From e5d6468fe9d8dced9af0c548a359a7dbeb31c931 Mon Sep 17 00:00:00 2001 From: Rocky Liao Date: Wed, 25 Mar 2020 10:26:37 +0800 Subject: Bluetooth: hci_qca: Add support for Qualcomm Bluetooth SoC QCA6390 This patch adds support for QCA6390, including the devicetree and acpi compatible hwid matching, and patch/nvm downloading. Signed-off-by: Rocky Liao Signed-off-by: Marcel Holtmann --- drivers/bluetooth/btqca.c | 18 +++++++++++++----- drivers/bluetooth/btqca.h | 3 ++- drivers/bluetooth/hci_qca.c | 40 +++++++++++++++++++++++++++++++++------- 3 files changed, 48 insertions(+), 13 deletions(-) diff --git a/drivers/bluetooth/btqca.c b/drivers/bluetooth/btqca.c index a16845c0751d..3ea866d44568 100644 --- a/drivers/bluetooth/btqca.c +++ b/drivers/bluetooth/btqca.c @@ -32,7 +32,7 @@ int qca_read_soc_version(struct hci_dev *hdev, u32 *soc_version, * VSE event. WCN3991 sends version command response as a payload to * command complete event. */ - if (soc_type == QCA_WCN3991) { + if (soc_type >= QCA_WCN3991) { event_type = 0; rlen += 1; rtype = EDL_PATCH_VER_REQ_CMD; @@ -69,7 +69,7 @@ int qca_read_soc_version(struct hci_dev *hdev, u32 *soc_version, goto out; } - if (soc_type == QCA_WCN3991) + if (soc_type >= QCA_WCN3991) memmove(&edl->data, &edl->data[1], sizeof(*ver)); ver = (struct qca_btsoc_version *)(edl->data); @@ -217,7 +217,7 @@ static void qca_tlv_check_data(struct qca_fw_config *config, tlv_nvm->data[0] |= 0x80; /* UART Baud Rate */ - if (soc_type == QCA_WCN3991) + if (soc_type >= QCA_WCN3991) tlv_nvm->data[1] = nvm_baud_rate; else tlv_nvm->data[2] = nvm_baud_rate; @@ -268,7 +268,7 @@ static int qca_tlv_send_segment(struct hci_dev *hdev, int seg_size, * VSE event. WCN3991 sends version command response as a payload to * command complete event. */ - if (soc_type == QCA_WCN3991) { + if (soc_type >= QCA_WCN3991) { event_type = 0; rlen = sizeof(*edl); rtype = EDL_PATCH_TLV_REQ_CMD; @@ -301,7 +301,7 @@ static int qca_tlv_send_segment(struct hci_dev *hdev, int seg_size, err = -EIO; } - if (soc_type == QCA_WCN3991) + if (soc_type >= QCA_WCN3991) goto out; tlv_resp = (struct tlv_seg_resp *)(edl->data); @@ -442,6 +442,11 @@ int qca_uart_setup(struct hci_dev *hdev, uint8_t baudrate, (soc_ver & 0x0000000f); snprintf(config.fwname, sizeof(config.fwname), "qca/crbtfw%02x.tlv", rom_ver); + } else if (soc_type == QCA_QCA6390) { + rom_ver = ((soc_ver & 0x00000f00) >> 0x04) | + (soc_ver & 0x0000000f); + snprintf(config.fwname, sizeof(config.fwname), + "qca/htbtfw%02x.tlv", rom_ver); } else { snprintf(config.fwname, sizeof(config.fwname), "qca/rampatch_%08x.bin", soc_ver); @@ -464,6 +469,9 @@ int qca_uart_setup(struct hci_dev *hdev, uint8_t baudrate, else if (qca_is_wcn399x(soc_type)) snprintf(config.fwname, sizeof(config.fwname), "qca/crnv%02x.bin", rom_ver); + else if (soc_type == QCA_QCA6390) + snprintf(config.fwname, sizeof(config.fwname), + "qca/htnv%02x.bin", rom_ver); else snprintf(config.fwname, sizeof(config.fwname), "qca/nvm_%08x.bin", soc_ver); diff --git a/drivers/bluetooth/btqca.h b/drivers/bluetooth/btqca.h index e16a4d650597..6e1e62dd4b95 100644 --- a/drivers/bluetooth/btqca.h +++ b/drivers/bluetooth/btqca.h @@ -125,8 +125,9 @@ enum qca_btsoc_type { QCA_AR3002, QCA_ROME, QCA_WCN3990, - QCA_WCN3991, QCA_WCN3998, + QCA_WCN3991, + QCA_QCA6390, }; #if IS_ENABLED(CONFIG_BT_QCA) diff --git a/drivers/bluetooth/hci_qca.c b/drivers/bluetooth/hci_qca.c index 439392b1c043..d0ac554584a4 100644 --- a/drivers/bluetooth/hci_qca.c +++ b/drivers/bluetooth/hci_qca.c @@ -26,6 +26,7 @@ #include #include #include +#include #include #include #include @@ -1596,7 +1597,7 @@ static int qca_setup(struct hci_uart *hu) set_bit(HCI_QUIRK_SIMULTANEOUS_DISCOVERY, &hdev->quirks); bt_dev_info(hdev, "setting up %s", - qca_is_wcn399x(soc_type) ? "wcn399x" : "ROME"); + qca_is_wcn399x(soc_type) ? "wcn399x" : "ROME/QCA6390"); retry: ret = qca_power_on(hdev); @@ -1665,10 +1666,10 @@ retry: } /* Setup bdaddr */ - if (qca_is_wcn399x(soc_type)) - hu->hdev->set_bdaddr = qca_set_bdaddr; - else + if (soc_type == QCA_ROME) hu->hdev->set_bdaddr = qca_set_bdaddr_rome; + else + hu->hdev->set_bdaddr = qca_set_bdaddr; return ret; } @@ -1721,6 +1722,11 @@ static const struct qca_vreg_data qca_soc_data_wcn3998 = { .num_vregs = 4, }; +static const struct qca_vreg_data qca_soc_data_qca6390 = { + .soc_type = QCA_QCA6390, + .num_vregs = 0, +}; + static void qca_power_shutdown(struct hci_uart *hu) { struct qca_serdev *qcadev; @@ -1764,7 +1770,7 @@ static int qca_power_off(struct hci_dev *hdev) enum qca_btsoc_type soc_type = qca_soc_type(hu); /* Stop sending shutdown command if soc crashes. */ - if (qca_is_wcn399x(soc_type) + if (soc_type != QCA_ROME && qca->memdump_state == QCA_MEMDUMP_IDLE) { qca_send_pre_shutdown_cmd(hdev); usleep_range(8000, 10000); @@ -1900,7 +1906,11 @@ static int qca_serdev_probe(struct serdev_device *serdev) return err; } } else { - qcadev->btsoc_type = QCA_ROME; + if (data) + qcadev->btsoc_type = data->soc_type; + else + qcadev->btsoc_type = QCA_ROME; + qcadev->bt_en = devm_gpiod_get_optional(&serdev->dev, "enable", GPIOD_OUT_LOW); if (!qcadev->bt_en) { @@ -2044,21 +2054,37 @@ static int __maybe_unused qca_resume(struct device *dev) static SIMPLE_DEV_PM_OPS(qca_pm_ops, qca_suspend, qca_resume); +#ifdef CONFIG_OF static const struct of_device_id qca_bluetooth_of_match[] = { { .compatible = "qcom,qca6174-bt" }, + { .compatible = "qcom,qca6390-bt", .data = &qca_soc_data_qca6390}, { .compatible = "qcom,wcn3990-bt", .data = &qca_soc_data_wcn3990}, { .compatible = "qcom,wcn3991-bt", .data = &qca_soc_data_wcn3991}, { .compatible = "qcom,wcn3998-bt", .data = &qca_soc_data_wcn3998}, { /* sentinel */ } }; MODULE_DEVICE_TABLE(of, qca_bluetooth_of_match); +#endif + +#ifdef CONFIG_ACPI +static const struct acpi_device_id qca_bluetooth_acpi_match[] = { + { "QCOM6390", (kernel_ulong_t)&qca_soc_data_qca6390 }, + { "DLA16390", (kernel_ulong_t)&qca_soc_data_qca6390 }, + { "DLB16390", (kernel_ulong_t)&qca_soc_data_qca6390 }, + { "DLB26390", (kernel_ulong_t)&qca_soc_data_qca6390 }, + { }, +}; +MODULE_DEVICE_TABLE(acpi, qca_bluetooth_acpi_match); +#endif + static struct serdev_device_driver qca_serdev_driver = { .probe = qca_serdev_probe, .remove = qca_serdev_remove, .driver = { .name = "hci_uart_qca", - .of_match_table = qca_bluetooth_of_match, + .of_match_table = of_match_ptr(qca_bluetooth_of_match), + .acpi_match_table = ACPI_PTR(qca_bluetooth_acpi_match), .pm = &qca_pm_ops, }, }; -- cgit v1.2.3-59-g8ed1b From 139dfad6cfa0ff816ea06d70132b164a44257c12 Mon Sep 17 00:00:00 2001 From: Rocky Liao Date: Wed, 25 Mar 2020 10:26:38 +0800 Subject: dt-bindings: net: bluetooth: Add device tree bindings for QCA chip QCA6390 This patch adds compatible string for the QCA chip QCA6390. Signed-off-by: Rocky Liao Signed-off-by: Marcel Holtmann --- Documentation/devicetree/bindings/net/qualcomm-bluetooth.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/Documentation/devicetree/bindings/net/qualcomm-bluetooth.txt b/Documentation/devicetree/bindings/net/qualcomm-bluetooth.txt index beca6466d59a..badf597c0e58 100644 --- a/Documentation/devicetree/bindings/net/qualcomm-bluetooth.txt +++ b/Documentation/devicetree/bindings/net/qualcomm-bluetooth.txt @@ -13,6 +13,7 @@ Required properties: * "qcom,wcn3990-bt" * "qcom,wcn3991-bt" * "qcom,wcn3998-bt" + * "qcom,qca6390-bt" Optional properties for compatible string qcom,qca6174-bt: -- cgit v1.2.3-59-g8ed1b From b86b0b150fed840c376145383ef5105116c81b0c Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Fri, 27 Mar 2020 11:32:14 -0700 Subject: Bluetooth: L2CAP: Fix handling LE modes by L2CAP_OPTIONS L2CAP_OPTIONS shall only be used with BR/EDR modes. Signed-off-by: Luiz Augusto von Dentz Signed-off-by: Marcel Holtmann --- net/bluetooth/l2cap_sock.c | 25 +++++++++++++++++++++---- 1 file changed, 21 insertions(+), 4 deletions(-) diff --git a/net/bluetooth/l2cap_sock.c b/net/bluetooth/l2cap_sock.c index 117ba20ea194..cfb402645c26 100644 --- a/net/bluetooth/l2cap_sock.c +++ b/net/bluetooth/l2cap_sock.c @@ -424,6 +424,20 @@ static int l2cap_sock_getsockopt_old(struct socket *sock, int optname, break; } + /* Only BR/EDR modes are supported here */ + switch (chan->mode) { + case L2CAP_MODE_BASIC: + case L2CAP_MODE_ERTM: + case L2CAP_MODE_STREAMING: + break; + default: + err = -EINVAL; + break; + } + + if (err < 0) + break; + memset(&opts, 0, sizeof(opts)); opts.imtu = chan->imtu; opts.omtu = chan->omtu; @@ -698,10 +712,8 @@ static int l2cap_sock_setsockopt_old(struct socket *sock, int optname, break; } - chan->mode = opts.mode; - switch (chan->mode) { - case L2CAP_MODE_LE_FLOWCTL: - break; + /* Only BR/EDR modes are supported here */ + switch (opts.mode) { case L2CAP_MODE_BASIC: clear_bit(CONF_STATE2_DEVICE, &chan->conf_state); break; @@ -715,6 +727,11 @@ static int l2cap_sock_setsockopt_old(struct socket *sock, int optname, break; } + if (err < 0) + break; + + chan->mode = opts.mode; + BT_DBG("mode 0x%2.2x", chan->mode); chan->imtu = opts.imtu; -- cgit v1.2.3-59-g8ed1b From 3ee7b7cd83900bb711efadbf16fa096a615a1566 Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Fri, 27 Mar 2020 11:32:15 -0700 Subject: Bluetooth: Add BT_MODE socket option This adds BT_MODE socket option which can be used to set L2CAP modes, including modes only supported over LE which were not supported using the L2CAP_OPTIONS. Signed-off-by: Luiz Augusto von Dentz Signed-off-by: Marcel Holtmann --- include/net/bluetooth/bluetooth.h | 8 +++ net/bluetooth/l2cap_sock.c | 113 +++++++++++++++++++++++++++++++++++++- 2 files changed, 120 insertions(+), 1 deletion(-) diff --git a/include/net/bluetooth/bluetooth.h b/include/net/bluetooth/bluetooth.h index 1576353a2773..3fa7b1e3c5d9 100644 --- a/include/net/bluetooth/bluetooth.h +++ b/include/net/bluetooth/bluetooth.h @@ -139,6 +139,14 @@ struct bt_voice { #define BT_PHY_LE_CODED_TX 0x00002000 #define BT_PHY_LE_CODED_RX 0x00004000 +#define BT_MODE 15 + +#define BT_MODE_BASIC 0x00 +#define BT_MODE_ERTM 0x01 +#define BT_MODE_STREAMING 0x02 +#define BT_MODE_LE_FLOWCTL 0x03 +#define BT_MODE_EXT_FLOWCTL 0x04 + __printf(1, 2) void bt_info(const char *fmt, ...); __printf(1, 2) diff --git a/net/bluetooth/l2cap_sock.c b/net/bluetooth/l2cap_sock.c index cfb402645c26..1cea42ee1e92 100644 --- a/net/bluetooth/l2cap_sock.c +++ b/net/bluetooth/l2cap_sock.c @@ -395,6 +395,24 @@ static int l2cap_sock_getname(struct socket *sock, struct sockaddr *addr, return sizeof(struct sockaddr_l2); } +static int l2cap_get_mode(struct l2cap_chan *chan) +{ + switch (chan->mode) { + case L2CAP_MODE_BASIC: + return BT_MODE_BASIC; + case L2CAP_MODE_ERTM: + return BT_MODE_ERTM; + case L2CAP_MODE_STREAMING: + return BT_MODE_STREAMING; + case L2CAP_MODE_LE_FLOWCTL: + return BT_MODE_LE_FLOWCTL; + case L2CAP_MODE_EXT_FLOWCTL: + return BT_MODE_EXT_FLOWCTL; + } + + return -EINVAL; +} + static int l2cap_sock_getsockopt_old(struct socket *sock, int optname, char __user *optval, int __user *optlen) { @@ -522,7 +540,7 @@ static int l2cap_sock_getsockopt(struct socket *sock, int level, int optname, struct bt_security sec; struct bt_power pwr; u32 phys; - int len, err = 0; + int len, mode, err = 0; BT_DBG("sk %p", sk); @@ -638,6 +656,27 @@ static int l2cap_sock_getsockopt(struct socket *sock, int level, int optname, err = -EFAULT; break; + case BT_MODE: + if (!enable_ecred) { + err = -ENOPROTOOPT; + break; + } + + if (chan->chan_type != L2CAP_CHAN_CONN_ORIENTED) { + err = -EINVAL; + break; + } + + mode = l2cap_get_mode(chan); + if (mode < 0) { + err = mode; + break; + } + + if (put_user(mode, (u8 __user *) optval)) + err = -EFAULT; + break; + default: err = -ENOPROTOOPT; break; @@ -780,6 +819,45 @@ static int l2cap_sock_setsockopt_old(struct socket *sock, int optname, return err; } +static int l2cap_set_mode(struct l2cap_chan *chan, u8 mode) +{ + switch (mode) { + case BT_MODE_BASIC: + if (bdaddr_type_is_le(chan->src_type)) + return -EINVAL; + mode = L2CAP_MODE_BASIC; + clear_bit(CONF_STATE2_DEVICE, &chan->conf_state); + break; + case BT_MODE_ERTM: + if (!disable_ertm || bdaddr_type_is_le(chan->src_type)) + return -EINVAL; + mode = L2CAP_MODE_ERTM; + break; + case BT_MODE_STREAMING: + if (!disable_ertm || bdaddr_type_is_le(chan->src_type)) + return -EINVAL; + mode = L2CAP_MODE_STREAMING; + break; + case BT_MODE_LE_FLOWCTL: + if (!bdaddr_type_is_le(chan->src_type)) + return -EINVAL; + mode = L2CAP_MODE_LE_FLOWCTL; + break; + case BT_MODE_EXT_FLOWCTL: + /* TODO: Add support for ECRED PDUs to BR/EDR */ + if (!bdaddr_type_is_le(chan->src_type)) + return -EINVAL; + mode = L2CAP_MODE_EXT_FLOWCTL; + break; + default: + return -EINVAL; + } + + chan->mode = mode; + + return 0; +} + static int l2cap_sock_setsockopt(struct socket *sock, int level, int optname, char __user *optval, unsigned int optlen) { @@ -985,6 +1063,39 @@ static int l2cap_sock_setsockopt(struct socket *sock, int level, int optname, break; + case BT_MODE: + if (!enable_ecred) { + err = -ENOPROTOOPT; + break; + } + + BT_DBG("sk->sk_state %u", sk->sk_state); + + if (sk->sk_state != BT_BOUND) { + err = -EINVAL; + break; + } + + if (chan->chan_type != L2CAP_CHAN_CONN_ORIENTED) { + err = -EINVAL; + break; + } + + if (get_user(opt, (u8 __user *) optval)) { + err = -EFAULT; + break; + } + + BT_DBG("opt %u", opt); + + err = l2cap_set_mode(chan, opt); + if (err) + break; + + BT_DBG("mode 0x%2.2x", chan->mode); + + break; + default: err = -ENOPROTOOPT; break; -- cgit v1.2.3-59-g8ed1b From 92516cd97fd4d8ad5b1421a0d51771044f453a5f Mon Sep 17 00:00:00 2001 From: Sonny Sasaka Date: Fri, 27 Mar 2020 17:34:23 -0700 Subject: Bluetooth: Always request for user confirmation for Just Works To improve security, always give the user-space daemon a chance to accept or reject a Just Works pairing (LE). The daemon may decide to auto-accept based on the user's intent. Signed-off-by: Sonny Sasaka Signed-off-by: Marcel Holtmann --- net/bluetooth/smp.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/net/bluetooth/smp.c b/net/bluetooth/smp.c index 1476a91ce935..d0b695ee49f6 100644 --- a/net/bluetooth/smp.c +++ b/net/bluetooth/smp.c @@ -855,6 +855,7 @@ static int tk_request(struct l2cap_conn *conn, u8 remote_oob, u8 auth, struct smp_chan *smp = chan->data; u32 passkey = 0; int ret = 0; + int err; /* Initialize key for JUST WORKS */ memset(smp->tk, 0, sizeof(smp->tk)); @@ -883,9 +884,16 @@ static int tk_request(struct l2cap_conn *conn, u8 remote_oob, u8 auth, hcon->io_capability == HCI_IO_NO_INPUT_OUTPUT) smp->method = JUST_WORKS; - /* If Just Works, Continue with Zero TK */ + /* If Just Works, Continue with Zero TK and ask user-space for + * confirmation */ if (smp->method == JUST_WORKS) { - set_bit(SMP_FLAG_TK_VALID, &smp->flags); + err = mgmt_user_confirm_request(hcon->hdev, &hcon->dst, + hcon->type, + hcon->dst_type, + passkey, 1); + if (err) + return SMP_UNSPECIFIED; + set_bit(SMP_FLAG_WAIT_USER, &smp->flags); return 0; } -- cgit v1.2.3-59-g8ed1b From b25e4df4a83e516efbdeeefb5b2d3e259639a56e Mon Sep 17 00:00:00 2001 From: Michał Mirosław Date: Thu, 2 Apr 2020 14:55:18 +0200 Subject: Bluetooth: hci_bcm: respect IRQ polarity from DT MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The IRQ polarity is be configured in bcm_setup_sleep(). Make the configured value match what is in the DeviceTree. Cc: stable@vger.kernel.org Fixes: f25a96c8eb46 ("Bluetooth: hci_bcm: enable IRQ capability from devicetree") Signed-off-by: Michał Mirosław Signed-off-by: Marcel Holtmann --- drivers/bluetooth/hci_bcm.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/bluetooth/hci_bcm.c b/drivers/bluetooth/hci_bcm.c index b236cb11c0dc..36b7f0d00c4b 100644 --- a/drivers/bluetooth/hci_bcm.c +++ b/drivers/bluetooth/hci_bcm.c @@ -1153,7 +1153,8 @@ static int bcm_of_probe(struct bcm_device *bdev) device_property_read_u8_array(bdev->dev, "brcm,bt-pcm-int-params", bdev->pcm_int_params, 5); bdev->irq = of_irq_get_byname(bdev->dev->of_node, "host-wakeup"); - + bdev->irq_active_low = irq_get_trigger_type(bdev->irq) + & (IRQ_TYPE_EDGE_FALLING | IRQ_TYPE_LEVEL_LOW); return 0; } -- cgit v1.2.3-59-g8ed1b From 81bd5d0c62437c02caac6b3f942fcda874063cb0 Mon Sep 17 00:00:00 2001 From: Michał Mirosław Date: Thu, 2 Apr 2020 14:55:20 +0200 Subject: Bluetooth: hci_bcm: fix freeing not-requested IRQ MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When BT module can't be initialized, but it has an IRQ, unloading the driver WARNs when trying to free not-yet-requested IRQ. Fix it by noting whether the IRQ was requested. WARNING: CPU: 2 PID: 214 at kernel/irq/devres.c:144 devm_free_irq+0x49/0x4ca [...] WARNING: CPU: 2 PID: 214 at kernel/irq/manage.c:1746 __free_irq+0x8b/0x27c Trying to free already-free IRQ 264 Modules linked in: hci_uart(-) btbcm bluetooth ecdh_generic ecc libaes CPU: 2 PID: 214 Comm: rmmod Tainted: G W 5.6.1mq-00044-ga5f9ea098318-dirty #928 [...] [] (devm_free_irq) from [] (bcm_close+0x97/0x118 [hci_uart]) [] (bcm_close [hci_uart]) from [] (hci_uart_unregister_device+0x33/0x3c [hci_uart]) [] (hci_uart_unregister_device [hci_uart]) from [] (serdev_drv_remove+0x13/0x20) [] (serdev_drv_remove) from [] (device_release_driver_internal+0x97/0x118) [] (device_release_driver_internal) from [] (driver_detach+0x2f/0x58) [] (driver_detach) from [] (bus_remove_driver+0x41/0x94) [] (bus_remove_driver) from [] (bcm_deinit+0x1b/0x740 [hci_uart]) [] (bcm_deinit [hci_uart]) from [] (hci_uart_exit+0x13/0x30 [hci_uart]) [] (hci_uart_exit [hci_uart]) from [] (sys_delete_module+0x109/0x1d0) [] (sys_delete_module) from [] (ret_fast_syscall+0x1/0x5a) [...] Cc: stable@vger.kernel.org Fixes: 6cc4396c8829 ("Bluetooth: hci_bcm: Add wake-up capability") Signed-off-by: Michał Mirosław Signed-off-by: Marcel Holtmann --- drivers/bluetooth/hci_bcm.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/bluetooth/hci_bcm.c b/drivers/bluetooth/hci_bcm.c index 36b7f0d00c4b..19e4587f366c 100644 --- a/drivers/bluetooth/hci_bcm.c +++ b/drivers/bluetooth/hci_bcm.c @@ -118,6 +118,7 @@ struct bcm_device { u32 oper_speed; int irq; bool irq_active_low; + bool irq_acquired; #ifdef CONFIG_PM struct hci_uart *hu; @@ -333,6 +334,8 @@ static int bcm_request_irq(struct bcm_data *bcm) goto unlock; } + bdev->irq_acquired = true; + device_init_wakeup(bdev->dev, true); pm_runtime_set_autosuspend_delay(bdev->dev, @@ -514,7 +517,7 @@ static int bcm_close(struct hci_uart *hu) } if (bdev) { - if (IS_ENABLED(CONFIG_PM) && bdev->irq > 0) { + if (IS_ENABLED(CONFIG_PM) && bdev->irq_acquired) { devm_free_irq(bdev->dev, bdev->irq, bdev); device_init_wakeup(bdev->dev, false); pm_runtime_disable(bdev->dev); -- cgit v1.2.3-59-g8ed1b From 7fedd3bb6b77f9b6eefb0e4dcd8f79d0d00b86d7 Mon Sep 17 00:00:00 2001 From: Abhishek Pandit-Subedi Date: Mon, 23 Mar 2020 12:45:07 -0700 Subject: Bluetooth: Prioritize SCO traffic When scheduling TX packets, send all SCO/eSCO packets first, check for pending SCO/eSCO packets after every ACL/LE packet and send them if any are pending. This is done to make sure that we can meet SCO deadlines on slow interfaces like UART. If we were to queue up multiple ACL packets without checking for a SCO packet, we might miss the SCO timing. For example: The time it takes to send a maximum size ACL packet (1024 bytes): t = 10/8 * 1024 bytes * 8 bits/byte * 1 packet / baudrate where 10/8 is uart overhead due to start/stop bits per byte Replace t = 3.75ms (SCO deadline), which gives us a baudrate of 2730666. At a baudrate of 3000000, if we didn't check for SCO packets within 1024 bytes, we would miss the 3.75ms timing window. Signed-off-by: Abhishek Pandit-Subedi Signed-off-by: Marcel Holtmann --- net/bluetooth/hci_core.c | 106 +++++++++++++++++++++++++---------------------- 1 file changed, 57 insertions(+), 49 deletions(-) diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index 2e7bc2da8371..5fb9db0b2b7b 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -4240,6 +4240,54 @@ static void __check_timeout(struct hci_dev *hdev, unsigned int cnt) } } +/* Schedule SCO */ +static void hci_sched_sco(struct hci_dev *hdev) +{ + struct hci_conn *conn; + struct sk_buff *skb; + int quote; + + BT_DBG("%s", hdev->name); + + if (!hci_conn_num(hdev, SCO_LINK)) + return; + + while (hdev->sco_cnt && (conn = hci_low_sent(hdev, SCO_LINK, "e))) { + while (quote-- && (skb = skb_dequeue(&conn->data_q))) { + BT_DBG("skb %p len %d", skb, skb->len); + hci_send_frame(hdev, skb); + + conn->sent++; + if (conn->sent == ~0) + conn->sent = 0; + } + } +} + +static void hci_sched_esco(struct hci_dev *hdev) +{ + struct hci_conn *conn; + struct sk_buff *skb; + int quote; + + BT_DBG("%s", hdev->name); + + if (!hci_conn_num(hdev, ESCO_LINK)) + return; + + while (hdev->sco_cnt && (conn = hci_low_sent(hdev, ESCO_LINK, + "e))) { + while (quote-- && (skb = skb_dequeue(&conn->data_q))) { + BT_DBG("skb %p len %d", skb, skb->len); + hci_send_frame(hdev, skb); + + conn->sent++; + if (conn->sent == ~0) + conn->sent = 0; + } + } +} + static void hci_sched_acl_pkt(struct hci_dev *hdev) { unsigned int cnt = hdev->acl_cnt; @@ -4271,6 +4319,10 @@ static void hci_sched_acl_pkt(struct hci_dev *hdev) hdev->acl_cnt--; chan->sent++; chan->conn->sent++; + + /* Send pending SCO packets right away */ + hci_sched_sco(hdev); + hci_sched_esco(hdev); } } @@ -4355,54 +4407,6 @@ static void hci_sched_acl(struct hci_dev *hdev) } } -/* Schedule SCO */ -static void hci_sched_sco(struct hci_dev *hdev) -{ - struct hci_conn *conn; - struct sk_buff *skb; - int quote; - - BT_DBG("%s", hdev->name); - - if (!hci_conn_num(hdev, SCO_LINK)) - return; - - while (hdev->sco_cnt && (conn = hci_low_sent(hdev, SCO_LINK, "e))) { - while (quote-- && (skb = skb_dequeue(&conn->data_q))) { - BT_DBG("skb %p len %d", skb, skb->len); - hci_send_frame(hdev, skb); - - conn->sent++; - if (conn->sent == ~0) - conn->sent = 0; - } - } -} - -static void hci_sched_esco(struct hci_dev *hdev) -{ - struct hci_conn *conn; - struct sk_buff *skb; - int quote; - - BT_DBG("%s", hdev->name); - - if (!hci_conn_num(hdev, ESCO_LINK)) - return; - - while (hdev->sco_cnt && (conn = hci_low_sent(hdev, ESCO_LINK, - "e))) { - while (quote-- && (skb = skb_dequeue(&conn->data_q))) { - BT_DBG("skb %p len %d", skb, skb->len); - hci_send_frame(hdev, skb); - - conn->sent++; - if (conn->sent == ~0) - conn->sent = 0; - } - } -} - static void hci_sched_le(struct hci_dev *hdev) { struct hci_chan *chan; @@ -4437,6 +4441,10 @@ static void hci_sched_le(struct hci_dev *hdev) cnt--; chan->sent++; chan->conn->sent++; + + /* Send pending SCO packets right away */ + hci_sched_sco(hdev); + hci_sched_esco(hdev); } } @@ -4459,9 +4467,9 @@ static void hci_tx_work(struct work_struct *work) if (!hci_dev_test_flag(hdev, HCI_USER_CHANNEL)) { /* Schedule queues and send stuff to HCI driver */ - hci_sched_acl(hdev); hci_sched_sco(hdev); hci_sched_esco(hdev); + hci_sched_acl(hdev); hci_sched_le(hdev); } -- cgit v1.2.3-59-g8ed1b From 1e5479be46a70389e1059818a2e9358858eaa5fc Mon Sep 17 00:00:00 2001 From: Alain Michaud Date: Fri, 3 Apr 2020 13:49:05 +0000 Subject: Bluetooth: fixing minor typo in comment This changes a simple typo in hci_event.c Signed-off-by: Alain Michaud Signed-off-by: Marcel Holtmann --- net/bluetooth/hci_event.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index 0a591be8b0ae..ddf77304aa8e 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -5269,7 +5269,7 @@ static struct hci_conn *check_pending_le_conn(struct hci_dev *hdev, case HCI_AUTO_CONN_ALWAYS: /* Devices advertising with ADV_IND or ADV_DIRECT_IND * are triggering a connection attempt. This means - * that incoming connectioms from slave device are + * that incoming connections from slave device are * accepted and also outgoing connections to slave * devices are established when found. */ -- cgit v1.2.3-59-g8ed1b From 1f8330ea1692c9c490b1e566e31d96d8cef99dd8 Mon Sep 17 00:00:00 2001 From: Sathish Narsimman Date: Fri, 3 Apr 2020 21:43:58 +0200 Subject: Bluetooth: add support to notify using SCO air mode notifying using HCI_NOTIFY_CONN_ADD for SCO connection is generic in case of mSBC audio. To differntiate SCO air mode introducing HCI_NOTIFY_ENABLE_SCO_CVSD and HCI_NOTIFY_ENABLE_SCO_TRANSP. Signed-off-by: Sathish Narsimman Signed-off-by: Marcel Holtmann Signed-off-by: Johan Hedberg --- include/net/bluetooth/hci.h | 3 +++ net/bluetooth/hci_conn.c | 25 +++++++++++++++++++++---- net/bluetooth/hci_event.c | 23 ++++++++++++++++++++++- 3 files changed, 46 insertions(+), 5 deletions(-) diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h index 5f60e135aeb6..9ff2f7a9e131 100644 --- a/include/net/bluetooth/hci.h +++ b/include/net/bluetooth/hci.h @@ -53,6 +53,9 @@ #define HCI_NOTIFY_CONN_ADD 1 #define HCI_NOTIFY_CONN_DEL 2 #define HCI_NOTIFY_VOICE_SETTING 3 +#define HCI_NOTIFY_ENABLE_SCO_CVSD 4 +#define HCI_NOTIFY_ENABLE_SCO_TRANSP 5 +#define HCI_NOTIFY_DISABLE_SCO 6 /* HCI bus types */ #define HCI_VIRTUAL 0 diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c index e245bc155cc2..07c34c55fc50 100644 --- a/net/bluetooth/hci_conn.c +++ b/net/bluetooth/hci_conn.c @@ -122,8 +122,18 @@ static void hci_conn_cleanup(struct hci_conn *conn) hci_conn_hash_del(hdev, conn); - if (hdev->notify) - hdev->notify(hdev, HCI_NOTIFY_CONN_DEL); + if (conn->type == SCO_LINK || conn->type == ESCO_LINK) { + switch (conn->setting & SCO_AIRMODE_MASK) { + case SCO_AIRMODE_CVSD: + case SCO_AIRMODE_TRANSP: + if (hdev->notify) + hdev->notify(hdev, HCI_NOTIFY_DISABLE_SCO); + break; + } + } else { + if (hdev->notify) + hdev->notify(hdev, HCI_NOTIFY_CONN_DEL); + } hci_conn_del_sysfs(conn); @@ -577,8 +587,15 @@ struct hci_conn *hci_conn_add(struct hci_dev *hdev, int type, bdaddr_t *dst, hci_dev_hold(hdev); hci_conn_hash_add(hdev, conn); - if (hdev->notify) - hdev->notify(hdev, HCI_NOTIFY_CONN_ADD); + + /* The SCO and eSCO connections will only be notified when their + * setup has been completed. This is different to ACL links which + * can be notified right away. + */ + if (conn->type != SCO_LINK && conn->type != ESCO_LINK) { + if (hdev->notify) + hdev->notify(hdev, HCI_NOTIFY_CONN_ADD); + } hci_conn_init_sysfs(conn); diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index ddf77304aa8e..af396cb69602 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -2607,8 +2607,16 @@ static void hci_conn_complete_evt(struct hci_dev *hdev, struct sk_buff *skb) if (ev->status) { hci_connect_cfm(conn, ev->status); hci_conn_del(conn); - } else if (ev->link_type != ACL_LINK) + } else if (ev->link_type == SCO_LINK) { + switch (conn->setting & SCO_AIRMODE_MASK) { + case SCO_AIRMODE_CVSD: + if (hdev->notify) + hdev->notify(hdev, HCI_NOTIFY_ENABLE_SCO_CVSD); + break; + } + hci_connect_cfm(conn, ev->status); + } unlock: hci_dev_unlock(hdev); @@ -4307,6 +4315,19 @@ static void hci_sync_conn_complete_evt(struct hci_dev *hdev, break; } + bt_dev_dbg(hdev, "SCO connected with air mode: %02x", ev->air_mode); + + switch (conn->setting & SCO_AIRMODE_MASK) { + case SCO_AIRMODE_CVSD: + if (hdev->notify) + hdev->notify(hdev, HCI_NOTIFY_ENABLE_SCO_CVSD); + break; + case SCO_AIRMODE_TRANSP: + if (hdev->notify) + hdev->notify(hdev, HCI_NOTIFY_ENABLE_SCO_TRANSP); + break; + } + hci_connect_cfm(conn, ev->status); if (ev->status) hci_conn_del(conn); -- cgit v1.2.3-59-g8ed1b From baac6276c0a9f36f1fe1f00590ef00d2ba5ba626 Mon Sep 17 00:00:00 2001 From: Sathish Narasimman Date: Fri, 3 Apr 2020 21:43:59 +0200 Subject: Bluetooth: btusb: handle mSBC audio over USB Endpoints For mSBC encoded audio stream over usb transport, btusb driver to be set to alternate settings 6 as per BT core spec 5.0. The type of air mode is used to differenting which alt setting to be used. The changes are made considering some discussion over the similar patch submitted earlier from Kuba Pawlak (link below) https://www.spinics.net/lists/linux-bluetooth/msg64577.html Reported-by: kbuild test robot Signed-off-by: Sathish Narasimman Signed-off-by: Chethan T N Signed-off-by: Hsin-Yu Chao Signed-off-by: Amit K Bag Signed-off-by: Marcel Holtmann Signed-off-by: Johan Hedberg --- drivers/bluetooth/btusb.c | 156 +++++++++++++++++++++++++++++++++++----------- 1 file changed, 119 insertions(+), 37 deletions(-) diff --git a/drivers/bluetooth/btusb.c b/drivers/bluetooth/btusb.c index 3bdec42c9612..110e96b245e5 100644 --- a/drivers/bluetooth/btusb.c +++ b/drivers/bluetooth/btusb.c @@ -492,6 +492,8 @@ struct btusb_data { __u8 cmdreq; unsigned int sco_num; + unsigned int air_mode; + bool usb_alt6_packet_flow; int isoc_altsetting; int suspend_count; @@ -983,6 +985,42 @@ static void btusb_isoc_complete(struct urb *urb) } } +static inline void __fill_isoc_descriptor_msbc(struct urb *urb, int len, + int mtu, struct btusb_data *data) +{ + int i, offset = 0; + unsigned int interval; + + BT_DBG("len %d mtu %d", len, mtu); + + /* For mSBC ALT 6 setting the host will send the packet at continuous + * flow. As per core spec 5, vol 4, part B, table 2.1. For ALT setting + * 6 the HCI PACKET INTERVAL should be 7.5ms for every usb packets. + * To maintain the rate we send 63bytes of usb packets alternatively for + * 7ms and 8ms to maintain the rate as 7.5ms. + */ + if (data->usb_alt6_packet_flow) { + interval = 7; + data->usb_alt6_packet_flow = false; + } else { + interval = 6; + data->usb_alt6_packet_flow = true; + } + + for (i = 0; i < interval; i++) { + urb->iso_frame_desc[i].offset = offset; + urb->iso_frame_desc[i].length = offset; + } + + if (len && i < BTUSB_MAX_ISOC_FRAMES) { + urb->iso_frame_desc[i].offset = offset; + urb->iso_frame_desc[i].length = len; + i++; + } + + urb->number_of_packets = i; +} + static inline void __fill_isoc_descriptor(struct urb *urb, int len, int mtu) { int i, offset = 0; @@ -1386,9 +1424,13 @@ static struct urb *alloc_isoc_urb(struct hci_dev *hdev, struct sk_buff *skb) urb->transfer_flags = URB_ISO_ASAP; - __fill_isoc_descriptor(urb, skb->len, - le16_to_cpu(data->isoc_tx_ep->wMaxPacketSize)); - + if (data->isoc_altsetting == 6) + __fill_isoc_descriptor_msbc(urb, skb->len, + le16_to_cpu(data->isoc_tx_ep->wMaxPacketSize), + data); + else + __fill_isoc_descriptor(urb, skb->len, + le16_to_cpu(data->isoc_tx_ep->wMaxPacketSize)); skb->dev = (void *)hdev; return urb; @@ -1484,6 +1526,7 @@ static void btusb_notify(struct hci_dev *hdev, unsigned int evt) if (hci_conn_num(hdev, SCO_LINK) != data->sco_num) { data->sco_num = hci_conn_num(hdev, SCO_LINK); + data->air_mode = evt; schedule_work(&data->work); } } @@ -1531,11 +1574,67 @@ static inline int __set_isoc_interface(struct hci_dev *hdev, int altsetting) return 0; } +static int btusb_switch_alt_setting(struct hci_dev *hdev, int new_alts) +{ + struct btusb_data *data = hci_get_drvdata(hdev); + int err; + + if (data->isoc_altsetting != new_alts) { + unsigned long flags; + + clear_bit(BTUSB_ISOC_RUNNING, &data->flags); + usb_kill_anchored_urbs(&data->isoc_anchor); + + /* When isochronous alternate setting needs to be + * changed, because SCO connection has been added + * or removed, a packet fragment may be left in the + * reassembling state. This could lead to wrongly + * assembled fragments. + * + * Clear outstanding fragment when selecting a new + * alternate setting. + */ + spin_lock_irqsave(&data->rxlock, flags); + kfree_skb(data->sco_skb); + data->sco_skb = NULL; + spin_unlock_irqrestore(&data->rxlock, flags); + + err = __set_isoc_interface(hdev, new_alts); + if (err < 0) + return err; + } + + if (!test_and_set_bit(BTUSB_ISOC_RUNNING, &data->flags)) { + if (btusb_submit_isoc_urb(hdev, GFP_KERNEL) < 0) + clear_bit(BTUSB_ISOC_RUNNING, &data->flags); + else + btusb_submit_isoc_urb(hdev, GFP_KERNEL); + } + + return 0; +} + +static struct usb_host_interface *btusb_find_altsetting(struct btusb_data *data, + int alt) +{ + struct usb_interface *intf = data->isoc; + int i; + + BT_DBG("Looking for Alt no :%d", alt); + + for (i = 0; i < intf->num_altsetting; i++) { + if (intf->altsetting[i].desc.bAlternateSetting == alt) + return &intf->altsetting[i]; + } + + return NULL; +} + static void btusb_work(struct work_struct *work) { struct btusb_data *data = container_of(work, struct btusb_data, work); struct hci_dev *hdev = data->hdev; - int new_alts; + int new_alts = 0; int err; if (data->sco_num > 0) { @@ -1550,44 +1649,27 @@ static void btusb_work(struct work_struct *work) set_bit(BTUSB_DID_ISO_RESUME, &data->flags); } - if (hdev->voice_setting & 0x0020) { - static const int alts[3] = { 2, 4, 5 }; - - new_alts = alts[data->sco_num - 1]; - } else { - new_alts = data->sco_num; - } - - if (data->isoc_altsetting != new_alts) { - unsigned long flags; + if (data->air_mode == HCI_NOTIFY_ENABLE_SCO_CVSD) { + if (hdev->voice_setting & 0x0020) { + static const int alts[3] = { 2, 4, 5 }; - clear_bit(BTUSB_ISOC_RUNNING, &data->flags); - usb_kill_anchored_urbs(&data->isoc_anchor); - - /* When isochronous alternate setting needs to be - * changed, because SCO connection has been added - * or removed, a packet fragment may be left in the - * reassembling state. This could lead to wrongly - * assembled fragments. - * - * Clear outstanding fragment when selecting a new - * alternate setting. - */ - spin_lock_irqsave(&data->rxlock, flags); - kfree_skb(data->sco_skb); - data->sco_skb = NULL; - spin_unlock_irqrestore(&data->rxlock, flags); + new_alts = alts[data->sco_num - 1]; + } else { + new_alts = data->sco_num; + } + } else if (data->air_mode == HCI_NOTIFY_ENABLE_SCO_TRANSP) { - if (__set_isoc_interface(hdev, new_alts) < 0) - return; - } + data->usb_alt6_packet_flow = true; - if (!test_and_set_bit(BTUSB_ISOC_RUNNING, &data->flags)) { - if (btusb_submit_isoc_urb(hdev, GFP_KERNEL) < 0) - clear_bit(BTUSB_ISOC_RUNNING, &data->flags); + /* Check if Alt 6 is supported for Transparent audio */ + if (btusb_find_altsetting(data, 6)) + new_alts = 6; else - btusb_submit_isoc_urb(hdev, GFP_KERNEL); + bt_dev_err(hdev, "Device does not support ALT setting 6"); } + + if (btusb_switch_alt_setting(hdev, new_alts) < 0) + bt_dev_err(hdev, "set USB alt:(%d) failed!", new_alts); } else { clear_bit(BTUSB_ISOC_RUNNING, &data->flags); usb_kill_anchored_urbs(&data->isoc_anchor); -- cgit v1.2.3-59-g8ed1b From 3d2336042ae3555d4b77995402291c5795882d20 Mon Sep 17 00:00:00 2001 From: Marcel Holtmann Date: Fri, 3 Apr 2020 21:44:00 +0200 Subject: Bluetooth: Move debugfs configuration above the selftests This is just a cosmetic clean to move the selftests configuration option to the bottom of the list of options. Signed-off-by: Marcel Holtmann Signed-off-by: Johan Hedberg --- net/bluetooth/Kconfig | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/net/bluetooth/Kconfig b/net/bluetooth/Kconfig index 165148c7c4ce..77703216a2e3 100644 --- a/net/bluetooth/Kconfig +++ b/net/bluetooth/Kconfig @@ -93,6 +93,14 @@ config BT_LEDS This option selects a few LED triggers for different Bluetooth events. +config BT_DEBUGFS + bool "Export Bluetooth internals in debugfs" + depends on BT && DEBUG_FS + default y + help + Provide extensive information about internal Bluetooth states + in debugfs. + config BT_SELFTEST bool "Bluetooth self testing support" depends on BT && DEBUG_KERNEL @@ -120,12 +128,4 @@ config BT_SELFTEST_SMP Run test cases for SMP cryptographic functionality, including both legacy SMP as well as the Secure Connections features. -config BT_DEBUGFS - bool "Export Bluetooth internals in debugfs" - depends on BT && DEBUG_FS - default y - help - Provide extensive information about internal Bluetooth states - in debugfs. - source "drivers/bluetooth/Kconfig" -- cgit v1.2.3-59-g8ed1b From 145373cb1b1fcdba2059e945d0aa2613af2e84d1 Mon Sep 17 00:00:00 2001 From: Miao-chen Chou Date: Fri, 3 Apr 2020 21:44:01 +0200 Subject: Bluetooth: Add framework for Microsoft vendor extension Micrsoft defined a set for HCI vendor extensions. Check the following link for details: https://docs.microsoft.com/en-us/windows-hardware/drivers/bluetooth/microsoft-defined-bluetooth-hci-commands-and-events This provides the basic framework to enable the extension and read its supported features. Drivers still have to declare support for this extension before it can be utilized by the host stack. Signed-off-by: Miao-chen Chou Signed-off-by: Marcel Holtmann Signed-off-by: Johan Hedberg --- include/net/bluetooth/hci_core.h | 13 ++++ net/bluetooth/Kconfig | 7 ++ net/bluetooth/Makefile | 1 + net/bluetooth/hci_core.c | 5 ++ net/bluetooth/hci_event.c | 5 ++ net/bluetooth/msft.c | 141 +++++++++++++++++++++++++++++++++++++++ net/bluetooth/msft.h | 18 +++++ 7 files changed, 190 insertions(+) create mode 100644 net/bluetooth/msft.c create mode 100644 net/bluetooth/msft.h diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index d4e28773d378..3cb0f82d0c83 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -484,6 +484,11 @@ struct hci_dev { struct led_trigger *power_led; #endif +#if IS_ENABLED(CONFIG_BT_MSFTEXT) + __u16 msft_opcode; + void *msft_data; +#endif + int (*open)(struct hci_dev *hdev); int (*close)(struct hci_dev *hdev); int (*flush)(struct hci_dev *hdev); @@ -1116,6 +1121,14 @@ int hci_recv_frame(struct hci_dev *hdev, struct sk_buff *skb); int hci_recv_diag(struct hci_dev *hdev, struct sk_buff *skb); __printf(2, 3) void hci_set_hw_info(struct hci_dev *hdev, const char *fmt, ...); __printf(2, 3) void hci_set_fw_info(struct hci_dev *hdev, const char *fmt, ...); + +static inline void hci_set_msft_opcode(struct hci_dev *hdev, __u16 opcode) +{ +#if IS_ENABLED(CONFIG_BT_MSFTEXT) + hdev->msft_opcode = opcode; +#endif +} + int hci_dev_open(__u16 dev); int hci_dev_close(__u16 dev); int hci_dev_do_close(struct hci_dev *hdev); diff --git a/net/bluetooth/Kconfig b/net/bluetooth/Kconfig index 77703216a2e3..9e25c6570170 100644 --- a/net/bluetooth/Kconfig +++ b/net/bluetooth/Kconfig @@ -93,6 +93,13 @@ config BT_LEDS This option selects a few LED triggers for different Bluetooth events. +config BT_MSFTEXT + bool "Enable Microsoft extensions" + depends on BT + help + This options enables support for the Microsoft defined HCI + vendor extensions. + config BT_DEBUGFS bool "Export Bluetooth internals in debugfs" depends on BT && DEBUG_FS diff --git a/net/bluetooth/Makefile b/net/bluetooth/Makefile index fda41c0b4781..41dd541a44a5 100644 --- a/net/bluetooth/Makefile +++ b/net/bluetooth/Makefile @@ -19,5 +19,6 @@ bluetooth-y := af_bluetooth.o hci_core.o hci_conn.o hci_event.o mgmt.o \ bluetooth-$(CONFIG_BT_BREDR) += sco.o bluetooth-$(CONFIG_BT_HS) += a2mp.o amp.o bluetooth-$(CONFIG_BT_LEDS) += leds.o +bluetooth-$(CONFIG_BT_MSFTEXT) += msft.o bluetooth-$(CONFIG_BT_DEBUGFS) += hci_debugfs.o bluetooth-$(CONFIG_BT_SELFTEST) += selftest.o diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index 5fb9db0b2b7b..ef0ee3a3d9ed 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -44,6 +44,7 @@ #include "hci_debugfs.h" #include "smp.h" #include "leds.h" +#include "msft.h" static void hci_rx_work(struct work_struct *work); static void hci_cmd_work(struct work_struct *work); @@ -1563,6 +1564,8 @@ setup_failed: hci_dev_test_flag(hdev, HCI_VENDOR_DIAG) && hdev->set_diag) ret = hdev->set_diag(hdev, true); + msft_do_open(hdev); + clear_bit(HCI_INIT, &hdev->flags); if (!ret) { @@ -1758,6 +1761,8 @@ int hci_dev_do_close(struct hci_dev *hdev) hci_sock_dev_event(hdev, HCI_DEV_DOWN); + msft_do_close(hdev); + if (hdev->flush) hdev->flush(hdev); diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index af396cb69602..2803beaa1c44 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -35,6 +35,7 @@ #include "a2mp.h" #include "amp.h" #include "smp.h" +#include "msft.h" #define ZERO_KEY "\x00\x00\x00\x00\x00\x00\x00\x00" \ "\x00\x00\x00\x00\x00\x00\x00\x00" @@ -6166,6 +6167,10 @@ void hci_event_packet(struct hci_dev *hdev, struct sk_buff *skb) hci_num_comp_blocks_evt(hdev, skb); break; + case HCI_EV_VENDOR: + msft_vendor_evt(hdev, skb); + break; + default: BT_DBG("%s event 0x%2.2x", hdev->name, event); break; diff --git a/net/bluetooth/msft.c b/net/bluetooth/msft.c new file mode 100644 index 000000000000..d6c4e6b5ae77 --- /dev/null +++ b/net/bluetooth/msft.c @@ -0,0 +1,141 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2020 Google Corporation + */ + +#include +#include + +#include "msft.h" + +#define MSFT_OP_READ_SUPPORTED_FEATURES 0x00 +struct msft_cp_read_supported_features { + __u8 sub_opcode; +} __packed; +struct msft_rp_read_supported_features { + __u8 status; + __u8 sub_opcode; + __le64 features; + __u8 evt_prefix_len; + __u8 evt_prefix[0]; +} __packed; + +struct msft_data { + __u64 features; + __u8 evt_prefix_len; + __u8 *evt_prefix; +}; + +static bool read_supported_features(struct hci_dev *hdev, + struct msft_data *msft) +{ + struct msft_cp_read_supported_features cp; + struct msft_rp_read_supported_features *rp; + struct sk_buff *skb; + + cp.sub_opcode = MSFT_OP_READ_SUPPORTED_FEATURES; + + skb = __hci_cmd_sync(hdev, hdev->msft_opcode, sizeof(cp), &cp, + HCI_CMD_TIMEOUT); + if (IS_ERR(skb)) { + bt_dev_err(hdev, "Failed to read MSFT supported features (%ld)", + PTR_ERR(skb)); + return false; + } + + if (skb->len < sizeof(*rp)) { + bt_dev_err(hdev, "MSFT supported features length mismatch"); + goto failed; + } + + rp = (struct msft_rp_read_supported_features *)skb->data; + + if (rp->sub_opcode != MSFT_OP_READ_SUPPORTED_FEATURES) + goto failed; + + if (rp->evt_prefix_len > 0) { + msft->evt_prefix = kmemdup(rp->evt_prefix, rp->evt_prefix_len, + GFP_KERNEL); + if (!msft->evt_prefix) + goto failed; + } + + msft->evt_prefix_len = rp->evt_prefix_len; + msft->features = __le64_to_cpu(rp->features); + + kfree_skb(skb); + return true; + +failed: + kfree_skb(skb); + return false; +} + +void msft_do_open(struct hci_dev *hdev) +{ + struct msft_data *msft; + + if (hdev->msft_opcode == HCI_OP_NOP) + return; + + bt_dev_dbg(hdev, "Initialize MSFT extension"); + + msft = kzalloc(sizeof(*msft), GFP_KERNEL); + if (!msft) + return; + + if (!read_supported_features(hdev, msft)) { + kfree(msft); + return; + } + + hdev->msft_data = msft; +} + +void msft_do_close(struct hci_dev *hdev) +{ + struct msft_data *msft = hdev->msft_data; + + if (!msft) + return; + + bt_dev_dbg(hdev, "Cleanup of MSFT extension"); + + hdev->msft_data = NULL; + + kfree(msft->evt_prefix); + kfree(msft); +} + +void msft_vendor_evt(struct hci_dev *hdev, struct sk_buff *skb) +{ + struct msft_data *msft = hdev->msft_data; + u8 event; + + if (!msft) + return; + + /* When the extension has defined an event prefix, check that it + * matches, and otherwise just return. + */ + if (msft->evt_prefix_len > 0) { + if (skb->len < msft->evt_prefix_len) + return; + + if (memcmp(skb->data, msft->evt_prefix, msft->evt_prefix_len)) + return; + + skb_pull(skb, msft->evt_prefix_len); + } + + /* Every event starts at least with an event code and the rest of + * the data is variable and depends on the event code. + */ + if (skb->len < 1) + return; + + event = *skb->data; + skb_pull(skb, 1); + + bt_dev_dbg(hdev, "MSFT vendor event %u", event); +} diff --git a/net/bluetooth/msft.h b/net/bluetooth/msft.h new file mode 100644 index 000000000000..5aa9130e1f8a --- /dev/null +++ b/net/bluetooth/msft.h @@ -0,0 +1,18 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2020 Google Corporation + */ + +#if IS_ENABLED(CONFIG_BT_MSFTEXT) + +void msft_do_open(struct hci_dev *hdev); +void msft_do_close(struct hci_dev *hdev); +void msft_vendor_evt(struct hci_dev *hdev, struct sk_buff *skb); + +#else + +static inline void msft_do_open(struct hci_dev *hdev) {} +static inline void msft_do_close(struct hci_dev *hdev) {} +static inline void msft_vendor_evt(struct hci_dev *hdev, struct sk_buff *skb) {} + +#endif -- cgit v1.2.3-59-g8ed1b From 7fd673bcdacc8528c7d9489d31f040eac7cca164 Mon Sep 17 00:00:00 2001 From: Marcel Holtmann Date: Fri, 3 Apr 2020 21:44:02 +0200 Subject: Bluetooth: btusb: Enable Intel events even if already in operational mode In case the controller is already in operation mode, the Intel specific events will not be enabled. Fix this by jumping to a common finish section that will allow setting final details for the controller. Signed-off-by: Marcel Holtmann Signed-off-by: Johan Hedberg --- drivers/bluetooth/btusb.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/drivers/bluetooth/btusb.c b/drivers/bluetooth/btusb.c index 110e96b245e5..43925bdeaa81 100644 --- a/drivers/bluetooth/btusb.c +++ b/drivers/bluetooth/btusb.c @@ -2334,7 +2334,7 @@ static int btusb_setup_intel_new(struct hci_dev *hdev) if (ver.fw_variant == 0x23) { clear_bit(BTUSB_BOOTLOADER, &data->flags); btintel_check_bdaddr(hdev); - return 0; + goto finish; } /* If the device is not in bootloader mode, then the only possible @@ -2534,6 +2534,14 @@ done: */ btintel_load_ddc_config(hdev, fwname); + /* Read the Intel version information after loading the FW */ + err = btintel_read_version(hdev, &ver); + if (err) + return err; + + btintel_version_info(hdev, &ver); + +finish: /* Set the event mask for Intel specific vendor events. This enables * a few extra events that are useful during general operation. It * does not enable any debugging related events. @@ -2543,13 +2551,6 @@ done: */ btintel_set_event_mask(hdev, false); - /* Read the Intel version information after loading the FW */ - err = btintel_read_version(hdev, &ver); - if (err) - return err; - - btintel_version_info(hdev, &ver); - return 0; } -- cgit v1.2.3-59-g8ed1b From fc04590e3d39213a22b7afd46c4bd5d95a6cab1f Mon Sep 17 00:00:00 2001 From: Miao-chen Chou Date: Fri, 3 Apr 2020 21:44:03 +0200 Subject: Bluetooth: btusb: Enable MSFT extension for Intel ThunderPeak devices The Intel ThundePeak BT controllers support the Microsoft vendor extension and they are using 0xFC1E for VsMsftOpCode. < HCI Command: Vendor (0x3f|0x001e) plen 1 00 > HCI Event: Command Complete (0x0e) plen 15 Vendor (0x3f|0x001e) ncmd 1 Status: Success (0x00) 00 3f 00 00 00 00 00 00 00 01 50 Signed-off-by: Miao-chen Chou Signed-off-by: Marcel Holtmann Signed-off-by: Johan Hedberg --- drivers/bluetooth/btusb.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/drivers/bluetooth/btusb.c b/drivers/bluetooth/btusb.c index 43925bdeaa81..09913cadd1ca 100644 --- a/drivers/bluetooth/btusb.c +++ b/drivers/bluetooth/btusb.c @@ -2542,6 +2542,15 @@ done: btintel_version_info(hdev, &ver); finish: + /* All Intel controllers that support the Microsoft vendor + * extension are using 0xFC1E for VsMsftOpCode. + */ + switch (ver.hw_variant) { + case 0x12: /* ThP */ + hci_set_msft_opcode(hdev, 0xFC1E); + break; + } + /* Set the event mask for Intel specific vendor events. This enables * a few extra events that are useful during general operation. It * does not enable any debugging related events. -- cgit v1.2.3-59-g8ed1b From a479036041d6a1bcf98f72b16a425e8d45e20ae9 Mon Sep 17 00:00:00 2001 From: Marcel Holtmann Date: Fri, 3 Apr 2020 21:44:04 +0200 Subject: Bluetooth: Add support for Read Local Simple Pairing Options With the Read Local Simple Pairing Options command it is possible to retrieve the support for max encryption key size supported by the controller and also if the controller correctly verifies the ECDH public key during pairing. Signed-off-by: Marcel Holtmann Reviewed-by: Alain Michaud Signed-off-by: Johan Hedberg --- include/net/bluetooth/hci.h | 7 +++++++ include/net/bluetooth/hci_core.h | 2 ++ net/bluetooth/hci_core.c | 4 ++++ net/bluetooth/hci_event.c | 21 +++++++++++++++++++++ 4 files changed, 34 insertions(+) diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h index 9ff2f7a9e131..086a9e9d5d03 100644 --- a/include/net/bluetooth/hci.h +++ b/include/net/bluetooth/hci.h @@ -1275,6 +1275,13 @@ struct hci_rp_read_data_block_size { #define HCI_OP_READ_LOCAL_CODECS 0x100b +#define HCI_OP_READ_LOCAL_PAIRING_OPTS 0x100c +struct hci_rp_read_local_pairing_opts { + __u8 status; + __u8 pairing_opts; + __u8 max_key_size; +} __packed; + #define HCI_OP_READ_PAGE_SCAN_ACTIVITY 0x0c1b struct hci_rp_read_page_scan_activity { __u8 status; diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index 3cb0f82d0c83..2f3275f1d1c4 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -312,6 +312,8 @@ struct hci_dev { __u16 conn_info_max_age; __u16 auth_payload_timeout; __u8 min_enc_key_size; + __u8 max_enc_key_size; + __u8 pairing_opts; __u8 ssp_debug_mode; __u8 hw_error_code; __u32 clock; diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index ef0ee3a3d9ed..589c4085499c 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -827,6 +827,10 @@ static int hci_init4_req(struct hci_request *req, unsigned long opt) if (hdev->commands[29] & 0x20) hci_req_add(req, HCI_OP_READ_LOCAL_CODECS, 0, NULL); + /* Read local pairing options if the HCI command is supported */ + if (hdev->commands[41] & 0x08) + hci_req_add(req, HCI_OP_READ_LOCAL_PAIRING_OPTS, 0, NULL); + /* Get MWS transport configuration if the HCI command is supported */ if (hdev->commands[30] & 0x08) hci_req_add(req, HCI_OP_GET_MWS_TRANSPORT_CONFIG, 0, NULL); diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index 2803beaa1c44..51e6461f0b71 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -747,6 +747,23 @@ static void hci_cc_read_bd_addr(struct hci_dev *hdev, struct sk_buff *skb) bacpy(&hdev->setup_addr, &rp->bdaddr); } +static void hci_cc_read_local_pairing_opts(struct hci_dev *hdev, + struct sk_buff *skb) +{ + struct hci_rp_read_local_pairing_opts *rp = (void *) skb->data; + + BT_DBG("%s status 0x%2.2x", hdev->name, rp->status); + + if (rp->status) + return; + + if (hci_dev_test_flag(hdev, HCI_SETUP) || + hci_dev_test_flag(hdev, HCI_CONFIG)) { + hdev->pairing_opts = rp->pairing_opts; + hdev->max_enc_key_size = rp->max_key_size; + } +} + static void hci_cc_read_page_scan_activity(struct hci_dev *hdev, struct sk_buff *skb) { @@ -3343,6 +3360,10 @@ static void hci_cmd_complete_evt(struct hci_dev *hdev, struct sk_buff *skb, hci_cc_read_bd_addr(hdev, skb); break; + case HCI_OP_READ_LOCAL_PAIRING_OPTS: + hci_cc_read_local_pairing_opts(hdev, skb); + break; + case HCI_OP_READ_PAGE_SCAN_ACTIVITY: hci_cc_read_page_scan_activity(hdev, skb); break; -- cgit v1.2.3-59-g8ed1b From bc292258c580a82c9baef0a64f66971e010a40a9 Mon Sep 17 00:00:00 2001 From: Marcel Holtmann Date: Fri, 3 Apr 2020 21:44:05 +0200 Subject: Bluetooth: Add support for reading security information To allow userspace to make correcty security policy decision, the kernel needs to export a few details of the supported security features and encryption key size information. This command exports this information and also allows future extensions if needed. Signed-off-by: Marcel Holtmann Reviewed-by: Alain Michaud Signed-off-by: Johan Hedberg --- include/net/bluetooth/mgmt.h | 7 ++++++ net/bluetooth/mgmt.c | 53 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 60 insertions(+) diff --git a/include/net/bluetooth/mgmt.h b/include/net/bluetooth/mgmt.h index f41cd87550dc..65dd6fd1fff3 100644 --- a/include/net/bluetooth/mgmt.h +++ b/include/net/bluetooth/mgmt.h @@ -674,6 +674,13 @@ struct mgmt_cp_set_blocked_keys { #define MGMT_OP_SET_WIDEBAND_SPEECH 0x0047 +#define MGMT_OP_READ_SECURITY_INFO 0x0048 +#define MGMT_READ_SECURITY_INFO_SIZE 0 +struct mgmt_rp_read_security_info { + __le16 sec_len; + __u8 sec[0]; +} __packed; + #define MGMT_EV_CMD_COMPLETE 0x0001 struct mgmt_ev_cmd_complete { __le16 opcode; diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c index 6552003a170e..7b9eac339c87 100644 --- a/net/bluetooth/mgmt.c +++ b/net/bluetooth/mgmt.c @@ -108,6 +108,7 @@ static const u16 mgmt_commands[] = { MGMT_OP_SET_APPEARANCE, MGMT_OP_SET_BLOCKED_KEYS, MGMT_OP_SET_WIDEBAND_SPEECH, + MGMT_OP_READ_SECURITY_INFO, }; static const u16 mgmt_events[] = { @@ -155,6 +156,7 @@ static const u16 mgmt_untrusted_commands[] = { MGMT_OP_READ_CONFIG_INFO, MGMT_OP_READ_EXT_INDEX_LIST, MGMT_OP_READ_EXT_INFO, + MGMT_OP_READ_SECURITY_INFO, }; static const u16 mgmt_untrusted_events[] = { @@ -3659,6 +3661,55 @@ unlock: return err; } +static int read_security_info(struct sock *sk, struct hci_dev *hdev, + void *data, u16 data_len) +{ + char buf[16]; + struct mgmt_rp_read_security_info *rp = (void *)buf; + u16 sec_len = 0; + u8 flags = 0; + + bt_dev_dbg(hdev, "sock %p", sk); + + memset(&buf, 0, sizeof(buf)); + + hci_dev_lock(hdev); + + /* When the Read Simple Pairing Options command is supported, then + * the remote public key validation is supported. + */ + if (hdev->commands[41] & 0x08) + flags |= 0x01; /* Remote public key validation (BR/EDR) */ + + flags |= 0x02; /* Remote public key validation (LE) */ + + /* When the Read Encryption Key Size command is supported, then the + * encryption key size is enforced. + */ + if (hdev->commands[20] & 0x10) + flags |= 0x04; /* Encryption key size enforcement (BR/EDR) */ + + flags |= 0x08; /* Encryption key size enforcement (LE) */ + + sec_len = eir_append_data(rp->sec, sec_len, 0x01, &flags, 1); + + /* When the Read Simple Pairing Options command is supported, then + * also max encryption key size information is provided. + */ + if (hdev->commands[41] & 0x08) + sec_len = eir_append_le16(rp->sec, sec_len, 0x02, + hdev->max_enc_key_size); + + sec_len = eir_append_le16(rp->sec, sec_len, 0x03, SMP_MAX_ENC_KEY_SIZE); + + rp->sec_len = cpu_to_le16(sec_len); + + hci_dev_unlock(hdev); + + return mgmt_cmd_complete(sk, hdev->id, MGMT_OP_READ_SECURITY_INFO, 0, + rp, sizeof(*rp) + sec_len); +} + static void read_local_oob_data_complete(struct hci_dev *hdev, u8 status, u16 opcode, struct sk_buff *skb) { @@ -7099,6 +7150,8 @@ static const struct hci_mgmt_handler mgmt_handlers[] = { { set_blocked_keys, MGMT_OP_SET_BLOCKED_KEYS_SIZE, HCI_MGMT_VAR_LEN }, { set_wideband_speech, MGMT_SETTING_SIZE }, + { read_security_info, MGMT_READ_SECURITY_INFO_SIZE, + HCI_MGMT_UNTRUSTED }, }; void mgmt_index_added(struct hci_dev *hdev) -- cgit v1.2.3-59-g8ed1b From 3679fe7d43c65e07f00afb216987f33e152ceb6f Mon Sep 17 00:00:00 2001 From: Marcel Holtmann Date: Fri, 3 Apr 2020 21:44:06 +0200 Subject: Bluetooth: Increment management interface revision Increment the mgmt revision due to the recently added new commands. Signed-off-by: Marcel Holtmann Signed-off-by: Johan Hedberg --- net/bluetooth/mgmt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c index 7b9eac339c87..f8c0a4fc8090 100644 --- a/net/bluetooth/mgmt.c +++ b/net/bluetooth/mgmt.c @@ -38,7 +38,7 @@ #include "mgmt_util.h" #define MGMT_VERSION 1 -#define MGMT_REVISION 16 +#define MGMT_REVISION 17 static const u16 mgmt_commands[] = { MGMT_OP_READ_INDEX_LIST, -- cgit v1.2.3-59-g8ed1b From d2a3f5f4635b7b0df8d4cd04ee0c75886ef699b9 Mon Sep 17 00:00:00 2001 From: Marcel Holtmann Date: Fri, 3 Apr 2020 21:44:07 +0200 Subject: Bluetooth: Add HCI device identifier for VIRTIO devices This patch assigns the next free HCI device identifier to Bluetooth devices based on VIRTIO devices. Signed-off-by: Marcel Holtmann Signed-off-by: Johan Hedberg --- include/net/bluetooth/hci.h | 1 + 1 file changed, 1 insertion(+) diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h index 086a9e9d5d03..79de2a659dd6 100644 --- a/include/net/bluetooth/hci.h +++ b/include/net/bluetooth/hci.h @@ -68,6 +68,7 @@ #define HCI_SPI 7 #define HCI_I2C 8 #define HCI_SMD 9 +#define HCI_VIRTIO 10 /* HCI controller types */ #define HCI_PRIMARY 0x00 -- cgit v1.2.3-59-g8ed1b From 9556dfa28b4d84edfd5b96e684ed8e7a15a51b67 Mon Sep 17 00:00:00 2001 From: Maharaja Kennadyrajan Date: Thu, 26 Mar 2020 18:36:32 +0200 Subject: ath11k: Add sta debugfs support to configure ADDBA and DELBA Add support to test aggregation procedures (addba/addba_resp/delba) manually by adding the required callbacks in sta debugfs files. To enable automatic aggregation in target, echo 0 > /sys/kernel/debug/ieee80211/phyX/netdev:wlanX/ stations/XX:XX:XX:XX:XX:XX/aggr_mode For manual mode, echo 1 > /sys/kernel/debug/ieee80211/phyX/netdev:wlanX/ stations/XX:XX:XX:XX:XX:XX/aggr_mode To send addba response, echo 0 25 > /sys/kernel/debug/ieee80211/phyX/netdev:wlanX/ stations/XX:XX:XX:XX:XX:XX/addba_resp To send addba, echo 1 32 > /sys/kernel/debug/ieee80211/phyX/netdev:wlanX/ stations/XX:XX:XX:XX:XX:XX/addba To send delba, echo 0 1 37 > /sys/kernel/debug/ieee80211/phyX/netdev:wlanX/ stations/XX:XX:XX:XX:XX:XX/delba Signed-off-by: Maharaja Kennadyrajan Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/1585213026-28406-1-git-send-email-mkenna@codeaurora.org --- drivers/net/wireless/ath/ath11k/core.h | 5 + drivers/net/wireless/ath/ath11k/debug.h | 6 + drivers/net/wireless/ath/ath11k/debugfs_sta.c | 221 ++++++++++++++++++++++++++ drivers/net/wireless/ath/ath11k/wmi.c | 140 ++++++++++++++++ drivers/net/wireless/ath/ath11k/wmi.h | 38 +++++ 5 files changed, 410 insertions(+) diff --git a/drivers/net/wireless/ath/ath11k/core.h b/drivers/net/wireless/ath/ath11k/core.h index 6e7b8ecd09a6..96ca114c2c44 100644 --- a/drivers/net/wireless/ath/ath11k/core.h +++ b/drivers/net/wireless/ath/ath11k/core.h @@ -341,6 +341,11 @@ struct ath11k_sta { u8 rssi_comb; struct ath11k_htt_tx_stats *tx_stats; struct ath11k_rx_peer_stats *rx_stats; + +#ifdef CONFIG_MAC80211_DEBUGFS + /* protected by conf_mutex */ + bool aggr_mode; +#endif }; #define ATH11K_NUM_CHANS 41 diff --git a/drivers/net/wireless/ath/ath11k/debug.h b/drivers/net/wireless/ath/ath11k/debug.h index 97e7306c506d..4a3ff8227187 100644 --- a/drivers/net/wireless/ath/ath11k/debug.h +++ b/drivers/net/wireless/ath/ath11k/debug.h @@ -112,6 +112,12 @@ enum ath11k_pktlog_enum { ATH11K_PKTLOG_TYPE_LITE_RX = 24, }; +enum ath11k_dbg_aggr_mode { + ATH11K_DBG_AGGR_MODE_AUTO, + ATH11K_DBG_AGGR_MODE_MANUAL, + ATH11K_DBG_AGGR_MODE_MAX, +}; + __printf(2, 3) void ath11k_info(struct ath11k_base *ab, const char *fmt, ...); __printf(2, 3) void ath11k_err(struct ath11k_base *ab, const char *fmt, ...); __printf(2, 3) void ath11k_warn(struct ath11k_base *ab, const char *fmt, ...); diff --git a/drivers/net/wireless/ath/ath11k/debugfs_sta.c b/drivers/net/wireless/ath/ath11k/debugfs_sta.c index 389dac219238..68963cfc5097 100644 --- a/drivers/net/wireless/ath/ath11k/debugfs_sta.c +++ b/drivers/net/wireless/ath/ath11k/debugfs_sta.c @@ -533,6 +533,222 @@ static const struct file_operations fops_peer_pktlog = { .llseek = default_llseek, }; +static ssize_t ath11k_dbg_sta_write_delba(struct file *file, + const char __user *user_buf, + size_t count, loff_t *ppos) +{ + struct ieee80211_sta *sta = file->private_data; + struct ath11k_sta *arsta = (struct ath11k_sta *)sta->drv_priv; + struct ath11k *ar = arsta->arvif->ar; + u32 tid, initiator, reason; + int ret; + char buf[64] = {0}; + + ret = simple_write_to_buffer(buf, sizeof(buf) - 1, ppos, + user_buf, count); + if (ret <= 0) + return ret; + + ret = sscanf(buf, "%u %u %u", &tid, &initiator, &reason); + if (ret != 3) + return -EINVAL; + + /* Valid TID values are 0 through 15 */ + if (tid > HAL_DESC_REO_NON_QOS_TID - 1) + return -EINVAL; + + mutex_lock(&ar->conf_mutex); + if (ar->state != ATH11K_STATE_ON || + arsta->aggr_mode != ATH11K_DBG_AGGR_MODE_MANUAL) { + ret = count; + goto out; + } + + ret = ath11k_wmi_delba_send(ar, arsta->arvif->vdev_id, sta->addr, + tid, initiator, reason); + if (ret) { + ath11k_warn(ar->ab, "failed to send delba: vdev_id %u peer %pM tid %u initiator %u reason %u\n", + arsta->arvif->vdev_id, sta->addr, tid, initiator, + reason); + } + ret = count; +out: + mutex_unlock(&ar->conf_mutex); + return ret; +} + +static const struct file_operations fops_delba = { + .write = ath11k_dbg_sta_write_delba, + .open = simple_open, + .owner = THIS_MODULE, + .llseek = default_llseek, +}; + +static ssize_t ath11k_dbg_sta_write_addba_resp(struct file *file, + const char __user *user_buf, + size_t count, loff_t *ppos) +{ + struct ieee80211_sta *sta = file->private_data; + struct ath11k_sta *arsta = (struct ath11k_sta *)sta->drv_priv; + struct ath11k *ar = arsta->arvif->ar; + u32 tid, status; + int ret; + char buf[64] = {0}; + + ret = simple_write_to_buffer(buf, sizeof(buf) - 1, ppos, + user_buf, count); + if (ret <= 0) + return ret; + + ret = sscanf(buf, "%u %u", &tid, &status); + if (ret != 2) + return -EINVAL; + + /* Valid TID values are 0 through 15 */ + if (tid > HAL_DESC_REO_NON_QOS_TID - 1) + return -EINVAL; + + mutex_lock(&ar->conf_mutex); + if (ar->state != ATH11K_STATE_ON || + arsta->aggr_mode != ATH11K_DBG_AGGR_MODE_MANUAL) { + ret = count; + goto out; + } + + ret = ath11k_wmi_addba_set_resp(ar, arsta->arvif->vdev_id, sta->addr, + tid, status); + if (ret) { + ath11k_warn(ar->ab, "failed to send addba response: vdev_id %u peer %pM tid %u status%u\n", + arsta->arvif->vdev_id, sta->addr, tid, status); + } + ret = count; +out: + mutex_unlock(&ar->conf_mutex); + return ret; +} + +static const struct file_operations fops_addba_resp = { + .write = ath11k_dbg_sta_write_addba_resp, + .open = simple_open, + .owner = THIS_MODULE, + .llseek = default_llseek, +}; + +static ssize_t ath11k_dbg_sta_write_addba(struct file *file, + const char __user *user_buf, + size_t count, loff_t *ppos) +{ + struct ieee80211_sta *sta = file->private_data; + struct ath11k_sta *arsta = (struct ath11k_sta *)sta->drv_priv; + struct ath11k *ar = arsta->arvif->ar; + u32 tid, buf_size; + int ret; + char buf[64] = {0}; + + ret = simple_write_to_buffer(buf, sizeof(buf) - 1, ppos, + user_buf, count); + if (ret <= 0) + return ret; + + ret = sscanf(buf, "%u %u", &tid, &buf_size); + if (ret != 2) + return -EINVAL; + + /* Valid TID values are 0 through 15 */ + if (tid > HAL_DESC_REO_NON_QOS_TID - 1) + return -EINVAL; + + mutex_lock(&ar->conf_mutex); + if (ar->state != ATH11K_STATE_ON || + arsta->aggr_mode != ATH11K_DBG_AGGR_MODE_MANUAL) { + ret = count; + goto out; + } + + ret = ath11k_wmi_addba_send(ar, arsta->arvif->vdev_id, sta->addr, + tid, buf_size); + if (ret) { + ath11k_warn(ar->ab, "failed to send addba request: vdev_id %u peer %pM tid %u buf_size %u\n", + arsta->arvif->vdev_id, sta->addr, tid, buf_size); + } + + ret = count; +out: + mutex_unlock(&ar->conf_mutex); + return ret; +} + +static const struct file_operations fops_addba = { + .write = ath11k_dbg_sta_write_addba, + .open = simple_open, + .owner = THIS_MODULE, + .llseek = default_llseek, +}; + +static ssize_t ath11k_dbg_sta_read_aggr_mode(struct file *file, + char __user *user_buf, + size_t count, loff_t *ppos) +{ + struct ieee80211_sta *sta = file->private_data; + struct ath11k_sta *arsta = (struct ath11k_sta *)sta->drv_priv; + struct ath11k *ar = arsta->arvif->ar; + char buf[64]; + int len = 0; + + mutex_lock(&ar->conf_mutex); + len = scnprintf(buf, sizeof(buf) - len, + "aggregation mode: %s\n\n%s\n%s\n", + (arsta->aggr_mode == ATH11K_DBG_AGGR_MODE_AUTO) ? + "auto" : "manual", "auto = 0", "manual = 1"); + mutex_unlock(&ar->conf_mutex); + + return simple_read_from_buffer(user_buf, count, ppos, buf, len); +} + +static ssize_t ath11k_dbg_sta_write_aggr_mode(struct file *file, + const char __user *user_buf, + size_t count, loff_t *ppos) +{ + struct ieee80211_sta *sta = file->private_data; + struct ath11k_sta *arsta = (struct ath11k_sta *)sta->drv_priv; + struct ath11k *ar = arsta->arvif->ar; + u32 aggr_mode; + int ret; + + if (kstrtouint_from_user(user_buf, count, 0, &aggr_mode)) + return -EINVAL; + + if (aggr_mode >= ATH11K_DBG_AGGR_MODE_MAX) + return -EINVAL; + + mutex_lock(&ar->conf_mutex); + if (ar->state != ATH11K_STATE_ON || + aggr_mode == arsta->aggr_mode) { + ret = count; + goto out; + } + + ret = ath11k_wmi_addba_clear_resp(ar, arsta->arvif->vdev_id, sta->addr); + if (ret) { + ath11k_warn(ar->ab, "failed to clear addba session ret: %d\n", + ret); + goto out; + } + + arsta->aggr_mode = aggr_mode; +out: + mutex_unlock(&ar->conf_mutex); + return ret; +} + +static const struct file_operations fops_aggr_mode = { + .read = ath11k_dbg_sta_read_aggr_mode, + .write = ath11k_dbg_sta_write_aggr_mode, + .open = simple_open, + .owner = THIS_MODULE, + .llseek = default_llseek, +}; + void ath11k_sta_add_debugfs(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_sta *sta, struct dentry *dir) { @@ -550,4 +766,9 @@ void ath11k_sta_add_debugfs(struct ieee80211_hw *hw, struct ieee80211_vif *vif, debugfs_create_file("peer_pktlog", 0644, dir, sta, &fops_peer_pktlog); + + debugfs_create_file("aggr_mode", 0644, dir, sta, &fops_aggr_mode); + debugfs_create_file("addba", 0200, dir, sta, &fops_addba); + debugfs_create_file("addba_resp", 0200, dir, sta, &fops_addba_resp); + debugfs_create_file("delba", 0200, dir, sta, &fops_delba); } diff --git a/drivers/net/wireless/ath/ath11k/wmi.c b/drivers/net/wireless/ath/ath11k/wmi.c index e7ce36966d6a..49a17c85303a 100644 --- a/drivers/net/wireless/ath/ath11k/wmi.c +++ b/drivers/net/wireless/ath/ath11k/wmi.c @@ -2368,6 +2368,146 @@ int ath11k_wmi_send_dfs_phyerr_offload_enable_cmd(struct ath11k *ar, return ret; } +int ath11k_wmi_delba_send(struct ath11k *ar, u32 vdev_id, const u8 *mac, + u32 tid, u32 initiator, u32 reason) +{ + struct ath11k_pdev_wmi *wmi = ar->wmi; + struct wmi_delba_send_cmd *cmd; + struct sk_buff *skb; + int ret; + + skb = ath11k_wmi_alloc_skb(wmi->wmi_ab, sizeof(*cmd)); + if (!skb) + return -ENOMEM; + + cmd = (struct wmi_delba_send_cmd *)skb->data; + cmd->tlv_header = FIELD_PREP(WMI_TLV_TAG, WMI_TAG_DELBA_SEND_CMD) | + FIELD_PREP(WMI_TLV_LEN, sizeof(*cmd) - TLV_HDR_SIZE); + cmd->vdev_id = vdev_id; + ether_addr_copy(cmd->peer_macaddr.addr, mac); + cmd->tid = tid; + cmd->initiator = initiator; + cmd->reasoncode = reason; + + ath11k_dbg(ar->ab, ATH11K_DBG_WMI, + "wmi delba send vdev_id 0x%X mac_addr %pM tid %u initiator %u reason %u\n", + vdev_id, mac, tid, initiator, reason); + + ret = ath11k_wmi_cmd_send(wmi, skb, WMI_DELBA_SEND_CMDID); + + if (ret) { + ath11k_warn(ar->ab, + "failed to send WMI_DELBA_SEND_CMDID cmd\n"); + dev_kfree_skb(skb); + } + + return ret; +} + +int ath11k_wmi_addba_set_resp(struct ath11k *ar, u32 vdev_id, const u8 *mac, + u32 tid, u32 status) +{ + struct ath11k_pdev_wmi *wmi = ar->wmi; + struct wmi_addba_setresponse_cmd *cmd; + struct sk_buff *skb; + int ret; + + skb = ath11k_wmi_alloc_skb(wmi->wmi_ab, sizeof(*cmd)); + if (!skb) + return -ENOMEM; + + cmd = (struct wmi_addba_setresponse_cmd *)skb->data; + cmd->tlv_header = + FIELD_PREP(WMI_TLV_TAG, WMI_TAG_ADDBA_SETRESPONSE_CMD) | + FIELD_PREP(WMI_TLV_LEN, sizeof(*cmd) - TLV_HDR_SIZE); + cmd->vdev_id = vdev_id; + ether_addr_copy(cmd->peer_macaddr.addr, mac); + cmd->tid = tid; + cmd->statuscode = status; + + ath11k_dbg(ar->ab, ATH11K_DBG_WMI, + "wmi addba set resp vdev_id 0x%X mac_addr %pM tid %u status %u\n", + vdev_id, mac, tid, status); + + ret = ath11k_wmi_cmd_send(wmi, skb, WMI_ADDBA_SET_RESP_CMDID); + + if (ret) { + ath11k_warn(ar->ab, + "failed to send WMI_ADDBA_SET_RESP_CMDID cmd\n"); + dev_kfree_skb(skb); + } + + return ret; +} + +int ath11k_wmi_addba_send(struct ath11k *ar, u32 vdev_id, const u8 *mac, + u32 tid, u32 buf_size) +{ + struct ath11k_pdev_wmi *wmi = ar->wmi; + struct wmi_addba_send_cmd *cmd; + struct sk_buff *skb; + int ret; + + skb = ath11k_wmi_alloc_skb(wmi->wmi_ab, sizeof(*cmd)); + if (!skb) + return -ENOMEM; + + cmd = (struct wmi_addba_send_cmd *)skb->data; + cmd->tlv_header = FIELD_PREP(WMI_TLV_TAG, WMI_TAG_ADDBA_SEND_CMD) | + FIELD_PREP(WMI_TLV_LEN, sizeof(*cmd) - TLV_HDR_SIZE); + cmd->vdev_id = vdev_id; + ether_addr_copy(cmd->peer_macaddr.addr, mac); + cmd->tid = tid; + cmd->buffersize = buf_size; + + ath11k_dbg(ar->ab, ATH11K_DBG_WMI, + "wmi addba send vdev_id 0x%X mac_addr %pM tid %u bufsize %u\n", + vdev_id, mac, tid, buf_size); + + ret = ath11k_wmi_cmd_send(wmi, skb, WMI_ADDBA_SEND_CMDID); + + if (ret) { + ath11k_warn(ar->ab, + "failed to send WMI_ADDBA_SEND_CMDID cmd\n"); + dev_kfree_skb(skb); + } + + return ret; +} + +int ath11k_wmi_addba_clear_resp(struct ath11k *ar, u32 vdev_id, const u8 *mac) +{ + struct ath11k_pdev_wmi *wmi = ar->wmi; + struct wmi_addba_clear_resp_cmd *cmd; + struct sk_buff *skb; + int ret; + + skb = ath11k_wmi_alloc_skb(wmi->wmi_ab, sizeof(*cmd)); + if (!skb) + return -ENOMEM; + + cmd = (struct wmi_addba_clear_resp_cmd *)skb->data; + cmd->tlv_header = + FIELD_PREP(WMI_TLV_TAG, WMI_TAG_ADDBA_CLEAR_RESP_CMD) | + FIELD_PREP(WMI_TLV_LEN, sizeof(*cmd) - TLV_HDR_SIZE); + cmd->vdev_id = vdev_id; + ether_addr_copy(cmd->peer_macaddr.addr, mac); + + ath11k_dbg(ar->ab, ATH11K_DBG_WMI, + "wmi addba clear resp vdev_id 0x%X mac_addr %pM\n", + vdev_id, mac); + + ret = ath11k_wmi_cmd_send(wmi, skb, WMI_ADDBA_CLEAR_RESP_CMDID); + + if (ret) { + ath11k_warn(ar->ab, + "failed to send WMI_ADDBA_CLEAR_RESP_CMDID cmd\n"); + dev_kfree_skb(skb); + } + + return ret; +} + int ath11k_wmi_pdev_peer_pktlog_filter(struct ath11k *ar, u8 *addr, u8 enable) { struct ath11k_pdev_wmi *wmi = ar->wmi; diff --git a/drivers/net/wireless/ath/ath11k/wmi.h b/drivers/net/wireless/ath/ath11k/wmi.h index 510f9c6bc1d7..780e6620142d 100644 --- a/drivers/net/wireless/ath/ath11k/wmi.h +++ b/drivers/net/wireless/ath/ath11k/wmi.h @@ -3649,6 +3649,37 @@ struct wmi_therm_throt_level_config_info { u32 prio; } __packed; +struct wmi_delba_send_cmd { + u32 tlv_header; + u32 vdev_id; + struct wmi_mac_addr peer_macaddr; + u32 tid; + u32 initiator; + u32 reasoncode; +} __packed; + +struct wmi_addba_setresponse_cmd { + u32 tlv_header; + u32 vdev_id; + struct wmi_mac_addr peer_macaddr; + u32 tid; + u32 statuscode; +} __packed; + +struct wmi_addba_send_cmd { + u32 tlv_header; + u32 vdev_id; + struct wmi_mac_addr peer_macaddr; + u32 tid; + u32 buffersize; +} __packed; + +struct wmi_addba_clear_resp_cmd { + u32 tlv_header; + u32 vdev_id; + struct wmi_mac_addr peer_macaddr; +} __packed; + struct wmi_pdev_pktlog_filter_info { u32 tlv_header; struct wmi_mac_addr peer_macaddr; @@ -4822,6 +4853,13 @@ int ath11k_wmi_send_scan_chan_list_cmd(struct ath11k *ar, struct scan_chan_list_params *chan_list); int ath11k_wmi_send_dfs_phyerr_offload_enable_cmd(struct ath11k *ar, u32 pdev_id); +int ath11k_wmi_addba_clear_resp(struct ath11k *ar, u32 vdev_id, const u8 *mac); +int ath11k_wmi_addba_send(struct ath11k *ar, u32 vdev_id, const u8 *mac, + u32 tid, u32 buf_size); +int ath11k_wmi_addba_set_resp(struct ath11k *ar, u32 vdev_id, const u8 *mac, + u32 tid, u32 status); +int ath11k_wmi_delba_send(struct ath11k *ar, u32 vdev_id, const u8 *mac, + u32 tid, u32 initiator, u32 reason); int ath11k_wmi_send_bcn_offload_control_cmd(struct ath11k *ar, u32 vdev_id, u32 bcn_ctrl_op); int -- cgit v1.2.3-59-g8ed1b From 3d1c60460fb2823a19ead9e6ec8f184dd7271aa7 Mon Sep 17 00:00:00 2001 From: Maharaja Kennadyrajan Date: Thu, 26 Mar 2020 18:36:36 +0200 Subject: ath10k: Fix the race condition in firmware dump work queue There is a race condition, when the user writes 'hw-restart' and 'hard' in the simulate_fw_crash debugfs file without any delay. In the above scenario, the firmware dump work queue(scheduled by 'hard') should be handled gracefully, while the target is in the 'hw-restart'. Tested HW: QCA9984 Tested FW: 10.4-3.9.0.2-00044 Co-developed-by: Govindaraj Saminathan Signed-off-by: Govindaraj Saminathan Signed-off-by: Maharaja Kennadyrajan Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/1585213077-28439-1-git-send-email-mkenna@codeaurora.org --- drivers/net/wireless/ath/ath10k/pci.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/wireless/ath/ath10k/pci.c b/drivers/net/wireless/ath/ath10k/pci.c index ded7a220a4aa..cd1c5d60261f 100644 --- a/drivers/net/wireless/ath/ath10k/pci.c +++ b/drivers/net/wireless/ath/ath10k/pci.c @@ -2074,6 +2074,7 @@ static void ath10k_pci_hif_stop(struct ath10k *ar) ath10k_pci_irq_sync(ar); napi_synchronize(&ar->napi); napi_disable(&ar->napi); + cancel_work_sync(&ar_pci->dump_work); /* Most likely the device has HTT Rx ring configured. The only way to * prevent the device from accessing (and possible corrupting) host -- cgit v1.2.3-59-g8ed1b From 21c1b063f4b98c14b2438734c93fe24d517233cb Mon Sep 17 00:00:00 2001 From: Maharaja Kennadyrajan Date: Thu, 26 Mar 2020 20:19:15 +0530 Subject: ath11k: add pktlog checksum in trace events to support pktlog Pktlog data are different among the chipset & chipset versions. As part of enhancing the user space script to decode the pktlog trace events generated, it is desirable to know which chipset or which chipset version has provided the events and thereby decode the pktlogs appropriately. Pktlog checksum helps to determine the chipset variant which is given by the firmware in the struct wmi_ready_event. Pktlog checksums are computed during the firmware build. So, adding that pktlog checksum in the pklog trace events. Signed-off-by: Maharaja Kennadyrajan Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/1585234155-30574-1-git-send-email-mkenna@codeaurora.org --- drivers/net/wireless/ath/ath11k/core.h | 1 + drivers/net/wireless/ath/ath11k/dp_rx.c | 3 ++- drivers/net/wireless/ath/ath11k/trace.h | 12 ++++++++---- drivers/net/wireless/ath/ath11k/wmi.c | 21 +++++++++++++-------- drivers/net/wireless/ath/ath11k/wmi.h | 8 +++++++- 5 files changed, 31 insertions(+), 14 deletions(-) diff --git a/drivers/net/wireless/ath/ath11k/core.h b/drivers/net/wireless/ath/ath11k/core.h index 96ca114c2c44..b4c3e0418eef 100644 --- a/drivers/net/wireless/ath/ath11k/core.h +++ b/drivers/net/wireless/ath/ath11k/core.h @@ -655,6 +655,7 @@ struct ath11k_base { /* protected by data_lock */ u32 fw_crash_counter; } stats; + u32 pktlog_defs_checksum; }; struct ath11k_fw_stats_pdev { diff --git a/drivers/net/wireless/ath/ath11k/dp_rx.c b/drivers/net/wireless/ath/ath11k/dp_rx.c index f74a0e74bf3e..a3f2c76b3471 100644 --- a/drivers/net/wireless/ath/ath11k/dp_rx.c +++ b/drivers/net/wireless/ath/ath11k/dp_rx.c @@ -1491,7 +1491,8 @@ static void ath11k_htt_pktlog(struct ath11k_base *ab, struct sk_buff *skb) return; } - trace_ath11k_htt_pktlog(ar, data->payload, hdr->size); + trace_ath11k_htt_pktlog(ar, data->payload, hdr->size, + ar->ab->pktlog_defs_checksum); } static void ath11k_htt_backpressure_event_handler(struct ath11k_base *ab, diff --git a/drivers/net/wireless/ath/ath11k/trace.h b/drivers/net/wireless/ath/ath11k/trace.h index 8700a622be7b..66d0aae7816c 100644 --- a/drivers/net/wireless/ath/ath11k/trace.h +++ b/drivers/net/wireless/ath/ath11k/trace.h @@ -21,14 +21,16 @@ static inline void trace_ ## name(proto) {} #define TRACE_SYSTEM ath11k TRACE_EVENT(ath11k_htt_pktlog, - TP_PROTO(struct ath11k *ar, const void *buf, u16 buf_len), + TP_PROTO(struct ath11k *ar, const void *buf, u16 buf_len, + u32 pktlog_checksum), - TP_ARGS(ar, buf, buf_len), + TP_ARGS(ar, buf, buf_len, pktlog_checksum), TP_STRUCT__entry( __string(device, dev_name(ar->ab->dev)) __string(driver, dev_driver_string(ar->ab->dev)) __field(u16, buf_len) + __field(u32, pktlog_checksum) __dynamic_array(u8, pktlog, buf_len) ), @@ -36,14 +38,16 @@ TRACE_EVENT(ath11k_htt_pktlog, __assign_str(device, dev_name(ar->ab->dev)); __assign_str(driver, dev_driver_string(ar->ab->dev)); __entry->buf_len = buf_len; + __entry->pktlog_checksum = pktlog_checksum; memcpy(__get_dynamic_array(pktlog), buf, buf_len); ), TP_printk( - "%s %s size %hu", + "%s %s size %hu pktlog_checksum %d", __get_str(driver), __get_str(device), - __entry->buf_len + __entry->buf_len, + __entry->pktlog_checksum ) ); diff --git a/drivers/net/wireless/ath/ath11k/wmi.c b/drivers/net/wireless/ath/ath11k/wmi.c index 49a17c85303a..09150de53321 100644 --- a/drivers/net/wireless/ath/ath11k/wmi.c +++ b/drivers/net/wireless/ath/ath11k/wmi.c @@ -87,8 +87,8 @@ static const struct wmi_tlv_policy wmi_tlv_policies[] = { = { .min_len = sizeof(struct wmi_pdev_bss_chan_info_event) }, [WMI_TAG_VDEV_INSTALL_KEY_COMPLETE_EVENT] = { .min_len = sizeof(struct wmi_vdev_install_key_compl_event) }, - [WMI_TAG_READY_EVENT] - = {.min_len = sizeof(struct wmi_ready_event) }, + [WMI_TAG_READY_EVENT] = { + .min_len = sizeof(struct wmi_ready_event_min) }, [WMI_TAG_SERVICE_AVAILABLE_EVENT] = {.min_len = sizeof(struct wmi_service_available_event) }, [WMI_TAG_PEER_ASSOC_CONF_EVENT] @@ -4991,7 +4991,7 @@ static int ath11k_wmi_tlv_rdy_parse(struct ath11k_base *ab, u16 tag, u16 len, const void *ptr, void *data) { struct wmi_tlv_rdy_parse *rdy_parse = data; - struct wmi_ready_event *fixed_param; + struct wmi_ready_event fixed_param; struct wmi_mac_addr *addr_list; struct ath11k_pdev *pdev; u32 num_mac_addr; @@ -4999,11 +4999,16 @@ static int ath11k_wmi_tlv_rdy_parse(struct ath11k_base *ab, u16 tag, u16 len, switch (tag) { case WMI_TAG_READY_EVENT: - fixed_param = (struct wmi_ready_event *)ptr; - ab->wlan_init_status = fixed_param->status; - rdy_parse->num_extra_mac_addr = fixed_param->num_extra_mac_addr; - - ether_addr_copy(ab->mac_addr, fixed_param->mac_addr.addr); + memset(&fixed_param, 0, sizeof(fixed_param)); + memcpy(&fixed_param, (struct wmi_ready_event *)ptr, + min_t(u16, sizeof(fixed_param), len)); + ab->wlan_init_status = fixed_param.ready_event_min.status; + rdy_parse->num_extra_mac_addr = + fixed_param.ready_event_min.num_extra_mac_addr; + + ether_addr_copy(ab->mac_addr, + fixed_param.ready_event_min.mac_addr.addr); + ab->pktlog_defs_checksum = fixed_param.pktlog_defs_checksum; ab->wmi_ready = true; break; case WMI_TAG_ARRAY_FIXED_STRUCT: diff --git a/drivers/net/wireless/ath/ath11k/wmi.h b/drivers/net/wireless/ath/ath11k/wmi.h index 780e6620142d..ba05935b715a 100644 --- a/drivers/net/wireless/ath/ath11k/wmi.h +++ b/drivers/net/wireless/ath/ath11k/wmi.h @@ -2345,7 +2345,7 @@ struct wmi_mac_addr { } __packed; } __packed; -struct wmi_ready_event { +struct wmi_ready_event_min { struct wmi_abi_version fw_abi_vers; struct wmi_mac_addr mac_addr; u32 status; @@ -2355,6 +2355,12 @@ struct wmi_ready_event { u32 num_extra_peers; } __packed; +struct wmi_ready_event { + struct wmi_ready_event_min ready_event_min; + u32 max_ast_index; + u32 pktlog_defs_checksum; +} __packed; + struct wmi_service_available_event { u32 wmi_service_segment_offset; u32 wmi_service_segment_bitmap[WMI_SERVICE_SEGMENT_BM_SIZE32]; -- cgit v1.2.3-59-g8ed1b From 9a8074e3bcd7956ec6b4f7c26360af1b0b0abe38 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Fri, 27 Mar 2020 19:26:39 +0000 Subject: ath11k: fix error message to correctly report the command that failed Currently the error message refers to the command WMI_TWT_DIeABLE_CMDID which looks like a cut-n-paste mangled typo. Fix the message to match the command WMI_BSS_COLOR_CHANGE_ENABLE_CMDID that failed. Fixes: 5a032c8d1953 ("ath11k: add WMI calls required for handling BSS color") Signed-off-by: Colin Ian King Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/20200327192639.363354-1-colin.king@canonical.com --- drivers/net/wireless/ath/ath11k/wmi.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/wireless/ath/ath11k/wmi.c b/drivers/net/wireless/ath/ath11k/wmi.c index 09150de53321..8832b8c8e63f 100644 --- a/drivers/net/wireless/ath/ath11k/wmi.c +++ b/drivers/net/wireless/ath/ath11k/wmi.c @@ -2919,7 +2919,7 @@ int ath11k_wmi_send_bss_color_change_enable_cmd(struct ath11k *ar, u32 vdev_id, ret = ath11k_wmi_cmd_send(wmi, skb, WMI_BSS_COLOR_CHANGE_ENABLE_CMDID); if (ret) { - ath11k_warn(ab, "Failed to send WMI_TWT_DIeABLE_CMDID"); + ath11k_warn(ab, "Failed to send WMI_BSS_COLOR_CHANGE_ENABLE_CMDID"); dev_kfree_skb(skb); } return ret; -- cgit v1.2.3-59-g8ed1b From bdef56a36eeaccf236af43578f77938f3561a2b1 Mon Sep 17 00:00:00 2001 From: Sriram R Date: Mon, 30 Mar 2020 16:46:46 +0530 Subject: ath11k: Increase the tx completion ring size Increase the tx completion ring size to 0x8000.Also set the idr size to be same as the completion ring size. This avoids backpressure on the TX Completion and corresponding TCL Data ring during high data traffic. Signed-off-by: Sriram R Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/1585567006-9173-1-git-send-email-srirrama@codeaurora.org --- drivers/net/wireless/ath/ath11k/dp.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/wireless/ath/ath11k/dp.h b/drivers/net/wireless/ath/ath11k/dp.h index 551f9c9fb847..d4e19dc4bce1 100644 --- a/drivers/net/wireless/ath/ath11k/dp.h +++ b/drivers/net/wireless/ath/ath11k/dp.h @@ -169,8 +169,8 @@ struct ath11k_pdev_dp { #define DP_WBM_RELEASE_RING_SIZE 64 #define DP_TCL_DATA_RING_SIZE 512 -#define DP_TX_COMP_RING_SIZE 8192 -#define DP_TX_IDR_SIZE (DP_TX_COMP_RING_SIZE << 1) +#define DP_TX_COMP_RING_SIZE 32768 +#define DP_TX_IDR_SIZE DP_TX_COMP_RING_SIZE #define DP_TCL_CMD_RING_SIZE 32 #define DP_TCL_STATUS_RING_SIZE 32 #define DP_REO_DST_RING_MAX 4 -- cgit v1.2.3-59-g8ed1b From 800113ff4b1d277c2b66ffc04d4d38f202a0d187 Mon Sep 17 00:00:00 2001 From: Sriram R Date: Mon, 30 Mar 2020 16:47:08 +0530 Subject: ath11k: Avoid mgmt tx count underflow The mgmt tx count reference is incremented/decremented on every mgmt tx and on tx completion event from firmware. In case of an unexpected mgmt tx completion event from firmware, the counter would underflow. Avoid this by decrementing only when the tx count is greater than 0. Signed-off-by: Sriram R Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/1585567028-9242-1-git-send-email-srirrama@codeaurora.org --- drivers/net/wireless/ath/ath11k/wmi.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/net/wireless/ath/ath11k/wmi.c b/drivers/net/wireless/ath/ath11k/wmi.c index 8832b8c8e63f..973b72a0ca69 100644 --- a/drivers/net/wireless/ath/ath11k/wmi.c +++ b/drivers/net/wireless/ath/ath11k/wmi.c @@ -3880,8 +3880,9 @@ static int wmi_process_mgmt_tx_comp(struct ath11k *ar, u32 desc_id, ieee80211_tx_status_irqsafe(ar->hw, msdu); - WARN_ON_ONCE(atomic_read(&ar->num_pending_mgmt_tx) == 0); - atomic_dec(&ar->num_pending_mgmt_tx); + /* WARN when we received this event without doing any mgmt tx */ + if (atomic_dec_if_positive(&ar->num_pending_mgmt_tx) < 0) + WARN_ON_ONCE(1); return 0; } -- cgit v1.2.3-59-g8ed1b From 3db24065c2c824e9ea419c453b810b5f301d91c8 Mon Sep 17 00:00:00 2001 From: Lei Wang Date: Mon, 30 Mar 2020 18:56:31 +0530 Subject: ath10k: enable VHT160 and VHT80+80 modes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Set right channel frequencies in VHT160 mode according to the VHT160 interoperability workaround added as part of IEEE Std 802.11™-2016 in "Table 9-252—VHT Operation Information subfields", band_center_freq2 corresponds to CCFS1 in Table 9-253. Previous implementation (band_center_freq2 = 0 for VHT160) is only deprecated. Enable VHT80+80 mode and set the proper peer RX nss value for VHT160 and VHT80+80 mode. Based on patches by Sebastian Gottschall: https://lkml.kernel.org/r/20180704095444.662-1-s.gottschall@dd-wrt.com https://lkml.kernel.org/r/20180704120519.6479-1-s.gottschall@dd-wrt.com Tested: qca9984 with firmware ver 10.4-3.10-00047 Co-developed-by: Sebastian Gottschall Signed-off-by: Sebastian Gottschall Co-developed-by: Rick Wu Signed-off-by: Rick Wu Signed-off-by: Lei Wang Signed-off-by: Sowmiya Sree Elavalagan Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/1585574792-719-1-git-send-email-ssreeela@codeaurora.org --- drivers/net/wireless/ath/ath10k/mac.c | 84 +++++++++++++++++++++++++---------- drivers/net/wireless/ath/ath10k/wmi.c | 23 ++++++---- drivers/net/wireless/ath/ath10k/wmi.h | 5 ++- 3 files changed, 80 insertions(+), 32 deletions(-) diff --git a/drivers/net/wireless/ath/ath10k/mac.c b/drivers/net/wireless/ath/ath10k/mac.c index 2d03b8dd3b8c..a59a7a5631a8 100644 --- a/drivers/net/wireless/ath/ath10k/mac.c +++ b/drivers/net/wireless/ath/ath10k/mac.c @@ -2505,6 +2505,30 @@ ath10k_peer_assoc_h_vht_limit(u16 tx_mcs_set, return tx_mcs_set; } +static u32 get_160mhz_nss_from_maxrate(int rate) +{ + u32 nss; + + switch (rate) { + case 780: + nss = 1; + break; + case 1560: + nss = 2; + break; + case 2106: + nss = 3; /* not support MCS9 from spec*/ + break; + case 3120: + nss = 4; + break; + default: + nss = 1; + } + + return nss; +} + static void ath10k_peer_assoc_h_vht(struct ath10k *ar, struct ieee80211_vif *vif, struct ieee80211_sta *sta, @@ -2512,6 +2536,7 @@ static void ath10k_peer_assoc_h_vht(struct ath10k *ar, { const struct ieee80211_sta_vht_cap *vht_cap = &sta->vht_cap; struct ath10k_vif *arvif = (void *)vif->drv_priv; + struct ath10k_hw_params *hw = &ar->hw_params; struct cfg80211_chan_def def; enum nl80211_band band; const u16 *vht_mcs_mask; @@ -2578,22 +2603,38 @@ static void ath10k_peer_assoc_h_vht(struct ath10k *ar, arg->peer_vht_rates.tx_mcs_set = ath10k_peer_assoc_h_vht_limit( __le16_to_cpu(vht_cap->vht_mcs.tx_mcs_map), vht_mcs_mask); - ath10k_dbg(ar, ATH10K_DBG_MAC, "mac vht peer %pM max_mpdu %d flags 0x%x\n", - sta->addr, arg->peer_max_mpdu, arg->peer_flags); + /* Configure bandwidth-NSS mapping to FW + * for the chip's tx chains setting on 160Mhz bw + */ + if (arg->peer_phymode == MODE_11AC_VHT160 || + arg->peer_phymode == MODE_11AC_VHT80_80) { + u32 rx_nss; + u32 max_rate; - if (arg->peer_vht_rates.rx_max_rate && - (sta->vht_cap.cap & IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_MASK)) { - switch (arg->peer_vht_rates.rx_max_rate) { - case 1560: - /* Must be 2x2 at 160Mhz is all it can do. */ - arg->peer_bw_rxnss_override = 2; - break; - case 780: - /* Can only do 1x1 at 160Mhz (Long Guard Interval) */ - arg->peer_bw_rxnss_override = 1; - break; + max_rate = arg->peer_vht_rates.rx_max_rate; + rx_nss = get_160mhz_nss_from_maxrate(max_rate); + + if (rx_nss == 0) + rx_nss = arg->peer_num_spatial_streams; + else + rx_nss = min(arg->peer_num_spatial_streams, rx_nss); + + max_rate = hw->vht160_mcs_tx_highest; + rx_nss = min(rx_nss, get_160mhz_nss_from_maxrate(max_rate)); + + arg->peer_bw_rxnss_override = + FIELD_PREP(WMI_PEER_NSS_MAP_ENABLE, 1) | + FIELD_PREP(WMI_PEER_NSS_160MHZ_MASK, (rx_nss - 1)); + + if (arg->peer_phymode == MODE_11AC_VHT80_80) { + arg->peer_bw_rxnss_override |= + FIELD_PREP(WMI_PEER_NSS_80_80MHZ_MASK, (rx_nss - 1)); } } + ath10k_dbg(ar, ATH10K_DBG_MAC, + "mac vht peer %pM max_mpdu %d flags 0x%x peer_rx_nss_override 0x%x\n", + sta->addr, arg->peer_max_mpdu, + arg->peer_flags, arg->peer_bw_rxnss_override); } static void ath10k_peer_assoc_h_qos(struct ath10k *ar, @@ -2745,9 +2786,9 @@ static int ath10k_peer_assoc_prepare(struct ath10k *ar, ath10k_peer_assoc_h_crypto(ar, vif, sta, arg); ath10k_peer_assoc_h_rates(ar, vif, sta, arg); ath10k_peer_assoc_h_ht(ar, vif, sta, arg); + ath10k_peer_assoc_h_phymode(ar, vif, sta, arg); ath10k_peer_assoc_h_vht(ar, vif, sta, arg); ath10k_peer_assoc_h_qos(ar, vif, sta, arg); - ath10k_peer_assoc_h_phymode(ar, vif, sta, arg); return 0; } @@ -4563,13 +4604,6 @@ static struct ieee80211_sta_vht_cap ath10k_create_vht_cap(struct ath10k *ar) vht_cap.cap |= val; } - /* Currently the firmware seems to be buggy, don't enable 80+80 - * mode until that's resolved. - */ - if ((ar->vht_cap_info & IEEE80211_VHT_CAP_SHORT_GI_160) && - (ar->vht_cap_info & IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_MASK) == 0) - vht_cap.cap |= IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_160MHZ; - mcs_map = 0; for (i = 0; i < 8; i++) { if ((i < ar->num_rf_chains) && (ar->cfg_tx_chainmask & BIT(i))) @@ -8625,7 +8659,9 @@ static const struct ieee80211_iface_combination ath10k_10_4_if_comb[] = { .radar_detect_widths = BIT(NL80211_CHAN_WIDTH_20_NOHT) | BIT(NL80211_CHAN_WIDTH_20) | BIT(NL80211_CHAN_WIDTH_40) | - BIT(NL80211_CHAN_WIDTH_80), + BIT(NL80211_CHAN_WIDTH_80) | + BIT(NL80211_CHAN_WIDTH_80P80) | + BIT(NL80211_CHAN_WIDTH_160), #endif }, }; @@ -8643,7 +8679,9 @@ ieee80211_iface_combination ath10k_10_4_bcn_int_if_comb[] = { .radar_detect_widths = BIT(NL80211_CHAN_WIDTH_20_NOHT) | BIT(NL80211_CHAN_WIDTH_20) | BIT(NL80211_CHAN_WIDTH_40) | - BIT(NL80211_CHAN_WIDTH_80), + BIT(NL80211_CHAN_WIDTH_80) | + BIT(NL80211_CHAN_WIDTH_80P80) | + BIT(NL80211_CHAN_WIDTH_160), #endif }, }; diff --git a/drivers/net/wireless/ath/ath10k/wmi.c b/drivers/net/wireless/ath/ath10k/wmi.c index 2ea77bb880b1..db6f4c751485 100644 --- a/drivers/net/wireless/ath/ath10k/wmi.c +++ b/drivers/net/wireless/ath/ath10k/wmi.c @@ -1714,12 +1714,23 @@ void ath10k_wmi_put_wmi_channel(struct wmi_channel *ch, if (arg->chan_radar) flags |= WMI_CHAN_FLAG_DFS; + ch->band_center_freq2 = 0; ch->mhz = __cpu_to_le32(arg->freq); ch->band_center_freq1 = __cpu_to_le32(arg->band_center_freq1); if (arg->mode == MODE_11AC_VHT80_80) ch->band_center_freq2 = __cpu_to_le32(arg->band_center_freq2); - else - ch->band_center_freq2 = 0; + + if (arg->mode == MODE_11AC_VHT160) { + if (arg->freq > arg->band_center_freq1) + ch->band_center_freq1 = + __cpu_to_le32(arg->band_center_freq1 + 40); + else + ch->band_center_freq1 = + __cpu_to_le32(arg->band_center_freq1 - 40); + + ch->band_center_freq2 = __cpu_to_le32(arg->band_center_freq1); + } + ch->min_power = arg->min_power; ch->max_power = arg->max_power; ch->reg_power = arg->max_reg_power; @@ -7628,12 +7639,8 @@ ath10k_wmi_peer_assoc_fill_10_4(struct ath10k *ar, void *buf, struct wmi_10_4_peer_assoc_complete_cmd *cmd = buf; ath10k_wmi_peer_assoc_fill_10_2(ar, buf, arg); - if (arg->peer_bw_rxnss_override) - cmd->peer_bw_rxnss_override = - __cpu_to_le32((arg->peer_bw_rxnss_override - 1) | - BIT(PEER_BW_RXNSS_OVERRIDE_OFFSET)); - else - cmd->peer_bw_rxnss_override = 0; + cmd->peer_bw_rxnss_override = + __cpu_to_le32(arg->peer_bw_rxnss_override); } static int diff --git a/drivers/net/wireless/ath/ath10k/wmi.h b/drivers/net/wireless/ath/ath10k/wmi.h index 6df415778374..5ba0c9a7d18c 100644 --- a/drivers/net/wireless/ath/ath10k/wmi.h +++ b/drivers/net/wireless/ath/ath10k/wmi.h @@ -6508,7 +6508,10 @@ struct wmi_10_2_peer_assoc_complete_cmd { __le32 info0; /* WMI_PEER_ASSOC_INFO0_ */ } __packed; -#define PEER_BW_RXNSS_OVERRIDE_OFFSET 31 +/* NSS Mapping to FW */ +#define WMI_PEER_NSS_MAP_ENABLE BIT(31) +#define WMI_PEER_NSS_160MHZ_MASK GENMASK(2, 0) +#define WMI_PEER_NSS_80_80MHZ_MASK GENMASK(5, 3) struct wmi_10_4_peer_assoc_complete_cmd { struct wmi_10_2_peer_assoc_complete_cmd cmd; -- cgit v1.2.3-59-g8ed1b From 795def8b14ffa334881264823444eaab4d1879c3 Mon Sep 17 00:00:00 2001 From: Lei Wang Date: Mon, 30 Mar 2020 18:56:32 +0530 Subject: ath10k: enable radar detection in secondary segment Enable radar detection in secondary segment for VHT160 and VHT80+80 mode on DFS channels. Otherwise, when injecting radar pulse in the secondary segment, the DUT can't detect radar pulse. Tested: qca9984 with firmware ver 10.4-3.10-00047 Signed-off-by: Lei Wang Signed-off-by: Sowmiya Sree Elavalagan Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/1585574792-719-2-git-send-email-ssreeela@codeaurora.org --- drivers/net/wireless/ath/ath10k/wmi-tlv.c | 6 ++--- drivers/net/wireless/ath/ath10k/wmi.c | 39 ++++++++++++++++++++++--------- drivers/net/wireless/ath/ath10k/wmi.h | 5 ++-- 3 files changed, 34 insertions(+), 16 deletions(-) diff --git a/drivers/net/wireless/ath/ath10k/wmi-tlv.c b/drivers/net/wireless/ath/ath10k/wmi-tlv.c index 4e68debda9bf..e1ab900f2662 100644 --- a/drivers/net/wireless/ath/ath10k/wmi-tlv.c +++ b/drivers/net/wireless/ath/ath10k/wmi-tlv.c @@ -2123,7 +2123,7 @@ ath10k_wmi_tlv_op_gen_vdev_start(struct ath10k *ar, tlv->tag = __cpu_to_le16(WMI_TLV_TAG_STRUCT_CHANNEL); tlv->len = __cpu_to_le16(sizeof(*ch)); ch = (void *)tlv->value; - ath10k_wmi_put_wmi_channel(ch, &arg->channel); + ath10k_wmi_put_wmi_channel(ar, ch, &arg->channel); ptr += sizeof(*tlv); ptr += sizeof(*ch); @@ -2763,7 +2763,7 @@ ath10k_wmi_tlv_op_gen_scan_chan_list(struct ath10k *ar, tlv->len = __cpu_to_le16(sizeof(*ci)); ci = (void *)tlv->value; - ath10k_wmi_put_wmi_channel(ci, ch); + ath10k_wmi_put_wmi_channel(ar, ci, ch); chans += sizeof(*tlv); chans += sizeof(*ci); @@ -3450,7 +3450,7 @@ ath10k_wmi_tlv_op_gen_tdls_peer_update(struct ath10k *ar, tlv->tag = __cpu_to_le16(WMI_TLV_TAG_STRUCT_CHANNEL); tlv->len = __cpu_to_le16(sizeof(*chan)); chan = (void *)tlv->value; - ath10k_wmi_put_wmi_channel(chan, &chan_arg[i]); + ath10k_wmi_put_wmi_channel(ar, chan, &chan_arg[i]); ptr += sizeof(*tlv); ptr += sizeof(*chan); diff --git a/drivers/net/wireless/ath/ath10k/wmi.c b/drivers/net/wireless/ath/ath10k/wmi.c index db6f4c751485..4a3a698fe059 100644 --- a/drivers/net/wireless/ath/ath10k/wmi.c +++ b/drivers/net/wireless/ath/ath10k/wmi.c @@ -1694,10 +1694,11 @@ static const struct wmi_peer_flags_map wmi_10_2_peer_flags_map = { .bw160 = WMI_10_2_PEER_160MHZ, }; -void ath10k_wmi_put_wmi_channel(struct wmi_channel *ch, +void ath10k_wmi_put_wmi_channel(struct ath10k *ar, struct wmi_channel *ch, const struct wmi_channel_arg *arg) { u32 flags = 0; + struct ieee80211_channel *chan = NULL; memset(ch, 0, sizeof(*ch)); @@ -1717,20 +1718,36 @@ void ath10k_wmi_put_wmi_channel(struct wmi_channel *ch, ch->band_center_freq2 = 0; ch->mhz = __cpu_to_le32(arg->freq); ch->band_center_freq1 = __cpu_to_le32(arg->band_center_freq1); - if (arg->mode == MODE_11AC_VHT80_80) + if (arg->mode == MODE_11AC_VHT80_80) { ch->band_center_freq2 = __cpu_to_le32(arg->band_center_freq2); + chan = ieee80211_get_channel(ar->hw->wiphy, + arg->band_center_freq2 - 10); + } if (arg->mode == MODE_11AC_VHT160) { - if (arg->freq > arg->band_center_freq1) - ch->band_center_freq1 = - __cpu_to_le32(arg->band_center_freq1 + 40); - else - ch->band_center_freq1 = - __cpu_to_le32(arg->band_center_freq1 - 40); + u32 band_center_freq1; + u32 band_center_freq2; + + if (arg->freq > arg->band_center_freq1) { + band_center_freq1 = arg->band_center_freq1 + 40; + band_center_freq2 = arg->band_center_freq1 - 40; + } else { + band_center_freq1 = arg->band_center_freq1 - 40; + band_center_freq2 = arg->band_center_freq1 + 40; + } + ch->band_center_freq1 = + __cpu_to_le32(band_center_freq1); + /* Minus 10 to get a defined 5G channel frequency*/ + chan = ieee80211_get_channel(ar->hw->wiphy, + band_center_freq2 - 10); + /* The center frequency of the entire VHT160 */ ch->band_center_freq2 = __cpu_to_le32(arg->band_center_freq1); } + if (chan && chan->flags & IEEE80211_CHAN_RADAR) + flags |= WMI_CHAN_FLAG_DFS_CFREQ2; + ch->min_power = arg->min_power; ch->max_power = arg->max_power; ch->reg_power = arg->max_reg_power; @@ -7176,7 +7193,7 @@ ath10k_wmi_op_gen_vdev_start(struct ath10k *ar, memcpy(cmd->ssid.ssid, arg->ssid, arg->ssid_len); } - ath10k_wmi_put_wmi_channel(&cmd->chan, &arg->channel); + ath10k_wmi_put_wmi_channel(ar, &cmd->chan, &arg->channel); ath10k_dbg(ar, ATH10K_DBG_WMI, "wmi vdev %s id 0x%x flags: 0x%0X, freq %d, mode %d, ch_flags: 0x%0X, max_power: %d\n", @@ -7548,7 +7565,7 @@ ath10k_wmi_op_gen_scan_chan_list(struct ath10k *ar, ch = &arg->channels[i]; ci = &cmd->chan_info[i]; - ath10k_wmi_put_wmi_channel(ci, ch); + ath10k_wmi_put_wmi_channel(ar, ci, ch); } return skb; @@ -8952,7 +8969,7 @@ ath10k_wmi_10_4_gen_tdls_peer_update(struct ath10k *ar, for (i = 0; i < cap->peer_chan_len; i++) { chan = (struct wmi_channel *)&peer_cap->peer_chan_list[i]; - ath10k_wmi_put_wmi_channel(chan, &chan_arg[i]); + ath10k_wmi_put_wmi_channel(ar, chan, &chan_arg[i]); } ath10k_dbg(ar, ATH10K_DBG_WMI, diff --git a/drivers/net/wireless/ath/ath10k/wmi.h b/drivers/net/wireless/ath/ath10k/wmi.h index 5ba0c9a7d18c..209070714d1a 100644 --- a/drivers/net/wireless/ath/ath10k/wmi.h +++ b/drivers/net/wireless/ath/ath10k/wmi.h @@ -2094,7 +2094,8 @@ enum wmi_channel_change_cause { /* Indicate reason for channel switch */ #define WMI_CHANNEL_CHANGE_CAUSE_CSA (1 << 13) - +/* DFS required on channel for 2nd segment of VHT160 and VHT80+80*/ +#define WMI_CHAN_FLAG_DFS_CFREQ2 (1 << 15) #define WMI_MAX_SPATIAL_STREAM 3 /* default max ss */ /* HT Capabilities*/ @@ -7351,7 +7352,7 @@ void ath10k_wmi_put_start_scan_common(struct wmi_start_scan_common *cmn, const struct wmi_start_scan_arg *arg); void ath10k_wmi_set_wmm_param(struct wmi_wmm_params *params, const struct wmi_wmm_params_arg *arg); -void ath10k_wmi_put_wmi_channel(struct wmi_channel *ch, +void ath10k_wmi_put_wmi_channel(struct ath10k *ar, struct wmi_channel *ch, const struct wmi_channel_arg *arg); int ath10k_wmi_start_scan_verify(const struct wmi_start_scan_arg *arg); -- cgit v1.2.3-59-g8ed1b From acb31476adc9ff271140cdd4d3c707ff0c97f5a4 Mon Sep 17 00:00:00 2001 From: Venkateswara Naralasetty Date: Wed, 1 Apr 2020 15:48:10 +0530 Subject: ath10k: fix kernel null pointer dereference Currently sta airtime is updated without any lock in case of host based airtime calculation. Which may result in accessing the invalid sta pointer in case of continuous station connect/disconnect. This patch fix the kernel null pointer dereference by updating the station airtime with proper RCU lock in case of host based airtime calculation. Proceeding with the analysis of "ARM Kernel Panic". The APSS crash happened due to OOPS on CPU 0. Crash Signature : Unable to handle kernel NULL pointer dereference at virtual address 00000300 During the crash, PC points to "ieee80211_sta_register_airtime+0x1c/0x448 [mac80211]" LR points to "ath10k_txrx_tx_unref+0x17c/0x364 [ath10k_core]". The Backtrace obtained is as follows: [] (ieee80211_sta_register_airtime [mac80211]) from [] (ath10k_txrx_tx_unref+0x17c/0x364 [ath10k_core]) [] (ath10k_txrx_tx_unref [ath10k_core]) from [] (ath10k_htt_txrx_compl_task+0xa50/0xfc0 [ath10k_core]) [] (ath10k_htt_txrx_compl_task [ath10k_core]) from [] (ath10k_pci_napi_poll+0x50/0xf8 [ath10k_pci]) [] (ath10k_pci_napi_poll [ath10k_pci]) from [] (net_rx_action+0xac/0x160) [] (net_rx_action) from [] (__do_softirq+0x104/0x294) [] (__do_softirq) from [] (run_ksoftirqd+0x30/0x90) [] (run_ksoftirqd) from [] (smpboot_thread_fn+0x25c/0x274) [] (smpboot_thread_fn) from [] (kthread+0xd8/0xec) Tested HW: QCA9888 Tested FW: 10.4-3.10-00047 Signed-off-by: Venkateswara Naralasetty Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/1585736290-17661-1-git-send-email-vnaralas@codeaurora.org --- drivers/net/wireless/ath/ath10k/txrx.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/wireless/ath/ath10k/txrx.c b/drivers/net/wireless/ath/ath10k/txrx.c index 39abf8b12903..f46b9083bbf1 100644 --- a/drivers/net/wireless/ath/ath10k/txrx.c +++ b/drivers/net/wireless/ath/ath10k/txrx.c @@ -84,9 +84,11 @@ int ath10k_txrx_tx_unref(struct ath10k_htt *htt, wake_up(&htt->empty_tx_wq); spin_unlock_bh(&htt->tx_lock); + rcu_read_lock(); if (txq && txq->sta && skb_cb->airtime_est) ieee80211_sta_register_airtime(txq->sta, txq->tid, skb_cb->airtime_est, 0); + rcu_read_unlock(); if (ar->bus_param.dev_type != ATH10K_DEV_TYPE_HL) dma_unmap_single(dev, skb_cb->paddr, msdu->len, DMA_TO_DEVICE); -- cgit v1.2.3-59-g8ed1b From ced21a4c726bdc60b1680c050a284b08803bc64c Mon Sep 17 00:00:00 2001 From: Qiujun Huang Date: Sat, 4 Apr 2020 12:18:34 +0800 Subject: ath9k: Fix use-after-free Read in htc_connect_service The skb is consumed by htc_send_epid, so it needn't release again. The case reported by syzbot: https://lore.kernel.org/linux-usb/000000000000590f6b05a1c05d15@google.com usb 1-1: ath9k_htc: Firmware ath9k_htc/htc_9271-1.4.0.fw requested usb 1-1: ath9k_htc: Transferred FW: ath9k_htc/htc_9271-1.4.0.fw, size: 51008 usb 1-1: Service connection timeout for: 256 ================================================================== BUG: KASAN: use-after-free in atomic_read include/asm-generic/atomic-instrumented.h:26 [inline] BUG: KASAN: use-after-free in refcount_read include/linux/refcount.h:134 [inline] BUG: KASAN: use-after-free in skb_unref include/linux/skbuff.h:1042 [inline] BUG: KASAN: use-after-free in kfree_skb+0x32/0x3d0 net/core/skbuff.c:692 Read of size 4 at addr ffff8881d0957994 by task kworker/1:2/83 Call Trace: kfree_skb+0x32/0x3d0 net/core/skbuff.c:692 htc_connect_service.cold+0xa9/0x109 drivers/net/wireless/ath/ath9k/htc_hst.c:282 ath9k_wmi_connect+0xd2/0x1a0 drivers/net/wireless/ath/ath9k/wmi.c:265 ath9k_init_htc_services.constprop.0+0xb4/0x650 drivers/net/wireless/ath/ath9k/htc_drv_init.c:146 ath9k_htc_probe_device+0x25a/0x1d80 drivers/net/wireless/ath/ath9k/htc_drv_init.c:959 ath9k_htc_hw_init+0x31/0x60 drivers/net/wireless/ath/ath9k/htc_hst.c:501 ath9k_hif_usb_firmware_cb+0x26b/0x500 drivers/net/wireless/ath/ath9k/hif_usb.c:1187 request_firmware_work_func+0x126/0x242 drivers/base/firmware_loader/main.c:976 process_one_work+0x94b/0x1620 kernel/workqueue.c:2264 worker_thread+0x96/0xe20 kernel/workqueue.c:2410 kthread+0x318/0x420 kernel/kthread.c:255 ret_from_fork+0x24/0x30 arch/x86/entry/entry_64.S:352 Allocated by task 83: kmem_cache_alloc_node+0xdc/0x330 mm/slub.c:2814 __alloc_skb+0xba/0x5a0 net/core/skbuff.c:198 alloc_skb include/linux/skbuff.h:1081 [inline] htc_connect_service+0x2cc/0x840 drivers/net/wireless/ath/ath9k/htc_hst.c:257 ath9k_wmi_connect+0xd2/0x1a0 drivers/net/wireless/ath/ath9k/wmi.c:265 ath9k_init_htc_services.constprop.0+0xb4/0x650 drivers/net/wireless/ath/ath9k/htc_drv_init.c:146 ath9k_htc_probe_device+0x25a/0x1d80 drivers/net/wireless/ath/ath9k/htc_drv_init.c:959 ath9k_htc_hw_init+0x31/0x60 drivers/net/wireless/ath/ath9k/htc_hst.c:501 ath9k_hif_usb_firmware_cb+0x26b/0x500 drivers/net/wireless/ath/ath9k/hif_usb.c:1187 request_firmware_work_func+0x126/0x242 drivers/base/firmware_loader/main.c:976 process_one_work+0x94b/0x1620 kernel/workqueue.c:2264 worker_thread+0x96/0xe20 kernel/workqueue.c:2410 kthread+0x318/0x420 kernel/kthread.c:255 ret_from_fork+0x24/0x30 arch/x86/entry/entry_64.S:352 Freed by task 0: kfree_skb+0x102/0x3d0 net/core/skbuff.c:690 ath9k_htc_txcompletion_cb+0x1f8/0x2b0 drivers/net/wireless/ath/ath9k/htc_hst.c:356 hif_usb_regout_cb+0x10b/0x1b0 drivers/net/wireless/ath/ath9k/hif_usb.c:90 __usb_hcd_giveback_urb+0x29a/0x550 drivers/usb/core/hcd.c:1650 usb_hcd_giveback_urb+0x368/0x420 drivers/usb/core/hcd.c:1716 dummy_timer+0x1258/0x32ae drivers/usb/gadget/udc/dummy_hcd.c:1966 call_timer_fn+0x195/0x6f0 kernel/time/timer.c:1404 expire_timers kernel/time/timer.c:1449 [inline] __run_timers kernel/time/timer.c:1773 [inline] __run_timers kernel/time/timer.c:1740 [inline] run_timer_softirq+0x5f9/0x1500 kernel/time/timer.c:1786 __do_softirq+0x21e/0x950 kernel/softirq.c:292 Reported-and-tested-by: syzbot+9505af1ae303dabdc646@syzkaller.appspotmail.com Signed-off-by: Qiujun Huang Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/20200404041838.10426-2-hqjagain@gmail.com --- drivers/net/wireless/ath/ath9k/htc_hst.c | 3 --- drivers/net/wireless/ath/ath9k/wmi.c | 1 - 2 files changed, 4 deletions(-) diff --git a/drivers/net/wireless/ath/ath9k/htc_hst.c b/drivers/net/wireless/ath/ath9k/htc_hst.c index d091c8ebdcf0..1bf63a4efb4c 100644 --- a/drivers/net/wireless/ath/ath9k/htc_hst.c +++ b/drivers/net/wireless/ath/ath9k/htc_hst.c @@ -170,7 +170,6 @@ static int htc_config_pipe_credits(struct htc_target *target) time_left = wait_for_completion_timeout(&target->cmd_wait, HZ); if (!time_left) { dev_err(target->dev, "HTC credit config timeout\n"); - kfree_skb(skb); return -ETIMEDOUT; } @@ -206,7 +205,6 @@ static int htc_setup_complete(struct htc_target *target) time_left = wait_for_completion_timeout(&target->cmd_wait, HZ); if (!time_left) { dev_err(target->dev, "HTC start timeout\n"); - kfree_skb(skb); return -ETIMEDOUT; } @@ -279,7 +277,6 @@ int htc_connect_service(struct htc_target *target, if (!time_left) { dev_err(target->dev, "Service connection timeout for: %d\n", service_connreq->service_id); - kfree_skb(skb); return -ETIMEDOUT; } diff --git a/drivers/net/wireless/ath/ath9k/wmi.c b/drivers/net/wireless/ath/ath9k/wmi.c index cdc146091194..d1f6710ca63b 100644 --- a/drivers/net/wireless/ath/ath9k/wmi.c +++ b/drivers/net/wireless/ath/ath9k/wmi.c @@ -336,7 +336,6 @@ int ath9k_wmi_cmd(struct wmi *wmi, enum wmi_cmd_id cmd_id, ath_dbg(common, WMI, "Timeout waiting for WMI command: %s\n", wmi_cmd_to_name(cmd_id)); mutex_unlock(&wmi->op_mutex); - kfree_skb(skb); return -ETIMEDOUT; } -- cgit v1.2.3-59-g8ed1b From abeaa85054ff8cfe8b99aafc5c70ea067e5d0908 Mon Sep 17 00:00:00 2001 From: Qiujun Huang Date: Sat, 4 Apr 2020 12:18:35 +0800 Subject: ath9k: Fix use-after-free Read in ath9k_wmi_ctrl_rx Free wmi later after cmd urb has been killed, as urb cb will access wmi. the case reported by syzbot: https://lore.kernel.org/linux-usb/0000000000000002fc05a1d61a68@google.com BUG: KASAN: use-after-free in ath9k_wmi_ctrl_rx+0x416/0x500 drivers/net/wireless/ath/ath9k/wmi.c:215 Read of size 1 at addr ffff8881cef1417c by task swapper/1/0 Call Trace: ath9k_wmi_ctrl_rx+0x416/0x500 drivers/net/wireless/ath/ath9k/wmi.c:215 ath9k_htc_rx_msg+0x2da/0xaf0 drivers/net/wireless/ath/ath9k/htc_hst.c:459 ath9k_hif_usb_reg_in_cb+0x1ba/0x630 drivers/net/wireless/ath/ath9k/hif_usb.c:718 __usb_hcd_giveback_urb+0x29a/0x550 drivers/usb/core/hcd.c:1650 usb_hcd_giveback_urb+0x368/0x420 drivers/usb/core/hcd.c:1716 dummy_timer+0x1258/0x32ae drivers/usb/gadget/udc/dummy_hcd.c:1966 call_timer_fn+0x195/0x6f0 kernel/time/timer.c:1404 expire_timers kernel/time/timer.c:1449 [inline] __run_timers kernel/time/timer.c:1773 [inline] __run_timers kernel/time/timer.c:1740 [inline] run_timer_softirq+0x5f9/0x1500 kernel/time/timer.c:1786 Reported-and-tested-by: syzbot+5d338854440137ea0fef@syzkaller.appspotmail.com Signed-off-by: Qiujun Huang Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/20200404041838.10426-3-hqjagain@gmail.com --- drivers/net/wireless/ath/ath9k/hif_usb.c | 5 +++-- drivers/net/wireless/ath/ath9k/hif_usb.h | 1 + drivers/net/wireless/ath/ath9k/htc_drv_init.c | 10 +++++++--- drivers/net/wireless/ath/ath9k/wmi.c | 5 ++++- drivers/net/wireless/ath/ath9k/wmi.h | 3 ++- 5 files changed, 17 insertions(+), 7 deletions(-) diff --git a/drivers/net/wireless/ath/ath9k/hif_usb.c b/drivers/net/wireless/ath/ath9k/hif_usb.c index dd0c32379375..f227e19087ff 100644 --- a/drivers/net/wireless/ath/ath9k/hif_usb.c +++ b/drivers/net/wireless/ath/ath9k/hif_usb.c @@ -973,7 +973,7 @@ err: return -ENOMEM; } -static void ath9k_hif_usb_dealloc_urbs(struct hif_device_usb *hif_dev) +void ath9k_hif_usb_dealloc_urbs(struct hif_device_usb *hif_dev) { usb_kill_anchored_urbs(&hif_dev->regout_submitted); ath9k_hif_usb_dealloc_reg_in_urbs(hif_dev); @@ -1341,8 +1341,9 @@ static void ath9k_hif_usb_disconnect(struct usb_interface *interface) if (hif_dev->flags & HIF_USB_READY) { ath9k_htc_hw_deinit(hif_dev->htc_handle, unplugged); - ath9k_htc_hw_free(hif_dev->htc_handle); ath9k_hif_usb_dev_deinit(hif_dev); + ath9k_destoy_wmi(hif_dev->htc_handle->drv_priv); + ath9k_htc_hw_free(hif_dev->htc_handle); } usb_set_intfdata(interface, NULL); diff --git a/drivers/net/wireless/ath/ath9k/hif_usb.h b/drivers/net/wireless/ath/ath9k/hif_usb.h index 7846916aa01d..a94e7e1c86e9 100644 --- a/drivers/net/wireless/ath/ath9k/hif_usb.h +++ b/drivers/net/wireless/ath/ath9k/hif_usb.h @@ -133,5 +133,6 @@ struct hif_device_usb { int ath9k_hif_usb_init(void); void ath9k_hif_usb_exit(void); +void ath9k_hif_usb_dealloc_urbs(struct hif_device_usb *hif_dev); #endif /* HTC_USB_H */ diff --git a/drivers/net/wireless/ath/ath9k/htc_drv_init.c b/drivers/net/wireless/ath/ath9k/htc_drv_init.c index d961095ab01f..40a065028ebe 100644 --- a/drivers/net/wireless/ath/ath9k/htc_drv_init.c +++ b/drivers/net/wireless/ath/ath9k/htc_drv_init.c @@ -931,8 +931,9 @@ err_init: int ath9k_htc_probe_device(struct htc_target *htc_handle, struct device *dev, u16 devid, char *product, u32 drv_info) { - struct ieee80211_hw *hw; + struct hif_device_usb *hif_dev; struct ath9k_htc_priv *priv; + struct ieee80211_hw *hw; int ret; hw = ieee80211_alloc_hw(sizeof(struct ath9k_htc_priv), &ath9k_htc_ops); @@ -967,7 +968,10 @@ int ath9k_htc_probe_device(struct htc_target *htc_handle, struct device *dev, return 0; err_init: - ath9k_deinit_wmi(priv); + ath9k_stop_wmi(priv); + hif_dev = (struct hif_device_usb *)htc_handle->hif_dev; + ath9k_hif_usb_dealloc_urbs(hif_dev); + ath9k_destoy_wmi(priv); err_free: ieee80211_free_hw(hw); return ret; @@ -982,7 +986,7 @@ void ath9k_htc_disconnect_device(struct htc_target *htc_handle, bool hotunplug) htc_handle->drv_priv->ah->ah_flags |= AH_UNPLUGGED; ath9k_deinit_device(htc_handle->drv_priv); - ath9k_deinit_wmi(htc_handle->drv_priv); + ath9k_stop_wmi(htc_handle->drv_priv); ieee80211_free_hw(htc_handle->drv_priv->hw); } } diff --git a/drivers/net/wireless/ath/ath9k/wmi.c b/drivers/net/wireless/ath/ath9k/wmi.c index d1f6710ca63b..e7a3127395be 100644 --- a/drivers/net/wireless/ath/ath9k/wmi.c +++ b/drivers/net/wireless/ath/ath9k/wmi.c @@ -112,14 +112,17 @@ struct wmi *ath9k_init_wmi(struct ath9k_htc_priv *priv) return wmi; } -void ath9k_deinit_wmi(struct ath9k_htc_priv *priv) +void ath9k_stop_wmi(struct ath9k_htc_priv *priv) { struct wmi *wmi = priv->wmi; mutex_lock(&wmi->op_mutex); wmi->stopped = true; mutex_unlock(&wmi->op_mutex); +} +void ath9k_destoy_wmi(struct ath9k_htc_priv *priv) +{ kfree(priv->wmi); } diff --git a/drivers/net/wireless/ath/ath9k/wmi.h b/drivers/net/wireless/ath/ath9k/wmi.h index 380175d5ecd7..d8b912206232 100644 --- a/drivers/net/wireless/ath/ath9k/wmi.h +++ b/drivers/net/wireless/ath/ath9k/wmi.h @@ -179,7 +179,6 @@ struct wmi { }; struct wmi *ath9k_init_wmi(struct ath9k_htc_priv *priv); -void ath9k_deinit_wmi(struct ath9k_htc_priv *priv); int ath9k_wmi_connect(struct htc_target *htc, struct wmi *wmi, enum htc_endpoint_id *wmi_ctrl_epid); int ath9k_wmi_cmd(struct wmi *wmi, enum wmi_cmd_id cmd_id, @@ -189,6 +188,8 @@ int ath9k_wmi_cmd(struct wmi *wmi, enum wmi_cmd_id cmd_id, void ath9k_wmi_event_tasklet(unsigned long data); void ath9k_fatal_work(struct work_struct *work); void ath9k_wmi_event_drain(struct ath9k_htc_priv *priv); +void ath9k_stop_wmi(struct ath9k_htc_priv *priv); +void ath9k_destoy_wmi(struct ath9k_htc_priv *priv); #define WMI_CMD(_wmi_cmd) \ do { \ -- cgit v1.2.3-59-g8ed1b From e4ff08a4d727146bb6717a39a8d399d834654345 Mon Sep 17 00:00:00 2001 From: Qiujun Huang Date: Sat, 4 Apr 2020 12:18:36 +0800 Subject: ath9k: Fix use-after-free Write in ath9k_htc_rx_msg Write out of slab bounds. We should check epid. The case reported by syzbot: https://lore.kernel.org/linux-usb/0000000000006ac55b05a1c05d72@google.com BUG: KASAN: use-after-free in htc_process_conn_rsp drivers/net/wireless/ath/ath9k/htc_hst.c:131 [inline] BUG: KASAN: use-after-free in ath9k_htc_rx_msg+0xa25/0xaf0 drivers/net/wireless/ath/ath9k/htc_hst.c:443 Write of size 2 at addr ffff8881cea291f0 by task swapper/1/0 Call Trace: htc_process_conn_rsp drivers/net/wireless/ath/ath9k/htc_hst.c:131 [inline] ath9k_htc_rx_msg+0xa25/0xaf0 drivers/net/wireless/ath/ath9k/htc_hst.c:443 ath9k_hif_usb_reg_in_cb+0x1ba/0x630 drivers/net/wireless/ath/ath9k/hif_usb.c:718 __usb_hcd_giveback_urb+0x29a/0x550 drivers/usb/core/hcd.c:1650 usb_hcd_giveback_urb+0x368/0x420 drivers/usb/core/hcd.c:1716 dummy_timer+0x1258/0x32ae drivers/usb/gadget/udc/dummy_hcd.c:1966 call_timer_fn+0x195/0x6f0 kernel/time/timer.c:1404 expire_timers kernel/time/timer.c:1449 [inline] __run_timers kernel/time/timer.c:1773 [inline] __run_timers kernel/time/timer.c:1740 [inline] run_timer_softirq+0x5f9/0x1500 kernel/time/timer.c:1786 Reported-and-tested-by: syzbot+b1c61e5f11be5782f192@syzkaller.appspotmail.com Signed-off-by: Qiujun Huang Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/20200404041838.10426-4-hqjagain@gmail.com --- drivers/net/wireless/ath/ath9k/htc_hst.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/net/wireless/ath/ath9k/htc_hst.c b/drivers/net/wireless/ath/ath9k/htc_hst.c index 1bf63a4efb4c..d2e062eaf561 100644 --- a/drivers/net/wireless/ath/ath9k/htc_hst.c +++ b/drivers/net/wireless/ath/ath9k/htc_hst.c @@ -113,6 +113,9 @@ static void htc_process_conn_rsp(struct htc_target *target, if (svc_rspmsg->status == HTC_SERVICE_SUCCESS) { epid = svc_rspmsg->endpoint_id; + if (epid < 0 || epid >= ENDPOINT_MAX) + return; + service_id = be16_to_cpu(svc_rspmsg->service_id); max_msglen = be16_to_cpu(svc_rspmsg->max_msg_len); endpoint = &target->endpoint[epid]; -- cgit v1.2.3-59-g8ed1b From 19d6c375d671ce9949a864fb9a03e19f5487b4d3 Mon Sep 17 00:00:00 2001 From: Qiujun Huang Date: Sat, 4 Apr 2020 12:18:37 +0800 Subject: ath9x: Fix stack-out-of-bounds Write in ath9k_hif_usb_rx_cb Add barrier to accessing the stack array skb_pool. The case reported by syzbot: https://lore.kernel.org/linux-usb/0000000000003d7c1505a2168418@google.com BUG: KASAN: stack-out-of-bounds in ath9k_hif_usb_rx_stream drivers/net/wireless/ath/ath9k/hif_usb.c:626 [inline] BUG: KASAN: stack-out-of-bounds in ath9k_hif_usb_rx_cb+0xdf6/0xf70 drivers/net/wireless/ath/ath9k/hif_usb.c:666 Write of size 8 at addr ffff8881db309a28 by task swapper/1/0 Call Trace: ath9k_hif_usb_rx_stream drivers/net/wireless/ath/ath9k/hif_usb.c:626 [inline] ath9k_hif_usb_rx_cb+0xdf6/0xf70 drivers/net/wireless/ath/ath9k/hif_usb.c:666 __usb_hcd_giveback_urb+0x1f2/0x470 drivers/usb/core/hcd.c:1648 usb_hcd_giveback_urb+0x368/0x420 drivers/usb/core/hcd.c:1713 dummy_timer+0x1258/0x32ae drivers/usb/gadget/udc/dummy_hcd.c:1966 call_timer_fn+0x195/0x6f0 kernel/time/timer.c:1404 expire_timers kernel/time/timer.c:1449 [inline] __run_timers kernel/time/timer.c:1773 [inline] __run_timers kernel/time/timer.c:1740 [inline] run_timer_softirq+0x5f9/0x1500 kernel/time/timer.c:1786 Reported-and-tested-by: syzbot+d403396d4df67ad0bd5f@syzkaller.appspotmail.com Signed-off-by: Qiujun Huang Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/20200404041838.10426-5-hqjagain@gmail.com --- drivers/net/wireless/ath/ath9k/hif_usb.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/net/wireless/ath/ath9k/hif_usb.c b/drivers/net/wireless/ath/ath9k/hif_usb.c index f227e19087ff..6049d3766c64 100644 --- a/drivers/net/wireless/ath/ath9k/hif_usb.c +++ b/drivers/net/wireless/ath/ath9k/hif_usb.c @@ -612,6 +612,11 @@ static void ath9k_hif_usb_rx_stream(struct hif_device_usb *hif_dev, hif_dev->remain_skb = nskb; spin_unlock(&hif_dev->rx_lock); } else { + if (pool_index == MAX_PKT_NUM_IN_TRANSFER) { + dev_err(&hif_dev->udev->dev, + "ath9k_htc: over RX MAX_PKT_NUM\n"); + goto err; + } nskb = __dev_alloc_skb(pkt_len + 32, GFP_ATOMIC); if (!nskb) { dev_err(&hif_dev->udev->dev, -- cgit v1.2.3-59-g8ed1b From 2bbcaaee1fcbd83272e29f31e2bb7e70d8c49e05 Mon Sep 17 00:00:00 2001 From: Qiujun Huang Date: Sat, 4 Apr 2020 12:18:38 +0800 Subject: ath9k: Fix general protection fault in ath9k_hif_usb_rx_cb In ath9k_hif_usb_rx_cb interface number is assumed to be 0. usb_ifnum_to_if(urb->dev, 0) But it isn't always true. The case reported by syzbot: https://lore.kernel.org/linux-usb/000000000000666c9c05a1c05d12@google.com usb 2-1: new high-speed USB device number 2 using dummy_hcd usb 2-1: config 1 has an invalid interface number: 2 but max is 0 usb 2-1: config 1 has no interface number 0 usb 2-1: New USB device found, idVendor=0cf3, idProduct=9271, bcdDevice= 1.08 usb 2-1: New USB device strings: Mfr=1, Product=2, SerialNumber=3 general protection fault, probably for non-canonical address 0xdffffc0000000015: 0000 [#1] SMP KASAN KASAN: null-ptr-deref in range [0x00000000000000a8-0x00000000000000af] CPU: 0 PID: 0 Comm: swapper/0 Not tainted 5.6.0-rc5-syzkaller #0 Call Trace __usb_hcd_giveback_urb+0x29a/0x550 drivers/usb/core/hcd.c:1650 usb_hcd_giveback_urb+0x368/0x420 drivers/usb/core/hcd.c:1716 dummy_timer+0x1258/0x32ae drivers/usb/gadget/udc/dummy_hcd.c:1966 call_timer_fn+0x195/0x6f0 kernel/time/timer.c:1404 expire_timers kernel/time/timer.c:1449 [inline] __run_timers kernel/time/timer.c:1773 [inline] __run_timers kernel/time/timer.c:1740 [inline] run_timer_softirq+0x5f9/0x1500 kernel/time/timer.c:1786 __do_softirq+0x21e/0x950 kernel/softirq.c:292 invoke_softirq kernel/softirq.c:373 [inline] irq_exit+0x178/0x1a0 kernel/softirq.c:413 exiting_irq arch/x86/include/asm/apic.h:546 [inline] smp_apic_timer_interrupt+0x141/0x540 arch/x86/kernel/apic/apic.c:1146 apic_timer_interrupt+0xf/0x20 arch/x86/entry/entry_64.S:829 Reported-and-tested-by: syzbot+40d5d2e8a4680952f042@syzkaller.appspotmail.com Signed-off-by: Qiujun Huang Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/20200404041838.10426-6-hqjagain@gmail.com --- drivers/net/wireless/ath/ath9k/hif_usb.c | 48 ++++++++++++++++++++++++-------- drivers/net/wireless/ath/ath9k/hif_usb.h | 5 ++++ 2 files changed, 42 insertions(+), 11 deletions(-) diff --git a/drivers/net/wireless/ath/ath9k/hif_usb.c b/drivers/net/wireless/ath/ath9k/hif_usb.c index 6049d3766c64..4ed21dad6a8e 100644 --- a/drivers/net/wireless/ath/ath9k/hif_usb.c +++ b/drivers/net/wireless/ath/ath9k/hif_usb.c @@ -643,9 +643,9 @@ err: static void ath9k_hif_usb_rx_cb(struct urb *urb) { - struct sk_buff *skb = (struct sk_buff *) urb->context; - struct hif_device_usb *hif_dev = - usb_get_intfdata(usb_ifnum_to_if(urb->dev, 0)); + struct rx_buf *rx_buf = (struct rx_buf *)urb->context; + struct hif_device_usb *hif_dev = rx_buf->hif_dev; + struct sk_buff *skb = rx_buf->skb; int ret; if (!skb) @@ -685,14 +685,15 @@ resubmit: return; free: kfree_skb(skb); + kfree(rx_buf); } static void ath9k_hif_usb_reg_in_cb(struct urb *urb) { - struct sk_buff *skb = (struct sk_buff *) urb->context; + struct rx_buf *rx_buf = (struct rx_buf *)urb->context; + struct hif_device_usb *hif_dev = rx_buf->hif_dev; + struct sk_buff *skb = rx_buf->skb; struct sk_buff *nskb; - struct hif_device_usb *hif_dev = - usb_get_intfdata(usb_ifnum_to_if(urb->dev, 0)); int ret; if (!skb) @@ -750,6 +751,7 @@ resubmit: return; free: kfree_skb(skb); + kfree(rx_buf); urb->context = NULL; } @@ -795,7 +797,7 @@ static int ath9k_hif_usb_alloc_tx_urbs(struct hif_device_usb *hif_dev) init_usb_anchor(&hif_dev->mgmt_submitted); for (i = 0; i < MAX_TX_URB_NUM; i++) { - tx_buf = kzalloc(sizeof(struct tx_buf), GFP_KERNEL); + tx_buf = kzalloc(sizeof(*tx_buf), GFP_KERNEL); if (!tx_buf) goto err; @@ -832,8 +834,9 @@ static void ath9k_hif_usb_dealloc_rx_urbs(struct hif_device_usb *hif_dev) static int ath9k_hif_usb_alloc_rx_urbs(struct hif_device_usb *hif_dev) { - struct urb *urb = NULL; + struct rx_buf *rx_buf = NULL; struct sk_buff *skb = NULL; + struct urb *urb = NULL; int i, ret; init_usb_anchor(&hif_dev->rx_submitted); @@ -841,6 +844,12 @@ static int ath9k_hif_usb_alloc_rx_urbs(struct hif_device_usb *hif_dev) for (i = 0; i < MAX_RX_URB_NUM; i++) { + rx_buf = kzalloc(sizeof(*rx_buf), GFP_KERNEL); + if (!rx_buf) { + ret = -ENOMEM; + goto err_rxb; + } + /* Allocate URB */ urb = usb_alloc_urb(0, GFP_KERNEL); if (urb == NULL) { @@ -855,11 +864,14 @@ static int ath9k_hif_usb_alloc_rx_urbs(struct hif_device_usb *hif_dev) goto err_skb; } + rx_buf->hif_dev = hif_dev; + rx_buf->skb = skb; + usb_fill_bulk_urb(urb, hif_dev->udev, usb_rcvbulkpipe(hif_dev->udev, USB_WLAN_RX_PIPE), skb->data, MAX_RX_BUF_SIZE, - ath9k_hif_usb_rx_cb, skb); + ath9k_hif_usb_rx_cb, rx_buf); /* Anchor URB */ usb_anchor_urb(urb, &hif_dev->rx_submitted); @@ -885,6 +897,8 @@ err_submit: err_skb: usb_free_urb(urb); err_urb: + kfree(rx_buf); +err_rxb: ath9k_hif_usb_dealloc_rx_urbs(hif_dev); return ret; } @@ -896,14 +910,21 @@ static void ath9k_hif_usb_dealloc_reg_in_urbs(struct hif_device_usb *hif_dev) static int ath9k_hif_usb_alloc_reg_in_urbs(struct hif_device_usb *hif_dev) { - struct urb *urb = NULL; + struct rx_buf *rx_buf = NULL; struct sk_buff *skb = NULL; + struct urb *urb = NULL; int i, ret; init_usb_anchor(&hif_dev->reg_in_submitted); for (i = 0; i < MAX_REG_IN_URB_NUM; i++) { + rx_buf = kzalloc(sizeof(*rx_buf), GFP_KERNEL); + if (!rx_buf) { + ret = -ENOMEM; + goto err_rxb; + } + /* Allocate URB */ urb = usb_alloc_urb(0, GFP_KERNEL); if (urb == NULL) { @@ -918,11 +939,14 @@ static int ath9k_hif_usb_alloc_reg_in_urbs(struct hif_device_usb *hif_dev) goto err_skb; } + rx_buf->hif_dev = hif_dev; + rx_buf->skb = skb; + usb_fill_int_urb(urb, hif_dev->udev, usb_rcvintpipe(hif_dev->udev, USB_REG_IN_PIPE), skb->data, MAX_REG_IN_BUF_SIZE, - ath9k_hif_usb_reg_in_cb, skb, 1); + ath9k_hif_usb_reg_in_cb, rx_buf, 1); /* Anchor URB */ usb_anchor_urb(urb, &hif_dev->reg_in_submitted); @@ -948,6 +972,8 @@ err_submit: err_skb: usb_free_urb(urb); err_urb: + kfree(rx_buf); +err_rxb: ath9k_hif_usb_dealloc_reg_in_urbs(hif_dev); return ret; } diff --git a/drivers/net/wireless/ath/ath9k/hif_usb.h b/drivers/net/wireless/ath/ath9k/hif_usb.h index a94e7e1c86e9..5985aa15ca93 100644 --- a/drivers/net/wireless/ath/ath9k/hif_usb.h +++ b/drivers/net/wireless/ath/ath9k/hif_usb.h @@ -86,6 +86,11 @@ struct tx_buf { struct list_head list; }; +struct rx_buf { + struct sk_buff *skb; + struct hif_device_usb *hif_dev; +}; + #define HIF_USB_TX_STOP BIT(0) #define HIF_USB_TX_FLUSH BIT(1) -- cgit v1.2.3-59-g8ed1b From c2aa30db744d9cbdde127d4ed8aeea18273834c6 Mon Sep 17 00:00:00 2001 From: Archie Pusaka Date: Tue, 7 Apr 2020 12:26:27 +0800 Subject: Bluetooth: debugfs option to unset MITM flag The BT qualification test SM/MAS/PKE/BV-01-C needs us to turn off the MITM flag when pairing, and at the same time also set the io capability to something other than no input no output. Currently the MITM flag is only unset when the io capability is set to no input no output, therefore the test cannot be executed. This patch introduces a debugfs option to force MITM flag to be turned off. Signed-off-by: Archie Pusaka Signed-off-by: Marcel Holtmann --- include/net/bluetooth/hci.h | 1 + net/bluetooth/hci_debugfs.c | 46 +++++++++++++++++++++++++++++++++++++++++++++ net/bluetooth/smp.c | 15 ++++++++++----- 3 files changed, 57 insertions(+), 5 deletions(-) diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h index 79de2a659dd6..f4e8e2a0b7c1 100644 --- a/include/net/bluetooth/hci.h +++ b/include/net/bluetooth/hci.h @@ -298,6 +298,7 @@ enum { HCI_FORCE_STATIC_ADDR, HCI_LL_RPA_RESOLUTION, HCI_CMD_PENDING, + HCI_FORCE_NO_MITM, __HCI_NUM_FLAGS, }; diff --git a/net/bluetooth/hci_debugfs.c b/net/bluetooth/hci_debugfs.c index 6b1314c738b8..5e8af2658e44 100644 --- a/net/bluetooth/hci_debugfs.c +++ b/net/bluetooth/hci_debugfs.c @@ -1075,6 +1075,50 @@ DEFINE_SIMPLE_ATTRIBUTE(auth_payload_timeout_fops, auth_payload_timeout_get, auth_payload_timeout_set, "%llu\n"); +static ssize_t force_no_mitm_read(struct file *file, + char __user *user_buf, + size_t count, loff_t *ppos) +{ + struct hci_dev *hdev = file->private_data; + char buf[3]; + + buf[0] = hci_dev_test_flag(hdev, HCI_FORCE_NO_MITM) ? 'Y' : 'N'; + buf[1] = '\n'; + buf[2] = '\0'; + return simple_read_from_buffer(user_buf, count, ppos, buf, 2); +} + +static ssize_t force_no_mitm_write(struct file *file, + const char __user *user_buf, + size_t count, loff_t *ppos) +{ + struct hci_dev *hdev = file->private_data; + char buf[32]; + size_t buf_size = min(count, (sizeof(buf) - 1)); + bool enable; + + if (copy_from_user(buf, user_buf, buf_size)) + return -EFAULT; + + buf[buf_size] = '\0'; + if (strtobool(buf, &enable)) + return -EINVAL; + + if (enable == hci_dev_test_flag(hdev, HCI_FORCE_NO_MITM)) + return -EALREADY; + + hci_dev_change_flag(hdev, HCI_FORCE_NO_MITM); + + return count; +} + +static const struct file_operations force_no_mitm_fops = { + .open = simple_open, + .read = force_no_mitm_read, + .write = force_no_mitm_write, + .llseek = default_llseek, +}; + DEFINE_QUIRK_ATTRIBUTE(quirk_strict_duplicate_filter, HCI_QUIRK_STRICT_DUPLICATE_FILTER); DEFINE_QUIRK_ATTRIBUTE(quirk_simultaneous_discovery, @@ -1134,6 +1178,8 @@ void hci_debugfs_create_le(struct hci_dev *hdev) &max_key_size_fops); debugfs_create_file("auth_payload_timeout", 0644, hdev->debugfs, hdev, &auth_payload_timeout_fops); + debugfs_create_file("force_no_mitm", 0644, hdev->debugfs, hdev, + &force_no_mitm_fops); debugfs_create_file("quirk_strict_duplicate_filter", 0644, hdev->debugfs, hdev, diff --git a/net/bluetooth/smp.c b/net/bluetooth/smp.c index d0b695ee49f6..a85e3e49cd0d 100644 --- a/net/bluetooth/smp.c +++ b/net/bluetooth/smp.c @@ -2393,12 +2393,17 @@ int smp_conn_security(struct hci_conn *hcon, __u8 sec_level) authreq |= SMP_AUTH_CT2; } - /* Require MITM if IO Capability allows or the security level - * requires it. + /* Don't attempt to set MITM if setting is overridden by debugfs + * Needed to pass certification test SM/MAS/PKE/BV-01-C */ - if (hcon->io_capability != HCI_IO_NO_INPUT_OUTPUT || - hcon->pending_sec_level > BT_SECURITY_MEDIUM) - authreq |= SMP_AUTH_MITM; + if (!hci_dev_test_flag(hcon->hdev, HCI_FORCE_NO_MITM)) { + /* Require MITM if IO Capability allows or the security level + * requires it. + */ + if (hcon->io_capability != HCI_IO_NO_INPUT_OUTPUT || + hcon->pending_sec_level > BT_SECURITY_MEDIUM) + authreq |= SMP_AUTH_MITM; + } if (hcon->role == HCI_ROLE_MASTER) { struct smp_cmd_pairing cp; -- cgit v1.2.3-59-g8ed1b From d1d900f822b6b2874de9c1ef8094fc8df56a2f9f Mon Sep 17 00:00:00 2001 From: Guenter Roeck Date: Mon, 6 Apr 2020 11:54:38 -0700 Subject: Bluetooth: Simplify / fix return values from tk_request Some static checker run by 0day reports a variableScope warning. net/bluetooth/smp.c:870:6: warning: The scope of the variable 'err' can be reduced. [variableScope] There is no need for two separate variables holding return values. Stick with the existing variable. While at it, don't pre-initialize 'ret' because it is set in each code path. tk_request() is supposed to return a negative error code on errors, not a bluetooth return code. The calling code converts the return value to SMP_UNSPECIFIED if needed. Fixes: 92516cd97fd4 ("Bluetooth: Always request for user confirmation for Just Works") Cc: Sonny Sasaka Signed-off-by: Guenter Roeck Reviewed-by: Sonny Sasaka Signed-off-by: Sonny Sasaka Signed-off-by: Marcel Holtmann --- net/bluetooth/smp.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/net/bluetooth/smp.c b/net/bluetooth/smp.c index a85e3e49cd0d..daf198fb2b31 100644 --- a/net/bluetooth/smp.c +++ b/net/bluetooth/smp.c @@ -854,8 +854,7 @@ static int tk_request(struct l2cap_conn *conn, u8 remote_oob, u8 auth, struct l2cap_chan *chan = conn->smp; struct smp_chan *smp = chan->data; u32 passkey = 0; - int ret = 0; - int err; + int ret; /* Initialize key for JUST WORKS */ memset(smp->tk, 0, sizeof(smp->tk)); @@ -887,12 +886,12 @@ static int tk_request(struct l2cap_conn *conn, u8 remote_oob, u8 auth, /* If Just Works, Continue with Zero TK and ask user-space for * confirmation */ if (smp->method == JUST_WORKS) { - err = mgmt_user_confirm_request(hcon->hdev, &hcon->dst, + ret = mgmt_user_confirm_request(hcon->hdev, &hcon->dst, hcon->type, hcon->dst_type, passkey, 1); - if (err) - return SMP_UNSPECIFIED; + if (ret) + return ret; set_bit(SMP_FLAG_WAIT_USER, &smp->flags); return 0; } -- cgit v1.2.3-59-g8ed1b From fcd156ee8bc6a989dbf7e5efbccdc9cdb831fd27 Mon Sep 17 00:00:00 2001 From: Sathish Narasimman Date: Wed, 8 Apr 2020 10:57:03 +0530 Subject: Bluetooth: btusb: check for NULL in btusb_find_altsetting() The new btusb_find_altsetting() dereferences it without checking the check is added in this patch Signed-off-by: Sathish Narasimman Signed-off-by: Marcel Holtmann --- drivers/bluetooth/btusb.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/bluetooth/btusb.c b/drivers/bluetooth/btusb.c index 09913cadd1ca..871162790a0e 100644 --- a/drivers/bluetooth/btusb.c +++ b/drivers/bluetooth/btusb.c @@ -1622,6 +1622,9 @@ static struct usb_host_interface *btusb_find_altsetting(struct btusb_data *data, BT_DBG("Looking for Alt no :%d", alt); + if (!intf) + return NULL; + for (i = 0; i < intf->num_altsetting; i++) { if (intf->altsetting[i].desc.bAlternateSetting == alt) return &intf->altsetting[i]; -- cgit v1.2.3-59-g8ed1b From ffee202a78c2980688bc5d2f7d56480e69a5e0c9 Mon Sep 17 00:00:00 2001 From: Sonny Sasaka Date: Mon, 6 Apr 2020 11:04:02 -0700 Subject: Bluetooth: Always request for user confirmation for Just Works (LE SC) To improve security, always give the user-space daemon a chance to accept or reject a Just Works pairing (LE). The daemon may decide to auto-accept based on the user's intent. This patch is similar to the previous patch but applies for LE Secure Connections (SC). Signed-off-by: Sonny Sasaka Signed-off-by: Marcel Holtmann --- net/bluetooth/smp.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/net/bluetooth/smp.c b/net/bluetooth/smp.c index daf198fb2b31..df22cbf94693 100644 --- a/net/bluetooth/smp.c +++ b/net/bluetooth/smp.c @@ -2201,7 +2201,7 @@ mackey_and_ltk: if (err) return SMP_UNSPECIFIED; - if (smp->method == JUST_WORKS || smp->method == REQ_OOB) { + if (smp->method == REQ_OOB) { if (hcon->out) { sc_dhkey_check(smp); SMP_ALLOW_CMD(smp, SMP_CMD_DHKEY_CHECK); @@ -2216,6 +2216,9 @@ mackey_and_ltk: confirm_hint = 0; confirm: + if (smp->method == JUST_WORKS) + confirm_hint = 1; + err = mgmt_user_confirm_request(hcon->hdev, &hcon->dst, hcon->type, hcon->dst_type, passkey, confirm_hint); if (err) -- cgit v1.2.3-59-g8ed1b From d81686d3335648197c5da3992b151648706dc0f8 Mon Sep 17 00:00:00 2001 From: Wen Gong Date: Tue, 7 Apr 2020 08:12:30 +0300 Subject: ath10k: disable TX complete indication of htt for sdio For sdio chip, it is high latency bus, all the TX packet's content will be tranferred from HOST memory to firmware memory via sdio bus, then it need much more memory in firmware than low latency bus chip, for low latency chip, such as PCI-E, it only need to transfer the TX descriptor via PCI-E bus to firmware memory. For sdio chip, reduce the complexity of TX logic will help TX efficiency since its memory is limited, and it will reduce the TX circle's time of each packet and then firmware will have more memory for TX since TX complete also need memeory. This patch disable TX complete indication from firmware for htt data packet, it will not have TX complete indication from firmware to ath10k. It will cut the cost of bus bandwidth of TX complete and make the TX logic of firmware simpler, it results in significant performance improvement on TX path. Udp TX throughout is 130Mbps without this patch, and it arrives 400Mbps with this patch. The downside of this patch is the command "iw wlan0 station dump" will show 0 for "tx retries" and "tx failed" since all tx packet's status is success. This patch only effect sdio chip, it will not effect PCI, SNOC etc. Tested with QCA6174 SDIO with firmware WLAN.RMH.4.4.1-00017-QCARMSWPZ-1 Signed-off-by: Wen Gong Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/20200212080415.31265-2-wgong@codeaurora.org --- drivers/net/wireless/ath/ath10k/core.c | 5 +---- drivers/net/wireless/ath/ath10k/hif.h | 9 ++++++++ drivers/net/wireless/ath/ath10k/htc.c | 10 +++++++++ drivers/net/wireless/ath/ath10k/htc.h | 3 +++ drivers/net/wireless/ath/ath10k/htt.c | 5 +++++ drivers/net/wireless/ath/ath10k/htt.h | 13 +++++++++++- drivers/net/wireless/ath/ath10k/htt_rx.c | 34 ++++++++++++++++++++++++++++++- drivers/net/wireless/ath/ath10k/htt_tx.c | 35 ++++++++++++++++++++++++++++++++ drivers/net/wireless/ath/ath10k/hw.h | 2 +- drivers/net/wireless/ath/ath10k/sdio.c | 23 +++++++++++++++++++++ 10 files changed, 132 insertions(+), 7 deletions(-) diff --git a/drivers/net/wireless/ath/ath10k/core.c b/drivers/net/wireless/ath/ath10k/core.c index 70f3bae92a85..4cd50a353047 100644 --- a/drivers/net/wireless/ath/ath10k/core.c +++ b/drivers/net/wireless/ath/ath10k/core.c @@ -723,10 +723,7 @@ static int ath10k_init_sdio(struct ath10k *ar, enum ath10k_firmware_mode mode) if (ret) return ret; - /* Data transfer is not initiated, when reduced Tx completion - * is used for SDIO. disable it until fixed - */ - param &= ~HI_ACS_FLAGS_SDIO_REDUCE_TX_COMPL_SET; + param |= HI_ACS_FLAGS_SDIO_REDUCE_TX_COMPL_SET; /* Alternate credit size of 1544 as used by SDIO firmware is * not big enough for mac80211 / native wifi frames. disable it diff --git a/drivers/net/wireless/ath/ath10k/hif.h b/drivers/net/wireless/ath/ath10k/hif.h index 496ee34a4d78..0dd8973d0acf 100644 --- a/drivers/net/wireless/ath/ath10k/hif.h +++ b/drivers/net/wireless/ath/ath10k/hif.h @@ -56,6 +56,8 @@ struct ath10k_hif_ops { int (*swap_mailbox)(struct ath10k *ar); + int (*get_htt_tx_complete)(struct ath10k *ar); + int (*map_service_to_pipe)(struct ath10k *ar, u16 service_id, u8 *ul_pipe, u8 *dl_pipe); @@ -144,6 +146,13 @@ static inline int ath10k_hif_swap_mailbox(struct ath10k *ar) return 0; } +static inline int ath10k_hif_get_htt_tx_complete(struct ath10k *ar) +{ + if (ar->hif.ops->get_htt_tx_complete) + return ar->hif.ops->get_htt_tx_complete(ar); + return 0; +} + static inline int ath10k_hif_map_service_to_pipe(struct ath10k *ar, u16 service_id, u8 *ul_pipe, u8 *dl_pipe) diff --git a/drivers/net/wireless/ath/ath10k/htc.c b/drivers/net/wireless/ath/ath10k/htc.c index 2248d6c022f4..61ee413d902a 100644 --- a/drivers/net/wireless/ath/ath10k/htc.c +++ b/drivers/net/wireless/ath/ath10k/htc.c @@ -660,6 +660,16 @@ int ath10k_htc_wait_target(struct ath10k_htc *htc) return 0; } +void ath10k_htc_change_tx_credit_flow(struct ath10k_htc *htc, + enum ath10k_htc_ep_id eid, + bool enable) +{ + struct ath10k *ar = htc->ar; + struct ath10k_htc_ep *ep = &ar->htc.endpoint[eid]; + + ep->tx_credit_flow_enabled = enable; +} + int ath10k_htc_connect_service(struct ath10k_htc *htc, struct ath10k_htc_svc_conn_req *conn_req, struct ath10k_htc_svc_conn_resp *conn_resp) diff --git a/drivers/net/wireless/ath/ath10k/htc.h b/drivers/net/wireless/ath/ath10k/htc.h index 065c82d9d689..14e5c3f712c1 100644 --- a/drivers/net/wireless/ath/ath10k/htc.h +++ b/drivers/net/wireless/ath/ath10k/htc.h @@ -386,6 +386,9 @@ int ath10k_htc_start(struct ath10k_htc *htc); int ath10k_htc_connect_service(struct ath10k_htc *htc, struct ath10k_htc_svc_conn_req *conn_req, struct ath10k_htc_svc_conn_resp *conn_resp); +void ath10k_htc_change_tx_credit_flow(struct ath10k_htc *htc, + enum ath10k_htc_ep_id eid, + bool enable); int ath10k_htc_send(struct ath10k_htc *htc, enum ath10k_htc_ep_id eid, struct sk_buff *packet); struct sk_buff *ath10k_htc_alloc_skb(struct ath10k *ar, int size); diff --git a/drivers/net/wireless/ath/ath10k/htt.c b/drivers/net/wireless/ath/ath10k/htt.c index 7b75200ceae5..4354bf285ff1 100644 --- a/drivers/net/wireless/ath/ath10k/htt.c +++ b/drivers/net/wireless/ath/ath10k/htt.c @@ -10,6 +10,7 @@ #include "htt.h" #include "core.h" #include "debug.h" +#include "hif.h" static const enum htt_t2h_msg_type htt_main_t2h_msg_types[] = { [HTT_MAIN_T2H_MSG_TYPE_VERSION_CONF] = HTT_T2H_MSG_TYPE_VERSION_CONF, @@ -153,6 +154,10 @@ int ath10k_htt_connect(struct ath10k_htt *htt) htt->eid = conn_resp.eid; + htt->disable_tx_comp = ath10k_hif_get_htt_tx_complete(htt->ar); + if (htt->disable_tx_comp) + ath10k_htc_change_tx_credit_flow(&htt->ar->htc, htt->eid, true); + return 0; } diff --git a/drivers/net/wireless/ath/ath10k/htt.h b/drivers/net/wireless/ath/ath10k/htt.h index 4a12564fc30e..b88c2f3787d8 100644 --- a/drivers/net/wireless/ath/ath10k/htt.h +++ b/drivers/net/wireless/ath/ath10k/htt.h @@ -150,9 +150,19 @@ enum htt_data_tx_desc_flags1 { HTT_DATA_TX_DESC_FLAGS1_MORE_IN_BATCH = 1 << 12, HTT_DATA_TX_DESC_FLAGS1_CKSUM_L3_OFFLOAD = 1 << 13, HTT_DATA_TX_DESC_FLAGS1_CKSUM_L4_OFFLOAD = 1 << 14, - HTT_DATA_TX_DESC_FLAGS1_RSVD1 = 1 << 15 + HTT_DATA_TX_DESC_FLAGS1_TX_COMPLETE = 1 << 15 }; +#define HTT_TX_CREDIT_DELTA_ABS_M 0xffff0000 +#define HTT_TX_CREDIT_DELTA_ABS_S 16 +#define HTT_TX_CREDIT_DELTA_ABS_GET(word) \ + (((word) & HTT_TX_CREDIT_DELTA_ABS_M) >> HTT_TX_CREDIT_DELTA_ABS_S) + +#define HTT_TX_CREDIT_SIGN_BIT_M 0x00000100 +#define HTT_TX_CREDIT_SIGN_BIT_S 8 +#define HTT_TX_CREDIT_SIGN_BIT_GET(word) \ + (((word) & HTT_TX_CREDIT_SIGN_BIT_M) >> HTT_TX_CREDIT_SIGN_BIT_S) + enum htt_data_tx_ext_tid { HTT_DATA_TX_EXT_TID_NON_QOS_MCAST_BCAST = 16, HTT_DATA_TX_EXT_TID_MGMT = 17, @@ -2021,6 +2031,7 @@ struct ath10k_htt { bool tx_mem_allocated; const struct ath10k_htt_tx_ops *tx_ops; const struct ath10k_htt_rx_ops *rx_ops; + bool disable_tx_comp; }; struct ath10k_htt_tx_ops { diff --git a/drivers/net/wireless/ath/ath10k/htt_rx.c b/drivers/net/wireless/ath/ath10k/htt_rx.c index f883f2a724dd..64e45bfa5d05 100644 --- a/drivers/net/wireless/ath/ath10k/htt_rx.c +++ b/drivers/net/wireless/ath/ath10k/htt_rx.c @@ -3789,6 +3789,9 @@ bool ath10k_htt_t2h_msg_handler(struct ath10k *ar, struct sk_buff *skb) } case HTT_T2H_MSG_TYPE_MGMT_TX_COMPLETION: { struct htt_tx_done tx_done = {}; + struct ath10k_htt *htt = &ar->htt; + struct ath10k_htc *htc = &ar->htc; + struct ath10k_htc_ep *ep = &ar->htc.endpoint[htt->eid]; int status = __le32_to_cpu(resp->mgmt_tx_completion.status); int info = __le32_to_cpu(resp->mgmt_tx_completion.info); @@ -3814,6 +3817,12 @@ bool ath10k_htt_t2h_msg_handler(struct ath10k *ar, struct sk_buff *skb) break; } + if (htt->disable_tx_comp) { + spin_lock_bh(&htc->tx_lock); + ep->tx_credits++; + spin_unlock_bh(&htc->tx_lock); + } + status = ath10k_txrx_tx_unref(htt, &tx_done); if (!status) { spin_lock_bh(&htt->tx_lock); @@ -3888,8 +3897,31 @@ bool ath10k_htt_t2h_msg_handler(struct ath10k *ar, struct sk_buff *skb) skb_queue_tail(&htt->rx_in_ord_compl_q, skb); return false; } - case HTT_T2H_MSG_TYPE_TX_CREDIT_UPDATE_IND: + case HTT_T2H_MSG_TYPE_TX_CREDIT_UPDATE_IND: { + struct ath10k_htt *htt = &ar->htt; + struct ath10k_htc *htc = &ar->htc; + struct ath10k_htc_ep *ep = &ar->htc.endpoint[htt->eid]; + u32 msg_word = __le32_to_cpu(*(__le32 *)resp); + int htt_credit_delta; + + htt_credit_delta = HTT_TX_CREDIT_DELTA_ABS_GET(msg_word); + if (HTT_TX_CREDIT_SIGN_BIT_GET(msg_word)) + htt_credit_delta = -htt_credit_delta; + + ath10k_dbg(ar, ATH10K_DBG_HTT, + "htt credit update delta %d\n", + htt_credit_delta); + + if (htt->disable_tx_comp) { + spin_lock_bh(&htc->tx_lock); + ep->tx_credits += htt_credit_delta; + spin_unlock_bh(&htc->tx_lock); + ath10k_dbg(ar, ATH10K_DBG_HTT, + "htt credit total %d\n", + ep->tx_credits); + } break; + } case HTT_T2H_MSG_TYPE_CHAN_CHANGE: { u32 phymode = __le32_to_cpu(resp->chan_change.phymode); u32 freq = __le32_to_cpu(resp->chan_change.freq); diff --git a/drivers/net/wireless/ath/ath10k/htt_tx.c b/drivers/net/wireless/ath/ath10k/htt_tx.c index e9d12ea708b6..bcecf05fe2fd 100644 --- a/drivers/net/wireless/ath/ath10k/htt_tx.c +++ b/drivers/net/wireless/ath/ath10k/htt_tx.c @@ -543,7 +543,39 @@ void ath10k_htt_tx_free(struct ath10k_htt *htt) void ath10k_htt_htc_tx_complete(struct ath10k *ar, struct sk_buff *skb) { + struct ath10k_htt *htt = &ar->htt; + struct htt_tx_done tx_done = {0}; + struct htt_cmd_hdr *htt_hdr; + struct htt_data_tx_desc *desc_hdr = NULL; + u16 flags1 = 0; + u8 msg_type = 0; + + if (htt->disable_tx_comp) { + htt_hdr = (struct htt_cmd_hdr *)skb->data; + msg_type = htt_hdr->msg_type; + + if (msg_type == HTT_H2T_MSG_TYPE_TX_FRM) { + desc_hdr = (struct htt_data_tx_desc *) + (skb->data + sizeof(*htt_hdr)); + flags1 = __le16_to_cpu(desc_hdr->flags1); + } + } + dev_kfree_skb_any(skb); + + if ((!htt->disable_tx_comp) || (msg_type != HTT_H2T_MSG_TYPE_TX_FRM)) + return; + + ath10k_dbg(ar, ATH10K_DBG_HTT, + "htt tx complete msdu id:%u ,flags1:%x\n", + __le16_to_cpu(desc_hdr->id), flags1); + + if (flags1 & HTT_DATA_TX_DESC_FLAGS1_TX_COMPLETE) + return; + + tx_done.status = HTT_TX_COMPL_STATE_ACK; + tx_done.msdu_id = __le16_to_cpu(desc_hdr->id); + ath10k_txrx_tx_unref(&ar->htt, &tx_done); } void ath10k_htt_hif_tx_complete(struct ath10k *ar, struct sk_buff *skb) @@ -1279,6 +1311,9 @@ static int ath10k_htt_tx_hl(struct ath10k_htt *htt, enum ath10k_hw_txrx_mode txm flags0 |= SM(ATH10K_HW_TXRX_MGMT, HTT_DATA_TX_DESC_FLAGS0_PKT_TYPE); flags0 |= HTT_DATA_TX_DESC_FLAGS0_MAC_HDR_PRESENT; + + if (htt->disable_tx_comp) + flags1 |= HTT_DATA_TX_DESC_FLAGS1_TX_COMPLETE; break; } diff --git a/drivers/net/wireless/ath/ath10k/hw.h b/drivers/net/wireless/ath/ath10k/hw.h index 970c736ac6bb..2a7af5861788 100644 --- a/drivers/net/wireless/ath/ath10k/hw.h +++ b/drivers/net/wireless/ath/ath10k/hw.h @@ -765,7 +765,7 @@ ath10k_is_rssi_enable(struct ath10k_hw_params *hw, #define TARGET_TLV_NUM_TDLS_VDEVS 1 #define TARGET_TLV_NUM_TIDS ((TARGET_TLV_NUM_PEERS) * 2) #define TARGET_TLV_NUM_MSDU_DESC (1024 + 32) -#define TARGET_TLV_NUM_MSDU_DESC_HL 64 +#define TARGET_TLV_NUM_MSDU_DESC_HL 1024 #define TARGET_TLV_NUM_WOW_PATTERNS 22 #define TARGET_TLV_MGMT_NUM_MSDU_DESC (50) diff --git a/drivers/net/wireless/ath/ath10k/sdio.c b/drivers/net/wireless/ath/ath10k/sdio.c index 1f709b65c29b..5a0db342e5ad 100644 --- a/drivers/net/wireless/ath/ath10k/sdio.c +++ b/drivers/net/wireless/ath/ath10k/sdio.c @@ -1752,6 +1752,28 @@ static int ath10k_sdio_hif_swap_mailbox(struct ath10k *ar) return 0; } +static int ath10k_sdio_get_htt_tx_complete(struct ath10k *ar) +{ + u32 addr, val; + int ret; + + addr = host_interest_item_address(HI_ITEM(hi_acs_flags)); + + ret = ath10k_sdio_hif_diag_read32(ar, addr, &val); + if (ret) { + ath10k_warn(ar, + "unable to read hi_acs_flags for htt tx comple : %d\n", ret); + return ret; + } + + ret = (val & HI_ACS_FLAGS_SDIO_REDUCE_TX_COMPL_FW_ACK); + + ath10k_dbg(ar, ATH10K_DBG_SDIO, "sdio reduce tx complete fw%sack\n", + ret ? " " : " not "); + + return ret; +} + /* HIF start/stop */ static int ath10k_sdio_hif_start(struct ath10k *ar) @@ -2026,6 +2048,7 @@ static const struct ath10k_hif_ops ath10k_sdio_hif_ops = { .start = ath10k_sdio_hif_start, .stop = ath10k_sdio_hif_stop, .swap_mailbox = ath10k_sdio_hif_swap_mailbox, + .get_htt_tx_complete = ath10k_sdio_get_htt_tx_complete, .map_service_to_pipe = ath10k_sdio_hif_map_service_to_pipe, .get_default_pipe = ath10k_sdio_hif_get_default_pipe, .send_complete_check = ath10k_sdio_hif_send_complete_check, -- cgit v1.2.3-59-g8ed1b From c61a748370438ca1ae8389071664b2520f16820c Mon Sep 17 00:00:00 2001 From: Wen Gong Date: Tue, 7 Apr 2020 08:12:34 +0300 Subject: ath10k: change ATH10K_SDIO_BUS_REQUEST_MAX_NUM from 64 to 1024 sdio bus bandwidth is low, sometimes for high performance TX test, it will lack of ath10k_sdio_bus_request, it will print message: ath10k_sdio mmc1:0001:1: unable to allocate bus request for async request change the num from 64 to 1024 will not happen it. Tested with QCA6174 SDIO with firmware WLAN.RMH.4.4.1-00017-QCARMSWP-1. Signed-off-by: Wen Gong Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/20200212080415.31265-3-wgong@codeaurora.org --- drivers/net/wireless/ath/ath10k/sdio.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/wireless/ath/ath10k/sdio.h b/drivers/net/wireless/ath/ath10k/sdio.h index 33195f49acab..1c987494ad22 100644 --- a/drivers/net/wireless/ath/ath10k/sdio.h +++ b/drivers/net/wireless/ath/ath10k/sdio.h @@ -37,7 +37,7 @@ (ATH10K_SDIO_MAX_BUFFER_SIZE - sizeof(struct ath10k_htc_hdr)) #define ATH10K_HIF_MBOX_NUM_MAX 4 -#define ATH10K_SDIO_BUS_REQUEST_MAX_NUM 64 +#define ATH10K_SDIO_BUS_REQUEST_MAX_NUM 1024 #define ATH10K_SDIO_HIF_COMMUNICATION_TIMEOUT_HZ (100 * HZ) @@ -98,6 +98,7 @@ #define ATH10K_FIFO_TIMEOUT_AND_CHIP_CONTROL_DISABLE_SLEEP_OFF 0xFFFEFFFF #define ATH10K_FIFO_TIMEOUT_AND_CHIP_CONTROL_DISABLE_SLEEP_ON 0x10000 +/* TODO: remove this and use skb->cb instead, much cleaner approach */ struct ath10k_sdio_bus_request { struct list_head list; -- cgit v1.2.3-59-g8ed1b From 943d5d92c5e87aa8293aae6de2b3ee977aa7d3cf Mon Sep 17 00:00:00 2001 From: Daniels Umanovskis Date: Thu, 9 Apr 2020 13:18:29 +0200 Subject: Bluetooth: log advertisement packet length if it gets corrected The error could indicate a problem with the Bluetooth device. It is easier to investigate if the packet's actual length gets logged, not just the fact that a discrepancy occurred. Signed-off-by: Daniels Umanovskis Reviewed-by: Alain Michaud Signed-off-by: Marcel Holtmann --- net/bluetooth/hci_event.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index 51e6461f0b71..966fc543c01d 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -5396,7 +5396,8 @@ static void process_adv_report(struct hci_dev *hdev, u8 type, bdaddr_t *bdaddr, /* Adjust for actual length */ if (len != real_len) { - bt_dev_err_ratelimited(hdev, "advertising data len corrected"); + bt_dev_err_ratelimited(hdev, "advertising data len corrected %u -> %u", + len, real_len); len = real_len; } -- cgit v1.2.3-59-g8ed1b From 04896832c94aae4842100cafb8d3a73e1bed3a45 Mon Sep 17 00:00:00 2001 From: "Ziqian SUN (Zamir)" Date: Sat, 11 Apr 2020 09:34:27 +0800 Subject: Bluetooth: btrtl: Add support for RTL8761B Add new compatible device RTL8761B. RTL8761B is a USB Bluetooth device, with support of BLE and BR/EDR. The USB info is T: Bus=03 Lev=04 Prnt=04 Port=00 Cnt=01 Dev#= 29 Spd=12 MxCh= 0 D: Ver= 1.10 Cls=e0(wlcon) Sub=01 Prot=01 MxPS=64 #Cfgs= 1 P: Vendor=0bda ProdID=8771 Rev= 2.00 S: Manufacturer=Realtek S: Product=Bluetooth Radio S: SerialNumber=XXXXXXXXXXXX C:* #Ifs= 2 Cfg#= 1 Atr=e0 MxPwr=500mA I:* If#= 0 Alt= 0 #EPs= 3 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb E: Ad=81(I) Atr=03(Int.) MxPS= 16 Ivl=1ms E: Ad=02(O) Atr=02(Bulk) MxPS= 64 Ivl=0ms E: Ad=82(I) Atr=02(Bulk) MxPS= 64 Ivl=0ms I:* If#= 1 Alt= 0 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb E: Ad=03(O) Atr=01(Isoc) MxPS= 0 Ivl=1ms E: Ad=83(I) Atr=01(Isoc) MxPS= 0 Ivl=1ms I: If#= 1 Alt= 1 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb E: Ad=03(O) Atr=01(Isoc) MxPS= 9 Ivl=1ms E: Ad=83(I) Atr=01(Isoc) MxPS= 9 Ivl=1ms I: If#= 1 Alt= 2 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb E: Ad=03(O) Atr=01(Isoc) MxPS= 17 Ivl=1ms E: Ad=83(I) Atr=01(Isoc) MxPS= 17 Ivl=1ms I: If#= 1 Alt= 3 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb E: Ad=03(O) Atr=01(Isoc) MxPS= 25 Ivl=1ms E: Ad=83(I) Atr=01(Isoc) MxPS= 25 Ivl=1ms I: If#= 1 Alt= 4 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb E: Ad=03(O) Atr=01(Isoc) MxPS= 33 Ivl=1ms E: Ad=83(I) Atr=01(Isoc) MxPS= 33 Ivl=1ms I: If#= 1 Alt= 5 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb E: Ad=03(O) Atr=01(Isoc) MxPS= 49 Ivl=1ms E: Ad=83(I) Atr=01(Isoc) MxPS= 49 Ivl=1ms Signed-off-by: Ziqian SUN (Zamir) Signed-off-by: Marcel Holtmann --- drivers/bluetooth/btrtl.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/drivers/bluetooth/btrtl.c b/drivers/bluetooth/btrtl.c index 67f4bc21e7c5..3a9afc905f24 100644 --- a/drivers/bluetooth/btrtl.c +++ b/drivers/bluetooth/btrtl.c @@ -130,12 +130,19 @@ static const struct id_table ic_id_table[] = { .cfg_name = "rtl_bt/rtl8821c_config" }, /* 8761A */ - { IC_MATCH_FL_LMPSUBV, RTL_ROM_LMP_8761A, 0x0, + { IC_INFO(RTL_ROM_LMP_8761A, 0xa), .config_needed = false, .has_rom_version = true, .fw_name = "rtl_bt/rtl8761a_fw.bin", .cfg_name = "rtl_bt/rtl8761a_config" }, + /* 8761B */ + { IC_INFO(RTL_ROM_LMP_8761A, 0xb), + .config_needed = false, + .has_rom_version = true, + .fw_name = "rtl_bt/rtl8761b_fw.bin", + .cfg_name = "rtl_bt/rtl8761b_config" }, + /* 8822C with UART interface */ { .match_flags = IC_MATCH_FL_LMPSUBV | IC_MATCH_FL_HCIREV | IC_MATCH_FL_HCIBUS, @@ -267,6 +274,7 @@ static int rtlbt_parse_firmware(struct hci_dev *hdev, { RTL_ROM_LMP_8723B, 9 }, /* 8723D */ { RTL_ROM_LMP_8821A, 10 }, /* 8821C */ { RTL_ROM_LMP_8822B, 13 }, /* 8822C */ + { RTL_ROM_LMP_8761A, 14 }, /* 8761B */ }; min_size = sizeof(struct rtl_epatch_header) + sizeof(extension_sig) + 3; -- cgit v1.2.3-59-g8ed1b From 1e744bf218b54d2e241aa6107484828d4f4a9fdc Mon Sep 17 00:00:00 2001 From: Karthikeyan Periyasamy Date: Wed, 8 Apr 2020 16:33:15 +0530 Subject: ath11k: fix duplication peer create on same radio Add the pdev index information in the peer object to validate the peer creation. Ignore the peer creation request, if the given MAC address is already present in the peer list with same radio. If we allow the peer creation in above scenario, FW assert will happen. Above scenario occurred in two cases, where Multiple AP VAP created in the same radio. 1. when testing tool sends association request to two AP with same MAC address 2. when a station do roaming from one AP VAP to another AP VAP. Signed-off-by: Karthikeyan Periyasamy Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/1586343795-21422-1-git-send-email-periyasa@codeaurora.org --- drivers/net/wireless/ath/ath11k/peer.c | 35 ++++++++++++++++++++++++++++++++-- drivers/net/wireless/ath/ath11k/peer.h | 1 + 2 files changed, 34 insertions(+), 2 deletions(-) diff --git a/drivers/net/wireless/ath/ath11k/peer.c b/drivers/net/wireless/ath/ath11k/peer.c index f43deacc01bd..297172538620 100644 --- a/drivers/net/wireless/ath/ath11k/peer.c +++ b/drivers/net/wireless/ath/ath11k/peer.c @@ -17,7 +17,26 @@ struct ath11k_peer *ath11k_peer_find(struct ath11k_base *ab, int vdev_id, list_for_each_entry(peer, &ab->peers, list) { if (peer->vdev_id != vdev_id) continue; - if (memcmp(peer->addr, addr, ETH_ALEN)) + if (!ether_addr_equal(peer->addr, addr)) + continue; + + return peer; + } + + return NULL; +} + +static struct ath11k_peer *ath11k_peer_find_by_pdev_idx(struct ath11k_base *ab, + u8 pdev_idx, const u8 *addr) +{ + struct ath11k_peer *peer; + + lockdep_assert_held(&ab->base_lock); + + list_for_each_entry(peer, &ab->peers, list) { + if (peer->pdev_idx != pdev_idx) + continue; + if (!ether_addr_equal(peer->addr, addr)) continue; return peer; @@ -34,7 +53,7 @@ struct ath11k_peer *ath11k_peer_find_by_addr(struct ath11k_base *ab, lockdep_assert_held(&ab->base_lock); list_for_each_entry(peer, &ab->peers, list) { - if (memcmp(peer->addr, addr, ETH_ALEN)) + if (!ether_addr_equal(peer->addr, addr)) continue; return peer; @@ -200,6 +219,17 @@ int ath11k_peer_create(struct ath11k *ar, struct ath11k_vif *arvif, return -ENOBUFS; } + spin_lock_bh(&ar->ab->base_lock); + peer = ath11k_peer_find_by_pdev_idx(ar->ab, ar->pdev_idx, param->peer_addr); + if (peer) { + spin_unlock_bh(&ar->ab->base_lock); + ath11k_info(ar->ab, + "ignoring the peer %pM creation on same pdev idx %d\n", + param->peer_addr, ar->pdev_idx); + return -EINVAL; + } + spin_unlock_bh(&ar->ab->base_lock); + ret = ath11k_wmi_send_peer_create_cmd(ar, param); if (ret) { ath11k_warn(ar->ab, @@ -225,6 +255,7 @@ int ath11k_peer_create(struct ath11k *ar, struct ath11k_vif *arvif, return -ENOENT; } + peer->pdev_idx = ar->pdev_idx; peer->sta = sta; arvif->ast_hash = peer->ast_hash; diff --git a/drivers/net/wireless/ath/ath11k/peer.h b/drivers/net/wireless/ath/ath11k/peer.h index ccca1523a6ea..5d125ce8984e 100644 --- a/drivers/net/wireless/ath/ath11k/peer.h +++ b/drivers/net/wireless/ath/ath11k/peer.h @@ -13,6 +13,7 @@ struct ath11k_peer { u8 addr[ETH_ALEN]; int peer_id; u16 ast_hash; + u8 pdev_idx; /* protected by ab->data_lock */ struct ieee80211_key_conf *keys[WMI_MAX_KEY_INDEX + 1]; -- cgit v1.2.3-59-g8ed1b From bd902b1bdb25729be44c25630f44735fd6b8b254 Mon Sep 17 00:00:00 2001 From: Karthikeyan Periyasamy Date: Wed, 8 Apr 2020 16:35:57 +0530 Subject: ath11k: Modify the interrupt timer threshold Modify the interrupt timer threshold param as 256 to avoid HW watchdog in heavy multicast traffic scenario. Signed-off-by: Karthikeyan Periyasamy Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/1586343957-21474-1-git-send-email-periyasa@codeaurora.org --- drivers/net/wireless/ath/ath11k/hal.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/wireless/ath/ath11k/hal.h b/drivers/net/wireless/ath/ath11k/hal.h index 7722822a0456..780a3e11b609 100644 --- a/drivers/net/wireless/ath/ath11k/hal.h +++ b/drivers/net/wireless/ath/ath11k/hal.h @@ -599,7 +599,7 @@ struct hal_srng { /* Interrupt mitigation - timer threshold in us */ #define HAL_SRNG_INT_TIMER_THRESHOLD_TX 1000 #define HAL_SRNG_INT_TIMER_THRESHOLD_RX 500 -#define HAL_SRNG_INT_TIMER_THRESHOLD_OTHER 1000 +#define HAL_SRNG_INT_TIMER_THRESHOLD_OTHER 256 /* HW SRNG configuration table */ struct hal_srng_config { -- cgit v1.2.3-59-g8ed1b From a3baa8f084198949f3739651d96634d897f3224d Mon Sep 17 00:00:00 2001 From: Aloka Dixit Date: Wed, 8 Apr 2020 10:41:17 -0700 Subject: ath11k: Fix TWT radio count TWT feature fails on radio2 because physical device count is hardcoded to 2. Set value dynamically. Signed-off-by: Aloka Dixit Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/20200408174117.22957-1-alokad@codeaurora.org --- drivers/net/wireless/ath/ath11k/wmi.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/wireless/ath/ath11k/wmi.c b/drivers/net/wireless/ath/ath11k/wmi.c index 973b72a0ca69..c2a972377687 100644 --- a/drivers/net/wireless/ath/ath11k/wmi.c +++ b/drivers/net/wireless/ath/ath11k/wmi.c @@ -3245,7 +3245,7 @@ int ath11k_wmi_cmd_init(struct ath11k_base *ab) config.beacon_tx_offload_max_vdev = ab->num_radios * TARGET_MAX_BCN_OFFLD; config.rx_batchmode = TARGET_RX_BATCHMODE; config.peer_map_unmap_v2_support = 1; - config.twt_ap_pdev_count = 2; + config.twt_ap_pdev_count = ab->num_radios; config.twt_ap_sta_count = 1000; memcpy(&wmi_sc->wlan_resource_config, &config, sizeof(config)); -- cgit v1.2.3-59-g8ed1b From 05090864fc7ecfe72558087216fcccc5eb46add8 Mon Sep 17 00:00:00 2001 From: Manikanta Pubbisetty Date: Thu, 9 Apr 2020 14:00:13 +0530 Subject: ath11k: set IRQ_DISABLE_UNLAZY flag for DP interrupts Unlike CE interrupts, DP interrupts are not enabled/disabled at source; they are enabled/disabled only at GIC level, therefore it is required to set IRQ_DISABLE_UNLAZY flag to avoid spurious interrupts. Signed-off-by: Manikanta Pubbisetty Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/1586421013-23025-1-git-send-email-mpubbise@codeaurora.org --- drivers/net/wireless/ath/ath11k/ahb.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/wireless/ath/ath11k/ahb.c b/drivers/net/wireless/ath/ath11k/ahb.c index 59342d2797ca..3b2b76d602f2 100644 --- a/drivers/net/wireless/ath/ath11k/ahb.c +++ b/drivers/net/wireless/ath/ath11k/ahb.c @@ -788,7 +788,7 @@ static int ath11k_ahb_ext_irq_config(struct ath11k_base *ab) irq = platform_get_irq_byname(ab->pdev, irq_name[irq_idx]); ab->irq_num[irq_idx] = irq; - irq_set_status_flags(irq, IRQ_NOAUTOEN); + irq_set_status_flags(irq, IRQ_NOAUTOEN | IRQ_DISABLE_UNLAZY); ret = request_irq(irq, ath11k_ahb_ext_interrupt_handler, IRQF_TRIGGER_RISING, irq_name[irq_idx], irq_grp); -- cgit v1.2.3-59-g8ed1b From 7395fb496577f0f9abf7fd278f00a8941b2f7ad8 Mon Sep 17 00:00:00 2001 From: Manikanta Pubbisetty Date: Thu, 9 Apr 2020 14:13:17 +0530 Subject: ath11k: rx path optimizations During RX, accessing the reo dest ring descriptor directly is consuming a lot of CPU cycles. Accessing the descriptor after copying it locally has improved CPU usage by around ~10-15% while measuring throughput in RX DBTC test cases(all radios are involved in the throughput measurement). HW tested: IPQ8074 Signed-off-by: Manikanta Pubbisetty Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/1586421797-885-1-git-send-email-mpubbise@codeaurora.org --- drivers/net/wireless/ath/ath11k/dp_rx.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/drivers/net/wireless/ath/ath11k/dp_rx.c b/drivers/net/wireless/ath/ath11k/dp_rx.c index a3f2c76b3471..203fd44ff352 100644 --- a/drivers/net/wireless/ath/ath11k/dp_rx.c +++ b/drivers/net/wireless/ath/ath11k/dp_rx.c @@ -2403,12 +2403,12 @@ int ath11k_dp_process_rx(struct ath11k_base *ab, int ring_id, try_again: while ((rx_desc = ath11k_hal_srng_dst_get_next_entry(ab, srng))) { - struct hal_reo_dest_ring *desc = (struct hal_reo_dest_ring *)rx_desc; + struct hal_reo_dest_ring desc = *(struct hal_reo_dest_ring *)rx_desc; enum hal_reo_dest_ring_push_reason push_reason; u32 cookie; cookie = FIELD_GET(BUFFER_ADDR_INFO1_SW_COOKIE, - desc->buf_addr_info.info1); + desc.buf_addr_info.info1); buf_id = FIELD_GET(DP_RXDMA_BUF_COOKIE_BUF_ID, cookie); mac_id = FIELD_GET(DP_RXDMA_BUF_COOKIE_PDEV_ID, cookie); @@ -2436,7 +2436,7 @@ try_again: total_msdu_reaped++; push_reason = FIELD_GET(HAL_REO_DEST_RING_INFO0_PUSH_REASON, - desc->info0); + desc.info0); if (push_reason != HAL_REO_DEST_RING_PUSH_REASON_ROUTING_INSTRUCTION) { dev_kfree_skb_any(msdu); @@ -2444,15 +2444,15 @@ try_again: continue; } - rxcb->is_first_msdu = !!(desc->rx_msdu_info.info0 & + rxcb->is_first_msdu = !!(desc.rx_msdu_info.info0 & RX_MSDU_DESC_INFO0_FIRST_MSDU_IN_MPDU); - rxcb->is_last_msdu = !!(desc->rx_msdu_info.info0 & + rxcb->is_last_msdu = !!(desc.rx_msdu_info.info0 & RX_MSDU_DESC_INFO0_LAST_MSDU_IN_MPDU); - rxcb->is_continuation = !!(desc->rx_msdu_info.info0 & + rxcb->is_continuation = !!(desc.rx_msdu_info.info0 & RX_MSDU_DESC_INFO0_MSDU_CONTINUATION); rxcb->mac_id = mac_id; rxcb->tid = FIELD_GET(HAL_REO_DEST_RING_INFO0_RX_QUEUE_NUM, - desc->info0); + desc.info0); __skb_queue_tail(&msdu_list, msdu); -- cgit v1.2.3-59-g8ed1b From ca2c6881dccabe00a38cda00ddcccb55e6abe245 Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Sat, 28 Mar 2020 11:05:24 +0800 Subject: rtw88: Make two functions static Fix sparse warnings: drivers/net/wireless/realtek/rtw88/fw.c:633:4: warning: symbol 'rtw_get_rsvd_page_probe_req_location' was not declared. Should it be static? drivers/net/wireless/realtek/rtw88/fw.c:650:5: warning: symbol 'rtw_get_rsvd_page_probe_req_size' was not declared. Should it be static? Reported-by: Hulk Robot Signed-off-by: YueHaibing Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/20200328030524.16032-1-yuehaibing@huawei.com --- drivers/net/wireless/realtek/rtw88/fw.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/net/wireless/realtek/rtw88/fw.c b/drivers/net/wireless/realtek/rtw88/fw.c index 05c430b3489c..9192ab26e39b 100644 --- a/drivers/net/wireless/realtek/rtw88/fw.c +++ b/drivers/net/wireless/realtek/rtw88/fw.c @@ -630,8 +630,8 @@ void rtw_fw_set_pg_info(struct rtw_dev *rtwdev) rtw_fw_send_h2c_command(rtwdev, h2c_pkt); } -u8 rtw_get_rsvd_page_probe_req_location(struct rtw_dev *rtwdev, - struct cfg80211_ssid *ssid) +static u8 rtw_get_rsvd_page_probe_req_location(struct rtw_dev *rtwdev, + struct cfg80211_ssid *ssid) { struct rtw_rsvd_page *rsvd_pkt; u8 location = 0; @@ -647,8 +647,8 @@ u8 rtw_get_rsvd_page_probe_req_location(struct rtw_dev *rtwdev, return location; } -u16 rtw_get_rsvd_page_probe_req_size(struct rtw_dev *rtwdev, - struct cfg80211_ssid *ssid) +static u16 rtw_get_rsvd_page_probe_req_size(struct rtw_dev *rtwdev, + struct cfg80211_ssid *ssid) { struct rtw_rsvd_page *rsvd_pkt; u16 size = 0; -- cgit v1.2.3-59-g8ed1b From c57673852062428cdeabdd6501ac8b8e4c302067 Mon Sep 17 00:00:00 2001 From: Jaehoon Chung Date: Mon, 30 Mar 2020 14:25:28 +0900 Subject: brcmfmac: fix wrong location to get firmware feature sup_wpa feature is getting after setting feature_disable flag. If firmware is supported sup_wpa feature, it's always enabled regardless of feature_disable flag. Fixes: b8a64f0e96c2 ("brcmfmac: support 4-way handshake offloading for WPA/WPA2-PSK") Signed-off-by: Jaehoon Chung Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/20200330052528.10503-1-jh80.chung@samsung.com --- drivers/net/wireless/broadcom/brcm80211/brcmfmac/feature.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/feature.c b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/feature.c index 5da0dda0d899..0dcefbd0c000 100644 --- a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/feature.c +++ b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/feature.c @@ -285,13 +285,14 @@ void brcmf_feat_attach(struct brcmf_pub *drvr) if (!err) ifp->drvr->feat_flags |= BIT(BRCMF_FEAT_SCAN_RANDOM_MAC); + brcmf_feat_iovar_int_get(ifp, BRCMF_FEAT_FWSUP, "sup_wpa"); + if (drvr->settings->feature_disable) { brcmf_dbg(INFO, "Features: 0x%02x, disable: 0x%02x\n", ifp->drvr->feat_flags, drvr->settings->feature_disable); ifp->drvr->feat_flags &= ~drvr->settings->feature_disable; } - brcmf_feat_iovar_int_get(ifp, BRCMF_FEAT_FWSUP, "sup_wpa"); brcmf_feat_firmware_overrides(drvr); -- cgit v1.2.3-59-g8ed1b From a24993e54b9cac81b2814da53a664261af10a829 Mon Sep 17 00:00:00 2001 From: Qiujun Huang Date: Thu, 2 Apr 2020 16:19:17 +0800 Subject: rtlwifi: rtl8723ae: fix spelling mistake "chang" -> "change" There is a spelling mistake in a trace message. Fix it. Signed-off-by: Qiujun Huang Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/1585815557-20212-1-git-send-email-hqjagain@gmail.com --- drivers/net/wireless/realtek/rtlwifi/rtl8723ae/hal_btc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/wireless/realtek/rtlwifi/rtl8723ae/hal_btc.c b/drivers/net/wireless/realtek/rtlwifi/rtl8723ae/hal_btc.c index 680198280f8f..652d8ff9cccb 100644 --- a/drivers/net/wireless/realtek/rtlwifi/rtl8723ae/hal_btc.c +++ b/drivers/net/wireless/realtek/rtlwifi/rtl8723ae/hal_btc.c @@ -131,7 +131,7 @@ static bool rtl8723e_dm_bt_is_same_coexist_state(struct ieee80211_hw *hw) (rtlpriv->btcoexist.previous_state_h == rtlpriv->btcoexist.cstate_h)) { RT_TRACE(rtlpriv, COMP_BT_COEXIST, DBG_DMESG, - "[DM][BT], Coexist state do not chang!!\n"); + "[DM][BT], Coexist state do not change!!\n"); return true; } else { RT_TRACE(rtlpriv, COMP_BT_COEXIST, DBG_DMESG, -- cgit v1.2.3-59-g8ed1b From f9f46bca59d11d0fa04087c840e23ca94cd239b5 Mon Sep 17 00:00:00 2001 From: Qiujun Huang Date: Thu, 2 Apr 2020 22:17:58 +0800 Subject: rsi: fix a typo "throld" -> "threshold" There is a typo in debug message. Fix it. s/throld/threshold Signed-off-by: Qiujun Huang Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/1585837078-6149-1-git-send-email-hqjagain@gmail.com --- drivers/net/wireless/rsi/rsi_91x_mac80211.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/wireless/rsi/rsi_91x_mac80211.c b/drivers/net/wireless/rsi/rsi_91x_mac80211.c index 440088293aff..5c0adb0efc5d 100644 --- a/drivers/net/wireless/rsi/rsi_91x_mac80211.c +++ b/drivers/net/wireless/rsi/rsi_91x_mac80211.c @@ -832,7 +832,7 @@ static void rsi_mac80211_bss_info_changed(struct ieee80211_hw *hw, common->cqm_info.last_cqm_event_rssi = 0; common->cqm_info.rssi_thold = bss_conf->cqm_rssi_thold; common->cqm_info.rssi_hyst = bss_conf->cqm_rssi_hyst; - rsi_dbg(INFO_ZONE, "RSSI throld & hysteresis are: %d %d\n", + rsi_dbg(INFO_ZONE, "RSSI threshold & hysteresis are: %d %d\n", common->cqm_info.rssi_thold, common->cqm_info.rssi_hyst); } -- cgit v1.2.3-59-g8ed1b From 09667ea7ce6d6a90152aeba631f11b55f283a898 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Sun, 5 Apr 2020 14:39:06 +0100 Subject: brcm80211: remove redundant pointer 'address' Pointer 'address' is being assigned and updated in a few places by it is never read. Hence the assignments are redundant and can be removed. Addresses-Coverity: ("Unused value") Signed-off-by: Colin Ian King Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/20200405133906.381358-1-colin.king@canonical.com --- drivers/net/wireless/broadcom/brcm80211/brcmfmac/commonring.c | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/commonring.c b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/commonring.c index 49db54d23e03..e44236cb210e 100644 --- a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/commonring.c +++ b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/commonring.c @@ -180,14 +180,8 @@ again: int brcmf_commonring_write_complete(struct brcmf_commonring *commonring) { - void *address; - - address = commonring->buf_addr; - address += (commonring->f_ptr * commonring->item_len); - if (commonring->f_ptr > commonring->w_ptr) { - address = commonring->buf_addr; + if (commonring->f_ptr > commonring->w_ptr) commonring->f_ptr = 0; - } commonring->f_ptr = commonring->w_ptr; -- cgit v1.2.3-59-g8ed1b From 63e49a9fdac1b4e97ac26cb3fe953f210d83bc53 Mon Sep 17 00:00:00 2001 From: Giuseppe Marco Randazzo Date: Mon, 6 Apr 2020 00:06:59 +0200 Subject: p54usb: add AirVasT USB stick device-id This patch adds the AirVasT USB wireless devices 124a:4026 to the list of supported devices. It's using the ISL3886 usb firmware. Without this modification, the wiki adapter is not recognized. Cc: Signed-off-by: Giuseppe Marco Randazzo Signed-off-by: Christian Lamparter [formatted, reworded] Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/20200405220659.45621-1-chunkeey@gmail.com --- drivers/net/wireless/intersil/p54/p54usb.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/wireless/intersil/p54/p54usb.c b/drivers/net/wireless/intersil/p54/p54usb.c index b94764c88750..ff0e30c0c14c 100644 --- a/drivers/net/wireless/intersil/p54/p54usb.c +++ b/drivers/net/wireless/intersil/p54/p54usb.c @@ -61,6 +61,7 @@ static const struct usb_device_id p54u_table[] = { {USB_DEVICE(0x0db0, 0x6826)}, /* MSI UB54G (MS-6826) */ {USB_DEVICE(0x107b, 0x55f2)}, /* Gateway WGU-210 (Gemtek) */ {USB_DEVICE(0x124a, 0x4023)}, /* Shuttle PN15, Airvast WM168g, IOGear GWU513 */ + {USB_DEVICE(0x124a, 0x4026)}, /* AirVasT USB wireless device */ {USB_DEVICE(0x1435, 0x0210)}, /* Inventel UR054G */ {USB_DEVICE(0x15a9, 0x0002)}, /* Gemtek WUBI-100GW 802.11g */ {USB_DEVICE(0x1630, 0x0005)}, /* 2Wire 802.11g USB (v1) / Z-Com */ -- cgit v1.2.3-59-g8ed1b From 6343a6d4b2130be9323f347d60af8a7ba8f7242c Mon Sep 17 00:00:00 2001 From: Kai-Heng Feng Date: Tue, 7 Apr 2020 15:33:31 +0800 Subject: rtw88: Add delay on polling h2c command status bit On some systems we can constanly see rtw88 complains: [39584.721375] rtw_pci 0000:03:00.0: failed to send h2c command Increase interval of each check to wait the status bit really changed. Use read_poll_timeout() macro which fits anything we need here. Suggested-by: Kalle Valo Signed-off-by: Kai-Heng Feng Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/20200407073331.397-1-kai.heng.feng@canonical.com --- drivers/net/wireless/realtek/rtw88/fw.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/drivers/net/wireless/realtek/rtw88/fw.c b/drivers/net/wireless/realtek/rtw88/fw.c index 9192ab26e39b..245da96dfddc 100644 --- a/drivers/net/wireless/realtek/rtw88/fw.c +++ b/drivers/net/wireless/realtek/rtw88/fw.c @@ -2,6 +2,8 @@ /* Copyright(c) 2018-2019 Realtek Corporation */ +#include + #include "main.h" #include "coex.h" #include "fw.h" @@ -193,8 +195,8 @@ static void rtw_fw_send_h2c_command(struct rtw_dev *rtwdev, u8 box; u8 box_state; u32 box_reg, box_ex_reg; - u32 h2c_wait; int idx; + int ret; rtw_dbg(rtwdev, RTW_DBG_FW, "send H2C content %02x%02x%02x%02x %02x%02x%02x%02x\n", @@ -226,12 +228,11 @@ static void rtw_fw_send_h2c_command(struct rtw_dev *rtwdev, goto out; } - h2c_wait = 20; - do { - box_state = rtw_read8(rtwdev, REG_HMETFR); - } while ((box_state >> box) & 0x1 && --h2c_wait > 0); + ret = read_poll_timeout(rtw_read8, box_state, + !((box_state >> box) & 0x1), 100, 3000, false, + rtwdev, REG_HMETFR); - if (!h2c_wait) { + if (ret) { rtw_err(rtwdev, "failed to send h2c command\n"); goto out; } -- cgit v1.2.3-59-g8ed1b From ec4d3e3a054578de34cd0b587ab8a1ac36f629d9 Mon Sep 17 00:00:00 2001 From: Larry Finger Date: Tue, 7 Apr 2020 14:00:43 -0500 Subject: b43legacy: Fix case where channel status is corrupted This patch fixes commit 75388acd0cd8 ("add mac80211-based driver for legacy BCM43xx devices") In https://bugzilla.kernel.org/show_bug.cgi?id=207093, a defect in b43legacy is reported. Upon testing, thus problem exists on PPC and X86 platforms, is present in the oldest kernel tested (3.2), and has been present in the driver since it was first added to the kernel. The problem is a corrupted channel status received from the device. Both the internal card in a PowerBook G4 and the PCMCIA version (Broadcom BCM4306 with PCI ID 14e4:4320) have the problem. Only Rev, 2 (revision 4 of the 802.11 core) of the chip has been tested. No other devices using b43legacy are available for testing. Various sources of the problem were considered. Buffer overrun and other sources of corruption within the driver were rejected because the faulty channel status is always the same, not a random value. It was concluded that the faulty data is coming from the device, probably due to a firmware bug. As that source is not available, the driver must take appropriate action to recover. At present, the driver reports the error, and them continues to process the bad packet. This is believed that to be a mistake, and the correct action is to drop the correpted packet. Fixes: 75388acd0cd8 ("add mac80211-based driver for legacy BCM43xx devices") Cc: Stable Signed-off-by: Larry Finger Reported-and-tested by: F. Erhard Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/20200407190043.1686-1-Larry.Finger@lwfinger.net --- drivers/net/wireless/broadcom/b43legacy/xmit.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/wireless/broadcom/b43legacy/xmit.c b/drivers/net/wireless/broadcom/b43legacy/xmit.c index e9b23c2e5bd4..efd63f4ce74f 100644 --- a/drivers/net/wireless/broadcom/b43legacy/xmit.c +++ b/drivers/net/wireless/broadcom/b43legacy/xmit.c @@ -558,6 +558,7 @@ void b43legacy_rx(struct b43legacy_wldev *dev, default: b43legacywarn(dev->wl, "Unexpected value for chanstat (0x%X)\n", chanstat); + goto drop; } memcpy(IEEE80211_SKB_RXCB(skb), &status, sizeof(status)); -- cgit v1.2.3-59-g8ed1b From c960e2b384ef3cec4dd447ac90dbdc27a3c41a08 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Tue, 7 Apr 2020 21:32:33 +0200 Subject: qtnfmac: Simplify code in _attach functions There is no need to re-implement 'netdev_alloc_skb_ip_align()' here. Keep the code simple. Signed-off-by: Christophe JAILLET Reviewed-by: Sergey Matyukevich Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/20200407193233.9439-1-christophe.jaillet@wanadoo.fr --- drivers/net/wireless/quantenna/qtnfmac/pcie/pearl_pcie.c | 2 +- drivers/net/wireless/quantenna/qtnfmac/pcie/topaz_pcie.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/wireless/quantenna/qtnfmac/pcie/pearl_pcie.c b/drivers/net/wireless/quantenna/qtnfmac/pcie/pearl_pcie.c index dbb241106d8a..eb67b66b846b 100644 --- a/drivers/net/wireless/quantenna/qtnfmac/pcie/pearl_pcie.c +++ b/drivers/net/wireless/quantenna/qtnfmac/pcie/pearl_pcie.c @@ -286,7 +286,7 @@ static int pearl_skb2rbd_attach(struct qtnf_pcie_pearl_state *ps, u16 index) struct sk_buff *skb; dma_addr_t paddr; - skb = __netdev_alloc_skb_ip_align(NULL, SKB_BUF_SIZE, GFP_ATOMIC); + skb = netdev_alloc_skb_ip_align(NULL, SKB_BUF_SIZE); if (!skb) { priv->rx_skb[index] = NULL; return -ENOMEM; diff --git a/drivers/net/wireless/quantenna/qtnfmac/pcie/topaz_pcie.c b/drivers/net/wireless/quantenna/qtnfmac/pcie/topaz_pcie.c index dbf3c5fd751f..d1b850aa4657 100644 --- a/drivers/net/wireless/quantenna/qtnfmac/pcie/topaz_pcie.c +++ b/drivers/net/wireless/quantenna/qtnfmac/pcie/topaz_pcie.c @@ -247,7 +247,7 @@ topaz_skb2rbd_attach(struct qtnf_pcie_topaz_state *ts, u16 index, u32 wrap) struct sk_buff *skb; dma_addr_t paddr; - skb = __netdev_alloc_skb_ip_align(NULL, SKB_BUF_SIZE, GFP_ATOMIC); + skb = netdev_alloc_skb_ip_align(NULL, SKB_BUF_SIZE); if (!skb) { ts->base.rx_skb[index] = NULL; return -ENOMEM; -- cgit v1.2.3-59-g8ed1b From fd7fb0253cdf96241308336e7833186b928336a8 Mon Sep 17 00:00:00 2001 From: Jason Yan Date: Fri, 10 Apr 2020 17:08:17 +0800 Subject: brcmsmac: make brcms_c_set_mac() void Fix the following coccicheck warning: drivers/net/wireless/broadcom/brcm80211/brcmsmac/main.c:3773:5-8: Unneeded variable: "err". Return "0" on line 3781 Reported-by: Hulk Robot Signed-off-by: Jason Yan Acked-by: Arend van Spriel Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/20200410090817.26883-1-yanaijie@huawei.com --- drivers/net/wireless/broadcom/brcm80211/brcmsmac/main.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmsmac/main.c b/drivers/net/wireless/broadcom/brcm80211/brcmsmac/main.c index 7f2c15c799d2..d88f8d456b94 100644 --- a/drivers/net/wireless/broadcom/brcm80211/brcmsmac/main.c +++ b/drivers/net/wireless/broadcom/brcm80211/brcmsmac/main.c @@ -3768,17 +3768,14 @@ static void brcms_c_set_ps_ctrl(struct brcms_c_info *wlc) * Write this BSS config's MAC address to core. * Updates RXE match engine. */ -static int brcms_c_set_mac(struct brcms_bss_cfg *bsscfg) +static void brcms_c_set_mac(struct brcms_bss_cfg *bsscfg) { - int err = 0; struct brcms_c_info *wlc = bsscfg->wlc; /* enter the MAC addr into the RXE match registers */ brcms_c_set_addrmatch(wlc, RCM_MAC_OFFSET, wlc->pub->cur_etheraddr); brcms_c_ampdu_macaddr_upd(wlc); - - return err; } /* Write the BSS config's BSSID address to core (set_bssid in d11procs.tcl). -- cgit v1.2.3-59-g8ed1b From 6fc3b94ef5964736408cd4f8e85a816dcf1dd510 Mon Sep 17 00:00:00 2001 From: Maharaja Kennadyrajan Date: Fri, 10 Apr 2020 22:36:43 +0530 Subject: ath11k: Cleanup in pdev destroy and mac register during crash on recovery Debugfs pdev entries should be cleaned up during the crash on recovery. If not, mac register will fail for the reason that it is already registered during core reconfigure. Signed-off-by: Maharaja Kennadyrajan Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/1586538405-16226-1-git-send-email-mkenna@codeaurora.org --- drivers/net/wireless/ath/ath11k/debug.c | 3 +++ drivers/net/wireless/ath/ath11k/mac.c | 3 +++ 2 files changed, 6 insertions(+) diff --git a/drivers/net/wireless/ath/ath11k/debug.c b/drivers/net/wireless/ath/ath11k/debug.c index 8d485171b0b3..825e7ba61f45 100644 --- a/drivers/net/wireless/ath/ath11k/debug.c +++ b/drivers/net/wireless/ath/ath11k/debug.c @@ -803,6 +803,9 @@ static const struct file_operations fops_soc_rx_stats = { int ath11k_debug_pdev_create(struct ath11k_base *ab) { + if (test_bit(ATH11K_FLAG_REGISTERED, &ab->dev_flags)) + return 0; + ab->debugfs_soc = debugfs_create_dir(ab->hw_params.name, ab->debugfs_ath11k); if (IS_ERR_OR_NULL(ab->debugfs_soc)) { diff --git a/drivers/net/wireless/ath/ath11k/mac.c b/drivers/net/wireless/ath/ath11k/mac.c index 9f8bc19cc5ae..4783394b8575 100644 --- a/drivers/net/wireless/ath/ath11k/mac.c +++ b/drivers/net/wireless/ath/ath11k/mac.c @@ -5891,6 +5891,9 @@ int ath11k_mac_register(struct ath11k_base *ab) int i; int ret; + if (test_bit(ATH11K_FLAG_REGISTERED, &ab->dev_flags)) + return 0; + for (i = 0; i < ab->num_radios; i++) { pdev = &ab->pdevs[i]; ar = pdev->ar; -- cgit v1.2.3-59-g8ed1b From 40c766d4a49cdfe30e0fb40825321b4d4de651aa Mon Sep 17 00:00:00 2001 From: Ritesh Singh Date: Fri, 10 Apr 2020 22:36:44 +0530 Subject: ath11k: Fix fw assert by setting proper vht cap After setting fixed vht-rate if new station is trying to assoc with mu_bfee cap, or if a sta is already connected with mu_bfee cap then set the fixed vht-rate and reconnecting the sta, FW assert is happening. So to avoid this, reset the MU_BEAMFORMEE bit in vht->caps, if mcs_index is invalid for nss 1. Signed-off-by: Ritesh Singh Signed-off-by: Maharaja Kennadyrajan Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/1586538405-16226-2-git-send-email-mkenna@codeaurora.org --- drivers/net/wireless/ath/ath11k/mac.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/net/wireless/ath/ath11k/mac.c b/drivers/net/wireless/ath/ath11k/mac.c index 4783394b8575..0834089a61e7 100644 --- a/drivers/net/wireless/ath/ath11k/mac.c +++ b/drivers/net/wireless/ath/ath11k/mac.c @@ -1142,6 +1142,10 @@ static void ath11k_peer_assoc_h_vht(struct ath11k *ar, arg->tx_mcs_set &= ~IEEE80211_VHT_MCS_SUPPORT_0_11_MASK; arg->tx_mcs_set |= IEEE80211_DISABLE_VHT_MCS_SUPPORT_0_11; + if ((arg->tx_mcs_set & IEEE80211_VHT_MCS_NOT_SUPPORTED) == + IEEE80211_VHT_MCS_NOT_SUPPORTED) + arg->peer_vht_caps &= ~IEEE80211_VHT_CAP_MU_BEAMFORMEE_CAPABLE; + /* TODO: Check */ arg->tx_max_mcs_nss = 0xFF; -- cgit v1.2.3-59-g8ed1b From ec48d28ba291943d4ae2f873a4330debddecbca6 Mon Sep 17 00:00:00 2001 From: Maharaja Kennadyrajan Date: Fri, 10 Apr 2020 22:36:45 +0530 Subject: ath11k: Fix rx_filter flags setting for per peer rx_stats Rx_filter flags are set with default filter flags during wifi up/down sequence even though the 'ext_rx_stats' debugfs is enabled as 1. So, that we are not getting proper per peer rx_stats. Hence, fixing this by setting the missing rx_filter when ext_rx_stats is already set/enabled. Signed-off-by: Maharaja Kennadyrajan Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/1586538405-16226-3-git-send-email-mkenna@codeaurora.org --- drivers/net/wireless/ath/ath11k/core.h | 1 + drivers/net/wireless/ath/ath11k/debug.c | 2 ++ drivers/net/wireless/ath/ath11k/debug.h | 10 ++++++++++ drivers/net/wireless/ath/ath11k/mac.c | 4 +++- 4 files changed, 16 insertions(+), 1 deletion(-) diff --git a/drivers/net/wireless/ath/ath11k/core.h b/drivers/net/wireless/ath/ath11k/core.h index b4c3e0418eef..a8ef95f98616 100644 --- a/drivers/net/wireless/ath/ath11k/core.h +++ b/drivers/net/wireless/ath/ath11k/core.h @@ -392,6 +392,7 @@ struct ath11k_debug { u32 pktlog_mode; u32 pktlog_peer_valid; u8 pktlog_peer_addr[ETH_ALEN]; + u32 rx_filter; }; struct ath11k_per_peer_tx_stats { diff --git a/drivers/net/wireless/ath/ath11k/debug.c b/drivers/net/wireless/ath/ath11k/debug.c index 825e7ba61f45..a2e3dfeae3d5 100644 --- a/drivers/net/wireless/ath/ath11k/debug.c +++ b/drivers/net/wireless/ath/ath11k/debug.c @@ -698,6 +698,8 @@ static ssize_t ath11k_write_extd_rx_stats(struct file *file, tlv_filter = ath11k_mac_mon_status_filter_default; } + ar->debug.rx_filter = tlv_filter.rx_filter; + ring_id = ar->dp.rx_mon_status_refill_ring.refill_buf_ring.ring_id; ret = ath11k_dp_tx_htt_rx_filter_setup(ar->ab, ring_id, ar->dp.mac_id, HAL_RXDMA_MONITOR_STATUS, diff --git a/drivers/net/wireless/ath/ath11k/debug.h b/drivers/net/wireless/ath/ath11k/debug.h index 4a3ff8227187..45454fcef346 100644 --- a/drivers/net/wireless/ath/ath11k/debug.h +++ b/drivers/net/wireless/ath/ath11k/debug.h @@ -188,6 +188,11 @@ static inline int ath11k_debug_is_extd_rx_stats_enabled(struct ath11k *ar) return ar->debug.extd_rx_stats; } +static inline int ath11k_debug_rx_filter(struct ath11k *ar) +{ + return ar->debug.rx_filter; +} + void ath11k_sta_add_debugfs(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_sta *sta, struct dentry *dir); void @@ -269,6 +274,11 @@ static inline bool ath11k_debug_is_pktlog_peer_valid(struct ath11k *ar, u8 *addr return false; } +static inline int ath11k_debug_rx_filter(struct ath11k *ar) +{ + return 0; +} + static inline void ath11k_accumulate_per_peer_tx_stats(struct ath11k_sta *arsta, struct ath11k_per_peer_tx_stats *peer_stats, diff --git a/drivers/net/wireless/ath/ath11k/mac.c b/drivers/net/wireless/ath/ath11k/mac.c index 0834089a61e7..065b7d6d4ab2 100644 --- a/drivers/net/wireless/ath/ath11k/mac.c +++ b/drivers/net/wireless/ath/ath11k/mac.c @@ -3881,8 +3881,10 @@ static int ath11k_mac_config_mon_status_default(struct ath11k *ar, bool enable) struct htt_rx_ring_tlv_filter tlv_filter = {0}; u32 ring_id; - if (enable) + if (enable) { tlv_filter = ath11k_mac_mon_status_filter_default; + tlv_filter.rx_filter = ath11k_debug_rx_filter(ar); + } ring_id = ar->dp.rx_mon_status_refill_ring.refill_buf_ring.ring_id; -- cgit v1.2.3-59-g8ed1b From 8a7968bee8d08835caa0d7bc0c25d750a5b52389 Mon Sep 17 00:00:00 2001 From: Mamatha Telu Date: Sun, 12 Apr 2020 23:54:35 +0530 Subject: ath10k: Fix typo in warning messages Fix some typo: s/fnrom/from s/pkgs/pkts/ s/AMSUs/AMSDUs/ Signed-off-by: Mamatha Telu Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/1586715875-5182-1-git-send-email-telumamatha36@gmail.com --- drivers/net/wireless/ath/ath10k/debug.c | 2 +- drivers/net/wireless/ath/ath10k/sdio.c | 2 +- drivers/net/wireless/ath/ath10k/wmi.c | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/net/wireless/ath/ath10k/debug.c b/drivers/net/wireless/ath/ath10k/debug.c index f811e6940fb0..69139c2e6f82 100644 --- a/drivers/net/wireless/ath/ath10k/debug.c +++ b/drivers/net/wireless/ath/ath10k/debug.c @@ -778,7 +778,7 @@ static ssize_t ath10k_mem_value_read(struct file *file, ret = ath10k_hif_diag_read(ar, *ppos, buf, count); if (ret) { - ath10k_warn(ar, "failed to read address 0x%08x via diagnose window fnrom debugfs: %d\n", + ath10k_warn(ar, "failed to read address 0x%08x via diagnose window from debugfs: %d\n", (u32)(*ppos), ret); goto exit; } diff --git a/drivers/net/wireless/ath/ath10k/sdio.c b/drivers/net/wireless/ath/ath10k/sdio.c index 5a0db342e5ad..943db9f401d8 100644 --- a/drivers/net/wireless/ath/ath10k/sdio.c +++ b/drivers/net/wireless/ath/ath10k/sdio.c @@ -542,7 +542,7 @@ static int ath10k_sdio_mbox_rx_alloc(struct ath10k *ar, int pkt_cnt = 0; if (n_lookaheads > ATH10K_SDIO_MAX_RX_MSGS) { - ath10k_warn(ar, "the total number of pkgs to be fetched (%u) exceeds maximum %u\n", + ath10k_warn(ar, "the total number of pkts to be fetched (%u) exceeds maximum %u\n", n_lookaheads, ATH10K_SDIO_MAX_RX_MSGS); ret = -ENOMEM; goto err; diff --git a/drivers/net/wireless/ath/ath10k/wmi.c b/drivers/net/wireless/ath/ath10k/wmi.c index 4a3a698fe059..a81a1ab2de19 100644 --- a/drivers/net/wireless/ath/ath10k/wmi.c +++ b/drivers/net/wireless/ath/ath10k/wmi.c @@ -8336,7 +8336,7 @@ ath10k_wmi_fw_pdev_rx_stats_fill(const struct ath10k_fw_stats_pdev *pdev, len += scnprintf(buf + len, buf_len - len, "%30s %10d\n", "MPDUs delivered to stack", pdev->loc_mpdus); len += scnprintf(buf + len, buf_len - len, "%30s %10d\n", - "Oversized AMSUs", pdev->oversize_amsdu); + "Oversized AMSDUs", pdev->oversize_amsdu); len += scnprintf(buf + len, buf_len - len, "%30s %10d\n", "PHY errors", pdev->phy_errs); len += scnprintf(buf + len, buf_len - len, "%30s %10d\n", -- cgit v1.2.3-59-g8ed1b From e190bc05b191ff62157ca63322aedd14c7e87d32 Mon Sep 17 00:00:00 2001 From: Govindaraj Saminathan Date: Mon, 13 Apr 2020 16:51:12 +0530 Subject: ath11k: cleanup reo command error code overwritten should not overwrite the error code. No buffer available then return invalid. For other failures return the error code of actual failure. Signed-off-by: Govindaraj Saminathan Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/1586776872-25766-1-git-send-email-gsamin@codeaurora.org --- drivers/net/wireless/ath/ath11k/dp_tx.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/net/wireless/ath/ath11k/dp_tx.c b/drivers/net/wireless/ath/ath11k/dp_tx.c index 7aac4b0eea0c..6f40e72f41e6 100644 --- a/drivers/net/wireless/ath/ath11k/dp_tx.c +++ b/drivers/net/wireless/ath/ath11k/dp_tx.c @@ -543,8 +543,12 @@ int ath11k_dp_tx_send_reo_cmd(struct ath11k_base *ab, struct dp_rx_tid *rx_tid, cmd_ring = &ab->hal.srng_list[dp->reo_cmd_ring.ring_id]; cmd_num = ath11k_hal_reo_cmd_send(ab, cmd_ring, type, cmd); + /* cmd_num should start from 1, during failure return the error code */ + if (cmd_num < 0) + return cmd_num; + /* reo cmd ring descriptors has cmd_num starting from 1 */ - if (cmd_num <= 0) + if (cmd_num == 0) return -EINVAL; if (!cb) -- cgit v1.2.3-59-g8ed1b From d687275b268b09c350b24b1947d1bf3496f49137 Mon Sep 17 00:00:00 2001 From: Sriram R Date: Mon, 13 Apr 2020 18:27:02 +0530 Subject: ath11k: Add dynamic tcl ring selection logic with retry mechanism IPQ8074 HW supports three TCL rings for tx. Currently these rings are mapped based on the Access categories, viz. VO, VI, BE, BK. In case, one of the traffic type dominates, then it could stress the same tcl rings. Rather, it would be optimal to make use of all the rings in a round robin fashion irrespective of the traffic type so that the load could be evenly distributed among all the rings. Also, in case the selected ring is busy or full, a retry mechanism is used to ensure other available ring is selected without dropping the packet. In SMP systems, this change avoids a single CPU from getting hogged when heavy traffic of same category is transmitted. The tx completion interrupts corresponding to the used tcl ring would be more which causes the assigned CPU to get hogged. Distribution of tx packets to different tcl rings helps balance this load. Signed-off-by: Sriram R Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/1586782622-22570-1-git-send-email-srirrama@codeaurora.org --- drivers/net/wireless/ath/ath11k/core.h | 3 +++ drivers/net/wireless/ath/ath11k/dp_tx.c | 46 +++++++++++++++++++++++++++------ 2 files changed, 41 insertions(+), 8 deletions(-) diff --git a/drivers/net/wireless/ath/ath11k/core.h b/drivers/net/wireless/ath/ath11k/core.h index a8ef95f98616..33237eaf0371 100644 --- a/drivers/net/wireless/ath/ath11k/core.h +++ b/drivers/net/wireless/ath/ath11k/core.h @@ -657,6 +657,9 @@ struct ath11k_base { u32 fw_crash_counter; } stats; u32 pktlog_defs_checksum; + + /* Round robbin based TCL ring selector */ + atomic_t tcl_ring_selector; }; struct ath11k_fw_stats_pdev { diff --git a/drivers/net/wireless/ath/ath11k/dp_tx.c b/drivers/net/wireless/ath/ath11k/dp_tx.c index 6f40e72f41e6..59018ccb14da 100644 --- a/drivers/net/wireless/ath/ath11k/dp_tx.c +++ b/drivers/net/wireless/ath/ath11k/dp_tx.c @@ -9,10 +9,6 @@ #include "hw.h" #include "peer.h" -/* NOTE: Any of the mapped ring id value must not exceed DP_TCL_NUM_RING_MAX */ -static const u8 -ath11k_txq_tcl_ring_map[ATH11K_HW_MAX_QUEUES] = { 0x0, 0x1, 0x2, 0x2 }; - static enum hal_tcl_encap_type ath11k_dp_tx_get_encap_type(struct ath11k_vif *arvif, struct sk_buff *skb) { @@ -84,6 +80,8 @@ int ath11k_dp_tx(struct ath11k *ar, struct ath11k_vif *arvif, u8 pool_id; u8 hal_ring_id; int ret; + u8 ring_selector = 0, ring_map = 0; + bool tcl_ring_retry; if (test_bit(ATH11K_FLAG_CRASH_FLUSH, &ar->ab->dev_flags)) return -ESHUTDOWN; @@ -92,7 +90,20 @@ int ath11k_dp_tx(struct ath11k *ar, struct ath11k_vif *arvif, return -ENOTSUPP; pool_id = skb_get_queue_mapping(skb) & (ATH11K_HW_MAX_QUEUES - 1); - ti.ring_id = ath11k_txq_tcl_ring_map[pool_id]; + + /* Let the default ring selection be based on a round robin + * fashion where one of the 3 tcl rings are selected based on + * the tcl_ring_selector counter. In case that ring + * is full/busy, we resort to other available rings. + * If all rings are full, we drop the packet. + * //TODO Add throttling logic when all rings are full + */ + ring_selector = atomic_inc_return(&ab->tcl_ring_selector); + +tcl_ring_sel: + tcl_ring_retry = false; + ti.ring_id = ring_selector % DP_TCL_NUM_RING_MAX; + ring_map |= BIT(ti.ring_id); tx_ring = &dp->tx_ring[ti.ring_id]; @@ -101,8 +112,14 @@ int ath11k_dp_tx(struct ath11k *ar, struct ath11k_vif *arvif, DP_TX_IDR_SIZE - 1, GFP_ATOMIC); spin_unlock_bh(&tx_ring->tx_idr_lock); - if (ret < 0) - return -ENOSPC; + if (ret < 0) { + if (ring_map == (BIT(DP_TCL_NUM_RING_MAX) - 1)) + return -ENOSPC; + + /* Check if the next ring is available */ + ring_selector++; + goto tcl_ring_sel; + } ti.desc_id = FIELD_PREP(DP_TX_DESC_ID_MAC_ID, ar->pdev_idx) | FIELD_PREP(DP_TX_DESC_ID_MSDU_ID, ret) | @@ -178,11 +195,21 @@ int ath11k_dp_tx(struct ath11k *ar, struct ath11k_vif *arvif, if (!hal_tcl_desc) { /* NOTE: It is highly unlikely we'll be running out of tcl_ring * desc because the desc is directly enqueued onto hw queue. - * So add tx packet throttling logic in future if required. */ ath11k_hal_srng_access_end(ab, tcl_ring); spin_unlock_bh(&tcl_ring->lock); ret = -ENOMEM; + + /* Checking for available tcl descritors in another ring in + * case of failure due to full tcl ring now, is better than + * checking this ring earlier for each pkt tx. + * Restart ring selection if some rings are not checked yet. + */ + if (ring_map != (BIT(DP_TCL_NUM_RING_MAX) - 1)) { + tcl_ring_retry = true; + ring_selector++; + } + goto fail_unmap_dma; } @@ -206,6 +233,9 @@ fail_remove_idr: FIELD_GET(DP_TX_DESC_ID_MSDU_ID, ti.desc_id)); spin_unlock_bh(&tx_ring->tx_idr_lock); + if (tcl_ring_retry) + goto tcl_ring_sel; + return ret; } -- cgit v1.2.3-59-g8ed1b From a69a1328fb03309555d70f4add76eae3780a6fba Mon Sep 17 00:00:00 2001 From: Jason Yan Date: Fri, 10 Apr 2020 17:08:50 +0800 Subject: ipw2x00: make ipw_qos_association_resp() void Fix the following coccicheck warning: drivers/net/wireless/intel/ipw2x00/ipw2200.c:7048:5-8: Unneeded variable: "ret". Return "0" on line 7055 Reported-by: Hulk Robot Signed-off-by: Jason Yan Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/20200410090850.27025-1-yanaijie@huawei.com --- drivers/net/wireless/intel/ipw2x00/ipw2200.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/drivers/net/wireless/intel/ipw2x00/ipw2200.c b/drivers/net/wireless/intel/ipw2x00/ipw2200.c index 60b5e08dd6df..201a1eb0e2f6 100644 --- a/drivers/net/wireless/intel/ipw2x00/ipw2200.c +++ b/drivers/net/wireless/intel/ipw2x00/ipw2200.c @@ -7042,23 +7042,22 @@ static int ipw_qos_association(struct ipw_priv *priv, * off the network from the associated setting, adjust the QoS * setting */ -static int ipw_qos_association_resp(struct ipw_priv *priv, +static void ipw_qos_association_resp(struct ipw_priv *priv, struct libipw_network *network) { - int ret = 0; unsigned long flags; u32 size = sizeof(struct libipw_qos_parameters); int set_qos_param = 0; if ((priv == NULL) || (network == NULL) || (priv->assoc_network == NULL)) - return ret; + return; if (!(priv->status & STATUS_ASSOCIATED)) - return ret; + return; if ((priv->ieee->iw_mode != IW_MODE_INFRA)) - return ret; + return; spin_lock_irqsave(&priv->ieee->lock, flags); if (network->flags & NETWORK_HAS_QOS_PARAMETERS) { @@ -7088,8 +7087,6 @@ static int ipw_qos_association_resp(struct ipw_priv *priv, if (set_qos_param == 1) schedule_work(&priv->qos_activate); - - return ret; } static u32 ipw_qos_get_burst_duration(struct ipw_priv *priv) -- cgit v1.2.3-59-g8ed1b From 80efb443ea0346791a79dade0e65c4a252f0571f Mon Sep 17 00:00:00 2001 From: Jason Yan Date: Fri, 10 Apr 2020 17:09:10 +0800 Subject: cw1200: make cw1200_spi_irq_unsubscribe() void Fix the following coccicheck warning: drivers/net/wireless/st/cw1200/cw1200_spi.c:273:5-8: Unneeded variable: "ret". Return "0" on line 279 Reported-by: Hulk Robot Signed-off-by: Jason Yan Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/20200410090910.27132-1-yanaijie@huawei.com --- drivers/net/wireless/st/cw1200/cw1200_spi.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/drivers/net/wireless/st/cw1200/cw1200_spi.c b/drivers/net/wireless/st/cw1200/cw1200_spi.c index ef01caac629c..271ed2ce2d7f 100644 --- a/drivers/net/wireless/st/cw1200/cw1200_spi.c +++ b/drivers/net/wireless/st/cw1200/cw1200_spi.c @@ -268,15 +268,11 @@ exit: return ret; } -static int cw1200_spi_irq_unsubscribe(struct hwbus_priv *self) +static void cw1200_spi_irq_unsubscribe(struct hwbus_priv *self) { - int ret = 0; - pr_debug("SW IRQ unsubscribe\n"); disable_irq_wake(self->func->irq); free_irq(self->func->irq, self); - - return ret; } static int cw1200_spi_off(const struct cw1200_platform_data_spi *pdata) -- cgit v1.2.3-59-g8ed1b From 2fd5fdca6a3ae0d7333c086e019ba6c4330fa0a8 Mon Sep 17 00:00:00 2001 From: Jason Yan Date: Fri, 10 Apr 2020 17:09:42 +0800 Subject: libertas: make lbs_init_mesh() void Fix the following coccicheck warning: drivers/net/wireless/marvell/libertas/mesh.c:833:5-8: Unneeded variable: "ret". Return "0" on line 874 Reported-by: Hulk Robot Signed-off-by: Jason Yan Reviewed-by: Lubomir Rintel Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/20200410090942.27239-1-yanaijie@huawei.com --- drivers/net/wireless/marvell/libertas/mesh.c | 6 +----- drivers/net/wireless/marvell/libertas/mesh.h | 2 +- 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/drivers/net/wireless/marvell/libertas/mesh.c b/drivers/net/wireless/marvell/libertas/mesh.c index 44c8a550da4c..f5b78257d551 100644 --- a/drivers/net/wireless/marvell/libertas/mesh.c +++ b/drivers/net/wireless/marvell/libertas/mesh.c @@ -828,10 +828,8 @@ static void lbs_persist_config_remove(struct net_device *dev) * Check mesh FW version and appropriately send the mesh start * command */ -int lbs_init_mesh(struct lbs_private *priv) +void lbs_init_mesh(struct lbs_private *priv) { - int ret = 0; - /* Determine mesh_fw_ver from fwrelease and fwcapinfo */ /* 5.0.16p0 9.0.0.p0 is known to NOT support any mesh */ /* 5.110.22 have mesh command with 0xa3 command id */ @@ -870,8 +868,6 @@ int lbs_init_mesh(struct lbs_private *priv) /* Stop meshing until interface is brought up */ lbs_mesh_config(priv, CMD_ACT_MESH_CONFIG_STOP, 1); - - return ret; } void lbs_start_mesh(struct lbs_private *priv) diff --git a/drivers/net/wireless/marvell/libertas/mesh.h b/drivers/net/wireless/marvell/libertas/mesh.h index 1561018f226f..d49717b20c09 100644 --- a/drivers/net/wireless/marvell/libertas/mesh.h +++ b/drivers/net/wireless/marvell/libertas/mesh.h @@ -16,7 +16,7 @@ struct net_device; -int lbs_init_mesh(struct lbs_private *priv); +void lbs_init_mesh(struct lbs_private *priv); void lbs_start_mesh(struct lbs_private *priv); int lbs_deinit_mesh(struct lbs_private *priv); -- cgit v1.2.3-59-g8ed1b From b9ed7e9505ba6346a101384d21ddd1139ae69eef Mon Sep 17 00:00:00 2001 From: Yan-Hsuan Chuang Date: Fri, 10 Apr 2020 18:09:49 +0800 Subject: rtw88: make rtw_chip_ops::set_antenna return int To support ieee80211_ops::set_antenna, the driver can decide if the antenna mask is accepted, otherwise it can return an error code. Because each chip could have different limitations, let the chip check the mask and return. Also the antenna mask for TRX from upper space is 32-bit long. Change the antenna mask for rtw_chip_ops::set_antenna from u8 to u32. Signed-off-by: Yan-Hsuan Chuang Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/20200410100950.3199-2-yhchuang@realtek.com --- drivers/net/wireless/realtek/rtw88/main.h | 9 +++++---- drivers/net/wireless/realtek/rtw88/rtw8822b.c | 18 ++++++++++++------ 2 files changed, 17 insertions(+), 10 deletions(-) diff --git a/drivers/net/wireless/realtek/rtw88/main.h b/drivers/net/wireless/realtek/rtw88/main.h index c6b590fdb573..c9edcabd7c42 100644 --- a/drivers/net/wireless/realtek/rtw88/main.h +++ b/drivers/net/wireless/realtek/rtw88/main.h @@ -798,8 +798,9 @@ struct rtw_chip_ops { void (*set_tx_power_index)(struct rtw_dev *rtwdev); int (*rsvd_page_dump)(struct rtw_dev *rtwdev, u8 *buf, u32 offset, u32 size); - void (*set_antenna)(struct rtw_dev *rtwdev, u8 antenna_tx, - u8 antenna_rx); + int (*set_antenna)(struct rtw_dev *rtwdev, + u32 antenna_tx, + u32 antenna_rx); void (*cfg_ldo25)(struct rtw_dev *rtwdev, bool enable); void (*false_alarm_statistics)(struct rtw_dev *rtwdev); void (*phy_calibration)(struct rtw_dev *rtwdev); @@ -1567,8 +1568,8 @@ struct rtw_hal { u8 sec_ch_offset; u8 rf_type; u8 rf_path_num; - u8 antenna_tx; - u8 antenna_rx; + u32 antenna_tx; + u32 antenna_rx; u8 bfee_sts_cap; /* protect tx power section */ diff --git a/drivers/net/wireless/realtek/rtw88/rtw8822b.c b/drivers/net/wireless/realtek/rtw88/rtw8822b.c index 4dd7d4143b04..c02f3a730369 100644 --- a/drivers/net/wireless/realtek/rtw88/rtw8822b.c +++ b/drivers/net/wireless/realtek/rtw88/rtw8822b.c @@ -998,8 +998,9 @@ static bool rtw8822b_check_rf_path(u8 antenna) } } -static void rtw8822b_set_antenna(struct rtw_dev *rtwdev, u8 antenna_tx, - u8 antenna_rx) +static int rtw8822b_set_antenna(struct rtw_dev *rtwdev, + u32 antenna_tx, + u32 antenna_rx) { struct rtw_hal *hal = &rtwdev->hal; @@ -1007,16 +1008,21 @@ static void rtw8822b_set_antenna(struct rtw_dev *rtwdev, u8 antenna_tx, antenna_tx, antenna_rx); if (!rtw8822b_check_rf_path(antenna_tx)) { - rtw_info(rtwdev, "unsupport tx path, set to default path ab\n"); - antenna_tx = BB_PATH_AB; + rtw_info(rtwdev, "unsupport tx path 0x%x\n", antenna_tx); + return -EINVAL; } + if (!rtw8822b_check_rf_path(antenna_rx)) { - rtw_info(rtwdev, "unsupport rx path, set to default path ab\n"); - antenna_rx = BB_PATH_AB; + rtw_info(rtwdev, "unsupport rx path 0x%x\n", antenna_rx); + return -EINVAL; } + hal->antenna_tx = antenna_tx; hal->antenna_rx = antenna_rx; + rtw8822b_config_trx_mode(rtwdev, antenna_tx, antenna_rx, false); + + return 0; } static void rtw8822b_cfg_ldo25(struct rtw_dev *rtwdev, bool enable) -- cgit v1.2.3-59-g8ed1b From 297bcf8222f222fd7defead862de4b8e3ea0b08a Mon Sep 17 00:00:00 2001 From: Yan-Hsuan Chuang Date: Fri, 10 Apr 2020 18:09:50 +0800 Subject: rtw88: add support for set/get antennas User space program such as iw can set antenna mask for the device. So add set antenna support by configure the trx mode. This is useful for some tests want to see the output of different antenna configuration (e.g. path A v.s. path B). Signed-off-by: Yan-Hsuan Chuang Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/20200410100950.3199-3-yhchuang@realtek.com --- drivers/net/wireless/realtek/rtw88/mac80211.c | 33 +++++++++++++++++++++++++ drivers/net/wireless/realtek/rtw88/main.c | 3 +++ drivers/net/wireless/realtek/rtw88/rtw8822c.c | 35 +++++++++++++++++++++++++++ 3 files changed, 71 insertions(+) diff --git a/drivers/net/wireless/realtek/rtw88/mac80211.c b/drivers/net/wireless/realtek/rtw88/mac80211.c index d7d02e4c0184..a2e6ef4ad9ee 100644 --- a/drivers/net/wireless/realtek/rtw88/mac80211.c +++ b/drivers/net/wireless/realtek/rtw88/mac80211.c @@ -754,6 +754,37 @@ static int rtw_ops_set_bitrate_mask(struct ieee80211_hw *hw, return 0; } +static int rtw_ops_set_antenna(struct ieee80211_hw *hw, + u32 tx_antenna, + u32 rx_antenna) +{ + struct rtw_dev *rtwdev = hw->priv; + struct rtw_chip_info *chip = rtwdev->chip; + int ret; + + if (!chip->ops->set_antenna) + return -EOPNOTSUPP; + + mutex_lock(&rtwdev->mutex); + ret = chip->ops->set_antenna(rtwdev, tx_antenna, rx_antenna); + mutex_unlock(&rtwdev->mutex); + + return ret; +} + +static int rtw_ops_get_antenna(struct ieee80211_hw *hw, + u32 *tx_antenna, + u32 *rx_antenna) +{ + struct rtw_dev *rtwdev = hw->priv; + struct rtw_hal *hal = &rtwdev->hal; + + *tx_antenna = hal->antenna_tx; + *rx_antenna = hal->antenna_rx; + + return 0; +} + #ifdef CONFIG_PM static int rtw_ops_suspend(struct ieee80211_hw *hw, struct cfg80211_wowlan *wowlan) @@ -815,6 +846,8 @@ const struct ieee80211_ops rtw_ops = { .sta_statistics = rtw_ops_sta_statistics, .flush = rtw_ops_flush, .set_bitrate_mask = rtw_ops_set_bitrate_mask, + .set_antenna = rtw_ops_set_antenna, + .get_antenna = rtw_ops_get_antenna, #ifdef CONFIG_PM .suspend = rtw_ops_suspend, .resume = rtw_ops_resume, diff --git a/drivers/net/wireless/realtek/rtw88/main.c b/drivers/net/wireless/realtek/rtw88/main.c index 7640e97706f5..1e1d2c774287 100644 --- a/drivers/net/wireless/realtek/rtw88/main.c +++ b/drivers/net/wireless/realtek/rtw88/main.c @@ -1450,6 +1450,7 @@ EXPORT_SYMBOL(rtw_core_deinit); int rtw_register_hw(struct rtw_dev *rtwdev, struct ieee80211_hw *hw) { + struct rtw_hal *hal = &rtwdev->hal; int max_tx_headroom = 0; int ret; @@ -1478,6 +1479,8 @@ int rtw_register_hw(struct rtw_dev *rtwdev, struct ieee80211_hw *hw) BIT(NL80211_IFTYPE_AP) | BIT(NL80211_IFTYPE_ADHOC) | BIT(NL80211_IFTYPE_MESH_POINT); + hw->wiphy->available_antennas_tx = hal->antenna_tx; + hw->wiphy->available_antennas_rx = hal->antenna_rx; hw->wiphy->flags |= WIPHY_FLAG_SUPPORTS_TDLS | WIPHY_FLAG_TDLS_EXTERNAL_SETUP; diff --git a/drivers/net/wireless/realtek/rtw88/rtw8822c.c b/drivers/net/wireless/realtek/rtw88/rtw8822c.c index dc07e6be38e8..c99b1de54bfc 100644 --- a/drivers/net/wireless/realtek/rtw88/rtw8822c.c +++ b/drivers/net/wireless/realtek/rtw88/rtw8822c.c @@ -1890,6 +1890,40 @@ static void rtw8822c_set_tx_power_index(struct rtw_dev *rtwdev) } } +static int rtw8822c_set_antenna(struct rtw_dev *rtwdev, + u32 antenna_tx, + u32 antenna_rx) +{ + struct rtw_hal *hal = &rtwdev->hal; + + switch (antenna_tx) { + case BB_PATH_A: + case BB_PATH_B: + case BB_PATH_AB: + break; + default: + rtw_info(rtwdev, "unsupport tx path 0x%x\n", antenna_tx); + return -EINVAL; + } + + /* path B only is not available for RX */ + switch (antenna_rx) { + case BB_PATH_A: + case BB_PATH_AB: + break; + default: + rtw_info(rtwdev, "unsupport rx path 0x%x\n", antenna_rx); + return -EINVAL; + } + + hal->antenna_tx = antenna_tx; + hal->antenna_rx = antenna_rx; + + rtw8822c_config_trx_mode(rtwdev, antenna_tx, antenna_rx, false); + + return 0; +} + static void rtw8822c_cfg_ldo25(struct rtw_dev *rtwdev, bool enable) { u8 ldo_pwr; @@ -3794,6 +3828,7 @@ static struct rtw_chip_ops rtw8822c_ops = { .read_rf = rtw_phy_read_rf, .write_rf = rtw_phy_write_rf_reg_mix, .set_tx_power_index = rtw8822c_set_tx_power_index, + .set_antenna = rtw8822c_set_antenna, .cfg_ldo25 = rtw8822c_cfg_ldo25, .false_alarm_statistics = rtw8822c_false_alarm_statistics, .dpk_track = rtw8822c_dpk_track, -- cgit v1.2.3-59-g8ed1b From 1c0e3c73e98dd55bc9551279fed6233997425c23 Mon Sep 17 00:00:00 2001 From: Jules Irenge Date: Sat, 11 Apr 2020 01:19:27 +0100 Subject: hostap: Add missing annotations for prism2_bss_list_proc_start() and prism2_bss_list_proc_stop Sparse reports warnings at prism2_bss_list_proc_start() and prism2_bss_list_proc_stop() warning: context imbalance in prism2_wds_proc_stop() - unexpected unlock warning: context imbalance in prism2_bss_list_proc_start() - wrong count at exit The root cause is the missing annotations at prism2_bss_list_proc_start() Add the missing __acquires(&local->lock) annotation Add the missing __releases(&local->lock) annotation Signed-off-by: Jules Irenge Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/20200411001933.10072-4-jbi.octave@gmail.com --- drivers/net/wireless/intersil/hostap/hostap_proc.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/wireless/intersil/hostap/hostap_proc.c b/drivers/net/wireless/intersil/hostap/hostap_proc.c index a2ee4693eaed..97c270845fd1 100644 --- a/drivers/net/wireless/intersil/hostap/hostap_proc.c +++ b/drivers/net/wireless/intersil/hostap/hostap_proc.c @@ -149,6 +149,7 @@ static int prism2_bss_list_proc_show(struct seq_file *m, void *v) } static void *prism2_bss_list_proc_start(struct seq_file *m, loff_t *_pos) + __acquires(&local->lock) { local_info_t *local = PDE_DATA(file_inode(m->file)); spin_lock_bh(&local->lock); @@ -162,6 +163,7 @@ static void *prism2_bss_list_proc_next(struct seq_file *m, void *v, loff_t *_pos } static void prism2_bss_list_proc_stop(struct seq_file *m, void *v) + __releases(&local->lock) { local_info_t *local = PDE_DATA(file_inode(m->file)); spin_unlock_bh(&local->lock); -- cgit v1.2.3-59-g8ed1b From 2fe5efb8a475c856cd72a37fd73d82f5b5b563e0 Mon Sep 17 00:00:00 2001 From: Jules Irenge Date: Sat, 11 Apr 2020 01:19:28 +0100 Subject: brcmsmac: Add missing annotation for brcms_rfkill_set_hw_state() Sparse reports a warning at brcms_rfkill_set_hw_state() warning: context imbalance in brcms_rfkill_set_hw_state() - unexpected unlock The root cause is the missing annotation at brcms_rfkill_set_hw_state() Add the missing __must_hold(&wl->lock) annotation Signed-off-by: Jules Irenge Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/20200411001933.10072-5-jbi.octave@gmail.com --- drivers/net/wireless/broadcom/brcm80211/brcmsmac/mac80211_if.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmsmac/mac80211_if.c b/drivers/net/wireless/broadcom/brcm80211/brcmsmac/mac80211_if.c index 8e8b685cfe09..c3dbeacea6ca 100644 --- a/drivers/net/wireless/broadcom/brcm80211/brcmsmac/mac80211_if.c +++ b/drivers/net/wireless/broadcom/brcm80211/brcmsmac/mac80211_if.c @@ -1717,6 +1717,7 @@ int brcms_check_firmwares(struct brcms_info *wl) * precondition: perimeter lock has been acquired */ bool brcms_rfkill_set_hw_state(struct brcms_info *wl) + __must_hold(&wl->lock) { bool blocked = brcms_c_check_radio_disabled(wl->wlc); -- cgit v1.2.3-59-g8ed1b From 40fb232c02d1b012c6c84b8c22465d01e20eddf9 Mon Sep 17 00:00:00 2001 From: Jules Irenge Date: Sat, 11 Apr 2020 01:19:29 +0100 Subject: brcmsmac: Add missing annotation for brcms_down() Sparse reports a warning at brcms_down() warning: context imbalance in brcms_down() - unexpected unlock The root cause is the missing annotation at brcms_down() Add the missing __must_hold(&wl->lock) annotation Signed-off-by: Jules Irenge Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/20200411001933.10072-6-jbi.octave@gmail.com --- drivers/net/wireless/broadcom/brcm80211/brcmsmac/mac80211_if.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmsmac/mac80211_if.c b/drivers/net/wireless/broadcom/brcm80211/brcmsmac/mac80211_if.c index c3dbeacea6ca..648efcbc819f 100644 --- a/drivers/net/wireless/broadcom/brcm80211/brcmsmac/mac80211_if.c +++ b/drivers/net/wireless/broadcom/brcm80211/brcmsmac/mac80211_if.c @@ -1431,6 +1431,7 @@ int brcms_up(struct brcms_info *wl) * precondition: perimeter lock has been acquired */ void brcms_down(struct brcms_info *wl) + __must_hold(&wl->lock) { uint callbacks, ret_val = 0; -- cgit v1.2.3-59-g8ed1b From 99cd87d63c0b0724a8e4f1405107ca06c10341e8 Mon Sep 17 00:00:00 2001 From: Jason Yan Date: Mon, 13 Apr 2020 16:20:22 +0800 Subject: libertas: make lbs_process_event() void Fix the following coccicheck warning: drivers/net/wireless/marvell/libertas/cmdresp.c:225:5-8: Unneeded variable: "ret". Return "0" on line 355 Reported-by: Hulk Robot Signed-off-by: Jason Yan Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/20200413082022.22380-1-yanaijie@huawei.com --- drivers/net/wireless/marvell/libertas/cmd.h | 2 +- drivers/net/wireless/marvell/libertas/cmdresp.c | 5 +---- 2 files changed, 2 insertions(+), 5 deletions(-) diff --git a/drivers/net/wireless/marvell/libertas/cmd.h b/drivers/net/wireless/marvell/libertas/cmd.h index 80878561cb90..3c193074662b 100644 --- a/drivers/net/wireless/marvell/libertas/cmd.h +++ b/drivers/net/wireless/marvell/libertas/cmd.h @@ -76,7 +76,7 @@ void lbs_mac_event_disconnected(struct lbs_private *priv, /* Events */ -int lbs_process_event(struct lbs_private *priv, u32 event); +void lbs_process_event(struct lbs_private *priv, u32 event); /* Actual commands */ diff --git a/drivers/net/wireless/marvell/libertas/cmdresp.c b/drivers/net/wireless/marvell/libertas/cmdresp.c index b73d08381398..cb515c5584c1 100644 --- a/drivers/net/wireless/marvell/libertas/cmdresp.c +++ b/drivers/net/wireless/marvell/libertas/cmdresp.c @@ -220,9 +220,8 @@ done: return ret; } -int lbs_process_event(struct lbs_private *priv, u32 event) +void lbs_process_event(struct lbs_private *priv, u32 event) { - int ret = 0; struct cmd_header cmd; switch (event) { @@ -351,6 +350,4 @@ int lbs_process_event(struct lbs_private *priv, u32 event) netdev_alert(priv->dev, "EVENT: unknown event id %d\n", event); break; } - - return ret; } -- cgit v1.2.3-59-g8ed1b From 7b9ae69d5441d98ce55da7d651efff2c7ce27551 Mon Sep 17 00:00:00 2001 From: Jason Yan Date: Mon, 13 Apr 2020 16:20:43 +0800 Subject: orinoco: remove useless variable 'err' in spectrum_cs_suspend() Fix the following coccicheck warning: drivers/net/wireless/intersil/orinoco/spectrum_cs.c:281:5-8: Unneeded variable: "err". Return "0" on line 286 Reported-by: Hulk Robot Signed-off-by: Jason Yan Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/20200413082043.22468-1-yanaijie@huawei.com --- drivers/net/wireless/intersil/orinoco/spectrum_cs.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/net/wireless/intersil/orinoco/spectrum_cs.c b/drivers/net/wireless/intersil/orinoco/spectrum_cs.c index b60048c95e0a..291ef97ed45e 100644 --- a/drivers/net/wireless/intersil/orinoco/spectrum_cs.c +++ b/drivers/net/wireless/intersil/orinoco/spectrum_cs.c @@ -278,12 +278,11 @@ static int spectrum_cs_suspend(struct pcmcia_device *link) { struct orinoco_private *priv = link->priv; - int err = 0; /* Mark the device as stopped, to block IO until later */ orinoco_down(priv); - return err; + return 0; } static int -- cgit v1.2.3-59-g8ed1b From e871b8bfedda84924ac5767883c80deae67a2657 Mon Sep 17 00:00:00 2001 From: Jason Yan Date: Mon, 13 Apr 2020 16:21:26 +0800 Subject: brcmsmac: make brcms_c_stf_ss_update() void Fix the following coccicheck warning: drivers/net/wireless/broadcom/brcm80211/brcmsmac/stf.c:309:5-13: Unneeded variable: "ret_code". Return "0" on line 328 Reported-by: Hulk Robot Signed-off-by: Jason Yan Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/20200413082126.22572-1-yanaijie@huawei.com --- drivers/net/wireless/broadcom/brcm80211/brcmsmac/stf.c | 7 ++----- drivers/net/wireless/broadcom/brcm80211/brcmsmac/stf.h | 2 +- 2 files changed, 3 insertions(+), 6 deletions(-) diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmsmac/stf.c b/drivers/net/wireless/broadcom/brcm80211/brcmsmac/stf.c index 0ab865de1491..79d4a7a4da8b 100644 --- a/drivers/net/wireless/broadcom/brcm80211/brcmsmac/stf.c +++ b/drivers/net/wireless/broadcom/brcm80211/brcmsmac/stf.c @@ -304,9 +304,8 @@ int brcms_c_stf_txchain_set(struct brcms_c_info *wlc, s32 int_val, bool force) * update wlc->stf->ss_opmode which represents the operational stf_ss mode * we're using */ -int brcms_c_stf_ss_update(struct brcms_c_info *wlc, struct brcms_band *band) +void brcms_c_stf_ss_update(struct brcms_c_info *wlc, struct brcms_band *band) { - int ret_code = 0; u8 prev_stf_ss; u8 upd_stf_ss; @@ -325,7 +324,7 @@ int brcms_c_stf_ss_update(struct brcms_c_info *wlc, struct brcms_band *band) PHY_TXC1_MODE_SISO : PHY_TXC1_MODE_CDD; } else { if (wlc->band != band) - return ret_code; + return; upd_stf_ss = (wlc->stf->txstreams == 1) ? PHY_TXC1_MODE_SISO : band->band_stf_ss_mode; } @@ -333,8 +332,6 @@ int brcms_c_stf_ss_update(struct brcms_c_info *wlc, struct brcms_band *band) wlc->stf->ss_opmode = upd_stf_ss; brcms_b_band_stf_ss_set(wlc->hw, upd_stf_ss); } - - return ret_code; } int brcms_c_stf_attach(struct brcms_c_info *wlc) diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmsmac/stf.h b/drivers/net/wireless/broadcom/brcm80211/brcmsmac/stf.h index ba9493009a33..aa4ab53bf634 100644 --- a/drivers/net/wireless/broadcom/brcm80211/brcmsmac/stf.h +++ b/drivers/net/wireless/broadcom/brcm80211/brcmsmac/stf.h @@ -25,7 +25,7 @@ void brcms_c_stf_detach(struct brcms_c_info *wlc); void brcms_c_tempsense_upd(struct brcms_c_info *wlc); void brcms_c_stf_ss_algo_channel_get(struct brcms_c_info *wlc, u16 *ss_algo_channel, u16 chanspec); -int brcms_c_stf_ss_update(struct brcms_c_info *wlc, struct brcms_band *band); +void brcms_c_stf_ss_update(struct brcms_c_info *wlc, struct brcms_band *band); void brcms_c_stf_phy_txant_upd(struct brcms_c_info *wlc); int brcms_c_stf_txchain_set(struct brcms_c_info *wlc, s32 int_val, bool force); bool brcms_c_stf_stbc_rx_set(struct brcms_c_info *wlc, s32 int_val); -- cgit v1.2.3-59-g8ed1b From 5a652b49b41b1cbffe2beedbaf253f60f768fd92 Mon Sep 17 00:00:00 2001 From: Jason Yan Date: Tue, 14 Apr 2020 20:02:51 +0800 Subject: ipw2x00: make ipw_setup_deferred_work() void This function actually needs no return value. So remove the unneeded variable 'ret' and make it void. This also fixes the following coccicheck warning: drivers/net/wireless/intel/ipw2x00/ipw2200.c:10648:5-8: Unneeded variable: "ret". Return "0" on line 10684 Signed-off-by: Jason Yan Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/20200414120251.35869-1-yanaijie@huawei.com --- drivers/net/wireless/intel/ipw2x00/ipw2200.c | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/drivers/net/wireless/intel/ipw2x00/ipw2200.c b/drivers/net/wireless/intel/ipw2x00/ipw2200.c index 201a1eb0e2f6..923be3781c92 100644 --- a/drivers/net/wireless/intel/ipw2x00/ipw2200.c +++ b/drivers/net/wireless/intel/ipw2x00/ipw2200.c @@ -10640,10 +10640,8 @@ static void ipw_bg_link_down(struct work_struct *work) mutex_unlock(&priv->mutex); } -static int ipw_setup_deferred_work(struct ipw_priv *priv) +static void ipw_setup_deferred_work(struct ipw_priv *priv) { - int ret = 0; - init_waitqueue_head(&priv->wait_command_queue); init_waitqueue_head(&priv->wait_state); @@ -10677,8 +10675,6 @@ static int ipw_setup_deferred_work(struct ipw_priv *priv) tasklet_init(&priv->irq_tasklet, ipw_irq_tasklet, (unsigned long)priv); - - return ret; } static void shim__set_security(struct net_device *dev, @@ -11659,11 +11655,7 @@ static int ipw_pci_probe(struct pci_dev *pdev, IPW_DEBUG_INFO("pci_resource_len = 0x%08x\n", length); IPW_DEBUG_INFO("pci_resource_base = %p\n", base); - err = ipw_setup_deferred_work(priv); - if (err) { - IPW_ERROR("Unable to setup deferred work\n"); - goto out_iounmap; - } + ipw_setup_deferred_work(priv); ipw_sw_reset(priv, 1); -- cgit v1.2.3-59-g8ed1b From 55beec10710d10cb4a1cbbc5b1a0d9c9cfbd2c1e Mon Sep 17 00:00:00 2001 From: Marcel Holtmann Date: Thu, 9 Apr 2020 08:05:47 +0200 Subject: Bluetooth: Sort list of LE features constants The list of LE features constants has gotten a bit confused. It lost the order and gained duplicated. Clean this up. Signed-off-by: Marcel Holtmann Signed-off-by: Johan Hedberg --- include/net/bluetooth/hci.h | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h index f4e8e2a0b7c1..ff42d05b3e72 100644 --- a/include/net/bluetooth/hci.h +++ b/include/net/bluetooth/hci.h @@ -460,12 +460,10 @@ enum { #define HCI_LE_SLAVE_FEATURES 0x08 #define HCI_LE_PING 0x10 #define HCI_LE_DATA_LEN_EXT 0x20 -#define HCI_LE_PHY_2M 0x01 -#define HCI_LE_PHY_CODED 0x08 -#define HCI_LE_EXT_ADV 0x10 #define HCI_LE_EXT_SCAN_POLICY 0x80 #define HCI_LE_PHY_2M 0x01 #define HCI_LE_PHY_CODED 0x08 +#define HCI_LE_EXT_ADV 0x10 #define HCI_LE_CHAN_SEL_ALG2 0x40 #define HCI_LE_CIS_MASTER 0x10 #define HCI_LE_CIS_SLAVE 0x20 -- cgit v1.2.3-59-g8ed1b From 849c9c35e80d73c215c65b6023658b371bdeb5ed Mon Sep 17 00:00:00 2001 From: Marcel Holtmann Date: Thu, 9 Apr 2020 08:05:48 +0200 Subject: Bluetooth: Use extra variable to make code more readable When starting active scanning for discovery the whitelist is not needed to be used. So the filter_policy is 0x00. To make the core more readable use a variable name instead of just setting 0 as paramter. Signed-off-by: Marcel Holtmann Signed-off-by: Johan Hedberg --- net/bluetooth/hci_request.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/net/bluetooth/hci_request.c b/net/bluetooth/hci_request.c index 649e1e5ed446..9ea40106ef17 100644 --- a/net/bluetooth/hci_request.c +++ b/net/bluetooth/hci_request.c @@ -2723,6 +2723,8 @@ static int active_scan(struct hci_request *req, unsigned long opt) uint16_t interval = opt; struct hci_dev *hdev = req->hdev; u8 own_addr_type; + /* White list is not used for discovery */ + u8 filter_policy = 0x00; int err; BT_DBG("%s", hdev->name); @@ -2744,7 +2746,7 @@ static int active_scan(struct hci_request *req, unsigned long opt) own_addr_type = ADDR_LE_DEV_PUBLIC; hci_req_start_scan(req, LE_SCAN_ACTIVE, interval, DISCOV_LE_SCAN_WIN, - own_addr_type, 0); + own_addr_type, filter_policy); return 0; } -- cgit v1.2.3-59-g8ed1b From ff3b8df2bd758d97aa3dd7c021864be05fec9bd5 Mon Sep 17 00:00:00 2001 From: Marcel Holtmann Date: Thu, 9 Apr 2020 08:05:49 +0200 Subject: Bluetooth: Enable LE Enhanced Connection Complete event. In case LL Privacy is supported by the controller, it is also a good idea to use the LE Enhanced Connection Complete event for getting all information about the new connection and its addresses. Signed-off-by: Marcel Holtmann Signed-off-by: Johan Hedberg --- include/net/bluetooth/hci.h | 1 + net/bluetooth/hci_core.c | 8 ++++++++ 2 files changed, 9 insertions(+) diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h index ff42d05b3e72..1da8cec8e210 100644 --- a/include/net/bluetooth/hci.h +++ b/include/net/bluetooth/hci.h @@ -460,6 +460,7 @@ enum { #define HCI_LE_SLAVE_FEATURES 0x08 #define HCI_LE_PING 0x10 #define HCI_LE_DATA_LEN_EXT 0x20 +#define HCI_LE_LL_PRIVACY 0x40 #define HCI_LE_EXT_SCAN_POLICY 0x80 #define HCI_LE_PHY_2M 0x01 #define HCI_LE_PHY_CODED 0x08 diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index 589c4085499c..0d726d59a492 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -638,6 +638,14 @@ static int hci_init3_req(struct hci_request *req, unsigned long opt) if (hdev->le_features[0] & HCI_LE_DATA_LEN_EXT) events[0] |= 0x40; /* LE Data Length Change */ + /* If the controller supports LL Privacy feature, enable + * the corresponding event. + */ + if (hdev->le_features[0] & HCI_LE_LL_PRIVACY) + events[1] |= 0x02; /* LE Enhanced Connection + * Complete + */ + /* If the controller supports Extended Scanner Filter * Policies, enable the correspondig event. */ -- cgit v1.2.3-59-g8ed1b From 2eb71a3a68c387274cfc1bc43eac25138add528d Mon Sep 17 00:00:00 2001 From: Marcel Holtmann Date: Thu, 9 Apr 2020 08:05:50 +0200 Subject: Bluetooth: Clear HCI_LL_RPA_RESOLUTION flag on reset When the controller is being reset or power cycled, then the flag HCI_LL_RPA_RESOLUTION which indicates if controller based address resolution is active needs to be also reset. Signed-off-by: Marcel Holtmann Signed-off-by: Johan Hedberg --- include/net/bluetooth/hci_core.h | 1 + 1 file changed, 1 insertion(+) diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index 2f3275f1d1c4..239ab72f16c6 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -645,6 +645,7 @@ extern struct mutex hci_cb_list_lock; do { \ hci_dev_clear_flag(hdev, HCI_LE_SCAN); \ hci_dev_clear_flag(hdev, HCI_LE_ADV); \ + hci_dev_clear_flag(hdev, HCI_LL_RPA_RESOLUTION);\ hci_dev_clear_flag(hdev, HCI_PERIODIC_INQ); \ } while (0) -- cgit v1.2.3-59-g8ed1b From f0f383347ced96416d5e3062f8bb2b0f99ac9d5b Mon Sep 17 00:00:00 2001 From: Nils ANDRÉ-CHANG Date: Sun, 12 Apr 2020 18:19:00 +0100 Subject: brcmfmac: remove leading space MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Nils ANDRÉ-CHANG Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/20200412171900.xzedxhzd56gox5kf@nixos --- drivers/net/wireless/broadcom/brcm80211/brcmfmac/bcmsdh.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/bcmsdh.c b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/bcmsdh.c index b684a5b6d904..22a17ae09e94 100644 --- a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/bcmsdh.c +++ b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/bcmsdh.c @@ -961,7 +961,7 @@ static const struct sdio_device_id brcmf_sdmmc_ids[] = { BRCMF_SDIO_DEVICE(SDIO_DEVICE_ID_BROADCOM_43340), BRCMF_SDIO_DEVICE(SDIO_DEVICE_ID_BROADCOM_43341), BRCMF_SDIO_DEVICE(SDIO_DEVICE_ID_BROADCOM_43362), - BRCMF_SDIO_DEVICE(SDIO_DEVICE_ID_BROADCOM_43364), + BRCMF_SDIO_DEVICE(SDIO_DEVICE_ID_BROADCOM_43364), BRCMF_SDIO_DEVICE(SDIO_DEVICE_ID_BROADCOM_4335_4339), BRCMF_SDIO_DEVICE(SDIO_DEVICE_ID_BROADCOM_4339), BRCMF_SDIO_DEVICE(SDIO_DEVICE_ID_BROADCOM_43430), -- cgit v1.2.3-59-g8ed1b From 7edc9079540b65026f3d3386b3642d1820d5fed5 Mon Sep 17 00:00:00 2001 From: Marcel Holtmann Date: Wed, 15 Apr 2020 17:35:16 +0200 Subject: Bluetooth: Enhanced Connection Complete event belongs to LL Privacy The Enhanced Connection Complete event is use in conjunction with LL Privacy and not Extended Advertising. Signed-off-by: Marcel Holtmann Signed-off-by: Johan Hedberg --- net/bluetooth/hci_core.c | 8 -------- 1 file changed, 8 deletions(-) diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index 0d726d59a492..51d399273276 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -719,14 +719,6 @@ static int hci_init3_req(struct hci_request *req, unsigned long opt) * Report */ - /* If the controller supports the LE Extended Create Connection - * command, enable the corresponding event. - */ - if (use_ext_conn(hdev)) - events[1] |= 0x02; /* LE Enhanced Connection - * Complete - */ - /* If the controller supports the LE Extended Advertising * command, enable the corresponding event. */ -- cgit v1.2.3-59-g8ed1b From 34428dff3679f0c4c9b185ff8eccefd12a7f55f8 Mon Sep 17 00:00:00 2001 From: Sasha Neftin Date: Mon, 10 Feb 2020 09:04:15 +0200 Subject: igc: Add GSO partial support Partial generic segmentation offload is a hybrid between TSO and GSO. What is effectively does is take advantage of certain traits of TCP and tunnels so that instead of having to rewrite the packet headers for each segment only in the inner-most transport header and possible the outer-most network header need to be updated. This allows devices that do not support tunnel offload or tunnels offloads with checksum to still make use of segmentation. Signed-off-by: Sasha Neftin Tested-by: Aaron Brown Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/igc/igc_main.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/drivers/net/ethernet/intel/igc/igc_main.c b/drivers/net/ethernet/intel/igc/igc_main.c index 69fa1ce1f927..46ab035c2032 100644 --- a/drivers/net/ethernet/intel/igc/igc_main.c +++ b/drivers/net/ethernet/intel/igc/igc_main.c @@ -4727,6 +4727,16 @@ static int igc_probe(struct pci_dev *pdev, netdev->features |= NETIF_F_HW_CSUM; netdev->features |= NETIF_F_SCTP_CRC; +#define IGC_GSO_PARTIAL_FEATURES (NETIF_F_GSO_GRE | \ + NETIF_F_GSO_GRE_CSUM | \ + NETIF_F_GSO_IPXIP4 | \ + NETIF_F_GSO_IPXIP6 | \ + NETIF_F_GSO_UDP_TUNNEL | \ + NETIF_F_GSO_UDP_TUNNEL_CSUM) + + netdev->gso_partial_features = IGC_GSO_PARTIAL_FEATURES; + netdev->features |= NETIF_F_GSO_PARTIAL | IGC_GSO_PARTIAL_FEATURES; + /* setup the private structure */ err = igc_sw_init(adapter); if (err) -- cgit v1.2.3-59-g8ed1b From ec50a9d437f05dd76444a65fdd3cfbfad90ee9d6 Mon Sep 17 00:00:00 2001 From: Vinicius Costa Gomes Date: Fri, 14 Feb 2020 15:52:02 -0800 Subject: igc: Add support for taprio offloading Adds support for translating taprio schedules into i225 cycles. This will allow schedules to run in the hardware, making the schedules enforcement more precise and saving CPU time. Right now, the only simple schedules are allowed, complex schedules are rejected. "simple" in this context are schedules that each HW queue is opened and closed only once in each cycle. Changing schedules is still not supported as well. Signed-off-by: Vinicius Costa Gomes Reviewed-by: Andre Guedes Tested-by: Aaron Brown Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/igc/Makefile | 2 +- drivers/net/ethernet/intel/igc/igc.h | 7 ++ drivers/net/ethernet/intel/igc/igc_defines.h | 12 +++ drivers/net/ethernet/intel/igc/igc_main.c | 113 +++++++++++++++++++++ drivers/net/ethernet/intel/igc/igc_regs.h | 12 +++ drivers/net/ethernet/intel/igc/igc_tsn.c | 140 +++++++++++++++++++++++++++ drivers/net/ethernet/intel/igc/igc_tsn.h | 9 ++ 7 files changed, 294 insertions(+), 1 deletion(-) create mode 100644 drivers/net/ethernet/intel/igc/igc_tsn.c create mode 100644 drivers/net/ethernet/intel/igc/igc_tsn.h diff --git a/drivers/net/ethernet/intel/igc/Makefile b/drivers/net/ethernet/intel/igc/Makefile index e3c164c12e10..3652f211f351 100644 --- a/drivers/net/ethernet/intel/igc/Makefile +++ b/drivers/net/ethernet/intel/igc/Makefile @@ -8,4 +8,4 @@ obj-$(CONFIG_IGC) += igc.o igc-objs := igc_main.o igc_mac.o igc_i225.o igc_base.o igc_nvm.o igc_phy.o \ -igc_ethtool.o igc_ptp.o igc_dump.o +igc_ethtool.o igc_ptp.o igc_dump.o igc_tsn.o diff --git a/drivers/net/ethernet/intel/igc/igc.h b/drivers/net/ethernet/intel/igc/igc.h index a1f845a2aa80..5e36822de5ec 100644 --- a/drivers/net/ethernet/intel/igc/igc.h +++ b/drivers/net/ethernet/intel/igc/igc.h @@ -70,6 +70,7 @@ extern char igc_driver_version[]; #define IGC_FLAG_HAS_MSIX BIT(13) #define IGC_FLAG_VLAN_PROMISC BIT(15) #define IGC_FLAG_RX_LEGACY BIT(16) +#define IGC_FLAG_TSN_QBV_ENABLED BIT(17) #define IGC_FLAG_RSS_FIELD_IPV4_UDP BIT(6) #define IGC_FLAG_RSS_FIELD_IPV6_UDP BIT(7) @@ -287,6 +288,9 @@ struct igc_ring { u8 reg_idx; /* physical index of the ring */ bool launchtime_enable; /* true if LaunchTime is enabled */ + u32 start_time; + u32 end_time; + /* everything past this point are written often */ u16 next_to_clean; u16 next_to_use; @@ -421,6 +425,9 @@ struct igc_adapter { u32 max_frame_size; u32 min_frame_size; + ktime_t base_time; + ktime_t cycle_time; + /* OS defined structs */ struct pci_dev *pdev; /* lock for statistics */ diff --git a/drivers/net/ethernet/intel/igc/igc_defines.h b/drivers/net/ethernet/intel/igc/igc_defines.h index 4ddccccf42cc..2da5a9b012af 100644 --- a/drivers/net/ethernet/intel/igc/igc_defines.h +++ b/drivers/net/ethernet/intel/igc/igc_defines.h @@ -377,6 +377,11 @@ #define I225_TXPBSIZE_DEFAULT 0x04000014 /* TXPBSIZE default */ #define IGC_RXPBS_CFG_TS_EN 0x80000000 /* Timestamp in Rx buffer */ +#define IGC_TXPBSIZE_TSN 0x04145145 /* 5k bytes buffer for each queue */ + +#define IGC_DTXMXPKTSZ_TSN 0x19 /* 1600 bytes of max TX DMA packet size */ +#define IGC_DTXMXPKTSZ_DEFAULT 0x98 /* 9728-byte Jumbo frames */ + /* Time Sync Interrupt Causes */ #define IGC_TSICR_SYS_WRAP BIT(0) /* SYSTIM Wrap around. */ #define IGC_TSICR_TXTS BIT(1) /* Transmit Timestamp. */ @@ -431,6 +436,13 @@ #define IGC_TSYNCTXCTL_START_SYNC 0x80000000 /* initiate sync */ #define IGC_TSYNCTXCTL_TXSYNSIG 0x00000020 /* Sample TX tstamp in PHY sop */ +/* Transmit Scheduling */ +#define IGC_TQAVCTRL_TRANSMIT_MODE_TSN 0x00000001 +#define IGC_TQAVCTRL_ENHANCED_QAV 0x00000008 + +#define IGC_TXQCTL_STRICT_CYCLE 0x00000002 +#define IGC_TXQCTL_STRICT_END 0x00000004 + /* Receive Checksum Control */ #define IGC_RXCSUM_CRCOFL 0x00000800 /* CRC32 offload enable */ #define IGC_RXCSUM_PCSD 0x00002000 /* packet checksum disabled */ diff --git a/drivers/net/ethernet/intel/igc/igc_main.c b/drivers/net/ethernet/intel/igc/igc_main.c index 46ab035c2032..12d672a6bc45 100644 --- a/drivers/net/ethernet/intel/igc/igc_main.c +++ b/drivers/net/ethernet/intel/igc/igc_main.c @@ -9,11 +9,13 @@ #include #include #include +#include #include #include "igc.h" #include "igc_hw.h" +#include "igc_tsn.h" #define DRV_VERSION "0.0.1-k" #define DRV_SUMMARY "Intel(R) 2.5G Ethernet Linux Driver" @@ -106,6 +108,9 @@ void igc_reset(struct igc_adapter *adapter) /* Re-enable PTP, where applicable. */ igc_ptp_reset(adapter); + /* Re-enable TSN offloading, where applicable. */ + igc_tsn_offload_apply(adapter); + igc_get_phy_info(hw); } @@ -4491,6 +4496,113 @@ static int igc_ioctl(struct net_device *netdev, struct ifreq *ifr, int cmd) } } +static bool validate_schedule(const struct tc_taprio_qopt_offload *qopt) +{ + int queue_uses[IGC_MAX_TX_QUEUES] = { }; + size_t n; + + if (qopt->cycle_time_extension) + return false; + + for (n = 0; n < qopt->num_entries; n++) { + const struct tc_taprio_sched_entry *e; + int i; + + e = &qopt->entries[n]; + + /* i225 only supports "global" frame preemption + * settings. + */ + if (e->command != TC_TAPRIO_CMD_SET_GATES) + return false; + + for (i = 0; i < IGC_MAX_TX_QUEUES; i++) { + if (e->gate_mask & BIT(i)) + queue_uses[i]++; + + if (queue_uses[i] > 1) + return false; + } + } + + return true; +} + +static int igc_save_qbv_schedule(struct igc_adapter *adapter, + struct tc_taprio_qopt_offload *qopt) +{ + u32 start_time = 0, end_time = 0; + size_t n; + + if (!qopt->enable) { + adapter->base_time = 0; + return 0; + } + + if (adapter->base_time) + return -EALREADY; + + if (!validate_schedule(qopt)) + return -EINVAL; + + adapter->cycle_time = qopt->cycle_time; + adapter->base_time = qopt->base_time; + + /* FIXME: be a little smarter about cases when the gate for a + * queue stays open for more than one entry. + */ + for (n = 0; n < qopt->num_entries; n++) { + struct tc_taprio_sched_entry *e = &qopt->entries[n]; + int i; + + end_time += e->interval; + + for (i = 0; i < IGC_MAX_TX_QUEUES; i++) { + struct igc_ring *ring = adapter->tx_ring[i]; + + if (!(e->gate_mask & BIT(i))) + continue; + + ring->start_time = start_time; + ring->end_time = end_time; + } + + start_time += e->interval; + } + + return 0; +} + +static int igc_tsn_enable_qbv_scheduling(struct igc_adapter *adapter, + struct tc_taprio_qopt_offload *qopt) +{ + struct igc_hw *hw = &adapter->hw; + int err; + + if (hw->mac.type != igc_i225) + return -EOPNOTSUPP; + + err = igc_save_qbv_schedule(adapter, qopt); + if (err) + return err; + + return igc_tsn_offload_apply(adapter); +} + +static int igc_setup_tc(struct net_device *dev, enum tc_setup_type type, + void *type_data) +{ + struct igc_adapter *adapter = netdev_priv(dev); + + switch (type) { + case TC_SETUP_QDISC_TAPRIO: + return igc_tsn_enable_qbv_scheduling(adapter, type_data); + + default: + return -EOPNOTSUPP; + } +} + static const struct net_device_ops igc_netdev_ops = { .ndo_open = igc_open, .ndo_stop = igc_close, @@ -4503,6 +4615,7 @@ static const struct net_device_ops igc_netdev_ops = { .ndo_set_features = igc_set_features, .ndo_features_check = igc_features_check, .ndo_do_ioctl = igc_ioctl, + .ndo_setup_tc = igc_setup_tc, }; /* PCIe configuration access */ diff --git a/drivers/net/ethernet/intel/igc/igc_regs.h b/drivers/net/ethernet/intel/igc/igc_regs.h index d4af53a80f11..6093cde2351c 100644 --- a/drivers/net/ethernet/intel/igc/igc_regs.h +++ b/drivers/net/ethernet/intel/igc/igc_regs.h @@ -231,6 +231,18 @@ #define IGC_RXPBS 0x02404 /* Rx Packet Buffer Size - RW */ +/* Transmit Scheduling Registers */ +#define IGC_TQAVCTRL 0x3570 +#define IGC_TXQCTL(_n) (0x3344 + 0x4 * (_n)) +#define IGC_BASET_L 0x3314 +#define IGC_BASET_H 0x3318 +#define IGC_QBVCYCLET 0x331C +#define IGC_QBVCYCLET_S 0x3320 + +#define IGC_STQT(_n) (0x3324 + 0x4 * (_n)) +#define IGC_ENDQT(_n) (0x3334 + 0x4 * (_n)) +#define IGC_DTXMXPKTSZ 0x355C + /* System Time Registers */ #define IGC_SYSTIML 0x0B600 /* System time register Low - RO */ #define IGC_SYSTIMH 0x0B604 /* System time register High - RO */ diff --git a/drivers/net/ethernet/intel/igc/igc_tsn.c b/drivers/net/ethernet/intel/igc/igc_tsn.c new file mode 100644 index 000000000000..257fe970afe8 --- /dev/null +++ b/drivers/net/ethernet/intel/igc/igc_tsn.c @@ -0,0 +1,140 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2019 Intel Corporation */ + +#include "igc.h" +#include "igc_tsn.h" + +/* Returns the TSN specific registers to their default values after + * TSN offloading is disabled. + */ +static int igc_tsn_disable_offload(struct igc_adapter *adapter) +{ + struct igc_hw *hw = &adapter->hw; + u32 tqavctrl; + int i; + + if (!(adapter->flags & IGC_FLAG_TSN_QBV_ENABLED)) + return 0; + + adapter->cycle_time = 0; + + wr32(IGC_TXPBS, I225_TXPBSIZE_DEFAULT); + wr32(IGC_DTXMXPKTSZ, IGC_DTXMXPKTSZ_DEFAULT); + + tqavctrl = rd32(IGC_TQAVCTRL); + tqavctrl &= ~(IGC_TQAVCTRL_TRANSMIT_MODE_TSN | + IGC_TQAVCTRL_ENHANCED_QAV); + wr32(IGC_TQAVCTRL, tqavctrl); + + for (i = 0; i < adapter->num_tx_queues; i++) { + struct igc_ring *ring = adapter->tx_ring[i]; + + ring->start_time = 0; + ring->end_time = 0; + ring->launchtime_enable = false; + + wr32(IGC_TXQCTL(i), 0); + wr32(IGC_STQT(i), 0); + wr32(IGC_ENDQT(i), NSEC_PER_SEC); + } + + wr32(IGC_QBVCYCLET_S, NSEC_PER_SEC); + wr32(IGC_QBVCYCLET, NSEC_PER_SEC); + + adapter->flags &= ~IGC_FLAG_TSN_QBV_ENABLED; + + return 0; +} + +static int igc_tsn_enable_offload(struct igc_adapter *adapter) +{ + struct igc_hw *hw = &adapter->hw; + u32 tqavctrl, baset_l, baset_h; + u32 sec, nsec, cycle; + ktime_t base_time, systim; + int i; + + if (adapter->flags & IGC_FLAG_TSN_QBV_ENABLED) + return 0; + + cycle = adapter->cycle_time; + base_time = adapter->base_time; + + wr32(IGC_TSAUXC, 0); + wr32(IGC_DTXMXPKTSZ, IGC_DTXMXPKTSZ_TSN); + wr32(IGC_TXPBS, IGC_TXPBSIZE_TSN); + + tqavctrl = rd32(IGC_TQAVCTRL); + tqavctrl |= IGC_TQAVCTRL_TRANSMIT_MODE_TSN | IGC_TQAVCTRL_ENHANCED_QAV; + wr32(IGC_TQAVCTRL, tqavctrl); + + wr32(IGC_QBVCYCLET_S, cycle); + wr32(IGC_QBVCYCLET, cycle); + + for (i = 0; i < adapter->num_tx_queues; i++) { + struct igc_ring *ring = adapter->tx_ring[i]; + u32 txqctl = 0; + + wr32(IGC_STQT(i), ring->start_time); + wr32(IGC_ENDQT(i), ring->end_time); + + if (adapter->base_time) { + /* If we have a base_time we are in "taprio" + * mode and we need to be strict about the + * cycles: only transmit a packet if it can be + * completed during that cycle. + */ + txqctl |= IGC_TXQCTL_STRICT_CYCLE | + IGC_TXQCTL_STRICT_END; + } + + wr32(IGC_TXQCTL(i), txqctl); + } + + nsec = rd32(IGC_SYSTIML); + sec = rd32(IGC_SYSTIMH); + + systim = ktime_set(sec, nsec); + + if (ktime_compare(systim, base_time) > 0) { + s64 n; + + n = div64_s64(ktime_sub_ns(systim, base_time), cycle); + base_time = ktime_add_ns(base_time, (n + 1) * cycle); + } + + baset_h = div_s64_rem(base_time, NSEC_PER_SEC, &baset_l); + + wr32(IGC_BASET_H, baset_h); + wr32(IGC_BASET_L, baset_l); + + adapter->flags |= IGC_FLAG_TSN_QBV_ENABLED; + + return 0; +} + +int igc_tsn_offload_apply(struct igc_adapter *adapter) +{ + bool is_any_enabled = adapter->base_time; + + if (!(adapter->flags & IGC_FLAG_TSN_QBV_ENABLED) && !is_any_enabled) + return 0; + + if (!is_any_enabled) { + int err = igc_tsn_disable_offload(adapter); + + if (err < 0) + return err; + + /* The BASET registers aren't cleared when writing + * into them, force a reset if the interface is + * running. + */ + if (netif_running(adapter->netdev)) + schedule_work(&adapter->reset_task); + + return 0; + } + + return igc_tsn_enable_offload(adapter); +} diff --git a/drivers/net/ethernet/intel/igc/igc_tsn.h b/drivers/net/ethernet/intel/igc/igc_tsn.h new file mode 100644 index 000000000000..f76bc86ddccd --- /dev/null +++ b/drivers/net/ethernet/intel/igc/igc_tsn.h @@ -0,0 +1,9 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright (c) 2020 Intel Corporation */ + +#ifndef _IGC_TSN_H_ +#define _IGC_TSN_H_ + +int igc_tsn_offload_apply(struct igc_adapter *adapter); + +#endif /* _IGC_BASE_H */ -- cgit v1.2.3-59-g8ed1b From 82faa9b799500f9e002067c6d8cb027ab12acca4 Mon Sep 17 00:00:00 2001 From: Vinicius Costa Gomes Date: Fri, 14 Feb 2020 15:52:03 -0800 Subject: igc: Add support for ETF offloading This adds support for ETF offloading for the i225 controller. For i225, the LaunchTime feature is almost a subset of the Qbv feature. The main change from the i210 is that the launchtime of each packet is specified as an offset applied to the BASET register. BASET is automatically incremented each cycle. For i225, the approach chosen is to re-use most of the setup used for taprio offloading. With a few changes: - The more or less obvious one is that when ETF is enabled, we should set add the expected launchtime to the (advanced) transmit descriptor; - The less obvious, is that when taprio offloading is not enabled, we add a dummy schedule (all queues are open all the time, with a cycle time of 1 second). Signed-off-by: Vinicius Costa Gomes Reviewed-by: Andre Guedes Tested-by: Aaron Brown Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/igc/igc_defines.h | 1 + drivers/net/ethernet/intel/igc/igc_main.c | 70 ++++++++++++++++++++++++++-- drivers/net/ethernet/intel/igc/igc_tsn.c | 19 +++++++- 3 files changed, 86 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/intel/igc/igc_defines.h b/drivers/net/ethernet/intel/igc/igc_defines.h index 2da5a9b012af..1b0fd2ffd08d 100644 --- a/drivers/net/ethernet/intel/igc/igc_defines.h +++ b/drivers/net/ethernet/intel/igc/igc_defines.h @@ -440,6 +440,7 @@ #define IGC_TQAVCTRL_TRANSMIT_MODE_TSN 0x00000001 #define IGC_TQAVCTRL_ENHANCED_QAV 0x00000008 +#define IGC_TXQCTL_QUEUE_MODE_LAUNCHT 0x00000001 #define IGC_TXQCTL_STRICT_CYCLE 0x00000002 #define IGC_TXQCTL_STRICT_END 0x00000004 diff --git a/drivers/net/ethernet/intel/igc/igc_main.c b/drivers/net/ethernet/intel/igc/igc_main.c index 12d672a6bc45..896b314035c9 100644 --- a/drivers/net/ethernet/intel/igc/igc_main.c +++ b/drivers/net/ethernet/intel/igc/igc_main.c @@ -869,6 +869,23 @@ static int igc_write_mc_addr_list(struct net_device *netdev) return netdev_mc_count(netdev); } +static __le32 igc_tx_launchtime(struct igc_adapter *adapter, ktime_t txtime) +{ + ktime_t cycle_time = adapter->cycle_time; + ktime_t base_time = adapter->base_time; + u32 launchtime; + + /* FIXME: when using ETF together with taprio, we may have a + * case where 'delta' is larger than the cycle_time, this may + * cause problems if we don't read the current value of + * IGC_BASET, as the value writen into the launchtime + * descriptor field may be misinterpreted. + */ + div_s64_rem(ktime_sub_ns(txtime, base_time), cycle_time, &launchtime); + + return cpu_to_le32(launchtime); +} + static void igc_tx_ctxtdesc(struct igc_ring *tx_ring, struct igc_tx_buffer *first, u32 vlan_macip_lens, u32 type_tucmd, @@ -876,7 +893,6 @@ static void igc_tx_ctxtdesc(struct igc_ring *tx_ring, { struct igc_adv_tx_context_desc *context_desc; u16 i = tx_ring->next_to_use; - struct timespec64 ts; context_desc = IGC_TX_CTXTDESC(tx_ring, i); @@ -898,9 +914,12 @@ static void igc_tx_ctxtdesc(struct igc_ring *tx_ring, * should have been handled by the upper layers. */ if (tx_ring->launchtime_enable) { - ts = ktime_to_timespec64(first->skb->tstamp); + struct igc_adapter *adapter = netdev_priv(tx_ring->netdev); + ktime_t txtime = first->skb->tstamp; + first->skb->tstamp = ktime_set(0, 0); - context_desc->launch_time = cpu_to_le32(ts.tv_nsec / 32); + context_desc->launch_time = igc_tx_launchtime(adapter, + txtime); } else { context_desc->launch_time = 0; } @@ -4496,6 +4515,32 @@ static int igc_ioctl(struct net_device *netdev, struct ifreq *ifr, int cmd) } } +static int igc_save_launchtime_params(struct igc_adapter *adapter, int queue, + bool enable) +{ + struct igc_ring *ring; + int i; + + if (queue < 0 || queue >= adapter->num_tx_queues) + return -EINVAL; + + ring = adapter->tx_ring[queue]; + ring->launchtime_enable = enable; + + if (adapter->base_time) + return 0; + + adapter->cycle_time = NSEC_PER_SEC; + + for (i = 0; i < adapter->num_tx_queues; i++) { + ring = adapter->tx_ring[i]; + ring->start_time = 0; + ring->end_time = NSEC_PER_SEC; + } + + return 0; +} + static bool validate_schedule(const struct tc_taprio_qopt_offload *qopt) { int queue_uses[IGC_MAX_TX_QUEUES] = { }; @@ -4528,6 +4573,22 @@ static bool validate_schedule(const struct tc_taprio_qopt_offload *qopt) return true; } +static int igc_tsn_enable_launchtime(struct igc_adapter *adapter, + struct tc_etf_qopt_offload *qopt) +{ + struct igc_hw *hw = &adapter->hw; + int err; + + if (hw->mac.type != igc_i225) + return -EOPNOTSUPP; + + err = igc_save_launchtime_params(adapter, qopt->queue, qopt->enable); + if (err) + return err; + + return igc_tsn_offload_apply(adapter); +} + static int igc_save_qbv_schedule(struct igc_adapter *adapter, struct tc_taprio_qopt_offload *qopt) { @@ -4598,6 +4659,9 @@ static int igc_setup_tc(struct net_device *dev, enum tc_setup_type type, case TC_SETUP_QDISC_TAPRIO: return igc_tsn_enable_qbv_scheduling(adapter, type_data); + case TC_SETUP_QDISC_ETF: + return igc_tsn_enable_launchtime(adapter, type_data); + default: return -EOPNOTSUPP; } diff --git a/drivers/net/ethernet/intel/igc/igc_tsn.c b/drivers/net/ethernet/intel/igc/igc_tsn.c index 257fe970afe8..174103c4bea6 100644 --- a/drivers/net/ethernet/intel/igc/igc_tsn.c +++ b/drivers/net/ethernet/intel/igc/igc_tsn.c @@ -4,6 +4,20 @@ #include "igc.h" #include "igc_tsn.h" +static bool is_any_launchtime(struct igc_adapter *adapter) +{ + int i; + + for (i = 0; i < adapter->num_tx_queues; i++) { + struct igc_ring *ring = adapter->tx_ring[i]; + + if (ring->launchtime_enable) + return true; + } + + return false; +} + /* Returns the TSN specific registers to their default values after * TSN offloading is disabled. */ @@ -88,6 +102,9 @@ static int igc_tsn_enable_offload(struct igc_adapter *adapter) IGC_TXQCTL_STRICT_END; } + if (ring->launchtime_enable) + txqctl |= IGC_TXQCTL_QUEUE_MODE_LAUNCHT; + wr32(IGC_TXQCTL(i), txqctl); } @@ -115,7 +132,7 @@ static int igc_tsn_enable_offload(struct igc_adapter *adapter) int igc_tsn_offload_apply(struct igc_adapter *adapter) { - bool is_any_enabled = adapter->base_time; + bool is_any_enabled = adapter->base_time || is_any_launchtime(adapter); if (!(adapter->flags & IGC_FLAG_TSN_QBV_ENABLED) && !is_any_enabled) return 0; -- cgit v1.2.3-59-g8ed1b From a34c7f5156654ebaf7eaace102938be7ff7036cb Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 19 Feb 2020 22:23:02 -0800 Subject: e1000: Distribute switch variables for initialization MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Variables declared in a switch statement before any case statements cannot be automatically initialized with compiler instrumentation (as they are not part of any execution flow). With GCC's proposed automatic stack variable initialization feature, this triggers a warning (and they don't get initialized). Clang's automatic stack variable initialization (via CONFIG_INIT_STACK_ALL=y) doesn't throw a warning, but it also doesn't initialize such variables[1]. Note that these warnings (or silent skipping) happen before the dead-store elimination optimization phase, so even when the automatic initializations are later elided in favor of direct initializations, the warnings remain. To avoid these problems, move such variables into the "case" where they're used or lift them up into the main function body. drivers/net/ethernet/intel/e1000/e1000_main.c: In function ‘e1000_xmit_frame’: drivers/net/ethernet/intel/e1000/e1000_main.c:3143:18: warning: statement will never be executed [-Wswitch-unreachable] 3143 | unsigned int pull_size; | ^~~~~~~~~ [1] https://bugs.llvm.org/show_bug.cgi?id=44916 Signed-off-by: Kees Cook Tested-by: Aaron Brown Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/e1000/e1000_main.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/intel/e1000/e1000_main.c b/drivers/net/ethernet/intel/e1000/e1000_main.c index f7103356ef56..ac5146d53c4c 100644 --- a/drivers/net/ethernet/intel/e1000/e1000_main.c +++ b/drivers/net/ethernet/intel/e1000/e1000_main.c @@ -3136,8 +3136,9 @@ static netdev_tx_t e1000_xmit_frame(struct sk_buff *skb, hdr_len = skb_transport_offset(skb) + tcp_hdrlen(skb); if (skb->data_len && hdr_len == len) { switch (hw->mac_type) { + case e1000_82544: { unsigned int pull_size; - case e1000_82544: + /* Make sure we have room to chop off 4 bytes, * and that the end alignment will work out to * this hardware's requirements @@ -3158,6 +3159,7 @@ static netdev_tx_t e1000_xmit_frame(struct sk_buff *skb, } len = skb_headlen(skb); break; + } default: /* do nothing */ break; -- cgit v1.2.3-59-g8ed1b From f1fd45598205b3eb52779f5d92b7df9d42fb755b Mon Sep 17 00:00:00 2001 From: Sasha Neftin Date: Thu, 27 Feb 2020 18:57:23 +0200 Subject: igc: Remove unused MDIC_DEST mask Formally Destination bit should be kept reserved to support legacy drivers and ignore on write/read operation Not applicable for i225 parts Signed-off-by: Sasha Neftin Tested-by: Aaron Brown Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/igc/igc_defines.h | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/net/ethernet/intel/igc/igc_defines.h b/drivers/net/ethernet/intel/igc/igc_defines.h index 1b0fd2ffd08d..d6e07f81ca4c 100644 --- a/drivers/net/ethernet/intel/igc/igc_defines.h +++ b/drivers/net/ethernet/intel/igc/igc_defines.h @@ -510,7 +510,6 @@ #define IGC_MDIC_READY 0x10000000 #define IGC_MDIC_INT_EN 0x20000000 #define IGC_MDIC_ERROR 0x40000000 -#define IGC_MDIC_DEST 0x80000000 #define IGC_N0_QUEUE -1 -- cgit v1.2.3-59-g8ed1b From 3d1ce3fa83917b19e150352c5a0f9c6324b2da9b Mon Sep 17 00:00:00 2001 From: Sasha Neftin Date: Thu, 27 Feb 2020 19:19:12 +0200 Subject: igc: Remove unused CTRL_EXT_LINK_MODE_MASK We support only copper mode Not applicable for i225 parts Signed-off-by: Sasha Neftin Tested-by: Aaron Brown Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/igc/igc_defines.h | 3 --- 1 file changed, 3 deletions(-) diff --git a/drivers/net/ethernet/intel/igc/igc_defines.h b/drivers/net/ethernet/intel/igc/igc_defines.h index d6e07f81ca4c..40d6f557079b 100644 --- a/drivers/net/ethernet/intel/igc/igc_defines.h +++ b/drivers/net/ethernet/intel/igc/igc_defines.h @@ -44,9 +44,6 @@ /* Wake Up Packet Memory stores the first 128 bytes of the wake up packet */ #define IGC_WUPM_BYTES 128 -/* Physical Func Reset Done Indication */ -#define IGC_CTRL_EXT_LINK_MODE_MASK 0x00C00000 - /* Loop limit on how long we wait for auto-negotiation to complete */ #define COPPER_LINK_UP_LIMIT 10 #define PHY_AUTO_NEG_LIMIT 45 -- cgit v1.2.3-59-g8ed1b From 89d35511f38da851c71c3ad9d2b8197ee34e0846 Mon Sep 17 00:00:00 2001 From: Sasha Neftin Date: Fri, 28 Feb 2020 02:25:15 +0200 Subject: igc: Remove forward declaration Move igc_adapter and igc_ring structures up to avoid forward declaration It is not necessary to forward declare these structures Signed-off-by: Sasha Neftin Tested-by: Aaron Brown Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/igc/igc.h | 391 +++++++++++++++++------------------ 1 file changed, 194 insertions(+), 197 deletions(-) diff --git a/drivers/net/ethernet/intel/igc/igc.h b/drivers/net/ethernet/intel/igc/igc.h index 5e36822de5ec..c7b0afd370d4 100644 --- a/drivers/net/ethernet/intel/igc/igc.h +++ b/drivers/net/ethernet/intel/igc/igc.h @@ -19,8 +19,200 @@ /* forward declaration */ void igc_set_ethtool_ops(struct net_device *); -struct igc_adapter; -struct igc_ring; +/* Transmit and receive queues */ +#define IGC_MAX_RX_QUEUES 4 +#define IGC_MAX_TX_QUEUES 4 + +#define MAX_Q_VECTORS 8 +#define MAX_STD_JUMBO_FRAME_SIZE 9216 + +#define MAX_ETYPE_FILTER (4 - 1) +#define IGC_RETA_SIZE 128 + +struct igc_tx_queue_stats { + u64 packets; + u64 bytes; + u64 restart_queue; + u64 restart_queue2; +}; + +struct igc_rx_queue_stats { + u64 packets; + u64 bytes; + u64 drops; + u64 csum_err; + u64 alloc_failed; +}; + +struct igc_rx_packet_stats { + u64 ipv4_packets; /* IPv4 headers processed */ + u64 ipv4e_packets; /* IPv4E headers with extensions processed */ + u64 ipv6_packets; /* IPv6 headers processed */ + u64 ipv6e_packets; /* IPv6E headers with extensions processed */ + u64 tcp_packets; /* TCP headers processed */ + u64 udp_packets; /* UDP headers processed */ + u64 sctp_packets; /* SCTP headers processed */ + u64 nfs_packets; /* NFS headers processe */ + u64 other_packets; +}; + +struct igc_ring_container { + struct igc_ring *ring; /* pointer to linked list of rings */ + unsigned int total_bytes; /* total bytes processed this int */ + unsigned int total_packets; /* total packets processed this int */ + u16 work_limit; /* total work allowed per interrupt */ + u8 count; /* total number of rings in vector */ + u8 itr; /* current ITR setting for ring */ +}; + +struct igc_ring { + struct igc_q_vector *q_vector; /* backlink to q_vector */ + struct net_device *netdev; /* back pointer to net_device */ + struct device *dev; /* device for dma mapping */ + union { /* array of buffer info structs */ + struct igc_tx_buffer *tx_buffer_info; + struct igc_rx_buffer *rx_buffer_info; + }; + void *desc; /* descriptor ring memory */ + unsigned long flags; /* ring specific flags */ + void __iomem *tail; /* pointer to ring tail register */ + dma_addr_t dma; /* phys address of the ring */ + unsigned int size; /* length of desc. ring in bytes */ + + u16 count; /* number of desc. in the ring */ + u8 queue_index; /* logical index of the ring*/ + u8 reg_idx; /* physical index of the ring */ + bool launchtime_enable; /* true if LaunchTime is enabled */ + + u32 start_time; + u32 end_time; + + /* everything past this point are written often */ + u16 next_to_clean; + u16 next_to_use; + u16 next_to_alloc; + + union { + /* TX */ + struct { + struct igc_tx_queue_stats tx_stats; + struct u64_stats_sync tx_syncp; + struct u64_stats_sync tx_syncp2; + }; + /* RX */ + struct { + struct igc_rx_queue_stats rx_stats; + struct igc_rx_packet_stats pkt_stats; + struct u64_stats_sync rx_syncp; + struct sk_buff *skb; + }; + }; +} ____cacheline_internodealigned_in_smp; + +/* Board specific private data structure */ +struct igc_adapter { + struct net_device *netdev; + + unsigned long state; + unsigned int flags; + unsigned int num_q_vectors; + + struct msix_entry *msix_entries; + + /* TX */ + u16 tx_work_limit; + u32 tx_timeout_count; + int num_tx_queues; + struct igc_ring *tx_ring[IGC_MAX_TX_QUEUES]; + + /* RX */ + int num_rx_queues; + struct igc_ring *rx_ring[IGC_MAX_RX_QUEUES]; + + struct timer_list watchdog_timer; + struct timer_list dma_err_timer; + struct timer_list phy_info_timer; + + u32 wol; + u32 en_mng_pt; + u16 link_speed; + u16 link_duplex; + + u8 port_num; + + u8 __iomem *io_addr; + /* Interrupt Throttle Rate */ + u32 rx_itr_setting; + u32 tx_itr_setting; + + struct work_struct reset_task; + struct work_struct watchdog_task; + struct work_struct dma_err_task; + bool fc_autoneg; + + u8 tx_timeout_factor; + + int msg_enable; + u32 max_frame_size; + u32 min_frame_size; + + ktime_t base_time; + ktime_t cycle_time; + + /* OS defined structs */ + struct pci_dev *pdev; + /* lock for statistics */ + spinlock_t stats64_lock; + struct rtnl_link_stats64 stats64; + + /* structs defined in igc_hw.h */ + struct igc_hw hw; + struct igc_hw_stats stats; + + struct igc_q_vector *q_vector[MAX_Q_VECTORS]; + u32 eims_enable_mask; + u32 eims_other; + + u16 tx_ring_count; + u16 rx_ring_count; + + u32 tx_hwtstamp_timeouts; + u32 tx_hwtstamp_skipped; + u32 rx_hwtstamp_cleared; + + u32 rss_queues; + u32 rss_indir_tbl_init; + + /* RX network flow classification support */ + struct hlist_head nfc_filter_list; + struct hlist_head cls_flower_list; + unsigned int nfc_filter_count; + + /* lock for RX network flow classification filter */ + spinlock_t nfc_lock; + bool etype_bitmap[MAX_ETYPE_FILTER]; + + struct igc_mac_addr *mac_table; + + u8 rss_indir_tbl[IGC_RETA_SIZE]; + + unsigned long link_check_timeout; + struct igc_info ei; + + struct ptp_clock *ptp_clock; + struct ptp_clock_info ptp_caps; + struct work_struct ptp_tx_work; + struct sk_buff *ptp_tx_skb; + struct hwtstamp_config tstamp_config; + unsigned long ptp_tx_start; + unsigned long last_rx_ptp_check; + unsigned long last_rx_timestamp; + unsigned int ptp_flags; + /* System time value lock */ + spinlock_t tmreg_lock; + struct cyclecounter cc; + struct timecounter tc; +}; void igc_up(struct igc_adapter *adapter); void igc_down(struct igc_adapter *adapter); @@ -50,7 +242,6 @@ extern char igc_driver_name[]; extern char igc_driver_version[]; #define IGC_REGS_LEN 740 -#define IGC_RETA_SIZE 128 /* flags controlling PTP/1588 function */ #define IGC_PTP_ENABLED BIT(0) @@ -100,13 +291,6 @@ extern char igc_driver_version[]; #define IGC_MIN_RXD 80 #define IGC_MAX_RXD 4096 -/* Transmit and receive queues */ -#define IGC_MAX_RX_QUEUES 4 -#define IGC_MAX_TX_QUEUES 4 - -#define MAX_Q_VECTORS 8 -#define MAX_STD_JUMBO_FRAME_SIZE 9216 - /* Supported Rx Buffer Sizes */ #define IGC_RXBUFFER_256 256 #define IGC_RXBUFFER_2048 2048 @@ -233,86 +417,6 @@ struct igc_rx_buffer { __u16 pagecnt_bias; }; -struct igc_tx_queue_stats { - u64 packets; - u64 bytes; - u64 restart_queue; - u64 restart_queue2; -}; - -struct igc_rx_queue_stats { - u64 packets; - u64 bytes; - u64 drops; - u64 csum_err; - u64 alloc_failed; -}; - -struct igc_rx_packet_stats { - u64 ipv4_packets; /* IPv4 headers processed */ - u64 ipv4e_packets; /* IPv4E headers with extensions processed */ - u64 ipv6_packets; /* IPv6 headers processed */ - u64 ipv6e_packets; /* IPv6E headers with extensions processed */ - u64 tcp_packets; /* TCP headers processed */ - u64 udp_packets; /* UDP headers processed */ - u64 sctp_packets; /* SCTP headers processed */ - u64 nfs_packets; /* NFS headers processe */ - u64 other_packets; -}; - -struct igc_ring_container { - struct igc_ring *ring; /* pointer to linked list of rings */ - unsigned int total_bytes; /* total bytes processed this int */ - unsigned int total_packets; /* total packets processed this int */ - u16 work_limit; /* total work allowed per interrupt */ - u8 count; /* total number of rings in vector */ - u8 itr; /* current ITR setting for ring */ -}; - -struct igc_ring { - struct igc_q_vector *q_vector; /* backlink to q_vector */ - struct net_device *netdev; /* back pointer to net_device */ - struct device *dev; /* device for dma mapping */ - union { /* array of buffer info structs */ - struct igc_tx_buffer *tx_buffer_info; - struct igc_rx_buffer *rx_buffer_info; - }; - void *desc; /* descriptor ring memory */ - unsigned long flags; /* ring specific flags */ - void __iomem *tail; /* pointer to ring tail register */ - dma_addr_t dma; /* phys address of the ring */ - unsigned int size; /* length of desc. ring in bytes */ - - u16 count; /* number of desc. in the ring */ - u8 queue_index; /* logical index of the ring*/ - u8 reg_idx; /* physical index of the ring */ - bool launchtime_enable; /* true if LaunchTime is enabled */ - - u32 start_time; - u32 end_time; - - /* everything past this point are written often */ - u16 next_to_clean; - u16 next_to_use; - u16 next_to_alloc; - - union { - /* TX */ - struct { - struct igc_tx_queue_stats tx_stats; - struct u64_stats_sync tx_syncp; - struct u64_stats_sync tx_syncp2; - }; - /* RX */ - struct { - struct igc_rx_queue_stats rx_stats; - struct igc_rx_packet_stats pkt_stats; - struct u64_stats_sync rx_syncp; - struct sk_buff *skb; - }; - }; -} ____cacheline_internodealigned_in_smp; - struct igc_q_vector { struct igc_adapter *adapter; /* backlink */ void __iomem *itr_register; @@ -333,8 +437,6 @@ struct igc_q_vector { struct igc_ring ring[] ____cacheline_internodealigned_in_smp; }; -#define MAX_ETYPE_FILTER (4 - 1) - enum igc_filter_match_flags { IGC_FILTER_FLAG_ETHER_TYPE = 0x1, IGC_FILTER_FLAG_VLAN_TCI = 0x2, @@ -378,111 +480,6 @@ struct igc_mac_addr { #define IGC_MAX_RXNFC_FILTERS 16 -/* Board specific private data structure */ -struct igc_adapter { - struct net_device *netdev; - - unsigned long state; - unsigned int flags; - unsigned int num_q_vectors; - - struct msix_entry *msix_entries; - - /* TX */ - u16 tx_work_limit; - u32 tx_timeout_count; - int num_tx_queues; - struct igc_ring *tx_ring[IGC_MAX_TX_QUEUES]; - - /* RX */ - int num_rx_queues; - struct igc_ring *rx_ring[IGC_MAX_RX_QUEUES]; - - struct timer_list watchdog_timer; - struct timer_list dma_err_timer; - struct timer_list phy_info_timer; - - u32 wol; - u32 en_mng_pt; - u16 link_speed; - u16 link_duplex; - - u8 port_num; - - u8 __iomem *io_addr; - /* Interrupt Throttle Rate */ - u32 rx_itr_setting; - u32 tx_itr_setting; - - struct work_struct reset_task; - struct work_struct watchdog_task; - struct work_struct dma_err_task; - bool fc_autoneg; - - u8 tx_timeout_factor; - - int msg_enable; - u32 max_frame_size; - u32 min_frame_size; - - ktime_t base_time; - ktime_t cycle_time; - - /* OS defined structs */ - struct pci_dev *pdev; - /* lock for statistics */ - spinlock_t stats64_lock; - struct rtnl_link_stats64 stats64; - - /* structs defined in igc_hw.h */ - struct igc_hw hw; - struct igc_hw_stats stats; - - struct igc_q_vector *q_vector[MAX_Q_VECTORS]; - u32 eims_enable_mask; - u32 eims_other; - - u16 tx_ring_count; - u16 rx_ring_count; - - u32 tx_hwtstamp_timeouts; - u32 tx_hwtstamp_skipped; - u32 rx_hwtstamp_cleared; - - u32 rss_queues; - u32 rss_indir_tbl_init; - - /* RX network flow classification support */ - struct hlist_head nfc_filter_list; - struct hlist_head cls_flower_list; - unsigned int nfc_filter_count; - - /* lock for RX network flow classification filter */ - spinlock_t nfc_lock; - bool etype_bitmap[MAX_ETYPE_FILTER]; - - struct igc_mac_addr *mac_table; - - u8 rss_indir_tbl[IGC_RETA_SIZE]; - - unsigned long link_check_timeout; - struct igc_info ei; - - struct ptp_clock *ptp_clock; - struct ptp_clock_info ptp_caps; - struct work_struct ptp_tx_work; - struct sk_buff *ptp_tx_skb; - struct hwtstamp_config tstamp_config; - unsigned long ptp_tx_start; - unsigned long last_rx_ptp_check; - unsigned long last_rx_timestamp; - unsigned int ptp_flags; - /* System time value lock */ - spinlock_t tmreg_lock; - struct cyclecounter cc; - struct timecounter tc; -}; - /* igc_desc_unused - calculate if we have unused descriptors */ static inline u16 igc_desc_unused(const struct igc_ring *ring) { -- cgit v1.2.3-59-g8ed1b From b8a61ea15cdf8524f91dbad90a4f1fae13b0120b Mon Sep 17 00:00:00 2001 From: Andre Guedes Date: Fri, 28 Feb 2020 00:19:57 -0800 Subject: igc: Fix overwrites when dumping registers This patch fixes some register overwriting when dumping registers via ethtool. We have a total of 16 RAL registers, starting at offset 139. So RAH offset should be 139 + 16 = 155, not 145. As result some RAL registers are overwritten. Likewise, RAH registers are also overwritten by TDBAL, TDBAH, TDLEN, and TDH registers. To fix this bug while preserving the ABI, this patch re-writes RAL and RAH registers at the end of 'regs_buff' and bumps regs->version. It also removes some pointless comments in the middle of igc_set_regs(). Signed-off-by: Andre Guedes Tested-by: Aaron Brown Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/igc/igc_ethtool.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/intel/igc/igc_ethtool.c b/drivers/net/ethernet/intel/igc/igc_ethtool.c index f530fc29b074..ff2a40496e4e 100644 --- a/drivers/net/ethernet/intel/igc/igc_ethtool.c +++ b/drivers/net/ethernet/intel/igc/igc_ethtool.c @@ -153,7 +153,7 @@ static void igc_get_regs(struct net_device *netdev, memset(p, 0, IGC_REGS_LEN * sizeof(u32)); - regs->version = (1u << 24) | (hw->revision_id << 16) | hw->device_id; + regs->version = (2u << 24) | (hw->revision_id << 16) | hw->device_id; /* General Registers */ regs_buff[0] = rd32(IGC_CTRL); @@ -306,6 +306,15 @@ static void igc_get_regs(struct net_device *netdev, regs_buff[164 + i] = rd32(IGC_TDT(i)); for (i = 0; i < 4; i++) regs_buff[168 + i] = rd32(IGC_TXDCTL(i)); + + /* XXX: Due to a bug few lines above, RAL and RAH registers are + * overwritten. To preserve the ABI, we write these registers again in + * regs_buff. + */ + for (i = 0; i < 16; i++) + regs_buff[172 + i] = rd32(IGC_RAL(i)); + for (i = 0; i < 16; i++) + regs_buff[188 + i] = rd32(IGC_RAH(i)); } static void igc_get_wol(struct net_device *netdev, struct ethtool_wolinfo *wol) -- cgit v1.2.3-59-g8ed1b From 64900e8ff551dd6ae891651b6d74789378472ee1 Mon Sep 17 00:00:00 2001 From: Sasha Neftin Date: Fri, 28 Feb 2020 19:50:07 +0200 Subject: igc: Fix double definition IGC_START_ITR has beed defined twice This patch come to fix it Signed-off-by: Sasha Neftin Tested-by: Aaron Brown Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/igc/igc.h | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/net/ethernet/intel/igc/igc.h b/drivers/net/ethernet/intel/igc/igc.h index c7b0afd370d4..4643f358b843 100644 --- a/drivers/net/ethernet/intel/igc/igc.h +++ b/drivers/net/ethernet/intel/igc/igc.h @@ -246,9 +246,6 @@ extern char igc_driver_version[]; /* flags controlling PTP/1588 function */ #define IGC_PTP_ENABLED BIT(0) -/* Interrupt defines */ -#define IGC_START_ITR 648 /* ~6000 ints/sec */ - /* Flags definitions */ #define IGC_FLAG_HAS_MSI BIT(0) #define IGC_FLAG_QUEUE_PAIRS BIT(3) @@ -270,6 +267,7 @@ extern char igc_driver_version[]; #define IGC_MRQC_RSS_FIELD_IPV4_UDP 0x00400000 #define IGC_MRQC_RSS_FIELD_IPV6_UDP 0x00800000 +/* Interrupt defines */ #define IGC_START_ITR 648 /* ~6000 ints/sec */ #define IGC_4K_ITR 980 #define IGC_20K_ITR 196 -- cgit v1.2.3-59-g8ed1b From 635071e2c48d4a9261a0df8401155dbd959efd3d Mon Sep 17 00:00:00 2001 From: Sasha Neftin Date: Tue, 3 Mar 2020 02:28:08 +0200 Subject: igc: Enable NETIF_F_HW_TC flag This assignment of the feature NETIF_F_HW_TC occurs prior to the initial setup of the local hw_features variable. This ensures that NETIF_F_HW_TC are marked as user changeable, and also enables it by default when the driver loads. Signed-off-by: Sasha Neftin Tested-by: Aaron Brown Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/igc/igc_main.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/intel/igc/igc_main.c b/drivers/net/ethernet/intel/igc/igc_main.c index 896b314035c9..800268da0834 100644 --- a/drivers/net/ethernet/intel/igc/igc_main.c +++ b/drivers/net/ethernet/intel/igc/igc_main.c @@ -4903,6 +4903,7 @@ static int igc_probe(struct pci_dev *pdev, netdev->features |= NETIF_F_RXCSUM; netdev->features |= NETIF_F_HW_CSUM; netdev->features |= NETIF_F_SCTP_CRC; + netdev->features |= NETIF_F_HW_TC; #define IGC_GSO_PARTIAL_FEATURES (NETIF_F_GSO_GRE | \ NETIF_F_GSO_GRE_CSUM | \ -- cgit v1.2.3-59-g8ed1b From 2e39d2c8ff9654ba508c973ade5df332f53f41cc Mon Sep 17 00:00:00 2001 From: Sasha Neftin Date: Tue, 3 Mar 2020 20:26:10 +0200 Subject: igc: Remove copper fiber switch control i225 device support copper mode only PHY signal detect indication for copper fiber switch not applicable to i225 part Signed-off-by: Sasha Neftin Tested-by: Aaron Brown Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/igc/igc_defines.h | 2 -- drivers/net/ethernet/intel/igc/igc_main.c | 9 --------- 2 files changed, 11 deletions(-) diff --git a/drivers/net/ethernet/intel/igc/igc_defines.h b/drivers/net/ethernet/intel/igc/igc_defines.h index 40d6f557079b..42fe4d75cc0d 100644 --- a/drivers/net/ethernet/intel/igc/igc_defines.h +++ b/drivers/net/ethernet/intel/igc/igc_defines.h @@ -91,8 +91,6 @@ #define IGC_CTRL_RFCE 0x08000000 /* Receive Flow Control enable */ #define IGC_CTRL_TFCE 0x10000000 /* Transmit flow control enable */ -#define IGC_CONNSW_AUTOSENSE_EN 0x1 - /* As per the EAS the maximum supported size is 9.5KB (9728 bytes) */ #define MAX_JUMBO_FRAME_SIZE 0x2600 diff --git a/drivers/net/ethernet/intel/igc/igc_main.c b/drivers/net/ethernet/intel/igc/igc_main.c index 800268da0834..44366c1bec19 100644 --- a/drivers/net/ethernet/intel/igc/igc_main.c +++ b/drivers/net/ethernet/intel/igc/igc_main.c @@ -4033,7 +4033,6 @@ static void igc_watchdog_task(struct work_struct *work) struct igc_hw *hw = &adapter->hw; struct igc_phy_info *phy = &hw->phy; u16 phy_data, retry_count = 20; - u32 connsw; u32 link; int i; @@ -4046,14 +4045,6 @@ static void igc_watchdog_task(struct work_struct *work) link = false; } - /* Force link down if we have fiber to swap to */ - if (adapter->flags & IGC_FLAG_MAS_ENABLE) { - if (hw->phy.media_type == igc_media_type_copper) { - connsw = rd32(IGC_CONNSW); - if (!(connsw & IGC_CONNSW_AUTOSENSE_EN)) - link = 0; - } - } if (link) { /* Cancel scheduled suspend requests. */ pm_runtime_resume(netdev->dev.parent); -- cgit v1.2.3-59-g8ed1b From 27945ebe5b980f796fa04dd61511796ac5b80cc2 Mon Sep 17 00:00:00 2001 From: Andre Guedes Date: Fri, 6 Mar 2020 15:54:03 -0800 Subject: igc: Fix NFC queue redirection support The support for ethtool Network Flow Classification (NFC) queue redirection based on destination MAC address is currently broken in IGC. For instance, if we add the following rule, matching frames aren't enqueued on the expected rx queue. $ ethtool -N IFNAME flow-type ether dst 3c:fd:fe:9e:7f:71 queue 2 The issue here is due to the fact that igc_rar_set_index() is missing code to enable the queue selection feature from Receive Address High (RAH) register. This patch adds the missing code and fixes the issue. Signed-off-by: Andre Guedes Acked-by: Sasha Neftin Tested-by: Aaron Brown Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/igc/igc_defines.h | 5 ++++- drivers/net/ethernet/intel/igc/igc_main.c | 11 ++++++++--- 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/intel/igc/igc_defines.h b/drivers/net/ethernet/intel/igc/igc_defines.h index 42fe4d75cc0d..af0c03d77a39 100644 --- a/drivers/net/ethernet/intel/igc/igc_defines.h +++ b/drivers/net/ethernet/intel/igc/igc_defines.h @@ -63,8 +63,11 @@ * (RAR[15]) for our directed address used by controllers with * manageability enabled, allowing us room for 15 multicast addresses. */ +#define IGC_RAH_QSEL_MASK 0x000C0000 +#define IGC_RAH_QSEL_SHIFT 18 +#define IGC_RAH_QSEL_ENABLE BIT(28) #define IGC_RAH_AV 0x80000000 /* Receive descriptor valid */ -#define IGC_RAH_POOL_1 0x00040000 + #define IGC_RAL_MAC_ADDR_LEN 4 #define IGC_RAH_MAC_ADDR_LEN 2 diff --git a/drivers/net/ethernet/intel/igc/igc_main.c b/drivers/net/ethernet/intel/igc/igc_main.c index 44366c1bec19..85df9366e172 100644 --- a/drivers/net/ethernet/intel/igc/igc_main.c +++ b/drivers/net/ethernet/intel/igc/igc_main.c @@ -780,13 +780,18 @@ static void igc_rar_set_index(struct igc_adapter *adapter, u32 index) rar_low = le32_to_cpup((__le32 *)(addr)); rar_high = le16_to_cpup((__le16 *)(addr + 4)); + if (adapter->mac_table[index].state & IGC_MAC_STATE_QUEUE_STEERING) { + u8 queue = adapter->mac_table[index].queue; + u32 qsel = IGC_RAH_QSEL_MASK & (queue << IGC_RAH_QSEL_SHIFT); + + rar_high |= qsel; + rar_high |= IGC_RAH_QSEL_ENABLE; + } + /* Indicate to hardware the Address is Valid. */ if (adapter->mac_table[index].state & IGC_MAC_STATE_IN_USE) { if (is_valid_ether_addr(addr)) rar_high |= IGC_RAH_AV; - - rar_high |= IGC_RAH_POOL_1 << - adapter->mac_table[index].queue; } wr32(IGC_RAL(index), rar_low); -- cgit v1.2.3-59-g8ed1b From c24fd2481e0bd3d2c5755c7a3dc898ef249c0ddb Mon Sep 17 00:00:00 2001 From: Andre Guedes Date: Fri, 6 Mar 2020 16:36:42 -0800 Subject: igc: Remove dead code related to flower filter IGC driver has no support for tc-flower filters so this patch removes some leftover code, probably copied from IGB driver by mistake. Signed-off-by: Andre Guedes Tested-by: Aaron Brown Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/igc/igc.h | 1 - drivers/net/ethernet/intel/igc/igc_main.c | 3 --- 2 files changed, 4 deletions(-) diff --git a/drivers/net/ethernet/intel/igc/igc.h b/drivers/net/ethernet/intel/igc/igc.h index 4643f358b843..5f21dcfe99ce 100644 --- a/drivers/net/ethernet/intel/igc/igc.h +++ b/drivers/net/ethernet/intel/igc/igc.h @@ -185,7 +185,6 @@ struct igc_adapter { /* RX network flow classification support */ struct hlist_head nfc_filter_list; - struct hlist_head cls_flower_list; unsigned int nfc_filter_count; /* lock for RX network flow classification filter */ diff --git a/drivers/net/ethernet/intel/igc/igc_main.c b/drivers/net/ethernet/intel/igc/igc_main.c index 85df9366e172..6acb85842d0a 100644 --- a/drivers/net/ethernet/intel/igc/igc_main.c +++ b/drivers/net/ethernet/intel/igc/igc_main.c @@ -3487,9 +3487,6 @@ static void igc_nfc_filter_exit(struct igc_adapter *adapter) hlist_for_each_entry(rule, &adapter->nfc_filter_list, nfc_node) igc_erase_filter(adapter, rule); - hlist_for_each_entry(rule, &adapter->cls_flower_list, nfc_node) - igc_erase_filter(adapter, rule); - spin_unlock(&adapter->nfc_lock); } -- cgit v1.2.3-59-g8ed1b From ac9156b27564a089ec52f526bfcb59f61c34e7c6 Mon Sep 17 00:00:00 2001 From: Andre Guedes Date: Mon, 9 Mar 2020 16:10:40 -0700 Subject: igc: Fix default MAC address filter override This patch fixes a bug when the user adds the first MAC address filter via ethtool NFC mechanism. When the first MAC address filter is added, it overwrites the default MAC address filter configured at RAL[0] and RAH[0]. As consequence, frames addressed to the interface MAC address are not sent to host anymore. This patch fixes the bug by calling igc_set_default_mac_filter() during adapter init so the position 0 of adapter->mac_table[] is assigned to the default MAC address. Signed-off-by: Andre Guedes Tested-by: Aaron Brown Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/igc/igc_main.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/ethernet/intel/igc/igc_main.c b/drivers/net/ethernet/intel/igc/igc_main.c index 6acb85842d0a..9d1792e80e2e 100644 --- a/drivers/net/ethernet/intel/igc/igc_main.c +++ b/drivers/net/ethernet/intel/igc/igc_main.c @@ -2354,7 +2354,9 @@ static void igc_configure(struct igc_adapter *adapter) igc_setup_mrqc(adapter); igc_setup_rctl(adapter); + igc_set_default_mac_filter(adapter); igc_nfc_filter_restore(adapter); + igc_configure_tx(adapter); igc_configure_rx(adapter); -- cgit v1.2.3-59-g8ed1b From 5800091a206172e2016e84906035f1d757cc6547 Mon Sep 17 00:00:00 2001 From: David Bauer Date: Fri, 17 Apr 2020 15:41:59 +0200 Subject: net: phy: at803x: add support for AR8032 PHY This adds support for the Qualcomm Atheros AR8032 Fast Ethernet PHY. It shares many similarities with the already supported AR8030 PHY but additionally supports MII connection to the MAC. Signed-off-by: David Bauer Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- drivers/net/phy/at803x.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/drivers/net/phy/at803x.c b/drivers/net/phy/at803x.c index 31f731e6df72..31b6edcc1fd1 100644 --- a/drivers/net/phy/at803x.c +++ b/drivers/net/phy/at803x.c @@ -106,6 +106,7 @@ #define ATH9331_PHY_ID 0x004dd041 #define ATH8030_PHY_ID 0x004dd076 #define ATH8031_PHY_ID 0x004dd074 +#define ATH8032_PHY_ID 0x004dd023 #define ATH8035_PHY_ID 0x004dd072 #define AT803X_PHY_ID_MASK 0xffffffef @@ -762,6 +763,21 @@ static struct phy_driver at803x_driver[] = { .aneg_done = at803x_aneg_done, .ack_interrupt = &at803x_ack_interrupt, .config_intr = &at803x_config_intr, +}, { + /* Qualcomm Atheros AR8032 */ + PHY_ID_MATCH_EXACT(ATH8032_PHY_ID), + .name = "Qualcomm Atheros AR8032", + .probe = at803x_probe, + .remove = at803x_remove, + .config_init = at803x_config_init, + .link_change_notify = at803x_link_change_notify, + .set_wol = at803x_set_wol, + .get_wol = at803x_get_wol, + .suspend = at803x_suspend, + .resume = at803x_resume, + /* PHY_BASIC_FEATURES */ + .ack_interrupt = at803x_ack_interrupt, + .config_intr = at803x_config_intr, }, { /* ATHEROS AR9331 */ PHY_ID_MATCH_EXACT(ATH9331_PHY_ID), @@ -778,6 +794,7 @@ module_phy_driver(at803x_driver); static struct mdio_device_id __maybe_unused atheros_tbl[] = { { ATH8030_PHY_ID, AT803X_PHY_ID_MASK }, { ATH8031_PHY_ID, AT803X_PHY_ID_MASK }, + { PHY_ID_MATCH_EXACT(ATH8032_PHY_ID) }, { ATH8035_PHY_ID, AT803X_PHY_ID_MASK }, { PHY_ID_MATCH_EXACT(ATH9331_PHY_ID) }, { } -- cgit v1.2.3-59-g8ed1b From 007fc3c0ca478f3a8ad687cf9ecbe672d3a64700 Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Fri, 17 Apr 2020 11:33:41 -0700 Subject: net: dsa: b53: per-port interrupts are optional Make use of platform_get_irq_byname_optional() to avoid printing messages on the kernel console that interrupts cannot be found. Signed-off-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/dsa/b53/b53_srab.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/dsa/b53/b53_srab.c b/drivers/net/dsa/b53/b53_srab.c index 0a1be5259be0..1207c3095027 100644 --- a/drivers/net/dsa/b53/b53_srab.c +++ b/drivers/net/dsa/b53/b53_srab.c @@ -524,7 +524,7 @@ static void b53_srab_prepare_irq(struct platform_device *pdev) port->num = i; port->dev = dev; - port->irq = platform_get_irq_byname(pdev, name); + port->irq = platform_get_irq_byname_optional(pdev, name); kfree(name); } -- cgit v1.2.3-59-g8ed1b From c6f5f242f5ed58cc531e75507e8447a8c9b6cd30 Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Fri, 17 Apr 2020 11:34:20 -0700 Subject: net: phy: mdio-bcm-iproc: Do not show kernel pointer Displaying the virtual address at which the MDIO base register address has been mapped is not useful and is not visible with pointer hashing in place, replace the message with something indicating successful registration instead. Signed-off-by: Florian Fainelli Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- drivers/net/phy/mdio-bcm-iproc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/phy/mdio-bcm-iproc.c b/drivers/net/phy/mdio-bcm-iproc.c index f1ded03f0229..38bf40e0d673 100644 --- a/drivers/net/phy/mdio-bcm-iproc.c +++ b/drivers/net/phy/mdio-bcm-iproc.c @@ -159,7 +159,7 @@ static int iproc_mdio_probe(struct platform_device *pdev) platform_set_drvdata(pdev, priv); - dev_info(&pdev->dev, "Broadcom iProc MDIO bus at 0x%p\n", priv->base); + dev_info(&pdev->dev, "Broadcom iProc MDIO bus registered\n"); return 0; -- cgit v1.2.3-59-g8ed1b From 123aff2a789c3975c2235653939ff00107d6156c Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Fri, 17 Apr 2020 11:38:02 -0700 Subject: net: phy: broadcom: Add support for BCM53125 internal PHYs BCM53125 has internal Gigabit PHYs which support interrupts as well as statistics, make it possible to configure both of those features with a PHY driver entry. Signed-off-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/phy/broadcom.c | 14 ++++++++++++++ include/linux/brcmphy.h | 1 + 2 files changed, 15 insertions(+) diff --git a/drivers/net/phy/broadcom.c b/drivers/net/phy/broadcom.c index ae4873f2f86e..97201d5cf007 100644 --- a/drivers/net/phy/broadcom.c +++ b/drivers/net/phy/broadcom.c @@ -781,6 +781,19 @@ static struct phy_driver broadcom_drivers[] = { .get_strings = bcm_phy_get_strings, .get_stats = bcm53xx_phy_get_stats, .probe = bcm53xx_phy_probe, +}, { + .phy_id = PHY_ID_BCM53125, + .phy_id_mask = 0xfffffff0, + .name = "Broadcom BCM53125", + .flags = PHY_IS_INTERNAL, + /* PHY_GBIT_FEATURES */ + .get_sset_count = bcm_phy_get_sset_count, + .get_strings = bcm_phy_get_strings, + .get_stats = bcm53xx_phy_get_stats, + .probe = bcm53xx_phy_probe, + .config_init = bcm54xx_config_init, + .ack_interrupt = bcm_phy_ack_intr, + .config_intr = bcm_phy_config_intr, }, { .phy_id = PHY_ID_BCM89610, .phy_id_mask = 0xfffffff0, @@ -810,6 +823,7 @@ static struct mdio_device_id __maybe_unused broadcom_tbl[] = { { PHY_ID_BCMAC131, 0xfffffff0 }, { PHY_ID_BCM5241, 0xfffffff0 }, { PHY_ID_BCM5395, 0xfffffff0 }, + { PHY_ID_BCM53125, 0xfffffff0 }, { PHY_ID_BCM89610, 0xfffffff0 }, { } }; diff --git a/include/linux/brcmphy.h b/include/linux/brcmphy.h index 6462c5447872..7e1d857c8468 100644 --- a/include/linux/brcmphy.h +++ b/include/linux/brcmphy.h @@ -15,6 +15,7 @@ #define PHY_ID_BCMAC131 0x0143bc70 #define PHY_ID_BCM5481 0x0143bca0 #define PHY_ID_BCM5395 0x0143bcf0 +#define PHY_ID_BCM53125 0x03625f20 #define PHY_ID_BCM54810 0x03625d00 #define PHY_ID_BCM5482 0x0143bcb0 #define PHY_ID_BCM5411 0x00206070 -- cgit v1.2.3-59-g8ed1b From 89f9ffd3eb670bad1260bc579f5e13b8f2d5b3e0 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Fri, 17 Apr 2020 22:03:08 +0300 Subject: net: mscc: ocelot: deal with problematic MAC_ETYPE VCAP IS2 rules By default, the VCAP IS2 will produce a single match for each frame, on the most specific classification. Example: a ping packet (ICMP over IPv4 over Ethernet) sent from an IP address of 10.0.0.1 and a MAC address of 96:18:82:00:04:01 will match this rule: tc filter add dev swp0 ingress protocol ipv4 \ flower skip_sw src_ip 10.0.0.1 action drop but not this one: tc filter add dev swp0 ingress \ flower skip_sw src_mac 96:18:82:00:04:01 action drop Currently the driver does not really warn the user in any way about this, and the behavior is rather strange anyway. The current patch is a workaround to force matches on MAC_ETYPE keys (DMAC and SMAC) for all packets irrespective of higher layer protocol. The setting is made at the port level. Of course this breaks all other non-src_mac and non-dst_mac matches, so rule exclusivity checks have been added to the driver, in order to never have rules of both types on any ingress port. The bits that discard higher-level protocol information are set only once a MAC_ETYPE rule is added to a filter block, and only for the ports that are bound to that filter block. Then all further non-MAC_ETYPE rules added to that filter block should be denied by the ports bound to it. Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- drivers/net/ethernet/mscc/ocelot_ace.c | 103 +++++++++++++++++++++++++++++- drivers/net/ethernet/mscc/ocelot_ace.h | 5 +- drivers/net/ethernet/mscc/ocelot_flower.c | 2 +- 3 files changed, 106 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/mscc/ocelot_ace.c b/drivers/net/ethernet/mscc/ocelot_ace.c index 3bd286044480..8a2f7d13ef6d 100644 --- a/drivers/net/ethernet/mscc/ocelot_ace.c +++ b/drivers/net/ethernet/mscc/ocelot_ace.c @@ -706,13 +706,114 @@ ocelot_ace_rule_get_rule_index(struct ocelot_acl_block *block, int index) return NULL; } +/* If @on=false, then SNAP, ARP, IP and OAM frames will not match on keys based + * on destination and source MAC addresses, but only on higher-level protocol + * information. The only frame types to match on keys containing MAC addresses + * in this case are non-SNAP, non-ARP, non-IP and non-OAM frames. + * + * If @on=true, then the above frame types (SNAP, ARP, IP and OAM) will match + * on MAC_ETYPE keys such as destination and source MAC on this ingress port. + * However the setting has the side effect of making these frames not matching + * on any _other_ keys than MAC_ETYPE ones. + */ +static void ocelot_match_all_as_mac_etype(struct ocelot *ocelot, int port, + bool on) +{ + u32 val = 0; + + if (on) + val = ANA_PORT_VCAP_S2_CFG_S2_SNAP_DIS(3) | + ANA_PORT_VCAP_S2_CFG_S2_ARP_DIS(3) | + ANA_PORT_VCAP_S2_CFG_S2_IP_TCPUDP_DIS(3) | + ANA_PORT_VCAP_S2_CFG_S2_IP_OTHER_DIS(3) | + ANA_PORT_VCAP_S2_CFG_S2_OAM_DIS(3); + + ocelot_rmw_gix(ocelot, val, + ANA_PORT_VCAP_S2_CFG_S2_SNAP_DIS_M | + ANA_PORT_VCAP_S2_CFG_S2_ARP_DIS_M | + ANA_PORT_VCAP_S2_CFG_S2_IP_TCPUDP_DIS_M | + ANA_PORT_VCAP_S2_CFG_S2_IP_OTHER_DIS_M | + ANA_PORT_VCAP_S2_CFG_S2_OAM_DIS_M, + ANA_PORT_VCAP_S2_CFG, port); +} + +static bool ocelot_ace_is_problematic_mac_etype(struct ocelot_ace_rule *ace) +{ + if (ace->type != OCELOT_ACE_TYPE_ETYPE) + return false; + if (ether_addr_to_u64(ace->frame.etype.dmac.value) & + ether_addr_to_u64(ace->frame.etype.dmac.mask)) + return true; + if (ether_addr_to_u64(ace->frame.etype.smac.value) & + ether_addr_to_u64(ace->frame.etype.smac.mask)) + return true; + return false; +} + +static bool ocelot_ace_is_problematic_non_mac_etype(struct ocelot_ace_rule *ace) +{ + if (ace->type == OCELOT_ACE_TYPE_SNAP) + return true; + if (ace->type == OCELOT_ACE_TYPE_ARP) + return true; + if (ace->type == OCELOT_ACE_TYPE_IPV4) + return true; + if (ace->type == OCELOT_ACE_TYPE_IPV6) + return true; + return false; +} + +static bool ocelot_exclusive_mac_etype_ace_rules(struct ocelot *ocelot, + struct ocelot_ace_rule *ace) +{ + struct ocelot_acl_block *block = &ocelot->acl_block; + struct ocelot_ace_rule *tmp; + unsigned long port; + int i; + + if (ocelot_ace_is_problematic_mac_etype(ace)) { + /* Search for any non-MAC_ETYPE rules on the port */ + for (i = 0; i < block->count; i++) { + tmp = ocelot_ace_rule_get_rule_index(block, i); + if (tmp->ingress_port_mask & ace->ingress_port_mask && + ocelot_ace_is_problematic_non_mac_etype(tmp)) + return false; + } + + for_each_set_bit(port, &ace->ingress_port_mask, + ocelot->num_phys_ports) + ocelot_match_all_as_mac_etype(ocelot, port, true); + } else if (ocelot_ace_is_problematic_non_mac_etype(ace)) { + /* Search for any MAC_ETYPE rules on the port */ + for (i = 0; i < block->count; i++) { + tmp = ocelot_ace_rule_get_rule_index(block, i); + if (tmp->ingress_port_mask & ace->ingress_port_mask && + ocelot_ace_is_problematic_mac_etype(tmp)) + return false; + } + + for_each_set_bit(port, &ace->ingress_port_mask, + ocelot->num_phys_ports) + ocelot_match_all_as_mac_etype(ocelot, port, false); + } + + return true; +} + int ocelot_ace_rule_offload_add(struct ocelot *ocelot, - struct ocelot_ace_rule *rule) + struct ocelot_ace_rule *rule, + struct netlink_ext_ack *extack) { struct ocelot_acl_block *block = &ocelot->acl_block; struct ocelot_ace_rule *ace; int i, index; + if (!ocelot_exclusive_mac_etype_ace_rules(ocelot, rule)) { + NL_SET_ERR_MSG_MOD(extack, + "Cannot mix MAC_ETYPE with non-MAC_ETYPE rules"); + return -EBUSY; + } + /* Add rule to the linked list */ ocelot_ace_rule_add(ocelot, block, rule); diff --git a/drivers/net/ethernet/mscc/ocelot_ace.h b/drivers/net/ethernet/mscc/ocelot_ace.h index 29d22c566786..099e177f2617 100644 --- a/drivers/net/ethernet/mscc/ocelot_ace.h +++ b/drivers/net/ethernet/mscc/ocelot_ace.h @@ -194,7 +194,7 @@ struct ocelot_ace_rule { enum ocelot_ace_action action; struct ocelot_ace_stats stats; - u16 ingress_port_mask; + unsigned long ingress_port_mask; enum ocelot_vcap_bit dmac_mc; enum ocelot_vcap_bit dmac_bc; @@ -215,7 +215,8 @@ struct ocelot_ace_rule { }; int ocelot_ace_rule_offload_add(struct ocelot *ocelot, - struct ocelot_ace_rule *rule); + struct ocelot_ace_rule *rule, + struct netlink_ext_ack *extack); int ocelot_ace_rule_offload_del(struct ocelot *ocelot, struct ocelot_ace_rule *rule); int ocelot_ace_rule_stats_update(struct ocelot *ocelot, diff --git a/drivers/net/ethernet/mscc/ocelot_flower.c b/drivers/net/ethernet/mscc/ocelot_flower.c index 341923311fec..954cb67eeaa2 100644 --- a/drivers/net/ethernet/mscc/ocelot_flower.c +++ b/drivers/net/ethernet/mscc/ocelot_flower.c @@ -205,7 +205,7 @@ int ocelot_cls_flower_replace(struct ocelot *ocelot, int port, return ret; } - return ocelot_ace_rule_offload_add(ocelot, ace); + return ocelot_ace_rule_offload_add(ocelot, ace, f->common.extack); } EXPORT_SYMBOL_GPL(ocelot_cls_flower_replace); -- cgit v1.2.3-59-g8ed1b From 7070eea5e95a3c1ea30b9afcedd8d29efcc28477 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Fri, 17 Apr 2020 22:07:55 +0300 Subject: enetc: permit configuration of rx-vlan-filter with ethtool Each ENETC station interface (SI) has a VLAN filter list and a port flag (PSIPVMR) by which it can be put in "VLAN promiscuous" mode, which enables the reception of VLAN-tagged traffic even if it is not in the VLAN filtering list. Currently the handling of this setting works like this: the port starts off as VLAN promiscuous, then it switches to enabling VLAN filtering as soon as the first VLAN is installed in its filter via .ndo_vlan_rx_add_vid. In practice that does not work out very well, because more often than not, the first VLAN to be installed is out of the control of the user: the 8021q module, if loaded, adds its rule for 802.1p (VID 0) traffic upon bringing the interface up. What the user is currently seeing in ethtool is this: ethtool -k eno2 rx-vlan-filter: on [fixed] which doesn't match the intention of the code, but the practical reality of having the 8021q module install its VID which has the side-effect of turning on VLAN filtering in this driver. All in all, a slightly confusing experience. So instead of letting this driver switch the VLAN filtering state by itself, just wire it up with the rx-vlan-filter feature from ethtool, and let it be user-configurable just through that knob, except for one case, see below. In promiscuous mode, it is more intuitive that all traffic is received, including VLAN tagged traffic. It appears that it is necessary to set the flag in PSIPVMR for that to be the case, so VLAN promiscuous mode is also temporarily enabled. On exit from promiscuous mode, the setting made by ethtool is restored. Signed-off-by: Vladimir Oltean Reviewed-by: Claudiu Manoil Signed-off-by: David S. Miller --- drivers/net/ethernet/freescale/enetc/enetc_pf.c | 44 +++++++++---------------- 1 file changed, 16 insertions(+), 28 deletions(-) diff --git a/drivers/net/ethernet/freescale/enetc/enetc_pf.c b/drivers/net/ethernet/freescale/enetc/enetc_pf.c index 85e2b741df41..de1ad4975074 100644 --- a/drivers/net/ethernet/freescale/enetc/enetc_pf.c +++ b/drivers/net/ethernet/freescale/enetc/enetc_pf.c @@ -50,21 +50,6 @@ static void enetc_set_vlan_promisc(struct enetc_hw *hw, char si_map) enetc_port_wr(hw, ENETC_PSIPVMR, ENETC_PSIPVMR_SET_VP(si_map) | val); } -static bool enetc_si_vlan_promisc_is_on(struct enetc_pf *pf, int si_idx) -{ - return pf->vlan_promisc_simap & BIT(si_idx); -} - -static bool enetc_vlan_filter_is_on(struct enetc_pf *pf) -{ - int i; - - for_each_set_bit(i, pf->active_vlans, VLAN_N_VID) - return true; - - return false; -} - static void enetc_enable_si_vlan_promisc(struct enetc_pf *pf, int si_idx) { pf->vlan_promisc_simap |= BIT(si_idx); @@ -204,6 +189,7 @@ static void enetc_pf_set_rx_mode(struct net_device *ndev) { struct enetc_ndev_priv *priv = netdev_priv(ndev); struct enetc_pf *pf = enetc_si_priv(priv->si); + char vlan_promisc_simap = pf->vlan_promisc_simap; struct enetc_hw *hw = &priv->si->hw; bool uprom = false, mprom = false; struct enetc_mac_filter *filter; @@ -216,16 +202,16 @@ static void enetc_pf_set_rx_mode(struct net_device *ndev) psipmr = ENETC_PSIPMR_SET_UP(0) | ENETC_PSIPMR_SET_MP(0); uprom = true; mprom = true; - /* enable VLAN promisc mode for SI0 */ - if (!enetc_si_vlan_promisc_is_on(pf, 0)) - enetc_enable_si_vlan_promisc(pf, 0); - + /* Enable VLAN promiscuous mode for SI0 (PF) */ + vlan_promisc_simap |= BIT(0); } else if (ndev->flags & IFF_ALLMULTI) { /* enable multi cast promisc mode for SI0 (PF) */ psipmr = ENETC_PSIPMR_SET_MP(0); mprom = true; } + enetc_set_vlan_promisc(&pf->si->hw, vlan_promisc_simap); + /* first 2 filter entries belong to PF */ if (!uprom) { /* Update unicast filters */ @@ -306,9 +292,6 @@ static int enetc_vlan_rx_add_vid(struct net_device *ndev, __be16 prot, u16 vid) struct enetc_pf *pf = enetc_si_priv(priv->si); int idx; - if (enetc_si_vlan_promisc_is_on(pf, 0)) - enetc_disable_si_vlan_promisc(pf, 0); - __set_bit(vid, pf->active_vlans); idx = enetc_vid_hash_idx(vid); @@ -326,9 +309,6 @@ static int enetc_vlan_rx_del_vid(struct net_device *ndev, __be16 prot, u16 vid) __clear_bit(vid, pf->active_vlans); enetc_sync_vlan_ht_filter(pf, true); - if (!enetc_vlan_filter_is_on(pf)) - enetc_enable_si_vlan_promisc(pf, 0); - return 0; } @@ -677,6 +657,15 @@ static int enetc_pf_set_features(struct net_device *ndev, enetc_enable_txvlan(&priv->si->hw, 0, !!(features & NETIF_F_HW_VLAN_CTAG_TX)); + if (changed & NETIF_F_HW_VLAN_CTAG_FILTER) { + struct enetc_pf *pf = enetc_si_priv(priv->si); + + if (!!(features & NETIF_F_HW_VLAN_CTAG_FILTER)) + enetc_disable_si_vlan_promisc(pf, 0); + else + enetc_enable_si_vlan_promisc(pf, 0); + } + if (changed & NETIF_F_LOOPBACK) enetc_set_loopback(ndev, !!(features & NETIF_F_LOOPBACK)); @@ -719,12 +708,11 @@ static void enetc_pf_netdev_setup(struct enetc_si *si, struct net_device *ndev, ndev->hw_features = NETIF_F_SG | NETIF_F_RXCSUM | NETIF_F_HW_CSUM | NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_CTAG_RX | - NETIF_F_LOOPBACK; + NETIF_F_HW_VLAN_CTAG_FILTER | NETIF_F_LOOPBACK; ndev->features = NETIF_F_HIGHDMA | NETIF_F_SG | NETIF_F_RXCSUM | NETIF_F_HW_CSUM | NETIF_F_HW_VLAN_CTAG_TX | - NETIF_F_HW_VLAN_CTAG_RX | - NETIF_F_HW_VLAN_CTAG_FILTER; + NETIF_F_HW_VLAN_CTAG_RX; if (si->num_rss) ndev->hw_features |= NETIF_F_RXHASH; -- cgit v1.2.3-59-g8ed1b From 0360c046ca186be1953d185d5a3631e415381820 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Sat, 18 Apr 2020 23:06:51 +0200 Subject: r8169: move setting OCP base to generic init code Move setting the ocp_base to rtl_init_one(). Where supported the value is always the same, and if not supported it doesn't hurt. Signed-off-by: Heiner Kallweit Signed-off-by: David S. Miller --- drivers/net/ethernet/realtek/r8169_main.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/drivers/net/ethernet/realtek/r8169_main.c b/drivers/net/ethernet/realtek/r8169_main.c index bf5bf05970a2..f882e8c09987 100644 --- a/drivers/net/ethernet/realtek/r8169_main.c +++ b/drivers/net/ethernet/realtek/r8169_main.c @@ -75,6 +75,8 @@ #define R8169_TX_RING_BYTES (NUM_TX_DESC * sizeof(struct TxDesc)) #define R8169_RX_RING_BYTES (NUM_RX_DESC * sizeof(struct RxDesc)) +#define OCP_STD_PHY_BASE 0xa400 + #define RTL_CFG_NO_GBIT 1 /* write/read MMIO register */ @@ -847,8 +849,6 @@ static void r8168_mac_ocp_modify(struct rtl8169_private *tp, u32 reg, u16 mask, r8168_mac_ocp_write(tp, reg, (data & ~mask) | set); } -#define OCP_STD_PHY_BASE 0xa400 - static void r8168g_mdio_write(struct rtl8169_private *tp, int reg, int value) { if (reg == 0x1f) { @@ -5189,8 +5189,6 @@ static int r8169_mdio_register(struct rtl8169_private *tp) static void rtl_hw_init_8168g(struct rtl8169_private *tp) { - tp->ocp_base = OCP_STD_PHY_BASE; - RTL_W32(tp, MISC, RTL_R32(tp, MISC) | RXDV_GATED_EN); if (!rtl_udelay_loop_wait_high(tp, &rtl_txcfg_empty_cond, 100, 42)) @@ -5215,8 +5213,6 @@ static void rtl_hw_init_8168g(struct rtl8169_private *tp) static void rtl_hw_init_8125(struct rtl8169_private *tp) { - tp->ocp_base = OCP_STD_PHY_BASE; - RTL_W32(tp, MISC, RTL_R32(tp, MISC) | RXDV_GATED_EN); if (!rtl_udelay_loop_wait_high(tp, &rtl_rxtx_empty_cond, 100, 42)) @@ -5353,6 +5349,7 @@ static int rtl_init_one(struct pci_dev *pdev, const struct pci_device_id *ent) tp->msg_enable = netif_msg_init(debug.msg_enable, R8169_MSG_DEFAULT); tp->supports_gmii = ent->driver_data == RTL_CFG_NO_GBIT ? 0 : 1; tp->eee_adv = -1; + tp->ocp_base = OCP_STD_PHY_BASE; /* Get the *optional* external "ether_clk" used on some boards */ rc = rtl_get_ether_clk(tp); -- cgit v1.2.3-59-g8ed1b From a9b3d56830a3d626a0d85adc285ea94e829b6e9d Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Sat, 18 Apr 2020 23:07:41 +0200 Subject: r8169: remove NETIF_F_HIGHDMA from vlan_features NETIF_F_HIGHDMA is added to vlan_features by register_netdev(), therefore we can omit this here. Signed-off-by: Heiner Kallweit Signed-off-by: David S. Miller --- drivers/net/ethernet/realtek/r8169_main.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/net/ethernet/realtek/r8169_main.c b/drivers/net/ethernet/realtek/r8169_main.c index f882e8c09987..2d6c94652dc7 100644 --- a/drivers/net/ethernet/realtek/r8169_main.c +++ b/drivers/net/ethernet/realtek/r8169_main.c @@ -5440,8 +5440,7 @@ static int rtl_init_one(struct pci_dev *pdev, const struct pci_device_id *ent) dev->hw_features = NETIF_F_IP_CSUM | NETIF_F_RXCSUM | NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_CTAG_RX; - dev->vlan_features = NETIF_F_SG | NETIF_F_IP_CSUM | NETIF_F_TSO | - NETIF_F_HIGHDMA; + dev->vlan_features = NETIF_F_SG | NETIF_F_IP_CSUM | NETIF_F_TSO; dev->priv_flags |= IFF_LIVE_ADDR_CHANGE; tp->cp_cmd |= RxChkSum; -- cgit v1.2.3-59-g8ed1b From 85ab8b245ec6572eb489d0fe76c5573851d7b7ad Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Sat, 18 Apr 2020 23:08:43 +0200 Subject: r8169: preserve VLAN setting on RTL8125 in rtl_init_rxcfg So far we set RX_VLAN_8125 unconditionally, even if NETIF_F_HW_VLAN_CTAG_RX may not be set. Don't touch these bits, and let only rtl8169_set_features() control them. Signed-off-by: Heiner Kallweit Signed-off-by: David S. Miller --- drivers/net/ethernet/realtek/r8169_main.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/realtek/r8169_main.c b/drivers/net/ethernet/realtek/r8169_main.c index 2d6c94652dc7..e37ff1a5161d 100644 --- a/drivers/net/ethernet/realtek/r8169_main.c +++ b/drivers/net/ethernet/realtek/r8169_main.c @@ -2397,6 +2397,8 @@ static void rtl_pll_power_up(struct rtl8169_private *tp) static void rtl_init_rxcfg(struct rtl8169_private *tp) { + u32 vlan; + switch (tp->mac_version) { case RTL_GIGA_MAC_VER_02 ... RTL_GIGA_MAC_VER_06: case RTL_GIGA_MAC_VER_10 ... RTL_GIGA_MAC_VER_17: @@ -2411,8 +2413,9 @@ static void rtl_init_rxcfg(struct rtl8169_private *tp) RTL_W32(tp, RxConfig, RX128_INT_EN | RX_MULTI_EN | RX_DMA_BURST | RX_EARLY_OFF); break; case RTL_GIGA_MAC_VER_60 ... RTL_GIGA_MAC_VER_61: - RTL_W32(tp, RxConfig, RX_FETCH_DFLT_8125 | RX_VLAN_8125 | - RX_DMA_BURST); + /* VLAN flags are controlled by NETIF_F_HW_VLAN_CTAG_RX */ + vlan = RTL_R32(tp, RxConfig) & RX_VLAN_8125; + RTL_W32(tp, RxConfig, vlan | RX_FETCH_DFLT_8125 | RX_DMA_BURST); break; default: RTL_W32(tp, RxConfig, RX128_INT_EN | RX_DMA_BURST); -- cgit v1.2.3-59-g8ed1b From 145192f83a1184ca8f2ef4508e7a93bb783bb444 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Sat, 18 Apr 2020 23:10:03 +0200 Subject: r8169: use rtl8169_set_features in rtl8169_init_one At that place in rtl_init_one() we can safely use rtl8169_set_features() to configure the chip according to the default features. Signed-off-by: Heiner Kallweit Signed-off-by: David S. Miller --- drivers/net/ethernet/realtek/r8169_main.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/realtek/r8169_main.c b/drivers/net/ethernet/realtek/r8169_main.c index e37ff1a5161d..e8c55b795c76 100644 --- a/drivers/net/ethernet/realtek/r8169_main.c +++ b/drivers/net/ethernet/realtek/r8169_main.c @@ -5446,10 +5446,6 @@ static int rtl_init_one(struct pci_dev *pdev, const struct pci_device_id *ent) dev->vlan_features = NETIF_F_SG | NETIF_F_IP_CSUM | NETIF_F_TSO; dev->priv_flags |= IFF_LIVE_ADDR_CHANGE; - tp->cp_cmd |= RxChkSum; - /* RTL8125 uses register RxConfig for VLAN offloading config */ - if (!rtl_is_8125(tp)) - tp->cp_cmd |= RxVlan; /* * Pretend we are using VLANs; This bypasses a nasty bug where * Interrupts stop flowing on high load on 8110SCd controllers. @@ -5481,6 +5477,9 @@ static int rtl_init_one(struct pci_dev *pdev, const struct pci_device_id *ent) dev->hw_features |= NETIF_F_RXALL; dev->hw_features |= NETIF_F_RXFCS; + /* configure chip for default features */ + rtl8169_set_features(dev, dev->features); + jumbo_max = rtl_jumbo_max(tp); if (jumbo_max) dev->max_mtu = jumbo_max; -- cgit v1.2.3-59-g8ed1b From 0623b98b41cd16073ee15f35e058194313e6dc51 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Sat, 18 Apr 2020 23:10:44 +0200 Subject: r8169: improve rtl8169_tso_csum_v2 Simplify the code and avoid the overhead of calling vlan_get_protocol(). Signed-off-by: Heiner Kallweit Signed-off-by: David S. Miller --- drivers/net/ethernet/realtek/r8169_main.c | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) diff --git a/drivers/net/ethernet/realtek/r8169_main.c b/drivers/net/ethernet/realtek/r8169_main.c index e8c55b795c76..2cd2b038e4eb 100644 --- a/drivers/net/ethernet/realtek/r8169_main.c +++ b/drivers/net/ethernet/realtek/r8169_main.c @@ -4127,25 +4127,20 @@ static bool rtl8169_tso_csum_v2(struct rtl8169_private *tp, struct sk_buff *skb, u32 *opts) { u32 transport_offset = (u32)skb_transport_offset(skb); - u32 mss = skb_shinfo(skb)->gso_size; + struct skb_shared_info *shinfo = skb_shinfo(skb); + u32 mss = shinfo->gso_size; if (mss) { - switch (vlan_get_protocol(skb)) { - case htons(ETH_P_IP): + if (shinfo->gso_type & SKB_GSO_TCPV4) { opts[0] |= TD1_GTSENV4; - break; - - case htons(ETH_P_IPV6): + } else if (shinfo->gso_type & SKB_GSO_TCPV6) { if (skb_cow_head(skb, 0)) return false; tcp_v6_gso_csum_prep(skb); opts[0] |= TD1_GTSENV6; - break; - - default: + } else { WARN_ON_ONCE(1); - break; } opts[0] |= transport_offset << GTTCPHO_SHIFT; -- cgit v1.2.3-59-g8ed1b From 773235f4e1cc41ea98f520a0cfe0a51a58b2d411 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Sat, 18 Apr 2020 23:11:32 +0200 Subject: r8169: add workaround for RTL8168evl TSO hw issues Add workaround for hw issues with TSO on RTL8168evl. This workaround is based on information I got from Realtek, and *should* allow to safely enable TSO on this chip version. Signed-off-by: Heiner Kallweit Signed-off-by: David S. Miller --- drivers/net/ethernet/realtek/r8169_main.c | 34 +++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) diff --git a/drivers/net/ethernet/realtek/r8169_main.c b/drivers/net/ethernet/realtek/r8169_main.c index 2cd2b038e4eb..a8696d958cd1 100644 --- a/drivers/net/ethernet/realtek/r8169_main.c +++ b/drivers/net/ethernet/realtek/r8169_main.c @@ -4306,6 +4306,37 @@ err_stop_0: return NETDEV_TX_BUSY; } +static unsigned int rtl_last_frag_len(struct sk_buff *skb) +{ + struct skb_shared_info *info = skb_shinfo(skb); + unsigned int nr_frags = info->nr_frags; + + if (!nr_frags) + return UINT_MAX; + + return skb_frag_size(info->frags + nr_frags - 1); +} + +/* Workaround for hw issues with TSO on RTL8168evl */ +static netdev_features_t rtl8168evl_fix_tso(struct sk_buff *skb, + netdev_features_t features) +{ + /* IPv4 header has options field */ + if (vlan_get_protocol(skb) == htons(ETH_P_IP) && + ip_hdrlen(skb) > sizeof(struct iphdr)) + features &= ~NETIF_F_ALL_TSO; + + /* IPv4 TCP header has options field */ + else if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV4 && + tcp_hdrlen(skb) > sizeof(struct tcphdr)) + features &= ~NETIF_F_ALL_TSO; + + else if (rtl_last_frag_len(skb) <= 6) + features &= ~NETIF_F_ALL_TSO; + + return features; +} + static netdev_features_t rtl8169_features_check(struct sk_buff *skb, struct net_device *dev, netdev_features_t features) @@ -4314,6 +4345,9 @@ static netdev_features_t rtl8169_features_check(struct sk_buff *skb, struct rtl8169_private *tp = netdev_priv(dev); if (skb_is_gso(skb)) { + if (tp->mac_version == RTL_GIGA_MAC_VER_34) + features = rtl8168evl_fix_tso(skb, features); + if (transport_offset > GTTCPHO_MAX && rtl_chip_supports_csum_v2(tp)) features &= ~NETIF_F_ALL_TSO; -- cgit v1.2.3-59-g8ed1b From bb7fc863729b45f0fbcdea991d0465d855ffd831 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Sun, 5 Apr 2020 20:57:00 +0300 Subject: net/mlx5: Provide simplified command interfaces Many mlx5_cmd_exec() callers are not interested in the output from that command or have standard in/out structures. Those callers simply allocate those structure on the stack and use sizeof() to provide in/out arguments. In this naive approach provide simplified versions of mlx5_cmd_exec(). Reviewed-by: Saeed Mahameed Signed-off-by: Leon Romanovsky --- include/linux/mlx5/driver.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 6f8f79ef829b..1caddfa85c4d 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -903,6 +903,19 @@ int mlx5_cmd_exec_cb(struct mlx5_async_ctx *ctx, void *in, int in_size, int mlx5_cmd_exec(struct mlx5_core_dev *dev, void *in, int in_size, void *out, int out_size); + +#define mlx5_cmd_exec_inout(dev, ifc_cmd, in, out) \ + ({ \ + mlx5_cmd_exec(dev, in, MLX5_ST_SZ_BYTES(ifc_cmd##_in), out, \ + MLX5_ST_SZ_BYTES(ifc_cmd##_out)); \ + }) + +#define mlx5_cmd_exec_in(dev, ifc_cmd, in) \ + ({ \ + u32 _out[MLX5_ST_SZ_DW(ifc_cmd##_out)] = {}; \ + mlx5_cmd_exec_inout(dev, ifc_cmd, in, _out); \ + }) + int mlx5_cmd_exec_polling(struct mlx5_core_dev *dev, void *in, int in_size, void *out, int out_size); void mlx5_cmd_mbox_status(void *out, u8 *status, u32 *syndrome); -- cgit v1.2.3-59-g8ed1b From ec44e72b73b74af489196352152e53a20c8ad5eb Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Thu, 2 Apr 2020 15:17:40 +0300 Subject: net/mlx5: Open-code create and destroy QP calls FPGA, IPoIB and SW steering don't need anything from the mlx5_core_create_qp() and mlx5_core_destroy_qp() except calls to mlx5_cmd_exec(). Let's open-code it, so we will be able to move qp.c to mlx5_ib. Reviewed-by: Saeed Mahameed Signed-off-by: Leon Romanovsky --- .../net/ethernet/mellanox/mlx5/core/fpga/conn.c | 24 +++++++++-------- .../net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c | 30 ++++++++++------------ .../ethernet/mellanox/mlx5/core/steering/dr_send.c | 18 ++++++++----- 3 files changed, 39 insertions(+), 33 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fpga/conn.c b/drivers/net/ethernet/mellanox/mlx5/core/fpga/conn.c index 61021133029e..7c3e7232852e 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/fpga/conn.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/fpga/conn.c @@ -534,8 +534,9 @@ static int mlx5_fpga_conn_create_qp(struct mlx5_fpga_conn *conn, unsigned int tx_size, unsigned int rx_size) { struct mlx5_fpga_device *fdev = conn->fdev; + u32 out[MLX5_ST_SZ_DW(create_qp_out)] = {}; struct mlx5_core_dev *mdev = fdev->mdev; - u32 temp_qpc[MLX5_ST_SZ_DW(qpc)] = {0}; + u32 temp_qpc[MLX5_ST_SZ_DW(qpc)] = {}; void *in = NULL, *qpc; int err, inlen; @@ -600,10 +601,12 @@ static int mlx5_fpga_conn_create_qp(struct mlx5_fpga_conn *conn, mlx5_fill_page_frag_array(&conn->qp.wq_ctrl.buf, (__be64 *)MLX5_ADDR_OF(create_qp_in, in, pas)); - err = mlx5_core_create_qp(mdev, &conn->qp.mqp, in, inlen); + MLX5_SET(create_qp_in, in, opcode, MLX5_CMD_OP_CREATE_QP); + err = mlx5_cmd_exec(mdev, in, inlen, out, sizeof(out)); if (err) goto err_sq_bufs; + conn->qp.mqp.qpn = MLX5_GET(create_qp_out, out, qpn); conn->qp.mqp.event = mlx5_fpga_conn_event; mlx5_fpga_dbg(fdev, "Created QP #0x%x\n", conn->qp.mqp.qpn); @@ -658,7 +661,14 @@ static void mlx5_fpga_conn_flush_send_bufs(struct mlx5_fpga_conn *conn) static void mlx5_fpga_conn_destroy_qp(struct mlx5_fpga_conn *conn) { - mlx5_core_destroy_qp(conn->fdev->mdev, &conn->qp.mqp); + struct mlx5_core_dev *dev = conn->fdev->mdev; + u32 in[MLX5_ST_SZ_DW(destroy_qp_in)] = {}; + struct mlx5_core_qp *qp = &conn->qp.mqp; + + MLX5_SET(destroy_qp_in, in, opcode, MLX5_CMD_OP_DESTROY_QP); + MLX5_SET(destroy_qp_in, in, qpn, qp->qpn); + mlx5_cmd_exec_in(dev, destroy_qp, in); + mlx5_fpga_conn_free_recv_bufs(conn); mlx5_fpga_conn_flush_send_bufs(conn); kvfree(conn->qp.sq.bufs); @@ -972,19 +982,11 @@ out: void mlx5_fpga_conn_destroy(struct mlx5_fpga_conn *conn) { - struct mlx5_fpga_device *fdev = conn->fdev; - struct mlx5_core_dev *mdev = fdev->mdev; - int err = 0; - conn->qp.active = false; tasklet_disable(&conn->cq.tasklet); synchronize_irq(conn->cq.mcq.irqn); mlx5_fpga_destroy_qp(conn->fdev->mdev, conn->fpga_qpn); - err = mlx5_core_qp_modify(mdev, MLX5_CMD_OP_2ERR_QP, 0, NULL, - &conn->qp.mqp); - if (err) - mlx5_fpga_warn(fdev, "qp_modify 2ERR failed: %d\n", err); mlx5_fpga_conn_destroy_qp(conn); mlx5_fpga_conn_destroy_cq(conn); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c b/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c index 673aaa815f57..8bca11cb1e19 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c @@ -219,17 +219,12 @@ void mlx5i_uninit_underlay_qp(struct mlx5e_priv *priv) int mlx5i_create_underlay_qp(struct mlx5_core_dev *mdev, struct mlx5_core_qp *qp) { - u32 *in = NULL; + u32 out[MLX5_ST_SZ_DW(create_qp_out)] = {}; + u32 in[MLX5_ST_SZ_DW(create_qp_in)] = {}; void *addr_path; int ret = 0; - int inlen; void *qpc; - inlen = MLX5_ST_SZ_BYTES(create_qp_in); - in = kvzalloc(inlen, GFP_KERNEL); - if (!in) - return -ENOMEM; - qpc = MLX5_ADDR_OF(create_qp_in, in, qpc); MLX5_SET(qpc, qpc, st, MLX5_QP_ST_UD); MLX5_SET(qpc, qpc, pm_state, MLX5_QP_PM_MIGRATED); @@ -240,20 +235,23 @@ int mlx5i_create_underlay_qp(struct mlx5_core_dev *mdev, struct mlx5_core_qp *qp MLX5_SET(ads, addr_path, vhca_port_num, 1); MLX5_SET(ads, addr_path, grh, 1); - ret = mlx5_core_create_qp(mdev, qp, in, inlen); - if (ret) { - mlx5_core_err(mdev, "Failed creating IPoIB QP err : %d\n", ret); - goto out; - } + MLX5_SET(create_qp_in, in, opcode, MLX5_CMD_OP_CREATE_QP); + ret = mlx5_cmd_exec_inout(mdev, create_qp, in, out); + if (ret) + return ret; -out: - kvfree(in); - return ret; + qp->qpn = MLX5_GET(create_qp_out, out, qpn); + + return 0; } void mlx5i_destroy_underlay_qp(struct mlx5_core_dev *mdev, struct mlx5_core_qp *qp) { - mlx5_core_destroy_qp(mdev, qp); + u32 in[MLX5_ST_SZ_DW(destroy_qp_in)] = {}; + + MLX5_SET(destroy_qp_in, in, opcode, MLX5_CMD_OP_DESTROY_QP); + MLX5_SET(destroy_qp_in, in, qpn, qp->qpn); + mlx5_cmd_exec_in(mdev, destroy_qp, in); } int mlx5i_create_tis(struct mlx5_core_dev *mdev, u32 underlay_qpn, u32 *tisn) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_send.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_send.c index c0ab9cf74929..88bc94a8b8f1 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_send.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_send.c @@ -108,6 +108,7 @@ static void dr_qp_event(struct mlx5_core_qp *mqp, int event) static struct mlx5dr_qp *dr_create_rc_qp(struct mlx5_core_dev *mdev, struct dr_qp_init_attr *attr) { + u32 out[MLX5_ST_SZ_DW(create_qp_out)] = {}; u32 temp_qpc[MLX5_ST_SZ_DW(qpc)] = {}; struct mlx5_wq_param wqp; struct mlx5dr_qp *dr_qp; @@ -180,13 +181,12 @@ static struct mlx5dr_qp *dr_create_rc_qp(struct mlx5_core_dev *mdev, (__be64 *)MLX5_ADDR_OF(create_qp_in, in, pas)); - err = mlx5_core_create_qp(mdev, &dr_qp->mqp, in, inlen); + MLX5_SET(create_qp_in, in, opcode, MLX5_CMD_OP_CREATE_QP); + err = mlx5_cmd_exec(mdev, in, inlen, out, sizeof(out)); + dr_qp->mqp.qpn = MLX5_GET(create_qp_out, out, qpn); kfree(in); - - if (err) { - mlx5_core_warn(mdev, " Can't create QP\n"); + if (err) goto err_in; - } dr_qp->mqp.event = dr_qp_event; dr_qp->uar = attr->uar; @@ -204,7 +204,13 @@ err_wq: static void dr_destroy_qp(struct mlx5_core_dev *mdev, struct mlx5dr_qp *dr_qp) { - mlx5_core_destroy_qp(mdev, &dr_qp->mqp); + u32 in[MLX5_ST_SZ_DW(destroy_qp_in)] = {}; + struct mlx5_core_qp *qp = &dr_qp->mqp; + + MLX5_SET(destroy_qp_in, in, opcode, MLX5_CMD_OP_DESTROY_QP); + MLX5_SET(destroy_qp_in, in, qpn, qp->qpn); + mlx5_cmd_exec_in(mdev, destroy_qp, in); + kfree(dr_qp->sq.wqe_head); mlx5_wq_destroy(&dr_qp->wq_ctrl); kfree(dr_qp); -- cgit v1.2.3-59-g8ed1b From 73a75b96fc9a79779ad7491b61d65f0bbae04d11 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Mon, 6 Apr 2020 08:40:52 +0300 Subject: net/mlx5: Remove empty QP and CQ events handlers The QP and CQ events functions do nothing except printing some debug messages. There is nothing to do with this knowledge and such events, so remove them. Reviewed-by: Saeed Mahameed Signed-off-by: Leon Romanovsky --- drivers/net/ethernet/mellanox/mlx5/core/fpga/conn.c | 19 ------------------- .../ethernet/mellanox/mlx5/core/steering/dr_send.c | 14 -------------- 2 files changed, 33 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fpga/conn.c b/drivers/net/ethernet/mellanox/mlx5/core/fpga/conn.c index 7c3e7232852e..1d49894399af 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/fpga/conn.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/fpga/conn.c @@ -362,23 +362,6 @@ static void mlx5_fpga_conn_arm_cq(struct mlx5_fpga_conn *conn) conn->fdev->conn_res.uar->map, conn->cq.wq.cc); } -static void mlx5_fpga_conn_cq_event(struct mlx5_core_cq *mcq, - enum mlx5_event event) -{ - struct mlx5_fpga_conn *conn; - - conn = container_of(mcq, struct mlx5_fpga_conn, cq.mcq); - mlx5_fpga_warn(conn->fdev, "CQ event %u on CQ #%u\n", event, mcq->cqn); -} - -static void mlx5_fpga_conn_event(struct mlx5_core_qp *mqp, int event) -{ - struct mlx5_fpga_conn *conn; - - conn = container_of(mqp, struct mlx5_fpga_conn, qp.mqp); - mlx5_fpga_warn(conn->fdev, "QP event %u on QP #%u\n", event, mqp->qpn); -} - static inline void mlx5_fpga_conn_cqes(struct mlx5_fpga_conn *conn, unsigned int budget) { @@ -493,7 +476,6 @@ static int mlx5_fpga_conn_create_cq(struct mlx5_fpga_conn *conn, int cq_size) *conn->cq.mcq.arm_db = 0; conn->cq.mcq.vector = 0; conn->cq.mcq.comp = mlx5_fpga_conn_cq_complete; - conn->cq.mcq.event = mlx5_fpga_conn_cq_event; conn->cq.mcq.irqn = irqn; conn->cq.mcq.uar = fdev->conn_res.uar; tasklet_init(&conn->cq.tasklet, mlx5_fpga_conn_cq_tasklet, @@ -607,7 +589,6 @@ static int mlx5_fpga_conn_create_qp(struct mlx5_fpga_conn *conn, goto err_sq_bufs; conn->qp.mqp.qpn = MLX5_GET(create_qp_out, out, qpn); - conn->qp.mqp.event = mlx5_fpga_conn_event; mlx5_fpga_dbg(fdev, "Created QP #0x%x\n", conn->qp.mqp.qpn); goto out; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_send.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_send.c index 88bc94a8b8f1..690e4181db4c 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_send.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_send.c @@ -100,11 +100,6 @@ static int dr_poll_cq(struct mlx5dr_cq *dr_cq, int ne) return err == CQ_POLL_ERR ? err : npolled; } -static void dr_qp_event(struct mlx5_core_qp *mqp, int event) -{ - pr_info("DR QP event %u on QP #%u\n", event, mqp->qpn); -} - static struct mlx5dr_qp *dr_create_rc_qp(struct mlx5_core_dev *mdev, struct dr_qp_init_attr *attr) { @@ -187,7 +182,6 @@ static struct mlx5dr_qp *dr_create_rc_qp(struct mlx5_core_dev *mdev, kfree(in); if (err) goto err_in; - dr_qp->mqp.event = dr_qp_event; dr_qp->uar = attr->uar; return dr_qp; @@ -695,12 +689,6 @@ static int dr_prepare_qp_to_rts(struct mlx5dr_domain *dmn) return 0; } -static void dr_cq_event(struct mlx5_core_cq *mcq, - enum mlx5_event event) -{ - pr_info("CQ event %u on CQ #%u\n", event, mcq->cqn); -} - static struct mlx5dr_cq *dr_create_cq(struct mlx5_core_dev *mdev, struct mlx5_uars_page *uar, size_t ncqe) @@ -761,8 +749,6 @@ static struct mlx5dr_cq *dr_create_cq(struct mlx5_core_dev *mdev, pas = (__be64 *)MLX5_ADDR_OF(create_cq_in, in, pas); mlx5_fill_page_frag_array(&cq->wq_ctrl.buf, pas); - cq->mcq.event = dr_cq_event; - err = mlx5_core_create_cq(mdev, &cq->mcq, in, inlen, out, sizeof(out)); kvfree(in); -- cgit v1.2.3-59-g8ed1b From acab4b88e93ceb467623b991bb4b1594c893667a Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Sat, 4 Apr 2020 18:25:48 +0300 Subject: net/mlx5: Open-code modify QP in steering module Remove dependency on qp.c from SW steering by open coding modify QP interface. Reviewed-by: Saeed Mahameed Signed-off-by: Leon Romanovsky --- .../ethernet/mellanox/mlx5/core/steering/dr_send.c | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_send.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_send.c index 690e4181db4c..266b913d2f9a 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_send.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_send.c @@ -585,8 +585,10 @@ static int dr_modify_qp_rst2init(struct mlx5_core_dev *mdev, MLX5_SET(qpc, qpc, rre, 1); MLX5_SET(qpc, qpc, rwe, 1); - return mlx5_core_qp_modify(mdev, MLX5_CMD_OP_RST2INIT_QP, 0, qpc, - &dr_qp->mqp); + MLX5_SET(rst2init_qp_in, in, opcode, MLX5_CMD_OP_RST2INIT_QP); + MLX5_SET(rst2init_qp_in, in, qpn, dr_qp->mqp.qpn); + + return mlx5_cmd_exec_in(mdev, rst2init_qp, in); } static int dr_cmd_modify_qp_rtr2rts(struct mlx5_core_dev *mdev, @@ -600,12 +602,13 @@ static int dr_cmd_modify_qp_rtr2rts(struct mlx5_core_dev *mdev, MLX5_SET(rtr2rts_qp_in, in, qpn, dr_qp->mqp.qpn); - MLX5_SET(qpc, qpc, log_ack_req_freq, 0); MLX5_SET(qpc, qpc, retry_count, attr->retry_cnt); MLX5_SET(qpc, qpc, rnr_retry, attr->rnr_retry); - return mlx5_core_qp_modify(mdev, MLX5_CMD_OP_RTR2RTS_QP, 0, qpc, - &dr_qp->mqp); + MLX5_SET(rtr2rts_qp_in, in, opcode, MLX5_CMD_OP_RTR2RTS_QP); + MLX5_SET(rtr2rts_qp_in, in, qpn, dr_qp->mqp.qpn); + + return mlx5_cmd_exec_in(mdev, rtr2rts_qp, in); } static int dr_cmd_modify_qp_init2rtr(struct mlx5_core_dev *mdev, @@ -636,8 +639,10 @@ static int dr_cmd_modify_qp_init2rtr(struct mlx5_core_dev *mdev, MLX5_SET(qpc, qpc, primary_address_path.vhca_port_num, attr->port_num); MLX5_SET(qpc, qpc, min_rnr_nak, 1); - return mlx5_core_qp_modify(mdev, MLX5_CMD_OP_INIT2RTR_QP, 0, qpc, - &dr_qp->mqp); + MLX5_SET(init2rtr_qp_in, in, opcode, MLX5_CMD_OP_INIT2RTR_QP); + MLX5_SET(init2rtr_qp_in, in, qpn, dr_qp->mqp.qpn); + + return mlx5_cmd_exec_in(mdev, init2rtr_qp, in); } static int dr_prepare_qp_to_rts(struct mlx5dr_domain *dmn) -- cgit v1.2.3-59-g8ed1b From a6532fd925b981863161275dea6fd26b2e2c02e4 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Mon, 6 Apr 2020 09:42:43 +0300 Subject: net/mlx5: Open-code modify QP in the FPGA module Remove dependency on qp.c from the FPGA by open coding modify QP interface. Reviewed-by: Saeed Mahameed Signed-off-by: Leon Romanovsky --- .../net/ethernet/mellanox/mlx5/core/fpga/conn.c | 84 ++++++++-------------- 1 file changed, 28 insertions(+), 56 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fpga/conn.c b/drivers/net/ethernet/mellanox/mlx5/core/fpga/conn.c index 1d49894399af..b00d834d2dbf 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/fpga/conn.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/fpga/conn.c @@ -657,30 +657,29 @@ static void mlx5_fpga_conn_destroy_qp(struct mlx5_fpga_conn *conn) mlx5_wq_destroy(&conn->qp.wq_ctrl); } -static inline int mlx5_fpga_conn_reset_qp(struct mlx5_fpga_conn *conn) +static int mlx5_fpga_conn_reset_qp(struct mlx5_fpga_conn *conn) { struct mlx5_core_dev *mdev = conn->fdev->mdev; + u32 in[MLX5_ST_SZ_DW(qp_2rst_in)] = {}; mlx5_fpga_dbg(conn->fdev, "Modifying QP %u to RST\n", conn->qp.mqp.qpn); - return mlx5_core_qp_modify(mdev, MLX5_CMD_OP_2RST_QP, 0, NULL, - &conn->qp.mqp); + MLX5_SET(qp_2rst_in, in, opcode, MLX5_CMD_OP_2RST_QP); + MLX5_SET(qp_2rst_in, in, qpn, conn->qp.mqp.qpn); + + return mlx5_cmd_exec_in(mdev, qp_2rst, in); } -static inline int mlx5_fpga_conn_init_qp(struct mlx5_fpga_conn *conn) +static int mlx5_fpga_conn_init_qp(struct mlx5_fpga_conn *conn) { + u32 in[MLX5_ST_SZ_DW(rst2init_qp_in)] = {}; struct mlx5_fpga_device *fdev = conn->fdev; struct mlx5_core_dev *mdev = fdev->mdev; - u32 *qpc = NULL; - int err; + u32 *qpc; mlx5_fpga_dbg(conn->fdev, "Modifying QP %u to INIT\n", conn->qp.mqp.qpn); - qpc = kzalloc(MLX5_ST_SZ_BYTES(qpc), GFP_KERNEL); - if (!qpc) { - err = -ENOMEM; - goto out; - } + qpc = MLX5_ADDR_OF(rst2init_qp_in, in, qpc); MLX5_SET(qpc, qpc, st, MLX5_QP_ST_RC); MLX5_SET(qpc, qpc, pm_state, MLX5_QP_PM_MIGRATED); @@ -691,32 +690,22 @@ static inline int mlx5_fpga_conn_init_qp(struct mlx5_fpga_conn *conn) MLX5_SET(qpc, qpc, cqn_rcv, conn->cq.mcq.cqn); MLX5_SET64(qpc, qpc, dbr_addr, conn->qp.wq_ctrl.db.dma); - err = mlx5_core_qp_modify(mdev, MLX5_CMD_OP_RST2INIT_QP, 0, qpc, - &conn->qp.mqp); - if (err) { - mlx5_fpga_warn(fdev, "qp_modify RST2INIT failed: %d\n", err); - goto out; - } + MLX5_SET(rst2init_qp_in, in, opcode, MLX5_CMD_OP_RST2INIT_QP); + MLX5_SET(rst2init_qp_in, in, qpn, conn->qp.mqp.qpn); -out: - kfree(qpc); - return err; + return mlx5_cmd_exec_in(mdev, rst2init_qp, in); } -static inline int mlx5_fpga_conn_rtr_qp(struct mlx5_fpga_conn *conn) +static int mlx5_fpga_conn_rtr_qp(struct mlx5_fpga_conn *conn) { + u32 in[MLX5_ST_SZ_DW(init2rtr_qp_in)] = {}; struct mlx5_fpga_device *fdev = conn->fdev; struct mlx5_core_dev *mdev = fdev->mdev; - u32 *qpc = NULL; - int err; + u32 *qpc; mlx5_fpga_dbg(conn->fdev, "QP RTR\n"); - qpc = kzalloc(MLX5_ST_SZ_BYTES(qpc), GFP_KERNEL); - if (!qpc) { - err = -ENOMEM; - goto out; - } + qpc = MLX5_ADDR_OF(init2rtr_qp_in, in, qpc); MLX5_SET(qpc, qpc, mtu, MLX5_QPC_MTU_1K_BYTES); MLX5_SET(qpc, qpc, log_msg_max, (u8)MLX5_CAP_GEN(mdev, log_max_msg)); @@ -736,33 +725,22 @@ static inline int mlx5_fpga_conn_rtr_qp(struct mlx5_fpga_conn *conn) MLX5_ADDR_OF(fpga_qpc, conn->fpga_qpc, fpga_ip), MLX5_FLD_SZ_BYTES(qpc, primary_address_path.rgid_rip)); - err = mlx5_core_qp_modify(mdev, MLX5_CMD_OP_INIT2RTR_QP, 0, qpc, - &conn->qp.mqp); - if (err) { - mlx5_fpga_warn(fdev, "qp_modify RST2INIT failed: %d\n", err); - goto out; - } + MLX5_SET(init2rtr_qp_in, in, opcode, MLX5_CMD_OP_INIT2RTR_QP); + MLX5_SET(init2rtr_qp_in, in, qpn, conn->qp.mqp.qpn); -out: - kfree(qpc); - return err; + return mlx5_cmd_exec_in(mdev, init2rtr_qp, in); } -static inline int mlx5_fpga_conn_rts_qp(struct mlx5_fpga_conn *conn) +static int mlx5_fpga_conn_rts_qp(struct mlx5_fpga_conn *conn) { struct mlx5_fpga_device *fdev = conn->fdev; + u32 in[MLX5_ST_SZ_DW(rtr2rts_qp_in)] = {}; struct mlx5_core_dev *mdev = fdev->mdev; - u32 *qpc = NULL; - u32 opt_mask; - int err; + u32 *qpc; mlx5_fpga_dbg(conn->fdev, "QP RTS\n"); - qpc = kzalloc(MLX5_ST_SZ_BYTES(qpc), GFP_KERNEL); - if (!qpc) { - err = -ENOMEM; - goto out; - } + qpc = MLX5_ADDR_OF(rtr2rts_qp_in, in, qpc); MLX5_SET(qpc, qpc, log_ack_req_freq, 8); MLX5_SET(qpc, qpc, min_rnr_nak, 0x12); @@ -772,17 +750,11 @@ static inline int mlx5_fpga_conn_rts_qp(struct mlx5_fpga_conn *conn) MLX5_SET(qpc, qpc, retry_count, 7); MLX5_SET(qpc, qpc, rnr_retry, 7); /* Infinite retry if RNR NACK */ - opt_mask = MLX5_QP_OPTPAR_RNR_TIMEOUT; - err = mlx5_core_qp_modify(mdev, MLX5_CMD_OP_RTR2RTS_QP, opt_mask, qpc, - &conn->qp.mqp); - if (err) { - mlx5_fpga_warn(fdev, "qp_modify RST2INIT failed: %d\n", err); - goto out; - } + MLX5_SET(rtr2rts_qp_in, in, opcode, MLX5_CMD_OP_RTR2RTS_QP); + MLX5_SET(rtr2rts_qp_in, in, qpn, conn->qp.mqp.qpn); + MLX5_SET(rtr2rts_qp_in, in, opt_param_mask, MLX5_QP_OPTPAR_RNR_TIMEOUT); -out: - kfree(qpc); - return err; + return mlx5_cmd_exec_in(mdev, rtr2rts_qp, in); } static int mlx5_fpga_conn_connect(struct mlx5_fpga_conn *conn) -- cgit v1.2.3-59-g8ed1b From a452e0e43669d5223f3f2264d0d4f08acdba98c0 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Mon, 6 Apr 2020 13:43:14 +0300 Subject: net/mlx5: Open-code modify QP in the IPoIB module Remove dependency on qp.c from the IPoIB by open coding modify QP interface. Reviewed-by: Saeed Mahameed Signed-off-by: Leon Romanovsky --- .../net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c | 76 ++++++++++++---------- 1 file changed, 42 insertions(+), 34 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c b/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c index 8bca11cb1e19..83b198d8e3d6 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c @@ -161,44 +161,54 @@ int mlx5i_init_underlay_qp(struct mlx5e_priv *priv) struct mlx5_core_dev *mdev = priv->mdev; struct mlx5i_priv *ipriv = priv->ppriv; struct mlx5_core_qp *qp = &ipriv->qp; - struct mlx5_qp_context *context; int ret; - /* QP states */ - context = kzalloc(sizeof(*context), GFP_KERNEL); - if (!context) - return -ENOMEM; + { + u32 in[MLX5_ST_SZ_DW(rst2init_qp_in)] = {}; + u32 *qpc; - context->flags = cpu_to_be32(MLX5_QP_PM_MIGRATED << 11); - context->pri_path.port = 1; - context->pri_path.pkey_index = cpu_to_be16(ipriv->pkey_index); - context->qkey = cpu_to_be32(IB_DEFAULT_Q_KEY); + qpc = MLX5_ADDR_OF(rst2init_qp_in, in, qpc); - ret = mlx5_core_qp_modify(mdev, MLX5_CMD_OP_RST2INIT_QP, 0, context, qp); - if (ret) { - mlx5_core_err(mdev, "Failed to modify qp RST2INIT, err: %d\n", ret); - goto err_qp_modify_to_err; - } - memset(context, 0, sizeof(*context)); + MLX5_SET(qpc, qpc, pm_state, MLX5_QP_PM_MIGRATED); + MLX5_SET(qpc, qpc, primary_address_path.pkey_index, + ipriv->pkey_index); + MLX5_SET(qpc, qpc, primary_address_path.vhca_port_num, 1); + MLX5_SET(qpc, qpc, q_key, IB_DEFAULT_Q_KEY); - ret = mlx5_core_qp_modify(mdev, MLX5_CMD_OP_INIT2RTR_QP, 0, context, qp); - if (ret) { - mlx5_core_err(mdev, "Failed to modify qp INIT2RTR, err: %d\n", ret); - goto err_qp_modify_to_err; + MLX5_SET(rst2init_qp_in, in, opcode, MLX5_CMD_OP_RST2INIT_QP); + MLX5_SET(rst2init_qp_in, in, qpn, qp->qpn); + ret = mlx5_cmd_exec_in(mdev, rst2init_qp, in); + if (ret) + goto err_qp_modify_to_err; } - - ret = mlx5_core_qp_modify(mdev, MLX5_CMD_OP_RTR2RTS_QP, 0, context, qp); - if (ret) { - mlx5_core_err(mdev, "Failed to modify qp RTR2RTS, err: %d\n", ret); - goto err_qp_modify_to_err; + { + u32 in[MLX5_ST_SZ_DW(init2rtr_qp_in)] = {}; + + MLX5_SET(init2rtr_qp_in, in, opcode, MLX5_CMD_OP_INIT2RTR_QP); + MLX5_SET(init2rtr_qp_in, in, qpn, qp->qpn); + ret = mlx5_cmd_exec_in(mdev, init2rtr_qp, in); + if (ret) + goto err_qp_modify_to_err; + } + { + u32 in[MLX5_ST_SZ_DW(rtr2rts_qp_in)] = {}; + + MLX5_SET(rtr2rts_qp_in, in, opcode, MLX5_CMD_OP_RTR2RTS_QP); + MLX5_SET(rtr2rts_qp_in, in, qpn, qp->qpn); + ret = mlx5_cmd_exec_in(mdev, rtr2rts_qp, in); + if (ret) + goto err_qp_modify_to_err; } - - kfree(context); return 0; err_qp_modify_to_err: - mlx5_core_qp_modify(mdev, MLX5_CMD_OP_2ERR_QP, 0, &context, qp); - kfree(context); + { + u32 in[MLX5_ST_SZ_DW(qp_2err_in)] = {}; + + MLX5_SET(qp_2err_in, in, opcode, MLX5_CMD_OP_2ERR_QP); + MLX5_SET(qp_2err_in, in, qpn, qp->qpn); + mlx5_cmd_exec_in(mdev, qp_2err, in); + } return ret; } @@ -206,13 +216,11 @@ void mlx5i_uninit_underlay_qp(struct mlx5e_priv *priv) { struct mlx5i_priv *ipriv = priv->ppriv; struct mlx5_core_dev *mdev = priv->mdev; - struct mlx5_qp_context context; - int err; + u32 in[MLX5_ST_SZ_DW(qp_2rst_in)] = {}; - err = mlx5_core_qp_modify(mdev, MLX5_CMD_OP_2RST_QP, 0, &context, - &ipriv->qp); - if (err) - mlx5_core_err(mdev, "Failed to modify qp 2RST, err: %d\n", err); + MLX5_SET(qp_2rst_in, in, opcode, MLX5_CMD_OP_2RST_QP); + MLX5_SET(qp_2rst_in, in, qpn, ipriv->qp.qpn); + mlx5_cmd_exec_in(mdev, qp_2rst, in); } #define MLX5_QP_ENHANCED_ULP_STATELESS_MODE 2 -- cgit v1.2.3-59-g8ed1b From f93f4f4f31492468d5c6903e35cc0e31a9cb2c48 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Mon, 6 Apr 2020 11:17:44 +0300 Subject: net/mlx5: Remove extra indirection while storing QPN The FPGA, SW steering and IPoIB need to have only QPN from the mlx5_core_qp struct, so reduce memory footprint by storing QPN directly. Reviewed-by: Saeed Mahameed Signed-off-by: Leon Romanovsky --- .../net/ethernet/mellanox/mlx5/core/fpga/conn.c | 23 ++++----- .../net/ethernet/mellanox/mlx5/core/fpga/conn.h | 2 +- .../net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c | 60 +++++++++++----------- .../net/ethernet/mellanox/mlx5/core/ipoib/ipoib.h | 6 +-- .../ethernet/mellanox/mlx5/core/ipoib/ipoib_vlan.c | 19 +++---- .../ethernet/mellanox/mlx5/core/steering/dr_send.c | 19 ++++--- .../mellanox/mlx5/core/steering/dr_types.h | 2 +- 7 files changed, 64 insertions(+), 67 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fpga/conn.c b/drivers/net/ethernet/mellanox/mlx5/core/fpga/conn.c index b00d834d2dbf..182d3ac3e73f 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/fpga/conn.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/fpga/conn.c @@ -165,7 +165,7 @@ static void mlx5_fpga_conn_post_send(struct mlx5_fpga_conn *conn, ctrl->fm_ce_se = MLX5_WQE_CTRL_CQ_UPDATE; ctrl->opmod_idx_opcode = cpu_to_be32(((conn->qp.sq.pc & 0xffff) << 8) | MLX5_OPCODE_SEND); - ctrl->qpn_ds = cpu_to_be32(size | (conn->qp.mqp.qpn << 8)); + ctrl->qpn_ds = cpu_to_be32(size | (conn->qp.qpn << 8)); conn->qp.sq.pc++; conn->qp.sq.bufs[ix] = buf; @@ -588,8 +588,8 @@ static int mlx5_fpga_conn_create_qp(struct mlx5_fpga_conn *conn, if (err) goto err_sq_bufs; - conn->qp.mqp.qpn = MLX5_GET(create_qp_out, out, qpn); - mlx5_fpga_dbg(fdev, "Created QP #0x%x\n", conn->qp.mqp.qpn); + conn->qp.qpn = MLX5_GET(create_qp_out, out, qpn); + mlx5_fpga_dbg(fdev, "Created QP #0x%x\n", conn->qp.qpn); goto out; @@ -644,10 +644,9 @@ static void mlx5_fpga_conn_destroy_qp(struct mlx5_fpga_conn *conn) { struct mlx5_core_dev *dev = conn->fdev->mdev; u32 in[MLX5_ST_SZ_DW(destroy_qp_in)] = {}; - struct mlx5_core_qp *qp = &conn->qp.mqp; MLX5_SET(destroy_qp_in, in, opcode, MLX5_CMD_OP_DESTROY_QP); - MLX5_SET(destroy_qp_in, in, qpn, qp->qpn); + MLX5_SET(destroy_qp_in, in, qpn, conn->qp.qpn); mlx5_cmd_exec_in(dev, destroy_qp, in); mlx5_fpga_conn_free_recv_bufs(conn); @@ -662,10 +661,10 @@ static int mlx5_fpga_conn_reset_qp(struct mlx5_fpga_conn *conn) struct mlx5_core_dev *mdev = conn->fdev->mdev; u32 in[MLX5_ST_SZ_DW(qp_2rst_in)] = {}; - mlx5_fpga_dbg(conn->fdev, "Modifying QP %u to RST\n", conn->qp.mqp.qpn); + mlx5_fpga_dbg(conn->fdev, "Modifying QP %u to RST\n", conn->qp.qpn); MLX5_SET(qp_2rst_in, in, opcode, MLX5_CMD_OP_2RST_QP); - MLX5_SET(qp_2rst_in, in, qpn, conn->qp.mqp.qpn); + MLX5_SET(qp_2rst_in, in, qpn, conn->qp.qpn); return mlx5_cmd_exec_in(mdev, qp_2rst, in); } @@ -677,7 +676,7 @@ static int mlx5_fpga_conn_init_qp(struct mlx5_fpga_conn *conn) struct mlx5_core_dev *mdev = fdev->mdev; u32 *qpc; - mlx5_fpga_dbg(conn->fdev, "Modifying QP %u to INIT\n", conn->qp.mqp.qpn); + mlx5_fpga_dbg(conn->fdev, "Modifying QP %u to INIT\n", conn->qp.qpn); qpc = MLX5_ADDR_OF(rst2init_qp_in, in, qpc); @@ -691,7 +690,7 @@ static int mlx5_fpga_conn_init_qp(struct mlx5_fpga_conn *conn) MLX5_SET64(qpc, qpc, dbr_addr, conn->qp.wq_ctrl.db.dma); MLX5_SET(rst2init_qp_in, in, opcode, MLX5_CMD_OP_RST2INIT_QP); - MLX5_SET(rst2init_qp_in, in, qpn, conn->qp.mqp.qpn); + MLX5_SET(rst2init_qp_in, in, qpn, conn->qp.qpn); return mlx5_cmd_exec_in(mdev, rst2init_qp, in); } @@ -726,7 +725,7 @@ static int mlx5_fpga_conn_rtr_qp(struct mlx5_fpga_conn *conn) MLX5_FLD_SZ_BYTES(qpc, primary_address_path.rgid_rip)); MLX5_SET(init2rtr_qp_in, in, opcode, MLX5_CMD_OP_INIT2RTR_QP); - MLX5_SET(init2rtr_qp_in, in, qpn, conn->qp.mqp.qpn); + MLX5_SET(init2rtr_qp_in, in, qpn, conn->qp.qpn); return mlx5_cmd_exec_in(mdev, init2rtr_qp, in); } @@ -751,7 +750,7 @@ static int mlx5_fpga_conn_rts_qp(struct mlx5_fpga_conn *conn) MLX5_SET(qpc, qpc, rnr_retry, 7); /* Infinite retry if RNR NACK */ MLX5_SET(rtr2rts_qp_in, in, opcode, MLX5_CMD_OP_RTR2RTS_QP); - MLX5_SET(rtr2rts_qp_in, in, qpn, conn->qp.mqp.qpn); + MLX5_SET(rtr2rts_qp_in, in, qpn, conn->qp.qpn); MLX5_SET(rtr2rts_qp_in, in, opt_param_mask, MLX5_QP_OPTPAR_RNR_TIMEOUT); return mlx5_cmd_exec_in(mdev, rtr2rts_qp, in); @@ -894,7 +893,7 @@ struct mlx5_fpga_conn *mlx5_fpga_conn_create(struct mlx5_fpga_device *fdev, MLX5_SET(fpga_qpc, conn->fpga_qpc, next_rcv_psn, 1); MLX5_SET(fpga_qpc, conn->fpga_qpc, next_send_psn, 0); MLX5_SET(fpga_qpc, conn->fpga_qpc, pkey, MLX5_FPGA_PKEY); - MLX5_SET(fpga_qpc, conn->fpga_qpc, remote_qpn, conn->qp.mqp.qpn); + MLX5_SET(fpga_qpc, conn->fpga_qpc, remote_qpn, conn->qp.qpn); MLX5_SET(fpga_qpc, conn->fpga_qpc, rnr_retry, 7); MLX5_SET(fpga_qpc, conn->fpga_qpc, retry_count, 7); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fpga/conn.h b/drivers/net/ethernet/mellanox/mlx5/core/fpga/conn.h index 634ae10e287b..5116e869a6e4 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/fpga/conn.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/fpga/conn.h @@ -65,7 +65,7 @@ struct mlx5_fpga_conn { int sgid_index; struct mlx5_wq_qp wq; struct mlx5_wq_ctrl wq_ctrl; - struct mlx5_core_qp mqp; + u32 qpn; struct { spinlock_t lock; /* Protects all SQ state */ unsigned int pc; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c b/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c index 83b198d8e3d6..068578be00f1 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c @@ -160,7 +160,6 @@ int mlx5i_init_underlay_qp(struct mlx5e_priv *priv) { struct mlx5_core_dev *mdev = priv->mdev; struct mlx5i_priv *ipriv = priv->ppriv; - struct mlx5_core_qp *qp = &ipriv->qp; int ret; { @@ -176,7 +175,7 @@ int mlx5i_init_underlay_qp(struct mlx5e_priv *priv) MLX5_SET(qpc, qpc, q_key, IB_DEFAULT_Q_KEY); MLX5_SET(rst2init_qp_in, in, opcode, MLX5_CMD_OP_RST2INIT_QP); - MLX5_SET(rst2init_qp_in, in, qpn, qp->qpn); + MLX5_SET(rst2init_qp_in, in, qpn, ipriv->qpn); ret = mlx5_cmd_exec_in(mdev, rst2init_qp, in); if (ret) goto err_qp_modify_to_err; @@ -185,7 +184,7 @@ int mlx5i_init_underlay_qp(struct mlx5e_priv *priv) u32 in[MLX5_ST_SZ_DW(init2rtr_qp_in)] = {}; MLX5_SET(init2rtr_qp_in, in, opcode, MLX5_CMD_OP_INIT2RTR_QP); - MLX5_SET(init2rtr_qp_in, in, qpn, qp->qpn); + MLX5_SET(init2rtr_qp_in, in, qpn, ipriv->qpn); ret = mlx5_cmd_exec_in(mdev, init2rtr_qp, in); if (ret) goto err_qp_modify_to_err; @@ -194,7 +193,7 @@ int mlx5i_init_underlay_qp(struct mlx5e_priv *priv) u32 in[MLX5_ST_SZ_DW(rtr2rts_qp_in)] = {}; MLX5_SET(rtr2rts_qp_in, in, opcode, MLX5_CMD_OP_RTR2RTS_QP); - MLX5_SET(rtr2rts_qp_in, in, qpn, qp->qpn); + MLX5_SET(rtr2rts_qp_in, in, qpn, ipriv->qpn); ret = mlx5_cmd_exec_in(mdev, rtr2rts_qp, in); if (ret) goto err_qp_modify_to_err; @@ -206,7 +205,7 @@ err_qp_modify_to_err: u32 in[MLX5_ST_SZ_DW(qp_2err_in)] = {}; MLX5_SET(qp_2err_in, in, opcode, MLX5_CMD_OP_2ERR_QP); - MLX5_SET(qp_2err_in, in, qpn, qp->qpn); + MLX5_SET(qp_2err_in, in, qpn, ipriv->qpn); mlx5_cmd_exec_in(mdev, qp_2err, in); } return ret; @@ -219,16 +218,17 @@ void mlx5i_uninit_underlay_qp(struct mlx5e_priv *priv) u32 in[MLX5_ST_SZ_DW(qp_2rst_in)] = {}; MLX5_SET(qp_2rst_in, in, opcode, MLX5_CMD_OP_2RST_QP); - MLX5_SET(qp_2rst_in, in, qpn, ipriv->qp.qpn); + MLX5_SET(qp_2rst_in, in, qpn, ipriv->qpn); mlx5_cmd_exec_in(mdev, qp_2rst, in); } #define MLX5_QP_ENHANCED_ULP_STATELESS_MODE 2 -int mlx5i_create_underlay_qp(struct mlx5_core_dev *mdev, struct mlx5_core_qp *qp) +int mlx5i_create_underlay_qp(struct mlx5e_priv *priv) { u32 out[MLX5_ST_SZ_DW(create_qp_out)] = {}; u32 in[MLX5_ST_SZ_DW(create_qp_in)] = {}; + struct mlx5i_priv *ipriv = priv->ppriv; void *addr_path; int ret = 0; void *qpc; @@ -244,21 +244,21 @@ int mlx5i_create_underlay_qp(struct mlx5_core_dev *mdev, struct mlx5_core_qp *qp MLX5_SET(ads, addr_path, grh, 1); MLX5_SET(create_qp_in, in, opcode, MLX5_CMD_OP_CREATE_QP); - ret = mlx5_cmd_exec_inout(mdev, create_qp, in, out); + ret = mlx5_cmd_exec_inout(priv->mdev, create_qp, in, out); if (ret) return ret; - qp->qpn = MLX5_GET(create_qp_out, out, qpn); + ipriv->qpn = MLX5_GET(create_qp_out, out, qpn); return 0; } -void mlx5i_destroy_underlay_qp(struct mlx5_core_dev *mdev, struct mlx5_core_qp *qp) +void mlx5i_destroy_underlay_qp(struct mlx5_core_dev *mdev, u32 qpn) { u32 in[MLX5_ST_SZ_DW(destroy_qp_in)] = {}; MLX5_SET(destroy_qp_in, in, opcode, MLX5_CMD_OP_DESTROY_QP); - MLX5_SET(destroy_qp_in, in, qpn, qp->qpn); + MLX5_SET(destroy_qp_in, in, qpn, qpn); mlx5_cmd_exec_in(mdev, destroy_qp, in); } @@ -279,13 +279,13 @@ static int mlx5i_init_tx(struct mlx5e_priv *priv) struct mlx5i_priv *ipriv = priv->ppriv; int err; - err = mlx5i_create_underlay_qp(priv->mdev, &ipriv->qp); + err = mlx5i_create_underlay_qp(priv); if (err) { mlx5_core_warn(priv->mdev, "create underlay QP failed, %d\n", err); return err; } - err = mlx5i_create_tis(priv->mdev, ipriv->qp.qpn, &priv->tisn[0][0]); + err = mlx5i_create_tis(priv->mdev, ipriv->qpn, &priv->tisn[0][0]); if (err) { mlx5_core_warn(priv->mdev, "create tis failed, %d\n", err); goto err_destroy_underlay_qp; @@ -294,7 +294,7 @@ static int mlx5i_init_tx(struct mlx5e_priv *priv) return 0; err_destroy_underlay_qp: - mlx5i_destroy_underlay_qp(priv->mdev, &ipriv->qp); + mlx5i_destroy_underlay_qp(priv->mdev, ipriv->qpn); return err; } @@ -303,7 +303,7 @@ static void mlx5i_cleanup_tx(struct mlx5e_priv *priv) struct mlx5i_priv *ipriv = priv->ppriv; mlx5e_destroy_tis(priv->mdev, priv->tisn[0][0]); - mlx5i_destroy_underlay_qp(priv->mdev, &ipriv->qp); + mlx5i_destroy_underlay_qp(priv->mdev, ipriv->qpn); } static int mlx5i_create_flow_steering(struct mlx5e_priv *priv) @@ -506,12 +506,12 @@ int mlx5i_dev_init(struct net_device *dev) struct mlx5i_priv *ipriv = priv->ppriv; /* Set dev address using underlay QP */ - dev->dev_addr[1] = (ipriv->qp.qpn >> 16) & 0xff; - dev->dev_addr[2] = (ipriv->qp.qpn >> 8) & 0xff; - dev->dev_addr[3] = (ipriv->qp.qpn) & 0xff; + dev->dev_addr[1] = (ipriv->qpn >> 16) & 0xff; + dev->dev_addr[2] = (ipriv->qpn >> 8) & 0xff; + dev->dev_addr[3] = (ipriv->qpn) & 0xff; /* Add QPN to net-device mapping to HT */ - mlx5i_pkey_add_qpn(dev ,ipriv->qp.qpn); + mlx5i_pkey_add_qpn(dev, ipriv->qpn); return 0; } @@ -538,7 +538,7 @@ void mlx5i_dev_cleanup(struct net_device *dev) mlx5i_uninit_underlay_qp(priv); /* Delete QPN to net-device mapping from HT */ - mlx5i_pkey_del_qpn(dev, ipriv->qp.qpn); + mlx5i_pkey_del_qpn(dev, ipriv->qpn); } static int mlx5i_open(struct net_device *netdev) @@ -558,7 +558,7 @@ static int mlx5i_open(struct net_device *netdev) goto err_clear_state_opened_flag; } - err = mlx5_fs_add_rx_underlay_qpn(mdev, ipriv->qp.qpn); + err = mlx5_fs_add_rx_underlay_qpn(mdev, ipriv->qpn); if (err) { mlx5_core_warn(mdev, "attach underlay qp to ft failed, %d\n", err); goto err_reset_qp; @@ -575,7 +575,7 @@ static int mlx5i_open(struct net_device *netdev) return 0; err_remove_fs_underlay_qp: - mlx5_fs_remove_rx_underlay_qpn(mdev, ipriv->qp.qpn); + mlx5_fs_remove_rx_underlay_qpn(mdev, ipriv->qpn); err_reset_qp: mlx5i_uninit_underlay_qp(epriv); err_clear_state_opened_flag: @@ -601,7 +601,7 @@ static int mlx5i_close(struct net_device *netdev) clear_bit(MLX5E_STATE_OPENED, &epriv->state); netif_carrier_off(epriv->netdev); - mlx5_fs_remove_rx_underlay_qpn(mdev, ipriv->qp.qpn); + mlx5_fs_remove_rx_underlay_qpn(mdev, ipriv->qpn); mlx5e_deactivate_priv_channels(epriv); mlx5e_close_channels(&epriv->channels); mlx5i_uninit_underlay_qp(epriv); @@ -620,11 +620,12 @@ static int mlx5i_attach_mcast(struct net_device *netdev, struct ib_device *hca, struct mlx5i_priv *ipriv = epriv->ppriv; int err; - mlx5_core_dbg(mdev, "attaching QPN 0x%x, MGID %pI6\n", ipriv->qp.qpn, gid->raw); - err = mlx5_core_attach_mcg(mdev, gid, ipriv->qp.qpn); + mlx5_core_dbg(mdev, "attaching QPN 0x%x, MGID %pI6\n", ipriv->qpn, + gid->raw); + err = mlx5_core_attach_mcg(mdev, gid, ipriv->qpn); if (err) mlx5_core_warn(mdev, "failed attaching QPN 0x%x, MGID %pI6\n", - ipriv->qp.qpn, gid->raw); + ipriv->qpn, gid->raw); if (set_qkey) { mlx5_core_dbg(mdev, "%s setting qkey 0x%x\n", @@ -643,12 +644,13 @@ static int mlx5i_detach_mcast(struct net_device *netdev, struct ib_device *hca, struct mlx5i_priv *ipriv = epriv->ppriv; int err; - mlx5_core_dbg(mdev, "detaching QPN 0x%x, MGID %pI6\n", ipriv->qp.qpn, gid->raw); + mlx5_core_dbg(mdev, "detaching QPN 0x%x, MGID %pI6\n", ipriv->qpn, + gid->raw); - err = mlx5_core_detach_mcg(mdev, gid, ipriv->qp.qpn); + err = mlx5_core_detach_mcg(mdev, gid, ipriv->qpn); if (err) mlx5_core_dbg(mdev, "failed detaching QPN 0x%x, MGID %pI6\n", - ipriv->qp.qpn, gid->raw); + ipriv->qpn, gid->raw); return err; } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.h b/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.h index de7e01a027bb..3483ba642cfe 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.h @@ -51,7 +51,7 @@ extern const struct ethtool_ops mlx5i_pkey_ethtool_ops; /* ipoib rdma netdev's private data structure */ struct mlx5i_priv { struct rdma_netdev rn; /* keep this first */ - struct mlx5_core_qp qp; + u32 qpn; bool sub_interface; u32 qkey; u16 pkey_index; @@ -62,8 +62,8 @@ struct mlx5i_priv { int mlx5i_create_tis(struct mlx5_core_dev *mdev, u32 underlay_qpn, u32 *tisn); /* Underlay QP create/destroy functions */ -int mlx5i_create_underlay_qp(struct mlx5_core_dev *mdev, struct mlx5_core_qp *qp); -void mlx5i_destroy_underlay_qp(struct mlx5_core_dev *mdev, struct mlx5_core_qp *qp); +int mlx5i_create_underlay_qp(struct mlx5e_priv *priv); +void mlx5i_destroy_underlay_qp(struct mlx5_core_dev *mdev, u32 qpn); /* Underlay QP state modification init/uninit functions */ int mlx5i_init_underlay_qp(struct mlx5e_priv *priv); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib_vlan.c b/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib_vlan.c index 96e64187c089..b9af37ad40bf 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib_vlan.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib_vlan.c @@ -204,13 +204,13 @@ static int mlx5i_pkey_open(struct net_device *netdev) goto err_release_lock; } - err = mlx5_fs_add_rx_underlay_qpn(mdev, ipriv->qp.qpn); + err = mlx5_fs_add_rx_underlay_qpn(mdev, ipriv->qpn); if (err) { mlx5_core_warn(mdev, "attach child underlay qp to ft failed, %d\n", err); goto err_unint_underlay_qp; } - err = mlx5i_create_tis(mdev, ipriv->qp.qpn, &epriv->tisn[0][0]); + err = mlx5i_create_tis(mdev, ipriv->qpn, &epriv->tisn[0][0]); if (err) { mlx5_core_warn(mdev, "create child tis failed, %d\n", err); goto err_remove_rx_uderlay_qp; @@ -230,7 +230,7 @@ static int mlx5i_pkey_open(struct net_device *netdev) err_clear_state_opened_flag: mlx5e_destroy_tis(mdev, epriv->tisn[0][0]); err_remove_rx_uderlay_qp: - mlx5_fs_remove_rx_underlay_qpn(mdev, ipriv->qp.qpn); + mlx5_fs_remove_rx_underlay_qpn(mdev, ipriv->qpn); err_unint_underlay_qp: mlx5i_uninit_underlay_qp(epriv); err_release_lock: @@ -253,7 +253,7 @@ static int mlx5i_pkey_close(struct net_device *netdev) clear_bit(MLX5E_STATE_OPENED, &priv->state); netif_carrier_off(priv->netdev); - mlx5_fs_remove_rx_underlay_qpn(mdev, ipriv->qp.qpn); + mlx5_fs_remove_rx_underlay_qpn(mdev, ipriv->qpn); mlx5i_uninit_underlay_qp(priv); mlx5e_deactivate_priv_channels(priv); mlx5e_close_channels(&priv->channels); @@ -307,23 +307,20 @@ static void mlx5i_pkey_cleanup(struct mlx5e_priv *priv) static int mlx5i_pkey_init_tx(struct mlx5e_priv *priv) { - struct mlx5i_priv *ipriv = priv->ppriv; int err; - err = mlx5i_create_underlay_qp(priv->mdev, &ipriv->qp); - if (err) { + err = mlx5i_create_underlay_qp(priv); + if (err) mlx5_core_warn(priv->mdev, "create child underlay QP failed, %d\n", err); - return err; - } - return 0; + return err; } static void mlx5i_pkey_cleanup_tx(struct mlx5e_priv *priv) { struct mlx5i_priv *ipriv = priv->ppriv; - mlx5i_destroy_underlay_qp(priv->mdev, &ipriv->qp); + mlx5i_destroy_underlay_qp(priv->mdev, ipriv->qpn); } static int mlx5i_pkey_init_rx(struct mlx5e_priv *priv) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_send.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_send.c index 266b913d2f9a..c4ed25bb9ac8 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_send.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_send.c @@ -178,7 +178,7 @@ static struct mlx5dr_qp *dr_create_rc_qp(struct mlx5_core_dev *mdev, MLX5_SET(create_qp_in, in, opcode, MLX5_CMD_OP_CREATE_QP); err = mlx5_cmd_exec(mdev, in, inlen, out, sizeof(out)); - dr_qp->mqp.qpn = MLX5_GET(create_qp_out, out, qpn); + dr_qp->qpn = MLX5_GET(create_qp_out, out, qpn); kfree(in); if (err) goto err_in; @@ -199,10 +199,9 @@ static void dr_destroy_qp(struct mlx5_core_dev *mdev, struct mlx5dr_qp *dr_qp) { u32 in[MLX5_ST_SZ_DW(destroy_qp_in)] = {}; - struct mlx5_core_qp *qp = &dr_qp->mqp; MLX5_SET(destroy_qp_in, in, opcode, MLX5_CMD_OP_DESTROY_QP); - MLX5_SET(destroy_qp_in, in, qpn, qp->qpn); + MLX5_SET(destroy_qp_in, in, qpn, dr_qp->qpn); mlx5_cmd_exec_in(mdev, destroy_qp, in); kfree(dr_qp->sq.wqe_head); @@ -242,7 +241,7 @@ static void dr_rdma_segments(struct mlx5dr_qp *dr_qp, u64 remote_addr, MLX5_WQE_CTRL_CQ_UPDATE : 0; wq_ctrl->opmod_idx_opcode = cpu_to_be32(((dr_qp->sq.pc & 0xffff) << 8) | opcode); - wq_ctrl->qpn_ds = cpu_to_be32(size | dr_qp->mqp.qpn << 8); + wq_ctrl->qpn_ds = cpu_to_be32(size | dr_qp->qpn << 8); wq_raddr = (void *)(wq_ctrl + 1); wq_raddr->raddr = cpu_to_be64(remote_addr); wq_raddr->rkey = cpu_to_be32(rkey); @@ -586,7 +585,7 @@ static int dr_modify_qp_rst2init(struct mlx5_core_dev *mdev, MLX5_SET(qpc, qpc, rwe, 1); MLX5_SET(rst2init_qp_in, in, opcode, MLX5_CMD_OP_RST2INIT_QP); - MLX5_SET(rst2init_qp_in, in, qpn, dr_qp->mqp.qpn); + MLX5_SET(rst2init_qp_in, in, qpn, dr_qp->qpn); return mlx5_cmd_exec_in(mdev, rst2init_qp, in); } @@ -600,13 +599,13 @@ static int dr_cmd_modify_qp_rtr2rts(struct mlx5_core_dev *mdev, qpc = MLX5_ADDR_OF(rtr2rts_qp_in, in, qpc); - MLX5_SET(rtr2rts_qp_in, in, qpn, dr_qp->mqp.qpn); + MLX5_SET(rtr2rts_qp_in, in, qpn, dr_qp->qpn); MLX5_SET(qpc, qpc, retry_count, attr->retry_cnt); MLX5_SET(qpc, qpc, rnr_retry, attr->rnr_retry); MLX5_SET(rtr2rts_qp_in, in, opcode, MLX5_CMD_OP_RTR2RTS_QP); - MLX5_SET(rtr2rts_qp_in, in, qpn, dr_qp->mqp.qpn); + MLX5_SET(rtr2rts_qp_in, in, qpn, dr_qp->qpn); return mlx5_cmd_exec_in(mdev, rtr2rts_qp, in); } @@ -620,7 +619,7 @@ static int dr_cmd_modify_qp_init2rtr(struct mlx5_core_dev *mdev, qpc = MLX5_ADDR_OF(init2rtr_qp_in, in, qpc); - MLX5_SET(init2rtr_qp_in, in, qpn, dr_qp->mqp.qpn); + MLX5_SET(init2rtr_qp_in, in, qpn, dr_qp->qpn); MLX5_SET(qpc, qpc, mtu, attr->mtu); MLX5_SET(qpc, qpc, log_msg_max, DR_CHUNK_SIZE_MAX - 1); @@ -640,7 +639,7 @@ static int dr_cmd_modify_qp_init2rtr(struct mlx5_core_dev *mdev, MLX5_SET(qpc, qpc, min_rnr_nak, 1); MLX5_SET(init2rtr_qp_in, in, opcode, MLX5_CMD_OP_INIT2RTR_QP); - MLX5_SET(init2rtr_qp_in, in, qpn, dr_qp->mqp.qpn); + MLX5_SET(init2rtr_qp_in, in, qpn, dr_qp->qpn); return mlx5_cmd_exec_in(mdev, init2rtr_qp, in); } @@ -668,7 +667,7 @@ static int dr_prepare_qp_to_rts(struct mlx5dr_domain *dmn) return ret; rtr_attr.mtu = mtu; - rtr_attr.qp_num = dr_qp->mqp.qpn; + rtr_attr.qp_num = dr_qp->qpn; rtr_attr.min_rnr_timer = 12; rtr_attr.port_num = port; rtr_attr.sgid_index = gid_index; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_types.h b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_types.h index 3fa739951b34..984783238baa 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_types.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_types.h @@ -990,7 +990,7 @@ struct mlx5dr_qp { struct mlx5_wq_qp wq; struct mlx5_uars_page *uar; struct mlx5_wq_ctrl wq_ctrl; - struct mlx5_core_qp mqp; + u32 qpn; struct { unsigned int pc; unsigned int cc; -- cgit v1.2.3-59-g8ed1b From 57a6c5e992f5d6ab92764a7eaaba855f6d4b2df8 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Mon, 6 Apr 2020 15:53:06 +0300 Subject: net/mlx5: Replace hand written QP context struct with automatic getters By changing debugfs to not use any QP related API, convert the debugfs code to use MLX5_GET() directly to access QP context instead of hand written struct. Reviewed-by: Saeed Mahameed Signed-off-by: Leon Romanovsky --- drivers/net/ethernet/mellanox/mlx5/core/debugfs.c | 51 +++++++++-------------- 1 file changed, 20 insertions(+), 31 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/debugfs.c b/drivers/net/ethernet/mellanox/mlx5/core/debugfs.c index 04854e5fbcd7..d40c3d5bd496 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/debugfs.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/debugfs.c @@ -202,42 +202,37 @@ void mlx5_cq_debugfs_cleanup(struct mlx5_core_dev *dev) static u64 qp_read_field(struct mlx5_core_dev *dev, struct mlx5_core_qp *qp, int index, int *is_str) { - int outlen = MLX5_ST_SZ_BYTES(query_qp_out); - struct mlx5_qp_context *ctx; + u32 out[MLX5_ST_SZ_BYTES(query_qp_out)] = {}; + u32 in[MLX5_ST_SZ_DW(query_qp_in)] = {}; u64 param = 0; - u32 *out; + int state; + u32 *qpc; int err; - int no_sq; - out = kzalloc(outlen, GFP_KERNEL); - if (!out) - return param; - - err = mlx5_core_qp_query(dev, qp, out, outlen); - if (err) { - mlx5_core_warn(dev, "failed to query qp err=%d\n", err); - goto out; - } + MLX5_SET(query_qp_in, in, opcode, MLX5_CMD_OP_QUERY_QP); + MLX5_SET(query_qp_in, in, qpn, qp->qpn); + err = mlx5_cmd_exec_inout(dev, query_qp, in, out); + if (err) + return 0; *is_str = 0; - /* FIXME: use MLX5_GET rather than mlx5_qp_context manual struct */ - ctx = (struct mlx5_qp_context *)MLX5_ADDR_OF(query_qp_out, out, qpc); - + qpc = MLX5_ADDR_OF(query_qp_out, out, qpc); switch (index) { case QP_PID: param = qp->pid; break; case QP_STATE: - param = (unsigned long)mlx5_qp_state_str(be32_to_cpu(ctx->flags) >> 28); + state = MLX5_GET(qpc, qpc, state); + param = (unsigned long)mlx5_qp_state_str(state); *is_str = 1; break; case QP_XPORT: - param = (unsigned long)mlx5_qp_type_str((be32_to_cpu(ctx->flags) >> 16) & 0xff); + param = (unsigned long)mlx5_qp_type_str(MLX5_GET(qpc, qpc, st)); *is_str = 1; break; case QP_MTU: - switch (ctx->mtu_msgmax >> 5) { + switch (MLX5_GET(qpc, qpc, mtu)) { case IB_MTU_256: param = 256; break; @@ -258,29 +253,23 @@ static u64 qp_read_field(struct mlx5_core_dev *dev, struct mlx5_core_qp *qp, } break; case QP_N_RECV: - param = 1 << ((ctx->rq_size_stride >> 3) & 0xf); + param = 1 << MLX5_GET(qpc, qpc, log_rq_size); break; case QP_RECV_SZ: - param = 1 << ((ctx->rq_size_stride & 7) + 4); + param = 1 << (MLX5_GET(qpc, qpc, log_rq_stride) + 4); break; case QP_N_SEND: - no_sq = be16_to_cpu(ctx->sq_crq_size) >> 15; - if (!no_sq) - param = 1 << (be16_to_cpu(ctx->sq_crq_size) >> 11); - else - param = 0; + if (!MLX5_GET(qpc, qpc, no_sq)) + param = 1 << MLX5_GET(qpc, qpc, log_sq_size); break; case QP_LOG_PG_SZ: - param = (be32_to_cpu(ctx->log_pg_sz_remote_qpn) >> 24) & 0x1f; - param += 12; + param = MLX5_GET(qpc, qpc, log_page_size) + 12; break; case QP_RQPN: - param = be32_to_cpu(ctx->log_pg_sz_remote_qpn) & 0xffffff; + param = MLX5_GET(qpc, qpc, remote_qpn); break; } -out: - kfree(out); return param; } -- cgit v1.2.3-59-g8ed1b From 66247fbb280c2a699a8621708c52dae6acd2e4bc Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Fri, 3 Apr 2020 11:28:28 +0300 Subject: net/mlx5: Remove Q counter low level helper APIs mlx5 core users are encouraged to use low level API (mlx5_cmd_exec) without the need of helper functions, do this for q counters, remove helper functions and call mlx5_cmd_exec directly from users. This will help reduce the total amount of code and reduction of the mlx5_core symbol table. Reviewed-by: Saeed Mahameed Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/mlx5/main.c | 55 +++++++++++++--------- drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 39 +++++++++------ drivers/net/ethernet/mellanox/mlx5/core/en_stats.c | 35 +++++++++----- drivers/net/ethernet/mellanox/mlx5/core/qp.c | 39 --------------- include/linux/mlx5/qp.h | 4 -- 5 files changed, 80 insertions(+), 92 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index 6679756506e6..b02d027ebf3b 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -5439,15 +5439,21 @@ static bool is_mdev_switchdev_mode(const struct mlx5_core_dev *mdev) static void mlx5_ib_dealloc_counters(struct mlx5_ib_dev *dev) { + u32 in[MLX5_ST_SZ_DW(dealloc_q_counter_in)] = {}; int num_cnt_ports; int i; num_cnt_ports = is_mdev_switchdev_mode(dev->mdev) ? 1 : dev->num_ports; + MLX5_SET(dealloc_q_counter_in, in, opcode, + MLX5_CMD_OP_DEALLOC_Q_COUNTER); + for (i = 0; i < num_cnt_ports; i++) { - if (dev->port[i].cnts.set_id_valid) - mlx5_core_dealloc_q_counter(dev->mdev, - dev->port[i].cnts.set_id); + if (dev->port[i].cnts.set_id_valid) { + MLX5_SET(dealloc_q_counter_in, in, counter_set_id, + dev->port[i].cnts.set_id); + mlx5_cmd_exec_in(dev->mdev, dealloc_q_counter, in); + } kfree(dev->port[i].cnts.names); kfree(dev->port[i].cnts.offsets); } @@ -5638,27 +5644,23 @@ static int mlx5_ib_query_q_counters(struct mlx5_core_dev *mdev, struct rdma_hw_stats *stats, u16 set_id) { - int outlen = MLX5_ST_SZ_BYTES(query_q_counter_out); - void *out; + u32 out[MLX5_ST_SZ_DW(query_q_counter_out)] = {}; + u32 in[MLX5_ST_SZ_DW(query_q_counter_in)] = {}; __be32 val; int ret, i; - out = kvzalloc(outlen, GFP_KERNEL); - if (!out) - return -ENOMEM; - - ret = mlx5_core_query_q_counter(mdev, set_id, 0, out, outlen); + MLX5_SET(query_q_counter_in, in, opcode, MLX5_CMD_OP_QUERY_Q_COUNTER); + MLX5_SET(query_q_counter_in, in, counter_set_id, set_id); + ret = mlx5_cmd_exec_inout(mdev, query_q_counter, in, out); if (ret) - goto free; + return ret; for (i = 0; i < cnts->num_q_counters; i++) { - val = *(__be32 *)(out + cnts->offsets[i]); + val = *(__be32 *)((void *)out + cnts->offsets[i]); stats->value[i] = (u64)be32_to_cpu(val); } -free: - kvfree(out); - return ret; + return 0; } static int mlx5_ib_query_ext_ppcnt_counters(struct mlx5_ib_dev *dev, @@ -5765,6 +5767,20 @@ static int mlx5_ib_counter_update_stats(struct rdma_counter *counter) counter->stats, counter->id); } +static int mlx5_ib_counter_dealloc(struct rdma_counter *counter) +{ + struct mlx5_ib_dev *dev = to_mdev(counter->device); + u32 in[MLX5_ST_SZ_DW(dealloc_q_counter_in)] = {}; + + if (!counter->id) + return 0; + + MLX5_SET(dealloc_q_counter_in, in, opcode, + MLX5_CMD_OP_DEALLOC_Q_COUNTER); + MLX5_SET(dealloc_q_counter_in, in, counter_set_id, counter->id); + return mlx5_cmd_exec_in(dev->mdev, dealloc_q_counter, in); +} + static int mlx5_ib_counter_bind_qp(struct rdma_counter *counter, struct ib_qp *qp) { @@ -5788,7 +5804,7 @@ static int mlx5_ib_counter_bind_qp(struct rdma_counter *counter, return 0; fail_set_counter: - mlx5_core_dealloc_q_counter(dev->mdev, cnt_set_id); + mlx5_ib_counter_dealloc(counter); counter->id = 0; return err; @@ -5799,13 +5815,6 @@ static int mlx5_ib_counter_unbind_qp(struct ib_qp *qp) return mlx5_ib_qp_set_counter(qp, NULL); } -static int mlx5_ib_counter_dealloc(struct rdma_counter *counter) -{ - struct mlx5_ib_dev *dev = to_mdev(counter->device); - - return mlx5_core_dealloc_q_counter(dev->mdev, counter->id); -} - static int mlx5_ib_rn_get_params(struct ib_device *device, u8 port_num, enum rdma_netdev_t type, struct rdma_netdev_alloc_params *params) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index dd7f338425eb..30970b405040 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -4997,29 +4997,40 @@ static void mlx5e_build_nic_netdev(struct net_device *netdev) void mlx5e_create_q_counters(struct mlx5e_priv *priv) { + u32 out[MLX5_ST_SZ_DW(alloc_q_counter_out)] = {}; + u32 in[MLX5_ST_SZ_DW(alloc_q_counter_in)] = {}; struct mlx5_core_dev *mdev = priv->mdev; int err; - err = mlx5_core_alloc_q_counter(mdev, &priv->q_counter); - if (err) { - mlx5_core_warn(mdev, "alloc queue counter failed, %d\n", err); - priv->q_counter = 0; - } + MLX5_SET(alloc_q_counter_in, in, opcode, MLX5_CMD_OP_ALLOC_Q_COUNTER); + err = mlx5_cmd_exec_inout(mdev, alloc_q_counter, in, out); + if (!err) + priv->q_counter = + MLX5_GET(alloc_q_counter_out, out, counter_set_id); - err = mlx5_core_alloc_q_counter(mdev, &priv->drop_rq_q_counter); - if (err) { - mlx5_core_warn(mdev, "alloc drop RQ counter failed, %d\n", err); - priv->drop_rq_q_counter = 0; - } + err = mlx5_cmd_exec_inout(mdev, alloc_q_counter, in, out); + if (!err) + priv->drop_rq_q_counter = + MLX5_GET(alloc_q_counter_out, out, counter_set_id); } void mlx5e_destroy_q_counters(struct mlx5e_priv *priv) { - if (priv->q_counter) - mlx5_core_dealloc_q_counter(priv->mdev, priv->q_counter); + u32 in[MLX5_ST_SZ_DW(dealloc_q_counter_in)] = {}; + + MLX5_SET(dealloc_q_counter_in, in, opcode, + MLX5_CMD_OP_DEALLOC_Q_COUNTER); + if (priv->q_counter) { + MLX5_SET(dealloc_q_counter_in, in, counter_set_id, + priv->q_counter); + mlx5_cmd_exec_in(priv->mdev, dealloc_q_counter, in); + } - if (priv->drop_rq_q_counter) - mlx5_core_dealloc_q_counter(priv->mdev, priv->drop_rq_q_counter); + if (priv->drop_rq_q_counter) { + MLX5_SET(dealloc_q_counter_in, in, counter_set_id, + priv->drop_rq_q_counter); + mlx5_cmd_exec_in(priv->mdev, dealloc_q_counter, in); + } } static int mlx5e_nic_init(struct mlx5_core_dev *mdev, diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_stats.c b/drivers/net/ethernet/mellanox/mlx5/core/en_stats.c index 30b216d9284c..ff4002ebad90 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_stats.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_stats.c @@ -411,18 +411,29 @@ static MLX5E_DECLARE_STATS_GRP_OP_FILL_STATS(qcnt) static MLX5E_DECLARE_STATS_GRP_OP_UPDATE_STATS(qcnt) { struct mlx5e_qcounter_stats *qcnt = &priv->stats.qcnt; - u32 out[MLX5_ST_SZ_DW(query_q_counter_out)]; - - if (priv->q_counter && - !mlx5_core_query_q_counter(priv->mdev, priv->q_counter, 0, out, - sizeof(out))) - qcnt->rx_out_of_buffer = MLX5_GET(query_q_counter_out, - out, out_of_buffer); - if (priv->drop_rq_q_counter && - !mlx5_core_query_q_counter(priv->mdev, priv->drop_rq_q_counter, 0, - out, sizeof(out))) - qcnt->rx_if_down_packets = MLX5_GET(query_q_counter_out, out, - out_of_buffer); + u32 out[MLX5_ST_SZ_DW(query_q_counter_out)] = {}; + u32 in[MLX5_ST_SZ_DW(query_q_counter_in)] = {}; + int ret; + + MLX5_SET(query_q_counter_in, in, opcode, MLX5_CMD_OP_QUERY_Q_COUNTER); + + if (priv->q_counter) { + MLX5_SET(query_q_counter_in, in, counter_set_id, + priv->q_counter); + ret = mlx5_cmd_exec_inout(priv->mdev, query_q_counter, in, out); + if (!ret) + qcnt->rx_out_of_buffer = MLX5_GET(query_q_counter_out, + out, out_of_buffer); + } + + if (priv->drop_rq_q_counter) { + MLX5_SET(query_q_counter_in, in, counter_set_id, + priv->drop_rq_q_counter); + ret = mlx5_cmd_exec_inout(priv->mdev, query_q_counter, in, out); + if (!ret) + qcnt->rx_if_down_packets = MLX5_GET(query_q_counter_out, + out, out_of_buffer); + } } #define VNIC_ENV_OFF(c) MLX5_BYTE_OFF(query_vnic_env_out, c) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/qp.c b/drivers/net/ethernet/mellanox/mlx5/core/qp.c index c3aea4cc2fff..e36790ad5256 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/qp.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/qp.c @@ -680,45 +680,6 @@ void mlx5_core_destroy_sq_tracked(struct mlx5_core_dev *dev, } EXPORT_SYMBOL(mlx5_core_destroy_sq_tracked); -int mlx5_core_alloc_q_counter(struct mlx5_core_dev *dev, u16 *counter_id) -{ - u32 in[MLX5_ST_SZ_DW(alloc_q_counter_in)] = {0}; - u32 out[MLX5_ST_SZ_DW(alloc_q_counter_out)] = {0}; - int err; - - MLX5_SET(alloc_q_counter_in, in, opcode, MLX5_CMD_OP_ALLOC_Q_COUNTER); - err = mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); - if (!err) - *counter_id = MLX5_GET(alloc_q_counter_out, out, - counter_set_id); - return err; -} -EXPORT_SYMBOL_GPL(mlx5_core_alloc_q_counter); - -int mlx5_core_dealloc_q_counter(struct mlx5_core_dev *dev, u16 counter_id) -{ - u32 in[MLX5_ST_SZ_DW(dealloc_q_counter_in)] = {0}; - u32 out[MLX5_ST_SZ_DW(dealloc_q_counter_out)] = {0}; - - MLX5_SET(dealloc_q_counter_in, in, opcode, - MLX5_CMD_OP_DEALLOC_Q_COUNTER); - MLX5_SET(dealloc_q_counter_in, in, counter_set_id, counter_id); - return mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); -} -EXPORT_SYMBOL_GPL(mlx5_core_dealloc_q_counter); - -int mlx5_core_query_q_counter(struct mlx5_core_dev *dev, u16 counter_id, - int reset, void *out, int out_size) -{ - u32 in[MLX5_ST_SZ_DW(query_q_counter_in)] = {0}; - - MLX5_SET(query_q_counter_in, in, opcode, MLX5_CMD_OP_QUERY_Q_COUNTER); - MLX5_SET(query_q_counter_in, in, clear, reset); - MLX5_SET(query_q_counter_in, in, counter_set_id, counter_id); - return mlx5_cmd_exec(dev, in, sizeof(in), out, out_size); -} -EXPORT_SYMBOL_GPL(mlx5_core_query_q_counter); - struct mlx5_core_rsc_common *mlx5_core_res_hold(struct mlx5_core_dev *dev, int res_num, enum mlx5_res_type res_type) diff --git a/include/linux/mlx5/qp.h b/include/linux/mlx5/qp.h index ae63b1ae9004..4d25a3d24182 100644 --- a/include/linux/mlx5/qp.h +++ b/include/linux/mlx5/qp.h @@ -595,10 +595,6 @@ int mlx5_core_create_sq_tracked(struct mlx5_core_dev *dev, u32 *in, int inlen, struct mlx5_core_qp *sq); void mlx5_core_destroy_sq_tracked(struct mlx5_core_dev *dev, struct mlx5_core_qp *sq); -int mlx5_core_alloc_q_counter(struct mlx5_core_dev *dev, u16 *counter_id); -int mlx5_core_dealloc_q_counter(struct mlx5_core_dev *dev, u16 counter_id); -int mlx5_core_query_q_counter(struct mlx5_core_dev *dev, u16 counter_id, - int reset, void *out, int out_size); struct mlx5_core_rsc_common *mlx5_core_res_hold(struct mlx5_core_dev *dev, int res_num, -- cgit v1.2.3-59-g8ed1b From bfd745f8f327c40d74c8207ca62db05a264b5b7c Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Fri, 3 Apr 2020 13:31:18 +0300 Subject: RDMA/mlx5: Delete Q counter allocations command Remove mlx5_ib implementation of Q counter allocation logic together with cleaning boolean which controlled validity of the counter. It is not needed, because counter_id == 0 means that counter is not valid. Reviewed-by: Saeed Mahameed Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/mlx5/cmd.c | 17 ----------------- drivers/infiniband/hw/mlx5/cmd.h | 2 -- drivers/infiniband/hw/mlx5/main.c | 31 ++++++++++++++++++++----------- drivers/infiniband/hw/mlx5/mlx5_ib.h | 1 - 4 files changed, 20 insertions(+), 31 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/cmd.c b/drivers/infiniband/hw/mlx5/cmd.c index 4c26492ab8a3..a2fcbc49131e 100644 --- a/drivers/infiniband/hw/mlx5/cmd.c +++ b/drivers/infiniband/hw/mlx5/cmd.c @@ -327,23 +327,6 @@ int mlx5_cmd_xrcd_dealloc(struct mlx5_core_dev *dev, u32 xrcdn, u16 uid) return mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); } -int mlx5_cmd_alloc_q_counter(struct mlx5_core_dev *dev, u16 *counter_id, - u16 uid) -{ - u32 in[MLX5_ST_SZ_DW(alloc_q_counter_in)] = {0}; - u32 out[MLX5_ST_SZ_DW(alloc_q_counter_out)] = {0}; - int err; - - MLX5_SET(alloc_q_counter_in, in, opcode, MLX5_CMD_OP_ALLOC_Q_COUNTER); - MLX5_SET(alloc_q_counter_in, in, uid, uid); - - err = mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); - if (!err) - *counter_id = MLX5_GET(alloc_q_counter_out, out, - counter_set_id); - return err; -} - int mlx5_cmd_mad_ifc(struct mlx5_core_dev *dev, const void *inb, void *outb, u16 opmod, u8 port) { diff --git a/drivers/infiniband/hw/mlx5/cmd.h b/drivers/infiniband/hw/mlx5/cmd.h index 945ebce73613..43079b18d9b4 100644 --- a/drivers/infiniband/hw/mlx5/cmd.h +++ b/drivers/infiniband/hw/mlx5/cmd.h @@ -61,8 +61,6 @@ int mlx5_cmd_detach_mcg(struct mlx5_core_dev *dev, union ib_gid *mgid, u32 qpn, u16 uid); int mlx5_cmd_xrcd_alloc(struct mlx5_core_dev *dev, u32 *xrcdn, u16 uid); int mlx5_cmd_xrcd_dealloc(struct mlx5_core_dev *dev, u32 xrcdn, u16 uid); -int mlx5_cmd_alloc_q_counter(struct mlx5_core_dev *dev, u16 *counter_id, - u16 uid); int mlx5_cmd_mad_ifc(struct mlx5_core_dev *dev, const void *inb, void *outb, u16 opmod, u8 port); #endif /* MLX5_IB_CMD_H */ diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index b02d027ebf3b..76ea756d846b 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -5449,7 +5449,7 @@ static void mlx5_ib_dealloc_counters(struct mlx5_ib_dev *dev) MLX5_CMD_OP_DEALLOC_Q_COUNTER); for (i = 0; i < num_cnt_ports; i++) { - if (dev->port[i].cnts.set_id_valid) { + if (dev->port[i].cnts.set_id) { MLX5_SET(dealloc_q_counter_in, in, counter_set_id, dev->port[i].cnts.set_id); mlx5_cmd_exec_in(dev->mdev, dealloc_q_counter, in); @@ -5562,11 +5562,14 @@ static void mlx5_ib_fill_counters(struct mlx5_ib_dev *dev, static int mlx5_ib_alloc_counters(struct mlx5_ib_dev *dev) { + u32 out[MLX5_ST_SZ_DW(alloc_q_counter_out)] = {}; + u32 in[MLX5_ST_SZ_DW(alloc_q_counter_in)] = {}; int num_cnt_ports; int err = 0; int i; bool is_shared; + MLX5_SET(alloc_q_counter_in, in, opcode, MLX5_CMD_OP_ALLOC_Q_COUNTER); is_shared = MLX5_CAP_GEN(dev->mdev, log_max_uctx) != 0; num_cnt_ports = is_mdev_switchdev_mode(dev->mdev) ? 1 : dev->num_ports; @@ -5578,17 +5581,19 @@ static int mlx5_ib_alloc_counters(struct mlx5_ib_dev *dev) mlx5_ib_fill_counters(dev, dev->port[i].cnts.names, dev->port[i].cnts.offsets); - err = mlx5_cmd_alloc_q_counter(dev->mdev, - &dev->port[i].cnts.set_id, - is_shared ? - MLX5_SHARED_RESOURCE_UID : 0); + MLX5_SET(alloc_q_counter_in, in, uid, + is_shared ? MLX5_SHARED_RESOURCE_UID : 0); + + err = mlx5_cmd_exec_inout(dev->mdev, alloc_q_counter, in, out); if (err) { mlx5_ib_warn(dev, "couldn't allocate queue counter for port %d, err %d\n", i + 1, err); goto err_alloc; } - dev->port[i].cnts.set_id_valid = true; + + dev->port[i].cnts.set_id = + MLX5_GET(alloc_q_counter_out, out, counter_set_id); } return 0; @@ -5785,16 +5790,20 @@ static int mlx5_ib_counter_bind_qp(struct rdma_counter *counter, struct ib_qp *qp) { struct mlx5_ib_dev *dev = to_mdev(qp->device); - u16 cnt_set_id = 0; int err; if (!counter->id) { - err = mlx5_cmd_alloc_q_counter(dev->mdev, - &cnt_set_id, - MLX5_SHARED_RESOURCE_UID); + u32 out[MLX5_ST_SZ_DW(alloc_q_counter_out)] = {}; + u32 in[MLX5_ST_SZ_DW(alloc_q_counter_in)] = {}; + + MLX5_SET(alloc_q_counter_in, in, opcode, + MLX5_CMD_OP_ALLOC_Q_COUNTER); + MLX5_SET(alloc_q_counter_in, in, uid, MLX5_SHARED_RESOURCE_UID); + err = mlx5_cmd_exec_inout(dev->mdev, alloc_q_counter, in, out); if (err) return err; - counter->id = cnt_set_id; + counter->id = + MLX5_GET(alloc_q_counter_out, out, counter_set_id); } err = mlx5_ib_qp_set_counter(qp, counter); diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h index a4e522385de0..cb2a021aa93c 100644 --- a/drivers/infiniband/hw/mlx5/mlx5_ib.h +++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h @@ -780,7 +780,6 @@ struct mlx5_ib_counters { u32 num_cong_counters; u32 num_ext_ppcnt_counters; u16 set_id; - bool set_id_valid; }; struct mlx5_ib_multiport_info; -- cgit v1.2.3-59-g8ed1b From 9c275ee4ad82aeb1f51981fdc9ee16b74d4b101a Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Tue, 7 Apr 2020 16:09:15 +0300 Subject: net/mlx5: Delete not-used cmd header The structures defined in the cmd header are not used and can be safely removed from the driver. This patch removes that file and deletes all relevant includes. Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/mlx5/mad.c | 1 - drivers/infiniband/hw/mlx5/srq_cmd.c | 1 - drivers/net/ethernet/mellanox/mlx5/core/cq.c | 1 - drivers/net/ethernet/mellanox/mlx5/core/eq.c | 1 - drivers/net/ethernet/mellanox/mlx5/core/fpga/cmd.c | 1 - drivers/net/ethernet/mellanox/mlx5/core/fw.c | 1 - drivers/net/ethernet/mellanox/mlx5/core/health.c | 1 - drivers/net/ethernet/mellanox/mlx5/core/lib/port_tun.c | 1 - drivers/net/ethernet/mellanox/mlx5/core/mcg.c | 1 - drivers/net/ethernet/mellanox/mlx5/core/mr.c | 1 - drivers/net/ethernet/mellanox/mlx5/core/pagealloc.c | 1 - drivers/net/ethernet/mellanox/mlx5/core/pd.c | 1 - drivers/net/ethernet/mellanox/mlx5/core/qp.c | 1 - drivers/net/ethernet/mellanox/mlx5/core/rl.c | 1 - drivers/net/ethernet/mellanox/mlx5/core/uar.c | 1 - 15 files changed, 15 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/mad.c b/drivers/infiniband/hw/mlx5/mad.c index 14e0c17de6a9..f0ab6d7d8497 100644 --- a/drivers/infiniband/hw/mlx5/mad.c +++ b/drivers/infiniband/hw/mlx5/mad.c @@ -30,7 +30,6 @@ * SOFTWARE. */ -#include #include #include #include diff --git a/drivers/infiniband/hw/mlx5/srq_cmd.c b/drivers/infiniband/hw/mlx5/srq_cmd.c index 8fc3630a9d4c..88c0388f9fc6 100644 --- a/drivers/infiniband/hw/mlx5/srq_cmd.c +++ b/drivers/infiniband/hw/mlx5/srq_cmd.c @@ -5,7 +5,6 @@ #include #include -#include #include "mlx5_ib.h" #include "srq.h" diff --git a/drivers/net/ethernet/mellanox/mlx5/core/cq.c b/drivers/net/ethernet/mellanox/mlx5/core/cq.c index 818edc63e428..4477a590b308 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/cq.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/cq.c @@ -34,7 +34,6 @@ #include #include #include -#include #include #include #include "mlx5_core.h" diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eq.c b/drivers/net/ethernet/mellanox/mlx5/core/eq.c index cccea3a8eddd..bee419d01af2 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/eq.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/eq.c @@ -36,7 +36,6 @@ #include #include #include -#include #ifdef CONFIG_RFS_ACCEL #include #endif diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fpga/cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/fpga/cmd.c index c0fd2212e890..09769401c313 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/fpga/cmd.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/fpga/cmd.c @@ -31,7 +31,6 @@ */ #include -#include #include #include diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fw.c b/drivers/net/ethernet/mellanox/mlx5/core/fw.c index 90e3d0233101..3040e0466681 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/fw.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/fw.c @@ -31,7 +31,6 @@ */ #include -#include #include #include #include "mlx5_core.h" diff --git a/drivers/net/ethernet/mellanox/mlx5/core/health.c b/drivers/net/ethernet/mellanox/mlx5/core/health.c index fa1665caac46..3ae355453464 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/health.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/health.c @@ -36,7 +36,6 @@ #include #include #include -#include #include "mlx5_core.h" #include "lib/eq.h" #include "lib/mlx5.h" diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/port_tun.c b/drivers/net/ethernet/mellanox/mlx5/core/lib/port_tun.c index 48b5c847b642..8809a65ecefb 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lib/port_tun.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/port_tun.c @@ -4,7 +4,6 @@ #include #include #include -#include #include "mlx5_core.h" #include "lib/port_tun.h" diff --git a/drivers/net/ethernet/mellanox/mlx5/core/mcg.c b/drivers/net/ethernet/mellanox/mlx5/core/mcg.c index ba2b09cc192f..6789fe658037 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/mcg.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/mcg.c @@ -33,7 +33,6 @@ #include #include #include -#include #include #include "mlx5_core.h" diff --git a/drivers/net/ethernet/mellanox/mlx5/core/mr.c b/drivers/net/ethernet/mellanox/mlx5/core/mr.c index 366f2cbfc6db..1feedf335dea 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/mr.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/mr.c @@ -33,7 +33,6 @@ #include #include #include -#include #include "mlx5_core.h" int mlx5_core_create_mkey(struct mlx5_core_dev *dev, diff --git a/drivers/net/ethernet/mellanox/mlx5/core/pagealloc.c b/drivers/net/ethernet/mellanox/mlx5/core/pagealloc.c index 91bd258ecf1b..a3959754b927 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/pagealloc.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/pagealloc.c @@ -35,7 +35,6 @@ #include #include #include -#include #include "mlx5_core.h" #include "lib/eq.h" diff --git a/drivers/net/ethernet/mellanox/mlx5/core/pd.c b/drivers/net/ethernet/mellanox/mlx5/core/pd.c index bd830d8d6c5f..b92d6f621c83 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/pd.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/pd.c @@ -33,7 +33,6 @@ #include #include #include -#include #include "mlx5_core.h" int mlx5_core_alloc_pd(struct mlx5_core_dev *dev, u32 *pdn) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/qp.c b/drivers/net/ethernet/mellanox/mlx5/core/qp.c index e36790ad5256..d9df3a5dd532 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/qp.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/qp.c @@ -32,7 +32,6 @@ #include #include -#include #include #include #include diff --git a/drivers/net/ethernet/mellanox/mlx5/core/rl.c b/drivers/net/ethernet/mellanox/mlx5/core/rl.c index f3b29d9ade1f..c9599f7c5696 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/rl.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/rl.c @@ -33,7 +33,6 @@ #include #include #include -#include #include "mlx5_core.h" /* Scheduling element fw management */ diff --git a/drivers/net/ethernet/mellanox/mlx5/core/uar.c b/drivers/net/ethernet/mellanox/mlx5/core/uar.c index 0d006224d7b0..816f9c434359 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/uar.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/uar.c @@ -34,7 +34,6 @@ #include #include #include -#include #include "mlx5_core.h" int mlx5_cmd_alloc_uar(struct mlx5_core_dev *dev, u32 *uarn) -- cgit v1.2.3-59-g8ed1b From 42f9bbd11278d0a270a75998f5c0d21e7b37c521 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Wed, 1 Apr 2020 11:48:47 +0300 Subject: RDMA/mlx5: Alphabetically sort build artifacts Sort .o objects in makefile to make addition of new object less cumbersome. Reviewed-by: Maor Gottlieb Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/mlx5/Makefile | 27 ++++++++++++++++++++------- 1 file changed, 20 insertions(+), 7 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/Makefile b/drivers/infiniband/hw/mlx5/Makefile index 2a334800f109..375b341be8a1 100644 --- a/drivers/infiniband/hw/mlx5/Makefile +++ b/drivers/infiniband/hw/mlx5/Makefile @@ -1,11 +1,24 @@ # SPDX-License-Identifier: GPL-2.0-only -obj-$(CONFIG_MLX5_INFINIBAND) += mlx5_ib.o +obj-$(CONFIG_MLX5_INFINIBAND) += mlx5_ib.o + +mlx5_ib-y := ah.o \ + cmd.o \ + cong.o \ + cq.o \ + doorbell.o \ + gsi.o \ + ib_virt.o \ + mad.o \ + main.o \ + mem.o \ + mr.o \ + qp.o \ + restrack.o \ + srq.o \ + srq_cmd.o -mlx5_ib-y := main.o cq.o doorbell.o qp.o mem.o srq_cmd.o \ - srq.o mr.o ah.o mad.o gsi.o ib_virt.o cmd.o \ - cong.o restrack.o mlx5_ib-$(CONFIG_INFINIBAND_ON_DEMAND_PAGING) += odp.o mlx5_ib-$(CONFIG_MLX5_ESWITCH) += ib_rep.o -mlx5_ib-$(CONFIG_INFINIBAND_USER_ACCESS) += devx.o -mlx5_ib-$(CONFIG_INFINIBAND_USER_ACCESS) += flow.o -mlx5_ib-$(CONFIG_INFINIBAND_USER_ACCESS) += qos.o +mlx5_ib-$(CONFIG_INFINIBAND_USER_ACCESS) += devx.o \ + flow.o \ + qos.o -- cgit v1.2.3-59-g8ed1b From 333fbaa0255b8d471fc7ae767ef3a1766c732d6d Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Sat, 4 Apr 2020 10:40:24 +0300 Subject: net/mlx5: Move QP logic to mlx5_ib The mlx5_core doesn't need any functionality coded in qp.c, so move that file to drivers/infiniband/ be under mlx5_ib responsibility. Reviewed-by: Saeed Mahameed Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/mlx5/Makefile | 1 + drivers/infiniband/hw/mlx5/cq.c | 3 +- drivers/infiniband/hw/mlx5/devx.c | 10 +- drivers/infiniband/hw/mlx5/main.c | 10 +- drivers/infiniband/hw/mlx5/mlx5_ib.h | 2 + drivers/infiniband/hw/mlx5/odp.c | 3 +- drivers/infiniband/hw/mlx5/qp.c | 47 +- drivers/infiniband/hw/mlx5/qp.h | 46 ++ drivers/infiniband/hw/mlx5/qpc.c | 605 +++++++++++++++++++ drivers/infiniband/hw/mlx5/srq_cmd.c | 1 + drivers/net/ethernet/mellanox/mlx5/core/Makefile | 2 +- drivers/net/ethernet/mellanox/mlx5/core/debugfs.c | 6 +- drivers/net/ethernet/mellanox/mlx5/core/main.c | 4 - drivers/net/ethernet/mellanox/mlx5/core/qp.c | 697 ---------------------- include/linux/mlx5/cmd.h | 51 -- include/linux/mlx5/driver.h | 2 - include/linux/mlx5/qp.h | 45 -- 17 files changed, 699 insertions(+), 836 deletions(-) create mode 100644 drivers/infiniband/hw/mlx5/qp.h create mode 100644 drivers/infiniband/hw/mlx5/qpc.c delete mode 100644 drivers/net/ethernet/mellanox/mlx5/core/qp.c delete mode 100644 include/linux/mlx5/cmd.h diff --git a/drivers/infiniband/hw/mlx5/Makefile b/drivers/infiniband/hw/mlx5/Makefile index 375b341be8a1..228be05fbaf8 100644 --- a/drivers/infiniband/hw/mlx5/Makefile +++ b/drivers/infiniband/hw/mlx5/Makefile @@ -13,6 +13,7 @@ mlx5_ib-y := ah.o \ mem.o \ mr.o \ qp.o \ + qpc.o \ restrack.o \ srq.o \ srq_cmd.o diff --git a/drivers/infiniband/hw/mlx5/cq.c b/drivers/infiniband/hw/mlx5/cq.c index 146ba2966744..32c05730dfe9 100644 --- a/drivers/infiniband/hw/mlx5/cq.c +++ b/drivers/infiniband/hw/mlx5/cq.c @@ -36,6 +36,7 @@ #include #include "mlx5_ib.h" #include "srq.h" +#include "qp.h" static void mlx5_ib_cq_comp(struct mlx5_core_cq *cq, struct mlx5_eqe *eqe) { @@ -484,7 +485,7 @@ repoll: * because CQs will be locked while QPs are removed * from the table. */ - mqp = __mlx5_qp_lookup(dev->mdev, qpn); + mqp = radix_tree_lookup(&dev->qp_table.tree, qpn); *cur_qp = to_mibqp(mqp); } diff --git a/drivers/infiniband/hw/mlx5/devx.c b/drivers/infiniband/hw/mlx5/devx.c index 46e1ab771f10..35b98c2d64d5 100644 --- a/drivers/infiniband/hw/mlx5/devx.c +++ b/drivers/infiniband/hw/mlx5/devx.c @@ -14,6 +14,7 @@ #include #include #include "mlx5_ib.h" +#include "qp.h" #include #define UVERBS_MODULE_NAME mlx5_ib @@ -1356,7 +1357,7 @@ static int devx_obj_cleanup(struct ib_uobject *uobject, } if (obj->flags & DEVX_OBJ_FLAGS_DCT) - ret = mlx5_core_destroy_dct(obj->ib_dev->mdev, &obj->core_dct); + ret = mlx5_core_destroy_dct(obj->ib_dev, &obj->core_dct); else if (obj->flags & DEVX_OBJ_FLAGS_CQ) ret = mlx5_core_destroy_cq(obj->ib_dev->mdev, &obj->core_cq); else @@ -1450,9 +1451,8 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_DEVX_OBJ_CREATE)( if (opcode == MLX5_CMD_OP_CREATE_DCT) { obj->flags |= DEVX_OBJ_FLAGS_DCT; - err = mlx5_core_create_dct(dev->mdev, &obj->core_dct, - cmd_in, cmd_in_len, - cmd_out, cmd_out_len); + err = mlx5_core_create_dct(dev, &obj->core_dct, cmd_in, + cmd_in_len, cmd_out, cmd_out_len); } else if (opcode == MLX5_CMD_OP_CREATE_CQ) { obj->flags |= DEVX_OBJ_FLAGS_CQ; obj->core_cq.comp = devx_cq_comp; @@ -1499,7 +1499,7 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_DEVX_OBJ_CREATE)( obj_destroy: if (obj->flags & DEVX_OBJ_FLAGS_DCT) - mlx5_core_destroy_dct(obj->ib_dev->mdev, &obj->core_dct); + mlx5_core_destroy_dct(obj->ib_dev, &obj->core_dct); else if (obj->flags & DEVX_OBJ_FLAGS_CQ) mlx5_core_destroy_cq(obj->ib_dev->mdev, &obj->core_cq); else diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index 76ea756d846b..f10675213115 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -59,6 +59,7 @@ #include "ib_rep.h" #include "cmd.h" #include "srq.h" +#include "qp.h" #include #include #include @@ -4632,8 +4633,7 @@ static void delay_drop_handler(struct work_struct *work) atomic_inc(&delay_drop->events_cnt); mutex_lock(&delay_drop->lock); - err = mlx5_core_set_delay_drop(delay_drop->dev->mdev, - delay_drop->timeout); + err = mlx5_core_set_delay_drop(delay_drop->dev, delay_drop->timeout); if (err) { mlx5_ib_warn(delay_drop->dev, "Failed to set delay drop, timeout=%u\n", delay_drop->timeout); @@ -7193,6 +7193,9 @@ static const struct mlx5_ib_profile pf_profile = { STAGE_CREATE(MLX5_IB_STAGE_ROCE, mlx5_ib_stage_roce_init, mlx5_ib_stage_roce_cleanup), + STAGE_CREATE(MLX5_IB_STAGE_QP, + mlx5_init_qp_table, + mlx5_cleanup_qp_table), STAGE_CREATE(MLX5_IB_STAGE_SRQ, mlx5_init_srq_table, mlx5_cleanup_srq_table), @@ -7250,6 +7253,9 @@ const struct mlx5_ib_profile raw_eth_profile = { STAGE_CREATE(MLX5_IB_STAGE_ROCE, mlx5_ib_stage_raw_eth_roce_init, mlx5_ib_stage_raw_eth_roce_cleanup), + STAGE_CREATE(MLX5_IB_STAGE_QP, + mlx5_init_qp_table, + mlx5_cleanup_qp_table), STAGE_CREATE(MLX5_IB_STAGE_SRQ, mlx5_init_srq_table, mlx5_cleanup_srq_table), diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h index cb2a021aa93c..aaabb8a98eed 100644 --- a/drivers/infiniband/hw/mlx5/mlx5_ib.h +++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h @@ -869,6 +869,7 @@ enum mlx5_ib_stages { MLX5_IB_STAGE_CAPS, MLX5_IB_STAGE_NON_DEFAULT_CB, MLX5_IB_STAGE_ROCE, + MLX5_IB_STAGE_QP, MLX5_IB_STAGE_SRQ, MLX5_IB_STAGE_DEVICE_RESOURCES, MLX5_IB_STAGE_DEVICE_NOTIFIER, @@ -1064,6 +1065,7 @@ struct mlx5_ib_dev { struct mlx5_dm dm; u16 devx_whitelist_uid; struct mlx5_srq_table srq_table; + struct mlx5_qp_table qp_table; struct mlx5_async_ctx async_ctx; struct mlx5_devx_event_table devx_event_table; struct mlx5_var_table var_table; diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c index 3de7606d4a1a..16af1105cfcf 100644 --- a/drivers/infiniband/hw/mlx5/odp.c +++ b/drivers/infiniband/hw/mlx5/odp.c @@ -36,6 +36,7 @@ #include "mlx5_ib.h" #include "cmd.h" +#include "qp.h" #include @@ -1219,7 +1220,7 @@ static inline struct mlx5_core_rsc_common *odp_get_rsc(struct mlx5_ib_dev *dev, case MLX5_WQE_PF_TYPE_REQ_SEND_OR_WRITE: case MLX5_WQE_PF_TYPE_RESP: case MLX5_WQE_PF_TYPE_REQ_READ_OR_ATOMIC: - common = mlx5_core_res_hold(dev->mdev, wq_num, MLX5_RES_QP); + common = mlx5_core_res_hold(dev, wq_num, MLX5_RES_QP); break; default: break; diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index 1456db4b6295..3ecd1864b3c8 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -39,6 +39,7 @@ #include "mlx5_ib.h" #include "ib_rep.h" #include "cmd.h" +#include "qp.h" /* not supported currently */ static int wq_signature; @@ -1336,7 +1337,7 @@ static int create_raw_packet_qp_sq(struct mlx5_ib_dev *dev, pas = (__be64 *)MLX5_ADDR_OF(wq, wq, pas); mlx5_ib_populate_pas(dev, sq->ubuffer.umem, page_shift, pas, 0); - err = mlx5_core_create_sq_tracked(dev->mdev, in, inlen, &sq->base.mqp); + err = mlx5_core_create_sq_tracked(dev, in, inlen, &sq->base.mqp); kvfree(in); @@ -1356,7 +1357,7 @@ static void destroy_raw_packet_qp_sq(struct mlx5_ib_dev *dev, struct mlx5_ib_sq *sq) { destroy_flow_rule_vport_sq(sq); - mlx5_core_destroy_sq_tracked(dev->mdev, &sq->base.mqp); + mlx5_core_destroy_sq_tracked(dev, &sq->base.mqp); ib_umem_release(sq->ubuffer.umem); } @@ -1426,7 +1427,7 @@ static int create_raw_packet_qp_rq(struct mlx5_ib_dev *dev, qp_pas = (__be64 *)MLX5_ADDR_OF(create_qp_in, qpin, pas); memcpy(pas, qp_pas, rq_pas_size); - err = mlx5_core_create_rq_tracked(dev->mdev, in, inlen, &rq->base.mqp); + err = mlx5_core_create_rq_tracked(dev, in, inlen, &rq->base.mqp); kvfree(in); @@ -1436,7 +1437,7 @@ static int create_raw_packet_qp_rq(struct mlx5_ib_dev *dev, static void destroy_raw_packet_qp_rq(struct mlx5_ib_dev *dev, struct mlx5_ib_rq *rq) { - mlx5_core_destroy_rq_tracked(dev->mdev, &rq->base.mqp); + mlx5_core_destroy_rq_tracked(dev, &rq->base.mqp); } static bool tunnel_offload_supported(struct mlx5_core_dev *dev) @@ -2347,7 +2348,7 @@ static int create_qp_common(struct mlx5_ib_dev *dev, struct ib_pd *pd, err = create_raw_packet_qp(dev, qp, in, inlen, pd, udata, &resp); } else { - err = mlx5_core_create_qp(dev->mdev, &base->mqp, in, inlen); + err = mlx5_core_create_qp(dev, &base->mqp, in, inlen); } if (err) { @@ -2513,8 +2514,7 @@ static void destroy_qp_common(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, if (qp->state != IB_QPS_RESET) { if (qp->ibqp.qp_type != IB_QPT_RAW_PACKET && !(qp->flags & MLX5_IB_QP_UNDERLAY)) { - err = mlx5_core_qp_modify(dev->mdev, - MLX5_CMD_OP_2RST_QP, 0, + err = mlx5_core_qp_modify(dev, MLX5_CMD_OP_2RST_QP, 0, NULL, &base->mqp); } else { struct mlx5_modify_raw_qp_param raw_qp_param = { @@ -2555,7 +2555,7 @@ static void destroy_qp_common(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, qp->flags & MLX5_IB_QP_UNDERLAY) { destroy_raw_packet_qp(dev, qp); } else { - err = mlx5_core_destroy_qp(dev->mdev, &base->mqp); + err = mlx5_core_destroy_qp(dev, &base->mqp); if (err) mlx5_ib_warn(dev, "failed to destroy QP 0x%x\n", base->mqp.qpn); @@ -2818,7 +2818,7 @@ static int mlx5_ib_destroy_dct(struct mlx5_ib_qp *mqp) if (mqp->state == IB_QPS_RTR) { int err; - err = mlx5_core_destroy_dct(dev->mdev, &mqp->dct.mdct); + err = mlx5_core_destroy_dct(dev, &mqp->dct.mdct); if (err) { mlx5_ib_warn(dev, "failed to destroy DCT %d\n", err); return err; @@ -3462,10 +3462,9 @@ static int __mlx5_ib_qp_set_counter(struct ib_qp *qp, base = &mqp->trans_qp.base; context.qp_counter_set_usr_page &= cpu_to_be32(0xffffff); context.qp_counter_set_usr_page |= cpu_to_be32(set_id << 24); - return mlx5_core_qp_modify(dev->mdev, - MLX5_CMD_OP_RTS2RTS_QP, - MLX5_QP_OPTPAR_COUNTER_SET_ID, - &context, &base->mqp); + return mlx5_core_qp_modify(dev, MLX5_CMD_OP_RTS2RTS_QP, + MLX5_QP_OPTPAR_COUNTER_SET_ID, &context, + &base->mqp); } static int __mlx5_ib_modify_qp(struct ib_qp *ibqp, @@ -3752,8 +3751,7 @@ static int __mlx5_ib_modify_qp(struct ib_qp *ibqp, err = modify_raw_packet_qp(dev, qp, &raw_qp_param, tx_affinity); } else { - err = mlx5_core_qp_modify(dev->mdev, op, optpar, context, - &base->mqp); + err = mlx5_core_qp_modify(dev, op, optpar, context, &base->mqp); } if (err) @@ -3927,7 +3925,7 @@ static int mlx5_ib_modify_dct(struct ib_qp *ibqp, struct ib_qp_attr *attr, MLX5_SET(dctc, dctc, my_addr_index, attr->ah_attr.grh.sgid_index); MLX5_SET(dctc, dctc, hop_limit, attr->ah_attr.grh.hop_limit); - err = mlx5_core_create_dct(dev->mdev, &qp->dct.mdct, qp->dct.in, + err = mlx5_core_create_dct(dev, &qp->dct.mdct, qp->dct.in, MLX5_ST_SZ_BYTES(create_dct_in), out, sizeof(out)); if (err) @@ -3935,7 +3933,7 @@ static int mlx5_ib_modify_dct(struct ib_qp *ibqp, struct ib_qp_attr *attr, resp.dctn = qp->dct.mdct.mqp.qpn; err = ib_copy_to_udata(udata, &resp, resp.response_length); if (err) { - mlx5_core_destroy_dct(dev->mdev, &qp->dct.mdct); + mlx5_core_destroy_dct(dev, &qp->dct.mdct); return err; } } else { @@ -5697,8 +5695,7 @@ static int query_qp_attr(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, if (!outb) return -ENOMEM; - err = mlx5_core_qp_query(dev->mdev, &qp->trans_qp.base.mqp, outb, - outlen); + err = mlx5_core_qp_query(dev, &qp->trans_qp.base.mqp, outb, outlen); if (err) goto out; @@ -5776,7 +5773,7 @@ static int mlx5_ib_dct_query_qp(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *mqp, if (!out) return -ENOMEM; - err = mlx5_core_dct_query(dev->mdev, dct, out, outlen); + err = mlx5_core_dct_query(dev, dct, out, outlen); if (err) goto out; @@ -5962,7 +5959,7 @@ static int set_delay_drop(struct mlx5_ib_dev *dev) if (dev->delay_drop.activate) goto out; - err = mlx5_core_set_delay_drop(dev->mdev, dev->delay_drop.timeout); + err = mlx5_core_set_delay_drop(dev, dev->delay_drop.timeout); if (err) goto out; @@ -6068,13 +6065,13 @@ static int create_rq(struct mlx5_ib_rwq *rwq, struct ib_pd *pd, } rq_pas0 = (__be64 *)MLX5_ADDR_OF(wq, wq, pas); mlx5_ib_populate_pas(dev, rwq->umem, rwq->page_shift, rq_pas0, 0); - err = mlx5_core_create_rq_tracked(dev->mdev, in, inlen, &rwq->core_qp); + err = mlx5_core_create_rq_tracked(dev, in, inlen, &rwq->core_qp); if (!err && init_attr->create_flags & IB_WQ_FLAGS_DELAY_DROP) { err = set_delay_drop(dev); if (err) { mlx5_ib_warn(dev, "Failed to enable delay drop err=%d\n", err); - mlx5_core_destroy_rq_tracked(dev->mdev, &rwq->core_qp); + mlx5_core_destroy_rq_tracked(dev, &rwq->core_qp); } else { rwq->create_flags |= MLX5_IB_WQ_FLAGS_DELAY_DROP; } @@ -6256,7 +6253,7 @@ struct ib_wq *mlx5_ib_create_wq(struct ib_pd *pd, return &rwq->ibwq; err_copy: - mlx5_core_destroy_rq_tracked(dev->mdev, &rwq->core_qp); + mlx5_core_destroy_rq_tracked(dev, &rwq->core_qp); err_user_rq: destroy_user_rq(dev, pd, rwq, udata); err: @@ -6269,7 +6266,7 @@ void mlx5_ib_destroy_wq(struct ib_wq *wq, struct ib_udata *udata) struct mlx5_ib_dev *dev = to_mdev(wq->device); struct mlx5_ib_rwq *rwq = to_mrwq(wq); - mlx5_core_destroy_rq_tracked(dev->mdev, &rwq->core_qp); + mlx5_core_destroy_rq_tracked(dev, &rwq->core_qp); destroy_user_rq(dev, wq->pd, rwq, udata); kfree(rwq); } diff --git a/drivers/infiniband/hw/mlx5/qp.h b/drivers/infiniband/hw/mlx5/qp.h new file mode 100644 index 000000000000..ad9d76e3e18a --- /dev/null +++ b/drivers/infiniband/hw/mlx5/qp.h @@ -0,0 +1,46 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* + * Copyright (c) 2013-2020, Mellanox Technologies inc. All rights reserved. + */ + +#ifndef _MLX5_IB_QP_H +#define _MLX5_IB_QP_H + +#include "mlx5_ib.h" + +int mlx5_init_qp_table(struct mlx5_ib_dev *dev); +void mlx5_cleanup_qp_table(struct mlx5_ib_dev *dev); + +int mlx5_core_create_dct(struct mlx5_ib_dev *dev, struct mlx5_core_dct *qp, + u32 *in, int inlen, u32 *out, int outlen); +int mlx5_core_create_qp(struct mlx5_ib_dev *dev, struct mlx5_core_qp *qp, + u32 *in, int inlen); +int mlx5_core_qp_modify(struct mlx5_ib_dev *dev, u16 opcode, u32 opt_param_mask, + void *qpc, struct mlx5_core_qp *qp); +int mlx5_core_destroy_qp(struct mlx5_ib_dev *dev, struct mlx5_core_qp *qp); +int mlx5_core_destroy_dct(struct mlx5_ib_dev *dev, struct mlx5_core_dct *dct); +int mlx5_core_qp_query(struct mlx5_ib_dev *dev, struct mlx5_core_qp *qp, + u32 *out, int outlen); +int mlx5_core_dct_query(struct mlx5_ib_dev *dev, struct mlx5_core_dct *dct, + u32 *out, int outlen); + +int mlx5_core_set_delay_drop(struct mlx5_ib_dev *dev, u32 timeout_usec); + +void mlx5_core_destroy_rq_tracked(struct mlx5_ib_dev *dev, + struct mlx5_core_qp *rq); +int mlx5_core_create_sq_tracked(struct mlx5_ib_dev *dev, u32 *in, int inlen, + struct mlx5_core_qp *sq); +void mlx5_core_destroy_sq_tracked(struct mlx5_ib_dev *dev, + struct mlx5_core_qp *sq); + +int mlx5_core_create_rq_tracked(struct mlx5_ib_dev *dev, u32 *in, int inlen, + struct mlx5_core_qp *rq); + +struct mlx5_core_rsc_common *mlx5_core_res_hold(struct mlx5_ib_dev *dev, + int res_num, + enum mlx5_res_type res_type); +void mlx5_core_res_put(struct mlx5_core_rsc_common *res); + +int mlx5_core_xrcd_alloc(struct mlx5_ib_dev *dev, u32 *xrcdn); +int mlx5_core_xrcd_dealloc(struct mlx5_ib_dev *dev, u32 xrcdn); +#endif /* _MLX5_IB_QP_H */ diff --git a/drivers/infiniband/hw/mlx5/qpc.c b/drivers/infiniband/hw/mlx5/qpc.c new file mode 100644 index 000000000000..ea62735042f0 --- /dev/null +++ b/drivers/infiniband/hw/mlx5/qpc.c @@ -0,0 +1,605 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* + * Copyright (c) 2013-2020, Mellanox Technologies inc. All rights reserved. + */ + +#include +#include +#include +#include "mlx5_ib.h" +#include "qp.h" + +static int mlx5_core_drain_dct(struct mlx5_ib_dev *dev, + struct mlx5_core_dct *dct); + +static struct mlx5_core_rsc_common * +mlx5_get_rsc(struct mlx5_qp_table *table, u32 rsn) +{ + struct mlx5_core_rsc_common *common; + unsigned long flags; + + spin_lock_irqsave(&table->lock, flags); + + common = radix_tree_lookup(&table->tree, rsn); + if (common) + refcount_inc(&common->refcount); + + spin_unlock_irqrestore(&table->lock, flags); + + return common; +} + +void mlx5_core_put_rsc(struct mlx5_core_rsc_common *common) +{ + if (refcount_dec_and_test(&common->refcount)) + complete(&common->free); +} + +static u64 qp_allowed_event_types(void) +{ + u64 mask; + + mask = BIT(MLX5_EVENT_TYPE_PATH_MIG) | + BIT(MLX5_EVENT_TYPE_COMM_EST) | + BIT(MLX5_EVENT_TYPE_SQ_DRAINED) | + BIT(MLX5_EVENT_TYPE_SRQ_LAST_WQE) | + BIT(MLX5_EVENT_TYPE_WQ_CATAS_ERROR) | + BIT(MLX5_EVENT_TYPE_PATH_MIG_FAILED) | + BIT(MLX5_EVENT_TYPE_WQ_INVAL_REQ_ERROR) | + BIT(MLX5_EVENT_TYPE_WQ_ACCESS_ERROR); + + return mask; +} + +static u64 rq_allowed_event_types(void) +{ + u64 mask; + + mask = BIT(MLX5_EVENT_TYPE_SRQ_LAST_WQE) | + BIT(MLX5_EVENT_TYPE_WQ_CATAS_ERROR); + + return mask; +} + +static u64 sq_allowed_event_types(void) +{ + return BIT(MLX5_EVENT_TYPE_WQ_CATAS_ERROR); +} + +static u64 dct_allowed_event_types(void) +{ + return BIT(MLX5_EVENT_TYPE_DCT_DRAINED); +} + +static bool is_event_type_allowed(int rsc_type, int event_type) +{ + switch (rsc_type) { + case MLX5_EVENT_QUEUE_TYPE_QP: + return BIT(event_type) & qp_allowed_event_types(); + case MLX5_EVENT_QUEUE_TYPE_RQ: + return BIT(event_type) & rq_allowed_event_types(); + case MLX5_EVENT_QUEUE_TYPE_SQ: + return BIT(event_type) & sq_allowed_event_types(); + case MLX5_EVENT_QUEUE_TYPE_DCT: + return BIT(event_type) & dct_allowed_event_types(); + default: + WARN(1, "Event arrived for unknown resource type"); + return false; + } +} + +static int rsc_event_notifier(struct notifier_block *nb, + unsigned long type, void *data) +{ + struct mlx5_core_rsc_common *common; + struct mlx5_qp_table *table; + struct mlx5_core_dct *dct; + u8 event_type = (u8)type; + struct mlx5_core_qp *qp; + struct mlx5_eqe *eqe; + u32 rsn; + + switch (event_type) { + case MLX5_EVENT_TYPE_DCT_DRAINED: + eqe = data; + rsn = be32_to_cpu(eqe->data.dct.dctn) & 0xffffff; + rsn |= (MLX5_RES_DCT << MLX5_USER_INDEX_LEN); + break; + case MLX5_EVENT_TYPE_PATH_MIG: + case MLX5_EVENT_TYPE_COMM_EST: + case MLX5_EVENT_TYPE_SQ_DRAINED: + case MLX5_EVENT_TYPE_SRQ_LAST_WQE: + case MLX5_EVENT_TYPE_WQ_CATAS_ERROR: + case MLX5_EVENT_TYPE_PATH_MIG_FAILED: + case MLX5_EVENT_TYPE_WQ_INVAL_REQ_ERROR: + case MLX5_EVENT_TYPE_WQ_ACCESS_ERROR: + eqe = data; + rsn = be32_to_cpu(eqe->data.qp_srq.qp_srq_n) & 0xffffff; + rsn |= (eqe->data.qp_srq.type << MLX5_USER_INDEX_LEN); + break; + default: + return NOTIFY_DONE; + } + + table = container_of(nb, struct mlx5_qp_table, nb); + common = mlx5_get_rsc(table, rsn); + if (!common) + return NOTIFY_OK; + + if (!is_event_type_allowed((rsn >> MLX5_USER_INDEX_LEN), event_type)) + goto out; + + switch (common->res) { + case MLX5_RES_QP: + case MLX5_RES_RQ: + case MLX5_RES_SQ: + qp = (struct mlx5_core_qp *)common; + qp->event(qp, event_type); + break; + case MLX5_RES_DCT: + dct = (struct mlx5_core_dct *)common; + if (event_type == MLX5_EVENT_TYPE_DCT_DRAINED) + complete(&dct->drained); + break; + default: + break; + } +out: + mlx5_core_put_rsc(common); + + return NOTIFY_OK; +} + +static int create_resource_common(struct mlx5_ib_dev *dev, + struct mlx5_core_qp *qp, int rsc_type) +{ + struct mlx5_qp_table *table = &dev->qp_table; + int err; + + qp->common.res = rsc_type; + spin_lock_irq(&table->lock); + err = radix_tree_insert(&table->tree, + qp->qpn | (rsc_type << MLX5_USER_INDEX_LEN), + qp); + spin_unlock_irq(&table->lock); + if (err) + return err; + + refcount_set(&qp->common.refcount, 1); + init_completion(&qp->common.free); + qp->pid = current->pid; + + return 0; +} + +static void destroy_resource_common(struct mlx5_ib_dev *dev, + struct mlx5_core_qp *qp) +{ + struct mlx5_qp_table *table = &dev->qp_table; + unsigned long flags; + + spin_lock_irqsave(&table->lock, flags); + radix_tree_delete(&table->tree, + qp->qpn | (qp->common.res << MLX5_USER_INDEX_LEN)); + spin_unlock_irqrestore(&table->lock, flags); + mlx5_core_put_rsc((struct mlx5_core_rsc_common *)qp); + wait_for_completion(&qp->common.free); +} + +static int _mlx5_core_destroy_dct(struct mlx5_ib_dev *dev, + struct mlx5_core_dct *dct, bool need_cleanup) +{ + u32 in[MLX5_ST_SZ_DW(destroy_dct_in)] = {}; + struct mlx5_core_qp *qp = &dct->mqp; + int err; + + err = mlx5_core_drain_dct(dev, dct); + if (err) { + if (dev->mdev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR) + goto destroy; + + return err; + } + wait_for_completion(&dct->drained); +destroy: + if (need_cleanup) + destroy_resource_common(dev, &dct->mqp); + MLX5_SET(destroy_dct_in, in, opcode, MLX5_CMD_OP_DESTROY_DCT); + MLX5_SET(destroy_dct_in, in, dctn, qp->qpn); + MLX5_SET(destroy_dct_in, in, uid, qp->uid); + err = mlx5_cmd_exec_in(dev->mdev, destroy_dct, in); + return err; +} + +int mlx5_core_create_dct(struct mlx5_ib_dev *dev, struct mlx5_core_dct *dct, + u32 *in, int inlen, u32 *out, int outlen) +{ + struct mlx5_core_qp *qp = &dct->mqp; + int err; + + init_completion(&dct->drained); + MLX5_SET(create_dct_in, in, opcode, MLX5_CMD_OP_CREATE_DCT); + + err = mlx5_cmd_exec(dev->mdev, in, inlen, out, outlen); + if (err) + return err; + + qp->qpn = MLX5_GET(create_dct_out, out, dctn); + qp->uid = MLX5_GET(create_dct_in, in, uid); + err = create_resource_common(dev, qp, MLX5_RES_DCT); + if (err) + goto err_cmd; + + return 0; +err_cmd: + _mlx5_core_destroy_dct(dev, dct, false); + return err; +} + +int mlx5_core_create_qp(struct mlx5_ib_dev *dev, struct mlx5_core_qp *qp, + u32 *in, int inlen) +{ + u32 out[MLX5_ST_SZ_DW(create_qp_out)] = {}; + u32 din[MLX5_ST_SZ_DW(destroy_qp_in)] = {}; + int err; + + MLX5_SET(create_qp_in, in, opcode, MLX5_CMD_OP_CREATE_QP); + + err = mlx5_cmd_exec(dev->mdev, in, inlen, out, sizeof(out)); + if (err) + return err; + + qp->uid = MLX5_GET(create_qp_in, in, uid); + qp->qpn = MLX5_GET(create_qp_out, out, qpn); + + err = create_resource_common(dev, qp, MLX5_RES_QP); + if (err) + goto err_cmd; + + mlx5_debug_qp_add(dev->mdev, qp); + + return 0; + +err_cmd: + MLX5_SET(destroy_qp_in, din, opcode, MLX5_CMD_OP_DESTROY_QP); + MLX5_SET(destroy_qp_in, din, qpn, qp->qpn); + MLX5_SET(destroy_qp_in, din, uid, qp->uid); + mlx5_cmd_exec_in(dev->mdev, destroy_qp, din); + return err; +} + +static int mlx5_core_drain_dct(struct mlx5_ib_dev *dev, + struct mlx5_core_dct *dct) +{ + u32 in[MLX5_ST_SZ_DW(drain_dct_in)] = {}; + struct mlx5_core_qp *qp = &dct->mqp; + + MLX5_SET(drain_dct_in, in, opcode, MLX5_CMD_OP_DRAIN_DCT); + MLX5_SET(drain_dct_in, in, dctn, qp->qpn); + MLX5_SET(drain_dct_in, in, uid, qp->uid); + return mlx5_cmd_exec_in(dev->mdev, drain_dct, in); +} + +int mlx5_core_destroy_dct(struct mlx5_ib_dev *dev, + struct mlx5_core_dct *dct) +{ + return _mlx5_core_destroy_dct(dev, dct, true); +} + +int mlx5_core_destroy_qp(struct mlx5_ib_dev *dev, struct mlx5_core_qp *qp) +{ + u32 in[MLX5_ST_SZ_DW(destroy_qp_in)] = {}; + + mlx5_debug_qp_remove(dev->mdev, qp); + + destroy_resource_common(dev, qp); + + MLX5_SET(destroy_qp_in, in, opcode, MLX5_CMD_OP_DESTROY_QP); + MLX5_SET(destroy_qp_in, in, qpn, qp->qpn); + MLX5_SET(destroy_qp_in, in, uid, qp->uid); + mlx5_cmd_exec_in(dev->mdev, destroy_qp, in); + return 0; +} + +int mlx5_core_set_delay_drop(struct mlx5_ib_dev *dev, + u32 timeout_usec) +{ + u32 in[MLX5_ST_SZ_DW(set_delay_drop_params_in)] = {}; + + MLX5_SET(set_delay_drop_params_in, in, opcode, + MLX5_CMD_OP_SET_DELAY_DROP_PARAMS); + MLX5_SET(set_delay_drop_params_in, in, delay_drop_timeout, + timeout_usec / 100); + return mlx5_cmd_exec_in(dev->mdev, set_delay_drop_params, in); +} + +struct mbox_info { + u32 *in; + u32 *out; + int inlen; + int outlen; +}; + +static int mbox_alloc(struct mbox_info *mbox, int inlen, int outlen) +{ + mbox->inlen = inlen; + mbox->outlen = outlen; + mbox->in = kzalloc(mbox->inlen, GFP_KERNEL); + mbox->out = kzalloc(mbox->outlen, GFP_KERNEL); + if (!mbox->in || !mbox->out) { + kfree(mbox->in); + kfree(mbox->out); + return -ENOMEM; + } + + return 0; +} + +static void mbox_free(struct mbox_info *mbox) +{ + kfree(mbox->in); + kfree(mbox->out); +} + +static int modify_qp_mbox_alloc(struct mlx5_core_dev *dev, u16 opcode, int qpn, + u32 opt_param_mask, void *qpc, + struct mbox_info *mbox, u16 uid) +{ + mbox->out = NULL; + mbox->in = NULL; + +#define MBOX_ALLOC(mbox, typ) \ + mbox_alloc(mbox, MLX5_ST_SZ_BYTES(typ##_in), MLX5_ST_SZ_BYTES(typ##_out)) + +#define MOD_QP_IN_SET(typ, in, _opcode, _qpn, _uid) \ + do { \ + MLX5_SET(typ##_in, in, opcode, _opcode); \ + MLX5_SET(typ##_in, in, qpn, _qpn); \ + MLX5_SET(typ##_in, in, uid, _uid); \ + } while (0) + +#define MOD_QP_IN_SET_QPC(typ, in, _opcode, _qpn, _opt_p, _qpc, _uid) \ + do { \ + MOD_QP_IN_SET(typ, in, _opcode, _qpn, _uid); \ + MLX5_SET(typ##_in, in, opt_param_mask, _opt_p); \ + memcpy(MLX5_ADDR_OF(typ##_in, in, qpc), _qpc, \ + MLX5_ST_SZ_BYTES(qpc)); \ + } while (0) + + switch (opcode) { + /* 2RST & 2ERR */ + case MLX5_CMD_OP_2RST_QP: + if (MBOX_ALLOC(mbox, qp_2rst)) + return -ENOMEM; + MOD_QP_IN_SET(qp_2rst, mbox->in, opcode, qpn, uid); + break; + case MLX5_CMD_OP_2ERR_QP: + if (MBOX_ALLOC(mbox, qp_2err)) + return -ENOMEM; + MOD_QP_IN_SET(qp_2err, mbox->in, opcode, qpn, uid); + break; + + /* MODIFY with QPC */ + case MLX5_CMD_OP_RST2INIT_QP: + if (MBOX_ALLOC(mbox, rst2init_qp)) + return -ENOMEM; + MOD_QP_IN_SET_QPC(rst2init_qp, mbox->in, opcode, qpn, + opt_param_mask, qpc, uid); + break; + case MLX5_CMD_OP_INIT2RTR_QP: + if (MBOX_ALLOC(mbox, init2rtr_qp)) + return -ENOMEM; + MOD_QP_IN_SET_QPC(init2rtr_qp, mbox->in, opcode, qpn, + opt_param_mask, qpc, uid); + break; + case MLX5_CMD_OP_RTR2RTS_QP: + if (MBOX_ALLOC(mbox, rtr2rts_qp)) + return -ENOMEM; + MOD_QP_IN_SET_QPC(rtr2rts_qp, mbox->in, opcode, qpn, + opt_param_mask, qpc, uid); + break; + case MLX5_CMD_OP_RTS2RTS_QP: + if (MBOX_ALLOC(mbox, rts2rts_qp)) + return -ENOMEM; + MOD_QP_IN_SET_QPC(rts2rts_qp, mbox->in, opcode, qpn, + opt_param_mask, qpc, uid); + break; + case MLX5_CMD_OP_SQERR2RTS_QP: + if (MBOX_ALLOC(mbox, sqerr2rts_qp)) + return -ENOMEM; + MOD_QP_IN_SET_QPC(sqerr2rts_qp, mbox->in, opcode, qpn, + opt_param_mask, qpc, uid); + break; + case MLX5_CMD_OP_INIT2INIT_QP: + if (MBOX_ALLOC(mbox, init2init_qp)) + return -ENOMEM; + MOD_QP_IN_SET_QPC(init2init_qp, mbox->in, opcode, qpn, + opt_param_mask, qpc, uid); + break; + default: + return -EINVAL; + } + return 0; +} + +int mlx5_core_qp_modify(struct mlx5_ib_dev *dev, u16 opcode, u32 opt_param_mask, + void *qpc, struct mlx5_core_qp *qp) +{ + struct mbox_info mbox; + int err; + + err = modify_qp_mbox_alloc(dev->mdev, opcode, qp->qpn, + opt_param_mask, qpc, &mbox, qp->uid); + if (err) + return err; + + err = mlx5_cmd_exec(dev->mdev, mbox.in, mbox.inlen, mbox.out, + mbox.outlen); + mbox_free(&mbox); + return err; +} + +int mlx5_init_qp_table(struct mlx5_ib_dev *dev) +{ + struct mlx5_qp_table *table = &dev->qp_table; + + spin_lock_init(&table->lock); + INIT_RADIX_TREE(&table->tree, GFP_ATOMIC); + mlx5_qp_debugfs_init(dev->mdev); + + table->nb.notifier_call = rsc_event_notifier; + mlx5_notifier_register(dev->mdev, &table->nb); + + return 0; +} + +void mlx5_cleanup_qp_table(struct mlx5_ib_dev *dev) +{ + struct mlx5_qp_table *table = &dev->qp_table; + + mlx5_notifier_unregister(dev->mdev, &table->nb); + mlx5_qp_debugfs_cleanup(dev->mdev); +} + +int mlx5_core_qp_query(struct mlx5_ib_dev *dev, struct mlx5_core_qp *qp, + u32 *out, int outlen) +{ + u32 in[MLX5_ST_SZ_DW(query_qp_in)] = {}; + + MLX5_SET(query_qp_in, in, opcode, MLX5_CMD_OP_QUERY_QP); + MLX5_SET(query_qp_in, in, qpn, qp->qpn); + return mlx5_cmd_exec(dev->mdev, in, sizeof(in), out, outlen); +} + +int mlx5_core_dct_query(struct mlx5_ib_dev *dev, struct mlx5_core_dct *dct, + u32 *out, int outlen) +{ + u32 in[MLX5_ST_SZ_DW(query_dct_in)] = {}; + struct mlx5_core_qp *qp = &dct->mqp; + + MLX5_SET(query_dct_in, in, opcode, MLX5_CMD_OP_QUERY_DCT); + MLX5_SET(query_dct_in, in, dctn, qp->qpn); + + return mlx5_cmd_exec(dev->mdev, (void *)&in, sizeof(in), (void *)out, + outlen); +} + +int mlx5_core_xrcd_alloc(struct mlx5_ib_dev *dev, u32 *xrcdn) +{ + u32 out[MLX5_ST_SZ_DW(alloc_xrcd_out)] = {}; + u32 in[MLX5_ST_SZ_DW(alloc_xrcd_in)] = {}; + int err; + + MLX5_SET(alloc_xrcd_in, in, opcode, MLX5_CMD_OP_ALLOC_XRCD); + err = mlx5_cmd_exec_inout(dev->mdev, alloc_xrcd, in, out); + if (!err) + *xrcdn = MLX5_GET(alloc_xrcd_out, out, xrcd); + return err; +} + +int mlx5_core_xrcd_dealloc(struct mlx5_ib_dev *dev, u32 xrcdn) +{ + u32 in[MLX5_ST_SZ_DW(dealloc_xrcd_in)] = {}; + + MLX5_SET(dealloc_xrcd_in, in, opcode, MLX5_CMD_OP_DEALLOC_XRCD); + MLX5_SET(dealloc_xrcd_in, in, xrcd, xrcdn); + return mlx5_cmd_exec_in(dev->mdev, dealloc_xrcd, in); +} + +static void destroy_rq_tracked(struct mlx5_ib_dev *dev, u32 rqn, u16 uid) +{ + u32 in[MLX5_ST_SZ_DW(destroy_rq_in)] = {}; + + MLX5_SET(destroy_rq_in, in, opcode, MLX5_CMD_OP_DESTROY_RQ); + MLX5_SET(destroy_rq_in, in, rqn, rqn); + MLX5_SET(destroy_rq_in, in, uid, uid); + mlx5_cmd_exec_in(dev->mdev, destroy_rq, in); +} + +int mlx5_core_create_rq_tracked(struct mlx5_ib_dev *dev, u32 *in, int inlen, + struct mlx5_core_qp *rq) +{ + int err; + u32 rqn; + + err = mlx5_core_create_rq(dev->mdev, in, inlen, &rqn); + if (err) + return err; + + rq->uid = MLX5_GET(create_rq_in, in, uid); + rq->qpn = rqn; + err = create_resource_common(dev, rq, MLX5_RES_RQ); + if (err) + goto err_destroy_rq; + + return 0; + +err_destroy_rq: + destroy_rq_tracked(dev, rq->qpn, rq->uid); + + return err; +} + +void mlx5_core_destroy_rq_tracked(struct mlx5_ib_dev *dev, + struct mlx5_core_qp *rq) +{ + destroy_resource_common(dev, rq); + destroy_rq_tracked(dev, rq->qpn, rq->uid); +} + +static void destroy_sq_tracked(struct mlx5_ib_dev *dev, u32 sqn, u16 uid) +{ + u32 in[MLX5_ST_SZ_DW(destroy_sq_in)] = {}; + + MLX5_SET(destroy_sq_in, in, opcode, MLX5_CMD_OP_DESTROY_SQ); + MLX5_SET(destroy_sq_in, in, sqn, sqn); + MLX5_SET(destroy_sq_in, in, uid, uid); + mlx5_cmd_exec_in(dev->mdev, destroy_sq, in); +} + +int mlx5_core_create_sq_tracked(struct mlx5_ib_dev *dev, u32 *in, int inlen, + struct mlx5_core_qp *sq) +{ + u32 out[MLX5_ST_SZ_DW(create_sq_out)] = {}; + int err; + + MLX5_SET(create_sq_in, in, opcode, MLX5_CMD_OP_CREATE_SQ); + err = mlx5_cmd_exec(dev->mdev, in, inlen, out, sizeof(out)); + if (err) + return err; + + sq->qpn = MLX5_GET(create_sq_out, out, sqn); + sq->uid = MLX5_GET(create_sq_in, in, uid); + err = create_resource_common(dev, sq, MLX5_RES_SQ); + if (err) + goto err_destroy_sq; + + return 0; + +err_destroy_sq: + destroy_sq_tracked(dev, sq->qpn, sq->uid); + + return err; +} + +void mlx5_core_destroy_sq_tracked(struct mlx5_ib_dev *dev, + struct mlx5_core_qp *sq) +{ + destroy_resource_common(dev, sq); + destroy_sq_tracked(dev, sq->qpn, sq->uid); +} + +struct mlx5_core_rsc_common *mlx5_core_res_hold(struct mlx5_ib_dev *dev, + int res_num, + enum mlx5_res_type res_type) +{ + u32 rsn = res_num | (res_type << MLX5_USER_INDEX_LEN); + struct mlx5_qp_table *table = &dev->qp_table; + + return mlx5_get_rsc(table, rsn); +} + +void mlx5_core_res_put(struct mlx5_core_rsc_common *res) +{ + mlx5_core_put_rsc(res); +} diff --git a/drivers/infiniband/hw/mlx5/srq_cmd.c b/drivers/infiniband/hw/mlx5/srq_cmd.c index 88c0388f9fc6..c851570791af 100644 --- a/drivers/infiniband/hw/mlx5/srq_cmd.c +++ b/drivers/infiniband/hw/mlx5/srq_cmd.c @@ -7,6 +7,7 @@ #include #include "mlx5_ib.h" #include "srq.h" +#include "qp.h" static int get_pas_size(struct mlx5_srq_attr *in) { diff --git a/drivers/net/ethernet/mellanox/mlx5/core/Makefile b/drivers/net/ethernet/mellanox/mlx5/core/Makefile index 6d32915000fc..d3c7dbd7f1d5 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/Makefile +++ b/drivers/net/ethernet/mellanox/mlx5/core/Makefile @@ -12,7 +12,7 @@ obj-$(CONFIG_MLX5_CORE) += mlx5_core.o # mlx5 core basic # mlx5_core-y := main.o cmd.o debugfs.o fw.o eq.o uar.o pagealloc.o \ - health.o mcg.o cq.o alloc.o qp.o port.o mr.o pd.o \ + health.o mcg.o cq.o alloc.o port.o mr.o pd.o \ transobj.o vport.o sriov.o fs_cmd.o fs_core.o pci_irq.o \ fs_counters.o rl.o lag.o dev.o events.o wq.o lib/gid.o \ lib/devcom.o lib/pci_vsc.o lib/dm.o diag/fs_tracepoint.o \ diff --git a/drivers/net/ethernet/mellanox/mlx5/core/debugfs.c b/drivers/net/ethernet/mellanox/mlx5/core/debugfs.c index d40c3d5bd496..65fef5a86644 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/debugfs.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/debugfs.c @@ -101,15 +101,15 @@ void mlx5_unregister_debugfs(void) void mlx5_qp_debugfs_init(struct mlx5_core_dev *dev) { - atomic_set(&dev->num_qps, 0); - dev->priv.qp_debugfs = debugfs_create_dir("QPs", dev->priv.dbg_root); } +EXPORT_SYMBOL(mlx5_qp_debugfs_init); void mlx5_qp_debugfs_cleanup(struct mlx5_core_dev *dev) { debugfs_remove_recursive(dev->priv.qp_debugfs); } +EXPORT_SYMBOL(mlx5_qp_debugfs_cleanup); void mlx5_eq_debugfs_init(struct mlx5_core_dev *dev) { @@ -450,6 +450,7 @@ int mlx5_debug_qp_add(struct mlx5_core_dev *dev, struct mlx5_core_qp *qp) return err; } +EXPORT_SYMBOL(mlx5_debug_qp_add); void mlx5_debug_qp_remove(struct mlx5_core_dev *dev, struct mlx5_core_qp *qp) { @@ -459,6 +460,7 @@ void mlx5_debug_qp_remove(struct mlx5_core_dev *dev, struct mlx5_core_qp *qp) if (qp->dbg) rem_res_tree(qp->dbg); } +EXPORT_SYMBOL(mlx5_debug_qp_remove); int mlx5_debug_eq_add(struct mlx5_core_dev *dev, struct mlx5_eq *eq) { diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c index 7af4210c1b96..6e19fa4d1310 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c @@ -836,8 +836,6 @@ static int mlx5_init_once(struct mlx5_core_dev *dev) mlx5_cq_debugfs_init(dev); - mlx5_init_qp_table(dev); - mlx5_init_reserved_gids(dev); mlx5_init_clock(dev); @@ -896,7 +894,6 @@ err_rl_cleanup: err_tables_cleanup: mlx5_geneve_destroy(dev->geneve); mlx5_vxlan_destroy(dev->vxlan); - mlx5_cleanup_qp_table(dev); mlx5_cq_debugfs_cleanup(dev); mlx5_events_cleanup(dev); err_eq_cleanup: @@ -924,7 +921,6 @@ static void mlx5_cleanup_once(struct mlx5_core_dev *dev) mlx5_vxlan_destroy(dev->vxlan); mlx5_cleanup_clock(dev); mlx5_cleanup_reserved_gids(dev); - mlx5_cleanup_qp_table(dev); mlx5_cq_debugfs_cleanup(dev); mlx5_events_cleanup(dev); mlx5_eq_table_cleanup(dev); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/qp.c b/drivers/net/ethernet/mellanox/mlx5/core/qp.c deleted file mode 100644 index d9df3a5dd532..000000000000 --- a/drivers/net/ethernet/mellanox/mlx5/core/qp.c +++ /dev/null @@ -1,697 +0,0 @@ -/* - * Copyright (c) 2013-2015, Mellanox Technologies. All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * OpenIB.org BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#include -#include -#include -#include -#include - -#include "mlx5_core.h" -#include "lib/eq.h" - -static int mlx5_core_drain_dct(struct mlx5_core_dev *dev, - struct mlx5_core_dct *dct); - -static struct mlx5_core_rsc_common * -mlx5_get_rsc(struct mlx5_qp_table *table, u32 rsn) -{ - struct mlx5_core_rsc_common *common; - unsigned long flags; - - spin_lock_irqsave(&table->lock, flags); - - common = radix_tree_lookup(&table->tree, rsn); - if (common) - refcount_inc(&common->refcount); - - spin_unlock_irqrestore(&table->lock, flags); - - return common; -} - -void mlx5_core_put_rsc(struct mlx5_core_rsc_common *common) -{ - if (refcount_dec_and_test(&common->refcount)) - complete(&common->free); -} - -static u64 qp_allowed_event_types(void) -{ - u64 mask; - - mask = BIT(MLX5_EVENT_TYPE_PATH_MIG) | - BIT(MLX5_EVENT_TYPE_COMM_EST) | - BIT(MLX5_EVENT_TYPE_SQ_DRAINED) | - BIT(MLX5_EVENT_TYPE_SRQ_LAST_WQE) | - BIT(MLX5_EVENT_TYPE_WQ_CATAS_ERROR) | - BIT(MLX5_EVENT_TYPE_PATH_MIG_FAILED) | - BIT(MLX5_EVENT_TYPE_WQ_INVAL_REQ_ERROR) | - BIT(MLX5_EVENT_TYPE_WQ_ACCESS_ERROR); - - return mask; -} - -static u64 rq_allowed_event_types(void) -{ - u64 mask; - - mask = BIT(MLX5_EVENT_TYPE_SRQ_LAST_WQE) | - BIT(MLX5_EVENT_TYPE_WQ_CATAS_ERROR); - - return mask; -} - -static u64 sq_allowed_event_types(void) -{ - return BIT(MLX5_EVENT_TYPE_WQ_CATAS_ERROR); -} - -static u64 dct_allowed_event_types(void) -{ - return BIT(MLX5_EVENT_TYPE_DCT_DRAINED); -} - -static bool is_event_type_allowed(int rsc_type, int event_type) -{ - switch (rsc_type) { - case MLX5_EVENT_QUEUE_TYPE_QP: - return BIT(event_type) & qp_allowed_event_types(); - case MLX5_EVENT_QUEUE_TYPE_RQ: - return BIT(event_type) & rq_allowed_event_types(); - case MLX5_EVENT_QUEUE_TYPE_SQ: - return BIT(event_type) & sq_allowed_event_types(); - case MLX5_EVENT_QUEUE_TYPE_DCT: - return BIT(event_type) & dct_allowed_event_types(); - default: - WARN(1, "Event arrived for unknown resource type"); - return false; - } -} - -static int rsc_event_notifier(struct notifier_block *nb, - unsigned long type, void *data) -{ - struct mlx5_core_rsc_common *common; - struct mlx5_qp_table *table; - struct mlx5_core_dev *dev; - struct mlx5_core_dct *dct; - u8 event_type = (u8)type; - struct mlx5_core_qp *qp; - struct mlx5_priv *priv; - struct mlx5_eqe *eqe; - u32 rsn; - - switch (event_type) { - case MLX5_EVENT_TYPE_DCT_DRAINED: - eqe = data; - rsn = be32_to_cpu(eqe->data.dct.dctn) & 0xffffff; - rsn |= (MLX5_RES_DCT << MLX5_USER_INDEX_LEN); - break; - case MLX5_EVENT_TYPE_PATH_MIG: - case MLX5_EVENT_TYPE_COMM_EST: - case MLX5_EVENT_TYPE_SQ_DRAINED: - case MLX5_EVENT_TYPE_SRQ_LAST_WQE: - case MLX5_EVENT_TYPE_WQ_CATAS_ERROR: - case MLX5_EVENT_TYPE_PATH_MIG_FAILED: - case MLX5_EVENT_TYPE_WQ_INVAL_REQ_ERROR: - case MLX5_EVENT_TYPE_WQ_ACCESS_ERROR: - eqe = data; - rsn = be32_to_cpu(eqe->data.qp_srq.qp_srq_n) & 0xffffff; - rsn |= (eqe->data.qp_srq.type << MLX5_USER_INDEX_LEN); - break; - default: - return NOTIFY_DONE; - } - - table = container_of(nb, struct mlx5_qp_table, nb); - priv = container_of(table, struct mlx5_priv, qp_table); - dev = container_of(priv, struct mlx5_core_dev, priv); - - mlx5_core_dbg(dev, "event (%d) arrived on resource 0x%x\n", eqe->type, rsn); - - common = mlx5_get_rsc(table, rsn); - if (!common) { - mlx5_core_dbg(dev, "Async event for unknown resource 0x%x\n", rsn); - return NOTIFY_OK; - } - - if (!is_event_type_allowed((rsn >> MLX5_USER_INDEX_LEN), event_type)) { - mlx5_core_warn(dev, "event 0x%.2x is not allowed on resource 0x%.8x\n", - event_type, rsn); - goto out; - } - - switch (common->res) { - case MLX5_RES_QP: - case MLX5_RES_RQ: - case MLX5_RES_SQ: - qp = (struct mlx5_core_qp *)common; - qp->event(qp, event_type); - break; - case MLX5_RES_DCT: - dct = (struct mlx5_core_dct *)common; - if (event_type == MLX5_EVENT_TYPE_DCT_DRAINED) - complete(&dct->drained); - break; - default: - mlx5_core_warn(dev, "invalid resource type for 0x%x\n", rsn); - } -out: - mlx5_core_put_rsc(common); - - return NOTIFY_OK; -} - -static int create_resource_common(struct mlx5_core_dev *dev, - struct mlx5_core_qp *qp, - int rsc_type) -{ - struct mlx5_qp_table *table = &dev->priv.qp_table; - int err; - - qp->common.res = rsc_type; - spin_lock_irq(&table->lock); - err = radix_tree_insert(&table->tree, - qp->qpn | (rsc_type << MLX5_USER_INDEX_LEN), - qp); - spin_unlock_irq(&table->lock); - if (err) - return err; - - refcount_set(&qp->common.refcount, 1); - init_completion(&qp->common.free); - qp->pid = current->pid; - - return 0; -} - -static void destroy_resource_common(struct mlx5_core_dev *dev, - struct mlx5_core_qp *qp) -{ - struct mlx5_qp_table *table = &dev->priv.qp_table; - unsigned long flags; - - spin_lock_irqsave(&table->lock, flags); - radix_tree_delete(&table->tree, - qp->qpn | (qp->common.res << MLX5_USER_INDEX_LEN)); - spin_unlock_irqrestore(&table->lock, flags); - mlx5_core_put_rsc((struct mlx5_core_rsc_common *)qp); - wait_for_completion(&qp->common.free); -} - -static int _mlx5_core_destroy_dct(struct mlx5_core_dev *dev, - struct mlx5_core_dct *dct, bool need_cleanup) -{ - u32 out[MLX5_ST_SZ_DW(destroy_dct_out)] = {0}; - u32 in[MLX5_ST_SZ_DW(destroy_dct_in)] = {0}; - struct mlx5_core_qp *qp = &dct->mqp; - int err; - - err = mlx5_core_drain_dct(dev, dct); - if (err) { - if (dev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR) { - goto destroy; - } else { - mlx5_core_warn( - dev, "failed drain DCT 0x%x with error 0x%x\n", - qp->qpn, err); - return err; - } - } - wait_for_completion(&dct->drained); -destroy: - if (need_cleanup) - destroy_resource_common(dev, &dct->mqp); - MLX5_SET(destroy_dct_in, in, opcode, MLX5_CMD_OP_DESTROY_DCT); - MLX5_SET(destroy_dct_in, in, dctn, qp->qpn); - MLX5_SET(destroy_dct_in, in, uid, qp->uid); - err = mlx5_cmd_exec(dev, (void *)&in, sizeof(in), - (void *)&out, sizeof(out)); - return err; -} - -int mlx5_core_create_dct(struct mlx5_core_dev *dev, - struct mlx5_core_dct *dct, - u32 *in, int inlen, - u32 *out, int outlen) -{ - struct mlx5_core_qp *qp = &dct->mqp; - int err; - - init_completion(&dct->drained); - MLX5_SET(create_dct_in, in, opcode, MLX5_CMD_OP_CREATE_DCT); - - err = mlx5_cmd_exec(dev, in, inlen, out, outlen); - if (err) { - mlx5_core_warn(dev, "create DCT failed, ret %d\n", err); - return err; - } - - qp->qpn = MLX5_GET(create_dct_out, out, dctn); - qp->uid = MLX5_GET(create_dct_in, in, uid); - err = create_resource_common(dev, qp, MLX5_RES_DCT); - if (err) - goto err_cmd; - - return 0; -err_cmd: - _mlx5_core_destroy_dct(dev, dct, false); - return err; -} -EXPORT_SYMBOL_GPL(mlx5_core_create_dct); - -int mlx5_core_create_qp(struct mlx5_core_dev *dev, - struct mlx5_core_qp *qp, - u32 *in, int inlen) -{ - u32 out[MLX5_ST_SZ_DW(create_qp_out)] = {0}; - u32 dout[MLX5_ST_SZ_DW(destroy_qp_out)]; - u32 din[MLX5_ST_SZ_DW(destroy_qp_in)]; - int err; - - MLX5_SET(create_qp_in, in, opcode, MLX5_CMD_OP_CREATE_QP); - - err = mlx5_cmd_exec(dev, in, inlen, out, sizeof(out)); - if (err) - return err; - - qp->uid = MLX5_GET(create_qp_in, in, uid); - qp->qpn = MLX5_GET(create_qp_out, out, qpn); - mlx5_core_dbg(dev, "qpn = 0x%x\n", qp->qpn); - - err = create_resource_common(dev, qp, MLX5_RES_QP); - if (err) - goto err_cmd; - - err = mlx5_debug_qp_add(dev, qp); - if (err) - mlx5_core_dbg(dev, "failed adding QP 0x%x to debug file system\n", - qp->qpn); - - atomic_inc(&dev->num_qps); - - return 0; - -err_cmd: - memset(din, 0, sizeof(din)); - memset(dout, 0, sizeof(dout)); - MLX5_SET(destroy_qp_in, din, opcode, MLX5_CMD_OP_DESTROY_QP); - MLX5_SET(destroy_qp_in, din, qpn, qp->qpn); - MLX5_SET(destroy_qp_in, din, uid, qp->uid); - mlx5_cmd_exec(dev, din, sizeof(din), dout, sizeof(dout)); - return err; -} -EXPORT_SYMBOL_GPL(mlx5_core_create_qp); - -static int mlx5_core_drain_dct(struct mlx5_core_dev *dev, - struct mlx5_core_dct *dct) -{ - u32 out[MLX5_ST_SZ_DW(drain_dct_out)] = {0}; - u32 in[MLX5_ST_SZ_DW(drain_dct_in)] = {0}; - struct mlx5_core_qp *qp = &dct->mqp; - - MLX5_SET(drain_dct_in, in, opcode, MLX5_CMD_OP_DRAIN_DCT); - MLX5_SET(drain_dct_in, in, dctn, qp->qpn); - MLX5_SET(drain_dct_in, in, uid, qp->uid); - return mlx5_cmd_exec(dev, (void *)&in, sizeof(in), - (void *)&out, sizeof(out)); -} - -int mlx5_core_destroy_dct(struct mlx5_core_dev *dev, - struct mlx5_core_dct *dct) -{ - return _mlx5_core_destroy_dct(dev, dct, true); -} -EXPORT_SYMBOL_GPL(mlx5_core_destroy_dct); - -int mlx5_core_destroy_qp(struct mlx5_core_dev *dev, - struct mlx5_core_qp *qp) -{ - u32 out[MLX5_ST_SZ_DW(destroy_qp_out)] = {0}; - u32 in[MLX5_ST_SZ_DW(destroy_qp_in)] = {0}; - int err; - - mlx5_debug_qp_remove(dev, qp); - - destroy_resource_common(dev, qp); - - MLX5_SET(destroy_qp_in, in, opcode, MLX5_CMD_OP_DESTROY_QP); - MLX5_SET(destroy_qp_in, in, qpn, qp->qpn); - MLX5_SET(destroy_qp_in, in, uid, qp->uid); - err = mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); - if (err) - return err; - - atomic_dec(&dev->num_qps); - return 0; -} -EXPORT_SYMBOL_GPL(mlx5_core_destroy_qp); - -int mlx5_core_set_delay_drop(struct mlx5_core_dev *dev, - u32 timeout_usec) -{ - u32 out[MLX5_ST_SZ_DW(set_delay_drop_params_out)] = {0}; - u32 in[MLX5_ST_SZ_DW(set_delay_drop_params_in)] = {0}; - - MLX5_SET(set_delay_drop_params_in, in, opcode, - MLX5_CMD_OP_SET_DELAY_DROP_PARAMS); - MLX5_SET(set_delay_drop_params_in, in, delay_drop_timeout, - timeout_usec / 100); - return mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); -} -EXPORT_SYMBOL_GPL(mlx5_core_set_delay_drop); - -struct mbox_info { - u32 *in; - u32 *out; - int inlen; - int outlen; -}; - -static int mbox_alloc(struct mbox_info *mbox, int inlen, int outlen) -{ - mbox->inlen = inlen; - mbox->outlen = outlen; - mbox->in = kzalloc(mbox->inlen, GFP_KERNEL); - mbox->out = kzalloc(mbox->outlen, GFP_KERNEL); - if (!mbox->in || !mbox->out) { - kfree(mbox->in); - kfree(mbox->out); - return -ENOMEM; - } - - return 0; -} - -static void mbox_free(struct mbox_info *mbox) -{ - kfree(mbox->in); - kfree(mbox->out); -} - -static int modify_qp_mbox_alloc(struct mlx5_core_dev *dev, u16 opcode, int qpn, - u32 opt_param_mask, void *qpc, - struct mbox_info *mbox, u16 uid) -{ - mbox->out = NULL; - mbox->in = NULL; - -#define MBOX_ALLOC(mbox, typ) \ - mbox_alloc(mbox, MLX5_ST_SZ_BYTES(typ##_in), MLX5_ST_SZ_BYTES(typ##_out)) - -#define MOD_QP_IN_SET(typ, in, _opcode, _qpn, _uid) \ - do { \ - MLX5_SET(typ##_in, in, opcode, _opcode); \ - MLX5_SET(typ##_in, in, qpn, _qpn); \ - MLX5_SET(typ##_in, in, uid, _uid); \ - } while (0) - -#define MOD_QP_IN_SET_QPC(typ, in, _opcode, _qpn, _opt_p, _qpc, _uid) \ - do { \ - MOD_QP_IN_SET(typ, in, _opcode, _qpn, _uid); \ - MLX5_SET(typ##_in, in, opt_param_mask, _opt_p); \ - memcpy(MLX5_ADDR_OF(typ##_in, in, qpc), _qpc, \ - MLX5_ST_SZ_BYTES(qpc)); \ - } while (0) - - switch (opcode) { - /* 2RST & 2ERR */ - case MLX5_CMD_OP_2RST_QP: - if (MBOX_ALLOC(mbox, qp_2rst)) - return -ENOMEM; - MOD_QP_IN_SET(qp_2rst, mbox->in, opcode, qpn, uid); - break; - case MLX5_CMD_OP_2ERR_QP: - if (MBOX_ALLOC(mbox, qp_2err)) - return -ENOMEM; - MOD_QP_IN_SET(qp_2err, mbox->in, opcode, qpn, uid); - break; - - /* MODIFY with QPC */ - case MLX5_CMD_OP_RST2INIT_QP: - if (MBOX_ALLOC(mbox, rst2init_qp)) - return -ENOMEM; - MOD_QP_IN_SET_QPC(rst2init_qp, mbox->in, opcode, qpn, - opt_param_mask, qpc, uid); - break; - case MLX5_CMD_OP_INIT2RTR_QP: - if (MBOX_ALLOC(mbox, init2rtr_qp)) - return -ENOMEM; - MOD_QP_IN_SET_QPC(init2rtr_qp, mbox->in, opcode, qpn, - opt_param_mask, qpc, uid); - break; - case MLX5_CMD_OP_RTR2RTS_QP: - if (MBOX_ALLOC(mbox, rtr2rts_qp)) - return -ENOMEM; - MOD_QP_IN_SET_QPC(rtr2rts_qp, mbox->in, opcode, qpn, - opt_param_mask, qpc, uid); - break; - case MLX5_CMD_OP_RTS2RTS_QP: - if (MBOX_ALLOC(mbox, rts2rts_qp)) - return -ENOMEM; - MOD_QP_IN_SET_QPC(rts2rts_qp, mbox->in, opcode, qpn, - opt_param_mask, qpc, uid); - break; - case MLX5_CMD_OP_SQERR2RTS_QP: - if (MBOX_ALLOC(mbox, sqerr2rts_qp)) - return -ENOMEM; - MOD_QP_IN_SET_QPC(sqerr2rts_qp, mbox->in, opcode, qpn, - opt_param_mask, qpc, uid); - break; - case MLX5_CMD_OP_INIT2INIT_QP: - if (MBOX_ALLOC(mbox, init2init_qp)) - return -ENOMEM; - MOD_QP_IN_SET_QPC(init2init_qp, mbox->in, opcode, qpn, - opt_param_mask, qpc, uid); - break; - default: - mlx5_core_err(dev, "Unknown transition for modify QP: OP(0x%x) QPN(0x%x)\n", - opcode, qpn); - return -EINVAL; - } - return 0; -} - -int mlx5_core_qp_modify(struct mlx5_core_dev *dev, u16 opcode, - u32 opt_param_mask, void *qpc, - struct mlx5_core_qp *qp) -{ - struct mbox_info mbox; - int err; - - err = modify_qp_mbox_alloc(dev, opcode, qp->qpn, - opt_param_mask, qpc, &mbox, qp->uid); - if (err) - return err; - - err = mlx5_cmd_exec(dev, mbox.in, mbox.inlen, mbox.out, mbox.outlen); - mbox_free(&mbox); - return err; -} -EXPORT_SYMBOL_GPL(mlx5_core_qp_modify); - -void mlx5_init_qp_table(struct mlx5_core_dev *dev) -{ - struct mlx5_qp_table *table = &dev->priv.qp_table; - - memset(table, 0, sizeof(*table)); - spin_lock_init(&table->lock); - INIT_RADIX_TREE(&table->tree, GFP_ATOMIC); - mlx5_qp_debugfs_init(dev); - - table->nb.notifier_call = rsc_event_notifier; - mlx5_notifier_register(dev, &table->nb); -} - -void mlx5_cleanup_qp_table(struct mlx5_core_dev *dev) -{ - struct mlx5_qp_table *table = &dev->priv.qp_table; - - mlx5_notifier_unregister(dev, &table->nb); - mlx5_qp_debugfs_cleanup(dev); -} - -int mlx5_core_qp_query(struct mlx5_core_dev *dev, struct mlx5_core_qp *qp, - u32 *out, int outlen) -{ - u32 in[MLX5_ST_SZ_DW(query_qp_in)] = {0}; - - MLX5_SET(query_qp_in, in, opcode, MLX5_CMD_OP_QUERY_QP); - MLX5_SET(query_qp_in, in, qpn, qp->qpn); - return mlx5_cmd_exec(dev, in, sizeof(in), out, outlen); -} -EXPORT_SYMBOL_GPL(mlx5_core_qp_query); - -int mlx5_core_dct_query(struct mlx5_core_dev *dev, struct mlx5_core_dct *dct, - u32 *out, int outlen) -{ - u32 in[MLX5_ST_SZ_DW(query_dct_in)] = {0}; - struct mlx5_core_qp *qp = &dct->mqp; - - MLX5_SET(query_dct_in, in, opcode, MLX5_CMD_OP_QUERY_DCT); - MLX5_SET(query_dct_in, in, dctn, qp->qpn); - - return mlx5_cmd_exec(dev, (void *)&in, sizeof(in), - (void *)out, outlen); -} -EXPORT_SYMBOL_GPL(mlx5_core_dct_query); - -int mlx5_core_xrcd_alloc(struct mlx5_core_dev *dev, u32 *xrcdn) -{ - u32 out[MLX5_ST_SZ_DW(alloc_xrcd_out)] = {0}; - u32 in[MLX5_ST_SZ_DW(alloc_xrcd_in)] = {0}; - int err; - - MLX5_SET(alloc_xrcd_in, in, opcode, MLX5_CMD_OP_ALLOC_XRCD); - err = mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); - if (!err) - *xrcdn = MLX5_GET(alloc_xrcd_out, out, xrcd); - return err; -} -EXPORT_SYMBOL_GPL(mlx5_core_xrcd_alloc); - -int mlx5_core_xrcd_dealloc(struct mlx5_core_dev *dev, u32 xrcdn) -{ - u32 out[MLX5_ST_SZ_DW(dealloc_xrcd_out)] = {0}; - u32 in[MLX5_ST_SZ_DW(dealloc_xrcd_in)] = {0}; - - MLX5_SET(dealloc_xrcd_in, in, opcode, MLX5_CMD_OP_DEALLOC_XRCD); - MLX5_SET(dealloc_xrcd_in, in, xrcd, xrcdn); - return mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); -} -EXPORT_SYMBOL_GPL(mlx5_core_xrcd_dealloc); - -static void destroy_rq_tracked(struct mlx5_core_dev *dev, u32 rqn, u16 uid) -{ - u32 in[MLX5_ST_SZ_DW(destroy_rq_in)] = {}; - u32 out[MLX5_ST_SZ_DW(destroy_rq_out)] = {}; - - MLX5_SET(destroy_rq_in, in, opcode, MLX5_CMD_OP_DESTROY_RQ); - MLX5_SET(destroy_rq_in, in, rqn, rqn); - MLX5_SET(destroy_rq_in, in, uid, uid); - mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); -} - -int mlx5_core_create_rq_tracked(struct mlx5_core_dev *dev, u32 *in, int inlen, - struct mlx5_core_qp *rq) -{ - int err; - u32 rqn; - - err = mlx5_core_create_rq(dev, in, inlen, &rqn); - if (err) - return err; - - rq->uid = MLX5_GET(create_rq_in, in, uid); - rq->qpn = rqn; - err = create_resource_common(dev, rq, MLX5_RES_RQ); - if (err) - goto err_destroy_rq; - - return 0; - -err_destroy_rq: - destroy_rq_tracked(dev, rq->qpn, rq->uid); - - return err; -} -EXPORT_SYMBOL(mlx5_core_create_rq_tracked); - -void mlx5_core_destroy_rq_tracked(struct mlx5_core_dev *dev, - struct mlx5_core_qp *rq) -{ - destroy_resource_common(dev, rq); - destroy_rq_tracked(dev, rq->qpn, rq->uid); -} -EXPORT_SYMBOL(mlx5_core_destroy_rq_tracked); - -static void destroy_sq_tracked(struct mlx5_core_dev *dev, u32 sqn, u16 uid) -{ - u32 in[MLX5_ST_SZ_DW(destroy_sq_in)] = {}; - u32 out[MLX5_ST_SZ_DW(destroy_sq_out)] = {}; - - MLX5_SET(destroy_sq_in, in, opcode, MLX5_CMD_OP_DESTROY_SQ); - MLX5_SET(destroy_sq_in, in, sqn, sqn); - MLX5_SET(destroy_sq_in, in, uid, uid); - mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); -} - -int mlx5_core_create_sq_tracked(struct mlx5_core_dev *dev, u32 *in, int inlen, - struct mlx5_core_qp *sq) -{ - int err; - u32 sqn; - - err = mlx5_core_create_sq(dev, in, inlen, &sqn); - if (err) - return err; - - sq->uid = MLX5_GET(create_sq_in, in, uid); - sq->qpn = sqn; - err = create_resource_common(dev, sq, MLX5_RES_SQ); - if (err) - goto err_destroy_sq; - - return 0; - -err_destroy_sq: - destroy_sq_tracked(dev, sq->qpn, sq->uid); - - return err; -} -EXPORT_SYMBOL(mlx5_core_create_sq_tracked); - -void mlx5_core_destroy_sq_tracked(struct mlx5_core_dev *dev, - struct mlx5_core_qp *sq) -{ - destroy_resource_common(dev, sq); - destroy_sq_tracked(dev, sq->qpn, sq->uid); -} -EXPORT_SYMBOL(mlx5_core_destroy_sq_tracked); - -struct mlx5_core_rsc_common *mlx5_core_res_hold(struct mlx5_core_dev *dev, - int res_num, - enum mlx5_res_type res_type) -{ - u32 rsn = res_num | (res_type << MLX5_USER_INDEX_LEN); - struct mlx5_qp_table *table = &dev->priv.qp_table; - - return mlx5_get_rsc(table, rsn); -} -EXPORT_SYMBOL_GPL(mlx5_core_res_hold); - -void mlx5_core_res_put(struct mlx5_core_rsc_common *res) -{ - mlx5_core_put_rsc(res); -} -EXPORT_SYMBOL_GPL(mlx5_core_res_put); diff --git a/include/linux/mlx5/cmd.h b/include/linux/mlx5/cmd.h deleted file mode 100644 index 68cd08f02c2f..000000000000 --- a/include/linux/mlx5/cmd.h +++ /dev/null @@ -1,51 +0,0 @@ -/* - * Copyright (c) 2013-2015, Mellanox Technologies. All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * OpenIB.org BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#ifndef MLX5_CMD_H -#define MLX5_CMD_H - -#include - -struct manage_pages_layout { - u64 ptr; - u32 reserved; - u16 num_entries; - u16 func_id; -}; - - -struct mlx5_cmd_alloc_uar_imm_out { - u32 rsvd[3]; - u32 uarn; -}; - -#endif /* MLX5_CMD_H */ diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 1caddfa85c4d..b60e5ab7906b 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -541,7 +541,6 @@ struct mlx5_priv { struct mlx5_core_health health; /* start: qp staff */ - struct mlx5_qp_table qp_table; struct dentry *qp_debugfs; struct dentry *eq_debugfs; struct dentry *cq_debugfs; @@ -687,7 +686,6 @@ struct mlx5_core_dev { unsigned long intf_state; struct mlx5_priv priv; struct mlx5_profile *profile; - atomic_t num_qps; u32 issi; struct mlx5e_resources mlx5e_res; struct mlx5_dm *dm; diff --git a/include/linux/mlx5/qp.h b/include/linux/mlx5/qp.h index 4d25a3d24182..ef127a156a62 100644 --- a/include/linux/mlx5/qp.h +++ b/include/linux/mlx5/qp.h @@ -553,53 +553,8 @@ struct mlx5_qp_context { u8 rsvd1[24]; }; -static inline struct mlx5_core_qp *__mlx5_qp_lookup(struct mlx5_core_dev *dev, u32 qpn) -{ - return radix_tree_lookup(&dev->priv.qp_table.tree, qpn); -} - -int mlx5_core_create_dct(struct mlx5_core_dev *dev, - struct mlx5_core_dct *qp, - u32 *in, int inlen, - u32 *out, int outlen); -int mlx5_core_create_qp(struct mlx5_core_dev *dev, - struct mlx5_core_qp *qp, - u32 *in, - int inlen); -int mlx5_core_qp_modify(struct mlx5_core_dev *dev, u16 opcode, - u32 opt_param_mask, void *qpc, - struct mlx5_core_qp *qp); -int mlx5_core_destroy_qp(struct mlx5_core_dev *dev, - struct mlx5_core_qp *qp); -int mlx5_core_destroy_dct(struct mlx5_core_dev *dev, - struct mlx5_core_dct *dct); -int mlx5_core_qp_query(struct mlx5_core_dev *dev, struct mlx5_core_qp *qp, - u32 *out, int outlen); -int mlx5_core_dct_query(struct mlx5_core_dev *dev, struct mlx5_core_dct *dct, - u32 *out, int outlen); - -int mlx5_core_set_delay_drop(struct mlx5_core_dev *dev, - u32 timeout_usec); - -int mlx5_core_xrcd_alloc(struct mlx5_core_dev *dev, u32 *xrcdn); -int mlx5_core_xrcd_dealloc(struct mlx5_core_dev *dev, u32 xrcdn); -void mlx5_init_qp_table(struct mlx5_core_dev *dev); -void mlx5_cleanup_qp_table(struct mlx5_core_dev *dev); int mlx5_debug_qp_add(struct mlx5_core_dev *dev, struct mlx5_core_qp *qp); void mlx5_debug_qp_remove(struct mlx5_core_dev *dev, struct mlx5_core_qp *qp); -int mlx5_core_create_rq_tracked(struct mlx5_core_dev *dev, u32 *in, int inlen, - struct mlx5_core_qp *rq); -void mlx5_core_destroy_rq_tracked(struct mlx5_core_dev *dev, - struct mlx5_core_qp *rq); -int mlx5_core_create_sq_tracked(struct mlx5_core_dev *dev, u32 *in, int inlen, - struct mlx5_core_qp *sq); -void mlx5_core_destroy_sq_tracked(struct mlx5_core_dev *dev, - struct mlx5_core_qp *sq); - -struct mlx5_core_rsc_common *mlx5_core_res_hold(struct mlx5_core_dev *dev, - int res_num, - enum mlx5_res_type res_type); -void mlx5_core_res_put(struct mlx5_core_rsc_common *res); static inline const char *mlx5_qp_type_str(int type) { -- cgit v1.2.3-59-g8ed1b From a2a322f447b91a9b85d332b345a3b508d97506a9 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Thu, 19 Mar 2020 11:43:59 +0200 Subject: net/mlx5: Refactor HCA capability set flow Reduce the amount of kzalloc/kfree cycles by allocating command structure in the parent function and leverage the knowledge that set_caps() is called for HCA capabilities only with specific HW structure as parameter to calculate mailbox size. Acked-by: Saeed Mahameed Signed-off-by: Leon Romanovsky --- drivers/net/ethernet/mellanox/mlx5/core/main.c | 83 ++++++++++---------------- 1 file changed, 31 insertions(+), 52 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c index 6e19fa4d1310..a000cd820ace 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c @@ -407,30 +407,28 @@ int mlx5_core_get_caps(struct mlx5_core_dev *dev, enum mlx5_cap_type cap_type) return mlx5_core_get_caps_mode(dev, cap_type, HCA_CAP_OPMOD_GET_MAX); } -static int set_caps(struct mlx5_core_dev *dev, void *in, int in_sz, int opmod) +static int set_caps(struct mlx5_core_dev *dev, void *in, int opmod) { - u32 out[MLX5_ST_SZ_DW(set_hca_cap_out)] = {0}; + u32 out[MLX5_ST_SZ_DW(set_hca_cap_out)] = {}; MLX5_SET(set_hca_cap_in, in, opcode, MLX5_CMD_OP_SET_HCA_CAP); MLX5_SET(set_hca_cap_in, in, op_mod, opmod << 1); - return mlx5_cmd_exec(dev, in, in_sz, out, sizeof(out)); + return mlx5_cmd_exec(dev, in, MLX5_ST_SZ_BYTES(set_hca_cap_in), out, + sizeof(out)); } -static int handle_hca_cap_atomic(struct mlx5_core_dev *dev) +static int handle_hca_cap_atomic(struct mlx5_core_dev *dev, void *set_ctx) { - void *set_ctx; void *set_hca_cap; - int set_sz = MLX5_ST_SZ_BYTES(set_hca_cap_in); int req_endianness; int err; - if (MLX5_CAP_GEN(dev, atomic)) { - err = mlx5_core_get_caps(dev, MLX5_CAP_ATOMIC); - if (err) - return err; - } else { + if (!MLX5_CAP_GEN(dev, atomic)) return 0; - } + + err = mlx5_core_get_caps(dev, MLX5_CAP_ATOMIC); + if (err) + return err; req_endianness = MLX5_CAP_ATOMIC(dev, @@ -439,27 +437,18 @@ static int handle_hca_cap_atomic(struct mlx5_core_dev *dev) if (req_endianness != MLX5_ATOMIC_REQ_MODE_HOST_ENDIANNESS) return 0; - set_ctx = kzalloc(set_sz, GFP_KERNEL); - if (!set_ctx) - return -ENOMEM; - set_hca_cap = MLX5_ADDR_OF(set_hca_cap_in, set_ctx, capability); /* Set requestor to host endianness */ MLX5_SET(atomic_caps, set_hca_cap, atomic_req_8B_endianness_mode, MLX5_ATOMIC_REQ_MODE_HOST_ENDIANNESS); - err = set_caps(dev, set_ctx, set_sz, MLX5_SET_HCA_CAP_OP_MOD_ATOMIC); - - kfree(set_ctx); - return err; + return set_caps(dev, set_ctx, MLX5_SET_HCA_CAP_OP_MOD_ATOMIC); } -static int handle_hca_cap_odp(struct mlx5_core_dev *dev) +static int handle_hca_cap_odp(struct mlx5_core_dev *dev, void *set_ctx) { void *set_hca_cap; - void *set_ctx; - int set_sz; bool do_set = false; int err; @@ -471,11 +460,6 @@ static int handle_hca_cap_odp(struct mlx5_core_dev *dev) if (err) return err; - set_sz = MLX5_ST_SZ_BYTES(set_hca_cap_in); - set_ctx = kzalloc(set_sz, GFP_KERNEL); - if (!set_ctx) - return -ENOMEM; - set_hca_cap = MLX5_ADDR_OF(set_hca_cap_in, set_ctx, capability); memcpy(set_hca_cap, dev->caps.hca_cur[MLX5_CAP_ODP], MLX5_ST_SZ_BYTES(odp_cap)); @@ -504,30 +488,21 @@ static int handle_hca_cap_odp(struct mlx5_core_dev *dev) ODP_CAP_SET_MAX(dev, dc_odp_caps.read); ODP_CAP_SET_MAX(dev, dc_odp_caps.atomic); - if (do_set) - err = set_caps(dev, set_ctx, set_sz, - MLX5_SET_HCA_CAP_OP_MOD_ODP); - - kfree(set_ctx); + if (!do_set) + return 0; - return err; + return set_caps(dev, set_ctx, MLX5_SET_HCA_CAP_OP_MOD_ODP); } -static int handle_hca_cap(struct mlx5_core_dev *dev) +static int handle_hca_cap(struct mlx5_core_dev *dev, void *set_ctx) { - void *set_ctx = NULL; struct mlx5_profile *prof = dev->profile; - int err = -ENOMEM; - int set_sz = MLX5_ST_SZ_BYTES(set_hca_cap_in); void *set_hca_cap; - - set_ctx = kzalloc(set_sz, GFP_KERNEL); - if (!set_ctx) - goto query_ex; + int err; err = mlx5_core_get_caps(dev, MLX5_CAP_GENERAL); if (err) - goto query_ex; + return err; set_hca_cap = MLX5_ADDR_OF(set_hca_cap_in, set_ctx, capability); @@ -578,37 +553,41 @@ static int handle_hca_cap(struct mlx5_core_dev *dev) num_vhca_ports, MLX5_CAP_GEN_MAX(dev, num_vhca_ports)); - err = set_caps(dev, set_ctx, set_sz, - MLX5_SET_HCA_CAP_OP_MOD_GENERAL_DEVICE); - -query_ex: - kfree(set_ctx); - return err; + return set_caps(dev, set_ctx, MLX5_SET_HCA_CAP_OP_MOD_GENERAL_DEVICE); } static int set_hca_cap(struct mlx5_core_dev *dev) { + int set_sz = MLX5_ST_SZ_BYTES(set_hca_cap_in); + void *set_ctx; int err; - err = handle_hca_cap(dev); + set_ctx = kzalloc(set_sz, GFP_KERNEL); + if (!set_ctx) + return -ENOMEM; + + err = handle_hca_cap(dev, set_ctx); if (err) { mlx5_core_err(dev, "handle_hca_cap failed\n"); goto out; } - err = handle_hca_cap_atomic(dev); + memset(set_ctx, 0, set_sz); + err = handle_hca_cap_atomic(dev, set_ctx); if (err) { mlx5_core_err(dev, "handle_hca_cap_atomic failed\n"); goto out; } - err = handle_hca_cap_odp(dev); + memset(set_ctx, 0, set_sz); + err = handle_hca_cap_odp(dev, set_ctx); if (err) { mlx5_core_err(dev, "handle_hca_cap_odp failed\n"); goto out; } out: + kfree(set_ctx); return err; } -- cgit v1.2.3-59-g8ed1b From 59e9e8e4fe83f68e599b87c06aaf239dcc64887b Mon Sep 17 00:00:00 2001 From: Mark Zhang Date: Tue, 14 Jan 2020 05:06:25 +0200 Subject: net/mlx5: Enable SW-defined RoCEv2 UDP source port When this is enabled, UDP source port for RoCEv2 packets are defined by software instead of firmware. Signed-off-by: Mark Zhang Reviewed-by: Maor Gottlieb Signed-off-by: Leon Romanovsky --- drivers/net/ethernet/mellanox/mlx5/core/main.c | 32 ++++++++++++++++++++++++++ include/linux/mlx5/mlx5_ifc.h | 5 +++- 2 files changed, 36 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c index a000cd820ace..0044aa5cc676 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c @@ -556,6 +556,31 @@ static int handle_hca_cap(struct mlx5_core_dev *dev, void *set_ctx) return set_caps(dev, set_ctx, MLX5_SET_HCA_CAP_OP_MOD_GENERAL_DEVICE); } +static int handle_hca_cap_roce(struct mlx5_core_dev *dev, void *set_ctx) +{ + void *set_hca_cap; + int err; + + if (!MLX5_CAP_GEN(dev, roce)) + return 0; + + err = mlx5_core_get_caps(dev, MLX5_CAP_ROCE); + if (err) + return err; + + if (MLX5_CAP_ROCE(dev, sw_r_roce_src_udp_port) || + !MLX5_CAP_ROCE_MAX(dev, sw_r_roce_src_udp_port)) + return 0; + + set_hca_cap = MLX5_ADDR_OF(set_hca_cap_in, set_ctx, capability); + memcpy(set_hca_cap, dev->caps.hca_cur[MLX5_CAP_ROCE], + MLX5_ST_SZ_BYTES(roce_cap)); + MLX5_SET(roce_cap, set_hca_cap, sw_r_roce_src_udp_port, 1); + + err = set_caps(dev, set_ctx, MLX5_SET_HCA_CAP_OP_MOD_ROCE); + return err; +} + static int set_hca_cap(struct mlx5_core_dev *dev) { int set_sz = MLX5_ST_SZ_BYTES(set_hca_cap_in); @@ -586,6 +611,13 @@ static int set_hca_cap(struct mlx5_core_dev *dev) goto out; } + memset(set_ctx, 0, set_sz); + err = handle_hca_cap_roce(dev, set_ctx); + if (err) { + mlx5_core_err(dev, "handle_hca_cap_roce failed\n"); + goto out; + } + out: kfree(set_ctx); return err; diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 69b27c7dfc3e..6fa24918eade 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -74,6 +74,7 @@ enum { MLX5_SET_HCA_CAP_OP_MOD_GENERAL_DEVICE = 0x0, MLX5_SET_HCA_CAP_OP_MOD_ODP = 0x2, MLX5_SET_HCA_CAP_OP_MOD_ATOMIC = 0x3, + MLX5_SET_HCA_CAP_OP_MOD_ROCE = 0x4, }; enum { @@ -903,7 +904,9 @@ struct mlx5_ifc_per_protocol_networking_offload_caps_bits { struct mlx5_ifc_roce_cap_bits { u8 roce_apm[0x1]; - u8 reserved_at_1[0x1f]; + u8 reserved_at_1[0x3]; + u8 sw_r_roce_src_udp_port[0x1]; + u8 reserved_at_5[0x1b]; u8 reserved_at_20[0x60]; -- cgit v1.2.3-59-g8ed1b From c2a3f8febc69f222d9fc3248bf774c8f0c5725f3 Mon Sep 17 00:00:00 2001 From: Sasha Neftin Date: Wed, 11 Mar 2020 11:35:06 +0200 Subject: igc: Add new device IDs for i225 part Add new device IDs for the next step of i225 Signed-off-by: Sasha Neftin Tested-by: Aaron Brown Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/igc/igc_base.c | 3 +++ drivers/net/ethernet/intel/igc/igc_hw.h | 3 +++ drivers/net/ethernet/intel/igc/igc_main.c | 3 +++ 3 files changed, 9 insertions(+) diff --git a/drivers/net/ethernet/intel/igc/igc_base.c b/drivers/net/ethernet/intel/igc/igc_base.c index 5a506440560a..f7fb18d8d8f5 100644 --- a/drivers/net/ethernet/intel/igc/igc_base.c +++ b/drivers/net/ethernet/intel/igc/igc_base.c @@ -212,6 +212,9 @@ static s32 igc_get_invariants_base(struct igc_hw *hw) case IGC_DEV_ID_I225_I: case IGC_DEV_ID_I220_V: case IGC_DEV_ID_I225_K: + case IGC_DEV_ID_I225_K2: + case IGC_DEV_ID_I225_LMVP: + case IGC_DEV_ID_I225_IT: case IGC_DEV_ID_I225_BLANK_NVM: mac->type = igc_i225; break; diff --git a/drivers/net/ethernet/intel/igc/igc_hw.h b/drivers/net/ethernet/intel/igc/igc_hw.h index 90ac0e0144d8..af34ae310327 100644 --- a/drivers/net/ethernet/intel/igc/igc_hw.h +++ b/drivers/net/ethernet/intel/igc/igc_hw.h @@ -21,6 +21,9 @@ #define IGC_DEV_ID_I225_I 0x15F8 #define IGC_DEV_ID_I220_V 0x15F7 #define IGC_DEV_ID_I225_K 0x3100 +#define IGC_DEV_ID_I225_K2 0x3101 +#define IGC_DEV_ID_I225_LMVP 0x5502 +#define IGC_DEV_ID_I225_IT 0x0D9F #define IGC_DEV_ID_I225_BLANK_NVM 0x15FD /* Function pointers for the MAC. */ diff --git a/drivers/net/ethernet/intel/igc/igc_main.c b/drivers/net/ethernet/intel/igc/igc_main.c index 9d1792e80e2e..6a7c1b081792 100644 --- a/drivers/net/ethernet/intel/igc/igc_main.c +++ b/drivers/net/ethernet/intel/igc/igc_main.c @@ -47,6 +47,9 @@ static const struct pci_device_id igc_pci_tbl[] = { { PCI_VDEVICE(INTEL, IGC_DEV_ID_I225_I), board_base }, { PCI_VDEVICE(INTEL, IGC_DEV_ID_I220_V), board_base }, { PCI_VDEVICE(INTEL, IGC_DEV_ID_I225_K), board_base }, + { PCI_VDEVICE(INTEL, IGC_DEV_ID_I225_K2), board_base }, + { PCI_VDEVICE(INTEL, IGC_DEV_ID_I225_LMVP), board_base }, + { PCI_VDEVICE(INTEL, IGC_DEV_ID_I225_IT), board_base }, { PCI_VDEVICE(INTEL, IGC_DEV_ID_I225_BLANK_NVM), board_base }, /* required last entry */ {0, } -- cgit v1.2.3-59-g8ed1b From 632fbd5eb5b0e01f03f1acb90a2b9ac1352b5dc7 Mon Sep 17 00:00:00 2001 From: Vitaly Lifshits Date: Thu, 12 Mar 2020 13:57:07 +0200 Subject: e1000e: fix S0ix flows for cable connected case Added a fix to S0ix entry and exit flows for TGP and above MAC types, to the case when the Ethernet cable is connected and the link is up. With that the system is able to reach SLP_S0 when going to freeze power state. Signed-off-by: Vitaly Lifshits Tested-by: Aaron Brown Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/e1000e/netdev.c | 54 ++++++++++++++++++++++++++++++ drivers/net/ethernet/intel/e1000e/regs.h | 3 ++ 2 files changed, 57 insertions(+) diff --git a/drivers/net/ethernet/intel/e1000e/netdev.c b/drivers/net/ethernet/intel/e1000e/netdev.c index 177c6da80c57..e0b074820b47 100644 --- a/drivers/net/ethernet/intel/e1000e/netdev.c +++ b/drivers/net/ethernet/intel/e1000e/netdev.c @@ -6404,6 +6404,31 @@ static void e1000e_s0ix_entry_flow(struct e1000_adapter *adapter) mac_data |= BIT(3); ew32(CTRL_EXT, mac_data); + /* Disable disconnected cable conditioning for Power Gating */ + mac_data = er32(DPGFR); + mac_data |= BIT(2); + ew32(DPGFR, mac_data); + + /* Don't wake from dynamic Power Gating with clock request */ + mac_data = er32(FEXTNVM12); + mac_data |= BIT(12); + ew32(FEXTNVM12, mac_data); + + /* Ungate PGCB clock */ + mac_data = er32(FEXTNVM9); + mac_data |= BIT(28); + ew32(FEXTNVM9, mac_data); + + /* Enable K1 off to enable mPHY Power Gating */ + mac_data = er32(FEXTNVM6); + mac_data |= BIT(31); + ew32(FEXTNVM12, mac_data); + + /* Enable mPHY power gating for any link and speed */ + mac_data = er32(FEXTNVM8); + mac_data |= BIT(9); + ew32(FEXTNVM8, mac_data); + /* Enable the Dynamic Clock Gating in the DMA and MAC */ mac_data = er32(CTRL_EXT); mac_data |= E1000_CTRL_EXT_DMA_DYN_CLK_EN; @@ -6433,6 +6458,35 @@ static void e1000e_s0ix_exit_flow(struct e1000_adapter *adapter) mac_data |= BIT(0); ew32(FEXTNVM7, mac_data); + /* Disable mPHY power gating for any link and speed */ + mac_data = er32(FEXTNVM8); + mac_data &= ~BIT(9); + ew32(FEXTNVM8, mac_data); + + /* Disable K1 off */ + mac_data = er32(FEXTNVM6); + mac_data &= ~BIT(31); + ew32(FEXTNVM12, mac_data); + + /* Disable Ungate PGCB clock */ + mac_data = er32(FEXTNVM9); + mac_data &= ~BIT(28); + ew32(FEXTNVM9, mac_data); + + /* Cancel not waking from dynamic + * Power Gating with clock request + */ + mac_data = er32(FEXTNVM12); + mac_data &= ~BIT(12); + ew32(FEXTNVM12, mac_data); + + /* Cancel disable disconnected cable conditioning + * for Power Gating + */ + mac_data = er32(DPGFR); + mac_data &= ~BIT(2); + ew32(DPGFR, mac_data); + /* Disable Dynamic Power Gating */ mac_data = er32(CTRL_EXT); mac_data &= 0xFFFFFFF7; diff --git a/drivers/net/ethernet/intel/e1000e/regs.h b/drivers/net/ethernet/intel/e1000e/regs.h index df59fd1d660c..8165ba2619a4 100644 --- a/drivers/net/ethernet/intel/e1000e/regs.h +++ b/drivers/net/ethernet/intel/e1000e/regs.h @@ -21,9 +21,12 @@ #define E1000_FEXTNVM5 0x00014 /* Future Extended NVM 5 - RW */ #define E1000_FEXTNVM6 0x00010 /* Future Extended NVM 6 - RW */ #define E1000_FEXTNVM7 0x000E4 /* Future Extended NVM 7 - RW */ +#define E1000_FEXTNVM8 0x5BB0 /* Future Extended NVM 8 - RW */ #define E1000_FEXTNVM9 0x5BB4 /* Future Extended NVM 9 - RW */ #define E1000_FEXTNVM11 0x5BBC /* Future Extended NVM 11 - RW */ +#define E1000_FEXTNVM12 0x5BC0 /* Future Extended NVM 12 - RW */ #define E1000_PCIEANACFG 0x00F18 /* PCIE Analog Config */ +#define E1000_DPGFR 0x00FAC /* Dynamic Power Gate Force Control Register */ #define E1000_FCT 0x00030 /* Flow Control Type - RW */ #define E1000_VET 0x00038 /* VLAN Ether Type - RW */ #define E1000_ICR 0x000C0 /* Interrupt Cause Read - R/clr */ -- cgit v1.2.3-59-g8ed1b From 0c2e060859aa1cb8b13376554d802a251f750fa9 Mon Sep 17 00:00:00 2001 From: Andre Guedes Date: Wed, 18 Mar 2020 16:00:51 -0700 Subject: igc: Remove duplicate code in MAC filtering logic This patch does a code refactoring in the MAC address filtering logic to get rid of some duplicate code. IGC driver has two functions to add MAC address filters that are pretty much the same: igc_add_mac_filter() and igc_add_mac_filter_flags(). The only difference is that the latter allows the callee to specify the 'flags' parameter while the former has it hard coded as zero. The same rationale applies to filter deletion counterparts. So this patch refactors igc_add_mac_filter() and igc_del_mac_filter() so they handle the 'flags' parameters, removes the _flags() functions, and fixes callees accordingly. Signed-off-by: Andre Guedes Tested-by: Aaron Brown Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/igc/igc_main.c | 112 ++++-------------------------- 1 file changed, 13 insertions(+), 99 deletions(-) diff --git a/drivers/net/ethernet/intel/igc/igc_main.c b/drivers/net/ethernet/intel/igc/igc_main.c index 6a7c1b081792..ade460f08ed1 100644 --- a/drivers/net/ethernet/intel/igc/igc_main.c +++ b/drivers/net/ethernet/intel/igc/igc_main.c @@ -2191,8 +2191,8 @@ static bool igc_mac_entry_can_be_used(const struct igc_mac_addr *entry, * default for the destination address, if matching by source address * is desired the flag IGC_MAC_STATE_SRC_ADDR can be used. */ -static int igc_add_mac_filter(struct igc_adapter *adapter, - const u8 *addr, const u8 queue) +static int igc_add_mac_filter(struct igc_adapter *adapter, const u8 *addr, + const u8 queue, const u8 flags) { struct igc_hw *hw = &adapter->hw; int rar_entries = hw->mac.rar_entry_count; @@ -2207,12 +2207,12 @@ static int igc_add_mac_filter(struct igc_adapter *adapter, */ for (i = 0; i < rar_entries; i++) { if (!igc_mac_entry_can_be_used(&adapter->mac_table[i], - addr, 0)) + addr, flags)) continue; ether_addr_copy(adapter->mac_table[i].addr, addr); adapter->mac_table[i].queue = queue; - adapter->mac_table[i].state |= IGC_MAC_STATE_IN_USE; + adapter->mac_table[i].state |= IGC_MAC_STATE_IN_USE | flags; igc_rar_set_index(adapter, i); return i; @@ -2227,8 +2227,8 @@ static int igc_add_mac_filter(struct igc_adapter *adapter, * matching by source address is to be removed the flag * IGC_MAC_STATE_SRC_ADDR can be used. */ -static int igc_del_mac_filter(struct igc_adapter *adapter, - const u8 *addr, const u8 queue) +static int igc_del_mac_filter(struct igc_adapter *adapter, const u8 *addr, + const u8 queue, const u8 flags) { struct igc_hw *hw = &adapter->hw; int rar_entries = hw->mac.rar_entry_count; @@ -2244,7 +2244,7 @@ static int igc_del_mac_filter(struct igc_adapter *adapter, for (i = 0; i < rar_entries; i++) { if (!(adapter->mac_table[i].state & IGC_MAC_STATE_IN_USE)) continue; - if (adapter->mac_table[i].state != 0) + if (flags && (adapter->mac_table[i].state & flags) != flags) continue; if (adapter->mac_table[i].queue != queue) continue; @@ -2276,7 +2276,7 @@ static int igc_uc_sync(struct net_device *netdev, const unsigned char *addr) struct igc_adapter *adapter = netdev_priv(netdev); int ret; - ret = igc_add_mac_filter(adapter, addr, adapter->num_rx_queues); + ret = igc_add_mac_filter(adapter, addr, adapter->num_rx_queues, 0); return min_t(int, ret, 0); } @@ -2285,7 +2285,7 @@ static int igc_uc_unsync(struct net_device *netdev, const unsigned char *addr) { struct igc_adapter *adapter = netdev_priv(netdev); - igc_del_mac_filter(adapter, addr, adapter->num_rx_queues); + igc_del_mac_filter(adapter, addr, adapter->num_rx_queues, 0); return 0; } @@ -3720,104 +3720,18 @@ igc_features_check(struct sk_buff *skb, struct net_device *dev, return features; } -/* Add a MAC filter for 'addr' directing matching traffic to 'queue', - * 'flags' is used to indicate what kind of match is made, match is by - * default for the destination address, if matching by source address - * is desired the flag IGC_MAC_STATE_SRC_ADDR can be used. - */ -static int igc_add_mac_filter_flags(struct igc_adapter *adapter, - const u8 *addr, const u8 queue, - const u8 flags) -{ - struct igc_hw *hw = &adapter->hw; - int rar_entries = hw->mac.rar_entry_count; - int i; - - if (is_zero_ether_addr(addr)) - return -EINVAL; - - /* Search for the first empty entry in the MAC table. - * Do not touch entries at the end of the table reserved for the VF MAC - * addresses. - */ - for (i = 0; i < rar_entries; i++) { - if (!igc_mac_entry_can_be_used(&adapter->mac_table[i], - addr, flags)) - continue; - - ether_addr_copy(adapter->mac_table[i].addr, addr); - adapter->mac_table[i].queue = queue; - adapter->mac_table[i].state |= IGC_MAC_STATE_IN_USE | flags; - - igc_rar_set_index(adapter, i); - return i; - } - - return -ENOSPC; -} - int igc_add_mac_steering_filter(struct igc_adapter *adapter, const u8 *addr, u8 queue, u8 flags) { - return igc_add_mac_filter_flags(adapter, addr, queue, - IGC_MAC_STATE_QUEUE_STEERING | flags); -} - -/* Remove a MAC filter for 'addr' directing matching traffic to - * 'queue', 'flags' is used to indicate what kind of match need to be - * removed, match is by default for the destination address, if - * matching by source address is to be removed the flag - * IGC_MAC_STATE_SRC_ADDR can be used. - */ -static int igc_del_mac_filter_flags(struct igc_adapter *adapter, - const u8 *addr, const u8 queue, - const u8 flags) -{ - struct igc_hw *hw = &adapter->hw; - int rar_entries = hw->mac.rar_entry_count; - int i; - - if (is_zero_ether_addr(addr)) - return -EINVAL; - - /* Search for matching entry in the MAC table based on given address - * and queue. Do not touch entries at the end of the table reserved - * for the VF MAC addresses. - */ - for (i = 0; i < rar_entries; i++) { - if (!(adapter->mac_table[i].state & IGC_MAC_STATE_IN_USE)) - continue; - if ((adapter->mac_table[i].state & flags) != flags) - continue; - if (adapter->mac_table[i].queue != queue) - continue; - if (!ether_addr_equal(adapter->mac_table[i].addr, addr)) - continue; - - /* When a filter for the default address is "deleted", - * we return it to its initial configuration - */ - if (adapter->mac_table[i].state & IGC_MAC_STATE_DEFAULT) { - adapter->mac_table[i].state = - IGC_MAC_STATE_DEFAULT | IGC_MAC_STATE_IN_USE; - } else { - adapter->mac_table[i].state = 0; - adapter->mac_table[i].queue = 0; - memset(adapter->mac_table[i].addr, 0, ETH_ALEN); - } - - igc_rar_set_index(adapter, i); - return 0; - } - - return -ENOENT; + return igc_add_mac_filter(adapter, addr, queue, + IGC_MAC_STATE_QUEUE_STEERING | flags); } int igc_del_mac_steering_filter(struct igc_adapter *adapter, const u8 *addr, u8 queue, u8 flags) { - return igc_del_mac_filter_flags(adapter, addr, queue, - IGC_MAC_STATE_QUEUE_STEERING | flags); + return igc_del_mac_filter(adapter, addr, queue, + IGC_MAC_STATE_QUEUE_STEERING | flags); } static void igc_tsync_interrupt(struct igc_adapter *adapter) -- cgit v1.2.3-59-g8ed1b From 23b7b511675669702ad32bf7f92dcf2ae05015ba Mon Sep 17 00:00:00 2001 From: Andre Guedes Date: Wed, 18 Mar 2020 16:00:52 -0700 Subject: igc: Check unsupported flag in igc_add_mac_filter() The IGC_MAC_STATE_SRC_ADDR flags is not supported by igc_add_mac_ filter() so this patch adds a check for it and returns -ENOTSUPP in case it is set. Signed-off-by: Andre Guedes Tested-by: Aaron Brown Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/igc/igc_main.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/ethernet/intel/igc/igc_main.c b/drivers/net/ethernet/intel/igc/igc_main.c index ade460f08ed1..66b3a689bb05 100644 --- a/drivers/net/ethernet/intel/igc/igc_main.c +++ b/drivers/net/ethernet/intel/igc/igc_main.c @@ -2200,6 +2200,8 @@ static int igc_add_mac_filter(struct igc_adapter *adapter, const u8 *addr, if (is_zero_ether_addr(addr)) return -EINVAL; + if (flags & IGC_MAC_STATE_SRC_ADDR) + return -ENOTSUPP; /* Search for the first empty entry in the MAC table. * Do not touch entries at the end of the table reserved for the VF MAC -- cgit v1.2.3-59-g8ed1b From 58184b8ff0786b219772016a50ce07c6b3020846 Mon Sep 17 00:00:00 2001 From: Andre Guedes Date: Wed, 18 Mar 2020 16:00:53 -0700 Subject: igc: Change igc_add_mac_filter() returning value In case of success, igc_add_mac_filter() returns the index in adapter->mac_table where the requested filter was added. This information, however, is not used by any caller of that function. In fact, callers have extra code just to handle this returning index as 0 (success). So this patch changes the function to return 0 on success instead, and cleans up the extra code. Signed-off-by: Andre Guedes Tested-by: Aaron Brown Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/igc/igc_ethtool.c | 2 -- drivers/net/ethernet/intel/igc/igc_main.c | 7 ++----- 2 files changed, 2 insertions(+), 7 deletions(-) diff --git a/drivers/net/ethernet/intel/igc/igc_ethtool.c b/drivers/net/ethernet/intel/igc/igc_ethtool.c index ff2a40496e4e..c9f4552c018b 100644 --- a/drivers/net/ethernet/intel/igc/igc_ethtool.c +++ b/drivers/net/ethernet/intel/igc/igc_ethtool.c @@ -1269,7 +1269,6 @@ int igc_add_filter(struct igc_adapter *adapter, struct igc_nfc_filter *input) err = igc_add_mac_steering_filter(adapter, input->filter.dst_addr, input->action, 0); - err = min_t(int, err, 0); if (err) return err; } @@ -1279,7 +1278,6 @@ int igc_add_filter(struct igc_adapter *adapter, struct igc_nfc_filter *input) input->filter.src_addr, input->action, IGC_MAC_STATE_SRC_ADDR); - err = min_t(int, err, 0); if (err) return err; } diff --git a/drivers/net/ethernet/intel/igc/igc_main.c b/drivers/net/ethernet/intel/igc/igc_main.c index 66b3a689bb05..7c060c731a7e 100644 --- a/drivers/net/ethernet/intel/igc/igc_main.c +++ b/drivers/net/ethernet/intel/igc/igc_main.c @@ -2217,7 +2217,7 @@ static int igc_add_mac_filter(struct igc_adapter *adapter, const u8 *addr, adapter->mac_table[i].state |= IGC_MAC_STATE_IN_USE | flags; igc_rar_set_index(adapter, i); - return i; + return 0; } return -ENOSPC; @@ -2276,11 +2276,8 @@ static int igc_del_mac_filter(struct igc_adapter *adapter, const u8 *addr, static int igc_uc_sync(struct net_device *netdev, const unsigned char *addr) { struct igc_adapter *adapter = netdev_priv(netdev); - int ret; - - ret = igc_add_mac_filter(adapter, addr, adapter->num_rx_queues, 0); - return min_t(int, ret, 0); + return igc_add_mac_filter(adapter, addr, adapter->num_rx_queues, 0); } static int igc_uc_unsync(struct net_device *netdev, const unsigned char *addr) -- cgit v1.2.3-59-g8ed1b From ec00f1090735259151568f6b74d19c3230d65787 Mon Sep 17 00:00:00 2001 From: Andre Guedes Date: Wed, 18 Mar 2020 16:00:54 -0700 Subject: igc: Fix igc_uc_unsync() In case igc_del_mac_filter() returns error, that error is masked since the functions always return 0 (success). This patch fixes igc_uc_unsync() so it returns whatever value igc_del_mac_filter() returns (0 on success, negative number on error). Signed-off-by: Andre Guedes Tested-by: Aaron Brown Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/igc/igc_main.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/net/ethernet/intel/igc/igc_main.c b/drivers/net/ethernet/intel/igc/igc_main.c index 7c060c731a7e..dc4632428117 100644 --- a/drivers/net/ethernet/intel/igc/igc_main.c +++ b/drivers/net/ethernet/intel/igc/igc_main.c @@ -2284,9 +2284,7 @@ static int igc_uc_unsync(struct net_device *netdev, const unsigned char *addr) { struct igc_adapter *adapter = netdev_priv(netdev); - igc_del_mac_filter(adapter, addr, adapter->num_rx_queues, 0); - - return 0; + return igc_del_mac_filter(adapter, addr, adapter->num_rx_queues, 0); } /** -- cgit v1.2.3-59-g8ed1b From 424045bec085575fe4818428e6a68dac7cca48f3 Mon Sep 17 00:00:00 2001 From: Andre Guedes Date: Wed, 1 Apr 2020 14:36:45 -0700 Subject: igc: Refactor igc_rar_set_index() Current igc_rar_set_index() implementation is a bit convoluted so this patch does some code refactoring to improve it. The helper igc_rar_set_index() is about writing MAC filter settings into hardware registers. Logic such as address validation belongs to functions upper in the call chain such as igc_set_mac() and igc_add_mac_filter(). So this patch moves the is_valid_ether_addr() call to igc_add_mac_filter(). No need to touch igc_set_mac() since it already checks it. The variables 'rar_low' and 'rar_high' represent the value in registers RAL and RAH so we rename them to 'ral' and 'rah', respectively, to match the registers names. To make it explicit, filter settings are passed as arguments to the function instead of reading them from adapter->mac_table "under the hood". Also, the function was renamed to igc_set_mac_filter_hw to make it more clear what it does. Finally, the patch removes some wrfl() calls and comments not needed. Signed-off-by: Andre Guedes Tested-by: Aaron Brown Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/igc/igc_main.c | 73 ++++++++++++++++++------------- 1 file changed, 42 insertions(+), 31 deletions(-) diff --git a/drivers/net/ethernet/intel/igc/igc_main.c b/drivers/net/ethernet/intel/igc/igc_main.c index dc4632428117..2f6c8f7fa6f4 100644 --- a/drivers/net/ethernet/intel/igc/igc_main.c +++ b/drivers/net/ethernet/intel/igc/igc_main.c @@ -765,42 +765,52 @@ static void igc_setup_tctl(struct igc_adapter *adapter) } /** - * igc_rar_set_index - Sync RAL[index] and RAH[index] registers with MAC table - * @adapter: address of board private structure - * @index: Index of the RAR entry which need to be synced with MAC table + * igc_set_mac_filter_hw() - Set MAC address filter in hardware + * @adapter: Pointer to adapter where the filter should be set + * @index: Filter index + * @addr: Destination MAC address + * @queue: If non-negative, queue assignment feature is enabled and frames + * matching the filter are enqueued onto 'queue'. Otherwise, queue + * assignment is disabled. */ -static void igc_rar_set_index(struct igc_adapter *adapter, u32 index) +static void igc_set_mac_filter_hw(struct igc_adapter *adapter, int index, + const u8 *addr, int queue) { - u8 *addr = adapter->mac_table[index].addr; struct igc_hw *hw = &adapter->hw; - u32 rar_low, rar_high; + u32 ral, rah; - /* HW expects these to be in network order when they are plugged - * into the registers which are little endian. In order to guarantee - * that ordering we need to do an leXX_to_cpup here in order to be - * ready for the byteswap that occurs with writel - */ - rar_low = le32_to_cpup((__le32 *)(addr)); - rar_high = le16_to_cpup((__le16 *)(addr + 4)); + if (WARN_ON(index >= hw->mac.rar_entry_count)) + return; - if (adapter->mac_table[index].state & IGC_MAC_STATE_QUEUE_STEERING) { - u8 queue = adapter->mac_table[index].queue; - u32 qsel = IGC_RAH_QSEL_MASK & (queue << IGC_RAH_QSEL_SHIFT); + ral = le32_to_cpup((__le32 *)(addr)); + rah = le16_to_cpup((__le16 *)(addr + 4)); - rar_high |= qsel; - rar_high |= IGC_RAH_QSEL_ENABLE; + if (queue >= 0) { + rah &= ~IGC_RAH_QSEL_MASK; + rah |= (queue << IGC_RAH_QSEL_SHIFT); + rah |= IGC_RAH_QSEL_ENABLE; } - /* Indicate to hardware the Address is Valid. */ - if (adapter->mac_table[index].state & IGC_MAC_STATE_IN_USE) { - if (is_valid_ether_addr(addr)) - rar_high |= IGC_RAH_AV; - } + rah |= IGC_RAH_AV; - wr32(IGC_RAL(index), rar_low); - wrfl(); - wr32(IGC_RAH(index), rar_high); - wrfl(); + wr32(IGC_RAL(index), ral); + wr32(IGC_RAH(index), rah); +} + +/** + * igc_clear_mac_filter_hw() - Clear MAC address filter in hardware + * @adapter: Pointer to adapter where the filter should be cleared + * @index: Filter index + */ +static void igc_clear_mac_filter_hw(struct igc_adapter *adapter, int index) +{ + struct igc_hw *hw = &adapter->hw; + + if (WARN_ON(index >= hw->mac.rar_entry_count)) + return; + + wr32(IGC_RAL(index), 0); + wr32(IGC_RAH(index), 0); } /* Set default MAC address for the PF in the first RAR entry */ @@ -811,7 +821,7 @@ static void igc_set_default_mac_filter(struct igc_adapter *adapter) ether_addr_copy(mac_table->addr, adapter->hw.mac.addr); mac_table->state = IGC_MAC_STATE_DEFAULT | IGC_MAC_STATE_IN_USE; - igc_rar_set_index(adapter, 0); + igc_set_mac_filter_hw(adapter, 0, mac_table->addr, -1); } /** @@ -2198,7 +2208,7 @@ static int igc_add_mac_filter(struct igc_adapter *adapter, const u8 *addr, int rar_entries = hw->mac.rar_entry_count; int i; - if (is_zero_ether_addr(addr)) + if (!is_valid_ether_addr(addr)) return -EINVAL; if (flags & IGC_MAC_STATE_SRC_ADDR) return -ENOTSUPP; @@ -2216,7 +2226,7 @@ static int igc_add_mac_filter(struct igc_adapter *adapter, const u8 *addr, adapter->mac_table[i].queue = queue; adapter->mac_table[i].state |= IGC_MAC_STATE_IN_USE | flags; - igc_rar_set_index(adapter, i); + igc_set_mac_filter_hw(adapter, i, addr, queue); return 0; } @@ -2260,13 +2270,14 @@ static int igc_del_mac_filter(struct igc_adapter *adapter, const u8 *addr, adapter->mac_table[i].state = IGC_MAC_STATE_DEFAULT | IGC_MAC_STATE_IN_USE; adapter->mac_table[i].queue = 0; + igc_set_mac_filter_hw(adapter, 0, addr, -1); } else { adapter->mac_table[i].state = 0; adapter->mac_table[i].queue = 0; memset(adapter->mac_table[i].addr, 0, ETH_ALEN); + igc_clear_mac_filter_hw(adapter, i); } - igc_rar_set_index(adapter, i); return 0; } -- cgit v1.2.3-59-g8ed1b From a73eb651005eef079a6bf3b448d1b6eb607bc80b Mon Sep 17 00:00:00 2001 From: Andre Guedes Date: Wed, 18 Mar 2020 16:00:56 -0700 Subject: igc: Improve address check in igc_del_mac_filter() igc_add_mac_filter() doesn't allow filters with invalid MAC address to be added to adapter->mac_table so, in igc_del_mac_filter(), we can early return if MAC address is invalid. No need to traverse the table. Signed-off-by: Andre Guedes Tested-by: Aaron Brown Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/igc/igc_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/intel/igc/igc_main.c b/drivers/net/ethernet/intel/igc/igc_main.c index 2f6c8f7fa6f4..070df92bb4e9 100644 --- a/drivers/net/ethernet/intel/igc/igc_main.c +++ b/drivers/net/ethernet/intel/igc/igc_main.c @@ -2246,7 +2246,7 @@ static int igc_del_mac_filter(struct igc_adapter *adapter, const u8 *addr, int rar_entries = hw->mac.rar_entry_count; int i; - if (is_zero_ether_addr(addr)) + if (!is_valid_ether_addr(addr)) return -EINVAL; /* Search for matching entry in the MAC table based on given address -- cgit v1.2.3-59-g8ed1b From c6aae5917b8a244dcb4114be806ba3ac52e3480a Mon Sep 17 00:00:00 2001 From: Andre Guedes Date: Wed, 1 Apr 2020 14:41:43 -0700 Subject: igc: Remove 'queue' check in igc_del_mac_filter() igc_add_mac_filter() doesn't allow us to have more than one entry with the same address and address type in adapter->mac_table so checking if 'queue' matches in igc_del_mac_filter() isn't necessary. This patch removes that check. This patch also takes the opportunity to improve the igc_del_mac_filter documentation and remove comment which is not applicable to this I225 controller. Signed-off-by: Andre Guedes Tested-by: Aaron Brown Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/igc/igc_main.c | 25 +++++++++++-------------- 1 file changed, 11 insertions(+), 14 deletions(-) diff --git a/drivers/net/ethernet/intel/igc/igc_main.c b/drivers/net/ethernet/intel/igc/igc_main.c index 070df92bb4e9..badb8ecf38dc 100644 --- a/drivers/net/ethernet/intel/igc/igc_main.c +++ b/drivers/net/ethernet/intel/igc/igc_main.c @@ -2233,14 +2233,17 @@ static int igc_add_mac_filter(struct igc_adapter *adapter, const u8 *addr, return -ENOSPC; } -/* Remove a MAC filter for 'addr' directing matching traffic to - * 'queue', 'flags' is used to indicate what kind of match need to be - * removed, match is by default for the destination address, if - * matching by source address is to be removed the flag - * IGC_MAC_STATE_SRC_ADDR can be used. +/** + * igc_del_mac_filter() - Delete MAC address filter + * @adapter: Pointer to adapter where the filter should be deleted from + * @addr: MAC address + * @flags: Set IGC_MAC_STATE_SRC_ADDR bit to indicate @address is a source + * address + * + * Return: 0 in case of success, negative errno code otherwise. */ static int igc_del_mac_filter(struct igc_adapter *adapter, const u8 *addr, - const u8 queue, const u8 flags) + const u8 flags) { struct igc_hw *hw = &adapter->hw; int rar_entries = hw->mac.rar_entry_count; @@ -2249,17 +2252,11 @@ static int igc_del_mac_filter(struct igc_adapter *adapter, const u8 *addr, if (!is_valid_ether_addr(addr)) return -EINVAL; - /* Search for matching entry in the MAC table based on given address - * and queue. Do not touch entries at the end of the table reserved - * for the VF MAC addresses. - */ for (i = 0; i < rar_entries; i++) { if (!(adapter->mac_table[i].state & IGC_MAC_STATE_IN_USE)) continue; if (flags && (adapter->mac_table[i].state & flags) != flags) continue; - if (adapter->mac_table[i].queue != queue) - continue; if (!ether_addr_equal(adapter->mac_table[i].addr, addr)) continue; @@ -2295,7 +2292,7 @@ static int igc_uc_unsync(struct net_device *netdev, const unsigned char *addr) { struct igc_adapter *adapter = netdev_priv(netdev); - return igc_del_mac_filter(adapter, addr, adapter->num_rx_queues, 0); + return igc_del_mac_filter(adapter, addr, 0); } /** @@ -3738,7 +3735,7 @@ int igc_add_mac_steering_filter(struct igc_adapter *adapter, int igc_del_mac_steering_filter(struct igc_adapter *adapter, const u8 *addr, u8 queue, u8 flags) { - return igc_del_mac_filter(adapter, addr, queue, + return igc_del_mac_filter(adapter, addr, IGC_MAC_STATE_QUEUE_STEERING | flags); } -- cgit v1.2.3-59-g8ed1b From e9736fa407e53224e9f23a092b17d8f2d1eb705d Mon Sep 17 00:00:00 2001 From: Andre Guedes Date: Wed, 1 Apr 2020 14:43:58 -0700 Subject: igc: Remove IGC_MAC_STATE_QUEUE_STEERING The IGC_MAC_STATE_QUEUE_STEERING bit in mac_table[i].state is utilized to indicate that frames matching the filter are assigned to mac_table[i].queue. This bit is not strictly necessary since we can convey the same information as follows: queue == -1 means queue assignment is disabled, otherwise it is enabled. In addition to make the code simpler, this change fixes some awkward situations where we pass a complete misleading 'queue' value such as in igc_uc_sync(). So this patch removes IGC_MAC_STATE_QUEUE_STEERING and also takes the opportunity to improve the igc_add_mac_filter documentation. Signed-off-by: Andre Guedes Tested-by: Aaron Brown Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/igc/igc.h | 3 +-- drivers/net/ethernet/intel/igc/igc_main.c | 32 ++++++++++++++++++------------- 2 files changed, 20 insertions(+), 15 deletions(-) diff --git a/drivers/net/ethernet/intel/igc/igc.h b/drivers/net/ethernet/intel/igc/igc.h index 5f21dcfe99ce..8d5ebe2103ee 100644 --- a/drivers/net/ethernet/intel/igc/igc.h +++ b/drivers/net/ethernet/intel/igc/igc.h @@ -466,14 +466,13 @@ struct igc_nfc_filter { struct igc_mac_addr { u8 addr[ETH_ALEN]; - u8 queue; + s8 queue; u8 state; /* bitmask */ }; #define IGC_MAC_STATE_DEFAULT 0x1 #define IGC_MAC_STATE_IN_USE 0x2 #define IGC_MAC_STATE_SRC_ADDR 0x4 -#define IGC_MAC_STATE_QUEUE_STEERING 0x8 #define IGC_MAX_RXNFC_FILTERS 16 diff --git a/drivers/net/ethernet/intel/igc/igc_main.c b/drivers/net/ethernet/intel/igc/igc_main.c index badb8ecf38dc..e195400cd490 100644 --- a/drivers/net/ethernet/intel/igc/igc_main.c +++ b/drivers/net/ethernet/intel/igc/igc_main.c @@ -820,8 +820,9 @@ static void igc_set_default_mac_filter(struct igc_adapter *adapter) ether_addr_copy(mac_table->addr, adapter->hw.mac.addr); mac_table->state = IGC_MAC_STATE_DEFAULT | IGC_MAC_STATE_IN_USE; + mac_table->queue = -1; - igc_set_mac_filter_hw(adapter, 0, mac_table->addr, -1); + igc_set_mac_filter_hw(adapter, 0, mac_table->addr, mac_table->queue); } /** @@ -2196,13 +2197,20 @@ static bool igc_mac_entry_can_be_used(const struct igc_mac_addr *entry, return true; } -/* Add a MAC filter for 'addr' directing matching traffic to 'queue', - * 'flags' is used to indicate what kind of match is made, match is by - * default for the destination address, if matching by source address - * is desired the flag IGC_MAC_STATE_SRC_ADDR can be used. +/** + * igc_add_mac_filter() - Add MAC address filter + * @adapter: Pointer to adapter where the filter should be added + * @addr: MAC address + * @queue: If non-negative, queue assignment feature is enabled and frames + * matching the filter are enqueued onto 'queue'. Otherwise, queue + * assignment is disabled. + * @flags: Set IGC_MAC_STATE_SRC_ADDR bit to indicate @address is a source + * address + * + * Return: 0 in case of success, negative errno code otherwise. */ static int igc_add_mac_filter(struct igc_adapter *adapter, const u8 *addr, - const u8 queue, const u8 flags) + const s8 queue, const u8 flags) { struct igc_hw *hw = &adapter->hw; int rar_entries = hw->mac.rar_entry_count; @@ -2266,11 +2274,11 @@ static int igc_del_mac_filter(struct igc_adapter *adapter, const u8 *addr, if (adapter->mac_table[i].state & IGC_MAC_STATE_DEFAULT) { adapter->mac_table[i].state = IGC_MAC_STATE_DEFAULT | IGC_MAC_STATE_IN_USE; - adapter->mac_table[i].queue = 0; + adapter->mac_table[i].queue = -1; igc_set_mac_filter_hw(adapter, 0, addr, -1); } else { adapter->mac_table[i].state = 0; - adapter->mac_table[i].queue = 0; + adapter->mac_table[i].queue = -1; memset(adapter->mac_table[i].addr, 0, ETH_ALEN); igc_clear_mac_filter_hw(adapter, i); } @@ -2285,7 +2293,7 @@ static int igc_uc_sync(struct net_device *netdev, const unsigned char *addr) { struct igc_adapter *adapter = netdev_priv(netdev); - return igc_add_mac_filter(adapter, addr, adapter->num_rx_queues, 0); + return igc_add_mac_filter(adapter, addr, -1, 0); } static int igc_uc_unsync(struct net_device *netdev, const unsigned char *addr) @@ -3728,15 +3736,13 @@ igc_features_check(struct sk_buff *skb, struct net_device *dev, int igc_add_mac_steering_filter(struct igc_adapter *adapter, const u8 *addr, u8 queue, u8 flags) { - return igc_add_mac_filter(adapter, addr, queue, - IGC_MAC_STATE_QUEUE_STEERING | flags); + return igc_add_mac_filter(adapter, addr, queue, flags); } int igc_del_mac_steering_filter(struct igc_adapter *adapter, const u8 *addr, u8 queue, u8 flags) { - return igc_del_mac_filter(adapter, addr, - IGC_MAC_STATE_QUEUE_STEERING | flags); + return igc_del_mac_filter(adapter, addr, flags); } static void igc_tsync_interrupt(struct igc_adapter *adapter) -- cgit v1.2.3-59-g8ed1b From 83ba21b9ef7706413bdaf9fa8357b93c4986d8a0 Mon Sep 17 00:00:00 2001 From: Andre Guedes Date: Wed, 18 Mar 2020 16:00:59 -0700 Subject: igc: Remove igc_*_mac_steering_filter() wrappers With the previous two patches, igc_add_mac_steering_filter() and igc_del_mac_steering_filter() became a pointless wrapper of igc_add_mac_filter() and igc_del_mac_filter(). This patch removes these wrappers and update callers to call igc_add_mac_filter() and igc_del_mac_filter() directly. Signed-off-by: Andre Guedes Tested-by: Aaron Brown Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/igc/igc.h | 8 ++++---- drivers/net/ethernet/intel/igc/igc_ethtool.c | 20 ++++++++------------ drivers/net/ethernet/intel/igc/igc_main.c | 20 ++++---------------- 3 files changed, 16 insertions(+), 32 deletions(-) diff --git a/drivers/net/ethernet/intel/igc/igc.h b/drivers/net/ethernet/intel/igc/igc.h index 8d5ebe2103ee..8ddc39482a8e 100644 --- a/drivers/net/ethernet/intel/igc/igc.h +++ b/drivers/net/ethernet/intel/igc/igc.h @@ -227,10 +227,10 @@ void igc_write_rss_indir_tbl(struct igc_adapter *adapter); bool igc_has_link(struct igc_adapter *adapter); void igc_reset(struct igc_adapter *adapter); int igc_set_spd_dplx(struct igc_adapter *adapter, u32 spd, u8 dplx); -int igc_add_mac_steering_filter(struct igc_adapter *adapter, - const u8 *addr, u8 queue, u8 flags); -int igc_del_mac_steering_filter(struct igc_adapter *adapter, - const u8 *addr, u8 queue, u8 flags); +int igc_add_mac_filter(struct igc_adapter *adapter, const u8 *addr, + const s8 queue, const u8 flags); +int igc_del_mac_filter(struct igc_adapter *adapter, const u8 *addr, + const u8 flags); void igc_update_stats(struct igc_adapter *adapter); /* igc_dump declarations */ diff --git a/drivers/net/ethernet/intel/igc/igc_ethtool.c b/drivers/net/ethernet/intel/igc/igc_ethtool.c index c9f4552c018b..0a8c4a7412a4 100644 --- a/drivers/net/ethernet/intel/igc/igc_ethtool.c +++ b/drivers/net/ethernet/intel/igc/igc_ethtool.c @@ -1266,18 +1266,16 @@ int igc_add_filter(struct igc_adapter *adapter, struct igc_nfc_filter *input) } if (input->filter.match_flags & IGC_FILTER_FLAG_DST_MAC_ADDR) { - err = igc_add_mac_steering_filter(adapter, - input->filter.dst_addr, - input->action, 0); + err = igc_add_mac_filter(adapter, input->filter.dst_addr, + input->action, 0); if (err) return err; } if (input->filter.match_flags & IGC_FILTER_FLAG_SRC_MAC_ADDR) { - err = igc_add_mac_steering_filter(adapter, - input->filter.src_addr, - input->action, - IGC_MAC_STATE_SRC_ADDR); + err = igc_add_mac_filter(adapter, input->filter.src_addr, + input->action, + IGC_MAC_STATE_SRC_ADDR); if (err) return err; } @@ -1331,13 +1329,11 @@ int igc_erase_filter(struct igc_adapter *adapter, struct igc_nfc_filter *input) ntohs(input->filter.vlan_tci)); if (input->filter.match_flags & IGC_FILTER_FLAG_SRC_MAC_ADDR) - igc_del_mac_steering_filter(adapter, input->filter.src_addr, - input->action, - IGC_MAC_STATE_SRC_ADDR); + igc_del_mac_filter(adapter, input->filter.src_addr, + IGC_MAC_STATE_SRC_ADDR); if (input->filter.match_flags & IGC_FILTER_FLAG_DST_MAC_ADDR) - igc_del_mac_steering_filter(adapter, input->filter.dst_addr, - input->action, 0); + igc_del_mac_filter(adapter, input->filter.dst_addr, 0); return 0; } diff --git a/drivers/net/ethernet/intel/igc/igc_main.c b/drivers/net/ethernet/intel/igc/igc_main.c index e195400cd490..3af6ce1712d5 100644 --- a/drivers/net/ethernet/intel/igc/igc_main.c +++ b/drivers/net/ethernet/intel/igc/igc_main.c @@ -2209,8 +2209,8 @@ static bool igc_mac_entry_can_be_used(const struct igc_mac_addr *entry, * * Return: 0 in case of success, negative errno code otherwise. */ -static int igc_add_mac_filter(struct igc_adapter *adapter, const u8 *addr, - const s8 queue, const u8 flags) +int igc_add_mac_filter(struct igc_adapter *adapter, const u8 *addr, + const s8 queue, const u8 flags) { struct igc_hw *hw = &adapter->hw; int rar_entries = hw->mac.rar_entry_count; @@ -2250,8 +2250,8 @@ static int igc_add_mac_filter(struct igc_adapter *adapter, const u8 *addr, * * Return: 0 in case of success, negative errno code otherwise. */ -static int igc_del_mac_filter(struct igc_adapter *adapter, const u8 *addr, - const u8 flags) +int igc_del_mac_filter(struct igc_adapter *adapter, const u8 *addr, + const u8 flags) { struct igc_hw *hw = &adapter->hw; int rar_entries = hw->mac.rar_entry_count; @@ -3733,18 +3733,6 @@ igc_features_check(struct sk_buff *skb, struct net_device *dev, return features; } -int igc_add_mac_steering_filter(struct igc_adapter *adapter, - const u8 *addr, u8 queue, u8 flags) -{ - return igc_add_mac_filter(adapter, addr, queue, flags); -} - -int igc_del_mac_steering_filter(struct igc_adapter *adapter, - const u8 *addr, u8 queue, u8 flags) -{ - return igc_del_mac_filter(adapter, addr, flags); -} - static void igc_tsync_interrupt(struct igc_adapter *adapter) { struct igc_hw *hw = &adapter->hw; -- cgit v1.2.3-59-g8ed1b From 794e5bc817bcc16ca955691e957e23908edbef9c Mon Sep 17 00:00:00 2001 From: Andre Guedes Date: Wed, 18 Mar 2020 16:01:00 -0700 Subject: igc: Refactor igc_mac_entry_can_be_used() The helper igc_mac_entry_can_be_used() implementation is a bit convoluted since it does two different things: find a not-in-use slot in mac_table or find an in-use slot where the address and address type match. This patch does a code refactoring and break it up into two helper functions. With this patch we might traverse mac_table twice in some situations, but this is not harmful performance-wise (mac_table has only 16 entries and adding mac filters is not hot-path), and it improves igc_add_mac_ filter() readability considerably. Signed-off-by: Andre Guedes Tested-by: Aaron Brown Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/igc/igc_main.c | 80 ++++++++++++++++++------------- 1 file changed, 47 insertions(+), 33 deletions(-) diff --git a/drivers/net/ethernet/intel/igc/igc_main.c b/drivers/net/ethernet/intel/igc/igc_main.c index 3af6ce1712d5..79a9875e0767 100644 --- a/drivers/net/ethernet/intel/igc/igc_main.c +++ b/drivers/net/ethernet/intel/igc/igc_main.c @@ -2176,25 +2176,44 @@ static void igc_nfc_filter_restore(struct igc_adapter *adapter) spin_unlock(&adapter->nfc_lock); } -/* If the filter to be added and an already existing filter express - * the same address and address type, it should be possible to only - * override the other configurations, for example the queue to steer - * traffic. - */ -static bool igc_mac_entry_can_be_used(const struct igc_mac_addr *entry, - const u8 *addr, const u8 flags) +static int igc_find_mac_filter(struct igc_adapter *adapter, const u8 *addr, + u8 flags) { - if (!(entry->state & IGC_MAC_STATE_IN_USE)) - return true; + int max_entries = adapter->hw.mac.rar_entry_count; + struct igc_mac_addr *entry; + int i; - if ((entry->state & IGC_MAC_STATE_SRC_ADDR) != - (flags & IGC_MAC_STATE_SRC_ADDR)) - return false; + for (i = 0; i < max_entries; i++) { + entry = &adapter->mac_table[i]; - if (!ether_addr_equal(addr, entry->addr)) - return false; + if (!(entry->state & IGC_MAC_STATE_IN_USE)) + continue; + if (!ether_addr_equal(addr, entry->addr)) + continue; + if ((entry->state & IGC_MAC_STATE_SRC_ADDR) != + (flags & IGC_MAC_STATE_SRC_ADDR)) + continue; - return true; + return i; + } + + return -1; +} + +static int igc_get_avail_mac_filter_slot(struct igc_adapter *adapter) +{ + int max_entries = adapter->hw.mac.rar_entry_count; + struct igc_mac_addr *entry; + int i; + + for (i = 0; i < max_entries; i++) { + entry = &adapter->mac_table[i]; + + if (!(entry->state & IGC_MAC_STATE_IN_USE)) + return i; + } + + return -1; } /** @@ -2212,33 +2231,28 @@ static bool igc_mac_entry_can_be_used(const struct igc_mac_addr *entry, int igc_add_mac_filter(struct igc_adapter *adapter, const u8 *addr, const s8 queue, const u8 flags) { - struct igc_hw *hw = &adapter->hw; - int rar_entries = hw->mac.rar_entry_count; - int i; + int index; if (!is_valid_ether_addr(addr)) return -EINVAL; if (flags & IGC_MAC_STATE_SRC_ADDR) return -ENOTSUPP; - /* Search for the first empty entry in the MAC table. - * Do not touch entries at the end of the table reserved for the VF MAC - * addresses. - */ - for (i = 0; i < rar_entries; i++) { - if (!igc_mac_entry_can_be_used(&adapter->mac_table[i], - addr, flags)) - continue; + index = igc_find_mac_filter(adapter, addr, flags); + if (index >= 0) + goto update_queue_assignment; - ether_addr_copy(adapter->mac_table[i].addr, addr); - adapter->mac_table[i].queue = queue; - adapter->mac_table[i].state |= IGC_MAC_STATE_IN_USE | flags; + index = igc_get_avail_mac_filter_slot(adapter); + if (index < 0) + return -ENOSPC; - igc_set_mac_filter_hw(adapter, i, addr, queue); - return 0; - } + ether_addr_copy(adapter->mac_table[index].addr, addr); + adapter->mac_table[index].state |= IGC_MAC_STATE_IN_USE | flags; +update_queue_assignment: + adapter->mac_table[index].queue = queue; - return -ENOSPC; + igc_set_mac_filter_hw(adapter, index, addr, queue); + return 0; } /** -- cgit v1.2.3-59-g8ed1b From 5f930713728b5551f5c07b918cc8030dacb75fe1 Mon Sep 17 00:00:00 2001 From: Andre Guedes Date: Wed, 18 Mar 2020 16:01:01 -0700 Subject: igc: Refactor igc_del_mac_filter() This patch does a code refactoring in igc_del_mac_filter() so it uses the new helper igc_find_mac_filter() and improves the comment about the special handling when deleting the default filter. Signed-off-by: Andre Guedes Tested-by: Aaron Brown Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/igc/igc_main.c | 45 +++++++++++++------------------ 1 file changed, 19 insertions(+), 26 deletions(-) diff --git a/drivers/net/ethernet/intel/igc/igc_main.c b/drivers/net/ethernet/intel/igc/igc_main.c index 79a9875e0767..78753f12b8a0 100644 --- a/drivers/net/ethernet/intel/igc/igc_main.c +++ b/drivers/net/ethernet/intel/igc/igc_main.c @@ -2267,40 +2267,33 @@ update_queue_assignment: int igc_del_mac_filter(struct igc_adapter *adapter, const u8 *addr, const u8 flags) { - struct igc_hw *hw = &adapter->hw; - int rar_entries = hw->mac.rar_entry_count; - int i; + struct igc_mac_addr *entry; + int index; if (!is_valid_ether_addr(addr)) return -EINVAL; - for (i = 0; i < rar_entries; i++) { - if (!(adapter->mac_table[i].state & IGC_MAC_STATE_IN_USE)) - continue; - if (flags && (adapter->mac_table[i].state & flags) != flags) - continue; - if (!ether_addr_equal(adapter->mac_table[i].addr, addr)) - continue; + index = igc_find_mac_filter(adapter, addr, flags); + if (index < 0) + return -ENOENT; - /* When a filter for the default address is "deleted", - * we return it to its initial configuration - */ - if (adapter->mac_table[i].state & IGC_MAC_STATE_DEFAULT) { - adapter->mac_table[i].state = - IGC_MAC_STATE_DEFAULT | IGC_MAC_STATE_IN_USE; - adapter->mac_table[i].queue = -1; - igc_set_mac_filter_hw(adapter, 0, addr, -1); - } else { - adapter->mac_table[i].state = 0; - adapter->mac_table[i].queue = -1; - memset(adapter->mac_table[i].addr, 0, ETH_ALEN); - igc_clear_mac_filter_hw(adapter, i); - } + entry = &adapter->mac_table[index]; - return 0; + if (entry->state & IGC_MAC_STATE_DEFAULT) { + /* If this is the default filter, we don't actually delete it. + * We just reset to its default value i.e. disable queue + * assignment. + */ + entry->queue = -1; + igc_set_mac_filter_hw(adapter, 0, addr, entry->queue); + } else { + entry->state = 0; + entry->queue = -1; + memset(entry->addr, 0, ETH_ALEN); + igc_clear_mac_filter_hw(adapter, index); } - return -ENOENT; + return 0; } static int igc_uc_sync(struct net_device *netdev, const unsigned char *addr) -- cgit v1.2.3-59-g8ed1b From 949b922e8b1b2bebfec90c68edd888723ab8bc23 Mon Sep 17 00:00:00 2001 From: Andre Guedes Date: Wed, 18 Mar 2020 16:01:02 -0700 Subject: igc: Add debug messages to MAC filter code This patch adds log messages to functions related to the MAC address filtering code to ease debugging. Signed-off-by: Andre Guedes Tested-by: Aaron Brown Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/igc/igc_main.c | 24 ++++++++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/intel/igc/igc_main.c b/drivers/net/ethernet/intel/igc/igc_main.c index 78753f12b8a0..9d5f8287c704 100644 --- a/drivers/net/ethernet/intel/igc/igc_main.c +++ b/drivers/net/ethernet/intel/igc/igc_main.c @@ -776,6 +776,7 @@ static void igc_setup_tctl(struct igc_adapter *adapter) static void igc_set_mac_filter_hw(struct igc_adapter *adapter, int index, const u8 *addr, int queue) { + struct net_device *dev = adapter->netdev; struct igc_hw *hw = &adapter->hw; u32 ral, rah; @@ -795,6 +796,8 @@ static void igc_set_mac_filter_hw(struct igc_adapter *adapter, int index, wr32(IGC_RAL(index), ral); wr32(IGC_RAH(index), rah); + + netdev_dbg(dev, "MAC address filter set in HW: index %d", index); } /** @@ -804,6 +807,7 @@ static void igc_set_mac_filter_hw(struct igc_adapter *adapter, int index, */ static void igc_clear_mac_filter_hw(struct igc_adapter *adapter, int index) { + struct net_device *dev = adapter->netdev; struct igc_hw *hw = &adapter->hw; if (WARN_ON(index >= hw->mac.rar_entry_count)) @@ -811,18 +815,24 @@ static void igc_clear_mac_filter_hw(struct igc_adapter *adapter, int index) wr32(IGC_RAL(index), 0); wr32(IGC_RAH(index), 0); + + netdev_dbg(dev, "MAC address filter cleared in HW: index %d", index); } /* Set default MAC address for the PF in the first RAR entry */ static void igc_set_default_mac_filter(struct igc_adapter *adapter) { struct igc_mac_addr *mac_table = &adapter->mac_table[0]; + struct net_device *dev = adapter->netdev; + u8 *addr = adapter->hw.mac.addr; + + netdev_dbg(dev, "Set default MAC address filter: address %pM", addr); - ether_addr_copy(mac_table->addr, adapter->hw.mac.addr); + ether_addr_copy(mac_table->addr, addr); mac_table->state = IGC_MAC_STATE_DEFAULT | IGC_MAC_STATE_IN_USE; mac_table->queue = -1; - igc_set_mac_filter_hw(adapter, 0, mac_table->addr, mac_table->queue); + igc_set_mac_filter_hw(adapter, 0, addr, mac_table->queue); } /** @@ -2231,6 +2241,7 @@ static int igc_get_avail_mac_filter_slot(struct igc_adapter *adapter) int igc_add_mac_filter(struct igc_adapter *adapter, const u8 *addr, const s8 queue, const u8 flags) { + struct net_device *dev = adapter->netdev; int index; if (!is_valid_ether_addr(addr)) @@ -2246,6 +2257,9 @@ int igc_add_mac_filter(struct igc_adapter *adapter, const u8 *addr, if (index < 0) return -ENOSPC; + netdev_dbg(dev, "Add MAC address filter: index %d address %pM queue %d", + index, addr, queue); + ether_addr_copy(adapter->mac_table[index].addr, addr); adapter->mac_table[index].state |= IGC_MAC_STATE_IN_USE | flags; update_queue_assignment: @@ -2267,6 +2281,7 @@ update_queue_assignment: int igc_del_mac_filter(struct igc_adapter *adapter, const u8 *addr, const u8 flags) { + struct net_device *dev = adapter->netdev; struct igc_mac_addr *entry; int index; @@ -2284,9 +2299,14 @@ int igc_del_mac_filter(struct igc_adapter *adapter, const u8 *addr, * We just reset to its default value i.e. disable queue * assignment. */ + netdev_dbg(dev, "Disable default MAC filter queue assignment"); + entry->queue = -1; igc_set_mac_filter_hw(adapter, 0, addr, entry->queue); } else { + netdev_dbg(dev, "Delete MAC address filter: index %d address %pM", + index, addr); + entry->state = 0; entry->queue = -1; memset(entry->addr, 0, ETH_ALEN); -- cgit v1.2.3-59-g8ed1b From 135e30180ff4f654794960aba04785fe7eef2e90 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Fri, 17 Apr 2020 22:50:52 +0300 Subject: net: dsa: sja1105: enable internal pull-down for RX_DV/CRS_DV/RX_CTL and RX_ER Some boards do not have the RX_ER MII signal connected. Normally in such situation, those pins would be grounded, but then again, some boards left it electrically floating. When sending traffic to those switch ports, one can see that the N_SOFERR statistics counter is incrementing once per each packet. The user manual states for this counter that it may count the number of frames "that have the MII error input being asserted prior to or up to the SOF delimiter byte". So the switch MAC is sampling an electrically floating signal, and preventing proper traffic reception because of that. As a workaround, enable the internal weak pull-downs on the input pads for the MII control signals. This way, a floating signal would be internally tied to ground. The logic levels of signals which _are_ externally driven should not be bothered by this 40-50 KOhm internal resistor. So it is not an issue to enable the internal pull-down unconditionally, irrespective of PHY interface type (MII, RMII, RGMII, SGMII) and of board layout. Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- drivers/net/dsa/sja1105/sja1105.h | 1 + drivers/net/dsa/sja1105/sja1105_clocking.c | 58 ++++++++++++++++++++++++++---- drivers/net/dsa/sja1105/sja1105_spi.c | 2 ++ 3 files changed, 54 insertions(+), 7 deletions(-) diff --git a/drivers/net/dsa/sja1105/sja1105.h b/drivers/net/dsa/sja1105/sja1105.h index 8b60dbd567f2..2f62942692ec 100644 --- a/drivers/net/dsa/sja1105/sja1105.h +++ b/drivers/net/dsa/sja1105/sja1105.h @@ -49,6 +49,7 @@ struct sja1105_regs { u64 ptpschtm; u64 ptpegr_ts[SJA1105_NUM_PORTS]; u64 pad_mii_tx[SJA1105_NUM_PORTS]; + u64 pad_mii_rx[SJA1105_NUM_PORTS]; u64 pad_mii_id[SJA1105_NUM_PORTS]; u64 cgu_idiv[SJA1105_NUM_PORTS]; u64 mii_tx_clk[SJA1105_NUM_PORTS]; diff --git a/drivers/net/dsa/sja1105/sja1105_clocking.c b/drivers/net/dsa/sja1105/sja1105_clocking.c index 0fdc2d55fff6..2a9b8a6a5306 100644 --- a/drivers/net/dsa/sja1105/sja1105_clocking.c +++ b/drivers/net/dsa/sja1105/sja1105_clocking.c @@ -7,12 +7,16 @@ #define SJA1105_SIZE_CGU_CMD 4 -struct sja1105_cfg_pad_mii_tx { +/* Common structure for CFG_PAD_MIIx_RX and CFG_PAD_MIIx_TX */ +struct sja1105_cfg_pad_mii { u64 d32_os; + u64 d32_ih; u64 d32_ipud; + u64 d10_ih; u64 d10_os; u64 d10_ipud; u64 ctrl_os; + u64 ctrl_ih; u64 ctrl_ipud; u64 clk_os; u64 clk_ih; @@ -338,16 +342,19 @@ static int sja1105_cgu_rgmii_tx_clk_config(struct sja1105_private *priv, /* AGU */ static void -sja1105_cfg_pad_mii_tx_packing(void *buf, struct sja1105_cfg_pad_mii_tx *cmd, - enum packing_op op) +sja1105_cfg_pad_mii_packing(void *buf, struct sja1105_cfg_pad_mii *cmd, + enum packing_op op) { const int size = 4; sja1105_packing(buf, &cmd->d32_os, 28, 27, size, op); + sja1105_packing(buf, &cmd->d32_ih, 26, 26, size, op); sja1105_packing(buf, &cmd->d32_ipud, 25, 24, size, op); sja1105_packing(buf, &cmd->d10_os, 20, 19, size, op); + sja1105_packing(buf, &cmd->d10_ih, 18, 18, size, op); sja1105_packing(buf, &cmd->d10_ipud, 17, 16, size, op); sja1105_packing(buf, &cmd->ctrl_os, 12, 11, size, op); + sja1105_packing(buf, &cmd->ctrl_ih, 10, 10, size, op); sja1105_packing(buf, &cmd->ctrl_ipud, 9, 8, size, op); sja1105_packing(buf, &cmd->clk_os, 4, 3, size, op); sja1105_packing(buf, &cmd->clk_ih, 2, 2, size, op); @@ -358,7 +365,7 @@ static int sja1105_rgmii_cfg_pad_tx_config(struct sja1105_private *priv, int port) { const struct sja1105_regs *regs = priv->info->regs; - struct sja1105_cfg_pad_mii_tx pad_mii_tx; + struct sja1105_cfg_pad_mii pad_mii_tx = {0}; u8 packed_buf[SJA1105_SIZE_CGU_CMD] = {0}; /* Payload */ @@ -375,12 +382,45 @@ static int sja1105_rgmii_cfg_pad_tx_config(struct sja1105_private *priv, pad_mii_tx.clk_os = 3; /* TX_CLK output stage */ pad_mii_tx.clk_ih = 0; /* TX_CLK input hysteresis (default) */ pad_mii_tx.clk_ipud = 2; /* TX_CLK input stage (default) */ - sja1105_cfg_pad_mii_tx_packing(packed_buf, &pad_mii_tx, PACK); + sja1105_cfg_pad_mii_packing(packed_buf, &pad_mii_tx, PACK); return sja1105_xfer_buf(priv, SPI_WRITE, regs->pad_mii_tx[port], packed_buf, SJA1105_SIZE_CGU_CMD); } +static int sja1105_cfg_pad_rx_config(struct sja1105_private *priv, int port) +{ + const struct sja1105_regs *regs = priv->info->regs; + struct sja1105_cfg_pad_mii pad_mii_rx = {0}; + u8 packed_buf[SJA1105_SIZE_CGU_CMD] = {0}; + + /* Payload */ + pad_mii_rx.d32_ih = 0; /* RXD[3:2] input stage hysteresis: */ + /* non-Schmitt (default) */ + pad_mii_rx.d32_ipud = 2; /* RXD[3:2] input weak pull-up/down */ + /* plain input (default) */ + pad_mii_rx.d10_ih = 0; /* RXD[1:0] input stage hysteresis: */ + /* non-Schmitt (default) */ + pad_mii_rx.d10_ipud = 2; /* RXD[1:0] input weak pull-up/down */ + /* plain input (default) */ + pad_mii_rx.ctrl_ih = 0; /* RX_DV/CRS_DV/RX_CTL and RX_ER */ + /* input stage hysteresis: */ + /* non-Schmitt (default) */ + pad_mii_rx.ctrl_ipud = 3; /* RX_DV/CRS_DV/RX_CTL and RX_ER */ + /* input stage weak pull-up/down: */ + /* pull-down */ + pad_mii_rx.clk_os = 2; /* RX_CLK/RXC output stage: */ + /* medium noise/fast speed (default) */ + pad_mii_rx.clk_ih = 0; /* RX_CLK/RXC input hysteresis: */ + /* non-Schmitt (default) */ + pad_mii_rx.clk_ipud = 2; /* RX_CLK/RXC input pull-up/down: */ + /* plain input (default) */ + sja1105_cfg_pad_mii_packing(packed_buf, &pad_mii_rx, PACK); + + return sja1105_xfer_buf(priv, SPI_WRITE, regs->pad_mii_rx[port], + packed_buf, SJA1105_SIZE_CGU_CMD); +} + static void sja1105_cfg_pad_mii_id_packing(void *buf, struct sja1105_cfg_pad_mii_id *cmd, enum packing_op op) @@ -669,10 +709,14 @@ int sja1105_clocking_setup_port(struct sja1105_private *priv, int port) phy_mode); return -EINVAL; } - if (rc) + if (rc) { dev_err(dev, "Clocking setup for port %d failed: %d\n", port, rc); - return rc; + return rc; + } + + /* Internally pull down the RX_DV/CRS_DV/RX_CTL and RX_ER inputs */ + return sja1105_cfg_pad_rx_config(priv, port); } int sja1105_clocking_setup(struct sja1105_private *priv) diff --git a/drivers/net/dsa/sja1105/sja1105_spi.c b/drivers/net/dsa/sja1105/sja1105_spi.c index 04bdb72ae6b6..43f14a5c2718 100644 --- a/drivers/net/dsa/sja1105/sja1105_spi.c +++ b/drivers/net/dsa/sja1105/sja1105_spi.c @@ -443,6 +443,7 @@ static struct sja1105_regs sja1105et_regs = { .rgu = 0x100440, /* UM10944.pdf, Table 86, ACU Register overview */ .pad_mii_tx = {0x100800, 0x100802, 0x100804, 0x100806, 0x100808}, + .pad_mii_rx = {0x100801, 0x100803, 0x100805, 0x100807, 0x100809}, .rmii_pll1 = 0x10000A, .cgu_idiv = {0x10000B, 0x10000C, 0x10000D, 0x10000E, 0x10000F}, .mac = {0x200, 0x202, 0x204, 0x206, 0x208}, @@ -475,6 +476,7 @@ static struct sja1105_regs sja1105pqrs_regs = { .rgu = 0x100440, /* UM10944.pdf, Table 86, ACU Register overview */ .pad_mii_tx = {0x100800, 0x100802, 0x100804, 0x100806, 0x100808}, + .pad_mii_rx = {0x100801, 0x100803, 0x100805, 0x100807, 0x100809}, .pad_mii_id = {0x100810, 0x100811, 0x100812, 0x100813, 0x100814}, .sgmii = 0x1F0000, .rmii_pll1 = 0x10000A, -- cgit v1.2.3-59-g8ed1b From 0673f976285e0570437612329962c339300d013e Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Sat, 18 Apr 2020 09:51:54 +0800 Subject: ptp_kvm: Make kvm_ptp_lock static Fix sparse warning: drivers/ptp/ptp_kvm.c:25:1: warning: symbol 'kvm_ptp_lock' was not declared. Should it be static? Reported-by: Hulk Robot Signed-off-by: YueHaibing Acked-by: Richard Cochran Signed-off-by: David S. Miller --- drivers/ptp/ptp_kvm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/ptp/ptp_kvm.c b/drivers/ptp/ptp_kvm.c index fc7d0b77e118..658d33fc3195 100644 --- a/drivers/ptp/ptp_kvm.c +++ b/drivers/ptp/ptp_kvm.c @@ -22,7 +22,7 @@ struct kvm_ptp_clock { struct ptp_clock_info caps; }; -DEFINE_SPINLOCK(kvm_ptp_lock); +static DEFINE_SPINLOCK(kvm_ptp_lock); static struct pvclock_vsyscall_time_info *hv_clock; -- cgit v1.2.3-59-g8ed1b From d30e1c3db96467f1f444d84023c2c37821202aeb Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Sat, 18 Apr 2020 10:01:49 +0800 Subject: ptp: idt82p33: Make two variables static Fix sparse warnings: drivers/ptp/ptp_idt82p33.c:26:5: warning: symbol 'sync_tod_timeout' was not declared. Should it be static? drivers/ptp/ptp_idt82p33.c:31:5: warning: symbol 'phase_snap_threshold' was not declared. Should it be static? Reported-by: Hulk Robot Signed-off-by: YueHaibing Acked-by: Richard Cochran Signed-off-by: David S. Miller --- drivers/ptp/ptp_idt82p33.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/ptp/ptp_idt82p33.c b/drivers/ptp/ptp_idt82p33.c index b63ac240308b..31ea811b6d5f 100644 --- a/drivers/ptp/ptp_idt82p33.c +++ b/drivers/ptp/ptp_idt82p33.c @@ -23,12 +23,12 @@ MODULE_VERSION("1.0"); MODULE_LICENSE("GPL"); /* Module Parameters */ -u32 sync_tod_timeout = SYNC_TOD_TIMEOUT_SEC; +static u32 sync_tod_timeout = SYNC_TOD_TIMEOUT_SEC; module_param(sync_tod_timeout, uint, 0); MODULE_PARM_DESC(sync_tod_timeout, "duration in second to keep SYNC_TOD on (set to 0 to keep it always on)"); -u32 phase_snap_threshold = SNAP_THRESHOLD_NS; +static u32 phase_snap_threshold = SNAP_THRESHOLD_NS; module_param(phase_snap_threshold, uint, 0); MODULE_PARM_DESC(phase_snap_threshold, "threshold (150000ns by default) below which adjtime would ignore"); -- cgit v1.2.3-59-g8ed1b From 6d92797716003a462bd75bbda3740718ce54bcdd Mon Sep 17 00:00:00 2001 From: Jason Yan Date: Sat, 18 Apr 2020 16:42:12 +0800 Subject: net: hns: use true,false for bool variables Fix the following coccicheck warning: drivers/net/ethernet/hisilicon/hns/hns_dsaf_mac.c:700:2-8: WARNING: Assignment of 0/1 to bool variable drivers/net/ethernet/hisilicon/hns/hns_dsaf_mac.c:702:2-8: WARNING: Assignment of 0/1 to bool variable Reported-by: Hulk Robot Signed-off-by: Jason Yan Signed-off-by: David S. Miller --- drivers/net/ethernet/hisilicon/hns/hns_dsaf_mac.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/hisilicon/hns/hns_dsaf_mac.c b/drivers/net/ethernet/hisilicon/hns/hns_dsaf_mac.c index 8aace2de0cc9..9a907947ba19 100644 --- a/drivers/net/ethernet/hisilicon/hns/hns_dsaf_mac.c +++ b/drivers/net/ethernet/hisilicon/hns/hns_dsaf_mac.c @@ -697,9 +697,9 @@ hns_mac_register_phydev(struct mii_bus *mdio, struct hns_mac_cb *mac_cb, return rc; if (!strcmp(phy_type, phy_modes(PHY_INTERFACE_MODE_XGMII))) - is_c45 = 1; + is_c45 = true; else if (!strcmp(phy_type, phy_modes(PHY_INTERFACE_MODE_SGMII))) - is_c45 = 0; + is_c45 = false; else return -ENODATA; -- cgit v1.2.3-59-g8ed1b From fee698d62b3b0621dba8b33454042a4fe82521f3 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Sat, 18 Apr 2020 22:08:51 +0200 Subject: net: phy: realtek: add delay to resume path of certain internal PHY's Internal PHY's from RTL8168h up may not be instantly ready after calling genphy_resume(). So far r8169 network driver adds the needed delay, but better handle this in the PHY driver. The network driver may miss other places where the PHY is resumed. Signed-off-by: Heiner Kallweit Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/phy/realtek.c | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/drivers/net/phy/realtek.c b/drivers/net/phy/realtek.c index 2d99e9de6ee1..c7229d022a27 100644 --- a/drivers/net/phy/realtek.c +++ b/drivers/net/phy/realtek.c @@ -11,6 +11,7 @@ #include #include #include +#include #define RTL821x_PHYSR 0x11 #define RTL821x_PHYSR_DUPLEX BIT(13) @@ -526,6 +527,16 @@ static int rtl8125_match_phy_device(struct phy_device *phydev) rtlgen_supports_2_5gbps(phydev); } +static int rtlgen_resume(struct phy_device *phydev) +{ + int ret = genphy_resume(phydev); + + /* Internal PHY's from RTL8168h up may not be instantly ready */ + msleep(20); + + return ret; +} + static struct phy_driver realtek_drvs[] = { { PHY_ID_MATCH_EXACT(0x00008201), @@ -609,7 +620,7 @@ static struct phy_driver realtek_drvs[] = { .match_phy_device = rtlgen_match_phy_device, .read_status = rtlgen_read_status, .suspend = genphy_suspend, - .resume = genphy_resume, + .resume = rtlgen_resume, .read_page = rtl821x_read_page, .write_page = rtl821x_write_page, .read_mmd = rtlgen_read_mmd, @@ -621,7 +632,7 @@ static struct phy_driver realtek_drvs[] = { .config_aneg = rtl8125_config_aneg, .read_status = rtl8125_read_status, .suspend = genphy_suspend, - .resume = genphy_resume, + .resume = rtlgen_resume, .read_page = rtl821x_read_page, .write_page = rtl821x_write_page, .read_mmd = rtl8125_read_mmd, -- cgit v1.2.3-59-g8ed1b From 109f0cf23b094b0b780a22ce1c9ea267ebcd3974 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Sat, 18 Apr 2020 22:09:42 +0200 Subject: r8169: remove PHY resume delay that is handled in the PHY driver now The Realtek PHY driver takes care of adding the needed delay now, therefore we can remove the delay here. Signed-off-by: Heiner Kallweit Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/ethernet/realtek/r8169_main.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/net/ethernet/realtek/r8169_main.c b/drivers/net/ethernet/realtek/r8169_main.c index a8696d958cd1..1bc415d00cb8 100644 --- a/drivers/net/ethernet/realtek/r8169_main.c +++ b/drivers/net/ethernet/realtek/r8169_main.c @@ -2391,8 +2391,6 @@ static void rtl_pll_power_up(struct rtl8169_private *tp) } phy_resume(tp->phydev); - /* give MAC/PHY some time to resume */ - msleep(20); } static void rtl_init_rxcfg(struct rtl8169_private *tp) -- cgit v1.2.3-59-g8ed1b From c290d1ab12d3385da1d7be909e6b09caea3325bb Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Sat, 18 Apr 2020 20:17:13 -0700 Subject: net: phy: Propagate error from bus->reset If a bus->reset() call for the mii_bus structure returns an error (e.g.: -EPROE_DEFER) we should propagate it accordingly. Signed-off-by: Florian Fainelli Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- drivers/net/phy/mdio_bus.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/drivers/net/phy/mdio_bus.c b/drivers/net/phy/mdio_bus.c index 7a4eb3f2cb74..346e88435d29 100644 --- a/drivers/net/phy/mdio_bus.c +++ b/drivers/net/phy/mdio_bus.c @@ -627,8 +627,11 @@ int __mdiobus_register(struct mii_bus *bus, struct module *owner) gpiod_set_value_cansleep(gpiod, 0); } - if (bus->reset) - bus->reset(bus); + if (bus->reset) { + err = bus->reset(bus); + if (err) + goto error_reset_gpiod; + } for (i = 0; i < PHY_MAX_ADDR; i++) { if ((bus->phy_mask & (1 << i)) == 0) { @@ -657,7 +660,7 @@ error: mdiodev->device_remove(mdiodev); mdiodev->device_free(mdiodev); } - +error_reset_gpiod: /* Put PHYs in RESET to save power */ if (bus->reset_gpiod) gpiod_set_value_cansleep(bus->reset_gpiod, 1); -- cgit v1.2.3-59-g8ed1b From cec2500d44751a7782d16ce99fca104e77e324a5 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Sun, 19 Apr 2020 10:01:05 +0300 Subject: mlxsw: spectrum_router: Re-increase scale of IPv6 nexthop groups As explained in commit fc25996e6f46 ("mlxsw: spectrum_router: Increase scale of IPv6 nexthop groups"), each nexthop group is hashed by XOR-ing the interface indexes of all the member nexthop devices. To avoid many different nexthop groups ending up using the same key, the above commit started hashing the interface indexes themselves before they are XOR-ed. However, in cases in which there are many nexthop groups that all use the same nexthop device and only differ in the gateway IP, we can still end up in a situation in which all the groups are using the same key. This eventually leads to -EBUSY error from rhashtable during insertion. Improve the situation by also making the gateway IP part of the key. Signed-off-by: Ido Schimmel Reported-by: Alex Veber Reviewed-by: Jiri Pirko Tested-by: Alex Veber Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c index d5bca1be3ef5..71aee4914619 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c @@ -2999,6 +2999,7 @@ static u32 mlxsw_sp_nexthop_group_hash_obj(const void *data, u32 len, u32 seed) for (i = 0; i < nh_grp->count; i++) { nh = &nh_grp->nexthops[i]; val ^= jhash(&nh->ifindex, sizeof(nh->ifindex), seed); + val ^= jhash(&nh->gw_addr, sizeof(nh->gw_addr), seed); } return jhash(&val, sizeof(val), seed); default: @@ -3012,11 +3013,14 @@ mlxsw_sp_nexthop6_group_hash(struct mlxsw_sp_fib6_entry *fib6_entry, u32 seed) { unsigned int val = fib6_entry->nrt6; struct mlxsw_sp_rt6 *mlxsw_sp_rt6; - struct net_device *dev; list_for_each_entry(mlxsw_sp_rt6, &fib6_entry->rt6_list, list) { - dev = mlxsw_sp_rt6->rt->fib6_nh->fib_nh_dev; + struct fib6_nh *fib6_nh = mlxsw_sp_rt6->rt->fib6_nh; + struct net_device *dev = fib6_nh->fib_nh_dev; + struct in6_addr *gw = &fib6_nh->fib_nh_gw6; + val ^= jhash(&dev->ifindex, sizeof(dev->ifindex), seed); + val ^= jhash(gw, sizeof(*gw), seed); } return jhash(&val, sizeof(val), seed); -- cgit v1.2.3-59-g8ed1b From b7f03b0b2a213474f452a802769ecd9fcee209a9 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Sun, 19 Apr 2020 10:01:06 +0300 Subject: mlxsw: reg: Increase register field length to 13 bits The Infrastructure Entry Delete Register (IEDR) is used to delete entries stored in the KVD linear database. Currently, it is only possible to delete entries of size up to 2048. Future firmware versions will support deletion of entries of size up to 4096. Increase the size of the field so that the driver will be able to perform such deletions in the future, when required. Signed-off-by: Ido Schimmel Reviewed-by: Jiri Pirko Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlxsw/reg.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlxsw/reg.h b/drivers/net/ethernet/mellanox/mlxsw/reg.h index 9b39b8e70519..3c3db1c874b6 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/reg.h +++ b/drivers/net/ethernet/mellanox/mlxsw/reg.h @@ -3203,7 +3203,7 @@ MLXSW_ITEM32_INDEXED(reg, iedr, rec_type, MLXSW_REG_IEDR_BASE_LEN, 24, 8, * Size of entries do be deleted. The unit is 1 entry, regardless of entry type. * Access: OP */ -MLXSW_ITEM32_INDEXED(reg, iedr, rec_size, MLXSW_REG_IEDR_BASE_LEN, 0, 11, +MLXSW_ITEM32_INDEXED(reg, iedr, rec_size, MLXSW_REG_IEDR_BASE_LEN, 0, 13, MLXSW_REG_IEDR_REC_LEN, 0x00, false); /* reg_iedr_rec_index_start -- cgit v1.2.3-59-g8ed1b From cceadc831e728fde74c21813519962c648f5ca7c Mon Sep 17 00:00:00 2001 From: Michael Walle Date: Sun, 19 Apr 2020 10:27:57 +0200 Subject: net: phy: mscc: use mdiobus_get_phy() Don't use internal knowledge of the mdio bus core, instead use mdiobus_get_phy() which does the same thing. Signed-off-by: Michael Walle Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- drivers/net/phy/mscc/mscc_main.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/drivers/net/phy/mscc/mscc_main.c b/drivers/net/phy/mscc/mscc_main.c index acddef79f4e8..5391acdece05 100644 --- a/drivers/net/phy/mscc/mscc_main.c +++ b/drivers/net/phy/mscc/mscc_main.c @@ -1292,7 +1292,7 @@ out: */ static bool vsc8584_is_pkg_init(struct phy_device *phydev, bool reversed) { - struct mdio_device **map = phydev->mdio.bus->mdio_map; + struct mii_bus *bus = phydev->mdio.bus; struct vsc8531_private *vsc8531; struct phy_device *phy; int i, addr; @@ -1306,11 +1306,10 @@ static bool vsc8584_is_pkg_init(struct phy_device *phydev, bool reversed) else addr = vsc8531->base_addr + i; - if (!map[addr]) + phy = mdiobus_get_phy(bus, addr); + if (!phy) continue; - phy = container_of(map[addr], struct phy_device, mdio); - if ((phy->phy_id & phydev->drv->phy_id_mask) != (phydev->drv->phy_id & phydev->drv->phy_id_mask)) continue; -- cgit v1.2.3-59-g8ed1b From b66c9b8de22b666718c2fcb0ae84ce620f9b81c0 Mon Sep 17 00:00:00 2001 From: Lourdes Pedrajas Date: Sun, 19 Apr 2020 11:16:51 +0200 Subject: selftests: pmtu: implement IPIP, SIT and ip6tnl PMTU discovery tests Add PMTU discovery tests for these encapsulations: - IPIP - SIT, mode ip6ip - ip6tnl, modes ip6ip6 and ipip6 Signed-off-by: Lourdes Pedrajas Reviewed-by: Stefano Brivio Reviewed-by: David Ahern Signed-off-by: David S. Miller --- tools/testing/selftests/net/pmtu.sh | 122 ++++++++++++++++++++++++++++++++++++ 1 file changed, 122 insertions(+) diff --git a/tools/testing/selftests/net/pmtu.sh b/tools/testing/selftests/net/pmtu.sh index 71a62e7e35b1..77c09cd339c3 100755 --- a/tools/testing/selftests/net/pmtu.sh +++ b/tools/testing/selftests/net/pmtu.sh @@ -67,6 +67,10 @@ # Same as pmtu_ipv4_vxlan4, but using a generic UDP IPv4/IPv6 # encapsulation (GUE) over IPv4/IPv6, instead of VXLAN # +# - pmtu_ipv{4,6}_ipv{4,6}_exception +# Same as pmtu_ipv4_vxlan4, but using a IPv4/IPv6 tunnel over IPv4/IPv6, +# instead of VXLAN +# # - pmtu_vti4_exception # Set up vti tunnel on top of veth, with xfrm states and policies, in two # namespaces with matching endpoints. Check that route exception is not @@ -151,6 +155,10 @@ tests=" pmtu_ipv6_gue4_exception IPv6 over gue4: PMTU exceptions 1 pmtu_ipv4_gue6_exception IPv4 over gue6: PMTU exceptions 1 pmtu_ipv6_gue6_exception IPv6 over gue6: PMTU exceptions 1 + pmtu_ipv4_ipv4_exception IPv4 over IPv4: PMTU exceptions 1 + pmtu_ipv6_ipv4_exception IPv6 over IPv4: PMTU exceptions 1 + pmtu_ipv4_ipv6_exception IPv4 over IPv6: PMTU exceptions 1 + pmtu_ipv6_ipv6_exception IPv6 over IPv6: PMTU exceptions 1 pmtu_vti6_exception vti6: PMTU exceptions 0 pmtu_vti4_exception vti4: PMTU exceptions 0 pmtu_vti4_default_mtu vti4: default MTU assignment 0 @@ -363,6 +371,62 @@ setup_gue66() { setup_fou_or_gue 6 6 gue } +setup_ipvX_over_ipvY() { + inner=${1} + outer=${2} + + if [ "${outer}" -eq 4 ]; then + a_addr="${prefix4}.${a_r1}.1" + b_addr="${prefix4}.${b_r1}.1" + if [ "${inner}" -eq 4 ]; then + type="ipip" + mode="ipip" + else + type="sit" + mode="ip6ip" + fi + else + a_addr="${prefix6}:${a_r1}::1" + b_addr="${prefix6}:${b_r1}::1" + type="ip6tnl" + if [ "${inner}" -eq 4 ]; then + mode="ipip6" + else + mode="ip6ip6" + fi + fi + + run_cmd ${ns_a} ip link add ip_a type ${type} local ${a_addr} remote ${b_addr} mode ${mode} || return 2 + run_cmd ${ns_b} ip link add ip_b type ${type} local ${b_addr} remote ${a_addr} mode ${mode} + + run_cmd ${ns_a} ip link set ip_a up + run_cmd ${ns_b} ip link set ip_b up + + if [ "${inner}" = "4" ]; then + run_cmd ${ns_a} ip addr add ${tunnel4_a_addr}/${tunnel4_mask} dev ip_a + run_cmd ${ns_b} ip addr add ${tunnel4_b_addr}/${tunnel4_mask} dev ip_b + else + run_cmd ${ns_a} ip addr add ${tunnel6_a_addr}/${tunnel6_mask} dev ip_a + run_cmd ${ns_b} ip addr add ${tunnel6_b_addr}/${tunnel6_mask} dev ip_b + fi +} + +setup_ip4ip4() { + setup_ipvX_over_ipvY 4 4 +} + +setup_ip6ip4() { + setup_ipvX_over_ipvY 6 4 +} + +setup_ip4ip6() { + setup_ipvX_over_ipvY 4 6 +} + +setup_ip6ip6() { + setup_ipvX_over_ipvY 6 6 +} + setup_namespaces() { for n in ${NS_A} ${NS_B} ${NS_R1} ${NS_R2}; do ip netns add ${n} || return 1 @@ -908,6 +972,64 @@ test_pmtu_ipv6_gue6_exception() { test_pmtu_ipvX_over_fouY_or_gueY 6 6 gue } +test_pmtu_ipvX_over_ipvY_exception() { + inner=${1} + outer=${2} + ll_mtu=4000 + + setup namespaces routing ip${inner}ip${outer} || return 2 + + trace "${ns_a}" ip_a "${ns_b}" ip_b \ + "${ns_a}" veth_A-R1 "${ns_r1}" veth_R1-A \ + "${ns_b}" veth_B-R1 "${ns_r1}" veth_R1-B + + if [ ${inner} -eq 4 ]; then + ping=ping + dst=${tunnel4_b_addr} + else + ping=${ping6} + dst=${tunnel6_b_addr} + fi + + if [ ${outer} -eq 4 ]; then + # IPv4 header + exp_mtu=$((${ll_mtu} - 20)) + else + # IPv6 header Option 4 + exp_mtu=$((${ll_mtu} - 40 - 8)) + fi + + # Create route exception by exceeding link layer MTU + mtu "${ns_a}" veth_A-R1 $((${ll_mtu} + 1000)) + mtu "${ns_r1}" veth_R1-A $((${ll_mtu} + 1000)) + mtu "${ns_b}" veth_B-R1 ${ll_mtu} + mtu "${ns_r1}" veth_R1-B ${ll_mtu} + + mtu "${ns_a}" ip_a $((${ll_mtu} + 1000)) || return + mtu "${ns_b}" ip_b $((${ll_mtu} + 1000)) || return + run_cmd ${ns_a} ${ping} -q -M want -i 0.1 -w 1 -s $((${ll_mtu} + 500)) ${dst} + + # Check that exception was created + pmtu="$(route_get_dst_pmtu_from_exception "${ns_a}" ${dst})" + check_pmtu_value ${exp_mtu} "${pmtu}" "exceeding link layer MTU on ip${inner}ip${outer} interface" +} + +test_pmtu_ipv4_ipv4_exception() { + test_pmtu_ipvX_over_ipvY_exception 4 4 +} + +test_pmtu_ipv6_ipv4_exception() { + test_pmtu_ipvX_over_ipvY_exception 6 4 +} + +test_pmtu_ipv4_ipv6_exception() { + test_pmtu_ipvX_over_ipvY_exception 4 6 +} + +test_pmtu_ipv6_ipv6_exception() { + test_pmtu_ipvX_over_ipvY_exception 6 6 +} + test_pmtu_vti4_exception() { setup namespaces veth vti4 xfrm4 || return 2 trace "${ns_a}" veth_a "${ns_b}" veth_b \ -- cgit v1.2.3-59-g8ed1b From 92efe48e8fe2b6848ca56e14436c2b7d20c986d4 Mon Sep 17 00:00:00 2001 From: Dejin Zheng Date: Sun, 19 Apr 2020 20:02:53 +0800 Subject: net: ethernet: dnet: convert to devm_platform_get_and_ioremap_resource use devm_platform_get_and_ioremap_resource() to simplify code, which contains platform_get_resource() and devm_ioremap_resource(), it also get the resource for use by the following code. Signed-off-by: Dejin Zheng Signed-off-by: David S. Miller --- drivers/net/ethernet/dnet.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/net/ethernet/dnet.c b/drivers/net/ethernet/dnet.c index 057a508dd6e2..db98274501a0 100644 --- a/drivers/net/ethernet/dnet.c +++ b/drivers/net/ethernet/dnet.c @@ -776,8 +776,7 @@ static int dnet_probe(struct platform_device *pdev) spin_lock_init(&bp->lock); - res = platform_get_resource(pdev, IORESOURCE_MEM, 0); - bp->regs = devm_ioremap_resource(&pdev->dev, res); + bp->regs = devm_platform_get_and_ioremap_resource(pdev, 0, &res); if (IS_ERR(bp->regs)) { err = PTR_ERR(bp->regs); goto err_out_free_dev; -- cgit v1.2.3-59-g8ed1b From 5333fdbed0c860c7a17dc12f328203d29947eff7 Mon Sep 17 00:00:00 2001 From: Aishwarya Ramakrishnan Date: Sun, 19 Apr 2020 21:14:43 +0530 Subject: net: sun: Remove unneeded cast from memory allocation Remove casting the values returned by memory allocation function. Coccinelle emits WARNING: casting value returned by memory allocation function to (struct cas_init_block *) is useless. This issue was detected by using the Coccinelle software. Signed-off-by: Aishwarya Ramakrishnan Signed-off-by: David S. Miller --- drivers/net/ethernet/sun/cassini.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/sun/cassini.c b/drivers/net/ethernet/sun/cassini.c index e6d1aa882fa5..3ee6ab104cb9 100644 --- a/drivers/net/ethernet/sun/cassini.c +++ b/drivers/net/ethernet/sun/cassini.c @@ -5059,7 +5059,7 @@ static int cas_init_one(struct pci_dev *pdev, const struct pci_device_id *ent) if (cp->cas_flags & CAS_FLAG_SATURN) cas_saturn_firmware_init(cp); - cp->init_block = (struct cas_init_block *) + cp->init_block = pci_alloc_consistent(pdev, sizeof(struct cas_init_block), &cp->block_dvma); if (!cp->init_block) { -- cgit v1.2.3-59-g8ed1b From 745e5ad5084db32e093a143de324afa4ed95f14d Mon Sep 17 00:00:00 2001 From: Aishwarya Ramakrishnan Date: Sun, 19 Apr 2020 21:59:17 +0530 Subject: net: qed: Remove unneeded cast from memory allocation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Remove casting the values returned by memory allocation function. Coccinelle emits WARNING: casting value returned by memory allocation function to struct pointer is useless. This issue was detected by using the Coccinelle. Signed-off-by: Aishwarya Ramakrishnan Acked-by: Michal Kalderon  Signed-off-by: David S. Miller --- drivers/net/ethernet/qlogic/qed/qed_roce.c | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/drivers/net/ethernet/qlogic/qed/qed_roce.c b/drivers/net/ethernet/qlogic/qed/qed_roce.c index 37e70562a964..475b89903f46 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_roce.c +++ b/drivers/net/ethernet/qlogic/qed/qed_roce.c @@ -736,9 +736,9 @@ static int qed_roce_sp_destroy_qp_responder(struct qed_hwfn *p_hwfn, p_ramrod = &p_ent->ramrod.roce_destroy_qp_resp; - p_ramrod_res = (struct roce_destroy_qp_resp_output_params *) - dma_alloc_coherent(&p_hwfn->cdev->pdev->dev, sizeof(*p_ramrod_res), - &ramrod_res_phys, GFP_KERNEL); + p_ramrod_res = dma_alloc_coherent(&p_hwfn->cdev->pdev->dev, + sizeof(*p_ramrod_res), + &ramrod_res_phys, GFP_KERNEL); if (!p_ramrod_res) { rc = -ENOMEM; @@ -872,10 +872,10 @@ int qed_roce_query_qp(struct qed_hwfn *p_hwfn, } /* Send a query responder ramrod to FW to get RQ-PSN and state */ - p_resp_ramrod_res = (struct roce_query_qp_resp_output_params *) - dma_alloc_coherent(&p_hwfn->cdev->pdev->dev, - sizeof(*p_resp_ramrod_res), - &resp_ramrod_res_phys, GFP_KERNEL); + p_resp_ramrod_res = + dma_alloc_coherent(&p_hwfn->cdev->pdev->dev, + sizeof(*p_resp_ramrod_res), + &resp_ramrod_res_phys, GFP_KERNEL); if (!p_resp_ramrod_res) { DP_NOTICE(p_hwfn, "qed query qp failed: cannot allocate memory (ramrod)\n"); @@ -920,8 +920,7 @@ int qed_roce_query_qp(struct qed_hwfn *p_hwfn, } /* Send a query requester ramrod to FW to get SQ-PSN and state */ - p_req_ramrod_res = (struct roce_query_qp_req_output_params *) - dma_alloc_coherent(&p_hwfn->cdev->pdev->dev, + p_req_ramrod_res = dma_alloc_coherent(&p_hwfn->cdev->pdev->dev, sizeof(*p_req_ramrod_res), &req_ramrod_res_phys, GFP_KERNEL); -- cgit v1.2.3-59-g8ed1b From 2ac1fa439ee97aaaa124c37340a316918bb0a8bc Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Sun, 19 Apr 2020 23:07:39 +0200 Subject: r8169: inline rtl8169_mark_as_last_descriptor rtl8169_mark_as_last_descriptor() has just one user, so inline it. Signed-off-by: Heiner Kallweit Signed-off-by: David S. Miller --- drivers/net/ethernet/realtek/r8169_main.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/drivers/net/ethernet/realtek/r8169_main.c b/drivers/net/ethernet/realtek/r8169_main.c index 1bc415d00cb8..3b8ae49c3ea2 100644 --- a/drivers/net/ethernet/realtek/r8169_main.c +++ b/drivers/net/ethernet/realtek/r8169_main.c @@ -3939,11 +3939,6 @@ static void rtl8169_rx_clear(struct rtl8169_private *tp) } } -static inline void rtl8169_mark_as_last_descriptor(struct RxDesc *desc) -{ - desc->opts1 |= cpu_to_le32(RingEnd); -} - static int rtl8169_rx_fill(struct rtl8169_private *tp) { unsigned int i; @@ -3959,7 +3954,8 @@ static int rtl8169_rx_fill(struct rtl8169_private *tp) tp->Rx_databuff[i] = data; } - rtl8169_mark_as_last_descriptor(tp->RxDescArray + NUM_RX_DESC - 1); + /* mark as last descriptor in the ring */ + tp->RxDescArray[NUM_RX_DESC - 1].opts1 |= cpu_to_le32(RingEnd); return 0; } -- cgit v1.2.3-59-g8ed1b From 9d3679fe0f30b5a4c1ae6303611c9f2bbeb9961d Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Sun, 19 Apr 2020 23:16:55 +0200 Subject: r8169: inline rtl8169_make_unusable_by_asic Inline rtl8169_make_unusable_by_asic() and simplify it: - Address field doesn't need to be poisoned because descriptor is owned by CPU now - desc->opts1 is set by rtl8169_mark_to_asic() and rtl8169_rx_fill(), therefore we don't have to preserve any field parts. Signed-off-by: Heiner Kallweit Signed-off-by: David S. Miller --- drivers/net/ethernet/realtek/r8169_main.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/drivers/net/ethernet/realtek/r8169_main.c b/drivers/net/ethernet/realtek/r8169_main.c index 3b8ae49c3ea2..b8cc064ee9f5 100644 --- a/drivers/net/ethernet/realtek/r8169_main.c +++ b/drivers/net/ethernet/realtek/r8169_main.c @@ -3882,12 +3882,6 @@ static int rtl8169_change_mtu(struct net_device *dev, int new_mtu) return 0; } -static inline void rtl8169_make_unusable_by_asic(struct RxDesc *desc) -{ - desc->addr = cpu_to_le64(0x0badbadbadbadbadull); - desc->opts1 &= ~cpu_to_le32(DescOwn | RsvdMask); -} - static inline void rtl8169_mark_to_asic(struct RxDesc *desc) { u32 eor = le32_to_cpu(desc->opts1) & RingEnd; @@ -3935,7 +3929,8 @@ static void rtl8169_rx_clear(struct rtl8169_private *tp) R8169_RX_BUF_SIZE, DMA_FROM_DEVICE); __free_pages(tp->Rx_databuff[i], get_order(R8169_RX_BUF_SIZE)); tp->Rx_databuff[i] = NULL; - rtl8169_make_unusable_by_asic(tp->RxDescArray + i); + tp->RxDescArray[i].addr = 0; + tp->RxDescArray[i].opts1 = 0; } } -- cgit v1.2.3-59-g8ed1b From 29ae6bd1b0d8a57d7c00ab12cbb949fc41986eef Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Mon, 20 Apr 2020 00:04:00 +0200 Subject: net: ethernet: fec: Replace interrupt driven MDIO with polled IO Measurements of the MDIO bus have shown that driving the MDIO bus using interrupts is slow. Back to back MDIO transactions take about 90us, with 25us spent performing the transaction, and the remainder of the time the bus is idle. Replacing the completion interrupt with polled IO results in back to back transactions of 40us. The polling loop waiting for the hardware to complete the transaction takes around 28us. Which suggests interrupt handling has an overhead of 50us, and polled IO nearly halves this overhead, and doubles the MDIO performance. Suggested-by: Chris Heally Signed-off-by: Andrew Lunn Signed-off-by: David S. Miller --- drivers/net/ethernet/freescale/fec.h | 4 +- drivers/net/ethernet/freescale/fec_main.c | 67 ++++++++++++++++--------------- 2 files changed, 35 insertions(+), 36 deletions(-) diff --git a/drivers/net/ethernet/freescale/fec.h b/drivers/net/ethernet/freescale/fec.h index e74dd1f86bba..a6cdd5b61921 100644 --- a/drivers/net/ethernet/freescale/fec.h +++ b/drivers/net/ethernet/freescale/fec.h @@ -376,8 +376,7 @@ struct bufdesc_ex { #define FEC_ENET_TS_AVAIL ((uint)0x00010000) #define FEC_ENET_TS_TIMER ((uint)0x00008000) -#define FEC_DEFAULT_IMASK (FEC_ENET_TXF | FEC_ENET_RXF | FEC_ENET_MII) -#define FEC_NAPI_IMASK FEC_ENET_MII +#define FEC_DEFAULT_IMASK (FEC_ENET_TXF | FEC_ENET_RXF) #define FEC_RX_DISABLED_IMASK (FEC_DEFAULT_IMASK & (~FEC_ENET_RXF)) /* ENET interrupt coalescing macro define */ @@ -543,7 +542,6 @@ struct fec_enet_private { int link; int full_duplex; int speed; - struct completion mdio_done; int irq[FEC_IRQ_NUM]; bool bufdesc_ex; int pause_flag; diff --git a/drivers/net/ethernet/freescale/fec_main.c b/drivers/net/ethernet/freescale/fec_main.c index dc6f8763a5d4..2267bf75784e 100644 --- a/drivers/net/ethernet/freescale/fec_main.c +++ b/drivers/net/ethernet/freescale/fec_main.c @@ -976,8 +976,8 @@ fec_restart(struct net_device *ndev) writel((__force u32)cpu_to_be32(temp_mac[1]), fep->hwp + FEC_ADDR_HIGH); - /* Clear any outstanding interrupt. */ - writel(0xffffffff, fep->hwp + FEC_IEVENT); + /* Clear any outstanding interrupt, except MDIO. */ + writel((0xffffffff & ~FEC_ENET_MII), fep->hwp + FEC_IEVENT); fec_enet_bd_init(ndev); @@ -1123,7 +1123,7 @@ fec_restart(struct net_device *ndev) if (fep->link) writel(FEC_DEFAULT_IMASK, fep->hwp + FEC_IMASK); else - writel(FEC_ENET_MII, fep->hwp + FEC_IMASK); + writel(0, fep->hwp + FEC_IMASK); /* Init the interrupt coalescing */ fec_enet_itr_coal_init(ndev); @@ -1652,6 +1652,10 @@ fec_enet_interrupt(int irq, void *dev_id) irqreturn_t ret = IRQ_NONE; int_events = readl(fep->hwp + FEC_IEVENT); + + /* Don't clear MDIO events, we poll for those */ + int_events &= ~FEC_ENET_MII; + writel(int_events, fep->hwp + FEC_IEVENT); fec_enet_collect_events(fep, int_events); @@ -1659,16 +1663,12 @@ fec_enet_interrupt(int irq, void *dev_id) ret = IRQ_HANDLED; if (napi_schedule_prep(&fep->napi)) { - /* Disable the NAPI interrupts */ - writel(FEC_NAPI_IMASK, fep->hwp + FEC_IMASK); + /* Disable interrupts */ + writel(0, fep->hwp + FEC_IMASK); __napi_schedule(&fep->napi); } } - if (int_events & FEC_ENET_MII) { - ret = IRQ_HANDLED; - complete(&fep->mdio_done); - } return ret; } @@ -1818,11 +1818,24 @@ static void fec_enet_adjust_link(struct net_device *ndev) phy_print_status(phy_dev); } +static int fec_enet_mdio_wait(struct fec_enet_private *fep) +{ + uint ievent; + int ret; + + ret = readl_poll_timeout_atomic(fep->hwp + FEC_IEVENT, ievent, + ievent & FEC_ENET_MII, 2, 30000); + + if (!ret) + writel(FEC_ENET_MII, fep->hwp + FEC_IEVENT); + + return ret; +} + static int fec_enet_mdio_read(struct mii_bus *bus, int mii_id, int regnum) { struct fec_enet_private *fep = bus->priv; struct device *dev = &fep->pdev->dev; - unsigned long time_left; int ret = 0, frame_start, frame_addr, frame_op; bool is_c45 = !!(regnum & MII_ADDR_C45); @@ -1830,8 +1843,6 @@ static int fec_enet_mdio_read(struct mii_bus *bus, int mii_id, int regnum) if (ret < 0) return ret; - reinit_completion(&fep->mdio_done); - if (is_c45) { frame_start = FEC_MMFR_ST_C45; @@ -1843,11 +1854,9 @@ static int fec_enet_mdio_read(struct mii_bus *bus, int mii_id, int regnum) fep->hwp + FEC_MII_DATA); /* wait for end of transfer */ - time_left = wait_for_completion_timeout(&fep->mdio_done, - usecs_to_jiffies(FEC_MII_TIMEOUT)); - if (time_left == 0) { + ret = fec_enet_mdio_wait(fep); + if (ret) { netdev_err(fep->netdev, "MDIO address write timeout\n"); - ret = -ETIMEDOUT; goto out; } @@ -1866,11 +1875,9 @@ static int fec_enet_mdio_read(struct mii_bus *bus, int mii_id, int regnum) FEC_MMFR_TA, fep->hwp + FEC_MII_DATA); /* wait for end of transfer */ - time_left = wait_for_completion_timeout(&fep->mdio_done, - usecs_to_jiffies(FEC_MII_TIMEOUT)); - if (time_left == 0) { + ret = fec_enet_mdio_wait(fep); + if (ret) { netdev_err(fep->netdev, "MDIO read timeout\n"); - ret = -ETIMEDOUT; goto out; } @@ -1888,7 +1895,6 @@ static int fec_enet_mdio_write(struct mii_bus *bus, int mii_id, int regnum, { struct fec_enet_private *fep = bus->priv; struct device *dev = &fep->pdev->dev; - unsigned long time_left; int ret, frame_start, frame_addr; bool is_c45 = !!(regnum & MII_ADDR_C45); @@ -1898,8 +1904,6 @@ static int fec_enet_mdio_write(struct mii_bus *bus, int mii_id, int regnum, else ret = 0; - reinit_completion(&fep->mdio_done); - if (is_c45) { frame_start = FEC_MMFR_ST_C45; @@ -1911,11 +1915,9 @@ static int fec_enet_mdio_write(struct mii_bus *bus, int mii_id, int regnum, fep->hwp + FEC_MII_DATA); /* wait for end of transfer */ - time_left = wait_for_completion_timeout(&fep->mdio_done, - usecs_to_jiffies(FEC_MII_TIMEOUT)); - if (time_left == 0) { + ret = fec_enet_mdio_wait(fep); + if (ret) { netdev_err(fep->netdev, "MDIO address write timeout\n"); - ret = -ETIMEDOUT; goto out; } } else { @@ -1931,12 +1933,9 @@ static int fec_enet_mdio_write(struct mii_bus *bus, int mii_id, int regnum, fep->hwp + FEC_MII_DATA); /* wait for end of transfer */ - time_left = wait_for_completion_timeout(&fep->mdio_done, - usecs_to_jiffies(FEC_MII_TIMEOUT)); - if (time_left == 0) { + ret = fec_enet_mdio_wait(fep); + if (ret) netdev_err(fep->netdev, "MDIO write timeout\n"); - ret = -ETIMEDOUT; - } out: pm_runtime_mark_last_busy(dev); @@ -2132,6 +2131,9 @@ static int fec_enet_mii_init(struct platform_device *pdev) writel(fep->phy_speed, fep->hwp + FEC_MII_SPEED); + /* Clear any pending transaction complete indication */ + writel(FEC_ENET_MII, fep->hwp + FEC_IEVENT); + fep->mii_bus = mdiobus_alloc(); if (fep->mii_bus == NULL) { err = -ENOMEM; @@ -3674,7 +3676,6 @@ fec_probe(struct platform_device *pdev) fep->irq[i] = irq; } - init_completion(&fep->mdio_done); ret = fec_enet_mii_init(pdev); if (ret) goto failed_mii_init; -- cgit v1.2.3-59-g8ed1b From 3e782985cb3ce00a32c372b37d8feefdae18ddf1 Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Mon, 20 Apr 2020 00:04:01 +0200 Subject: net: ethernet: fec: Allow configuration of MDIO bus speed MDIO busses typically operate at 2.5MHz. However many devices can operate at faster speeds. This then allows more MDIO transactions per second, useful for Ethernet switch statistics, or Ethernet PHY TDR data. Allow the bus speed to be configured, using the standard "clock-frequency" property, which i2c busses use to indicate the bus speed. Before using this property, ensure all devices on the bus do actually support the requested clock speed. Suggested-by: Chris Healy Signed-off-by: Andrew Lunn Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- Documentation/devicetree/bindings/net/fsl-fec.txt | 1 + Documentation/devicetree/bindings/net/mdio.yaml | 6 ++++++ drivers/net/ethernet/freescale/fec_main.c | 11 ++++++++--- 3 files changed, 15 insertions(+), 3 deletions(-) diff --git a/Documentation/devicetree/bindings/net/fsl-fec.txt b/Documentation/devicetree/bindings/net/fsl-fec.txt index ff8b0f211aa1..26c492a2e0e1 100644 --- a/Documentation/devicetree/bindings/net/fsl-fec.txt +++ b/Documentation/devicetree/bindings/net/fsl-fec.txt @@ -82,6 +82,7 @@ ethernet@83fec000 { phy-supply = <®_fec_supply>; phy-handle = <ðphy>; mdio { + clock-frequency = <5000000>; ethphy: ethernet-phy@6 { compatible = "ethernet-phy-ieee802.3-c22"; reg = <6>; diff --git a/Documentation/devicetree/bindings/net/mdio.yaml b/Documentation/devicetree/bindings/net/mdio.yaml index 50c3397a82bc..ab4a9df8b8e2 100644 --- a/Documentation/devicetree/bindings/net/mdio.yaml +++ b/Documentation/devicetree/bindings/net/mdio.yaml @@ -39,6 +39,12 @@ properties: and must therefore be appropriately determined based on all PHY requirements (maximum value of all per-PHY RESET pulse widths). + clock-frequency: + description: + Desired MDIO bus clock frequency in Hz. Values greater than IEEE 802.3 + defined 2.5MHz should only be used when all devices on the bus support + the given clock speed. + patternProperties: "^ethernet-phy@[0-9a-f]+$": type: object diff --git a/drivers/net/ethernet/freescale/fec_main.c b/drivers/net/ethernet/freescale/fec_main.c index 2267bf75784e..832a24e2805c 100644 --- a/drivers/net/ethernet/freescale/fec_main.c +++ b/drivers/net/ethernet/freescale/fec_main.c @@ -2067,6 +2067,7 @@ static int fec_enet_mii_init(struct platform_device *pdev) struct device_node *node; int err = -ENXIO; u32 mii_speed, holdtime; + u32 bus_freq; /* * The i.MX28 dual fec interfaces are not equal. @@ -2094,15 +2095,20 @@ static int fec_enet_mii_init(struct platform_device *pdev) return -ENOENT; } + bus_freq = 2500000; /* 2.5MHz by default */ + node = of_get_child_by_name(pdev->dev.of_node, "mdio"); + if (node) + of_property_read_u32(node, "clock-frequency", &bus_freq); + /* - * Set MII speed to 2.5 MHz (= clk_get_rate() / 2 * phy_speed) + * Set MII speed (= clk_get_rate() / 2 * phy_speed) * * The formula for FEC MDC is 'ref_freq / (MII_SPEED x 2)' while * for ENET-MAC is 'ref_freq / ((MII_SPEED + 1) x 2)'. The i.MX28 * Reference Manual has an error on this, and gets fixed on i.MX6Q * document. */ - mii_speed = DIV_ROUND_UP(clk_get_rate(fep->clk_ipg), 5000000); + mii_speed = DIV_ROUND_UP(clk_get_rate(fep->clk_ipg), bus_freq * 2); if (fep->quirks & FEC_QUIRK_ENET_MAC) mii_speed--; if (mii_speed > 63) { @@ -2148,7 +2154,6 @@ static int fec_enet_mii_init(struct platform_device *pdev) fep->mii_bus->priv = fep; fep->mii_bus->parent = &pdev->dev; - node = of_get_child_by_name(pdev->dev.of_node, "mdio"); err = of_mdiobus_register(fep->mii_bus, node); of_node_put(node); if (err) -- cgit v1.2.3-59-g8ed1b From 3c01eb62d1bd85a5dd1d22d74339728666ae2c45 Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Mon, 20 Apr 2020 00:04:02 +0200 Subject: net: ethernet: fec: Allow the MDIO preamble to be disabled An MDIO transaction normally starts with 32 1s as a preamble. However not all devices requires such a preamble. Add a device tree property which allows the preamble to be suppressed. This will half the size of the MDIO transaction, allowing faster transactions. But it should only be used when all devices on the bus support suppressed preamble. Suggested-by: Chris Healy Signed-off-by: Andrew Lunn Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- Documentation/devicetree/bindings/net/mdio.yaml | 6 ++++++ drivers/net/ethernet/freescale/fec_main.c | 9 ++++++++- 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/Documentation/devicetree/bindings/net/mdio.yaml b/Documentation/devicetree/bindings/net/mdio.yaml index ab4a9df8b8e2..cd6c6ae6dabb 100644 --- a/Documentation/devicetree/bindings/net/mdio.yaml +++ b/Documentation/devicetree/bindings/net/mdio.yaml @@ -45,6 +45,12 @@ properties: defined 2.5MHz should only be used when all devices on the bus support the given clock speed. + suppress-preamble: + description: + The 32 bit preamble should be suppressed. In order for this to + work, all devices on the bus must support suppressed preamble. + type: boolean + patternProperties: "^ethernet-phy@[0-9a-f]+$": type: object diff --git a/drivers/net/ethernet/freescale/fec_main.c b/drivers/net/ethernet/freescale/fec_main.c index 832a24e2805c..1ae075a246a3 100644 --- a/drivers/net/ethernet/freescale/fec_main.c +++ b/drivers/net/ethernet/freescale/fec_main.c @@ -2064,6 +2064,7 @@ static int fec_enet_mii_init(struct platform_device *pdev) static struct mii_bus *fec0_mii_bus; struct net_device *ndev = platform_get_drvdata(pdev); struct fec_enet_private *fep = netdev_priv(ndev); + bool suppress_preamble = false; struct device_node *node; int err = -ENXIO; u32 mii_speed, holdtime; @@ -2097,8 +2098,11 @@ static int fec_enet_mii_init(struct platform_device *pdev) bus_freq = 2500000; /* 2.5MHz by default */ node = of_get_child_by_name(pdev->dev.of_node, "mdio"); - if (node) + if (node) { of_property_read_u32(node, "clock-frequency", &bus_freq); + suppress_preamble = of_property_read_bool(node, + "suppress-preamble"); + } /* * Set MII speed (= clk_get_rate() / 2 * phy_speed) @@ -2135,6 +2139,9 @@ static int fec_enet_mii_init(struct platform_device *pdev) fep->phy_speed = mii_speed << 1 | holdtime << 8; + if (suppress_preamble) + fep->phy_speed |= BIT(7); + writel(fep->phy_speed, fep->hwp + FEC_MII_SPEED); /* Clear any pending transaction complete indication */ -- cgit v1.2.3-59-g8ed1b From eec517cdb4810b3843eb7707971de3164088bff1 Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Mon, 20 Apr 2020 00:11:50 +0200 Subject: net: Add IF_OPER_TESTING RFC 2863 defines the operational state testing. Add support for this state, both as a IF_LINK_MODE_ and __LINK_STATE_. Signed-off-by: Andrew Lunn Signed-off-by: David S. Miller --- include/linux/netdevice.h | 41 +++++++++++++++++++++++++++++++++++++++++ include/uapi/linux/if.h | 1 + net/core/dev.c | 5 +++++ net/core/link_watch.c | 12 ++++++++++-- net/core/rtnetlink.c | 9 ++++++++- 5 files changed, 65 insertions(+), 3 deletions(-) diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 130a668049ab..0750b54b3765 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -288,6 +288,7 @@ enum netdev_state_t { __LINK_STATE_NOCARRIER, __LINK_STATE_LINKWATCH_PENDING, __LINK_STATE_DORMANT, + __LINK_STATE_TESTING, }; @@ -3907,6 +3908,46 @@ static inline bool netif_dormant(const struct net_device *dev) } +/** + * netif_testing_on - mark device as under test. + * @dev: network device + * + * Mark device as under test (as per RFC2863). + * + * The testing state indicates that some test(s) must be performed on + * the interface. After completion, of the test, the interface state + * will change to up, dormant, or down, as appropriate. + */ +static inline void netif_testing_on(struct net_device *dev) +{ + if (!test_and_set_bit(__LINK_STATE_TESTING, &dev->state)) + linkwatch_fire_event(dev); +} + +/** + * netif_testing_off - set device as not under test. + * @dev: network device + * + * Device is not in testing state. + */ +static inline void netif_testing_off(struct net_device *dev) +{ + if (test_and_clear_bit(__LINK_STATE_TESTING, &dev->state)) + linkwatch_fire_event(dev); +} + +/** + * netif_testing - test if device is under test + * @dev: network device + * + * Check if device is under test + */ +static inline bool netif_testing(const struct net_device *dev) +{ + return test_bit(__LINK_STATE_TESTING, &dev->state); +} + + /** * netif_oper_up - test if device is operational * @dev: network device diff --git a/include/uapi/linux/if.h b/include/uapi/linux/if.h index be714cd8c826..797ba2c1562a 100644 --- a/include/uapi/linux/if.h +++ b/include/uapi/linux/if.h @@ -178,6 +178,7 @@ enum { enum { IF_LINK_MODE_DEFAULT, IF_LINK_MODE_DORMANT, /* limit upward transition to dormant */ + IF_LINK_MODE_TESTING, /* limit upward transition to testing */ }; /* diff --git a/net/core/dev.c b/net/core/dev.c index 522288177bbd..fb61522b1ce1 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -9136,6 +9136,11 @@ void netif_stacked_transfer_operstate(const struct net_device *rootdev, else netif_dormant_off(dev); + if (rootdev->operstate == IF_OPER_TESTING) + netif_testing_on(dev); + else + netif_testing_off(dev); + if (netif_carrier_ok(rootdev)) netif_carrier_on(dev); else diff --git a/net/core/link_watch.c b/net/core/link_watch.c index f153e0601838..75431ca9300f 100644 --- a/net/core/link_watch.c +++ b/net/core/link_watch.c @@ -34,6 +34,9 @@ static DEFINE_SPINLOCK(lweventlist_lock); static unsigned char default_operstate(const struct net_device *dev) { + if (netif_testing(dev)) + return IF_OPER_TESTING; + if (!netif_carrier_ok(dev)) return (dev->ifindex != dev_get_iflink(dev) ? IF_OPER_LOWERLAYERDOWN : IF_OPER_DOWN); @@ -55,11 +58,15 @@ static void rfc2863_policy(struct net_device *dev) write_lock_bh(&dev_base_lock); switch(dev->link_mode) { + case IF_LINK_MODE_TESTING: + if (operstate == IF_OPER_UP) + operstate = IF_OPER_TESTING; + break; + case IF_LINK_MODE_DORMANT: if (operstate == IF_OPER_UP) operstate = IF_OPER_DORMANT; break; - case IF_LINK_MODE_DEFAULT: default: break; @@ -74,7 +81,8 @@ static void rfc2863_policy(struct net_device *dev) void linkwatch_init_dev(struct net_device *dev) { /* Handle pre-registration link state changes */ - if (!netif_carrier_ok(dev) || netif_dormant(dev)) + if (!netif_carrier_ok(dev) || netif_dormant(dev) || + netif_testing(dev)) rfc2863_policy(dev); } diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 709ebbf8ab5b..d6f4f4a9e8ba 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -829,11 +829,18 @@ static void set_operstate(struct net_device *dev, unsigned char transition) switch (transition) { case IF_OPER_UP: if ((operstate == IF_OPER_DORMANT || + operstate == IF_OPER_TESTING || operstate == IF_OPER_UNKNOWN) && - !netif_dormant(dev)) + !netif_dormant(dev) && !netif_testing(dev)) operstate = IF_OPER_UP; break; + case IF_OPER_TESTING: + if (operstate == IF_OPER_UP || + operstate == IF_OPER_UNKNOWN) + operstate = IF_OPER_TESTING; + break; + case IF_OPER_DORMANT: if (operstate == IF_OPER_UP || operstate == IF_OPER_UNKNOWN) -- cgit v1.2.3-59-g8ed1b From db30a57779b18b7cef092c21887ed2d23ad2bd35 Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Mon, 20 Apr 2020 00:11:51 +0200 Subject: net: Add testing sysfs attribute Similar to speed, duplex and dorment, report the testing status in sysfs. Signed-off-by: Andrew Lunn Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- Documentation/ABI/testing/sysfs-class-net | 13 +++++++++++++ net/core/net-sysfs.c | 15 ++++++++++++++- 2 files changed, 27 insertions(+), 1 deletion(-) diff --git a/Documentation/ABI/testing/sysfs-class-net b/Documentation/ABI/testing/sysfs-class-net index 664a8f6a634f..3b404577f380 100644 --- a/Documentation/ABI/testing/sysfs-class-net +++ b/Documentation/ABI/testing/sysfs-class-net @@ -124,6 +124,19 @@ Description: authentication is performed (e.g: 802.1x). 'link_mode' attribute will also reflect the dormant state. +What: /sys/class/net//testing +Date: April 2002 +KernelVersion: 5.8 +Contact: netdev@vger.kernel.org +Description: + Indicates whether the interface is under test. Possible + values are: + 0: interface is not being tested + 1: interface is being tested + + When an interface is under test, it cannot be expected + to pass packets as normal. + What: /sys/clas/net//duplex Date: October 2009 KernelVersion: 2.6.33 diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index 4773ad6ec111..0d9e46de205e 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -243,6 +243,18 @@ static ssize_t duplex_show(struct device *dev, } static DEVICE_ATTR_RO(duplex); +static ssize_t testing_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct net_device *netdev = to_net_dev(dev); + + if (netif_running(netdev)) + return sprintf(buf, fmt_dec, !!netif_testing(netdev)); + + return -EINVAL; +} +static DEVICE_ATTR_RO(testing); + static ssize_t dormant_show(struct device *dev, struct device_attribute *attr, char *buf) { @@ -260,7 +272,7 @@ static const char *const operstates[] = { "notpresent", /* currently unused */ "down", "lowerlayerdown", - "testing", /* currently unused */ + "testing", "dormant", "up" }; @@ -524,6 +536,7 @@ static struct attribute *net_class_attrs[] __ro_after_init = { &dev_attr_speed.attr, &dev_attr_duplex.attr, &dev_attr_dormant.attr, + &dev_attr_testing.attr, &dev_attr_operstate.attr, &dev_attr_carrier_changes.attr, &dev_attr_ifalias.attr, -- cgit v1.2.3-59-g8ed1b From 77e9b2ab451d32c81468ef679c377b2f831cd720 Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Mon, 20 Apr 2020 00:11:52 +0200 Subject: net: ethtool: self_test: Mark interface in testing operative status When an interface is executing a self test, put the interface into operative status testing. Signed-off-by: Andrew Lunn Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- net/ethtool/ioctl.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/ethtool/ioctl.c b/net/ethtool/ioctl.c index 89d0b1827aaf..593fa665f820 100644 --- a/net/ethtool/ioctl.c +++ b/net/ethtool/ioctl.c @@ -1746,7 +1746,9 @@ static int ethtool_self_test(struct net_device *dev, char __user *useraddr) if (!data) return -ENOMEM; + netif_testing_on(dev); ops->self_test(dev, &test, data); + netif_testing_off(dev); ret = -EFAULT; if (copy_to_user(useraddr, &test, sizeof(test))) -- cgit v1.2.3-59-g8ed1b From 736fc0e17fade807e59cd9001af88ec4bcca62ef Mon Sep 17 00:00:00 2001 From: Jian Shen Date: Mon, 20 Apr 2020 10:17:26 +0800 Subject: net: hns3: split out hclge_fd_check_ether_tuple() For readability and maintainability, this patch separates the handling part of each flow type in hclge_fd_check_ether_tuple() into standalone functions. Signed-off-by: Jian Shen Signed-off-by: Huazhong Tan Signed-off-by: David S. Miller --- .../ethernet/hisilicon/hns3/hns3pf/hclge_main.c | 285 +++++++++++++-------- 1 file changed, 173 insertions(+), 112 deletions(-) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c index a758f9ae32be..80d0651145df 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c @@ -5244,157 +5244,158 @@ static int hclge_config_action(struct hclge_dev *hdev, u8 stage, return hclge_fd_ad_config(hdev, stage, ad_data.ad_id, &ad_data); } -static int hclge_fd_check_spec(struct hclge_dev *hdev, - struct ethtool_rx_flow_spec *fs, u32 *unused) +static int hclge_fd_check_tcpip4_tuple(struct ethtool_tcpip4_spec *spec, + u32 *unused_tuple) { - struct ethtool_tcpip4_spec *tcp_ip4_spec; - struct ethtool_usrip4_spec *usr_ip4_spec; - struct ethtool_tcpip6_spec *tcp_ip6_spec; - struct ethtool_usrip6_spec *usr_ip6_spec; - struct ethhdr *ether_spec; - - if (fs->location >= hdev->fd_cfg.rule_num[HCLGE_FD_STAGE_1]) + if (!spec || !unused_tuple) return -EINVAL; - if (!(fs->flow_type & hdev->fd_cfg.proto_support)) - return -EOPNOTSUPP; - - if ((fs->flow_type & FLOW_EXT) && - (fs->h_ext.data[0] != 0 || fs->h_ext.data[1] != 0)) { - dev_err(&hdev->pdev->dev, "user-def bytes are not supported\n"); - return -EOPNOTSUPP; - } + *unused_tuple |= BIT(INNER_SRC_MAC) | BIT(INNER_DST_MAC); - switch (fs->flow_type & ~(FLOW_EXT | FLOW_MAC_EXT)) { - case SCTP_V4_FLOW: - case TCP_V4_FLOW: - case UDP_V4_FLOW: - tcp_ip4_spec = &fs->h_u.tcp_ip4_spec; - *unused |= BIT(INNER_SRC_MAC) | BIT(INNER_DST_MAC); + if (!spec->ip4src) + *unused_tuple |= BIT(INNER_SRC_IP); - if (!tcp_ip4_spec->ip4src) - *unused |= BIT(INNER_SRC_IP); + if (!spec->ip4dst) + *unused_tuple |= BIT(INNER_DST_IP); - if (!tcp_ip4_spec->ip4dst) - *unused |= BIT(INNER_DST_IP); + if (!spec->psrc) + *unused_tuple |= BIT(INNER_SRC_PORT); - if (!tcp_ip4_spec->psrc) - *unused |= BIT(INNER_SRC_PORT); + if (!spec->pdst) + *unused_tuple |= BIT(INNER_DST_PORT); - if (!tcp_ip4_spec->pdst) - *unused |= BIT(INNER_DST_PORT); + if (!spec->tos) + *unused_tuple |= BIT(INNER_IP_TOS); - if (!tcp_ip4_spec->tos) - *unused |= BIT(INNER_IP_TOS); + return 0; +} - break; - case IP_USER_FLOW: - usr_ip4_spec = &fs->h_u.usr_ip4_spec; - *unused |= BIT(INNER_SRC_MAC) | BIT(INNER_DST_MAC) | - BIT(INNER_SRC_PORT) | BIT(INNER_DST_PORT); +static int hclge_fd_check_ip4_tuple(struct ethtool_usrip4_spec *spec, + u32 *unused_tuple) +{ + if (!spec || !unused_tuple) + return -EINVAL; - if (!usr_ip4_spec->ip4src) - *unused |= BIT(INNER_SRC_IP); + *unused_tuple |= BIT(INNER_SRC_MAC) | BIT(INNER_DST_MAC) | + BIT(INNER_SRC_PORT) | BIT(INNER_DST_PORT); - if (!usr_ip4_spec->ip4dst) - *unused |= BIT(INNER_DST_IP); + if (!spec->ip4src) + *unused_tuple |= BIT(INNER_SRC_IP); - if (!usr_ip4_spec->tos) - *unused |= BIT(INNER_IP_TOS); + if (!spec->ip4dst) + *unused_tuple |= BIT(INNER_DST_IP); - if (!usr_ip4_spec->proto) - *unused |= BIT(INNER_IP_PROTO); + if (!spec->tos) + *unused_tuple |= BIT(INNER_IP_TOS); - if (usr_ip4_spec->l4_4_bytes) - return -EOPNOTSUPP; + if (!spec->proto) + *unused_tuple |= BIT(INNER_IP_PROTO); - if (usr_ip4_spec->ip_ver != ETH_RX_NFC_IP4) - return -EOPNOTSUPP; + if (spec->l4_4_bytes) + return -EOPNOTSUPP; - break; - case SCTP_V6_FLOW: - case TCP_V6_FLOW: - case UDP_V6_FLOW: - tcp_ip6_spec = &fs->h_u.tcp_ip6_spec; - *unused |= BIT(INNER_SRC_MAC) | BIT(INNER_DST_MAC) | - BIT(INNER_IP_TOS); + if (spec->ip_ver != ETH_RX_NFC_IP4) + return -EOPNOTSUPP; - /* check whether src/dst ip address used */ - if (!tcp_ip6_spec->ip6src[0] && !tcp_ip6_spec->ip6src[1] && - !tcp_ip6_spec->ip6src[2] && !tcp_ip6_spec->ip6src[3]) - *unused |= BIT(INNER_SRC_IP); + return 0; +} - if (!tcp_ip6_spec->ip6dst[0] && !tcp_ip6_spec->ip6dst[1] && - !tcp_ip6_spec->ip6dst[2] && !tcp_ip6_spec->ip6dst[3]) - *unused |= BIT(INNER_DST_IP); +static int hclge_fd_check_tcpip6_tuple(struct ethtool_tcpip6_spec *spec, + u32 *unused_tuple) +{ + if (!spec || !unused_tuple) + return -EINVAL; - if (!tcp_ip6_spec->psrc) - *unused |= BIT(INNER_SRC_PORT); + *unused_tuple |= BIT(INNER_SRC_MAC) | BIT(INNER_DST_MAC) | + BIT(INNER_IP_TOS); - if (!tcp_ip6_spec->pdst) - *unused |= BIT(INNER_DST_PORT); + /* check whether src/dst ip address used */ + if (!spec->ip6src[0] && !spec->ip6src[1] && + !spec->ip6src[2] && !spec->ip6src[3]) + *unused_tuple |= BIT(INNER_SRC_IP); - if (tcp_ip6_spec->tclass) - return -EOPNOTSUPP; + if (!spec->ip6dst[0] && !spec->ip6dst[1] && + !spec->ip6dst[2] && !spec->ip6dst[3]) + *unused_tuple |= BIT(INNER_DST_IP); - break; - case IPV6_USER_FLOW: - usr_ip6_spec = &fs->h_u.usr_ip6_spec; - *unused |= BIT(INNER_SRC_MAC) | BIT(INNER_DST_MAC) | - BIT(INNER_IP_TOS) | BIT(INNER_SRC_PORT) | - BIT(INNER_DST_PORT); + if (!spec->psrc) + *unused_tuple |= BIT(INNER_SRC_PORT); - /* check whether src/dst ip address used */ - if (!usr_ip6_spec->ip6src[0] && !usr_ip6_spec->ip6src[1] && - !usr_ip6_spec->ip6src[2] && !usr_ip6_spec->ip6src[3]) - *unused |= BIT(INNER_SRC_IP); + if (!spec->pdst) + *unused_tuple |= BIT(INNER_DST_PORT); - if (!usr_ip6_spec->ip6dst[0] && !usr_ip6_spec->ip6dst[1] && - !usr_ip6_spec->ip6dst[2] && !usr_ip6_spec->ip6dst[3]) - *unused |= BIT(INNER_DST_IP); + if (spec->tclass) + return -EOPNOTSUPP; - if (!usr_ip6_spec->l4_proto) - *unused |= BIT(INNER_IP_PROTO); + return 0; +} - if (usr_ip6_spec->tclass) - return -EOPNOTSUPP; +static int hclge_fd_check_ip6_tuple(struct ethtool_usrip6_spec *spec, + u32 *unused_tuple) +{ + if (!spec || !unused_tuple) + return -EINVAL; - if (usr_ip6_spec->l4_4_bytes) - return -EOPNOTSUPP; + *unused_tuple |= BIT(INNER_SRC_MAC) | BIT(INNER_DST_MAC) | + BIT(INNER_IP_TOS) | BIT(INNER_SRC_PORT) | BIT(INNER_DST_PORT); - break; - case ETHER_FLOW: - ether_spec = &fs->h_u.ether_spec; - *unused |= BIT(INNER_SRC_IP) | BIT(INNER_DST_IP) | - BIT(INNER_SRC_PORT) | BIT(INNER_DST_PORT) | - BIT(INNER_IP_TOS) | BIT(INNER_IP_PROTO); + /* check whether src/dst ip address used */ + if (!spec->ip6src[0] && !spec->ip6src[1] && + !spec->ip6src[2] && !spec->ip6src[3]) + *unused_tuple |= BIT(INNER_SRC_IP); - if (is_zero_ether_addr(ether_spec->h_source)) - *unused |= BIT(INNER_SRC_MAC); + if (!spec->ip6dst[0] && !spec->ip6dst[1] && + !spec->ip6dst[2] && !spec->ip6dst[3]) + *unused_tuple |= BIT(INNER_DST_IP); - if (is_zero_ether_addr(ether_spec->h_dest)) - *unused |= BIT(INNER_DST_MAC); + if (!spec->l4_proto) + *unused_tuple |= BIT(INNER_IP_PROTO); - if (!ether_spec->h_proto) - *unused |= BIT(INNER_ETH_TYPE); + if (spec->tclass) + return -EOPNOTSUPP; - break; - default: + if (spec->l4_4_bytes) return -EOPNOTSUPP; - } + return 0; +} + +static int hclge_fd_check_ether_tuple(struct ethhdr *spec, u32 *unused_tuple) +{ + if (!spec || !unused_tuple) + return -EINVAL; + + *unused_tuple |= BIT(INNER_SRC_IP) | BIT(INNER_DST_IP) | + BIT(INNER_SRC_PORT) | BIT(INNER_DST_PORT) | + BIT(INNER_IP_TOS) | BIT(INNER_IP_PROTO); + + if (is_zero_ether_addr(spec->h_source)) + *unused_tuple |= BIT(INNER_SRC_MAC); + + if (is_zero_ether_addr(spec->h_dest)) + *unused_tuple |= BIT(INNER_DST_MAC); + + if (!spec->h_proto) + *unused_tuple |= BIT(INNER_ETH_TYPE); + + return 0; +} + +static int hclge_fd_check_ext_tuple(struct hclge_dev *hdev, + struct ethtool_rx_flow_spec *fs, + u32 *unused_tuple) +{ if ((fs->flow_type & FLOW_EXT)) { if (fs->h_ext.vlan_etype) return -EOPNOTSUPP; if (!fs->h_ext.vlan_tci) - *unused |= BIT(INNER_VLAN_TAG_FST); + *unused_tuple |= BIT(INNER_VLAN_TAG_FST); - if (fs->m_ext.vlan_tci) { - if (be16_to_cpu(fs->h_ext.vlan_tci) >= VLAN_N_VID) - return -EINVAL; - } + if (fs->m_ext.vlan_tci && + be16_to_cpu(fs->h_ext.vlan_tci) >= VLAN_N_VID) + return -EINVAL; } else { - *unused |= BIT(INNER_VLAN_TAG_FST); + *unused_tuple |= BIT(INNER_VLAN_TAG_FST); } if (fs->flow_type & FLOW_MAC_EXT) { @@ -5402,14 +5403,74 @@ static int hclge_fd_check_spec(struct hclge_dev *hdev, return -EOPNOTSUPP; if (is_zero_ether_addr(fs->h_ext.h_dest)) - *unused |= BIT(INNER_DST_MAC); + *unused_tuple |= BIT(INNER_DST_MAC); else - *unused &= ~(BIT(INNER_DST_MAC)); + *unused_tuple &= ~(BIT(INNER_DST_MAC)); } return 0; } +static int hclge_fd_check_spec(struct hclge_dev *hdev, + struct ethtool_rx_flow_spec *fs, + u32 *unused_tuple) +{ + int ret; + + if (fs->location >= hdev->fd_cfg.rule_num[HCLGE_FD_STAGE_1]) + return -EINVAL; + + if (!(fs->flow_type & hdev->fd_cfg.proto_support)) + return -EOPNOTSUPP; + + if ((fs->flow_type & FLOW_EXT) && + (fs->h_ext.data[0] != 0 || fs->h_ext.data[1] != 0)) { + dev_err(&hdev->pdev->dev, "user-def bytes are not supported\n"); + return -EOPNOTSUPP; + } + + switch (fs->flow_type & ~(FLOW_EXT | FLOW_MAC_EXT)) { + case SCTP_V4_FLOW: + case TCP_V4_FLOW: + case UDP_V4_FLOW: + ret = hclge_fd_check_tcpip4_tuple(&fs->h_u.tcp_ip4_spec, + unused_tuple); + break; + case IP_USER_FLOW: + ret = hclge_fd_check_ip4_tuple(&fs->h_u.usr_ip4_spec, + unused_tuple); + break; + case SCTP_V6_FLOW: + case TCP_V6_FLOW: + case UDP_V6_FLOW: + ret = hclge_fd_check_tcpip6_tuple(&fs->h_u.tcp_ip6_spec, + unused_tuple); + break; + case IPV6_USER_FLOW: + ret = hclge_fd_check_ip6_tuple(&fs->h_u.usr_ip6_spec, + unused_tuple); + break; + case ETHER_FLOW: + if (hdev->fd_cfg.fd_mode != + HCLGE_FD_MODE_DEPTH_2K_WIDTH_400B_STAGE_1) { + dev_err(&hdev->pdev->dev, + "ETHER_FLOW is not supported in current fd mode!\n"); + return -EOPNOTSUPP; + } + + ret = hclge_fd_check_ether_tuple(&fs->h_u.ether_spec, + unused_tuple); + break; + default: + return -EOPNOTSUPP; + } + + if (ret) + return ret; + + return hclge_fd_check_ext_tuple(hdev, fs, unused_tuple); +} + static bool hclge_fd_rule_exist(struct hclge_dev *hdev, u16 location) { struct hclge_fd_rule *rule = NULL; -- cgit v1.2.3-59-g8ed1b From fa663c096052ec8c5dfef29fc1cd30190e5aba93 Mon Sep 17 00:00:00 2001 From: Jian Shen Date: Mon, 20 Apr 2020 10:17:27 +0800 Subject: net: hns3: split out hclge_get_fd_rule_info() hclge_get_fd_rule_info() is bloated, this patch separates it into several standalone functions for readability and maintainability. Signed-off-by: Jian Shen Signed-off-by: Huazhong Tan Signed-off-by: David S. Miller --- .../ethernet/hisilicon/hns3/hns3pf/hclge_main.c | 303 +++++++++++---------- 1 file changed, 159 insertions(+), 144 deletions(-) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c index 80d0651145df..b1fe204c9e40 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c @@ -5938,6 +5938,149 @@ static int hclge_get_fd_rule_cnt(struct hnae3_handle *handle, return 0; } +static void hclge_fd_get_tcpip4_info(struct hclge_fd_rule *rule, + struct ethtool_tcpip4_spec *spec, + struct ethtool_tcpip4_spec *spec_mask) +{ + spec->ip4src = cpu_to_be32(rule->tuples.src_ip[IPV4_INDEX]); + spec_mask->ip4src = rule->unused_tuple & BIT(INNER_SRC_IP) ? + 0 : cpu_to_be32(rule->tuples_mask.src_ip[IPV4_INDEX]); + + spec->ip4dst = cpu_to_be32(rule->tuples.dst_ip[IPV4_INDEX]); + spec_mask->ip4dst = rule->unused_tuple & BIT(INNER_DST_IP) ? + 0 : cpu_to_be32(rule->tuples_mask.dst_ip[IPV4_INDEX]); + + spec->psrc = cpu_to_be16(rule->tuples.src_port); + spec_mask->psrc = rule->unused_tuple & BIT(INNER_SRC_PORT) ? + 0 : cpu_to_be16(rule->tuples_mask.src_port); + + spec->pdst = cpu_to_be16(rule->tuples.dst_port); + spec_mask->pdst = rule->unused_tuple & BIT(INNER_DST_PORT) ? + 0 : cpu_to_be16(rule->tuples_mask.dst_port); + + spec->tos = rule->tuples.ip_tos; + spec_mask->tos = rule->unused_tuple & BIT(INNER_IP_TOS) ? + 0 : rule->tuples_mask.ip_tos; +} + +static void hclge_fd_get_ip4_info(struct hclge_fd_rule *rule, + struct ethtool_usrip4_spec *spec, + struct ethtool_usrip4_spec *spec_mask) +{ + spec->ip4src = cpu_to_be32(rule->tuples.src_ip[IPV4_INDEX]); + spec_mask->ip4src = rule->unused_tuple & BIT(INNER_SRC_IP) ? + 0 : cpu_to_be32(rule->tuples_mask.src_ip[IPV4_INDEX]); + + spec->ip4dst = cpu_to_be32(rule->tuples.dst_ip[IPV4_INDEX]); + spec_mask->ip4dst = rule->unused_tuple & BIT(INNER_DST_IP) ? + 0 : cpu_to_be32(rule->tuples_mask.dst_ip[IPV4_INDEX]); + + spec->tos = rule->tuples.ip_tos; + spec_mask->tos = rule->unused_tuple & BIT(INNER_IP_TOS) ? + 0 : rule->tuples_mask.ip_tos; + + spec->proto = rule->tuples.ip_proto; + spec_mask->proto = rule->unused_tuple & BIT(INNER_IP_PROTO) ? + 0 : rule->tuples_mask.ip_proto; + + spec->ip_ver = ETH_RX_NFC_IP4; +} + +static void hclge_fd_get_tcpip6_info(struct hclge_fd_rule *rule, + struct ethtool_tcpip6_spec *spec, + struct ethtool_tcpip6_spec *spec_mask) +{ + cpu_to_be32_array(spec->ip6src, + rule->tuples.src_ip, IPV6_SIZE); + cpu_to_be32_array(spec->ip6dst, + rule->tuples.dst_ip, IPV6_SIZE); + if (rule->unused_tuple & BIT(INNER_SRC_IP)) + memset(spec_mask->ip6src, 0, sizeof(spec_mask->ip6src)); + else + cpu_to_be32_array(spec_mask->ip6src, rule->tuples_mask.src_ip, + IPV6_SIZE); + + if (rule->unused_tuple & BIT(INNER_DST_IP)) + memset(spec_mask->ip6dst, 0, sizeof(spec_mask->ip6dst)); + else + cpu_to_be32_array(spec_mask->ip6dst, rule->tuples_mask.dst_ip, + IPV6_SIZE); + + spec->psrc = cpu_to_be16(rule->tuples.src_port); + spec_mask->psrc = rule->unused_tuple & BIT(INNER_SRC_PORT) ? + 0 : cpu_to_be16(rule->tuples_mask.src_port); + + spec->pdst = cpu_to_be16(rule->tuples.dst_port); + spec_mask->pdst = rule->unused_tuple & BIT(INNER_DST_PORT) ? + 0 : cpu_to_be16(rule->tuples_mask.dst_port); +} + +static void hclge_fd_get_ip6_info(struct hclge_fd_rule *rule, + struct ethtool_usrip6_spec *spec, + struct ethtool_usrip6_spec *spec_mask) +{ + cpu_to_be32_array(spec->ip6src, rule->tuples.src_ip, IPV6_SIZE); + cpu_to_be32_array(spec->ip6dst, rule->tuples.dst_ip, IPV6_SIZE); + if (rule->unused_tuple & BIT(INNER_SRC_IP)) + memset(spec_mask->ip6src, 0, sizeof(spec_mask->ip6src)); + else + cpu_to_be32_array(spec_mask->ip6src, + rule->tuples_mask.src_ip, IPV6_SIZE); + + if (rule->unused_tuple & BIT(INNER_DST_IP)) + memset(spec_mask->ip6dst, 0, sizeof(spec_mask->ip6dst)); + else + cpu_to_be32_array(spec_mask->ip6dst, + rule->tuples_mask.dst_ip, IPV6_SIZE); + + spec->l4_proto = rule->tuples.ip_proto; + spec_mask->l4_proto = rule->unused_tuple & BIT(INNER_IP_PROTO) ? + 0 : rule->tuples_mask.ip_proto; +} + +static void hclge_fd_get_ether_info(struct hclge_fd_rule *rule, + struct ethhdr *spec, + struct ethhdr *spec_mask) +{ + ether_addr_copy(spec->h_source, rule->tuples.src_mac); + ether_addr_copy(spec->h_dest, rule->tuples.dst_mac); + + if (rule->unused_tuple & BIT(INNER_SRC_MAC)) + eth_zero_addr(spec_mask->h_source); + else + ether_addr_copy(spec_mask->h_source, rule->tuples_mask.src_mac); + + if (rule->unused_tuple & BIT(INNER_DST_MAC)) + eth_zero_addr(spec_mask->h_dest); + else + ether_addr_copy(spec_mask->h_dest, rule->tuples_mask.dst_mac); + + spec->h_proto = cpu_to_be16(rule->tuples.ether_proto); + spec_mask->h_proto = rule->unused_tuple & BIT(INNER_ETH_TYPE) ? + 0 : cpu_to_be16(rule->tuples_mask.ether_proto); +} + +static void hclge_fd_get_ext_info(struct ethtool_rx_flow_spec *fs, + struct hclge_fd_rule *rule) +{ + if (fs->flow_type & FLOW_EXT) { + fs->h_ext.vlan_tci = cpu_to_be16(rule->tuples.vlan_tag1); + fs->m_ext.vlan_tci = + rule->unused_tuple & BIT(INNER_VLAN_TAG_FST) ? + cpu_to_be16(VLAN_VID_MASK) : + cpu_to_be16(rule->tuples_mask.vlan_tag1); + } + + if (fs->flow_type & FLOW_MAC_EXT) { + ether_addr_copy(fs->h_ext.h_dest, rule->tuples.dst_mac); + if (rule->unused_tuple & BIT(INNER_DST_MAC)) + eth_zero_addr(fs->m_u.ether_spec.h_dest); + else + ether_addr_copy(fs->m_u.ether_spec.h_dest, + rule->tuples_mask.dst_mac); + } +} + static int hclge_get_fd_rule_info(struct hnae3_handle *handle, struct ethtool_rxnfc *cmd) { @@ -5970,162 +6113,34 @@ static int hclge_get_fd_rule_info(struct hnae3_handle *handle, case SCTP_V4_FLOW: case TCP_V4_FLOW: case UDP_V4_FLOW: - fs->h_u.tcp_ip4_spec.ip4src = - cpu_to_be32(rule->tuples.src_ip[IPV4_INDEX]); - fs->m_u.tcp_ip4_spec.ip4src = - rule->unused_tuple & BIT(INNER_SRC_IP) ? - 0 : cpu_to_be32(rule->tuples_mask.src_ip[IPV4_INDEX]); - - fs->h_u.tcp_ip4_spec.ip4dst = - cpu_to_be32(rule->tuples.dst_ip[IPV4_INDEX]); - fs->m_u.tcp_ip4_spec.ip4dst = - rule->unused_tuple & BIT(INNER_DST_IP) ? - 0 : cpu_to_be32(rule->tuples_mask.dst_ip[IPV4_INDEX]); - - fs->h_u.tcp_ip4_spec.psrc = cpu_to_be16(rule->tuples.src_port); - fs->m_u.tcp_ip4_spec.psrc = - rule->unused_tuple & BIT(INNER_SRC_PORT) ? - 0 : cpu_to_be16(rule->tuples_mask.src_port); - - fs->h_u.tcp_ip4_spec.pdst = cpu_to_be16(rule->tuples.dst_port); - fs->m_u.tcp_ip4_spec.pdst = - rule->unused_tuple & BIT(INNER_DST_PORT) ? - 0 : cpu_to_be16(rule->tuples_mask.dst_port); - - fs->h_u.tcp_ip4_spec.tos = rule->tuples.ip_tos; - fs->m_u.tcp_ip4_spec.tos = - rule->unused_tuple & BIT(INNER_IP_TOS) ? - 0 : rule->tuples_mask.ip_tos; - + hclge_fd_get_tcpip4_info(rule, &fs->h_u.tcp_ip4_spec, + &fs->m_u.tcp_ip4_spec); break; case IP_USER_FLOW: - fs->h_u.usr_ip4_spec.ip4src = - cpu_to_be32(rule->tuples.src_ip[IPV4_INDEX]); - fs->m_u.tcp_ip4_spec.ip4src = - rule->unused_tuple & BIT(INNER_SRC_IP) ? - 0 : cpu_to_be32(rule->tuples_mask.src_ip[IPV4_INDEX]); - - fs->h_u.usr_ip4_spec.ip4dst = - cpu_to_be32(rule->tuples.dst_ip[IPV4_INDEX]); - fs->m_u.usr_ip4_spec.ip4dst = - rule->unused_tuple & BIT(INNER_DST_IP) ? - 0 : cpu_to_be32(rule->tuples_mask.dst_ip[IPV4_INDEX]); - - fs->h_u.usr_ip4_spec.tos = rule->tuples.ip_tos; - fs->m_u.usr_ip4_spec.tos = - rule->unused_tuple & BIT(INNER_IP_TOS) ? - 0 : rule->tuples_mask.ip_tos; - - fs->h_u.usr_ip4_spec.proto = rule->tuples.ip_proto; - fs->m_u.usr_ip4_spec.proto = - rule->unused_tuple & BIT(INNER_IP_PROTO) ? - 0 : rule->tuples_mask.ip_proto; - - fs->h_u.usr_ip4_spec.ip_ver = ETH_RX_NFC_IP4; - + hclge_fd_get_ip4_info(rule, &fs->h_u.usr_ip4_spec, + &fs->m_u.usr_ip4_spec); break; case SCTP_V6_FLOW: case TCP_V6_FLOW: case UDP_V6_FLOW: - cpu_to_be32_array(fs->h_u.tcp_ip6_spec.ip6src, - rule->tuples.src_ip, IPV6_SIZE); - if (rule->unused_tuple & BIT(INNER_SRC_IP)) - memset(fs->m_u.tcp_ip6_spec.ip6src, 0, - sizeof(int) * IPV6_SIZE); - else - cpu_to_be32_array(fs->m_u.tcp_ip6_spec.ip6src, - rule->tuples_mask.src_ip, IPV6_SIZE); - - cpu_to_be32_array(fs->h_u.tcp_ip6_spec.ip6dst, - rule->tuples.dst_ip, IPV6_SIZE); - if (rule->unused_tuple & BIT(INNER_DST_IP)) - memset(fs->m_u.tcp_ip6_spec.ip6dst, 0, - sizeof(int) * IPV6_SIZE); - else - cpu_to_be32_array(fs->m_u.tcp_ip6_spec.ip6dst, - rule->tuples_mask.dst_ip, IPV6_SIZE); - - fs->h_u.tcp_ip6_spec.psrc = cpu_to_be16(rule->tuples.src_port); - fs->m_u.tcp_ip6_spec.psrc = - rule->unused_tuple & BIT(INNER_SRC_PORT) ? - 0 : cpu_to_be16(rule->tuples_mask.src_port); - - fs->h_u.tcp_ip6_spec.pdst = cpu_to_be16(rule->tuples.dst_port); - fs->m_u.tcp_ip6_spec.pdst = - rule->unused_tuple & BIT(INNER_DST_PORT) ? - 0 : cpu_to_be16(rule->tuples_mask.dst_port); - + hclge_fd_get_tcpip6_info(rule, &fs->h_u.tcp_ip6_spec, + &fs->m_u.tcp_ip6_spec); break; case IPV6_USER_FLOW: - cpu_to_be32_array(fs->h_u.usr_ip6_spec.ip6src, - rule->tuples.src_ip, IPV6_SIZE); - if (rule->unused_tuple & BIT(INNER_SRC_IP)) - memset(fs->m_u.usr_ip6_spec.ip6src, 0, - sizeof(int) * IPV6_SIZE); - else - cpu_to_be32_array(fs->m_u.usr_ip6_spec.ip6src, - rule->tuples_mask.src_ip, IPV6_SIZE); - - cpu_to_be32_array(fs->h_u.usr_ip6_spec.ip6dst, - rule->tuples.dst_ip, IPV6_SIZE); - if (rule->unused_tuple & BIT(INNER_DST_IP)) - memset(fs->m_u.usr_ip6_spec.ip6dst, 0, - sizeof(int) * IPV6_SIZE); - else - cpu_to_be32_array(fs->m_u.usr_ip6_spec.ip6dst, - rule->tuples_mask.dst_ip, IPV6_SIZE); - - fs->h_u.usr_ip6_spec.l4_proto = rule->tuples.ip_proto; - fs->m_u.usr_ip6_spec.l4_proto = - rule->unused_tuple & BIT(INNER_IP_PROTO) ? - 0 : rule->tuples_mask.ip_proto; - - break; - case ETHER_FLOW: - ether_addr_copy(fs->h_u.ether_spec.h_source, - rule->tuples.src_mac); - if (rule->unused_tuple & BIT(INNER_SRC_MAC)) - eth_zero_addr(fs->m_u.ether_spec.h_source); - else - ether_addr_copy(fs->m_u.ether_spec.h_source, - rule->tuples_mask.src_mac); - - ether_addr_copy(fs->h_u.ether_spec.h_dest, - rule->tuples.dst_mac); - if (rule->unused_tuple & BIT(INNER_DST_MAC)) - eth_zero_addr(fs->m_u.ether_spec.h_dest); - else - ether_addr_copy(fs->m_u.ether_spec.h_dest, - rule->tuples_mask.dst_mac); - - fs->h_u.ether_spec.h_proto = - cpu_to_be16(rule->tuples.ether_proto); - fs->m_u.ether_spec.h_proto = - rule->unused_tuple & BIT(INNER_ETH_TYPE) ? - 0 : cpu_to_be16(rule->tuples_mask.ether_proto); - + hclge_fd_get_ip6_info(rule, &fs->h_u.usr_ip6_spec, + &fs->m_u.usr_ip6_spec); break; + /* The flow type of fd rule has been checked before adding in to rule + * list. As other flow types have been handled, it must be ETHER_FLOW + * for the default case + */ default: - spin_unlock_bh(&hdev->fd_rule_lock); - return -EOPNOTSUPP; - } - - if (fs->flow_type & FLOW_EXT) { - fs->h_ext.vlan_tci = cpu_to_be16(rule->tuples.vlan_tag1); - fs->m_ext.vlan_tci = - rule->unused_tuple & BIT(INNER_VLAN_TAG_FST) ? - cpu_to_be16(VLAN_VID_MASK) : - cpu_to_be16(rule->tuples_mask.vlan_tag1); + hclge_fd_get_ether_info(rule, &fs->h_u.ether_spec, + &fs->m_u.ether_spec); + break; } - if (fs->flow_type & FLOW_MAC_EXT) { - ether_addr_copy(fs->h_ext.h_dest, rule->tuples.dst_mac); - if (rule->unused_tuple & BIT(INNER_DST_MAC)) - eth_zero_addr(fs->m_u.ether_spec.h_dest); - else - ether_addr_copy(fs->m_u.ether_spec.h_dest, - rule->tuples_mask.dst_mac); - } + hclge_fd_get_ext_info(fs, rule); if (rule->action == HCLGE_FD_ACTION_DROP_PACKET) { fs->ring_cookie = RX_CLS_FLOW_DISC; -- cgit v1.2.3-59-g8ed1b From e9368c4094f54c1af44a842814c1d87d4365c684 Mon Sep 17 00:00:00 2001 From: Huazhong Tan Date: Mon, 20 Apr 2020 10:17:28 +0800 Subject: net: hns3: remove an unnecessary case 0 in hclge_fd_convert_tuple() Since case default has included case 0, so removes this redundant case 0. Signed-off-by: Huazhong Tan Signed-off-by: David S. Miller --- drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c index b1fe204c9e40..999f05686f06 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c @@ -5006,8 +5006,6 @@ static bool hclge_fd_convert_tuple(u32 tuple_bit, u8 *key_x, u8 *key_y, return true; switch (tuple_bit) { - case 0: - return false; case BIT(INNER_DST_MAC): for (i = 0; i < ETH_ALEN; i++) { calc_x(key_x[ETH_ALEN - 1 - i], rule->tuples.dst_mac[i], -- cgit v1.2.3-59-g8ed1b From 16505f878e3065f2c0457e38ec867cdd6d3ce243 Mon Sep 17 00:00:00 2001 From: Guojia Liao Date: Mon, 20 Apr 2020 10:17:29 +0800 Subject: net: hns3: remove useless proto_support field in struct hclge_fd_cfg proto_support field in struct hclge_fd_cfg shows what protocols in flow direct table are supported now. It is unnecessary since checking which one is unsupported will be more efficient, so this patch removes it. Signed-off-by: Guojia Liao Signed-off-by: Huazhong Tan Signed-off-by: David S. Miller --- drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c | 17 ++++++----------- drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h | 1 - 2 files changed, 6 insertions(+), 12 deletions(-) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c index 999f05686f06..e238a9df6282 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c @@ -4876,9 +4876,6 @@ static int hclge_init_fd_config(struct hclge_dev *hdev) return -EOPNOTSUPP; } - hdev->fd_cfg.proto_support = - TCP_V4_FLOW | UDP_V4_FLOW | SCTP_V4_FLOW | TCP_V6_FLOW | - UDP_V6_FLOW | SCTP_V6_FLOW | IPV4_USER_FLOW | IPV6_USER_FLOW; key_cfg = &hdev->fd_cfg.key_cfg[HCLGE_FD_STAGE_1]; key_cfg->key_sel = HCLGE_FD_KEY_BASE_ON_TUPLE, key_cfg->inner_sipv6_word_en = LOW_2_WORDS; @@ -4892,11 +4889,9 @@ static int hclge_init_fd_config(struct hclge_dev *hdev) BIT(INNER_SRC_PORT) | BIT(INNER_DST_PORT); /* If use max 400bit key, we can support tuples for ether type */ - if (hdev->fd_cfg.max_key_length == MAX_KEY_LENGTH) { - hdev->fd_cfg.proto_support |= ETHER_FLOW; + if (hdev->fd_cfg.fd_mode == HCLGE_FD_MODE_DEPTH_2K_WIDTH_400B_STAGE_1) key_cfg->tuple_active |= BIT(INNER_DST_MAC) | BIT(INNER_SRC_MAC); - } /* roce_type is used to filter roce frames * dst_vport is used to specify the rule @@ -5397,7 +5392,8 @@ static int hclge_fd_check_ext_tuple(struct hclge_dev *hdev, } if (fs->flow_type & FLOW_MAC_EXT) { - if (!(hdev->fd_cfg.proto_support & ETHER_FLOW)) + if (hdev->fd_cfg.fd_mode != + HCLGE_FD_MODE_DEPTH_2K_WIDTH_400B_STAGE_1) return -EOPNOTSUPP; if (is_zero_ether_addr(fs->h_ext.h_dest)) @@ -5413,21 +5409,20 @@ static int hclge_fd_check_spec(struct hclge_dev *hdev, struct ethtool_rx_flow_spec *fs, u32 *unused_tuple) { + u32 flow_type; int ret; if (fs->location >= hdev->fd_cfg.rule_num[HCLGE_FD_STAGE_1]) return -EINVAL; - if (!(fs->flow_type & hdev->fd_cfg.proto_support)) - return -EOPNOTSUPP; - if ((fs->flow_type & FLOW_EXT) && (fs->h_ext.data[0] != 0 || fs->h_ext.data[1] != 0)) { dev_err(&hdev->pdev->dev, "user-def bytes are not supported\n"); return -EOPNOTSUPP; } - switch (fs->flow_type & ~(FLOW_EXT | FLOW_MAC_EXT)) { + flow_type = fs->flow_type & ~(FLOW_EXT | FLOW_MAC_EXT); + switch (flow_type) { case SCTP_V4_FLOW: case TCP_V4_FLOW: case UDP_V4_FLOW: diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h index 71df23d5f1b4..a58c26200ea0 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h @@ -580,7 +580,6 @@ struct hclge_fd_key_cfg { struct hclge_fd_cfg { u8 fd_mode; u16 max_key_length; /* use bit as unit */ - u32 proto_support; u32 rule_num[MAX_STAGE_NUM]; /* rule entry number */ u16 cnt_num[MAX_STAGE_NUM]; /* rule hit counter number */ struct hclge_fd_key_cfg key_cfg[MAX_STAGE_NUM]; -- cgit v1.2.3-59-g8ed1b From f84f6a8634f3424244e91fc3fd460152dae689cf Mon Sep 17 00:00:00 2001 From: Guojia Liao Date: Mon, 20 Apr 2020 10:17:30 +0800 Subject: net: hns3: remove two unused structures in hclge_cmd.h struct hclge_mac_vlan_remove_cmd and hclge_mac_vlan_add_cmd are unused. So removes them from hclge_cmd.h. Signed-off-by: Guojia Liao Signed-off-by: Huazhong Tan Signed-off-by: David S. Miller --- .../net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h | 25 ---------------------- 1 file changed, 25 deletions(-) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h index 96498d9b4754..90e422efe590 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h @@ -733,31 +733,6 @@ struct hclge_mac_mgr_tbl_entry_cmd { u8 rsv3[2]; }; -struct hclge_mac_vlan_add_cmd { - __le16 flags; - __le16 mac_addr_hi16; - __le32 mac_addr_lo32; - __le32 mac_addr_msk_hi32; - __le16 mac_addr_msk_lo16; - __le16 vlan_tag; - __le16 ingress_port; - __le16 egress_port; - u8 rsv[4]; -}; - -#define HNS3_MAC_VLAN_CFG_FLAG_BIT 0 -struct hclge_mac_vlan_remove_cmd { - __le16 flags; - __le16 mac_addr_hi16; - __le32 mac_addr_lo32; - __le32 mac_addr_msk_hi32; - __le16 mac_addr_msk_lo16; - __le16 vlan_tag; - __le16 ingress_port; - __le16 egress_port; - u8 rsv[4]; -}; - struct hclge_vlan_filter_ctrl_cmd { u8 vlan_type; u8 vlan_fe; -- cgit v1.2.3-59-g8ed1b From 84944d5c4797df7aa0b2677283db5064a2560633 Mon Sep 17 00:00:00 2001 From: Guojia Liao Date: Mon, 20 Apr 2020 10:17:31 +0800 Subject: net: hns3: modify some unsuitable type declaration In hclge_set_fd_key_config(), parameter 'stage' should be as enum HCLGE_FD_STAGE, and in hclge_config_key(), 'tuple_size' should be type u8, also simplify unsigned int with u32 for 'i'. Signed-off-by: Guojia Liao Signed-off-by: Huazhong Tan Signed-off-by: David S. Miller --- drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c index e238a9df6282..9edee7d94bbf 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c @@ -4822,7 +4822,8 @@ static int hclge_get_fd_allocation(struct hclge_dev *hdev, return ret; } -static int hclge_set_fd_key_config(struct hclge_dev *hdev, int stage_num) +static int hclge_set_fd_key_config(struct hclge_dev *hdev, + enum HCLGE_FD_STAGE stage_num) { struct hclge_set_fd_key_config_cmd *req; struct hclge_fd_key_cfg *stage; @@ -5158,9 +5159,10 @@ static int hclge_config_key(struct hclge_dev *hdev, u8 stage, struct hclge_fd_key_cfg *key_cfg = &hdev->fd_cfg.key_cfg[stage]; u8 key_x[MAX_KEY_BYTES], key_y[MAX_KEY_BYTES]; u8 *cur_key_x, *cur_key_y; - unsigned int i; - int ret, tuple_size; u8 meta_data_region; + u8 tuple_size; + int ret; + u32 i; memset(key_x, 0, sizeof(key_x)); memset(key_y, 0, sizeof(key_y)); -- cgit v1.2.3-59-g8ed1b From 0b4bdc55df6163f2861fe935755e892963dc9512 Mon Sep 17 00:00:00 2001 From: Huazhong Tan Date: Mon, 20 Apr 2020 10:17:32 +0800 Subject: net: hns3: clean up some coding style issue This patch removes some unnecessary blank lines, redundant parentheses, and changes one tab to blank in hclge_dbg_dump_reg_common(). Signed-off-by: Huazhong Tan Signed-off-by: David S. Miller --- drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_debugfs.c | 2 +- drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c | 10 +++------- 2 files changed, 4 insertions(+), 8 deletions(-) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_debugfs.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_debugfs.c index 17228288d4df..cfc9300ff715 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_debugfs.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_debugfs.c @@ -143,7 +143,7 @@ static void hclge_dbg_dump_reg_common(struct hclge_dev *hdev, return; } - buf_len = sizeof(struct hclge_desc) * bd_num; + buf_len = sizeof(struct hclge_desc) * bd_num; desc_src = kzalloc(buf_len, GFP_KERNEL); if (!desc_src) return; diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c index 9edee7d94bbf..635aec2ffba4 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c @@ -5380,7 +5380,7 @@ static int hclge_fd_check_ext_tuple(struct hclge_dev *hdev, struct ethtool_rx_flow_spec *fs, u32 *unused_tuple) { - if ((fs->flow_type & FLOW_EXT)) { + if (fs->flow_type & FLOW_EXT) { if (fs->h_ext.vlan_etype) return -EOPNOTSUPP; if (!fs->h_ext.vlan_tci) @@ -5401,7 +5401,7 @@ static int hclge_fd_check_ext_tuple(struct hclge_dev *hdev, if (is_zero_ether_addr(fs->h_ext.h_dest)) *unused_tuple |= BIT(INNER_DST_MAC); else - *unused_tuple &= ~(BIT(INNER_DST_MAC)); + *unused_tuple &= ~BIT(INNER_DST_MAC); } return 0; @@ -5674,7 +5674,7 @@ static int hclge_fd_get_tuple(struct hclge_dev *hdev, break; } - if ((fs->flow_type & FLOW_EXT)) { + if (fs->flow_type & FLOW_EXT) { rule->tuples.vlan_tag1 = be16_to_cpu(fs->h_ext.vlan_tci); rule->tuples_mask.vlan_tag1 = be16_to_cpu(fs->m_ext.vlan_tci); } @@ -5785,7 +5785,6 @@ static int hclge_add_fd_entry(struct hnae3_handle *handle, } rule->flow_type = fs->flow_type; - rule->location = fs->location; rule->unused_tuple = unused; rule->vf_id = dst_vport_id; @@ -6273,7 +6272,6 @@ static int hclge_add_fd_entry_by_arfs(struct hnae3_handle *handle, u16 queue_id, */ if (hdev->fd_active_type == HCLGE_FD_EP_ACTIVE) { spin_unlock_bh(&hdev->fd_rule_lock); - return -EOPNOTSUPP; } @@ -6287,14 +6285,12 @@ static int hclge_add_fd_entry_by_arfs(struct hnae3_handle *handle, u16 queue_id, bit_id = find_first_zero_bit(hdev->fd_bmap, MAX_FD_FILTER_NUM); if (bit_id >= hdev->fd_cfg.rule_num[HCLGE_FD_STAGE_1]) { spin_unlock_bh(&hdev->fd_rule_lock); - return -ENOSPC; } rule = kzalloc(sizeof(*rule), GFP_ATOMIC); if (!rule) { spin_unlock_bh(&hdev->fd_rule_lock); - return -ENOMEM; } -- cgit v1.2.3-59-g8ed1b From a3ca5e9048be07576259d2ba9628dc8187f7690e Mon Sep 17 00:00:00 2001 From: Guojia Liao Date: Mon, 20 Apr 2020 10:17:33 +0800 Subject: net: hns3: add debug information for flow table when failed Adds some debug information for failures of processing flow table, removes the redundant printing when hclge_fd_check_spec() returns error, and modifies the printing level for FD not enable error. Signed-off-by: Guojia Liao Signed-off-by: Huazhong Tan Signed-off-by: David S. Miller --- .../ethernet/hisilicon/hns3/hns3pf/hclge_main.c | 45 ++++++++++++++++------ 1 file changed, 34 insertions(+), 11 deletions(-) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c index 635aec2ffba4..0618f22e6f14 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c @@ -5381,22 +5381,32 @@ static int hclge_fd_check_ext_tuple(struct hclge_dev *hdev, u32 *unused_tuple) { if (fs->flow_type & FLOW_EXT) { - if (fs->h_ext.vlan_etype) + if (fs->h_ext.vlan_etype) { + dev_err(&hdev->pdev->dev, "vlan-etype is not supported!\n"); return -EOPNOTSUPP; + } + if (!fs->h_ext.vlan_tci) *unused_tuple |= BIT(INNER_VLAN_TAG_FST); if (fs->m_ext.vlan_tci && - be16_to_cpu(fs->h_ext.vlan_tci) >= VLAN_N_VID) + be16_to_cpu(fs->h_ext.vlan_tci) >= VLAN_N_VID) { + dev_err(&hdev->pdev->dev, + "failed to config vlan_tci, invalid vlan_tci: %u, max is %u.\n", + ntohs(fs->h_ext.vlan_tci), VLAN_N_VID - 1); return -EINVAL; + } } else { *unused_tuple |= BIT(INNER_VLAN_TAG_FST); } if (fs->flow_type & FLOW_MAC_EXT) { if (hdev->fd_cfg.fd_mode != - HCLGE_FD_MODE_DEPTH_2K_WIDTH_400B_STAGE_1) + HCLGE_FD_MODE_DEPTH_2K_WIDTH_400B_STAGE_1) { + dev_err(&hdev->pdev->dev, + "FLOW_MAC_EXT is not supported in current fd mode!\n"); return -EOPNOTSUPP; + } if (is_zero_ether_addr(fs->h_ext.h_dest)) *unused_tuple |= BIT(INNER_DST_MAC); @@ -5414,8 +5424,13 @@ static int hclge_fd_check_spec(struct hclge_dev *hdev, u32 flow_type; int ret; - if (fs->location >= hdev->fd_cfg.rule_num[HCLGE_FD_STAGE_1]) + if (fs->location >= hdev->fd_cfg.rule_num[HCLGE_FD_STAGE_1]) { + dev_err(&hdev->pdev->dev, + "failed to config fd rules, invalid rule location: %u, max is %u\n.", + fs->location, + hdev->fd_cfg.rule_num[HCLGE_FD_STAGE_1] - 1); return -EINVAL; + } if ((fs->flow_type & FLOW_EXT) && (fs->h_ext.data[0] != 0 || fs->h_ext.data[1] != 0)) { @@ -5457,11 +5472,18 @@ static int hclge_fd_check_spec(struct hclge_dev *hdev, unused_tuple); break; default: + dev_err(&hdev->pdev->dev, + "unsupported protocol type, protocol type = %#x\n", + flow_type); return -EOPNOTSUPP; } - if (ret) + if (ret) { + dev_err(&hdev->pdev->dev, + "failed to check flow union tuple, ret = %d\n", + ret); return ret; + } return hclge_fd_check_ext_tuple(hdev, fs, unused_tuple); } @@ -5729,22 +5751,23 @@ static int hclge_add_fd_entry(struct hnae3_handle *handle, u8 action; int ret; - if (!hnae3_dev_fd_supported(hdev)) + if (!hnae3_dev_fd_supported(hdev)) { + dev_err(&hdev->pdev->dev, + "flow table director is not supported\n"); return -EOPNOTSUPP; + } if (!hdev->fd_en) { - dev_warn(&hdev->pdev->dev, - "Please enable flow director first\n"); + dev_err(&hdev->pdev->dev, + "please enable flow director first\n"); return -EOPNOTSUPP; } fs = (struct ethtool_rx_flow_spec *)&cmd->fs; ret = hclge_fd_check_spec(hdev, fs, &unused); - if (ret) { - dev_err(&hdev->pdev->dev, "Check fd spec failed\n"); + if (ret) return ret; - } if (fs->ring_cookie == RX_CLS_FLOW_DISC) { action = HCLGE_FD_ACTION_DROP_PACKET; -- cgit v1.2.3-59-g8ed1b From 5cb51cfe8ad65117d4404b82fb8531768b149ad9 Mon Sep 17 00:00:00 2001 From: Yufeng Mo Date: Mon, 20 Apr 2020 10:17:34 +0800 Subject: net: hns3: add support for dumping MAC reg in debugfs This patch adds support for dumping MAC reg in debugfs, which will be helpful for debugging. Signed-off-by: Yufeng Mo Signed-off-by: Huazhong Tan Signed-off-by: David S. Miller --- drivers/net/ethernet/hisilicon/hns3/hns3_debugfs.c | 2 +- .../ethernet/hisilicon/hns3/hns3pf/hclge_debugfs.c | 113 +++++++++++++++++++++ 2 files changed, 114 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_debugfs.c b/drivers/net/ethernet/hisilicon/hns3/hns3_debugfs.c index e1d88095a77e..c934f328c040 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3_debugfs.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3_debugfs.c @@ -270,7 +270,7 @@ static void hns3_dbg_help(struct hnae3_handle *h) " [igu egu ] [rpu ]", HNS3_DBG_BUF_LEN - strlen(printf_buf) - 1); strncat(printf_buf + strlen(printf_buf), - " [rtc] [ppp] [rcb] [tqp ]]\n", + " [rtc] [ppp] [rcb] [tqp ] [mac]]\n", HNS3_DBG_BUF_LEN - strlen(printf_buf) - 1); dev_info(&h->pdev->dev, "%s", printf_buf); diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_debugfs.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_debugfs.c index cfc9300ff715..66c1ad3a156b 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_debugfs.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_debugfs.c @@ -173,6 +173,114 @@ static void hclge_dbg_dump_reg_common(struct hclge_dev *hdev, kfree(desc_src); } +static void hclge_dbg_dump_mac_enable_status(struct hclge_dev *hdev) +{ + struct hclge_config_mac_mode_cmd *req; + struct hclge_desc desc; + u32 loop_en; + int ret; + + hclge_cmd_setup_basic_desc(&desc, HCLGE_OPC_CONFIG_MAC_MODE, true); + + ret = hclge_cmd_send(&hdev->hw, &desc, 1); + if (ret) { + dev_err(&hdev->pdev->dev, + "failed to dump mac enable status, ret = %d\n", ret); + return; + } + + req = (struct hclge_config_mac_mode_cmd *)desc.data; + loop_en = le32_to_cpu(req->txrx_pad_fcs_loop_en); + + dev_info(&hdev->pdev->dev, "config_mac_trans_en: %#x\n", + hnae3_get_bit(loop_en, HCLGE_MAC_TX_EN_B)); + dev_info(&hdev->pdev->dev, "config_mac_rcv_en: %#x\n", + hnae3_get_bit(loop_en, HCLGE_MAC_RX_EN_B)); + dev_info(&hdev->pdev->dev, "config_pad_trans_en: %#x\n", + hnae3_get_bit(loop_en, HCLGE_MAC_PAD_TX_B)); + dev_info(&hdev->pdev->dev, "config_pad_rcv_en: %#x\n", + hnae3_get_bit(loop_en, HCLGE_MAC_PAD_RX_B)); + dev_info(&hdev->pdev->dev, "config_1588_trans_en: %#x\n", + hnae3_get_bit(loop_en, HCLGE_MAC_1588_TX_B)); + dev_info(&hdev->pdev->dev, "config_1588_rcv_en: %#x\n", + hnae3_get_bit(loop_en, HCLGE_MAC_1588_RX_B)); + dev_info(&hdev->pdev->dev, "config_mac_app_loop_en: %#x\n", + hnae3_get_bit(loop_en, HCLGE_MAC_APP_LP_B)); + dev_info(&hdev->pdev->dev, "config_mac_line_loop_en: %#x\n", + hnae3_get_bit(loop_en, HCLGE_MAC_LINE_LP_B)); + dev_info(&hdev->pdev->dev, "config_mac_fcs_tx_en: %#x\n", + hnae3_get_bit(loop_en, HCLGE_MAC_FCS_TX_B)); + dev_info(&hdev->pdev->dev, "config_mac_rx_oversize_truncate_en: %#x\n", + hnae3_get_bit(loop_en, HCLGE_MAC_RX_OVERSIZE_TRUNCATE_B)); + dev_info(&hdev->pdev->dev, "config_mac_rx_fcs_strip_en: %#x\n", + hnae3_get_bit(loop_en, HCLGE_MAC_RX_FCS_STRIP_B)); + dev_info(&hdev->pdev->dev, "config_mac_rx_fcs_en: %#x\n", + hnae3_get_bit(loop_en, HCLGE_MAC_RX_FCS_B)); + dev_info(&hdev->pdev->dev, "config_mac_tx_under_min_err_en: %#x\n", + hnae3_get_bit(loop_en, HCLGE_MAC_TX_UNDER_MIN_ERR_B)); + dev_info(&hdev->pdev->dev, "config_mac_tx_oversize_truncate_en: %#x\n", + hnae3_get_bit(loop_en, HCLGE_MAC_TX_OVERSIZE_TRUNCATE_B)); +} + +static void hclge_dbg_dump_mac_frame_size(struct hclge_dev *hdev) +{ + struct hclge_config_max_frm_size_cmd *req; + struct hclge_desc desc; + int ret; + + hclge_cmd_setup_basic_desc(&desc, HCLGE_OPC_CONFIG_MAX_FRM_SIZE, true); + + ret = hclge_cmd_send(&hdev->hw, &desc, 1); + if (ret) { + dev_err(&hdev->pdev->dev, + "failed to dump mac frame size, ret = %d\n", ret); + return; + } + + req = (struct hclge_config_max_frm_size_cmd *)desc.data; + + dev_info(&hdev->pdev->dev, "max_frame_size: %u\n", + le16_to_cpu(req->max_frm_size)); + dev_info(&hdev->pdev->dev, "min_frame_size: %u\n", req->min_frm_size); +} + +static void hclge_dbg_dump_mac_speed_duplex(struct hclge_dev *hdev) +{ +#define HCLGE_MAC_SPEED_SHIFT 0 +#define HCLGE_MAC_SPEED_MASK GENMASK(5, 0) +#define HCLGE_MAC_DUPLEX_SHIFT 7 + + struct hclge_config_mac_speed_dup_cmd *req; + struct hclge_desc desc; + int ret; + + hclge_cmd_setup_basic_desc(&desc, HCLGE_OPC_CONFIG_SPEED_DUP, true); + + ret = hclge_cmd_send(&hdev->hw, &desc, 1); + if (ret) { + dev_err(&hdev->pdev->dev, + "failed to dump mac speed duplex, ret = %d\n", ret); + return; + } + + req = (struct hclge_config_mac_speed_dup_cmd *)desc.data; + + dev_info(&hdev->pdev->dev, "speed: %#lx\n", + hnae3_get_field(req->speed_dup, HCLGE_MAC_SPEED_MASK, + HCLGE_MAC_SPEED_SHIFT)); + dev_info(&hdev->pdev->dev, "duplex: %#x\n", + hnae3_get_bit(req->speed_dup, HCLGE_MAC_DUPLEX_SHIFT)); +} + +static void hclge_dbg_dump_mac(struct hclge_dev *hdev) +{ + hclge_dbg_dump_mac_enable_status(hdev); + + hclge_dbg_dump_mac_frame_size(hdev); + + hclge_dbg_dump_mac_speed_duplex(hdev); +} + static void hclge_dbg_dump_dcb(struct hclge_dev *hdev, const char *cmd_buf) { struct device *dev = &hdev->pdev->dev; @@ -304,6 +412,11 @@ static void hclge_dbg_dump_reg_cmd(struct hclge_dev *hdev, const char *cmd_buf) } } + if (strncmp(cmd_buf, "mac", strlen("mac")) == 0) { + hclge_dbg_dump_mac(hdev); + has_dump = true; + } + if (strncmp(cmd_buf, "dcb", 3) == 0) { hclge_dbg_dump_dcb(hdev, &cmd_buf[sizeof("dcb")]); has_dump = true; -- cgit v1.2.3-59-g8ed1b From d8355240cf8fb8b9e002b5c8458578435cea85c2 Mon Sep 17 00:00:00 2001 From: Yufeng Mo Date: Mon, 20 Apr 2020 10:17:35 +0800 Subject: net: hns3: add trace event support for PF/VF mailbox This patch adds trace event support for PF/VF mailbox. Signed-off-by: Yufeng Mo Signed-off-by: Huazhong Tan Signed-off-by: David S. Miller --- .../net/ethernet/hisilicon/hns3/hns3pf/Makefile | 1 + .../net/ethernet/hisilicon/hns3/hns3pf/hclge_mbx.c | 7 ++ .../ethernet/hisilicon/hns3/hns3pf/hclge_trace.h | 87 ++++++++++++++++++++++ .../net/ethernet/hisilicon/hns3/hns3vf/Makefile | 1 + .../ethernet/hisilicon/hns3/hns3vf/hclgevf_mbx.c | 7 ++ .../ethernet/hisilicon/hns3/hns3vf/hclgevf_trace.h | 87 ++++++++++++++++++++++ 6 files changed, 190 insertions(+) create mode 100644 drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_trace.h create mode 100644 drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_trace.h diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/Makefile b/drivers/net/ethernet/hisilicon/hns3/hns3pf/Makefile index 0fb61d440d3b..6c28c8f6292c 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/Makefile +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/Makefile @@ -4,6 +4,7 @@ # ccflags-y := -I $(srctree)/drivers/net/ethernet/hisilicon/hns3 +ccflags-y += -I $(srctree)/$(src) obj-$(CONFIG_HNS3_HCLGE) += hclge.o hclge-objs = hclge_main.o hclge_cmd.o hclge_mdio.o hclge_tm.o hclge_mbx.o hclge_err.o hclge_debugfs.o diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_mbx.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_mbx.c index 7f24fcb4f96a..103c2ec777b0 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_mbx.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_mbx.c @@ -5,6 +5,9 @@ #include "hclge_mbx.h" #include "hnae3.h" +#define CREATE_TRACE_POINTS +#include "hclge_trace.h" + static u16 hclge_errno_to_resp(int errno) { return abs(errno); @@ -90,6 +93,8 @@ static int hclge_send_mbx_msg(struct hclge_vport *vport, u8 *msg, u16 msg_len, memcpy(&resp_pf_to_vf->msg.vf_mbx_msg_code, msg, msg_len); + trace_hclge_pf_mbx_send(hdev, resp_pf_to_vf); + status = hclge_cmd_send(&hdev->hw, &desc, 1); if (status) dev_err(&hdev->pdev->dev, @@ -674,6 +679,8 @@ void hclge_mbx_handler(struct hclge_dev *hdev) vport = &hdev->vport[req->mbx_src_vfid]; + trace_hclge_pf_mbx_get(hdev, req); + switch (req->msg.code) { case HCLGE_MBX_MAP_RING_TO_VECTOR: ret = hclge_map_unmap_ring_to_vf_vector(vport, true, diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_trace.h b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_trace.h new file mode 100644 index 000000000000..5b0b71bd6120 --- /dev/null +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_trace.h @@ -0,0 +1,87 @@ +/* SPDX-License-Identifier: GPL-2.0+ */ +/* Copyright (c) 2018-2020 Hisilicon Limited. */ + +/* This must be outside ifdef _HCLGE_TRACE_H */ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM hns3 + +#if !defined(_HCLGE_TRACE_H_) || defined(TRACE_HEADER_MULTI_READ) +#define _HCLGE_TRACE_H_ + +#include + +#define PF_GET_MBX_LEN (sizeof(struct hclge_mbx_vf_to_pf_cmd) / sizeof(u32)) +#define PF_SEND_MBX_LEN (sizeof(struct hclge_mbx_pf_to_vf_cmd) / sizeof(u32)) + +TRACE_EVENT(hclge_pf_mbx_get, + TP_PROTO( + struct hclge_dev *hdev, + struct hclge_mbx_vf_to_pf_cmd *req), + TP_ARGS(hdev, req), + + TP_STRUCT__entry( + __field(u8, vfid) + __field(u8, code) + __field(u8, subcode) + __string(pciname, pci_name(hdev->pdev)) + __string(devname, &hdev->vport[0].nic.kinfo.netdev->name) + __array(u32, mbx_data, PF_GET_MBX_LEN) + ), + + TP_fast_assign( + __entry->vfid = req->mbx_src_vfid; + __entry->code = req->msg.code; + __entry->subcode = req->msg.subcode; + __assign_str(pciname, pci_name(hdev->pdev)); + __assign_str(devname, &hdev->vport[0].nic.kinfo.netdev->name); + memcpy(__entry->mbx_data, req, + sizeof(struct hclge_mbx_vf_to_pf_cmd)); + ), + + TP_printk( + "%s %s vfid:%u code:%u subcode:%u data:%s", + __get_str(pciname), __get_str(devname), __entry->vfid, + __entry->code, __entry->subcode, + __print_array(__entry->mbx_data, PF_GET_MBX_LEN, sizeof(u32)) + ) +); + +TRACE_EVENT(hclge_pf_mbx_send, + TP_PROTO( + struct hclge_dev *hdev, + struct hclge_mbx_pf_to_vf_cmd *req), + TP_ARGS(hdev, req), + + TP_STRUCT__entry( + __field(u8, vfid) + __field(u16, code) + __string(pciname, pci_name(hdev->pdev)) + __string(devname, &hdev->vport[0].nic.kinfo.netdev->name) + __array(u32, mbx_data, PF_SEND_MBX_LEN) + ), + + TP_fast_assign( + __entry->vfid = req->dest_vfid; + __entry->code = req->msg.code; + __assign_str(pciname, pci_name(hdev->pdev)); + __assign_str(devname, &hdev->vport[0].nic.kinfo.netdev->name); + memcpy(__entry->mbx_data, req, + sizeof(struct hclge_mbx_pf_to_vf_cmd)); + ), + + TP_printk( + "%s %s vfid:%u code:%u data:%s", + __get_str(pciname), __get_str(devname), __entry->vfid, + __entry->code, + __print_array(__entry->mbx_data, PF_SEND_MBX_LEN, sizeof(u32)) + ) +); + +#endif /* _HCLGE_TRACE_H_ */ + +/* This must be outside ifdef _HCLGE_TRACE_H */ +#undef TRACE_INCLUDE_PATH +#define TRACE_INCLUDE_PATH . +#undef TRACE_INCLUDE_FILE +#define TRACE_INCLUDE_FILE hclge_trace +#include diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3vf/Makefile b/drivers/net/ethernet/hisilicon/hns3/hns3vf/Makefile index 53804d95ea90..2c26ea607a53 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3vf/Makefile +++ b/drivers/net/ethernet/hisilicon/hns3/hns3vf/Makefile @@ -4,6 +4,7 @@ # ccflags-y := -I $(srctree)/drivers/net/ethernet/hisilicon/hns3 +ccflags-y += -I $(srctree)/$(src) obj-$(CONFIG_HNS3_HCLGEVF) += hclgevf.o hclgevf-objs = hclgevf_main.o hclgevf_cmd.o hclgevf_mbx.o diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_mbx.c b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_mbx.c index 9b8154955f91..5b2dcd97c107 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_mbx.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_mbx.c @@ -5,6 +5,9 @@ #include "hclgevf_main.h" #include "hnae3.h" +#define CREATE_TRACE_POINTS +#include "hclgevf_trace.h" + static int hclgevf_resp_to_errno(u16 resp_code) { return resp_code ? -resp_code : 0; @@ -106,6 +109,8 @@ int hclgevf_send_mbx_msg(struct hclgevf_dev *hdev, memcpy(&req->msg, send_msg, sizeof(struct hclge_vf_to_pf_msg)); + trace_hclge_vf_mbx_send(hdev, req); + /* synchronous send */ if (need_resp) { mutex_lock(&hdev->mbx_resp.mbx_mutex); @@ -179,6 +184,8 @@ void hclgevf_mbx_handler(struct hclgevf_dev *hdev) continue; } + trace_hclge_vf_mbx_get(hdev, req); + /* synchronous messages are time critical and need preferential * treatment. Therefore, we need to acknowledge all the sync * responses as quickly as possible so that waiting tasks do not diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_trace.h b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_trace.h new file mode 100644 index 000000000000..e4bfb6191fef --- /dev/null +++ b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_trace.h @@ -0,0 +1,87 @@ +/* SPDX-License-Identifier: GPL-2.0+ */ +/* Copyright (c) 2018-2019 Hisilicon Limited. */ + +/* This must be outside ifdef _HCLGEVF_TRACE_H */ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM hns3 + +#if !defined(_HCLGEVF_TRACE_H_) || defined(TRACE_HEADER_MULTI_READ) +#define _HCLGEVF_TRACE_H_ + +#include + +#define VF_GET_MBX_LEN (sizeof(struct hclge_mbx_pf_to_vf_cmd) / sizeof(u32)) +#define VF_SEND_MBX_LEN (sizeof(struct hclge_mbx_vf_to_pf_cmd) / sizeof(u32)) + +TRACE_EVENT(hclge_vf_mbx_get, + TP_PROTO( + struct hclgevf_dev *hdev, + struct hclge_mbx_pf_to_vf_cmd *req), + TP_ARGS(hdev, req), + + TP_STRUCT__entry( + __field(u8, vfid) + __field(u16, code) + __string(pciname, pci_name(hdev->pdev)) + __string(devname, &hdev->nic.kinfo.netdev->name) + __array(u32, mbx_data, VF_GET_MBX_LEN) + ), + + TP_fast_assign( + __entry->vfid = req->dest_vfid; + __entry->code = req->msg.code; + __assign_str(pciname, pci_name(hdev->pdev)); + __assign_str(devname, &hdev->nic.kinfo.netdev->name); + memcpy(__entry->mbx_data, req, + sizeof(struct hclge_mbx_pf_to_vf_cmd)); + ), + + TP_printk( + "%s %s vfid:%u code:%u data:%s", + __get_str(pciname), __get_str(devname), __entry->vfid, + __entry->code, + __print_array(__entry->mbx_data, VF_GET_MBX_LEN, sizeof(u32)) + ) +); + +TRACE_EVENT(hclge_vf_mbx_send, + TP_PROTO( + struct hclgevf_dev *hdev, + struct hclge_mbx_vf_to_pf_cmd *req), + TP_ARGS(hdev, req), + + TP_STRUCT__entry( + __field(u8, vfid) + __field(u8, code) + __field(u8, subcode) + __string(pciname, pci_name(hdev->pdev)) + __string(devname, &hdev->nic.kinfo.netdev->name) + __array(u32, mbx_data, VF_SEND_MBX_LEN) + ), + + TP_fast_assign( + __entry->vfid = req->mbx_src_vfid; + __entry->code = req->msg.code; + __entry->subcode = req->msg.subcode; + __assign_str(pciname, pci_name(hdev->pdev)); + __assign_str(devname, &hdev->nic.kinfo.netdev->name); + memcpy(__entry->mbx_data, req, + sizeof(struct hclge_mbx_vf_to_pf_cmd)); + ), + + TP_printk( + "%s %s vfid:%u code:%u subcode:%u data:%s", + __get_str(pciname), __get_str(devname), __entry->vfid, + __entry->code, __entry->subcode, + __print_array(__entry->mbx_data, VF_SEND_MBX_LEN, sizeof(u32)) + ) +); + +#endif /* _HCLGEVF_TRACE_H_ */ + +/* This must be outside ifdef _HCLGEVF_TRACE_H */ +#undef TRACE_INCLUDE_PATH +#define TRACE_INCLUDE_PATH . +#undef TRACE_INCLUDE_FILE +#define TRACE_INCLUDE_FILE hclgevf_trace +#include -- cgit v1.2.3-59-g8ed1b From 82ebc889091a488b4dd95e682b3c3b889a50713c Mon Sep 17 00:00:00 2001 From: Jason Yan Date: Mon, 20 Apr 2020 12:27:20 +0800 Subject: qed: use true,false for bool variables MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix the following coccicheck warning: drivers/net/ethernet/qlogic/qed/qed_dev.c:4395:2-34: WARNING: Assignment of 0/1 to bool variable drivers/net/ethernet/qlogic/qed/qed_dev.c:1975:2-34: WARNING: Assignment of 0/1 to bool variable Signed-off-by: Jason Yan Acked-by: Michal Kalderon  Signed-off-by: David S. Miller --- drivers/net/ethernet/qlogic/qed/qed_dev.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/qlogic/qed/qed_dev.c b/drivers/net/ethernet/qlogic/qed/qed_dev.c index 38a65b984e47..7119a18af19e 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_dev.c +++ b/drivers/net/ethernet/qlogic/qed/qed_dev.c @@ -1972,7 +1972,7 @@ static int qed_init_qm_sanity(struct qed_hwfn *p_hwfn) return 0; if (QED_IS_ROCE_PERSONALITY(p_hwfn)) { - p_hwfn->hw_info.multi_tc_roce_en = 0; + p_hwfn->hw_info.multi_tc_roce_en = false; DP_NOTICE(p_hwfn, "multi-tc roce was disabled to reduce requested amount of pqs\n"); if (qed_init_qm_get_num_pqs(p_hwfn) <= RESC_NUM(p_hwfn, QED_PQ)) @@ -4392,7 +4392,7 @@ qed_get_hw_info(struct qed_hwfn *p_hwfn, } if (QED_IS_ROCE_PERSONALITY(p_hwfn)) - p_hwfn->hw_info.multi_tc_roce_en = 1; + p_hwfn->hw_info.multi_tc_roce_en = true; p_hwfn->hw_info.num_hw_tc = NUM_PHYS_TCS_4PORT_K2; p_hwfn->hw_info.num_active_tc = 1; -- cgit v1.2.3-59-g8ed1b From 8c8eea07c1fd46455b3a275a03c27326ddc42b20 Mon Sep 17 00:00:00 2001 From: Raed Salem Date: Wed, 23 Oct 2019 16:03:12 +0300 Subject: net/mlx5: Use the correct IPsec capability function for FPGA ops Currently the IPsec acceleration capability function is also used at IPsec fpga capable device code. This could cause a future bug as the acceleration layer is agnostic to the device implementing its API. Fix by using the IPsec FPGA capability function instead of acceleration layer capability function in case of FPGA IPsec only related operations. Downstream patches will add support for Connect-X IPsec, this can avoid a future bug. Signed-off-by: Raed Salem Reviewed-by: Boris Pismenny Reviewed-by: Huy Nguyen Reviewed-by: Tariq Toukan Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/fpga/ipsec.c | 2 +- drivers/net/ethernet/mellanox/mlx5/core/fpga/ipsec.h | 15 ++++++++++++++- drivers/net/ethernet/mellanox/mlx5/core/fs_core.c | 5 +++-- 3 files changed, 18 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fpga/ipsec.c b/drivers/net/ethernet/mellanox/mlx5/core/fpga/ipsec.c index b794888fa3ba..c8736b6b4172 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/fpga/ipsec.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/fpga/ipsec.c @@ -602,7 +602,7 @@ static bool mlx5_is_fpga_ipsec_rule(struct mlx5_core_dev *dev, const u32 *match_c, const u32 *match_v) { - u32 ipsec_dev_caps = mlx5_accel_ipsec_device_caps(dev); + u32 ipsec_dev_caps = mlx5_fpga_ipsec_device_caps(dev); bool ipv6_flow; ipv6_flow = mlx5_fs_is_outer_ipv6_flow(dev, match_c, match_v); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fpga/ipsec.h b/drivers/net/ethernet/mellanox/mlx5/core/fpga/ipsec.h index 382985e65b48..d01b1fc8e11b 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/fpga/ipsec.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/fpga/ipsec.h @@ -37,6 +37,7 @@ #include "accel/ipsec.h" #include "fs_cmd.h" +#ifdef CONFIG_MLX5_FPGA_IPSEC u32 mlx5_fpga_ipsec_device_caps(struct mlx5_core_dev *mdev); unsigned int mlx5_fpga_ipsec_counters_count(struct mlx5_core_dev *mdev); int mlx5_fpga_ipsec_counters_read(struct mlx5_core_dev *mdev, u64 *counters, @@ -63,5 +64,17 @@ int mlx5_fpga_esp_modify_xfrm(struct mlx5_accel_esp_xfrm *xfrm, const struct mlx5_flow_cmds * mlx5_fs_cmd_get_default_ipsec_fpga_cmds(enum fs_flow_table_type type); +#else +static inline u32 mlx5_fpga_ipsec_device_caps(struct mlx5_core_dev *mdev) +{ + return 0; +} -#endif /* __MLX5_FPGA_SADB_H__ */ +static inline const struct mlx5_flow_cmds * +mlx5_fs_cmd_get_default_ipsec_fpga_cmds(enum fs_flow_table_type type) +{ + return mlx5_fs_cmd_get_default(type); +} + +#endif /* CONFIG_MLX5_FPGA_IPSEC */ +#endif /* __MLX5_FPGA_IPSEC_H__ */ diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c index d5defe09339a..2da45e9b9b6d 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c @@ -2359,7 +2359,7 @@ static struct mlx5_flow_root_namespace struct mlx5_flow_root_namespace *root_ns; struct mlx5_flow_namespace *ns; - if (mlx5_accel_ipsec_device_caps(steering->dev) & MLX5_ACCEL_IPSEC_CAP_DEVICE && + if (mlx5_fpga_ipsec_device_caps(steering->dev) & MLX5_ACCEL_IPSEC_CAP_DEVICE && (table_type == FS_FT_NIC_RX || table_type == FS_FT_NIC_TX)) cmds = mlx5_fs_cmd_get_default_ipsec_fpga_cmds(table_type); @@ -2943,7 +2943,8 @@ int mlx5_init_fs(struct mlx5_core_dev *dev) goto err; } - if (MLX5_IPSEC_DEV(dev) || MLX5_CAP_FLOWTABLE_NIC_TX(dev, ft_support)) { + if (mlx5_fpga_ipsec_device_caps(steering->dev) & MLX5_ACCEL_IPSEC_CAP_DEVICE || + MLX5_CAP_FLOWTABLE_NIC_TX(dev, ft_support)) { err = init_egress_root_ns(steering); if (err) goto err; -- cgit v1.2.3-59-g8ed1b From 9425c595bd513948537ef355c07a65595dd2c771 Mon Sep 17 00:00:00 2001 From: Raed Salem Date: Mon, 20 Jan 2020 15:03:00 +0200 Subject: net/mlx5e: en_accel, Add missing net/geneve.h include The cited commit relies on include being included implicitly prior to include "en_accel/en_accel.h". This mandates that all files that needs to include en_accel.h to redantantly include net/geneve.h. Include net/geneve.h explicitly at "en_accel/en_accel.h" to avoid undesired constrain as above. Fixes: e3cfc7e6b7bd ("net/mlx5e: TX, Add geneve tunnel stateless offload support") Signed-off-by: Raed Salem Reviewed-by: Tariq Toukan Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en_accel/en_accel.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/en_accel.h b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/en_accel.h index 3022463f2284..a6f65d4b2f36 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/en_accel.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/en_accel.h @@ -42,6 +42,8 @@ #include "en/txrx.h" #if IS_ENABLED(CONFIG_GENEVE) +#include + static inline bool mlx5_geneve_tx_allowed(struct mlx5_core_dev *mdev) { return mlx5_tx_swp_supported(mdev); -- cgit v1.2.3-59-g8ed1b From 1dbd51d0a71a561056579e2d4f406e5ce5343af0 Mon Sep 17 00:00:00 2001 From: Raed Salem Date: Tue, 10 Dec 2019 13:20:55 +0200 Subject: net/mlx5: Refactor mlx5_accel_esp_create_hw_context parameter list Currently the FPGA IPsec is the only hw implementation of the IPsec acceleration api, and so the mlx5_accel_esp_create_hw_context was wrongly made to suit this HW api, among other in its parameter list and some of its parameter endianness. This implementation might not be suitable for different HW. Refactor by group and pass all function arguments of mlx5_accel_esp_create_hw_context in common mlx5_accel_esp_xfrm_attrs struct field of mlx5_accel_esp_xfrm struct and correct the endianness according to the HW being called. Signed-off-by: Raed Salem Reviewed-by: Boris Pismenny Reviewed-by: Huy Nguyen Signed-off-by: Saeed Mahameed --- .../net/ethernet/mellanox/mlx5/core/accel/ipsec.c | 20 +++++++++++++------ .../net/ethernet/mellanox/mlx5/core/accel/ipsec.h | 10 ++-------- .../ethernet/mellanox/mlx5/core/en_accel/ipsec.c | 23 +++++++++------------- include/linux/mlx5/accel.h | 12 +++++++++++ 4 files changed, 37 insertions(+), 28 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/accel/ipsec.c b/drivers/net/ethernet/mellanox/mlx5/core/accel/ipsec.c index eddc34e4a762..a92cd88d369c 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/accel/ipsec.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/accel/ipsec.c @@ -57,13 +57,21 @@ int mlx5_accel_ipsec_counters_read(struct mlx5_core_dev *mdev, u64 *counters, } void *mlx5_accel_esp_create_hw_context(struct mlx5_core_dev *mdev, - struct mlx5_accel_esp_xfrm *xfrm, - const __be32 saddr[4], - const __be32 daddr[4], - const __be32 spi, bool is_ipv6) + struct mlx5_accel_esp_xfrm *xfrm) { - return mlx5_fpga_ipsec_create_sa_ctx(mdev, xfrm, saddr, daddr, - spi, is_ipv6); + __be32 saddr[4] = {}, daddr[4] = {}; + + if (!xfrm->attrs.is_ipv6) { + saddr[3] = xfrm->attrs.saddr.a4; + daddr[3] = xfrm->attrs.daddr.a4; + } else { + memcpy(saddr, xfrm->attrs.saddr.a6, sizeof(saddr)); + memcpy(daddr, xfrm->attrs.daddr.a6, sizeof(daddr)); + } + + return mlx5_fpga_ipsec_create_sa_ctx(mdev, xfrm, saddr, + daddr, xfrm->attrs.spi, + xfrm->attrs.is_ipv6); } void mlx5_accel_esp_free_hw_context(void *context) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/accel/ipsec.h b/drivers/net/ethernet/mellanox/mlx5/core/accel/ipsec.h index 530e428d46ab..f9b8e2a041c1 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/accel/ipsec.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/accel/ipsec.h @@ -47,10 +47,7 @@ int mlx5_accel_ipsec_counters_read(struct mlx5_core_dev *mdev, u64 *counters, unsigned int count); void *mlx5_accel_esp_create_hw_context(struct mlx5_core_dev *mdev, - struct mlx5_accel_esp_xfrm *xfrm, - const __be32 saddr[4], - const __be32 daddr[4], - const __be32 spi, bool is_ipv6); + struct mlx5_accel_esp_xfrm *xfrm); void mlx5_accel_esp_free_hw_context(void *context); int mlx5_accel_ipsec_init(struct mlx5_core_dev *mdev); @@ -63,10 +60,7 @@ void mlx5_accel_ipsec_cleanup(struct mlx5_core_dev *mdev); static inline void * mlx5_accel_esp_create_hw_context(struct mlx5_core_dev *mdev, - struct mlx5_accel_esp_xfrm *xfrm, - const __be32 saddr[4], - const __be32 daddr[4], - const __be32 spi, bool is_ipv6) + struct mlx5_accel_esp_xfrm *xfrm) { return NULL; } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec.c b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec.c index 29626c6c9c25..9e6c2216c93e 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec.c @@ -199,6 +199,14 @@ mlx5e_ipsec_build_accel_xfrm_attrs(struct mlx5e_ipsec_sa_entry *sa_entry, attrs->flags |= (x->props.mode == XFRM_MODE_TRANSPORT) ? MLX5_ACCEL_ESP_FLAGS_TRANSPORT : MLX5_ACCEL_ESP_FLAGS_TUNNEL; + + /* spi */ + attrs->spi = x->id.spi; + + /* source , destination ips */ + memcpy(&attrs->saddr, x->props.saddr.a6, sizeof(attrs->saddr)); + memcpy(&attrs->daddr, x->id.daddr.a6, sizeof(attrs->daddr)); + attrs->is_ipv6 = (x->props.family != AF_INET); } static inline int mlx5e_xfrm_validate_state(struct xfrm_state *x) @@ -284,8 +292,6 @@ static int mlx5e_xfrm_add_state(struct xfrm_state *x) struct net_device *netdev = x->xso.dev; struct mlx5_accel_esp_xfrm_attrs attrs; struct mlx5e_priv *priv; - __be32 saddr[4] = {0}, daddr[4] = {0}, spi; - bool is_ipv6 = false; int err; priv = netdev_priv(netdev); @@ -331,20 +337,9 @@ static int mlx5e_xfrm_add_state(struct xfrm_state *x) } /* create hw context */ - if (x->props.family == AF_INET) { - saddr[3] = x->props.saddr.a4; - daddr[3] = x->id.daddr.a4; - } else { - memcpy(saddr, x->props.saddr.a6, sizeof(saddr)); - memcpy(daddr, x->id.daddr.a6, sizeof(daddr)); - is_ipv6 = true; - } - spi = x->id.spi; sa_entry->hw_context = mlx5_accel_esp_create_hw_context(priv->mdev, - sa_entry->xfrm, - saddr, daddr, spi, - is_ipv6); + sa_entry->xfrm); if (IS_ERR(sa_entry->hw_context)) { err = PTR_ERR(sa_entry->hw_context); goto err_xfrm; diff --git a/include/linux/mlx5/accel.h b/include/linux/mlx5/accel.h index 5613e677a5f9..b919d143a9a6 100644 --- a/include/linux/mlx5/accel.h +++ b/include/linux/mlx5/accel.h @@ -92,6 +92,18 @@ struct mlx5_accel_esp_xfrm_attrs { union { struct aes_gcm_keymat aes_gcm; } keymat; + + union { + __be32 a4; + __be32 a6[4]; + } saddr; + + union { + __be32 a4; + __be32 a6[4]; + } daddr; + + u8 is_ipv6; }; struct mlx5_accel_esp_xfrm { -- cgit v1.2.3-59-g8ed1b From 0aab3e1b04aeeb5682c1ae7c862f107334ab79c0 Mon Sep 17 00:00:00 2001 From: Raed Salem Date: Wed, 8 Jan 2020 11:48:37 +0200 Subject: net/mlx5e: IPSec, Expose IPsec HW stat only for supporting HW The current HW counters are supported only by Innova, split the ipsec stats group into two groups, one for HW and one for SW. And expose the HW counters to ethtool only if Innova HW is used for IPsec offload. Signed-off-by: Raed Salem Reviewed-by: Huy Nguyen Signed-off-by: Saeed Mahameed --- .../ethernet/mellanox/mlx5/core/en_accel/ipsec.h | 25 ------ .../mellanox/mlx5/core/en_accel/ipsec_stats.c | 88 +++++++++++++--------- drivers/net/ethernet/mellanox/mlx5/core/en_stats.c | 29 ++----- drivers/net/ethernet/mellanox/mlx5/core/en_stats.h | 2 + 4 files changed, 58 insertions(+), 86 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec.h b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec.h index 93bf10e6508c..c85151a1e008 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec.h @@ -109,11 +109,6 @@ int mlx5e_ipsec_init(struct mlx5e_priv *priv); void mlx5e_ipsec_cleanup(struct mlx5e_priv *priv); void mlx5e_ipsec_build_netdev(struct mlx5e_priv *priv); -int mlx5e_ipsec_get_count(struct mlx5e_priv *priv); -int mlx5e_ipsec_get_strings(struct mlx5e_priv *priv, uint8_t *data); -void mlx5e_ipsec_update_stats(struct mlx5e_priv *priv); -int mlx5e_ipsec_get_stats(struct mlx5e_priv *priv, u64 *data); - struct xfrm_state *mlx5e_ipsec_sadb_rx_lookup(struct mlx5e_ipsec *dev, unsigned int handle); @@ -136,26 +131,6 @@ static inline void mlx5e_ipsec_build_netdev(struct mlx5e_priv *priv) { } -static inline int mlx5e_ipsec_get_count(struct mlx5e_priv *priv) -{ - return 0; -} - -static inline int mlx5e_ipsec_get_strings(struct mlx5e_priv *priv, - uint8_t *data) -{ - return 0; -} - -static inline void mlx5e_ipsec_update_stats(struct mlx5e_priv *priv) -{ -} - -static inline int mlx5e_ipsec_get_stats(struct mlx5e_priv *priv, u64 *data) -{ - return 0; -} - #endif #endif /* __MLX5E_IPSEC_H__ */ diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_stats.c b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_stats.c index 6fea59223dc4..6c5c54bcd9be 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_stats.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_stats.c @@ -38,6 +38,7 @@ #include "accel/ipsec.h" #include "fpga/sdk.h" #include "en_accel/ipsec.h" +#include "fpga/ipsec.h" static const struct counter_desc mlx5e_ipsec_hw_stats_desc[] = { { MLX5E_DECLARE_STAT(struct mlx5e_ipsec_stats, ipsec_dec_in_packets) }, @@ -73,61 +74,74 @@ static const struct counter_desc mlx5e_ipsec_sw_stats_desc[] = { #define NUM_IPSEC_HW_COUNTERS ARRAY_SIZE(mlx5e_ipsec_hw_stats_desc) #define NUM_IPSEC_SW_COUNTERS ARRAY_SIZE(mlx5e_ipsec_sw_stats_desc) -#define NUM_IPSEC_COUNTERS (NUM_IPSEC_HW_COUNTERS + NUM_IPSEC_SW_COUNTERS) - -int mlx5e_ipsec_get_count(struct mlx5e_priv *priv) +static MLX5E_DECLARE_STATS_GRP_OP_NUM_STATS(ipsec_sw) { - if (!priv->ipsec) - return 0; - - return NUM_IPSEC_COUNTERS; + return NUM_IPSEC_SW_COUNTERS; } -int mlx5e_ipsec_get_strings(struct mlx5e_priv *priv, uint8_t *data) -{ - unsigned int i, idx = 0; +static inline MLX5E_DECLARE_STATS_GRP_OP_UPDATE_STATS(ipsec_sw) {} - if (!priv->ipsec) - return 0; +static MLX5E_DECLARE_STATS_GRP_OP_FILL_STRS(ipsec_sw) +{ + unsigned int i; - for (i = 0; i < NUM_IPSEC_HW_COUNTERS; i++) - strcpy(data + (idx++) * ETH_GSTRING_LEN, - mlx5e_ipsec_hw_stats_desc[i].format); + if (priv->ipsec) + for (i = 0; i < NUM_IPSEC_SW_COUNTERS; i++) + strcpy(data + (idx++) * ETH_GSTRING_LEN, + mlx5e_ipsec_sw_stats_desc[i].format); + return idx; +} - for (i = 0; i < NUM_IPSEC_SW_COUNTERS; i++) - strcpy(data + (idx++) * ETH_GSTRING_LEN, - mlx5e_ipsec_sw_stats_desc[i].format); +static MLX5E_DECLARE_STATS_GRP_OP_FILL_STATS(ipsec_sw) +{ + int i; - return NUM_IPSEC_COUNTERS; + if (priv->ipsec) + for (i = 0; i < NUM_IPSEC_SW_COUNTERS; i++) + data[idx++] = MLX5E_READ_CTR_ATOMIC64(&priv->ipsec->sw_stats, + mlx5e_ipsec_sw_stats_desc, i); + return idx; } -void mlx5e_ipsec_update_stats(struct mlx5e_priv *priv) +static MLX5E_DECLARE_STATS_GRP_OP_NUM_STATS(ipsec_hw) { - int ret; + return (mlx5_fpga_ipsec_device_caps(priv->mdev)) ? NUM_IPSEC_HW_COUNTERS : 0; +} - if (!priv->ipsec) - return; +static MLX5E_DECLARE_STATS_GRP_OP_UPDATE_STATS(ipsec_hw) +{ + int ret = 0; - ret = mlx5_accel_ipsec_counters_read(priv->mdev, (u64 *)&priv->ipsec->stats, - NUM_IPSEC_HW_COUNTERS); + if (priv->ipsec) + ret = mlx5_accel_ipsec_counters_read(priv->mdev, (u64 *)&priv->ipsec->stats, + NUM_IPSEC_HW_COUNTERS); if (ret) memset(&priv->ipsec->stats, 0, sizeof(priv->ipsec->stats)); } -int mlx5e_ipsec_get_stats(struct mlx5e_priv *priv, u64 *data) +static MLX5E_DECLARE_STATS_GRP_OP_FILL_STRS(ipsec_hw) { - int i, idx = 0; - - if (!priv->ipsec) - return 0; + unsigned int i; - for (i = 0; i < NUM_IPSEC_HW_COUNTERS; i++) - data[idx++] = MLX5E_READ_CTR64_CPU(&priv->ipsec->stats, - mlx5e_ipsec_hw_stats_desc, i); + if (priv->ipsec && mlx5_fpga_ipsec_device_caps(priv->mdev)) + for (i = 0; i < NUM_IPSEC_HW_COUNTERS; i++) + strcpy(data + (idx++) * ETH_GSTRING_LEN, + mlx5e_ipsec_hw_stats_desc[i].format); - for (i = 0; i < NUM_IPSEC_SW_COUNTERS; i++) - data[idx++] = MLX5E_READ_CTR_ATOMIC64(&priv->ipsec->sw_stats, - mlx5e_ipsec_sw_stats_desc, i); + return idx; +} - return NUM_IPSEC_COUNTERS; +static MLX5E_DECLARE_STATS_GRP_OP_FILL_STATS(ipsec_hw) +{ + int i; + + if (priv->ipsec && mlx5_fpga_ipsec_device_caps(priv->mdev)) + for (i = 0; i < NUM_IPSEC_HW_COUNTERS; i++) + data[idx++] = MLX5E_READ_CTR64_CPU(&priv->ipsec->stats, + mlx5e_ipsec_hw_stats_desc, + i); + return idx; } + +MLX5E_DEFINE_STATS_GRP(ipsec_sw, 0); +MLX5E_DEFINE_STATS_GRP(ipsec_hw, 0); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_stats.c b/drivers/net/ethernet/mellanox/mlx5/core/en_stats.c index 30b216d9284c..6eb0e8236bbf 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_stats.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_stats.c @@ -32,8 +32,8 @@ #include "lib/mlx5.h" #include "en.h" -#include "en_accel/ipsec.h" #include "en_accel/tls.h" +#include "en_accel/en_accel.h" static unsigned int stats_grps_num(struct mlx5e_priv *priv) { @@ -1424,27 +1424,6 @@ static MLX5E_DECLARE_STATS_GRP_OP_FILL_STATS(pme) static MLX5E_DECLARE_STATS_GRP_OP_UPDATE_STATS(pme) { return; } -static MLX5E_DECLARE_STATS_GRP_OP_NUM_STATS(ipsec) -{ - return mlx5e_ipsec_get_count(priv); -} - -static MLX5E_DECLARE_STATS_GRP_OP_FILL_STRS(ipsec) -{ - return idx + mlx5e_ipsec_get_strings(priv, - data + idx * ETH_GSTRING_LEN); -} - -static MLX5E_DECLARE_STATS_GRP_OP_FILL_STATS(ipsec) -{ - return idx + mlx5e_ipsec_get_stats(priv, data + idx); -} - -static MLX5E_DECLARE_STATS_GRP_OP_UPDATE_STATS(ipsec) -{ - mlx5e_ipsec_update_stats(priv); -} - static MLX5E_DECLARE_STATS_GRP_OP_NUM_STATS(tls) { return mlx5e_tls_get_count(priv); @@ -1714,7 +1693,6 @@ MLX5E_DEFINE_STATS_GRP(pme, 0); MLX5E_DEFINE_STATS_GRP(channels, 0); MLX5E_DEFINE_STATS_GRP(per_port_buff_congest, 0); MLX5E_DEFINE_STATS_GRP(eth_ext, 0); -static MLX5E_DEFINE_STATS_GRP(ipsec, 0); static MLX5E_DEFINE_STATS_GRP(tls, 0); /* The stats groups order is opposite to the update_stats() order calls */ @@ -1731,7 +1709,10 @@ mlx5e_stats_grp_t mlx5e_nic_stats_grps[] = { &MLX5E_STATS_GRP(pcie), &MLX5E_STATS_GRP(per_prio), &MLX5E_STATS_GRP(pme), - &MLX5E_STATS_GRP(ipsec), +#ifdef CONFIG_MLX5_EN_IPSEC + &MLX5E_STATS_GRP(ipsec_sw), + &MLX5E_STATS_GRP(ipsec_hw), +#endif &MLX5E_STATS_GRP(tls), &MLX5E_STATS_GRP(channels), &MLX5E_STATS_GRP(per_port_buff_congest), diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_stats.h b/drivers/net/ethernet/mellanox/mlx5/core/en_stats.h index 092b39ffa32a..2b83ba990714 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_stats.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_stats.h @@ -390,5 +390,7 @@ extern MLX5E_DECLARE_STATS_GRP(per_prio); extern MLX5E_DECLARE_STATS_GRP(pme); extern MLX5E_DECLARE_STATS_GRP(channels); extern MLX5E_DECLARE_STATS_GRP(per_port_buff_congest); +extern MLX5E_DECLARE_STATS_GRP(ipsec_hw); +extern MLX5E_DECLARE_STATS_GRP(ipsec_sw); #endif /* __MLX5_EN_STATS_H__ */ -- cgit v1.2.3-59-g8ed1b From 7dfee4b1d79e1800818abcfb47747b162c9a2d31 Mon Sep 17 00:00:00 2001 From: Raed Salem Date: Wed, 23 Oct 2019 17:04:13 +0300 Subject: net/mlx5: IPsec, Refactor SA handle creation and destruction Currently the SA handle is created and managed as part of the common code for different IPsec supporting HW, this handle is passed to HW to be used on Rx to identify the SA handle that was used to return the xfrm state to stack. The above implementation pose a limitation on managing this handle. Refactor by moving management of this field to the specific HW code. Downstream patches will introduce the Connect-X support for IPsec that will use this handle differently than current implementation. Signed-off-by: Raed Salem Reviewed-by: Boris Pismenny Reviewed-by: Huy Nguyen Signed-off-by: Saeed Mahameed --- .../net/ethernet/mellanox/mlx5/core/accel/ipsec.c | 5 +- .../net/ethernet/mellanox/mlx5/core/accel/ipsec.h | 6 +- .../ethernet/mellanox/mlx5/core/en_accel/ipsec.c | 68 +++++++++------------- .../net/ethernet/mellanox/mlx5/core/fpga/ipsec.c | 29 ++++++++- .../net/ethernet/mellanox/mlx5/core/fpga/ipsec.h | 3 +- 5 files changed, 63 insertions(+), 48 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/accel/ipsec.c b/drivers/net/ethernet/mellanox/mlx5/core/accel/ipsec.c index a92cd88d369c..8a4985d8cbfe 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/accel/ipsec.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/accel/ipsec.c @@ -57,7 +57,8 @@ int mlx5_accel_ipsec_counters_read(struct mlx5_core_dev *mdev, u64 *counters, } void *mlx5_accel_esp_create_hw_context(struct mlx5_core_dev *mdev, - struct mlx5_accel_esp_xfrm *xfrm) + struct mlx5_accel_esp_xfrm *xfrm, + u32 *sa_handle) { __be32 saddr[4] = {}, daddr[4] = {}; @@ -71,7 +72,7 @@ void *mlx5_accel_esp_create_hw_context(struct mlx5_core_dev *mdev, return mlx5_fpga_ipsec_create_sa_ctx(mdev, xfrm, saddr, daddr, xfrm->attrs.spi, - xfrm->attrs.is_ipv6); + xfrm->attrs.is_ipv6, sa_handle); } void mlx5_accel_esp_free_hw_context(void *context) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/accel/ipsec.h b/drivers/net/ethernet/mellanox/mlx5/core/accel/ipsec.h index f9b8e2a041c1..e89747674712 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/accel/ipsec.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/accel/ipsec.h @@ -47,7 +47,8 @@ int mlx5_accel_ipsec_counters_read(struct mlx5_core_dev *mdev, u64 *counters, unsigned int count); void *mlx5_accel_esp_create_hw_context(struct mlx5_core_dev *mdev, - struct mlx5_accel_esp_xfrm *xfrm); + struct mlx5_accel_esp_xfrm *xfrm, + u32 *sa_handle); void mlx5_accel_esp_free_hw_context(void *context); int mlx5_accel_ipsec_init(struct mlx5_core_dev *mdev); @@ -60,7 +61,8 @@ void mlx5_accel_ipsec_cleanup(struct mlx5_core_dev *mdev); static inline void * mlx5_accel_esp_create_hw_context(struct mlx5_core_dev *mdev, - struct mlx5_accel_esp_xfrm *xfrm) + struct mlx5_accel_esp_xfrm *xfrm, + u32 *sa_handle) { return NULL; } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec.c b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec.c index 9e6c2216c93e..92eb3bad4acd 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec.c @@ -75,18 +75,23 @@ struct xfrm_state *mlx5e_ipsec_sadb_rx_lookup(struct mlx5e_ipsec *ipsec, return ret; } -static int mlx5e_ipsec_sadb_rx_add(struct mlx5e_ipsec_sa_entry *sa_entry) +static int mlx5e_ipsec_sadb_rx_add(struct mlx5e_ipsec_sa_entry *sa_entry, + unsigned int handle) { struct mlx5e_ipsec *ipsec = sa_entry->ipsec; + struct mlx5e_ipsec_sa_entry *_sa_entry; unsigned long flags; - int ret; - ret = ida_simple_get(&ipsec->halloc, 1, 0, GFP_KERNEL); - if (ret < 0) - return ret; + rcu_read_lock(); + hash_for_each_possible_rcu(ipsec->sadb_rx, _sa_entry, hlist, handle) + if (_sa_entry->handle == handle) { + rcu_read_unlock(); + return -EEXIST; + } + rcu_read_unlock(); spin_lock_irqsave(&ipsec->sadb_rx_lock, flags); - sa_entry->handle = ret; + sa_entry->handle = handle; hash_add_rcu(ipsec->sadb_rx, &sa_entry->hlist, sa_entry->handle); spin_unlock_irqrestore(&ipsec->sadb_rx_lock, flags); @@ -103,15 +108,6 @@ static void mlx5e_ipsec_sadb_rx_del(struct mlx5e_ipsec_sa_entry *sa_entry) spin_unlock_irqrestore(&ipsec->sadb_rx_lock, flags); } -static void mlx5e_ipsec_sadb_rx_free(struct mlx5e_ipsec_sa_entry *sa_entry) -{ - struct mlx5e_ipsec *ipsec = sa_entry->ipsec; - - /* xfrm already doing sync rcu between del and free callbacks */ - - ida_simple_remove(&ipsec->halloc, sa_entry->handle); -} - static bool mlx5e_ipsec_update_esn_state(struct mlx5e_ipsec_sa_entry *sa_entry) { struct xfrm_replay_state_esn *replay_esn; @@ -292,6 +288,7 @@ static int mlx5e_xfrm_add_state(struct xfrm_state *x) struct net_device *netdev = x->xso.dev; struct mlx5_accel_esp_xfrm_attrs attrs; struct mlx5e_priv *priv; + unsigned int sa_handle; int err; priv = netdev_priv(netdev); @@ -309,20 +306,6 @@ static int mlx5e_xfrm_add_state(struct xfrm_state *x) sa_entry->x = x; sa_entry->ipsec = priv->ipsec; - /* Add the SA to handle processed incoming packets before the add SA - * completion was received - */ - if (x->xso.flags & XFRM_OFFLOAD_INBOUND) { - err = mlx5e_ipsec_sadb_rx_add(sa_entry); - if (err) { - netdev_info(netdev, "Failed adding to SADB_RX: %d\n", err); - goto err_entry; - } - } else { - sa_entry->set_iv_op = (x->props.flags & XFRM_STATE_ESN) ? - mlx5e_ipsec_set_iv_esn : mlx5e_ipsec_set_iv; - } - /* check esn */ mlx5e_ipsec_update_esn_state(sa_entry); @@ -333,30 +316,38 @@ static int mlx5e_xfrm_add_state(struct xfrm_state *x) MLX5_ACCEL_XFRM_FLAG_REQUIRE_METADATA); if (IS_ERR(sa_entry->xfrm)) { err = PTR_ERR(sa_entry->xfrm); - goto err_sadb_rx; + goto err_sa_entry; } /* create hw context */ sa_entry->hw_context = mlx5_accel_esp_create_hw_context(priv->mdev, - sa_entry->xfrm); + sa_entry->xfrm, + &sa_handle); if (IS_ERR(sa_entry->hw_context)) { err = PTR_ERR(sa_entry->hw_context); goto err_xfrm; } + if (x->xso.flags & XFRM_OFFLOAD_INBOUND) { + err = mlx5e_ipsec_sadb_rx_add(sa_entry, sa_handle); + if (err) + goto err_hw_ctx; + } else { + sa_entry->set_iv_op = (x->props.flags & XFRM_STATE_ESN) ? + mlx5e_ipsec_set_iv_esn : mlx5e_ipsec_set_iv; + } + x->xso.offload_handle = (unsigned long)sa_entry; goto out; +err_hw_ctx: + mlx5_accel_esp_free_hw_context(sa_entry->hw_context); err_xfrm: mlx5_accel_esp_destroy_xfrm(sa_entry->xfrm); -err_sadb_rx: - if (x->xso.flags & XFRM_OFFLOAD_INBOUND) { - mlx5e_ipsec_sadb_rx_del(sa_entry); - mlx5e_ipsec_sadb_rx_free(sa_entry); - } -err_entry: +err_sa_entry: kfree(sa_entry); + out: return err; } @@ -385,9 +376,6 @@ static void mlx5e_xfrm_free_state(struct xfrm_state *x) mlx5_accel_esp_destroy_xfrm(sa_entry->xfrm); } - if (x->xso.flags & XFRM_OFFLOAD_INBOUND) - mlx5e_ipsec_sadb_rx_free(sa_entry); - kfree(sa_entry); } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fpga/ipsec.c b/drivers/net/ethernet/mellanox/mlx5/core/fpga/ipsec.c index c8736b6b4172..0604216eb94f 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/fpga/ipsec.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/fpga/ipsec.c @@ -65,6 +65,7 @@ struct mlx5_fpga_esp_xfrm; struct mlx5_fpga_ipsec_sa_ctx { struct rhash_head hash; struct mlx5_ifc_fpga_ipsec_sa hw_sa; + u32 sa_handle; struct mlx5_core_dev *dev; struct mlx5_fpga_esp_xfrm *fpga_xfrm; }; @@ -119,6 +120,8 @@ struct mlx5_fpga_ipsec { */ struct rb_root rules_rb; struct mutex rules_rb_lock; /* rules lock */ + + struct ida halloc; }; static bool mlx5_fpga_is_ipsec_device(struct mlx5_core_dev *mdev) @@ -666,7 +669,8 @@ void *mlx5_fpga_ipsec_create_sa_ctx(struct mlx5_core_dev *mdev, struct mlx5_accel_esp_xfrm *accel_xfrm, const __be32 saddr[4], const __be32 daddr[4], - const __be32 spi, bool is_ipv6) + const __be32 spi, bool is_ipv6, + u32 *sa_handle) { struct mlx5_fpga_ipsec_sa_ctx *sa_ctx; struct mlx5_fpga_esp_xfrm *fpga_xfrm = @@ -704,6 +708,17 @@ void *mlx5_fpga_ipsec_create_sa_ctx(struct mlx5_core_dev *mdev, goto exists; } + if (accel_xfrm->attrs.action & MLX5_ACCEL_ESP_ACTION_DECRYPT) { + err = ida_simple_get(&fipsec->halloc, 1, 0, GFP_KERNEL); + if (err < 0) { + context = ERR_PTR(err); + goto exists; + } + + sa_ctx->sa_handle = err; + if (sa_handle) + *sa_handle = sa_ctx->sa_handle; + } /* This is unbounded fpga_xfrm, try to add to hash */ mutex_lock(&fipsec->sa_hash_lock); @@ -744,7 +759,8 @@ delete_hash: rhash_sa)); unlock_hash: mutex_unlock(&fipsec->sa_hash_lock); - + if (accel_xfrm->attrs.action & MLX5_ACCEL_ESP_ACTION_DECRYPT) + ida_simple_remove(&fipsec->halloc, sa_ctx->sa_handle); exists: mutex_unlock(&fpga_xfrm->lock); kfree(sa_ctx); @@ -816,7 +832,7 @@ mlx5_fpga_ipsec_fs_create_sa_ctx(struct mlx5_core_dev *mdev, /* create */ return mlx5_fpga_ipsec_create_sa_ctx(mdev, accel_xfrm, saddr, daddr, - spi, is_ipv6); + spi, is_ipv6, NULL); } static void @@ -836,6 +852,10 @@ mlx5_fpga_ipsec_release_sa_ctx(struct mlx5_fpga_ipsec_sa_ctx *sa_ctx) return; } + if (sa_ctx->fpga_xfrm->accel_xfrm.attrs.action & + MLX5_ACCEL_ESP_ACTION_DECRYPT) + ida_simple_remove(&fipsec->halloc, sa_ctx->sa_handle); + mutex_lock(&fipsec->sa_hash_lock); WARN_ON(rhashtable_remove_fast(&fipsec->sa_hash, &sa_ctx->hash, rhash_sa)); @@ -1299,6 +1319,8 @@ int mlx5_fpga_ipsec_init(struct mlx5_core_dev *mdev) goto err_destroy_hash; } + ida_init(&fdev->ipsec->halloc); + return 0; err_destroy_hash: @@ -1331,6 +1353,7 @@ void mlx5_fpga_ipsec_cleanup(struct mlx5_core_dev *mdev) if (!mlx5_fpga_is_ipsec_device(mdev)) return; + ida_destroy(&fdev->ipsec->halloc); destroy_rules_rb(&fdev->ipsec->rules_rb); rhashtable_destroy(&fdev->ipsec->sa_hash); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fpga/ipsec.h b/drivers/net/ethernet/mellanox/mlx5/core/fpga/ipsec.h index d01b1fc8e11b..9ba637f0f0f2 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/fpga/ipsec.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/fpga/ipsec.h @@ -47,7 +47,8 @@ void *mlx5_fpga_ipsec_create_sa_ctx(struct mlx5_core_dev *mdev, struct mlx5_accel_esp_xfrm *accel_xfrm, const __be32 saddr[4], const __be32 daddr[4], - const __be32 spi, bool is_ipv6); + const __be32 spi, bool is_ipv6, + u32 *sa_handle); void mlx5_fpga_ipsec_delete_sa_ctx(void *context); int mlx5_fpga_ipsec_init(struct mlx5_core_dev *mdev); -- cgit v1.2.3-59-g8ed1b From 82fe2996419830b0bb2c7e1f2fed2d3a8a1a65cd Mon Sep 17 00:00:00 2001 From: Tariq Toukan Date: Tue, 18 Feb 2020 12:27:25 +0200 Subject: net/mlx5e: Set of completion request bit should not clear other adjacent bits In notify HW (ring doorbell) flow, we set the bit to request a completion on the TX descriptor. When doing so, we should not unset other bits in the same byte. Currently, this does not fix a real issue, as we still don't have a flow where both MLX5_WQE_CTRL_CQ_UPDATE and any adjacent bit are set together. Fixes: 542578c67936 ("net/mlx5e: Move helper functions to a new txrx datapath header") Fixes: 864b2d715300 ("net/mlx5e: Generalize tx helper functions for different SQ types") Signed-off-by: Tariq Toukan Reviewed-by: Aya Levin Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en/txrx.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/txrx.h b/drivers/net/ethernet/mellanox/mlx5/core/en/txrx.h index f07b1399744e..9f6967d76053 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/txrx.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/txrx.h @@ -102,7 +102,7 @@ static inline void mlx5e_notify_hw(struct mlx5_wq_cyc *wq, u16 pc, void __iomem *uar_map, struct mlx5_wqe_ctrl_seg *ctrl) { - ctrl->fm_ce_se = MLX5_WQE_CTRL_CQ_UPDATE; + ctrl->fm_ce_se |= MLX5_WQE_CTRL_CQ_UPDATE; /* ensure wqe is visible to device before updating doorbell record */ dma_wmb(); -- cgit v1.2.3-59-g8ed1b From d7a42ad062cc6b20b2c2a8c09dc61df2d4f5751f Mon Sep 17 00:00:00 2001 From: Roi Dayan Date: Wed, 25 Mar 2020 11:32:56 +0200 Subject: net/mlx5e: Allow partial data mask for tunnel options We use mapping to save and restore the tunnel options. Save also the tunnel options mask. Signed-off-by: Roi Dayan Reviewed-by: Paul Blakey Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en_tc.c | 32 +++++++++++++++++-------- 1 file changed, 22 insertions(+), 10 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c index a574c588269a..7d2b05576f44 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c @@ -171,6 +171,11 @@ struct tunnel_match_key { int filter_ifindex; }; +struct tunnel_match_enc_opts { + struct flow_dissector_key_enc_opts key; + struct flow_dissector_key_enc_opts mask; +}; + /* Tunnel_id mapping is TUNNEL_INFO_BITS + ENC_OPTS_BITS. * Upper TUNNEL_INFO_BITS for general tunnel info. * Lower ENC_OPTS_BITS bits for enc_opts. @@ -1824,9 +1829,7 @@ enc_opts_is_dont_care_or_full_match(struct mlx5e_priv *priv, *dont_care = false; if (opt->opt_class != U16_MAX || - opt->type != U8_MAX || - memchr_inv(opt->opt_data, 0xFF, - opt->length * 4)) { + opt->type != U8_MAX) { NL_SET_ERR_MSG(extack, "Partial match of tunnel options in chain > 0 isn't supported"); netdev_warn(priv->netdev, @@ -1863,6 +1866,7 @@ static int mlx5e_get_flow_tunnel_id(struct mlx5e_priv *priv, struct mlx5_esw_flow_attr *attr = flow->esw_attr; struct mlx5e_tc_mod_hdr_acts *mod_hdr_acts; struct flow_match_enc_opts enc_opts_match; + struct tunnel_match_enc_opts tun_enc_opts; struct mlx5_rep_uplink_priv *uplink_priv; struct mlx5e_rep_priv *uplink_rpriv; struct tunnel_match_key tunnel_key; @@ -1905,8 +1909,14 @@ static int mlx5e_get_flow_tunnel_id(struct mlx5e_priv *priv, goto err_enc_opts; if (!enc_opts_is_dont_care) { + memset(&tun_enc_opts, 0, sizeof(tun_enc_opts)); + memcpy(&tun_enc_opts.key, enc_opts_match.key, + sizeof(*enc_opts_match.key)); + memcpy(&tun_enc_opts.mask, enc_opts_match.mask, + sizeof(*enc_opts_match.mask)); + err = mapping_add(uplink_priv->tunnel_enc_opts_mapping, - enc_opts_match.key, &enc_opts_id); + &tun_enc_opts, &enc_opts_id); if (err) goto err_enc_opts; } @@ -4707,7 +4717,7 @@ void mlx5e_tc_nic_cleanup(struct mlx5e_priv *priv) int mlx5e_tc_esw_init(struct rhashtable *tc_ht) { - const size_t sz_enc_opts = sizeof(struct flow_dissector_key_enc_opts); + const size_t sz_enc_opts = sizeof(struct tunnel_match_enc_opts); struct mlx5_rep_uplink_priv *uplink_priv; struct mlx5e_rep_priv *priv; struct mapping_ctx *mapping; @@ -4802,7 +4812,7 @@ static bool mlx5e_restore_tunnel(struct mlx5e_priv *priv, struct sk_buff *skb, u32 tunnel_id) { struct mlx5_eswitch *esw = priv->mdev->priv.eswitch; - struct flow_dissector_key_enc_opts enc_opts = {}; + struct tunnel_match_enc_opts enc_opts = {}; struct mlx5_rep_uplink_priv *uplink_priv; struct mlx5e_rep_priv *uplink_rpriv; struct metadata_dst *tun_dst; @@ -4840,7 +4850,7 @@ static bool mlx5e_restore_tunnel(struct mlx5e_priv *priv, struct sk_buff *skb, } } - tun_dst = tun_rx_dst(enc_opts.len); + tun_dst = tun_rx_dst(enc_opts.key.len); if (!tun_dst) { WARN_ON_ONCE(true); return false; @@ -4854,9 +4864,11 @@ static bool mlx5e_restore_tunnel(struct mlx5e_priv *priv, struct sk_buff *skb, key32_to_tunnel_id(key.enc_key_id.keyid), TUNNEL_KEY); - if (enc_opts.len) - ip_tunnel_info_opts_set(&tun_dst->u.tun_info, enc_opts.data, - enc_opts.len, enc_opts.dst_opt_type); + if (enc_opts.key.len) + ip_tunnel_info_opts_set(&tun_dst->u.tun_info, + enc_opts.key.data, + enc_opts.key.len, + enc_opts.key.dst_opt_type); skb_dst_set(skb, (struct dst_entry *)tun_dst); dev = dev_get_by_index(&init_net, key.filter_ifindex); -- cgit v1.2.3-59-g8ed1b From fa3748775b92692331cfcab6f7b09a04a23694d9 Mon Sep 17 00:00:00 2001 From: Maxim Mikityanskiy Date: Thu, 19 Mar 2020 17:32:27 +0200 Subject: net/mlx5e: Handle errors from netif_set_real_num_{tx,rx}_queues netif_set_real_num_tx_queues and netif_set_real_num_rx_queues may fail. Now that mlx5e supports handling errors in the preactivate hook, this commit leverages that functionality to handle errors from those functions and roll back all changes on failure. Signed-off-by: Maxim Mikityanskiy Reviewed-by: Tariq Toukan Signed-off-by: Saeed Mahameed --- .../net/ethernet/mellanox/mlx5/core/en_ethtool.c | 2 +- drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 59 +++++++++++++++++----- 2 files changed, 48 insertions(+), 13 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c index 6d703ddee4e2..4ab78b5c2393 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c @@ -432,7 +432,7 @@ int mlx5e_ethtool_set_channels(struct mlx5e_priv *priv, if (!test_bit(MLX5E_STATE_OPENED, &priv->state)) { *cur_params = new_channels.params; - mlx5e_num_channels_changed(priv); + err = mlx5e_num_channels_changed(priv); goto out; } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index f02150a97ac8..e057822898f8 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -2839,11 +2839,8 @@ void mlx5e_set_netdev_mtu_boundaries(struct mlx5e_priv *priv) ETH_MAX_MTU); } -static void mlx5e_netdev_set_tcs(struct net_device *netdev) +static void mlx5e_netdev_set_tcs(struct net_device *netdev, u16 nch, u8 ntc) { - struct mlx5e_priv *priv = netdev_priv(netdev); - int nch = priv->channels.params.num_channels; - int ntc = priv->channels.params.num_tc; int tc; netdev_reset_tc(netdev); @@ -2860,15 +2857,47 @@ static void mlx5e_netdev_set_tcs(struct net_device *netdev) netdev_set_tc_queue(netdev, tc, nch, 0); } -static void mlx5e_update_netdev_queues(struct mlx5e_priv *priv, u16 count) +static int mlx5e_update_netdev_queues(struct mlx5e_priv *priv) { - int num_txqs = count * priv->channels.params.num_tc; - int num_rxqs = count * priv->profile->rq_groups; struct net_device *netdev = priv->netdev; + int num_txqs, num_rxqs, nch, ntc; + int old_num_txqs, old_ntc; + int err; + + old_num_txqs = netdev->real_num_tx_queues; + old_ntc = netdev->num_tc; - mlx5e_netdev_set_tcs(netdev); - netif_set_real_num_tx_queues(netdev, num_txqs); - netif_set_real_num_rx_queues(netdev, num_rxqs); + nch = priv->channels.params.num_channels; + ntc = priv->channels.params.num_tc; + num_txqs = nch * ntc; + num_rxqs = nch * priv->profile->rq_groups; + + mlx5e_netdev_set_tcs(netdev, nch, ntc); + + err = netif_set_real_num_tx_queues(netdev, num_txqs); + if (err) { + netdev_warn(netdev, "netif_set_real_num_tx_queues failed, %d\n", err); + goto err_tcs; + } + err = netif_set_real_num_rx_queues(netdev, num_rxqs); + if (err) { + netdev_warn(netdev, "netif_set_real_num_rx_queues failed, %d\n", err); + goto err_txqs; + } + + return 0; + +err_txqs: + /* netif_set_real_num_rx_queues could fail only when nch increased. Only + * one of nch and ntc is changed in this function. That means, the call + * to netif_set_real_num_tx_queues below should not fail, because it + * decreases the number of TX queues. + */ + WARN_ON_ONCE(netif_set_real_num_tx_queues(netdev, old_num_txqs)); + +err_tcs: + mlx5e_netdev_set_tcs(netdev, old_num_txqs / old_ntc, old_ntc); + return err; } static void mlx5e_set_default_xps_cpumasks(struct mlx5e_priv *priv, @@ -2895,8 +2924,12 @@ static void mlx5e_set_default_xps_cpumasks(struct mlx5e_priv *priv, int mlx5e_num_channels_changed(struct mlx5e_priv *priv) { u16 count = priv->channels.params.num_channels; + int err; + + err = mlx5e_update_netdev_queues(priv); + if (err) + return err; - mlx5e_update_netdev_queues(priv, count); mlx5e_set_default_xps_cpumasks(priv, &priv->channels.params); if (!netif_is_rxfh_configured(priv->netdev)) @@ -5358,9 +5391,11 @@ int mlx5e_attach_netdev(struct mlx5e_priv *priv) */ if (take_rtnl) rtnl_lock(); - mlx5e_num_channels_changed(priv); + err = mlx5e_num_channels_changed(priv); if (take_rtnl) rtnl_unlock(); + if (err) + goto out; err = profile->init_tx(priv); if (err) -- cgit v1.2.3-59-g8ed1b From c89da067a2e4d0f94f0f314c2918dca50348789c Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Sun, 8 Mar 2020 21:41:22 -0500 Subject: net/mlx5: Read embedded cpu bit only once Embedded CPU bit doesn't change with PCI resume/suspend. Hence read it only once while probing the PCI device. Signed-off-by: Parav Pandit Reviewed-by: Bodong Wang Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/main.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c index 7af4210c1b96..5a97e98e937c 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c @@ -782,7 +782,7 @@ static int mlx5_pci_init(struct mlx5_core_dev *dev, struct pci_dev *pdev, } mlx5_pci_vsc_init(dev); - + dev->caps.embedded_cpu = mlx5_read_embedded_cpu(dev); return 0; err_clr_master: @@ -1180,7 +1180,6 @@ int mlx5_load_one(struct mlx5_core_dev *dev, bool boot) { int err = 0; - dev->caps.embedded_cpu = mlx5_read_embedded_cpu(dev); mutex_lock(&dev->intf_state_mutex); if (test_bit(MLX5_INTERFACE_STATE_UP, &dev->intf_state)) { mlx5_core_warn(dev, "interface is up, NOP\n"); -- cgit v1.2.3-59-g8ed1b From 6533380dfd003ea7636cb5672f4f85124b56328b Mon Sep 17 00:00:00 2001 From: Hu Haowen Date: Wed, 1 Apr 2020 20:57:20 +0800 Subject: net/mlx5: improve some comments Replaced "its" with "it's". Signed-off-by: Hu Haowen Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/diag/fw_tracer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/diag/fw_tracer.c b/drivers/net/ethernet/mellanox/mlx5/core/diag/fw_tracer.c index c9c9b479bda5..0a8adda073c2 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/diag/fw_tracer.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/diag/fw_tracer.c @@ -684,7 +684,7 @@ static void mlx5_fw_tracer_handle_traces(struct work_struct *work) get_block_timestamp(tracer, &tmp_trace_block[TRACES_PER_BLOCK - 1]); while (block_timestamp > tracer->last_timestamp) { - /* Check block override if its not the first block */ + /* Check block override if it's not the first block */ if (!tracer->last_timestamp) { u64 *ts_event; /* To avoid block override be the HW in case of buffer -- cgit v1.2.3-59-g8ed1b From 794867ee6730fe3f0d419d3911f35a725cafe3a8 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Mon, 20 Apr 2020 22:52:59 +0200 Subject: r8169: change wmb to smb_wmb in rtl8169_start_xmit A barrier is needed here to ensure that rtl_tx sees the descriptor changes (DescOwn set) before the updated tp->cur_tx value. Else it may wrongly assume that the transfer has been finished already. For this purpose smp_wmb() is sufficient. No separate barrier is needed for ordering the descriptor changes with the MMIO doorbell write. The needed barrier is included in the non-relaxed writel() used by rtl8169_doorbell(). Signed-off-by: Heiner Kallweit Signed-off-by: David S. Miller --- drivers/net/ethernet/realtek/r8169_main.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/realtek/r8169_main.c b/drivers/net/ethernet/realtek/r8169_main.c index b8cc064ee9f5..b7a2853e7396 100644 --- a/drivers/net/ethernet/realtek/r8169_main.c +++ b/drivers/net/ethernet/realtek/r8169_main.c @@ -4249,8 +4249,8 @@ static netdev_tx_t rtl8169_start_xmit(struct sk_buff *skb, txd_first->opts1 |= cpu_to_le32(DescOwn | FirstFrag); - /* Force all memory writes to complete before notifying device */ - wmb(); + /* rtl_tx needs to see descriptor changes before updated tp->cur_tx */ + smp_wmb(); tp->cur_tx += frags + 1; -- cgit v1.2.3-59-g8ed1b From d4d9b47e4b10679d5c794e62f9866ee5ee204c0d Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Tue, 21 Apr 2020 00:51:17 +0300 Subject: net: bcmgenet: Drop ACPI_PTR() to avoid compiler warning When compiled with CONFIG_ACPI=n, ACPI_PTR() will be no-op, and thus genet_acpi_match table defined, but not used. Compiler is not happy about such data. Drop ACPI_PTR() for good. Signed-off-by: Andy Shevchenko Acked-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/genet/bcmgenet.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/broadcom/genet/bcmgenet.c b/drivers/net/ethernet/broadcom/genet/bcmgenet.c index d975338bf78d..9f2f0e681656 100644 --- a/drivers/net/ethernet/broadcom/genet/bcmgenet.c +++ b/drivers/net/ethernet/broadcom/genet/bcmgenet.c @@ -3740,7 +3740,7 @@ static struct platform_driver bcmgenet_driver = { .name = "bcmgenet", .of_match_table = bcmgenet_match, .pm = &bcmgenet_pm_ops, - .acpi_match_table = ACPI_PTR(genet_acpi_match), + .acpi_match_table = genet_acpi_match, }, }; module_platform_driver(bcmgenet_driver); -- cgit v1.2.3-59-g8ed1b From 9a965942a9cabb2cdca1d7e99c985d6978ece96c Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Tue, 21 Apr 2020 00:51:18 +0300 Subject: net: bcmgenet: Drop useless OF code There is nothing which needs a set of OF headers, followed by redundant OF node ID check. Drop them for good. Signed-off-by: Andy Shevchenko Acked-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/genet/bcmgenet.c | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/drivers/net/ethernet/broadcom/genet/bcmgenet.c b/drivers/net/ethernet/broadcom/genet/bcmgenet.c index 9f2f0e681656..ef275db018f7 100644 --- a/drivers/net/ethernet/broadcom/genet/bcmgenet.c +++ b/drivers/net/ethernet/broadcom/genet/bcmgenet.c @@ -23,11 +23,6 @@ #include #include #include -#include -#include -#include -#include -#include #include #include @@ -3417,8 +3412,6 @@ MODULE_DEVICE_TABLE(of, bcmgenet_match); static int bcmgenet_probe(struct platform_device *pdev) { struct bcmgenet_platform_data *pd = pdev->dev.platform_data; - struct device_node *dn = pdev->dev.of_node; - const struct of_device_id *of_id = NULL; const struct bcmgenet_plat_data *pdata; struct bcmgenet_priv *priv; struct net_device *dev; @@ -3433,12 +3426,6 @@ static int bcmgenet_probe(struct platform_device *pdev) return -ENOMEM; } - if (dn) { - of_id = of_match_node(bcmgenet_match, dn); - if (!of_id) - return -EINVAL; - } - priv = netdev_priv(dev); priv->irq0 = platform_get_irq(pdev, 0); if (priv->irq0 < 0) { -- cgit v1.2.3-59-g8ed1b From c80d36ff63a5fb447616e03d4339d2eef7884d28 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Tue, 21 Apr 2020 00:51:19 +0300 Subject: net: bcmgenet: Use devm_clk_get_optional() to get the clocks Conversion to devm_clk_get_optional() makes it explicit that clocks are optional. This change allows to handle deferred probe in case clocks are defined, but not yet probed. Due to above changes bail out in error case. While here, check potential error when enable main clock. Signed-off-by: Andy Shevchenko Acked-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/genet/bcmgenet.c | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/drivers/net/ethernet/broadcom/genet/bcmgenet.c b/drivers/net/ethernet/broadcom/genet/bcmgenet.c index ef275db018f7..86666e9ab3e7 100644 --- a/drivers/net/ethernet/broadcom/genet/bcmgenet.c +++ b/drivers/net/ethernet/broadcom/genet/bcmgenet.c @@ -3487,13 +3487,16 @@ static int bcmgenet_probe(struct platform_device *pdev) priv->dma_max_burst_length = DMA_MAX_BURST_LENGTH; } - priv->clk = devm_clk_get(&priv->pdev->dev, "enet"); + priv->clk = devm_clk_get_optional(&priv->pdev->dev, "enet"); if (IS_ERR(priv->clk)) { dev_dbg(&priv->pdev->dev, "failed to get enet clock\n"); - priv->clk = NULL; + err = PTR_ERR(priv->clk); + goto err; } - clk_prepare_enable(priv->clk); + err = clk_prepare_enable(priv->clk); + if (err) + goto err; bcmgenet_set_hw_params(priv); @@ -3511,16 +3514,18 @@ static int bcmgenet_probe(struct platform_device *pdev) priv->rx_buf_len = RX_BUF_LENGTH; INIT_WORK(&priv->bcmgenet_irq_work, bcmgenet_irq_task); - priv->clk_wol = devm_clk_get(&priv->pdev->dev, "enet-wol"); + priv->clk_wol = devm_clk_get_optional(&priv->pdev->dev, "enet-wol"); if (IS_ERR(priv->clk_wol)) { dev_dbg(&priv->pdev->dev, "failed to get enet-wol clock\n"); - priv->clk_wol = NULL; + err = PTR_ERR(priv->clk_wol); + goto err; } - priv->clk_eee = devm_clk_get(&priv->pdev->dev, "enet-eee"); + priv->clk_eee = devm_clk_get_optional(&priv->pdev->dev, "enet-eee"); if (IS_ERR(priv->clk_eee)) { dev_dbg(&priv->pdev->dev, "failed to get enet-eee clock\n"); - priv->clk_eee = NULL; + err = PTR_ERR(priv->clk_eee); + goto err; } /* If this is an internal GPHY, power it on now, before UniMAC is -- cgit v1.2.3-59-g8ed1b From d2af1420cbc84f5eabcca6b667d7fb2339c398e2 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Tue, 21 Apr 2020 00:51:20 +0300 Subject: net: bcmgenet: Use get_unligned_beXX() and put_unaligned_beXX() It's convenient to use get_unligned_beXX() and put_unaligned_beXX() helpers to get or set MAC instead of open-coded variants. Signed-off-by: Andy Shevchenko Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/genet/bcmgenet.c | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/drivers/net/ethernet/broadcom/genet/bcmgenet.c b/drivers/net/ethernet/broadcom/genet/bcmgenet.c index 86666e9ab3e7..2c9881032a24 100644 --- a/drivers/net/ethernet/broadcom/genet/bcmgenet.c +++ b/drivers/net/ethernet/broadcom/genet/bcmgenet.c @@ -2702,9 +2702,8 @@ static void bcmgenet_umac_reset(struct bcmgenet_priv *priv) static void bcmgenet_set_hw_addr(struct bcmgenet_priv *priv, unsigned char *addr) { - bcmgenet_umac_writel(priv, (addr[0] << 24) | (addr[1] << 16) | - (addr[2] << 8) | addr[3], UMAC_MAC0); - bcmgenet_umac_writel(priv, (addr[4] << 8) | addr[5], UMAC_MAC1); + bcmgenet_umac_writel(priv, get_unaligned_be32(&addr[0]), UMAC_MAC0); + bcmgenet_umac_writel(priv, get_unaligned_be16(&addr[4]), UMAC_MAC1); } static void bcmgenet_get_hw_addr(struct bcmgenet_priv *priv, @@ -2713,13 +2712,9 @@ static void bcmgenet_get_hw_addr(struct bcmgenet_priv *priv, u32 addr_tmp; addr_tmp = bcmgenet_umac_readl(priv, UMAC_MAC0); - addr[0] = addr_tmp >> 24; - addr[1] = (addr_tmp >> 16) & 0xff; - addr[2] = (addr_tmp >> 8) & 0xff; - addr[3] = addr_tmp & 0xff; + put_unaligned_be32(addr_tmp, &addr[0]); addr_tmp = bcmgenet_umac_readl(priv, UMAC_MAC1); - addr[4] = (addr_tmp >> 8) & 0xff; - addr[5] = addr_tmp & 0xff; + put_unaligned_be16(addr_tmp, &addr[4]); } /* Returns a reusable dma control register value */ -- cgit v1.2.3-59-g8ed1b From 7d3cca75c169710048b0fea08522f635efcea893 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Tue, 21 Apr 2020 00:51:21 +0300 Subject: net: bcmgenet: Drop too many parentheses in bcmgenet_probe() No need to have parentheses around plain pointer variable or negation operator. Drop them for good. Signed-off-by: Andy Shevchenko Acked-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/genet/bcmgenet.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/broadcom/genet/bcmgenet.c b/drivers/net/ethernet/broadcom/genet/bcmgenet.c index 2c9881032a24..20aba79becce 100644 --- a/drivers/net/ethernet/broadcom/genet/bcmgenet.c +++ b/drivers/net/ethernet/broadcom/genet/bcmgenet.c @@ -3529,7 +3529,7 @@ static int bcmgenet_probe(struct platform_device *pdev) if (device_get_phy_mode(&pdev->dev) == PHY_INTERFACE_MODE_INTERNAL) bcmgenet_power_up(priv, GENET_POWER_PASSIVE); - if ((pd) && (!IS_ERR_OR_NULL(pd->mac_address))) + if (pd && !IS_ERR_OR_NULL(pd->mac_address)) ether_addr_copy(dev->dev_addr, pd->mac_address); else if (!device_get_mac_address(&pdev->dev, dev->dev_addr, ETH_ALEN)) -- cgit v1.2.3-59-g8ed1b From b6246f4d8d0778fd045b84dbd7fc5aadd8f3136e Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Mon, 20 Apr 2020 22:51:49 +0100 Subject: net: ipv4: remove redundant assignment to variable rc The variable rc is being assigned with a value that is never read and it is being updated later with a new value. The initialization is redundant and can be removed. Addresses-Coverity: ("Unused value") Signed-off-by: Colin Ian King Signed-off-by: David S. Miller --- net/ipv4/af_inet.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index cf58e29cf746..c618e242490f 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -1914,7 +1914,7 @@ static int __init inet_init(void) { struct inet_protosw *q; struct list_head *r; - int rc = -EINVAL; + int rc; sock_skb_cb_check_size(sizeof(struct inet_skb_parm)); -- cgit v1.2.3-59-g8ed1b From 2a7e978625e887cbb80cb2188cf5f955e4223fa1 Mon Sep 17 00:00:00 2001 From: Simon Wunderlich Date: Mon, 13 Apr 2020 20:40:20 +0200 Subject: batman-adv: Start new development cycle Signed-off-by: Simon Wunderlich --- net/batman-adv/main.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/batman-adv/main.h b/net/batman-adv/main.h index 2a234d0ad445..61d8dbe8c954 100644 --- a/net/batman-adv/main.h +++ b/net/batman-adv/main.h @@ -13,7 +13,7 @@ #define BATADV_DRIVER_DEVICE "batman-adv" #ifndef BATADV_SOURCE_VERSION -#define BATADV_SOURCE_VERSION "2020.1" +#define BATADV_SOURCE_VERSION "2020.2" #endif /* B.A.T.M.A.N. parameters */ -- cgit v1.2.3-59-g8ed1b From c08dd06b3d25e8450a2e187ee6b7412d07c460af Mon Sep 17 00:00:00 2001 From: Sven Eckelmann Date: Thu, 26 Mar 2020 18:37:07 +0100 Subject: batman-adv: Fix spelling error in term buffer checkpatch warns about a typo in the word bufFer which was introduced in commit 2191c1bcbc64 ("batman-adv: kernel doc for types.h"). Signed-off-by: Sven Eckelmann Signed-off-by: Simon Wunderlich --- net/batman-adv/types.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/batman-adv/types.h b/net/batman-adv/types.h index 4a17a66cc572..d152b8e81f61 100644 --- a/net/batman-adv/types.h +++ b/net/batman-adv/types.h @@ -1086,7 +1086,7 @@ struct batadv_priv_bla { * struct batadv_priv_debug_log - debug logging data */ struct batadv_priv_debug_log { - /** @log_buff: buffer holding the logs (ring bufer) */ + /** @log_buff: buffer holding the logs (ring buffer) */ char log_buff[BATADV_LOG_BUF_LEN]; /** @log_start: index of next character to read */ -- cgit v1.2.3-59-g8ed1b From 9204a4f876b229505f4ab947124d1160b80bbb4c Mon Sep 17 00:00:00 2001 From: Sven Eckelmann Date: Mon, 13 Apr 2020 20:26:07 +0200 Subject: batman-adv: trace: Drop unneeded types.h include The commit 04ae87a52074 ("ftrace: Rework event_create_dir()") restructured various macros in the ftrace framework. These changes also had the nice side effect that the linux/types.h include is no longer necessary to define some of the types used by these macros. Signed-off-by: Sven Eckelmann Signed-off-by: Simon Wunderlich --- net/batman-adv/trace.h | 1 - 1 file changed, 1 deletion(-) diff --git a/net/batman-adv/trace.h b/net/batman-adv/trace.h index f631b1e01b89..a87547570b4e 100644 --- a/net/batman-adv/trace.h +++ b/net/batman-adv/trace.h @@ -15,7 +15,6 @@ #include #include #include -#include #undef TRACE_SYSTEM #define TRACE_SYSTEM batadv -- cgit v1.2.3-59-g8ed1b From 26893e7e928e1790852b072edb9bff3c1309e111 Mon Sep 17 00:00:00 2001 From: Sven Eckelmann Date: Mon, 13 Apr 2020 21:23:29 +0200 Subject: batman-adv: Utilize prandom_u32_max for random [0, max) values The kernel provides a function to create random values from 0 - (max-1) since commit f337db64af05 ("random32: add prandom_u32_max and convert open coded users"). Simply use this function to replace code sections which use prandom_u32 and a handcrafted method to map it to the correct range. Signed-off-by: Sven Eckelmann --- net/batman-adv/bat_iv_ogm.c | 4 ++-- net/batman-adv/bat_v_elp.c | 2 +- net/batman-adv/bat_v_ogm.c | 4 ++-- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/net/batman-adv/bat_iv_ogm.c b/net/batman-adv/bat_iv_ogm.c index a7c8dd7ae513..e87f19c82e8d 100644 --- a/net/batman-adv/bat_iv_ogm.c +++ b/net/batman-adv/bat_iv_ogm.c @@ -280,7 +280,7 @@ batadv_iv_ogm_emit_send_time(const struct batadv_priv *bat_priv) unsigned int msecs; msecs = atomic_read(&bat_priv->orig_interval) - BATADV_JITTER; - msecs += prandom_u32() % (2 * BATADV_JITTER); + msecs += prandom_u32_max(2 * BATADV_JITTER); return jiffies + msecs_to_jiffies(msecs); } @@ -288,7 +288,7 @@ batadv_iv_ogm_emit_send_time(const struct batadv_priv *bat_priv) /* when do we schedule a ogm packet to be sent */ static unsigned long batadv_iv_ogm_fwd_send_time(void) { - return jiffies + msecs_to_jiffies(prandom_u32() % (BATADV_JITTER / 2)); + return jiffies + msecs_to_jiffies(prandom_u32_max(BATADV_JITTER / 2)); } /* apply hop penalty for a normal link */ diff --git a/net/batman-adv/bat_v_elp.c b/net/batman-adv/bat_v_elp.c index 1e3172db7492..353e49c40e7f 100644 --- a/net/batman-adv/bat_v_elp.c +++ b/net/batman-adv/bat_v_elp.c @@ -49,7 +49,7 @@ static void batadv_v_elp_start_timer(struct batadv_hard_iface *hard_iface) unsigned int msecs; msecs = atomic_read(&hard_iface->bat_v.elp_interval) - BATADV_JITTER; - msecs += prandom_u32() % (2 * BATADV_JITTER); + msecs += prandom_u32_max(2 * BATADV_JITTER); queue_delayed_work(batadv_event_workqueue, &hard_iface->bat_v.elp_wq, msecs_to_jiffies(msecs)); diff --git a/net/batman-adv/bat_v_ogm.c b/net/batman-adv/bat_v_ogm.c index 969466218999..0959d32be65c 100644 --- a/net/batman-adv/bat_v_ogm.c +++ b/net/batman-adv/bat_v_ogm.c @@ -88,7 +88,7 @@ static void batadv_v_ogm_start_queue_timer(struct batadv_hard_iface *hard_iface) unsigned int msecs = BATADV_MAX_AGGREGATION_MS * 1000; /* msecs * [0.9, 1.1] */ - msecs += prandom_u32() % (msecs / 5) - (msecs / 10); + msecs += prandom_u32_max(msecs / 5) - (msecs / 10); queue_delayed_work(batadv_event_workqueue, &hard_iface->bat_v.aggr_wq, msecs_to_jiffies(msecs / 1000)); } @@ -107,7 +107,7 @@ static void batadv_v_ogm_start_timer(struct batadv_priv *bat_priv) return; msecs = atomic_read(&bat_priv->orig_interval) - BATADV_JITTER; - msecs += prandom_u32() % (2 * BATADV_JITTER); + msecs += prandom_u32_max(2 * BATADV_JITTER); queue_delayed_work(batadv_event_workqueue, &bat_priv->bat_v.ogm_wq, msecs_to_jiffies(msecs)); } -- cgit v1.2.3-59-g8ed1b From 557e171434eb9bb43dbe71361775ae21ae95d4ed Mon Sep 17 00:00:00 2001 From: Kalle Valo Date: Thu, 16 Apr 2020 14:50:56 +0300 Subject: ath10k: rename ath10k_hif_swap_mailbox() to ath10k_hif_start_post() Convert ath10k_hif_swap_mailbox() to a more generic op so that bus drivers can do more than just swap the mailbox, for example set power save settings like in the following sdio patch. No functional changes, compile tested only. Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/1587037859-28873-2-git-send-email-kvalo@codeaurora.org --- drivers/net/wireless/ath/ath10k/core.c | 2 +- drivers/net/wireless/ath/ath10k/hif.h | 8 ++++---- drivers/net/wireless/ath/ath10k/sdio.c | 4 ++-- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/drivers/net/wireless/ath/ath10k/core.c b/drivers/net/wireless/ath/ath10k/core.c index 52472bbcee1f..5926281c7e05 100644 --- a/drivers/net/wireless/ath/ath10k/core.c +++ b/drivers/net/wireless/ath/ath10k/core.c @@ -2714,7 +2714,7 @@ int ath10k_core_start(struct ath10k *ar, enum ath10k_firmware_mode mode, goto err_hif_stop; } - status = ath10k_hif_swap_mailbox(ar); + status = ath10k_hif_start_post(ar); if (status) { ath10k_err(ar, "failed to swap mailbox: %d\n", status); goto err_hif_stop; diff --git a/drivers/net/wireless/ath/ath10k/hif.h b/drivers/net/wireless/ath/ath10k/hif.h index 0dd8973d0acf..2c5d61d98337 100644 --- a/drivers/net/wireless/ath/ath10k/hif.h +++ b/drivers/net/wireless/ath/ath10k/hif.h @@ -54,7 +54,7 @@ struct ath10k_hif_ops { */ void (*stop)(struct ath10k *ar); - int (*swap_mailbox)(struct ath10k *ar); + int (*start_post)(struct ath10k *ar); int (*get_htt_tx_complete)(struct ath10k *ar); @@ -139,10 +139,10 @@ static inline void ath10k_hif_stop(struct ath10k *ar) return ar->hif.ops->stop(ar); } -static inline int ath10k_hif_swap_mailbox(struct ath10k *ar) +static inline int ath10k_hif_start_post(struct ath10k *ar) { - if (ar->hif.ops->swap_mailbox) - return ar->hif.ops->swap_mailbox(ar); + if (ar->hif.ops->start_post) + return ar->hif.ops->start_post(ar); return 0; } diff --git a/drivers/net/wireless/ath/ath10k/sdio.c b/drivers/net/wireless/ath/ath10k/sdio.c index 943db9f401d8..184b3545324e 100644 --- a/drivers/net/wireless/ath/ath10k/sdio.c +++ b/drivers/net/wireless/ath/ath10k/sdio.c @@ -1725,7 +1725,7 @@ static int ath10k_sdio_hif_diag_write_mem(struct ath10k *ar, u32 address, return 0; } -static int ath10k_sdio_hif_swap_mailbox(struct ath10k *ar) +static int ath10k_sdio_hif_start_post(struct ath10k *ar) { struct ath10k_sdio *ar_sdio = ath10k_sdio_priv(ar); u32 addr, val; @@ -2047,7 +2047,7 @@ static const struct ath10k_hif_ops ath10k_sdio_hif_ops = { .exchange_bmi_msg = ath10k_sdio_bmi_exchange_msg, .start = ath10k_sdio_hif_start, .stop = ath10k_sdio_hif_stop, - .swap_mailbox = ath10k_sdio_hif_swap_mailbox, + .start_post = ath10k_sdio_hif_start_post, .get_htt_tx_complete = ath10k_sdio_get_htt_tx_complete, .map_service_to_pipe = ath10k_sdio_hif_map_service_to_pipe, .get_default_pipe = ath10k_sdio_hif_get_default_pipe, -- cgit v1.2.3-59-g8ed1b From 22f28076b6c3f86107424b3b1ddfd90f2628f354 Mon Sep 17 00:00:00 2001 From: Wen Gong Date: Thu, 16 Apr 2020 14:50:57 +0300 Subject: ath10k: improve power save performance for sdio This patch is to set register to allow the mbox enter sleep status if it does not have tx traffic and wakeup it if tx traffic arrive. After mbox enter sleep status, the soc will enter sleep status by firmware, this will save power. The power consume drops from about 90mW to about 10mW with this patch. This patch only effect sdio chip. Tested with QCA6174 SDIO with firmware WLAN.RMH.4.4.1-00029. Signed-off-by: Wen Gong Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/1587037859-28873-3-git-send-email-kvalo@codeaurora.org --- drivers/net/wireless/ath/ath10k/sdio.c | 130 ++++++++++++++++++++++++++------- drivers/net/wireless/ath/ath10k/sdio.h | 16 ++++ 2 files changed, 119 insertions(+), 27 deletions(-) diff --git a/drivers/net/wireless/ath/ath10k/sdio.c b/drivers/net/wireless/ath/ath10k/sdio.c index 184b3545324e..1626976293c7 100644 --- a/drivers/net/wireless/ath/ath10k/sdio.c +++ b/drivers/net/wireless/ath/ath10k/sdio.c @@ -1361,23 +1361,117 @@ static void ath10k_rx_indication_async_work(struct work_struct *work) napi_schedule(&ar->napi); } +static int ath10k_sdio_read_rtc_state(struct ath10k_sdio *ar_sdio, unsigned char *state) +{ + struct ath10k *ar = ar_sdio->ar; + unsigned char rtc_state = 0; + int ret = 0; + + rtc_state = sdio_f0_readb(ar_sdio->func, ATH10K_CIS_RTC_STATE_ADDR, &ret); + if (ret) { + ath10k_warn(ar, "failed to read rtc state: %d\n", ret); + return ret; + } + + *state = rtc_state & 0x3; + + return ret; +} + +static int ath10k_sdio_hif_set_mbox_sleep(struct ath10k *ar, bool enable_sleep) +{ + struct ath10k_sdio *ar_sdio = ath10k_sdio_priv(ar); + u32 val; + int retry = ATH10K_CIS_READ_RETRY, ret = 0; + unsigned char rtc_state = 0; + + sdio_claim_host(ar_sdio->func); + + ret = ath10k_sdio_read32(ar, ATH10K_FIFO_TIMEOUT_AND_CHIP_CONTROL, &val); + if (ret) { + ath10k_warn(ar, "failed to read fifo/chip control register: %d\n", + ret); + goto release; + } + + if (enable_sleep) { + val &= ATH10K_FIFO_TIMEOUT_AND_CHIP_CONTROL_DISABLE_SLEEP_OFF; + ar_sdio->mbox_state = SDIO_MBOX_SLEEP_STATE; + } else { + val |= ATH10K_FIFO_TIMEOUT_AND_CHIP_CONTROL_DISABLE_SLEEP_ON; + ar_sdio->mbox_state = SDIO_MBOX_AWAKE_STATE; + } + + ret = ath10k_sdio_write32(ar, ATH10K_FIFO_TIMEOUT_AND_CHIP_CONTROL, val); + if (ret) { + ath10k_warn(ar, "failed to write to FIFO_TIMEOUT_AND_CHIP_CONTROL: %d", + ret); + } + + if (!enable_sleep) { + do { + udelay(ATH10K_CIS_READ_WAIT_4_RTC_CYCLE_IN_US); + ret = ath10k_sdio_read_rtc_state(ar_sdio, &rtc_state); + + if (ret) { + ath10k_warn(ar, "failed to disable mbox sleep: %d", ret); + break; + } + + ath10k_dbg(ar, ATH10K_DBG_SDIO, "sdio read rtc state: %d\n", + rtc_state); + + if (rtc_state == ATH10K_CIS_RTC_STATE_ON) + break; + + udelay(ATH10K_CIS_XTAL_SETTLE_DURATION_IN_US); + retry--; + } while (retry > 0); + } + +release: + sdio_release_host(ar_sdio->func); + + return ret; +} + +static void ath10k_sdio_sleep_timer_handler(struct timer_list *t) +{ + struct ath10k_sdio *ar_sdio = from_timer(ar_sdio, t, sleep_timer); + + ar_sdio->mbox_state = SDIO_MBOX_REQUEST_TO_SLEEP_STATE; + queue_work(ar_sdio->workqueue, &ar_sdio->wr_async_work); +} + static void ath10k_sdio_write_async_work(struct work_struct *work) { struct ath10k_sdio *ar_sdio = container_of(work, struct ath10k_sdio, wr_async_work); struct ath10k *ar = ar_sdio->ar; struct ath10k_sdio_bus_request *req, *tmp_req; + struct ath10k_mbox_info *mbox_info = &ar_sdio->mbox_info; spin_lock_bh(&ar_sdio->wr_async_lock); list_for_each_entry_safe(req, tmp_req, &ar_sdio->wr_asyncq, list) { list_del(&req->list); spin_unlock_bh(&ar_sdio->wr_async_lock); + + if (req->address >= mbox_info->htc_addr && + ar_sdio->mbox_state == SDIO_MBOX_SLEEP_STATE) { + ath10k_sdio_hif_set_mbox_sleep(ar, false); + mod_timer(&ar_sdio->sleep_timer, jiffies + + msecs_to_jiffies(ATH10K_MIN_SLEEP_INACTIVITY_TIME_MS)); + } + __ath10k_sdio_write_async(ar, req); spin_lock_bh(&ar_sdio->wr_async_lock); } spin_unlock_bh(&ar_sdio->wr_async_lock); + + if (ar_sdio->mbox_state == SDIO_MBOX_REQUEST_TO_SLEEP_STATE) + ath10k_sdio_hif_set_mbox_sleep(ar, true); } static int ath10k_sdio_prep_async_req(struct ath10k *ar, u32 addr, @@ -1517,6 +1611,9 @@ static void ath10k_sdio_hif_power_down(struct ath10k *ar) ath10k_dbg(ar, ATH10K_DBG_BOOT, "sdio power off\n"); + del_timer_sync(&ar_sdio->sleep_timer); + ath10k_sdio_hif_set_mbox_sleep(ar, true); + /* Disable the card */ sdio_claim_host(ar_sdio->func); @@ -1617,33 +1714,6 @@ static int ath10k_sdio_hif_enable_intrs(struct ath10k *ar) return ret; } -static int ath10k_sdio_hif_set_mbox_sleep(struct ath10k *ar, bool enable_sleep) -{ - u32 val; - int ret; - - ret = ath10k_sdio_read32(ar, ATH10K_FIFO_TIMEOUT_AND_CHIP_CONTROL, &val); - if (ret) { - ath10k_warn(ar, "failed to read fifo/chip control register: %d\n", - ret); - return ret; - } - - if (enable_sleep) - val &= ATH10K_FIFO_TIMEOUT_AND_CHIP_CONTROL_DISABLE_SLEEP_OFF; - else - val |= ATH10K_FIFO_TIMEOUT_AND_CHIP_CONTROL_DISABLE_SLEEP_ON; - - ret = ath10k_sdio_write32(ar, ATH10K_FIFO_TIMEOUT_AND_CHIP_CONTROL, val); - if (ret) { - ath10k_warn(ar, "failed to write to FIFO_TIMEOUT_AND_CHIP_CONTROL: %d", - ret); - return ret; - } - - return 0; -} - /* HIF diagnostics */ static int ath10k_sdio_hif_diag_read(struct ath10k *ar, u32 address, void *buf, @@ -1749,6 +1819,8 @@ static int ath10k_sdio_hif_start_post(struct ath10k *ar) ar_sdio->swap_mbox = false; } + ath10k_sdio_hif_set_mbox_sleep(ar, true); + return 0; } @@ -2076,6 +2148,8 @@ static int ath10k_sdio_pm_suspend(struct device *device) if (!device_may_wakeup(ar->dev)) return 0; + ath10k_sdio_hif_set_mbox_sleep(ar, true); + pm_flag = MMC_PM_KEEP_POWER; ret = sdio_set_host_pm_flags(func, pm_flag); @@ -2239,6 +2313,8 @@ static int ath10k_sdio_probe(struct sdio_func *func, goto err_free_wq; } + timer_setup(&ar_sdio->sleep_timer, ath10k_sdio_sleep_timer_handler, 0); + return 0; err_free_wq: diff --git a/drivers/net/wireless/ath/ath10k/sdio.h b/drivers/net/wireless/ath/ath10k/sdio.h index 1c987494ad22..29523600887d 100644 --- a/drivers/net/wireless/ath/ath10k/sdio.h +++ b/drivers/net/wireless/ath/ath10k/sdio.h @@ -98,6 +98,20 @@ #define ATH10K_FIFO_TIMEOUT_AND_CHIP_CONTROL_DISABLE_SLEEP_OFF 0xFFFEFFFF #define ATH10K_FIFO_TIMEOUT_AND_CHIP_CONTROL_DISABLE_SLEEP_ON 0x10000 +enum sdio_mbox_state { + SDIO_MBOX_UNKNOWN_STATE = 0, + SDIO_MBOX_REQUEST_TO_SLEEP_STATE = 1, + SDIO_MBOX_SLEEP_STATE = 2, + SDIO_MBOX_AWAKE_STATE = 3, +}; + +#define ATH10K_CIS_READ_WAIT_4_RTC_CYCLE_IN_US 125 +#define ATH10K_CIS_RTC_STATE_ADDR 0x1138 +#define ATH10K_CIS_RTC_STATE_ON 0x01 +#define ATH10K_CIS_XTAL_SETTLE_DURATION_IN_US 1500 +#define ATH10K_CIS_READ_RETRY 10 +#define ATH10K_MIN_SLEEP_INACTIVITY_TIME_MS 50 + /* TODO: remove this and use skb->cb instead, much cleaner approach */ struct ath10k_sdio_bus_request { struct list_head list; @@ -218,6 +232,8 @@ struct ath10k_sdio { spinlock_t wr_async_lock; struct work_struct async_work_rx; + struct timer_list sleep_timer; + enum sdio_mbox_state mbox_state; }; static inline struct ath10k_sdio *ath10k_sdio_priv(struct ath10k *ar) -- cgit v1.2.3-59-g8ed1b From 58921763210315fe96f590d9edb4f3952f8526ce Mon Sep 17 00:00:00 2001 From: Kalle Valo Date: Thu, 16 Apr 2020 14:50:58 +0300 Subject: ath10k: sdio: remove _hif_ prefix from functions not part of hif interface The _hif_ prefix should be used only on functions part of ath10k_hif_ops, so remove it from functions which should not have it. No functional changes, compile tested only. Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/1587037859-28873-4-git-send-email-kvalo@codeaurora.org --- drivers/net/wireless/ath/ath10k/sdio.c | 34 +++++++++++++++++----------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/drivers/net/wireless/ath/ath10k/sdio.c b/drivers/net/wireless/ath/ath10k/sdio.c index 1626976293c7..884e1a85e29f 100644 --- a/drivers/net/wireless/ath/ath10k/sdio.c +++ b/drivers/net/wireless/ath/ath10k/sdio.c @@ -1378,7 +1378,7 @@ static int ath10k_sdio_read_rtc_state(struct ath10k_sdio *ar_sdio, unsigned char return ret; } -static int ath10k_sdio_hif_set_mbox_sleep(struct ath10k *ar, bool enable_sleep) +static int ath10k_sdio_set_mbox_sleep(struct ath10k *ar, bool enable_sleep) { struct ath10k_sdio *ar_sdio = ath10k_sdio_priv(ar); u32 val; @@ -1459,7 +1459,7 @@ static void ath10k_sdio_write_async_work(struct work_struct *work) if (req->address >= mbox_info->htc_addr && ar_sdio->mbox_state == SDIO_MBOX_SLEEP_STATE) { - ath10k_sdio_hif_set_mbox_sleep(ar, false); + ath10k_sdio_set_mbox_sleep(ar, false); mod_timer(&ar_sdio->sleep_timer, jiffies + msecs_to_jiffies(ATH10K_MIN_SLEEP_INACTIVITY_TIME_MS)); } @@ -1471,7 +1471,7 @@ static void ath10k_sdio_write_async_work(struct work_struct *work) spin_unlock_bh(&ar_sdio->wr_async_lock); if (ar_sdio->mbox_state == SDIO_MBOX_REQUEST_TO_SLEEP_STATE) - ath10k_sdio_hif_set_mbox_sleep(ar, true); + ath10k_sdio_set_mbox_sleep(ar, true); } static int ath10k_sdio_prep_async_req(struct ath10k *ar, u32 addr, @@ -1538,7 +1538,7 @@ static void ath10k_sdio_irq_handler(struct sdio_func *func) /* sdio HIF functions */ -static int ath10k_sdio_hif_disable_intrs(struct ath10k *ar) +static int ath10k_sdio_disable_intrs(struct ath10k *ar) { struct ath10k_sdio *ar_sdio = ath10k_sdio_priv(ar); struct ath10k_sdio_irq_data *irq_data = &ar_sdio->irq_data; @@ -1594,7 +1594,7 @@ static int ath10k_sdio_hif_power_up(struct ath10k *ar, ar_sdio->is_disabled = false; - ret = ath10k_sdio_hif_disable_intrs(ar); + ret = ath10k_sdio_disable_intrs(ar); if (ret) return ret; @@ -1612,7 +1612,7 @@ static void ath10k_sdio_hif_power_down(struct ath10k *ar) ath10k_dbg(ar, ATH10K_DBG_BOOT, "sdio power off\n"); del_timer_sync(&ar_sdio->sleep_timer); - ath10k_sdio_hif_set_mbox_sleep(ar, true); + ath10k_sdio_set_mbox_sleep(ar, true); /* Disable the card */ sdio_claim_host(ar_sdio->func); @@ -1666,7 +1666,7 @@ static int ath10k_sdio_hif_tx_sg(struct ath10k *ar, u8 pipe_id, return 0; } -static int ath10k_sdio_hif_enable_intrs(struct ath10k *ar) +static int ath10k_sdio_enable_intrs(struct ath10k *ar) { struct ath10k_sdio *ar_sdio = ath10k_sdio_priv(ar); struct ath10k_sdio_irq_data *irq_data = &ar_sdio->irq_data; @@ -1749,8 +1749,8 @@ out: return ret; } -static int ath10k_sdio_hif_diag_read32(struct ath10k *ar, u32 address, - u32 *value) +static int ath10k_sdio_diag_read32(struct ath10k *ar, u32 address, + u32 *value) { __le32 *val; int ret; @@ -1803,7 +1803,7 @@ static int ath10k_sdio_hif_start_post(struct ath10k *ar) addr = host_interest_item_address(HI_ITEM(hi_acs_flags)); - ret = ath10k_sdio_hif_diag_read32(ar, addr, &val); + ret = ath10k_sdio_diag_read32(ar, addr, &val); if (ret) { ath10k_warn(ar, "unable to read hi_acs_flags : %d\n", ret); return ret; @@ -1819,7 +1819,7 @@ static int ath10k_sdio_hif_start_post(struct ath10k *ar) ar_sdio->swap_mbox = false; } - ath10k_sdio_hif_set_mbox_sleep(ar, true); + ath10k_sdio_set_mbox_sleep(ar, true); return 0; } @@ -1831,7 +1831,7 @@ static int ath10k_sdio_get_htt_tx_complete(struct ath10k *ar) addr = host_interest_item_address(HI_ITEM(hi_acs_flags)); - ret = ath10k_sdio_hif_diag_read32(ar, addr, &val); + ret = ath10k_sdio_diag_read32(ar, addr, &val); if (ret) { ath10k_warn(ar, "unable to read hi_acs_flags for htt tx comple : %d\n", ret); @@ -1860,7 +1860,7 @@ static int ath10k_sdio_hif_start(struct ath10k *ar) * request before interrupts are disabled. */ msleep(20); - ret = ath10k_sdio_hif_disable_intrs(ar); + ret = ath10k_sdio_disable_intrs(ar); if (ret) return ret; @@ -1882,19 +1882,19 @@ static int ath10k_sdio_hif_start(struct ath10k *ar) sdio_release_host(ar_sdio->func); - ret = ath10k_sdio_hif_enable_intrs(ar); + ret = ath10k_sdio_enable_intrs(ar); if (ret) ath10k_warn(ar, "failed to enable sdio interrupts: %d\n", ret); /* Enable sleep and then disable it again */ - ret = ath10k_sdio_hif_set_mbox_sleep(ar, true); + ret = ath10k_sdio_set_mbox_sleep(ar, true); if (ret) return ret; /* Wait for 20ms for the written value to take effect */ msleep(20); - ret = ath10k_sdio_hif_set_mbox_sleep(ar, false); + ret = ath10k_sdio_set_mbox_sleep(ar, false); if (ret) return ret; @@ -2148,7 +2148,7 @@ static int ath10k_sdio_pm_suspend(struct device *device) if (!device_may_wakeup(ar->dev)) return 0; - ath10k_sdio_hif_set_mbox_sleep(ar, true); + ath10k_sdio_set_mbox_sleep(ar, true); pm_flag = MMC_PM_KEEP_POWER; -- cgit v1.2.3-59-g8ed1b From 96c64857983fdc623fa5899afdb0310bef196f68 Mon Sep 17 00:00:00 2001 From: Kalle Valo Date: Thu, 16 Apr 2020 14:50:59 +0300 Subject: ath10k: hif: make send_complete_check op optional That way we don't need to have an empty function in sdio.c. No functional changes, compile tested only. Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/1587037859-28873-5-git-send-email-kvalo@codeaurora.org --- drivers/net/wireless/ath/ath10k/hif.h | 3 ++- drivers/net/wireless/ath/ath10k/sdio.c | 12 ------------ drivers/net/wireless/ath/ath10k/usb.c | 12 ------------ 3 files changed, 2 insertions(+), 25 deletions(-) diff --git a/drivers/net/wireless/ath/ath10k/hif.h b/drivers/net/wireless/ath/ath10k/hif.h index 2c5d61d98337..9e45fd9073a6 100644 --- a/drivers/net/wireless/ath/ath10k/hif.h +++ b/drivers/net/wireless/ath/ath10k/hif.h @@ -170,7 +170,8 @@ static inline void ath10k_hif_get_default_pipe(struct ath10k *ar, static inline void ath10k_hif_send_complete_check(struct ath10k *ar, u8 pipe_id, int force) { - ar->hif.ops->send_complete_check(ar, pipe_id, force); + if (ar->hif.ops->send_complete_check) + ar->hif.ops->send_complete_check(ar, pipe_id, force); } static inline u16 ath10k_hif_get_free_queue_number(struct ath10k *ar, diff --git a/drivers/net/wireless/ath/ath10k/sdio.c b/drivers/net/wireless/ath/ath10k/sdio.c index 884e1a85e29f..e2aff2254a40 100644 --- a/drivers/net/wireless/ath/ath10k/sdio.c +++ b/drivers/net/wireless/ath/ath10k/sdio.c @@ -2101,17 +2101,6 @@ static void ath10k_sdio_hif_get_default_pipe(struct ath10k *ar, *dl_pipe = 0; } -/* This op is currently only used by htc_wait_target if the HTC ready - * message times out. It is not applicable for SDIO since there is nothing - * we can do if the HTC ready message does not arrive in time. - * TODO: Make this op non mandatory by introducing a NULL check in the - * hif op wrapper. - */ -static void ath10k_sdio_hif_send_complete_check(struct ath10k *ar, - u8 pipe, int force) -{ -} - static const struct ath10k_hif_ops ath10k_sdio_hif_ops = { .tx_sg = ath10k_sdio_hif_tx_sg, .diag_read = ath10k_sdio_hif_diag_read, @@ -2123,7 +2112,6 @@ static const struct ath10k_hif_ops ath10k_sdio_hif_ops = { .get_htt_tx_complete = ath10k_sdio_get_htt_tx_complete, .map_service_to_pipe = ath10k_sdio_hif_map_service_to_pipe, .get_default_pipe = ath10k_sdio_hif_get_default_pipe, - .send_complete_check = ath10k_sdio_hif_send_complete_check, .power_up = ath10k_sdio_hif_power_up, .power_down = ath10k_sdio_hif_power_down, #ifdef CONFIG_PM diff --git a/drivers/net/wireless/ath/ath10k/usb.c b/drivers/net/wireless/ath/ath10k/usb.c index 1e0343081be9..b7daf344d012 100644 --- a/drivers/net/wireless/ath/ath10k/usb.c +++ b/drivers/net/wireless/ath/ath10k/usb.c @@ -693,17 +693,6 @@ static int ath10k_usb_hif_map_service_to_pipe(struct ath10k *ar, u16 svc_id, return 0; } -/* This op is currently only used by htc_wait_target if the HTC ready - * message times out. It is not applicable for USB since there is nothing - * we can do if the HTC ready message does not arrive in time. - * TODO: Make this op non mandatory by introducing a NULL check in the - * hif op wrapper. - */ -static void ath10k_usb_hif_send_complete_check(struct ath10k *ar, - u8 pipe, int force) -{ -} - static int ath10k_usb_hif_power_up(struct ath10k *ar, enum ath10k_firmware_mode fw_mode) { @@ -737,7 +726,6 @@ static const struct ath10k_hif_ops ath10k_usb_hif_ops = { .stop = ath10k_usb_hif_stop, .map_service_to_pipe = ath10k_usb_hif_map_service_to_pipe, .get_default_pipe = ath10k_usb_hif_get_default_pipe, - .send_complete_check = ath10k_usb_hif_send_complete_check, .get_free_queue_number = ath10k_usb_hif_get_free_queue_number, .power_up = ath10k_usb_hif_power_up, .power_down = ath10k_usb_hif_power_down, -- cgit v1.2.3-59-g8ed1b From bec095ab477dcc11fbe448c6fae6c2c61a876f37 Mon Sep 17 00:00:00 2001 From: Jason Yan Date: Sat, 18 Apr 2020 15:02:32 +0800 Subject: rtlwifi: rtl8188ee: use true,false for bool variables Fix the following coccicheck warning: drivers/net/wireless/realtek/rtlwifi/rtl8188ee/sw.c:70:1-34: WARNING: Assignment of 0/1 to bool variable drivers/net/wireless/realtek/rtlwifi/rtl8188ee/sw.c:72:1-34: WARNING: Assignment of 0/1 to bool variable Reported-by: Hulk Robot Signed-off-by: Jason Yan Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/20200418070236.9620-2-yanaijie@huawei.com --- drivers/net/wireless/realtek/rtlwifi/rtl8188ee/sw.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/wireless/realtek/rtlwifi/rtl8188ee/sw.c b/drivers/net/wireless/realtek/rtlwifi/rtl8188ee/sw.c index 4865639ac9ea..02b77521b5cd 100644 --- a/drivers/net/wireless/realtek/rtlwifi/rtl8188ee/sw.c +++ b/drivers/net/wireless/realtek/rtlwifi/rtl8188ee/sw.c @@ -67,9 +67,9 @@ static int rtl88e_init_sw_vars(struct ieee80211_hw *hw) char *fw_name; rtl8188ee_bt_reg_init(hw); - rtlpriv->dm.dm_initialgain_enable = 1; + rtlpriv->dm.dm_initialgain_enable = true; rtlpriv->dm.dm_flag = 0; - rtlpriv->dm.disable_framebursting = 0; + rtlpriv->dm.disable_framebursting = false; rtlpriv->dm.thermalvalue = 0; rtlpci->transmit_config = CFENDFORM | BIT(15); -- cgit v1.2.3-59-g8ed1b From 23c2ddb574c621cc2c5d9be0bb99a59f18f5863c Mon Sep 17 00:00:00 2001 From: Jason Yan Date: Sat, 18 Apr 2020 15:02:33 +0800 Subject: rtlwifi: rtl8723ae: use true,false for bool variables Fix the following coccicheck warning: drivers/net/wireless/realtek/rtlwifi/rtl8723ae/sw.c:81:1-34: WARNING: Assignment of 0/1 to bool variable drivers/net/wireless/realtek/rtlwifi/rtl8723ae/sw.c:83:1-34: WARNING: Assignment of 0/1 to bool variable Reported-by: Hulk Robot Signed-off-by: Jason Yan Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/20200418070236.9620-3-yanaijie@huawei.com --- drivers/net/wireless/realtek/rtlwifi/rtl8723ae/sw.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/wireless/realtek/rtlwifi/rtl8723ae/sw.c b/drivers/net/wireless/realtek/rtlwifi/rtl8723ae/sw.c index ea86d5bf33d2..7828acb1de3f 100644 --- a/drivers/net/wireless/realtek/rtlwifi/rtl8723ae/sw.c +++ b/drivers/net/wireless/realtek/rtlwifi/rtl8723ae/sw.c @@ -78,9 +78,9 @@ static int rtl8723e_init_sw_vars(struct ieee80211_hw *hw) rtlpriv->btcoexist.btc_ops = rtl_btc_get_ops_pointer(); - rtlpriv->dm.dm_initialgain_enable = 1; + rtlpriv->dm.dm_initialgain_enable = true; rtlpriv->dm.dm_flag = 0; - rtlpriv->dm.disable_framebursting = 0; + rtlpriv->dm.disable_framebursting = false; rtlpriv->dm.thermalvalue = 0; rtlpci->transmit_config = CFENDFORM | BIT(12) | BIT(13); -- cgit v1.2.3-59-g8ed1b From c13a83b01010c94ad7fe68161fd4dae3767d3ffe Mon Sep 17 00:00:00 2001 From: Jason Yan Date: Sat, 18 Apr 2020 15:02:34 +0800 Subject: rtlwifi: rtl8192ee: use true,false for bool variables Fix the following coccicheck warning: drivers/net/wireless/realtek/rtlwifi/rtl8192ee/sw.c:78:1-34: WARNING: Assignment of 0/1 to bool variable drivers/net/wireless/realtek/rtlwifi/rtl8192ee/sw.c:80:1-34: WARNING: Assignment of 0/1 to bool variable Reported-by: Hulk Robot Signed-off-by: Jason Yan Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/20200418070236.9620-4-yanaijie@huawei.com --- drivers/net/wireless/realtek/rtlwifi/rtl8192ee/sw.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/wireless/realtek/rtlwifi/rtl8192ee/sw.c b/drivers/net/wireless/realtek/rtlwifi/rtl8192ee/sw.c index b337d599b6f4..7a16563b3a5d 100644 --- a/drivers/net/wireless/realtek/rtlwifi/rtl8192ee/sw.c +++ b/drivers/net/wireless/realtek/rtlwifi/rtl8192ee/sw.c @@ -75,9 +75,9 @@ static int rtl92ee_init_sw_vars(struct ieee80211_hw *hw) rtlpci->msi_support = rtlpriv->cfg->mod_params->msi_support; rtlpriv->btcoexist.btc_ops = rtl_btc_get_ops_pointer(); - rtlpriv->dm.dm_initialgain_enable = 1; + rtlpriv->dm.dm_initialgain_enable = true; rtlpriv->dm.dm_flag = 0; - rtlpriv->dm.disable_framebursting = 0; + rtlpriv->dm.disable_framebursting = false; rtlpci->transmit_config = CFENDFORM | BIT(15); /*just 2.4G band*/ -- cgit v1.2.3-59-g8ed1b From 47361089d987c367bedd778eba66843601d347df Mon Sep 17 00:00:00 2001 From: Jason Yan Date: Sat, 18 Apr 2020 15:02:35 +0800 Subject: rtlwifi: rtl8723be: use true,false for bool variables Fix the following coccicheck warning: drivers/net/wireless/realtek/rtlwifi/rtl8723be/sw.c:77:1-34: WARNING: Assignment of 0/1 to bool variable drivers/net/wireless/realtek/rtlwifi/rtl8723be/sw.c:79:1-34: WARNING: Assignment of 0/1 to bool variable Reported-by: Hulk Robot Signed-off-by: Jason Yan Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/20200418070236.9620-5-yanaijie@huawei.com --- drivers/net/wireless/realtek/rtlwifi/rtl8723be/sw.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/wireless/realtek/rtlwifi/rtl8723be/sw.c b/drivers/net/wireless/realtek/rtlwifi/rtl8723be/sw.c index 36209ac5b208..d220e8955e37 100644 --- a/drivers/net/wireless/realtek/rtlwifi/rtl8723be/sw.c +++ b/drivers/net/wireless/realtek/rtlwifi/rtl8723be/sw.c @@ -74,9 +74,9 @@ static int rtl8723be_init_sw_vars(struct ieee80211_hw *hw) rtl8723be_bt_reg_init(hw); rtlpriv->btcoexist.btc_ops = rtl_btc_get_ops_pointer(); - rtlpriv->dm.dm_initialgain_enable = 1; + rtlpriv->dm.dm_initialgain_enable = true; rtlpriv->dm.dm_flag = 0; - rtlpriv->dm.disable_framebursting = 0; + rtlpriv->dm.disable_framebursting = false; rtlpriv->dm.thermalvalue = 0; rtlpci->transmit_config = CFENDFORM | BIT(15) | BIT(24) | BIT(25); -- cgit v1.2.3-59-g8ed1b From e8277abd453d6824a92b42989a248969f4fbc988 Mon Sep 17 00:00:00 2001 From: Jason Yan Date: Sat, 18 Apr 2020 15:02:36 +0800 Subject: rtlwifi: rtl8821ae: use true,false for bool variables Fix the following coccicheck warning: drivers/net/wireless/realtek/rtlwifi/rtl8821ae/sw.c:79:1-34: WARNING: Assignment of 0/1 to bool variable drivers/net/wireless/realtek/rtlwifi/rtl8821ae/sw.c:81:1-34: WARNING: Assignment of 0/1 to bool variable Reported-by: Hulk Robot Signed-off-by: Jason Yan Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/20200418070236.9620-6-yanaijie@huawei.com --- drivers/net/wireless/realtek/rtlwifi/rtl8821ae/sw.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/wireless/realtek/rtlwifi/rtl8821ae/sw.c b/drivers/net/wireless/realtek/rtlwifi/rtl8821ae/sw.c index d8df816753cb..950542a24e31 100644 --- a/drivers/net/wireless/realtek/rtlwifi/rtl8821ae/sw.c +++ b/drivers/net/wireless/realtek/rtlwifi/rtl8821ae/sw.c @@ -76,9 +76,9 @@ static int rtl8821ae_init_sw_vars(struct ieee80211_hw *hw) rtl8821ae_bt_reg_init(hw); rtlpriv->btcoexist.btc_ops = rtl_btc_get_ops_pointer(); - rtlpriv->dm.dm_initialgain_enable = 1; + rtlpriv->dm.dm_initialgain_enable = true; rtlpriv->dm.dm_flag = 0; - rtlpriv->dm.disable_framebursting = 0; + rtlpriv->dm.disable_framebursting = false; rtlpriv->dm.thermalvalue = 0; rtlpci->transmit_config = CFENDFORM | BIT(15) | BIT(24) | BIT(25); -- cgit v1.2.3-59-g8ed1b From 887e74239805217c9c583584a382e66583e0556b Mon Sep 17 00:00:00 2001 From: Jason Yan Date: Mon, 20 Apr 2020 12:26:58 +0800 Subject: rtlwifi: rtl8723ae: fix warning comparison to bool Fix the following coccicheck warning: drivers/net/wireless/realtek/rtlwifi/rtl8723ae/hw.c:617:14-20: WARNING: Comparison to bool drivers/net/wireless/realtek/rtlwifi/rtl8723ae/hw.c:622:13-19: WARNING: Comparison to bool drivers/net/wireless/realtek/rtlwifi/rtl8723ae/hw.c:627:14-20: WARNING: Comparison to bool drivers/net/wireless/realtek/rtlwifi/rtl8723ae/hw.c:632:13-19: WARNING: Comparison to bool drivers/net/wireless/realtek/rtlwifi/rtl8723ae/hw.c:937:5-13: WARNING: Comparison to bool Signed-off-by: Jason Yan Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/20200420042658.18733-1-yanaijie@huawei.com --- drivers/net/wireless/realtek/rtlwifi/rtl8723ae/hw.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/net/wireless/realtek/rtlwifi/rtl8723ae/hw.c b/drivers/net/wireless/realtek/rtlwifi/rtl8723ae/hw.c index 655460f61bbc..7a46c6a9deae 100644 --- a/drivers/net/wireless/realtek/rtlwifi/rtl8723ae/hw.c +++ b/drivers/net/wireless/realtek/rtlwifi/rtl8723ae/hw.c @@ -614,22 +614,22 @@ static bool _rtl8723e_llt_table_init(struct ieee80211_hw *hw) for (i = 0; i < (txpktbuf_bndy - 1); i++) { status = _rtl8723e_llt_write(hw, i, i + 1); - if (true != status) + if (!status) return status; } status = _rtl8723e_llt_write(hw, (txpktbuf_bndy - 1), 0xFF); - if (true != status) + if (!status) return status; for (i = txpktbuf_bndy; i < maxpage; i++) { status = _rtl8723e_llt_write(hw, i, (i + 1)); - if (true != status) + if (!status) return status; } status = _rtl8723e_llt_write(hw, maxpage, txpktbuf_bndy); - if (true != status) + if (!status) return status; rtl_write_byte(rtlpriv, REG_CR, 0xff); @@ -934,7 +934,7 @@ int rtl8723e_hw_init(struct ieee80211_hw *hw) rtlpriv->intf_ops->disable_aspm(hw); rtstatus = _rtl8712e_init_mac(hw); - if (rtstatus != true) { + if (!rtstatus) { pr_err("Init MAC failed\n"); err = 1; goto exit; -- cgit v1.2.3-59-g8ed1b From 811853da541a6a9be335c1f9dc9f20ca8bde65ed Mon Sep 17 00:00:00 2001 From: Ping-Ke Shih Date: Mon, 20 Apr 2020 13:50:47 +0800 Subject: rtw88: 8723d: Add basic chip capabilities RTL8723DE is an 11n 1x1 2.4G single band chip with the following capabilities: - TX/RX BD size: 16/8 - TX/RX desc size: 40/24 - physical/logical/protected efuse size: 512/512/96 - TX gain index factor: 1 - max TX power index: 0x3F - band: 2G - HT: support - VHT: Not support Signed-off-by: Ping-Ke Shih Signed-off-by: Yan-Hsuan Chuang Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/20200420055054.14592-2-yhchuang@realtek.com --- drivers/net/wireless/realtek/rtw88/main.h | 2 + drivers/net/wireless/realtek/rtw88/pci.c | 3 ++ drivers/net/wireless/realtek/rtw88/rtw8723d.c | 44 ++++++++++++++++++++++ drivers/net/wireless/realtek/rtw88/rtw8723d.h | 8 ++++ .../net/wireless/realtek/rtw88/rtw8723d_table.c | 7 ++++ .../net/wireless/realtek/rtw88/rtw8723d_table.h | 8 ++++ 6 files changed, 72 insertions(+) create mode 100644 drivers/net/wireless/realtek/rtw88/rtw8723d.c create mode 100644 drivers/net/wireless/realtek/rtw88/rtw8723d.h create mode 100644 drivers/net/wireless/realtek/rtw88/rtw8723d_table.c create mode 100644 drivers/net/wireless/realtek/rtw88/rtw8723d_table.h diff --git a/drivers/net/wireless/realtek/rtw88/main.h b/drivers/net/wireless/realtek/rtw88/main.h index c9edcabd7c42..be74533320ad 100644 --- a/drivers/net/wireless/realtek/rtw88/main.h +++ b/drivers/net/wireless/realtek/rtw88/main.h @@ -41,6 +41,7 @@ extern unsigned int rtw_debug_mask; extern const struct ieee80211_ops rtw_ops; extern struct rtw_chip_info rtw8822b_hw_spec; extern struct rtw_chip_info rtw8822c_hw_spec; +extern struct rtw_chip_info rtw8723d_hw_spec; #define RTW_MAX_CHANNEL_NUM_2G 14 #define RTW_MAX_CHANNEL_NUM_5G 49 @@ -183,6 +184,7 @@ enum rtw_wireless_set { enum rtw_chip_type { RTW_CHIP_TYPE_8822B, RTW_CHIP_TYPE_8822C, + RTW_CHIP_TYPE_8723D, }; enum rtw_tx_queue_type { diff --git a/drivers/net/wireless/realtek/rtw88/pci.c b/drivers/net/wireless/realtek/rtw88/pci.c index e37c71495c0d..b3e76b579af9 100644 --- a/drivers/net/wireless/realtek/rtw88/pci.c +++ b/drivers/net/wireless/realtek/rtw88/pci.c @@ -1572,6 +1572,9 @@ static const struct pci_device_id rtw_pci_id_table[] = { #endif #ifdef CONFIG_RTW88_8822CE { RTK_PCI_DEVICE(PCI_VENDOR_ID_REALTEK, 0xC822, rtw8822c_hw_spec) }, +#endif +#ifdef CONFIG_RTW88_8723DE + { RTK_PCI_DEVICE(PCI_VENDOR_ID_REALTEK, 0xD723, rtw8723d_hw_spec) }, #endif {}, }; diff --git a/drivers/net/wireless/realtek/rtw88/rtw8723d.c b/drivers/net/wireless/realtek/rtw88/rtw8723d.c new file mode 100644 index 000000000000..cccf05ee6807 --- /dev/null +++ b/drivers/net/wireless/realtek/rtw88/rtw8723d.c @@ -0,0 +1,44 @@ +// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause +/* Copyright(c) 2018-2019 Realtek Corporation + */ + +#include "main.h" +#include "coex.h" +#include "fw.h" +#include "tx.h" +#include "rx.h" +#include "phy.h" +#include "rtw8723d.h" +#include "rtw8723d_table.h" +#include "mac.h" +#include "reg.h" +#include "debug.h" + +static struct rtw_chip_ops rtw8723d_ops = { + .set_antenna = NULL, +}; + +struct rtw_chip_info rtw8723d_hw_spec = { + .ops = &rtw8723d_ops, + .id = RTW_CHIP_TYPE_8723D, + .fw_name = "rtw88/rtw8723d_fw.bin", + .tx_pkt_desc_sz = 40, + .tx_buf_desc_sz = 16, + .rx_pkt_desc_sz = 24, + .rx_buf_desc_sz = 8, + .phy_efuse_size = 512, + .log_efuse_size = 512, + .ptct_efuse_size = 96 + 1, + .txgi_factor = 1, + .is_pwr_by_rate_dec = true, + .max_power_index = 0x3f, + .csi_buf_pg_num = 0, + .band = RTW_BAND_2G, + .ht_supported = true, + .vht_supported = false, + .lps_deep_mode_supported = 0, + .sys_func_en = 0xFD, +}; +EXPORT_SYMBOL(rtw8723d_hw_spec); + +MODULE_FIRMWARE("rtw88/rtw8723d_fw.bin"); diff --git a/drivers/net/wireless/realtek/rtw88/rtw8723d.h b/drivers/net/wireless/realtek/rtw88/rtw8723d.h new file mode 100644 index 000000000000..0b784cfc34c6 --- /dev/null +++ b/drivers/net/wireless/realtek/rtw88/rtw8723d.h @@ -0,0 +1,8 @@ +/* SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause */ +/* Copyright(c) 2018-2019 Realtek Corporation + */ + +#ifndef __RTW8723D_H__ +#define __RTW8723D_H__ + +#endif diff --git a/drivers/net/wireless/realtek/rtw88/rtw8723d_table.c b/drivers/net/wireless/realtek/rtw88/rtw8723d_table.c new file mode 100644 index 000000000000..b22b4b0f2fcf --- /dev/null +++ b/drivers/net/wireless/realtek/rtw88/rtw8723d_table.c @@ -0,0 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause +/* Copyright(c) 2018-2019 Realtek Corporation + */ + +#include "main.h" +#include "phy.h" +#include "rtw8723d_table.h" diff --git a/drivers/net/wireless/realtek/rtw88/rtw8723d_table.h b/drivers/net/wireless/realtek/rtw88/rtw8723d_table.h new file mode 100644 index 000000000000..ea5933ffd043 --- /dev/null +++ b/drivers/net/wireless/realtek/rtw88/rtw8723d_table.h @@ -0,0 +1,8 @@ +/* SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause */ +/* Copyright(c) 2018-2019 Realtek Corporation + */ + +#ifndef __RTW8723D_TABLE_H__ +#define __RTW8723D_TABLE_H__ + +#endif -- cgit v1.2.3-59-g8ed1b From 93ae973fb47df112326e9a3657302f990934b327 Mon Sep 17 00:00:00 2001 From: Ping-Ke Shih Date: Mon, 20 Apr 2020 13:50:48 +0800 Subject: rtw88: 8723d: add beamform wrapper functions 8723D doesn't support beamform because rtw88 only supports VHT beamform but 8723d doesn't have VHT capability. Though 8723d doesn't support beamform, BSS_CHANGED_MU_GROUPS is still marked as changed when doing disassociation. So, add wrapper functions for all beamform ops to make sure they aren't NULL before calling. Signed-off-by: Ping-Ke Shih Signed-off-by: Yan-Hsuan Chuang Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/20200420055054.14592-3-yhchuang@realtek.com --- drivers/net/wireless/realtek/rtw88/bf.c | 7 +++---- drivers/net/wireless/realtek/rtw88/bf.h | 22 ++++++++++++++++++++++ drivers/net/wireless/realtek/rtw88/mac80211.c | 7 ++----- drivers/net/wireless/realtek/rtw88/main.c | 7 +++---- drivers/net/wireless/realtek/rtw88/rtw8723d.c | 3 +++ 5 files changed, 33 insertions(+), 13 deletions(-) diff --git a/drivers/net/wireless/realtek/rtw88/bf.c b/drivers/net/wireless/realtek/rtw88/bf.c index b6d1d71f4d30..a5912da327e2 100644 --- a/drivers/net/wireless/realtek/rtw88/bf.c +++ b/drivers/net/wireless/realtek/rtw88/bf.c @@ -10,7 +10,6 @@ void rtw_bf_disassoc(struct rtw_dev *rtwdev, struct ieee80211_vif *vif, struct ieee80211_bss_conf *bss_conf) { - struct rtw_chip_info *chip = rtwdev->chip; struct rtw_vif *rtwvif = (struct rtw_vif *)vif->drv_priv; struct rtw_bfee *bfee = &rtwvif->bfee; struct rtw_bf_info *bfinfo = &rtwdev->bf_info; @@ -23,7 +22,7 @@ void rtw_bf_disassoc(struct rtw_dev *rtwdev, struct ieee80211_vif *vif, else if (bfee->role == RTW_BFEE_SU) bfinfo->bfer_su_cnt--; - chip->ops->config_bfee(rtwdev, rtwvif, bfee, false); + rtw_chip_config_bfee(rtwdev, rtwvif, bfee, false); bfee->role = RTW_BFEE_NONE; } @@ -71,7 +70,7 @@ void rtw_bf_assoc(struct rtw_dev *rtwdev, struct ieee80211_vif *vif, bfee->aid = bss_conf->aid; bfinfo->bfer_mu_cnt++; - chip->ops->config_bfee(rtwdev, rtwvif, bfee, true); + rtw_chip_config_bfee(rtwdev, rtwvif, bfee, true); } else if ((ic_vht_cap->cap & IEEE80211_VHT_CAP_SU_BEAMFORMEE_CAPABLE) && (vht_cap->cap & IEEE80211_VHT_CAP_SU_BEAMFORMER_CAPABLE)) { if (bfinfo->bfer_su_cnt >= chip->bfer_su_max_num) { @@ -97,7 +96,7 @@ void rtw_bf_assoc(struct rtw_dev *rtwdev, struct ieee80211_vif *vif, } } - chip->ops->config_bfee(rtwdev, rtwvif, bfee, true); + rtw_chip_config_bfee(rtwdev, rtwvif, bfee, true); } out_unlock: diff --git a/drivers/net/wireless/realtek/rtw88/bf.h b/drivers/net/wireless/realtek/rtw88/bf.h index 96a8216dd11f..17855edb5006 100644 --- a/drivers/net/wireless/realtek/rtw88/bf.h +++ b/drivers/net/wireless/realtek/rtw88/bf.h @@ -89,4 +89,26 @@ void rtw_bf_set_gid_table(struct rtw_dev *rtwdev, struct ieee80211_vif *vif, void rtw_bf_phy_init(struct rtw_dev *rtwdev); void rtw_bf_cfg_csi_rate(struct rtw_dev *rtwdev, u8 rssi, u8 cur_rate, u8 fixrate_en, u8 *new_rate); +static inline void rtw_chip_config_bfee(struct rtw_dev *rtwdev, struct rtw_vif *vif, + struct rtw_bfee *bfee, bool enable) +{ + if (rtwdev->chip->ops->config_bfee) + rtwdev->chip->ops->config_bfee(rtwdev, vif, bfee, enable); +} + +static inline void rtw_chip_set_gid_table(struct rtw_dev *rtwdev, + struct ieee80211_vif *vif, + struct ieee80211_bss_conf *conf) +{ + if (rtwdev->chip->ops->set_gid_table) + rtwdev->chip->ops->set_gid_table(rtwdev, vif, conf); +} + +static inline void rtw_chip_cfg_csi_rate(struct rtw_dev *rtwdev, u8 rssi, u8 cur_rate, + u8 fixrate_en, u8 *new_rate) +{ + if (rtwdev->chip->ops->cfg_csi_rate) + rtwdev->chip->ops->cfg_csi_rate(rtwdev, rssi, cur_rate, + fixrate_en, new_rate); +} #endif diff --git a/drivers/net/wireless/realtek/rtw88/mac80211.c b/drivers/net/wireless/realtek/rtw88/mac80211.c index a2e6ef4ad9ee..98d2ac22f6f6 100644 --- a/drivers/net/wireless/realtek/rtw88/mac80211.c +++ b/drivers/net/wireless/realtek/rtw88/mac80211.c @@ -375,11 +375,8 @@ static void rtw_ops_bss_info_changed(struct ieee80211_hw *hw, if (changed & BSS_CHANGED_BEACON) rtw_fw_download_rsvd_page(rtwdev); - if (changed & BSS_CHANGED_MU_GROUPS) { - struct rtw_chip_info *chip = rtwdev->chip; - - chip->ops->set_gid_table(rtwdev, vif, conf); - } + if (changed & BSS_CHANGED_MU_GROUPS) + rtw_chip_set_gid_table(rtwdev, vif, conf); if (changed & BSS_CHANGED_ERP_SLOT) rtw_conf_tx(rtwdev, rtwvif); diff --git a/drivers/net/wireless/realtek/rtw88/main.c b/drivers/net/wireless/realtek/rtw88/main.c index 1e1d2c774287..6dfe4895c352 100644 --- a/drivers/net/wireless/realtek/rtw88/main.c +++ b/drivers/net/wireless/realtek/rtw88/main.c @@ -137,7 +137,6 @@ struct rtw_watch_dog_iter_data { static void rtw_dynamic_csi_rate(struct rtw_dev *rtwdev, struct rtw_vif *rtwvif) { struct rtw_bf_info *bf_info = &rtwdev->bf_info; - struct rtw_chip_info *chip = rtwdev->chip; u8 fix_rate_enable = 0; u8 new_csi_rate_idx; @@ -145,9 +144,9 @@ static void rtw_dynamic_csi_rate(struct rtw_dev *rtwdev, struct rtw_vif *rtwvif) rtwvif->bfee.role != RTW_BFEE_MU) return; - chip->ops->cfg_csi_rate(rtwdev, rtwdev->dm_info.min_rssi, - bf_info->cur_csi_rpt_rate, - fix_rate_enable, &new_csi_rate_idx); + rtw_chip_cfg_csi_rate(rtwdev, rtwdev->dm_info.min_rssi, + bf_info->cur_csi_rpt_rate, + fix_rate_enable, &new_csi_rate_idx); if (new_csi_rate_idx != bf_info->cur_csi_rpt_rate) bf_info->cur_csi_rpt_rate = new_csi_rate_idx; diff --git a/drivers/net/wireless/realtek/rtw88/rtw8723d.c b/drivers/net/wireless/realtek/rtw88/rtw8723d.c index cccf05ee6807..5798a5804af3 100644 --- a/drivers/net/wireless/realtek/rtw88/rtw8723d.c +++ b/drivers/net/wireless/realtek/rtw88/rtw8723d.c @@ -16,6 +16,9 @@ static struct rtw_chip_ops rtw8723d_ops = { .set_antenna = NULL, + .config_bfee = NULL, + .set_gid_table = NULL, + .cfg_csi_rate = NULL, }; struct rtw_chip_info rtw8723d_hw_spec = { -- cgit v1.2.3-59-g8ed1b From c57bd7c3af9974ad432c46c0373a70d75a2d9e08 Mon Sep 17 00:00:00 2001 From: Ping-Ke Shih Date: Mon, 20 Apr 2020 13:50:49 +0800 Subject: rtw88: 8723d: Add power sequence Add corresponding power sequence for 8723D devices Signed-off-by: Ping-Ke Shih Signed-off-by: Yan-Hsuan Chuang Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/20200420055054.14592-4-yhchuang@realtek.com --- drivers/net/wireless/realtek/rtw88/main.h | 1 + drivers/net/wireless/realtek/rtw88/rtw8723d.c | 403 ++++++++++++++++++++++++++ 2 files changed, 404 insertions(+) diff --git a/drivers/net/wireless/realtek/rtw88/main.h b/drivers/net/wireless/realtek/rtw88/main.h index be74533320ad..e852ab194315 100644 --- a/drivers/net/wireless/realtek/rtw88/main.h +++ b/drivers/net/wireless/realtek/rtw88/main.h @@ -847,6 +847,7 @@ struct rtw_chip_ops { #define RTW_PWR_INTF_PCI_MSK BIT(2) #define RTW_PWR_INTF_ALL_MSK (BIT(0) | BIT(1) | BIT(2) | BIT(3)) +#define RTW_PWR_CUT_TEST_MSK BIT(0) #define RTW_PWR_CUT_A_MSK BIT(1) #define RTW_PWR_CUT_B_MSK BIT(2) #define RTW_PWR_CUT_C_MSK BIT(3) diff --git a/drivers/net/wireless/realtek/rtw88/rtw8723d.c b/drivers/net/wireless/realtek/rtw88/rtw8723d.c index 5798a5804af3..5b97730f1407 100644 --- a/drivers/net/wireless/realtek/rtw88/rtw8723d.c +++ b/drivers/net/wireless/realtek/rtw88/rtw8723d.c @@ -21,6 +21,407 @@ static struct rtw_chip_ops rtw8723d_ops = { .cfg_csi_rate = NULL, }; +static const struct rtw_pwr_seq_cmd trans_carddis_to_cardemu_8723d[] = { + {0x0005, + RTW_PWR_CUT_ALL_MSK, + RTW_PWR_INTF_ALL_MSK, + RTW_PWR_ADDR_MAC, + RTW_PWR_CMD_WRITE, BIT(3) | BIT(7), 0}, + {0x0086, + RTW_PWR_CUT_ALL_MSK, + RTW_PWR_INTF_SDIO_MSK, + RTW_PWR_ADDR_SDIO, + RTW_PWR_CMD_WRITE, BIT(0), 0}, + {0x0086, + RTW_PWR_CUT_ALL_MSK, + RTW_PWR_INTF_SDIO_MSK, + RTW_PWR_ADDR_SDIO, + RTW_PWR_CMD_POLLING, BIT(1), BIT(1)}, + {0x004A, + RTW_PWR_CUT_ALL_MSK, + RTW_PWR_INTF_USB_MSK, + RTW_PWR_ADDR_MAC, + RTW_PWR_CMD_WRITE, BIT(0), 0}, + {0x0005, + RTW_PWR_CUT_ALL_MSK, + RTW_PWR_INTF_ALL_MSK, + RTW_PWR_ADDR_MAC, + RTW_PWR_CMD_WRITE, BIT(3) | BIT(4), 0}, + {0x0023, + RTW_PWR_CUT_ALL_MSK, + RTW_PWR_INTF_SDIO_MSK, + RTW_PWR_ADDR_MAC, + RTW_PWR_CMD_WRITE, BIT(4), 0}, + {0x0301, + RTW_PWR_CUT_ALL_MSK, + RTW_PWR_INTF_PCI_MSK, + RTW_PWR_ADDR_MAC, + RTW_PWR_CMD_WRITE, 0xFF, 0}, + {0xFFFF, + RTW_PWR_CUT_ALL_MSK, + RTW_PWR_INTF_ALL_MSK, + 0, + RTW_PWR_CMD_END, 0, 0}, +}; + +static const struct rtw_pwr_seq_cmd trans_cardemu_to_act_8723d[] = { + {0x0020, + RTW_PWR_CUT_ALL_MSK, + RTW_PWR_INTF_USB_MSK | RTW_PWR_INTF_SDIO_MSK, + RTW_PWR_ADDR_MAC, + RTW_PWR_CMD_WRITE, BIT(0), BIT(0)}, + {0x0001, + RTW_PWR_CUT_ALL_MSK, + RTW_PWR_INTF_USB_MSK | RTW_PWR_INTF_SDIO_MSK, + RTW_PWR_ADDR_MAC, + RTW_PWR_CMD_DELAY, 1, RTW_PWR_DELAY_MS}, + {0x0000, + RTW_PWR_CUT_ALL_MSK, + RTW_PWR_INTF_USB_MSK | RTW_PWR_INTF_SDIO_MSK, + RTW_PWR_ADDR_MAC, + RTW_PWR_CMD_WRITE, BIT(5), 0}, + {0x0005, + RTW_PWR_CUT_ALL_MSK, + RTW_PWR_INTF_ALL_MSK, + RTW_PWR_ADDR_MAC, + RTW_PWR_CMD_WRITE, (BIT(4) | BIT(3) | BIT(2)), 0}, + {0x0075, + RTW_PWR_CUT_ALL_MSK, + RTW_PWR_INTF_PCI_MSK, + RTW_PWR_ADDR_MAC, + RTW_PWR_CMD_WRITE, BIT(0), BIT(0)}, + {0x0006, + RTW_PWR_CUT_ALL_MSK, + RTW_PWR_INTF_ALL_MSK, + RTW_PWR_ADDR_MAC, + RTW_PWR_CMD_POLLING, BIT(1), BIT(1)}, + {0x0075, + RTW_PWR_CUT_ALL_MSK, + RTW_PWR_INTF_PCI_MSK, + RTW_PWR_ADDR_MAC, + RTW_PWR_CMD_WRITE, BIT(0), 0}, + {0x0006, + RTW_PWR_CUT_ALL_MSK, + RTW_PWR_INTF_ALL_MSK, + RTW_PWR_ADDR_MAC, + RTW_PWR_CMD_WRITE, BIT(0), BIT(0)}, + {0x0005, + RTW_PWR_CUT_ALL_MSK, + RTW_PWR_INTF_ALL_MSK, + RTW_PWR_ADDR_MAC, + RTW_PWR_CMD_POLLING, (BIT(1) | BIT(0)), 0}, + {0x0005, + RTW_PWR_CUT_ALL_MSK, + RTW_PWR_INTF_ALL_MSK, + RTW_PWR_ADDR_MAC, + RTW_PWR_CMD_WRITE, BIT(7), 0}, + {0x0005, + RTW_PWR_CUT_ALL_MSK, + RTW_PWR_INTF_ALL_MSK, + RTW_PWR_ADDR_MAC, + RTW_PWR_CMD_WRITE, (BIT(4) | BIT(3)), 0}, + {0x0005, + RTW_PWR_CUT_ALL_MSK, + RTW_PWR_INTF_ALL_MSK, + RTW_PWR_ADDR_MAC, + RTW_PWR_CMD_WRITE, BIT(0), BIT(0)}, + {0x0005, + RTW_PWR_CUT_ALL_MSK, + RTW_PWR_INTF_ALL_MSK, + RTW_PWR_ADDR_MAC, + RTW_PWR_CMD_POLLING, BIT(0), 0}, + {0x0010, + RTW_PWR_CUT_ALL_MSK, + RTW_PWR_INTF_ALL_MSK, + RTW_PWR_ADDR_MAC, + RTW_PWR_CMD_WRITE, BIT(6), BIT(6)}, + {0x0049, + RTW_PWR_CUT_ALL_MSK, + RTW_PWR_INTF_ALL_MSK, + RTW_PWR_ADDR_MAC, + RTW_PWR_CMD_WRITE, BIT(1), BIT(1)}, + {0x0063, + RTW_PWR_CUT_ALL_MSK, + RTW_PWR_INTF_ALL_MSK, + RTW_PWR_ADDR_MAC, + RTW_PWR_CMD_WRITE, BIT(1), BIT(1)}, + {0x0062, + RTW_PWR_CUT_ALL_MSK, + RTW_PWR_INTF_ALL_MSK, + RTW_PWR_ADDR_MAC, + RTW_PWR_CMD_WRITE, BIT(1), 0}, + {0x0058, + RTW_PWR_CUT_ALL_MSK, + RTW_PWR_INTF_ALL_MSK, + RTW_PWR_ADDR_MAC, + RTW_PWR_CMD_WRITE, BIT(0), BIT(0)}, + {0x005A, + RTW_PWR_CUT_ALL_MSK, + RTW_PWR_INTF_ALL_MSK, + RTW_PWR_ADDR_MAC, + RTW_PWR_CMD_WRITE, BIT(1), BIT(1)}, + {0x0068, + RTW_PWR_CUT_TEST_MSK, + RTW_PWR_INTF_ALL_MSK, + RTW_PWR_ADDR_MAC, + RTW_PWR_CMD_WRITE, BIT(3), BIT(3)}, + {0x0069, + RTW_PWR_CUT_ALL_MSK, + RTW_PWR_INTF_ALL_MSK, + RTW_PWR_ADDR_MAC, + RTW_PWR_CMD_WRITE, BIT(6), BIT(6)}, + {0x001f, + RTW_PWR_CUT_ALL_MSK, + RTW_PWR_INTF_ALL_MSK, + RTW_PWR_ADDR_MAC, + RTW_PWR_CMD_WRITE, 0xFF, 0x00}, + {0x0077, + RTW_PWR_CUT_ALL_MSK, + RTW_PWR_INTF_ALL_MSK, + RTW_PWR_ADDR_MAC, + RTW_PWR_CMD_WRITE, 0xFF, 0x00}, + {0x001f, + RTW_PWR_CUT_ALL_MSK, + RTW_PWR_INTF_ALL_MSK, + RTW_PWR_ADDR_MAC, + RTW_PWR_CMD_WRITE, 0xFF, 0x07}, + {0x0077, + RTW_PWR_CUT_ALL_MSK, + RTW_PWR_INTF_ALL_MSK, + RTW_PWR_ADDR_MAC, + RTW_PWR_CMD_WRITE, 0xFF, 0x07}, + {0xFFFF, + RTW_PWR_CUT_ALL_MSK, + RTW_PWR_INTF_ALL_MSK, + 0, + RTW_PWR_CMD_END, 0, 0}, +}; + +static const struct rtw_pwr_seq_cmd *card_enable_flow_8723d[] = { + trans_carddis_to_cardemu_8723d, + trans_cardemu_to_act_8723d, + NULL +}; + +static const struct rtw_pwr_seq_cmd trans_act_to_lps_8723d[] = { + {0x0301, + RTW_PWR_CUT_ALL_MSK, + RTW_PWR_INTF_PCI_MSK, + RTW_PWR_ADDR_MAC, + RTW_PWR_CMD_WRITE, 0xFF, 0xFF}, + {0x0522, + RTW_PWR_CUT_ALL_MSK, + RTW_PWR_INTF_ALL_MSK, + RTW_PWR_ADDR_MAC, + RTW_PWR_CMD_WRITE, 0xFF, 0xFF}, + {0x05F8, + RTW_PWR_CUT_ALL_MSK, + RTW_PWR_INTF_ALL_MSK, + RTW_PWR_ADDR_MAC, + RTW_PWR_CMD_POLLING, 0xFF, 0}, + {0x05F9, + RTW_PWR_CUT_ALL_MSK, + RTW_PWR_INTF_ALL_MSK, + RTW_PWR_ADDR_MAC, + RTW_PWR_CMD_POLLING, 0xFF, 0}, + {0x05FA, + RTW_PWR_CUT_ALL_MSK, + RTW_PWR_INTF_ALL_MSK, + RTW_PWR_ADDR_MAC, + RTW_PWR_CMD_POLLING, 0xFF, 0}, + {0x05FB, + RTW_PWR_CUT_ALL_MSK, + RTW_PWR_INTF_ALL_MSK, + RTW_PWR_ADDR_MAC, + RTW_PWR_CMD_POLLING, 0xFF, 0}, + {0x0002, + RTW_PWR_CUT_ALL_MSK, + RTW_PWR_INTF_ALL_MSK, + RTW_PWR_ADDR_MAC, + RTW_PWR_CMD_WRITE, BIT(0), 0}, + {0x0002, + RTW_PWR_CUT_ALL_MSK, + RTW_PWR_INTF_ALL_MSK, + RTW_PWR_ADDR_MAC, + RTW_PWR_CMD_DELAY, 0, RTW_PWR_DELAY_US}, + {0x0002, + RTW_PWR_CUT_ALL_MSK, + RTW_PWR_INTF_ALL_MSK, + RTW_PWR_ADDR_MAC, + RTW_PWR_CMD_WRITE, BIT(1), 0}, + {0x0100, + RTW_PWR_CUT_ALL_MSK, + RTW_PWR_INTF_ALL_MSK, + RTW_PWR_ADDR_MAC, + RTW_PWR_CMD_WRITE, 0xFF, 0x03}, + {0x0101, + RTW_PWR_CUT_ALL_MSK, + RTW_PWR_INTF_ALL_MSK, + RTW_PWR_ADDR_MAC, + RTW_PWR_CMD_WRITE, BIT(1), 0}, + {0x0093, + RTW_PWR_CUT_ALL_MSK, + RTW_PWR_INTF_SDIO_MSK, + RTW_PWR_ADDR_MAC, + RTW_PWR_CMD_WRITE, 0xFF, 0x00}, + {0x0553, + RTW_PWR_CUT_ALL_MSK, + RTW_PWR_INTF_ALL_MSK, + RTW_PWR_ADDR_MAC, + RTW_PWR_CMD_WRITE, BIT(5), BIT(5)}, + {0xFFFF, + RTW_PWR_CUT_ALL_MSK, + RTW_PWR_INTF_ALL_MSK, + 0, + RTW_PWR_CMD_END, 0, 0}, +}; + +static const struct rtw_pwr_seq_cmd trans_act_to_pre_carddis_8723d[] = { + {0x0003, + RTW_PWR_CUT_ALL_MSK, + RTW_PWR_INTF_ALL_MSK, + RTW_PWR_ADDR_MAC, + RTW_PWR_CMD_WRITE, BIT(2), 0}, + {0x0080, + RTW_PWR_CUT_ALL_MSK, + RTW_PWR_INTF_ALL_MSK, + RTW_PWR_ADDR_MAC, + RTW_PWR_CMD_WRITE, 0xFF, 0}, + {0xFFFF, + RTW_PWR_CUT_ALL_MSK, + RTW_PWR_INTF_ALL_MSK, + 0, + RTW_PWR_CMD_END, 0, 0}, +}; + +static const struct rtw_pwr_seq_cmd trans_act_to_cardemu_8723d[] = { + {0x0002, + RTW_PWR_CUT_ALL_MSK, + RTW_PWR_INTF_ALL_MSK, + RTW_PWR_ADDR_MAC, + RTW_PWR_CMD_WRITE, BIT(0), 0}, + {0x0049, + RTW_PWR_CUT_ALL_MSK, + RTW_PWR_INTF_ALL_MSK, + RTW_PWR_ADDR_MAC, + RTW_PWR_CMD_WRITE, BIT(1), 0}, + {0x0006, + RTW_PWR_CUT_ALL_MSK, + RTW_PWR_INTF_ALL_MSK, + RTW_PWR_ADDR_MAC, + RTW_PWR_CMD_WRITE, BIT(0), BIT(0)}, + {0x0005, + RTW_PWR_CUT_ALL_MSK, + RTW_PWR_INTF_ALL_MSK, + RTW_PWR_ADDR_MAC, + RTW_PWR_CMD_WRITE, BIT(1), BIT(1)}, + {0x0005, + RTW_PWR_CUT_ALL_MSK, + RTW_PWR_INTF_ALL_MSK, + RTW_PWR_ADDR_MAC, + RTW_PWR_CMD_POLLING, BIT(1), 0}, + {0x0010, + RTW_PWR_CUT_ALL_MSK, + RTW_PWR_INTF_ALL_MSK, + RTW_PWR_ADDR_MAC, + RTW_PWR_CMD_WRITE, BIT(6), 0}, + {0x0000, + RTW_PWR_CUT_ALL_MSK, + RTW_PWR_INTF_USB_MSK | RTW_PWR_INTF_SDIO_MSK, + RTW_PWR_ADDR_MAC, + RTW_PWR_CMD_WRITE, BIT(5), BIT(5)}, + {0x0020, + RTW_PWR_CUT_ALL_MSK, + RTW_PWR_INTF_USB_MSK | RTW_PWR_INTF_SDIO_MSK, + RTW_PWR_ADDR_MAC, + RTW_PWR_CMD_WRITE, BIT(0), 0}, + {0xFFFF, + RTW_PWR_CUT_ALL_MSK, + RTW_PWR_INTF_ALL_MSK, + 0, + RTW_PWR_CMD_END, 0, 0}, +}; + +static const struct rtw_pwr_seq_cmd trans_cardemu_to_carddis_8723d[] = { + {0x0007, + RTW_PWR_CUT_ALL_MSK, + RTW_PWR_INTF_SDIO_MSK, + RTW_PWR_ADDR_MAC, + RTW_PWR_CMD_WRITE, 0xFF, 0x20}, + {0x0005, + RTW_PWR_CUT_ALL_MSK, + RTW_PWR_INTF_USB_MSK | RTW_PWR_INTF_SDIO_MSK, + RTW_PWR_ADDR_MAC, + RTW_PWR_CMD_WRITE, BIT(3) | BIT(4), BIT(3)}, + {0x0005, + RTW_PWR_CUT_ALL_MSK, + RTW_PWR_INTF_PCI_MSK, + RTW_PWR_ADDR_MAC, + RTW_PWR_CMD_WRITE, BIT(2), BIT(2)}, + {0x0005, + RTW_PWR_CUT_ALL_MSK, + RTW_PWR_INTF_PCI_MSK, + RTW_PWR_ADDR_MAC, + RTW_PWR_CMD_WRITE, BIT(3) | BIT(4), BIT(3) | BIT(4)}, + {0x004A, + RTW_PWR_CUT_ALL_MSK, + RTW_PWR_INTF_USB_MSK, + RTW_PWR_ADDR_MAC, + RTW_PWR_CMD_WRITE, BIT(0), 1}, + {0x0023, + RTW_PWR_CUT_ALL_MSK, + RTW_PWR_INTF_SDIO_MSK, + RTW_PWR_ADDR_MAC, + RTW_PWR_CMD_WRITE, BIT(4), BIT(4)}, + {0x0086, + RTW_PWR_CUT_ALL_MSK, + RTW_PWR_INTF_SDIO_MSK, + RTW_PWR_ADDR_SDIO, + RTW_PWR_CMD_WRITE, BIT(0), BIT(0)}, + {0x0086, + RTW_PWR_CUT_ALL_MSK, + RTW_PWR_INTF_SDIO_MSK, + RTW_PWR_ADDR_SDIO, + RTW_PWR_CMD_POLLING, BIT(1), 0}, + {0xFFFF, + RTW_PWR_CUT_ALL_MSK, + RTW_PWR_INTF_ALL_MSK, + 0, + RTW_PWR_CMD_END, 0, 0}, +}; + +static const struct rtw_pwr_seq_cmd trans_act_to_post_carddis_8723d[] = { + {0x001D, + RTW_PWR_CUT_ALL_MSK, + RTW_PWR_INTF_ALL_MSK, + RTW_PWR_ADDR_MAC, + RTW_PWR_CMD_WRITE, BIT(0), 0}, + {0x001D, + RTW_PWR_CUT_ALL_MSK, + RTW_PWR_INTF_ALL_MSK, + RTW_PWR_ADDR_MAC, + RTW_PWR_CMD_WRITE, BIT(0), BIT(0)}, + {0x001C, + RTW_PWR_CUT_ALL_MSK, + RTW_PWR_INTF_ALL_MSK, + RTW_PWR_ADDR_MAC, + RTW_PWR_CMD_WRITE, 0xFF, 0x0E}, + {0xFFFF, + RTW_PWR_CUT_ALL_MSK, + RTW_PWR_INTF_ALL_MSK, + 0, + RTW_PWR_CMD_END, 0, 0}, +}; + +static const struct rtw_pwr_seq_cmd *card_disable_flow_8723d[] = { + trans_act_to_lps_8723d, + trans_act_to_pre_carddis_8723d, + trans_act_to_cardemu_8723d, + trans_cardemu_to_carddis_8723d, + trans_act_to_post_carddis_8723d, + NULL +}; + struct rtw_chip_info rtw8723d_hw_spec = { .ops = &rtw8723d_ops, .id = RTW_CHIP_TYPE_8723D, @@ -41,6 +442,8 @@ struct rtw_chip_info rtw8723d_hw_spec = { .vht_supported = false, .lps_deep_mode_supported = 0, .sys_func_en = 0xFD, + .pwr_on_seq = card_enable_flow_8723d, + .pwr_off_seq = card_disable_flow_8723d, }; EXPORT_SYMBOL(rtw8723d_hw_spec); -- cgit v1.2.3-59-g8ed1b From e0c27cdbbd414877864773152ad0291913e18eae Mon Sep 17 00:00:00 2001 From: Ping-Ke Shih Date: Mon, 20 Apr 2020 13:50:50 +0800 Subject: rtw88: 8723d: Add RF read/write ops 8723D use SIPI to indirectly read RF register instead of directly read, so introduce a new struct rtw_rf_sipi_addr and new function rtw_phy_read_rf_sipi(). Since other chips don't use the new function, only 8723D needs to fill struct rtw_rf_sipi_addr in rtw_chip_info. Because there are two kinds of functions for reading RF registers now, change rtw_phy_read_rf() to chip->ops->read_rf() in rtw_phy_write_rf_reg_sipi() so that we can switch tp proper RF read functions depends on the type of the chip. Though 8723D is an 1x1 chip, it has two RF PHY and we can switch to one of them, and that should be configured properly. Hence, add a fix_rf_phy_num to struct rtw_chip_info to allow driver to set one of the PHY's registers for 8723D, even it is only 1x1. Another variable rf_phy_num is introduced to keep the constraint number of RF path we can access, and its value is: rf_phy_num = (fix_rf_phy_num ? fix_rf_phy_num : rf_path_num) Signed-off-by: Ping-Ke Shih Signed-off-by: Zong-Zhe Yang Signed-off-by: Yan-Hsuan Chuang Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/20200420055054.14592-5-yhchuang@realtek.com --- drivers/net/wireless/realtek/rtw88/main.c | 5 +++ drivers/net/wireless/realtek/rtw88/main.h | 10 +++++ drivers/net/wireless/realtek/rtw88/phy.c | 56 +++++++++++++++++++++++++-- drivers/net/wireless/realtek/rtw88/phy.h | 6 +++ drivers/net/wireless/realtek/rtw88/rtw8723d.c | 12 ++++++ 5 files changed, 85 insertions(+), 4 deletions(-) diff --git a/drivers/net/wireless/realtek/rtw88/main.c b/drivers/net/wireless/realtek/rtw88/main.c index 6dfe4895c352..c851830132d0 100644 --- a/drivers/net/wireless/realtek/rtw88/main.c +++ b/drivers/net/wireless/realtek/rtw88/main.c @@ -472,6 +472,7 @@ static u8 hw_bw_cap_to_bitamp(u8 bw_cap) static void rtw_hw_config_rf_ant_num(struct rtw_dev *rtwdev, u8 hw_ant_num) { struct rtw_hal *hal = &rtwdev->hal; + struct rtw_chip_info *chip = rtwdev->chip; if (hw_ant_num == EFUSE_HW_CAP_IGNORE || hw_ant_num >= hal->rf_path_num) @@ -481,6 +482,8 @@ static void rtw_hw_config_rf_ant_num(struct rtw_dev *rtwdev, u8 hw_ant_num) case 1: hal->rf_type = RF_1T1R; hal->rf_path_num = 1; + if (!chip->fix_rf_phy_num) + hal->rf_phy_num = hal->rf_path_num; hal->antenna_tx = BB_PATH_A; hal->antenna_rx = BB_PATH_A; break; @@ -1130,6 +1133,8 @@ static int rtw_chip_parameter_setup(struct rtw_dev *rtwdev) hal->antenna_tx = BB_PATH_A; hal->antenna_rx = BB_PATH_A; } + hal->rf_phy_num = chip->fix_rf_phy_num ? chip->fix_rf_phy_num : + hal->rf_path_num; efuse->physical_size = chip->phy_efuse_size; efuse->logical_size = chip->log_efuse_size; diff --git a/drivers/net/wireless/realtek/rtw88/main.h b/drivers/net/wireless/realtek/rtw88/main.h index e852ab194315..8f15fc113af0 100644 --- a/drivers/net/wireless/realtek/rtw88/main.h +++ b/drivers/net/wireless/realtek/rtw88/main.h @@ -529,6 +529,13 @@ struct rtw_reg_domain { u8 domain; }; +struct rtw_rf_sipi_addr { + u32 hssi_1; + u32 hssi_2; + u32 lssi_read; + u32 lssi_read_pi; +}; + struct rtw_backup_info { u8 len; u32 reg; @@ -1087,6 +1094,8 @@ struct rtw_chip_info { const struct rtw_hw_reg *dig; u32 rf_base_addr[2]; u32 rf_sipi_addr[2]; + const struct rtw_rf_sipi_addr *rf_sipi_read_addr; + u8 fix_rf_phy_num; const struct rtw_table *mac_tbl; const struct rtw_table *agc_tbl; @@ -1571,6 +1580,7 @@ struct rtw_hal { u8 sec_ch_offset; u8 rf_type; u8 rf_path_num; + u8 rf_phy_num; u32 antenna_tx; u32 antenna_rx; u8 bfee_sts_cap; diff --git a/drivers/net/wireless/realtek/rtw88/phy.c b/drivers/net/wireless/realtek/rtw88/phy.c index 8793dd22188f..8489abfdc12e 100644 --- a/drivers/net/wireless/realtek/rtw88/phy.c +++ b/drivers/net/wireless/realtek/rtw88/phy.c @@ -679,7 +679,7 @@ u32 rtw_phy_read_rf(struct rtw_dev *rtwdev, enum rtw_rf_path rf_path, const u32 *base_addr = chip->rf_base_addr; u32 val, direct_addr; - if (rf_path >= hal->rf_path_num) { + if (rf_path >= hal->rf_phy_num) { rtw_err(rtwdev, "unsupported rf path (%d)\n", rf_path); return INV_RF_DATA; } @@ -693,6 +693,54 @@ u32 rtw_phy_read_rf(struct rtw_dev *rtwdev, enum rtw_rf_path rf_path, return val; } +u32 rtw_phy_read_rf_sipi(struct rtw_dev *rtwdev, enum rtw_rf_path rf_path, + u32 addr, u32 mask) +{ + struct rtw_hal *hal = &rtwdev->hal; + struct rtw_chip_info *chip = rtwdev->chip; + const struct rtw_rf_sipi_addr *rf_sipi_addr; + const struct rtw_rf_sipi_addr *rf_sipi_addr_a; + u32 val32; + u32 en_pi; + u32 r_addr; + u32 shift; + + if (rf_path >= hal->rf_phy_num) { + rtw_err(rtwdev, "unsupported rf path (%d)\n", rf_path); + return INV_RF_DATA; + } + + if (!chip->rf_sipi_read_addr) { + rtw_err(rtwdev, "rf_sipi_read_addr isn't defined\n"); + return INV_RF_DATA; + } + + rf_sipi_addr = &chip->rf_sipi_read_addr[rf_path]; + rf_sipi_addr_a = &chip->rf_sipi_read_addr[RF_PATH_A]; + + addr &= 0xff; + + val32 = rtw_read32(rtwdev, rf_sipi_addr->hssi_2); + val32 = (val32 & ~LSSI_READ_ADDR_MASK) | (addr << 23); + rtw_write32(rtwdev, rf_sipi_addr->hssi_2, val32); + + /* toggle read edge of path A */ + val32 = rtw_read32(rtwdev, rf_sipi_addr_a->hssi_2); + rtw_write32(rtwdev, rf_sipi_addr_a->hssi_2, val32 & ~LSSI_READ_EDGE_MASK); + rtw_write32(rtwdev, rf_sipi_addr_a->hssi_2, val32 | LSSI_READ_EDGE_MASK); + + udelay(120); + + en_pi = rtw_read32_mask(rtwdev, rf_sipi_addr->hssi_1, BIT(8)); + r_addr = en_pi ? rf_sipi_addr->lssi_read_pi : rf_sipi_addr->lssi_read; + + val32 = rtw_read32_mask(rtwdev, r_addr, LSSI_READ_DATA_MASK); + + shift = __ffs(mask); + + return (val32 & mask) >> shift; +} + bool rtw_phy_write_rf_reg_sipi(struct rtw_dev *rtwdev, enum rtw_rf_path rf_path, u32 addr, u32 mask, u32 data) { @@ -703,7 +751,7 @@ bool rtw_phy_write_rf_reg_sipi(struct rtw_dev *rtwdev, enum rtw_rf_path rf_path, u32 old_data = 0; u32 shift; - if (rf_path >= hal->rf_path_num) { + if (rf_path >= hal->rf_phy_num) { rtw_err(rtwdev, "unsupported rf path (%d)\n", rf_path); return false; } @@ -712,7 +760,7 @@ bool rtw_phy_write_rf_reg_sipi(struct rtw_dev *rtwdev, enum rtw_rf_path rf_path, mask &= RFREG_MASK; if (mask != RFREG_MASK) { - old_data = rtw_phy_read_rf(rtwdev, rf_path, addr, RFREG_MASK); + old_data = chip->ops->read_rf(rtwdev, rf_path, addr, RFREG_MASK); if (old_data == INV_RF_DATA) { rtw_err(rtwdev, "Write fail, rf is disabled\n"); @@ -740,7 +788,7 @@ bool rtw_phy_write_rf_reg(struct rtw_dev *rtwdev, enum rtw_rf_path rf_path, const u32 *base_addr = chip->rf_base_addr; u32 direct_addr; - if (rf_path >= hal->rf_path_num) { + if (rf_path >= hal->rf_phy_num) { rtw_err(rtwdev, "unsupported rf path (%d)\n", rf_path); return false; } diff --git a/drivers/net/wireless/realtek/rtw88/phy.h b/drivers/net/wireless/realtek/rtw88/phy.h index af916d8784cd..413bf7165cc0 100644 --- a/drivers/net/wireless/realtek/rtw88/phy.h +++ b/drivers/net/wireless/realtek/rtw88/phy.h @@ -21,6 +21,8 @@ void rtw_phy_dynamic_mechanism(struct rtw_dev *rtwdev); u8 rtw_phy_rf_power_2_rssi(s8 *rf_power, u8 path_num); u32 rtw_phy_read_rf(struct rtw_dev *rtwdev, enum rtw_rf_path rf_path, u32 addr, u32 mask); +u32 rtw_phy_read_rf_sipi(struct rtw_dev *rtwdev, enum rtw_rf_path rf_path, + u32 addr, u32 mask); bool rtw_phy_write_rf_reg_sipi(struct rtw_dev *rtwdev, enum rtw_rf_path rf_path, u32 addr, u32 mask, u32 data); bool rtw_phy_write_rf_reg(struct rtw_dev *rtwdev, enum rtw_rf_path rf_path, @@ -178,4 +180,8 @@ enum rtw_phy_cck_pd_lv { #define CCK_FA_AVG_RESET 0xffffffff +#define LSSI_READ_ADDR_MASK 0x7f800000 +#define LSSI_READ_EDGE_MASK 0x80000000 +#define LSSI_READ_DATA_MASK 0xfffff + #endif diff --git a/drivers/net/wireless/realtek/rtw88/rtw8723d.c b/drivers/net/wireless/realtek/rtw88/rtw8723d.c index 5b97730f1407..679c6c19516c 100644 --- a/drivers/net/wireless/realtek/rtw88/rtw8723d.c +++ b/drivers/net/wireless/realtek/rtw88/rtw8723d.c @@ -15,6 +15,8 @@ #include "debug.h" static struct rtw_chip_ops rtw8723d_ops = { + .read_rf = rtw_phy_read_rf_sipi, + .write_rf = rtw_phy_write_rf_reg_sipi, .set_antenna = NULL, .config_bfee = NULL, .set_gid_table = NULL, @@ -422,6 +424,13 @@ static const struct rtw_pwr_seq_cmd *card_disable_flow_8723d[] = { NULL }; +static const struct rtw_rf_sipi_addr rtw8723d_rf_sipi_addr[] = { + [RF_PATH_A] = { .hssi_1 = 0x820, .lssi_read = 0x8a0, + .hssi_2 = 0x824, .lssi_read_pi = 0x8b8}, + [RF_PATH_B] = { .hssi_1 = 0x828, .lssi_read = 0x8a4, + .hssi_2 = 0x82c, .lssi_read_pi = 0x8bc}, +}; + struct rtw_chip_info rtw8723d_hw_spec = { .ops = &rtw8723d_ops, .id = RTW_CHIP_TYPE_8723D, @@ -444,6 +453,9 @@ struct rtw_chip_info rtw8723d_hw_spec = { .sys_func_en = 0xFD, .pwr_on_seq = card_enable_flow_8723d, .pwr_off_seq = card_disable_flow_8723d, + .rf_sipi_addr = {0x840, 0x844}, + .rf_sipi_read_addr = rtw8723d_rf_sipi_addr, + .fix_rf_phy_num = 2, }; EXPORT_SYMBOL(rtw8723d_hw_spec); -- cgit v1.2.3-59-g8ed1b From 9874f6851e47f674a23fc12969de31dbdf469f3d Mon Sep 17 00:00:00 2001 From: Ping-Ke Shih Date: Mon, 20 Apr 2020 13:50:51 +0800 Subject: rtw88: 8723d: Add mac/bb/rf/agc/power_limit tables Add corresponding parameter tables for 8723D devices. Since 8723D devices currently have only one RFE type, there is only one entry in rtw8723d_rfe_defs. Signed-off-by: Ping-Ke Shih Signed-off-by: Yan-Hsuan Chuang Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/20200420055054.14592-6-yhchuang@realtek.com --- drivers/net/wireless/realtek/rtw88/rtw8723d.c | 11 + .../net/wireless/realtek/rtw88/rtw8723d_table.c | 1189 ++++++++++++++++++++ .../net/wireless/realtek/rtw88/rtw8723d_table.h | 7 + 3 files changed, 1207 insertions(+) diff --git a/drivers/net/wireless/realtek/rtw88/rtw8723d.c b/drivers/net/wireless/realtek/rtw88/rtw8723d.c index 679c6c19516c..4fe433549285 100644 --- a/drivers/net/wireless/realtek/rtw88/rtw8723d.c +++ b/drivers/net/wireless/realtek/rtw88/rtw8723d.c @@ -431,6 +431,11 @@ static const struct rtw_rf_sipi_addr rtw8723d_rf_sipi_addr[] = { .hssi_2 = 0x82c, .lssi_read_pi = 0x8bc}, }; +static const struct rtw_rfe_def rtw8723d_rfe_defs[] = { + [0] = { .phy_pg_tbl = &rtw8723d_bb_pg_tbl, + .txpwr_lmt_tbl = &rtw8723d_txpwr_lmt_tbl,}, +}; + struct rtw_chip_info rtw8723d_hw_spec = { .ops = &rtw8723d_ops, .id = RTW_CHIP_TYPE_8723D, @@ -456,6 +461,12 @@ struct rtw_chip_info rtw8723d_hw_spec = { .rf_sipi_addr = {0x840, 0x844}, .rf_sipi_read_addr = rtw8723d_rf_sipi_addr, .fix_rf_phy_num = 2, + .mac_tbl = &rtw8723d_mac_tbl, + .agc_tbl = &rtw8723d_agc_tbl, + .bb_tbl = &rtw8723d_bb_tbl, + .rf_tbl = {&rtw8723d_rf_a_tbl}, + .rfe_defs = rtw8723d_rfe_defs, + .rfe_defs_size = ARRAY_SIZE(rtw8723d_rfe_defs), }; EXPORT_SYMBOL(rtw8723d_hw_spec); diff --git a/drivers/net/wireless/realtek/rtw88/rtw8723d_table.c b/drivers/net/wireless/realtek/rtw88/rtw8723d_table.c index b22b4b0f2fcf..27a22b392df0 100644 --- a/drivers/net/wireless/realtek/rtw88/rtw8723d_table.c +++ b/drivers/net/wireless/realtek/rtw88/rtw8723d_table.c @@ -5,3 +5,1192 @@ #include "main.h" #include "phy.h" #include "rtw8723d_table.h" + +static const u32 rtw8723d_mac[] = { + 0x020, 0x00000013, + 0x02F, 0x00000010, + 0x077, 0x00000007, + 0x421, 0x0000000F, + 0x428, 0x0000000A, + 0x429, 0x00000010, + 0x430, 0x00000000, + 0x431, 0x00000000, + 0x432, 0x00000000, + 0x433, 0x00000001, + 0x434, 0x00000002, + 0x435, 0x00000003, + 0x436, 0x00000005, + 0x437, 0x00000007, + 0x438, 0x00000000, + 0x439, 0x00000000, + 0x43A, 0x00000000, + 0x43B, 0x00000001, + 0x43C, 0x00000002, + 0x43D, 0x00000003, + 0x43E, 0x00000005, + 0x43F, 0x00000007, + 0x440, 0x0000005D, + 0x441, 0x00000001, + 0x442, 0x00000000, + 0x444, 0x00000010, + 0x445, 0x00000000, + 0x446, 0x00000000, + 0x447, 0x00000000, + 0x448, 0x00000000, + 0x449, 0x000000F0, + 0x44A, 0x0000000F, + 0x44B, 0x0000003E, + 0x44C, 0x00000010, + 0x44D, 0x00000000, + 0x44E, 0x00000000, + 0x44F, 0x00000000, + 0x450, 0x00000000, + 0x451, 0x000000F0, + 0x452, 0x0000000F, + 0x453, 0x00000000, + 0x456, 0x0000005E, + 0x460, 0x00000066, + 0x461, 0x00000066, + 0x4C8, 0x000000FF, + 0x4C9, 0x00000008, + 0x4CC, 0x000000FF, + 0x4CD, 0x000000FF, + 0x4CE, 0x00000001, + 0x500, 0x00000026, + 0x501, 0x000000A2, + 0x502, 0x0000002F, + 0x503, 0x00000000, + 0x504, 0x00000028, + 0x505, 0x000000A3, + 0x506, 0x0000005E, + 0x507, 0x00000000, + 0x508, 0x0000002B, + 0x509, 0x000000A4, + 0x50A, 0x0000005E, + 0x50B, 0x00000000, + 0x50C, 0x0000004F, + 0x50D, 0x000000A4, + 0x50E, 0x00000000, + 0x50F, 0x00000000, + 0x512, 0x0000001C, + 0x514, 0x0000000A, + 0x516, 0x0000000A, + 0x525, 0x0000004F, + 0x550, 0x00000010, + 0x551, 0x00000010, + 0x559, 0x00000002, + 0x55C, 0x00000028, + 0x55D, 0x000000FF, + 0x605, 0x00000030, + 0x608, 0x0000000E, + 0x609, 0x0000002A, + 0x620, 0x000000FF, + 0x621, 0x000000FF, + 0x622, 0x000000FF, + 0x623, 0x000000FF, + 0x624, 0x000000FF, + 0x625, 0x000000FF, + 0x626, 0x000000FF, + 0x627, 0x000000FF, + 0x638, 0x00000028, + 0x63C, 0x0000000A, + 0x63D, 0x0000000A, + 0x63E, 0x0000000C, + 0x63F, 0x0000000C, + 0x640, 0x00000040, + 0x642, 0x00000040, + 0x643, 0x00000000, + 0x652, 0x000000C8, + 0x66A, 0x000000B0, + 0x66E, 0x00000005, + 0x700, 0x00000021, + 0x701, 0x00000043, + 0x702, 0x00000065, + 0x703, 0x00000087, + 0x708, 0x00000021, + 0x709, 0x00000043, + 0x70A, 0x00000065, + 0x70B, 0x00000087, + 0x765, 0x00000018, + 0x76E, 0x00000004, + 0x7C0, 0x00000038, + 0x7C2, 0x0000000F, + 0x7C3, 0x000000C0, + 0x073, 0x00000004, + 0x7C4, 0x00000077, + 0x07C, 0x00000003, + 0x016, 0x000000B3, +}; + +RTW_DECL_TABLE_PHY_COND(rtw8723d_mac, rtw_phy_cfg_mac); + +static const u32 rtw8723d_agc[] = { + 0xC78, 0xFE000101, + 0xC78, 0xFD010101, + 0xC78, 0xFC020101, + 0xC78, 0xFB030101, + 0xC78, 0xFA040101, + 0xC78, 0xF9050101, + 0xC78, 0xF8060101, + 0xC78, 0xF7070101, + 0xC78, 0xF6080101, + 0xC78, 0xF5090101, + 0xC78, 0xF40A0101, + 0xC78, 0xF30B0101, + 0xC78, 0xF20C0101, + 0xC78, 0xF10D0101, + 0xC78, 0xF00E0101, + 0xC78, 0xEF0F0101, + 0xC78, 0xEE100101, + 0xC78, 0xED110101, + 0xC78, 0xEC120101, + 0xC78, 0xEB130101, + 0xC78, 0xEA140101, + 0xC78, 0xE9150101, + 0xC78, 0xE8160101, + 0xC78, 0xE7170101, + 0xC78, 0xE6180101, + 0xC78, 0xE5190101, + 0xC78, 0xE41A0101, + 0xC78, 0xE31B0101, + 0xC78, 0xE21C0101, + 0xC78, 0xE11D0101, + 0xC78, 0xE01E0101, + 0xC78, 0x861F0101, + 0xC78, 0x85200101, + 0xC78, 0x84210101, + 0xC78, 0x83220101, + 0xC78, 0x82230101, + 0xC78, 0x81240101, + 0xC78, 0x80250101, + 0xC78, 0x44260101, + 0xC78, 0x43270101, + 0xC78, 0x42280101, + 0xC78, 0x41290101, + 0xC78, 0x402A0101, + 0xC78, 0x022B0101, + 0xC78, 0x012C0101, + 0xC78, 0x002D0101, + 0xC78, 0xC52E0001, + 0xC78, 0xC42F0001, + 0xC78, 0xC3300001, + 0xC78, 0xC2310001, + 0xC78, 0xC1320001, + 0xC78, 0xC0330001, + 0xC78, 0x04340001, + 0xC78, 0x03350001, + 0xC78, 0x02360001, + 0xC78, 0x01370001, + 0xC78, 0x00380001, + 0xC78, 0x00390001, + 0xC78, 0x003A0001, + 0xC78, 0x003B0001, + 0xC78, 0x003C0001, + 0xC78, 0x003D0001, + 0xC78, 0x003E0001, + 0xC78, 0x003F0001, + 0xC78, 0x6F002001, + 0xC78, 0x6F012001, + 0xC78, 0x6F022001, + 0xC78, 0x6F032001, + 0xC78, 0x6F042001, + 0xC78, 0x6F052001, + 0xC78, 0x6F062001, + 0xC78, 0x6F072001, + 0xC78, 0x6F082001, + 0xC78, 0x6F092001, + 0xC78, 0x6F0A2001, + 0xC78, 0x6F0B2001, + 0xC78, 0x6F0C2001, + 0xC78, 0x6F0D2001, + 0xC78, 0x6F0E2001, + 0xC78, 0x6F0F2001, + 0xC78, 0x6F102001, + 0xC78, 0x6F112001, + 0xC78, 0x6F122001, + 0xC78, 0x6F132001, + 0xC78, 0x6F142001, + 0xC78, 0x6F152001, + 0xC78, 0x6F162001, + 0xC78, 0x6F172001, + 0xC78, 0x6F182001, + 0xC78, 0x6F192001, + 0xC78, 0x6F1A2001, + 0xC78, 0x6F1B2001, + 0xC78, 0x6F1C2001, + 0xC78, 0x6F1D2001, + 0xC78, 0x6F1E2001, + 0xC78, 0x6F1F2001, + 0xC78, 0x6F202001, + 0xC78, 0x6F212001, + 0xC78, 0x6F222001, + 0xC78, 0x6F232001, + 0xC78, 0x6E242001, + 0xC78, 0x6D252001, + 0xC78, 0x6C262001, + 0xC78, 0x6B272001, + 0xC78, 0x6A282001, + 0xC78, 0x69292001, + 0xC78, 0x4B2A2001, + 0xC78, 0x4A2B2001, + 0xC78, 0x492C2001, + 0xC78, 0x482D2001, + 0xC78, 0x472E2001, + 0xC78, 0x462F2001, + 0xC78, 0x45302001, + 0xC78, 0x44312001, + 0xC78, 0x43322001, + 0xC78, 0x42332001, + 0xC78, 0x41342001, + 0xC78, 0x40352001, + 0xC78, 0x02362001, + 0xC78, 0x01372001, + 0xC78, 0x00382001, + 0xC78, 0x00392001, + 0xC78, 0x003A2001, + 0xC78, 0x003B2001, + 0xC78, 0x003C2001, + 0xC78, 0x003D2001, + 0xC78, 0x003E2001, + 0xC78, 0x003F2001, + 0xC78, 0x7F003101, + 0xC78, 0x7F013101, + 0xC78, 0x7F023101, + 0xC78, 0x7F033101, + 0xC78, 0x7F043101, + 0xC78, 0x7F053101, + 0xC78, 0x7F063101, + 0xC78, 0x7F073101, + 0xC78, 0x7E083101, + 0xC78, 0x7D093101, + 0xC78, 0x7C0A3101, + 0xC78, 0x7B0B3101, + 0xC78, 0x7A0C3101, + 0xC78, 0x790D3101, + 0xC78, 0x780E3101, + 0xC78, 0x770F3101, + 0xC78, 0x76103101, + 0xC78, 0x75113101, + 0xC78, 0x74123101, + 0xC78, 0x73133101, + 0xC78, 0x72143101, + 0xC78, 0x71153101, + 0xC78, 0x70163101, + 0xC78, 0x6F173101, + 0xC78, 0x6E183101, + 0xC78, 0x6D193101, + 0xC78, 0x6C1A3101, + 0xC78, 0x6B1B3101, + 0xC78, 0x6A1C3101, + 0xC78, 0x691D3101, + 0xC78, 0x681E3101, + 0xC78, 0x4B1F3101, + 0xC78, 0x4A203101, + 0xC78, 0x49213101, + 0xC78, 0x48223101, + 0xC78, 0x47233101, + 0xC78, 0x46243101, + 0xC78, 0x45253101, + 0xC78, 0x44263101, + 0xC78, 0x43273101, + 0xC78, 0x42283101, + 0xC78, 0x41293101, + 0xC78, 0x402A3101, + 0xC78, 0x022B3101, + 0xC78, 0x012C3101, + 0xC78, 0x002D3101, + 0xC78, 0x002E3101, + 0xC78, 0x002F3101, + 0xC78, 0x00303101, + 0xC78, 0x00313101, + 0xC78, 0x00323101, + 0xC78, 0x00333101, + 0xC78, 0x00343101, + 0xC78, 0x00353101, + 0xC78, 0x00363101, + 0xC78, 0x00373101, + 0xC78, 0x00383101, + 0xC78, 0x00393101, + 0xC78, 0x003A3101, + 0xC78, 0x003B3101, + 0xC78, 0x003C3101, + 0xC78, 0x003D3101, + 0xC78, 0x003E3101, + 0xC78, 0x003F3101, + 0xC78, 0xFE403101, + 0xC78, 0xFD413101, + 0xC78, 0xFC423101, + 0xC78, 0xFB433101, + 0xC78, 0xFA443101, + 0xC78, 0xF9453101, + 0xC78, 0xF8463101, + 0xC78, 0xF7473101, + 0xC78, 0xF6483101, + 0xC78, 0xF5493101, + 0xC78, 0xF44A3101, + 0xC78, 0xF34B3101, + 0xC78, 0xF24C3101, + 0xC78, 0xF14D3101, + 0xC78, 0xF04E3101, + 0xC78, 0xEF4F3101, + 0xC78, 0xEE503101, + 0xC78, 0xED513101, + 0xC78, 0xEC523101, + 0xC78, 0xEB533101, + 0xC78, 0xEA543101, + 0xC78, 0xE9553101, + 0xC78, 0xE8563101, + 0xC78, 0xE7573101, + 0xC78, 0xE6583101, + 0xC78, 0xE5593101, + 0xC78, 0xE45A3101, + 0xC78, 0xE35B3101, + 0xC78, 0xE25C3101, + 0xC78, 0xE15D3101, + 0xC78, 0xE05E3101, + 0xC78, 0x865F3101, + 0xC78, 0x85603101, + 0xC78, 0x84613101, + 0xC78, 0x83623101, + 0xC78, 0x82633101, + 0xC78, 0x81643101, + 0xC78, 0x80653101, + 0xC78, 0x80663101, + 0xC78, 0x80673101, + 0xC78, 0x80683101, + 0xC78, 0x80693101, + 0xC78, 0x806A3101, + 0xC78, 0x806B3101, + 0xC78, 0x806C3101, + 0xC78, 0x806D3101, + 0xC78, 0x806E3101, + 0xC78, 0x806F3101, + 0xC78, 0x80703101, + 0xC78, 0x80713101, + 0xC78, 0x80723101, + 0xC78, 0x80733101, + 0xC78, 0x80743101, + 0xC78, 0x80753101, + 0xC78, 0x80763101, + 0xC78, 0x80773101, + 0xC78, 0x80783101, + 0xC78, 0x80793101, + 0xC78, 0x807A3101, + 0xC78, 0x807B3101, + 0xC78, 0x807C3101, + 0xC78, 0x807D3101, + 0xC78, 0x807E3101, + 0xC78, 0x807F3101, + 0xC78, 0xEF402001, + 0xC78, 0xEF412001, + 0xC78, 0xEF422001, + 0xC78, 0xEF432001, + 0xC78, 0xEF442001, + 0xC78, 0xEF452001, + 0xC78, 0xEF462001, + 0xC78, 0xEF472001, + 0xC78, 0xEF482001, + 0xC78, 0xEF492001, + 0xC78, 0xEF4A2001, + 0xC78, 0xEF4B2001, + 0xC78, 0xEF4C2001, + 0xC78, 0xEF4D2001, + 0xC78, 0xEF4E2001, + 0xC78, 0xEF4F2001, + 0xC78, 0xEF502001, + 0xC78, 0xEF512001, + 0xC78, 0xEF522001, + 0xC78, 0xEF532001, + 0xC78, 0xEF542001, + 0xC78, 0xEF552001, + 0xC78, 0xEF562001, + 0xC78, 0xEF572001, + 0xC78, 0xEF582001, + 0xC78, 0xEF592001, + 0xC78, 0xEF5A2001, + 0xC78, 0xEF5B2001, + 0xC78, 0xEF5C2001, + 0xC78, 0xEF5D2001, + 0xC78, 0xEF5E2001, + 0xC78, 0xEF5F2001, + 0xC78, 0xEF602001, + 0xC78, 0xEE612001, + 0xC78, 0xED622001, + 0xC78, 0xEC632001, + 0xC78, 0xEB642001, + 0xC78, 0xEA652001, + 0xC78, 0xE9662001, + 0xC78, 0xE8672001, + 0xC78, 0xCB682001, + 0xC78, 0xCA692001, + 0xC78, 0xC96A2001, + 0xC78, 0xC86B2001, + 0xC78, 0xC76C2001, + 0xC78, 0xC66D2001, + 0xC78, 0xC56E2001, + 0xC78, 0xC46F2001, + 0xC78, 0xC3702001, + 0xC78, 0xC2712001, + 0xC78, 0xC1722001, + 0xC78, 0xC0732001, + 0xC78, 0x82742001, + 0xC78, 0x81752001, + 0xC78, 0x80762001, + 0xC78, 0x80772001, + 0xC78, 0x80782001, + 0xC78, 0x80792001, + 0xC78, 0x807A2001, + 0xC78, 0x807B2001, + 0xC78, 0x807C2001, + 0xC78, 0x807D2001, + 0xC78, 0x807E2001, + 0xC78, 0x807F2001, + 0xC78, 0xFA001101, + 0xC78, 0xF9011101, + 0xC78, 0xF8021101, + 0xC78, 0xF7031101, + 0xC78, 0xF6041101, + 0xC78, 0xF5051101, + 0xC78, 0xF4061101, + 0xC78, 0xD7071101, + 0xC78, 0xD6081101, + 0xC78, 0xD5091101, + 0xC78, 0xD40A1101, + 0xC78, 0x970B1101, + 0xC78, 0x960C1101, + 0xC78, 0x950D1101, + 0xC78, 0x940E1101, + 0xC78, 0x930F1101, + 0xC78, 0x92101101, + 0xC78, 0x91111101, + 0xC78, 0x90121101, + 0xC78, 0x8F131101, + 0xC78, 0x8E141101, + 0xC78, 0x8D151101, + 0xC78, 0x8C161101, + 0xC78, 0x8B171101, + 0xC78, 0x8A181101, + 0xC78, 0x89191101, + 0xC78, 0x881A1101, + 0xC78, 0x871B1101, + 0xC78, 0x861C1101, + 0xC78, 0x851D1101, + 0xC78, 0x841E1101, + 0xC78, 0x831F1101, + 0xC78, 0x82201101, + 0xC78, 0x81211101, + 0xC78, 0x80221101, + 0xC78, 0x43231101, + 0xC78, 0x42241101, + 0xC78, 0x41251101, + 0xC78, 0x04261101, + 0xC78, 0x03271101, + 0xC78, 0x02281101, + 0xC78, 0x01291101, + 0xC78, 0x002A1101, + 0xC78, 0xC42B1001, + 0xC78, 0xC32C1001, + 0xC78, 0xC22D1001, + 0xC78, 0xC12E1001, + 0xC78, 0xC02F1001, + 0xC78, 0x85301001, + 0xC78, 0x84311001, + 0xC78, 0x83321001, + 0xC78, 0x82331001, + 0xC78, 0x81341001, + 0xC78, 0x80351001, + 0xC78, 0x05361001, + 0xC78, 0x04371001, + 0xC78, 0x03381001, + 0xC78, 0x02391001, + 0xC78, 0x013A1001, + 0xC78, 0x003B1001, + 0xC78, 0x003C1001, + 0xC78, 0x003D1001, + 0xC78, 0x003E1001, + 0xC78, 0x003F1001, + 0xC50, 0x69553422, + 0xC50, 0x69553420, +}; + +RTW_DECL_TABLE_PHY_COND(rtw8723d_agc, rtw_phy_cfg_agc); + +static const u32 rtw8723d_bb[] = { + 0x800, 0x80046C00, + 0x804, 0x00000003, + 0x808, 0x0000FC00, + 0x80C, 0x0000000A, + 0x810, 0x10001331, + 0x814, 0x020C3D10, + 0x818, 0x00200385, + 0x81C, 0x00000000, + 0x820, 0x01000100, + 0x824, 0x00390204, + 0x828, 0x00000000, + 0x82C, 0x00000000, + 0x830, 0x00000000, + 0x834, 0x00000000, + 0x838, 0x00000000, + 0x83C, 0x00000000, + 0x840, 0x00010000, + 0x844, 0x00000000, + 0x848, 0x00000000, + 0x84C, 0x00000000, + 0x850, 0x00000000, + 0x854, 0x00000000, + 0x858, 0x569A11A9, + 0x85C, 0x01000014, + 0x860, 0x66F60110, + 0x864, 0x461F0641, + 0x868, 0x00000000, + 0x86C, 0x27272700, + 0x870, 0x07000460, + 0x874, 0x25004000, + 0x878, 0x00000808, + 0x87C, 0x004F0201, + 0x880, 0xB2002E12, + 0x884, 0x00000007, + 0x888, 0x00000000, + 0x88C, 0xCCC000C0, + 0x890, 0x00000800, + 0x894, 0xFFFFFFFE, + 0x898, 0x40302010, + 0x89C, 0x00706050, + 0x900, 0x00000000, + 0x904, 0x00000023, + 0x908, 0x00000000, + 0x90C, 0x81121111, + 0x910, 0x00000402, + 0x914, 0x00000300, + 0x920, 0x18C6318C, + 0x924, 0x0000018C, + 0x948, 0x99000000, + 0x94C, 0x00000010, + 0x950, 0x00003800, + 0x954, 0x5A380000, + 0x958, 0x4BC6D87A, + 0x95C, 0x04EB9B79, + 0x96C, 0x00000003, + 0x970, 0x00000000, + 0x974, 0x00000000, + 0x978, 0x00000000, + 0x97C, 0x13000000, + 0x980, 0x00000000, + 0xA00, 0x00D046C8, + 0xA04, 0x80FF800C, + 0xA08, 0x8C838300, + 0xA0C, 0x2E20100F, + 0xA10, 0x9500BB78, + 0xA14, 0x1114D028, + 0xA18, 0x00881117, + 0xA1C, 0x89140F00, + 0xA20, 0xE82C0001, + 0xA24, 0x64B80C1C, + 0xA28, 0x00008810, + 0xA2C, 0x00D30000, + 0xA70, 0x101FBF00, + 0xA74, 0x00000007, + 0xA78, 0x00008900, + 0xA7C, 0x225B0606, + 0xA80, 0x2180FA74, + 0xA84, 0x00200000, + 0xA88, 0x040C0000, + 0xA8C, 0x12345678, + 0xA90, 0xABCDEF00, + 0xA94, 0x001B1B89, + 0xA98, 0x00000000, + 0xA9C, 0x00020000, + 0xAA0, 0x00000000, + 0xAA4, 0x0000000C, + 0xAA8, 0xCA100008, + 0xAAC, 0x01235667, + 0xAB0, 0x00000000, + 0xAB4, 0x20201402, + 0xB2C, 0x00000000, + 0xC00, 0x48071D40, + 0xC04, 0x03A05611, + 0xC08, 0x000000E4, + 0xC0C, 0x6C6C6C6C, + 0xC10, 0x28800000, + 0xC14, 0x40000100, + 0xC18, 0x08800000, + 0xC1C, 0x40000100, + 0xC20, 0x00000000, + 0xC24, 0x00000000, + 0xC28, 0x00000000, + 0xC2C, 0x00000000, + 0xC30, 0x69E9AC48, + 0xC34, 0x31000040, + 0xC38, 0x21688080, + 0xC3C, 0x000016D4, + 0xC40, 0x1F78403F, + 0xC44, 0x00010036, + 0xC48, 0xEC020107, + 0xC4C, 0x007F037F, + 0xC50, 0x69553420, + 0xC54, 0x43BC0094, + 0xC58, 0x00015969, + 0xC5C, 0x00310492, + 0xC60, 0x00280A00, + 0xC64, 0x7112848B, + 0xC68, 0x47C074FF, + 0xC6C, 0x00000036, + 0xC70, 0x2C7F000D, + 0xC74, 0x020600DB, + 0xC78, 0x0000001F, + 0xC7C, 0x00B91612, + 0xC80, 0x390000E4, + 0xC84, 0x21F60000, + 0xC88, 0x40000100, + 0xC8C, 0x20200000, + 0xC90, 0x00091521, + 0xC94, 0x00000000, + 0xC98, 0x00121820, + 0xC9C, 0x00007F7F, + 0xCA0, 0x00012000, + 0xCA4, 0x800000A0, + 0xCA8, 0x84E6C606, + 0xCAC, 0x00000060, + 0xCB0, 0x00000000, + 0xCB4, 0x00000000, + 0xCB8, 0x00000000, + 0xCBC, 0x28000000, + 0xCC0, 0x0010A3D0, + 0xCC4, 0x00000F7D, + 0xCC8, 0x000442D6, + 0xCCC, 0x00000000, + 0xCD0, 0x000001C8, + 0xCD4, 0x001C8000, + 0xCD8, 0x00000100, + 0xCDC, 0x40100000, + 0xCE0, 0x00222220, + 0xCE4, 0x20000000, + 0xCE8, 0x37644302, + 0xCEC, 0x2F97D40C, + 0xD00, 0x00030740, + 0xD04, 0x40020401, + 0xD08, 0x0000907F, + 0xD0C, 0x20010201, + 0xD10, 0xA0633333, + 0xD14, 0x3333BC53, + 0xD18, 0x7A8F5B6F, + 0xD2C, 0xCC979975, + 0xD30, 0x00000000, + 0xD34, 0x40608000, + 0xD38, 0x88000000, + 0xD3C, 0xC0127343, + 0xD40, 0x00000000, + 0xD44, 0x00000000, + 0xD48, 0x00000000, + 0xD4C, 0x00000000, + 0xD50, 0x00000038, + 0xD54, 0x00000000, + 0xD58, 0x00000282, + 0xD5C, 0x30032064, + 0xD60, 0x4653DE68, + 0xD64, 0x04518A3C, + 0xD68, 0x00002101, + 0xE00, 0x2D2D2D2D, + 0xE04, 0x2D2D2D2D, + 0xE08, 0x0390272D, + 0xE10, 0x2D2D2D2D, + 0xE14, 0x2D2D2D2D, + 0xE18, 0x2D2D2D2D, + 0xE1C, 0x2D2D2D2D, + 0xE28, 0x00000000, + 0xE30, 0x1000DC1F, + 0xE34, 0x10008C1F, + 0xE38, 0x02140102, + 0xE3C, 0x681604C2, + 0xE40, 0x01007C00, + 0xE44, 0x01004800, + 0xE48, 0xFB000000, + 0xE4C, 0x000028D1, + 0xE50, 0x1000DC1F, + 0xE54, 0x10008C1F, + 0xE58, 0x02140102, + 0xE5C, 0x28160D05, + 0xE60, 0x00000008, + 0xE68, 0x001B25A4, + 0xE6C, 0x01C00014, + 0xE70, 0x01C00016, + 0xE74, 0x02000014, + 0xE78, 0x02000014, + 0xE7C, 0x02000014, + 0xE80, 0x02000014, + 0xE84, 0x01C00014, + 0xE88, 0x02000014, + 0xE8C, 0x01C00014, + 0xED0, 0x01C00014, + 0xED4, 0x01C00014, + 0xED8, 0x01C00014, + 0xEDC, 0x00000014, + 0xEE0, 0x00000014, + 0xEE8, 0x21555448, + 0xEEC, 0x03C00014, + 0xF14, 0x00000003, + 0xF00, 0x00100300, + 0xF08, 0x0000800B, + 0xF0C, 0x0000F007, + 0xF10, 0x0000A487, + 0xF1C, 0x80000064, + 0xF38, 0x00030155, + 0xF3C, 0x0000003A, + 0xF4C, 0x13000000, + 0xF50, 0x00000000, + 0xF18, 0x00000000, +}; + +RTW_DECL_TABLE_PHY_COND(rtw8723d_bb, rtw_phy_cfg_bb); + +static const struct rtw_phy_pg_cfg_pair rtw8723d_bb_pg[] = { + { 0, 0, 0, 0x00000e08, 0x0000ff00, 0x00003200, }, + { 0, 0, 0, 0x0000086c, 0xffffff00, 0x32323200, }, + { 0, 0, 0, 0x00000e00, 0xffffffff, 0x32343434, }, + { 0, 0, 0, 0x00000e04, 0xffffffff, 0x28303032, }, + { 0, 0, 0, 0x00000e10, 0xffffffff, 0x30323234, }, + { 0, 0, 0, 0x00000e14, 0xffffffff, 0x26282830, }, +}; + +RTW_DECL_TABLE_BB_PG(rtw8723d_bb_pg); + +static const u32 rtw8723d_rf_a[] = { + 0x050, 0x0001C000, + 0x049, 0x0004AA00, + 0x000, 0x00010000, + 0x0B1, 0x00054573, + 0x0B4, 0x000508AB, + 0x0B7, 0x00014787, + 0x0B8, 0x000064CB, + 0x01B, 0x00073A40, + 0x051, 0x00038CAF, + 0x052, 0x000FCCA3, + 0x053, 0x00090F38, + 0x054, 0x00011083, + 0x057, 0x000D0000, + 0x08D, 0x00000A1A, + 0x082, 0x00082AAC, + 0x08E, 0x00076940, + 0x08F, 0x00088400, + 0x061, 0x00038CAF, + 0x062, 0x000FCCA3, + 0x063, 0x00090F38, + 0x064, 0x00011083, + 0x067, 0x000D0000, + 0x092, 0x00082AAC, + 0x0EF, 0x00000400, + 0x030, 0x000008CA, + 0x030, 0x000018CA, + 0x030, 0x000028CA, + 0x030, 0x000038CA, + 0x0EF, 0x00000000, + 0x0EE, 0x00000400, + 0x030, 0x000008CA, + 0x030, 0x000018CA, + 0x030, 0x000028CA, + 0x030, 0x000038CA, + 0x0EE, 0x00000000, + 0x0EF, 0x00000100, + 0x033, 0x00000000, + 0x03F, 0x0000CCA3, + 0x033, 0x00000001, + 0x03F, 0x0000CCA3, + 0x033, 0x00000002, + 0x03F, 0x0000CCA3, + 0x033, 0x00000003, + 0x03F, 0x0000CCA3, + 0x033, 0x00000004, + 0x03F, 0x0000CCA3, + 0x033, 0x00000005, + 0x03F, 0x0000CCA3, + 0x033, 0x00000006, + 0x03F, 0x0000CCA3, + 0x033, 0x00000007, + 0x03F, 0x0000CCA3, + 0x0EF, 0x00000000, + 0x0EE, 0x00000100, + 0x033, 0x00000000, + 0x03F, 0x0000CCA3, + 0x033, 0x00000001, + 0x03F, 0x0000CCA3, + 0x033, 0x00000002, + 0x03F, 0x0000CCA3, + 0x033, 0x00000003, + 0x03F, 0x0000CCA3, + 0x033, 0x00000004, + 0x03F, 0x0000CCA3, + 0x033, 0x00000005, + 0x03F, 0x0000CCA3, + 0x033, 0x00000006, + 0x03F, 0x0000CCA3, + 0x033, 0x00000007, + 0x03F, 0x0000CCA3, + 0x0EE, 0x00000000, + 0x0EF, 0x00000800, + 0x030, 0x0000002D, + 0x030, 0x0000122C, + 0x030, 0x0000222F, + 0x030, 0x0000326C, + 0x030, 0x0000466B, + 0x030, 0x0000566E, + 0x030, 0x000066EB, + 0x030, 0x000077EC, + 0x030, 0x000087EF, + 0x030, 0x000097F2, + 0x030, 0x0000A7F5, + 0x0EF, 0x00000000, + 0x0EE, 0x00000800, + 0x030, 0x00000001, + 0x030, 0x00001011, + 0x030, 0x00002011, + 0x030, 0x00003013, + 0x030, 0x00004033, + 0x030, 0x00005033, + 0x030, 0x00006037, + 0x030, 0x0000703F, + 0x030, 0x0000803F, + 0x030, 0x0000903F, + 0x030, 0x0000A03F, + 0x0EE, 0x00000000, + 0x082, 0x00083B8C, + 0x0ED, 0x00000008, + 0x030, 0x000030F6, + 0x030, 0x00002004, + 0x030, 0x000010F6, + 0x030, 0x000000F6, + 0x0ED, 0x00000000, + 0x092, 0x00083B8C, + 0x0EC, 0x00000008, + 0x030, 0x000030F6, + 0x030, 0x00002004, + 0x030, 0x000010F6, + 0x030, 0x000000F6, + 0x0EC, 0x00000000, + 0x0EF, 0x00010000, + 0x030, 0x0001C11C, + 0x030, 0x000181F4, + 0x030, 0x00014108, + 0x030, 0x000101E4, + 0x030, 0x0000C11C, + 0x030, 0x000081F4, + 0x030, 0x00004108, + 0x030, 0x000001E4, + 0x0EF, 0x00000000, + 0x0EE, 0x00010000, + 0x030, 0x0001C11C, + 0x030, 0x000181F4, + 0x030, 0x00014108, + 0x030, 0x000101E4, + 0x030, 0x0000C11C, + 0x030, 0x000081F4, + 0x030, 0x00004108, + 0x030, 0x000001E4, + 0x0EE, 0x00000000, + 0x0EF, 0x00080000, + 0x033, 0x00000007, + 0x03E, 0x0000005F, + 0x03F, 0x000B3FDB, + 0x033, 0x00000004, + 0x03E, 0x0000005D, + 0x03F, 0x000BFFE0, + 0x033, 0x00000005, + 0x03E, 0x0000005D, + 0x03F, 0x000FBFCE, + 0x033, 0x00000006, + 0x03E, 0x0000005F, + 0x03F, 0x000A7FFB, + 0x0EF, 0x00000000, + 0x0EE, 0x00000002, + 0x030, 0x00000001, + 0x030, 0x00002001, + 0x030, 0x00004001, + 0x030, 0x00007001, + 0x030, 0x00006001, + 0x030, 0x00020001, + 0x030, 0x00022001, + 0x030, 0x00024001, + 0x030, 0x00027001, + 0x030, 0x00026001, + 0x030, 0x00034001, + 0x030, 0x00037001, + 0x030, 0x00036001, + 0x030, 0x00008000, + 0x030, 0x0000A000, + 0x030, 0x0000C000, + 0x83000100, 0x00000000, 0x40000000, 0x00000000, + 0x030, 0x0000E024, + 0xA0000000, 0x00000000, + 0x030, 0x0000E000, + 0xB0000000, 0x00000000, + 0x030, 0x0001C000, + 0x030, 0x0001E000, + 0x0EE, 0x00000000, + 0x0EE, 0x00020000, + 0x0EF, 0x00020000, + 0x030, 0x00000F75, + 0x030, 0x00002F55, + 0x030, 0x00003F75, + 0x0EE, 0x00000000, + 0x0EF, 0x00000000, + 0x018, 0x00008401, + 0xFFE, 0x00000000, +}; + +RTW_DECL_TABLE_RF_RADIO(rtw8723d_rf_a, A); + +static const struct rtw_txpwr_lmt_cfg_pair rtw8723d_txpwr_lmt[] = { + {0, 0, 0, 0, 1, 30, }, + {2, 0, 0, 0, 1, 30, }, + {1, 0, 0, 0, 1, 30, }, + {0, 0, 0, 0, 2, 30, }, + {2, 0, 0, 0, 2, 30, }, + {1, 0, 0, 0, 2, 30, }, + {0, 0, 0, 0, 3, 30, }, + {2, 0, 0, 0, 3, 30, }, + {1, 0, 0, 0, 3, 30, }, + {0, 0, 0, 0, 4, 30, }, + {2, 0, 0, 0, 4, 30, }, + {1, 0, 0, 0, 4, 30, }, + {0, 0, 0, 0, 5, 30, }, + {2, 0, 0, 0, 5, 30, }, + {1, 0, 0, 0, 5, 30, }, + {0, 0, 0, 0, 6, 30, }, + {2, 0, 0, 0, 6, 30, }, + {1, 0, 0, 0, 6, 30, }, + {0, 0, 0, 0, 7, 30, }, + {2, 0, 0, 0, 7, 30, }, + {1, 0, 0, 0, 7, 30, }, + {0, 0, 0, 0, 8, 30, }, + {2, 0, 0, 0, 8, 30, }, + {1, 0, 0, 0, 8, 30, }, + {0, 0, 0, 0, 9, 30, }, + {2, 0, 0, 0, 9, 30, }, + {1, 0, 0, 0, 9, 30, }, + {0, 0, 0, 0, 10, 30, }, + {2, 0, 0, 0, 10, 30, }, + {1, 0, 0, 0, 10, 30, }, + {0, 0, 0, 0, 11, 30, }, + {2, 0, 0, 0, 11, 30, }, + {1, 0, 0, 0, 11, 30, }, + {0, 0, 0, 0, 12, 30, }, + {2, 0, 0, 0, 12, 30, }, + {1, 0, 0, 0, 12, 30, }, + {0, 0, 0, 0, 13, 17, }, + {2, 0, 0, 0, 13, 30, }, + {1, 0, 0, 0, 13, 30, }, + {0, 0, 0, 0, 14, 63, }, + {2, 0, 0, 0, 14, 63, }, + {1, 0, 0, 0, 14, 30, }, + {0, 0, 0, 1, 1, 26, }, + {2, 0, 0, 1, 1, 31, }, + {1, 0, 0, 1, 1, 31, }, + {0, 0, 0, 1, 2, 28, }, + {2, 0, 0, 1, 2, 31, }, + {1, 0, 0, 1, 2, 31, }, + {0, 0, 0, 1, 3, 30, }, + {2, 0, 0, 1, 3, 31, }, + {1, 0, 0, 1, 3, 31, }, + {0, 0, 0, 1, 4, 30, }, + {2, 0, 0, 1, 4, 31, }, + {1, 0, 0, 1, 4, 31, }, + {0, 0, 0, 1, 5, 30, }, + {2, 0, 0, 1, 5, 31, }, + {1, 0, 0, 1, 5, 31, }, + {0, 0, 0, 1, 6, 30, }, + {2, 0, 0, 1, 6, 31, }, + {1, 0, 0, 1, 6, 31, }, + {0, 0, 0, 1, 7, 30, }, + {2, 0, 0, 1, 7, 31, }, + {1, 0, 0, 1, 7, 31, }, + {0, 0, 0, 1, 8, 30, }, + {2, 0, 0, 1, 8, 31, }, + {1, 0, 0, 1, 8, 31, }, + {0, 0, 0, 1, 9, 30, }, + {2, 0, 0, 1, 9, 31, }, + {1, 0, 0, 1, 9, 31, }, + {0, 0, 0, 1, 10, 28, }, + {2, 0, 0, 1, 10, 31, }, + {1, 0, 0, 1, 10, 31, }, + {0, 0, 0, 1, 11, 26, }, + {2, 0, 0, 1, 11, 31, }, + {1, 0, 0, 1, 11, 31, }, + {0, 0, 0, 1, 12, 24, }, + {2, 0, 0, 1, 12, 31, }, + {1, 0, 0, 1, 12, 31, }, + {0, 0, 0, 1, 13, 14, }, + {2, 0, 0, 1, 13, 31, }, + {1, 0, 0, 1, 13, 31, }, + {0, 0, 0, 1, 14, 63, }, + {2, 0, 0, 1, 14, 63, }, + {1, 0, 0, 1, 14, 63, }, + {0, 0, 0, 2, 1, 24, }, + {2, 0, 0, 2, 1, 31, }, + {1, 0, 0, 2, 1, 31, }, + {0, 0, 0, 2, 2, 26, }, + {2, 0, 0, 2, 2, 31, }, + {1, 0, 0, 2, 2, 31, }, + {0, 0, 0, 2, 3, 30, }, + {2, 0, 0, 2, 3, 31, }, + {1, 0, 0, 2, 3, 31, }, + {0, 0, 0, 2, 4, 30, }, + {2, 0, 0, 2, 4, 31, }, + {1, 0, 0, 2, 4, 31, }, + {0, 0, 0, 2, 5, 30, }, + {2, 0, 0, 2, 5, 31, }, + {1, 0, 0, 2, 5, 31, }, + {0, 0, 0, 2, 6, 30, }, + {2, 0, 0, 2, 6, 31, }, + {1, 0, 0, 2, 6, 31, }, + {0, 0, 0, 2, 7, 30, }, + {2, 0, 0, 2, 7, 31, }, + {1, 0, 0, 2, 7, 31, }, + {0, 0, 0, 2, 8, 30, }, + {2, 0, 0, 2, 8, 31, }, + {1, 0, 0, 2, 8, 31, }, + {0, 0, 0, 2, 9, 30, }, + {2, 0, 0, 2, 9, 31, }, + {1, 0, 0, 2, 9, 31, }, + {0, 0, 0, 2, 10, 26, }, + {2, 0, 0, 2, 10, 31, }, + {1, 0, 0, 2, 10, 31, }, + {0, 0, 0, 2, 11, 24, }, + {2, 0, 0, 2, 11, 31, }, + {1, 0, 0, 2, 11, 31, }, + {0, 0, 0, 2, 12, 23, }, + {2, 0, 0, 2, 12, 31, }, + {1, 0, 0, 2, 12, 31, }, + {0, 0, 0, 2, 13, 13, }, + {2, 0, 0, 2, 13, 31, }, + {1, 0, 0, 2, 13, 31, }, + {0, 0, 0, 2, 14, 63, }, + {2, 0, 0, 2, 14, 63, }, + {1, 0, 0, 2, 14, 63, }, + {0, 0, 0, 3, 1, 28, }, + {2, 0, 0, 3, 1, 30, }, + {1, 0, 0, 3, 1, 30, }, + {0, 0, 0, 3, 2, 28, }, + {2, 0, 0, 3, 2, 30, }, + {1, 0, 0, 3, 2, 30, }, + {0, 0, 0, 3, 3, 30, }, + {2, 0, 0, 3, 3, 30, }, + {1, 0, 0, 3, 3, 30, }, + {0, 0, 0, 3, 4, 30, }, + {2, 0, 0, 3, 4, 30, }, + {1, 0, 0, 3, 4, 30, }, + {0, 0, 0, 3, 5, 30, }, + {2, 0, 0, 3, 5, 30, }, + {1, 0, 0, 3, 5, 30, }, + {0, 0, 0, 3, 6, 30, }, + {2, 0, 0, 3, 6, 30, }, + {1, 0, 0, 3, 6, 30, }, + {0, 0, 0, 3, 7, 30, }, + {2, 0, 0, 3, 7, 30, }, + {1, 0, 0, 3, 7, 30, }, + {0, 0, 0, 3, 8, 30, }, + {2, 0, 0, 3, 8, 30, }, + {1, 0, 0, 3, 8, 30, }, + {0, 0, 0, 3, 9, 28, }, + {2, 0, 0, 3, 9, 30, }, + {1, 0, 0, 3, 9, 30, }, + {0, 0, 0, 3, 10, 28, }, + {2, 0, 0, 3, 10, 30, }, + {1, 0, 0, 3, 10, 30, }, + {0, 0, 0, 3, 11, 28, }, + {2, 0, 0, 3, 11, 30, }, + {1, 0, 0, 3, 11, 30, }, + {0, 0, 0, 3, 12, 63, }, + {2, 0, 0, 3, 12, 30, }, + {1, 0, 0, 3, 12, 30, }, + {0, 0, 0, 3, 13, 63, }, + {2, 0, 0, 3, 13, 30, }, + {1, 0, 0, 3, 13, 30, }, + {0, 0, 0, 3, 14, 63, }, + {2, 0, 0, 3, 14, 63, }, + {1, 0, 0, 3, 14, 63, }, + {0, 0, 1, 2, 1, 63, }, + {2, 0, 1, 2, 1, 63, }, + {1, 0, 1, 2, 1, 63, }, + {0, 0, 1, 2, 2, 63, }, + {2, 0, 1, 2, 2, 63, }, + {1, 0, 1, 2, 2, 63, }, + {0, 0, 1, 2, 3, 24, }, + {2, 0, 1, 2, 3, 30, }, + {1, 0, 1, 2, 3, 30, }, + {0, 0, 1, 2, 4, 24, }, + {2, 0, 1, 2, 4, 30, }, + {1, 0, 1, 2, 4, 30, }, + {0, 0, 1, 2, 5, 24, }, + {2, 0, 1, 2, 5, 30, }, + {1, 0, 1, 2, 5, 30, }, + {0, 0, 1, 2, 6, 24, }, + {2, 0, 1, 2, 6, 30, }, + {1, 0, 1, 2, 6, 30, }, + {0, 0, 1, 2, 7, 24, }, + {2, 0, 1, 2, 7, 30, }, + {1, 0, 1, 2, 7, 30, }, + {0, 0, 1, 2, 8, 24, }, + {2, 0, 1, 2, 8, 30, }, + {1, 0, 1, 2, 8, 30, }, + {0, 0, 1, 2, 9, 24, }, + {2, 0, 1, 2, 9, 30, }, + {1, 0, 1, 2, 9, 30, }, + {0, 0, 1, 2, 10, 22, }, + {2, 0, 1, 2, 10, 30, }, + {1, 0, 1, 2, 10, 30, }, + {0, 0, 1, 2, 11, 20, }, + {2, 0, 1, 2, 11, 30, }, + {1, 0, 1, 2, 11, 30, }, + {0, 0, 1, 2, 12, 63, }, + {2, 0, 1, 2, 12, 30, }, + {1, 0, 1, 2, 12, 30, }, + {0, 0, 1, 2, 13, 63, }, + {2, 0, 1, 2, 13, 30, }, + {1, 0, 1, 2, 13, 30, }, + {0, 0, 1, 2, 14, 63, }, + {2, 0, 1, 2, 14, 63, }, + {1, 0, 1, 2, 14, 63, }, + {0, 0, 1, 3, 1, 63, }, + {2, 0, 1, 3, 1, 63, }, + {1, 0, 1, 3, 1, 63, }, + {0, 0, 1, 3, 2, 63, }, + {2, 0, 1, 3, 2, 63, }, + {1, 0, 1, 3, 2, 63, }, + {0, 0, 1, 3, 3, 26, }, + {2, 0, 1, 3, 3, 26, }, + {1, 0, 1, 3, 3, 26, }, + {0, 0, 1, 3, 4, 26, }, + {2, 0, 1, 3, 4, 26, }, + {1, 0, 1, 3, 4, 26, }, + {0, 0, 1, 3, 5, 26, }, + {2, 0, 1, 3, 5, 26, }, + {1, 0, 1, 3, 5, 26, }, + {0, 0, 1, 3, 6, 26, }, + {2, 0, 1, 3, 6, 26, }, + {1, 0, 1, 3, 6, 26, }, + {0, 0, 1, 3, 7, 26, }, + {2, 0, 1, 3, 7, 26, }, + {1, 0, 1, 3, 7, 26, }, + {0, 0, 1, 3, 8, 26, }, + {2, 0, 1, 3, 8, 26, }, + {1, 0, 1, 3, 8, 26, }, + {0, 0, 1, 3, 9, 26, }, + {2, 0, 1, 3, 9, 26, }, + {1, 0, 1, 3, 9, 26, }, + {0, 0, 1, 3, 10, 26, }, + {2, 0, 1, 3, 10, 26, }, + {1, 0, 1, 3, 10, 26, }, + {0, 0, 1, 3, 11, 26, }, + {2, 0, 1, 3, 11, 26, }, + {1, 0, 1, 3, 11, 26, }, + {0, 0, 1, 3, 12, 63, }, + {2, 0, 1, 3, 12, 26, }, + {1, 0, 1, 3, 12, 26, }, + {0, 0, 1, 3, 13, 63, }, + {2, 0, 1, 3, 13, 26, }, + {1, 0, 1, 3, 13, 26, }, + {0, 0, 1, 3, 14, 63, }, + {2, 0, 1, 3, 14, 63, }, + {1, 0, 1, 3, 14, 63, }, +}; + +RTW_DECL_TABLE_TXPWR_LMT(rtw8723d_txpwr_lmt); diff --git a/drivers/net/wireless/realtek/rtw88/rtw8723d_table.h b/drivers/net/wireless/realtek/rtw88/rtw8723d_table.h index ea5933ffd043..4db996a1d982 100644 --- a/drivers/net/wireless/realtek/rtw88/rtw8723d_table.h +++ b/drivers/net/wireless/realtek/rtw88/rtw8723d_table.h @@ -5,4 +5,11 @@ #ifndef __RTW8723D_TABLE_H__ #define __RTW8723D_TABLE_H__ +extern const struct rtw_table rtw8723d_mac_tbl; +extern const struct rtw_table rtw8723d_agc_tbl; +extern const struct rtw_table rtw8723d_bb_tbl; +extern const struct rtw_table rtw8723d_bb_pg_tbl; +extern const struct rtw_table rtw8723d_rf_a_tbl; +extern const struct rtw_table rtw8723d_txpwr_lmt_tbl; + #endif -- cgit v1.2.3-59-g8ed1b From 1afb5eb7a00dab15551bbfb24c4e8a750da21827 Mon Sep 17 00:00:00 2001 From: Ping-Ke Shih Date: Mon, 20 Apr 2020 13:50:52 +0800 Subject: rtw88: 8723d: Add cfg_ldo25 to control LDO25 Implement rtw_chip_ops::cfg_ldo25 to enable/disable LDO25 with proper voltage. Signed-off-by: Ping-Ke Shih Signed-off-by: Yan-Hsuan Chuang Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/20200420055054.14592-7-yhchuang@realtek.com --- drivers/net/wireless/realtek/rtw88/reg.h | 5 +++++ drivers/net/wireless/realtek/rtw88/rtw8723d.c | 15 +++++++++++++++ drivers/net/wireless/realtek/rtw88/rtw8822b.c | 2 +- 3 files changed, 21 insertions(+), 1 deletion(-) diff --git a/drivers/net/wireless/realtek/rtw88/reg.h b/drivers/net/wireless/realtek/rtw88/reg.h index 9d94534c9674..2afd547ebcc9 100644 --- a/drivers/net/wireless/realtek/rtw88/reg.h +++ b/drivers/net/wireless/realtek/rtw88/reg.h @@ -37,6 +37,11 @@ #define REG_LDO_EFUSE_CTRL 0x0034 #define BIT_MASK_EFUSE_BANK_SEL (BIT(8) | BIT(9)) +#define BIT_LDO25_VOLTAGE_V25 0x03 +#define BIT_MASK_LDO25_VOLTAGE GENMASK(6, 4) +#define BIT_SHIFT_LDO25_VOLTAGE 4 +#define BIT_LDO25_EN BIT(7) + #define REG_GPIO_MUXCFG 0x0040 #define BIT_FSPI_EN BIT(19) #define BIT_BT_AOD_GPIO3 BIT(9) diff --git a/drivers/net/wireless/realtek/rtw88/rtw8723d.c b/drivers/net/wireless/realtek/rtw88/rtw8723d.c index 4fe433549285..04f8d73e4e6c 100644 --- a/drivers/net/wireless/realtek/rtw88/rtw8723d.c +++ b/drivers/net/wireless/realtek/rtw88/rtw8723d.c @@ -14,10 +14,25 @@ #include "reg.h" #include "debug.h" +static void rtw8723d_cfg_ldo25(struct rtw_dev *rtwdev, bool enable) +{ + u8 ldo_pwr; + + ldo_pwr = rtw_read8(rtwdev, REG_LDO_EFUSE_CTRL + 3); + if (enable) { + ldo_pwr &= ~BIT_MASK_LDO25_VOLTAGE; + ldo_pwr = (BIT_LDO25_VOLTAGE_V25 << 4) | BIT_LDO25_EN; + } else { + ldo_pwr &= ~BIT_LDO25_EN; + } + rtw_write8(rtwdev, REG_LDO_EFUSE_CTRL + 3, ldo_pwr); +} + static struct rtw_chip_ops rtw8723d_ops = { .read_rf = rtw_phy_read_rf_sipi, .write_rf = rtw_phy_write_rf_reg_sipi, .set_antenna = NULL, + .cfg_ldo25 = rtw8723d_cfg_ldo25, .config_bfee = NULL, .set_gid_table = NULL, .cfg_csi_rate = NULL, diff --git a/drivers/net/wireless/realtek/rtw88/rtw8822b.c b/drivers/net/wireless/realtek/rtw88/rtw8822b.c index c02f3a730369..9a2e18e7624f 100644 --- a/drivers/net/wireless/realtek/rtw88/rtw8822b.c +++ b/drivers/net/wireless/realtek/rtw88/rtw8822b.c @@ -1030,7 +1030,7 @@ static void rtw8822b_cfg_ldo25(struct rtw_dev *rtwdev, bool enable) u8 ldo_pwr; ldo_pwr = rtw_read8(rtwdev, REG_LDO_EFUSE_CTRL + 3); - ldo_pwr = enable ? ldo_pwr | BIT(7) : ldo_pwr & ~BIT(7); + ldo_pwr = enable ? ldo_pwr | BIT_LDO25_EN : ldo_pwr & ~BIT_LDO25_EN; rtw_write8(rtwdev, REG_LDO_EFUSE_CTRL + 3, ldo_pwr); } -- cgit v1.2.3-59-g8ed1b From 44baa97ca820dbc81dfde076937d15bc725a3a54 Mon Sep 17 00:00:00 2001 From: Ping-Ke Shih Date: Mon, 20 Apr 2020 13:50:53 +0800 Subject: rtw88: 8723d: Add new chip op efuse_grant() to control efuse access 8723D devices need to grant efuse access before dumping physical efuse map, other chips don't need it, so keep this ops as blank. Signed-off-by: Ping-Ke Shih Signed-off-by: Yan-Hsuan Chuang Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/20200420055054.14592-8-yhchuang@realtek.com --- drivers/net/wireless/realtek/rtw88/efuse.c | 4 ++++ drivers/net/wireless/realtek/rtw88/main.h | 13 +++++++++++++ drivers/net/wireless/realtek/rtw88/reg.h | 9 +++++++++ drivers/net/wireless/realtek/rtw88/rtw8723d.c | 13 +++++++++++++ 4 files changed, 39 insertions(+) diff --git a/drivers/net/wireless/realtek/rtw88/efuse.c b/drivers/net/wireless/realtek/rtw88/efuse.c index 212c8376a8c9..df969d346b41 100644 --- a/drivers/net/wireless/realtek/rtw88/efuse.c +++ b/drivers/net/wireless/realtek/rtw88/efuse.c @@ -90,6 +90,8 @@ static int rtw_dump_physical_efuse_map(struct rtw_dev *rtwdev, u8 *map) u32 addr; u32 cnt; + rtw_chip_efuse_grant_on(rtwdev); + switch_efuse_bank(rtwdev); /* disable 2.5V LDO */ @@ -113,6 +115,8 @@ static int rtw_dump_physical_efuse_map(struct rtw_dev *rtwdev, u8 *map) *(map + addr) = (u8)(efuse_ctl & BIT_MASK_EF_DATA); } + rtw_chip_efuse_grant_off(rtwdev); + return 0; } diff --git a/drivers/net/wireless/realtek/rtw88/main.h b/drivers/net/wireless/realtek/rtw88/main.h index 8f15fc113af0..74302181da53 100644 --- a/drivers/net/wireless/realtek/rtw88/main.h +++ b/drivers/net/wireless/realtek/rtw88/main.h @@ -811,6 +811,7 @@ struct rtw_chip_ops { u32 antenna_tx, u32 antenna_rx); void (*cfg_ldo25)(struct rtw_dev *rtwdev, bool enable); + void (*efuse_grant)(struct rtw_dev *rtwdev, bool enable); void (*false_alarm_statistics)(struct rtw_dev *rtwdev); void (*phy_calibration)(struct rtw_dev *rtwdev); void (*dpk_track)(struct rtw_dev *rtwdev); @@ -1712,6 +1713,18 @@ static inline bool rtw_ssid_equal(struct cfg80211_ssid *a, return true; } +static inline void rtw_chip_efuse_grant_on(struct rtw_dev *rtwdev) +{ + if (rtwdev->chip->ops->efuse_grant) + rtwdev->chip->ops->efuse_grant(rtwdev, true); +} + +static inline void rtw_chip_efuse_grant_off(struct rtw_dev *rtwdev) +{ + if (rtwdev->chip->ops->efuse_grant) + rtwdev->chip->ops->efuse_grant(rtwdev, false); +} + void rtw_get_channel_params(struct cfg80211_chan_def *chandef, struct rtw_channel_params *ch_param); bool check_hw_ready(struct rtw_dev *rtwdev, u32 addr, u32 mask, u32 target); diff --git a/drivers/net/wireless/realtek/rtw88/reg.h b/drivers/net/wireless/realtek/rtw88/reg.h index 2afd547ebcc9..911d8e75db77 100644 --- a/drivers/net/wireless/realtek/rtw88/reg.h +++ b/drivers/net/wireless/realtek/rtw88/reg.h @@ -6,6 +6,7 @@ #define __RTW_REG_DEF_H__ #define REG_SYS_FUNC_EN 0x0002 +#define BIT_FEN_ELDR BIT(12) #define BIT_FEN_CPUEN BIT(2) #define BIT_FEN_BB_GLB_RST BIT(1) #define BIT_FEN_BB_RSTB BIT(0) @@ -15,6 +16,10 @@ #define REG_SYS_CLK_CTRL 0x0008 #define BIT_CPU_CLK_EN BIT(14) +#define REG_SYS_CLKR 0x0008 +#define BIT_ANA8M BIT(1) +#define BIT_LOADER_CLK_EN BIT(5) + #define REG_RSV_CTRL 0x001C #define DISABLE_PI 0x3 #define ENABLE_PI 0x2 @@ -87,6 +92,10 @@ BIT_CHECK_SUM_OK) #define FW_READY_MASK 0xffff +#define REG_EFUSE_ACCESS 0x00CF +#define EFUSE_ACCESS_ON 0x69 +#define EFUSE_ACCESS_OFF 0x00 + #define REG_WLRF1 0x00EC #define REG_WIFI_BT_INFO 0x00AA #define BIT_BT_INT_EN BIT(15) diff --git a/drivers/net/wireless/realtek/rtw88/rtw8723d.c b/drivers/net/wireless/realtek/rtw88/rtw8723d.c index 04f8d73e4e6c..756454d69fad 100644 --- a/drivers/net/wireless/realtek/rtw88/rtw8723d.c +++ b/drivers/net/wireless/realtek/rtw88/rtw8723d.c @@ -28,11 +28,24 @@ static void rtw8723d_cfg_ldo25(struct rtw_dev *rtwdev, bool enable) rtw_write8(rtwdev, REG_LDO_EFUSE_CTRL + 3, ldo_pwr); } +static void rtw8723d_efuse_grant(struct rtw_dev *rtwdev, bool on) +{ + if (on) { + rtw_write8(rtwdev, REG_EFUSE_ACCESS, EFUSE_ACCESS_ON); + + rtw_write16_set(rtwdev, REG_SYS_FUNC_EN, BIT_FEN_ELDR); + rtw_write16_set(rtwdev, REG_SYS_CLKR, BIT_LOADER_CLK_EN | BIT_ANA8M); + } else { + rtw_write8(rtwdev, REG_EFUSE_ACCESS, EFUSE_ACCESS_OFF); + } +} + static struct rtw_chip_ops rtw8723d_ops = { .read_rf = rtw_phy_read_rf_sipi, .write_rf = rtw_phy_write_rf_reg_sipi, .set_antenna = NULL, .cfg_ldo25 = rtw8723d_cfg_ldo25, + .efuse_grant = rtw8723d_efuse_grant, .config_bfee = NULL, .set_gid_table = NULL, .cfg_csi_rate = NULL, -- cgit v1.2.3-59-g8ed1b From ab0a031ecf2908c77833caebf0c86bab5e9f12b7 Mon Sep 17 00:00:00 2001 From: Ping-Ke Shih Date: Mon, 20 Apr 2020 13:50:54 +0800 Subject: rtw88: 8723d: Add read_efuse to recognize efuse info from map The logical efuse map is decoded from physical map by parsing the header format of the physical map. And each different type of chips has different logical efuse layout. So add the logical map's layout for parsing the efuse contents. Signed-off-by: Ping-Ke Shih Signed-off-by: Yan-Hsuan Chuang Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/20200420055054.14592-9-yhchuang@realtek.com --- drivers/net/wireless/realtek/rtw88/rtw8723d.c | 43 +++++++++++++++++++++++++++ drivers/net/wireless/realtek/rtw88/rtw8723d.h | 39 ++++++++++++++++++++++++ 2 files changed, 82 insertions(+) diff --git a/drivers/net/wireless/realtek/rtw88/rtw8723d.c b/drivers/net/wireless/realtek/rtw88/rtw8723d.c index 756454d69fad..c25cabbab64d 100644 --- a/drivers/net/wireless/realtek/rtw88/rtw8723d.c +++ b/drivers/net/wireless/realtek/rtw88/rtw8723d.c @@ -14,6 +14,48 @@ #include "reg.h" #include "debug.h" +static void rtw8723de_efuse_parsing(struct rtw_efuse *efuse, + struct rtw8723d_efuse *map) +{ + ether_addr_copy(efuse->addr, map->e.mac_addr); +} + +static int rtw8723d_read_efuse(struct rtw_dev *rtwdev, u8 *log_map) +{ + struct rtw_efuse *efuse = &rtwdev->efuse; + struct rtw8723d_efuse *map; + int i; + + map = (struct rtw8723d_efuse *)log_map; + + efuse->rfe_option = 0; + efuse->rf_board_option = map->rf_board_option; + efuse->crystal_cap = map->xtal_k; + efuse->pa_type_2g = map->pa_type; + efuse->lna_type_2g = map->lna_type_2g[0]; + efuse->channel_plan = map->channel_plan; + efuse->country_code[0] = map->country_code[0]; + efuse->country_code[1] = map->country_code[1]; + efuse->bt_setting = map->rf_bt_setting; + efuse->regd = map->rf_board_option & 0x7; + efuse->thermal_meter[0] = map->thermal_meter; + efuse->thermal_meter_k = map->thermal_meter; + + for (i = 0; i < 4; i++) + efuse->txpwr_idx_table[i] = map->txpwr_idx_table[i]; + + switch (rtw_hci_type(rtwdev)) { + case RTW_HCI_TYPE_PCIE: + rtw8723de_efuse_parsing(efuse, map); + break; + default: + /* unsupported now */ + return -ENOTSUPP; + } + + return 0; +} + static void rtw8723d_cfg_ldo25(struct rtw_dev *rtwdev, bool enable) { u8 ldo_pwr; @@ -41,6 +83,7 @@ static void rtw8723d_efuse_grant(struct rtw_dev *rtwdev, bool on) } static struct rtw_chip_ops rtw8723d_ops = { + .read_efuse = rtw8723d_read_efuse, .read_rf = rtw_phy_read_rf_sipi, .write_rf = rtw_phy_write_rf_reg_sipi, .set_antenna = NULL, diff --git a/drivers/net/wireless/realtek/rtw88/rtw8723d.h b/drivers/net/wireless/realtek/rtw88/rtw8723d.h index 0b784cfc34c6..1939d9897a26 100644 --- a/drivers/net/wireless/realtek/rtw88/rtw8723d.h +++ b/drivers/net/wireless/realtek/rtw88/rtw8723d.h @@ -5,4 +5,43 @@ #ifndef __RTW8723D_H__ #define __RTW8723D_H__ +struct rtw8723de_efuse { + u8 mac_addr[ETH_ALEN]; /* 0xd0 */ + u8 vender_id[2]; + u8 device_id[2]; + u8 sub_vender_id[2]; + u8 sub_device_id[2]; +}; + +struct rtw8723d_efuse { + __le16 rtl_id; + u8 rsvd[2]; + u8 afe; + u8 rsvd1[11]; + + /* power index for four RF paths */ + struct rtw_txpwr_idx txpwr_idx_table[4]; + + u8 channel_plan; /* 0xb8 */ + u8 xtal_k; + u8 thermal_meter; + u8 iqk_lck; + u8 pa_type; /* 0xbc */ + u8 lna_type_2g[2]; /* 0xbd */ + u8 lna_type_5g[2]; + u8 rf_board_option; + u8 rf_feature_option; + u8 rf_bt_setting; + u8 eeprom_version; + u8 eeprom_customer_id; + u8 tx_bb_swing_setting_2g; + u8 res_c7; + u8 tx_pwr_calibrate_rate; + u8 rf_antenna_option; /* 0xc9 */ + u8 rfe_option; + u8 country_code[2]; + u8 res[3]; + struct rtw8723de_efuse e; +}; + #endif -- cgit v1.2.3-59-g8ed1b From 5ad4d8957b69f3ebf95ac02212c388bda75aeb30 Mon Sep 17 00:00:00 2001 From: Tzu-En Huang Date: Mon, 20 Apr 2020 18:52:07 +0800 Subject: rtw88: set power trim according to efuse PG values 8822C devices have power trim, thermal and PA bias values programmed in efuse. Driver should configure the RF components according to the values. If the power trim is not configured, then the devices might have distortion on the output tx power. Signed-off-by: Tzu-En Huang Signed-off-by: Yan-Hsuan Chuang Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/20200420105207.31899-1-yhchuang@realtek.com --- drivers/net/wireless/realtek/rtw88/efuse.c | 22 +++++ drivers/net/wireless/realtek/rtw88/efuse.h | 3 + drivers/net/wireless/realtek/rtw88/rtw8822c.c | 113 ++++++++++++++++++++++++++ drivers/net/wireless/realtek/rtw88/rtw8822c.h | 28 +++++++ 4 files changed, 166 insertions(+) diff --git a/drivers/net/wireless/realtek/rtw88/efuse.c b/drivers/net/wireless/realtek/rtw88/efuse.c index df969d346b41..13d1c58d6de5 100644 --- a/drivers/net/wireless/realtek/rtw88/efuse.c +++ b/drivers/net/wireless/realtek/rtw88/efuse.c @@ -2,6 +2,8 @@ /* Copyright(c) 2018-2019 Realtek Corporation */ +#include + #include "main.h" #include "efuse.h" #include "reg.h" @@ -120,6 +122,26 @@ static int rtw_dump_physical_efuse_map(struct rtw_dev *rtwdev, u8 *map) return 0; } +int rtw_read8_physical_efuse(struct rtw_dev *rtwdev, u16 addr, u8 *data) +{ + u32 efuse_ctl; + int ret; + + rtw_write32_mask(rtwdev, REG_EFUSE_CTRL, 0x3ff00, addr); + rtw_write32_clr(rtwdev, REG_EFUSE_CTRL, BIT_EF_FLAG); + + ret = read_poll_timeout(rtw_read32, efuse_ctl, efuse_ctl & BIT_EF_FLAG, + 1000, 100000, false, rtwdev, REG_EFUSE_CTRL); + if (ret) { + *data = EFUSE_READ_FAIL; + return ret; + } + + *data = rtw_read8(rtwdev, REG_EFUSE_CTRL); + + return 0; +} + int rtw_parse_efuse_map(struct rtw_dev *rtwdev) { struct rtw_chip_info *chip = rtwdev->chip; diff --git a/drivers/net/wireless/realtek/rtw88/efuse.h b/drivers/net/wireless/realtek/rtw88/efuse.h index 115bbe85946a..97a51f0b0e46 100644 --- a/drivers/net/wireless/realtek/rtw88/efuse.h +++ b/drivers/net/wireless/realtek/rtw88/efuse.h @@ -10,6 +10,8 @@ #define EFUSE_HW_CAP_SUPP_BW80 7 #define EFUSE_HW_CAP_SUPP_BW40 6 +#define EFUSE_READ_FAIL 0xff + #define GET_EFUSE_HW_CAP_HCI(hw_cap) \ le32_get_bits(*((__le32 *)(hw_cap) + 0x01), GENMASK(3, 0)) #define GET_EFUSE_HW_CAP_BW(hw_cap) \ @@ -22,5 +24,6 @@ le32_get_bits(*((__le32 *)(hw_cap) + 0x01), GENMASK(27, 26)) int rtw_parse_efuse_map(struct rtw_dev *rtwdev); +int rtw_read8_physical_efuse(struct rtw_dev *rtwdev, u16 addr, u8 *data); #endif diff --git a/drivers/net/wireless/realtek/rtw88/rtw8822c.c b/drivers/net/wireless/realtek/rtw88/rtw8822c.c index c99b1de54bfc..ee0d39135617 100644 --- a/drivers/net/wireless/realtek/rtw88/rtw8822c.c +++ b/drivers/net/wireless/realtek/rtw88/rtw8822c.c @@ -15,6 +15,7 @@ #include "debug.h" #include "util.h" #include "bf.h" +#include "efuse.h" static void rtw8822c_config_trx_mode(struct rtw_dev *rtwdev, u8 tx_path, u8 rx_path, bool is_tx2_path); @@ -1000,10 +1001,122 @@ static void rtw8822c_rf_x2_check(struct rtw_dev *rtwdev) } } +static void rtw8822c_set_power_trim(struct rtw_dev *rtwdev, s8 bb_gain[2][8]) +{ +#define RF_SET_POWER_TRIM(_path, _seq, _idx) \ + do { \ + rtw_write_rf(rtwdev, _path, 0x33, RFREG_MASK, _seq); \ + rtw_write_rf(rtwdev, _path, 0x3f, RFREG_MASK, \ + bb_gain[_path][_idx]); \ + } while (0) + u8 path; + + for (path = 0; path < rtwdev->hal.rf_path_num; path++) { + rtw_write_rf(rtwdev, path, 0xee, BIT(19), 1); + RF_SET_POWER_TRIM(path, 0x0, 0); + RF_SET_POWER_TRIM(path, 0x1, 1); + RF_SET_POWER_TRIM(path, 0x2, 2); + RF_SET_POWER_TRIM(path, 0x3, 2); + RF_SET_POWER_TRIM(path, 0x4, 3); + RF_SET_POWER_TRIM(path, 0x5, 4); + RF_SET_POWER_TRIM(path, 0x6, 5); + RF_SET_POWER_TRIM(path, 0x7, 6); + RF_SET_POWER_TRIM(path, 0x8, 7); + RF_SET_POWER_TRIM(path, 0x9, 3); + RF_SET_POWER_TRIM(path, 0xa, 4); + RF_SET_POWER_TRIM(path, 0xb, 5); + RF_SET_POWER_TRIM(path, 0xc, 6); + RF_SET_POWER_TRIM(path, 0xd, 7); + RF_SET_POWER_TRIM(path, 0xe, 7); + rtw_write_rf(rtwdev, path, 0xee, BIT(19), 0); + } +#undef RF_SET_POWER_TRIM +} + +static void rtw8822c_power_trim(struct rtw_dev *rtwdev) +{ + u8 pg_pwr = 0xff, i, path, idx; + s8 bb_gain[2][8] = {0}; + u16 rf_efuse_2g[3] = {PPG_2GL_TXAB, PPG_2GM_TXAB, PPG_2GH_TXAB}; + u16 rf_efuse_5g[2][5] = {{PPG_5GL1_TXA, PPG_5GL2_TXA, PPG_5GM1_TXA, + PPG_5GM2_TXA, PPG_5GH1_TXA}, + {PPG_5GL1_TXB, PPG_5GL2_TXB, PPG_5GM1_TXB, + PPG_5GM2_TXB, PPG_5GH1_TXB} }; + bool set = false; + + for (i = 0; i < ARRAY_SIZE(rf_efuse_2g); i++) { + rtw_read8_physical_efuse(rtwdev, rf_efuse_2g[i], &pg_pwr); + if (pg_pwr == EFUSE_READ_FAIL) + continue; + set = true; + bb_gain[RF_PATH_A][i] = FIELD_GET(PPG_2G_A_MASK, pg_pwr); + bb_gain[RF_PATH_B][i] = FIELD_GET(PPG_2G_B_MASK, pg_pwr); + } + + for (i = 0; i < ARRAY_SIZE(rf_efuse_5g[0]); i++) { + for (path = 0; path < rtwdev->hal.rf_path_num; path++) { + rtw_read8_physical_efuse(rtwdev, rf_efuse_5g[path][i], + &pg_pwr); + if (pg_pwr == EFUSE_READ_FAIL) + continue; + set = true; + idx = i + ARRAY_SIZE(rf_efuse_2g); + bb_gain[path][idx] = FIELD_GET(PPG_5G_MASK, pg_pwr); + } + } + if (set) + rtw8822c_set_power_trim(rtwdev, bb_gain); + + rtw_write32_mask(rtwdev, REG_DIS_DPD, DIS_DPD_MASK, DIS_DPD_RATEALL); +} + +static void rtw8822c_thermal_trim(struct rtw_dev *rtwdev) +{ + u16 rf_efuse[2] = {PPG_THERMAL_A, PPG_THERMAL_B}; + u8 pg_therm = 0xff, thermal[2] = {0}, path; + + for (path = 0; path < rtwdev->hal.rf_path_num; path++) { + rtw_read8_physical_efuse(rtwdev, rf_efuse[path], &pg_therm); + if (pg_therm == EFUSE_READ_FAIL) + return; + /* Efuse value of BIT(0) shall be move to BIT(3), and the value + * of BIT(1) to BIT(3) should be right shifted 1 bit. + */ + thermal[path] = FIELD_GET(GENMASK(3, 1), pg_therm); + thermal[path] |= FIELD_PREP(BIT(3), pg_therm & BIT(0)); + rtw_write_rf(rtwdev, path, 0x43, RF_THEMAL_MASK, thermal[path]); + } +} + +static void rtw8822c_pa_bias(struct rtw_dev *rtwdev) +{ + u16 rf_efuse_2g[2] = {PPG_PABIAS_2GA, PPG_PABIAS_2GB}; + u16 rf_efuse_5g[2] = {PPG_PABIAS_5GA, PPG_PABIAS_5GB}; + u8 pg_pa_bias = 0xff, path; + + for (path = 0; path < rtwdev->hal.rf_path_num; path++) { + rtw_read8_physical_efuse(rtwdev, rf_efuse_2g[path], + &pg_pa_bias); + if (pg_pa_bias == EFUSE_READ_FAIL) + return; + pg_pa_bias = FIELD_GET(PPG_PABIAS_MASK, pg_pa_bias); + rtw_write_rf(rtwdev, path, 0x60, RF_PABIAS_2G_MASK, pg_pa_bias); + } + for (path = 0; path < rtwdev->hal.rf_path_num; path++) { + rtw_read8_physical_efuse(rtwdev, rf_efuse_5g[path], + &pg_pa_bias); + pg_pa_bias = FIELD_GET(PPG_PABIAS_MASK, pg_pa_bias); + rtw_write_rf(rtwdev, path, 0x60, RF_PABIAS_5G_MASK, pg_pa_bias); + } +} + static void rtw8822c_rf_init(struct rtw_dev *rtwdev) { rtw8822c_rf_dac_cal(rtwdev); rtw8822c_rf_x2_check(rtwdev); + rtw8822c_thermal_trim(rtwdev); + rtw8822c_power_trim(rtwdev); + rtw8822c_pa_bias(rtwdev); } static void rtw8822c_pwrtrack_init(struct rtw_dev *rtwdev) diff --git a/drivers/net/wireless/realtek/rtw88/rtw8822c.h b/drivers/net/wireless/realtek/rtw88/rtw8822c.h index dfd8662a0c0e..32b4771e04d0 100644 --- a/drivers/net/wireless/realtek/rtw88/rtw8822c.h +++ b/drivers/net/wireless/realtek/rtw88/rtw8822c.h @@ -309,4 +309,32 @@ const struct rtw_table name ## _tbl = { \ #define BIT_GS_PWSF GENMASK(27, 0) #define BIT_RPT_DGAIN GENMASK(27, 16) #define BIT_TX_CFIR GENMASK(31, 30) + +#define PPG_THERMAL_A 0x1ef +#define PPG_THERMAL_B 0x1b0 +#define RF_THEMAL_MASK GENMASK(19, 16) +#define PPG_2GL_TXAB 0x1d4 +#define PPG_2GM_TXAB 0x1ee +#define PPG_2GH_TXAB 0x1d2 +#define PPG_2G_A_MASK GENMASK(3, 0) +#define PPG_2G_B_MASK GENMASK(7, 4) +#define PPG_5GL1_TXA 0x1ec +#define PPG_5GL2_TXA 0x1e8 +#define PPG_5GM1_TXA 0x1e4 +#define PPG_5GM2_TXA 0x1e0 +#define PPG_5GH1_TXA 0x1dc +#define PPG_5GL1_TXB 0x1eb +#define PPG_5GL2_TXB 0x1e7 +#define PPG_5GM1_TXB 0x1e3 +#define PPG_5GM2_TXB 0x1df +#define PPG_5GH1_TXB 0x1db +#define PPG_5G_MASK GENMASK(4, 0) +#define PPG_PABIAS_2GA 0x1d6 +#define PPG_PABIAS_2GB 0x1d5 +#define PPG_PABIAS_5GA 0x1d8 +#define PPG_PABIAS_5GB 0x1d7 +#define PPG_PABIAS_MASK GENMASK(3, 0) +#define RF_PABIAS_2G_MASK GENMASK(15, 12) +#define RF_PABIAS_5G_MASK GENMASK(19, 16) + #endif -- cgit v1.2.3-59-g8ed1b From 1c79031f8a75c132bbd42e3ef20267af97b67466 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Sun, 19 Apr 2020 17:18:47 +0300 Subject: drivers: Remove inclusion of vermagic header Get rid of linux/vermagic.h includes, so that MODULE_ARCH_VERMAGIC from the arch header arch/x86/include/asm/module.h won't be redefined. In file included from ./include/linux/module.h:30, from drivers/net/ethernet/3com/3c515.c:56: ./arch/x86/include/asm/module.h:73: warning: "MODULE_ARCH_VERMAGIC" redefined 73 | # define MODULE_ARCH_VERMAGIC MODULE_PROC_FAMILY | In file included from drivers/net/ethernet/3com/3c515.c:25: ./include/linux/vermagic.h:28: note: this is the location of the previous definition 28 | #define MODULE_ARCH_VERMAGIC "" | Fixes: 6bba2e89a88c ("net/3com: Delete driver and module versions from 3com drivers") Co-developed-by: Borislav Petkov Signed-off-by: Borislav Petkov Acked-by: Shannon Nelson # ionic Acked-by: Sebastian Reichel # power Signed-off-by: Leon Romanovsky Signed-off-by: David S. Miller --- drivers/net/bonding/bonding_priv.h | 2 +- drivers/net/ethernet/3com/3c509.c | 1 - drivers/net/ethernet/3com/3c515.c | 1 - drivers/net/ethernet/adaptec/starfire.c | 1 - drivers/net/ethernet/pensando/ionic/ionic_main.c | 2 +- drivers/power/supply/test_power.c | 2 +- net/ethtool/ioctl.c | 3 +-- 7 files changed, 4 insertions(+), 8 deletions(-) diff --git a/drivers/net/bonding/bonding_priv.h b/drivers/net/bonding/bonding_priv.h index 45b77bc8c7b3..48cdf3a49a7d 100644 --- a/drivers/net/bonding/bonding_priv.h +++ b/drivers/net/bonding/bonding_priv.h @@ -14,7 +14,7 @@ #ifndef _BONDING_PRIV_H #define _BONDING_PRIV_H -#include +#include #define DRV_NAME "bonding" #define DRV_DESCRIPTION "Ethernet Channel Bonding Driver" diff --git a/drivers/net/ethernet/3com/3c509.c b/drivers/net/ethernet/3com/3c509.c index b762176a1406..139d0120f511 100644 --- a/drivers/net/ethernet/3com/3c509.c +++ b/drivers/net/ethernet/3com/3c509.c @@ -85,7 +85,6 @@ #include #include #include -#include #include #include diff --git a/drivers/net/ethernet/3com/3c515.c b/drivers/net/ethernet/3com/3c515.c index 90312fcd6319..47b4215bb93b 100644 --- a/drivers/net/ethernet/3com/3c515.c +++ b/drivers/net/ethernet/3com/3c515.c @@ -22,7 +22,6 @@ */ -#include #define DRV_NAME "3c515" #define CORKSCREW 1 diff --git a/drivers/net/ethernet/adaptec/starfire.c b/drivers/net/ethernet/adaptec/starfire.c index 2db42211329f..a64191fc2af9 100644 --- a/drivers/net/ethernet/adaptec/starfire.c +++ b/drivers/net/ethernet/adaptec/starfire.c @@ -45,7 +45,6 @@ #include /* Processor type for cache alignment. */ #include #include -#include /* * The current frame processor firmware fails to checksum a fragment diff --git a/drivers/net/ethernet/pensando/ionic/ionic_main.c b/drivers/net/ethernet/pensando/ionic/ionic_main.c index 588c62e9add7..3ed150512091 100644 --- a/drivers/net/ethernet/pensando/ionic/ionic_main.c +++ b/drivers/net/ethernet/pensando/ionic/ionic_main.c @@ -6,7 +6,7 @@ #include #include #include -#include +#include #include "ionic.h" #include "ionic_bus.h" diff --git a/drivers/power/supply/test_power.c b/drivers/power/supply/test_power.c index 65c23ef6408d..b3c05ff05783 100644 --- a/drivers/power/supply/test_power.c +++ b/drivers/power/supply/test_power.c @@ -16,7 +16,7 @@ #include #include #include -#include +#include enum test_power_id { TEST_AC, diff --git a/net/ethtool/ioctl.c b/net/ethtool/ioctl.c index 593fa665f820..226d5ecdd567 100644 --- a/net/ethtool/ioctl.c +++ b/net/ethtool/ioctl.c @@ -17,7 +17,6 @@ #include #include #include -#include #include #include #include @@ -28,7 +27,7 @@ #include #include #include - +#include #include "common.h" /* -- cgit v1.2.3-59-g8ed1b From cad99e506887e257ce8bce826ec25f7854b7e69a Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Sun, 19 Apr 2020 17:18:48 +0300 Subject: net/hns: Remove custom driver version in favour of global one Use globally defined kernel version instead of custom driver variant. Reported-by: Borislav Petkov Signed-off-by: Leon Romanovsky Signed-off-by: David S. Miller --- drivers/net/ethernet/hisilicon/hns3/hns3_enet.c | 3 --- drivers/net/ethernet/hisilicon/hns3/hns3_enet.h | 4 ---- drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c | 4 ---- 3 files changed, 11 deletions(-) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c index da98fd7c8eca..ac3a48a24d86 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c @@ -15,7 +15,6 @@ #include #include #include -#include #include #include #include @@ -44,7 +43,6 @@ static void hns3_clear_all_ring(struct hnae3_handle *h, bool force); static void hns3_remove_hw_addr(struct net_device *netdev); static const char hns3_driver_name[] = "hns3"; -const char hns3_driver_version[] = VERMAGIC_STRING; static const char hns3_driver_string[] = "Hisilicon Ethernet Network Driver for Hip08 Family"; static const char hns3_copyright[] = "Copyright (c) 2017 Huawei Corporation."; @@ -4765,4 +4763,3 @@ MODULE_DESCRIPTION("HNS3: Hisilicon Ethernet Driver"); MODULE_AUTHOR("Huawei Tech. Co., Ltd."); MODULE_LICENSE("GPL"); MODULE_ALIAS("pci:hns-nic"); -MODULE_VERSION(HNS3_MOD_VERSION); diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.h b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.h index abefd7a179f7..4b3f0abf0715 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.h +++ b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.h @@ -8,10 +8,6 @@ #include "hnae3.h" -#define HNS3_MOD_VERSION "1.0" - -extern const char hns3_driver_version[]; - enum hns3_nic_state { HNS3_NIC_STATE_TESTING, HNS3_NIC_STATE_RESETTING, diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c b/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c index 28b81f24afa1..6a0734be4a1a 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c @@ -546,10 +546,6 @@ static void hns3_get_drvinfo(struct net_device *netdev, return; } - strncpy(drvinfo->version, hns3_driver_version, - sizeof(drvinfo->version)); - drvinfo->version[sizeof(drvinfo->version) - 1] = '\0'; - strncpy(drvinfo->driver, h->pdev->driver->name, sizeof(drvinfo->driver)); drvinfo->driver[sizeof(drvinfo->driver) - 1] = '\0'; -- cgit v1.2.3-59-g8ed1b From b4f37219813fd126b2cda6d7805d8c61b8cf801a Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Sun, 19 Apr 2020 17:18:49 +0300 Subject: net/nfp: Update driver to use global kernel version Change nfp driver to use globally defined kernel version. Reported-by: Borislav Petkov Acked-by: Jakub Kicinski Signed-off-by: Leon Romanovsky Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- drivers/net/ethernet/netronome/nfp/nfp_main.c | 3 --- drivers/net/ethernet/netronome/nfp/nfp_net_ethtool.c | 2 -- 2 files changed, 5 deletions(-) diff --git a/drivers/net/ethernet/netronome/nfp/nfp_main.c b/drivers/net/ethernet/netronome/nfp/nfp_main.c index 4d282fc56009..7ff2ccbd43b0 100644 --- a/drivers/net/ethernet/netronome/nfp/nfp_main.c +++ b/drivers/net/ethernet/netronome/nfp/nfp_main.c @@ -14,7 +14,6 @@ #include #include #include -#include #include #include @@ -31,7 +30,6 @@ #include "nfp_net.h" static const char nfp_driver_name[] = "nfp"; -const char nfp_driver_version[] = VERMAGIC_STRING; static const struct pci_device_id nfp_pci_device_ids[] = { { PCI_VENDOR_ID_NETRONOME, PCI_DEVICE_ID_NETRONOME_NFP6000, @@ -920,4 +918,3 @@ MODULE_FIRMWARE("netronome/nic_AMDA0099-0001_1x10_1x25.nffw"); MODULE_AUTHOR("Netronome Systems "); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("The Netronome Flow Processor (NFP) driver."); -MODULE_VERSION(UTS_RELEASE); diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_ethtool.c b/drivers/net/ethernet/netronome/nfp/nfp_net_ethtool.c index 2779f1526d1e..a5aa3219d112 100644 --- a/drivers/net/ethernet/netronome/nfp/nfp_net_ethtool.c +++ b/drivers/net/ethernet/netronome/nfp/nfp_net_ethtool.c @@ -203,8 +203,6 @@ nfp_get_drvinfo(struct nfp_app *app, struct pci_dev *pdev, char nsp_version[ETHTOOL_FWVERS_LEN] = {}; strlcpy(drvinfo->driver, pdev->driver->name, sizeof(drvinfo->driver)); - strlcpy(drvinfo->version, nfp_driver_version, sizeof(drvinfo->version)); - nfp_net_get_nspinfo(app, nsp_version); snprintf(drvinfo->fw_version, sizeof(drvinfo->fw_version), "%s %s %s %s", vnic_version, nsp_version, -- cgit v1.2.3-59-g8ed1b From 51161bfc66a68d21f13d15a689b3ea7980457790 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Sun, 19 Apr 2020 18:55:06 +0300 Subject: kernel/module: Hide vermagic header file from general use MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit VERMAGIC* definitions are not supposed to be used by the drivers, see this [1] bug report, so introduce special define to guard inclusion of this header file and define it in kernel/modules.h and in internal script that generates *.mod.c files. In-tree module build: ➜ kernel git:(vermagic) ✗ make clean ➜ kernel git:(vermagic) ✗ make M=drivers/infiniband/hw/mlx5 ➜ kernel git:(vermagic) ✗ modinfo drivers/infiniband/hw/mlx5/mlx5_ib.ko filename: /images/leonro/src/kernel/drivers/infiniband/hw/mlx5/mlx5_ib.ko <...> vermagic: 5.6.0+ SMP mod_unload modversions Out-of-tree module build: ➜ mlx5 make -C /images/leonro/src/kernel clean M=/tmp/mlx5 ➜ mlx5 make -C /images/leonro/src/kernel M=/tmp/mlx5 ➜ mlx5 modinfo /tmp/mlx5/mlx5_ib.ko filename: /tmp/mlx5/mlx5_ib.ko <...> vermagic: 5.6.0+ SMP mod_unload modversions [1] https://lore.kernel.org/lkml/20200411155623.GA22175@zn.tnic Reported-by: Borislav Petkov Acked-by: Borislav Petkov Acked-by: Jessica Yu Co-developed-by: Masahiro Yamada Signed-off-by: Masahiro Yamada Signed-off-by: Leon Romanovsky Signed-off-by: David S. Miller --- include/linux/vermagic.h | 5 +++++ kernel/module.c | 3 +++ scripts/mod/modpost.c | 1 + 3 files changed, 9 insertions(+) diff --git a/include/linux/vermagic.h b/include/linux/vermagic.h index 9aced11e9000..7768d20ada39 100644 --- a/include/linux/vermagic.h +++ b/include/linux/vermagic.h @@ -1,4 +1,9 @@ /* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef INCLUDE_VERMAGIC +#error "This header can be included from kernel/module.c or *.mod.c only" +#endif + #include /* Simply sanity version stamp for modules. */ diff --git a/kernel/module.c b/kernel/module.c index 646f1e2330d2..8833e848b73c 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -4,6 +4,9 @@ Copyright (C) 2001 Rusty Russell, 2002, 2010 Rusty Russell IBM. */ + +#define INCLUDE_VERMAGIC + #include #include #include diff --git a/scripts/mod/modpost.c b/scripts/mod/modpost.c index 5c3c50c5ec52..7f7d4ee7b652 100644 --- a/scripts/mod/modpost.c +++ b/scripts/mod/modpost.c @@ -2251,6 +2251,7 @@ static void add_header(struct buffer *b, struct module *mod) * Include build-salt.h after module.h in order to * inherit the definitions. */ + buf_printf(b, "#define INCLUDE_VERMAGIC\n"); buf_printf(b, "#include \n"); buf_printf(b, "#include \n"); buf_printf(b, "#include \n"); -- cgit v1.2.3-59-g8ed1b From 2b49d128b3f8d8fff8972afcbc603802e5e40c6a Mon Sep 17 00:00:00 2001 From: Yangbo Lu Date: Mon, 20 Apr 2020 10:46:45 +0800 Subject: net: mscc: ocelot: move ocelot ptp clock code out of ocelot.c The Ocelot PTP clock driver had been embedded into ocelot.c driver. It had supported basic gettime64/settime64/adjtime/adjfine functions by now which were used by both Ocelot switch and Felix switch. This patch is to move current ptp clock code out of ocelot.c driver maintaining as a single ocelot_ptp.c. For futher new features implementation, the common code could be put in ocelot_ptp.c and the switch specific code should be in specific switch driver. The interrupt implementation in SoC is different between Ocelot and Felix. Signed-off-by: Yangbo Lu Signed-off-by: David S. Miller --- drivers/net/dsa/ocelot/felix.c | 25 ++++ drivers/net/ethernet/mscc/Makefile | 2 +- drivers/net/ethernet/mscc/ocelot.c | 206 ------------------------------- drivers/net/ethernet/mscc/ocelot.h | 3 +- drivers/net/ethernet/mscc/ocelot_board.c | 25 ++++ drivers/net/ethernet/mscc/ocelot_ptp.c | 203 ++++++++++++++++++++++++++++++ drivers/net/ethernet/mscc/ocelot_ptp.h | 41 ------ include/soc/mscc/ocelot.h | 1 - include/soc/mscc/ocelot_ptp.h | 52 ++++++++ 9 files changed, 307 insertions(+), 251 deletions(-) create mode 100644 drivers/net/ethernet/mscc/ocelot_ptp.c delete mode 100644 drivers/net/ethernet/mscc/ocelot_ptp.h create mode 100644 include/soc/mscc/ocelot_ptp.h diff --git a/drivers/net/dsa/ocelot/felix.c b/drivers/net/dsa/ocelot/felix.c index d0a3764ff0cf..44015a24b087 100644 --- a/drivers/net/dsa/ocelot/felix.c +++ b/drivers/net/dsa/ocelot/felix.c @@ -7,6 +7,7 @@ #include #include #include +#include #include #include #include @@ -494,6 +495,21 @@ static int felix_init_structs(struct felix *felix, int num_phys_ports) return 0; } +static struct ptp_clock_info ocelot_ptp_clock_info = { + .owner = THIS_MODULE, + .name = "felix ptp", + .max_adj = 0x7fffffff, + .n_alarm = 0, + .n_ext_ts = 0, + .n_per_out = 0, + .n_pins = 0, + .pps = 0, + .gettime64 = ocelot_ptp_gettime64, + .settime64 = ocelot_ptp_settime64, + .adjtime = ocelot_ptp_adjtime, + .adjfine = ocelot_ptp_adjfine, +}; + /* Hardware initialization done here so that we can allocate structures with * devm without fear of dsa_register_switch returning -EPROBE_DEFER and causing * us to allocate structures twice (leak memory) and map PCI memory twice @@ -510,6 +526,14 @@ static int felix_setup(struct dsa_switch *ds) return err; ocelot_init(ocelot); + if (ocelot->ptp) { + err = ocelot_init_timestamp(ocelot, &ocelot_ptp_clock_info); + if (err) { + dev_err(ocelot->dev, + "Timestamp initialization failed\n"); + ocelot->ptp = 0; + } + } for (port = 0; port < ds->num_ports; port++) { ocelot_init_port(ocelot, port); @@ -548,6 +572,7 @@ static void felix_teardown(struct dsa_switch *ds) if (felix->info->mdio_bus_free) felix->info->mdio_bus_free(ocelot); + ocelot_deinit_timestamp(ocelot); /* stop workqueue thread */ ocelot_deinit(ocelot); } diff --git a/drivers/net/ethernet/mscc/Makefile b/drivers/net/ethernet/mscc/Makefile index 9a36c26095c8..91b33b55054e 100644 --- a/drivers/net/ethernet/mscc/Makefile +++ b/drivers/net/ethernet/mscc/Makefile @@ -1,5 +1,5 @@ # SPDX-License-Identifier: (GPL-2.0 OR MIT) obj-$(CONFIG_MSCC_OCELOT_SWITCH) += mscc_ocelot_common.o mscc_ocelot_common-y := ocelot.o ocelot_io.o -mscc_ocelot_common-y += ocelot_regs.o ocelot_tc.o ocelot_police.o ocelot_ace.o ocelot_flower.o +mscc_ocelot_common-y += ocelot_regs.o ocelot_tc.o ocelot_police.o ocelot_ace.o ocelot_flower.o ocelot_ptp.o obj-$(CONFIG_MSCC_OCELOT_SWITCH_OCELOT) += ocelot_board.o diff --git a/drivers/net/ethernet/mscc/ocelot.c b/drivers/net/ethernet/mscc/ocelot.c index a8c48a4a708f..7c4165af9f66 100644 --- a/drivers/net/ethernet/mscc/ocelot.c +++ b/drivers/net/ethernet/mscc/ocelot.c @@ -14,7 +14,6 @@ #include #include #include -#include #include #include #include @@ -1991,200 +1990,6 @@ struct notifier_block ocelot_switchdev_blocking_nb __read_mostly = { }; EXPORT_SYMBOL(ocelot_switchdev_blocking_nb); -int ocelot_ptp_gettime64(struct ptp_clock_info *ptp, struct timespec64 *ts) -{ - struct ocelot *ocelot = container_of(ptp, struct ocelot, ptp_info); - unsigned long flags; - time64_t s; - u32 val; - s64 ns; - - spin_lock_irqsave(&ocelot->ptp_clock_lock, flags); - - val = ocelot_read_rix(ocelot, PTP_PIN_CFG, TOD_ACC_PIN); - val &= ~(PTP_PIN_CFG_SYNC | PTP_PIN_CFG_ACTION_MASK | PTP_PIN_CFG_DOM); - val |= PTP_PIN_CFG_ACTION(PTP_PIN_ACTION_SAVE); - ocelot_write_rix(ocelot, val, PTP_PIN_CFG, TOD_ACC_PIN); - - s = ocelot_read_rix(ocelot, PTP_PIN_TOD_SEC_MSB, TOD_ACC_PIN) & 0xffff; - s <<= 32; - s += ocelot_read_rix(ocelot, PTP_PIN_TOD_SEC_LSB, TOD_ACC_PIN); - ns = ocelot_read_rix(ocelot, PTP_PIN_TOD_NSEC, TOD_ACC_PIN); - - spin_unlock_irqrestore(&ocelot->ptp_clock_lock, flags); - - /* Deal with negative values */ - if (ns >= 0x3ffffff0 && ns <= 0x3fffffff) { - s--; - ns &= 0xf; - ns += 999999984; - } - - set_normalized_timespec64(ts, s, ns); - return 0; -} -EXPORT_SYMBOL(ocelot_ptp_gettime64); - -static int ocelot_ptp_settime64(struct ptp_clock_info *ptp, - const struct timespec64 *ts) -{ - struct ocelot *ocelot = container_of(ptp, struct ocelot, ptp_info); - unsigned long flags; - u32 val; - - spin_lock_irqsave(&ocelot->ptp_clock_lock, flags); - - val = ocelot_read_rix(ocelot, PTP_PIN_CFG, TOD_ACC_PIN); - val &= ~(PTP_PIN_CFG_SYNC | PTP_PIN_CFG_ACTION_MASK | PTP_PIN_CFG_DOM); - val |= PTP_PIN_CFG_ACTION(PTP_PIN_ACTION_IDLE); - - ocelot_write_rix(ocelot, val, PTP_PIN_CFG, TOD_ACC_PIN); - - ocelot_write_rix(ocelot, lower_32_bits(ts->tv_sec), PTP_PIN_TOD_SEC_LSB, - TOD_ACC_PIN); - ocelot_write_rix(ocelot, upper_32_bits(ts->tv_sec), PTP_PIN_TOD_SEC_MSB, - TOD_ACC_PIN); - ocelot_write_rix(ocelot, ts->tv_nsec, PTP_PIN_TOD_NSEC, TOD_ACC_PIN); - - val = ocelot_read_rix(ocelot, PTP_PIN_CFG, TOD_ACC_PIN); - val &= ~(PTP_PIN_CFG_SYNC | PTP_PIN_CFG_ACTION_MASK | PTP_PIN_CFG_DOM); - val |= PTP_PIN_CFG_ACTION(PTP_PIN_ACTION_LOAD); - - ocelot_write_rix(ocelot, val, PTP_PIN_CFG, TOD_ACC_PIN); - - spin_unlock_irqrestore(&ocelot->ptp_clock_lock, flags); - return 0; -} - -static int ocelot_ptp_adjtime(struct ptp_clock_info *ptp, s64 delta) -{ - if (delta > -(NSEC_PER_SEC / 2) && delta < (NSEC_PER_SEC / 2)) { - struct ocelot *ocelot = container_of(ptp, struct ocelot, ptp_info); - unsigned long flags; - u32 val; - - spin_lock_irqsave(&ocelot->ptp_clock_lock, flags); - - val = ocelot_read_rix(ocelot, PTP_PIN_CFG, TOD_ACC_PIN); - val &= ~(PTP_PIN_CFG_SYNC | PTP_PIN_CFG_ACTION_MASK | PTP_PIN_CFG_DOM); - val |= PTP_PIN_CFG_ACTION(PTP_PIN_ACTION_IDLE); - - ocelot_write_rix(ocelot, val, PTP_PIN_CFG, TOD_ACC_PIN); - - ocelot_write_rix(ocelot, 0, PTP_PIN_TOD_SEC_LSB, TOD_ACC_PIN); - ocelot_write_rix(ocelot, 0, PTP_PIN_TOD_SEC_MSB, TOD_ACC_PIN); - ocelot_write_rix(ocelot, delta, PTP_PIN_TOD_NSEC, TOD_ACC_PIN); - - val = ocelot_read_rix(ocelot, PTP_PIN_CFG, TOD_ACC_PIN); - val &= ~(PTP_PIN_CFG_SYNC | PTP_PIN_CFG_ACTION_MASK | PTP_PIN_CFG_DOM); - val |= PTP_PIN_CFG_ACTION(PTP_PIN_ACTION_DELTA); - - ocelot_write_rix(ocelot, val, PTP_PIN_CFG, TOD_ACC_PIN); - - spin_unlock_irqrestore(&ocelot->ptp_clock_lock, flags); - } else { - /* Fall back using ocelot_ptp_settime64 which is not exact. */ - struct timespec64 ts; - u64 now; - - ocelot_ptp_gettime64(ptp, &ts); - - now = ktime_to_ns(timespec64_to_ktime(ts)); - ts = ns_to_timespec64(now + delta); - - ocelot_ptp_settime64(ptp, &ts); - } - return 0; -} - -static int ocelot_ptp_adjfine(struct ptp_clock_info *ptp, long scaled_ppm) -{ - struct ocelot *ocelot = container_of(ptp, struct ocelot, ptp_info); - u32 unit = 0, direction = 0; - unsigned long flags; - u64 adj = 0; - - spin_lock_irqsave(&ocelot->ptp_clock_lock, flags); - - if (!scaled_ppm) - goto disable_adj; - - if (scaled_ppm < 0) { - direction = PTP_CFG_CLK_ADJ_CFG_DIR; - scaled_ppm = -scaled_ppm; - } - - adj = PSEC_PER_SEC << 16; - do_div(adj, scaled_ppm); - do_div(adj, 1000); - - /* If the adjustment value is too large, use ns instead */ - if (adj >= (1L << 30)) { - unit = PTP_CFG_CLK_ADJ_FREQ_NS; - do_div(adj, 1000); - } - - /* Still too big */ - if (adj >= (1L << 30)) - goto disable_adj; - - ocelot_write(ocelot, unit | adj, PTP_CLK_CFG_ADJ_FREQ); - ocelot_write(ocelot, PTP_CFG_CLK_ADJ_CFG_ENA | direction, - PTP_CLK_CFG_ADJ_CFG); - - spin_unlock_irqrestore(&ocelot->ptp_clock_lock, flags); - return 0; - -disable_adj: - ocelot_write(ocelot, 0, PTP_CLK_CFG_ADJ_CFG); - - spin_unlock_irqrestore(&ocelot->ptp_clock_lock, flags); - return 0; -} - -static struct ptp_clock_info ocelot_ptp_clock_info = { - .owner = THIS_MODULE, - .name = "ocelot ptp", - .max_adj = 0x7fffffff, - .n_alarm = 0, - .n_ext_ts = 0, - .n_per_out = 0, - .n_pins = 0, - .pps = 0, - .gettime64 = ocelot_ptp_gettime64, - .settime64 = ocelot_ptp_settime64, - .adjtime = ocelot_ptp_adjtime, - .adjfine = ocelot_ptp_adjfine, -}; - -static int ocelot_init_timestamp(struct ocelot *ocelot) -{ - struct ptp_clock *ptp_clock; - - ocelot->ptp_info = ocelot_ptp_clock_info; - ptp_clock = ptp_clock_register(&ocelot->ptp_info, ocelot->dev); - if (IS_ERR(ptp_clock)) - return PTR_ERR(ptp_clock); - /* Check if PHC support is missing at the configuration level */ - if (!ptp_clock) - return 0; - - ocelot->ptp_clock = ptp_clock; - - ocelot_write(ocelot, SYS_PTP_CFG_PTP_STAMP_WID(30), SYS_PTP_CFG); - ocelot_write(ocelot, 0xffffffff, ANA_TABLES_PTP_ID_LOW); - ocelot_write(ocelot, 0xffffffff, ANA_TABLES_PTP_ID_HIGH); - - ocelot_write(ocelot, PTP_CFG_MISC_PTP_EN, PTP_CFG_MISC); - - /* There is no device reconfiguration, PTP Rx stamping is always - * enabled. - */ - ocelot->hwtstamp_config.rx_filter = HWTSTAMP_FILTER_PTP_V2_EVENT; - - return 0; -} - /* Configure the maximum SDU (L2 payload) on RX to the value specified in @sdu. * The length of VLAN tags is accounted for automatically via DEV_MAC_TAGS_CFG. * In the special case that it's the NPI port that we're configuring, the @@ -2530,15 +2335,6 @@ int ocelot_init(struct ocelot *ocelot) queue_delayed_work(ocelot->stats_queue, &ocelot->stats_work, OCELOT_STATS_CHECK_DELAY); - if (ocelot->ptp) { - ret = ocelot_init_timestamp(ocelot); - if (ret) { - dev_err(ocelot->dev, - "Timestamp initialization failed\n"); - return ret; - } - } - return 0; } EXPORT_SYMBOL(ocelot_init); @@ -2551,8 +2347,6 @@ void ocelot_deinit(struct ocelot *ocelot) cancel_delayed_work(&ocelot->stats_work); destroy_workqueue(ocelot->stats_queue); mutex_destroy(&ocelot->stats_lock); - if (ocelot->ptp_clock) - ptp_clock_unregister(ocelot->ptp_clock); for (i = 0; i < ocelot->num_phys_ports; i++) { port = ocelot->ports[i]; diff --git a/drivers/net/ethernet/mscc/ocelot.h b/drivers/net/ethernet/mscc/ocelot.h index 641af929497f..f0a15aa187f2 100644 --- a/drivers/net/ethernet/mscc/ocelot.h +++ b/drivers/net/ethernet/mscc/ocelot.h @@ -15,18 +15,17 @@ #include #include #include -#include #include #include #include #include #include +#include #include #include "ocelot_rew.h" #include "ocelot_qs.h" #include "ocelot_tc.h" -#include "ocelot_ptp.h" #define OCELOT_BUFFER_CELL_SZ 60 diff --git a/drivers/net/ethernet/mscc/ocelot_board.c b/drivers/net/ethernet/mscc/ocelot_board.c index 0ac9fbf77a01..ee016f7ed934 100644 --- a/drivers/net/ethernet/mscc/ocelot_board.c +++ b/drivers/net/ethernet/mscc/ocelot_board.c @@ -366,6 +366,21 @@ static const struct vcap_props vsc7514_vcap_props[] = { }, }; +static struct ptp_clock_info ocelot_ptp_clock_info = { + .owner = THIS_MODULE, + .name = "ocelot ptp", + .max_adj = 0x7fffffff, + .n_alarm = 0, + .n_ext_ts = 0, + .n_per_out = 0, + .n_pins = 0, + .pps = 0, + .gettime64 = ocelot_ptp_gettime64, + .settime64 = ocelot_ptp_settime64, + .adjtime = ocelot_ptp_adjtime, + .adjfine = ocelot_ptp_adjfine, +}; + static int mscc_ocelot_probe(struct platform_device *pdev) { struct device_node *np = pdev->dev.of_node; @@ -469,6 +484,15 @@ static int mscc_ocelot_probe(struct platform_device *pdev) ocelot->vcap = vsc7514_vcap_props; ocelot_init(ocelot); + if (ocelot->ptp) { + err = ocelot_init_timestamp(ocelot, &ocelot_ptp_clock_info); + if (err) { + dev_err(ocelot->dev, + "Timestamp initialization failed\n"); + ocelot->ptp = 0; + } + } + /* No NPI port */ ocelot_configure_cpu(ocelot, -1, OCELOT_TAG_PREFIX_NONE, OCELOT_TAG_PREFIX_NONE); @@ -574,6 +598,7 @@ static int mscc_ocelot_remove(struct platform_device *pdev) { struct ocelot *ocelot = platform_get_drvdata(pdev); + ocelot_deinit_timestamp(ocelot); ocelot_deinit(ocelot); unregister_switchdev_blocking_notifier(&ocelot_switchdev_blocking_nb); unregister_switchdev_notifier(&ocelot_switchdev_nb); diff --git a/drivers/net/ethernet/mscc/ocelot_ptp.c b/drivers/net/ethernet/mscc/ocelot_ptp.c new file mode 100644 index 000000000000..69d4e5677343 --- /dev/null +++ b/drivers/net/ethernet/mscc/ocelot_ptp.c @@ -0,0 +1,203 @@ +// SPDX-License-Identifier: (GPL-2.0 OR MIT) +/* Microsemi Ocelot PTP clock driver + * + * Copyright (c) 2017 Microsemi Corporation + * Copyright 2020 NXP + */ +#include +#include +#include + +int ocelot_ptp_gettime64(struct ptp_clock_info *ptp, struct timespec64 *ts) +{ + struct ocelot *ocelot = container_of(ptp, struct ocelot, ptp_info); + unsigned long flags; + time64_t s; + u32 val; + s64 ns; + + spin_lock_irqsave(&ocelot->ptp_clock_lock, flags); + + val = ocelot_read_rix(ocelot, PTP_PIN_CFG, TOD_ACC_PIN); + val &= ~(PTP_PIN_CFG_SYNC | PTP_PIN_CFG_ACTION_MASK | PTP_PIN_CFG_DOM); + val |= PTP_PIN_CFG_ACTION(PTP_PIN_ACTION_SAVE); + ocelot_write_rix(ocelot, val, PTP_PIN_CFG, TOD_ACC_PIN); + + s = ocelot_read_rix(ocelot, PTP_PIN_TOD_SEC_MSB, TOD_ACC_PIN) & 0xffff; + s <<= 32; + s += ocelot_read_rix(ocelot, PTP_PIN_TOD_SEC_LSB, TOD_ACC_PIN); + ns = ocelot_read_rix(ocelot, PTP_PIN_TOD_NSEC, TOD_ACC_PIN); + + spin_unlock_irqrestore(&ocelot->ptp_clock_lock, flags); + + /* Deal with negative values */ + if (ns >= 0x3ffffff0 && ns <= 0x3fffffff) { + s--; + ns &= 0xf; + ns += 999999984; + } + + set_normalized_timespec64(ts, s, ns); + return 0; +} +EXPORT_SYMBOL(ocelot_ptp_gettime64); + +int ocelot_ptp_settime64(struct ptp_clock_info *ptp, + const struct timespec64 *ts) +{ + struct ocelot *ocelot = container_of(ptp, struct ocelot, ptp_info); + unsigned long flags; + u32 val; + + spin_lock_irqsave(&ocelot->ptp_clock_lock, flags); + + val = ocelot_read_rix(ocelot, PTP_PIN_CFG, TOD_ACC_PIN); + val &= ~(PTP_PIN_CFG_SYNC | PTP_PIN_CFG_ACTION_MASK | PTP_PIN_CFG_DOM); + val |= PTP_PIN_CFG_ACTION(PTP_PIN_ACTION_IDLE); + + ocelot_write_rix(ocelot, val, PTP_PIN_CFG, TOD_ACC_PIN); + + ocelot_write_rix(ocelot, lower_32_bits(ts->tv_sec), PTP_PIN_TOD_SEC_LSB, + TOD_ACC_PIN); + ocelot_write_rix(ocelot, upper_32_bits(ts->tv_sec), PTP_PIN_TOD_SEC_MSB, + TOD_ACC_PIN); + ocelot_write_rix(ocelot, ts->tv_nsec, PTP_PIN_TOD_NSEC, TOD_ACC_PIN); + + val = ocelot_read_rix(ocelot, PTP_PIN_CFG, TOD_ACC_PIN); + val &= ~(PTP_PIN_CFG_SYNC | PTP_PIN_CFG_ACTION_MASK | PTP_PIN_CFG_DOM); + val |= PTP_PIN_CFG_ACTION(PTP_PIN_ACTION_LOAD); + + ocelot_write_rix(ocelot, val, PTP_PIN_CFG, TOD_ACC_PIN); + + spin_unlock_irqrestore(&ocelot->ptp_clock_lock, flags); + return 0; +} +EXPORT_SYMBOL(ocelot_ptp_settime64); + +int ocelot_ptp_adjtime(struct ptp_clock_info *ptp, s64 delta) +{ + if (delta > -(NSEC_PER_SEC / 2) && delta < (NSEC_PER_SEC / 2)) { + struct ocelot *ocelot = container_of(ptp, struct ocelot, + ptp_info); + unsigned long flags; + u32 val; + + spin_lock_irqsave(&ocelot->ptp_clock_lock, flags); + + val = ocelot_read_rix(ocelot, PTP_PIN_CFG, TOD_ACC_PIN); + val &= ~(PTP_PIN_CFG_SYNC | PTP_PIN_CFG_ACTION_MASK | + PTP_PIN_CFG_DOM); + val |= PTP_PIN_CFG_ACTION(PTP_PIN_ACTION_IDLE); + + ocelot_write_rix(ocelot, val, PTP_PIN_CFG, TOD_ACC_PIN); + + ocelot_write_rix(ocelot, 0, PTP_PIN_TOD_SEC_LSB, TOD_ACC_PIN); + ocelot_write_rix(ocelot, 0, PTP_PIN_TOD_SEC_MSB, TOD_ACC_PIN); + ocelot_write_rix(ocelot, delta, PTP_PIN_TOD_NSEC, TOD_ACC_PIN); + + val = ocelot_read_rix(ocelot, PTP_PIN_CFG, TOD_ACC_PIN); + val &= ~(PTP_PIN_CFG_SYNC | PTP_PIN_CFG_ACTION_MASK | + PTP_PIN_CFG_DOM); + val |= PTP_PIN_CFG_ACTION(PTP_PIN_ACTION_DELTA); + + ocelot_write_rix(ocelot, val, PTP_PIN_CFG, TOD_ACC_PIN); + + spin_unlock_irqrestore(&ocelot->ptp_clock_lock, flags); + } else { + /* Fall back using ocelot_ptp_settime64 which is not exact. */ + struct timespec64 ts; + u64 now; + + ocelot_ptp_gettime64(ptp, &ts); + + now = ktime_to_ns(timespec64_to_ktime(ts)); + ts = ns_to_timespec64(now + delta); + + ocelot_ptp_settime64(ptp, &ts); + } + return 0; +} +EXPORT_SYMBOL(ocelot_ptp_adjtime); + +int ocelot_ptp_adjfine(struct ptp_clock_info *ptp, long scaled_ppm) +{ + struct ocelot *ocelot = container_of(ptp, struct ocelot, ptp_info); + u32 unit = 0, direction = 0; + unsigned long flags; + u64 adj = 0; + + spin_lock_irqsave(&ocelot->ptp_clock_lock, flags); + + if (!scaled_ppm) + goto disable_adj; + + if (scaled_ppm < 0) { + direction = PTP_CFG_CLK_ADJ_CFG_DIR; + scaled_ppm = -scaled_ppm; + } + + adj = PSEC_PER_SEC << 16; + do_div(adj, scaled_ppm); + do_div(adj, 1000); + + /* If the adjustment value is too large, use ns instead */ + if (adj >= (1L << 30)) { + unit = PTP_CFG_CLK_ADJ_FREQ_NS; + do_div(adj, 1000); + } + + /* Still too big */ + if (adj >= (1L << 30)) + goto disable_adj; + + ocelot_write(ocelot, unit | adj, PTP_CLK_CFG_ADJ_FREQ); + ocelot_write(ocelot, PTP_CFG_CLK_ADJ_CFG_ENA | direction, + PTP_CLK_CFG_ADJ_CFG); + + spin_unlock_irqrestore(&ocelot->ptp_clock_lock, flags); + return 0; + +disable_adj: + ocelot_write(ocelot, 0, PTP_CLK_CFG_ADJ_CFG); + + spin_unlock_irqrestore(&ocelot->ptp_clock_lock, flags); + return 0; +} +EXPORT_SYMBOL(ocelot_ptp_adjfine); + +int ocelot_init_timestamp(struct ocelot *ocelot, struct ptp_clock_info *info) +{ + struct ptp_clock *ptp_clock; + + ocelot->ptp_info = *info; + ptp_clock = ptp_clock_register(&ocelot->ptp_info, ocelot->dev); + if (IS_ERR(ptp_clock)) + return PTR_ERR(ptp_clock); + /* Check if PHC support is missing at the configuration level */ + if (!ptp_clock) + return 0; + + ocelot->ptp_clock = ptp_clock; + + ocelot_write(ocelot, SYS_PTP_CFG_PTP_STAMP_WID(30), SYS_PTP_CFG); + ocelot_write(ocelot, 0xffffffff, ANA_TABLES_PTP_ID_LOW); + ocelot_write(ocelot, 0xffffffff, ANA_TABLES_PTP_ID_HIGH); + + ocelot_write(ocelot, PTP_CFG_MISC_PTP_EN, PTP_CFG_MISC); + + /* There is no device reconfiguration, PTP Rx stamping is always + * enabled. + */ + ocelot->hwtstamp_config.rx_filter = HWTSTAMP_FILTER_PTP_V2_EVENT; + + return 0; +} +EXPORT_SYMBOL(ocelot_init_timestamp); + +int ocelot_deinit_timestamp(struct ocelot *ocelot) +{ + if (ocelot->ptp_clock) + ptp_clock_unregister(ocelot->ptp_clock); + return 0; +} +EXPORT_SYMBOL(ocelot_deinit_timestamp); diff --git a/drivers/net/ethernet/mscc/ocelot_ptp.h b/drivers/net/ethernet/mscc/ocelot_ptp.h deleted file mode 100644 index 9ede14a12573..000000000000 --- a/drivers/net/ethernet/mscc/ocelot_ptp.h +++ /dev/null @@ -1,41 +0,0 @@ -/* SPDX-License-Identifier: (GPL-2.0 OR MIT) */ -/* - * Microsemi Ocelot Switch driver - * - * License: Dual MIT/GPL - * Copyright (c) 2017 Microsemi Corporation - */ - -#ifndef _MSCC_OCELOT_PTP_H_ -#define _MSCC_OCELOT_PTP_H_ - -#define PTP_PIN_CFG_RSZ 0x20 -#define PTP_PIN_TOD_SEC_MSB_RSZ PTP_PIN_CFG_RSZ -#define PTP_PIN_TOD_SEC_LSB_RSZ PTP_PIN_CFG_RSZ -#define PTP_PIN_TOD_NSEC_RSZ PTP_PIN_CFG_RSZ - -#define PTP_PIN_CFG_DOM BIT(0) -#define PTP_PIN_CFG_SYNC BIT(2) -#define PTP_PIN_CFG_ACTION(x) ((x) << 3) -#define PTP_PIN_CFG_ACTION_MASK PTP_PIN_CFG_ACTION(0x7) - -enum { - PTP_PIN_ACTION_IDLE = 0, - PTP_PIN_ACTION_LOAD, - PTP_PIN_ACTION_SAVE, - PTP_PIN_ACTION_CLOCK, - PTP_PIN_ACTION_DELTA, - PTP_PIN_ACTION_NOSYNC, - PTP_PIN_ACTION_SYNC, -}; - -#define PTP_CFG_MISC_PTP_EN BIT(2) - -#define PSEC_PER_SEC 1000000000000LL - -#define PTP_CFG_CLK_ADJ_CFG_ENA BIT(0) -#define PTP_CFG_CLK_ADJ_CFG_DIR BIT(1) - -#define PTP_CFG_CLK_ADJ_FREQ_NS BIT(30) - -#endif diff --git a/include/soc/mscc/ocelot.h b/include/soc/mscc/ocelot.h index 6d6a3947c8b7..6fd88ee622cf 100644 --- a/include/soc/mscc/ocelot.h +++ b/include/soc/mscc/ocelot.h @@ -620,7 +620,6 @@ int ocelot_vlan_add(struct ocelot *ocelot, int port, u16 vid, bool pvid, int ocelot_vlan_del(struct ocelot *ocelot, int port, u16 vid); int ocelot_hwstamp_get(struct ocelot *ocelot, int port, struct ifreq *ifr); int ocelot_hwstamp_set(struct ocelot *ocelot, int port, struct ifreq *ifr); -int ocelot_ptp_gettime64(struct ptp_clock_info *ptp, struct timespec64 *ts); int ocelot_port_add_txtstamp_skb(struct ocelot_port *ocelot_port, struct sk_buff *skb); void ocelot_get_txtstamp(struct ocelot *ocelot); diff --git a/include/soc/mscc/ocelot_ptp.h b/include/soc/mscc/ocelot_ptp.h new file mode 100644 index 000000000000..f01b0ce4e4cb --- /dev/null +++ b/include/soc/mscc/ocelot_ptp.h @@ -0,0 +1,52 @@ +/* SPDX-License-Identifier: (GPL-2.0 OR MIT) */ +/* + * Microsemi Ocelot Switch driver + * + * License: Dual MIT/GPL + * Copyright (c) 2017 Microsemi Corporation + * Copyright 2020 NXP + */ + +#ifndef _MSCC_OCELOT_PTP_H_ +#define _MSCC_OCELOT_PTP_H_ + +#include +#include + +#define PTP_PIN_CFG_RSZ 0x20 +#define PTP_PIN_TOD_SEC_MSB_RSZ PTP_PIN_CFG_RSZ +#define PTP_PIN_TOD_SEC_LSB_RSZ PTP_PIN_CFG_RSZ +#define PTP_PIN_TOD_NSEC_RSZ PTP_PIN_CFG_RSZ + +#define PTP_PIN_CFG_DOM BIT(0) +#define PTP_PIN_CFG_SYNC BIT(2) +#define PTP_PIN_CFG_ACTION(x) ((x) << 3) +#define PTP_PIN_CFG_ACTION_MASK PTP_PIN_CFG_ACTION(0x7) + +enum { + PTP_PIN_ACTION_IDLE = 0, + PTP_PIN_ACTION_LOAD, + PTP_PIN_ACTION_SAVE, + PTP_PIN_ACTION_CLOCK, + PTP_PIN_ACTION_DELTA, + PTP_PIN_ACTION_NOSYNC, + PTP_PIN_ACTION_SYNC, +}; + +#define PTP_CFG_MISC_PTP_EN BIT(2) + +#define PSEC_PER_SEC 1000000000000LL + +#define PTP_CFG_CLK_ADJ_CFG_ENA BIT(0) +#define PTP_CFG_CLK_ADJ_CFG_DIR BIT(1) + +#define PTP_CFG_CLK_ADJ_FREQ_NS BIT(30) + +int ocelot_ptp_gettime64(struct ptp_clock_info *ptp, struct timespec64 *ts); +int ocelot_ptp_settime64(struct ptp_clock_info *ptp, + const struct timespec64 *ts); +int ocelot_ptp_adjtime(struct ptp_clock_info *ptp, s64 delta); +int ocelot_ptp_adjfine(struct ptp_clock_info *ptp, long scaled_ppm); +int ocelot_init_timestamp(struct ocelot *ocelot, struct ptp_clock_info *info); +int ocelot_deinit_timestamp(struct ocelot *ocelot); +#endif -- cgit v1.2.3-59-g8ed1b From d2b09a8e7bcbfa47e7161b20d6387ac968834c21 Mon Sep 17 00:00:00 2001 From: Yangbo Lu Date: Mon, 20 Apr 2020 10:46:46 +0800 Subject: net: mscc: ocelot: fix timestamp info if ptp clock does not work The timestamp info should be only software timestamp capabilities if ptp clock does not work. Signed-off-by: Yangbo Lu Signed-off-by: David S. Miller --- drivers/net/ethernet/mscc/ocelot.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/net/ethernet/mscc/ocelot.c b/drivers/net/ethernet/mscc/ocelot.c index 7c4165af9f66..a2b9b85612a4 100644 --- a/drivers/net/ethernet/mscc/ocelot.c +++ b/drivers/net/ethernet/mscc/ocelot.c @@ -1349,6 +1349,12 @@ int ocelot_get_ts_info(struct ocelot *ocelot, int port, { info->phc_index = ocelot->ptp_clock ? ptp_clock_index(ocelot->ptp_clock) : -1; + if (info->phc_index == -1) { + info->so_timestamping |= SOF_TIMESTAMPING_TX_SOFTWARE | + SOF_TIMESTAMPING_RX_SOFTWARE | + SOF_TIMESTAMPING_SOFTWARE; + return 0; + } info->so_timestamping |= SOF_TIMESTAMPING_TX_SOFTWARE | SOF_TIMESTAMPING_RX_SOFTWARE | SOF_TIMESTAMPING_SOFTWARE | -- cgit v1.2.3-59-g8ed1b From 3007bc7321e3c37de9d7d965cb9fb95aaa00113b Mon Sep 17 00:00:00 2001 From: Yangbo Lu Date: Mon, 20 Apr 2020 10:46:47 +0800 Subject: net: mscc: ocelot: redefine PTP pins There are 5 PTP_PINS register groups on Ocelot switch. Except the one used for TOD operations, there are still 4 register groups for programmable pins. So redefine the 4 programmable pins. Signed-off-by: Yangbo Lu Signed-off-by: David S. Miller --- include/soc/mscc/ocelot.h | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/include/soc/mscc/ocelot.h b/include/soc/mscc/ocelot.h index 6fd88ee622cf..7d44d3508869 100644 --- a/include/soc/mscc/ocelot.h +++ b/include/soc/mscc/ocelot.h @@ -440,10 +440,11 @@ enum ocelot_regfield { REGFIELD_MAX }; -enum ocelot_clk_pins { - ALT_PPS_PIN = 1, - EXT_CLK_PIN, - ALT_LDST_PIN, +enum ocelot_ptp_pins { + PTP_PIN_0, + PTP_PIN_1, + PTP_PIN_2, + PTP_PIN_3, TOD_ACC_PIN }; -- cgit v1.2.3-59-g8ed1b From 94aca0824443d32987b31e656044ff7da425c523 Mon Sep 17 00:00:00 2001 From: Yangbo Lu Date: Mon, 20 Apr 2020 10:46:48 +0800 Subject: net: mscc: ocelot: add wave programming registers definitions Add wave programming registers definitions for Ocelot platforms. Signed-off-by: Yangbo Lu Signed-off-by: David S. Miller --- drivers/net/dsa/ocelot/felix_vsc9959.c | 2 ++ drivers/net/ethernet/mscc/ocelot_regs.c | 2 ++ include/soc/mscc/ocelot.h | 2 ++ include/soc/mscc/ocelot_ptp.h | 2 ++ 4 files changed, 8 insertions(+) diff --git a/drivers/net/dsa/ocelot/felix_vsc9959.c b/drivers/net/dsa/ocelot/felix_vsc9959.c index b4078f3c5c38..4fe707ef54b8 100644 --- a/drivers/net/dsa/ocelot/felix_vsc9959.c +++ b/drivers/net/dsa/ocelot/felix_vsc9959.c @@ -313,6 +313,8 @@ static const u32 vsc9959_ptp_regmap[] = { REG(PTP_PIN_TOD_SEC_MSB, 0x000004), REG(PTP_PIN_TOD_SEC_LSB, 0x000008), REG(PTP_PIN_TOD_NSEC, 0x00000c), + REG(PTP_PIN_WF_HIGH_PERIOD, 0x000014), + REG(PTP_PIN_WF_LOW_PERIOD, 0x000018), REG(PTP_CFG_MISC, 0x0000a0), REG(PTP_CLK_CFG_ADJ_CFG, 0x0000a4), REG(PTP_CLK_CFG_ADJ_FREQ, 0x0000a8), diff --git a/drivers/net/ethernet/mscc/ocelot_regs.c b/drivers/net/ethernet/mscc/ocelot_regs.c index b88b5899b227..ed4dd01a41ad 100644 --- a/drivers/net/ethernet/mscc/ocelot_regs.c +++ b/drivers/net/ethernet/mscc/ocelot_regs.c @@ -239,6 +239,8 @@ static const u32 ocelot_ptp_regmap[] = { REG(PTP_PIN_TOD_SEC_MSB, 0x000004), REG(PTP_PIN_TOD_SEC_LSB, 0x000008), REG(PTP_PIN_TOD_NSEC, 0x00000c), + REG(PTP_PIN_WF_HIGH_PERIOD, 0x000014), + REG(PTP_PIN_WF_LOW_PERIOD, 0x000018), REG(PTP_CFG_MISC, 0x0000a0), REG(PTP_CLK_CFG_ADJ_CFG, 0x0000a4), REG(PTP_CLK_CFG_ADJ_FREQ, 0x0000a8), diff --git a/include/soc/mscc/ocelot.h b/include/soc/mscc/ocelot.h index 7d44d3508869..31193ad3a545 100644 --- a/include/soc/mscc/ocelot.h +++ b/include/soc/mscc/ocelot.h @@ -385,6 +385,8 @@ enum ocelot_reg { PTP_PIN_TOD_SEC_MSB, PTP_PIN_TOD_SEC_LSB, PTP_PIN_TOD_NSEC, + PTP_PIN_WF_HIGH_PERIOD, + PTP_PIN_WF_LOW_PERIOD, PTP_CFG_MISC, PTP_CLK_CFG_ADJ_CFG, PTP_CLK_CFG_ADJ_FREQ, diff --git a/include/soc/mscc/ocelot_ptp.h b/include/soc/mscc/ocelot_ptp.h index f01b0ce4e4cb..aae1570eecb1 100644 --- a/include/soc/mscc/ocelot_ptp.h +++ b/include/soc/mscc/ocelot_ptp.h @@ -17,6 +17,8 @@ #define PTP_PIN_TOD_SEC_MSB_RSZ PTP_PIN_CFG_RSZ #define PTP_PIN_TOD_SEC_LSB_RSZ PTP_PIN_CFG_RSZ #define PTP_PIN_TOD_NSEC_RSZ PTP_PIN_CFG_RSZ +#define PTP_PIN_WF_HIGH_PERIOD_RSZ PTP_PIN_CFG_RSZ +#define PTP_PIN_WF_LOW_PERIOD_RSZ PTP_PIN_CFG_RSZ #define PTP_PIN_CFG_DOM BIT(0) #define PTP_PIN_CFG_SYNC BIT(2) -- cgit v1.2.3-59-g8ed1b From cc2d87bb83407c7dfb0900d63b3fcfbf6a59202f Mon Sep 17 00:00:00 2001 From: Yangbo Lu Date: Mon, 20 Apr 2020 10:46:49 +0800 Subject: net: mscc: ocelot: support 4 PTP programmable pins Support 4 PTP programmable pins with only PTP_PF_PEROUT function for now. The PTP_PF_EXTTS function will be supported in the future, and it should be implemented separately for Felix and Ocelot, because of different hardware interrupt implementation in them. Since the hardware is not able to support absolute start time, the periodic clock request only allows start time 0 0. But nsec could be accepted for PPS case for phase adjustment. Signed-off-by: Yangbo Lu Signed-off-by: David S. Miller --- drivers/net/ethernet/mscc/ocelot_ptp.c | 121 +++++++++++++++++++++++++++++++++ include/soc/mscc/ocelot.h | 3 + include/soc/mscc/ocelot_ptp.h | 4 ++ 3 files changed, 128 insertions(+) diff --git a/drivers/net/ethernet/mscc/ocelot_ptp.c b/drivers/net/ethernet/mscc/ocelot_ptp.c index 69d4e5677343..a3088a1676ed 100644 --- a/drivers/net/ethernet/mscc/ocelot_ptp.c +++ b/drivers/net/ethernet/mscc/ocelot_ptp.c @@ -165,11 +165,132 @@ disable_adj: } EXPORT_SYMBOL(ocelot_ptp_adjfine); +int ocelot_ptp_verify(struct ptp_clock_info *ptp, unsigned int pin, + enum ptp_pin_function func, unsigned int chan) +{ + switch (func) { + case PTP_PF_NONE: + case PTP_PF_PEROUT: + break; + case PTP_PF_EXTTS: + case PTP_PF_PHYSYNC: + return -1; + } + return 0; +} +EXPORT_SYMBOL(ocelot_ptp_verify); + +int ocelot_ptp_enable(struct ptp_clock_info *ptp, + struct ptp_clock_request *rq, int on) +{ + struct ocelot *ocelot = container_of(ptp, struct ocelot, ptp_info); + struct timespec64 ts_start, ts_period; + enum ocelot_ptp_pins ptp_pin; + unsigned long flags; + bool pps = false; + int pin = -1; + u32 val; + s64 ns; + + switch (rq->type) { + case PTP_CLK_REQ_PEROUT: + /* Reject requests with unsupported flags */ + if (rq->perout.flags) + return -EOPNOTSUPP; + + pin = ptp_find_pin(ocelot->ptp_clock, PTP_PF_PEROUT, + rq->perout.index); + if (pin == 0) + ptp_pin = PTP_PIN_0; + else if (pin == 1) + ptp_pin = PTP_PIN_1; + else if (pin == 2) + ptp_pin = PTP_PIN_2; + else if (pin == 3) + ptp_pin = PTP_PIN_3; + else + return -EBUSY; + + ts_start.tv_sec = rq->perout.start.sec; + ts_start.tv_nsec = rq->perout.start.nsec; + ts_period.tv_sec = rq->perout.period.sec; + ts_period.tv_nsec = rq->perout.period.nsec; + + if (ts_period.tv_sec == 1 && ts_period.tv_nsec == 0) + pps = true; + + if (ts_start.tv_sec || (ts_start.tv_nsec && !pps)) { + dev_warn(ocelot->dev, + "Absolute start time not supported!\n"); + dev_warn(ocelot->dev, + "Accept nsec for PPS phase adjustment, otherwise start time should be 0 0.\n"); + return -EINVAL; + } + + /* Handle turning off */ + if (!on) { + spin_lock_irqsave(&ocelot->ptp_clock_lock, flags); + val = PTP_PIN_CFG_ACTION(PTP_PIN_ACTION_IDLE); + ocelot_write_rix(ocelot, val, PTP_PIN_CFG, ptp_pin); + spin_unlock_irqrestore(&ocelot->ptp_clock_lock, flags); + break; + } + + /* Handle PPS request */ + if (pps) { + spin_lock_irqsave(&ocelot->ptp_clock_lock, flags); + /* Pulse generated perout.start.nsec after TOD has + * increased seconds. + * Pulse width is set to 1us. + */ + ocelot_write_rix(ocelot, ts_start.tv_nsec, + PTP_PIN_WF_LOW_PERIOD, ptp_pin); + ocelot_write_rix(ocelot, 1000, + PTP_PIN_WF_HIGH_PERIOD, ptp_pin); + val = PTP_PIN_CFG_ACTION(PTP_PIN_ACTION_CLOCK); + val |= PTP_PIN_CFG_SYNC; + ocelot_write_rix(ocelot, val, PTP_PIN_CFG, ptp_pin); + spin_unlock_irqrestore(&ocelot->ptp_clock_lock, flags); + break; + } + + /* Handle periodic clock */ + ns = timespec64_to_ns(&ts_period); + ns = ns >> 1; + if (ns > 0x3fffffff || ns <= 0x6) + return -EINVAL; + + spin_lock_irqsave(&ocelot->ptp_clock_lock, flags); + ocelot_write_rix(ocelot, ns, PTP_PIN_WF_LOW_PERIOD, ptp_pin); + ocelot_write_rix(ocelot, ns, PTP_PIN_WF_HIGH_PERIOD, ptp_pin); + val = PTP_PIN_CFG_ACTION(PTP_PIN_ACTION_CLOCK); + ocelot_write_rix(ocelot, val, PTP_PIN_CFG, ptp_pin); + spin_unlock_irqrestore(&ocelot->ptp_clock_lock, flags); + break; + default: + return -EOPNOTSUPP; + } + return 0; +} +EXPORT_SYMBOL(ocelot_ptp_enable); + int ocelot_init_timestamp(struct ocelot *ocelot, struct ptp_clock_info *info) { struct ptp_clock *ptp_clock; + int i; ocelot->ptp_info = *info; + + for (i = 0; i < OCELOT_PTP_PINS_NUM; i++) { + struct ptp_pin_desc *p = &ocelot->ptp_pins[i]; + + snprintf(p->name, sizeof(p->name), "switch_1588_dat%d", i); + p->index = i; + p->func = PTP_PF_NONE; + } + + ocelot->ptp_info.pin_config = &ocelot->ptp_pins[0]; + ptp_clock = ptp_clock_register(&ocelot->ptp_info, ocelot->dev); if (IS_ERR(ptp_clock)) return PTR_ERR(ptp_clock); diff --git a/include/soc/mscc/ocelot.h b/include/soc/mscc/ocelot.h index 31193ad3a545..a025fb798164 100644 --- a/include/soc/mscc/ocelot.h +++ b/include/soc/mscc/ocelot.h @@ -92,6 +92,8 @@ #define OCELOT_SPEED_100 2 #define OCELOT_SPEED_10 3 +#define OCELOT_PTP_PINS_NUM 4 + #define TARGET_OFFSET 24 #define REG_MASK GENMASK(TARGET_OFFSET - 1, 0) #define REG(reg, offset) [reg & REG_MASK] = offset @@ -552,6 +554,7 @@ struct ocelot { struct mutex ptp_lock; /* Protects the PTP clock */ spinlock_t ptp_clock_lock; + struct ptp_pin_desc ptp_pins[OCELOT_PTP_PINS_NUM]; }; struct ocelot_policer { diff --git a/include/soc/mscc/ocelot_ptp.h b/include/soc/mscc/ocelot_ptp.h index aae1570eecb1..4a6b2f71b6b2 100644 --- a/include/soc/mscc/ocelot_ptp.h +++ b/include/soc/mscc/ocelot_ptp.h @@ -49,6 +49,10 @@ int ocelot_ptp_settime64(struct ptp_clock_info *ptp, const struct timespec64 *ts); int ocelot_ptp_adjtime(struct ptp_clock_info *ptp, s64 delta); int ocelot_ptp_adjfine(struct ptp_clock_info *ptp, long scaled_ppm); +int ocelot_ptp_verify(struct ptp_clock_info *ptp, unsigned int pin, + enum ptp_pin_function func, unsigned int chan); +int ocelot_ptp_enable(struct ptp_clock_info *ptp, + struct ptp_clock_request *rq, int on); int ocelot_init_timestamp(struct ocelot *ocelot, struct ptp_clock_info *info); int ocelot_deinit_timestamp(struct ocelot *ocelot); #endif -- cgit v1.2.3-59-g8ed1b From aabb2bb07c963c67b8072aafcca3677e2b235be0 Mon Sep 17 00:00:00 2001 From: Yangbo Lu Date: Mon, 20 Apr 2020 10:46:50 +0800 Subject: net: mscc: ocelot: enable PTP programmable pin Enable PTP programmable pin. Signed-off-by: Yangbo Lu Signed-off-by: David S. Miller --- drivers/net/ethernet/mscc/ocelot_board.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/mscc/ocelot_board.c b/drivers/net/ethernet/mscc/ocelot_board.c index ee016f7ed934..67a8d61c926a 100644 --- a/drivers/net/ethernet/mscc/ocelot_board.c +++ b/drivers/net/ethernet/mscc/ocelot_board.c @@ -372,13 +372,15 @@ static struct ptp_clock_info ocelot_ptp_clock_info = { .max_adj = 0x7fffffff, .n_alarm = 0, .n_ext_ts = 0, - .n_per_out = 0, - .n_pins = 0, + .n_per_out = OCELOT_PTP_PINS_NUM, + .n_pins = OCELOT_PTP_PINS_NUM, .pps = 0, .gettime64 = ocelot_ptp_gettime64, .settime64 = ocelot_ptp_settime64, .adjtime = ocelot_ptp_adjtime, .adjfine = ocelot_ptp_adjfine, + .verify = ocelot_ptp_verify, + .enable = ocelot_ptp_enable, }; static int mscc_ocelot_probe(struct platform_device *pdev) -- cgit v1.2.3-59-g8ed1b From 5287be405ca2263a9c524a6b0a7869c59760f4e6 Mon Sep 17 00:00:00 2001 From: Yangbo Lu Date: Mon, 20 Apr 2020 10:46:51 +0800 Subject: net: dsa: felix: enable PTP programmable pin Enable PTP programmable pin. Signed-off-by: Yangbo Lu Signed-off-by: David S. Miller --- drivers/net/dsa/ocelot/felix.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/net/dsa/ocelot/felix.c b/drivers/net/dsa/ocelot/felix.c index 44015a24b087..9173b95551d1 100644 --- a/drivers/net/dsa/ocelot/felix.c +++ b/drivers/net/dsa/ocelot/felix.c @@ -501,13 +501,15 @@ static struct ptp_clock_info ocelot_ptp_clock_info = { .max_adj = 0x7fffffff, .n_alarm = 0, .n_ext_ts = 0, - .n_per_out = 0, - .n_pins = 0, + .n_per_out = OCELOT_PTP_PINS_NUM, + .n_pins = OCELOT_PTP_PINS_NUM, .pps = 0, .gettime64 = ocelot_ptp_gettime64, .settime64 = ocelot_ptp_settime64, .adjtime = ocelot_ptp_adjtime, .adjfine = ocelot_ptp_adjfine, + .verify = ocelot_ptp_verify, + .enable = ocelot_ptp_enable, }; /* Hardware initialization done here so that we can allocate structures with -- cgit v1.2.3-59-g8ed1b From 8af40902f839f5431b89ce2a035ee01e51b31a1d Mon Sep 17 00:00:00 2001 From: Jason Yan Date: Mon, 20 Apr 2020 20:37:18 +0800 Subject: ath11k: remove conversion to bool in ath11k_dp_rxdesc_mpdu_valid() The '==' expression itself is bool, no need to convert it to bool again. This fixes the following coccicheck warning: drivers/net/wireless/ath/ath11k/dp_rx.c:255:46-51: WARNING: conversion to bool not needed here Signed-off-by: Jason Yan Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/20200420123718.3384-1-yanaijie@huawei.com --- drivers/net/wireless/ath/ath11k/dp_rx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/wireless/ath/ath11k/dp_rx.c b/drivers/net/wireless/ath/ath11k/dp_rx.c index 203fd44ff352..bbd7da48518f 100644 --- a/drivers/net/wireless/ath/ath11k/dp_rx.c +++ b/drivers/net/wireless/ath/ath11k/dp_rx.c @@ -252,7 +252,7 @@ static bool ath11k_dp_rxdesc_mpdu_valid(struct hal_rx_desc *rx_desc) tlv_tag = FIELD_GET(HAL_TLV_HDR_TAG, __le32_to_cpu(rx_desc->mpdu_start_tag)); - return tlv_tag == HAL_RX_MPDU_START ? true : false; + return tlv_tag == HAL_RX_MPDU_START; } static u32 ath11k_dp_rxdesc_get_ppduid(struct hal_rx_desc *rx_desc) -- cgit v1.2.3-59-g8ed1b From d81709346ceac7b9131b4a091197b9d7fc5a409f Mon Sep 17 00:00:00 2001 From: Jason Yan Date: Mon, 20 Apr 2020 20:37:45 +0800 Subject: ath11k: remove conversion to bool in ath11k_debug_fw_stats_process() The '==' expression itself is bool, no need to convert it to bool again. This fixes the following coccicheck warning: drivers/net/wireless/ath/ath11k/debug.c:198:57-62: WARNING: conversion to bool not needed here drivers/net/wireless/ath/ath11k/debug.c:218:58-63: WARNING: conversion to bool not needed here Signed-off-by: Jason Yan Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/20200420123745.4159-1-yanaijie@huawei.com --- drivers/net/wireless/ath/ath11k/debug.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/wireless/ath/ath11k/debug.c b/drivers/net/wireless/ath/ath11k/debug.c index a2e3dfeae3d5..3fd6b5af073b 100644 --- a/drivers/net/wireless/ath/ath11k/debug.c +++ b/drivers/net/wireless/ath/ath11k/debug.c @@ -195,7 +195,7 @@ void ath11k_debug_fw_stats_process(struct ath11k_base *ab, struct sk_buff *skb) total_vdevs_started += ar->num_started_vdevs; } - is_end = ((++num_vdev) == total_vdevs_started ? true : false); + is_end = ((++num_vdev) == total_vdevs_started); list_splice_tail_init(&stats.vdevs, &ar->debug.fw_stats.vdevs); @@ -215,7 +215,7 @@ void ath11k_debug_fw_stats_process(struct ath11k_base *ab, struct sk_buff *skb) /* Mark end until we reached the count of all started VDEVs * within the PDEV */ - is_end = ((++num_bcn) == ar->num_started_vdevs ? true : false); + is_end = ((++num_bcn) == ar->num_started_vdevs); list_splice_tail_init(&stats.bcn, &ar->debug.fw_stats.bcn); -- cgit v1.2.3-59-g8ed1b From c8334512f3dd1b94844baca629f9bedca4271593 Mon Sep 17 00:00:00 2001 From: Wen Gong Date: Tue, 21 Apr 2020 15:09:35 +0300 Subject: ath10k: add htt TX bundle for sdio The transmission utilization ratio for sdio bus for small packet is slow, because the space and time cost for sdio bus is same for large length packet and small length packet. So the speed of data for large length packet is higher than small length. Test result of different length of data: data packet(byte) cost time(us) calculated rate(Mbps) 256 28 73 512 33 124 1024 35 234 1792 45 318 14336 168 682 28672 333 688 57344 660 695 This patch change the TX packet from single packet to a large length bundle packet, max size is 32, it results in significant performance improvement on TX path. Also there's a fourth thread "ath10k_tx_complete_wq" added to ath10k as it improves TCP RX throughput (values in Mbps): TCP-RX TCP-TX UDP-RX UDP-TX use workqueue_tx_complete 423 357 448 412 change it to ar->workqueue 410 360 449 414 change it to ar->workqueue_aux 405 339 446 401 This patch only effect sdio chip, it will not effect PCI, SNOC etc. It only enable bundle for sdio chip. Tested with QCA6174 SDIO with firmware WLAN.RMH.4.4.1-00017-QCARMSWP-1. Signed-off-by: Wen Gong Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/20200410061400.14231-2-wgong@codeaurora.org --- drivers/net/wireless/ath/ath10k/core.c | 14 +- drivers/net/wireless/ath/ath10k/core.h | 4 +- drivers/net/wireless/ath/ath10k/htc.c | 372 ++++++++++++++++++++++++++++--- drivers/net/wireless/ath/ath10k/htc.h | 24 +- drivers/net/wireless/ath/ath10k/htt.c | 8 + drivers/net/wireless/ath/ath10k/htt.h | 4 + drivers/net/wireless/ath/ath10k/htt_rx.c | 1 + drivers/net/wireless/ath/ath10k/htt_tx.c | 8 +- 8 files changed, 398 insertions(+), 37 deletions(-) diff --git a/drivers/net/wireless/ath/ath10k/core.c b/drivers/net/wireless/ath/ath10k/core.c index 5926281c7e05..8689c330fdd9 100644 --- a/drivers/net/wireless/ath/ath10k/core.c +++ b/drivers/net/wireless/ath/ath10k/core.c @@ -3288,6 +3288,11 @@ struct ath10k *ath10k_core_create(size_t priv_size, struct device *dev, if (!ar->workqueue_aux) goto err_free_wq; + ar->workqueue_tx_complete = + create_singlethread_workqueue("ath10k_tx_complete_wq"); + if (!ar->workqueue_tx_complete) + goto err_free_aux_wq; + mutex_init(&ar->conf_mutex); mutex_init(&ar->dump_mutex); spin_lock_init(&ar->data_lock); @@ -3315,7 +3320,7 @@ struct ath10k *ath10k_core_create(size_t priv_size, struct device *dev, ret = ath10k_coredump_create(ar); if (ret) - goto err_free_aux_wq; + goto err_free_tx_complete; ret = ath10k_debug_create(ar); if (ret) @@ -3325,12 +3330,12 @@ struct ath10k *ath10k_core_create(size_t priv_size, struct device *dev, err_free_coredump: ath10k_coredump_destroy(ar); - +err_free_tx_complete: + destroy_workqueue(ar->workqueue_tx_complete); err_free_aux_wq: destroy_workqueue(ar->workqueue_aux); err_free_wq: destroy_workqueue(ar->workqueue); - err_free_mac: ath10k_mac_destroy(ar); @@ -3346,6 +3351,9 @@ void ath10k_core_destroy(struct ath10k *ar) flush_workqueue(ar->workqueue_aux); destroy_workqueue(ar->workqueue_aux); + flush_workqueue(ar->workqueue_tx_complete); + destroy_workqueue(ar->workqueue_tx_complete); + ath10k_debug_destroy(ar); ath10k_coredump_destroy(ar); ath10k_htt_tx_destroy(&ar->htt); diff --git a/drivers/net/wireless/ath/ath10k/core.h b/drivers/net/wireless/ath/ath10k/core.h index bd8ef576c590..d6adcbaf9616 100644 --- a/drivers/net/wireless/ath/ath10k/core.h +++ b/drivers/net/wireless/ath/ath10k/core.h @@ -1091,7 +1091,7 @@ struct ath10k { struct workqueue_struct *workqueue; /* Auxiliary workqueue */ struct workqueue_struct *workqueue_aux; - + struct workqueue_struct *workqueue_tx_complete; /* prevents concurrent FW reconfiguration */ struct mutex conf_mutex; @@ -1132,6 +1132,8 @@ struct ath10k { struct work_struct register_work; struct work_struct restart_work; + struct work_struct bundle_tx_work; + struct work_struct tx_complete_work; /* cycle count is reported twice for each visited channel during scan. * access protected by data_lock diff --git a/drivers/net/wireless/ath/ath10k/htc.c b/drivers/net/wireless/ath/ath10k/htc.c index 61ee413d902a..ed4e0add997e 100644 --- a/drivers/net/wireless/ath/ath10k/htc.c +++ b/drivers/net/wireless/ath/ath10k/htc.c @@ -51,10 +51,12 @@ void ath10k_htc_notify_tx_completion(struct ath10k_htc_ep *ep, struct sk_buff *skb) { struct ath10k *ar = ep->htc->ar; + struct ath10k_htc_hdr *hdr; ath10k_dbg(ar, ATH10K_DBG_HTC, "%s: ep %d skb %pK\n", __func__, ep->eid, skb); + hdr = (struct ath10k_htc_hdr *)skb->data; ath10k_htc_restore_tx_skb(ep->htc, skb); if (!ep->ep_ops.ep_tx_complete) { @@ -63,6 +65,11 @@ void ath10k_htc_notify_tx_completion(struct ath10k_htc_ep *ep, return; } + if (hdr->flags & ATH10K_HTC_FLAG_SEND_BUNDLE) { + dev_kfree_skb_any(skb); + return; + } + ep->ep_ops.ep_tx_complete(ep->htc->ar, skb); } EXPORT_SYMBOL(ath10k_htc_notify_tx_completion); @@ -78,7 +85,7 @@ static void ath10k_htc_prepare_tx_skb(struct ath10k_htc_ep *ep, hdr->eid = ep->eid; hdr->len = __cpu_to_le16(skb->len - sizeof(*hdr)); hdr->flags = 0; - if (ep->tx_credit_flow_enabled) + if (ep->tx_credit_flow_enabled && !ep->bundle_tx) hdr->flags |= ATH10K_HTC_FLAG_NEED_CREDIT_UPDATE; spin_lock_bh(&ep->htc->tx_lock); @@ -86,6 +93,63 @@ static void ath10k_htc_prepare_tx_skb(struct ath10k_htc_ep *ep, spin_unlock_bh(&ep->htc->tx_lock); } +static int ath10k_htc_consume_credit(struct ath10k_htc_ep *ep, + unsigned int len, + bool consume) +{ + struct ath10k_htc *htc = ep->htc; + struct ath10k *ar = htc->ar; + enum ath10k_htc_ep_id eid = ep->eid; + int credits, ret = 0; + + if (!ep->tx_credit_flow_enabled) + return 0; + + credits = DIV_ROUND_UP(len, ep->tx_credit_size); + spin_lock_bh(&htc->tx_lock); + + if (ep->tx_credits < credits) { + ath10k_dbg(ar, ATH10K_DBG_HTC, + "htc insufficient credits ep %d required %d available %d consume %d\n", + eid, credits, ep->tx_credits, consume); + ret = -EAGAIN; + goto unlock; + } + + if (consume) { + ep->tx_credits -= credits; + ath10k_dbg(ar, ATH10K_DBG_HTC, + "htc ep %d consumed %d credits total %d\n", + eid, credits, ep->tx_credits); + } + +unlock: + spin_unlock_bh(&htc->tx_lock); + return ret; +} + +static void ath10k_htc_release_credit(struct ath10k_htc_ep *ep, unsigned int len) +{ + struct ath10k_htc *htc = ep->htc; + struct ath10k *ar = htc->ar; + enum ath10k_htc_ep_id eid = ep->eid; + int credits; + + if (!ep->tx_credit_flow_enabled) + return; + + credits = DIV_ROUND_UP(len, ep->tx_credit_size); + spin_lock_bh(&htc->tx_lock); + ep->tx_credits += credits; + ath10k_dbg(ar, ATH10K_DBG_HTC, + "htc ep %d reverted %d credits back total %d\n", + eid, credits, ep->tx_credits); + spin_unlock_bh(&htc->tx_lock); + + if (ep->ep_ops.ep_tx_credits) + ep->ep_ops.ep_tx_credits(htc->ar); +} + int ath10k_htc_send(struct ath10k_htc *htc, enum ath10k_htc_ep_id eid, struct sk_buff *skb) @@ -95,8 +159,8 @@ int ath10k_htc_send(struct ath10k_htc *htc, struct ath10k_skb_cb *skb_cb = ATH10K_SKB_CB(skb); struct ath10k_hif_sg_item sg_item; struct device *dev = htc->ar->dev; - int credits = 0; int ret; + unsigned int skb_len; if (htc->ar->state == ATH10K_STATE_WEDGED) return -ECOMM; @@ -108,23 +172,10 @@ int ath10k_htc_send(struct ath10k_htc *htc, skb_push(skb, sizeof(struct ath10k_htc_hdr)); - if (ep->tx_credit_flow_enabled) { - credits = DIV_ROUND_UP(skb->len, htc->target_credit_size); - spin_lock_bh(&htc->tx_lock); - if (ep->tx_credits < credits) { - ath10k_dbg(ar, ATH10K_DBG_HTC, - "htc insufficient credits ep %d required %d available %d\n", - eid, credits, ep->tx_credits); - spin_unlock_bh(&htc->tx_lock); - ret = -EAGAIN; - goto err_pull; - } - ep->tx_credits -= credits; - ath10k_dbg(ar, ATH10K_DBG_HTC, - "htc ep %d consumed %d credits (total %d)\n", - eid, credits, ep->tx_credits); - spin_unlock_bh(&htc->tx_lock); - } + skb_len = skb->len; + ret = ath10k_htc_consume_credit(ep, skb_len, true); + if (ret) + goto err_pull; ath10k_htc_prepare_tx_skb(ep, skb); @@ -155,17 +206,7 @@ err_unmap: if (ar->bus_param.dev_type != ATH10K_DEV_TYPE_HL) dma_unmap_single(dev, skb_cb->paddr, skb->len, DMA_TO_DEVICE); err_credits: - if (ep->tx_credit_flow_enabled) { - spin_lock_bh(&htc->tx_lock); - ep->tx_credits += credits; - ath10k_dbg(ar, ATH10K_DBG_HTC, - "htc ep %d reverted %d credits back (total %d)\n", - eid, credits, ep->tx_credits); - spin_unlock_bh(&htc->tx_lock); - - if (ep->ep_ops.ep_tx_credits) - ep->ep_ops.ep_tx_credits(htc->ar); - } + ath10k_htc_release_credit(ep, skb_len); err_pull: skb_pull(skb, sizeof(struct ath10k_htc_hdr)); return ret; @@ -581,6 +622,273 @@ static u8 ath10k_htc_get_credit_allocation(struct ath10k_htc *htc, return allocation; } +static int ath10k_htc_send_bundle(struct ath10k_htc_ep *ep, + struct sk_buff *bundle_skb, + struct sk_buff_head *tx_save_head) +{ + struct ath10k_hif_sg_item sg_item; + struct ath10k_htc *htc = ep->htc; + struct ath10k *ar = htc->ar; + struct sk_buff *skb; + int ret, cn = 0; + unsigned int skb_len; + + ath10k_dbg(ar, ATH10K_DBG_HTC, "bundle skb len %d\n", bundle_skb->len); + skb_len = bundle_skb->len; + ret = ath10k_htc_consume_credit(ep, skb_len, true); + + if (!ret) { + sg_item.transfer_id = ep->eid; + sg_item.transfer_context = bundle_skb; + sg_item.vaddr = bundle_skb->data; + sg_item.len = bundle_skb->len; + + ret = ath10k_hif_tx_sg(htc->ar, ep->ul_pipe_id, &sg_item, 1); + if (ret) + ath10k_htc_release_credit(ep, skb_len); + } + + if (ret) + dev_kfree_skb_any(bundle_skb); + + for (cn = 0; (skb = skb_dequeue_tail(tx_save_head)); cn++) { + if (ret) { + skb_pull(skb, sizeof(struct ath10k_htc_hdr)); + skb_queue_head(&ep->tx_req_head, skb); + } else { + skb_queue_tail(&ep->tx_complete_head, skb); + } + } + + if (!ret) + queue_work(ar->workqueue_tx_complete, &ar->tx_complete_work); + + ath10k_dbg(ar, ATH10K_DBG_HTC, + "bundle tx status %d eid %d req count %d count %d len %d\n", + ret, ep->eid, skb_queue_len(&ep->tx_req_head), cn, bundle_skb->len); + return ret; +} + +static void ath10k_htc_send_one_skb(struct ath10k_htc_ep *ep, struct sk_buff *skb) +{ + struct ath10k_htc *htc = ep->htc; + struct ath10k *ar = htc->ar; + int ret; + + ret = ath10k_htc_send(htc, ep->eid, skb); + + if (ret) + skb_queue_head(&ep->tx_req_head, skb); + + ath10k_dbg(ar, ATH10K_DBG_HTC, "tx one status %d eid %d len %d pending count %d\n", + ret, ep->eid, skb->len, skb_queue_len(&ep->tx_req_head)); +} + +static int ath10k_htc_send_bundle_skbs(struct ath10k_htc_ep *ep) +{ + struct ath10k_htc *htc = ep->htc; + struct sk_buff *bundle_skb, *skb; + struct sk_buff_head tx_save_head; + struct ath10k_htc_hdr *hdr; + u8 *bundle_buf; + int ret = 0, credit_pad, credit_remainder, trans_len, bundles_left = 0; + + if (htc->ar->state == ATH10K_STATE_WEDGED) + return -ECOMM; + + if (ep->tx_credit_flow_enabled && + ep->tx_credits < ATH10K_MIN_CREDIT_PER_HTC_TX_BUNDLE) + return 0; + + bundles_left = ATH10K_MAX_MSG_PER_HTC_TX_BUNDLE * ep->tx_credit_size; + bundle_skb = dev_alloc_skb(bundles_left); + + if (!bundle_skb) + return -ENOMEM; + + bundle_buf = bundle_skb->data; + skb_queue_head_init(&tx_save_head); + + while (true) { + skb = skb_dequeue(&ep->tx_req_head); + if (!skb) + break; + + credit_pad = 0; + trans_len = skb->len + sizeof(*hdr); + credit_remainder = trans_len % ep->tx_credit_size; + + if (credit_remainder != 0) { + credit_pad = ep->tx_credit_size - credit_remainder; + trans_len += credit_pad; + } + + ret = ath10k_htc_consume_credit(ep, + bundle_buf + trans_len - bundle_skb->data, + false); + if (ret) { + skb_queue_head(&ep->tx_req_head, skb); + break; + } + + if (bundles_left < trans_len) { + bundle_skb->len = bundle_buf - bundle_skb->data; + ret = ath10k_htc_send_bundle(ep, bundle_skb, &tx_save_head); + + if (ret) { + skb_queue_head(&ep->tx_req_head, skb); + return ret; + } + + if (skb_queue_len(&ep->tx_req_head) == 0) { + ath10k_htc_send_one_skb(ep, skb); + return ret; + } + + if (ep->tx_credit_flow_enabled && + ep->tx_credits < ATH10K_MIN_CREDIT_PER_HTC_TX_BUNDLE) { + skb_queue_head(&ep->tx_req_head, skb); + return 0; + } + + bundles_left = + ATH10K_MAX_MSG_PER_HTC_TX_BUNDLE * ep->tx_credit_size; + bundle_skb = dev_alloc_skb(bundles_left); + + if (!bundle_skb) { + skb_queue_head(&ep->tx_req_head, skb); + return -ENOMEM; + } + bundle_buf = bundle_skb->data; + skb_queue_head_init(&tx_save_head); + } + + skb_push(skb, sizeof(struct ath10k_htc_hdr)); + ath10k_htc_prepare_tx_skb(ep, skb); + + memcpy(bundle_buf, skb->data, skb->len); + hdr = (struct ath10k_htc_hdr *)bundle_buf; + hdr->flags |= ATH10K_HTC_FLAG_SEND_BUNDLE; + hdr->pad_len = __cpu_to_le16(credit_pad); + bundle_buf += trans_len; + bundles_left -= trans_len; + skb_queue_tail(&tx_save_head, skb); + } + + if (bundle_buf != bundle_skb->data) { + bundle_skb->len = bundle_buf - bundle_skb->data; + ret = ath10k_htc_send_bundle(ep, bundle_skb, &tx_save_head); + } else { + dev_kfree_skb_any(bundle_skb); + } + + return ret; +} + +static void ath10k_htc_bundle_tx_work(struct work_struct *work) +{ + struct ath10k *ar = container_of(work, struct ath10k, bundle_tx_work); + struct ath10k_htc_ep *ep; + struct sk_buff *skb; + int i; + + for (i = 0; i < ARRAY_SIZE(ar->htc.endpoint); i++) { + ep = &ar->htc.endpoint[i]; + + if (!ep->bundle_tx) + continue; + + ath10k_dbg(ar, ATH10K_DBG_HTC, "bundle tx work eid %d count %d\n", + ep->eid, skb_queue_len(&ep->tx_req_head)); + + if (skb_queue_len(&ep->tx_req_head) >= + ATH10K_MIN_MSG_PER_HTC_TX_BUNDLE) { + ath10k_htc_send_bundle_skbs(ep); + } else { + skb = skb_dequeue(&ep->tx_req_head); + + if (!skb) + continue; + ath10k_htc_send_one_skb(ep, skb); + } + } +} + +static void ath10k_htc_tx_complete_work(struct work_struct *work) +{ + struct ath10k *ar = container_of(work, struct ath10k, tx_complete_work); + struct ath10k_htc_ep *ep; + enum ath10k_htc_ep_id eid; + struct sk_buff *skb; + int i; + + for (i = 0; i < ARRAY_SIZE(ar->htc.endpoint); i++) { + ep = &ar->htc.endpoint[i]; + eid = ep->eid; + if (ep->bundle_tx && eid == ar->htt.eid) { + ath10k_dbg(ar, ATH10K_DBG_HTC, "bundle tx complete eid %d pending complete count%d\n", + ep->eid, skb_queue_len(&ep->tx_complete_head)); + + while (true) { + skb = skb_dequeue(&ep->tx_complete_head); + if (!skb) + break; + ath10k_htc_notify_tx_completion(ep, skb); + } + } + } +} + +int ath10k_htc_send_hl(struct ath10k_htc *htc, + enum ath10k_htc_ep_id eid, + struct sk_buff *skb) +{ + struct ath10k_htc_ep *ep = &htc->endpoint[eid]; + struct ath10k *ar = htc->ar; + + ath10k_dbg(ar, ATH10K_DBG_HTC, "htc send hl eid %d bundle %d tx count %d len %d\n", + eid, ep->bundle_tx, skb_queue_len(&ep->tx_req_head), skb->len); + + if (ep->bundle_tx) { + skb_queue_tail(&ep->tx_req_head, skb); + queue_work(ar->workqueue, &ar->bundle_tx_work); + return 0; + } else { + return ath10k_htc_send(htc, eid, skb); + } +} + +void ath10k_htc_setup_tx_req(struct ath10k_htc_ep *ep) +{ + if (ep->htc->max_msgs_per_htc_bundle >= ATH10K_MIN_MSG_PER_HTC_TX_BUNDLE && + !ep->bundle_tx) { + ep->bundle_tx = true; + skb_queue_head_init(&ep->tx_req_head); + skb_queue_head_init(&ep->tx_complete_head); + } +} + +void ath10k_htc_stop_hl(struct ath10k *ar) +{ + struct ath10k_htc_ep *ep; + int i; + + cancel_work_sync(&ar->bundle_tx_work); + cancel_work_sync(&ar->tx_complete_work); + + for (i = 0; i < ARRAY_SIZE(ar->htc.endpoint); i++) { + ep = &ar->htc.endpoint[i]; + + if (!ep->bundle_tx) + continue; + + ath10k_dbg(ar, ATH10K_DBG_HTC, "stop tx work eid %d count %d\n", + ep->eid, skb_queue_len(&ep->tx_req_head)); + + skb_queue_purge(&ep->tx_req_head); + } +} + int ath10k_htc_wait_target(struct ath10k_htc *htc) { struct ath10k *ar = htc->ar; @@ -657,6 +965,9 @@ int ath10k_htc_wait_target(struct ath10k_htc *htc) htc->max_msgs_per_htc_bundle); } + INIT_WORK(&ar->bundle_tx_work, ath10k_htc_bundle_tx_work); + INIT_WORK(&ar->tx_complete_work, ath10k_htc_tx_complete_work); + return 0; } @@ -801,6 +1112,7 @@ setup: ep->max_tx_queue_depth = conn_req->max_send_queue_depth; ep->max_ep_message_len = __le16_to_cpu(resp_msg->max_msg_size); ep->tx_credits = tx_alloc; + ep->tx_credit_size = htc->target_credit_size; /* copy all the callbacks */ ep->ep_ops = conn_req->ep_ops; diff --git a/drivers/net/wireless/ath/ath10k/htc.h b/drivers/net/wireless/ath/ath10k/htc.h index 14e5c3f712c1..d045dbc42158 100644 --- a/drivers/net/wireless/ath/ath10k/htc.h +++ b/drivers/net/wireless/ath/ath10k/htc.h @@ -83,8 +83,14 @@ struct ath10k_htc_hdr { u8 seq_no; /* for tx */ u8 control_byte1; } __packed; - u8 pad0; - u8 pad1; + union { + __le16 pad_len; + struct { + u8 pad0; + u8 pad1; + } __packed; + } __packed; + } __packed __aligned(4); enum ath10k_ath10k_htc_msg_id { @@ -121,6 +127,10 @@ enum ath10k_htc_conn_svc_status { ATH10K_HTC_CONN_SVC_STATUS_NO_MORE_EP = 4 }; +#define ATH10K_MAX_MSG_PER_HTC_TX_BUNDLE 32 +#define ATH10K_MIN_MSG_PER_HTC_TX_BUNDLE 2 +#define ATH10K_MIN_CREDIT_PER_HTC_TX_BUNDLE 2 + enum ath10k_htc_setup_complete_flags { ATH10K_HTC_SETUP_COMPLETE_FLAGS_RX_BNDL_EN = 1 }; @@ -353,7 +363,12 @@ struct ath10k_htc_ep { u8 seq_no; /* for debugging */ int tx_credits; + int tx_credit_size; bool tx_credit_flow_enabled; + bool bundle_tx; + struct sk_buff_head tx_req_head; + struct sk_buff_head tx_complete_head; + }; struct ath10k_htc_svc_tx_credits { @@ -382,6 +397,7 @@ struct ath10k_htc { int ath10k_htc_init(struct ath10k *ar); int ath10k_htc_wait_target(struct ath10k_htc *htc); +void ath10k_htc_setup_tx_req(struct ath10k_htc_ep *ep); int ath10k_htc_start(struct ath10k_htc *htc); int ath10k_htc_connect_service(struct ath10k_htc *htc, struct ath10k_htc_svc_conn_req *conn_req, @@ -391,6 +407,10 @@ void ath10k_htc_change_tx_credit_flow(struct ath10k_htc *htc, bool enable); int ath10k_htc_send(struct ath10k_htc *htc, enum ath10k_htc_ep_id eid, struct sk_buff *packet); +void ath10k_htc_stop_hl(struct ath10k *ar); + +int ath10k_htc_send_hl(struct ath10k_htc *htc, enum ath10k_htc_ep_id eid, + struct sk_buff *packet); struct sk_buff *ath10k_htc_alloc_skb(struct ath10k *ar, int size); void ath10k_htc_tx_completion_handler(struct ath10k *ar, struct sk_buff *skb); void ath10k_htc_rx_completion_handler(struct ath10k *ar, struct sk_buff *skb); diff --git a/drivers/net/wireless/ath/ath10k/htt.c b/drivers/net/wireless/ath/ath10k/htt.c index 4354bf285ff1..127b4e4980ef 100644 --- a/drivers/net/wireless/ath/ath10k/htt.c +++ b/drivers/net/wireless/ath/ath10k/htt.c @@ -135,6 +135,8 @@ int ath10k_htt_connect(struct ath10k_htt *htt) { struct ath10k_htc_svc_conn_req conn_req; struct ath10k_htc_svc_conn_resp conn_resp; + struct ath10k *ar = htt->ar; + struct ath10k_htc_ep *ep; int status; memset(&conn_req, 0, sizeof(conn_req)); @@ -142,6 +144,7 @@ int ath10k_htt_connect(struct ath10k_htt *htt) conn_req.ep_ops.ep_tx_complete = ath10k_htt_htc_tx_complete; conn_req.ep_ops.ep_rx_complete = ath10k_htt_htc_t2h_msg_handler; + conn_req.ep_ops.ep_tx_credits = ath10k_htt_op_ep_tx_credits; /* connect to control service */ conn_req.service_id = ATH10K_HTC_SVC_ID_HTT_DATA_MSG; @@ -154,6 +157,11 @@ int ath10k_htt_connect(struct ath10k_htt *htt) htt->eid = conn_resp.eid; + if (ar->bus_param.dev_type == ATH10K_DEV_TYPE_HL) { + ep = &ar->htc.endpoint[htt->eid]; + ath10k_htc_setup_tx_req(ep); + } + htt->disable_tx_comp = ath10k_hif_get_htt_tx_complete(htt->ar); if (htt->disable_tx_comp) ath10k_htc_change_tx_credit_flow(&htt->ar->htc, htt->eid, true); diff --git a/drivers/net/wireless/ath/ath10k/htt.h b/drivers/net/wireless/ath/ath10k/htt.h index b88c2f3787d8..462a25b3056d 100644 --- a/drivers/net/wireless/ath/ath10k/htt.h +++ b/drivers/net/wireless/ath/ath10k/htt.h @@ -2032,6 +2032,9 @@ struct ath10k_htt { const struct ath10k_htt_tx_ops *tx_ops; const struct ath10k_htt_rx_ops *rx_ops; bool disable_tx_comp; + bool bundle_tx; + struct sk_buff_head tx_req_head; + struct sk_buff_head tx_complete_head; }; struct ath10k_htt_tx_ops { @@ -2278,6 +2281,7 @@ int ath10k_htt_tx_fetch_resp(struct ath10k *ar, __le16 fetch_seq_num, struct htt_tx_fetch_record *records, size_t num_records); +void ath10k_htt_op_ep_tx_credits(struct ath10k *ar); void ath10k_htt_tx_txq_update(struct ieee80211_hw *hw, struct ieee80211_txq *txq); diff --git a/drivers/net/wireless/ath/ath10k/htt_rx.c b/drivers/net/wireless/ath/ath10k/htt_rx.c index 64e45bfa5d05..816af1a8ad69 100644 --- a/drivers/net/wireless/ath/ath10k/htt_rx.c +++ b/drivers/net/wireless/ath/ath10k/htt_rx.c @@ -3919,6 +3919,7 @@ bool ath10k_htt_t2h_msg_handler(struct ath10k *ar, struct sk_buff *skb) ath10k_dbg(ar, ATH10K_DBG_HTT, "htt credit total %d\n", ep->tx_credits); + ep->ep_ops.ep_tx_credits(htc->ar); } break; } diff --git a/drivers/net/wireless/ath/ath10k/htt_tx.c b/drivers/net/wireless/ath/ath10k/htt_tx.c index bcecf05fe2fd..ff044426c337 100644 --- a/drivers/net/wireless/ath/ath10k/htt_tx.c +++ b/drivers/net/wireless/ath/ath10k/htt_tx.c @@ -531,6 +531,7 @@ void ath10k_htt_tx_destroy(struct ath10k_htt *htt) void ath10k_htt_tx_stop(struct ath10k_htt *htt) { + ath10k_htc_stop_hl(htt->ar); idr_for_each(&htt->pending_tx, ath10k_htt_tx_clean_up_pending, htt->ar); idr_destroy(&htt->pending_tx); } @@ -541,6 +542,11 @@ void ath10k_htt_tx_free(struct ath10k_htt *htt) ath10k_htt_tx_destroy(htt); } +void ath10k_htt_op_ep_tx_credits(struct ath10k *ar) +{ + queue_work(ar->workqueue, &ar->bundle_tx_work); +} + void ath10k_htt_htc_tx_complete(struct ath10k *ar, struct sk_buff *skb) { struct ath10k_htt *htt = &ar->htt; @@ -1379,7 +1385,7 @@ static int ath10k_htt_tx_hl(struct ath10k_htt *htt, enum ath10k_hw_txrx_mode txm */ tx_desc->peerid = __cpu_to_le32(HTT_INVALID_PEERID); - res = ath10k_htc_send(&htt->ar->htc, htt->eid, msdu); + res = ath10k_htc_send_hl(&htt->ar->htc, htt->eid, msdu); out: return res; -- cgit v1.2.3-59-g8ed1b From 2f918ea98606100f3a6d47db7ff7c200838ec4f3 Mon Sep 17 00:00:00 2001 From: Wen Gong Date: Tue, 21 Apr 2020 15:09:35 +0300 Subject: ath10k: enable alt data of TX path for sdio The default credit size is 1792 bytes, but the IP mtu is 1500 bytes, then it has about 290 bytes's waste for each data packet on sdio transfer path for TX bundle, it will reduce the transmission utilization ratio for data packet. This patch enable the small credit size in firmware, firmware will use the new credit size 1556 bytes, it will increase the transmission utilization ratio for data packet on TX patch. It results in significant performance improvement on TX path. This patch only effect sdio chip, it will not effect PCI, SNOC etc. Tested with QCA6174 SDIO with firmware WLAN.RMH.4.4.1-00017-QCARMSWP-1. Signed-off-by: Wen Gong Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/20200410061400.14231-3-wgong@codeaurora.org --- drivers/net/wireless/ath/ath10k/core.c | 8 ++++---- drivers/net/wireless/ath/ath10k/htc.c | 12 ++++++++++-- drivers/net/wireless/ath/ath10k/htc.h | 13 +++++++++++-- 3 files changed, 25 insertions(+), 8 deletions(-) diff --git a/drivers/net/wireless/ath/ath10k/core.c b/drivers/net/wireless/ath/ath10k/core.c index 8689c330fdd9..d96d178b4980 100644 --- a/drivers/net/wireless/ath/ath10k/core.c +++ b/drivers/net/wireless/ath/ath10k/core.c @@ -725,10 +725,10 @@ static int ath10k_init_sdio(struct ath10k *ar, enum ath10k_firmware_mode mode) param |= HI_ACS_FLAGS_SDIO_REDUCE_TX_COMPL_SET; - /* Alternate credit size of 1544 as used by SDIO firmware is - * not big enough for mac80211 / native wifi frames. disable it - */ - param &= ~HI_ACS_FLAGS_ALT_DATA_CREDIT_SIZE; + if (mode == ATH10K_FIRMWARE_MODE_NORMAL) + param |= HI_ACS_FLAGS_ALT_DATA_CREDIT_SIZE; + else + param &= ~HI_ACS_FLAGS_ALT_DATA_CREDIT_SIZE; if (mode == ATH10K_FIRMWARE_MODE_UTF) param &= ~HI_ACS_FLAGS_SDIO_SWAP_MAILBOX_SET; diff --git a/drivers/net/wireless/ath/ath10k/htc.c b/drivers/net/wireless/ath/ath10k/htc.c index ed4e0add997e..58ceba75d20a 100644 --- a/drivers/net/wireless/ath/ath10k/htc.c +++ b/drivers/net/wireless/ath/ath10k/htc.c @@ -957,12 +957,16 @@ int ath10k_htc_wait_target(struct ath10k_htc *htc) */ if (htc->control_resp_len >= sizeof(msg->hdr) + sizeof(msg->ready_ext)) { + htc->alt_data_credit_size = + __le16_to_cpu(msg->ready_ext.reserved) & + ATH10K_HTC_MSG_READY_EXT_ALT_DATA_MASK; htc->max_msgs_per_htc_bundle = min_t(u8, msg->ready_ext.max_msgs_per_htc_bundle, HTC_HOST_MAX_MSG_PER_RX_BUNDLE); ath10k_dbg(ar, ATH10K_DBG_HTC, - "Extended ready message. RX bundle size: %d\n", - htc->max_msgs_per_htc_bundle); + "Extended ready message RX bundle size %d alt size %d\n", + htc->max_msgs_per_htc_bundle, + htc->alt_data_credit_size); } INIT_WORK(&ar->bundle_tx_work, ath10k_htc_bundle_tx_work); @@ -1114,6 +1118,10 @@ setup: ep->tx_credits = tx_alloc; ep->tx_credit_size = htc->target_credit_size; + if (conn_req->service_id == ATH10K_HTC_SVC_ID_HTT_DATA_MSG && + htc->alt_data_credit_size != 0) + ep->tx_credit_size = htc->alt_data_credit_size; + /* copy all the callbacks */ ep->ep_ops = conn_req->ep_ops; diff --git a/drivers/net/wireless/ath/ath10k/htc.h b/drivers/net/wireless/ath/ath10k/htc.h index d045dbc42158..0d180faf3b77 100644 --- a/drivers/net/wireless/ath/ath10k/htc.h +++ b/drivers/net/wireless/ath/ath10k/htc.h @@ -119,6 +119,8 @@ enum ath10k_htc_conn_flags { #define ATH10K_HTC_CONN_FLAGS_RECV_ALLOC_LSB 8 }; +#define ATH10K_HTC_MSG_READY_EXT_ALT_DATA_MASK 0xFFF + enum ath10k_htc_conn_svc_status { ATH10K_HTC_CONN_SVC_STATUS_SUCCESS = 0, ATH10K_HTC_CONN_SVC_STATUS_NOT_FOUND = 1, @@ -155,8 +157,14 @@ struct ath10k_htc_ready_extended { struct ath10k_htc_ready base; u8 htc_version; /* @enum ath10k_htc_version */ u8 max_msgs_per_htc_bundle; - u8 pad0; - u8 pad1; + union { + __le16 reserved; + struct { + u8 pad0; + u8 pad1; + } __packed; + } __packed; + } __packed; struct ath10k_htc_conn_svc { @@ -393,6 +401,7 @@ struct ath10k_htc { int total_transmit_credits; int target_credit_size; u8 max_msgs_per_htc_bundle; + int alt_data_credit_size; }; int ath10k_htc_init(struct ath10k *ar); -- cgit v1.2.3-59-g8ed1b From dd7fc5545bbafdbd6c1efdc996b61883b285bdc5 Mon Sep 17 00:00:00 2001 From: Wen Gong Date: Tue, 21 Apr 2020 15:09:35 +0300 Subject: ath10k: add flush tx packets for SDIO chip When station connected to AP, and run TX traffic such as TCP/UDP, and system enter suspend state, then mac80211 call ath10k_flush with set drop flag, recently it only send wmi peer flush to firmware and firmware will flush all pending TX packets, for PCIe, firmware will indicate the TX packets status to ath10k, and then ath10k indicate to mac80211 TX complete with the status, then all the packets has been flushed at this moment. For SDIO chip, it is different, its TX complete indication is disabled by default, and it has a tx queue in ath10k, and its tx credit control is enabled, total tx credit is 96, when its credit is not sufficient, then the packets will buffered in the tx queue of ath10k, max packets is TARGET_TLV_NUM_MSDU_DESC_HL which is 1024, for SDIO, when mac80211 call ath10k_flush with set drop flag, maybe it have pending packets in tx queue of ath10k, and if it does not have sufficient tx credit, the packets will stay in queue untill tx credit report from firmware, if it is a noisy environment, tx speed is low and the tx credit report from firmware will delay more time, then the num_pending_tx will remain > 0 untill all packets send to firmware. After the 1st ath10k_flush, mac80211 will call the 2nd ath10k_flush without set drop flag immediately, then it will call to ath10k_mac_wait_tx_complete, and it wait untill num_pending_tx become to 0, in noisy environment, it is esay to wait about near 5 seconds, then it cause the suspend take long time. 1st and 2nd callstack of ath10k_flush [ 303.740427] ath10k_sdio mmc1:0001:1: ath10k_flush drop:1, pending:0-0 [ 303.740495] ------------[ cut here ]------------ [ 303.740739] WARNING: CPU: 1 PID: 3921 at /mnt/host/source/src/third_party/kernel/v4.19/drivers/net/wireless/ath/ath10k/mac.c:7025 ath10k_flush+0x54/0x104 [ath10k_core] [ 303.740757] Modules linked in: bridge stp llc ath10k_sdio ath10k_core rfcomm uinput cros_ec_rpmsg mtk_seninf mtk_cam_isp mtk_vcodec_enc mtk_fd mtk_vcodec_dec mtk_vcodec_common mtk_dip mtk_mdp3 videobuf2_dma_contig videobuf2_memops v4l2_mem2mem videobuf2_v4l2 videobuf2_common hid_google_hammer hci_uart btqca bluetooth dw9768 ov8856 ecdh_generic ov02a10 v4l2_fwnode mtk_scp mtk_rpmsg rpmsg_core mtk_scp_ipi ipt_MASQUERADE fuse iio_trig_sysfs cros_ec_sensors_ring cros_ec_sensors_sync cros_ec_light_prox cros_ec_sensors industrialio_triggered_buffer [ 303.740914] kfifo_buf cros_ec_activity cros_ec_sensors_core lzo_rle lzo_compress ath mac80211 zram cfg80211 joydev [last unloaded: ath10k_core] [ 303.741009] CPU: 1 PID: 3921 Comm: kworker/u16:10 Tainted: G W 4.19.95 #2 [ 303.741027] Hardware name: MediaTek krane sku176 board (DT) [ 303.741061] Workqueue: events_unbound async_run_entry_fn [ 303.741086] pstate: 60000005 (nZCv daif -PAN -UAO) [ 303.741166] pc : ath10k_flush+0x54/0x104 [ath10k_core] [ 303.741244] lr : ath10k_flush+0x54/0x104 [ath10k_core] [ 303.741260] sp : ffffffdf080e77a0 [ 303.741276] x29: ffffffdf080e77a0 x28: ffffffdef3730040 [ 303.741300] x27: ffffff907c2240a0 x26: ffffffde6ff39afc [ 303.741321] x25: ffffffdef3730040 x24: ffffff907bf61018 [ 303.741343] x23: ffffff907c2240a0 x22: ffffffde6ff39a50 [ 303.741364] x21: 0000000000000001 x20: ffffffde6ff39a50 [ 303.741385] x19: ffffffde6bac2420 x18: 0000000000017200 [ 303.741407] x17: ffffff907c24a000 x16: 0000000000000037 [ 303.741428] x15: ffffff907b49a568 x14: ffffff907cf332c1 [ 303.741476] x13: 00000000000922e4 x12: 0000000000000000 [ 303.741497] x11: 0000000000000001 x10: 0000000000000007 [ 303.741518] x9 : f2256b8c1de4bc00 x8 : f2256b8c1de4bc00 [ 303.741539] x7 : ffffff907ab5e764 x6 : 0000000000000000 [ 303.741560] x5 : 0000000000000080 x4 : 0000000000000001 [ 303.741582] x3 : ffffffdf080e74a8 x2 : ffffff907aa91244 [ 303.741603] x1 : ffffffdf080e74a8 x0 : 0000000000000024 [ 303.741624] Call trace: [ 303.741701] ath10k_flush+0x54/0x104 [ath10k_core] [ 303.741941] __ieee80211_flush_queues+0x1dc/0x358 [mac80211] [ 303.742098] ieee80211_flush_queues+0x34/0x44 [mac80211] [ 303.742253] ieee80211_set_disassoc+0xc0/0x5ec [mac80211] [ 303.742399] ieee80211_mgd_deauth+0x720/0x7d4 [mac80211] [ 303.742535] ieee80211_deauth+0x24/0x30 [mac80211] [ 303.742720] cfg80211_mlme_deauth+0x250/0x3bc [cfg80211] [ 303.742849] cfg80211_mlme_down+0x90/0xd0 [cfg80211] [ 303.742971] cfg80211_disconnect+0x340/0x3a0 [cfg80211] [ 303.743087] __cfg80211_leave+0xe4/0x17c [cfg80211] [ 303.743203] cfg80211_leave+0x38/0x50 [cfg80211] [ 303.743319] wiphy_suspend+0x84/0x5bc [cfg80211] [ 303.743335] dpm_run_callback+0x170/0x304 [ 303.743346] __device_suspend+0x2dc/0x3e8 [ 303.743356] async_suspend+0x2c/0xb0 [ 303.743370] async_run_entry_fn+0x48/0xf8 [ 303.743383] process_one_work+0x304/0x604 [ 303.743394] worker_thread+0x248/0x3f4 [ 303.743403] kthread+0x120/0x130 [ 303.743416] ret_from_fork+0x10/0x18 [ 303.743812] ath10k_sdio mmc1:0001:1: ath10k_flush drop:0, pending:0-0 [ 303.743858] ------------[ cut here ]------------ [ 303.744057] WARNING: CPU: 1 PID: 3921 at /mnt/host/source/src/third_party/kernel/v4.19/drivers/net/wireless/ath/ath10k/mac.c:7025 ath10k_flush+0x54/0x104 [ath10k_core] [ 303.744075] Modules linked in: bridge stp llc ath10k_sdio ath10k_core rfcomm uinput cros_ec_rpmsg mtk_seninf mtk_cam_isp mtk_vcodec_enc mtk_fd mtk_vcodec_dec mtk_vcodec_common mtk_dip mtk_mdp3 videobuf2_dma_contig videobuf2_memops v4l2_mem2mem videobuf2_v4l2 videobuf2_common hid_google_hammer hci_uart btqca bluetooth dw9768 ov8856 ecdh_generic ov02a10 v4l2_fwnode mtk_scp mtk_rpmsg rpmsg_core mtk_scp_ipi ipt_MASQUERADE fuse iio_trig_sysfs cros_ec_sensors_ring cros_ec_sensors_sync cros_ec_light_prox cros_ec_sensors industrialio_triggered_buffer kfifo_buf cros_ec_activity cros_ec_sensors_core lzo_rle lzo_compress ath mac80211 zram cfg80211 joydev [last unloaded: ath10k_core] [ 303.744256] CPU: 1 PID: 3921 Comm: kworker/u16:10 Tainted: G W 4.19.95 #2 [ 303.744273] Hardware name: MediaTek krane sku176 board (DT) [ 303.744301] Workqueue: events_unbound async_run_entry_fn [ 303.744325] pstate: 60000005 (nZCv daif -PAN -UAO) [ 303.744403] pc : ath10k_flush+0x54/0x104 [ath10k_core] [ 303.744480] lr : ath10k_flush+0x54/0x104 [ath10k_core] [ 303.744496] sp : ffffffdf080e77a0 [ 303.744512] x29: ffffffdf080e77a0 x28: ffffffdef3730040 [ 303.744534] x27: ffffff907c2240a0 x26: ffffffde6ff39afc [ 303.744556] x25: ffffffdef3730040 x24: ffffff907bf61018 [ 303.744577] x23: ffffff907c2240a0 x22: ffffffde6ff39a50 [ 303.744598] x21: 0000000000000000 x20: ffffffde6ff39a50 [ 303.744620] x19: ffffffde6bac2420 x18: 000000000001831c [ 303.744641] x17: ffffff907c24a000 x16: 0000000000000037 [ 303.744662] x15: ffffff907b49a568 x14: ffffff907cf332c1 [ 303.744683] x13: 00000000000922ea x12: 0000000000000000 [ 303.744704] x11: 0000000000000001 x10: 0000000000000007 [ 303.744747] x9 : f2256b8c1de4bc00 x8 : f2256b8c1de4bc00 [ 303.744768] x7 : ffffff907ab5e764 x6 : 0000000000000000 [ 303.744789] x5 : 0000000000000080 x4 : 0000000000000001 [ 303.744810] x3 : ffffffdf080e74a8 x2 : ffffff907aa91244 [ 303.744831] x1 : ffffffdf080e74a8 x0 : 0000000000000024 [ 303.744853] Call trace: [ 303.744929] ath10k_flush+0x54/0x104 [ath10k_core] [ 303.745098] __ieee80211_flush_queues+0x1dc/0x358 [mac80211] [ 303.745277] ieee80211_flush_queues+0x34/0x44 [mac80211] [ 303.745424] ieee80211_set_disassoc+0x108/0x5ec [mac80211] [ 303.745569] ieee80211_mgd_deauth+0x720/0x7d4 [mac80211] [ 303.745706] ieee80211_deauth+0x24/0x30 [mac80211] [ 303.745853] cfg80211_mlme_deauth+0x250/0x3bc [cfg80211] [ 303.745979] cfg80211_mlme_down+0x90/0xd0 [cfg80211] [ 303.746103] cfg80211_disconnect+0x340/0x3a0 [cfg80211] [ 303.746219] __cfg80211_leave+0xe4/0x17c [cfg80211] [ 303.746335] cfg80211_leave+0x38/0x50 [cfg80211] [ 303.746452] wiphy_suspend+0x84/0x5bc [cfg80211] [ 303.746467] dpm_run_callback+0x170/0x304 [ 303.746477] __device_suspend+0x2dc/0x3e8 [ 303.746487] async_suspend+0x2c/0xb0 [ 303.746498] async_run_entry_fn+0x48/0xf8 [ 303.746510] process_one_work+0x304/0x604 [ 303.746521] worker_thread+0x248/0x3f4 [ 303.746530] kthread+0x120/0x130 [ 303.746542] ret_from_fork+0x10/0x18 one sample's debugging log: it wait 3190 ms(5000 - 1810). 1st ath10k_flush, it has 120 packets in tx queue of ath10k: <...>-1513 [000] .... 25374.786005: ath10k_log_err: ath10k_sdio mmc1:0001:1 ath10k_flush drop:1, pending:120-0 <...>-1513 [000] ...1 25374.788375: ath10k_log_warn: ath10k_sdio mmc1:0001:1 ath10k_htt_tx_mgmt_inc_pending htt->num_pending_mgmt_tx:0 <...>-1500 [001] .... 25374.790143: ath10k_log_dbg: ath10k_sdio mmc1:0001:1 bundle tx work, eid:1, count:121 2st ath10k_flush, it has 121 packets in tx queue of ath10k: <...>-1513 [000] .... 25374.790571: ath10k_log_err: ath10k_sdio mmc1:0001:1 ath10k_flush drop:0, pending:121-0 <...>-1513 [000] .... 25374.791990: ath10k_log_err: ath10k_sdio mmc1:0001:1 ath10k_mac_wait_tx_complete state:1 pending:121-0 <...>-1508 [001] .... 25374.792696: ath10k_log_dbg: ath10k_sdio mmc1:0001:1 credit update: delta:46 <...>-1508 [001] .... 25374.792700: ath10k_log_dbg: ath10k_sdio mmc1:0001:1 credit total:46 <...>-1508 [001] .... 25374.792729: ath10k_log_dbg: ath10k_sdio mmc1:0001:1 bundle tx work, eid:1, count:121 <...>-1508 [001] .... 25374.792937: ath10k_log_dbg: ath10k_sdio mmc1:0001:1 bundle tx status:0, eid:1, req count:88, count:32, len:49792 <...>-1508 [001] .... 25374.793031: ath10k_log_dbg: ath10k_sdio mmc1:0001:1 bundle tx status:0, eid:1, req count:75, count:14, len:21784 kworker/u16:0-25773 [003] .... 25374.793701: ath10k_log_dbg: ath10k_sdio mmc1:0001:1 bundle tx complete, eid:1, pending complete count:46 <...>-1881 [000] .... 25375.073178: ath10k_log_dbg: ath10k_sdio mmc1:0001:1 credit update: delta:24 <...>-1881 [000] .... 25375.073182: ath10k_log_dbg: ath10k_sdio mmc1:0001:1 credit total:24 <...>-1881 [000] .... 25375.073429: ath10k_log_dbg: ath10k_sdio mmc1:0001:1 bundle tx work, eid:1, count:75 <...>-1879 [001] .... 25375.074090: ath10k_log_dbg: ath10k_sdio mmc1:0001:1 bundle tx complete, eid:1, pending complete count:24 <...>-1881 [000] .... 25375.074123: ath10k_log_dbg: ath10k_sdio mmc1:0001:1 bundle tx status:0, eid:1, req count:51, count:24, len:37344 <...>-1879 [001] .... 25375.270126: ath10k_log_dbg: ath10k_sdio mmc1:0001:1 credit update: delta:26 <...>-1879 [001] .... 25375.270130: ath10k_log_dbg: ath10k_sdio mmc1:0001:1 credit total:26 <...>-1488 [000] .... 25375.270174: ath10k_log_dbg: ath10k_sdio mmc1:0001:1 bundle tx work, eid:1, count:51 <...>-1488 [000] .... 25375.270529: ath10k_log_dbg: ath10k_sdio mmc1:0001:1 bundle tx status:0, eid:1, req count:25, count:26, len:40456 <...>-1879 [001] .... 25375.270693: ath10k_log_dbg: ath10k_sdio mmc1:0001:1 bundle tx complete, eid:1, pending complete count:26 <...>-1488 [001] .... 25377.775885: ath10k_log_dbg: ath10k_sdio mmc1:0001:1 credit update: delta:12 <...>-1488 [001] .... 25377.775890: ath10k_log_dbg: ath10k_sdio mmc1:0001:1 credit total:12 <...>-1488 [001] .... 25377.775933: ath10k_log_dbg: ath10k_sdio mmc1:0001:1 bundle tx work, eid:1, count:25 <...>-1488 [001] .... 25377.776059: ath10k_log_dbg: ath10k_sdio mmc1:0001:1 bundle tx status:0, eid:1, req count:13, count:12, len:18672 <...>-1879 [001] .... 25377.776100: ath10k_log_dbg: ath10k_sdio mmc1:0001:1 bundle tx complete, eid:1, pending complete count:12 <...>-1488 [001] .... 25377.878079: ath10k_log_dbg: ath10k_sdio mmc1:0001:1 credit update: delta:15 <...>-1488 [001] .... 25377.878087: ath10k_log_dbg: ath10k_sdio mmc1:0001:1 credit total:15 <...>-1879 [000] .... 25377.878323: ath10k_log_dbg: ath10k_sdio mmc1:0001:1 bundle tx work, eid:1, count:13 <...>-1879 [000] .... 25377.878487: ath10k_log_dbg: ath10k_sdio mmc1:0001:1 bundle tx status:0, eid:1, req count:0, count:13, len:20228 <...>-1879 [000] .... 25377.878497: ath10k_log_dbg: ath10k_sdio mmc1:0001:1 bundle tx complete, eid:1, pending complete count:13 <...>-1488 [001] .... 25377.919927: ath10k_log_dbg: ath10k_sdio mmc1:0001:1 credit update: delta:11 <...>-1488 [001] .... 25377.919932: ath10k_log_dbg: ath10k_sdio mmc1:0001:1 credit total:13 <...>-1488 [001] .... 25377.919976: ath10k_log_dbg: ath10k_sdio mmc1:0001:1 bundle tx work, eid:1, count:0 <...>-1881 [000] .... 25377.982645: ath10k_log_warn: ath10k_sdio mmc1:0001:1 HTT_T2H_MSG_TYPE_MGMT_TX_COMPLETION status:0 <...>-1513 [001] .... 25377.982973: ath10k_log_err: ath10k_sdio mmc1:0001:1 ath10k_mac_wait_tx_complete time_left:1810, pending:0-0 Flush all pending TX packets for the 1st ath10k_flush reduced the wait time of the 2nd ath10k_flush and then suspend take short time. This Patch only effect SDIO chips. Tested with QCA6174 SDIO with firmware WLAN.RMH.4.4.1-00042. Signed-off-by: Wen Gong Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/20200415233730.10581-1-wgong@codeaurora.org --- drivers/net/wireless/ath/ath10k/htt.h | 7 +++++++ drivers/net/wireless/ath/ath10k/htt_tx.c | 8 +++++++- drivers/net/wireless/ath/ath10k/mac.c | 1 + 3 files changed, 15 insertions(+), 1 deletion(-) diff --git a/drivers/net/wireless/ath/ath10k/htt.h b/drivers/net/wireless/ath/ath10k/htt.h index 462a25b3056d..8f3710cf28f4 100644 --- a/drivers/net/wireless/ath/ath10k/htt.h +++ b/drivers/net/wireless/ath/ath10k/htt.h @@ -2049,6 +2049,7 @@ struct ath10k_htt_tx_ops { int (*htt_h2t_aggr_cfg_msg)(struct ath10k_htt *htt, u8 max_subfrms_ampdu, u8 max_subfrms_amsdu); + void (*htt_flush_tx)(struct ath10k_htt *htt); }; static inline int ath10k_htt_send_rx_ring_cfg(struct ath10k_htt *htt) @@ -2088,6 +2089,12 @@ static inline int ath10k_htt_tx(struct ath10k_htt *htt, return htt->tx_ops->htt_tx(htt, txmode, msdu); } +static inline void ath10k_htt_flush_tx(struct ath10k_htt *htt) +{ + if (htt->tx_ops->htt_flush_tx) + htt->tx_ops->htt_flush_tx(htt); +} + static inline int ath10k_htt_alloc_txbuff(struct ath10k_htt *htt) { if (!htt->tx_ops->htt_alloc_txbuff) diff --git a/drivers/net/wireless/ath/ath10k/htt_tx.c b/drivers/net/wireless/ath/ath10k/htt_tx.c index ff044426c337..4fd10ac3a941 100644 --- a/drivers/net/wireless/ath/ath10k/htt_tx.c +++ b/drivers/net/wireless/ath/ath10k/htt_tx.c @@ -529,10 +529,15 @@ void ath10k_htt_tx_destroy(struct ath10k_htt *htt) htt->tx_mem_allocated = false; } -void ath10k_htt_tx_stop(struct ath10k_htt *htt) +static void ath10k_htt_flush_tx_queue(struct ath10k_htt *htt) { ath10k_htc_stop_hl(htt->ar); idr_for_each(&htt->pending_tx, ath10k_htt_tx_clean_up_pending, htt->ar); +} + +void ath10k_htt_tx_stop(struct ath10k_htt *htt) +{ + ath10k_htt_flush_tx_queue(htt); idr_destroy(&htt->pending_tx); } @@ -1825,6 +1830,7 @@ static const struct ath10k_htt_tx_ops htt_tx_ops_hl = { .htt_send_frag_desc_bank_cfg = ath10k_htt_send_frag_desc_bank_cfg_32, .htt_tx = ath10k_htt_tx_hl, .htt_h2t_aggr_cfg_msg = ath10k_htt_h2t_aggr_cfg_msg_32, + .htt_flush_tx = ath10k_htt_flush_tx_queue, }; void ath10k_htt_set_tx_ops(struct ath10k_htt *htt) diff --git a/drivers/net/wireless/ath/ath10k/mac.c b/drivers/net/wireless/ath/ath10k/mac.c index a59a7a5631a8..6791c0035be0 100644 --- a/drivers/net/wireless/ath/ath10k/mac.c +++ b/drivers/net/wireless/ath/ath10k/mac.c @@ -7224,6 +7224,7 @@ static void ath10k_flush(struct ieee80211_hw *hw, struct ieee80211_vif *vif, ath10k_wmi_peer_flush(ar, arvif->vdev_id, arvif->bssid, bitmap); } + ath10k_htt_flush_tx(&ar->htt); } return; } -- cgit v1.2.3-59-g8ed1b From 3fef10ec321ced8e75cd0a28616402401cbbcaf4 Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Fri, 17 Apr 2020 19:15:25 +0200 Subject: Bluetooth: btbcm: Drop upper nibble version check from btbcm_initialize() btbcm_initialize() must either return an error; or fill the passed in fw_name, otherwise we end up passing uninitialized stack memory to request_firmware(). Since we have a fallback hw_name of "BCM" not having a known version in the subver field does not matter, drop the check so that we always fill the passed in fw_name. Signed-off-by: Hans de Goede Signed-off-by: Marcel Holtmann --- drivers/bluetooth/btbcm.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/drivers/bluetooth/btbcm.c b/drivers/bluetooth/btbcm.c index 1f498f358f60..b9e1fe052148 100644 --- a/drivers/bluetooth/btbcm.c +++ b/drivers/bluetooth/btbcm.c @@ -440,10 +440,6 @@ int btbcm_initialize(struct hci_dev *hdev, char *fw_name, size_t len, return err; } - /* Upper nibble of rev should be between 0 and 3? */ - if (((rev & 0xf000) >> 12) > 3) - return 0; - bcm_subver_table = (hdev->bus == HCI_USB) ? bcm_usb_subver_table : bcm_uart_subver_table; -- cgit v1.2.3-59-g8ed1b From f8c51d28e9d13f20c33f4f2f46f8e7d0b8476b9c Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Fri, 17 Apr 2020 19:15:26 +0200 Subject: Bluetooth: btbcm: Move setting of USE_BDADDR_PROPERTY quirk to hci_bcm.c btbcm_finalize() is currently only used by UART attached BCM devices. Move the setting of the USE_BDADDR_PROPERTY quirk, which we only want for UART attached devices to hci_bcm in preparation for using btbcm_finalize() for USB attached devices too. Signed-off-by: Hans de Goede Signed-off-by: Marcel Holtmann --- drivers/bluetooth/btbcm.c | 6 ------ drivers/bluetooth/hci_bcm.c | 6 ++++++ 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/bluetooth/btbcm.c b/drivers/bluetooth/btbcm.c index b9e1fe052148..8052a0e8dbfb 100644 --- a/drivers/bluetooth/btbcm.c +++ b/drivers/bluetooth/btbcm.c @@ -488,12 +488,6 @@ int btbcm_finalize(struct hci_dev *hdev) set_bit(HCI_QUIRK_STRICT_DUPLICATE_FILTER, &hdev->quirks); - /* Some devices ship with the controller default address. - * Allow the bootloader to set a valid address through the - * device tree. - */ - set_bit(HCI_QUIRK_USE_BDADDR_PROPERTY, &hdev->quirks); - return 0; } EXPORT_SYMBOL_GPL(btbcm_finalize); diff --git a/drivers/bluetooth/hci_bcm.c b/drivers/bluetooth/hci_bcm.c index 19e4587f366c..c42bf791a61b 100644 --- a/drivers/bluetooth/hci_bcm.c +++ b/drivers/bluetooth/hci_bcm.c @@ -623,6 +623,12 @@ finalize: if (err) return err; + /* Some devices ship with the controller default address. + * Allow the bootloader to set a valid address through the + * device tree. + */ + set_bit(HCI_QUIRK_USE_BDADDR_PROPERTY, &hu->hdev->quirks); + if (!bcm_request_irq(bcm)) err = bcm_setup_sleep(hu); -- cgit v1.2.3-59-g8ed1b From 0287c5d84f5c0cde6c39362d56c7002dc4acedb3 Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Fri, 17 Apr 2020 19:15:27 +0200 Subject: Bluetooth: btbcm: Fold Patch loading + applying into btbcm_initialize() Instead of having btbcm_initialize() fill a passed in fw_name buffer and then have its callers use that to request the firmware + load it into the HCI, make btbcm_initialize() do this itself the first time it is called (its get called a second time to reset the HCI after the firmware has been loaded). This removes some code duplication and makes it easier for further patches in this series to try more then 1 firmware filename. Signed-off-by: Hans de Goede Signed-off-by: Marcel Holtmann --- drivers/bluetooth/btbcm.c | 50 +++++++++++++++++++++++++++------------------ drivers/bluetooth/btbcm.h | 6 ++---- drivers/bluetooth/hci_bcm.c | 19 +++-------------- 3 files changed, 35 insertions(+), 40 deletions(-) diff --git a/drivers/bluetooth/btbcm.c b/drivers/bluetooth/btbcm.c index 8052a0e8dbfb..c22e90a5e288 100644 --- a/drivers/bluetooth/btbcm.c +++ b/drivers/bluetooth/btbcm.c @@ -27,6 +27,8 @@ #define BDADDR_BCM4345C5 (&(bdaddr_t) {{0xac, 0x1f, 0x00, 0xc5, 0x45, 0x43}}) #define BDADDR_BCM43341B (&(bdaddr_t) {{0xac, 0x1f, 0x00, 0x1b, 0x34, 0x43}}) +#define BCM_FW_NAME_LEN 64 + int btbcm_check_bdaddr(struct hci_dev *hdev) { struct hci_rp_read_bd_addr *bda; @@ -408,14 +410,15 @@ static const struct bcm_subver_table bcm_usb_subver_table[] = { { } }; -int btbcm_initialize(struct hci_dev *hdev, char *fw_name, size_t len, - bool reinit) +int btbcm_initialize(struct hci_dev *hdev, bool *fw_load_done) { u16 subver, rev, pid, vid; const char *hw_name = "BCM"; struct sk_buff *skb; struct hci_rp_read_local_version *ver; const struct bcm_subver_table *bcm_subver_table; + char fw_name[BCM_FW_NAME_LEN]; + const struct firmware *fw; int i, err; /* Reset */ @@ -434,7 +437,7 @@ int btbcm_initialize(struct hci_dev *hdev, char *fw_name, size_t len, kfree_skb(skb); /* Read controller information */ - if (!reinit) { + if (!(*fw_load_done)) { err = btbcm_read_info(hdev); if (err) return err; @@ -460,27 +463,42 @@ int btbcm_initialize(struct hci_dev *hdev, char *fw_name, size_t len, pid = get_unaligned_le16(skb->data + 3); kfree_skb(skb); - snprintf(fw_name, len, "brcm/%s-%4.4x-%4.4x.hcd", + snprintf(fw_name, BCM_FW_NAME_LEN, "brcm/%s-%4.4x-%4.4x.hcd", hw_name, vid, pid); } else { - snprintf(fw_name, len, "brcm/%s.hcd", hw_name); + snprintf(fw_name, BCM_FW_NAME_LEN, "brcm/%s.hcd", hw_name); } bt_dev_info(hdev, "%s (%3.3u.%3.3u.%3.3u) build %4.4u", hw_name, (subver & 0xe000) >> 13, (subver & 0x1f00) >> 8, (subver & 0x00ff), rev & 0x0fff); + if (*fw_load_done) + return 0; + + err = request_firmware(&fw, fw_name, &hdev->dev); + if (err) { + bt_dev_info(hdev, "BCM: Patch %s not found", fw_name); + return 0; + } + + err = btbcm_patchram(hdev, fw); + if (err) + bt_dev_info(hdev, "BCM: Patch failed (%d)", err); + + release_firmware(fw); + *fw_load_done = true; return 0; } EXPORT_SYMBOL_GPL(btbcm_initialize); int btbcm_finalize(struct hci_dev *hdev) { - char fw_name[64]; + bool fw_load_done = true; int err; /* Re-initialize */ - err = btbcm_initialize(hdev, fw_name, sizeof(fw_name), true); + err = btbcm_initialize(hdev, &fw_load_done); if (err) return err; @@ -494,28 +512,20 @@ EXPORT_SYMBOL_GPL(btbcm_finalize); int btbcm_setup_patchram(struct hci_dev *hdev) { - char fw_name[64]; - const struct firmware *fw; + bool fw_load_done = false; struct sk_buff *skb; int err; /* Initialize */ - err = btbcm_initialize(hdev, fw_name, sizeof(fw_name), false); + err = btbcm_initialize(hdev, &fw_load_done); if (err) return err; - err = request_firmware(&fw, fw_name, &hdev->dev); - if (err < 0) { - bt_dev_info(hdev, "BCM: Patch %s not found", fw_name); + if (!fw_load_done) goto done; - } - btbcm_patchram(hdev, fw); - - release_firmware(fw); - - /* Re-initialize */ - err = btbcm_initialize(hdev, fw_name, sizeof(fw_name), true); + /* Re-initialize after loading Patch */ + err = btbcm_initialize(hdev, &fw_load_done); if (err) return err; diff --git a/drivers/bluetooth/btbcm.h b/drivers/bluetooth/btbcm.h index 014ef847a486..8437caba421d 100644 --- a/drivers/bluetooth/btbcm.h +++ b/drivers/bluetooth/btbcm.h @@ -62,8 +62,7 @@ int btbcm_write_pcm_int_params(struct hci_dev *hdev, int btbcm_setup_patchram(struct hci_dev *hdev); int btbcm_setup_apple(struct hci_dev *hdev); -int btbcm_initialize(struct hci_dev *hdev, char *fw_name, size_t len, - bool reinit); +int btbcm_initialize(struct hci_dev *hdev, bool *fw_load_done); int btbcm_finalize(struct hci_dev *hdev); #else @@ -105,8 +104,7 @@ static inline int btbcm_setup_apple(struct hci_dev *hdev) return 0; } -static inline int btbcm_initialize(struct hci_dev *hdev, char *fw_name, - size_t len, bool reinit) +static inline int btbcm_initialize(struct hci_dev *hdev, bool *fw_load_done) { return 0; } diff --git a/drivers/bluetooth/hci_bcm.c b/drivers/bluetooth/hci_bcm.c index c42bf791a61b..61731cb451cb 100644 --- a/drivers/bluetooth/hci_bcm.c +++ b/drivers/bluetooth/hci_bcm.c @@ -553,8 +553,7 @@ static int bcm_flush(struct hci_uart *hu) static int bcm_setup(struct hci_uart *hu) { struct bcm_data *bcm = hu->priv; - char fw_name[64]; - const struct firmware *fw; + bool fw_load_done = false; unsigned int speed; int err; @@ -563,21 +562,12 @@ static int bcm_setup(struct hci_uart *hu) hu->hdev->set_diag = bcm_set_diag; hu->hdev->set_bdaddr = btbcm_set_bdaddr; - err = btbcm_initialize(hu->hdev, fw_name, sizeof(fw_name), false); + err = btbcm_initialize(hu->hdev, &fw_load_done); if (err) return err; - err = request_firmware(&fw, fw_name, &hu->hdev->dev); - if (err < 0) { - bt_dev_info(hu->hdev, "BCM: Patch %s not found", fw_name); + if (!fw_load_done) return 0; - } - - err = btbcm_patchram(hu->hdev, fw); - if (err) { - bt_dev_info(hu->hdev, "BCM: Patch failed (%d)", err); - goto finalize; - } /* Init speed if any */ if (hu->init_speed) @@ -616,9 +606,6 @@ static int bcm_setup(struct hci_uart *hu) btbcm_write_pcm_int_params(hu->hdev, ¶ms); } -finalize: - release_firmware(fw); - err = btbcm_finalize(hu->hdev); if (err) return err; -- cgit v1.2.3-59-g8ed1b From 2fcdd562b91bdc29dddd406f7278102e4d90b1fa Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Fri, 17 Apr 2020 19:15:28 +0200 Subject: Bluetooth: btbcm: Make btbcm_initialize() print local-name on re-init too Make btbcm_initialize() get and print the device's local-name on re-init too, this will make us also print the local-name after loading the Patch on UART attached devices making things more consistent. This also removes some code duplication from btbcm_setup_patchram() and allows more code duplication removal there in a follow-up patch. Signed-off-by: Hans de Goede Signed-off-by: Marcel Holtmann --- drivers/bluetooth/btbcm.c | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/drivers/bluetooth/btbcm.c b/drivers/bluetooth/btbcm.c index c22e90a5e288..3404021b10bd 100644 --- a/drivers/bluetooth/btbcm.c +++ b/drivers/bluetooth/btbcm.c @@ -360,6 +360,13 @@ static int btbcm_read_info(struct hci_dev *hdev) bt_dev_info(hdev, "BCM: features 0x%2.2x", skb->data[1]); kfree_skb(skb); + return 0; +} + +static int btbcm_print_local_name(struct hci_dev *hdev) +{ + struct sk_buff *skb; + /* Read Local Name */ skb = btbcm_read_local_name(hdev); if (IS_ERR(skb)) @@ -442,6 +449,9 @@ int btbcm_initialize(struct hci_dev *hdev, bool *fw_load_done) if (err) return err; } + err = btbcm_print_local_name(hdev); + if (err) + return err; bcm_subver_table = (hdev->bus == HCI_USB) ? bcm_usb_subver_table : bcm_uart_subver_table; @@ -513,7 +523,6 @@ EXPORT_SYMBOL_GPL(btbcm_finalize); int btbcm_setup_patchram(struct hci_dev *hdev) { bool fw_load_done = false; - struct sk_buff *skb; int err; /* Initialize */ @@ -529,14 +538,6 @@ int btbcm_setup_patchram(struct hci_dev *hdev) if (err) return err; - /* Read Local Name */ - skb = btbcm_read_local_name(hdev); - if (IS_ERR(skb)) - return PTR_ERR(skb); - - bt_dev_info(hdev, "%s", (char *)(skb->data + 1)); - kfree_skb(skb); - done: btbcm_check_bdaddr(hdev); -- cgit v1.2.3-59-g8ed1b From 0383f16a87c4dec6840cdbb80c2a30ecfdc2ffb0 Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Fri, 17 Apr 2020 19:15:29 +0200 Subject: Bluetooth: btbcm: Make btbcm_setup_patchram use btbcm_finalize On UART attached devices we do: 1. btbcm_initialize() 2. Setup UART baudrate, etc. 3. btbcm_finalize() After our previous changes we can now also use btbcm_finalize() from the btbcm_setup_patchram() function used on USB devices without any functional changes. This completes unifying the USB and UART paths as much as possible. Signed-off-by: Hans de Goede Signed-off-by: Marcel Holtmann --- drivers/bluetooth/btbcm.c | 27 ++++++++------------------- drivers/bluetooth/btbcm.h | 4 ++-- drivers/bluetooth/hci_bcm.c | 2 +- 3 files changed, 11 insertions(+), 22 deletions(-) diff --git a/drivers/bluetooth/btbcm.c b/drivers/bluetooth/btbcm.c index 3404021b10bd..cc3628cace35 100644 --- a/drivers/bluetooth/btbcm.c +++ b/drivers/bluetooth/btbcm.c @@ -502,15 +502,16 @@ int btbcm_initialize(struct hci_dev *hdev, bool *fw_load_done) } EXPORT_SYMBOL_GPL(btbcm_initialize); -int btbcm_finalize(struct hci_dev *hdev) +int btbcm_finalize(struct hci_dev *hdev, bool *fw_load_done) { - bool fw_load_done = true; int err; - /* Re-initialize */ - err = btbcm_initialize(hdev, &fw_load_done); - if (err) - return err; + /* Re-initialize if necessary */ + if (*fw_load_done) { + err = btbcm_initialize(hdev, fw_load_done); + if (err) + return err; + } btbcm_check_bdaddr(hdev); @@ -530,20 +531,8 @@ int btbcm_setup_patchram(struct hci_dev *hdev) if (err) return err; - if (!fw_load_done) - goto done; - /* Re-initialize after loading Patch */ - err = btbcm_initialize(hdev, &fw_load_done); - if (err) - return err; - -done: - btbcm_check_bdaddr(hdev); - - set_bit(HCI_QUIRK_STRICT_DUPLICATE_FILTER, &hdev->quirks); - - return 0; + return btbcm_finalize(hdev, &fw_load_done); } EXPORT_SYMBOL_GPL(btbcm_setup_patchram); diff --git a/drivers/bluetooth/btbcm.h b/drivers/bluetooth/btbcm.h index 8437caba421d..8bf01565fdfc 100644 --- a/drivers/bluetooth/btbcm.h +++ b/drivers/bluetooth/btbcm.h @@ -63,7 +63,7 @@ int btbcm_setup_patchram(struct hci_dev *hdev); int btbcm_setup_apple(struct hci_dev *hdev); int btbcm_initialize(struct hci_dev *hdev, bool *fw_load_done); -int btbcm_finalize(struct hci_dev *hdev); +int btbcm_finalize(struct hci_dev *hdev, bool *fw_load_done); #else @@ -109,7 +109,7 @@ static inline int btbcm_initialize(struct hci_dev *hdev, bool *fw_load_done) return 0; } -static inline int btbcm_finalize(struct hci_dev *hdev) +static inline int btbcm_finalize(struct hci_dev *hdev, bool *fw_load_done) { return 0; } diff --git a/drivers/bluetooth/hci_bcm.c b/drivers/bluetooth/hci_bcm.c index 61731cb451cb..8ea5ca8d71d6 100644 --- a/drivers/bluetooth/hci_bcm.c +++ b/drivers/bluetooth/hci_bcm.c @@ -606,7 +606,7 @@ static int bcm_setup(struct hci_uart *hu) btbcm_write_pcm_int_params(hu->hdev, ¶ms); } - err = btbcm_finalize(hu->hdev); + err = btbcm_finalize(hu->hdev, &fw_load_done); if (err) return err; -- cgit v1.2.3-59-g8ed1b From f53b975cf113fa0dca9c7bba067c3d749682cc82 Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Fri, 17 Apr 2020 19:15:30 +0200 Subject: Bluetooth: btbcm: Bail sooner from btbcm_initialize() when not loading fw If we have already loaded the firmware/patchram and btbcm_initialize() is called to re-init the HCI after this then there is no need to get the USB device-ids and build a firmware-filename out of these. Signed-off-by: Hans de Goede Signed-off-by: Marcel Holtmann --- drivers/bluetooth/btbcm.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/drivers/bluetooth/btbcm.c b/drivers/bluetooth/btbcm.c index cc3628cace35..9fa153b35825 100644 --- a/drivers/bluetooth/btbcm.c +++ b/drivers/bluetooth/btbcm.c @@ -463,6 +463,13 @@ int btbcm_initialize(struct hci_dev *hdev, bool *fw_load_done) } } + bt_dev_info(hdev, "%s (%3.3u.%3.3u.%3.3u) build %4.4u", + hw_name, (subver & 0xe000) >> 13, + (subver & 0x1f00) >> 8, (subver & 0x00ff), rev & 0x0fff); + + if (*fw_load_done) + return 0; + if (hdev->bus == HCI_USB) { /* Read USB Product Info */ skb = btbcm_read_usb_product(hdev); @@ -479,13 +486,6 @@ int btbcm_initialize(struct hci_dev *hdev, bool *fw_load_done) snprintf(fw_name, BCM_FW_NAME_LEN, "brcm/%s.hcd", hw_name); } - bt_dev_info(hdev, "%s (%3.3u.%3.3u.%3.3u) build %4.4u", - hw_name, (subver & 0xe000) >> 13, - (subver & 0x1f00) >> 8, (subver & 0x00ff), rev & 0x0fff); - - if (*fw_load_done) - return 0; - err = request_firmware(&fw, fw_name, &hdev->dev); if (err) { bt_dev_info(hdev, "BCM: Patch %s not found", fw_name); -- cgit v1.2.3-59-g8ed1b From 74530a639adfa2b2162df6a688c6367ecae6a3ca Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Fri, 17 Apr 2020 19:15:31 +0200 Subject: Bluetooth: btbcm: Try multiple Patch filenames when loading the Patch firmware Currently the bcm_uart_subver_ and bcm_usb_subver_table-s lack entries for various newer chipsets. This makes the code use just "BCM" as prefix for the filename to pass to request-firmware, making it harder for users to figure out which firmware they need. This especially a problem with UART attached devices where this leads to the filename being "BCM.hcd". If we add new entries to the subver-tables now, then this will change what firmware file the kernel looks for, e.g. currently linux-firmware contains a brcm/BCM-0bb4-0306.hcd file. If we add the info for the BCM20703A1 to the subver table, then this will change to brcm/BCM20703A1-0bb4-0306.hcd. This will cause the file to no longer get loaded breaking Bluetooth for existing users, going against the no regressions policy. To avoid this regression make the btbcm code try multiple filenames, first try the fullname, e.g. BCM20703A1-0bb4-0306.hcd and if that is not found, then fallback to the name with just BCM as prefix. This commit also adds an info message which filename was used, this makes the output look like this for example: [ 57.387867] Bluetooth: hci0: BCM20703A1 [ 57.387870] Bluetooth: hci0: BCM20703A1 (001.001.005) build 0000 [ 57.389438] Bluetooth: hci0: BCM20703A1 'brcm/BCM20703A1-0a5c-6410.hcd' Patch [ 58.681769] Bluetooth: hci0: BCM20703A1 Generic USB 20Mhz fcbga_BU [ 58.681772] Bluetooth: hci0: BCM20703A1 (001.001.005) build 0481 Signed-off-by: Hans de Goede Signed-off-by: Marcel Holtmann --- drivers/bluetooth/btbcm.c | 59 ++++++++++++++++++++++++++++++++++------------- 1 file changed, 43 insertions(+), 16 deletions(-) diff --git a/drivers/bluetooth/btbcm.c b/drivers/bluetooth/btbcm.c index 9fa153b35825..739ba1200f5d 100644 --- a/drivers/bluetooth/btbcm.c +++ b/drivers/bluetooth/btbcm.c @@ -28,6 +28,9 @@ #define BDADDR_BCM43341B (&(bdaddr_t) {{0xac, 0x1f, 0x00, 0x1b, 0x34, 0x43}}) #define BCM_FW_NAME_LEN 64 +#define BCM_FW_NAME_COUNT_MAX 2 +/* For kmalloc-ing the fw-name array instead of putting it on the stack */ +typedef char bcm_fw_name[BCM_FW_NAME_LEN]; int btbcm_check_bdaddr(struct hci_dev *hdev) { @@ -420,11 +423,13 @@ static const struct bcm_subver_table bcm_usb_subver_table[] = { int btbcm_initialize(struct hci_dev *hdev, bool *fw_load_done) { u16 subver, rev, pid, vid; - const char *hw_name = "BCM"; struct sk_buff *skb; struct hci_rp_read_local_version *ver; const struct bcm_subver_table *bcm_subver_table; - char fw_name[BCM_FW_NAME_LEN]; + const char *hw_name = NULL; + char postfix[16] = ""; + int fw_name_count = 0; + bcm_fw_name *fw_name; const struct firmware *fw; int i, err; @@ -464,7 +469,7 @@ int btbcm_initialize(struct hci_dev *hdev, bool *fw_load_done) } bt_dev_info(hdev, "%s (%3.3u.%3.3u.%3.3u) build %4.4u", - hw_name, (subver & 0xe000) >> 13, + hw_name ? hw_name : "BCM", (subver & 0xe000) >> 13, (subver & 0x1f00) >> 8, (subver & 0x00ff), rev & 0x0fff); if (*fw_load_done) @@ -480,24 +485,46 @@ int btbcm_initialize(struct hci_dev *hdev, bool *fw_load_done) pid = get_unaligned_le16(skb->data + 3); kfree_skb(skb); - snprintf(fw_name, BCM_FW_NAME_LEN, "brcm/%s-%4.4x-%4.4x.hcd", - hw_name, vid, pid); - } else { - snprintf(fw_name, BCM_FW_NAME_LEN, "brcm/%s.hcd", hw_name); + snprintf(postfix, sizeof(postfix), "-%4.4x-%4.4x", vid, pid); } - err = request_firmware(&fw, fw_name, &hdev->dev); - if (err) { - bt_dev_info(hdev, "BCM: Patch %s not found", fw_name); - return 0; + fw_name = kmalloc(BCM_FW_NAME_COUNT_MAX * BCM_FW_NAME_LEN, GFP_KERNEL); + if (!fw_name) + return -ENOMEM; + + if (hw_name) { + snprintf(fw_name[fw_name_count], BCM_FW_NAME_LEN, + "brcm/%s%s.hcd", hw_name, postfix); + fw_name_count++; } - err = btbcm_patchram(hdev, fw); - if (err) - bt_dev_info(hdev, "BCM: Patch failed (%d)", err); + snprintf(fw_name[fw_name_count], BCM_FW_NAME_LEN, + "brcm/BCM%s.hcd", postfix); + fw_name_count++; + + for (i = 0; i < fw_name_count; i++) { + err = firmware_request_nowarn(&fw, fw_name[i], &hdev->dev); + if (err == 0) { + bt_dev_info(hdev, "%s '%s' Patch", + hw_name ? hw_name : "BCM", fw_name[i]); + *fw_load_done = true; + break; + } + } + + if (*fw_load_done) { + err = btbcm_patchram(hdev, fw); + if (err) + bt_dev_info(hdev, "BCM: Patch failed (%d)", err); + + release_firmware(fw); + } else { + bt_dev_err(hdev, "BCM: firmware Patch file not found, tried:"); + for (i = 0; i < fw_name_count; i++) + bt_dev_err(hdev, "BCM: '%s'", fw_name[i]); + } - release_firmware(fw); - *fw_load_done = true; + kfree(fw_name); return 0; } EXPORT_SYMBOL_GPL(btbcm_initialize); -- cgit v1.2.3-59-g8ed1b From c03ee9af4e07112bd3fc688daca9e654f41eca93 Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Fri, 17 Apr 2020 19:15:32 +0200 Subject: Bluetooth: btbcm: Add 2 missing models to subver tables Currently the bcm_uart_subver_ and bcm_usb_subver_table-s lack entries for the BCM4324B5 and BCM20703A1 chipsets. This makes the code use just "BCM" as prefix for the filename to pass to request-firmware, making it harder for users to figure out which firmware they need. This especially is problematic with the UART attached BCM4324B5 where this leads to the filename being just "BCM.hcd". Add the 2 missing devices to subver tables. This has been tested on: 1. A Dell XPS15 9550 where this makes btbcm.c try to load "BCM20703A1-0a5c-6410.hcd" before it tries to load "BCM-0a5c-6410.hcd". 2. A Thinkpad 8 where this makes btbcm.c try to load "BCM4324B5.hcd" before it tries to load "BCM.hcd" Signed-off-by: Hans de Goede Signed-off-by: Marcel Holtmann --- drivers/bluetooth/btbcm.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/bluetooth/btbcm.c b/drivers/bluetooth/btbcm.c index 739ba1200f5d..df7a8a22e53c 100644 --- a/drivers/bluetooth/btbcm.c +++ b/drivers/bluetooth/btbcm.c @@ -392,6 +392,7 @@ static const struct bcm_subver_table bcm_uart_subver_table[] = { { 0x410e, "BCM43341B0" }, /* 002.001.014 */ { 0x4204, "BCM2076B1" }, /* 002.002.004 */ { 0x4406, "BCM4324B3" }, /* 002.004.006 */ + { 0x4606, "BCM4324B5" }, /* 002.006.006 */ { 0x6109, "BCM4335C0" }, /* 003.001.009 */ { 0x610c, "BCM4354" }, /* 003.001.012 */ { 0x2122, "BCM4343A0" }, /* 001.001.034 */ @@ -407,6 +408,7 @@ static const struct bcm_subver_table bcm_uart_subver_table[] = { }; static const struct bcm_subver_table bcm_usb_subver_table[] = { + { 0x2105, "BCM20703A1" }, /* 001.001.005 */ { 0x210b, "BCM43142A0" }, /* 001.001.011 */ { 0x2112, "BCM4314A0" }, /* 001.001.018 */ { 0x2118, "BCM20702A0" }, /* 001.001.024 */ -- cgit v1.2.3-59-g8ed1b From 86b956de119c09818d0aabaf668280d8e4bd0d4b Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Mon, 20 Apr 2020 19:27:41 +0300 Subject: net: mscc: ocelot: support matching on EtherType Currently, the filter's protocol is ignored except for a few special cases (IPv4 and IPv6). The EtherType can be matched inside VCAP IS2 by using a MAC_ETYPE key. So there are 2 cases in which EtherType matches are supported: - As part of a larger MAC_ETYPE rule, such as: tc filter add dev swp0 ingress protocol ip \ flower skip_sw src_mac 42:be:24:9b:76:20 action drop - Standalone (matching on protocol only): tc filter add dev swp0 ingress protocol arp \ flower skip_sw action drop As before, if the protocol is not specified, is it implicitly "all" and the EtherType mask in the MAC_ETYPE half key is set to zero. Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- drivers/net/ethernet/mscc/ocelot_flower.c | 22 +++++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/mscc/ocelot_flower.c b/drivers/net/ethernet/mscc/ocelot_flower.c index 954cb67eeaa2..67f0f5455ff0 100644 --- a/drivers/net/ethernet/mscc/ocelot_flower.c +++ b/drivers/net/ethernet/mscc/ocelot_flower.c @@ -51,6 +51,8 @@ static int ocelot_flower_parse(struct flow_cls_offload *f, { struct flow_rule *rule = flow_cls_offload_flow_rule(f); struct flow_dissector *dissector = rule->match.dissector; + u16 proto = ntohs(f->common.protocol); + bool match_protocol = true; if (dissector->used_keys & ~(BIT(FLOW_DISSECTOR_KEY_CONTROL) | @@ -71,7 +73,6 @@ static int ocelot_flower_parse(struct flow_cls_offload *f, if (flow_rule_match_key(rule, FLOW_DISSECTOR_KEY_ETH_ADDRS)) { struct flow_match_eth_addrs match; - u16 proto = ntohs(f->common.protocol); /* The hw support mac matches only for MAC_ETYPE key, * therefore if other matches(port, tcp flags, etc) are added @@ -114,6 +115,7 @@ static int ocelot_flower_parse(struct flow_cls_offload *f, match.key->ip_proto; ace->frame.ipv4.proto.mask[0] = match.mask->ip_proto; + match_protocol = false; } if (ntohs(match.key->n_proto) == ETH_P_IPV6) { ace->type = OCELOT_ACE_TYPE_IPV6; @@ -121,11 +123,12 @@ static int ocelot_flower_parse(struct flow_cls_offload *f, match.key->ip_proto; ace->frame.ipv6.proto.mask[0] = match.mask->ip_proto; + match_protocol = false; } } if (flow_rule_match_key(rule, FLOW_DISSECTOR_KEY_IPV4_ADDRS) && - ntohs(f->common.protocol) == ETH_P_IP) { + proto == ETH_P_IP) { struct flow_match_ipv4_addrs match; u8 *tmp; @@ -141,10 +144,11 @@ static int ocelot_flower_parse(struct flow_cls_offload *f, tmp = &ace->frame.ipv4.dip.mask.addr[0]; memcpy(tmp, &match.mask->dst, 4); + match_protocol = false; } if (flow_rule_match_key(rule, FLOW_DISSECTOR_KEY_IPV6_ADDRS) && - ntohs(f->common.protocol) == ETH_P_IPV6) { + proto == ETH_P_IPV6) { return -EOPNOTSUPP; } @@ -156,6 +160,7 @@ static int ocelot_flower_parse(struct flow_cls_offload *f, ace->frame.ipv4.sport.mask = ntohs(match.mask->src); ace->frame.ipv4.dport.value = ntohs(match.key->dst); ace->frame.ipv4.dport.mask = ntohs(match.mask->dst); + match_protocol = false; } if (flow_rule_match_key(rule, FLOW_DISSECTOR_KEY_VLAN)) { @@ -167,9 +172,20 @@ static int ocelot_flower_parse(struct flow_cls_offload *f, ace->vlan.vid.mask = match.mask->vlan_id; ace->vlan.pcp.value[0] = match.key->vlan_priority; ace->vlan.pcp.mask[0] = match.mask->vlan_priority; + match_protocol = false; } finished_key_parsing: + if (match_protocol && proto != ETH_P_ALL) { + /* TODO: support SNAP, LLC etc */ + if (proto < ETH_P_802_3_MIN) + return -EOPNOTSUPP; + ace->type = OCELOT_ACE_TYPE_ETYPE; + *(u16 *)ace->frame.etype.etype.value = htons(proto); + *(u16 *)ace->frame.etype.etype.mask = 0xffff; + } + /* else, a rule of type OCELOT_ACE_TYPE_ANY is implicitly added */ + ace->prio = f->common.prio; ace->id = f->cookie; return ocelot_flower_parse_action(f, ace); -- cgit v1.2.3-59-g8ed1b From 7dec902f4fc0cf1162e18030f2598440e311a2d2 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Mon, 20 Apr 2020 19:27:42 +0300 Subject: net: mscc: ocelot: refine the ocelot_ace_is_problematic_mac_etype function The commit mentioned below was a bit too harsh, and while it restricted the invalid key combinations which are known to not work, such as: tc filter add dev swp0 ingress proto ip \ flower src_ip 192.0.2.1 action drop tc filter add dev swp0 ingress proto all \ flower src_mac 00:11:22:33:44:55 action drop it also restricted some which still should work, such as: tc filter add dev swp0 ingress proto ip \ flower src_ip 192.0.2.1 action drop tc filter add dev swp0 ingress proto 0x22f0 \ flower src_mac 00:11:22:33:44:55 action drop What actually does not match "sanely" is a MAC_ETYPE rule on frames having an EtherType of ARP, IPv4, IPv6, in addition to SNAP and OAM frames (which the ocelot tc-flower implementation does not parse yet, so the function might need to be revisited again in the future). So just make the function recognize the problematic MAC_ETYPE rules by EtherType - thus the VCAP IS2 can be forced to match even on those packets. This patch makes it possible for IP rules to live on a port together with MAC_ETYPE rules that are non-all, non-arp, non-ip and non-ipv6. Fixes: d4d0cb741d7b ("net: mscc: ocelot: deal with problematic MAC_ETYPE VCAP IS2 rules") Reported-by: Allan W. Nielsen Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- drivers/net/ethernet/mscc/ocelot_ace.c | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/mscc/ocelot_ace.c b/drivers/net/ethernet/mscc/ocelot_ace.c index 8a2f7d13ef6d..dfd82a3baab2 100644 --- a/drivers/net/ethernet/mscc/ocelot_ace.c +++ b/drivers/net/ethernet/mscc/ocelot_ace.c @@ -739,14 +739,24 @@ static void ocelot_match_all_as_mac_etype(struct ocelot *ocelot, int port, static bool ocelot_ace_is_problematic_mac_etype(struct ocelot_ace_rule *ace) { + u16 proto, mask; + if (ace->type != OCELOT_ACE_TYPE_ETYPE) return false; - if (ether_addr_to_u64(ace->frame.etype.dmac.value) & - ether_addr_to_u64(ace->frame.etype.dmac.mask)) + + proto = ntohs(*(u16 *)ace->frame.etype.etype.value); + mask = ntohs(*(u16 *)ace->frame.etype.etype.mask); + + /* ETH_P_ALL match, so all protocols below are included */ + if (mask == 0) return true; - if (ether_addr_to_u64(ace->frame.etype.smac.value) & - ether_addr_to_u64(ace->frame.etype.smac.mask)) + if (proto == ETH_P_ARP) return true; + if (proto == ETH_P_IP) + return true; + if (proto == ETH_P_IPV6) + return true; + return false; } -- cgit v1.2.3-59-g8ed1b From 4faa2e06433fbba16a13a21e1380ee4d246b95fc Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Mon, 20 Apr 2020 19:27:43 +0300 Subject: net: mscc: ocelot: lift protocol restriction for flow_match_eth_addrs keys An attempt was made in commit fe3490e6107e ("net: mscc: ocelot: Hardware ofload for tc flower filter") to avoid clashes between MAC_ETYPE rules and IP rules. Because the protocol blacklist should have included ETH_P_ALL too, it created some confusion, but now the situation should be dealt with a bit better by the patch immediately previous to this one ("net: mscc: ocelot: refine the ocelot_ace_is_problematic_mac_etype function"). So now we can remove that check. MAC_ETYPE rules with a protocol of ETH_P_IP, ETH_P_IPV6, ETH_P_ARP and ETH_P_ALL _are_ supported, with some restrictions regarding per-port exclusivity which are enforced now. Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- drivers/net/ethernet/mscc/ocelot_flower.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/drivers/net/ethernet/mscc/ocelot_flower.c b/drivers/net/ethernet/mscc/ocelot_flower.c index 67f0f5455ff0..5ce172e22b43 100644 --- a/drivers/net/ethernet/mscc/ocelot_flower.c +++ b/drivers/net/ethernet/mscc/ocelot_flower.c @@ -87,11 +87,6 @@ static int ocelot_flower_parse(struct flow_cls_offload *f, BIT(FLOW_DISSECTOR_KEY_CONTROL))) return -EOPNOTSUPP; - if (proto == ETH_P_IP || - proto == ETH_P_IPV6 || - proto == ETH_P_ARP) - return -EOPNOTSUPP; - flow_rule_match_eth_addrs(rule, &match); ace->type = OCELOT_ACE_TYPE_ETYPE; ether_addr_copy(ace->frame.etype.dmac.value, -- cgit v1.2.3-59-g8ed1b From f42ceca226cadf2c27709499af8643acd4281cd7 Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Mon, 20 Apr 2020 11:07:21 -0700 Subject: dt-bindings: net: Correct description of 'broken-turn-around' The turn around bytes (2) are placed between the control phase of the MDIO transaction and the data phase, correct the wording to be more exact. Signed-off-by: Florian Fainelli Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- Documentation/devicetree/bindings/net/ethernet-phy.yaml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Documentation/devicetree/bindings/net/ethernet-phy.yaml b/Documentation/devicetree/bindings/net/ethernet-phy.yaml index 5aa141ccc113..9b1f1147ca36 100644 --- a/Documentation/devicetree/bindings/net/ethernet-phy.yaml +++ b/Documentation/devicetree/bindings/net/ethernet-phy.yaml @@ -81,7 +81,8 @@ properties: $ref: /schemas/types.yaml#definitions/flag description: If set, indicates the PHY device does not correctly release - the turn around line low at the end of a MDIO transaction. + the turn around line low at end of the control phase of the + MDIO transaction. enet-phy-lane-swap: $ref: /schemas/types.yaml#definitions/flag -- cgit v1.2.3-59-g8ed1b From b92d905f2c9c5e66868fae26ba9ed32352df0f5a Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Mon, 20 Apr 2020 11:07:22 -0700 Subject: dt-bindings: net: mdio: Document common properties Some of the properties pertaining to the broken turn around or resets were only documented in ethernet-phy.yaml while they are applicable across all MDIO devices and not Ethernet PHYs specifically which are a superset. Signed-off-by: Florian Fainelli Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- Documentation/devicetree/bindings/net/mdio.yaml | 28 +++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/Documentation/devicetree/bindings/net/mdio.yaml b/Documentation/devicetree/bindings/net/mdio.yaml index cd6c6ae6dabb..37b9579b5651 100644 --- a/Documentation/devicetree/bindings/net/mdio.yaml +++ b/Documentation/devicetree/bindings/net/mdio.yaml @@ -62,6 +62,34 @@ patternProperties: description: The ID number for the PHY. + broken-turn-around: + $ref: /schemas/types.yaml#definitions/flag + description: + If set, indicates the MDIO device does not correctly release + the turn around line low at end of the control phase of the + MDIO transaction. + + resets: + maxItems: 1 + + reset-names: + const: phy + + reset-gpios: + maxItems: 1 + description: + The GPIO phandle and specifier for the MDIO reset signal. + + reset-assert-us: + description: + Delay after the reset was asserted in microseconds. If this + property is missing the delay will be skipped. + + reset-deassert-us: + description: + Delay after the reset was deasserted in microseconds. If + this property is missing the delay will be skipped. + required: - reg -- cgit v1.2.3-59-g8ed1b From 630c3ff8c3d554054229b8c1c3d3a2b9465ffa64 Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Mon, 20 Apr 2020 11:07:23 -0700 Subject: dt-bindings: net: mdio: Make descriptions more general A number of descriptions assume a PHY device, but since this binding describes a MDIO bus which can have different kinds of MDIO devices attached to it, rephrase some descriptions to be more general in that regard. Reviewed-by: Andrew Lunn Signed-off-by: Florian Fainelli Signed-off-by: David S. Miller --- Documentation/devicetree/bindings/net/mdio.yaml | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/Documentation/devicetree/bindings/net/mdio.yaml b/Documentation/devicetree/bindings/net/mdio.yaml index 37b9579b5651..d6a3bf8550eb 100644 --- a/Documentation/devicetree/bindings/net/mdio.yaml +++ b/Documentation/devicetree/bindings/net/mdio.yaml @@ -31,13 +31,13 @@ properties: maxItems: 1 description: The phandle and specifier for the GPIO that controls the RESET - lines of all PHYs on that MDIO bus. + lines of all devices on that MDIO bus. reset-delay-us: description: - RESET pulse width in microseconds. It applies to all PHY devices - and must therefore be appropriately determined based on all PHY - requirements (maximum value of all per-PHY RESET pulse widths). + RESET pulse width in microseconds. It applies to all MDIO devices + and must therefore be appropriately determined based on all devices + requirements (maximum value of all per-device RESET pulse widths). clock-frequency: description: @@ -60,7 +60,7 @@ patternProperties: minimum: 0 maximum: 31 description: - The ID number for the PHY. + The ID number for the device. broken-turn-around: $ref: /schemas/types.yaml#definitions/flag -- cgit v1.2.3-59-g8ed1b From 0a32f1ff2a2e41404deaba5fb32f8a0d640c0974 Mon Sep 17 00:00:00 2001 From: Michael Walle Date: Mon, 20 Apr 2020 20:21:11 +0200 Subject: net: phy: broadcom: add helper to write/read RDB registers RDB (Register Data Base) registers are used on newer Broadcom PHYs. Add helper to read, write and modify these registers. Signed-off-by: Michael Walle Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/phy/bcm-phy-lib.c | 80 +++++++++++++++++++++++++++++++++++++++++++ drivers/net/phy/bcm-phy-lib.h | 9 +++++ include/linux/brcmphy.h | 3 ++ 3 files changed, 92 insertions(+) diff --git a/drivers/net/phy/bcm-phy-lib.c b/drivers/net/phy/bcm-phy-lib.c index e77b274a09fd..d5f9a2701989 100644 --- a/drivers/net/phy/bcm-phy-lib.c +++ b/drivers/net/phy/bcm-phy-lib.c @@ -155,6 +155,86 @@ int bcm_phy_write_shadow(struct phy_device *phydev, u16 shadow, } EXPORT_SYMBOL_GPL(bcm_phy_write_shadow); +int __bcm_phy_read_rdb(struct phy_device *phydev, u16 rdb) +{ + int val; + + val = __phy_write(phydev, MII_BCM54XX_RDB_ADDR, rdb); + if (val < 0) + return val; + + return __phy_read(phydev, MII_BCM54XX_RDB_DATA); +} +EXPORT_SYMBOL_GPL(__bcm_phy_read_rdb); + +int bcm_phy_read_rdb(struct phy_device *phydev, u16 rdb) +{ + int ret; + + phy_lock_mdio_bus(phydev); + ret = __bcm_phy_read_rdb(phydev, rdb); + phy_unlock_mdio_bus(phydev); + + return ret; +} +EXPORT_SYMBOL_GPL(bcm_phy_read_rdb); + +int __bcm_phy_write_rdb(struct phy_device *phydev, u16 rdb, u16 val) +{ + int ret; + + ret = __phy_write(phydev, MII_BCM54XX_RDB_ADDR, rdb); + if (ret < 0) + return ret; + + return __phy_write(phydev, MII_BCM54XX_RDB_DATA, val); +} +EXPORT_SYMBOL_GPL(__bcm_phy_write_rdb); + +int bcm_phy_write_rdb(struct phy_device *phydev, u16 rdb, u16 val) +{ + int ret; + + phy_lock_mdio_bus(phydev); + ret = __bcm_phy_write_rdb(phydev, rdb, val); + phy_unlock_mdio_bus(phydev); + + return ret; +} +EXPORT_SYMBOL_GPL(bcm_phy_write_rdb); + +int __bcm_phy_modify_rdb(struct phy_device *phydev, u16 rdb, u16 mask, u16 set) +{ + int new, ret; + + ret = __phy_write(phydev, MII_BCM54XX_RDB_ADDR, rdb); + if (ret < 0) + return ret; + + ret = __phy_read(phydev, MII_BCM54XX_RDB_DATA); + if (ret < 0) + return ret; + + new = (ret & ~mask) | set; + if (new == ret) + return 0; + + return __phy_write(phydev, MII_BCM54XX_RDB_DATA, new); +} +EXPORT_SYMBOL_GPL(__bcm_phy_modify_rdb); + +int bcm_phy_modify_rdb(struct phy_device *phydev, u16 rdb, u16 mask, u16 set) +{ + int ret; + + phy_lock_mdio_bus(phydev); + ret = __bcm_phy_modify_rdb(phydev, rdb, mask, set); + phy_unlock_mdio_bus(phydev); + + return ret; +} +EXPORT_SYMBOL_GPL(bcm_phy_modify_rdb); + int bcm_phy_enable_apd(struct phy_device *phydev, bool dll_pwr_down) { int val; diff --git a/drivers/net/phy/bcm-phy-lib.h b/drivers/net/phy/bcm-phy-lib.h index 129df819be8c..4d3de91cda6c 100644 --- a/drivers/net/phy/bcm-phy-lib.h +++ b/drivers/net/phy/bcm-phy-lib.h @@ -48,6 +48,15 @@ int bcm_phy_write_shadow(struct phy_device *phydev, u16 shadow, u16 val); int bcm_phy_read_shadow(struct phy_device *phydev, u16 shadow); +int __bcm_phy_write_rdb(struct phy_device *phydev, u16 rdb, u16 val); +int bcm_phy_write_rdb(struct phy_device *phydev, u16 rdb, u16 val); +int __bcm_phy_read_rdb(struct phy_device *phydev, u16 rdb); +int bcm_phy_read_rdb(struct phy_device *phydev, u16 rdb); +int __bcm_phy_modify_rdb(struct phy_device *phydev, u16 rdb, u16 mask, + u16 set); +int bcm_phy_modify_rdb(struct phy_device *phydev, u16 rdb, u16 mask, + u16 set); + int bcm_phy_ack_intr(struct phy_device *phydev); int bcm_phy_config_intr(struct phy_device *phydev); diff --git a/include/linux/brcmphy.h b/include/linux/brcmphy.h index 7e1d857c8468..897b69309964 100644 --- a/include/linux/brcmphy.h +++ b/include/linux/brcmphy.h @@ -115,6 +115,9 @@ #define MII_BCM54XX_SHD_VAL(x) ((x & 0x1f) << 10) #define MII_BCM54XX_SHD_DATA(x) ((x & 0x3ff) << 0) +#define MII_BCM54XX_RDB_ADDR 0x1e +#define MII_BCM54XX_RDB_DATA 0x1f + /* * AUXILIARY CONTROL SHADOW ACCESS REGISTERS. (PHY REG 0x18) */ -- cgit v1.2.3-59-g8ed1b From 6937602ed3f9ebd46ed6a6b5e609c0ae4ed99008 Mon Sep 17 00:00:00 2001 From: Michael Walle Date: Mon, 20 Apr 2020 20:21:12 +0200 Subject: net: phy: add Broadcom BCM54140 support The Broadcom BCM54140 is a Quad SGMII/QSGMII Copper/Fiber Gigabit Ethernet transceiver. This also adds support for tunables to set and get downshift and energy detect auto power-down. The PHY has four ports and each port has its own PHY address. There are per-port registers as well as global registers. Unfortunately, the global registers can only be accessed by reading and writing from/to the PHY address of the first port. Further, there is no way to find out what port you actually are by just reading the per-port registers. We therefore, have to scan the bus on the PHY probe to determine the port and thus what address we need to access the global registers. Signed-off-by: Michael Walle Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- drivers/net/phy/Kconfig | 10 + drivers/net/phy/Makefile | 1 + drivers/net/phy/bcm54140.c | 481 +++++++++++++++++++++++++++++++++++++++++++++ include/linux/brcmphy.h | 1 + 4 files changed, 493 insertions(+) create mode 100644 drivers/net/phy/bcm54140.c diff --git a/drivers/net/phy/Kconfig b/drivers/net/phy/Kconfig index 3fa33d27eeba..cb7936b577de 100644 --- a/drivers/net/phy/Kconfig +++ b/drivers/net/phy/Kconfig @@ -346,6 +346,16 @@ config BROADCOM_PHY Currently supports the BCM5411, BCM5421, BCM5461, BCM54616S, BCM5464, BCM5481, BCM54810 and BCM5482 PHYs. +config BCM54140_PHY + tristate "Broadcom BCM54140 PHY" + depends on PHYLIB + select BCM_NET_PHYLIB + help + Support the Broadcom BCM54140 Quad SGMII/QSGMII PHY. + + This driver also supports the hardware monitoring of this PHY and + exposes voltage and temperature sensors. + config BCM84881_PHY tristate "Broadcom BCM84881 PHY" depends on PHYLIB diff --git a/drivers/net/phy/Makefile b/drivers/net/phy/Makefile index 2f5c7093a65b..cd345b75d127 100644 --- a/drivers/net/phy/Makefile +++ b/drivers/net/phy/Makefile @@ -68,6 +68,7 @@ obj-$(CONFIG_BCM87XX_PHY) += bcm87xx.o obj-$(CONFIG_BCM_CYGNUS_PHY) += bcm-cygnus.o obj-$(CONFIG_BCM_NET_PHYLIB) += bcm-phy-lib.o obj-$(CONFIG_BROADCOM_PHY) += broadcom.o +obj-$(CONFIG_BCM54140_PHY) += bcm54140.o obj-$(CONFIG_BCM84881_PHY) += bcm84881.o obj-$(CONFIG_CICADA_PHY) += cicada.o obj-$(CONFIG_CORTINA_PHY) += cortina.o diff --git a/drivers/net/phy/bcm54140.c b/drivers/net/phy/bcm54140.c new file mode 100644 index 000000000000..0eeb60de67f8 --- /dev/null +++ b/drivers/net/phy/bcm54140.c @@ -0,0 +1,481 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* Broadcom BCM54140 Quad SGMII/QSGMII Copper/Fiber Gigabit PHY + * + * Copyright (c) 2020 Michael Walle + */ + +#include +#include +#include +#include + +#include "bcm-phy-lib.h" + +/* RDB per-port registers + */ +#define BCM54140_RDB_ISR 0x00a /* interrupt status */ +#define BCM54140_RDB_IMR 0x00b /* interrupt mask */ +#define BCM54140_RDB_INT_LINK BIT(1) /* link status changed */ +#define BCM54140_RDB_INT_SPEED BIT(2) /* link speed change */ +#define BCM54140_RDB_INT_DUPLEX BIT(3) /* duplex mode changed */ +#define BCM54140_RDB_SPARE1 0x012 /* spare control 1 */ +#define BCM54140_RDB_SPARE1_LSLM BIT(2) /* link speed LED mode */ +#define BCM54140_RDB_SPARE2 0x014 /* spare control 2 */ +#define BCM54140_RDB_SPARE2_WS_RTRY_DIS BIT(8) /* wirespeed retry disable */ +#define BCM54140_RDB_SPARE2_WS_RTRY_LIMIT GENMASK(4, 2) /* retry limit */ +#define BCM54140_RDB_SPARE3 0x015 /* spare control 3 */ +#define BCM54140_RDB_SPARE3_BIT0 BIT(0) +#define BCM54140_RDB_LED_CTRL 0x019 /* LED control */ +#define BCM54140_RDB_LED_CTRL_ACTLINK0 BIT(4) +#define BCM54140_RDB_LED_CTRL_ACTLINK1 BIT(8) +#define BCM54140_RDB_C_APWR 0x01a /* auto power down control */ +#define BCM54140_RDB_C_APWR_SINGLE_PULSE BIT(8) /* single pulse */ +#define BCM54140_RDB_C_APWR_APD_MODE_DIS 0 /* ADP disable */ +#define BCM54140_RDB_C_APWR_APD_MODE_EN 1 /* ADP enable */ +#define BCM54140_RDB_C_APWR_APD_MODE_DIS2 2 /* ADP disable */ +#define BCM54140_RDB_C_APWR_APD_MODE_EN_ANEG 3 /* ADP enable w/ aneg */ +#define BCM54140_RDB_C_APWR_APD_MODE_MASK GENMASK(6, 5) +#define BCM54140_RDB_C_APWR_SLP_TIM_MASK BIT(4)/* sleep timer */ +#define BCM54140_RDB_C_APWR_SLP_TIM_2_7 0 /* 2.7s */ +#define BCM54140_RDB_C_APWR_SLP_TIM_5_4 1 /* 5.4s */ +#define BCM54140_RDB_C_PWR 0x02a /* copper power control */ +#define BCM54140_RDB_C_PWR_ISOLATE BIT(5) /* super isolate mode */ +#define BCM54140_RDB_C_MISC_CTRL 0x02f /* misc copper control */ +#define BCM54140_RDB_C_MISC_CTRL_WS_EN BIT(4) /* wirespeed enable */ + +/* RDB global registers + */ +#define BCM54140_RDB_TOP_IMR 0x82d /* interrupt mask */ +#define BCM54140_RDB_TOP_IMR_PORT0 BIT(4) +#define BCM54140_RDB_TOP_IMR_PORT1 BIT(5) +#define BCM54140_RDB_TOP_IMR_PORT2 BIT(6) +#define BCM54140_RDB_TOP_IMR_PORT3 BIT(7) + +#define BCM54140_DEFAULT_DOWNSHIFT 5 +#define BCM54140_MAX_DOWNSHIFT 9 + +struct bcm54140_priv { + int port; + int base_addr; +}; + +static int bcm54140_base_read_rdb(struct phy_device *phydev, u16 rdb) +{ + struct bcm54140_priv *priv = phydev->priv; + struct mii_bus *bus = phydev->mdio.bus; + int ret; + + mutex_lock(&bus->mdio_lock); + ret = __mdiobus_write(bus, priv->base_addr, MII_BCM54XX_RDB_ADDR, rdb); + if (ret < 0) + goto out; + + ret = __mdiobus_read(bus, priv->base_addr, MII_BCM54XX_RDB_DATA); + +out: + mutex_unlock(&bus->mdio_lock); + return ret; +} + +static int bcm54140_base_write_rdb(struct phy_device *phydev, + u16 rdb, u16 val) +{ + struct bcm54140_priv *priv = phydev->priv; + struct mii_bus *bus = phydev->mdio.bus; + int ret; + + mutex_lock(&bus->mdio_lock); + ret = __mdiobus_write(bus, priv->base_addr, MII_BCM54XX_RDB_ADDR, rdb); + if (ret < 0) + goto out; + + ret = __mdiobus_write(bus, priv->base_addr, MII_BCM54XX_RDB_DATA, val); + +out: + mutex_unlock(&bus->mdio_lock); + return ret; +} + +/* Under some circumstances a core PLL may not lock, this will then prevent + * a successful link establishment. Restart the PLL after the voltages are + * stable to workaround this issue. + */ +static int bcm54140_b0_workaround(struct phy_device *phydev) +{ + int spare3; + int ret; + + spare3 = bcm_phy_read_rdb(phydev, BCM54140_RDB_SPARE3); + if (spare3 < 0) + return spare3; + + spare3 &= ~BCM54140_RDB_SPARE3_BIT0; + + ret = bcm_phy_write_rdb(phydev, BCM54140_RDB_SPARE3, spare3); + if (ret) + return ret; + + ret = phy_modify(phydev, MII_BMCR, 0, BMCR_PDOWN); + if (ret) + return ret; + + ret = phy_modify(phydev, MII_BMCR, BMCR_PDOWN, 0); + if (ret) + return ret; + + spare3 |= BCM54140_RDB_SPARE3_BIT0; + + return bcm_phy_write_rdb(phydev, BCM54140_RDB_SPARE3, spare3); +} + +/* The BCM54140 is a quad PHY where only the first port has access to the + * global register. Thus we need to find out its PHY address. + * + */ +static int bcm54140_get_base_addr_and_port(struct phy_device *phydev) +{ + struct bcm54140_priv *priv = phydev->priv; + struct mii_bus *bus = phydev->mdio.bus; + int addr, min_addr, max_addr; + int step = 1; + u32 phy_id; + int tmp; + + min_addr = phydev->mdio.addr; + max_addr = phydev->mdio.addr; + addr = phydev->mdio.addr; + + /* We scan forward and backwards and look for PHYs which have the + * same phy_id like we do. Step 1 will scan forward, step 2 + * backwards. Once we are finished, we have a min_addr and + * max_addr which resembles the range of PHY addresses of the same + * type of PHY. There is one caveat; there may be many PHYs of + * the same type, but we know that each PHY takes exactly 4 + * consecutive addresses. Therefore we can deduce our offset + * to the base address of this quad PHY. + */ + + while (1) { + if (step == 3) { + break; + } else if (step == 1) { + max_addr = addr; + addr++; + } else { + min_addr = addr; + addr--; + } + + if (addr < 0 || addr >= PHY_MAX_ADDR) { + addr = phydev->mdio.addr; + step++; + continue; + } + + /* read the PHY id */ + tmp = mdiobus_read(bus, addr, MII_PHYSID1); + if (tmp < 0) + return tmp; + phy_id = tmp << 16; + tmp = mdiobus_read(bus, addr, MII_PHYSID2); + if (tmp < 0) + return tmp; + phy_id |= tmp; + + /* see if it is still the same PHY */ + if ((phy_id & phydev->drv->phy_id_mask) != + (phydev->drv->phy_id & phydev->drv->phy_id_mask)) { + addr = phydev->mdio.addr; + step++; + } + } + + /* The range we get should be a multiple of four. Please note that both + * the min_addr and max_addr are inclusive. So we have to add one if we + * subtract them. + */ + if ((max_addr - min_addr + 1) % 4) { + dev_err(&phydev->mdio.dev, + "Detected Quad PHY IDs %d..%d doesn't make sense.\n", + min_addr, max_addr); + return -EINVAL; + } + + priv->port = (phydev->mdio.addr - min_addr) % 4; + priv->base_addr = phydev->mdio.addr - priv->port; + + return 0; +} + +static int bcm54140_probe(struct phy_device *phydev) +{ + struct bcm54140_priv *priv; + int ret; + + priv = devm_kzalloc(&phydev->mdio.dev, sizeof(*priv), GFP_KERNEL); + if (!priv) + return -ENOMEM; + + phydev->priv = priv; + + ret = bcm54140_get_base_addr_and_port(phydev); + if (ret) + return ret; + + phydev_dbg(phydev, "probed (port %d, base PHY address %d)\n", + priv->port, priv->base_addr); + + return 0; +} + +static int bcm54140_config_init(struct phy_device *phydev) +{ + u16 reg = 0xffff; + int ret; + + /* Apply hardware errata */ + ret = bcm54140_b0_workaround(phydev); + if (ret) + return ret; + + /* Unmask events we are interested in. */ + reg &= ~(BCM54140_RDB_INT_DUPLEX | + BCM54140_RDB_INT_SPEED | + BCM54140_RDB_INT_LINK); + ret = bcm_phy_write_rdb(phydev, BCM54140_RDB_IMR, reg); + if (ret) + return ret; + + /* LED1=LINKSPD[1], LED2=LINKSPD[2], LED3=LINK/ACTIVITY */ + ret = bcm_phy_modify_rdb(phydev, BCM54140_RDB_SPARE1, + 0, BCM54140_RDB_SPARE1_LSLM); + if (ret) + return ret; + + ret = bcm_phy_modify_rdb(phydev, BCM54140_RDB_LED_CTRL, + 0, BCM54140_RDB_LED_CTRL_ACTLINK0); + if (ret) + return ret; + + /* disable super isolate mode */ + return bcm_phy_modify_rdb(phydev, BCM54140_RDB_C_PWR, + BCM54140_RDB_C_PWR_ISOLATE, 0); +} + +int bcm54140_did_interrupt(struct phy_device *phydev) +{ + int ret; + + ret = bcm_phy_read_rdb(phydev, BCM54140_RDB_ISR); + + return (ret < 0) ? 0 : ret; +} + +int bcm54140_ack_intr(struct phy_device *phydev) +{ + int reg; + + /* clear pending interrupts */ + reg = bcm_phy_read_rdb(phydev, BCM54140_RDB_ISR); + if (reg < 0) + return reg; + + return 0; +} + +int bcm54140_config_intr(struct phy_device *phydev) +{ + struct bcm54140_priv *priv = phydev->priv; + static const u16 port_to_imr_bit[] = { + BCM54140_RDB_TOP_IMR_PORT0, BCM54140_RDB_TOP_IMR_PORT1, + BCM54140_RDB_TOP_IMR_PORT2, BCM54140_RDB_TOP_IMR_PORT3, + }; + int reg; + + if (priv->port >= ARRAY_SIZE(port_to_imr_bit)) + return -EINVAL; + + reg = bcm54140_base_read_rdb(phydev, BCM54140_RDB_TOP_IMR); + if (reg < 0) + return reg; + + if (phydev->interrupts == PHY_INTERRUPT_ENABLED) + reg &= ~port_to_imr_bit[priv->port]; + else + reg |= port_to_imr_bit[priv->port]; + + return bcm54140_base_write_rdb(phydev, BCM54140_RDB_TOP_IMR, reg); +} + +static int bcm54140_get_downshift(struct phy_device *phydev, u8 *data) +{ + int val; + + val = bcm_phy_read_rdb(phydev, BCM54140_RDB_C_MISC_CTRL); + if (val < 0) + return val; + + if (!(val & BCM54140_RDB_C_MISC_CTRL_WS_EN)) { + *data = DOWNSHIFT_DEV_DISABLE; + return 0; + } + + val = bcm_phy_read_rdb(phydev, BCM54140_RDB_SPARE2); + if (val < 0) + return val; + + if (val & BCM54140_RDB_SPARE2_WS_RTRY_DIS) + *data = 1; + else + *data = FIELD_GET(BCM54140_RDB_SPARE2_WS_RTRY_LIMIT, val) + 2; + + return 0; +} + +static int bcm54140_set_downshift(struct phy_device *phydev, u8 cnt) +{ + u16 mask, set; + int ret; + + if (cnt > BCM54140_MAX_DOWNSHIFT && cnt != DOWNSHIFT_DEV_DEFAULT_COUNT) + return -EINVAL; + + if (!cnt) + return bcm_phy_modify_rdb(phydev, BCM54140_RDB_C_MISC_CTRL, + BCM54140_RDB_C_MISC_CTRL_WS_EN, 0); + + if (cnt == DOWNSHIFT_DEV_DEFAULT_COUNT) + cnt = BCM54140_DEFAULT_DOWNSHIFT; + + if (cnt == 1) { + mask = 0; + set = BCM54140_RDB_SPARE2_WS_RTRY_DIS; + } else { + mask = BCM54140_RDB_SPARE2_WS_RTRY_DIS; + mask |= BCM54140_RDB_SPARE2_WS_RTRY_LIMIT; + set = FIELD_PREP(BCM54140_RDB_SPARE2_WS_RTRY_LIMIT, cnt - 2); + } + ret = bcm_phy_modify_rdb(phydev, BCM54140_RDB_SPARE2, + mask, set); + if (ret) + return ret; + + return bcm_phy_modify_rdb(phydev, BCM54140_RDB_C_MISC_CTRL, + 0, BCM54140_RDB_C_MISC_CTRL_WS_EN); +} + +static int bcm54140_get_edpd(struct phy_device *phydev, u16 *tx_interval) +{ + int val; + + val = bcm_phy_read_rdb(phydev, BCM54140_RDB_C_APWR); + if (val < 0) + return val; + + switch (FIELD_GET(BCM54140_RDB_C_APWR_APD_MODE_MASK, val)) { + case BCM54140_RDB_C_APWR_APD_MODE_DIS: + case BCM54140_RDB_C_APWR_APD_MODE_DIS2: + *tx_interval = ETHTOOL_PHY_EDPD_DISABLE; + break; + case BCM54140_RDB_C_APWR_APD_MODE_EN: + case BCM54140_RDB_C_APWR_APD_MODE_EN_ANEG: + switch (FIELD_GET(BCM54140_RDB_C_APWR_SLP_TIM_MASK, val)) { + case BCM54140_RDB_C_APWR_SLP_TIM_2_7: + *tx_interval = 2700; + break; + case BCM54140_RDB_C_APWR_SLP_TIM_5_4: + *tx_interval = 5400; + break; + } + } + + return 0; +} + +static int bcm54140_set_edpd(struct phy_device *phydev, u16 tx_interval) +{ + u16 mask, set; + + mask = BCM54140_RDB_C_APWR_APD_MODE_MASK; + if (tx_interval == ETHTOOL_PHY_EDPD_DISABLE) + set = FIELD_PREP(BCM54140_RDB_C_APWR_APD_MODE_MASK, + BCM54140_RDB_C_APWR_APD_MODE_DIS); + else + set = FIELD_PREP(BCM54140_RDB_C_APWR_APD_MODE_MASK, + BCM54140_RDB_C_APWR_APD_MODE_EN_ANEG); + + /* enable single pulse mode */ + set |= BCM54140_RDB_C_APWR_SINGLE_PULSE; + + /* set sleep timer */ + mask |= BCM54140_RDB_C_APWR_SLP_TIM_MASK; + switch (tx_interval) { + case ETHTOOL_PHY_EDPD_DFLT_TX_MSECS: + case ETHTOOL_PHY_EDPD_DISABLE: + case 2700: + set |= BCM54140_RDB_C_APWR_SLP_TIM_2_7; + break; + case 5400: + set |= BCM54140_RDB_C_APWR_SLP_TIM_5_4; + break; + default: + return -EINVAL; + } + + return bcm_phy_modify_rdb(phydev, BCM54140_RDB_C_APWR, mask, set); +} + +static int bcm54140_get_tunable(struct phy_device *phydev, + struct ethtool_tunable *tuna, void *data) +{ + switch (tuna->id) { + case ETHTOOL_PHY_DOWNSHIFT: + return bcm54140_get_downshift(phydev, data); + case ETHTOOL_PHY_EDPD: + return bcm54140_get_edpd(phydev, data); + default: + return -EOPNOTSUPP; + } +} + +static int bcm54140_set_tunable(struct phy_device *phydev, + struct ethtool_tunable *tuna, const void *data) +{ + switch (tuna->id) { + case ETHTOOL_PHY_DOWNSHIFT: + return bcm54140_set_downshift(phydev, *(const u8 *)data); + case ETHTOOL_PHY_EDPD: + return bcm54140_set_edpd(phydev, *(const u16 *)data); + default: + return -EOPNOTSUPP; + } +} + +static struct phy_driver bcm54140_drivers[] = { + { + .phy_id = PHY_ID_BCM54140, + .phy_id_mask = 0xfffffff0, + .name = "Broadcom BCM54140", + .features = PHY_GBIT_FEATURES, + .config_init = bcm54140_config_init, + .did_interrupt = bcm54140_did_interrupt, + .ack_interrupt = bcm54140_ack_intr, + .config_intr = bcm54140_config_intr, + .probe = bcm54140_probe, + .suspend = genphy_suspend, + .resume = genphy_resume, + .get_tunable = bcm54140_get_tunable, + .set_tunable = bcm54140_set_tunable, + }, +}; +module_phy_driver(bcm54140_drivers); + +static struct mdio_device_id __maybe_unused bcm54140_tbl[] = { + { PHY_ID_BCM54140, 0xfffffff0 }, + { } +}; + +MODULE_AUTHOR("Michael Walle"); +MODULE_DESCRIPTION("Broadcom BCM54140 PHY driver"); +MODULE_DEVICE_TABLE(mdio, bcm54140_tbl); +MODULE_LICENSE("GPL"); diff --git a/include/linux/brcmphy.h b/include/linux/brcmphy.h index 897b69309964..8be150e69c7c 100644 --- a/include/linux/brcmphy.h +++ b/include/linux/brcmphy.h @@ -25,6 +25,7 @@ #define PHY_ID_BCM5461 0x002060c0 #define PHY_ID_BCM54612E 0x03625e60 #define PHY_ID_BCM54616S 0x03625d10 +#define PHY_ID_BCM54140 0xae025019 #define PHY_ID_BCM57780 0x03625d90 #define PHY_ID_BCM89610 0x03625cd0 -- cgit v1.2.3-59-g8ed1b From 4406d36dfdf1fbd954400e16ffeb915c1907d58a Mon Sep 17 00:00:00 2001 From: Michael Walle Date: Mon, 20 Apr 2020 20:21:13 +0200 Subject: net: phy: bcm54140: add hwmon support The PHY supports monitoring its die temperature as well as two analog voltages. Add support for it. Signed-off-by: Michael Walle Acked-by: Guenter Roeck Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- Documentation/hwmon/bcm54140.rst | 45 +++++ Documentation/hwmon/index.rst | 1 + drivers/net/phy/Kconfig | 1 + drivers/net/phy/bcm54140.c | 396 +++++++++++++++++++++++++++++++++++++++ 4 files changed, 443 insertions(+) create mode 100644 Documentation/hwmon/bcm54140.rst diff --git a/Documentation/hwmon/bcm54140.rst b/Documentation/hwmon/bcm54140.rst new file mode 100644 index 000000000000..bc6ea4b45966 --- /dev/null +++ b/Documentation/hwmon/bcm54140.rst @@ -0,0 +1,45 @@ +.. SPDX-License-Identifier: GPL-2.0-only + +Broadcom BCM54140 Quad SGMII/QSGMII PHY +======================================= + +Supported chips: + + * Broadcom BCM54140 + + Datasheet: not public + +Author: Michael Walle + +Description +----------- + +The Broadcom BCM54140 is a Quad SGMII/QSGMII PHY which supports monitoring +its die temperature as well as two analog voltages. + +The AVDDL is a 1.0V analogue voltage, the AVDDH is a 3.3V analogue voltage. +Both voltages and the temperature are measured in a round-robin fashion. + +Sysfs entries +------------- + +The following attributes are supported. + +======================= ======================================================== +in0_label "AVDDL" +in0_input Measured AVDDL voltage. +in0_min Minimum AVDDL voltage. +in0_max Maximum AVDDL voltage. +in0_alarm AVDDL voltage alarm. + +in1_label "AVDDH" +in1_input Measured AVDDH voltage. +in1_min Minimum AVDDH voltage. +in1_max Maximum AVDDH voltage. +in1_alarm AVDDH voltage alarm. + +temp1_input Die temperature. +temp1_min Minimum die temperature. +temp1_max Maximum die temperature. +temp1_alarm Die temperature alarm. +======================= ======================================================== diff --git a/Documentation/hwmon/index.rst b/Documentation/hwmon/index.rst index 8ef62fd39787..1f0affb3b6e0 100644 --- a/Documentation/hwmon/index.rst +++ b/Documentation/hwmon/index.rst @@ -42,6 +42,7 @@ Hardware Monitoring Kernel Drivers asb100 asc7621 aspeed-pwm-tacho + bcm54140 bel-pfe coretemp da9052 diff --git a/drivers/net/phy/Kconfig b/drivers/net/phy/Kconfig index cb7936b577de..bacfee41b564 100644 --- a/drivers/net/phy/Kconfig +++ b/drivers/net/phy/Kconfig @@ -349,6 +349,7 @@ config BROADCOM_PHY config BCM54140_PHY tristate "Broadcom BCM54140 PHY" depends on PHYLIB + depends on HWMON || HWMON=n select BCM_NET_PHYLIB help Support the Broadcom BCM54140 Quad SGMII/QSGMII PHY. diff --git a/drivers/net/phy/bcm54140.c b/drivers/net/phy/bcm54140.c index 0eeb60de67f8..aa854477e06a 100644 --- a/drivers/net/phy/bcm54140.c +++ b/drivers/net/phy/bcm54140.c @@ -6,6 +6,7 @@ #include #include +#include #include #include @@ -50,6 +51,69 @@ #define BCM54140_RDB_TOP_IMR_PORT1 BIT(5) #define BCM54140_RDB_TOP_IMR_PORT2 BIT(6) #define BCM54140_RDB_TOP_IMR_PORT3 BIT(7) +#define BCM54140_RDB_MON_CTRL 0x831 /* monitor control */ +#define BCM54140_RDB_MON_CTRL_V_MODE BIT(3) /* voltage mode */ +#define BCM54140_RDB_MON_CTRL_SEL_MASK GENMASK(2, 1) +#define BCM54140_RDB_MON_CTRL_SEL_TEMP 0 /* meassure temperature */ +#define BCM54140_RDB_MON_CTRL_SEL_1V0 1 /* meassure AVDDL 1.0V */ +#define BCM54140_RDB_MON_CTRL_SEL_3V3 2 /* meassure AVDDH 3.3V */ +#define BCM54140_RDB_MON_CTRL_SEL_RR 3 /* meassure all round-robin */ +#define BCM54140_RDB_MON_CTRL_PWR_DOWN BIT(0) /* power-down monitor */ +#define BCM54140_RDB_MON_TEMP_VAL 0x832 /* temperature value */ +#define BCM54140_RDB_MON_TEMP_MAX 0x833 /* temperature high thresh */ +#define BCM54140_RDB_MON_TEMP_MIN 0x834 /* temperature low thresh */ +#define BCM54140_RDB_MON_TEMP_DATA_MASK GENMASK(9, 0) +#define BCM54140_RDB_MON_1V0_VAL 0x835 /* AVDDL 1.0V value */ +#define BCM54140_RDB_MON_1V0_MAX 0x836 /* AVDDL 1.0V high thresh */ +#define BCM54140_RDB_MON_1V0_MIN 0x837 /* AVDDL 1.0V low thresh */ +#define BCM54140_RDB_MON_1V0_DATA_MASK GENMASK(10, 0) +#define BCM54140_RDB_MON_3V3_VAL 0x838 /* AVDDH 3.3V value */ +#define BCM54140_RDB_MON_3V3_MAX 0x839 /* AVDDH 3.3V high thresh */ +#define BCM54140_RDB_MON_3V3_MIN 0x83a /* AVDDH 3.3V low thresh */ +#define BCM54140_RDB_MON_3V3_DATA_MASK GENMASK(11, 0) +#define BCM54140_RDB_MON_ISR 0x83b /* interrupt status */ +#define BCM54140_RDB_MON_ISR_3V3 BIT(2) /* AVDDH 3.3V alarm */ +#define BCM54140_RDB_MON_ISR_1V0 BIT(1) /* AVDDL 1.0V alarm */ +#define BCM54140_RDB_MON_ISR_TEMP BIT(0) /* temperature alarm */ + +/* According to the datasheet the formula is: + * T = 413.35 - (0.49055 * bits[9:0]) + */ +#define BCM54140_HWMON_TO_TEMP(v) (413350L - (v) * 491) +#define BCM54140_HWMON_FROM_TEMP(v) DIV_ROUND_CLOSEST_ULL(413350L - (v), 491) + +/* According to the datasheet the formula is: + * U = bits[11:0] / 1024 * 220 / 0.2 + * + * Normalized: + * U = bits[11:0] / 4096 * 2514 + */ +#define BCM54140_HWMON_TO_IN_1V0(v) ((v) * 2514 >> 11) +#define BCM54140_HWMON_FROM_IN_1V0(v) DIV_ROUND_CLOSEST_ULL(((v) << 11), 2514) + +/* According to the datasheet the formula is: + * U = bits[10:0] / 1024 * 880 / 0.7 + * + * Normalized: + * U = bits[10:0] / 2048 * 4400 + */ +#define BCM54140_HWMON_TO_IN_3V3(v) ((v) * 4400 >> 12) +#define BCM54140_HWMON_FROM_IN_3V3(v) DIV_ROUND_CLOSEST_ULL(((v) << 12), 4400) + +#define BCM54140_HWMON_TO_IN(ch, v) ((ch) ? BCM54140_HWMON_TO_IN_3V3(v) \ + : BCM54140_HWMON_TO_IN_1V0(v)) +#define BCM54140_HWMON_FROM_IN(ch, v) ((ch) ? BCM54140_HWMON_FROM_IN_3V3(v) \ + : BCM54140_HWMON_FROM_IN_1V0(v)) +#define BCM54140_HWMON_IN_MASK(ch) ((ch) ? BCM54140_RDB_MON_3V3_DATA_MASK \ + : BCM54140_RDB_MON_1V0_DATA_MASK) +#define BCM54140_HWMON_IN_VAL_REG(ch) ((ch) ? BCM54140_RDB_MON_3V3_VAL \ + : BCM54140_RDB_MON_1V0_VAL) +#define BCM54140_HWMON_IN_MIN_REG(ch) ((ch) ? BCM54140_RDB_MON_3V3_MIN \ + : BCM54140_RDB_MON_1V0_MIN) +#define BCM54140_HWMON_IN_MAX_REG(ch) ((ch) ? BCM54140_RDB_MON_3V3_MAX \ + : BCM54140_RDB_MON_1V0_MAX) +#define BCM54140_HWMON_IN_ALARM_BIT(ch) ((ch) ? BCM54140_RDB_MON_ISR_3V3 \ + : BCM54140_RDB_MON_ISR_1V0) #define BCM54140_DEFAULT_DOWNSHIFT 5 #define BCM54140_MAX_DOWNSHIFT 9 @@ -57,8 +121,328 @@ struct bcm54140_priv { int port; int base_addr; +#if IS_ENABLED(CONFIG_HWMON) + bool pkg_init; + /* protect the alarm bits */ + struct mutex alarm_lock; + u16 alarm; +#endif }; +#if IS_ENABLED(CONFIG_HWMON) +static umode_t bcm54140_hwmon_is_visible(const void *data, + enum hwmon_sensor_types type, + u32 attr, int channel) +{ + switch (type) { + case hwmon_in: + switch (attr) { + case hwmon_in_min: + case hwmon_in_max: + return 0644; + case hwmon_in_label: + case hwmon_in_input: + case hwmon_in_alarm: + return 0444; + default: + return 0; + } + case hwmon_temp: + switch (attr) { + case hwmon_temp_min: + case hwmon_temp_max: + return 0644; + case hwmon_temp_input: + case hwmon_temp_alarm: + return 0444; + default: + return 0; + } + default: + return 0; + } +} + +static int bcm54140_hwmon_read_alarm(struct device *dev, unsigned int bit, + long *val) +{ + struct phy_device *phydev = dev_get_drvdata(dev); + struct bcm54140_priv *priv = phydev->priv; + int tmp, ret = 0; + + mutex_lock(&priv->alarm_lock); + + /* latch any alarm bits */ + tmp = bcm_phy_read_rdb(phydev, BCM54140_RDB_MON_ISR); + if (tmp < 0) { + ret = tmp; + goto out; + } + priv->alarm |= tmp; + + *val = !!(priv->alarm & bit); + priv->alarm &= ~bit; + +out: + mutex_unlock(&priv->alarm_lock); + return ret; +} + +static int bcm54140_hwmon_read_temp(struct device *dev, u32 attr, long *val) +{ + struct phy_device *phydev = dev_get_drvdata(dev); + u16 reg, tmp; + + switch (attr) { + case hwmon_temp_input: + reg = BCM54140_RDB_MON_TEMP_VAL; + break; + case hwmon_temp_min: + reg = BCM54140_RDB_MON_TEMP_MIN; + break; + case hwmon_temp_max: + reg = BCM54140_RDB_MON_TEMP_MAX; + break; + case hwmon_temp_alarm: + return bcm54140_hwmon_read_alarm(dev, + BCM54140_RDB_MON_ISR_TEMP, + val); + default: + return -EOPNOTSUPP; + } + + tmp = bcm_phy_read_rdb(phydev, reg); + if (tmp < 0) + return tmp; + + *val = BCM54140_HWMON_TO_TEMP(tmp & BCM54140_RDB_MON_TEMP_DATA_MASK); + + return 0; +} + +static int bcm54140_hwmon_read_in(struct device *dev, u32 attr, + int channel, long *val) +{ + struct phy_device *phydev = dev_get_drvdata(dev); + u16 bit, reg, tmp; + + switch (attr) { + case hwmon_in_input: + reg = BCM54140_HWMON_IN_VAL_REG(channel); + break; + case hwmon_in_min: + reg = BCM54140_HWMON_IN_MIN_REG(channel); + break; + case hwmon_in_max: + reg = BCM54140_HWMON_IN_MAX_REG(channel); + break; + case hwmon_in_alarm: + bit = BCM54140_HWMON_IN_ALARM_BIT(channel); + return bcm54140_hwmon_read_alarm(dev, bit, val); + default: + return -EOPNOTSUPP; + } + + tmp = bcm_phy_read_rdb(phydev, reg); + if (tmp < 0) + return tmp; + + tmp &= BCM54140_HWMON_IN_MASK(channel); + *val = BCM54140_HWMON_TO_IN(channel, tmp); + + return 0; +} + +static int bcm54140_hwmon_read(struct device *dev, + enum hwmon_sensor_types type, u32 attr, + int channel, long *val) +{ + switch (type) { + case hwmon_temp: + return bcm54140_hwmon_read_temp(dev, attr, val); + case hwmon_in: + return bcm54140_hwmon_read_in(dev, attr, channel, val); + default: + return -EOPNOTSUPP; + } +} + +static const char *const bcm54140_hwmon_in_labels[] = { + "AVDDL", + "AVDDH", +}; + +static int bcm54140_hwmon_read_string(struct device *dev, + enum hwmon_sensor_types type, u32 attr, + int channel, const char **str) +{ + switch (type) { + case hwmon_in: + switch (attr) { + case hwmon_in_label: + *str = bcm54140_hwmon_in_labels[channel]; + return 0; + default: + return -EOPNOTSUPP; + } + default: + return -EOPNOTSUPP; + } +} + +static int bcm54140_hwmon_write_temp(struct device *dev, u32 attr, + int channel, long val) +{ + struct phy_device *phydev = dev_get_drvdata(dev); + u16 mask = BCM54140_RDB_MON_TEMP_DATA_MASK; + u16 reg; + + val = clamp_val(val, BCM54140_HWMON_TO_TEMP(mask), + BCM54140_HWMON_TO_TEMP(0)); + + switch (attr) { + case hwmon_temp_min: + reg = BCM54140_RDB_MON_TEMP_MIN; + break; + case hwmon_temp_max: + reg = BCM54140_RDB_MON_TEMP_MAX; + break; + default: + return -EOPNOTSUPP; + } + + return bcm_phy_modify_rdb(phydev, reg, mask, + BCM54140_HWMON_FROM_TEMP(val)); +} + +static int bcm54140_hwmon_write_in(struct device *dev, u32 attr, + int channel, long val) +{ + struct phy_device *phydev = dev_get_drvdata(dev); + u16 mask = BCM54140_HWMON_IN_MASK(channel); + u16 reg; + + val = clamp_val(val, 0, BCM54140_HWMON_TO_IN(channel, mask)); + + switch (attr) { + case hwmon_in_min: + reg = BCM54140_HWMON_IN_MIN_REG(channel); + break; + case hwmon_in_max: + reg = BCM54140_HWMON_IN_MAX_REG(channel); + break; + default: + return -EOPNOTSUPP; + } + + return bcm_phy_modify_rdb(phydev, reg, mask, + BCM54140_HWMON_FROM_IN(channel, val)); +} + +static int bcm54140_hwmon_write(struct device *dev, + enum hwmon_sensor_types type, u32 attr, + int channel, long val) +{ + switch (type) { + case hwmon_temp: + return bcm54140_hwmon_write_temp(dev, attr, channel, val); + case hwmon_in: + return bcm54140_hwmon_write_in(dev, attr, channel, val); + default: + return -EOPNOTSUPP; + } +} + +static const struct hwmon_channel_info *bcm54140_hwmon_info[] = { + HWMON_CHANNEL_INFO(temp, + HWMON_T_INPUT | HWMON_T_MIN | HWMON_T_MAX | + HWMON_T_ALARM), + HWMON_CHANNEL_INFO(in, + HWMON_I_INPUT | HWMON_I_MIN | HWMON_I_MAX | + HWMON_I_ALARM | HWMON_I_LABEL, + HWMON_I_INPUT | HWMON_I_MIN | HWMON_I_MAX | + HWMON_I_ALARM | HWMON_I_LABEL), + NULL +}; + +static const struct hwmon_ops bcm54140_hwmon_ops = { + .is_visible = bcm54140_hwmon_is_visible, + .read = bcm54140_hwmon_read, + .read_string = bcm54140_hwmon_read_string, + .write = bcm54140_hwmon_write, +}; + +static const struct hwmon_chip_info bcm54140_chip_info = { + .ops = &bcm54140_hwmon_ops, + .info = bcm54140_hwmon_info, +}; + +static int bcm54140_enable_monitoring(struct phy_device *phydev) +{ + u16 mask, set; + + /* 3.3V voltage mode */ + set = BCM54140_RDB_MON_CTRL_V_MODE; + + /* select round-robin */ + mask = BCM54140_RDB_MON_CTRL_SEL_MASK; + set |= FIELD_PREP(BCM54140_RDB_MON_CTRL_SEL_MASK, + BCM54140_RDB_MON_CTRL_SEL_RR); + + /* remove power-down bit */ + mask |= BCM54140_RDB_MON_CTRL_PWR_DOWN; + + return bcm_phy_modify_rdb(phydev, BCM54140_RDB_MON_CTRL, mask, set); +} + +/* Check if one PHY has already done the init of the parts common to all PHYs + * in the Quad PHY package. + */ +static bool bcm54140_is_pkg_init(struct phy_device *phydev) +{ + struct bcm54140_priv *priv = phydev->priv; + struct mii_bus *bus = phydev->mdio.bus; + int base_addr = priv->base_addr; + struct phy_device *phy; + int i; + + /* Quad PHY */ + for (i = 0; i < 4; i++) { + phy = mdiobus_get_phy(bus, base_addr + i); + if (!phy) + continue; + + if ((phy->phy_id & phydev->drv->phy_id_mask) != + (phydev->drv->phy_id & phydev->drv->phy_id_mask)) + continue; + + priv = phy->priv; + + if (priv && priv->pkg_init) + return true; + } + + return false; +} + +static int bcm54140_probe_once(struct phy_device *phydev) +{ + struct device *hwmon; + int ret; + + /* enable hardware monitoring */ + ret = bcm54140_enable_monitoring(phydev); + if (ret) + return ret; + + hwmon = devm_hwmon_device_register_with_info(&phydev->mdio.dev, + "BCM54140", phydev, + &bcm54140_chip_info, + NULL); + return PTR_ERR_OR_ZERO(hwmon); +} +#endif + static int bcm54140_base_read_rdb(struct phy_device *phydev, u16 rdb) { struct bcm54140_priv *priv = phydev->priv; @@ -222,6 +606,18 @@ static int bcm54140_probe(struct phy_device *phydev) if (ret) return ret; +#if IS_ENABLED(CONFIG_HWMON) + mutex_init(&priv->alarm_lock); + + if (!bcm54140_is_pkg_init(phydev)) { + ret = bcm54140_probe_once(phydev); + if (ret) + return ret; + } + + priv->pkg_init = true; +#endif + phydev_dbg(phydev, "probed (port %d, base PHY address %d)\n", priv->port, priv->base_addr); -- cgit v1.2.3-59-g8ed1b From 38f961e744840db9044af68f4773ae5feae60a89 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Mon, 20 Apr 2020 23:29:05 +0200 Subject: net: phy: add device-managed devm_mdiobus_register If there's no special ordering requirement for mdiobus_unregister(), then driver code can be simplified by using a device-managed version of mdiobus_register(). Prerequisite is that bus allocation has been done device-managed too. Else mdiobus_free() may be called whilst bus is still registered, resulting in a BUG_ON(). Therefore let devm_mdiobus_register() return -EPERM if bus was allocated non-managed. Signed-off-by: Heiner Kallweit Signed-off-by: David S. Miller --- drivers/net/phy/mdio_bus.c | 8 +++++++- include/linux/phy.h | 17 +++++++++++++++++ 2 files changed, 24 insertions(+), 1 deletion(-) diff --git a/drivers/net/phy/mdio_bus.c b/drivers/net/phy/mdio_bus.c index 346e88435d29..26b00af94573 100644 --- a/drivers/net/phy/mdio_bus.c +++ b/drivers/net/phy/mdio_bus.c @@ -170,7 +170,12 @@ EXPORT_SYMBOL(mdiobus_alloc_size); static void _devm_mdiobus_free(struct device *dev, void *res) { - mdiobus_free(*(struct mii_bus **)res); + struct mii_bus *bus = *(struct mii_bus **)res; + + if (bus->is_managed_registered && bus->state == MDIOBUS_REGISTERED) + mdiobus_unregister(bus); + + mdiobus_free(bus); } static int devm_mdiobus_match(struct device *dev, void *res, void *data) @@ -210,6 +215,7 @@ struct mii_bus *devm_mdiobus_alloc_size(struct device *dev, int sizeof_priv) if (bus) { *ptr = bus; devres_add(dev, ptr); + bus->is_managed = 1; } else { devres_free(ptr); } diff --git a/include/linux/phy.h b/include/linux/phy.h index 2432ca463ddc..3941a6bcba10 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -241,6 +241,9 @@ struct mii_bus { int (*reset)(struct mii_bus *bus); struct mdio_bus_stats stats[PHY_MAX_ADDR]; + unsigned int is_managed:1; /* is device-managed */ + unsigned int is_managed_registered:1; + /* * A lock to ensure that only one thing can read/write * the MDIO bus at a time @@ -286,6 +289,20 @@ static inline struct mii_bus *mdiobus_alloc(void) int __mdiobus_register(struct mii_bus *bus, struct module *owner); #define mdiobus_register(bus) __mdiobus_register(bus, THIS_MODULE) +static inline int devm_mdiobus_register(struct mii_bus *bus) +{ + int ret; + + if (!bus->is_managed) + return -EPERM; + + ret = mdiobus_register(bus); + if (!ret) + bus->is_managed_registered = 1; + + return ret; +} + void mdiobus_unregister(struct mii_bus *bus); void mdiobus_free(struct mii_bus *bus); struct mii_bus *devm_mdiobus_alloc_size(struct device *dev, int sizeof_priv); -- cgit v1.2.3-59-g8ed1b From 0785dad48003408da27579845db2f83c024636df Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Mon, 20 Apr 2020 23:29:55 +0200 Subject: r8169: use devm_mdiobus_register Use new function devm_mdiobus_register() to simplify the driver. Signed-off-by: Heiner Kallweit Signed-off-by: David S. Miller --- drivers/net/ethernet/realtek/r8169_main.c | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/drivers/net/ethernet/realtek/r8169_main.c b/drivers/net/ethernet/realtek/r8169_main.c index b7a2853e7396..4c616701856a 100644 --- a/drivers/net/ethernet/realtek/r8169_main.c +++ b/drivers/net/ethernet/realtek/r8169_main.c @@ -5185,20 +5185,18 @@ static int r8169_mdio_register(struct rtl8169_private *tp) new_bus->read = r8169_mdio_read_reg; new_bus->write = r8169_mdio_write_reg; - ret = mdiobus_register(new_bus); + ret = devm_mdiobus_register(new_bus); if (ret) return ret; tp->phydev = mdiobus_get_phy(new_bus, 0); if (!tp->phydev) { - mdiobus_unregister(new_bus); return -ENODEV; } else if (!tp->phydev->drv) { /* Most chip versions fail with the genphy driver. * Therefore ensure that the dedicated PHY driver is loaded. */ dev_err(&pdev->dev, "realtek.ko not loaded, maybe it needs to be added to initramfs?\n"); - mdiobus_unregister(new_bus); return -EUNATCH; } @@ -5523,7 +5521,7 @@ static int rtl_init_one(struct pci_dev *pdev, const struct pci_device_id *ent) rc = register_netdev(dev); if (rc) - goto err_mdio_unregister; + return rc; netif_info(tp, probe, dev, "%s, %pM, XID %03x, IRQ %d\n", rtl_chip_infos[chipset].name, dev->dev_addr, xid, @@ -5542,10 +5540,6 @@ static int rtl_init_one(struct pci_dev *pdev, const struct pci_device_id *ent) pm_runtime_put_sync(&pdev->dev); return 0; - -err_mdio_unregister: - mdiobus_unregister(tp->phydev->mdio.bus); - return rc; } static struct pci_driver rtl8169_pci_driver = { -- cgit v1.2.3-59-g8ed1b From beb97d3a3192c00575580af9073921c6283cf93d Mon Sep 17 00:00:00 2001 From: wenxu Date: Tue, 21 Apr 2020 07:55:43 +0800 Subject: net/sched: act_ct: update nf_conn_acct for act_ct SW offload in flowtable When the act_ct SW offload in flowtable, The counter of the conntrack entry will never update. So update the nf_conn_acct conuter in act_ct flowtable software offload. Signed-off-by: wenxu Reviewed-by: Roi Dayan Signed-off-by: David S. Miller --- net/sched/act_ct.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/sched/act_ct.c b/net/sched/act_ct.c index 1a766393be62..9adff83b523b 100644 --- a/net/sched/act_ct.c +++ b/net/sched/act_ct.c @@ -30,6 +30,7 @@ #include #include #include +#include #include #include @@ -536,6 +537,7 @@ static bool tcf_ct_flow_table_lookup(struct tcf_ct_params *p, flow_offload_refresh(nf_ft, flow); nf_conntrack_get(&ct->ct_general); nf_ct_set(skb, ct, ctinfo); + nf_ct_acct_update(ct, dir, skb->len); return true; } -- cgit v1.2.3-59-g8ed1b From 540bde5c2c3da005b87b3edb394d6ca4f890777d Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Tue, 21 Apr 2020 11:09:12 +0800 Subject: ila: remove unused macro 'ILA_HASH_TABLE_SIZE' net/ipv6/ila/ila_xlat.c:604:0: warning: macro "ILA_HASH_TABLE_SIZE" is not used [-Wunused-macros] Signed-off-by: YueHaibing Signed-off-by: David S. Miller --- net/ipv6/ila/ila_xlat.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/net/ipv6/ila/ila_xlat.c b/net/ipv6/ila/ila_xlat.c index 5fc1f4e0c0cf..a1ac0e3d8c60 100644 --- a/net/ipv6/ila/ila_xlat.c +++ b/net/ipv6/ila/ila_xlat.c @@ -601,8 +601,6 @@ out_ret: return ret; } -#define ILA_HASH_TABLE_SIZE 1024 - int ila_xlat_init_net(struct net *net) { struct ila_net *ilan = net_generic(net, ila_net_id); -- cgit v1.2.3-59-g8ed1b From dfddb54043f0a377f642bd0e6a28aa40769e2e65 Mon Sep 17 00:00:00 2001 From: Manivannan Sadhasivam Date: Tue, 21 Apr 2020 13:10:54 +0530 Subject: net: qrtr: Add tracepoint support Add tracepoint support for QRTR with NS as the first candidate. Later on this can be extended to core QRTR and transport drivers. The trace_printk() used in NS has been replaced by tracepoints. Signed-off-by: Manivannan Sadhasivam Signed-off-by: David S. Miller --- include/trace/events/qrtr.h | 115 ++++++++++++++++++++++++++++++++++++++++++++ net/qrtr/ns.c | 20 ++++---- 2 files changed, 126 insertions(+), 9 deletions(-) create mode 100644 include/trace/events/qrtr.h diff --git a/include/trace/events/qrtr.h b/include/trace/events/qrtr.h new file mode 100644 index 000000000000..b1de14c3bb93 --- /dev/null +++ b/include/trace/events/qrtr.h @@ -0,0 +1,115 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM qrtr + +#if !defined(_TRACE_QRTR_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_QRTR_H + +#include +#include + +TRACE_EVENT(qrtr_ns_service_announce_new, + + TP_PROTO(__le32 service, __le32 instance, __le32 node, __le32 port), + + TP_ARGS(service, instance, node, port), + + TP_STRUCT__entry( + __field(__le32, service) + __field(__le32, instance) + __field(__le32, node) + __field(__le32, port) + ), + + TP_fast_assign( + __entry->service = service; + __entry->instance = instance; + __entry->node = node; + __entry->port = port; + ), + + TP_printk("advertising new server [%d:%x]@[%d:%d]", + __entry->service, __entry->instance, __entry->node, + __entry->port + ) +); + +TRACE_EVENT(qrtr_ns_service_announce_del, + + TP_PROTO(__le32 service, __le32 instance, __le32 node, __le32 port), + + TP_ARGS(service, instance, node, port), + + TP_STRUCT__entry( + __field(__le32, service) + __field(__le32, instance) + __field(__le32, node) + __field(__le32, port) + ), + + TP_fast_assign( + __entry->service = service; + __entry->instance = instance; + __entry->node = node; + __entry->port = port; + ), + + TP_printk("advertising removal of server [%d:%x]@[%d:%d]", + __entry->service, __entry->instance, __entry->node, + __entry->port + ) +); + +TRACE_EVENT(qrtr_ns_server_add, + + TP_PROTO(__le32 service, __le32 instance, __le32 node, __le32 port), + + TP_ARGS(service, instance, node, port), + + TP_STRUCT__entry( + __field(__le32, service) + __field(__le32, instance) + __field(__le32, node) + __field(__le32, port) + ), + + TP_fast_assign( + __entry->service = service; + __entry->instance = instance; + __entry->node = node; + __entry->port = port; + ), + + TP_printk("add server [%d:%x]@[%d:%d]", + __entry->service, __entry->instance, __entry->node, + __entry->port + ) +); + +TRACE_EVENT(qrtr_ns_message, + + TP_PROTO(const char * const ctrl_pkt_str, __u32 sq_node, __u32 sq_port), + + TP_ARGS(ctrl_pkt_str, sq_node, sq_port), + + TP_STRUCT__entry( + __string(ctrl_pkt_str, ctrl_pkt_str) + __field(__u32, sq_node) + __field(__u32, sq_port) + ), + + TP_fast_assign( + __assign_str(ctrl_pkt_str, ctrl_pkt_str); + __entry->sq_node = sq_node; + __entry->sq_port = sq_port; + ), + + TP_printk("%s from %d:%d", + __get_str(ctrl_pkt_str), __entry->sq_node, __entry->sq_port + ) +); + +#endif /* _TRACE_QRTR_H */ + +/* This part must be outside protection */ +#include diff --git a/net/qrtr/ns.c b/net/qrtr/ns.c index e7d0fe3f4330..3ca196fc7f9b 100644 --- a/net/qrtr/ns.c +++ b/net/qrtr/ns.c @@ -12,6 +12,9 @@ #include "qrtr.h" +#define CREATE_TRACE_POINTS +#include + static RADIX_TREE(nodes, GFP_KERNEL); static struct { @@ -105,8 +108,8 @@ static int service_announce_new(struct sockaddr_qrtr *dest, struct msghdr msg = { }; struct kvec iv; - trace_printk("advertising new server [%d:%x]@[%d:%d]\n", - srv->service, srv->instance, srv->node, srv->port); + trace_qrtr_ns_service_announce_new(srv->service, srv->instance, + srv->node, srv->port); iv.iov_base = &pkt; iv.iov_len = sizeof(pkt); @@ -132,8 +135,8 @@ static int service_announce_del(struct sockaddr_qrtr *dest, struct kvec iv; int ret; - trace_printk("advertising removal of server [%d:%x]@[%d:%d]\n", - srv->service, srv->instance, srv->node, srv->port); + trace_qrtr_ns_service_announce_del(srv->service, srv->instance, + srv->node, srv->port); iv.iov_base = &pkt; iv.iov_len = sizeof(pkt); @@ -244,8 +247,8 @@ static struct qrtr_server *server_add(unsigned int service, radix_tree_insert(&node->servers, port, srv); - trace_printk("add server [%d:%x]@[%d:%d]\n", srv->service, - srv->instance, srv->node, srv->port); + trace_qrtr_ns_server_add(srv->service, srv->instance, + srv->node, srv->port); return srv; @@ -633,9 +636,8 @@ static void qrtr_ns_worker(struct work_struct *work) cmd = le32_to_cpu(pkt->cmd); if (cmd < ARRAY_SIZE(qrtr_ctrl_pkt_strings) && qrtr_ctrl_pkt_strings[cmd]) - trace_printk("%s from %d:%d\n", - qrtr_ctrl_pkt_strings[cmd], sq.sq_node, - sq.sq_port); + trace_qrtr_ns_message(qrtr_ctrl_pkt_strings[cmd], + sq.sq_node, sq.sq_port); ret = 0; switch (cmd) { -- cgit v1.2.3-59-g8ed1b From 3c7b51bd39b2078870baeeb98ad8190f447c2ed2 Mon Sep 17 00:00:00 2001 From: Xiaoliang Yang Date: Tue, 21 Apr 2020 21:13:47 +0300 Subject: net: dsa: felix: allow flooding for all traffic classes Right now it can be seen that the VSC9959 (Felix) switch will not flood frames if they have a VLAN tag with a PCP of 1-7 (nonzero). It turns out that Felix is quite different from its cousin, Ocelot, in that frame flooding can be allowed/denied per traffic class. Where Ocelot has 1 instance of the ANA_FLOODING register, Felix has 8. The approach that this driver is going to take is "thanks, but no thanks". We have no use case of limiting the flooding domain based on traffic class, so we just want to allow packets to be flooded, no matter what traffic class they have. So we copy the line of code from ocelot.c which does the one-shot initialization of the flooding PGIDs, and we add it to felix.c as well - except replicated 8 times. Signed-off-by: Xiaoliang Yang Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- drivers/net/dsa/ocelot/felix.c | 7 +++++++ drivers/net/dsa/ocelot/felix.h | 1 + 2 files changed, 8 insertions(+) diff --git a/drivers/net/dsa/ocelot/felix.c b/drivers/net/dsa/ocelot/felix.c index 9173b95551d1..8a633ddce6c5 100644 --- a/drivers/net/dsa/ocelot/felix.c +++ b/drivers/net/dsa/ocelot/felix.c @@ -522,6 +522,7 @@ static int felix_setup(struct dsa_switch *ds) struct ocelot *ocelot = ds->priv; struct felix *felix = ocelot_to_felix(ocelot); int port, err; + int tc; err = felix_init_structs(felix, ds->num_ports); if (err) @@ -555,6 +556,12 @@ static int felix_setup(struct dsa_switch *ds) ocelot_write_rix(ocelot, ANA_PGID_PGID_PGID(GENMASK(ocelot->num_phys_ports, 0)), ANA_PGID_PGID, PGID_UC); + /* Setup the per-traffic class flooding PGIDs */ + for (tc = 0; tc < FELIX_NUM_TC; tc++) + ocelot_write_rix(ocelot, ANA_FLOODING_FLD_MULTICAST(PGID_MC) | + ANA_FLOODING_FLD_BROADCAST(PGID_MC) | + ANA_FLOODING_FLD_UNICAST(PGID_UC), + ANA_FLOODING, tc); ds->mtu_enforcement_ingress = true; /* It looks like the MAC/PCS interrupt register - PM0_IEVENT (0x8040) diff --git a/drivers/net/dsa/ocelot/felix.h b/drivers/net/dsa/ocelot/felix.h index 82d46f260041..2ad793c0e1df 100644 --- a/drivers/net/dsa/ocelot/felix.h +++ b/drivers/net/dsa/ocelot/felix.h @@ -5,6 +5,7 @@ #define _MSCC_FELIX_H #define ocelot_to_felix(o) container_of((o), struct felix, ocelot) +#define FELIX_NUM_TC 8 /* Platform-specific information */ struct felix_info { -- cgit v1.2.3-59-g8ed1b From 3f251d741150265cfa7c84d30d105612449601ab Mon Sep 17 00:00:00 2001 From: David Ahern Date: Tue, 21 Apr 2020 18:40:22 -0600 Subject: selftests: Add tests for vrf and xfrms Add tests for vrf and xfrms with a second round after adding a qdisc. There are a few known problems documented with the test cases that fail. The fix is non-trivial; will come back to it when time allows. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- tools/testing/selftests/net/Makefile | 1 + tools/testing/selftests/net/vrf-xfrm-tests.sh | 436 ++++++++++++++++++++++++++ 2 files changed, 437 insertions(+) create mode 100755 tools/testing/selftests/net/vrf-xfrm-tests.sh diff --git a/tools/testing/selftests/net/Makefile b/tools/testing/selftests/net/Makefile index 3f386eb9e7d7..895ec992b2f1 100644 --- a/tools/testing/selftests/net/Makefile +++ b/tools/testing/selftests/net/Makefile @@ -16,6 +16,7 @@ TEST_PROGS += altnames.sh icmp_redirect.sh ip6_gre_headroom.sh TEST_PROGS += route_localnet.sh TEST_PROGS += reuseaddr_ports_exhausted.sh TEST_PROGS += txtimestamp.sh +TEST_PROGS += vrf-xfrm-tests.sh TEST_PROGS_EXTENDED := in_netns.sh TEST_GEN_FILES = socket nettest TEST_GEN_FILES += psock_fanout psock_tpacket msg_zerocopy reuseport_addr_any diff --git a/tools/testing/selftests/net/vrf-xfrm-tests.sh b/tools/testing/selftests/net/vrf-xfrm-tests.sh new file mode 100755 index 000000000000..184da81f554f --- /dev/null +++ b/tools/testing/selftests/net/vrf-xfrm-tests.sh @@ -0,0 +1,436 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +# +# Various combinations of VRF with xfrms and qdisc. + +# Kselftest framework requirement - SKIP code is 4. +ksft_skip=4 + +PAUSE_ON_FAIL=no +VERBOSE=0 +ret=0 + +HOST1_4=192.168.1.1 +HOST2_4=192.168.1.2 +HOST1_6=2001:db8:1::1 +HOST2_6=2001:db8:1::2 + +XFRM1_4=10.0.1.1 +XFRM2_4=10.0.1.2 +XFRM1_6=fc00:1000::1 +XFRM2_6=fc00:1000::2 +IF_ID=123 + +VRF=red +TABLE=300 + +AUTH_1=0xd94fcfea65fddf21dc6e0d24a0253508 +AUTH_2=0xdc6e0d24a0253508d94fcfea65fddf21 +ENC_1=0xfc46c20f8048be9725930ff3fb07ac2a91f0347dffeacf62 +ENC_2=0x3fb07ac2a91f0347dffeacf62fc46c20f8048be9725930ff +SPI_1=0x02122b77 +SPI_2=0x2b770212 + +which ping6 > /dev/null 2>&1 && ping6=$(which ping6) || ping6=$(which ping) + +################################################################################ +# +log_test() +{ + local rc=$1 + local expected=$2 + local msg="$3" + + if [ ${rc} -eq ${expected} ]; then + printf "TEST: %-60s [ OK ]\n" "${msg}" + nsuccess=$((nsuccess+1)) + else + ret=1 + nfail=$((nfail+1)) + printf "TEST: %-60s [FAIL]\n" "${msg}" + if [ "${PAUSE_ON_FAIL}" = "yes" ]; then + echo + echo "hit enter to continue, 'q' to quit" + read a + [ "$a" = "q" ] && exit 1 + fi + fi +} + +run_cmd_host1() +{ + local cmd="$*" + local out + local rc + + if [ "$VERBOSE" = "1" ]; then + printf " COMMAND: $cmd\n" + fi + + out=$(eval ip netns exec host1 $cmd 2>&1) + rc=$? + if [ "$VERBOSE" = "1" ]; then + if [ -n "$out" ]; then + echo + echo " $out" + fi + echo + fi + + return $rc +} + +################################################################################ +# create namespaces for hosts and sws + +create_vrf() +{ + local ns=$1 + local vrf=$2 + local table=$3 + + if [ -n "${ns}" ]; then + ns="-netns ${ns}" + fi + + ip ${ns} link add ${vrf} type vrf table ${table} + ip ${ns} link set ${vrf} up + ip ${ns} route add vrf ${vrf} unreachable default metric 8192 + ip ${ns} -6 route add vrf ${vrf} unreachable default metric 8192 + + ip ${ns} addr add 127.0.0.1/8 dev ${vrf} + ip ${ns} -6 addr add ::1 dev ${vrf} nodad + + ip ${ns} ru del pref 0 + ip ${ns} ru add pref 32765 from all lookup local + ip ${ns} -6 ru del pref 0 + ip ${ns} -6 ru add pref 32765 from all lookup local +} + +create_ns() +{ + local ns=$1 + local addr=$2 + local addr6=$3 + + [ -z "${addr}" ] && addr="-" + [ -z "${addr6}" ] && addr6="-" + + ip netns add ${ns} + + ip -netns ${ns} link set lo up + if [ "${addr}" != "-" ]; then + ip -netns ${ns} addr add dev lo ${addr} + fi + if [ "${addr6}" != "-" ]; then + ip -netns ${ns} -6 addr add dev lo ${addr6} + fi + + ip -netns ${ns} ro add unreachable default metric 8192 + ip -netns ${ns} -6 ro add unreachable default metric 8192 + + ip netns exec ${ns} sysctl -qw net.ipv4.ip_forward=1 + ip netns exec ${ns} sysctl -qw net.ipv6.conf.all.keep_addr_on_down=1 + ip netns exec ${ns} sysctl -qw net.ipv6.conf.all.forwarding=1 + ip netns exec ${ns} sysctl -qw net.ipv6.conf.default.forwarding=1 + ip netns exec ${ns} sysctl -qw net.ipv6.conf.default.accept_dad=0 +} + +# create veth pair to connect namespaces and apply addresses. +connect_ns() +{ + local ns1=$1 + local ns1_dev=$2 + local ns1_addr=$3 + local ns1_addr6=$4 + local ns2=$5 + local ns2_dev=$6 + local ns2_addr=$7 + local ns2_addr6=$8 + local ns1arg + local ns2arg + + if [ -n "${ns1}" ]; then + ns1arg="-netns ${ns1}" + fi + if [ -n "${ns2}" ]; then + ns2arg="-netns ${ns2}" + fi + + ip ${ns1arg} li add ${ns1_dev} type veth peer name tmp + ip ${ns1arg} li set ${ns1_dev} up + ip ${ns1arg} li set tmp netns ${ns2} name ${ns2_dev} + ip ${ns2arg} li set ${ns2_dev} up + + if [ "${ns1_addr}" != "-" ]; then + ip ${ns1arg} addr add dev ${ns1_dev} ${ns1_addr} + ip ${ns2arg} addr add dev ${ns2_dev} ${ns2_addr} + fi + + if [ "${ns1_addr6}" != "-" ]; then + ip ${ns1arg} addr add dev ${ns1_dev} ${ns1_addr6} nodad + ip ${ns2arg} addr add dev ${ns2_dev} ${ns2_addr6} nodad + fi +} + +################################################################################ + +cleanup() +{ + ip netns del host1 + ip netns del host2 +} + +setup() +{ + create_ns "host1" + create_ns "host2" + + connect_ns "host1" eth0 ${HOST1_4}/24 ${HOST1_6}/64 \ + "host2" eth0 ${HOST2_4}/24 ${HOST2_6}/64 + + create_vrf "host1" ${VRF} ${TABLE} + ip -netns host1 link set dev eth0 master ${VRF} +} + +cleanup_xfrm() +{ + for ns in host1 host2 + do + for x in state policy + do + ip -netns ${ns} xfrm ${x} flush + ip -6 -netns ${ns} xfrm ${x} flush + done + done +} + +setup_xfrm() +{ + local h1_4=$1 + local h2_4=$2 + local h1_6=$3 + local h2_6=$4 + local devarg="$5" + + # + # policy + # + + # host1 - IPv4 out + ip -netns host1 xfrm policy add \ + src ${h1_4} dst ${h2_4} ${devarg} dir out \ + tmpl src ${HOST1_4} dst ${HOST2_4} proto esp mode tunnel + + # host2 - IPv4 in + ip -netns host2 xfrm policy add \ + src ${h1_4} dst ${h2_4} dir in \ + tmpl src ${HOST1_4} dst ${HOST2_4} proto esp mode tunnel + + # host1 - IPv4 in + ip -netns host1 xfrm policy add \ + src ${h2_4} dst ${h1_4} ${devarg} dir in \ + tmpl src ${HOST2_4} dst ${HOST1_4} proto esp mode tunnel + + # host2 - IPv4 out + ip -netns host2 xfrm policy add \ + src ${h2_4} dst ${h1_4} dir out \ + tmpl src ${HOST2_4} dst ${HOST1_4} proto esp mode tunnel + + + # host1 - IPv6 out + ip -6 -netns host1 xfrm policy add \ + src ${h1_6} dst ${h2_6} ${devarg} dir out \ + tmpl src ${HOST1_6} dst ${HOST2_6} proto esp mode tunnel + + # host2 - IPv6 in + ip -6 -netns host2 xfrm policy add \ + src ${h1_6} dst ${h2_6} dir in \ + tmpl src ${HOST1_6} dst ${HOST2_6} proto esp mode tunnel + + # host1 - IPv6 in + ip -6 -netns host1 xfrm policy add \ + src ${h2_6} dst ${h1_6} ${devarg} dir in \ + tmpl src ${HOST2_6} dst ${HOST1_6} proto esp mode tunnel + + # host2 - IPv6 out + ip -6 -netns host2 xfrm policy add \ + src ${h2_6} dst ${h1_6} dir out \ + tmpl src ${HOST2_6} dst ${HOST1_6} proto esp mode tunnel + + # + # state + # + ip -netns host1 xfrm state add src ${HOST1_4} dst ${HOST2_4} \ + proto esp spi ${SPI_1} reqid 0 mode tunnel \ + replay-window 4 replay-oseq 0x4 \ + auth-trunc 'hmac(md5)' ${AUTH_1} 96 \ + enc 'cbc(des3_ede)' ${ENC_1} \ + sel src ${h1_4} dst ${h2_4} ${devarg} + + ip -netns host2 xfrm state add src ${HOST1_4} dst ${HOST2_4} \ + proto esp spi ${SPI_1} reqid 0 mode tunnel \ + replay-window 4 replay-oseq 0x4 \ + auth-trunc 'hmac(md5)' ${AUTH_1} 96 \ + enc 'cbc(des3_ede)' ${ENC_1} \ + sel src ${h1_4} dst ${h2_4} + + + ip -netns host1 xfrm state add src ${HOST2_4} dst ${HOST1_4} \ + proto esp spi ${SPI_2} reqid 0 mode tunnel \ + replay-window 4 replay-oseq 0x4 \ + auth-trunc 'hmac(md5)' ${AUTH_2} 96 \ + enc 'cbc(des3_ede)' ${ENC_2} \ + sel src ${h2_4} dst ${h1_4} ${devarg} + + ip -netns host2 xfrm state add src ${HOST2_4} dst ${HOST1_4} \ + proto esp spi ${SPI_2} reqid 0 mode tunnel \ + replay-window 4 replay-oseq 0x4 \ + auth-trunc 'hmac(md5)' ${AUTH_2} 96 \ + enc 'cbc(des3_ede)' ${ENC_2} \ + sel src ${h2_4} dst ${h1_4} + + + ip -6 -netns host1 xfrm state add src ${HOST1_6} dst ${HOST2_6} \ + proto esp spi ${SPI_1} reqid 0 mode tunnel \ + replay-window 4 replay-oseq 0x4 \ + auth-trunc 'hmac(md5)' ${AUTH_1} 96 \ + enc 'cbc(des3_ede)' ${ENC_1} \ + sel src ${h1_6} dst ${h2_6} ${devarg} + + ip -6 -netns host2 xfrm state add src ${HOST1_6} dst ${HOST2_6} \ + proto esp spi ${SPI_1} reqid 0 mode tunnel \ + replay-window 4 replay-oseq 0x4 \ + auth-trunc 'hmac(md5)' ${AUTH_1} 96 \ + enc 'cbc(des3_ede)' ${ENC_1} \ + sel src ${h1_6} dst ${h2_6} + + + ip -6 -netns host1 xfrm state add src ${HOST2_6} dst ${HOST1_6} \ + proto esp spi ${SPI_2} reqid 0 mode tunnel \ + replay-window 4 replay-oseq 0x4 \ + auth-trunc 'hmac(md5)' ${AUTH_2} 96 \ + enc 'cbc(des3_ede)' ${ENC_2} \ + sel src ${h2_6} dst ${h1_6} ${devarg} + + ip -6 -netns host2 xfrm state add src ${HOST2_6} dst ${HOST1_6} \ + proto esp spi ${SPI_2} reqid 0 mode tunnel \ + replay-window 4 replay-oseq 0x4 \ + auth-trunc 'hmac(md5)' ${AUTH_2} 96 \ + enc 'cbc(des3_ede)' ${ENC_2} \ + sel src ${h2_6} dst ${h1_6} +} + +cleanup_xfrm_dev() +{ + ip -netns host1 li del xfrm0 + ip -netns host2 addr del ${XFRM2_4}/24 dev eth0 + ip -netns host2 addr del ${XFRM2_6}/64 dev eth0 +} + +setup_xfrm_dev() +{ + local vrfarg="vrf ${VRF}" + + ip -netns host1 li add type xfrm dev eth0 if_id ${IF_ID} + ip -netns host1 li set xfrm0 ${vrfarg} up + ip -netns host1 addr add ${XFRM1_4}/24 dev xfrm0 + ip -netns host1 addr add ${XFRM1_6}/64 dev xfrm0 + + ip -netns host2 addr add ${XFRM2_4}/24 dev eth0 + ip -netns host2 addr add ${XFRM2_6}/64 dev eth0 + + setup_xfrm ${XFRM1_4} ${XFRM2_4} ${XFRM1_6} ${XFRM2_6} "if_id ${IF_ID}" +} + +run_tests() +{ + cleanup_xfrm + + # no IPsec + run_cmd_host1 ip vrf exec ${VRF} ping -c1 -w1 ${HOST2_4} + log_test $? 0 "IPv4 no xfrm policy" + run_cmd_host1 ip vrf exec ${VRF} ${ping6} -c1 -w1 ${HOST2_6} + log_test $? 0 "IPv6 no xfrm policy" + + # xfrm without VRF in sel + setup_xfrm ${HOST1_4} ${HOST2_4} ${HOST1_6} ${HOST2_6} + run_cmd_host1 ip vrf exec ${VRF} ping -c1 -w1 ${HOST2_4} + log_test $? 0 "IPv4 xfrm policy based on address" + run_cmd_host1 ip vrf exec ${VRF} ${ping6} -c1 -w1 ${HOST2_6} + log_test $? 0 "IPv6 xfrm policy based on address" + cleanup_xfrm + + # xfrm with VRF in sel + # Known failure: ipv4 resets the flow oif after the lookup. Fix is + # not straightforward. + # setup_xfrm ${HOST1_4} ${HOST2_4} ${HOST1_6} ${HOST2_6} "dev ${VRF}" + # run_cmd_host1 ip vrf exec ${VRF} ping -c1 -w1 ${HOST2_4} + # log_test $? 0 "IPv4 xfrm policy with VRF in selector" + run_cmd_host1 ip vrf exec ${VRF} ${ping6} -c1 -w1 ${HOST2_6} + log_test $? 0 "IPv6 xfrm policy with VRF in selector" + cleanup_xfrm + + # xfrm with enslaved device in sel + # Known failures: combined with the above, __xfrm{4,6}_selector_match + # needs to consider both l3mdev and enslaved device index. + # setup_xfrm ${HOST1_4} ${HOST2_4} ${HOST1_6} ${HOST2_6} "dev eth0" + # run_cmd_host1 ip vrf exec ${VRF} ping -c1 -w1 ${HOST2_4} + # log_test $? 0 "IPv4 xfrm policy with enslaved device in selector" + # run_cmd_host1 ip vrf exec ${VRF} ${ping6} -c1 -w1 ${HOST2_6} + # log_test $? 0 "IPv6 xfrm policy with enslaved device in selector" + # cleanup_xfrm + + # xfrm device + setup_xfrm_dev + run_cmd_host1 ip vrf exec ${VRF} ping -c1 -w1 ${XFRM2_4} + log_test $? 0 "IPv4 xfrm policy with xfrm device" + run_cmd_host1 ip vrf exec ${VRF} ${ping6} -c1 -w1 ${XFRM2_6} + log_test $? 0 "IPv6 xfrm policy with xfrm device" + cleanup_xfrm_dev +} + +################################################################################ +# usage + +usage() +{ + cat </dev/null +setup + +echo +echo "No qdisc on VRF device" +run_tests + +run_cmd_host1 tc qdisc add dev ${VRF} root netem delay 100ms +echo +echo "netem qdisc on VRF device" +run_tests + +printf "\nTests passed: %3d\n" ${nsuccess} +printf "Tests failed: %3d\n" ${nfail} + +exit $ret -- cgit v1.2.3-59-g8ed1b From 58e64a312c8468f3e8adead24b71ebb3039b381e Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Wed, 22 Apr 2020 10:11:35 +0800 Subject: macvlan: silence RCU list debugging warning macvlan_hash_lookup() uses list_for_each_entry_rcu() for traversing should either under RCU in fast path or the protection of rtnl_mutex. In the case of holding RTNL, we should add the corresponding lockdep expression to silence the following false-positive warning: ============================= WARNING: suspicious RCU usage 5.7.0-rc1-next-20200416-00003-ga3b8d28bc #1 Not tainted ----------------------------- drivers/net/macvlan.c:126 RCU-list traversed in non-reader section!! Signed-off-by: Wei Yongjun Signed-off-by: David S. Miller --- drivers/net/macvlan.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/macvlan.c b/drivers/net/macvlan.c index e7289d67268f..654c1fa11826 100644 --- a/drivers/net/macvlan.c +++ b/drivers/net/macvlan.c @@ -123,7 +123,8 @@ static struct macvlan_dev *macvlan_hash_lookup(const struct macvlan_port *port, struct macvlan_dev *vlan; u32 idx = macvlan_eth_hash(addr); - hlist_for_each_entry_rcu(vlan, &port->vlan_hash[idx], hlist) { + hlist_for_each_entry_rcu(vlan, &port->vlan_hash[idx], hlist, + lockdep_rtnl_is_held()) { if (ether_addr_equal_64bits(vlan->dev->dev_addr, addr)) return vlan; } -- cgit v1.2.3-59-g8ed1b From c89f44ff10fd4cdcfbebf4854aa1282fdad8de5d Mon Sep 17 00:00:00 2001 From: "Chuah, Kim Tatt" Date: Wed, 22 Apr 2020 11:31:06 +0800 Subject: net: stmmac: Add support for VLAN promiscuous mode For dwmac4, enable VLAN promiscuity when MAC controller is requested to enter promiscuous mode. Signed-off-by: Chuah, Kim Tatt Signed-off-by: Ong Boon Leong Signed-off-by: Tan, Tee Min Signed-off-by: Wong Vee Khee Signed-off-by: David S. Miller --- drivers/net/ethernet/stmicro/stmmac/common.h | 1 + drivers/net/ethernet/stmicro/stmmac/dwmac4.h | 1 + drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c | 67 +++++++++++++++++++++++ 3 files changed, 69 insertions(+) diff --git a/drivers/net/ethernet/stmicro/stmmac/common.h b/drivers/net/ethernet/stmicro/stmmac/common.h index 6208a68a331d..127f75862962 100644 --- a/drivers/net/ethernet/stmicro/stmmac/common.h +++ b/drivers/net/ethernet/stmicro/stmmac/common.h @@ -473,6 +473,7 @@ struct mac_device_info { unsigned int xlgmac; unsigned int num_vlan; u32 vlan_filter[32]; + unsigned int promisc; }; struct stmmac_rx_routing { diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac4.h b/drivers/net/ethernet/stmicro/stmmac/dwmac4.h index 28cac28253b8..61f3249bd724 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac4.h +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac4.h @@ -90,6 +90,7 @@ #define GMAC_VLAN_CSVL BIT(19) #define GMAC_VLAN_VLC GENMASK(17, 16) #define GMAC_VLAN_VLC_SHIFT 16 +#define GMAC_VLAN_VLHT GENMASK(15, 0) /* MAC VLAN Tag */ #define GMAC_VLAN_TAG_VID GENMASK(15, 0) diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c b/drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c index 39692d15d80c..ecd834e0e121 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c @@ -450,6 +450,12 @@ static int dwmac4_add_hw_vlan_rx_fltr(struct net_device *dev, if (vid > 4095) return -EINVAL; + if (hw->promisc) { + netdev_err(dev, + "Adding VLAN in promisc mode not supported\n"); + return -EPERM; + } + /* Single Rx VLAN Filter */ if (hw->num_vlan == 1) { /* For single VLAN filter, VID 0 means VLAN promiscuous */ @@ -499,6 +505,12 @@ static int dwmac4_del_hw_vlan_rx_fltr(struct net_device *dev, { int i, ret = 0; + if (hw->promisc) { + netdev_err(dev, + "Deleting VLAN in promisc mode not supported\n"); + return -EPERM; + } + /* Single Rx VLAN Filter */ if (hw->num_vlan == 1) { if ((hw->vlan_filter[0] & GMAC_VLAN_TAG_VID) == vid) { @@ -523,9 +535,45 @@ static int dwmac4_del_hw_vlan_rx_fltr(struct net_device *dev, return ret; } +static void dwmac4_vlan_promisc_enable(struct net_device *dev, + struct mac_device_info *hw) +{ + void __iomem *ioaddr = hw->pcsr; + u32 value; + u32 hash; + u32 val; + int i; + + /* Single Rx VLAN Filter */ + if (hw->num_vlan == 1) { + dwmac4_write_single_vlan(dev, 0); + return; + } + + /* Extended Rx VLAN Filter Enable */ + for (i = 0; i < hw->num_vlan; i++) { + if (hw->vlan_filter[i] & GMAC_VLAN_TAG_DATA_VEN) { + val = hw->vlan_filter[i] & ~GMAC_VLAN_TAG_DATA_VEN; + dwmac4_write_vlan_filter(dev, hw, i, val); + } + } + + hash = readl(ioaddr + GMAC_VLAN_HASH_TABLE); + if (hash & GMAC_VLAN_VLHT) { + value = readl(ioaddr + GMAC_VLAN_TAG); + if (value & GMAC_VLAN_VTHM) { + value &= ~GMAC_VLAN_VTHM; + writel(value, ioaddr + GMAC_VLAN_TAG); + } + } +} + static void dwmac4_restore_hw_vlan_rx_fltr(struct net_device *dev, struct mac_device_info *hw) { + void __iomem *ioaddr = hw->pcsr; + u32 value; + u32 hash; u32 val; int i; @@ -542,6 +590,13 @@ static void dwmac4_restore_hw_vlan_rx_fltr(struct net_device *dev, dwmac4_write_vlan_filter(dev, hw, i, val); } } + + hash = readl(ioaddr + GMAC_VLAN_HASH_TABLE); + if (hash & GMAC_VLAN_VLHT) { + value = readl(ioaddr + GMAC_VLAN_TAG); + value |= GMAC_VLAN_VTHM; + writel(value, ioaddr + GMAC_VLAN_TAG); + } } static void dwmac4_set_filter(struct mac_device_info *hw, @@ -624,6 +679,18 @@ static void dwmac4_set_filter(struct mac_device_info *hw, value |= GMAC_PACKET_FILTER_VTFE; writel(value, ioaddr + GMAC_PACKET_FILTER); + + if (dev->flags & IFF_PROMISC) { + if (!hw->promisc) { + hw->promisc = 1; + dwmac4_vlan_promisc_enable(dev, hw); + } + } else { + if (hw->promisc) { + hw->promisc = 0; + dwmac4_restore_hw_vlan_rx_fltr(dev, hw); + } + } } static void dwmac4_flow_ctrl(struct mac_device_info *hw, unsigned int duplex, -- cgit v1.2.3-59-g8ed1b From 8518307dc2b2a97b235c74920b45020f9bebe33e Mon Sep 17 00:00:00 2001 From: Jason Yan Date: Wed, 22 Apr 2020 15:16:36 +0800 Subject: net: caif: use true,false for bool variables Fix the following coccicheck warning: net/caif/caif_dev.c:410:2-13: WARNING: Assignment of 0/1 to bool variable net/caif/caif_dev.c:445:2-13: WARNING: Assignment of 0/1 to bool variable net/caif/caif_dev.c:145:1-12: WARNING: Assignment of 0/1 to bool variable net/caif/caif_dev.c:223:1-12: WARNING: Assignment of 0/1 to bool variable Signed-off-by: Jason Yan Signed-off-by: David S. Miller --- net/caif/caif_dev.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/net/caif/caif_dev.c b/net/caif/caif_dev.c index 195d2d67be8a..c10e5a55758d 100644 --- a/net/caif/caif_dev.c +++ b/net/caif/caif_dev.c @@ -142,7 +142,7 @@ static void caif_flow_cb(struct sk_buff *skb) spin_lock_bh(&caifd->flow_lock); send_xoff = caifd->xoff; - caifd->xoff = 0; + caifd->xoff = false; dtor = caifd->xoff_skb_dtor; if (WARN_ON(caifd->xoff_skb != skb)) @@ -220,7 +220,7 @@ static int transmit(struct cflayer *layer, struct cfpkt *pkt) pr_debug("queue has stopped(%d) or is full (%d > %d)\n", netif_queue_stopped(caifd->netdev), qlen, high); - caifd->xoff = 1; + caifd->xoff = true; caifd->xoff_skb = skb; caifd->xoff_skb_dtor = skb->destructor; skb->destructor = caif_flow_cb; @@ -407,7 +407,7 @@ static int caif_device_notify(struct notifier_block *me, unsigned long what, break; } - caifd->xoff = 0; + caifd->xoff = false; cfcnfg_set_phy_state(cfg, &caifd->layer, true); rcu_read_unlock(); @@ -442,7 +442,7 @@ static int caif_device_notify(struct notifier_block *me, unsigned long what, if (caifd->xoff_skb_dtor != NULL && caifd->xoff_skb != NULL) caifd->xoff_skb->destructor = caifd->xoff_skb_dtor; - caifd->xoff = 0; + caifd->xoff = false; caifd->xoff_skb_dtor = NULL; caifd->xoff_skb = NULL; -- cgit v1.2.3-59-g8ed1b From bcf3440c6dd78bfe5836ec0990fe36d7b4bb7d20 Mon Sep 17 00:00:00 2001 From: Oleksij Rempel Date: Wed, 22 Apr 2020 09:21:37 +0200 Subject: net: phy: micrel: add phy-mode support for the KSZ9031 PHY Add support for following phy-modes: rgmii, rgmii-id, rgmii-txid, rgmii-rxid. This PHY has an internal RX delay of 1.2ns and no delay for TX. The pad skew registers allow to set the total TX delay to max 1.38ns and the total RX delay to max of 2.58ns (configurable 1.38ns + build in 1.2ns) and a minimal delay of 0ns. According to the RGMII v1.3 specification the delay provided by PCB traces should be between 1.5ns and 2.0ns. The RGMII v2.0 allows to provide this delay by MAC or PHY. So, we configure this PHY to the best values we can get by this HW: TX delay to 1.38ns (max supported value) and RX delay to 1.80ns (best calculated delay) The phy-modes can still be fine tuned/overwritten by *-skew-ps device tree properties described in: Documentation/devicetree/bindings/net/micrel-ksz90x1.txt Signed-off-by: Oleksij Rempel Reviewed-by: Philippe Schenker Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- drivers/net/phy/micrel.c | 128 +++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 123 insertions(+), 5 deletions(-) diff --git a/drivers/net/phy/micrel.c b/drivers/net/phy/micrel.c index 3a4d83fa52dc..3fe552675dd2 100644 --- a/drivers/net/phy/micrel.c +++ b/drivers/net/phy/micrel.c @@ -19,6 +19,7 @@ * ksz9477 */ +#include #include #include #include @@ -490,9 +491,50 @@ static int ksz9021_config_init(struct phy_device *phydev) /* MMD Address 0x2 */ #define MII_KSZ9031RN_CONTROL_PAD_SKEW 4 +#define MII_KSZ9031RN_RX_CTL_M GENMASK(7, 4) +#define MII_KSZ9031RN_TX_CTL_M GENMASK(3, 0) + #define MII_KSZ9031RN_RX_DATA_PAD_SKEW 5 +#define MII_KSZ9031RN_RXD3 GENMASK(15, 12) +#define MII_KSZ9031RN_RXD2 GENMASK(11, 8) +#define MII_KSZ9031RN_RXD1 GENMASK(7, 4) +#define MII_KSZ9031RN_RXD0 GENMASK(3, 0) + #define MII_KSZ9031RN_TX_DATA_PAD_SKEW 6 +#define MII_KSZ9031RN_TXD3 GENMASK(15, 12) +#define MII_KSZ9031RN_TXD2 GENMASK(11, 8) +#define MII_KSZ9031RN_TXD1 GENMASK(7, 4) +#define MII_KSZ9031RN_TXD0 GENMASK(3, 0) + #define MII_KSZ9031RN_CLK_PAD_SKEW 8 +#define MII_KSZ9031RN_GTX_CLK GENMASK(9, 5) +#define MII_KSZ9031RN_RX_CLK GENMASK(4, 0) + +/* KSZ9031 has internal RGMII_IDRX = 1.2ns and RGMII_IDTX = 0ns. To + * provide different RGMII options we need to configure delay offset + * for each pad relative to build in delay. + */ +/* keep rx as "No delay adjustment" and set rx_clk to +0.60ns to get delays of + * 1.80ns + */ +#define RX_ID 0x7 +#define RX_CLK_ID 0x19 + +/* set rx to +0.30ns and rx_clk to -0.90ns to compensate the + * internal 1.2ns delay. + */ +#define RX_ND 0xc +#define RX_CLK_ND 0x0 + +/* set tx to -0.42ns and tx_clk to +0.96ns to get 1.38ns delay */ +#define TX_ID 0x0 +#define TX_CLK_ID 0x1f + +/* set tx and tx_clk to "No delay adjustment" to keep 0ns + * dealy + */ +#define TX_ND 0x7 +#define TX_CLK_ND 0xf /* MMD Address 0x1C */ #define MII_KSZ9031RN_EDPD 0x23 @@ -501,7 +543,8 @@ static int ksz9021_config_init(struct phy_device *phydev) static int ksz9031_of_load_skew_values(struct phy_device *phydev, const struct device_node *of_node, u16 reg, size_t field_sz, - const char *field[], u8 numfields) + const char *field[], u8 numfields, + bool *update) { int val[4] = {-1, -2, -3, -4}; int matches = 0; @@ -517,6 +560,8 @@ static int ksz9031_of_load_skew_values(struct phy_device *phydev, if (!matches) return 0; + *update |= true; + if (matches < numfields) newval = phy_read_mmd(phydev, 2, reg); else @@ -565,6 +610,67 @@ static int ksz9031_enable_edpd(struct phy_device *phydev) reg | MII_KSZ9031RN_EDPD_ENABLE); } +static int ksz9031_config_rgmii_delay(struct phy_device *phydev) +{ + u16 rx, tx, rx_clk, tx_clk; + int ret; + + switch (phydev->interface) { + case PHY_INTERFACE_MODE_RGMII: + tx = TX_ND; + tx_clk = TX_CLK_ND; + rx = RX_ND; + rx_clk = RX_CLK_ND; + break; + case PHY_INTERFACE_MODE_RGMII_ID: + tx = TX_ID; + tx_clk = TX_CLK_ID; + rx = RX_ID; + rx_clk = RX_CLK_ID; + break; + case PHY_INTERFACE_MODE_RGMII_RXID: + tx = TX_ND; + tx_clk = TX_CLK_ND; + rx = RX_ID; + rx_clk = RX_CLK_ID; + break; + case PHY_INTERFACE_MODE_RGMII_TXID: + tx = TX_ID; + tx_clk = TX_CLK_ID; + rx = RX_ND; + rx_clk = RX_CLK_ND; + break; + default: + return 0; + } + + ret = phy_write_mmd(phydev, 2, MII_KSZ9031RN_CONTROL_PAD_SKEW, + FIELD_PREP(MII_KSZ9031RN_RX_CTL_M, rx) | + FIELD_PREP(MII_KSZ9031RN_TX_CTL_M, tx)); + if (ret < 0) + return ret; + + ret = phy_write_mmd(phydev, 2, MII_KSZ9031RN_RX_DATA_PAD_SKEW, + FIELD_PREP(MII_KSZ9031RN_RXD3, rx) | + FIELD_PREP(MII_KSZ9031RN_RXD2, rx) | + FIELD_PREP(MII_KSZ9031RN_RXD1, rx) | + FIELD_PREP(MII_KSZ9031RN_RXD0, rx)); + if (ret < 0) + return ret; + + ret = phy_write_mmd(phydev, 2, MII_KSZ9031RN_TX_DATA_PAD_SKEW, + FIELD_PREP(MII_KSZ9031RN_TXD3, tx) | + FIELD_PREP(MII_KSZ9031RN_TXD2, tx) | + FIELD_PREP(MII_KSZ9031RN_TXD1, tx) | + FIELD_PREP(MII_KSZ9031RN_TXD0, tx)); + if (ret < 0) + return ret; + + return phy_write_mmd(phydev, 2, MII_KSZ9031RN_CLK_PAD_SKEW, + FIELD_PREP(MII_KSZ9031RN_GTX_CLK, tx_clk) | + FIELD_PREP(MII_KSZ9031RN_RX_CLK, rx_clk)); +} + static int ksz9031_config_init(struct phy_device *phydev) { const struct device *dev = &phydev->mdio.dev; @@ -597,21 +703,33 @@ static int ksz9031_config_init(struct phy_device *phydev) } while (!of_node && dev_walker); if (of_node) { + bool update = false; + + if (phy_interface_is_rgmii(phydev)) { + result = ksz9031_config_rgmii_delay(phydev); + if (result < 0) + return result; + } + ksz9031_of_load_skew_values(phydev, of_node, MII_KSZ9031RN_CLK_PAD_SKEW, 5, - clk_skews, 2); + clk_skews, 2, &update); ksz9031_of_load_skew_values(phydev, of_node, MII_KSZ9031RN_CONTROL_PAD_SKEW, 4, - control_skews, 2); + control_skews, 2, &update); ksz9031_of_load_skew_values(phydev, of_node, MII_KSZ9031RN_RX_DATA_PAD_SKEW, 4, - rx_data_skews, 4); + rx_data_skews, 4, &update); ksz9031_of_load_skew_values(phydev, of_node, MII_KSZ9031RN_TX_DATA_PAD_SKEW, 4, - tx_data_skews, 4); + tx_data_skews, 4, &update); + + if (update && phydev->interface != PHY_INTERFACE_MODE_RGMII) + phydev_warn(phydev, + "*-skew-ps values should be used only with phy-mode = \"rgmii\"\n"); /* Silicon Errata Sheet (DS80000691D or DS80000692D): * When the device links in the 1000BASE-T slave mode only, -- cgit v1.2.3-59-g8ed1b From d0f0c55e7c4ca7c66877064d7c1f4795025e88f8 Mon Sep 17 00:00:00 2001 From: Tang Bin Date: Wed, 22 Apr 2020 16:15:42 +0800 Subject: net: phy: Use IS_ERR() to check and simplify code Use IS_ERR() and PTR_ERR() instead of PTR_ZRR_OR_ZERO() to simplify code, avoid redundant paramenter definitions and judgements. Signed-off-by: Zhang Shengju Signed-off-by: Tang Bin Signed-off-by: David S. Miller --- drivers/net/phy/mdio_bus.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/drivers/net/phy/mdio_bus.c b/drivers/net/phy/mdio_bus.c index 26b00af94573..3e79b96fa344 100644 --- a/drivers/net/phy/mdio_bus.c +++ b/drivers/net/phy/mdio_bus.c @@ -42,14 +42,11 @@ static int mdiobus_register_gpiod(struct mdio_device *mdiodev) { - int error; - /* Deassert the optional reset signal */ mdiodev->reset_gpio = gpiod_get_optional(&mdiodev->dev, "reset", GPIOD_OUT_LOW); - error = PTR_ERR_OR_ZERO(mdiodev->reset_gpio); - if (error) - return error; + if (IS_ERR(mdiodev->reset_gpio)) + return PTR_ERR(mdiodev->reset_gpio); if (mdiodev->reset_gpio) gpiod_set_consumer_name(mdiodev->reset_gpio, "PHY reset"); -- cgit v1.2.3-59-g8ed1b From d9cc193cf0bf471630e203c369e60ab3735dd621 Mon Sep 17 00:00:00 2001 From: Oleksij Rempel Date: Wed, 22 Apr 2020 11:24:53 +0200 Subject: dt-bindings: net: phy: Add support for NXP TJA11xx Document the NXP TJA11xx PHY bindings. Signed-off-by: Oleksij Rempel Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- .../devicetree/bindings/net/nxp,tja11xx.yaml | 61 ++++++++++++++++++++++ 1 file changed, 61 insertions(+) create mode 100644 Documentation/devicetree/bindings/net/nxp,tja11xx.yaml diff --git a/Documentation/devicetree/bindings/net/nxp,tja11xx.yaml b/Documentation/devicetree/bindings/net/nxp,tja11xx.yaml new file mode 100644 index 000000000000..42be0255512b --- /dev/null +++ b/Documentation/devicetree/bindings/net/nxp,tja11xx.yaml @@ -0,0 +1,61 @@ +# SPDX-License-Identifier: GPL-2.0+ +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/net/nxp,tja11xx.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: NXP TJA11xx PHY + +maintainers: + - Andrew Lunn + - Florian Fainelli + - Heiner Kallweit + +description: + Bindings for NXP TJA11xx automotive PHYs + +allOf: + - $ref: ethernet-phy.yaml# + +patternProperties: + "^ethernet-phy@[0-9a-f]+$": + type: object + description: | + Some packages have multiple PHYs. Secondary PHY should be defines as + subnode of the first (parent) PHY. + + properties: + reg: + minimum: 0 + maximum: 31 + description: + The ID number for the child PHY. Should be +1 of parent PHY. + + required: + - reg + +examples: + - | + mdio { + #address-cells = <1>; + #size-cells = <0>; + + tja1101_phy0: ethernet-phy@4 { + reg = <0x4>; + }; + }; + - | + mdio { + #address-cells = <1>; + #size-cells = <0>; + + tja1102_phy0: ethernet-phy@4 { + reg = <0x4>; + #address-cells = <1>; + #size-cells = <0>; + + tja1102_phy1: ethernet-phy@5 { + reg = <0x5>; + }; + }; + }; -- cgit v1.2.3-59-g8ed1b From 8f469506de2ad5528dedbab4d9ba34838991d5d1 Mon Sep 17 00:00:00 2001 From: Oleksij Rempel Date: Wed, 22 Apr 2020 11:24:54 +0200 Subject: net: phy: tja11xx: add initial TJA1102 support TJA1102 is an dual T1 PHY chip. Both PHYs are separately addressable. Both PHYs are similar but have different amount of functionality. For example PHY 1 has no PHY ID and no health monitor. Signed-off-by: Oleksij Rempel Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- drivers/net/phy/nxp-tja11xx.c | 91 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 91 insertions(+) diff --git a/drivers/net/phy/nxp-tja11xx.c b/drivers/net/phy/nxp-tja11xx.c index 47caae770ffc..971286f5e5b0 100644 --- a/drivers/net/phy/nxp-tja11xx.c +++ b/drivers/net/phy/nxp-tja11xx.c @@ -15,6 +15,7 @@ #define PHY_ID_MASK 0xfffffff0 #define PHY_ID_TJA1100 0x0180dc40 #define PHY_ID_TJA1101 0x0180dd00 +#define PHY_ID_TJA1102 0x0180dc80 #define MII_ECTRL 17 #define MII_ECTRL_LINK_CONTROL BIT(15) @@ -40,6 +41,10 @@ #define MII_INTSRC_TEMP_ERR BIT(1) #define MII_INTSRC_UV_ERR BIT(3) +#define MII_INTEN 22 +#define MII_INTEN_LINK_FAIL BIT(10) +#define MII_INTEN_LINK_UP BIT(9) + #define MII_COMMSTAT 23 #define MII_COMMSTAT_LINK_UP BIT(15) @@ -180,6 +185,7 @@ static int tja11xx_config_init(struct phy_device *phydev) return ret; break; case PHY_ID_TJA1101: + case PHY_ID_TJA1102: ret = phy_set_bits(phydev, MII_COMMCFG, MII_COMMCFG_AUTO_OP); if (ret) return ret; @@ -344,6 +350,55 @@ static int tja11xx_probe(struct phy_device *phydev) return PTR_ERR_OR_ZERO(priv->hwmon_dev); } +static int tja1102_match_phy_device(struct phy_device *phydev, bool port0) +{ + int ret; + + if ((phydev->phy_id & PHY_ID_MASK) != PHY_ID_TJA1102) + return 0; + + ret = phy_read(phydev, MII_PHYSID2); + if (ret < 0) + return ret; + + /* TJA1102 Port 1 has phyid 0 and doesn't support temperature + * and undervoltage alarms. + */ + if (port0) + return ret ? 1 : 0; + + return !ret; +} + +static int tja1102_p0_match_phy_device(struct phy_device *phydev) +{ + return tja1102_match_phy_device(phydev, true); +} + +static int tja1102_p1_match_phy_device(struct phy_device *phydev) +{ + return tja1102_match_phy_device(phydev, false); +} + +static int tja11xx_ack_interrupt(struct phy_device *phydev) +{ + int ret; + + ret = phy_read(phydev, MII_INTSRC); + + return (ret < 0) ? ret : 0; +} + +static int tja11xx_config_intr(struct phy_device *phydev) +{ + int value = 0; + + if (phydev->interrupts == PHY_INTERRUPT_ENABLED) + value = MII_INTEN_LINK_FAIL | MII_INTEN_LINK_UP; + + return phy_write(phydev, MII_INTEN, value); +} + static struct phy_driver tja11xx_driver[] = { { PHY_ID_MATCH_MODEL(PHY_ID_TJA1100), @@ -375,6 +430,41 @@ static struct phy_driver tja11xx_driver[] = { .get_sset_count = tja11xx_get_sset_count, .get_strings = tja11xx_get_strings, .get_stats = tja11xx_get_stats, + }, { + .name = "NXP TJA1102 Port 0", + .features = PHY_BASIC_T1_FEATURES, + .probe = tja11xx_probe, + .soft_reset = tja11xx_soft_reset, + .config_init = tja11xx_config_init, + .read_status = tja11xx_read_status, + .match_phy_device = tja1102_p0_match_phy_device, + .suspend = genphy_suspend, + .resume = genphy_resume, + .set_loopback = genphy_loopback, + /* Statistics */ + .get_sset_count = tja11xx_get_sset_count, + .get_strings = tja11xx_get_strings, + .get_stats = tja11xx_get_stats, + .ack_interrupt = tja11xx_ack_interrupt, + .config_intr = tja11xx_config_intr, + + }, { + .name = "NXP TJA1102 Port 1", + .features = PHY_BASIC_T1_FEATURES, + /* currently no probe for Port 1 is need */ + .soft_reset = tja11xx_soft_reset, + .config_init = tja11xx_config_init, + .read_status = tja11xx_read_status, + .match_phy_device = tja1102_p1_match_phy_device, + .suspend = genphy_suspend, + .resume = genphy_resume, + .set_loopback = genphy_loopback, + /* Statistics */ + .get_sset_count = tja11xx_get_sset_count, + .get_strings = tja11xx_get_strings, + .get_stats = tja11xx_get_stats, + .ack_interrupt = tja11xx_ack_interrupt, + .config_intr = tja11xx_config_intr, } }; @@ -383,6 +473,7 @@ module_phy_driver(tja11xx_driver); static struct mdio_device_id __maybe_unused tja11xx_tbl[] = { { PHY_ID_MATCH_MODEL(PHY_ID_TJA1100) }, { PHY_ID_MATCH_MODEL(PHY_ID_TJA1101) }, + { PHY_ID_MATCH_MODEL(PHY_ID_TJA1102) }, { } }; -- cgit v1.2.3-59-g8ed1b From 5972157c2dde11698d7bcfc55621107d97121c87 Mon Sep 17 00:00:00 2001 From: Oleksij Rempel Date: Wed, 22 Apr 2020 11:24:55 +0200 Subject: net: mdio: of: export part of of_mdiobus_register_phy() This function will be needed in tja11xx driver for secondary PHY support. Signed-off-by: Oleksij Rempel Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- drivers/of/of_mdio.c | 73 ++++++++++++++++++++++++++++--------------------- include/linux/of_mdio.h | 11 +++++++- 2 files changed, 52 insertions(+), 32 deletions(-) diff --git a/drivers/of/of_mdio.c b/drivers/of/of_mdio.c index 9f982c0627a0..a04afe79529c 100644 --- a/drivers/of/of_mdio.c +++ b/drivers/of/of_mdio.c @@ -60,39 +60,15 @@ static struct mii_timestamper *of_find_mii_timestamper(struct device_node *node) return register_mii_timestamper(arg.np, arg.args[0]); } -static int of_mdiobus_register_phy(struct mii_bus *mdio, - struct device_node *child, u32 addr) +int of_mdiobus_phy_device_register(struct mii_bus *mdio, struct phy_device *phy, + struct device_node *child, u32 addr) { - struct mii_timestamper *mii_ts; - struct phy_device *phy; - bool is_c45; int rc; - u32 phy_id; - - mii_ts = of_find_mii_timestamper(child); - if (IS_ERR(mii_ts)) - return PTR_ERR(mii_ts); - - is_c45 = of_device_is_compatible(child, - "ethernet-phy-ieee802.3-c45"); - - if (!is_c45 && !of_get_phy_id(child, &phy_id)) - phy = phy_device_create(mdio, addr, phy_id, 0, NULL); - else - phy = get_phy_device(mdio, addr, is_c45); - if (IS_ERR(phy)) { - if (mii_ts) - unregister_mii_timestamper(mii_ts); - return PTR_ERR(phy); - } rc = of_irq_get(child, 0); - if (rc == -EPROBE_DEFER) { - if (mii_ts) - unregister_mii_timestamper(mii_ts); - phy_device_free(phy); + if (rc == -EPROBE_DEFER) return rc; - } + if (rc > 0) { phy->irq = rc; mdio->irq[addr] = rc; @@ -117,11 +93,48 @@ static int of_mdiobus_register_phy(struct mii_bus *mdio, /* All data is now stored in the phy struct; * register it */ rc = phy_device_register(phy); + if (rc) { + of_node_put(child); + return rc; + } + + dev_dbg(&mdio->dev, "registered phy %pOFn at address %i\n", + child, addr); + return 0; +} +EXPORT_SYMBOL(of_mdiobus_phy_device_register); + +static int of_mdiobus_register_phy(struct mii_bus *mdio, + struct device_node *child, u32 addr) +{ + struct mii_timestamper *mii_ts; + struct phy_device *phy; + bool is_c45; + int rc; + u32 phy_id; + + mii_ts = of_find_mii_timestamper(child); + if (IS_ERR(mii_ts)) + return PTR_ERR(mii_ts); + + is_c45 = of_device_is_compatible(child, + "ethernet-phy-ieee802.3-c45"); + + if (!is_c45 && !of_get_phy_id(child, &phy_id)) + phy = phy_device_create(mdio, addr, phy_id, 0, NULL); + else + phy = get_phy_device(mdio, addr, is_c45); + if (IS_ERR(phy)) { + if (mii_ts) + unregister_mii_timestamper(mii_ts); + return PTR_ERR(phy); + } + + rc = of_mdiobus_phy_device_register(mdio, phy, child, addr); if (rc) { if (mii_ts) unregister_mii_timestamper(mii_ts); phy_device_free(phy); - of_node_put(child); return rc; } @@ -132,8 +145,6 @@ static int of_mdiobus_register_phy(struct mii_bus *mdio, if (mii_ts) phy->mii_ts = mii_ts; - dev_dbg(&mdio->dev, "registered phy %pOFn at address %i\n", - child, addr); return 0; } diff --git a/include/linux/of_mdio.h b/include/linux/of_mdio.h index 491a2b7e77c1..0f61a4ac6bcf 100644 --- a/include/linux/of_mdio.h +++ b/include/linux/of_mdio.h @@ -30,7 +30,9 @@ extern struct mii_bus *of_mdio_find_bus(struct device_node *mdio_np); extern int of_phy_register_fixed_link(struct device_node *np); extern void of_phy_deregister_fixed_link(struct device_node *np); extern bool of_phy_is_fixed_link(struct device_node *np); - +extern int of_mdiobus_phy_device_register(struct mii_bus *mdio, + struct phy_device *phy, + struct device_node *child, u32 addr); static inline int of_mdio_parse_addr(struct device *dev, const struct device_node *np) @@ -118,6 +120,13 @@ static inline bool of_phy_is_fixed_link(struct device_node *np) { return false; } + +static inline int of_mdiobus_phy_device_register(struct mii_bus *mdio, + struct phy_device *phy, + struct device_node *child, u32 addr) +{ + return -ENOSYS; +} #endif -- cgit v1.2.3-59-g8ed1b From 6a64d3cdc5ef89b2ac629701de5ffb3df1fb7937 Mon Sep 17 00:00:00 2001 From: Oleksij Rempel Date: Wed, 22 Apr 2020 11:24:56 +0200 Subject: net: phy: tja11xx: add delayed registration of TJA1102 PHY1 TJA1102 is a dual PHY package with PHY0 having proper PHYID and PHY1 having no ID. On one hand it is possible to for PHY detection by compatible, on other hand we should be able to reset complete chip before PHY1 configured it, and we need to define dependencies for proper power management. We can solve it by defining PHY1 as child of PHY0: tja1102_phy0: ethernet-phy@4 { reg = <0x4>; interrupts-extended = <&gpio5 8 IRQ_TYPE_LEVEL_LOW>; reset-gpios = <&gpio5 9 GPIO_ACTIVE_LOW>; reset-assert-us = <20>; reset-deassert-us = <2000>; tja1102_phy1: ethernet-phy@5 { reg = <0x5>; interrupts-extended = <&gpio5 8 IRQ_TYPE_LEVEL_LOW>; }; }; The PHY1 should be a subnode of PHY0 and registered only after PHY0 was completely reset and initialized. Signed-off-by: Oleksij Rempel Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- drivers/net/phy/nxp-tja11xx.c | 112 +++++++++++++++++++++++++++++++++++++++--- 1 file changed, 105 insertions(+), 7 deletions(-) diff --git a/drivers/net/phy/nxp-tja11xx.c b/drivers/net/phy/nxp-tja11xx.c index 971286f5e5b0..cc766b2d4136 100644 --- a/drivers/net/phy/nxp-tja11xx.c +++ b/drivers/net/phy/nxp-tja11xx.c @@ -6,11 +6,14 @@ #include #include #include +#include #include #include #include #include #include +#include +#include #define PHY_ID_MASK 0xfffffff0 #define PHY_ID_TJA1100 0x0180dc40 @@ -57,6 +60,8 @@ struct tja11xx_priv { char *hwmon_name; struct device *hwmon_dev; + struct phy_device *phydev; + struct work_struct phy_register_work; }; struct tja11xx_phy_stats { @@ -323,16 +328,12 @@ static const struct hwmon_chip_info tja11xx_hwmon_chip_info = { .info = tja11xx_hwmon_info, }; -static int tja11xx_probe(struct phy_device *phydev) +static int tja11xx_hwmon_register(struct phy_device *phydev, + struct tja11xx_priv *priv) { struct device *dev = &phydev->mdio.dev; - struct tja11xx_priv *priv; int i; - priv = devm_kzalloc(dev, sizeof(*priv), GFP_KERNEL); - if (!priv) - return -ENOMEM; - priv->hwmon_name = devm_kstrdup(dev, dev_name(dev), GFP_KERNEL); if (!priv->hwmon_name) return -ENOMEM; @@ -350,6 +351,103 @@ static int tja11xx_probe(struct phy_device *phydev) return PTR_ERR_OR_ZERO(priv->hwmon_dev); } +static int tja11xx_probe(struct phy_device *phydev) +{ + struct device *dev = &phydev->mdio.dev; + struct tja11xx_priv *priv; + + priv = devm_kzalloc(dev, sizeof(*priv), GFP_KERNEL); + if (!priv) + return -ENOMEM; + + priv->phydev = phydev; + + return tja11xx_hwmon_register(phydev, priv); +} + +static void tja1102_p1_register(struct work_struct *work) +{ + struct tja11xx_priv *priv = container_of(work, struct tja11xx_priv, + phy_register_work); + struct phy_device *phydev_phy0 = priv->phydev; + struct mii_bus *bus = phydev_phy0->mdio.bus; + struct device *dev = &phydev_phy0->mdio.dev; + struct device_node *np = dev->of_node; + struct device_node *child; + int ret; + + for_each_available_child_of_node(np, child) { + struct phy_device *phy; + int addr; + + addr = of_mdio_parse_addr(dev, child); + if (addr < 0) { + dev_err(dev, "Can't parse addr\n"); + continue; + } else if (addr != phydev_phy0->mdio.addr + 1) { + /* Currently we care only about double PHY chip TJA1102. + * If some day NXP will decide to bring chips with more + * PHYs, this logic should be reworked. + */ + dev_err(dev, "Unexpected address. Should be: %i\n", + phydev_phy0->mdio.addr + 1); + continue; + } + + if (mdiobus_is_registered_device(bus, addr)) { + dev_err(dev, "device is already registered\n"); + continue; + } + + /* Real PHY ID of Port 1 is 0 */ + phy = phy_device_create(bus, addr, PHY_ID_TJA1102, false, NULL); + if (IS_ERR(phy)) { + dev_err(dev, "Can't create PHY device for Port 1: %i\n", + addr); + continue; + } + + /* Overwrite parent device. phy_device_create() set parent to + * the mii_bus->dev, which is not correct in case. + */ + phy->mdio.dev.parent = dev; + + ret = of_mdiobus_phy_device_register(bus, phy, child, addr); + if (ret) { + /* All resources needed for Port 1 should be already + * available for Port 0. Both ports use the same + * interrupt line, so -EPROBE_DEFER would make no sense + * here. + */ + dev_err(dev, "Can't register Port 1. Unexpected error: %i\n", + ret); + phy_device_free(phy); + } + } +} + +static int tja1102_p0_probe(struct phy_device *phydev) +{ + struct device *dev = &phydev->mdio.dev; + struct tja11xx_priv *priv; + int ret; + + priv = devm_kzalloc(dev, sizeof(*priv), GFP_KERNEL); + if (!priv) + return -ENOMEM; + + priv->phydev = phydev; + INIT_WORK(&priv->phy_register_work, tja1102_p1_register); + + ret = tja11xx_hwmon_register(phydev, priv); + if (ret) + return ret; + + schedule_work(&priv->phy_register_work); + + return 0; +} + static int tja1102_match_phy_device(struct phy_device *phydev, bool port0) { int ret; @@ -433,7 +531,7 @@ static struct phy_driver tja11xx_driver[] = { }, { .name = "NXP TJA1102 Port 0", .features = PHY_BASIC_T1_FEATURES, - .probe = tja11xx_probe, + .probe = tja1102_p0_probe, .soft_reset = tja11xx_soft_reset, .config_init = tja11xx_config_init, .read_status = tja11xx_read_status, -- cgit v1.2.3-59-g8ed1b From 93e106da6a7514445c1e27fdbb6b9810f3df8452 Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Wed, 22 Apr 2020 19:48:29 +0300 Subject: selftests: forwarding: pedit_dsfield: Add pedit munge ip6 dsfield Extend the pedit_dsfield forwarding selftest with coverage of "pedit ex munge ip6 dsfield set". Signed-off-by: Petr Machata Signed-off-by: David S. Miller --- .../selftests/net/forwarding/pedit_dsfield.sh | 66 ++++++++++++++++++++++ 1 file changed, 66 insertions(+) diff --git a/tools/testing/selftests/net/forwarding/pedit_dsfield.sh b/tools/testing/selftests/net/forwarding/pedit_dsfield.sh index b50081855913..1181d647f6a7 100755 --- a/tools/testing/selftests/net/forwarding/pedit_dsfield.sh +++ b/tools/testing/selftests/net/forwarding/pedit_dsfield.sh @@ -20,10 +20,14 @@ ALL_TESTS=" ping_ipv4 + ping_ipv6 test_ip_dsfield test_ip_dscp test_ip_ecn test_ip_dscp_ecn + test_ip6_dsfield + test_ip6_dscp + test_ip6_ecn " NUM_NETIFS=4 @@ -107,6 +111,11 @@ ping_ipv4() ping_test $h1 192.0.2.2 } +ping_ipv6() +{ + ping6_test $h1 2001:db8:1::2 +} + do_test_pedit_dsfield_common() { local pedit_locus=$1; shift @@ -228,6 +237,63 @@ test_ip_dscp_ecn() do_test_ip_dscp_ecn "dev $swp2 egress" } +do_test_ip6_dsfield() +{ + local locus=$1; shift + local dsfield + + for dsfield in 0 1 2 3 128 252 253 254 255; do + do_test_pedit_dsfield "$locus" \ + "ip6 traffic_class set $dsfield" \ + ipv6 "ip_tos $dsfield" \ + "-6 -A 2001:db8:1::1 -B 2001:db8:1::2" + done +} + +test_ip6_dsfield() +{ + do_test_ip6_dsfield "dev $swp1 ingress" + do_test_ip6_dsfield "dev $swp2 egress" +} + +do_test_ip6_dscp() +{ + local locus=$1; shift + local dscp + + for dscp in 0 1 2 3 32 61 62 63; do + do_test_pedit_dsfield "$locus" \ + "ip6 traffic_class set $((dscp << 2)) retain 0xfc" \ + ipv6 "ip_tos $(((dscp << 2) | 1))" \ + "-6 -A 2001:db8:1::1 -B 2001:db8:1::2" + done +} + +test_ip6_dscp() +{ + do_test_ip6_dscp "dev $swp1 ingress" + do_test_ip6_dscp "dev $swp2 egress" +} + +do_test_ip6_ecn() +{ + local locus=$1; shift + local ecn + + for ecn in 0 1 2 3; do + do_test_pedit_dsfield "$locus" \ + "ip6 traffic_class set $ecn retain 0x3" \ + ipv6 "ip_tos $((124 | $ecn))" \ + "-6 -A 2001:db8:1::1 -B 2001:db8:1::2" + done +} + +test_ip6_ecn() +{ + do_test_ip6_ecn "dev $swp1 ingress" + do_test_ip6_ecn "dev $swp2 egress" +} + trap cleanup EXIT setup_prepare -- cgit v1.2.3-59-g8ed1b From f132ccc56e35875655226915588cab63a16237ef Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Wed, 22 Apr 2020 19:48:30 +0300 Subject: selftests: tc-testing: Add a TDC test for pedit munge ip6 dsfield Add a self-test for the IPv6 dsfield munge that iproute2 will support. Signed-off-by: Petr Machata Signed-off-by: David S. Miller --- .../tc-testing/tc-tests/actions/pedit.json | 25 ++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/tools/testing/selftests/tc-testing/tc-tests/actions/pedit.json b/tools/testing/selftests/tc-testing/tc-tests/actions/pedit.json index f8ea6f5fa8e9..72cdc3c800a5 100644 --- a/tools/testing/selftests/tc-testing/tc-tests/actions/pedit.json +++ b/tools/testing/selftests/tc-testing/tc-tests/actions/pedit.json @@ -1471,6 +1471,31 @@ "$TC actions flush action pedit" ] }, + { + "id": "94bb", + "name": "Add pedit action with LAYERED_OP ip6 traffic_class", + "category": [ + "actions", + "pedit", + "layered_op" + ], + "setup": [ + [ + "$TC actions flush action pedit", + 0, + 1, + 255 + ] + ], + "cmdUnderTest": "$TC actions add action pedit ex munge ip6 traffic_class set 0x40 continue", + "expExitCode": "0", + "verifyCmd": "$TC actions list action pedit", + "matchPattern": "ipv6\\+0: val 04000000 mask f00fffff", + "matchCount": "1", + "teardown": [ + "$TC actions flush action pedit" + ] + }, { "id": "6f5e", "name": "Add pedit action with LAYERED_OP ip6 flow_lbl", -- cgit v1.2.3-59-g8ed1b From 493f3cc7ee020a4c5da02f6502743d9ae7be50d6 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Wed, 22 Apr 2020 17:08:22 -0600 Subject: selftests: A few improvements to fib_nexthops.sh Add nodad when adding IPv6 addresses and remove the sleep. A recent change to iproute2 moved the 'pref medium' to the prefix (where it belongs). Change the expected route check to strip 'pref medium' to be compatible with old and new iproute2. Add IPv4 runtime test with an IPv6 address as the gateway in the default route. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- tools/testing/selftests/net/fib_nexthops.sh | 25 ++++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/tools/testing/selftests/net/fib_nexthops.sh b/tools/testing/selftests/net/fib_nexthops.sh index 796670ebc65b..5890ba6d7ef6 100755 --- a/tools/testing/selftests/net/fib_nexthops.sh +++ b/tools/testing/selftests/net/fib_nexthops.sh @@ -150,31 +150,31 @@ setup() $IP li add veth1 type veth peer name veth2 $IP li set veth1 up $IP addr add 172.16.1.1/24 dev veth1 - $IP -6 addr add 2001:db8:91::1/64 dev veth1 + $IP -6 addr add 2001:db8:91::1/64 dev veth1 nodad $IP li add veth3 type veth peer name veth4 $IP li set veth3 up $IP addr add 172.16.2.1/24 dev veth3 - $IP -6 addr add 2001:db8:92::1/64 dev veth3 + $IP -6 addr add 2001:db8:92::1/64 dev veth3 nodad $IP li set veth2 netns peer up ip -netns peer addr add 172.16.1.2/24 dev veth2 - ip -netns peer -6 addr add 2001:db8:91::2/64 dev veth2 + ip -netns peer -6 addr add 2001:db8:91::2/64 dev veth2 nodad $IP li set veth4 netns peer up ip -netns peer addr add 172.16.2.2/24 dev veth4 - ip -netns peer -6 addr add 2001:db8:92::2/64 dev veth4 + ip -netns peer -6 addr add 2001:db8:92::2/64 dev veth4 nodad ip -netns remote li add veth5 type veth peer name veth6 ip -netns remote li set veth5 up ip -netns remote addr add dev veth5 172.16.101.1/24 - ip -netns remote addr add dev veth5 2001:db8:101::1/64 + ip -netns remote -6 addr add dev veth5 2001:db8:101::1/64 nodad ip -netns remote ro add 172.16.0.0/22 via 172.16.101.2 ip -netns remote -6 ro add 2001:db8:90::/40 via 2001:db8:101::2 ip -netns remote li set veth6 netns peer up ip -netns peer addr add dev veth6 172.16.101.2/24 - ip -netns peer addr add dev veth6 2001:db8:101::2/64 + ip -netns peer -6 addr add dev veth6 2001:db8:101::2/64 nodad set +e } @@ -248,7 +248,7 @@ check_route6() local expected="$2" local out - out=$($IP -6 route ls match ${pfx} 2>/dev/null) + out=$($IP -6 route ls match ${pfx} 2>/dev/null | sed -e 's/pref medium//') check_output "${out}" "${expected}" } @@ -423,8 +423,6 @@ ipv6_fcnal_runtime() echo "IPv6 functional runtime" echo "-----------------------" - sleep 5 - # # IPv6 - the basics # @@ -481,12 +479,12 @@ ipv6_fcnal_runtime() run_cmd "$IP -6 nexthop add id 85 dev veth1" run_cmd "$IP ro replace 2001:db8:101::1/128 nhid 85" log_test $? 0 "IPv6 route with device only nexthop" - check_route6 "2001:db8:101::1" "2001:db8:101::1 nhid 85 dev veth1 metric 1024 pref medium" + check_route6 "2001:db8:101::1" "2001:db8:101::1 nhid 85 dev veth1 metric 1024" run_cmd "$IP nexthop add id 123 group 81/85" run_cmd "$IP ro replace 2001:db8:101::1/128 nhid 123" log_test $? 0 "IPv6 multipath route with nexthop mix - dev only + gw" - check_route6 "2001:db8:101::1" "2001:db8:101::1 nhid 123 metric 1024 nexthop via 2001:db8:91::2 dev veth1 weight 1 nexthop dev veth1 weight 1 pref medium" + check_route6 "2001:db8:101::1" "2001:db8:101::1 nhid 123 metric 1024 nexthop via 2001:db8:91::2 dev veth1 weight 1 nexthop dev veth1 weight 1" # # IPv6 route with v4 nexthop - not allowed @@ -843,6 +841,11 @@ ipv4_fcnal_runtime() $IP neigh sh | grep 'dev veth1' fi + run_cmd "$IP ro del 172.16.101.1/32 via inet6 ${lladdr} dev veth1" + run_cmd "$IP -4 ro add default via inet6 ${lladdr} dev veth1" + run_cmd "ip netns exec me ping -c1 -w1 172.16.101.1" + log_test $? 0 "IPv4 default route with IPv6 gateway" + # # MPLS as an example of LWT encap # -- cgit v1.2.3-59-g8ed1b From 788f87ac608c518b74f338acb95f197cf6e3d0c4 Mon Sep 17 00:00:00 2001 From: Ioana Ciornei Date: Wed, 22 Apr 2020 15:05:09 +0300 Subject: xdp: export the DEV_MAP_BULK_SIZE macro Export the DEV_MAP_BULK_SIZE macro to the header file so that drivers can directly use it as the maximum number of xdp_frames received in the .ndo_xdp_xmit() callback. Signed-off-by: Ioana Ciornei Acked-by: Jesper Dangaard Brouer Signed-off-by: David S. Miller --- include/net/xdp.h | 2 ++ kernel/bpf/devmap.c | 1 - 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/include/net/xdp.h b/include/net/xdp.h index 40c6d3398458..3cc6d5d84aa4 100644 --- a/include/net/xdp.h +++ b/include/net/xdp.h @@ -181,4 +181,6 @@ bool xdp_attachment_flags_ok(struct xdp_attachment_info *info, void xdp_attachment_setup(struct xdp_attachment_info *info, struct netdev_bpf *bpf); +#define DEV_MAP_BULK_SIZE 16 + #endif /* __LINUX_NET_XDP_H__ */ diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index 58bdca5d978a..a51d9fb7a359 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -52,7 +52,6 @@ #define DEV_CREATE_FLAG_MASK \ (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY) -#define DEV_MAP_BULK_SIZE 16 struct xdp_dev_bulk_queue { struct xdp_frame *q[DEV_MAP_BULK_SIZE]; struct list_head flush_node; -- cgit v1.2.3-59-g8ed1b From 48c0481e5ad1d1eec6ccfaee6bb8a030fbbd07f7 Mon Sep 17 00:00:00 2001 From: Ioana Ciornei Date: Wed, 22 Apr 2020 15:05:10 +0300 Subject: dpaa2-eth: return num_enqueued frames from enqueue callback The enqueue dpaa2-eth callback now returns the number of successfully enqueued frames. This is a preliminary patch necessary for adding support for bulk ring mode enqueue. Signed-off-by: Ioana Ciornei Signed-off-by: David S. Miller --- drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c | 34 ++++++++++++++++-------- drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.h | 5 ++-- 2 files changed, 26 insertions(+), 13 deletions(-) diff --git a/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c b/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c index b6c46639aa4c..7b41ece8f160 100644 --- a/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c +++ b/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: (GPL-2.0+ OR BSD-3-Clause) /* Copyright 2014-2016 Freescale Semiconductor Inc. - * Copyright 2016-2019 NXP + * Copyright 2016-2020 NXP */ #include #include @@ -268,7 +268,7 @@ static int xdp_enqueue(struct dpaa2_eth_priv *priv, struct dpaa2_fd *fd, fq = &priv->fq[queue_id]; for (i = 0; i < DPAA2_ETH_ENQUEUE_RETRIES; i++) { - err = priv->enqueue(priv, fq, fd, 0); + err = priv->enqueue(priv, fq, fd, 0, NULL); if (err != -EBUSY) break; } @@ -847,7 +847,7 @@ static netdev_tx_t dpaa2_eth_tx(struct sk_buff *skb, struct net_device *net_dev) * the Tx confirmation callback for this frame */ for (i = 0; i < DPAA2_ETH_ENQUEUE_RETRIES; i++) { - err = priv->enqueue(priv, fq, &fd, prio); + err = priv->enqueue(priv, fq, &fd, prio, NULL); if (err != -EBUSY) break; } @@ -1937,7 +1937,7 @@ static int dpaa2_eth_xdp_xmit_frame(struct net_device *net_dev, fq = &priv->fq[smp_processor_id() % dpaa2_eth_queue_count(priv)]; for (i = 0; i < DPAA2_ETH_ENQUEUE_RETRIES; i++) { - err = priv->enqueue(priv, fq, &fd, 0); + err = priv->enqueue(priv, fq, &fd, 0, NULL); if (err != -EBUSY) break; } @@ -2523,19 +2523,31 @@ static int set_buffer_layout(struct dpaa2_eth_priv *priv) static inline int dpaa2_eth_enqueue_qd(struct dpaa2_eth_priv *priv, struct dpaa2_eth_fq *fq, - struct dpaa2_fd *fd, u8 prio) + struct dpaa2_fd *fd, u8 prio, + int *frames_enqueued) { - return dpaa2_io_service_enqueue_qd(fq->channel->dpio, - priv->tx_qdid, prio, - fq->tx_qdbin, fd); + int err; + + err = dpaa2_io_service_enqueue_qd(fq->channel->dpio, + priv->tx_qdid, prio, + fq->tx_qdbin, fd); + if (!err && frames_enqueued) + *frames_enqueued = 1; + return err; } static inline int dpaa2_eth_enqueue_fq(struct dpaa2_eth_priv *priv, struct dpaa2_eth_fq *fq, - struct dpaa2_fd *fd, u8 prio) + struct dpaa2_fd *fd, u8 prio, + int *frames_enqueued) { - return dpaa2_io_service_enqueue_fq(fq->channel->dpio, - fq->tx_fqid[prio], fd); + int err; + + err = dpaa2_io_service_enqueue_fq(fq->channel->dpio, + fq->tx_fqid[prio], fd); + if (!err && frames_enqueued) + *frames_enqueued = 1; + return err; } static void set_enqueue_mode(struct dpaa2_eth_priv *priv) diff --git a/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.h b/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.h index 7635db3ef903..085ff750e4b5 100644 --- a/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.h +++ b/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: (GPL-2.0+ OR BSD-3-Clause) */ /* Copyright 2014-2016 Freescale Semiconductor Inc. - * Copyright 2016 NXP + * Copyright 2016-2020 NXP */ #ifndef __DPAA2_ETH_H @@ -371,7 +371,8 @@ struct dpaa2_eth_priv { struct dpaa2_eth_fq fq[DPAA2_ETH_MAX_QUEUES]; int (*enqueue)(struct dpaa2_eth_priv *priv, struct dpaa2_eth_fq *fq, - struct dpaa2_fd *fd, u8 prio); + struct dpaa2_fd *fd, u8 prio, + int *frames_enqueued); u8 num_channels; struct dpaa2_eth_channel *channel[DPAA2_ETH_MAX_DPCONS]; -- cgit v1.2.3-59-g8ed1b From 6ff8044751bdc40fd3199813bfe9b93d056fc15d Mon Sep 17 00:00:00 2001 From: Ioana Ciornei Date: Wed, 22 Apr 2020 15:05:11 +0300 Subject: dpaa2-eth: use the bulk ring mode enqueue interface Update the dpaa2-eth driver to use the bulk enqueue function introduced with the change to QBMAN ring mode. At the moment, no functional changes are made but rather the driver just transitions to the new interface while still enqueuing just one frame at a time. Signed-off-by: Ioana Ciornei Signed-off-by: David S. Miller --- drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c | 35 ++++++++++++++---------- drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.h | 1 + 2 files changed, 22 insertions(+), 14 deletions(-) diff --git a/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c b/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c index 7b41ece8f160..26c2868435d5 100644 --- a/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c +++ b/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c @@ -268,7 +268,7 @@ static int xdp_enqueue(struct dpaa2_eth_priv *priv, struct dpaa2_fd *fd, fq = &priv->fq[queue_id]; for (i = 0; i < DPAA2_ETH_ENQUEUE_RETRIES; i++) { - err = priv->enqueue(priv, fq, fd, 0, NULL); + err = priv->enqueue(priv, fq, fd, 0, 1, NULL); if (err != -EBUSY) break; } @@ -847,7 +847,7 @@ static netdev_tx_t dpaa2_eth_tx(struct sk_buff *skb, struct net_device *net_dev) * the Tx confirmation callback for this frame */ for (i = 0; i < DPAA2_ETH_ENQUEUE_RETRIES; i++) { - err = priv->enqueue(priv, fq, &fd, prio, NULL); + err = priv->enqueue(priv, fq, &fd, prio, 1, NULL); if (err != -EBUSY) break; } @@ -1937,7 +1937,7 @@ static int dpaa2_eth_xdp_xmit_frame(struct net_device *net_dev, fq = &priv->fq[smp_processor_id() % dpaa2_eth_queue_count(priv)]; for (i = 0; i < DPAA2_ETH_ENQUEUE_RETRIES; i++) { - err = priv->enqueue(priv, fq, &fd, 0, NULL); + err = priv->enqueue(priv, fq, &fd, 0, 1, NULL); if (err != -EBUSY) break; } @@ -2524,6 +2524,7 @@ static int set_buffer_layout(struct dpaa2_eth_priv *priv) static inline int dpaa2_eth_enqueue_qd(struct dpaa2_eth_priv *priv, struct dpaa2_eth_fq *fq, struct dpaa2_fd *fd, u8 prio, + u32 num_frames __always_unused, int *frames_enqueued) { int err; @@ -2536,18 +2537,24 @@ static inline int dpaa2_eth_enqueue_qd(struct dpaa2_eth_priv *priv, return err; } -static inline int dpaa2_eth_enqueue_fq(struct dpaa2_eth_priv *priv, - struct dpaa2_eth_fq *fq, - struct dpaa2_fd *fd, u8 prio, - int *frames_enqueued) +static inline int dpaa2_eth_enqueue_fq_multiple(struct dpaa2_eth_priv *priv, + struct dpaa2_eth_fq *fq, + struct dpaa2_fd *fd, + u8 prio, u32 num_frames, + int *frames_enqueued) { int err; - err = dpaa2_io_service_enqueue_fq(fq->channel->dpio, - fq->tx_fqid[prio], fd); - if (!err && frames_enqueued) - *frames_enqueued = 1; - return err; + err = dpaa2_io_service_enqueue_multiple_fq(fq->channel->dpio, + fq->tx_fqid[prio], + fd, num_frames); + + if (err == 0) + return -EBUSY; + + if (frames_enqueued) + *frames_enqueued = err; + return 0; } static void set_enqueue_mode(struct dpaa2_eth_priv *priv) @@ -2556,7 +2563,7 @@ static void set_enqueue_mode(struct dpaa2_eth_priv *priv) DPNI_ENQUEUE_FQID_VER_MINOR) < 0) priv->enqueue = dpaa2_eth_enqueue_qd; else - priv->enqueue = dpaa2_eth_enqueue_fq; + priv->enqueue = dpaa2_eth_enqueue_fq_multiple; } static int set_pause(struct dpaa2_eth_priv *priv) @@ -2617,7 +2624,7 @@ static void update_tx_fqids(struct dpaa2_eth_priv *priv) } } - priv->enqueue = dpaa2_eth_enqueue_fq; + priv->enqueue = dpaa2_eth_enqueue_fq_multiple; return; diff --git a/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.h b/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.h index 085ff750e4b5..2440ba6b21ef 100644 --- a/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.h +++ b/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.h @@ -372,6 +372,7 @@ struct dpaa2_eth_priv { int (*enqueue)(struct dpaa2_eth_priv *priv, struct dpaa2_eth_fq *fq, struct dpaa2_fd *fd, u8 prio, + u32 num_frames, int *frames_enqueued); u8 num_channels; -- cgit v1.2.3-59-g8ed1b From 6aa40b9e5b1ee732e07e406ffa6e17d152b3a216 Mon Sep 17 00:00:00 2001 From: Ioana Ciornei Date: Wed, 22 Apr 2020 15:05:12 +0300 Subject: dpaa2-eth: split the .ndo_xdp_xmit callback into two stages Instead of having a function that both creates a frame descriptor from an xdp_frame and enqueues it, split this into two stages. Add the dpaa2_eth_xdp_create_fd that just transforms an xdp_frame into a FD while the actual enqueue callback is called directly from the ndo for each frame. This is particulary useful in conjunction with bulk enqueue. Signed-off-by: Ioana Ciornei Acked-by: Jesper Dangaard Brouer Signed-off-by: David S. Miller --- drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c | 76 +++++++++++++----------- 1 file changed, 40 insertions(+), 36 deletions(-) diff --git a/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c b/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c index 26c2868435d5..9a0432cd893c 100644 --- a/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c +++ b/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c @@ -1880,20 +1880,16 @@ static int dpaa2_eth_xdp(struct net_device *dev, struct netdev_bpf *xdp) return 0; } -static int dpaa2_eth_xdp_xmit_frame(struct net_device *net_dev, - struct xdp_frame *xdpf) +static int dpaa2_eth_xdp_create_fd(struct net_device *net_dev, + struct xdp_frame *xdpf, + struct dpaa2_fd *fd) { struct dpaa2_eth_priv *priv = netdev_priv(net_dev); struct device *dev = net_dev->dev.parent; - struct rtnl_link_stats64 *percpu_stats; - struct dpaa2_eth_drv_stats *percpu_extras; unsigned int needed_headroom; struct dpaa2_eth_swa *swa; - struct dpaa2_eth_fq *fq; - struct dpaa2_fd fd; void *buffer_start, *aligned_start; dma_addr_t addr; - int err, i; /* We require a minimum headroom to be able to transmit the frame. * Otherwise return an error and let the original net_device handle it @@ -1902,11 +1898,8 @@ static int dpaa2_eth_xdp_xmit_frame(struct net_device *net_dev, if (xdpf->headroom < needed_headroom) return -EINVAL; - percpu_stats = this_cpu_ptr(priv->percpu_stats); - percpu_extras = this_cpu_ptr(priv->percpu_extras); - /* Setup the FD fields */ - memset(&fd, 0, sizeof(fd)); + memset(fd, 0, sizeof(*fd)); /* Align FD address, if possible */ buffer_start = xdpf->data - needed_headroom; @@ -1924,32 +1917,14 @@ static int dpaa2_eth_xdp_xmit_frame(struct net_device *net_dev, addr = dma_map_single(dev, buffer_start, swa->xdp.dma_size, DMA_BIDIRECTIONAL); - if (unlikely(dma_mapping_error(dev, addr))) { - percpu_stats->tx_dropped++; + if (unlikely(dma_mapping_error(dev, addr))) return -ENOMEM; - } - - dpaa2_fd_set_addr(&fd, addr); - dpaa2_fd_set_offset(&fd, xdpf->data - buffer_start); - dpaa2_fd_set_len(&fd, xdpf->len); - dpaa2_fd_set_format(&fd, dpaa2_fd_single); - dpaa2_fd_set_ctrl(&fd, FD_CTRL_PTA); - - fq = &priv->fq[smp_processor_id() % dpaa2_eth_queue_count(priv)]; - for (i = 0; i < DPAA2_ETH_ENQUEUE_RETRIES; i++) { - err = priv->enqueue(priv, fq, &fd, 0, 1, NULL); - if (err != -EBUSY) - break; - } - percpu_extras->tx_portal_busy += i; - if (unlikely(err < 0)) { - percpu_stats->tx_errors++; - /* let the Rx device handle the cleanup */ - return err; - } - percpu_stats->tx_packets++; - percpu_stats->tx_bytes += dpaa2_fd_get_len(&fd); + dpaa2_fd_set_addr(fd, addr); + dpaa2_fd_set_offset(fd, xdpf->data - buffer_start); + dpaa2_fd_set_len(fd, xdpf->len); + dpaa2_fd_set_format(fd, dpaa2_fd_single); + dpaa2_fd_set_ctrl(fd, FD_CTRL_PTA); return 0; } @@ -1957,6 +1932,11 @@ static int dpaa2_eth_xdp_xmit_frame(struct net_device *net_dev, static int dpaa2_eth_xdp_xmit(struct net_device *net_dev, int n, struct xdp_frame **frames, u32 flags) { + struct dpaa2_eth_priv *priv = netdev_priv(net_dev); + struct dpaa2_eth_drv_stats *percpu_extras; + struct rtnl_link_stats64 *percpu_stats; + struct dpaa2_eth_fq *fq; + struct dpaa2_fd fd; int drops = 0; int i, err; @@ -1966,14 +1946,38 @@ static int dpaa2_eth_xdp_xmit(struct net_device *net_dev, int n, if (!netif_running(net_dev)) return -ENETDOWN; + percpu_stats = this_cpu_ptr(priv->percpu_stats); + percpu_extras = this_cpu_ptr(priv->percpu_extras); + for (i = 0; i < n; i++) { struct xdp_frame *xdpf = frames[i]; - err = dpaa2_eth_xdp_xmit_frame(net_dev, xdpf); + /* create the FD from the xdp_frame */ + err = dpaa2_eth_xdp_create_fd(net_dev, xdpf, &fd); if (err) { + percpu_stats->tx_dropped++; xdp_return_frame_rx_napi(xdpf); drops++; + continue; + } + + /* enqueue the newly created FD */ + fq = &priv->fq[smp_processor_id() % dpaa2_eth_queue_count(priv)]; + for (i = 0; i < DPAA2_ETH_ENQUEUE_RETRIES; i++) { + err = priv->enqueue(priv, fq, &fd, 0, 1); + if (err != -EBUSY) + break; } + + percpu_extras->tx_portal_busy += i; + if (unlikely(err < 0)) { + percpu_stats->tx_errors++; + xdp_return_frame_rx_napi(xdpf); + continue; + } + + percpu_stats->tx_packets++; + percpu_stats->tx_bytes += dpaa2_fd_get_len(&fd); } return n - drops; -- cgit v1.2.3-59-g8ed1b From 8665d9780e6efafa3cd9865ae3a77826326fe8c6 Mon Sep 17 00:00:00 2001 From: Ioana Ciornei Date: Wed, 22 Apr 2020 15:05:13 +0300 Subject: dpaa2-eth: use bulk enqueue in .ndo_xdp_xmit Take advantage of the bulk enqueue feature in .ndo_xdp_xmit. We cannot use the XDP_XMIT_FLUSH since the architecture is not capable to store all the frames dequeued in a NAPI cycle so we instead are enqueueing all the frames received in a ndo_xdp_xmit call right away. After setting up all FDs for the xdp_frames received, enqueue multiple frames at a time until all are sent or the maximum number of retries is hit. Signed-off-by: Ioana Ciornei Acked-by: Jesper Dangaard Brouer Signed-off-by: David S. Miller --- drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c | 57 ++++++++++++------------ drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.h | 2 + 2 files changed, 30 insertions(+), 29 deletions(-) diff --git a/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c b/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c index 9a0432cd893c..9d4061bba0b8 100644 --- a/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c +++ b/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c @@ -1933,12 +1933,12 @@ static int dpaa2_eth_xdp_xmit(struct net_device *net_dev, int n, struct xdp_frame **frames, u32 flags) { struct dpaa2_eth_priv *priv = netdev_priv(net_dev); + int total_enqueued = 0, retries = 0, enqueued; struct dpaa2_eth_drv_stats *percpu_extras; struct rtnl_link_stats64 *percpu_stats; + int num_fds, i, err, max_retries; struct dpaa2_eth_fq *fq; - struct dpaa2_fd fd; - int drops = 0; - int i, err; + struct dpaa2_fd *fds; if (unlikely(flags & ~XDP_XMIT_FLAGS_MASK)) return -EINVAL; @@ -1946,41 +1946,40 @@ static int dpaa2_eth_xdp_xmit(struct net_device *net_dev, int n, if (!netif_running(net_dev)) return -ENETDOWN; + fq = &priv->fq[smp_processor_id()]; + fds = fq->xdp_fds; + percpu_stats = this_cpu_ptr(priv->percpu_stats); percpu_extras = this_cpu_ptr(priv->percpu_extras); + /* create a FD for each xdp_frame in the list received */ for (i = 0; i < n; i++) { - struct xdp_frame *xdpf = frames[i]; - - /* create the FD from the xdp_frame */ - err = dpaa2_eth_xdp_create_fd(net_dev, xdpf, &fd); - if (err) { - percpu_stats->tx_dropped++; - xdp_return_frame_rx_napi(xdpf); - drops++; - continue; - } - - /* enqueue the newly created FD */ - fq = &priv->fq[smp_processor_id() % dpaa2_eth_queue_count(priv)]; - for (i = 0; i < DPAA2_ETH_ENQUEUE_RETRIES; i++) { - err = priv->enqueue(priv, fq, &fd, 0, 1); - if (err != -EBUSY) - break; - } + err = dpaa2_eth_xdp_create_fd(net_dev, frames[i], &fds[i]); + if (err) + break; + } + num_fds = i; - percpu_extras->tx_portal_busy += i; - if (unlikely(err < 0)) { - percpu_stats->tx_errors++; - xdp_return_frame_rx_napi(xdpf); + /* try to enqueue all the FDs until the max number of retries is hit */ + max_retries = num_fds * DPAA2_ETH_ENQUEUE_RETRIES; + while (total_enqueued < num_fds && retries < max_retries) { + err = priv->enqueue(priv, fq, &fds[total_enqueued], + 0, num_fds - total_enqueued, &enqueued); + if (err == -EBUSY) { + percpu_extras->tx_portal_busy += ++retries; continue; } - - percpu_stats->tx_packets++; - percpu_stats->tx_bytes += dpaa2_fd_get_len(&fd); + total_enqueued += enqueued; } - return n - drops; + /* update statistics */ + percpu_stats->tx_packets += total_enqueued; + for (i = 0; i < total_enqueued; i++) + percpu_stats->tx_bytes += dpaa2_fd_get_len(&fds[i]); + for (i = total_enqueued; i < n; i++) + xdp_return_frame_rx_napi(frames[i]); + + return total_enqueued; } static int update_xps(struct dpaa2_eth_priv *priv) diff --git a/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.h b/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.h index 2440ba6b21ef..289053099974 100644 --- a/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.h +++ b/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.h @@ -325,6 +325,8 @@ struct dpaa2_eth_fq { const struct dpaa2_fd *fd, struct dpaa2_eth_fq *fq); struct dpaa2_eth_fq_stats stats; + + struct dpaa2_fd xdp_fds[DEV_MAP_BULK_SIZE]; }; struct dpaa2_eth_ch_xdp { -- cgit v1.2.3-59-g8ed1b From 2b7aadd3b9e17e8b81eeb8d9cc46756ae4658265 Mon Sep 17 00:00:00 2001 From: Raz Bouganim Date: Tue, 21 Apr 2020 15:28:05 +0300 Subject: wlcore: Adding suppoprt for IGTK key in wlcore driver This patch adding support for new cipher suite - AES-CMAC in wlcore driver. This patch is required for support PMF/WPA3 connection to install IGTK key. Signed-off-by: Raz Bouganim Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/1587472085-755-1-git-send-email-r-bouganim@ti.com --- drivers/net/wireless/ti/wlcore/cmd.h | 1 + drivers/net/wireless/ti/wlcore/main.c | 4 ++++ 2 files changed, 5 insertions(+) diff --git a/drivers/net/wireless/ti/wlcore/cmd.h b/drivers/net/wireless/ti/wlcore/cmd.h index f2609d5b6bf7..9acd8a41ea61 100644 --- a/drivers/net/wireless/ti/wlcore/cmd.h +++ b/drivers/net/wireless/ti/wlcore/cmd.h @@ -458,6 +458,7 @@ enum wl1271_cmd_key_type { KEY_TKIP = 2, KEY_AES = 3, KEY_GEM = 4, + KEY_IGTK = 5, }; struct wl1271_cmd_set_keys { diff --git a/drivers/net/wireless/ti/wlcore/main.c b/drivers/net/wireless/ti/wlcore/main.c index f140f7d7f553..4421fc656b1c 100644 --- a/drivers/net/wireless/ti/wlcore/main.c +++ b/drivers/net/wireless/ti/wlcore/main.c @@ -3547,6 +3547,9 @@ int wlcore_set_key(struct wl1271 *wl, enum set_key_cmd cmd, case WL1271_CIPHER_SUITE_GEM: key_type = KEY_GEM; break; + case WLAN_CIPHER_SUITE_AES_CMAC: + key_type = KEY_IGTK; + break; default: wl1271_error("Unknown key algo 0x%x", key_conf->cipher); @@ -6214,6 +6217,7 @@ static int wl1271_init_ieee80211(struct wl1271 *wl) WLAN_CIPHER_SUITE_TKIP, WLAN_CIPHER_SUITE_CCMP, WL1271_CIPHER_SUITE_GEM, + WLAN_CIPHER_SUITE_AES_CMAC, }; /* The tx descriptor buffer */ -- cgit v1.2.3-59-g8ed1b From 15d2fcc6b2dea46986e55cd3808c0dbb480a6c8d Mon Sep 17 00:00:00 2001 From: Ping-Ke Shih Date: Wed, 22 Apr 2020 11:46:00 +0800 Subject: rtw88: add legacy firmware download for 8723D devices The WLAN CPU of 8723D device is different from others, add legacy firmware download function for it. A new variable wlan_cpu is used to decide which firmware download function we should use. Legacy firmware file contains 32 bytes header including version and subversion. When downloading to wlan cpu, header is excluded. Firmware is downloaded via beacon queue to reserved page that is a part of TX buffer. Since 11N WLAN CPU uses different control registers, this patch introduces related control registers. Signed-off-by: Ping-Ke Shih Signed-off-by: Yan-Hsuan Chuang Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/20200422034607.28747-2-yhchuang@realtek.com --- drivers/net/wireless/realtek/rtw88/fw.c | 21 +++- drivers/net/wireless/realtek/rtw88/fw.h | 25 +++++ drivers/net/wireless/realtek/rtw88/mac.c | 146 +++++++++++++++++++++++++- drivers/net/wireless/realtek/rtw88/main.c | 41 ++++++-- drivers/net/wireless/realtek/rtw88/main.h | 16 +++ drivers/net/wireless/realtek/rtw88/reg.h | 11 ++ drivers/net/wireless/realtek/rtw88/rtw8723d.c | 1 + drivers/net/wireless/realtek/rtw88/rtw8822b.c | 1 + drivers/net/wireless/realtek/rtw88/rtw8822c.c | 1 + 9 files changed, 252 insertions(+), 11 deletions(-) diff --git a/drivers/net/wireless/realtek/rtw88/fw.c b/drivers/net/wireless/realtek/rtw88/fw.c index 245da96dfddc..209853fdcb42 100644 --- a/drivers/net/wireless/realtek/rtw88/fw.c +++ b/drivers/net/wireless/realtek/rtw88/fw.c @@ -1079,6 +1079,8 @@ int rtw_fw_write_data_rsvd_page(struct rtw_dev *rtwdev, u16 pg_addr, u8 bckp[2]; u8 val; u16 rsvd_pg_head; + u32 bcn_valid_addr; + u32 bcn_valid_mask; int ret; lockdep_assert_held(&rtwdev->mutex); @@ -1086,8 +1088,13 @@ int rtw_fw_write_data_rsvd_page(struct rtw_dev *rtwdev, u16 pg_addr, if (!size) return -EINVAL; - pg_addr &= BIT_MASK_BCN_HEAD_1_V1; - rtw_write16(rtwdev, REG_FIFOPAGE_CTRL_2, pg_addr | BIT_BCN_VALID_V1); + if (rtw_chip_wcpu_11n(rtwdev)) { + rtw_write32_set(rtwdev, REG_DWBCN0_CTRL, BIT_BCN_VALID); + } else { + pg_addr &= BIT_MASK_BCN_HEAD_1_V1; + pg_addr |= BIT_BCN_VALID_V1; + rtw_write16(rtwdev, REG_FIFOPAGE_CTRL_2, pg_addr); + } val = rtw_read8(rtwdev, REG_CR + 1); bckp[0] = val; @@ -1105,7 +1112,15 @@ int rtw_fw_write_data_rsvd_page(struct rtw_dev *rtwdev, u16 pg_addr, goto restore; } - if (!check_hw_ready(rtwdev, REG_FIFOPAGE_CTRL_2, BIT_BCN_VALID_V1, 1)) { + if (rtw_chip_wcpu_11n(rtwdev)) { + bcn_valid_addr = REG_DWBCN0_CTRL; + bcn_valid_mask = BIT_BCN_VALID; + } else { + bcn_valid_addr = REG_FIFOPAGE_CTRL_2; + bcn_valid_mask = BIT_BCN_VALID_V1; + } + + if (!check_hw_ready(rtwdev, bcn_valid_addr, bcn_valid_mask, 1)) { rtw_err(rtwdev, "error beacon valid\n"); ret = -EBUSY; } diff --git a/drivers/net/wireless/realtek/rtw88/fw.h b/drivers/net/wireless/realtek/rtw88/fw.h index cdd244857048..2933ef741e53 100644 --- a/drivers/net/wireless/realtek/rtw88/fw.h +++ b/drivers/net/wireless/realtek/rtw88/fw.h @@ -19,6 +19,12 @@ #define RSVD_PAGE_START_ADDR 0x780 #define FIFO_DUMP_ADDR 0x8000 +#define DLFW_PAGE_SIZE_SHIFT_LEGACY 12 +#define DLFW_PAGE_SIZE_LEGACY 0x1000 +#define DLFW_BLK_SIZE_SHIFT_LEGACY 2 +#define DLFW_BLK_SIZE_LEGACY 4 +#define FW_START_ADDR_LEGACY 0x1000 + enum rtw_c2h_cmd_id { C2H_BT_INFO = 0x09, C2H_BT_MP_INFO = 0x0b, @@ -192,6 +198,25 @@ struct rtw_fw_hdr { __le32 imem_addr; } __packed; +struct rtw_fw_hdr_legacy { + __le16 signature; + u8 category; + u8 function; + __le16 version; /* 0x04 */ + u8 subversion1; + u8 subversion2; + u8 month; /* 0x08 */ + u8 day; + u8 hour; + u8 minute; + __le16 size; + __le16 rsvd2; + __le32 idx; /* 0x10 */ + __le32 rsvd3; + __le32 rsvd4; /* 0x18 */ + __le32 rsvd5; +} __packed; + /* C2H */ #define GET_CCX_REPORT_SEQNUM(c2h_payload) (c2h_payload[8] & 0xfc) #define GET_CCX_REPORT_STATUS(c2h_payload) (c2h_payload[9] & 0xc0) diff --git a/drivers/net/wireless/realtek/rtw88/mac.c b/drivers/net/wireless/realtek/rtw88/mac.c index 7b245779ff90..6092604abfb9 100644 --- a/drivers/net/wireless/realtek/rtw88/mac.c +++ b/drivers/net/wireless/realtek/rtw88/mac.c @@ -650,7 +650,7 @@ static void download_firmware_end_flow(struct rtw_dev *rtwdev) rtw_write16(rtwdev, REG_MCUFW_CTRL, fw_ctrl); } -int rtw_download_firmware(struct rtw_dev *rtwdev, struct rtw_fw_state *fw) +int __rtw_download_firmware(struct rtw_dev *rtwdev, struct rtw_fw_state *fw) { struct rtw_backup_info bckp[DLFW_RESTORE_REG_NUM]; const u8 *data = fw->firmware->data; @@ -704,6 +704,150 @@ dlfw_fail: return ret; } +static void en_download_firmware_legacy(struct rtw_dev *rtwdev, bool en) +{ + int try; + + if (en) { + wlan_cpu_enable(rtwdev, false); + wlan_cpu_enable(rtwdev, true); + + rtw_write8_set(rtwdev, REG_MCUFW_CTRL, BIT_MCUFWDL_EN); + + for (try = 0; try < 10; try++) { + if (rtw_read8(rtwdev, REG_MCUFW_CTRL) & BIT_MCUFWDL_EN) + goto fwdl_ready; + rtw_write8_set(rtwdev, REG_MCUFW_CTRL, BIT_MCUFWDL_EN); + msleep(20); + } + rtw_err(rtwdev, "failed to check fw download ready\n"); +fwdl_ready: + rtw_write32_clr(rtwdev, REG_MCUFW_CTRL, BIT_ROM_DLEN); + } else { + rtw_write8_clr(rtwdev, REG_MCUFW_CTRL, BIT_MCUFWDL_EN); + } +} + +static void +write_firmware_page(struct rtw_dev *rtwdev, u32 page, const u8 *data, u32 size) +{ + u32 val32; + u32 block_nr; + u32 remain_size; + u32 write_addr = FW_START_ADDR_LEGACY; + const __le32 *ptr = (const __le32 *)data; + u32 block; + __le32 remain_data = 0; + + block_nr = size >> DLFW_BLK_SIZE_SHIFT_LEGACY; + remain_size = size & (DLFW_BLK_SIZE_LEGACY - 1); + + val32 = rtw_read32(rtwdev, REG_MCUFW_CTRL); + val32 &= ~BIT_ROM_PGE; + val32 |= (page << BIT_SHIFT_ROM_PGE) & BIT_ROM_PGE; + rtw_write32(rtwdev, REG_MCUFW_CTRL, val32); + + for (block = 0; block < block_nr; block++) { + rtw_write32(rtwdev, write_addr, le32_to_cpu(*ptr)); + + write_addr += DLFW_BLK_SIZE_LEGACY; + ptr++; + } + + if (remain_size) { + memcpy(&remain_data, ptr, remain_size); + rtw_write32(rtwdev, write_addr, le32_to_cpu(remain_data)); + } +} + +static int +download_firmware_legacy(struct rtw_dev *rtwdev, const u8 *data, u32 size) +{ + u32 page; + u32 total_page; + u32 last_page_size; + + data += sizeof(struct rtw_fw_hdr_legacy); + size -= sizeof(struct rtw_fw_hdr_legacy); + + total_page = size >> DLFW_PAGE_SIZE_SHIFT_LEGACY; + last_page_size = size & (DLFW_PAGE_SIZE_LEGACY - 1); + + rtw_write8_set(rtwdev, REG_MCUFW_CTRL, BIT_FWDL_CHK_RPT); + + for (page = 0; page < total_page; page++) { + write_firmware_page(rtwdev, page, data, DLFW_PAGE_SIZE_LEGACY); + data += DLFW_PAGE_SIZE_LEGACY; + } + if (last_page_size) + write_firmware_page(rtwdev, page, data, last_page_size); + + if (!check_hw_ready(rtwdev, REG_MCUFW_CTRL, BIT_FWDL_CHK_RPT, 1)) { + rtw_err(rtwdev, "failed to check download fimrware report\n"); + return -EINVAL; + } + + return 0; +} + +static int download_firmware_validate_legacy(struct rtw_dev *rtwdev) +{ + u32 val32; + int try; + + val32 = rtw_read32(rtwdev, REG_MCUFW_CTRL); + val32 |= BIT_MCUFWDL_RDY; + val32 &= ~BIT_WINTINI_RDY; + rtw_write32(rtwdev, REG_MCUFW_CTRL, val32); + + wlan_cpu_enable(rtwdev, false); + wlan_cpu_enable(rtwdev, true); + + for (try = 0; try < 10; try++) { + val32 = rtw_read32(rtwdev, REG_MCUFW_CTRL); + if ((val32 & FW_READY_LEGACY) == FW_READY_LEGACY) + return 0; + msleep(20); + } + + rtw_err(rtwdev, "failed to validate fimrware\n"); + return -EINVAL; +} + +int __rtw_download_firmware_legacy(struct rtw_dev *rtwdev, struct rtw_fw_state *fw) +{ + int ret = 0; + + en_download_firmware_legacy(rtwdev, true); + ret = download_firmware_legacy(rtwdev, fw->firmware->data, fw->firmware->size); + en_download_firmware_legacy(rtwdev, false); + if (ret) + goto out; + + ret = download_firmware_validate_legacy(rtwdev); + if (ret) + goto out; + + /* reset desc and index */ + rtw_hci_setup(rtwdev); + + rtwdev->h2c.last_box_num = 0; + rtwdev->h2c.seq = 0; + + set_bit(RTW_FLAG_FW_RUNNING, rtwdev->flags); + +out: + return ret; +} + +int rtw_download_firmware(struct rtw_dev *rtwdev, struct rtw_fw_state *fw) +{ + if (rtw_chip_wcpu_11n(rtwdev)) + return __rtw_download_firmware_legacy(rtwdev, fw); + + return __rtw_download_firmware(rtwdev, fw); +} + static u32 get_priority_queues(struct rtw_dev *rtwdev, u32 queues) { const struct rtw_rqpn *rqpn = rtwdev->fifo.rqpn; diff --git a/drivers/net/wireless/realtek/rtw88/main.c b/drivers/net/wireless/realtek/rtw88/main.c index c851830132d0..b0dadff0dc7b 100644 --- a/drivers/net/wireless/realtek/rtw88/main.c +++ b/drivers/net/wireless/realtek/rtw88/main.c @@ -1042,11 +1042,43 @@ static void rtw_unset_supported_band(struct ieee80211_hw *hw, kfree(hw->wiphy->bands[NL80211_BAND_5GHZ]); } +static void __update_firmware_info(struct rtw_dev *rtwdev, + struct rtw_fw_state *fw) +{ + const struct rtw_fw_hdr *fw_hdr = + (const struct rtw_fw_hdr *)fw->firmware->data; + + fw->h2c_version = le16_to_cpu(fw_hdr->h2c_fmt_ver); + fw->version = le16_to_cpu(fw_hdr->version); + fw->sub_version = fw_hdr->subversion; + fw->sub_index = fw_hdr->subindex; +} + +static void __update_firmware_info_legacy(struct rtw_dev *rtwdev, + struct rtw_fw_state *fw) +{ + struct rtw_fw_hdr_legacy *legacy = + (struct rtw_fw_hdr_legacy *)fw->firmware->data; + + fw->h2c_version = 0; + fw->version = le16_to_cpu(legacy->version); + fw->sub_version = legacy->subversion1; + fw->sub_index = legacy->subversion2; +} + +static void update_firmware_info(struct rtw_dev *rtwdev, + struct rtw_fw_state *fw) +{ + if (rtw_chip_wcpu_11n(rtwdev)) + __update_firmware_info_legacy(rtwdev, fw); + else + __update_firmware_info(rtwdev, fw); +} + static void rtw_load_firmware_cb(const struct firmware *firmware, void *context) { struct rtw_fw_state *fw = context; struct rtw_dev *rtwdev = fw->rtwdev; - const struct rtw_fw_hdr *fw_hdr; if (!firmware || !firmware->data) { rtw_err(rtwdev, "failed to request firmware\n"); @@ -1054,13 +1086,8 @@ static void rtw_load_firmware_cb(const struct firmware *firmware, void *context) return; } - fw_hdr = (const struct rtw_fw_hdr *)firmware->data; - fw->h2c_version = le16_to_cpu(fw_hdr->h2c_fmt_ver); - fw->version = le16_to_cpu(fw_hdr->version); - fw->sub_version = fw_hdr->subversion; - fw->sub_index = fw_hdr->subindex; - fw->firmware = firmware; + update_firmware_info(rtwdev, fw); complete_all(&fw->completion); rtw_info(rtwdev, "Firmware version %u.%u.%u, H2C version %u\n", diff --git a/drivers/net/wireless/realtek/rtw88/main.h b/drivers/net/wireless/realtek/rtw88/main.h index 74302181da53..380a670eeeee 100644 --- a/drivers/net/wireless/realtek/rtw88/main.h +++ b/drivers/net/wireless/realtek/rtw88/main.h @@ -1056,12 +1056,18 @@ struct rtw_pwr_track_tbl { const u8 *pwrtrk_2g_ccka_p; }; +enum rtw_wlan_cpu { + RTW_WCPU_11AC, + RTW_WCPU_11N, +}; + /* hardware configuration for each IC */ struct rtw_chip_info { struct rtw_chip_ops *ops; u8 id; const char *fw_name; + enum rtw_wlan_cpu wlan_cpu; u8 tx_pkt_desc_sz; u8 tx_buf_desc_sz; u8 rx_pkt_desc_sz; @@ -1725,6 +1731,16 @@ static inline void rtw_chip_efuse_grant_off(struct rtw_dev *rtwdev) rtwdev->chip->ops->efuse_grant(rtwdev, false); } +static inline bool rtw_chip_wcpu_11n(struct rtw_dev *rtwdev) +{ + return rtwdev->chip->wlan_cpu == RTW_WCPU_11N; +} + +static inline bool rtw_chip_wcpu_11ac(struct rtw_dev *rtwdev) +{ + return rtwdev->chip->wlan_cpu == RTW_WCPU_11AC; +} + void rtw_get_channel_params(struct cfg80211_chan_def *chandef, struct rtw_channel_params *ch_param); bool check_hw_ready(struct rtw_dev *rtwdev, u32 addr, u32 mask, u32 target); diff --git a/drivers/net/wireless/realtek/rtw88/reg.h b/drivers/net/wireless/realtek/rtw88/reg.h index 911d8e75db77..89868ac0748f 100644 --- a/drivers/net/wireless/realtek/rtw88/reg.h +++ b/drivers/net/wireless/realtek/rtw88/reg.h @@ -77,19 +77,28 @@ #define BIT_ANA_PORT_EN BIT(22) #define BIT_MAC_PORT_EN BIT(21) #define BIT_BOOT_FSPI_EN BIT(20) +#define BIT_ROM_DLEN BIT(19) +#define BIT_ROM_PGE GENMASK(18, 16) /* legacy only */ +#define BIT_SHIFT_ROM_PGE 16 #define BIT_FW_INIT_RDY BIT(15) #define BIT_FW_DW_RDY BIT(14) #define BIT_RPWM_TOGGLE BIT(7) +#define BIT_RAM_DL_SEL BIT(7) /* legacy only */ #define BIT_DMEM_CHKSUM_OK BIT(6) +#define BIT_WINTINI_RDY BIT(6) /* legacy only */ #define BIT_DMEM_DW_OK BIT(5) #define BIT_IMEM_CHKSUM_OK BIT(4) #define BIT_IMEM_DW_OK BIT(3) #define BIT_IMEM_BOOT_LOAD_CHECKSUM_OK BIT(2) +#define BIT_FWDL_CHK_RPT BIT(2) /* legacy only */ +#define BIT_MCUFWDL_RDY BIT(1) /* legacy only */ #define BIT_MCUFWDL_EN BIT(0) #define BIT_CHECK_SUM_OK (BIT(4) | BIT(6)) #define FW_READY (BIT_FW_INIT_RDY | BIT_FW_DW_RDY | \ BIT_IMEM_DW_OK | BIT_DMEM_DW_OK | \ BIT_CHECK_SUM_OK) +#define FW_READY_LEGACY (BIT_MCUFWDL_RDY | BIT_FWDL_CHK_RPT | \ + BIT_WINTINI_RDY | BIT_RAM_DL_SEL) #define FW_READY_MASK 0xffff #define REG_EFUSE_ACCESS 0x00CF @@ -197,6 +206,8 @@ #define BIT_MASK_BCN_HEAD_1_V1 0xfff #define REG_AUTO_LLT_V1 0x0208 #define BIT_AUTO_INIT_LLT_V1 BIT(0) +#define REG_DWBCN0_CTRL 0x0208 +#define BIT_BCN_VALID BIT(16) #define REG_TXDMA_OFFSET_CHK 0x020C #define REG_TXDMA_STATUS 0x0210 #define BTI_PAGE_OVF BIT(2) diff --git a/drivers/net/wireless/realtek/rtw88/rtw8723d.c b/drivers/net/wireless/realtek/rtw88/rtw8723d.c index c25cabbab64d..5e8e0dd6456e 100644 --- a/drivers/net/wireless/realtek/rtw88/rtw8723d.c +++ b/drivers/net/wireless/realtek/rtw88/rtw8723d.c @@ -511,6 +511,7 @@ struct rtw_chip_info rtw8723d_hw_spec = { .ops = &rtw8723d_ops, .id = RTW_CHIP_TYPE_8723D, .fw_name = "rtw88/rtw8723d_fw.bin", + .wlan_cpu = RTW_WCPU_11N, .tx_pkt_desc_sz = 40, .tx_buf_desc_sz = 16, .rx_pkt_desc_sz = 24, diff --git a/drivers/net/wireless/realtek/rtw88/rtw8822b.c b/drivers/net/wireless/realtek/rtw88/rtw8822b.c index 9a2e18e7624f..ffee8111d145 100644 --- a/drivers/net/wireless/realtek/rtw88/rtw8822b.c +++ b/drivers/net/wireless/realtek/rtw88/rtw8822b.c @@ -2408,6 +2408,7 @@ struct rtw_chip_info rtw8822b_hw_spec = { .ops = &rtw8822b_ops, .id = RTW_CHIP_TYPE_8822B, .fw_name = "rtw88/rtw8822b_fw.bin", + .wlan_cpu = RTW_WCPU_11AC, .tx_pkt_desc_sz = 48, .tx_buf_desc_sz = 16, .rx_pkt_desc_sz = 24, diff --git a/drivers/net/wireless/realtek/rtw88/rtw8822c.c b/drivers/net/wireless/realtek/rtw88/rtw8822c.c index ee0d39135617..8dd92136145d 100644 --- a/drivers/net/wireless/realtek/rtw88/rtw8822c.c +++ b/drivers/net/wireless/realtek/rtw88/rtw8822c.c @@ -4269,6 +4269,7 @@ struct rtw_chip_info rtw8822c_hw_spec = { .ops = &rtw8822c_ops, .id = RTW_CHIP_TYPE_8822C, .fw_name = "rtw88/rtw8822c_fw.bin", + .wlan_cpu = RTW_WCPU_11AC, .tx_pkt_desc_sz = 48, .tx_buf_desc_sz = 16, .rx_pkt_desc_sz = 24, -- cgit v1.2.3-59-g8ed1b From e5f57ad06adec1dcbfe69f45792a8b9dd4798664 Mon Sep 17 00:00:00 2001 From: Ping-Ke Shih Date: Wed, 22 Apr 2020 11:46:01 +0800 Subject: rtw88: no need to send additional information to legacy firmware The firmware of 11AC devices need more information to support more offload functions, such as IQK. And 11N devices such as 8723D does not support offload these function in firmware, there is no need to send these additional information to firmware when it comes to 11N devices. Signed-off-by: Ping-Ke Shih Signed-off-by: Yan-Hsuan Chuang Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/20200422034607.28747-3-yhchuang@realtek.com --- drivers/net/wireless/realtek/rtw88/fw.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/net/wireless/realtek/rtw88/fw.c b/drivers/net/wireless/realtek/rtw88/fw.c index 209853fdcb42..dde7823143ea 100644 --- a/drivers/net/wireless/realtek/rtw88/fw.c +++ b/drivers/net/wireless/realtek/rtw88/fw.c @@ -271,6 +271,9 @@ rtw_fw_send_general_info(struct rtw_dev *rtwdev) u8 h2c_pkt[H2C_PKT_SIZE] = {0}; u16 total_size = H2C_PKT_HDR_SIZE + 4; + if (rtw_chip_wcpu_11n(rtwdev)) + return; + rtw_h2c_pkt_set_header(h2c_pkt, H2C_PKT_GENERAL_INFO); SET_PKT_H2C_TOTAL_LEN(h2c_pkt, total_size); @@ -291,6 +294,9 @@ rtw_fw_send_phydm_info(struct rtw_dev *rtwdev) u16 total_size = H2C_PKT_HDR_SIZE + 8; u8 fw_rf_type = 0; + if (rtw_chip_wcpu_11n(rtwdev)) + return; + if (hal->rf_type == RF_1T1R) fw_rf_type = FW_RF_1T1R; else if (hal->rf_type == RF_2T2R) -- cgit v1.2.3-59-g8ed1b From 4e223a5f5342fab01ccebf87714401f559dcc791 Mon Sep 17 00:00:00 2001 From: Ping-Ke Shih Date: Wed, 22 Apr 2020 11:46:02 +0800 Subject: rtw88: 8723d: Add mac power-on/-off function The mac power-on flow consists of three steps: 1. pre_sys_cfg (Before switching power state) 2. power_switch (Switching power state) 3. init_sys_cfg (Settings after swtiching power state) When switching power state, driver will load and parse the power sequence tables. For 8723D devices, the logics for parsing are most same except for the polling function. 8723D devices need to toggle BIT_PFM_WOWL twice. The settings after power state is switched for 8723D devices are quite different with other devices, extract a legacy function for them. For power-off flow, 8723D devices have the same logic with existing chips. But warning printed if we run power-off sequence in power-off state: rtw_pci 0000:03:00.0: failed to poll offset=0x5f8 mask=0xff value=0x0 The scenario is user do 'ifconfig up' that will run power-on sequence to bring up and then run power-off sequence to enter idle (IEEE80211_CONF_IDLE). Then, user do 'ifconfig down' that will run power-off sequence again, and the warning is shown. Original code check power-on state to avoid to run power-on sequence twice, and this commit extends to check both power-on and power-off states. Signed-off-by: Ping-Ke Shih Signed-off-by: Yan-Hsuan Chuang Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/20200422034607.28747-4-yhchuang@realtek.com --- drivers/net/wireless/realtek/rtw88/mac.c | 60 +++++++++++++++++++++++++++----- drivers/net/wireless/realtek/rtw88/reg.h | 10 ++++++ 2 files changed, 61 insertions(+), 9 deletions(-) diff --git a/drivers/net/wireless/realtek/rtw88/mac.c b/drivers/net/wireless/realtek/rtw88/mac.c index 6092604abfb9..21b5c7173f0f 100644 --- a/drivers/net/wireless/realtek/rtw88/mac.c +++ b/drivers/net/wireless/realtek/rtw88/mac.c @@ -61,6 +61,14 @@ static int rtw_mac_pre_system_cfg(struct rtw_dev *rtwdev) rtw_write8(rtwdev, REG_RSV_CTRL, 0); + if (rtw_chip_wcpu_11n(rtwdev)) { + if (rtw_read32(rtwdev, REG_SYS_CFG1) & BIT_LDO) + rtw_write8(rtwdev, REG_LDO_SWR_CTRL, LDO_SEL); + else + rtw_write8(rtwdev, REG_LDO_SWR_CTRL, SPS_SEL); + return 0; + } + switch (rtw_hci_type(rtwdev)) { case RTW_HCI_TYPE_PCIE: rtw_write32_set(rtwdev, REG_HCI_OPT_CTRL, BIT_BT_DIG_CLK_EN); @@ -123,10 +131,19 @@ static int rtw_pwr_cmd_polling(struct rtw_dev *rtwdev, if (rtw_hci_type(rtwdev) == RTW_HCI_TYPE_PCIE && flag == 0) { value = rtw_read8(rtwdev, REG_SYS_PW_CTRL); - value |= BIT(3); + if (rtwdev->chip->id == RTW_CHIP_TYPE_8723D) { + value &= ~BIT_PFM_WOWL; + rtw_write8(rtwdev, REG_SYS_PW_CTRL, value); + } + value |= BIT_PFM_WOWL; rtw_write8(rtwdev, REG_SYS_PW_CTRL, value); - value &= ~BIT(3); + value &= ~BIT_PFM_WOWL; rtw_write8(rtwdev, REG_SYS_PW_CTRL, value); + if (rtwdev->chip->id == RTW_CHIP_TYPE_8723D) { + value |= BIT_PFM_WOWL; + rtw_write8(rtwdev, REG_SYS_PW_CTRL, value); + } + cnt = RTW_PWR_POLLING_CNT; flag = 1; } else { @@ -228,12 +245,14 @@ static int rtw_mac_power_switch(struct rtw_dev *rtwdev, bool pwr_on) u8 rpwm; bool cur_pwr; - rpwm = rtw_read8(rtwdev, rtwdev->hci.rpwm_addr); + if (rtw_chip_wcpu_11ac(rtwdev)) { + rpwm = rtw_read8(rtwdev, rtwdev->hci.rpwm_addr); - /* Check FW still exist or not */ - if (rtw_read16(rtwdev, REG_MCUFW_CTRL) == 0xC078) { - rpwm = (rpwm ^ BIT_RPWM_TOGGLE) & BIT_RPWM_TOGGLE; - rtw_write8(rtwdev, rtwdev->hci.rpwm_addr, rpwm); + /* Check FW still exist or not */ + if (rtw_read16(rtwdev, REG_MCUFW_CTRL) == 0xC078) { + rpwm = (rpwm ^ BIT_RPWM_TOGGLE) & BIT_RPWM_TOGGLE; + rtw_write8(rtwdev, rtwdev->hci.rpwm_addr, rpwm); + } } if (rtw_read8(rtwdev, REG_CR) == 0xea) @@ -244,7 +263,7 @@ static int rtw_mac_power_switch(struct rtw_dev *rtwdev, bool pwr_on) else cur_pwr = true; - if (pwr_on && cur_pwr) + if (pwr_on == cur_pwr) return -EALREADY; pwr_seq = pwr_on ? chip->pwr_on_seq : chip->pwr_off_seq; @@ -254,7 +273,7 @@ static int rtw_mac_power_switch(struct rtw_dev *rtwdev, bool pwr_on) return 0; } -static int rtw_mac_init_system_cfg(struct rtw_dev *rtwdev) +static int __rtw_mac_init_system_cfg(struct rtw_dev *rtwdev) { u8 sys_func_en = rtwdev->chip->sys_func_en; u8 value8; @@ -279,6 +298,29 @@ static int rtw_mac_init_system_cfg(struct rtw_dev *rtwdev) return 0; } +static int __rtw_mac_init_system_cfg_legacy(struct rtw_dev *rtwdev) +{ + rtw_write8(rtwdev, REG_CR, 0xff); + mdelay(2); + rtw_write8(rtwdev, REG_HWSEQ_CTRL, 0x7f); + mdelay(2); + + rtw_write8_set(rtwdev, REG_SYS_CLKR, BIT_WAKEPAD_EN); + rtw_write16_clr(rtwdev, REG_GPIO_MUXCFG, BIT_EN_SIC); + + rtw_write16(rtwdev, REG_CR, 0x2ff); + + return 0; +} + +static int rtw_mac_init_system_cfg(struct rtw_dev *rtwdev) +{ + if (rtw_chip_wcpu_11n(rtwdev)) + return __rtw_mac_init_system_cfg_legacy(rtwdev); + + return __rtw_mac_init_system_cfg(rtwdev); +} + int rtw_mac_power_on(struct rtw_dev *rtwdev) { int ret = 0; diff --git a/drivers/net/wireless/realtek/rtw88/reg.h b/drivers/net/wireless/realtek/rtw88/reg.h index 89868ac0748f..c1e66d656307 100644 --- a/drivers/net/wireless/realtek/rtw88/reg.h +++ b/drivers/net/wireless/realtek/rtw88/reg.h @@ -13,11 +13,13 @@ #define BIT_R_DIS_PRST BIT(6) #define BIT_WLOCK_1C_B6 BIT(5) #define REG_SYS_PW_CTRL 0x0004 +#define BIT_PFM_WOWL BIT(3) #define REG_SYS_CLK_CTRL 0x0008 #define BIT_CPU_CLK_EN BIT(14) #define REG_SYS_CLKR 0x0008 #define BIT_ANA8M BIT(1) +#define BIT_WAKEPAD_EN BIT(3) #define BIT_LOADER_CLK_EN BIT(5) #define REG_RSV_CTRL 0x001C @@ -49,6 +51,7 @@ #define REG_GPIO_MUXCFG 0x0040 #define BIT_FSPI_EN BIT(19) +#define BIT_EN_SIC BIT(12) #define BIT_BT_AOD_GPIO3 BIT(9) #define BIT_BT_PTA_EN BIT(5) #define BIT_WLRFE_4_5_EN BIT(2) @@ -73,6 +76,10 @@ #define BIT_LTE_MUX_CTRL_PATH BIT(26) #define REG_HCI_OPT_CTRL 0x0074 +#define REG_LDO_SWR_CTRL 0x007C +#define LDO_SEL 0xC3 +#define SPS_SEL 0x83 + #define REG_MCUFW_CTRL 0x0080 #define BIT_ANA_PORT_EN BIT(22) #define BIT_MAC_PORT_EN BIT(21) @@ -110,6 +117,7 @@ #define BIT_BT_INT_EN BIT(15) #define REG_SYS_CFG1 0x00F0 #define BIT_RTL_ID BIT(23) +#define BIT_LDO BIT(24) #define BIT_RF_TYPE_ID BIT(27) #define BIT_SHIFT_VENDOR_ID 16 #define BIT_MASK_VENDOR_ID 0xf @@ -238,6 +246,8 @@ #define REG_FWHW_TXQ_CTRL 0x0420 #define BIT_EN_BCNQ_DL BIT(22) #define BIT_EN_WR_FREE_TAIL BIT(20) +#define REG_HWSEQ_CTRL 0x0423 + #define REG_BCNQ_BDNY_V1 0x0424 #define REG_LIFETIME_EN 0x0426 #define BIT_BA_PARSER_EN BIT(5) -- cgit v1.2.3-59-g8ed1b From fd9ead385102652b43f628ca700810d343c52437 Mon Sep 17 00:00:00 2001 From: Ping-Ke Shih Date: Wed, 22 Apr 2020 11:46:03 +0800 Subject: rtw88: decompose while(1) loop of power sequence polling command The power polling command is one kind of power sequence commands. It's used to check hardware situation, and subsequent comamnds will be executed if hardware is ready. A special case is PCIE must toggle BIT_PFM_WOWL and try again if first try is failed. In order to reduce indentation to understand the code easier, move polling part to a separate function. Then, the 'while (1)...loop' is replaced by two statements to do first try and retry. Signed-off-by: Ping-Ke Shih Signed-off-by: Yan-Hsuan Chuang Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/20200422034607.28747-5-yhchuang@realtek.com --- drivers/net/wireless/realtek/rtw88/mac.c | 72 +++++++++++++++++--------------- 1 file changed, 38 insertions(+), 34 deletions(-) diff --git a/drivers/net/wireless/realtek/rtw88/mac.c b/drivers/net/wireless/realtek/rtw88/mac.c index 21b5c7173f0f..ac5d35153c8a 100644 --- a/drivers/net/wireless/realtek/rtw88/mac.c +++ b/drivers/net/wireless/realtek/rtw88/mac.c @@ -108,51 +108,55 @@ static int rtw_mac_pre_system_cfg(struct rtw_dev *rtwdev) return 0; } +static bool do_pwr_poll_cmd(struct rtw_dev *rtwdev, u32 addr, u32 mask, u32 target) +{ + u32 cnt; + + target &= mask; + + for (cnt = 0; cnt < RTW_PWR_POLLING_CNT; cnt++) { + if ((rtw_read8(rtwdev, addr) & mask) == target) + return true; + + udelay(50); + } + + return false; +} + static int rtw_pwr_cmd_polling(struct rtw_dev *rtwdev, const struct rtw_pwr_seq_cmd *cmd) { u8 value; - u8 flag = 0; u32 offset; - u32 cnt = RTW_PWR_POLLING_CNT; if (cmd->base == RTW_PWR_ADDR_SDIO) offset = cmd->offset | SDIO_LOCAL_OFFSET; else offset = cmd->offset; - do { - cnt--; - value = rtw_read8(rtwdev, offset); - value &= cmd->mask; - if (value == (cmd->value & cmd->mask)) - return 0; - if (cnt == 0) { - if (rtw_hci_type(rtwdev) == RTW_HCI_TYPE_PCIE && - flag == 0) { - value = rtw_read8(rtwdev, REG_SYS_PW_CTRL); - if (rtwdev->chip->id == RTW_CHIP_TYPE_8723D) { - value &= ~BIT_PFM_WOWL; - rtw_write8(rtwdev, REG_SYS_PW_CTRL, value); - } - value |= BIT_PFM_WOWL; - rtw_write8(rtwdev, REG_SYS_PW_CTRL, value); - value &= ~BIT_PFM_WOWL; - rtw_write8(rtwdev, REG_SYS_PW_CTRL, value); - if (rtwdev->chip->id == RTW_CHIP_TYPE_8723D) { - value |= BIT_PFM_WOWL; - rtw_write8(rtwdev, REG_SYS_PW_CTRL, value); - } - - cnt = RTW_PWR_POLLING_CNT; - flag = 1; - } else { - return -EBUSY; - } - } else { - udelay(50); - } - } while (1); + if (do_pwr_poll_cmd(rtwdev, offset, cmd->mask, cmd->value)) + return 0; + + if (rtw_hci_type(rtwdev) != RTW_HCI_TYPE_PCIE) + goto err; + + /* if PCIE, toggle BIT_PFM_WOWL and try again */ + value = rtw_read8(rtwdev, REG_SYS_PW_CTRL); + if (rtwdev->chip->id == RTW_CHIP_TYPE_8723D) + rtw_write8(rtwdev, REG_SYS_PW_CTRL, value & ~BIT_PFM_WOWL); + rtw_write8(rtwdev, REG_SYS_PW_CTRL, value | BIT_PFM_WOWL); + rtw_write8(rtwdev, REG_SYS_PW_CTRL, value & ~BIT_PFM_WOWL); + if (rtwdev->chip->id == RTW_CHIP_TYPE_8723D) + rtw_write8(rtwdev, REG_SYS_PW_CTRL, value | BIT_PFM_WOWL); + + if (do_pwr_poll_cmd(rtwdev, offset, cmd->mask, cmd->value)) + return 0; + +err: + rtw_err(rtwdev, "failed to poll offset=0x%x mask=0x%x value=0x%x\n", + offset, cmd->mask, cmd->value); + return -EBUSY; } static int rtw_sub_pwr_seq_parser(struct rtw_dev *rtwdev, u8 intf_mask, -- cgit v1.2.3-59-g8ed1b From 7907b52de08aeb27cea05bd3a2c825658f91f051 Mon Sep 17 00:00:00 2001 From: Ping-Ke Shih Date: Wed, 22 Apr 2020 11:46:04 +0800 Subject: rtw88: 8723d: 11N chips don't support H2C queue H2C queue is used to send command to firmware. Since 8723D doesn't support this queue, this commit check wlan_cpu flag to avoid to set H2C related registers. Signed-off-by: Ping-Ke Shih Signed-off-by: Yan-Hsuan Chuang Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/20200422034607.28747-6-yhchuang@realtek.com --- drivers/net/wireless/realtek/rtw88/mac.c | 6 +++++- drivers/net/wireless/realtek/rtw88/pci.c | 35 +++++++++++++++++++++----------- 2 files changed, 28 insertions(+), 13 deletions(-) diff --git a/drivers/net/wireless/realtek/rtw88/mac.c b/drivers/net/wireless/realtek/rtw88/mac.c index ac5d35153c8a..f4a504b350cf 100644 --- a/drivers/net/wireless/realtek/rtw88/mac.c +++ b/drivers/net/wireless/realtek/rtw88/mac.c @@ -1016,7 +1016,8 @@ static int txdma_queue_mapping(struct rtw_dev *rtwdev) rtw_write8(rtwdev, REG_CR, 0); rtw_write8(rtwdev, REG_CR, MAC_TRX_ENABLE); - rtw_write32(rtwdev, REG_H2CQ_CSR, BIT_H2CQ_FULL); + if (rtw_chip_wcpu_11ac(rtwdev)) + rtw_write32(rtwdev, REG_H2CQ_CSR, BIT_H2CQ_FULL); return 0; } @@ -1135,6 +1136,9 @@ static int init_h2c(struct rtw_dev *rtwdev) u32 h2cq_free; u32 wp, rp; + if (rtw_chip_wcpu_11n(rtwdev)) + return 0; + h2cq_addr = fifo->rsvd_h2cq_addr << TX_PAGE_SIZE_SHIFT; h2cq_size = RSVD_PG_H2CQ_NUM << TX_PAGE_SIZE_SHIFT; diff --git a/drivers/net/wireless/realtek/rtw88/pci.c b/drivers/net/wireless/realtek/rtw88/pci.c index b3e76b579af9..8a8d746d3349 100644 --- a/drivers/net/wireless/realtek/rtw88/pci.c +++ b/drivers/net/wireless/realtek/rtw88/pci.c @@ -411,12 +411,14 @@ static void rtw_pci_reset_buf_desc(struct rtw_dev *rtwdev) dma = rtwpci->tx_rings[RTW_TX_QUEUE_BCN].r.dma; rtw_write32(rtwdev, RTK_PCI_TXBD_DESA_BCNQ, dma); - len = rtwpci->tx_rings[RTW_TX_QUEUE_H2C].r.len; - dma = rtwpci->tx_rings[RTW_TX_QUEUE_H2C].r.dma; - rtwpci->tx_rings[RTW_TX_QUEUE_H2C].r.rp = 0; - rtwpci->tx_rings[RTW_TX_QUEUE_H2C].r.wp = 0; - rtw_write16(rtwdev, RTK_PCI_TXBD_NUM_H2CQ, len & TRX_BD_IDX_MASK); - rtw_write32(rtwdev, RTK_PCI_TXBD_DESA_H2CQ, dma); + if (!rtw_chip_wcpu_11n(rtwdev)) { + len = rtwpci->tx_rings[RTW_TX_QUEUE_H2C].r.len; + dma = rtwpci->tx_rings[RTW_TX_QUEUE_H2C].r.dma; + rtwpci->tx_rings[RTW_TX_QUEUE_H2C].r.rp = 0; + rtwpci->tx_rings[RTW_TX_QUEUE_H2C].r.wp = 0; + rtw_write16(rtwdev, RTK_PCI_TXBD_NUM_H2CQ, len & TRX_BD_IDX_MASK); + rtw_write32(rtwdev, RTK_PCI_TXBD_DESA_H2CQ, dma); + } len = rtwpci->tx_rings[RTW_TX_QUEUE_BK].r.len; dma = rtwpci->tx_rings[RTW_TX_QUEUE_BK].r.dma; @@ -471,8 +473,9 @@ static void rtw_pci_reset_buf_desc(struct rtw_dev *rtwdev) rtw_write32(rtwdev, RTK_PCI_TXBD_RWPTR_CLR, 0xffffffff); /* reset H2C Queue index in a single write */ - rtw_write32_set(rtwdev, RTK_PCI_TXBD_H2CQ_CSR, - BIT_CLR_H2CQ_HOST_IDX | BIT_CLR_H2CQ_HW_IDX); + if (rtw_chip_wcpu_11ac(rtwdev)) + rtw_write32_set(rtwdev, RTK_PCI_TXBD_H2CQ_CSR, + BIT_CLR_H2CQ_HOST_IDX | BIT_CLR_H2CQ_HW_IDX); } static void rtw_pci_reset_trx_ring(struct rtw_dev *rtwdev) @@ -489,7 +492,9 @@ static void rtw_pci_enable_interrupt(struct rtw_dev *rtwdev, rtw_write32(rtwdev, RTK_PCI_HIMR0, rtwpci->irq_mask[0]); rtw_write32(rtwdev, RTK_PCI_HIMR1, rtwpci->irq_mask[1]); - rtw_write32(rtwdev, RTK_PCI_HIMR3, rtwpci->irq_mask[3]); + if (rtw_chip_wcpu_11ac(rtwdev)) + rtw_write32(rtwdev, RTK_PCI_HIMR3, rtwpci->irq_mask[3]); + rtwpci->irq_enabled = true; spin_unlock_irqrestore(&rtwpci->hwirq_lock, flags); @@ -507,7 +512,9 @@ static void rtw_pci_disable_interrupt(struct rtw_dev *rtwdev, rtw_write32(rtwdev, RTK_PCI_HIMR0, 0); rtw_write32(rtwdev, RTK_PCI_HIMR1, 0); - rtw_write32(rtwdev, RTK_PCI_HIMR3, 0); + if (rtw_chip_wcpu_11ac(rtwdev)) + rtw_write32(rtwdev, RTK_PCI_HIMR3, 0); + rtwpci->irq_enabled = false; out: @@ -1012,13 +1019,17 @@ static void rtw_pci_irq_recognized(struct rtw_dev *rtwdev, irq_status[0] = rtw_read32(rtwdev, RTK_PCI_HISR0); irq_status[1] = rtw_read32(rtwdev, RTK_PCI_HISR1); - irq_status[3] = rtw_read32(rtwdev, RTK_PCI_HISR3); + if (rtw_chip_wcpu_11ac(rtwdev)) + irq_status[3] = rtw_read32(rtwdev, RTK_PCI_HISR3); + else + irq_status[3] = 0; irq_status[0] &= rtwpci->irq_mask[0]; irq_status[1] &= rtwpci->irq_mask[1]; irq_status[3] &= rtwpci->irq_mask[3]; rtw_write32(rtwdev, RTK_PCI_HISR0, irq_status[0]); rtw_write32(rtwdev, RTK_PCI_HISR1, irq_status[1]); - rtw_write32(rtwdev, RTK_PCI_HISR3, irq_status[3]); + if (rtw_chip_wcpu_11ac(rtwdev)) + rtw_write32(rtwdev, RTK_PCI_HISR3, irq_status[3]); spin_unlock_irqrestore(&rtwpci->hwirq_lock, flags); } -- cgit v1.2.3-59-g8ed1b From ba9f0d1b8d9debf2e2d83db01d3b8f63fb75d9d5 Mon Sep 17 00:00:00 2001 From: Ping-Ke Shih Date: Wed, 22 Apr 2020 11:46:05 +0800 Subject: rtw88: 8723d: implement set_tx_power_index ops The txagc table is used to map rate_id and txagc register address and mask, and ops set_tx_power_index uses this table to write TX power to corresponding registers. Since 8723D is a 1x1 2.4G 11n chip, only CCK, OFDM and HT_MCS 0-7 are listed in the table. Signed-off-by: Ping-Ke Shih Signed-off-by: Yan-Hsuan Chuang Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/20200422034607.28747-7-yhchuang@realtek.com --- drivers/net/wireless/realtek/rtw88/rtw8723d.c | 61 +++++++++++++++++++++++++++ 1 file changed, 61 insertions(+) diff --git a/drivers/net/wireless/realtek/rtw88/rtw8723d.c b/drivers/net/wireless/realtek/rtw88/rtw8723d.c index 5e8e0dd6456e..f2d21272b237 100644 --- a/drivers/net/wireless/realtek/rtw88/rtw8723d.c +++ b/drivers/net/wireless/realtek/rtw88/rtw8723d.c @@ -14,6 +14,29 @@ #include "reg.h" #include "debug.h" +static const struct rtw_hw_reg rtw8723d_txagc[] = { + [DESC_RATE1M] = { .addr = 0xe08, .mask = 0x0000ff00 }, + [DESC_RATE2M] = { .addr = 0x86c, .mask = 0x0000ff00 }, + [DESC_RATE5_5M] = { .addr = 0x86c, .mask = 0x00ff0000 }, + [DESC_RATE11M] = { .addr = 0x86c, .mask = 0xff000000 }, + [DESC_RATE6M] = { .addr = 0xe00, .mask = 0x000000ff }, + [DESC_RATE9M] = { .addr = 0xe00, .mask = 0x0000ff00 }, + [DESC_RATE12M] = { .addr = 0xe00, .mask = 0x00ff0000 }, + [DESC_RATE18M] = { .addr = 0xe00, .mask = 0xff000000 }, + [DESC_RATE24M] = { .addr = 0xe04, .mask = 0x000000ff }, + [DESC_RATE36M] = { .addr = 0xe04, .mask = 0x0000ff00 }, + [DESC_RATE48M] = { .addr = 0xe04, .mask = 0x00ff0000 }, + [DESC_RATE54M] = { .addr = 0xe04, .mask = 0xff000000 }, + [DESC_RATEMCS0] = { .addr = 0xe10, .mask = 0x000000ff }, + [DESC_RATEMCS1] = { .addr = 0xe10, .mask = 0x0000ff00 }, + [DESC_RATEMCS2] = { .addr = 0xe10, .mask = 0x00ff0000 }, + [DESC_RATEMCS3] = { .addr = 0xe10, .mask = 0xff000000 }, + [DESC_RATEMCS4] = { .addr = 0xe14, .mask = 0x000000ff }, + [DESC_RATEMCS5] = { .addr = 0xe14, .mask = 0x0000ff00 }, + [DESC_RATEMCS6] = { .addr = 0xe14, .mask = 0x00ff0000 }, + [DESC_RATEMCS7] = { .addr = 0xe14, .mask = 0xff000000 }, +}; + static void rtw8723de_efuse_parsing(struct rtw_efuse *efuse, struct rtw8723d_efuse *map) { @@ -70,6 +93,43 @@ static void rtw8723d_cfg_ldo25(struct rtw_dev *rtwdev, bool enable) rtw_write8(rtwdev, REG_LDO_EFUSE_CTRL + 3, ldo_pwr); } +static void +rtw8723d_set_tx_power_index_by_rate(struct rtw_dev *rtwdev, u8 path, u8 rs) +{ + struct rtw_hal *hal = &rtwdev->hal; + const struct rtw_hw_reg *txagc; + u8 rate, pwr_index; + int j; + + for (j = 0; j < rtw_rate_size[rs]; j++) { + rate = rtw_rate_section[rs][j]; + pwr_index = hal->tx_pwr_tbl[path][rate]; + + if (rate >= ARRAY_SIZE(rtw8723d_txagc)) { + rtw_warn(rtwdev, "rate 0x%x isn't supported\n", rate); + continue; + } + txagc = &rtw8723d_txagc[rate]; + if (!txagc->addr) { + rtw_warn(rtwdev, "rate 0x%x isn't defined\n", rate); + continue; + } + + rtw_write32_mask(rtwdev, txagc->addr, txagc->mask, pwr_index); + } +} + +static void rtw8723d_set_tx_power_index(struct rtw_dev *rtwdev) +{ + struct rtw_hal *hal = &rtwdev->hal; + int rs, path; + + for (path = 0; path < hal->rf_path_num; path++) { + for (rs = 0; rs <= RTW_RATE_SECTION_HT_1S; rs++) + rtw8723d_set_tx_power_index_by_rate(rtwdev, path, rs); + } +} + static void rtw8723d_efuse_grant(struct rtw_dev *rtwdev, bool on) { if (on) { @@ -86,6 +146,7 @@ static struct rtw_chip_ops rtw8723d_ops = { .read_efuse = rtw8723d_read_efuse, .read_rf = rtw_phy_read_rf_sipi, .write_rf = rtw_phy_write_rf_reg_sipi, + .set_tx_power_index = rtw8723d_set_tx_power_index, .set_antenna = NULL, .cfg_ldo25 = rtw8723d_cfg_ldo25, .efuse_grant = rtw8723d_efuse_grant, -- cgit v1.2.3-59-g8ed1b From d91277de23310c497212c2a2d2313e126cc3f2b8 Mon Sep 17 00:00:00 2001 From: Ping-Ke Shih Date: Wed, 22 Apr 2020 11:46:06 +0800 Subject: rtw88: 8723d: Organize chip TX/RX FIFO TX FIFO size is 32k and it was divided into 256 pages with 128 bytes. A boundary is used to split pages into two parts, head part is used to store TX packets coming from host, and tail part is reserved for special purposes, such as beacon packet, null data packet and so on. The TX packets coming from host have many categories, such as VO, VI, BE, BK, MG and etc. When going into head part of TX FIFO, they are classified to four priority queue named low, normal, high and extra priority queues. Each priority queue occupies predefined number of page, if a certain priority queue is full, TX packet will store into PUB priority queue. Similarly, RX FIFO is 16k and split into two parts, head part is used to store RX packets, and tail part is 128 bytes and used to store report. Thus, we fill this boundary to register as well. Signed-off-by: Ping-Ke Shih Signed-off-by: Yan-Hsuan Chuang Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/20200422034607.28747-8-yhchuang@realtek.com --- drivers/net/wireless/realtek/rtw88/mac.c | 140 +++++++++++++++++--------- drivers/net/wireless/realtek/rtw88/mac.h | 1 + drivers/net/wireless/realtek/rtw88/reg.h | 28 ++++++ drivers/net/wireless/realtek/rtw88/rtw8723d.c | 31 ++++++ 4 files changed, 154 insertions(+), 46 deletions(-) diff --git a/drivers/net/wireless/realtek/rtw88/mac.c b/drivers/net/wireless/realtek/rtw88/mac.c index f4a504b350cf..645207a01525 100644 --- a/drivers/net/wireless/realtek/rtw88/mac.c +++ b/drivers/net/wireless/realtek/rtw88/mac.c @@ -1032,13 +1032,16 @@ static int set_trx_fifo_info(struct rtw_dev *rtwdev) /* config rsvd page num */ fifo->rsvd_drv_pg_num = 8; fifo->txff_pg_num = chip->txff_size >> 7; - fifo->rsvd_pg_num = fifo->rsvd_drv_pg_num + - RSVD_PG_H2C_EXTRAINFO_NUM + - RSVD_PG_H2C_STATICINFO_NUM + - RSVD_PG_H2CQ_NUM + - RSVD_PG_CPU_INSTRUCTION_NUM + - RSVD_PG_FW_TXBUF_NUM + - csi_buf_pg_num; + if (rtw_chip_wcpu_11n(rtwdev)) + fifo->rsvd_pg_num = fifo->rsvd_drv_pg_num; + else + fifo->rsvd_pg_num = fifo->rsvd_drv_pg_num + + RSVD_PG_H2C_EXTRAINFO_NUM + + RSVD_PG_H2C_STATICINFO_NUM + + RSVD_PG_H2CQ_NUM + + RSVD_PG_CPU_INSTRUCTION_NUM + + RSVD_PG_FW_TXBUF_NUM + + csi_buf_pg_num; if (fifo->rsvd_pg_num > fifo->txff_pg_num) return -ENOMEM; @@ -1047,18 +1050,20 @@ static int set_trx_fifo_info(struct rtw_dev *rtwdev) fifo->rsvd_boundary = fifo->txff_pg_num - fifo->rsvd_pg_num; cur_pg_addr = fifo->txff_pg_num; - cur_pg_addr -= csi_buf_pg_num; - fifo->rsvd_csibuf_addr = cur_pg_addr; - cur_pg_addr -= RSVD_PG_FW_TXBUF_NUM; - fifo->rsvd_fw_txbuf_addr = cur_pg_addr; - cur_pg_addr -= RSVD_PG_CPU_INSTRUCTION_NUM; - fifo->rsvd_cpu_instr_addr = cur_pg_addr; - cur_pg_addr -= RSVD_PG_H2CQ_NUM; - fifo->rsvd_h2cq_addr = cur_pg_addr; - cur_pg_addr -= RSVD_PG_H2C_STATICINFO_NUM; - fifo->rsvd_h2c_sta_info_addr = cur_pg_addr; - cur_pg_addr -= RSVD_PG_H2C_EXTRAINFO_NUM; - fifo->rsvd_h2c_info_addr = cur_pg_addr; + if (rtw_chip_wcpu_11ac(rtwdev)) { + cur_pg_addr -= csi_buf_pg_num; + fifo->rsvd_csibuf_addr = cur_pg_addr; + cur_pg_addr -= RSVD_PG_FW_TXBUF_NUM; + fifo->rsvd_fw_txbuf_addr = cur_pg_addr; + cur_pg_addr -= RSVD_PG_CPU_INSTRUCTION_NUM; + fifo->rsvd_cpu_instr_addr = cur_pg_addr; + cur_pg_addr -= RSVD_PG_H2CQ_NUM; + fifo->rsvd_h2cq_addr = cur_pg_addr; + cur_pg_addr -= RSVD_PG_H2C_STATICINFO_NUM; + fifo->rsvd_h2c_sta_info_addr = cur_pg_addr; + cur_pg_addr -= RSVD_PG_H2C_EXTRAINFO_NUM; + fifo->rsvd_h2c_info_addr = cur_pg_addr; + } cur_pg_addr -= fifo->rsvd_drv_pg_num; fifo->rsvd_drv_addr = cur_pg_addr; @@ -1070,6 +1075,65 @@ static int set_trx_fifo_info(struct rtw_dev *rtwdev) return 0; } +static int __priority_queue_cfg(struct rtw_dev *rtwdev, + const struct rtw_page_table *pg_tbl, + u16 pubq_num) +{ + struct rtw_fifo_conf *fifo = &rtwdev->fifo; + struct rtw_chip_info *chip = rtwdev->chip; + + rtw_write16(rtwdev, REG_FIFOPAGE_INFO_1, pg_tbl->hq_num); + rtw_write16(rtwdev, REG_FIFOPAGE_INFO_2, pg_tbl->lq_num); + rtw_write16(rtwdev, REG_FIFOPAGE_INFO_3, pg_tbl->nq_num); + rtw_write16(rtwdev, REG_FIFOPAGE_INFO_4, pg_tbl->exq_num); + rtw_write16(rtwdev, REG_FIFOPAGE_INFO_5, pubq_num); + rtw_write32_set(rtwdev, REG_RQPN_CTRL_2, BIT_LD_RQPN); + + rtw_write16(rtwdev, REG_FIFOPAGE_CTRL_2, fifo->rsvd_boundary); + rtw_write8_set(rtwdev, REG_FWHW_TXQ_CTRL + 2, BIT_EN_WR_FREE_TAIL >> 16); + + rtw_write16(rtwdev, REG_BCNQ_BDNY_V1, fifo->rsvd_boundary); + rtw_write16(rtwdev, REG_FIFOPAGE_CTRL_2 + 2, fifo->rsvd_boundary); + rtw_write16(rtwdev, REG_BCNQ1_BDNY_V1, fifo->rsvd_boundary); + rtw_write32(rtwdev, REG_RXFF_BNDY, chip->rxff_size - C2H_PKT_BUF - 1); + rtw_write8_set(rtwdev, REG_AUTO_LLT_V1, BIT_AUTO_INIT_LLT_V1); + + if (!check_hw_ready(rtwdev, REG_AUTO_LLT_V1, BIT_AUTO_INIT_LLT_V1, 0)) + return -EBUSY; + + rtw_write8(rtwdev, REG_CR + 3, 0); + + return 0; +} + +static int __priority_queue_cfg_legacy(struct rtw_dev *rtwdev, + const struct rtw_page_table *pg_tbl, + u16 pubq_num) +{ + struct rtw_fifo_conf *fifo = &rtwdev->fifo; + struct rtw_chip_info *chip = rtwdev->chip; + u32 val32; + + val32 = BIT_RQPN_NE(pg_tbl->nq_num, pg_tbl->exq_num); + rtw_write32(rtwdev, REG_RQPN_NPQ, val32); + val32 = BIT_RQPN_HLP(pg_tbl->hq_num, pg_tbl->lq_num, pubq_num); + rtw_write32(rtwdev, REG_RQPN, val32); + + rtw_write8(rtwdev, REG_TRXFF_BNDY, fifo->rsvd_boundary); + rtw_write16(rtwdev, REG_TRXFF_BNDY + 2, chip->rxff_size - REPORT_BUF - 1); + rtw_write8(rtwdev, REG_DWBCN0_CTRL + 1, fifo->rsvd_boundary); + rtw_write8(rtwdev, REG_BCNQ_BDNY, fifo->rsvd_boundary); + rtw_write8(rtwdev, REG_MGQ_BDNY, fifo->rsvd_boundary); + rtw_write8(rtwdev, REG_WMAC_LBK_BF_HD, fifo->rsvd_boundary); + + rtw_write32_set(rtwdev, REG_AUTO_LLT, BIT_AUTO_INIT_LLT); + + if (!check_hw_ready(rtwdev, REG_AUTO_LLT, BIT_AUTO_INIT_LLT, 0)) + return -EBUSY; + + return 0; +} + static int priority_queue_cfg(struct rtw_dev *rtwdev) { struct rtw_fifo_conf *fifo = &rtwdev->fifo; @@ -1102,28 +1166,10 @@ static int priority_queue_cfg(struct rtw_dev *rtwdev) pubq_num = fifo->acq_pg_num - pg_tbl->hq_num - pg_tbl->lq_num - pg_tbl->nq_num - pg_tbl->exq_num - pg_tbl->gapq_num; - rtw_write16(rtwdev, REG_FIFOPAGE_INFO_1, pg_tbl->hq_num); - rtw_write16(rtwdev, REG_FIFOPAGE_INFO_2, pg_tbl->lq_num); - rtw_write16(rtwdev, REG_FIFOPAGE_INFO_3, pg_tbl->nq_num); - rtw_write16(rtwdev, REG_FIFOPAGE_INFO_4, pg_tbl->exq_num); - rtw_write16(rtwdev, REG_FIFOPAGE_INFO_5, pubq_num); - rtw_write32_set(rtwdev, REG_RQPN_CTRL_2, BIT_LD_RQPN); - - rtw_write16(rtwdev, REG_FIFOPAGE_CTRL_2, fifo->rsvd_boundary); - rtw_write8_set(rtwdev, REG_FWHW_TXQ_CTRL + 2, BIT_EN_WR_FREE_TAIL >> 16); - - rtw_write16(rtwdev, REG_BCNQ_BDNY_V1, fifo->rsvd_boundary); - rtw_write16(rtwdev, REG_FIFOPAGE_CTRL_2 + 2, fifo->rsvd_boundary); - rtw_write16(rtwdev, REG_BCNQ1_BDNY_V1, fifo->rsvd_boundary); - rtw_write32(rtwdev, REG_RXFF_BNDY, chip->rxff_size - C2H_PKT_BUF - 1); - rtw_write8_set(rtwdev, REG_AUTO_LLT_V1, BIT_AUTO_INIT_LLT_V1); - - if (!check_hw_ready(rtwdev, REG_AUTO_LLT_V1, BIT_AUTO_INIT_LLT_V1, 0)) - return -EBUSY; - - rtw_write8(rtwdev, REG_CR + 3, 0); - - return 0; + if (rtw_chip_wcpu_11n(rtwdev)) + return __priority_queue_cfg_legacy(rtwdev, pg_tbl, pubq_num); + else + return __priority_queue_cfg(rtwdev, pg_tbl, pubq_num); } static int init_h2c(struct rtw_dev *rtwdev) @@ -1203,11 +1249,13 @@ static int rtw_drv_info_cfg(struct rtw_dev *rtwdev) u8 value8; rtw_write8(rtwdev, REG_RX_DRVINFO_SZ, PHY_STATUS_SIZE); - value8 = rtw_read8(rtwdev, REG_TRXFF_BNDY + 1); - value8 &= 0xF0; - /* For rxdesc len = 0 issue */ - value8 |= 0xF; - rtw_write8(rtwdev, REG_TRXFF_BNDY + 1, value8); + if (rtw_chip_wcpu_11ac(rtwdev)) { + value8 = rtw_read8(rtwdev, REG_TRXFF_BNDY + 1); + value8 &= 0xF0; + /* For rxdesc len = 0 issue */ + value8 |= 0xF; + rtw_write8(rtwdev, REG_TRXFF_BNDY + 1, value8); + } rtw_write32_set(rtwdev, REG_RCR, BIT_APP_PHYSTS); rtw_write32_clr(rtwdev, REG_WMAC_OPTION_FUNCTION + 4, BIT(8) | BIT(9)); diff --git a/drivers/net/wireless/realtek/rtw88/mac.h b/drivers/net/wireless/realtek/rtw88/mac.h index 592dc830160c..ce64cdf7a565 100644 --- a/drivers/net/wireless/realtek/rtw88/mac.h +++ b/drivers/net/wireless/realtek/rtw88/mac.h @@ -10,6 +10,7 @@ #define SDIO_LOCAL_OFFSET 0x10250000 #define DDMA_POLLING_COUNT 1000 #define C2H_PKT_BUF 256 +#define REPORT_BUF 128 #define PHY_STATUS_SIZE 4 #define ILLEGAL_KEY_GROUP 0xFAAAAA00 diff --git a/drivers/net/wireless/realtek/rtw88/reg.h b/drivers/net/wireless/realtek/rtw88/reg.h index c1e66d656307..00eb6b6a1f5b 100644 --- a/drivers/net/wireless/realtek/rtw88/reg.h +++ b/drivers/net/wireless/realtek/rtw88/reg.h @@ -209,6 +209,19 @@ #define REG_HMEBOX2_EX 0x01F8 #define REG_HMEBOX3_EX 0x01FC +#define REG_RQPN 0x0200 +#define BIT_MASK_HPQ 0xff +#define BIT_SHIFT_HPQ 0 +#define BIT_RQPN_HPQ(x) (((x) & BIT_MASK_HPQ) << BIT_SHIFT_HPQ) +#define BIT_MASK_LPQ 0xff +#define BIT_SHIFT_LPQ 8 +#define BIT_RQPN_LPQ(x) (((x) & BIT_MASK_LPQ) << BIT_SHIFT_LPQ) +#define BIT_MASK_PUBQ 0xff +#define BIT_SHIFT_PUBQ 16 +#define BIT_RQPN_PUBQ(x) (((x) & BIT_MASK_PUBQ) << BIT_SHIFT_PUBQ) +#define BIT_RQPN_HLP(h, l, p) (BIT_LD_RQPN | BIT_RQPN_HPQ(h) | \ + BIT_RQPN_LPQ(l) | BIT_RQPN_PUBQ(p)) + #define REG_FIFOPAGE_CTRL_2 0x0204 #define BIT_BCN_VALID_V1 BIT(15) #define BIT_MASK_BCN_HEAD_1_V1 0xfff @@ -219,6 +232,18 @@ #define REG_TXDMA_OFFSET_CHK 0x020C #define REG_TXDMA_STATUS 0x0210 #define BTI_PAGE_OVF BIT(2) + +#define REG_RQPN_NPQ 0x0214 +#define BIT_MASK_NPQ 0xff +#define BIT_SHIFT_NPQ 0 +#define BIT_MASK_EPQ 0xff +#define BIT_SHIFT_EPQ 16 +#define BIT_RQPN_NPQ(x) (((x) & BIT_MASK_NPQ) << BIT_SHIFT_NPQ) +#define BIT_RQPN_EPQ(x) (((x) & BIT_MASK_EPQ) << BIT_SHIFT_EPQ) +#define BIT_RQPN_NE(n, e) (BIT_RQPN_NPQ(n) | BIT_RQPN_EPQ(e)) + +#define REG_AUTO_LLT 0x0224 +#define BIT_AUTO_INIT_LLT BIT(16) #define REG_RQPN_CTRL_1 0x0228 #define REG_RQPN_CTRL_2 0x022C #define BIT_LD_RQPN BIT(31) @@ -249,6 +274,8 @@ #define REG_HWSEQ_CTRL 0x0423 #define REG_BCNQ_BDNY_V1 0x0424 +#define REG_BCNQ_BDNY 0x0424 +#define REG_MGQ_BDNY 0x0425 #define REG_LIFETIME_EN 0x0426 #define BIT_BA_PARSER_EN BIT(5) #define REG_SPEC_SIFS 0x0428 @@ -264,6 +291,7 @@ #define BIT_CHECK_CCK_EN BIT(7) #define REG_AMPDU_MAX_TIME_V1 0x0455 #define REG_BCNQ1_BDNY_V1 0x0456 +#define REG_WMAC_LBK_BF_HD 0x045D #define REG_TX_HANG_CTRL 0x045E #define BIT_EN_GNT_BT_AWAKE BIT(3) #define BIT_EN_EOF_V1 BIT(2) diff --git a/drivers/net/wireless/realtek/rtw88/rtw8723d.c b/drivers/net/wireless/realtek/rtw88/rtw8723d.c index f2d21272b237..c03ed91349e5 100644 --- a/drivers/net/wireless/realtek/rtw88/rtw8723d.c +++ b/drivers/net/wireless/realtek/rtw88/rtw8723d.c @@ -556,6 +556,32 @@ static const struct rtw_pwr_seq_cmd *card_disable_flow_8723d[] = { NULL }; +static const struct rtw_page_table page_table_8723d[] = { + {12, 2, 2, 0, 1}, + {12, 2, 2, 0, 1}, + {12, 2, 2, 0, 1}, + {12, 2, 2, 0, 1}, + {12, 2, 2, 0, 1}, +}; + +static const struct rtw_rqpn rqpn_table_8723d[] = { + {RTW_DMA_MAPPING_NORMAL, RTW_DMA_MAPPING_NORMAL, + RTW_DMA_MAPPING_LOW, RTW_DMA_MAPPING_LOW, + RTW_DMA_MAPPING_EXTRA, RTW_DMA_MAPPING_HIGH}, + {RTW_DMA_MAPPING_NORMAL, RTW_DMA_MAPPING_NORMAL, + RTW_DMA_MAPPING_LOW, RTW_DMA_MAPPING_LOW, + RTW_DMA_MAPPING_EXTRA, RTW_DMA_MAPPING_HIGH}, + {RTW_DMA_MAPPING_NORMAL, RTW_DMA_MAPPING_NORMAL, + RTW_DMA_MAPPING_NORMAL, RTW_DMA_MAPPING_HIGH, + RTW_DMA_MAPPING_HIGH, RTW_DMA_MAPPING_HIGH}, + {RTW_DMA_MAPPING_NORMAL, RTW_DMA_MAPPING_NORMAL, + RTW_DMA_MAPPING_LOW, RTW_DMA_MAPPING_LOW, + RTW_DMA_MAPPING_HIGH, RTW_DMA_MAPPING_HIGH}, + {RTW_DMA_MAPPING_NORMAL, RTW_DMA_MAPPING_NORMAL, + RTW_DMA_MAPPING_LOW, RTW_DMA_MAPPING_LOW, + RTW_DMA_MAPPING_EXTRA, RTW_DMA_MAPPING_HIGH}, +}; + static const struct rtw_rf_sipi_addr rtw8723d_rf_sipi_addr[] = { [RF_PATH_A] = { .hssi_1 = 0x820, .lssi_read = 0x8a0, .hssi_2 = 0x824, .lssi_read_pi = 0x8b8}, @@ -580,17 +606,22 @@ struct rtw_chip_info rtw8723d_hw_spec = { .phy_efuse_size = 512, .log_efuse_size = 512, .ptct_efuse_size = 96 + 1, + .txff_size = 32768, + .rxff_size = 16384, .txgi_factor = 1, .is_pwr_by_rate_dec = true, .max_power_index = 0x3f, .csi_buf_pg_num = 0, .band = RTW_BAND_2G, + .page_size = 128, .ht_supported = true, .vht_supported = false, .lps_deep_mode_supported = 0, .sys_func_en = 0xFD, .pwr_on_seq = card_enable_flow_8723d, .pwr_off_seq = card_disable_flow_8723d, + .page_table = page_table_8723d, + .rqpn_table = rqpn_table_8723d, .rf_sipi_addr = {0x840, 0x844}, .rf_sipi_read_addr = rtw8723d_rf_sipi_addr, .fix_rf_phy_num = 2, -- cgit v1.2.3-59-g8ed1b From 75e69fb11b40ba1256b14f943c7050682c1f5458 Mon Sep 17 00:00:00 2001 From: Ping-Ke Shih Date: Wed, 22 Apr 2020 11:46:07 +0800 Subject: rtw88: 8723d: initialize mac/bb/rf basic functions Implement rtw_chip_ops::phy_set_param and ::mac_init to initialize mac/bb/rf, and they are used during interface up. The procedure contains power on sequence registers, download firmware, load predefined parameters, mac/bb/rf specific register and etc. Signed-off-by: Ping-Ke Shih Signed-off-by: Yan-Hsuan Chuang Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/20200422034607.28747-9-yhchuang@realtek.com --- drivers/net/wireless/realtek/rtw88/main.h | 1 + drivers/net/wireless/realtek/rtw88/reg.h | 34 +++++++ drivers/net/wireless/realtek/rtw88/rtw8723d.c | 124 ++++++++++++++++++++++++++ drivers/net/wireless/realtek/rtw88/rtw8723d.h | 3 + 4 files changed, 162 insertions(+) diff --git a/drivers/net/wireless/realtek/rtw88/main.h b/drivers/net/wireless/realtek/rtw88/main.h index 380a670eeeee..157aca641f6d 100644 --- a/drivers/net/wireless/realtek/rtw88/main.h +++ b/drivers/net/wireless/realtek/rtw88/main.h @@ -1475,6 +1475,7 @@ struct rtw_efuse { u8 ant_div_cfg; u8 ant_div_type; u8 regd; + u8 afe; u8 lna_type_2g; u8 lna_type_5g; diff --git a/drivers/net/wireless/realtek/rtw88/reg.h b/drivers/net/wireless/realtek/rtw88/reg.h index 00eb6b6a1f5b..9fdfcdc5c5cf 100644 --- a/drivers/net/wireless/realtek/rtw88/reg.h +++ b/drivers/net/wireless/realtek/rtw88/reg.h @@ -6,6 +6,7 @@ #define __RTW_REG_DEF_H__ #define REG_SYS_FUNC_EN 0x0002 +#define BIT_FEN_EN_25_1 BIT(13) #define BIT_FEN_ELDR BIT(12) #define BIT_FEN_CPUEN BIT(2) #define BIT_FEN_BB_GLB_RST BIT(1) @@ -40,6 +41,11 @@ #define BIT_MASK_EF_ADDR 0x3ff #define BIT_MASK_EF_DATA 0xff #define BITS_EF_ADDR (BIT_MASK_EF_ADDR << BIT_SHIFT_EF_ADDR) +#define BITS_PLL 0xf0 + +#define REG_AFE_CTRL3 0x2c +#define BIT_MASK_XTAL 0x00FFF000 +#define BIT_XTAL_GMP_BIT4 BIT(28) #define REG_LDO_EFUSE_CTRL 0x0034 #define BIT_MASK_EFUSE_BANK_SEL (BIT(8) | BIT(9)) @@ -61,6 +67,7 @@ #define BIT_PAPE_SEL_EN BIT(25) #define BIT_DPDT_WL_SEL BIT(24) #define BIT_DPDT_SEL_EN BIT(23) +#define REG_LEDCFG2 0x004E #define REG_PAD_CTRL1 0x0064 #define BIT_PAPE_WLBT_SEL BIT(29) #define BIT_LNAON_WLBT_SEL BIT(28) @@ -76,9 +83,15 @@ #define BIT_LTE_MUX_CTRL_PATH BIT(26) #define REG_HCI_OPT_CTRL 0x0074 +#define REG_AFE_CTRL_4 0x0078 +#define BIT_CK320M_AFE_EN BIT(4) +#define BIT_EN_SYN BIT(15) + #define REG_LDO_SWR_CTRL 0x007C #define LDO_SEL 0xC3 #define SPS_SEL 0x83 +#define BIT_XTA1 BIT(29) +#define BIT_XTA0 BIT(28) #define REG_MCUFW_CTRL 0x0080 #define BIT_ANA_PORT_EN BIT(22) @@ -197,6 +210,7 @@ #define BIT_FS_RXDONE BIT(16) #define REG_PKTBUF_DBG_CTRL 0x0140 #define REG_C2HEVT 0x01A0 +#define REG_MCUTST_1 0x01C0 #define REG_MCUTST_II 0x01C4 #define REG_WOWLAN_WAKE_REASON 0x01C7 #define REG_HMETFR 0x01CC @@ -230,6 +244,7 @@ #define REG_DWBCN0_CTRL 0x0208 #define BIT_BCN_VALID BIT(16) #define REG_TXDMA_OFFSET_CHK 0x020C +#define BIT_DROP_DATA_EN BIT(9) #define REG_TXDMA_STATUS 0x0210 #define BTI_PAGE_OVF BIT(2) @@ -291,6 +306,7 @@ #define BIT_CHECK_CCK_EN BIT(7) #define REG_AMPDU_MAX_TIME_V1 0x0455 #define REG_BCNQ1_BDNY_V1 0x0456 +#define REG_AMPDU_MAX_TIME 0x0456 #define REG_WMAC_LBK_BF_HD 0x045D #define REG_TX_HANG_CTRL 0x045E #define BIT_EN_GNT_BT_AWAKE BIT(3) @@ -306,7 +322,10 @@ #define REG_QUEUE_CTRL 0x04C6 #define BIT_PTA_WL_TX_EN BIT(4) #define BIT_PTA_EDCCA_EN BIT(5) +#define REG_SINGLE_AMPDU_CTRL 0x04C7 +#define BIT_EN_SINGLE_APMDU BIT(7) #define REG_PROT_MODE_CTRL 0x04C8 +#define REG_MAX_AGGR_NUM 0x04CA #define REG_BAR_MODE_CTRL 0x04CC #define REG_PRECNT_CTRL 0x04E5 #define BIT_BTCCA_CTRL (BIT(0) | BIT(1)) @@ -326,6 +345,7 @@ #define BIT_SHIFT_SIFS_OFDM_CTX 8 #define BIT_SHIFT_SIFS_CCK_TRX 16 #define BIT_SHIFT_SIFS_OFDM_TRX 24 +#define REG_AGGR_BREAK_TIME 0x051A #define REG_SLOT 0x051B #define REG_TX_PTCL_CTRL 0x0520 #define BIT_SIFS_BK_EN BIT(12) @@ -337,18 +357,23 @@ #define REG_TBTT_PROHIBIT 0x0540 #define BIT_SHIFT_TBTT_HOLD_TIME_AP 8 #define REG_RD_NAV_NXT 0x0544 +#define REG_NAV_PROT_LEN 0x0546 #define REG_BCN_CTRL 0x0550 #define BIT_DIS_TSF_UDT BIT(4) #define BIT_EN_BCN_FUNCTION BIT(3) +#define BIT_EN_TXBCN_RPT BIT(2) #define REG_BCN_CTRL_CLINT0 0x0551 #define REG_DRVERLYINT 0x0558 #define REG_BCNDMATIM 0x0559 +#define REG_ATIMWND 0x055A #define REG_USTIME_TSF 0x055C #define REG_BCN_MAX_ERR 0x055D #define REG_RXTSF_OFFSET_CCK 0x055E #define REG_MISC_CTRL 0x0577 #define BIT_EN_FREE_CNT BIT(3) #define BIT_DIS_SECOND_CCA (BIT(0) | BIT(1)) +#define REG_HIQ_NO_LMT_EN 0x5A7 +#define BIT_HIQ_NO_LMT_EN_ROOT BIT(0) #define REG_TIMER0_SRC_SEL 0x05B4 #define BIT_TSFT_SEL_TIMER0 (BIT(4) | BIT(5) | BIT(6)) @@ -374,6 +399,7 @@ #define BIT_HTC_LOC_CTRL BIT(14) #define BIT_RPFM_CAM_ENABLE BIT(12) #define BIT_TA_BCN BIT(11) +#define BIT_RCR_ADF BIT(11) #define BIT_DISDECMYPKT BIT(10) #define BIT_AICV BIT(9) #define BIT_ACRC32 BIT(8) @@ -391,6 +417,7 @@ #define REG_MAR 0x0620 #define REG_USTIME_EDCA 0x0638 #define REG_ACKTO_CCK 0x0639 +#define REG_MAC_SPEC_SIFS 0x063A #define REG_RESP_SIFS_CCK 0x063C #define REG_RESP_SIFS_OFDM 0x063E #define REG_ACKTO 0x0640 @@ -433,12 +460,19 @@ #define BIT_LTE_COEX_EN BIT(7) #define REG_BT_STAT_CTRL 0x0778 #define REG_BT_TDMA_TIME 0x0790 +#define REG_LTR_IDLE_LATENCY 0x0798 +#define REG_LTR_ACTIVE_LATENCY 0x079C +#define REG_LTR_CTRL_BASIC 0x07A4 #define REG_WMAC_OPTION_FUNCTION 0x07D0 #define REG_WMAC_OPTION_FUNCTION_1 0x07D4 +#define REG_FPGA0_RFMOD 0x0800 +#define BIT_CCKEN BIT(24) +#define BIT_OFDMEN BIT(25) #define REG_RX_GAIN_EN 0x081c #define REG_RFE_CTRL_E 0x0974 +#define REG_2ND_CCA_CTRL 0x0976 #define REG_DIS_DPD 0x0a70 #define DIS_DPD_MASK GENMASK(9, 0) diff --git a/drivers/net/wireless/realtek/rtw88/rtw8723d.c b/drivers/net/wireless/realtek/rtw88/rtw8723d.c index c03ed91349e5..8ca4d5794434 100644 --- a/drivers/net/wireless/realtek/rtw88/rtw8723d.c +++ b/drivers/net/wireless/realtek/rtw88/rtw8723d.c @@ -37,6 +37,98 @@ static const struct rtw_hw_reg rtw8723d_txagc[] = { [DESC_RATEMCS7] = { .addr = 0xe14, .mask = 0xff000000 }, }; +#define WLAN_TXQ_RPT_EN 0x1F +#define WLAN_SLOT_TIME 0x09 +#define WLAN_RL_VAL 0x3030 +#define WLAN_BAR_VAL 0x0201ffff +#define BIT_MASK_TBTT_HOLD 0x00000fff +#define BIT_SHIFT_TBTT_HOLD 8 +#define BIT_MASK_TBTT_SETUP 0x000000ff +#define BIT_SHIFT_TBTT_SETUP 0 +#define BIT_MASK_TBTT_MASK ((BIT_MASK_TBTT_HOLD << BIT_SHIFT_TBTT_HOLD) | \ + (BIT_MASK_TBTT_SETUP << BIT_SHIFT_TBTT_SETUP)) +#define TBTT_TIME(s, h)((((s) & BIT_MASK_TBTT_SETUP) << BIT_SHIFT_TBTT_SETUP) |\ + (((h) & BIT_MASK_TBTT_HOLD) << BIT_SHIFT_TBTT_HOLD)) +#define WLAN_TBTT_TIME_NORMAL TBTT_TIME(0x04, 0x80) +#define WLAN_TBTT_TIME_STOP_BCN TBTT_TIME(0x04, 0x64) +#define WLAN_PIFS_VAL 0 +#define WLAN_AGG_BRK_TIME 0x16 +#define WLAN_NAV_PROT_LEN 0x0040 +#define WLAN_SPEC_SIFS 0x100a +#define WLAN_RX_PKT_LIMIT 0x17 +#define WLAN_MAX_AGG_NR 0x0A +#define WLAN_AMPDU_MAX_TIME 0x1C +#define WLAN_ANT_SEL 0x82 +#define WLAN_LTR_IDLE_LAT 0x883C883C +#define WLAN_LTR_ACT_LAT 0x880B880B +#define WLAN_LTR_CTRL1 0xCB004010 +#define WLAN_LTR_CTRL2 0x01233425 + +static void rtw8723d_phy_set_param(struct rtw_dev *rtwdev) +{ + u8 xtal_cap; + u32 val32; + + /* power on BB/RF domain */ + rtw_write16_set(rtwdev, REG_SYS_FUNC_EN, + BIT_FEN_EN_25_1 | BIT_FEN_BB_GLB_RST | BIT_FEN_BB_RSTB); + rtw_write8_set(rtwdev, REG_RF_CTRL, + BIT_RF_EN | BIT_RF_RSTB | BIT_RF_SDM_RSTB); + rtw_write8(rtwdev, REG_AFE_CTRL1 + 1, 0x80); + + rtw_phy_load_tables(rtwdev); + + /* post init after header files config */ + rtw_write32_clr(rtwdev, REG_RCR, BIT_RCR_ADF); + rtw_write8_set(rtwdev, REG_HIQ_NO_LMT_EN, BIT_HIQ_NO_LMT_EN_ROOT); + rtw_write16_set(rtwdev, REG_AFE_CTRL_4, BIT_CK320M_AFE_EN | BIT_EN_SYN); + + xtal_cap = rtwdev->efuse.crystal_cap & 0x3F; + rtw_write32_mask(rtwdev, REG_AFE_CTRL3, BIT_MASK_XTAL, + xtal_cap | (xtal_cap << 6)); + rtw_write32_set(rtwdev, REG_FPGA0_RFMOD, BIT_CCKEN | BIT_OFDMEN); + if ((rtwdev->efuse.afe >> 4) == 14) { + rtw_write32_set(rtwdev, REG_AFE_CTRL3, BIT_XTAL_GMP_BIT4); + rtw_write32_clr(rtwdev, REG_AFE_CTRL1, BITS_PLL); + rtw_write32_set(rtwdev, REG_LDO_SWR_CTRL, BIT_XTA1); + rtw_write32_clr(rtwdev, REG_LDO_SWR_CTRL, BIT_XTA0); + } + + rtw_write8(rtwdev, REG_SLOT, WLAN_SLOT_TIME); + rtw_write8(rtwdev, REG_FWHW_TXQ_CTRL + 1, WLAN_TXQ_RPT_EN); + rtw_write16(rtwdev, REG_RETRY_LIMIT, WLAN_RL_VAL); + rtw_write32(rtwdev, REG_BAR_MODE_CTRL, WLAN_BAR_VAL); + rtw_write8(rtwdev, REG_ATIMWND, 0x2); + rtw_write8(rtwdev, REG_BCN_CTRL, + BIT_DIS_TSF_UDT | BIT_EN_BCN_FUNCTION | BIT_EN_TXBCN_RPT); + val32 = rtw_read32(rtwdev, REG_TBTT_PROHIBIT); + val32 &= ~BIT_MASK_TBTT_MASK; + val32 |= WLAN_TBTT_TIME_STOP_BCN; + rtw_write8(rtwdev, REG_TBTT_PROHIBIT, val32); + rtw_write8(rtwdev, REG_PIFS, WLAN_PIFS_VAL); + rtw_write8(rtwdev, REG_AGGR_BREAK_TIME, WLAN_AGG_BRK_TIME); + rtw_write16(rtwdev, REG_NAV_PROT_LEN, WLAN_NAV_PROT_LEN); + rtw_write16(rtwdev, REG_MAC_SPEC_SIFS, WLAN_SPEC_SIFS); + rtw_write16(rtwdev, REG_SIFS, WLAN_SPEC_SIFS); + rtw_write16(rtwdev, REG_SIFS + 2, WLAN_SPEC_SIFS); + rtw_write8(rtwdev, REG_SINGLE_AMPDU_CTRL, BIT_EN_SINGLE_APMDU); + rtw_write8(rtwdev, REG_RX_PKT_LIMIT, WLAN_RX_PKT_LIMIT); + rtw_write8(rtwdev, REG_MAX_AGGR_NUM, WLAN_MAX_AGG_NR); + rtw_write8(rtwdev, REG_AMPDU_MAX_TIME, WLAN_AMPDU_MAX_TIME); + rtw_write8(rtwdev, REG_LEDCFG2, WLAN_ANT_SEL); + + rtw_write32(rtwdev, REG_LTR_IDLE_LATENCY, WLAN_LTR_IDLE_LAT); + rtw_write32(rtwdev, REG_LTR_ACTIVE_LATENCY, WLAN_LTR_ACT_LAT); + rtw_write32(rtwdev, REG_LTR_CTRL_BASIC, WLAN_LTR_CTRL1); + rtw_write32(rtwdev, REG_LTR_CTRL_BASIC + 4, WLAN_LTR_CTRL2); + + rtw_phy_init(rtwdev); + + rtw_write16_set(rtwdev, REG_TXDMA_OFFSET_CHK, BIT_DROP_DATA_EN); + rtw_write32_mask(rtwdev, REG_OFDM0_XAAGC1, MASKBYTE0, 0x50); + rtw_write32_mask(rtwdev, REG_OFDM0_XAAGC1, MASKBYTE0, 0x20); +} + static void rtw8723de_efuse_parsing(struct rtw_efuse *efuse, struct rtw8723d_efuse *map) { @@ -63,6 +155,7 @@ static int rtw8723d_read_efuse(struct rtw_dev *rtwdev, u8 *log_map) efuse->regd = map->rf_board_option & 0x7; efuse->thermal_meter[0] = map->thermal_meter; efuse->thermal_meter_k = map->thermal_meter; + efuse->afe = map->afe; for (i = 0; i < 4; i++) efuse->txpwr_idx_table[i] = map->txpwr_idx_table[i]; @@ -79,6 +172,35 @@ static int rtw8723d_read_efuse(struct rtw_dev *rtwdev, u8 *log_map) return 0; } +#define BIT_CFENDFORM BIT(9) +#define BIT_WMAC_TCR_ERR0 BIT(12) +#define BIT_WMAC_TCR_ERR1 BIT(13) +#define BIT_TCR_CFG (BIT_CFENDFORM | BIT_WMAC_TCR_ERR0 | \ + BIT_WMAC_TCR_ERR1) +#define WLAN_RX_FILTER0 0xFFFF +#define WLAN_RX_FILTER1 0x400 +#define WLAN_RX_FILTER2 0xFFFF +#define WLAN_RCR_CFG 0x700060CE + +static int rtw8723d_mac_init(struct rtw_dev *rtwdev) +{ + rtw_write8(rtwdev, REG_FWHW_TXQ_CTRL + 1, WLAN_TXQ_RPT_EN); + rtw_write32(rtwdev, REG_TCR, BIT_TCR_CFG); + + rtw_write16(rtwdev, REG_RXFLTMAP0, WLAN_RX_FILTER0); + rtw_write16(rtwdev, REG_RXFLTMAP1, WLAN_RX_FILTER1); + rtw_write16(rtwdev, REG_RXFLTMAP2, WLAN_RX_FILTER2); + rtw_write32(rtwdev, REG_RCR, WLAN_RCR_CFG); + + rtw_write32(rtwdev, REG_INT_MIG, 0); + rtw_write32(rtwdev, REG_MCUTST_1, 0x0); + + rtw_write8(rtwdev, REG_MISC_CTRL, BIT_DIS_SECOND_CCA); + rtw_write8(rtwdev, REG_2ND_CCA_CTRL, 0); + + return 0; +} + static void rtw8723d_cfg_ldo25(struct rtw_dev *rtwdev, bool enable) { u8 ldo_pwr; @@ -143,7 +265,9 @@ static void rtw8723d_efuse_grant(struct rtw_dev *rtwdev, bool on) } static struct rtw_chip_ops rtw8723d_ops = { + .phy_set_param = rtw8723d_phy_set_param, .read_efuse = rtw8723d_read_efuse, + .mac_init = rtw8723d_mac_init, .read_rf = rtw_phy_read_rf_sipi, .write_rf = rtw_phy_write_rf_reg_sipi, .set_tx_power_index = rtw8723d_set_tx_power_index, diff --git a/drivers/net/wireless/realtek/rtw88/rtw8723d.h b/drivers/net/wireless/realtek/rtw88/rtw8723d.h index 1939d9897a26..6321dea83519 100644 --- a/drivers/net/wireless/realtek/rtw88/rtw8723d.h +++ b/drivers/net/wireless/realtek/rtw88/rtw8723d.h @@ -44,4 +44,7 @@ struct rtw8723d_efuse { struct rtw8723de_efuse e; }; +#define REG_OFDM0_XAAGC1 0x0c50 +#define REG_OFDM0_XBAGC1 0x0c58 + #endif -- cgit v1.2.3-59-g8ed1b From aac392d8553f3fcc8dd42fc8f7af8eb0593ce9ca Mon Sep 17 00:00:00 2001 From: Maharaja Kennadyrajan Date: Wed, 22 Apr 2020 00:28:32 +0530 Subject: ath10k: Fix the invalid tx/rx chainmask configuration The driver is allowing the invalid tx/rx chainmask configuration (other than 1,3,7,15) set by the user. It causes the firmware crash due to the invalid chainmask values. Hence, reject the invalid chainmask values in the driver by not sending the pdev set command to the firmware. Tested hardware: QCA9888 Tested firmware: 10.4-3.10-00047 Signed-off-by: Maharaja Kennadyrajan Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/1587495512-29813-1-git-send-email-mkenna@codeaurora.org --- drivers/net/wireless/ath/ath10k/mac.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/drivers/net/wireless/ath/ath10k/mac.c b/drivers/net/wireless/ath/ath10k/mac.c index 6791c0035be0..5de7910c24e7 100644 --- a/drivers/net/wireless/ath/ath10k/mac.c +++ b/drivers/net/wireless/ath/ath10k/mac.c @@ -4529,17 +4529,18 @@ static int ath10k_get_antenna(struct ieee80211_hw *hw, u32 *tx_ant, u32 *rx_ant) return 0; } -static void ath10k_check_chain_mask(struct ath10k *ar, u32 cm, const char *dbg) +static bool ath10k_check_chain_mask(struct ath10k *ar, u32 cm, const char *dbg) { /* It is not clear that allowing gaps in chainmask * is helpful. Probably it will not do what user * is hoping for, so warn in that case. */ if (cm == 15 || cm == 7 || cm == 3 || cm == 1 || cm == 0) - return; + return true; - ath10k_warn(ar, "mac %s antenna chainmask may be invalid: 0x%x. Suggested values: 15, 7, 3, 1 or 0.\n", + ath10k_warn(ar, "mac %s antenna chainmask is invalid: 0x%x. Suggested values: 15, 7, 3, 1 or 0.\n", dbg, cm); + return false; } static int ath10k_mac_get_vht_cap_bf_sts(struct ath10k *ar) @@ -4722,11 +4723,15 @@ static void ath10k_mac_setup_ht_vht_cap(struct ath10k *ar) static int __ath10k_set_antenna(struct ath10k *ar, u32 tx_ant, u32 rx_ant) { int ret; + bool is_valid_tx_chain_mask, is_valid_rx_chain_mask; lockdep_assert_held(&ar->conf_mutex); - ath10k_check_chain_mask(ar, tx_ant, "tx"); - ath10k_check_chain_mask(ar, rx_ant, "rx"); + is_valid_tx_chain_mask = ath10k_check_chain_mask(ar, tx_ant, "tx"); + is_valid_rx_chain_mask = ath10k_check_chain_mask(ar, rx_ant, "rx"); + + if (!is_valid_tx_chain_mask || !is_valid_rx_chain_mask) + return -EINVAL; ar->cfg_tx_chainmask = tx_ant; ar->cfg_rx_chainmask = rx_ant; -- cgit v1.2.3-59-g8ed1b From 8347784d6f5fae467e82522029ab1290673c50d6 Mon Sep 17 00:00:00 2001 From: Wen Gong Date: Wed, 22 Apr 2020 16:47:19 +0800 Subject: ath10k: drop the TX packet which size exceed credit size for sdio sdio chip use DMA buffer to receive TX packet from ath10k, and it has limitation of each buffer, if the packet size exceed the credit size, it will trigger error in firmware. Tested with QCA6174 SDIO with firmware WLAN.RMH.4.4.1-00017-QCARMSWP-1. Signed-off-by: Wen Gong Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/20200422084719.3479-1-wgong@codeaurora.org --- drivers/net/wireless/ath/ath10k/htc.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/net/wireless/ath/ath10k/htc.c b/drivers/net/wireless/ath/ath10k/htc.c index 58ceba75d20a..31df6dd04bf6 100644 --- a/drivers/net/wireless/ath/ath10k/htc.c +++ b/drivers/net/wireless/ath/ath10k/htc.c @@ -846,6 +846,11 @@ int ath10k_htc_send_hl(struct ath10k_htc *htc, struct ath10k_htc_ep *ep = &htc->endpoint[eid]; struct ath10k *ar = htc->ar; + if (sizeof(struct ath10k_htc_hdr) + skb->len > ep->tx_credit_size) { + ath10k_dbg(ar, ATH10K_DBG_HTC, "tx exceed max len %d\n", skb->len); + return -ENOMEM; + } + ath10k_dbg(ar, ATH10K_DBG_HTC, "htc send hl eid %d bundle %d tx count %d len %d\n", eid, ep->bundle_tx, skb_queue_len(&ep->tx_req_head), skb->len); -- cgit v1.2.3-59-g8ed1b From 5d1c9a114a6efba2c8391e39d4ac3e4e5c7b6d32 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Tue, 7 Apr 2020 18:59:51 +0300 Subject: net/mlx5: Update vport.c to new cmd interface Do mass update of vport.c to reuse newly introduced mlx5_cmd_exec_in*() interfaces. Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/mlx5/ib_virt.c | 2 +- drivers/infiniband/hw/mlx5/mad.c | 4 +- drivers/net/ethernet/mellanox/mlx5/core/vport.c | 142 +++++++++++------------- include/linux/mlx5/vport.h | 3 +- 4 files changed, 71 insertions(+), 80 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/ib_virt.c b/drivers/infiniband/hw/mlx5/ib_virt.c index b61165359954..46b2d370fb3f 100644 --- a/drivers/infiniband/hw/mlx5/ib_virt.c +++ b/drivers/infiniband/hw/mlx5/ib_virt.c @@ -134,7 +134,7 @@ int mlx5_ib_get_vf_stats(struct ib_device *device, int vf, if (!out) return -ENOMEM; - err = mlx5_core_query_vport_counter(mdev, true, vf, port, out, out_sz); + err = mlx5_core_query_vport_counter(mdev, true, vf, port, out); if (err) goto ex; diff --git a/drivers/infiniband/hw/mlx5/mad.c b/drivers/infiniband/hw/mlx5/mad.c index f0ab6d7d8497..454ce5de2de7 100644 --- a/drivers/infiniband/hw/mlx5/mad.c +++ b/drivers/infiniband/hw/mlx5/mad.c @@ -187,8 +187,8 @@ static int process_pma_cmd(struct mlx5_ib_dev *dev, u8 port_num, goto done; } - err = mlx5_core_query_vport_counter(mdev, 0, 0, - mdev_port_num, out_cnt, sz); + err = mlx5_core_query_vport_counter(mdev, 0, 0, mdev_port_num, + out_cnt); if (!err) pma_cnt_ext_assign(pma_cnt_ext, out_cnt); } else { diff --git a/drivers/net/ethernet/mellanox/mlx5/core/vport.c b/drivers/net/ethernet/mellanox/mlx5/core/vport.c index 23f879da9104..c107d92dc118 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/vport.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/vport.c @@ -40,10 +40,11 @@ /* Mutex to hold while enabling or disabling RoCE */ static DEFINE_MUTEX(mlx5_roce_en_lock); -static int _mlx5_query_vport_state(struct mlx5_core_dev *mdev, u8 opmod, - u16 vport, u32 *out, int outlen) +u8 mlx5_query_vport_state(struct mlx5_core_dev *mdev, u8 opmod, u16 vport) { - u32 in[MLX5_ST_SZ_DW(query_vport_state_in)] = {0}; + u32 out[MLX5_ST_SZ_DW(query_vport_state_out)] = {}; + u32 in[MLX5_ST_SZ_DW(query_vport_state_in)] = {}; + int err; MLX5_SET(query_vport_state_in, in, opcode, MLX5_CMD_OP_QUERY_VPORT_STATE); @@ -52,14 +53,9 @@ static int _mlx5_query_vport_state(struct mlx5_core_dev *mdev, u8 opmod, if (vport) MLX5_SET(query_vport_state_in, in, other_vport, 1); - return mlx5_cmd_exec(mdev, in, sizeof(in), out, outlen); -} - -u8 mlx5_query_vport_state(struct mlx5_core_dev *mdev, u8 opmod, u16 vport) -{ - u32 out[MLX5_ST_SZ_DW(query_vport_state_out)] = {0}; - - _mlx5_query_vport_state(mdev, opmod, vport, out, sizeof(out)); + err = mlx5_cmd_exec_inout(mdev, query_vport_state, in, out); + if (err) + return 0; return MLX5_GET(query_vport_state_out, out, state); } @@ -67,8 +63,7 @@ u8 mlx5_query_vport_state(struct mlx5_core_dev *mdev, u8 opmod, u16 vport) int mlx5_modify_vport_admin_state(struct mlx5_core_dev *mdev, u8 opmod, u16 vport, u8 other_vport, u8 state) { - u32 in[MLX5_ST_SZ_DW(modify_vport_state_in)] = {0}; - u32 out[MLX5_ST_SZ_DW(modify_vport_state_out)] = {0}; + u32 in[MLX5_ST_SZ_DW(modify_vport_state_in)] = {}; MLX5_SET(modify_vport_state_in, in, opcode, MLX5_CMD_OP_MODIFY_VPORT_STATE); @@ -77,13 +72,13 @@ int mlx5_modify_vport_admin_state(struct mlx5_core_dev *mdev, u8 opmod, MLX5_SET(modify_vport_state_in, in, other_vport, other_vport); MLX5_SET(modify_vport_state_in, in, admin_state, state); - return mlx5_cmd_exec(mdev, in, sizeof(in), out, sizeof(out)); + return mlx5_cmd_exec_in(mdev, modify_vport_state, in); } static int mlx5_query_nic_vport_context(struct mlx5_core_dev *mdev, u16 vport, - u32 *out, int outlen) + u32 *out) { - u32 in[MLX5_ST_SZ_DW(query_nic_vport_context_in)] = {0}; + u32 in[MLX5_ST_SZ_DW(query_nic_vport_context_in)] = {}; MLX5_SET(query_nic_vport_context_in, in, opcode, MLX5_CMD_OP_QUERY_NIC_VPORT_CONTEXT); @@ -91,26 +86,16 @@ static int mlx5_query_nic_vport_context(struct mlx5_core_dev *mdev, u16 vport, if (vport) MLX5_SET(query_nic_vport_context_in, in, other_vport, 1); - return mlx5_cmd_exec(mdev, in, sizeof(in), out, outlen); -} - -static int mlx5_modify_nic_vport_context(struct mlx5_core_dev *mdev, void *in, - int inlen) -{ - u32 out[MLX5_ST_SZ_DW(modify_nic_vport_context_out)] = {0}; - - MLX5_SET(modify_nic_vport_context_in, in, opcode, - MLX5_CMD_OP_MODIFY_NIC_VPORT_CONTEXT); - return mlx5_cmd_exec(mdev, in, inlen, out, sizeof(out)); + return mlx5_cmd_exec_inout(mdev, query_nic_vport_context, in, out); } int mlx5_query_nic_vport_min_inline(struct mlx5_core_dev *mdev, u16 vport, u8 *min_inline) { - u32 out[MLX5_ST_SZ_DW(query_nic_vport_context_out)] = {0}; + u32 out[MLX5_ST_SZ_DW(query_nic_vport_context_out)] = {}; int err; - err = mlx5_query_nic_vport_context(mdev, vport, out, sizeof(out)); + err = mlx5_query_nic_vport_context(mdev, vport, out); if (!err) *min_inline = MLX5_GET(query_nic_vport_context_out, out, nic_vport_context.min_wqe_inline_mode); @@ -139,8 +124,7 @@ EXPORT_SYMBOL_GPL(mlx5_query_min_inline); int mlx5_modify_nic_vport_min_inline(struct mlx5_core_dev *mdev, u16 vport, u8 min_inline) { - u32 in[MLX5_ST_SZ_DW(modify_nic_vport_context_in)] = {0}; - int inlen = MLX5_ST_SZ_BYTES(modify_nic_vport_context_in); + u32 in[MLX5_ST_SZ_DW(modify_nic_vport_context_in)] = {}; void *nic_vport_ctx; MLX5_SET(modify_nic_vport_context_in, in, @@ -152,23 +136,20 @@ int mlx5_modify_nic_vport_min_inline(struct mlx5_core_dev *mdev, in, nic_vport_context); MLX5_SET(nic_vport_context, nic_vport_ctx, min_wqe_inline_mode, min_inline); + MLX5_SET(modify_nic_vport_context_in, in, opcode, + MLX5_CMD_OP_MODIFY_NIC_VPORT_CONTEXT); - return mlx5_modify_nic_vport_context(mdev, in, inlen); + return mlx5_cmd_exec_in(mdev, modify_nic_vport_context, in); } int mlx5_query_nic_vport_mac_address(struct mlx5_core_dev *mdev, u16 vport, bool other, u8 *addr) { - int outlen = MLX5_ST_SZ_BYTES(query_nic_vport_context_out); + u32 out[MLX5_ST_SZ_DW(query_nic_vport_context_out)] = {}; u32 in[MLX5_ST_SZ_DW(query_nic_vport_context_in)] = {}; u8 *out_addr; - u32 *out; int err; - out = kvzalloc(outlen, GFP_KERNEL); - if (!out) - return -ENOMEM; - out_addr = MLX5_ADDR_OF(query_nic_vport_context_out, out, nic_vport_context.permanent_address); @@ -177,11 +158,10 @@ int mlx5_query_nic_vport_mac_address(struct mlx5_core_dev *mdev, MLX5_SET(query_nic_vport_context_in, in, vport_number, vport); MLX5_SET(query_nic_vport_context_in, in, other_vport, other); - err = mlx5_cmd_exec(mdev, in, sizeof(in), out, outlen); + err = mlx5_cmd_exec_inout(mdev, query_nic_vport_context, in, out); if (!err) ether_addr_copy(addr, &out_addr[2]); - kvfree(out); return err; } EXPORT_SYMBOL_GPL(mlx5_query_nic_vport_mac_address); @@ -216,8 +196,10 @@ int mlx5_modify_nic_vport_mac_address(struct mlx5_core_dev *mdev, permanent_address); ether_addr_copy(&perm_mac[2], addr); + MLX5_SET(modify_nic_vport_context_in, in, opcode, + MLX5_CMD_OP_MODIFY_NIC_VPORT_CONTEXT); - err = mlx5_modify_nic_vport_context(mdev, in, inlen); + err = mlx5_cmd_exec_in(mdev, modify_nic_vport_context, in); kvfree(in); @@ -235,7 +217,7 @@ int mlx5_query_nic_vport_mtu(struct mlx5_core_dev *mdev, u16 *mtu) if (!out) return -ENOMEM; - err = mlx5_query_nic_vport_context(mdev, 0, out, outlen); + err = mlx5_query_nic_vport_context(mdev, 0, out); if (!err) *mtu = MLX5_GET(query_nic_vport_context_out, out, nic_vport_context.mtu); @@ -257,8 +239,10 @@ int mlx5_modify_nic_vport_mtu(struct mlx5_core_dev *mdev, u16 mtu) MLX5_SET(modify_nic_vport_context_in, in, field_select.mtu, 1); MLX5_SET(modify_nic_vport_context_in, in, nic_vport_context.mtu, mtu); + MLX5_SET(modify_nic_vport_context_in, in, opcode, + MLX5_CMD_OP_MODIFY_NIC_VPORT_CONTEXT); - err = mlx5_modify_nic_vport_context(mdev, in, inlen); + err = mlx5_cmd_exec_in(mdev, modify_nic_vport_context, in); kvfree(in); return err; @@ -292,7 +276,7 @@ int mlx5_query_nic_vport_mac_list(struct mlx5_core_dev *dev, req_list_size = max_list_size; } - out_sz = MLX5_ST_SZ_BYTES(modify_nic_vport_context_in) + + out_sz = MLX5_ST_SZ_BYTES(query_nic_vport_context_in) + req_list_size * MLX5_ST_SZ_BYTES(mac_address_layout); out = kzalloc(out_sz, GFP_KERNEL); @@ -332,7 +316,7 @@ int mlx5_modify_nic_vport_mac_list(struct mlx5_core_dev *dev, u8 addr_list[][ETH_ALEN], int list_size) { - u32 out[MLX5_ST_SZ_DW(modify_nic_vport_context_out)]; + u32 out[MLX5_ST_SZ_DW(modify_nic_vport_context_out)] = {}; void *nic_vport_ctx; int max_list_size; int in_sz; @@ -350,7 +334,6 @@ int mlx5_modify_nic_vport_mac_list(struct mlx5_core_dev *dev, in_sz = MLX5_ST_SZ_BYTES(modify_nic_vport_context_in) + list_size * MLX5_ST_SZ_BYTES(mac_address_layout); - memset(out, 0, sizeof(out)); in = kzalloc(in_sz, GFP_KERNEL); if (!in) return -ENOMEM; @@ -442,7 +425,7 @@ int mlx5_query_nic_vport_system_image_guid(struct mlx5_core_dev *mdev, if (!out) return -ENOMEM; - mlx5_query_nic_vport_context(mdev, 0, out, outlen); + mlx5_query_nic_vport_context(mdev, 0, out); *system_image_guid = MLX5_GET64(query_nic_vport_context_out, out, nic_vport_context.system_image_guid); @@ -462,7 +445,7 @@ int mlx5_query_nic_vport_node_guid(struct mlx5_core_dev *mdev, u64 *node_guid) if (!out) return -ENOMEM; - mlx5_query_nic_vport_context(mdev, 0, out, outlen); + mlx5_query_nic_vport_context(mdev, 0, out); *node_guid = MLX5_GET64(query_nic_vport_context_out, out, nic_vport_context.node_guid); @@ -498,8 +481,10 @@ int mlx5_modify_nic_vport_node_guid(struct mlx5_core_dev *mdev, nic_vport_context = MLX5_ADDR_OF(modify_nic_vport_context_in, in, nic_vport_context); MLX5_SET64(nic_vport_context, nic_vport_context, node_guid, node_guid); + MLX5_SET(modify_nic_vport_context_in, in, opcode, + MLX5_CMD_OP_MODIFY_NIC_VPORT_CONTEXT); - err = mlx5_modify_nic_vport_context(mdev, in, inlen); + err = mlx5_cmd_exec_in(mdev, modify_nic_vport_context, in); kvfree(in); @@ -516,7 +501,7 @@ int mlx5_query_nic_vport_qkey_viol_cntr(struct mlx5_core_dev *mdev, if (!out) return -ENOMEM; - mlx5_query_nic_vport_context(mdev, 0, out, outlen); + mlx5_query_nic_vport_context(mdev, 0, out); *qkey_viol_cntr = MLX5_GET(query_nic_vport_context_out, out, nic_vport_context.qkey_violation_counter); @@ -664,7 +649,7 @@ int mlx5_query_hca_vport_context(struct mlx5_core_dev *dev, struct mlx5_hca_vport_context *rep) { int out_sz = MLX5_ST_SZ_BYTES(query_hca_vport_context_out); - int in[MLX5_ST_SZ_DW(query_hca_vport_context_in)] = {0}; + int in[MLX5_ST_SZ_DW(query_hca_vport_context_in)] = {}; int is_group_manager; void *out; void *ctx; @@ -691,7 +676,7 @@ int mlx5_query_hca_vport_context(struct mlx5_core_dev *dev, if (MLX5_CAP_GEN(dev, num_ports) == 2) MLX5_SET(query_hca_vport_context_in, in, port_num, port_num); - err = mlx5_cmd_exec(dev, in, sizeof(in), out, out_sz); + err = mlx5_cmd_exec_inout(dev, query_hca_vport_context, in, out); if (err) goto ex; @@ -788,7 +773,7 @@ int mlx5_query_nic_vport_promisc(struct mlx5_core_dev *mdev, if (!out) return -ENOMEM; - err = mlx5_query_nic_vport_context(mdev, vport, out, outlen); + err = mlx5_query_nic_vport_context(mdev, vport, out); if (err) goto out; @@ -825,8 +810,10 @@ int mlx5_modify_nic_vport_promisc(struct mlx5_core_dev *mdev, nic_vport_context.promisc_mc, promisc_mc); MLX5_SET(modify_nic_vport_context_in, in, nic_vport_context.promisc_all, promisc_all); + MLX5_SET(modify_nic_vport_context_in, in, opcode, + MLX5_CMD_OP_MODIFY_NIC_VPORT_CONTEXT); - err = mlx5_modify_nic_vport_context(mdev, in, inlen); + err = mlx5_cmd_exec_in(mdev, modify_nic_vport_context, in); kvfree(in); @@ -865,8 +852,10 @@ int mlx5_nic_vport_update_local_lb(struct mlx5_core_dev *mdev, bool enable) if (MLX5_CAP_GEN(mdev, disable_local_lb_uc)) MLX5_SET(modify_nic_vport_context_in, in, field_select.disable_uc_local_lb, 1); + MLX5_SET(modify_nic_vport_context_in, in, opcode, + MLX5_CMD_OP_MODIFY_NIC_VPORT_CONTEXT); - err = mlx5_modify_nic_vport_context(mdev, in, inlen); + err = mlx5_cmd_exec_in(mdev, modify_nic_vport_context, in); if (!err) mlx5_core_dbg(mdev, "%s local_lb\n", @@ -888,7 +877,7 @@ int mlx5_nic_vport_query_local_lb(struct mlx5_core_dev *mdev, bool *status) if (!out) return -ENOMEM; - err = mlx5_query_nic_vport_context(mdev, 0, out, outlen); + err = mlx5_query_nic_vport_context(mdev, 0, out); if (err) goto out; @@ -925,8 +914,10 @@ static int mlx5_nic_vport_update_roce_state(struct mlx5_core_dev *mdev, MLX5_SET(modify_nic_vport_context_in, in, field_select.roce_en, 1); MLX5_SET(modify_nic_vport_context_in, in, nic_vport_context.roce_en, state); + MLX5_SET(modify_nic_vport_context_in, in, opcode, + MLX5_CMD_OP_MODIFY_NIC_VPORT_CONTEXT); - err = mlx5_modify_nic_vport_context(mdev, in, inlen); + err = mlx5_cmd_exec_in(mdev, modify_nic_vport_context, in); kvfree(in); @@ -965,16 +956,15 @@ int mlx5_nic_vport_disable_roce(struct mlx5_core_dev *mdev) mutex_unlock(&mlx5_roce_en_lock); return err; } -EXPORT_SYMBOL_GPL(mlx5_nic_vport_disable_roce); +EXPORT_SYMBOL(mlx5_nic_vport_disable_roce); int mlx5_core_query_vport_counter(struct mlx5_core_dev *dev, u8 other_vport, - int vf, u8 port_num, void *out, - size_t out_sz) + int vf, u8 port_num, void *out) { - int in_sz = MLX5_ST_SZ_BYTES(query_vport_counter_in); - int is_group_manager; - void *in; - int err; + int in_sz = MLX5_ST_SZ_BYTES(query_vport_counter_in); + int is_group_manager; + void *in; + int err; is_group_manager = MLX5_CAP_GEN(dev, vport_group_manager); in = kvzalloc(in_sz, GFP_KERNEL); @@ -997,7 +987,7 @@ int mlx5_core_query_vport_counter(struct mlx5_core_dev *dev, u8 other_vport, if (MLX5_CAP_GEN(dev, num_ports) == 2) MLX5_SET(query_vport_counter_in, in, port_num, port_num); - err = mlx5_cmd_exec(dev, in, in_sz, out, out_sz); + err = mlx5_cmd_exec_inout(dev, query_vport_counter, in, out); free: kvfree(in); return err; @@ -1008,8 +998,8 @@ int mlx5_query_vport_down_stats(struct mlx5_core_dev *mdev, u16 vport, u8 other_vport, u64 *rx_discard_vport_down, u64 *tx_discard_vport_down) { - u32 out[MLX5_ST_SZ_DW(query_vnic_env_out)] = {0}; - u32 in[MLX5_ST_SZ_DW(query_vnic_env_in)] = {0}; + u32 out[MLX5_ST_SZ_DW(query_vnic_env_out)] = {}; + u32 in[MLX5_ST_SZ_DW(query_vnic_env_in)] = {}; int err; MLX5_SET(query_vnic_env_in, in, opcode, @@ -1018,7 +1008,7 @@ int mlx5_query_vport_down_stats(struct mlx5_core_dev *mdev, u16 vport, MLX5_SET(query_vnic_env_in, in, vport_number, vport); MLX5_SET(query_vnic_env_in, in, other_vport, other_vport); - err = mlx5_cmd_exec(mdev, in, sizeof(in), out, sizeof(out)); + err = mlx5_cmd_exec_inout(mdev, query_vnic_env, in, out); if (err) return err; @@ -1035,11 +1025,10 @@ int mlx5_core_modify_hca_vport_context(struct mlx5_core_dev *dev, struct mlx5_hca_vport_context *req) { int in_sz = MLX5_ST_SZ_BYTES(modify_hca_vport_context_in); - u8 out[MLX5_ST_SZ_BYTES(modify_hca_vport_context_out)]; int is_group_manager; + void *ctx; void *in; int err; - void *ctx; mlx5_core_dbg(dev, "vf %d\n", vf); is_group_manager = MLX5_CAP_GEN(dev, vport_group_manager); @@ -1047,7 +1036,6 @@ int mlx5_core_modify_hca_vport_context(struct mlx5_core_dev *dev, if (!in) return -ENOMEM; - memset(out, 0, sizeof(out)); MLX5_SET(modify_hca_vport_context_in, in, opcode, MLX5_CMD_OP_MODIFY_HCA_VPORT_CONTEXT); if (other_vport) { if (is_group_manager) { @@ -1074,7 +1062,7 @@ int mlx5_core_modify_hca_vport_context(struct mlx5_core_dev *dev, MLX5_SET(hca_vport_context, ctx, cap_mask1, req->cap_mask1); MLX5_SET(hca_vport_context, ctx, cap_mask1_field_select, req->cap_mask1_perm); - err = mlx5_cmd_exec(dev, in, in_sz, out, sizeof(out)); + err = mlx5_cmd_exec_in(dev, modify_hca_vport_context, in); ex: kfree(in); return err; @@ -1103,8 +1091,10 @@ int mlx5_nic_vport_affiliate_multiport(struct mlx5_core_dev *master_mdev, MLX5_SET(modify_nic_vport_context_in, in, nic_vport_context.affiliation_criteria, MLX5_CAP_GEN(port_mdev, affiliate_nic_vport_criteria)); + MLX5_SET(modify_nic_vport_context_in, in, opcode, + MLX5_CMD_OP_MODIFY_NIC_VPORT_CONTEXT); - err = mlx5_modify_nic_vport_context(port_mdev, in, inlen); + err = mlx5_cmd_exec_in(port_mdev, modify_nic_vport_context, in); if (err) mlx5_nic_vport_disable_roce(port_mdev); @@ -1129,8 +1119,10 @@ int mlx5_nic_vport_unaffiliate_multiport(struct mlx5_core_dev *port_mdev) nic_vport_context.affiliated_vhca_id, 0); MLX5_SET(modify_nic_vport_context_in, in, nic_vport_context.affiliation_criteria, 0); + MLX5_SET(modify_nic_vport_context_in, in, opcode, + MLX5_CMD_OP_MODIFY_NIC_VPORT_CONTEXT); - err = mlx5_modify_nic_vport_context(port_mdev, in, inlen); + err = mlx5_cmd_exec_in(port_mdev, modify_nic_vport_context, in); if (!err) mlx5_nic_vport_disable_roce(port_mdev); @@ -1170,4 +1162,4 @@ u16 mlx5_eswitch_get_total_vports(const struct mlx5_core_dev *dev) { return MLX5_SPECIAL_VPORTS(dev) + mlx5_core_max_vfs(dev); } -EXPORT_SYMBOL(mlx5_eswitch_get_total_vports); +EXPORT_SYMBOL_GPL(mlx5_eswitch_get_total_vports); diff --git a/include/linux/mlx5/vport.h b/include/linux/mlx5/vport.h index 16060fb9b5e5..8170da1e9f70 100644 --- a/include/linux/mlx5/vport.h +++ b/include/linux/mlx5/vport.h @@ -127,8 +127,7 @@ int mlx5_query_vport_down_stats(struct mlx5_core_dev *mdev, u16 vport, u8 other_vport, u64 *rx_discard_vport_down, u64 *tx_discard_vport_down); int mlx5_core_query_vport_counter(struct mlx5_core_dev *dev, u8 other_vport, - int vf, u8 port_num, void *out, - size_t out_sz); + int vf, u8 port_num, void *out); int mlx5_core_modify_hca_vport_context(struct mlx5_core_dev *dev, u8 other_vport, u8 port_num, int vf, -- cgit v1.2.3-59-g8ed1b From d1f620500cde5c72c7b96a19474733c4c6c67f38 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Thu, 9 Apr 2020 11:39:14 +0300 Subject: net/mlx5: Update cq.c to new cmd interface Do mass update of cq.c to reuse newly introduced mlx5_cmd_exec_in*() interfaces. Reviewed-by: Moshe Shemesh Signed-off-by: Leon Romanovsky --- drivers/net/ethernet/mellanox/mlx5/core/cq.c | 22 +++++++++------------- drivers/net/ethernet/mellanox/mlx5/core/debugfs.c | 2 +- .../net/ethernet/mellanox/mlx5/core/en/health.c | 2 +- include/linux/mlx5/cq.h | 2 +- 4 files changed, 12 insertions(+), 16 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/cq.c b/drivers/net/ethernet/mellanox/mlx5/core/cq.c index 4477a590b308..8379b24cb838 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/cq.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/cq.c @@ -90,8 +90,7 @@ int mlx5_core_create_cq(struct mlx5_core_dev *dev, struct mlx5_core_cq *cq, u32 *in, int inlen, u32 *out, int outlen) { int eqn = MLX5_GET(cqc, MLX5_ADDR_OF(create_cq_in, in, cq_context), c_eqn); - u32 dout[MLX5_ST_SZ_DW(destroy_cq_out)]; - u32 din[MLX5_ST_SZ_DW(destroy_cq_in)]; + u32 din[MLX5_ST_SZ_DW(destroy_cq_in)] = {}; struct mlx5_eq_comp *eq; int err; @@ -141,20 +140,17 @@ int mlx5_core_create_cq(struct mlx5_core_dev *dev, struct mlx5_core_cq *cq, err_cq_add: mlx5_eq_del_cq(&eq->core, cq); err_cmd: - memset(din, 0, sizeof(din)); - memset(dout, 0, sizeof(dout)); MLX5_SET(destroy_cq_in, din, opcode, MLX5_CMD_OP_DESTROY_CQ); MLX5_SET(destroy_cq_in, din, cqn, cq->cqn); MLX5_SET(destroy_cq_in, din, uid, cq->uid); - mlx5_cmd_exec(dev, din, sizeof(din), dout, sizeof(dout)); + mlx5_cmd_exec_in(dev, destroy_cq, din); return err; } EXPORT_SYMBOL(mlx5_core_create_cq); int mlx5_core_destroy_cq(struct mlx5_core_dev *dev, struct mlx5_core_cq *cq) { - u32 out[MLX5_ST_SZ_DW(destroy_cq_out)] = {0}; - u32 in[MLX5_ST_SZ_DW(destroy_cq_in)] = {0}; + u32 in[MLX5_ST_SZ_DW(destroy_cq_in)] = {}; int err; mlx5_eq_del_cq(mlx5_get_async_eq(dev), cq); @@ -163,7 +159,7 @@ int mlx5_core_destroy_cq(struct mlx5_core_dev *dev, struct mlx5_core_cq *cq) MLX5_SET(destroy_cq_in, in, opcode, MLX5_CMD_OP_DESTROY_CQ); MLX5_SET(destroy_cq_in, in, cqn, cq->cqn); MLX5_SET(destroy_cq_in, in, uid, cq->uid); - err = mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); + err = mlx5_cmd_exec_in(dev, destroy_cq, in); if (err) return err; @@ -178,20 +174,20 @@ int mlx5_core_destroy_cq(struct mlx5_core_dev *dev, struct mlx5_core_cq *cq) EXPORT_SYMBOL(mlx5_core_destroy_cq); int mlx5_core_query_cq(struct mlx5_core_dev *dev, struct mlx5_core_cq *cq, - u32 *out, int outlen) + u32 *out) { - u32 in[MLX5_ST_SZ_DW(query_cq_in)] = {0}; + u32 in[MLX5_ST_SZ_DW(query_cq_in)] = {}; MLX5_SET(query_cq_in, in, opcode, MLX5_CMD_OP_QUERY_CQ); MLX5_SET(query_cq_in, in, cqn, cq->cqn); - return mlx5_cmd_exec(dev, in, sizeof(in), out, outlen); + return mlx5_cmd_exec_inout(dev, query_cq, in, out); } EXPORT_SYMBOL(mlx5_core_query_cq); int mlx5_core_modify_cq(struct mlx5_core_dev *dev, struct mlx5_core_cq *cq, u32 *in, int inlen) { - u32 out[MLX5_ST_SZ_DW(modify_cq_out)] = {0}; + u32 out[MLX5_ST_SZ_DW(modify_cq_out)] = {}; MLX5_SET(modify_cq_in, in, opcode, MLX5_CMD_OP_MODIFY_CQ); MLX5_SET(modify_cq_in, in, uid, cq->uid); @@ -204,7 +200,7 @@ int mlx5_core_modify_cq_moderation(struct mlx5_core_dev *dev, u16 cq_period, u16 cq_max_count) { - u32 in[MLX5_ST_SZ_DW(modify_cq_in)] = {0}; + u32 in[MLX5_ST_SZ_DW(modify_cq_in)] = {}; void *cqc; MLX5_SET(modify_cq_in, in, cqn, cq->cqn); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/debugfs.c b/drivers/net/ethernet/mellanox/mlx5/core/debugfs.c index 65fef5a86644..c05e6a2c9126 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/debugfs.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/debugfs.c @@ -333,7 +333,7 @@ static u64 cq_read_field(struct mlx5_core_dev *dev, struct mlx5_core_cq *cq, if (!out) return param; - err = mlx5_core_query_cq(dev, cq, out, outlen); + err = mlx5_core_query_cq(dev, cq, out); if (err) { mlx5_core_warn(dev, "failed to query cq\n"); goto out; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/health.c b/drivers/net/ethernet/mellanox/mlx5/core/en/health.c index 3a199a03d929..7283443868f3 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/health.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/health.c @@ -43,7 +43,7 @@ int mlx5e_reporter_cq_diagnose(struct mlx5e_cq *cq, struct devlink_fmsg *fmsg) void *cqc; int err; - err = mlx5_core_query_cq(priv->mdev, &cq->mcq, out, sizeof(out)); + err = mlx5_core_query_cq(priv->mdev, &cq->mcq, out); if (err) return err; diff --git a/include/linux/mlx5/cq.h b/include/linux/mlx5/cq.h index 40748fc1b11b..b5a9399e07ee 100644 --- a/include/linux/mlx5/cq.h +++ b/include/linux/mlx5/cq.h @@ -188,7 +188,7 @@ int mlx5_core_create_cq(struct mlx5_core_dev *dev, struct mlx5_core_cq *cq, u32 *in, int inlen, u32 *out, int outlen); int mlx5_core_destroy_cq(struct mlx5_core_dev *dev, struct mlx5_core_cq *cq); int mlx5_core_query_cq(struct mlx5_core_dev *dev, struct mlx5_core_cq *cq, - u32 *out, int outlen); + u32 *out); int mlx5_core_modify_cq(struct mlx5_core_dev *dev, struct mlx5_core_cq *cq, u32 *in, int inlen); int mlx5_core_modify_cq_moderation(struct mlx5_core_dev *dev, -- cgit v1.2.3-59-g8ed1b From e36fb468d23967683c1f0de644da0928563e604d Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Thu, 9 Apr 2020 11:55:06 +0300 Subject: net/mlx5: Update debugfs.c to new cmd interface Do mass update of debugfs.c to reuse newly introduced mlx5_cmd_exec_in*() interfaces. Reviewed-by: Moshe Shemesh Signed-off-by: Leon Romanovsky --- drivers/net/ethernet/mellanox/mlx5/core/debugfs.c | 15 ++++----------- 1 file changed, 4 insertions(+), 11 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/debugfs.c b/drivers/net/ethernet/mellanox/mlx5/core/debugfs.c index c05e6a2c9126..6409090b3ec5 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/debugfs.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/debugfs.c @@ -273,20 +273,11 @@ static u64 qp_read_field(struct mlx5_core_dev *dev, struct mlx5_core_qp *qp, return param; } -static int mlx5_core_eq_query(struct mlx5_core_dev *dev, struct mlx5_eq *eq, - u32 *out, int outlen) -{ - u32 in[MLX5_ST_SZ_DW(query_eq_in)] = {}; - - MLX5_SET(query_eq_in, in, opcode, MLX5_CMD_OP_QUERY_EQ); - MLX5_SET(query_eq_in, in, eq_number, eq->eqn); - return mlx5_cmd_exec(dev, in, sizeof(in), out, outlen); -} - static u64 eq_read_field(struct mlx5_core_dev *dev, struct mlx5_eq *eq, int index) { int outlen = MLX5_ST_SZ_BYTES(query_eq_out); + u32 in[MLX5_ST_SZ_DW(query_eq_in)] = {}; u64 param = 0; void *ctx; u32 *out; @@ -296,7 +287,9 @@ static u64 eq_read_field(struct mlx5_core_dev *dev, struct mlx5_eq *eq, if (!out) return param; - err = mlx5_core_eq_query(dev, eq, out, outlen); + MLX5_SET(query_eq_in, in, opcode, MLX5_CMD_OP_QUERY_EQ); + MLX5_SET(query_eq_in, in, eq_number, eq->eqn); + err = mlx5_cmd_exec_inout(dev, query_eq, in, out); if (err) { mlx5_core_warn(dev, "failed to query eq\n"); goto out; -- cgit v1.2.3-59-g8ed1b From 9aa536ad45ec390d32c563321b26ccd8de52a2d9 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Thu, 9 Apr 2020 11:56:42 +0300 Subject: net/mlx5: Update ecpf.c to new cmd interface Do mass update of ecpf.c to reuse newly introduced mlx5_cmd_exec_in*() interfaces. Reviewed-by: Moshe Shemesh Signed-off-by: Leon Romanovsky --- drivers/net/ethernet/mellanox/mlx5/core/ecpf.c | 30 ++++++-------------------- 1 file changed, 6 insertions(+), 24 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/ecpf.c b/drivers/net/ethernet/mellanox/mlx5/core/ecpf.c index d2228e37450f..a894ea98c95a 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/ecpf.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/ecpf.c @@ -8,33 +8,13 @@ bool mlx5_read_embedded_cpu(struct mlx5_core_dev *dev) return (ioread32be(&dev->iseg->initializing) >> MLX5_ECPU_BIT_NUM) & 1; } -static int mlx5_peer_pf_enable_hca(struct mlx5_core_dev *dev) -{ - u32 out[MLX5_ST_SZ_DW(enable_hca_out)] = {}; - u32 in[MLX5_ST_SZ_DW(enable_hca_in)] = {}; - - MLX5_SET(enable_hca_in, in, opcode, MLX5_CMD_OP_ENABLE_HCA); - MLX5_SET(enable_hca_in, in, function_id, 0); - MLX5_SET(enable_hca_in, in, embedded_cpu_function, 0); - return mlx5_cmd_exec(dev, &in, sizeof(in), &out, sizeof(out)); -} - -static int mlx5_peer_pf_disable_hca(struct mlx5_core_dev *dev) -{ - u32 out[MLX5_ST_SZ_DW(disable_hca_out)] = {}; - u32 in[MLX5_ST_SZ_DW(disable_hca_in)] = {}; - - MLX5_SET(disable_hca_in, in, opcode, MLX5_CMD_OP_DISABLE_HCA); - MLX5_SET(disable_hca_in, in, function_id, 0); - MLX5_SET(disable_hca_in, in, embedded_cpu_function, 0); - return mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); -} - static int mlx5_peer_pf_init(struct mlx5_core_dev *dev) { + u32 in[MLX5_ST_SZ_DW(enable_hca_in)] = {}; int err; - err = mlx5_peer_pf_enable_hca(dev); + MLX5_SET(enable_hca_in, in, opcode, MLX5_CMD_OP_ENABLE_HCA); + err = mlx5_cmd_exec_in(dev, enable_hca, in); if (err) mlx5_core_err(dev, "Failed to enable peer PF HCA err(%d)\n", err); @@ -44,9 +24,11 @@ static int mlx5_peer_pf_init(struct mlx5_core_dev *dev) static void mlx5_peer_pf_cleanup(struct mlx5_core_dev *dev) { + u32 in[MLX5_ST_SZ_DW(disable_hca_in)] = {}; int err; - err = mlx5_peer_pf_disable_hca(dev); + MLX5_SET(disable_hca_in, in, opcode, MLX5_CMD_OP_DISABLE_HCA); + err = mlx5_cmd_exec_in(dev, disable_hca, in); if (err) { mlx5_core_err(dev, "Failed to disable peer PF HCA err(%d)\n", err); -- cgit v1.2.3-59-g8ed1b From 49d7fcd127c1ee011aee252985749eb33593488c Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Thu, 9 Apr 2020 12:25:08 +0300 Subject: net/mlx5: Update eq.c to new cmd interface Do mass update of eq.c to reuse newly introduced mlx5_cmd_exec_in*() interfaces. Reviewed-by: Moshe Shemesh Signed-off-by: Leon Romanovsky --- drivers/net/ethernet/mellanox/mlx5/core/eq.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eq.c b/drivers/net/ethernet/mellanox/mlx5/core/eq.c index bee419d01af2..4d974b5405b5 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/eq.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/eq.c @@ -101,12 +101,11 @@ struct mlx5_eq_table { static int mlx5_cmd_destroy_eq(struct mlx5_core_dev *dev, u8 eqn) { - u32 out[MLX5_ST_SZ_DW(destroy_eq_out)] = {0}; - u32 in[MLX5_ST_SZ_DW(destroy_eq_in)] = {0}; + u32 in[MLX5_ST_SZ_DW(destroy_eq_in)] = {}; MLX5_SET(destroy_eq_in, in, opcode, MLX5_CMD_OP_DESTROY_EQ); MLX5_SET(destroy_eq_in, in, eq_number, eqn); - return mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); + return mlx5_cmd_exec_in(dev, destroy_eq, in); } /* caller must eventually call mlx5_cq_put on the returned cq */ -- cgit v1.2.3-59-g8ed1b From a184cda1bb31eb14720c5f09d9698ab1666aa371 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Thu, 9 Apr 2020 12:26:41 +0300 Subject: net/mlx5: Update statistics to new cmd interface Do mass update of statistics to reuse newly introduced mlx5_cmd_exec_in*() interfaces. Reviewed-by: Moshe Shemesh Signed-off-by: Leon Romanovsky --- .../ethernet/mellanox/mlx5/core/en/monitor_stats.c | 46 +++++++--------------- drivers/net/ethernet/mellanox/mlx5/core/en_stats.c | 17 +++----- 2 files changed, 19 insertions(+), 44 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/monitor_stats.c b/drivers/net/ethernet/mellanox/mlx5/core/en/monitor_stats.c index 7cd5b02e0f10..8fe8b4d6ad1c 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/monitor_stats.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/monitor_stats.c @@ -38,12 +38,11 @@ int mlx5e_monitor_counter_supported(struct mlx5e_priv *priv) void mlx5e_monitor_counter_arm(struct mlx5e_priv *priv) { - u32 in[MLX5_ST_SZ_DW(arm_monitor_counter_in)] = {}; - u32 out[MLX5_ST_SZ_DW(arm_monitor_counter_out)] = {}; + u32 in[MLX5_ST_SZ_DW(arm_monitor_counter_in)] = {}; MLX5_SET(arm_monitor_counter_in, in, opcode, MLX5_CMD_OP_ARM_MONITOR_COUNTER); - mlx5_cmd_exec(priv->mdev, in, sizeof(in), out, sizeof(out)); + mlx5_cmd_exec_in(priv->mdev, arm_monitor_counter, in); } static void mlx5e_monitor_counters_work(struct work_struct *work) @@ -66,19 +65,6 @@ static int mlx5e_monitor_event_handler(struct notifier_block *nb, return NOTIFY_OK; } -static void mlx5e_monitor_counter_start(struct mlx5e_priv *priv) -{ - MLX5_NB_INIT(&priv->monitor_counters_nb, mlx5e_monitor_event_handler, - MONITOR_COUNTER); - mlx5_eq_notifier_register(priv->mdev, &priv->monitor_counters_nb); -} - -static void mlx5e_monitor_counter_stop(struct mlx5e_priv *priv) -{ - mlx5_eq_notifier_unregister(priv->mdev, &priv->monitor_counters_nb); - cancel_work_sync(&priv->monitor_counters_work); -} - static int fill_monitor_counter_ppcnt_set1(int cnt, u32 *in) { enum mlx5_monitor_counter_ppcnt ppcnt_cnt; @@ -118,8 +104,7 @@ static void mlx5e_set_monitor_counter(struct mlx5e_priv *priv) int num_q_counters = MLX5_CAP_GEN(mdev, num_q_monitor_counters); int num_ppcnt_counters = !MLX5_CAP_PCAM_REG(mdev, ppcnt) ? 0 : MLX5_CAP_GEN(mdev, num_ppcnt_monitor_counters); - u32 in[MLX5_ST_SZ_DW(set_monitor_counter_in)] = {}; - u32 out[MLX5_ST_SZ_DW(set_monitor_counter_out)] = {}; + u32 in[MLX5_ST_SZ_DW(set_monitor_counter_in)] = {}; int q_counter = priv->q_counter; int cnt = 0; @@ -136,34 +121,31 @@ static void mlx5e_set_monitor_counter(struct mlx5e_priv *priv) MLX5_SET(set_monitor_counter_in, in, opcode, MLX5_CMD_OP_SET_MONITOR_COUNTER); - mlx5_cmd_exec(mdev, in, sizeof(in), out, sizeof(out)); + mlx5_cmd_exec_in(mdev, set_monitor_counter, in); } /* check if mlx5e_monitor_counter_supported before calling this function*/ void mlx5e_monitor_counter_init(struct mlx5e_priv *priv) { INIT_WORK(&priv->monitor_counters_work, mlx5e_monitor_counters_work); - mlx5e_monitor_counter_start(priv); + MLX5_NB_INIT(&priv->monitor_counters_nb, mlx5e_monitor_event_handler, + MONITOR_COUNTER); + mlx5_eq_notifier_register(priv->mdev, &priv->monitor_counters_nb); + mlx5e_set_monitor_counter(priv); mlx5e_monitor_counter_arm(priv); queue_work(priv->wq, &priv->update_stats_work); } -static void mlx5e_monitor_counter_disable(struct mlx5e_priv *priv) +/* check if mlx5e_monitor_counter_supported before calling this function*/ +void mlx5e_monitor_counter_cleanup(struct mlx5e_priv *priv) { - u32 in[MLX5_ST_SZ_DW(set_monitor_counter_in)] = {}; - u32 out[MLX5_ST_SZ_DW(set_monitor_counter_out)] = {}; + u32 in[MLX5_ST_SZ_DW(set_monitor_counter_in)] = {}; - MLX5_SET(set_monitor_counter_in, in, num_of_counters, 0); MLX5_SET(set_monitor_counter_in, in, opcode, MLX5_CMD_OP_SET_MONITOR_COUNTER); - mlx5_cmd_exec(priv->mdev, in, sizeof(in), out, sizeof(out)); -} - -/* check if mlx5e_monitor_counter_supported before calling this function*/ -void mlx5e_monitor_counter_cleanup(struct mlx5e_priv *priv) -{ - mlx5e_monitor_counter_disable(priv); - mlx5e_monitor_counter_stop(priv); + mlx5_cmd_exec_in(priv->mdev, set_monitor_counter, in); + mlx5_eq_notifier_unregister(priv->mdev, &priv->monitor_counters_nb); + cancel_work_sync(&priv->monitor_counters_work); } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_stats.c b/drivers/net/ethernet/mellanox/mlx5/core/en_stats.c index ff4002ebad90..e91a8b22eba6 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_stats.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_stats.c @@ -491,18 +491,14 @@ static MLX5E_DECLARE_STATS_GRP_OP_FILL_STATS(vnic_env) static MLX5E_DECLARE_STATS_GRP_OP_UPDATE_STATS(vnic_env) { u32 *out = (u32 *)priv->stats.vnic.query_vnic_env_out; - int outlen = MLX5_ST_SZ_BYTES(query_vnic_env_out); - u32 in[MLX5_ST_SZ_DW(query_vnic_env_in)] = {0}; + u32 in[MLX5_ST_SZ_DW(query_vnic_env_in)] = {}; struct mlx5_core_dev *mdev = priv->mdev; if (!MLX5_CAP_GEN(priv->mdev, nic_receive_steering_discard)) return; - MLX5_SET(query_vnic_env_in, in, opcode, - MLX5_CMD_OP_QUERY_VNIC_ENV); - MLX5_SET(query_vnic_env_in, in, op_mod, 0); - MLX5_SET(query_vnic_env_in, in, other_vport, 0); - mlx5_cmd_exec(mdev, in, sizeof(in), out, outlen); + MLX5_SET(query_vnic_env_in, in, opcode, MLX5_CMD_OP_QUERY_VNIC_ENV); + mlx5_cmd_exec_inout(mdev, query_vnic_env, in, out); } #define VPORT_COUNTER_OFF(c) MLX5_BYTE_OFF(query_vport_counter_out, c) @@ -577,15 +573,12 @@ static MLX5E_DECLARE_STATS_GRP_OP_FILL_STATS(vport) static MLX5E_DECLARE_STATS_GRP_OP_UPDATE_STATS(vport) { - int outlen = MLX5_ST_SZ_BYTES(query_vport_counter_out); u32 *out = (u32 *)priv->stats.vport.query_vport_out; - u32 in[MLX5_ST_SZ_DW(query_vport_counter_in)] = {0}; + u32 in[MLX5_ST_SZ_DW(query_vport_counter_in)] = {}; struct mlx5_core_dev *mdev = priv->mdev; MLX5_SET(query_vport_counter_in, in, opcode, MLX5_CMD_OP_QUERY_VPORT_COUNTER); - MLX5_SET(query_vport_counter_in, in, op_mod, 0); - MLX5_SET(query_vport_counter_in, in, other_vport, 0); - mlx5_cmd_exec(mdev, in, sizeof(in), out, outlen); + mlx5_cmd_exec_inout(mdev, query_vport_counter, in, out); } #define PPORT_802_3_OFF(c) \ -- cgit v1.2.3-59-g8ed1b From e08a6832f9c19a1b514675ee53a34736647f918a Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Thu, 9 Apr 2020 12:30:08 +0300 Subject: net/mlx5: Update eswitch to new cmd interface Do mass update of eswitch to reuse newly introduced mlx5_cmd_exec_in*() interfaces. Signed-off-by: Leon Romanovsky --- drivers/net/ethernet/mellanox/mlx5/core/eswitch.c | 38 ++++++---------------- drivers/net/ethernet/mellanox/mlx5/core/eswitch.h | 6 +--- .../ethernet/mellanox/mlx5/core/eswitch_offloads.c | 18 +++++----- 3 files changed, 20 insertions(+), 42 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c index 7f618a443bfd..c5eb4e7754a9 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c @@ -84,8 +84,7 @@ mlx5_eswitch_get_vport(struct mlx5_eswitch *esw, u16 vport_num) static int arm_vport_context_events_cmd(struct mlx5_core_dev *dev, u16 vport, u32 events_mask) { - int in[MLX5_ST_SZ_DW(modify_nic_vport_context_in)] = {0}; - int out[MLX5_ST_SZ_DW(modify_nic_vport_context_out)] = {0}; + u32 in[MLX5_ST_SZ_DW(modify_nic_vport_context_in)] = {}; void *nic_vport_ctx; MLX5_SET(modify_nic_vport_context_in, in, @@ -108,40 +107,24 @@ static int arm_vport_context_events_cmd(struct mlx5_core_dev *dev, u16 vport, MLX5_SET(nic_vport_context, nic_vport_ctx, event_on_promisc_change, 1); - return mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); + return mlx5_cmd_exec_in(dev, modify_nic_vport_context, in); } /* E-Switch vport context HW commands */ int mlx5_eswitch_modify_esw_vport_context(struct mlx5_core_dev *dev, u16 vport, - bool other_vport, - void *in, int inlen) + bool other_vport, void *in) { - u32 out[MLX5_ST_SZ_DW(modify_esw_vport_context_out)] = {0}; - MLX5_SET(modify_esw_vport_context_in, in, opcode, MLX5_CMD_OP_MODIFY_ESW_VPORT_CONTEXT); MLX5_SET(modify_esw_vport_context_in, in, vport_number, vport); MLX5_SET(modify_esw_vport_context_in, in, other_vport, other_vport); - return mlx5_cmd_exec(dev, in, inlen, out, sizeof(out)); -} - -int mlx5_eswitch_query_esw_vport_context(struct mlx5_core_dev *dev, u16 vport, - bool other_vport, - void *out, int outlen) -{ - u32 in[MLX5_ST_SZ_DW(query_esw_vport_context_in)] = {}; - - MLX5_SET(query_esw_vport_context_in, in, opcode, - MLX5_CMD_OP_QUERY_ESW_VPORT_CONTEXT); - MLX5_SET(modify_esw_vport_context_in, in, vport_number, vport); - MLX5_SET(modify_esw_vport_context_in, in, other_vport, other_vport); - return mlx5_cmd_exec(dev, in, sizeof(in), out, outlen); + return mlx5_cmd_exec_in(dev, modify_esw_vport_context, in); } static int modify_esw_vport_cvlan(struct mlx5_core_dev *dev, u16 vport, u16 vlan, u8 qos, u8 set_flags) { - u32 in[MLX5_ST_SZ_DW(modify_esw_vport_context_in)] = {0}; + u32 in[MLX5_ST_SZ_DW(modify_esw_vport_context_in)] = {}; if (!MLX5_CAP_ESW(dev, vport_cvlan_strip) || !MLX5_CAP_ESW(dev, vport_cvlan_insert_if_not_exist)) @@ -170,8 +153,7 @@ static int modify_esw_vport_cvlan(struct mlx5_core_dev *dev, u16 vport, MLX5_SET(modify_esw_vport_context_in, in, field_select.vport_cvlan_insert, 1); - return mlx5_eswitch_modify_esw_vport_context(dev, vport, true, - in, sizeof(in)); + return mlx5_eswitch_modify_esw_vport_context(dev, vport, true, in); } /* E-Switch FDB */ @@ -1901,7 +1883,7 @@ const u32 *mlx5_esw_query_functions(struct mlx5_core_dev *dev) MLX5_SET(query_esw_functions_in, in, opcode, MLX5_CMD_OP_QUERY_ESW_FUNCTIONS); - err = mlx5_cmd_exec(dev, in, sizeof(in), out, outlen); + err = mlx5_cmd_exec_inout(dev, query_esw_functions, in, out); if (!err) return out; @@ -2783,8 +2765,8 @@ int mlx5_eswitch_get_vport_stats(struct mlx5_eswitch *esw, { struct mlx5_vport *vport = mlx5_eswitch_get_vport(esw, vport_num); int outlen = MLX5_ST_SZ_BYTES(query_vport_counter_out); - u32 in[MLX5_ST_SZ_DW(query_vport_counter_in)] = {0}; - struct mlx5_vport_drop_stats stats = {0}; + u32 in[MLX5_ST_SZ_DW(query_vport_counter_in)] = {}; + struct mlx5_vport_drop_stats stats = {}; int err = 0; u32 *out; @@ -2801,7 +2783,7 @@ int mlx5_eswitch_get_vport_stats(struct mlx5_eswitch *esw, MLX5_SET(query_vport_counter_in, in, vport_number, vport->vport); MLX5_SET(query_vport_counter_in, in, other_vport, 1); - err = mlx5_cmd_exec(esw->dev, in, sizeof(in), out, outlen); + err = mlx5_cmd_exec_inout(esw->dev, query_vport_counter, in, out); if (err) goto free_out; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h index 39f42f985fbd..fa2ad172f08c 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h @@ -329,11 +329,7 @@ int mlx5_eswitch_get_vport_stats(struct mlx5_eswitch *esw, void mlx5_eswitch_del_send_to_vport_rule(struct mlx5_flow_handle *rule); int mlx5_eswitch_modify_esw_vport_context(struct mlx5_core_dev *dev, u16 vport, - bool other_vport, - void *in, int inlen); -int mlx5_eswitch_query_esw_vport_context(struct mlx5_core_dev *dev, u16 vport, - bool other_vport, - void *out, int outlen); + bool other_vport, void *in); struct mlx5_flow_spec; struct mlx5_esw_flow_attr; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c index f171eb2234b0..dc098bb58973 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c @@ -790,7 +790,8 @@ static bool mlx5_eswitch_reg_c1_loopback_supported(struct mlx5_eswitch *esw) static int esw_set_passing_vport_metadata(struct mlx5_eswitch *esw, bool enable) { u32 out[MLX5_ST_SZ_DW(query_esw_vport_context_out)] = {}; - u32 in[MLX5_ST_SZ_DW(modify_esw_vport_context_in)] = {}; + u32 min[MLX5_ST_SZ_DW(modify_esw_vport_context_in)] = {}; + u32 in[MLX5_ST_SZ_DW(query_esw_vport_context_in)] = {}; u8 curr, wanted; int err; @@ -798,8 +799,9 @@ static int esw_set_passing_vport_metadata(struct mlx5_eswitch *esw, bool enable) !mlx5_eswitch_vport_match_metadata_enabled(esw)) return 0; - err = mlx5_eswitch_query_esw_vport_context(esw->dev, 0, false, - out, sizeof(out)); + MLX5_SET(query_esw_vport_context_in, in, opcode, + MLX5_CMD_OP_QUERY_ESW_VPORT_CONTEXT); + err = mlx5_cmd_exec_inout(esw->dev, query_esw_vport_context, in, out); if (err) return err; @@ -814,14 +816,12 @@ static int esw_set_passing_vport_metadata(struct mlx5_eswitch *esw, bool enable) else curr &= ~wanted; - MLX5_SET(modify_esw_vport_context_in, in, + MLX5_SET(modify_esw_vport_context_in, min, esw_vport_context.fdb_to_vport_reg_c_id, curr); - - MLX5_SET(modify_esw_vport_context_in, in, + MLX5_SET(modify_esw_vport_context_in, min, field_select.fdb_to_vport_reg_c_id, 1); - err = mlx5_eswitch_modify_esw_vport_context(esw->dev, 0, false, in, - sizeof(in)); + err = mlx5_eswitch_modify_esw_vport_context(esw->dev, 0, false, min); if (!err) { if (enable && (curr & MLX5_FDB_TO_VPORT_REG_C_1)) esw->flags |= MLX5_ESWITCH_REG_C1_LOOPBACK_ENABLED; @@ -1474,7 +1474,7 @@ query_vports: out: *mode = mlx5_mode; return 0; -} +} static void esw_destroy_restore_table(struct mlx5_eswitch *esw) { -- cgit v1.2.3-59-g8ed1b From b316e1866fa39aadc1888fc68b73729343cc4058 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Thu, 9 Apr 2020 15:59:30 +0300 Subject: net/mlx5: Update FPGA to new cmd interface Do mass update of FPGA to reuse newly introduced mlx5_cmd_exec_in*() interfaces. Reviewed-by: Moshe Shemesh Signed-off-by: Leon Romanovsky --- drivers/net/ethernet/mellanox/mlx5/core/fpga/cmd.c | 28 ++++++++++------------ 1 file changed, 13 insertions(+), 15 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fpga/cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/fpga/cmd.c index 09769401c313..9a37077152aa 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/fpga/cmd.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/fpga/cmd.c @@ -142,15 +142,15 @@ int mlx5_fpga_query(struct mlx5_core_dev *dev, struct mlx5_fpga_query *query) int mlx5_fpga_create_qp(struct mlx5_core_dev *dev, void *fpga_qpc, u32 *fpga_qpn) { - u32 in[MLX5_ST_SZ_DW(fpga_create_qp_in)] = {0}; - u32 out[MLX5_ST_SZ_DW(fpga_create_qp_out)]; + u32 out[MLX5_ST_SZ_DW(fpga_create_qp_out)] = {}; + u32 in[MLX5_ST_SZ_DW(fpga_create_qp_in)] = {}; int ret; MLX5_SET(fpga_create_qp_in, in, opcode, MLX5_CMD_OP_FPGA_CREATE_QP); memcpy(MLX5_ADDR_OF(fpga_create_qp_in, in, fpga_qpc), fpga_qpc, MLX5_FLD_SZ_BYTES(fpga_create_qp_in, fpga_qpc)); - ret = mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); + ret = mlx5_cmd_exec_inout(dev, fpga_create_qp, in, out); if (ret) return ret; @@ -164,8 +164,7 @@ int mlx5_fpga_modify_qp(struct mlx5_core_dev *dev, u32 fpga_qpn, enum mlx5_fpga_qpc_field_select fields, void *fpga_qpc) { - u32 in[MLX5_ST_SZ_DW(fpga_modify_qp_in)] = {0}; - u32 out[MLX5_ST_SZ_DW(fpga_modify_qp_out)]; + u32 in[MLX5_ST_SZ_DW(fpga_modify_qp_in)] = {}; MLX5_SET(fpga_modify_qp_in, in, opcode, MLX5_CMD_OP_FPGA_MODIFY_QP); MLX5_SET(fpga_modify_qp_in, in, field_select, fields); @@ -173,20 +172,20 @@ int mlx5_fpga_modify_qp(struct mlx5_core_dev *dev, u32 fpga_qpn, memcpy(MLX5_ADDR_OF(fpga_modify_qp_in, in, fpga_qpc), fpga_qpc, MLX5_FLD_SZ_BYTES(fpga_modify_qp_in, fpga_qpc)); - return mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); + return mlx5_cmd_exec_in(dev, fpga_modify_qp, in); } int mlx5_fpga_query_qp(struct mlx5_core_dev *dev, u32 fpga_qpn, void *fpga_qpc) { - u32 in[MLX5_ST_SZ_DW(fpga_query_qp_in)] = {0}; - u32 out[MLX5_ST_SZ_DW(fpga_query_qp_out)]; + u32 out[MLX5_ST_SZ_DW(fpga_query_qp_out)] = {}; + u32 in[MLX5_ST_SZ_DW(fpga_query_qp_in)] = {}; int ret; MLX5_SET(fpga_query_qp_in, in, opcode, MLX5_CMD_OP_FPGA_QUERY_QP); MLX5_SET(fpga_query_qp_in, in, fpga_qpn, fpga_qpn); - ret = mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); + ret = mlx5_cmd_exec_inout(dev, fpga_query_qp, in, out); if (ret) return ret; @@ -197,20 +196,19 @@ int mlx5_fpga_query_qp(struct mlx5_core_dev *dev, int mlx5_fpga_destroy_qp(struct mlx5_core_dev *dev, u32 fpga_qpn) { - u32 in[MLX5_ST_SZ_DW(fpga_destroy_qp_in)] = {0}; - u32 out[MLX5_ST_SZ_DW(fpga_destroy_qp_out)]; + u32 in[MLX5_ST_SZ_DW(fpga_destroy_qp_in)] = {}; MLX5_SET(fpga_destroy_qp_in, in, opcode, MLX5_CMD_OP_FPGA_DESTROY_QP); MLX5_SET(fpga_destroy_qp_in, in, fpga_qpn, fpga_qpn); - return mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); + return mlx5_cmd_exec_in(dev, fpga_destroy_qp, in); } int mlx5_fpga_query_qp_counters(struct mlx5_core_dev *dev, u32 fpga_qpn, bool clear, struct mlx5_fpga_qp_counters *data) { - u32 in[MLX5_ST_SZ_DW(fpga_query_qp_counters_in)] = {0}; - u32 out[MLX5_ST_SZ_DW(fpga_query_qp_counters_out)]; + u32 out[MLX5_ST_SZ_DW(fpga_query_qp_counters_out)] = {}; + u32 in[MLX5_ST_SZ_DW(fpga_query_qp_counters_in)] = {}; int ret; MLX5_SET(fpga_query_qp_counters_in, in, opcode, @@ -218,7 +216,7 @@ int mlx5_fpga_query_qp_counters(struct mlx5_core_dev *dev, u32 fpga_qpn, MLX5_SET(fpga_query_qp_counters_in, in, clear, clear); MLX5_SET(fpga_query_qp_counters_in, in, fpga_qpn, fpga_qpn); - ret = mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); + ret = mlx5_cmd_exec_inout(dev, fpga_query_qp_counters, in, out); if (ret) return ret; -- cgit v1.2.3-59-g8ed1b From 31a0956ea91567b8f0e39d8420253646304b8571 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Thu, 9 Apr 2020 16:01:30 +0300 Subject: net/mlx5: Update fs_core new cmd interface Do mass update of fs_core to reuse newly introduced mlx5_cmd_exec_in*() interfaces. Reviewed-by: Moshe Shemesh Signed-off-by: Leon Romanovsky --- drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c | 80 +++++++++--------------- 1 file changed, 31 insertions(+), 49 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c index 90048697b2ff..304d1e4f0541 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c @@ -155,8 +155,7 @@ static int mlx5_cmd_update_root_ft(struct mlx5_flow_root_namespace *ns, struct mlx5_flow_table *ft, u32 underlay_qpn, bool disconnect) { - u32 in[MLX5_ST_SZ_DW(set_flow_table_root_in)] = {0}; - u32 out[MLX5_ST_SZ_DW(set_flow_table_root_out)] = {0}; + u32 in[MLX5_ST_SZ_DW(set_flow_table_root_in)] = {}; struct mlx5_core_dev *dev = ns->dev; if ((MLX5_CAP_GEN(dev, port_type) == MLX5_CAP_PORT_TYPE_IB) && @@ -167,13 +166,10 @@ static int mlx5_cmd_update_root_ft(struct mlx5_flow_root_namespace *ns, MLX5_CMD_OP_SET_FLOW_TABLE_ROOT); MLX5_SET(set_flow_table_root_in, in, table_type, ft->type); - if (disconnect) { + if (disconnect) MLX5_SET(set_flow_table_root_in, in, op_mod, 1); - MLX5_SET(set_flow_table_root_in, in, table_id, 0); - } else { - MLX5_SET(set_flow_table_root_in, in, op_mod, 0); + else MLX5_SET(set_flow_table_root_in, in, table_id, ft->id); - } MLX5_SET(set_flow_table_root_in, in, underlay_qpn, underlay_qpn); if (ft->vport) { @@ -181,7 +177,7 @@ static int mlx5_cmd_update_root_ft(struct mlx5_flow_root_namespace *ns, MLX5_SET(set_flow_table_root_in, in, other_vport, 1); } - return mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); + return mlx5_cmd_exec_in(dev, set_flow_table_root, in); } static int mlx5_cmd_create_flow_table(struct mlx5_flow_root_namespace *ns, @@ -192,8 +188,8 @@ static int mlx5_cmd_create_flow_table(struct mlx5_flow_root_namespace *ns, int en_encap = !!(ft->flags & MLX5_FLOW_TABLE_TUNNEL_EN_REFORMAT); int en_decap = !!(ft->flags & MLX5_FLOW_TABLE_TUNNEL_EN_DECAP); int term = !!(ft->flags & MLX5_FLOW_TABLE_TERMINATION); - u32 out[MLX5_ST_SZ_DW(create_flow_table_out)] = {0}; - u32 in[MLX5_ST_SZ_DW(create_flow_table_in)] = {0}; + u32 out[MLX5_ST_SZ_DW(create_flow_table_out)] = {}; + u32 in[MLX5_ST_SZ_DW(create_flow_table_in)] = {}; struct mlx5_core_dev *dev = ns->dev; int err; @@ -239,7 +235,7 @@ static int mlx5_cmd_create_flow_table(struct mlx5_flow_root_namespace *ns, break; } - err = mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); + err = mlx5_cmd_exec_inout(dev, create_flow_table, in, out); if (!err) ft->id = MLX5_GET(create_flow_table_out, out, table_id); @@ -249,8 +245,7 @@ static int mlx5_cmd_create_flow_table(struct mlx5_flow_root_namespace *ns, static int mlx5_cmd_destroy_flow_table(struct mlx5_flow_root_namespace *ns, struct mlx5_flow_table *ft) { - u32 in[MLX5_ST_SZ_DW(destroy_flow_table_in)] = {0}; - u32 out[MLX5_ST_SZ_DW(destroy_flow_table_out)] = {0}; + u32 in[MLX5_ST_SZ_DW(destroy_flow_table_in)] = {}; struct mlx5_core_dev *dev = ns->dev; MLX5_SET(destroy_flow_table_in, in, opcode, @@ -262,15 +257,14 @@ static int mlx5_cmd_destroy_flow_table(struct mlx5_flow_root_namespace *ns, MLX5_SET(destroy_flow_table_in, in, other_vport, 1); } - return mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); + return mlx5_cmd_exec_in(dev, destroy_flow_table, in); } static int mlx5_cmd_modify_flow_table(struct mlx5_flow_root_namespace *ns, struct mlx5_flow_table *ft, struct mlx5_flow_table *next_ft) { - u32 in[MLX5_ST_SZ_DW(modify_flow_table_in)] = {0}; - u32 out[MLX5_ST_SZ_DW(modify_flow_table_out)] = {0}; + u32 in[MLX5_ST_SZ_DW(modify_flow_table_in)] = {}; struct mlx5_core_dev *dev = ns->dev; MLX5_SET(modify_flow_table_in, in, opcode, @@ -310,7 +304,7 @@ static int mlx5_cmd_modify_flow_table(struct mlx5_flow_root_namespace *ns, } } - return mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); + return mlx5_cmd_exec_in(dev, modify_flow_table, in); } static int mlx5_cmd_create_flow_group(struct mlx5_flow_root_namespace *ns, @@ -318,8 +312,7 @@ static int mlx5_cmd_create_flow_group(struct mlx5_flow_root_namespace *ns, u32 *in, struct mlx5_flow_group *fg) { - u32 out[MLX5_ST_SZ_DW(create_flow_group_out)] = {0}; - int inlen = MLX5_ST_SZ_BYTES(create_flow_group_in); + u32 out[MLX5_ST_SZ_DW(create_flow_group_out)] = {}; struct mlx5_core_dev *dev = ns->dev; int err; @@ -332,7 +325,7 @@ static int mlx5_cmd_create_flow_group(struct mlx5_flow_root_namespace *ns, MLX5_SET(create_flow_group_in, in, other_vport, 1); } - err = mlx5_cmd_exec(dev, in, inlen, out, sizeof(out)); + err = mlx5_cmd_exec_inout(dev, create_flow_group, in, out); if (!err) fg->id = MLX5_GET(create_flow_group_out, out, group_id); @@ -343,8 +336,7 @@ static int mlx5_cmd_destroy_flow_group(struct mlx5_flow_root_namespace *ns, struct mlx5_flow_table *ft, struct mlx5_flow_group *fg) { - u32 out[MLX5_ST_SZ_DW(destroy_flow_group_out)] = {0}; - u32 in[MLX5_ST_SZ_DW(destroy_flow_group_in)] = {0}; + u32 in[MLX5_ST_SZ_DW(destroy_flow_group_in)] = {}; struct mlx5_core_dev *dev = ns->dev; MLX5_SET(destroy_flow_group_in, in, opcode, @@ -357,7 +349,7 @@ static int mlx5_cmd_destroy_flow_group(struct mlx5_flow_root_namespace *ns, MLX5_SET(destroy_flow_group_in, in, other_vport, 1); } - return mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); + return mlx5_cmd_exec_in(dev, destroy_flow_group, in); } static int mlx5_set_extended_dest(struct mlx5_core_dev *dev, @@ -600,8 +592,7 @@ static int mlx5_cmd_delete_fte(struct mlx5_flow_root_namespace *ns, struct mlx5_flow_table *ft, struct fs_fte *fte) { - u32 out[MLX5_ST_SZ_DW(delete_fte_out)] = {0}; - u32 in[MLX5_ST_SZ_DW(delete_fte_in)] = {0}; + u32 in[MLX5_ST_SZ_DW(delete_fte_in)] = {}; struct mlx5_core_dev *dev = ns->dev; MLX5_SET(delete_fte_in, in, opcode, MLX5_CMD_OP_DELETE_FLOW_TABLE_ENTRY); @@ -613,22 +604,22 @@ static int mlx5_cmd_delete_fte(struct mlx5_flow_root_namespace *ns, MLX5_SET(delete_fte_in, in, other_vport, 1); } - return mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); + return mlx5_cmd_exec_in(dev, delete_fte, in); } int mlx5_cmd_fc_bulk_alloc(struct mlx5_core_dev *dev, enum mlx5_fc_bulk_alloc_bitmask alloc_bitmask, u32 *id) { - u32 in[MLX5_ST_SZ_DW(alloc_flow_counter_in)] = {0}; - u32 out[MLX5_ST_SZ_DW(alloc_flow_counter_out)] = {0}; + u32 out[MLX5_ST_SZ_DW(alloc_flow_counter_out)] = {}; + u32 in[MLX5_ST_SZ_DW(alloc_flow_counter_in)] = {}; int err; MLX5_SET(alloc_flow_counter_in, in, opcode, MLX5_CMD_OP_ALLOC_FLOW_COUNTER); MLX5_SET(alloc_flow_counter_in, in, flow_counter_bulk, alloc_bitmask); - err = mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); + err = mlx5_cmd_exec_inout(dev, alloc_flow_counter, in, out); if (!err) *id = MLX5_GET(alloc_flow_counter_out, out, flow_counter_id); return err; @@ -641,21 +632,20 @@ int mlx5_cmd_fc_alloc(struct mlx5_core_dev *dev, u32 *id) int mlx5_cmd_fc_free(struct mlx5_core_dev *dev, u32 id) { - u32 in[MLX5_ST_SZ_DW(dealloc_flow_counter_in)] = {0}; - u32 out[MLX5_ST_SZ_DW(dealloc_flow_counter_out)] = {0}; + u32 in[MLX5_ST_SZ_DW(dealloc_flow_counter_in)] = {}; MLX5_SET(dealloc_flow_counter_in, in, opcode, MLX5_CMD_OP_DEALLOC_FLOW_COUNTER); MLX5_SET(dealloc_flow_counter_in, in, flow_counter_id, id); - return mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); + return mlx5_cmd_exec_in(dev, dealloc_flow_counter, in); } int mlx5_cmd_fc_query(struct mlx5_core_dev *dev, u32 id, u64 *packets, u64 *bytes) { u32 out[MLX5_ST_SZ_BYTES(query_flow_counter_out) + - MLX5_ST_SZ_BYTES(traffic_counter)] = {0}; - u32 in[MLX5_ST_SZ_DW(query_flow_counter_in)] = {0}; + MLX5_ST_SZ_BYTES(traffic_counter)] = {}; + u32 in[MLX5_ST_SZ_DW(query_flow_counter_in)] = {}; void *stats; int err = 0; @@ -683,11 +673,10 @@ int mlx5_cmd_fc_bulk_query(struct mlx5_core_dev *dev, u32 base_id, int bulk_len, u32 *out) { int outlen = mlx5_cmd_fc_get_bulk_query_out_len(bulk_len); - u32 in[MLX5_ST_SZ_DW(query_flow_counter_in)] = {0}; + u32 in[MLX5_ST_SZ_DW(query_flow_counter_in)] = {}; MLX5_SET(query_flow_counter_in, in, opcode, MLX5_CMD_OP_QUERY_FLOW_COUNTER); - MLX5_SET(query_flow_counter_in, in, op_mod, 0); MLX5_SET(query_flow_counter_in, in, flow_counter_id, base_id); MLX5_SET(query_flow_counter_in, in, num_of_counters, bulk_len); return mlx5_cmd_exec(dev, in, sizeof(in), out, outlen); @@ -700,7 +689,7 @@ static int mlx5_cmd_packet_reformat_alloc(struct mlx5_flow_root_namespace *ns, enum mlx5_flow_namespace_type namespace, struct mlx5_pkt_reformat *pkt_reformat) { - u32 out[MLX5_ST_SZ_DW(alloc_packet_reformat_context_out)]; + u32 out[MLX5_ST_SZ_DW(alloc_packet_reformat_context_out)] = {}; struct mlx5_core_dev *dev = ns->dev; void *packet_reformat_context_in; int max_encap_size; @@ -732,7 +721,6 @@ static int mlx5_cmd_packet_reformat_alloc(struct mlx5_flow_root_namespace *ns, reformat_data); inlen = reformat - (void *)in + size; - memset(in, 0, inlen); MLX5_SET(alloc_packet_reformat_context_in, in, opcode, MLX5_CMD_OP_ALLOC_PACKET_REFORMAT_CONTEXT); MLX5_SET(packet_reformat_context_in, packet_reformat_context_in, @@ -741,7 +729,6 @@ static int mlx5_cmd_packet_reformat_alloc(struct mlx5_flow_root_namespace *ns, reformat_type, reformat_type); memcpy(reformat, reformat_data, size); - memset(out, 0, sizeof(out)); err = mlx5_cmd_exec(dev, in, inlen, out, sizeof(out)); pkt_reformat->id = MLX5_GET(alloc_packet_reformat_context_out, @@ -753,17 +740,15 @@ static int mlx5_cmd_packet_reformat_alloc(struct mlx5_flow_root_namespace *ns, static void mlx5_cmd_packet_reformat_dealloc(struct mlx5_flow_root_namespace *ns, struct mlx5_pkt_reformat *pkt_reformat) { - u32 in[MLX5_ST_SZ_DW(dealloc_packet_reformat_context_in)]; - u32 out[MLX5_ST_SZ_DW(dealloc_packet_reformat_context_out)]; + u32 in[MLX5_ST_SZ_DW(dealloc_packet_reformat_context_in)] = {}; struct mlx5_core_dev *dev = ns->dev; - memset(in, 0, sizeof(in)); MLX5_SET(dealloc_packet_reformat_context_in, in, opcode, MLX5_CMD_OP_DEALLOC_PACKET_REFORMAT_CONTEXT); MLX5_SET(dealloc_packet_reformat_context_in, in, packet_reformat_id, pkt_reformat->id); - mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); + mlx5_cmd_exec_in(dev, dealloc_packet_reformat_context, in); } static int mlx5_cmd_modify_header_alloc(struct mlx5_flow_root_namespace *ns, @@ -771,7 +756,7 @@ static int mlx5_cmd_modify_header_alloc(struct mlx5_flow_root_namespace *ns, void *modify_actions, struct mlx5_modify_hdr *modify_hdr) { - u32 out[MLX5_ST_SZ_DW(alloc_modify_header_context_out)]; + u32 out[MLX5_ST_SZ_DW(alloc_modify_header_context_out)] = {}; int max_actions, actions_size, inlen, err; struct mlx5_core_dev *dev = ns->dev; void *actions_in; @@ -821,7 +806,6 @@ static int mlx5_cmd_modify_header_alloc(struct mlx5_flow_root_namespace *ns, actions_in = MLX5_ADDR_OF(alloc_modify_header_context_in, in, actions); memcpy(actions_in, modify_actions, actions_size); - memset(out, 0, sizeof(out)); err = mlx5_cmd_exec(dev, in, inlen, out, sizeof(out)); modify_hdr->id = MLX5_GET(alloc_modify_header_context_out, out, modify_header_id); @@ -832,17 +816,15 @@ static int mlx5_cmd_modify_header_alloc(struct mlx5_flow_root_namespace *ns, static void mlx5_cmd_modify_header_dealloc(struct mlx5_flow_root_namespace *ns, struct mlx5_modify_hdr *modify_hdr) { - u32 in[MLX5_ST_SZ_DW(dealloc_modify_header_context_in)]; - u32 out[MLX5_ST_SZ_DW(dealloc_modify_header_context_out)]; + u32 in[MLX5_ST_SZ_DW(dealloc_modify_header_context_in)] = {}; struct mlx5_core_dev *dev = ns->dev; - memset(in, 0, sizeof(in)); MLX5_SET(dealloc_modify_header_context_in, in, opcode, MLX5_CMD_OP_DEALLOC_MODIFY_HEADER_CONTEXT); MLX5_SET(dealloc_modify_header_context_in, in, modify_header_id, modify_hdr->id); - mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); + mlx5_cmd_exec_in(dev, dealloc_modify_header_context, in); } static const struct mlx5_flow_cmds mlx5_flow_cmds = { -- cgit v1.2.3-59-g8ed1b From 59ad21c21fc4025831fbd0c291e2db1247fddf4d Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Thu, 9 Apr 2020 16:08:35 +0300 Subject: net/mlx5: Update fw.c new cmd interface Do mass update of fw.c to reuse newly introduced mlx5_cmd_exec_in*() interfaces. Reviewed-by: Moshe Shemesh Signed-off-by: Leon Romanovsky --- drivers/net/ethernet/mellanox/mlx5/core/fw.c | 33 +++++++++++----------------- 1 file changed, 13 insertions(+), 20 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fw.c b/drivers/net/ethernet/mellanox/mlx5/core/fw.c index 3040e0466681..a5fbe7343508 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/fw.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/fw.c @@ -67,26 +67,19 @@ enum { MCQI_FW_STORED_VERSION = 1, }; -static int mlx5_cmd_query_adapter(struct mlx5_core_dev *dev, u32 *out, - int outlen) -{ - u32 in[MLX5_ST_SZ_DW(query_adapter_in)] = {0}; - - MLX5_SET(query_adapter_in, in, opcode, MLX5_CMD_OP_QUERY_ADAPTER); - return mlx5_cmd_exec(dev, in, sizeof(in), out, outlen); -} - int mlx5_query_board_id(struct mlx5_core_dev *dev) { u32 *out; int outlen = MLX5_ST_SZ_BYTES(query_adapter_out); + u32 in[MLX5_ST_SZ_DW(query_adapter_in)] = {}; int err; out = kzalloc(outlen, GFP_KERNEL); if (!out) return -ENOMEM; - err = mlx5_cmd_query_adapter(dev, out, outlen); + MLX5_SET(query_adapter_in, in, opcode, MLX5_CMD_OP_QUERY_ADAPTER); + err = mlx5_cmd_exec_inout(dev, query_adapter, in, out); if (err) goto out; @@ -105,13 +98,15 @@ int mlx5_core_query_vendor_id(struct mlx5_core_dev *mdev, u32 *vendor_id) { u32 *out; int outlen = MLX5_ST_SZ_BYTES(query_adapter_out); + u32 in[MLX5_ST_SZ_DW(query_adapter_in)] = {}; int err; out = kzalloc(outlen, GFP_KERNEL); if (!out) return -ENOMEM; - err = mlx5_cmd_query_adapter(mdev, out, outlen); + MLX5_SET(query_adapter_in, in, opcode, MLX5_CMD_OP_QUERY_ADAPTER); + err = mlx5_cmd_exec_inout(mdev, query_adapter, in, out); if (err) goto out; @@ -259,8 +254,7 @@ int mlx5_query_hca_caps(struct mlx5_core_dev *dev) int mlx5_cmd_init_hca(struct mlx5_core_dev *dev, uint32_t *sw_owner_id) { - u32 out[MLX5_ST_SZ_DW(init_hca_out)] = {0}; - u32 in[MLX5_ST_SZ_DW(init_hca_in)] = {0}; + u32 in[MLX5_ST_SZ_DW(init_hca_in)] = {}; int i; MLX5_SET(init_hca_in, in, opcode, MLX5_CMD_OP_INIT_HCA); @@ -271,16 +265,15 @@ int mlx5_cmd_init_hca(struct mlx5_core_dev *dev, uint32_t *sw_owner_id) sw_owner_id[i]); } - return mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); + return mlx5_cmd_exec_in(dev, init_hca, in); } int mlx5_cmd_teardown_hca(struct mlx5_core_dev *dev) { - u32 out[MLX5_ST_SZ_DW(teardown_hca_out)] = {0}; - u32 in[MLX5_ST_SZ_DW(teardown_hca_in)] = {0}; + u32 in[MLX5_ST_SZ_DW(teardown_hca_in)] = {}; MLX5_SET(teardown_hca_in, in, opcode, MLX5_CMD_OP_TEARDOWN_HCA); - return mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); + return mlx5_cmd_exec_in(dev, teardown_hca, in); } int mlx5_cmd_force_teardown_hca(struct mlx5_core_dev *dev) @@ -315,8 +308,8 @@ int mlx5_cmd_force_teardown_hca(struct mlx5_core_dev *dev) int mlx5_cmd_fast_teardown_hca(struct mlx5_core_dev *dev) { unsigned long end, delay_ms = MLX5_FAST_TEARDOWN_WAIT_MS; - u32 out[MLX5_ST_SZ_DW(teardown_hca_out)] = {0}; - u32 in[MLX5_ST_SZ_DW(teardown_hca_in)] = {0}; + u32 out[MLX5_ST_SZ_DW(teardown_hca_out)] = {}; + u32 in[MLX5_ST_SZ_DW(teardown_hca_in)] = {}; int state; int ret; @@ -329,7 +322,7 @@ int mlx5_cmd_fast_teardown_hca(struct mlx5_core_dev *dev) MLX5_SET(teardown_hca_in, in, profile, MLX5_TEARDOWN_HCA_IN_PROFILE_PREPARE_FAST_TEARDOWN); - ret = mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); + ret = mlx5_cmd_exec_inout(dev, teardown_hca, in, out); if (ret) return ret; -- cgit v1.2.3-59-g8ed1b From 5d19395f6988c057cb9461075f308a6c16a8c253 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Thu, 9 Apr 2020 16:17:01 +0300 Subject: net/mlx5: Update lag.c new cmd interface Do mass update of lag.c to reuse newly introduced mlx5_cmd_exec_in*() interfaces. Reviewed-by: Moshe Shemesh Signed-off-by: Leon Romanovsky --- drivers/net/ethernet/mellanox/mlx5/core/lag.c | 52 +++++++++------------------ 1 file changed, 17 insertions(+), 35 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lag.c b/drivers/net/ethernet/mellanox/mlx5/core/lag.c index 93052b07c76c..c6ad5ca46877 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lag.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/lag.c @@ -47,8 +47,7 @@ static DEFINE_MUTEX(lag_mutex); static int mlx5_cmd_create_lag(struct mlx5_core_dev *dev, u8 remap_port1, u8 remap_port2) { - u32 in[MLX5_ST_SZ_DW(create_lag_in)] = {0}; - u32 out[MLX5_ST_SZ_DW(create_lag_out)] = {0}; + u32 in[MLX5_ST_SZ_DW(create_lag_in)] = {}; void *lag_ctx = MLX5_ADDR_OF(create_lag_in, in, ctx); MLX5_SET(create_lag_in, in, opcode, MLX5_CMD_OP_CREATE_LAG); @@ -56,14 +55,13 @@ static int mlx5_cmd_create_lag(struct mlx5_core_dev *dev, u8 remap_port1, MLX5_SET(lagc, lag_ctx, tx_remap_affinity_1, remap_port1); MLX5_SET(lagc, lag_ctx, tx_remap_affinity_2, remap_port2); - return mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); + return mlx5_cmd_exec_in(dev, create_lag, in); } static int mlx5_cmd_modify_lag(struct mlx5_core_dev *dev, u8 remap_port1, u8 remap_port2) { - u32 in[MLX5_ST_SZ_DW(modify_lag_in)] = {0}; - u32 out[MLX5_ST_SZ_DW(modify_lag_out)] = {0}; + u32 in[MLX5_ST_SZ_DW(modify_lag_in)] = {}; void *lag_ctx = MLX5_ADDR_OF(modify_lag_in, in, ctx); MLX5_SET(modify_lag_in, in, opcode, MLX5_CMD_OP_MODIFY_LAG); @@ -72,52 +70,29 @@ static int mlx5_cmd_modify_lag(struct mlx5_core_dev *dev, u8 remap_port1, MLX5_SET(lagc, lag_ctx, tx_remap_affinity_1, remap_port1); MLX5_SET(lagc, lag_ctx, tx_remap_affinity_2, remap_port2); - return mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); -} - -static int mlx5_cmd_destroy_lag(struct mlx5_core_dev *dev) -{ - u32 in[MLX5_ST_SZ_DW(destroy_lag_in)] = {0}; - u32 out[MLX5_ST_SZ_DW(destroy_lag_out)] = {0}; - - MLX5_SET(destroy_lag_in, in, opcode, MLX5_CMD_OP_DESTROY_LAG); - - return mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); + return mlx5_cmd_exec_in(dev, modify_lag, in); } int mlx5_cmd_create_vport_lag(struct mlx5_core_dev *dev) { - u32 in[MLX5_ST_SZ_DW(create_vport_lag_in)] = {0}; - u32 out[MLX5_ST_SZ_DW(create_vport_lag_out)] = {0}; + u32 in[MLX5_ST_SZ_DW(create_vport_lag_in)] = {}; MLX5_SET(create_vport_lag_in, in, opcode, MLX5_CMD_OP_CREATE_VPORT_LAG); - return mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); + return mlx5_cmd_exec_in(dev, create_vport_lag, in); } EXPORT_SYMBOL(mlx5_cmd_create_vport_lag); int mlx5_cmd_destroy_vport_lag(struct mlx5_core_dev *dev) { - u32 in[MLX5_ST_SZ_DW(destroy_vport_lag_in)] = {0}; - u32 out[MLX5_ST_SZ_DW(destroy_vport_lag_out)] = {0}; + u32 in[MLX5_ST_SZ_DW(destroy_vport_lag_in)] = {}; MLX5_SET(destroy_vport_lag_in, in, opcode, MLX5_CMD_OP_DESTROY_VPORT_LAG); - return mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); + return mlx5_cmd_exec_in(dev, destroy_vport_lag, in); } EXPORT_SYMBOL(mlx5_cmd_destroy_vport_lag); -static int mlx5_cmd_query_cong_counter(struct mlx5_core_dev *dev, - bool reset, void *out, int out_size) -{ - u32 in[MLX5_ST_SZ_DW(query_cong_statistics_in)] = { }; - - MLX5_SET(query_cong_statistics_in, in, opcode, - MLX5_CMD_OP_QUERY_CONG_STATISTICS); - MLX5_SET(query_cong_statistics_in, in, clear, reset); - return mlx5_cmd_exec(dev, in, sizeof(in), out, out_size); -} - int mlx5_lag_dev_get_netdev_idx(struct mlx5_lag *ldev, struct net_device *ndev) { @@ -232,12 +207,14 @@ int mlx5_activate_lag(struct mlx5_lag *ldev, static int mlx5_deactivate_lag(struct mlx5_lag *ldev) { struct mlx5_core_dev *dev0 = ldev->pf[MLX5_LAG_P1].dev; + u32 in[MLX5_ST_SZ_DW(destroy_lag_in)] = {}; bool roce_lag = __mlx5_lag_is_roce(ldev); int err; ldev->flags &= ~MLX5_LAG_MODE_FLAGS; - err = mlx5_cmd_destroy_lag(dev0); + MLX5_SET(destroy_lag_in, in, opcode, MLX5_CMD_OP_DESTROY_LAG); + err = mlx5_cmd_exec_in(dev0, destroy_lag, in); if (err) { if (roce_lag) { mlx5_core_err(dev0, @@ -758,7 +735,12 @@ int mlx5_lag_query_cong_counters(struct mlx5_core_dev *dev, } for (i = 0; i < num_ports; ++i) { - ret = mlx5_cmd_query_cong_counter(mdev[i], false, out, outlen); + u32 in[MLX5_ST_SZ_DW(query_cong_statistics_in)] = {}; + + MLX5_SET(query_cong_statistics_in, in, opcode, + MLX5_CMD_OP_QUERY_CONG_STATISTICS); + ret = mlx5_cmd_exec_inout(mdev[i], query_cong_statistics, in, + out); if (ret) goto unlock; -- cgit v1.2.3-59-g8ed1b From bb7664d369bfeb2754af8b972ddaa5734e2864a8 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Thu, 9 Apr 2020 16:30:39 +0300 Subject: net/mlx5: Update gid.c new cmd interface Do mass update of gid.c to reuse newly introduced mlx5_cmd_exec_in*() interfaces. Reviewed-by: Moshe Shemesh Signed-off-by: Leon Romanovsky --- drivers/net/ethernet/mellanox/mlx5/core/lib/gid.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/gid.c b/drivers/net/ethernet/mellanox/mlx5/core/lib/gid.c index 7722a3f9bb68..a68738c8f4bc 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lib/gid.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/gid.c @@ -124,8 +124,7 @@ int mlx5_core_roce_gid_set(struct mlx5_core_dev *dev, unsigned int index, const u8 *mac, bool vlan, u16 vlan_id, u8 port_num) { #define MLX5_SET_RA(p, f, v) MLX5_SET(roce_addr_layout, p, f, v) - u32 in[MLX5_ST_SZ_DW(set_roce_address_in)] = {0}; - u32 out[MLX5_ST_SZ_DW(set_roce_address_out)] = {0}; + u32 in[MLX5_ST_SZ_DW(set_roce_address_in)] = {}; void *in_addr = MLX5_ADDR_OF(set_roce_address_in, in, roce_address); char *addr_l3_addr = MLX5_ADDR_OF(roce_addr_layout, in_addr, source_l3_address); @@ -153,6 +152,6 @@ int mlx5_core_roce_gid_set(struct mlx5_core_dev *dev, unsigned int index, MLX5_SET(set_roce_address_in, in, roce_address_index, index); MLX5_SET(set_roce_address_in, in, opcode, MLX5_CMD_OP_SET_ROCE_ADDRESS); - return mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); + return mlx5_cmd_exec_in(dev, set_roce_address, in); } EXPORT_SYMBOL(mlx5_core_roce_gid_set); -- cgit v1.2.3-59-g8ed1b From 9d6ed27163163b3236f38f89e3968601f7de6a95 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Thu, 9 Apr 2020 16:31:21 +0300 Subject: net/mlx5: Update mpfs.c new cmd interface Do mass update of mpfs.c to reuse newly introduced mlx5_cmd_exec_in*() interfaces. Reviewed-by: Moshe Shemesh Signed-off-by: Leon Romanovsky --- drivers/net/ethernet/mellanox/mlx5/core/lib/mpfs.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/mpfs.c b/drivers/net/ethernet/mellanox/mlx5/core/lib/mpfs.c index 3118e8d66407..fd8449ff9e17 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lib/mpfs.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/mpfs.c @@ -40,8 +40,7 @@ /* HW L2 Table (MPFS) management */ static int set_l2table_entry_cmd(struct mlx5_core_dev *dev, u32 index, u8 *mac) { - u32 in[MLX5_ST_SZ_DW(set_l2_table_entry_in)] = {0}; - u32 out[MLX5_ST_SZ_DW(set_l2_table_entry_out)] = {0}; + u32 in[MLX5_ST_SZ_DW(set_l2_table_entry_in)] = {}; u8 *in_mac_addr; MLX5_SET(set_l2_table_entry_in, in, opcode, MLX5_CMD_OP_SET_L2_TABLE_ENTRY); @@ -50,17 +49,16 @@ static int set_l2table_entry_cmd(struct mlx5_core_dev *dev, u32 index, u8 *mac) in_mac_addr = MLX5_ADDR_OF(set_l2_table_entry_in, in, mac_address); ether_addr_copy(&in_mac_addr[2], mac); - return mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); + return mlx5_cmd_exec_in(dev, set_l2_table_entry, in); } static int del_l2table_entry_cmd(struct mlx5_core_dev *dev, u32 index) { - u32 in[MLX5_ST_SZ_DW(delete_l2_table_entry_in)] = {0}; - u32 out[MLX5_ST_SZ_DW(delete_l2_table_entry_out)] = {0}; + u32 in[MLX5_ST_SZ_DW(delete_l2_table_entry_in)] = {}; MLX5_SET(delete_l2_table_entry_in, in, opcode, MLX5_CMD_OP_DELETE_L2_TABLE_ENTRY); MLX5_SET(delete_l2_table_entry_in, in, table_index, index); - return mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); + return mlx5_cmd_exec_in(dev, delete_l2_table_entry, in); } /* UC L2 table hash node */ -- cgit v1.2.3-59-g8ed1b From 253e790e204f90adf10797ccea58910518a2c77e Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Thu, 9 Apr 2020 16:32:06 +0300 Subject: net/mlx5: Update vxlan.c new cmd interface Do mass update of vxlan.c to reuse newly introduced mlx5_cmd_exec_in*() interfaces. Reviewed-by: Moshe Shemesh Signed-off-by: Leon Romanovsky --- drivers/net/ethernet/mellanox/mlx5/core/lib/vxlan.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/vxlan.c b/drivers/net/ethernet/mellanox/mlx5/core/lib/vxlan.c index 148b55c3db7a..82c766a95165 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lib/vxlan.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/vxlan.c @@ -60,24 +60,22 @@ static inline u8 mlx5_vxlan_max_udp_ports(struct mlx5_core_dev *mdev) static int mlx5_vxlan_core_add_port_cmd(struct mlx5_core_dev *mdev, u16 port) { - u32 in[MLX5_ST_SZ_DW(add_vxlan_udp_dport_in)] = {0}; - u32 out[MLX5_ST_SZ_DW(add_vxlan_udp_dport_out)] = {0}; + u32 in[MLX5_ST_SZ_DW(add_vxlan_udp_dport_in)] = {}; MLX5_SET(add_vxlan_udp_dport_in, in, opcode, MLX5_CMD_OP_ADD_VXLAN_UDP_DPORT); MLX5_SET(add_vxlan_udp_dport_in, in, vxlan_udp_port, port); - return mlx5_cmd_exec(mdev, in, sizeof(in), out, sizeof(out)); + return mlx5_cmd_exec_in(mdev, add_vxlan_udp_dport, in); } static int mlx5_vxlan_core_del_port_cmd(struct mlx5_core_dev *mdev, u16 port) { - u32 in[MLX5_ST_SZ_DW(delete_vxlan_udp_dport_in)] = {0}; - u32 out[MLX5_ST_SZ_DW(delete_vxlan_udp_dport_out)] = {0}; + u32 in[MLX5_ST_SZ_DW(delete_vxlan_udp_dport_in)] = {}; MLX5_SET(delete_vxlan_udp_dport_in, in, opcode, MLX5_CMD_OP_DELETE_VXLAN_UDP_DPORT); MLX5_SET(delete_vxlan_udp_dport_in, in, vxlan_udp_port, port); - return mlx5_cmd_exec(mdev, in, sizeof(in), out, sizeof(out)); + return mlx5_cmd_exec_in(mdev, delete_vxlan_udp_dport, in); } static struct mlx5_vxlan_port* -- cgit v1.2.3-59-g8ed1b From 3ac0e69e69ad17fe792ec68e651c6535d74fcffd Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Thu, 9 Apr 2020 16:33:38 +0300 Subject: net/mlx5: Update main.c new cmd interface Do mass update of main.c to reuse newly introduced mlx5_cmd_exec_in*() interfaces. Reviewed-by: Moshe Shemesh Signed-off-by: Leon Romanovsky --- drivers/net/ethernet/mellanox/mlx5/core/main.c | 35 ++++++++++---------------- 1 file changed, 13 insertions(+), 22 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c index 0044aa5cc676..061b69ea9cc4 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c @@ -206,8 +206,7 @@ static void mlx5_set_driver_version(struct mlx5_core_dev *dev) { int driver_ver_sz = MLX5_FLD_SZ_BYTES(set_driver_version_in, driver_version); - u8 in[MLX5_ST_SZ_BYTES(set_driver_version_in)] = {0}; - u8 out[MLX5_ST_SZ_BYTES(set_driver_version_out)] = {0}; + u8 in[MLX5_ST_SZ_BYTES(set_driver_version_in)] = {}; int remaining_size = driver_ver_sz; char *string; @@ -234,7 +233,7 @@ static void mlx5_set_driver_version(struct mlx5_core_dev *dev) MLX5_SET(set_driver_version_in, in, opcode, MLX5_CMD_OP_SET_DRIVER_VERSION); - mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); + mlx5_cmd_exec_in(dev, set_driver_version, in); } static int set_dma_caps(struct pci_dev *pdev) @@ -366,7 +365,7 @@ static int mlx5_core_get_caps_mode(struct mlx5_core_dev *dev, MLX5_SET(query_hca_cap_in, in, opcode, MLX5_CMD_OP_QUERY_HCA_CAP); MLX5_SET(query_hca_cap_in, in, op_mod, opmod); - err = mlx5_cmd_exec(dev, in, sizeof(in), out, out_sz); + err = mlx5_cmd_exec_inout(dev, query_hca_cap, in, out); if (err) { mlx5_core_warn(dev, "QUERY_HCA_CAP : type(%x) opmode(%x) Failed(%d)\n", @@ -409,12 +408,9 @@ int mlx5_core_get_caps(struct mlx5_core_dev *dev, enum mlx5_cap_type cap_type) static int set_caps(struct mlx5_core_dev *dev, void *in, int opmod) { - u32 out[MLX5_ST_SZ_DW(set_hca_cap_out)] = {}; - MLX5_SET(set_hca_cap_in, in, opcode, MLX5_CMD_OP_SET_HCA_CAP); MLX5_SET(set_hca_cap_in, in, op_mod, opmod << 1); - return mlx5_cmd_exec(dev, in, MLX5_ST_SZ_BYTES(set_hca_cap_in), out, - sizeof(out)); + return mlx5_cmd_exec_in(dev, set_hca_cap, in); } static int handle_hca_cap_atomic(struct mlx5_core_dev *dev, void *set_ctx) @@ -653,26 +649,24 @@ static int mlx5_core_set_hca_defaults(struct mlx5_core_dev *dev) int mlx5_core_enable_hca(struct mlx5_core_dev *dev, u16 func_id) { - u32 out[MLX5_ST_SZ_DW(enable_hca_out)] = {0}; - u32 in[MLX5_ST_SZ_DW(enable_hca_in)] = {0}; + u32 in[MLX5_ST_SZ_DW(enable_hca_in)] = {}; MLX5_SET(enable_hca_in, in, opcode, MLX5_CMD_OP_ENABLE_HCA); MLX5_SET(enable_hca_in, in, function_id, func_id); MLX5_SET(enable_hca_in, in, embedded_cpu_function, dev->caps.embedded_cpu); - return mlx5_cmd_exec(dev, &in, sizeof(in), &out, sizeof(out)); + return mlx5_cmd_exec_in(dev, enable_hca, in); } int mlx5_core_disable_hca(struct mlx5_core_dev *dev, u16 func_id) { - u32 out[MLX5_ST_SZ_DW(disable_hca_out)] = {0}; - u32 in[MLX5_ST_SZ_DW(disable_hca_in)] = {0}; + u32 in[MLX5_ST_SZ_DW(disable_hca_in)] = {}; MLX5_SET(disable_hca_in, in, opcode, MLX5_CMD_OP_DISABLE_HCA); MLX5_SET(disable_hca_in, in, function_id, func_id); MLX5_SET(enable_hca_in, in, embedded_cpu_function, dev->caps.embedded_cpu); - return mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); + return mlx5_cmd_exec_in(dev, disable_hca, in); } u64 mlx5_read_internal_timer(struct mlx5_core_dev *dev, @@ -697,14 +691,13 @@ u64 mlx5_read_internal_timer(struct mlx5_core_dev *dev, static int mlx5_core_set_issi(struct mlx5_core_dev *dev) { - u32 query_in[MLX5_ST_SZ_DW(query_issi_in)] = {0}; - u32 query_out[MLX5_ST_SZ_DW(query_issi_out)] = {0}; + u32 query_out[MLX5_ST_SZ_DW(query_issi_out)] = {}; + u32 query_in[MLX5_ST_SZ_DW(query_issi_in)] = {}; u32 sup_issi; int err; MLX5_SET(query_issi_in, query_in, opcode, MLX5_CMD_OP_QUERY_ISSI); - err = mlx5_cmd_exec(dev, query_in, sizeof(query_in), - query_out, sizeof(query_out)); + err = mlx5_cmd_exec_inout(dev, query_issi, query_in, query_out); if (err) { u32 syndrome; u8 status; @@ -724,13 +717,11 @@ static int mlx5_core_set_issi(struct mlx5_core_dev *dev) sup_issi = MLX5_GET(query_issi_out, query_out, supported_issi_dw0); if (sup_issi & (1 << 1)) { - u32 set_in[MLX5_ST_SZ_DW(set_issi_in)] = {0}; - u32 set_out[MLX5_ST_SZ_DW(set_issi_out)] = {0}; + u32 set_in[MLX5_ST_SZ_DW(set_issi_in)] = {}; MLX5_SET(set_issi_in, set_in, opcode, MLX5_CMD_OP_SET_ISSI); MLX5_SET(set_issi_in, set_in, current_issi, 1); - err = mlx5_cmd_exec(dev, set_in, sizeof(set_in), - set_out, sizeof(set_out)); + err = mlx5_cmd_exec_in(dev, set_issi, set_in); if (err) { mlx5_core_err(dev, "Failed to set ISSI to 1 err(%d)\n", err); -- cgit v1.2.3-59-g8ed1b From 62a9fec040831c84a79721ec288851a3fd6f8ec0 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Thu, 9 Apr 2020 16:43:58 +0300 Subject: net/mlx5: Update mcg.c new cmd interface Do mass update of mcg.c to reuse newly introduced mlx5_cmd_exec_in*() interfaces. Reviewed-by: Moshe Shemesh Signed-off-by: Leon Romanovsky --- drivers/net/ethernet/mellanox/mlx5/core/mcg.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/mcg.c b/drivers/net/ethernet/mellanox/mlx5/core/mcg.c index 6789fe658037..e019d68062d8 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/mcg.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/mcg.c @@ -38,28 +38,26 @@ int mlx5_core_attach_mcg(struct mlx5_core_dev *dev, union ib_gid *mgid, u32 qpn) { - u32 out[MLX5_ST_SZ_DW(attach_to_mcg_out)] = {0}; - u32 in[MLX5_ST_SZ_DW(attach_to_mcg_in)] = {0}; + u32 in[MLX5_ST_SZ_DW(attach_to_mcg_in)] = {}; void *gid; MLX5_SET(attach_to_mcg_in, in, opcode, MLX5_CMD_OP_ATTACH_TO_MCG); MLX5_SET(attach_to_mcg_in, in, qpn, qpn); gid = MLX5_ADDR_OF(attach_to_mcg_in, in, multicast_gid); memcpy(gid, mgid, sizeof(*mgid)); - return mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); + return mlx5_cmd_exec_in(dev, attach_to_mcg, in); } EXPORT_SYMBOL(mlx5_core_attach_mcg); int mlx5_core_detach_mcg(struct mlx5_core_dev *dev, union ib_gid *mgid, u32 qpn) { - u32 out[MLX5_ST_SZ_DW(detach_from_mcg_out)] = {0}; - u32 in[MLX5_ST_SZ_DW(detach_from_mcg_in)] = {0}; + u32 in[MLX5_ST_SZ_DW(detach_from_mcg_in)] = {}; void *gid; MLX5_SET(detach_from_mcg_in, in, opcode, MLX5_CMD_OP_DETACH_FROM_MCG); MLX5_SET(detach_from_mcg_in, in, qpn, qpn); gid = MLX5_ADDR_OF(detach_from_mcg_in, in, multicast_gid); memcpy(gid, mgid, sizeof(*mgid)); - return mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); + return mlx5_cmd_exec_in(dev, detach_from_mcg, in); } EXPORT_SYMBOL(mlx5_core_detach_mcg); -- cgit v1.2.3-59-g8ed1b From adda874c957c86c7407930142d944c1546c38260 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Thu, 9 Apr 2020 16:50:37 +0300 Subject: net/mlx5: Update mr.c new cmd interface Do mass update of mr.c to reuse newly introduced mlx5_cmd_exec_in*() interfaces. Reviewed-by: Moshe Shemesh Signed-off-by: Leon Romanovsky --- drivers/net/ethernet/mellanox/mlx5/core/mr.c | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/mr.c b/drivers/net/ethernet/mellanox/mlx5/core/mr.c index 1feedf335dea..9eb51f06d3ae 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/mr.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/mr.c @@ -39,7 +39,7 @@ int mlx5_core_create_mkey(struct mlx5_core_dev *dev, struct mlx5_core_mkey *mkey, u32 *in, int inlen) { - u32 lout[MLX5_ST_SZ_DW(create_mkey_out)] = {0}; + u32 lout[MLX5_ST_SZ_DW(create_mkey_out)] = {}; u32 mkey_index; void *mkc; int err; @@ -65,19 +65,18 @@ EXPORT_SYMBOL(mlx5_core_create_mkey); int mlx5_core_destroy_mkey(struct mlx5_core_dev *dev, struct mlx5_core_mkey *mkey) { - u32 out[MLX5_ST_SZ_DW(destroy_mkey_out)] = {0}; - u32 in[MLX5_ST_SZ_DW(destroy_mkey_in)] = {0}; + u32 in[MLX5_ST_SZ_DW(destroy_mkey_in)] = {}; MLX5_SET(destroy_mkey_in, in, opcode, MLX5_CMD_OP_DESTROY_MKEY); MLX5_SET(destroy_mkey_in, in, mkey_index, mlx5_mkey_to_idx(mkey->key)); - return mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); + return mlx5_cmd_exec_in(dev, destroy_mkey, in); } EXPORT_SYMBOL(mlx5_core_destroy_mkey); int mlx5_core_query_mkey(struct mlx5_core_dev *dev, struct mlx5_core_mkey *mkey, u32 *out, int outlen) { - u32 in[MLX5_ST_SZ_DW(query_mkey_in)] = {0}; + u32 in[MLX5_ST_SZ_DW(query_mkey_in)] = {}; memset(out, 0, outlen); MLX5_SET(query_mkey_in, in, opcode, MLX5_CMD_OP_QUERY_MKEY); @@ -99,8 +98,8 @@ static inline u32 mlx5_get_psv(u32 *out, int psv_index) int mlx5_core_create_psv(struct mlx5_core_dev *dev, u32 pdn, int npsvs, u32 *sig_index) { - u32 out[MLX5_ST_SZ_DW(create_psv_out)] = {0}; - u32 in[MLX5_ST_SZ_DW(create_psv_in)] = {0}; + u32 out[MLX5_ST_SZ_DW(create_psv_out)] = {}; + u32 in[MLX5_ST_SZ_DW(create_psv_in)] = {}; int i, err; if (npsvs > MLX5_MAX_PSVS) @@ -110,7 +109,7 @@ int mlx5_core_create_psv(struct mlx5_core_dev *dev, u32 pdn, MLX5_SET(create_psv_in, in, pd, pdn); MLX5_SET(create_psv_in, in, num_psv, npsvs); - err = mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); + err = mlx5_cmd_exec_inout(dev, create_psv, in, out); if (err) return err; @@ -123,11 +122,10 @@ EXPORT_SYMBOL(mlx5_core_create_psv); int mlx5_core_destroy_psv(struct mlx5_core_dev *dev, int psv_num) { - u32 out[MLX5_ST_SZ_DW(destroy_psv_out)] = {0}; - u32 in[MLX5_ST_SZ_DW(destroy_psv_in)] = {0}; + u32 in[MLX5_ST_SZ_DW(destroy_psv_in)] = {}; MLX5_SET(destroy_psv_in, in, opcode, MLX5_CMD_OP_DESTROY_PSV); MLX5_SET(destroy_psv_in, in, psvn, psv_num); - return mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); + return mlx5_cmd_exec_in(dev, destroy_psv, in); } EXPORT_SYMBOL(mlx5_core_destroy_psv); -- cgit v1.2.3-59-g8ed1b From 86d41641ddd66fc9b2cc0aef0909cee68493a78a Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Thu, 9 Apr 2020 17:02:07 +0300 Subject: net/mlx5: Update pagealloc.c new cmd interface Do mass update of pagealloc.c to reuse newly introduced mlx5_cmd_exec_in*() interfaces. Reviewed-by: Moshe Shemesh Signed-off-by: Leon Romanovsky --- drivers/net/ethernet/mellanox/mlx5/core/pagealloc.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/pagealloc.c b/drivers/net/ethernet/mellanox/mlx5/core/pagealloc.c index a3959754b927..3d6f617abb7d 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/pagealloc.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/pagealloc.c @@ -135,8 +135,8 @@ static struct fw_page *find_fw_page(struct mlx5_core_dev *dev, u64 addr) static int mlx5_cmd_query_pages(struct mlx5_core_dev *dev, u16 *func_id, s32 *npages, int boot) { - u32 out[MLX5_ST_SZ_DW(query_pages_out)] = {0}; - u32 in[MLX5_ST_SZ_DW(query_pages_in)] = {0}; + u32 out[MLX5_ST_SZ_DW(query_pages_out)] = {}; + u32 in[MLX5_ST_SZ_DW(query_pages_in)] = {}; int err; MLX5_SET(query_pages_in, in, opcode, MLX5_CMD_OP_QUERY_PAGES); @@ -145,7 +145,7 @@ static int mlx5_cmd_query_pages(struct mlx5_core_dev *dev, u16 *func_id, MLX5_QUERY_PAGES_IN_OP_MOD_INIT_PAGES); MLX5_SET(query_pages_in, in, embedded_cpu_function, mlx5_core_is_ecpf(dev)); - err = mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); + err = mlx5_cmd_exec_inout(dev, query_pages, in, out); if (err) return err; @@ -256,8 +256,7 @@ err_mapping: static void page_notify_fail(struct mlx5_core_dev *dev, u16 func_id, bool ec_function) { - u32 out[MLX5_ST_SZ_DW(manage_pages_out)] = {0}; - u32 in[MLX5_ST_SZ_DW(manage_pages_in)] = {0}; + u32 in[MLX5_ST_SZ_DW(manage_pages_in)] = {}; int err; MLX5_SET(manage_pages_in, in, opcode, MLX5_CMD_OP_MANAGE_PAGES); @@ -265,7 +264,7 @@ static void page_notify_fail(struct mlx5_core_dev *dev, u16 func_id, MLX5_SET(manage_pages_in, in, function_id, func_id); MLX5_SET(manage_pages_in, in, embedded_cpu_function, ec_function); - err = mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); + err = mlx5_cmd_exec_in(dev, manage_pages, in); if (err) mlx5_core_warn(dev, "page notify failed func_id(%d) err(%d)\n", func_id, err); @@ -373,7 +372,7 @@ static int reclaim_pages(struct mlx5_core_dev *dev, u32 func_id, int npages, int *nclaimed, bool ec_function) { int outlen = MLX5_ST_SZ_BYTES(manage_pages_out); - u32 in[MLX5_ST_SZ_DW(manage_pages_in)] = {0}; + u32 in[MLX5_ST_SZ_DW(manage_pages_in)] = {}; int num_claimed; u32 *out; int err; -- cgit v1.2.3-59-g8ed1b From 9b3ca3ec03169e895c414c4e98a49c0cd0cd95ec Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Thu, 9 Apr 2020 17:04:30 +0300 Subject: net/mlx5: Update pd.c new cmd interface Do mass update of pd.c to reuse newly introduced mlx5_cmd_exec_in*() interfaces. Reviewed-by: Moshe Shemesh Signed-off-by: Leon Romanovsky --- drivers/net/ethernet/mellanox/mlx5/core/pd.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/pd.c b/drivers/net/ethernet/mellanox/mlx5/core/pd.c index b92d6f621c83..aabc53ad8bdd 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/pd.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/pd.c @@ -37,12 +37,12 @@ int mlx5_core_alloc_pd(struct mlx5_core_dev *dev, u32 *pdn) { - u32 out[MLX5_ST_SZ_DW(alloc_pd_out)] = {0}; - u32 in[MLX5_ST_SZ_DW(alloc_pd_in)] = {0}; + u32 out[MLX5_ST_SZ_DW(alloc_pd_out)] = {}; + u32 in[MLX5_ST_SZ_DW(alloc_pd_in)] = {}; int err; MLX5_SET(alloc_pd_in, in, opcode, MLX5_CMD_OP_ALLOC_PD); - err = mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); + err = mlx5_cmd_exec_inout(dev, alloc_pd, in, out); if (!err) *pdn = MLX5_GET(alloc_pd_out, out, pd); return err; @@ -51,11 +51,10 @@ EXPORT_SYMBOL(mlx5_core_alloc_pd); int mlx5_core_dealloc_pd(struct mlx5_core_dev *dev, u32 pdn) { - u32 out[MLX5_ST_SZ_DW(dealloc_pd_out)] = {0}; - u32 in[MLX5_ST_SZ_DW(dealloc_pd_in)] = {0}; + u32 in[MLX5_ST_SZ_DW(dealloc_pd_in)] = {}; MLX5_SET(dealloc_pd_in, in, opcode, MLX5_CMD_OP_DEALLOC_PD); MLX5_SET(dealloc_pd_in, in, pd, pdn); - return mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); + return mlx5_cmd_exec_in(dev, dealloc_pd, in); } EXPORT_SYMBOL(mlx5_core_dealloc_pd); -- cgit v1.2.3-59-g8ed1b From 1fb5193434555df68eb065705baf223bca80ad0a Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Thu, 9 Apr 2020 17:05:05 +0300 Subject: net/mlx5: Update uar.c new cmd interface Do mass update of uar.c to reuse newly introduced mlx5_cmd_exec_in*() interfaces. Reviewed-by: Moshe Shemesh Signed-off-by: Leon Romanovsky --- drivers/net/ethernet/mellanox/mlx5/core/uar.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/uar.c b/drivers/net/ethernet/mellanox/mlx5/core/uar.c index 816f9c434359..da481a7c12f4 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/uar.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/uar.c @@ -38,12 +38,12 @@ int mlx5_cmd_alloc_uar(struct mlx5_core_dev *dev, u32 *uarn) { - u32 out[MLX5_ST_SZ_DW(alloc_uar_out)] = {0}; - u32 in[MLX5_ST_SZ_DW(alloc_uar_in)] = {0}; + u32 out[MLX5_ST_SZ_DW(alloc_uar_out)] = {}; + u32 in[MLX5_ST_SZ_DW(alloc_uar_in)] = {}; int err; MLX5_SET(alloc_uar_in, in, opcode, MLX5_CMD_OP_ALLOC_UAR); - err = mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); + err = mlx5_cmd_exec_inout(dev, alloc_uar, in, out); if (!err) *uarn = MLX5_GET(alloc_uar_out, out, uar); return err; @@ -52,12 +52,11 @@ EXPORT_SYMBOL(mlx5_cmd_alloc_uar); int mlx5_cmd_free_uar(struct mlx5_core_dev *dev, u32 uarn) { - u32 out[MLX5_ST_SZ_DW(dealloc_uar_out)] = {0}; - u32 in[MLX5_ST_SZ_DW(dealloc_uar_in)] = {0}; + u32 in[MLX5_ST_SZ_DW(dealloc_uar_in)] = {}; MLX5_SET(dealloc_uar_in, in, opcode, MLX5_CMD_OP_DEALLOC_UAR); MLX5_SET(dealloc_uar_in, in, uar, uarn); - return mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); + return mlx5_cmd_exec_in(dev, dealloc_uar, in); } EXPORT_SYMBOL(mlx5_cmd_free_uar); -- cgit v1.2.3-59-g8ed1b From fa8110f4451c189b1efd4de6bef02ac6efb1244f Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Thu, 9 Apr 2020 17:05:31 +0300 Subject: net/mlx5: Update rl.c new cmd interface Do mass update of rl.c to reuse newly introduced mlx5_cmd_exec_in*() interfaces. Reviewed-by: Moshe Shemesh Signed-off-by: Leon Romanovsky --- drivers/net/ethernet/mellanox/mlx5/core/rl.c | 21 +++++++++------------ 1 file changed, 9 insertions(+), 12 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/rl.c b/drivers/net/ethernet/mellanox/mlx5/core/rl.c index c9599f7c5696..99039c47ef33 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/rl.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/rl.c @@ -39,8 +39,8 @@ int mlx5_create_scheduling_element_cmd(struct mlx5_core_dev *dev, u8 hierarchy, void *ctx, u32 *element_id) { - u32 in[MLX5_ST_SZ_DW(create_scheduling_element_in)] = {0}; - u32 out[MLX5_ST_SZ_DW(create_scheduling_element_in)] = {0}; + u32 out[MLX5_ST_SZ_DW(create_scheduling_element_in)] = {}; + u32 in[MLX5_ST_SZ_DW(create_scheduling_element_in)] = {}; void *schedc; int err; @@ -52,7 +52,7 @@ int mlx5_create_scheduling_element_cmd(struct mlx5_core_dev *dev, u8 hierarchy, hierarchy); memcpy(schedc, ctx, MLX5_ST_SZ_BYTES(scheduling_context)); - err = mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); + err = mlx5_cmd_exec_inout(dev, create_scheduling_element, in, out); if (err) return err; @@ -65,8 +65,7 @@ int mlx5_modify_scheduling_element_cmd(struct mlx5_core_dev *dev, u8 hierarchy, void *ctx, u32 element_id, u32 modify_bitmask) { - u32 in[MLX5_ST_SZ_DW(modify_scheduling_element_in)] = {0}; - u32 out[MLX5_ST_SZ_DW(modify_scheduling_element_in)] = {0}; + u32 in[MLX5_ST_SZ_DW(modify_scheduling_element_in)] = {}; void *schedc; schedc = MLX5_ADDR_OF(modify_scheduling_element_in, in, @@ -81,14 +80,13 @@ int mlx5_modify_scheduling_element_cmd(struct mlx5_core_dev *dev, u8 hierarchy, hierarchy); memcpy(schedc, ctx, MLX5_ST_SZ_BYTES(scheduling_context)); - return mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); + return mlx5_cmd_exec_in(dev, modify_scheduling_element, in); } int mlx5_destroy_scheduling_element_cmd(struct mlx5_core_dev *dev, u8 hierarchy, u32 element_id) { - u32 in[MLX5_ST_SZ_DW(destroy_scheduling_element_in)] = {0}; - u32 out[MLX5_ST_SZ_DW(destroy_scheduling_element_in)] = {0}; + u32 in[MLX5_ST_SZ_DW(destroy_scheduling_element_in)] = {}; MLX5_SET(destroy_scheduling_element_in, in, opcode, MLX5_CMD_OP_DESTROY_SCHEDULING_ELEMENT); @@ -97,7 +95,7 @@ int mlx5_destroy_scheduling_element_cmd(struct mlx5_core_dev *dev, u8 hierarchy, MLX5_SET(destroy_scheduling_element_in, in, scheduling_hierarchy, hierarchy); - return mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); + return mlx5_cmd_exec_in(dev, destroy_scheduling_element, in); } static bool mlx5_rl_are_equal_raw(struct mlx5_rl_entry *entry, void *rl_in, @@ -144,8 +142,7 @@ static struct mlx5_rl_entry *find_rl_entry(struct mlx5_rl_table *table, static int mlx5_set_pp_rate_limit_cmd(struct mlx5_core_dev *dev, struct mlx5_rl_entry *entry, bool set) { - u32 in[MLX5_ST_SZ_DW(set_pp_rate_limit_in)] = {}; - u32 out[MLX5_ST_SZ_DW(set_pp_rate_limit_out)] = {}; + u32 in[MLX5_ST_SZ_DW(set_pp_rate_limit_in)] = {}; void *pp_context; pp_context = MLX5_ADDR_OF(set_pp_rate_limit_in, in, ctx); @@ -155,7 +152,7 @@ static int mlx5_set_pp_rate_limit_cmd(struct mlx5_core_dev *dev, MLX5_SET(set_pp_rate_limit_in, in, rate_limit_index, entry->index); if (set) memcpy(pp_context, entry->rl_raw, sizeof(entry->rl_raw)); - return mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); + return mlx5_cmd_exec_in(dev, set_pp_rate_limit, in); } bool mlx5_rl_is_in_range(struct mlx5_core_dev *dev, u32 rate) -- cgit v1.2.3-59-g8ed1b From 2276a0dfc17be2cd89cfb8ed0a7ef6fad417aa3e Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Thu, 9 Apr 2020 21:00:23 +0300 Subject: net/mlx5: Update port.c new cmd interface Do mass update of port.c to reuse newly introduced mlx5_cmd_exec_in*() interfaces. Reviewed-by: Moshe Shemesh Signed-off-by: Leon Romanovsky --- drivers/net/ethernet/mellanox/mlx5/core/port.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/port.c b/drivers/net/ethernet/mellanox/mlx5/core/port.c index cc262b30aed5..9f829e68fc73 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/port.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/port.c @@ -763,24 +763,23 @@ EXPORT_SYMBOL_GPL(mlx5_query_port_ets_rate_limit); int mlx5_set_port_wol(struct mlx5_core_dev *mdev, u8 wol_mode) { - u32 in[MLX5_ST_SZ_DW(set_wol_rol_in)] = {0}; - u32 out[MLX5_ST_SZ_DW(set_wol_rol_out)] = {0}; + u32 in[MLX5_ST_SZ_DW(set_wol_rol_in)] = {}; MLX5_SET(set_wol_rol_in, in, opcode, MLX5_CMD_OP_SET_WOL_ROL); MLX5_SET(set_wol_rol_in, in, wol_mode_valid, 1); MLX5_SET(set_wol_rol_in, in, wol_mode, wol_mode); - return mlx5_cmd_exec(mdev, in, sizeof(in), out, sizeof(out)); + return mlx5_cmd_exec_in(mdev, set_wol_rol, in); } EXPORT_SYMBOL_GPL(mlx5_set_port_wol); int mlx5_query_port_wol(struct mlx5_core_dev *mdev, u8 *wol_mode) { - u32 in[MLX5_ST_SZ_DW(query_wol_rol_in)] = {0}; - u32 out[MLX5_ST_SZ_DW(query_wol_rol_out)] = {0}; + u32 out[MLX5_ST_SZ_DW(query_wol_rol_out)] = {}; + u32 in[MLX5_ST_SZ_DW(query_wol_rol_in)] = {}; int err; MLX5_SET(query_wol_rol_in, in, opcode, MLX5_CMD_OP_QUERY_WOL_ROL); - err = mlx5_cmd_exec(mdev, in, sizeof(in), out, sizeof(out)); + err = mlx5_cmd_exec_inout(mdev, query_wol_rol, in, out); if (!err) *wol_mode = MLX5_GET(query_wol_rol_out, out, wol_mode); -- cgit v1.2.3-59-g8ed1b From 7ba294e43595ab728adf2baf0aab378095768bfe Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Thu, 9 Apr 2020 21:02:08 +0300 Subject: net/mlx5: Update SW steering new cmd interface Do mass update of SW steering to reuse newly introduced mlx5_cmd_exec_in*() interfaces. Signed-off-by: Leon Romanovsky --- .../ethernet/mellanox/mlx5/core/steering/dr_cmd.c | 33 +++++++++------------- 1 file changed, 14 insertions(+), 19 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_cmd.c index 461b39376daf..6bd34b293007 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_cmd.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_cmd.c @@ -18,7 +18,7 @@ int mlx5dr_cmd_query_esw_vport_context(struct mlx5_core_dev *mdev, MLX5_SET(query_esw_vport_context_in, in, other_vport, other_vport); MLX5_SET(query_esw_vport_context_in, in, vport_number, vport_number); - err = mlx5_cmd_exec(mdev, in, sizeof(in), out, sizeof(out)); + err = mlx5_cmd_exec_inout(mdev, query_esw_vport_context, in, out); if (err) return err; @@ -51,7 +51,7 @@ int mlx5dr_cmd_query_gvmi(struct mlx5_core_dev *mdev, bool other_vport, MLX5_SET_HCA_CAP_OP_MOD_GENERAL_DEVICE << 1 | HCA_CAP_OPMOD_GET_CUR); - err = mlx5_cmd_exec(mdev, in, sizeof(in), out, out_size); + err = mlx5_cmd_exec_inout(mdev, query_hca_cap, in, out); if (err) { kfree(out); return err; @@ -141,7 +141,7 @@ int mlx5dr_cmd_query_flow_table(struct mlx5_core_dev *dev, MLX5_SET(query_flow_table_in, in, table_type, type); MLX5_SET(query_flow_table_in, in, table_id, table_id); - err = mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); + err = mlx5_cmd_exec_inout(dev, query_flow_table, in, out); if (err) return err; @@ -158,12 +158,11 @@ int mlx5dr_cmd_query_flow_table(struct mlx5_core_dev *dev, int mlx5dr_cmd_sync_steering(struct mlx5_core_dev *mdev) { - u32 out[MLX5_ST_SZ_DW(sync_steering_out)] = {}; u32 in[MLX5_ST_SZ_DW(sync_steering_in)] = {}; MLX5_SET(sync_steering_in, in, opcode, MLX5_CMD_OP_SYNC_STEERING); - return mlx5_cmd_exec(mdev, in, sizeof(in), out, sizeof(out)); + return mlx5_cmd_exec_in(mdev, sync_steering, in); } int mlx5dr_cmd_set_fte_modify_and_vport(struct mlx5_core_dev *mdev, @@ -214,14 +213,13 @@ int mlx5dr_cmd_del_flow_table_entry(struct mlx5_core_dev *mdev, u32 table_type, u32 table_id) { - u32 out[MLX5_ST_SZ_DW(delete_fte_out)] = {}; u32 in[MLX5_ST_SZ_DW(delete_fte_in)] = {}; MLX5_SET(delete_fte_in, in, opcode, MLX5_CMD_OP_DELETE_FLOW_TABLE_ENTRY); MLX5_SET(delete_fte_in, in, table_type, table_type); MLX5_SET(delete_fte_in, in, table_id, table_id); - return mlx5_cmd_exec(mdev, in, sizeof(in), out, sizeof(out)); + return mlx5_cmd_exec_in(mdev, delete_fte, in); } int mlx5dr_cmd_alloc_modify_header(struct mlx5_core_dev *mdev, @@ -263,7 +261,6 @@ out: int mlx5dr_cmd_dealloc_modify_header(struct mlx5_core_dev *mdev, u32 modify_header_id) { - u32 out[MLX5_ST_SZ_DW(dealloc_modify_header_context_out)] = {}; u32 in[MLX5_ST_SZ_DW(dealloc_modify_header_context_in)] = {}; MLX5_SET(dealloc_modify_header_context_in, in, opcode, @@ -271,7 +268,7 @@ int mlx5dr_cmd_dealloc_modify_header(struct mlx5_core_dev *mdev, MLX5_SET(dealloc_modify_header_context_in, in, modify_header_id, modify_header_id); - return mlx5_cmd_exec(mdev, in, sizeof(in), out, sizeof(out)); + return mlx5_cmd_exec_in(mdev, dealloc_modify_header_context, in); } int mlx5dr_cmd_create_empty_flow_group(struct mlx5_core_dev *mdev, @@ -292,7 +289,7 @@ int mlx5dr_cmd_create_empty_flow_group(struct mlx5_core_dev *mdev, MLX5_SET(create_flow_group_in, in, table_type, table_type); MLX5_SET(create_flow_group_in, in, table_id, table_id); - err = mlx5_cmd_exec(mdev, in, inlen, out, sizeof(out)); + err = mlx5_cmd_exec_inout(mdev, create_flow_group, in, out); if (err) goto out; @@ -309,14 +306,14 @@ int mlx5dr_cmd_destroy_flow_group(struct mlx5_core_dev *mdev, u32 group_id) { u32 in[MLX5_ST_SZ_DW(destroy_flow_group_in)] = {}; - u32 out[MLX5_ST_SZ_DW(destroy_flow_group_out)] = {}; - MLX5_SET(create_flow_group_in, in, opcode, MLX5_CMD_OP_DESTROY_FLOW_GROUP); + MLX5_SET(destroy_flow_group_in, in, opcode, + MLX5_CMD_OP_DESTROY_FLOW_GROUP); MLX5_SET(destroy_flow_group_in, in, table_type, table_type); MLX5_SET(destroy_flow_group_in, in, table_id, table_id); MLX5_SET(destroy_flow_group_in, in, group_id, group_id); - return mlx5_cmd_exec(mdev, in, sizeof(in), out, sizeof(out)); + return mlx5_cmd_exec_in(mdev, destroy_flow_group, in); } int mlx5dr_cmd_create_flow_table(struct mlx5_core_dev *mdev, @@ -360,7 +357,7 @@ int mlx5dr_cmd_create_flow_table(struct mlx5_core_dev *mdev, MLX5_SET(create_flow_table_in, in, flow_table_context.reformat_en, attr->reformat_en); - err = mlx5_cmd_exec(mdev, in, sizeof(in), out, sizeof(out)); + err = mlx5_cmd_exec_inout(mdev, create_flow_table, in, out); if (err) return err; @@ -379,7 +376,6 @@ int mlx5dr_cmd_destroy_flow_table(struct mlx5_core_dev *mdev, u32 table_id, u32 table_type) { - u32 out[MLX5_ST_SZ_DW(destroy_flow_table_out)] = {}; u32 in[MLX5_ST_SZ_DW(destroy_flow_table_in)] = {}; MLX5_SET(destroy_flow_table_in, in, opcode, @@ -387,7 +383,7 @@ int mlx5dr_cmd_destroy_flow_table(struct mlx5_core_dev *mdev, MLX5_SET(destroy_flow_table_in, in, table_type, table_type); MLX5_SET(destroy_flow_table_in, in, table_id, table_id); - return mlx5_cmd_exec(mdev, in, sizeof(in), out, sizeof(out)); + return mlx5_cmd_exec_in(mdev, destroy_flow_table, in); } int mlx5dr_cmd_create_reformat_ctx(struct mlx5_core_dev *mdev, @@ -434,7 +430,6 @@ int mlx5dr_cmd_create_reformat_ctx(struct mlx5_core_dev *mdev, void mlx5dr_cmd_destroy_reformat_ctx(struct mlx5_core_dev *mdev, u32 reformat_id) { - u32 out[MLX5_ST_SZ_DW(dealloc_packet_reformat_context_out)] = {}; u32 in[MLX5_ST_SZ_DW(dealloc_packet_reformat_context_in)] = {}; MLX5_SET(dealloc_packet_reformat_context_in, in, opcode, @@ -442,7 +437,7 @@ void mlx5dr_cmd_destroy_reformat_ctx(struct mlx5_core_dev *mdev, MLX5_SET(dealloc_packet_reformat_context_in, in, packet_reformat_id, reformat_id); - mlx5_cmd_exec(mdev, in, sizeof(in), out, sizeof(out)); + mlx5_cmd_exec_in(mdev, dealloc_packet_reformat_context, in); } int mlx5dr_cmd_query_gid(struct mlx5_core_dev *mdev, u8 vhca_port_num, @@ -458,7 +453,7 @@ int mlx5dr_cmd_query_gid(struct mlx5_core_dev *mdev, u8 vhca_port_num, MLX5_SET(query_roce_address_in, in, roce_address_index, index); MLX5_SET(query_roce_address_in, in, vhca_port_num, vhca_port_num); - err = mlx5_cmd_exec(mdev, in, sizeof(in), out, sizeof(out)); + err = mlx5_cmd_exec_inout(mdev, query_roce_address, in, out); if (err) return err; -- cgit v1.2.3-59-g8ed1b From e0b4b4722dfac09658d1519b296cf8dc349a2451 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Thu, 9 Apr 2020 21:03:33 +0300 Subject: net/mlx5: Update transobj.c new cmd interface Do mass update of transobj.c to reuse newly introduced mlx5_cmd_exec_in*() interfaces. Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/mlx5/qp.c | 32 +++--- drivers/net/ethernet/mellanox/mlx5/core/en.h | 6 +- .../net/ethernet/mellanox/mlx5/core/en_common.c | 7 +- .../net/ethernet/mellanox/mlx5/core/en_ethtool.c | 2 +- .../ethernet/mellanox/mlx5/core/en_fs_ethtool.c | 2 +- drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 29 +++--- drivers/net/ethernet/mellanox/mlx5/core/en_tc.c | 6 +- drivers/net/ethernet/mellanox/mlx5/core/transobj.c | 113 ++++++++------------- include/linux/mlx5/transobj.h | 19 ++-- 9 files changed, 85 insertions(+), 131 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index 3ecd1864b3c8..af599c8b88aa 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -1255,7 +1255,7 @@ static int create_raw_packet_qp_tis(struct mlx5_ib_dev *dev, struct mlx5_ib_sq *sq, u32 tdn, struct ib_pd *pd) { - u32 in[MLX5_ST_SZ_DW(create_tis_in)] = {0}; + u32 in[MLX5_ST_SZ_DW(create_tis_in)] = {}; void *tisc = MLX5_ADDR_OF(create_tis_in, in, ctx); MLX5_SET(create_tis_in, in, uid, to_mpd(pd)->uid); @@ -1263,7 +1263,7 @@ static int create_raw_packet_qp_tis(struct mlx5_ib_dev *dev, if (qp->flags & MLX5_IB_QP_UNDERLAY) MLX5_SET(tisc, tisc, underlay_qpn, qp->underlay_qpn); - return mlx5_core_create_tis(dev->mdev, in, sizeof(in), &sq->tisn); + return mlx5_core_create_tis(dev->mdev, in, &sq->tisn); } static void destroy_raw_packet_qp_tis(struct mlx5_ib_dev *dev, @@ -1460,9 +1460,8 @@ static void destroy_raw_packet_qp_tir(struct mlx5_ib_dev *dev, static int create_raw_packet_qp_tir(struct mlx5_ib_dev *dev, struct mlx5_ib_rq *rq, u32 tdn, - u32 *qp_flags_en, - struct ib_pd *pd, - u32 *out, int outlen) + u32 *qp_flags_en, struct ib_pd *pd, + u32 *out) { u8 lb_flag = 0; u32 *in; @@ -1495,9 +1494,8 @@ static int create_raw_packet_qp_tir(struct mlx5_ib_dev *dev, } MLX5_SET(tirc, tirc, self_lb_block, lb_flag); - - err = mlx5_core_create_tir_out(dev->mdev, in, inlen, out, outlen); - + MLX5_SET(create_tir_in, in, opcode, MLX5_CMD_OP_CREATE_TIR); + err = mlx5_cmd_exec_inout(dev->mdev, create_tir, in, out); rq->tirn = MLX5_GET(create_tir_out, out, tirn); if (!err && MLX5_GET(tirc, tirc, self_lb_block)) { err = mlx5_ib_enable_lb(dev, false, true); @@ -1557,9 +1555,8 @@ static int create_raw_packet_qp(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, if (err) goto err_destroy_sq; - err = create_raw_packet_qp_tir( - dev, rq, tdn, &qp->flags_en, pd, out, - MLX5_ST_SZ_BYTES(create_tir_out)); + err = create_raw_packet_qp_tir(dev, rq, tdn, &qp->flags_en, pd, + out); if (err) goto err_destroy_rq; @@ -1854,7 +1851,8 @@ static int create_rss_raw_qp_tir(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, MLX5_SET(rx_hash_field_select, hfso, selected_fields, selected_fields); create_tir: - err = mlx5_core_create_tir_out(dev->mdev, in, inlen, out, outlen); + MLX5_SET(create_tir_in, in, opcode, MLX5_CMD_OP_CREATE_TIR); + err = mlx5_cmd_exec_inout(dev->mdev, create_tir, in, out); qp->rss_qp.tirn = MLX5_GET(create_tir_out, out, tirn); if (!err && MLX5_GET(tirc, tirc, self_lb_block)) { @@ -2933,7 +2931,7 @@ static int modify_raw_packet_eth_prio(struct mlx5_core_dev *dev, tisc = MLX5_ADDR_OF(modify_tis_in, in, ctx); MLX5_SET(tisc, tisc, prio, ((sl & 0x7) << 1)); - err = mlx5_core_modify_tis(dev, sq->tisn, in, inlen); + err = mlx5_core_modify_tis(dev, sq->tisn, in); kvfree(in); @@ -2960,7 +2958,7 @@ static int modify_raw_packet_tx_affinity(struct mlx5_core_dev *dev, tisc = MLX5_ADDR_OF(modify_tis_in, in, ctx); MLX5_SET(tisc, tisc, lag_tx_port_affinity, tx_affinity); - err = mlx5_core_modify_tis(dev, sq->tisn, in, inlen); + err = mlx5_core_modify_tis(dev, sq->tisn, in); kvfree(in); @@ -3240,7 +3238,7 @@ static int modify_raw_packet_qp_rq( "RAW PACKET QP counters are not supported on current FW\n"); } - err = mlx5_core_modify_rq(dev->mdev, rq->base.mqp.qpn, in, inlen); + err = mlx5_core_modify_rq(dev->mdev, rq->base.mqp.qpn, in); if (err) goto out; @@ -3303,7 +3301,7 @@ static int modify_raw_packet_qp_sq( MLX5_SET(sqc, sqc, packet_pacing_rate_limit_index, rl_index); } - err = mlx5_core_modify_sq(dev, sq->base.mqp.qpn, in, inlen); + err = mlx5_core_modify_sq(dev, sq->base.mqp.qpn, in); if (err) { /* Remove new rate from table if failed */ if (new_rate_added) @@ -6444,7 +6442,7 @@ int mlx5_ib_modify_wq(struct ib_wq *wq, struct ib_wq_attr *wq_attr, "Receive WQ counters are not supported on current FW\n"); } - err = mlx5_core_modify_rq(dev->mdev, rwq->core_qp.qpn, in, inlen); + err = mlx5_core_modify_rq(dev->mdev, rwq->core_qp.qpn, in); if (!err) rwq->ibwq.state = (wq_state == MLX5_RQC_STATE_ERR) ? IB_WQS_ERR : wq_state; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h index 12a61bf82c14..1599b05f3c5a 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h @@ -1012,7 +1012,7 @@ int mlx5e_redirect_rqt(struct mlx5e_priv *priv, u32 rqtn, int sz, void mlx5e_build_indir_tir_ctx_hash(struct mlx5e_rss_params *rss_params, const struct mlx5e_tirc_config *ttconfig, void *tirc, bool inner); -void mlx5e_modify_tirs_hash(struct mlx5e_priv *priv, void *in, int inlen); +void mlx5e_modify_tirs_hash(struct mlx5e_priv *priv, void *in); struct mlx5e_tirc_config mlx5e_tirc_get_default_config(enum mlx5e_traffic_types tt); struct mlx5e_xsk_param; @@ -1102,8 +1102,8 @@ void mlx5e_dcbnl_init_app(struct mlx5e_priv *priv); void mlx5e_dcbnl_delete_app(struct mlx5e_priv *priv); #endif -int mlx5e_create_tir(struct mlx5_core_dev *mdev, - struct mlx5e_tir *tir, u32 *in, int inlen); +int mlx5e_create_tir(struct mlx5_core_dev *mdev, struct mlx5e_tir *tir, + u32 *in); void mlx5e_destroy_tir(struct mlx5_core_dev *mdev, struct mlx5e_tir *tir); int mlx5e_create_mdev_resources(struct mlx5_core_dev *mdev); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_common.c b/drivers/net/ethernet/mellanox/mlx5/core/en_common.c index f7890e0ce96c..af3228b3f303 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_common.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_common.c @@ -36,12 +36,11 @@ * Global resources are common to all the netdevices crated on the same nic. */ -int mlx5e_create_tir(struct mlx5_core_dev *mdev, - struct mlx5e_tir *tir, u32 *in, int inlen) +int mlx5e_create_tir(struct mlx5_core_dev *mdev, struct mlx5e_tir *tir, u32 *in) { int err; - err = mlx5_core_create_tir(mdev, in, inlen, &tir->tirn); + err = mlx5_core_create_tir(mdev, in, &tir->tirn); if (err) return err; @@ -167,7 +166,7 @@ int mlx5e_refresh_tirs(struct mlx5e_priv *priv, bool enable_uc_lb) mutex_lock(&mdev->mlx5e_res.td.list_lock); list_for_each_entry(tir, &mdev->mlx5e_res.td.tirs_list, list) { tirn = tir->tirn; - err = mlx5_core_modify_tir(mdev, tirn, in, inlen); + err = mlx5_core_modify_tir(mdev, tirn, in); if (err) goto out; } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c index 6d703ddee4e2..de8250820b06 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c @@ -1204,7 +1204,7 @@ int mlx5e_set_rxfh(struct net_device *dev, const u32 *indir, } if (hash_changed) - mlx5e_modify_tirs_hash(priv, in, inlen); + mlx5e_modify_tirs_hash(priv, in); mutex_unlock(&priv->state_lock); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_fs_ethtool.c b/drivers/net/ethernet/mellanox/mlx5/core/en_fs_ethtool.c index 3bc2ac3d53fc..83c9b2bbc4af 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_fs_ethtool.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_fs_ethtool.c @@ -858,7 +858,7 @@ static int mlx5e_set_rss_hash_opt(struct mlx5e_priv *priv, goto out; priv->rss_params.rx_hash_fields[tt] = rx_hash_field; - mlx5e_modify_tirs_hash(priv, in, inlen); + mlx5e_modify_tirs_hash(priv, in); out: mutex_unlock(&priv->state_lock); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index 30970b405040..05dbe8b9caac 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -721,7 +721,7 @@ int mlx5e_modify_rq_state(struct mlx5e_rq *rq, int curr_state, int next_state) MLX5_SET(modify_rq_in, in, rq_state, curr_state); MLX5_SET(rqc, rqc, state, next_state); - err = mlx5_core_modify_rq(mdev, rq->rqn, in, inlen); + err = mlx5_core_modify_rq(mdev, rq->rqn, in); kvfree(in); @@ -752,7 +752,7 @@ static int mlx5e_modify_rq_scatter_fcs(struct mlx5e_rq *rq, bool enable) MLX5_SET(rqc, rqc, scatter_fcs, enable); MLX5_SET(rqc, rqc, state, MLX5_RQC_STATE_RDY); - err = mlx5_core_modify_rq(mdev, rq->rqn, in, inlen); + err = mlx5_core_modify_rq(mdev, rq->rqn, in); kvfree(in); @@ -781,7 +781,7 @@ static int mlx5e_modify_rq_vsd(struct mlx5e_rq *rq, bool vsd) MLX5_SET(rqc, rqc, vsd, vsd); MLX5_SET(rqc, rqc, state, MLX5_RQC_STATE_RDY); - err = mlx5_core_modify_rq(mdev, rq->rqn, in, inlen); + err = mlx5_core_modify_rq(mdev, rq->rqn, in); kvfree(in); @@ -1259,7 +1259,7 @@ int mlx5e_modify_sq(struct mlx5_core_dev *mdev, u32 sqn, MLX5_SET(sqc, sqc, packet_pacing_rate_limit_index, p->rl_index); } - err = mlx5_core_modify_sq(mdev, sqn, in, inlen); + err = mlx5_core_modify_sq(mdev, sqn, in); kvfree(in); @@ -2698,7 +2698,7 @@ static void mlx5e_update_rx_hash_fields(struct mlx5e_tirc_config *ttconfig, ttconfig->rx_hash_fields = rx_hash_fields; } -void mlx5e_modify_tirs_hash(struct mlx5e_priv *priv, void *in, int inlen) +void mlx5e_modify_tirs_hash(struct mlx5e_priv *priv, void *in) { void *tirc = MLX5_ADDR_OF(modify_tir_in, in, ctx); struct mlx5e_rss_params *rss = &priv->rss_params; @@ -2714,7 +2714,7 @@ void mlx5e_modify_tirs_hash(struct mlx5e_priv *priv, void *in, int inlen) mlx5e_update_rx_hash_fields(&ttconfig, tt, rss->rx_hash_fields[tt]); mlx5e_build_indir_tir_ctx_hash(rss, &ttconfig, tirc, false); - mlx5_core_modify_tir(mdev, priv->indir_tir[tt].tirn, in, inlen); + mlx5_core_modify_tir(mdev, priv->indir_tir[tt].tirn, in); } if (!mlx5e_tunnel_inner_ft_supported(priv->mdev)) @@ -2725,8 +2725,7 @@ void mlx5e_modify_tirs_hash(struct mlx5e_priv *priv, void *in, int inlen) mlx5e_update_rx_hash_fields(&ttconfig, tt, rss->rx_hash_fields[tt]); mlx5e_build_indir_tir_ctx_hash(rss, &ttconfig, tirc, true); - mlx5_core_modify_tir(mdev, priv->inner_indir_tir[tt].tirn, in, - inlen); + mlx5_core_modify_tir(mdev, priv->inner_indir_tir[tt].tirn, in); } } @@ -2752,15 +2751,13 @@ static int mlx5e_modify_tirs_lro(struct mlx5e_priv *priv) mlx5e_build_tir_ctx_lro(&priv->channels.params, tirc); for (tt = 0; tt < MLX5E_NUM_INDIR_TIRS; tt++) { - err = mlx5_core_modify_tir(mdev, priv->indir_tir[tt].tirn, in, - inlen); + err = mlx5_core_modify_tir(mdev, priv->indir_tir[tt].tirn, in); if (err) goto free_in; } for (ix = 0; ix < priv->max_nch; ix++) { - err = mlx5_core_modify_tir(mdev, priv->direct_tir[ix].tirn, - in, inlen); + err = mlx5_core_modify_tir(mdev, priv->direct_tir[ix].tirn, in); if (err) goto free_in; } @@ -3214,7 +3211,7 @@ int mlx5e_create_tis(struct mlx5_core_dev *mdev, void *in, u32 *tisn) if (mlx5_lag_is_lacp_owner(mdev)) MLX5_SET(tisc, tisc, strict_lag_tx_port_affinity, 1); - return mlx5_core_create_tis(mdev, in, MLX5_ST_SZ_BYTES(create_tis_in), tisn); + return mlx5_core_create_tis(mdev, in, tisn); } void mlx5e_destroy_tis(struct mlx5_core_dev *mdev, u32 tisn) @@ -3332,7 +3329,7 @@ int mlx5e_create_indirect_tirs(struct mlx5e_priv *priv, bool inner_ttc) tir = &priv->indir_tir[tt]; tirc = MLX5_ADDR_OF(create_tir_in, in, ctx); mlx5e_build_indir_tir_ctx(priv, tt, tirc); - err = mlx5e_create_tir(priv->mdev, tir, in, inlen); + err = mlx5e_create_tir(priv->mdev, tir, in); if (err) { mlx5_core_warn(priv->mdev, "create indirect tirs failed, %d\n", err); goto err_destroy_inner_tirs; @@ -3347,7 +3344,7 @@ int mlx5e_create_indirect_tirs(struct mlx5e_priv *priv, bool inner_ttc) tir = &priv->inner_indir_tir[i]; tirc = MLX5_ADDR_OF(create_tir_in, in, ctx); mlx5e_build_inner_indir_tir_ctx(priv, i, tirc); - err = mlx5e_create_tir(priv->mdev, tir, in, inlen); + err = mlx5e_create_tir(priv->mdev, tir, in); if (err) { mlx5_core_warn(priv->mdev, "create inner indirect tirs failed, %d\n", err); goto err_destroy_inner_tirs; @@ -3390,7 +3387,7 @@ int mlx5e_create_direct_tirs(struct mlx5e_priv *priv, struct mlx5e_tir *tirs) tir = &tirs[ix]; tirc = MLX5_ADDR_OF(create_tir_in, in, ctx); mlx5e_build_direct_tir_ctx(priv, tir->rqt.rqtn, tirc); - err = mlx5e_create_tir(priv->mdev, tir, in, inlen); + err = mlx5e_create_tir(priv->mdev, tir, in); if (unlikely(err)) goto err_destroy_ch_tirs; } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c index 438128dde187..88c0e460e995 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c @@ -568,7 +568,7 @@ struct mlx5_core_dev *mlx5e_hairpin_get_mdev(struct net *net, int ifindex) static int mlx5e_hairpin_create_transport(struct mlx5e_hairpin *hp) { - u32 in[MLX5_ST_SZ_DW(create_tir_in)] = {0}; + u32 in[MLX5_ST_SZ_DW(create_tir_in)] = {}; void *tirc; int err; @@ -582,7 +582,7 @@ static int mlx5e_hairpin_create_transport(struct mlx5e_hairpin *hp) MLX5_SET(tirc, tirc, inline_rqn, hp->pair->rqn[0]); MLX5_SET(tirc, tirc, transport_domain, hp->tdn); - err = mlx5_core_create_tir(hp->func_mdev, in, MLX5_ST_SZ_BYTES(create_tir_in), &hp->tirn); + err = mlx5_core_create_tir(hp->func_mdev, in, &hp->tirn); if (err) goto create_tir_err; @@ -666,7 +666,7 @@ static int mlx5e_hairpin_create_indirect_tirs(struct mlx5e_hairpin *hp) mlx5e_build_indir_tir_ctx_hash(&priv->rss_params, &ttconfig, tirc, false); err = mlx5_core_create_tir(hp->func_mdev, in, - MLX5_ST_SZ_BYTES(create_tir_in), &hp->indir_tirn[tt]); + &hp->indir_tirn[tt]); if (err) { mlx5_core_warn(hp->func_mdev, "create indirect tirs failed, %d\n", err); goto err_destroy_tirs; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/transobj.c b/drivers/net/ethernet/mellanox/mlx5/core/transobj.c index b1068500f1df..01cc00ad8acf 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/transobj.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/transobj.c @@ -36,14 +36,14 @@ int mlx5_core_alloc_transport_domain(struct mlx5_core_dev *dev, u32 *tdn) { - u32 in[MLX5_ST_SZ_DW(alloc_transport_domain_in)] = {0}; - u32 out[MLX5_ST_SZ_DW(alloc_transport_domain_out)] = {0}; + u32 out[MLX5_ST_SZ_DW(alloc_transport_domain_out)] = {}; + u32 in[MLX5_ST_SZ_DW(alloc_transport_domain_in)] = {}; int err; MLX5_SET(alloc_transport_domain_in, in, opcode, MLX5_CMD_OP_ALLOC_TRANSPORT_DOMAIN); - err = mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); + err = mlx5_cmd_exec_inout(dev, alloc_transport_domain, in, out); if (!err) *tdn = MLX5_GET(alloc_transport_domain_out, out, transport_domain); @@ -54,19 +54,18 @@ EXPORT_SYMBOL(mlx5_core_alloc_transport_domain); void mlx5_core_dealloc_transport_domain(struct mlx5_core_dev *dev, u32 tdn) { - u32 in[MLX5_ST_SZ_DW(dealloc_transport_domain_in)] = {0}; - u32 out[MLX5_ST_SZ_DW(dealloc_transport_domain_out)] = {0}; + u32 in[MLX5_ST_SZ_DW(dealloc_transport_domain_in)] = {}; MLX5_SET(dealloc_transport_domain_in, in, opcode, MLX5_CMD_OP_DEALLOC_TRANSPORT_DOMAIN); MLX5_SET(dealloc_transport_domain_in, in, transport_domain, tdn); - mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); + mlx5_cmd_exec_in(dev, dealloc_transport_domain, in); } EXPORT_SYMBOL(mlx5_core_dealloc_transport_domain); int mlx5_core_create_rq(struct mlx5_core_dev *dev, u32 *in, int inlen, u32 *rqn) { - u32 out[MLX5_ST_SZ_DW(create_rq_out)] = {0}; + u32 out[MLX5_ST_SZ_DW(create_rq_out)] = {}; int err; MLX5_SET(create_rq_in, in, opcode, MLX5_CMD_OP_CREATE_RQ); @@ -78,44 +77,39 @@ int mlx5_core_create_rq(struct mlx5_core_dev *dev, u32 *in, int inlen, u32 *rqn) } EXPORT_SYMBOL(mlx5_core_create_rq); -int mlx5_core_modify_rq(struct mlx5_core_dev *dev, u32 rqn, u32 *in, int inlen) +int mlx5_core_modify_rq(struct mlx5_core_dev *dev, u32 rqn, u32 *in) { - u32 out[MLX5_ST_SZ_DW(modify_rq_out)]; - MLX5_SET(modify_rq_in, in, rqn, rqn); MLX5_SET(modify_rq_in, in, opcode, MLX5_CMD_OP_MODIFY_RQ); - memset(out, 0, sizeof(out)); - return mlx5_cmd_exec(dev, in, inlen, out, sizeof(out)); + return mlx5_cmd_exec_in(dev, modify_rq, in); } EXPORT_SYMBOL(mlx5_core_modify_rq); void mlx5_core_destroy_rq(struct mlx5_core_dev *dev, u32 rqn) { - u32 in[MLX5_ST_SZ_DW(destroy_rq_in)] = {0}; - u32 out[MLX5_ST_SZ_DW(destroy_rq_out)] = {0}; + u32 in[MLX5_ST_SZ_DW(destroy_rq_in)] = {}; MLX5_SET(destroy_rq_in, in, opcode, MLX5_CMD_OP_DESTROY_RQ); MLX5_SET(destroy_rq_in, in, rqn, rqn); - mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); + mlx5_cmd_exec_in(dev, destroy_rq, in); } EXPORT_SYMBOL(mlx5_core_destroy_rq); int mlx5_core_query_rq(struct mlx5_core_dev *dev, u32 rqn, u32 *out) { - u32 in[MLX5_ST_SZ_DW(query_rq_in)] = {0}; - int outlen = MLX5_ST_SZ_BYTES(query_rq_out); + u32 in[MLX5_ST_SZ_DW(query_rq_in)] = {}; MLX5_SET(query_rq_in, in, opcode, MLX5_CMD_OP_QUERY_RQ); MLX5_SET(query_rq_in, in, rqn, rqn); - return mlx5_cmd_exec(dev, in, sizeof(in), out, outlen); + return mlx5_cmd_exec_inout(dev, query_rq, in, out); } EXPORT_SYMBOL(mlx5_core_query_rq); int mlx5_core_create_sq(struct mlx5_core_dev *dev, u32 *in, int inlen, u32 *sqn) { - u32 out[MLX5_ST_SZ_DW(create_sq_out)] = {0}; + u32 out[MLX5_ST_SZ_DW(create_sq_out)] = {}; int err; MLX5_SET(create_sq_in, in, opcode, MLX5_CMD_OP_CREATE_SQ); @@ -126,34 +120,30 @@ int mlx5_core_create_sq(struct mlx5_core_dev *dev, u32 *in, int inlen, u32 *sqn) return err; } -int mlx5_core_modify_sq(struct mlx5_core_dev *dev, u32 sqn, u32 *in, int inlen) +int mlx5_core_modify_sq(struct mlx5_core_dev *dev, u32 sqn, u32 *in) { - u32 out[MLX5_ST_SZ_DW(modify_sq_out)] = {0}; - MLX5_SET(modify_sq_in, in, sqn, sqn); MLX5_SET(modify_sq_in, in, opcode, MLX5_CMD_OP_MODIFY_SQ); - return mlx5_cmd_exec(dev, in, inlen, out, sizeof(out)); + return mlx5_cmd_exec_in(dev, modify_sq, in); } EXPORT_SYMBOL(mlx5_core_modify_sq); void mlx5_core_destroy_sq(struct mlx5_core_dev *dev, u32 sqn) { - u32 in[MLX5_ST_SZ_DW(destroy_sq_in)] = {0}; - u32 out[MLX5_ST_SZ_DW(destroy_sq_out)] = {0}; + u32 in[MLX5_ST_SZ_DW(destroy_sq_in)] = {}; MLX5_SET(destroy_sq_in, in, opcode, MLX5_CMD_OP_DESTROY_SQ); MLX5_SET(destroy_sq_in, in, sqn, sqn); - mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); + mlx5_cmd_exec_in(dev, destroy_sq, in); } int mlx5_core_query_sq(struct mlx5_core_dev *dev, u32 sqn, u32 *out) { - u32 in[MLX5_ST_SZ_DW(query_sq_in)] = {0}; - int outlen = MLX5_ST_SZ_BYTES(query_sq_out); + u32 in[MLX5_ST_SZ_DW(query_sq_in)] = {}; MLX5_SET(query_sq_in, in, opcode, MLX5_CMD_OP_QUERY_SQ); MLX5_SET(query_sq_in, in, sqn, sqn); - return mlx5_cmd_exec(dev, in, sizeof(in), out, outlen); + return mlx5_cmd_exec_inout(dev, query_sq, in, out); } EXPORT_SYMBOL(mlx5_core_query_sq); @@ -182,24 +172,13 @@ out: } EXPORT_SYMBOL_GPL(mlx5_core_query_sq_state); -int mlx5_core_create_tir_out(struct mlx5_core_dev *dev, - u32 *in, int inlen, - u32 *out, int outlen) -{ - MLX5_SET(create_tir_in, in, opcode, MLX5_CMD_OP_CREATE_TIR); - - return mlx5_cmd_exec(dev, in, inlen, out, outlen); -} -EXPORT_SYMBOL(mlx5_core_create_tir_out); - -int mlx5_core_create_tir(struct mlx5_core_dev *dev, u32 *in, int inlen, - u32 *tirn) +int mlx5_core_create_tir(struct mlx5_core_dev *dev, u32 *in, u32 *tirn) { u32 out[MLX5_ST_SZ_DW(create_tir_out)] = {}; int err; - err = mlx5_core_create_tir_out(dev, in, inlen, - out, sizeof(out)); + MLX5_SET(create_tir_in, in, opcode, MLX5_CMD_OP_CREATE_TIR); + err = mlx5_cmd_exec_inout(dev, create_tir, in, out); if (!err) *tirn = MLX5_GET(create_tir_out, out, tirn); @@ -207,35 +186,30 @@ int mlx5_core_create_tir(struct mlx5_core_dev *dev, u32 *in, int inlen, } EXPORT_SYMBOL(mlx5_core_create_tir); -int mlx5_core_modify_tir(struct mlx5_core_dev *dev, u32 tirn, u32 *in, - int inlen) +int mlx5_core_modify_tir(struct mlx5_core_dev *dev, u32 tirn, u32 *in) { - u32 out[MLX5_ST_SZ_DW(modify_tir_out)] = {0}; - MLX5_SET(modify_tir_in, in, tirn, tirn); MLX5_SET(modify_tir_in, in, opcode, MLX5_CMD_OP_MODIFY_TIR); - return mlx5_cmd_exec(dev, in, inlen, out, sizeof(out)); + return mlx5_cmd_exec_in(dev, modify_tir, in); } void mlx5_core_destroy_tir(struct mlx5_core_dev *dev, u32 tirn) { - u32 in[MLX5_ST_SZ_DW(destroy_tir_in)] = {0}; - u32 out[MLX5_ST_SZ_DW(destroy_tir_out)] = {0}; + u32 in[MLX5_ST_SZ_DW(destroy_tir_in)] = {}; MLX5_SET(destroy_tir_in, in, opcode, MLX5_CMD_OP_DESTROY_TIR); MLX5_SET(destroy_tir_in, in, tirn, tirn); - mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); + mlx5_cmd_exec_in(dev, destroy_tir, in); } EXPORT_SYMBOL(mlx5_core_destroy_tir); -int mlx5_core_create_tis(struct mlx5_core_dev *dev, u32 *in, int inlen, - u32 *tisn) +int mlx5_core_create_tis(struct mlx5_core_dev *dev, u32 *in, u32 *tisn) { - u32 out[MLX5_ST_SZ_DW(create_tis_out)] = {0}; + u32 out[MLX5_ST_SZ_DW(create_tis_out)] = {}; int err; MLX5_SET(create_tis_in, in, opcode, MLX5_CMD_OP_CREATE_TIS); - err = mlx5_cmd_exec(dev, in, inlen, out, sizeof(out)); + err = mlx5_cmd_exec_inout(dev, create_tis, in, out); if (!err) *tisn = MLX5_GET(create_tis_out, out, tisn); @@ -243,33 +217,29 @@ int mlx5_core_create_tis(struct mlx5_core_dev *dev, u32 *in, int inlen, } EXPORT_SYMBOL(mlx5_core_create_tis); -int mlx5_core_modify_tis(struct mlx5_core_dev *dev, u32 tisn, u32 *in, - int inlen) +int mlx5_core_modify_tis(struct mlx5_core_dev *dev, u32 tisn, u32 *in) { - u32 out[MLX5_ST_SZ_DW(modify_tis_out)] = {0}; - MLX5_SET(modify_tis_in, in, tisn, tisn); MLX5_SET(modify_tis_in, in, opcode, MLX5_CMD_OP_MODIFY_TIS); - return mlx5_cmd_exec(dev, in, inlen, out, sizeof(out)); + return mlx5_cmd_exec_in(dev, modify_tis, in); } EXPORT_SYMBOL(mlx5_core_modify_tis); void mlx5_core_destroy_tis(struct mlx5_core_dev *dev, u32 tisn) { - u32 in[MLX5_ST_SZ_DW(destroy_tis_in)] = {0}; - u32 out[MLX5_ST_SZ_DW(destroy_tis_out)] = {0}; + u32 in[MLX5_ST_SZ_DW(destroy_tis_in)] = {}; MLX5_SET(destroy_tis_in, in, opcode, MLX5_CMD_OP_DESTROY_TIS); MLX5_SET(destroy_tis_in, in, tisn, tisn); - mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); + mlx5_cmd_exec_in(dev, destroy_tis, in); } EXPORT_SYMBOL(mlx5_core_destroy_tis); int mlx5_core_create_rqt(struct mlx5_core_dev *dev, u32 *in, int inlen, u32 *rqtn) { - u32 out[MLX5_ST_SZ_DW(create_rqt_out)] = {0}; + u32 out[MLX5_ST_SZ_DW(create_rqt_out)] = {}; int err; MLX5_SET(create_rqt_in, in, opcode, MLX5_CMD_OP_CREATE_RQT); @@ -284,7 +254,7 @@ EXPORT_SYMBOL(mlx5_core_create_rqt); int mlx5_core_modify_rqt(struct mlx5_core_dev *dev, u32 rqtn, u32 *in, int inlen) { - u32 out[MLX5_ST_SZ_DW(modify_rqt_out)] = {0}; + u32 out[MLX5_ST_SZ_DW(modify_rqt_out)] = {}; MLX5_SET(modify_rqt_in, in, rqtn, rqtn); MLX5_SET(modify_rqt_in, in, opcode, MLX5_CMD_OP_MODIFY_RQT); @@ -293,12 +263,11 @@ int mlx5_core_modify_rqt(struct mlx5_core_dev *dev, u32 rqtn, u32 *in, void mlx5_core_destroy_rqt(struct mlx5_core_dev *dev, u32 rqtn) { - u32 in[MLX5_ST_SZ_DW(destroy_rqt_in)] = {0}; - u32 out[MLX5_ST_SZ_DW(destroy_rqt_out)] = {0}; + u32 in[MLX5_ST_SZ_DW(destroy_rqt_in)] = {}; MLX5_SET(destroy_rqt_in, in, opcode, MLX5_CMD_OP_DESTROY_RQT); MLX5_SET(destroy_rqt_in, in, rqtn, rqtn); - mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); + mlx5_cmd_exec_in(dev, destroy_rqt, in); } EXPORT_SYMBOL(mlx5_core_destroy_rqt); @@ -383,7 +352,7 @@ static int mlx5_hairpin_modify_rq(struct mlx5_core_dev *func_mdev, u32 rqn, int curr_state, int next_state, u16 peer_vhca, u32 peer_sq) { - u32 in[MLX5_ST_SZ_DW(modify_rq_in)] = {0}; + u32 in[MLX5_ST_SZ_DW(modify_rq_in)] = {}; void *rqc; rqc = MLX5_ADDR_OF(modify_rq_in, in, ctx); @@ -396,8 +365,7 @@ static int mlx5_hairpin_modify_rq(struct mlx5_core_dev *func_mdev, u32 rqn, MLX5_SET(modify_rq_in, in, rq_state, curr_state); MLX5_SET(rqc, rqc, state, next_state); - return mlx5_core_modify_rq(func_mdev, rqn, - in, MLX5_ST_SZ_BYTES(modify_rq_in)); + return mlx5_core_modify_rq(func_mdev, rqn, in); } static int mlx5_hairpin_modify_sq(struct mlx5_core_dev *peer_mdev, u32 sqn, @@ -417,8 +385,7 @@ static int mlx5_hairpin_modify_sq(struct mlx5_core_dev *peer_mdev, u32 sqn, MLX5_SET(modify_sq_in, in, sq_state, curr_state); MLX5_SET(sqc, sqc, state, next_state); - return mlx5_core_modify_sq(peer_mdev, sqn, - in, MLX5_ST_SZ_BYTES(modify_sq_in)); + return mlx5_core_modify_sq(peer_mdev, sqn, in); } static int mlx5_hairpin_pair_queues(struct mlx5_hairpin *hp) diff --git a/include/linux/mlx5/transobj.h b/include/linux/mlx5/transobj.h index dc6b1e7cb8c4..028f442530cf 100644 --- a/include/linux/mlx5/transobj.h +++ b/include/linux/mlx5/transobj.h @@ -39,27 +39,20 @@ int mlx5_core_alloc_transport_domain(struct mlx5_core_dev *dev, u32 *tdn); void mlx5_core_dealloc_transport_domain(struct mlx5_core_dev *dev, u32 tdn); int mlx5_core_create_rq(struct mlx5_core_dev *dev, u32 *in, int inlen, u32 *rqn); -int mlx5_core_modify_rq(struct mlx5_core_dev *dev, u32 rqn, u32 *in, int inlen); +int mlx5_core_modify_rq(struct mlx5_core_dev *dev, u32 rqn, u32 *in); void mlx5_core_destroy_rq(struct mlx5_core_dev *dev, u32 rqn); int mlx5_core_query_rq(struct mlx5_core_dev *dev, u32 rqn, u32 *out); int mlx5_core_create_sq(struct mlx5_core_dev *dev, u32 *in, int inlen, u32 *sqn); -int mlx5_core_modify_sq(struct mlx5_core_dev *dev, u32 sqn, u32 *in, int inlen); +int mlx5_core_modify_sq(struct mlx5_core_dev *dev, u32 sqn, u32 *in); void mlx5_core_destroy_sq(struct mlx5_core_dev *dev, u32 sqn); int mlx5_core_query_sq(struct mlx5_core_dev *dev, u32 sqn, u32 *out); int mlx5_core_query_sq_state(struct mlx5_core_dev *dev, u32 sqn, u8 *state); -int mlx5_core_create_tir(struct mlx5_core_dev *dev, u32 *in, int inlen, - u32 *tirn); -int mlx5_core_create_tir_out(struct mlx5_core_dev *dev, - u32 *in, int inlen, - u32 *out, int outlen); -int mlx5_core_modify_tir(struct mlx5_core_dev *dev, u32 tirn, u32 *in, - int inlen); +int mlx5_core_create_tir(struct mlx5_core_dev *dev, u32 *in, u32 *tirn); +int mlx5_core_modify_tir(struct mlx5_core_dev *dev, u32 tirn, u32 *in); void mlx5_core_destroy_tir(struct mlx5_core_dev *dev, u32 tirn); -int mlx5_core_create_tis(struct mlx5_core_dev *dev, u32 *in, int inlen, - u32 *tisn); -int mlx5_core_modify_tis(struct mlx5_core_dev *dev, u32 tisn, u32 *in, - int inlen); +int mlx5_core_create_tis(struct mlx5_core_dev *dev, u32 *in, u32 *tisn); +int mlx5_core_modify_tis(struct mlx5_core_dev *dev, u32 tisn, u32 *in); void mlx5_core_destroy_tis(struct mlx5_core_dev *dev, u32 tisn); int mlx5_core_create_rqt(struct mlx5_core_dev *dev, u32 *in, int inlen, u32 *rqtn); -- cgit v1.2.3-59-g8ed1b From b75326c201242de9495ff98e5d5cff41d7fc0d9d Mon Sep 17 00:00:00 2001 From: Fernando Gont Date: Sun, 19 Apr 2020 09:24:57 -0300 Subject: ipv6: Honor all IPv6 PIO Valid Lifetime values RFC4862 5.5.3 e) prevents received Router Advertisements from reducing the Valid Lifetime of configured addresses to less than two hours, thus preventing hosts from reacting to the information provided by a router that has positive knowledge that a prefix has become invalid. This patch makes hosts honor all Valid Lifetime values, as per draft-gont-6man-slaac-renum-06, Section 4.2. This is meant to help mitigate the problem discussed in draft-ietf-v6ops-slaac-renum. Note: Attacks aiming at disabling an advertised prefix via a Valid Lifetime of 0 are not really more harmful than other attacks that can be performed via forged RA messages, such as those aiming at completely disabling a next-hop router via an RA that advertises a Router Lifetime of 0, or performing a Denial of Service (DoS) attack by advertising illegitimate prefixes via forged PIOs. In scenarios where RA-based attacks are of concern, proper mitigations such as RA-Guard [RFC6105] [RFC7113] should be implemented. Signed-off-by: Fernando Gont Signed-off-by: David S. Miller --- include/net/addrconf.h | 2 -- net/ipv6/addrconf.c | 27 +++++++-------------------- 2 files changed, 7 insertions(+), 22 deletions(-) diff --git a/include/net/addrconf.h b/include/net/addrconf.h index e0eabe58aa8b..fdb07105384c 100644 --- a/include/net/addrconf.h +++ b/include/net/addrconf.h @@ -6,8 +6,6 @@ #define RTR_SOLICITATION_INTERVAL (4*HZ) #define RTR_SOLICITATION_MAX_INTERVAL (3600*HZ) /* 1 hour */ -#define MIN_VALID_LIFETIME (2*3600) /* 2 hours */ - #define TEMP_VALID_LIFETIME (7*86400) #define TEMP_PREFERRED_LIFETIME (86400) #define REGEN_MAX_RETRY (3) diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 24e319dfb510..27b4fb6e452b 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -2564,7 +2564,7 @@ int addrconf_prefix_rcv_add_addr(struct net *net, struct net_device *dev, __u32 valid_lft, u32 prefered_lft) { struct inet6_ifaddr *ifp = ipv6_get_ifaddr(net, addr, dev, 1); - int create = 0, update_lft = 0; + int create = 0; if (!ifp && valid_lft) { int max_addresses = in6_dev->cnf.max_addresses; @@ -2608,32 +2608,19 @@ int addrconf_prefix_rcv_add_addr(struct net *net, struct net_device *dev, unsigned long now; u32 stored_lft; - /* update lifetime (RFC2462 5.5.3 e) */ + /* Update lifetime (RFC4862 5.5.3 e) + * We deviate from RFC4862 by honoring all Valid Lifetimes to + * improve the reaction of SLAAC to renumbering events + * (draft-gont-6man-slaac-renum-06, Section 4.2) + */ spin_lock_bh(&ifp->lock); now = jiffies; if (ifp->valid_lft > (now - ifp->tstamp) / HZ) stored_lft = ifp->valid_lft - (now - ifp->tstamp) / HZ; else stored_lft = 0; - if (!create && stored_lft) { - const u32 minimum_lft = min_t(u32, - stored_lft, MIN_VALID_LIFETIME); - valid_lft = max(valid_lft, minimum_lft); - - /* RFC4862 Section 5.5.3e: - * "Note that the preferred lifetime of the - * corresponding address is always reset to - * the Preferred Lifetime in the received - * Prefix Information option, regardless of - * whether the valid lifetime is also reset or - * ignored." - * - * So we should always update prefered_lft here. - */ - update_lft = 1; - } - if (update_lft) { + if (!create && stored_lft) { ifp->valid_lft = valid_lft; ifp->prefered_lft = prefered_lft; ifp->tstamp = now; -- cgit v1.2.3-59-g8ed1b From e131a5634830047923c694b4ce0c3b31745ff01b Mon Sep 17 00:00:00 2001 From: Alexander Lobakin Date: Tue, 21 Apr 2020 16:41:08 +0300 Subject: net: dsa: add GRO support via gro_cells gro_cells lib is used by different encapsulating netdevices, such as geneve, macsec, vxlan etc. to speed up decapsulated traffic processing. CPU tag is a sort of "encapsulation", and we can use the same mechs to greatly improve overall DSA performance. skbs are passed to the GRO layer after removing CPU tags, so we don't need any new packet offload types as it was firstly proposed by me in the first GRO-over-DSA variant [1]. The size of struct gro_cells is sizeof(void *), so hot struct dsa_slave_priv becomes only 4/8 bytes bigger, and all critical fields remain in one 32-byte cacheline. The other positive side effect is that drivers for network devices that can be shipped as CPU ports of DSA-driven switches can now use napi_gro_frags() to pass skbs to kernel. Packets built that way are completely non-linear and are likely being dropped without GRO. This was tested on to-be-mainlined-soon Ethernet driver that uses napi_gro_frags(), and the overall performance was on par with the variant from [1], sometimes even better due to minimal overhead. net.core.gro_normal_batch tuning may help to push it to the limit on particular setups and platforms. iperf3 IPoE VLAN NAT TCP forwarding (port1.218 -> port0) setup on 1.2 GHz MIPS board: 5.7-rc2 baseline: [ID] Interval Transfer Bitrate Retr [ 5] 0.00-120.01 sec 9.00 GBytes 644 Mbits/sec 413 sender [ 5] 0.00-120.00 sec 8.99 GBytes 644 Mbits/sec receiver Iface RX packets TX packets eth0 7097731 7097702 port0 426050 6671829 port1 6671681 425862 port1.218 6671677 425851 With this patch: [ID] Interval Transfer Bitrate Retr [ 5] 0.00-120.01 sec 12.2 GBytes 870 Mbits/sec 122 sender [ 5] 0.00-120.00 sec 12.2 GBytes 870 Mbits/sec receiver Iface RX packets TX packets eth0 9474792 9474777 port0 455200 353288 port1 9019592 455035 port1.218 353144 455024 v2: - Add some performance examples in the commit message; - No functional changes. [1] https://lore.kernel.org/netdev/20191230143028.27313-1-alobakin@dlink.ru/ Signed-off-by: Alexander Lobakin Signed-off-by: David S. Miller --- net/dsa/Kconfig | 1 + net/dsa/dsa.c | 2 +- net/dsa/dsa_priv.h | 3 +++ net/dsa/slave.c | 10 +++++++++- 4 files changed, 14 insertions(+), 2 deletions(-) diff --git a/net/dsa/Kconfig b/net/dsa/Kconfig index 92663dcb3aa2..739613070d07 100644 --- a/net/dsa/Kconfig +++ b/net/dsa/Kconfig @@ -9,6 +9,7 @@ menuconfig NET_DSA tristate "Distributed Switch Architecture" depends on HAVE_NET_DSA depends on BRIDGE || BRIDGE=n + select GRO_CELLS select NET_SWITCHDEV select PHYLINK select NET_DEVLINK diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c index ee2610c4d46a..0384a911779e 100644 --- a/net/dsa/dsa.c +++ b/net/dsa/dsa.c @@ -234,7 +234,7 @@ static int dsa_switch_rcv(struct sk_buff *skb, struct net_device *dev, if (dsa_skb_defer_rx_timestamp(p, skb)) return 0; - netif_receive_skb(skb); + gro_cells_receive(&p->gcells, skb); return 0; } diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h index 904cc7c9b882..6d9a1ef65fa0 100644 --- a/net/dsa/dsa_priv.h +++ b/net/dsa/dsa_priv.h @@ -11,6 +11,7 @@ #include #include #include +#include enum { DSA_NOTIFIER_AGEING_TIME, @@ -77,6 +78,8 @@ struct dsa_slave_priv { struct pcpu_sw_netstats *stats64; + struct gro_cells gcells; + /* DSA port data, such as switch, port index, etc. */ struct dsa_port *dp; diff --git a/net/dsa/slave.c b/net/dsa/slave.c index e94eb1aac602..f2c241cf3a80 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -1762,6 +1762,11 @@ int dsa_slave_create(struct dsa_port *port) free_netdev(slave_dev); return -ENOMEM; } + + ret = gro_cells_init(&p->gcells, slave_dev); + if (ret) + goto out_free; + p->dp = port; INIT_LIST_HEAD(&p->mall_tc_list); p->xmit = cpu_dp->tag_ops->xmit; @@ -1781,7 +1786,7 @@ int dsa_slave_create(struct dsa_port *port) ret = dsa_slave_phy_setup(slave_dev); if (ret) { netdev_err(master, "error %d setting up slave phy\n", ret); - goto out_free; + goto out_gcells; } dsa_slave_notify(slave_dev, DSA_PORT_REGISTER); @@ -1800,6 +1805,8 @@ out_phy: phylink_disconnect_phy(p->dp->pl); rtnl_unlock(); phylink_destroy(p->dp->pl); +out_gcells: + gro_cells_destroy(&p->gcells); out_free: free_percpu(p->stats64); free_netdev(slave_dev); @@ -1820,6 +1827,7 @@ void dsa_slave_destroy(struct net_device *slave_dev) dsa_slave_notify(slave_dev, DSA_PORT_UNREGISTER); unregister_netdev(slave_dev); phylink_destroy(dp->pl); + gro_cells_destroy(&p->gcells); free_percpu(p->stats64); free_netdev(slave_dev); } -- cgit v1.2.3-59-g8ed1b From 2196d831205bad6bc5cd328baf0ae02234629695 Mon Sep 17 00:00:00 2001 From: Sudarsana Reddy Kalluru Date: Wed, 22 Apr 2020 06:16:06 -0700 Subject: qed: Enable device error reporting capability. The patch enables the device to send error messages to root port when an error is detected. Signed-off-by: Sudarsana Reddy Kalluru Signed-off-by: Ariel Elior Signed-off-by: Igor Russkikh Signed-off-by: David S. Miller --- drivers/net/ethernet/qlogic/qed/qed_main.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/drivers/net/ethernet/qlogic/qed/qed_main.c b/drivers/net/ethernet/qlogic/qed/qed_main.c index 96356e897c80..38a1d26ca9db 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_main.c +++ b/drivers/net/ethernet/qlogic/qed/qed_main.c @@ -49,6 +49,7 @@ #include #include #include +#include #include "qed.h" #include "qed_sriov.h" @@ -129,6 +130,8 @@ static void qed_free_pci(struct qed_dev *cdev) { struct pci_dev *pdev = cdev->pdev; + pci_disable_pcie_error_reporting(pdev); + if (cdev->doorbells && cdev->db_size) iounmap(cdev->doorbells); if (cdev->regview) @@ -231,6 +234,12 @@ static int qed_init_pci(struct qed_dev *cdev, struct pci_dev *pdev) return -ENOMEM; } + /* AER (Advanced Error reporting) configuration */ + rc = pci_enable_pcie_error_reporting(pdev); + if (rc) + DP_VERBOSE(cdev, NETIF_MSG_DRV, + "Failed to configure PCIe AER [%d]\n", rc); + return 0; err2: -- cgit v1.2.3-59-g8ed1b From 731815e720ae7e47a19753e00ea80651b2d52b3b Mon Sep 17 00:00:00 2001 From: Sudarsana Reddy Kalluru Date: Wed, 22 Apr 2020 06:16:07 -0700 Subject: qede: Add support for handling the pcie errors. The error recovery is handled by management firmware (MFW) with the help of qed/qede drivers. Upon detecting the errors, driver informs MFW about this event which in turn starts a recovery process. MFW sends ERROR_RECOVERY notification to the driver which performs the required cleanup/recovery from the driver side. Signed-off-by: Sudarsana Reddy Kalluru Acked-by: Jakub Kicinski Signed-off-by: David S. Miller --- drivers/net/ethernet/qlogic/qede/qede.h | 1 + drivers/net/ethernet/qlogic/qede/qede_main.c | 68 +++++++++++++++++++++++++++- 2 files changed, 68 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/qlogic/qede/qede.h b/drivers/net/ethernet/qlogic/qede/qede.h index 234c6f30effb..1a708f95ce94 100644 --- a/drivers/net/ethernet/qlogic/qede/qede.h +++ b/drivers/net/ethernet/qlogic/qede/qede.h @@ -485,6 +485,7 @@ struct qede_fastpath { #define QEDE_SP_RECOVERY 0 #define QEDE_SP_RX_MODE 1 +#define QEDE_SP_AER 7 #ifdef CONFIG_RFS_ACCEL int qede_rx_flow_steer(struct net_device *dev, const struct sk_buff *skb, diff --git a/drivers/net/ethernet/qlogic/qede/qede_main.c b/drivers/net/ethernet/qlogic/qede/qede_main.c index 34fa3917eb33..9b456198cb50 100644 --- a/drivers/net/ethernet/qlogic/qede/qede_main.c +++ b/drivers/net/ethernet/qlogic/qede/qede_main.c @@ -60,6 +60,7 @@ #include #include #include +#include #include "qede.h" #include "qede_ptp.h" @@ -124,6 +125,8 @@ static const struct pci_device_id qede_pci_tbl[] = { MODULE_DEVICE_TABLE(pci, qede_pci_tbl); static int qede_probe(struct pci_dev *pdev, const struct pci_device_id *id); +static pci_ers_result_t +qede_io_error_detected(struct pci_dev *pdev, pci_channel_state_t state); #define TX_TIMEOUT (5 * HZ) @@ -203,6 +206,10 @@ static int qede_sriov_configure(struct pci_dev *pdev, int num_vfs_param) } #endif +static const struct pci_error_handlers qede_err_handler = { + .error_detected = qede_io_error_detected, +}; + static struct pci_driver qede_pci_driver = { .name = "qede", .id_table = qede_pci_tbl, @@ -212,6 +219,7 @@ static struct pci_driver qede_pci_driver = { #ifdef CONFIG_QED_SRIOV .sriov_configure = qede_sriov_configure, #endif + .err_handler = &qede_err_handler, }; static struct qed_eth_cb_ops qede_ll_ops = { @@ -974,7 +982,8 @@ static void qede_sp_task(struct work_struct *work) /* SRIOV must be disabled outside the lock to avoid a deadlock. * The recovery of the active VFs is currently not supported. */ - qede_sriov_configure(edev->pdev, 0); + if (pci_num_vf(edev->pdev)) + qede_sriov_configure(edev->pdev, 0); #endif qede_lock(edev); qede_recovery_handler(edev); @@ -994,6 +1003,17 @@ static void qede_sp_task(struct work_struct *work) } #endif __qede_unlock(edev); + + if (test_and_clear_bit(QEDE_SP_AER, &edev->sp_flags)) { +#ifdef CONFIG_QED_SRIOV + /* SRIOV must be disabled outside the lock to avoid a deadlock. + * The recovery of the active VFs is currently not supported. + */ + if (pci_num_vf(edev->pdev)) + qede_sriov_configure(edev->pdev, 0); +#endif + edev->ops->common->recovery_process(edev->cdev); + } } static void qede_update_pf_params(struct qed_dev *cdev) @@ -2579,3 +2599,49 @@ static void qede_get_eth_tlv_data(void *dev, void *data) etlv->num_txqs_full_set = true; etlv->num_rxqs_full_set = true; } + +/** + * qede_io_error_detected - called when PCI error is detected + * @pdev: Pointer to PCI device + * @state: The current pci connection state + * + * This function is called after a PCI bus error affecting + * this device has been detected. + */ +static pci_ers_result_t +qede_io_error_detected(struct pci_dev *pdev, pci_channel_state_t state) +{ + struct net_device *dev = pci_get_drvdata(pdev); + struct qede_dev *edev = netdev_priv(dev); + + if (!edev) + return PCI_ERS_RESULT_NONE; + + DP_NOTICE(edev, "IO error detected [%d]\n", state); + + __qede_lock(edev); + if (edev->state == QEDE_STATE_RECOVERY) { + DP_NOTICE(edev, "Device already in the recovery state\n"); + __qede_unlock(edev); + return PCI_ERS_RESULT_NONE; + } + + /* PF handles the recovery of its VFs */ + if (IS_VF(edev)) { + DP_VERBOSE(edev, QED_MSG_IOV, + "VF recovery is handled by its PF\n"); + __qede_unlock(edev); + return PCI_ERS_RESULT_RECOVERED; + } + + /* Close OS Tx */ + netif_tx_disable(edev->ndev); + netif_carrier_off(edev->ndev); + + set_bit(QEDE_SP_AER, &edev->sp_flags); + schedule_delayed_work(&edev->sp_task, 0); + + __qede_unlock(edev); + + return PCI_ERS_RESULT_CAN_RECOVER; +} -- cgit v1.2.3-59-g8ed1b From 6f8b12d661d09b488b9ac879b8eafbd2cc4a1450 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 22 Apr 2020 09:13:27 -0700 Subject: net: napi: add hard irqs deferral feature Back in commit 3b47d30396ba ("net: gro: add a per device gro flush timer") we added the ability to arm one high resolution timer, that we used to keep not-complete packets in GRO engine a bit longer, hoping that further frames might be added to them. Since then, we added the napi_complete_done() interface, and commit 364b6055738b ("net: busy-poll: return busypolling status to drivers") allowed drivers to avoid re-arming NIC interrupts if we made a promise that their NAPI poll() handler would be called in the near future. This infrastructure can be leveraged, thanks to a new device parameter, which allows to arm the napi hrtimer, instead of re-arming the device hard IRQ. We have noticed that on some servers with 32 RX queues or more, the chit-chat between the NIC and the host caused by IRQ delivery and re-arming could hurt throughput by ~20% on 100Gbit NIC. In contrast, hrtimers are using local (percpu) resources and might have lower cost. The new tunable, named napi_defer_hard_irqs, is placed in the same hierarchy than gro_flush_timeout (/sys/class/net/ethX/) By default, both gro_flush_timeout and napi_defer_hard_irqs are zero. This patch does not change the prior behavior of gro_flush_timeout if used alone : NIC hard irqs should be rearmed as before. One concrete usage can be : echo 20000 >/sys/class/net/eth1/gro_flush_timeout echo 10 >/sys/class/net/eth1/napi_defer_hard_irqs If at least one packet is retired, then we will reset napi counter to 10 (napi_defer_hard_irqs), ensuring at least 10 periodic scans of the queue. On busy queues, this should avoid NIC hard IRQ, while before this patch IRQ avoidance was only possible if napi->poll() was exhausting its budget and not call napi_complete_done(). This feature also can be used to work around some non-optimal NIC irq coalescing strategies. Having the ability to insert XX usec delays between each napi->poll() can increase cache efficiency, since we increase batch sizes. It also keeps serving cpus not idle too long, reducing tail latencies. Co-developed-by: Luigi Rizzo Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/netdevice.h | 2 ++ net/core/dev.c | 29 ++++++++++++++++++----------- net/core/net-sysfs.c | 18 ++++++++++++++++++ 3 files changed, 38 insertions(+), 11 deletions(-) diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 0750b54b3765..5a8d40f1ffe2 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -329,6 +329,7 @@ struct napi_struct { unsigned long state; int weight; + int defer_hard_irqs_count; unsigned long gro_bitmask; int (*poll)(struct napi_struct *, int); #ifdef CONFIG_NETPOLL @@ -1995,6 +1996,7 @@ struct net_device { struct bpf_prog __rcu *xdp_prog; unsigned long gro_flush_timeout; + int napi_defer_hard_irqs; rx_handler_func_t __rcu *rx_handler; void __rcu *rx_handler_data; diff --git a/net/core/dev.c b/net/core/dev.c index fb61522b1ce1..67585484ad32 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -6227,7 +6227,8 @@ EXPORT_SYMBOL(__napi_schedule_irqoff); bool napi_complete_done(struct napi_struct *n, int work_done) { - unsigned long flags, val, new; + unsigned long flags, val, new, timeout = 0; + bool ret = true; /* * 1) Don't let napi dequeue from the cpu poll list @@ -6239,20 +6240,23 @@ bool napi_complete_done(struct napi_struct *n, int work_done) NAPIF_STATE_IN_BUSY_POLL))) return false; - if (n->gro_bitmask) { - unsigned long timeout = 0; - - if (work_done) + if (work_done) { + if (n->gro_bitmask) timeout = n->dev->gro_flush_timeout; - + n->defer_hard_irqs_count = n->dev->napi_defer_hard_irqs; + } + if (n->defer_hard_irqs_count > 0) { + n->defer_hard_irqs_count--; + timeout = n->dev->gro_flush_timeout; + if (timeout) + ret = false; + } + if (n->gro_bitmask) { /* When the NAPI instance uses a timeout and keeps postponing * it, we need to bound somehow the time packets are kept in * the GRO layer */ napi_gro_flush(n, !!timeout); - if (timeout) - hrtimer_start(&n->timer, ns_to_ktime(timeout), - HRTIMER_MODE_REL_PINNED); } gro_normal_list(n); @@ -6284,7 +6288,10 @@ bool napi_complete_done(struct napi_struct *n, int work_done) return false; } - return true; + if (timeout) + hrtimer_start(&n->timer, ns_to_ktime(timeout), + HRTIMER_MODE_REL_PINNED); + return ret; } EXPORT_SYMBOL(napi_complete_done); @@ -6464,7 +6471,7 @@ static enum hrtimer_restart napi_watchdog(struct hrtimer *timer) /* Note : we use a relaxed variant of napi_schedule_prep() not setting * NAPI_STATE_MISSED, since we do not react to a device IRQ. */ - if (napi->gro_bitmask && !napi_disable_pending(napi) && + if (!napi_disable_pending(napi) && !test_and_set_bit(NAPI_STATE_SCHED, &napi->state)) __napi_schedule_irqoff(napi); diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index 0d9e46de205e..f3b650cd0923 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -382,6 +382,23 @@ static ssize_t gro_flush_timeout_store(struct device *dev, } NETDEVICE_SHOW_RW(gro_flush_timeout, fmt_ulong); +static int change_napi_defer_hard_irqs(struct net_device *dev, unsigned long val) +{ + dev->napi_defer_hard_irqs = val; + return 0; +} + +static ssize_t napi_defer_hard_irqs_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t len) +{ + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + + return netdev_store(dev, attr, buf, len, change_napi_defer_hard_irqs); +} +NETDEVICE_SHOW_RW(napi_defer_hard_irqs, fmt_dec); + static ssize_t ifalias_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t len) { @@ -545,6 +562,7 @@ static struct attribute *net_class_attrs[] __ro_after_init = { &dev_attr_flags.attr, &dev_attr_tx_queue_len.attr, &dev_attr_gro_flush_timeout.attr, + &dev_attr_napi_defer_hard_irqs.attr, &dev_attr_phys_port_id.attr, &dev_attr_phys_port_name.attr, &dev_attr_phys_switch_id.attr, -- cgit v1.2.3-59-g8ed1b From 7e417a66b86c110f4b282945dac82e21e0b08328 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 22 Apr 2020 09:13:28 -0700 Subject: net: napi: use READ_ONCE()/WRITE_ONCE() gro_flush_timeout and napi_defer_hard_irqs can be read from napi_complete_done() while other cpus write the value, whithout explicit synchronization. Use READ_ONCE()/WRITE_ONCE() to annotate the races. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 6 +++--- net/core/net-sysfs.c | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/net/core/dev.c b/net/core/dev.c index 67585484ad32..afff16849c26 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -6242,12 +6242,12 @@ bool napi_complete_done(struct napi_struct *n, int work_done) if (work_done) { if (n->gro_bitmask) - timeout = n->dev->gro_flush_timeout; - n->defer_hard_irqs_count = n->dev->napi_defer_hard_irqs; + timeout = READ_ONCE(n->dev->gro_flush_timeout); + n->defer_hard_irqs_count = READ_ONCE(n->dev->napi_defer_hard_irqs); } if (n->defer_hard_irqs_count > 0) { n->defer_hard_irqs_count--; - timeout = n->dev->gro_flush_timeout; + timeout = READ_ONCE(n->dev->gro_flush_timeout); if (timeout) ret = false; } diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index f3b650cd0923..880e89c894f6 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -367,7 +367,7 @@ NETDEVICE_SHOW_RW(tx_queue_len, fmt_dec); static int change_gro_flush_timeout(struct net_device *dev, unsigned long val) { - dev->gro_flush_timeout = val; + WRITE_ONCE(dev->gro_flush_timeout, val); return 0; } @@ -384,7 +384,7 @@ NETDEVICE_SHOW_RW(gro_flush_timeout, fmt_ulong); static int change_napi_defer_hard_irqs(struct net_device *dev, unsigned long val) { - dev->napi_defer_hard_irqs = val; + WRITE_ONCE(dev->napi_defer_hard_irqs, val); return 0; } -- cgit v1.2.3-59-g8ed1b From cf4058dbaa18bf8e55b7cb8c04e1c313298cd5b1 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 22 Apr 2020 09:13:29 -0700 Subject: net/mlx4_en: use napi_complete_done() in TX completion In order to benefit from the new napi_defer_hard_irqs feature, we need to use napi_complete_done() variant in this driver. RX path is already using it, this patch implements TX completion side. mlx4_en_process_tx_cq() now returns the amount of retired packets, instead of a boolean, so that mlx4_en_poll_tx_cq() can pass this value to napi_complete_done(). Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlx4/en_rx.c | 2 +- drivers/net/ethernet/mellanox/mlx4/en_tx.c | 20 ++++++++++---------- drivers/net/ethernet/mellanox/mlx4/mlx4_en.h | 4 ++-- 3 files changed, 13 insertions(+), 13 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx4/en_rx.c b/drivers/net/ethernet/mellanox/mlx4/en_rx.c index db3552f2d087..787139219813 100644 --- a/drivers/net/ethernet/mellanox/mlx4/en_rx.c +++ b/drivers/net/ethernet/mellanox/mlx4/en_rx.c @@ -946,7 +946,7 @@ int mlx4_en_poll_rx_cq(struct napi_struct *napi, int budget) xdp_tx_cq = priv->tx_cq[TX_XDP][cq->ring]; if (xdp_tx_cq->xdp_busy) { clean_complete = mlx4_en_process_tx_cq(dev, xdp_tx_cq, - budget); + budget) < budget; xdp_tx_cq->xdp_busy = !clean_complete; } } diff --git a/drivers/net/ethernet/mellanox/mlx4/en_tx.c b/drivers/net/ethernet/mellanox/mlx4/en_tx.c index 4d5ca302c067..a99d3ed49ed6 100644 --- a/drivers/net/ethernet/mellanox/mlx4/en_tx.c +++ b/drivers/net/ethernet/mellanox/mlx4/en_tx.c @@ -382,8 +382,8 @@ int mlx4_en_free_tx_buf(struct net_device *dev, struct mlx4_en_tx_ring *ring) return cnt; } -bool mlx4_en_process_tx_cq(struct net_device *dev, - struct mlx4_en_cq *cq, int napi_budget) +int mlx4_en_process_tx_cq(struct net_device *dev, + struct mlx4_en_cq *cq, int napi_budget) { struct mlx4_en_priv *priv = netdev_priv(dev); struct mlx4_cq *mcq = &cq->mcq; @@ -405,7 +405,7 @@ bool mlx4_en_process_tx_cq(struct net_device *dev, u32 ring_cons; if (unlikely(!priv->port_up)) - return true; + return 0; netdev_txq_bql_complete_prefetchw(ring->tx_queue); @@ -480,7 +480,7 @@ bool mlx4_en_process_tx_cq(struct net_device *dev, WRITE_ONCE(ring->cons, ring_cons + txbbs_skipped); if (cq->type == TX_XDP) - return done < budget; + return done; netdev_tx_completed_queue(ring->tx_queue, packets, bytes); @@ -492,7 +492,7 @@ bool mlx4_en_process_tx_cq(struct net_device *dev, ring->wake_queue++; } - return done < budget; + return done; } void mlx4_en_tx_irq(struct mlx4_cq *mcq) @@ -512,14 +512,14 @@ int mlx4_en_poll_tx_cq(struct napi_struct *napi, int budget) struct mlx4_en_cq *cq = container_of(napi, struct mlx4_en_cq, napi); struct net_device *dev = cq->dev; struct mlx4_en_priv *priv = netdev_priv(dev); - bool clean_complete; + int work_done; - clean_complete = mlx4_en_process_tx_cq(dev, cq, budget); - if (!clean_complete) + work_done = mlx4_en_process_tx_cq(dev, cq, budget); + if (work_done >= budget) return budget; - napi_complete(napi); - mlx4_en_arm_cq(priv, cq); + if (napi_complete_done(napi, work_done)) + mlx4_en_arm_cq(priv, cq); return 0; } diff --git a/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h b/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h index 630f15977f09..9f5603612960 100644 --- a/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h +++ b/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h @@ -737,8 +737,8 @@ int mlx4_en_process_rx_cq(struct net_device *dev, int budget); int mlx4_en_poll_rx_cq(struct napi_struct *napi, int budget); int mlx4_en_poll_tx_cq(struct napi_struct *napi, int budget); -bool mlx4_en_process_tx_cq(struct net_device *dev, - struct mlx4_en_cq *cq, int napi_budget); +int mlx4_en_process_tx_cq(struct net_device *dev, + struct mlx4_en_cq *cq, int napi_budget); u32 mlx4_en_free_tx_desc(struct mlx4_en_priv *priv, struct mlx4_en_tx_ring *ring, int index, u64 timestamp, -- cgit v1.2.3-59-g8ed1b From 79d6e755a45486ffb14bf0ed752e6ace20334cda Mon Sep 17 00:00:00 2001 From: Grygorii Strashko Date: Thu, 23 Apr 2020 17:20:13 +0300 Subject: net: ethernet: ti: cpts: use dev_yy() api for logs Use dev_yy() API instead of pr_yy() for log outputs. Signed-off-by: Grygorii Strashko Acked-by: Richard Cochran Signed-off-by: David S. Miller --- drivers/net/ethernet/ti/cpts.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/net/ethernet/ti/cpts.c b/drivers/net/ethernet/ti/cpts.c index 729ce09dded9..445f445185df 100644 --- a/drivers/net/ethernet/ti/cpts.c +++ b/drivers/net/ethernet/ti/cpts.c @@ -71,7 +71,7 @@ static int cpts_purge_events(struct cpts *cpts) } if (removed) - pr_debug("cpts: event pool cleaned up %d\n", removed); + dev_dbg(cpts->dev, "cpts: event pool cleaned up %d\n", removed); return removed ? 0 : -1; } @@ -150,7 +150,7 @@ static int cpts_fifo_read(struct cpts *cpts, int match) break; if (list_empty(&cpts->pool) && cpts_purge_events(cpts)) { - pr_err("cpts: event pool empty\n"); + dev_warn(cpts->dev, "cpts: event pool empty\n"); return -1; } @@ -178,7 +178,7 @@ static int cpts_fifo_read(struct cpts *cpts, int match) case CPTS_EV_HW: break; default: - pr_err("cpts: unknown event type\n"); + dev_err(cpts->dev, "cpts: unknown event type\n"); break; } if (type == match) @@ -196,7 +196,7 @@ static u64 cpts_systim_read(const struct cyclecounter *cc) cpts_write32(cpts, TS_PUSH, ts_push); if (cpts_fifo_read(cpts, CPTS_EV_PUSH)) - pr_err("cpts: unable to obtain a time stamp\n"); + dev_err(cpts->dev, "cpts: unable to obtain a time stamp\n"); list_for_each_safe(this, next, &cpts->events) { event = list_entry(this, struct cpts_event, list); @@ -307,8 +307,8 @@ static long cpts_overflow_check(struct ptp_clock_info *ptp) } spin_unlock_irqrestore(&cpts->lock, flags); - pr_debug("cpts overflow check at %lld.%09ld\n", - (long long)ts.tv_sec, ts.tv_nsec); + dev_dbg(cpts->dev, "cpts overflow check at %lld.%09ld\n", + (long long)ts.tv_sec, ts.tv_nsec); return (long)delay; } -- cgit v1.2.3-59-g8ed1b From e66dccced0cfd59a4dc4c16409b713332b882fa6 Mon Sep 17 00:00:00 2001 From: Grygorii Strashko Date: Thu, 23 Apr 2020 17:20:14 +0300 Subject: net: ethernet: ti: cpts: separate hw counter read from timecounter Now CPTS HW time reading code is implemented in timecounter->cyclecounter .read() callback and performs following operations: timecounter_read() ->cc.read() -> cpts_systim_read() - request current CPTS HW time CPTS_TS_PUSH.TS_PUSH = 1 - poll CPTS FIFO for CPTS_EV_PUSH event with current HW timestamp This approach need to be changed for the future switch to PTP PHC .gettimex64() callback, which require to separate requesting current CPTS HW time and processing CPTS FIFO. And for the follow up patch, which improves .adjfreq() implementation. This patch moves code accessing CPTS HW out of timecounter code as following: - convert HW timestamp of every CPTS event to PTP time (us) and store it as part struct cpts_event; - add CPTS context field to store current CPTS HW time (counter) value and update it on CPTS_EV_PUSH reception; - move code accessing CPTS HW out of timecounter code and use current CPTS HW time (counter) from CPTS context instead; - ensure timecounter->cycle_last is updated on CPTS_EV_PUSH reception. After this change CPTS timecounter will only perform timekeeper role without actually accessing CPTS HW. Signed-off-by: Grygorii Strashko Acked-by: Richard Cochran Signed-off-by: David S. Miller --- drivers/net/ethernet/ti/cpts.c | 53 +++++++++++++++++++++--------------------- drivers/net/ethernet/ti/cpts.h | 2 ++ 2 files changed, 29 insertions(+), 26 deletions(-) diff --git a/drivers/net/ethernet/ti/cpts.c b/drivers/net/ethernet/ti/cpts.c index 445f445185df..f40a864d8c36 100644 --- a/drivers/net/ethernet/ti/cpts.c +++ b/drivers/net/ethernet/ti/cpts.c @@ -112,10 +112,8 @@ static bool cpts_match_tx_ts(struct cpts *cpts, struct cpts_event *event) (struct cpts_skb_cb_data *)skb->cb; if (cpts_match(skb, class, seqid, mtype)) { - u64 ns = timecounter_cyc2time(&cpts->tc, event->low); - memset(&ssh, 0, sizeof(ssh)); - ssh.hwtstamp = ns_to_ktime(ns); + ssh.hwtstamp = ns_to_ktime(event->timestamp); skb_tstamp_tx(skb, &ssh); found = true; __skb_unlink(skb, &cpts->txq); @@ -158,8 +156,16 @@ static int cpts_fifo_read(struct cpts *cpts, int match) event->tmo = jiffies + 2; event->high = hi; event->low = lo; + event->timestamp = timecounter_cyc2time(&cpts->tc, event->low); type = event_type(event); + + dev_dbg(cpts->dev, "CPTS_EV: %d high:%08X low:%08x\n", + type, event->high, event->low); switch (type) { + case CPTS_EV_PUSH: + WRITE_ONCE(cpts->cur_timestamp, lo); + timecounter_read(&cpts->tc); + break; case CPTS_EV_TX: if (cpts_match_tx_ts(cpts, event)) { /* if the new event matches an existing skb, @@ -168,7 +174,6 @@ static int cpts_fifo_read(struct cpts *cpts, int match) break; } /* fall through */ - case CPTS_EV_PUSH: case CPTS_EV_RX: list_del_init(&event->list); list_add_tail(&event->list, &cpts->events); @@ -189,26 +194,17 @@ static int cpts_fifo_read(struct cpts *cpts, int match) static u64 cpts_systim_read(const struct cyclecounter *cc) { - u64 val = 0; - struct cpts_event *event; - struct list_head *this, *next; struct cpts *cpts = container_of(cc, struct cpts, cc); - cpts_write32(cpts, TS_PUSH, ts_push); - if (cpts_fifo_read(cpts, CPTS_EV_PUSH)) - dev_err(cpts->dev, "cpts: unable to obtain a time stamp\n"); + return READ_ONCE(cpts->cur_timestamp); +} - list_for_each_safe(this, next, &cpts->events) { - event = list_entry(this, struct cpts_event, list); - if (event_type(event) == CPTS_EV_PUSH) { - list_del_init(&event->list); - list_add(&event->list, &cpts->pool); - val = event->low; - break; - } - } +static void cpts_update_cur_time(struct cpts *cpts, int match) +{ + cpts_write32(cpts, TS_PUSH, ts_push); - return val; + if (cpts_fifo_read(cpts, match) && match != -1) + dev_err(cpts->dev, "cpts: unable to obtain a time stamp\n"); } /* PTP clock operations */ @@ -232,7 +228,7 @@ static int cpts_ptp_adjfreq(struct ptp_clock_info *ptp, s32 ppb) spin_lock_irqsave(&cpts->lock, flags); - timecounter_read(&cpts->tc); + cpts_update_cur_time(cpts, CPTS_EV_PUSH); cpts->cc.mult = neg_adj ? mult - diff : mult + diff; @@ -260,6 +256,9 @@ static int cpts_ptp_gettime(struct ptp_clock_info *ptp, struct timespec64 *ts) struct cpts *cpts = container_of(ptp, struct cpts, info); spin_lock_irqsave(&cpts->lock, flags); + + cpts_update_cur_time(cpts, CPTS_EV_PUSH); + ns = timecounter_read(&cpts->tc); spin_unlock_irqrestore(&cpts->lock, flags); @@ -294,11 +293,14 @@ static long cpts_overflow_check(struct ptp_clock_info *ptp) { struct cpts *cpts = container_of(ptp, struct cpts, info); unsigned long delay = cpts->ov_check_period; - struct timespec64 ts; unsigned long flags; + u64 ns; spin_lock_irqsave(&cpts->lock, flags); - ts = ns_to_timespec64(timecounter_read(&cpts->tc)); + + cpts_update_cur_time(cpts, -1); + + ns = timecounter_read(&cpts->tc); if (!skb_queue_empty(&cpts->txq)) { cpts_purge_txq(cpts); @@ -307,8 +309,7 @@ static long cpts_overflow_check(struct ptp_clock_info *ptp) } spin_unlock_irqrestore(&cpts->lock, flags); - dev_dbg(cpts->dev, "cpts overflow check at %lld.%09ld\n", - (long long)ts.tv_sec, ts.tv_nsec); + dev_dbg(cpts->dev, "cpts overflow check at %lld\n", ns); return (long)delay; } @@ -390,7 +391,7 @@ static u64 cpts_find_ts(struct cpts *cpts, struct sk_buff *skb, int ev_type) seqid = (event->high >> SEQUENCE_ID_SHIFT) & SEQUENCE_ID_MASK; if (ev_type == event_type(event) && cpts_match(skb, class, seqid, mtype)) { - ns = timecounter_cyc2time(&cpts->tc, event->low); + ns = event->timestamp; list_del_init(&event->list); list_add(&event->list, &cpts->pool); break; diff --git a/drivers/net/ethernet/ti/cpts.h b/drivers/net/ethernet/ti/cpts.h index bb997c11ee15..32ecd1ce4d3b 100644 --- a/drivers/net/ethernet/ti/cpts.h +++ b/drivers/net/ethernet/ti/cpts.h @@ -94,6 +94,7 @@ struct cpts_event { unsigned long tmo; u32 high; u32 low; + u64 timestamp; }; struct cpts { @@ -114,6 +115,7 @@ struct cpts { struct cpts_event pool_data[CPTS_MAX_EVENTS]; unsigned long ov_check_period; struct sk_buff_head txq; + u64 cur_timestamp; }; void cpts_rx_timestamp(struct cpts *cpts, struct sk_buff *skb); -- cgit v1.2.3-59-g8ed1b From 0d6df3e613b74fe1a88de89cda63a0352e1dc4eb Mon Sep 17 00:00:00 2001 From: Grygorii Strashko Date: Thu, 23 Apr 2020 17:20:15 +0300 Subject: net: ethernet: ti: cpts: move tc mult update in cpts_fifo_read() Now CPTS driver .adjfreq() generates request to read CPTS current time (CPTS_EV_PUSH) with intention to process all pending event using previous frequency adjustment values before switching to the new ones. So CPTS_EV_PUSH works as a marker to switch to the new frequency adjustment values. Current code assumes that all job is done in .adjfreq(), but after enabling IRQ this will not be true any more. Hence save new frequency adjustment values (mult) and perform actual freq adjustment in cpts_fifo_read() immediately after CPTS_EV_PUSH is received. Signed-off-by: Grygorii Strashko Acked-by: Richard Cochran Signed-off-by: David S. Miller --- drivers/net/ethernet/ti/cpts.c | 8 ++++++-- drivers/net/ethernet/ti/cpts.h | 1 + 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/ti/cpts.c b/drivers/net/ethernet/ti/cpts.c index f40a864d8c36..a2974b542bed 100644 --- a/drivers/net/ethernet/ti/cpts.c +++ b/drivers/net/ethernet/ti/cpts.c @@ -165,6 +165,10 @@ static int cpts_fifo_read(struct cpts *cpts, int match) case CPTS_EV_PUSH: WRITE_ONCE(cpts->cur_timestamp, lo); timecounter_read(&cpts->tc); + if (cpts->mult_new) { + cpts->cc.mult = cpts->mult_new; + cpts->mult_new = 0; + } break; case CPTS_EV_TX: if (cpts_match_tx_ts(cpts, event)) { @@ -228,9 +232,9 @@ static int cpts_ptp_adjfreq(struct ptp_clock_info *ptp, s32 ppb) spin_lock_irqsave(&cpts->lock, flags); - cpts_update_cur_time(cpts, CPTS_EV_PUSH); + cpts->mult_new = neg_adj ? mult - diff : mult + diff; - cpts->cc.mult = neg_adj ? mult - diff : mult + diff; + cpts_update_cur_time(cpts, CPTS_EV_PUSH); spin_unlock_irqrestore(&cpts->lock, flags); diff --git a/drivers/net/ethernet/ti/cpts.h b/drivers/net/ethernet/ti/cpts.h index 32ecd1ce4d3b..421630049ee7 100644 --- a/drivers/net/ethernet/ti/cpts.h +++ b/drivers/net/ethernet/ti/cpts.h @@ -116,6 +116,7 @@ struct cpts { unsigned long ov_check_period; struct sk_buff_head txq; u64 cur_timestamp; + u32 mult_new; }; void cpts_rx_timestamp(struct cpts *cpts, struct sk_buff *skb); -- cgit v1.2.3-59-g8ed1b From 856e59ab7e6d3c85ee739f3f53341d47c88d454e Mon Sep 17 00:00:00 2001 From: Grygorii Strashko Date: Thu, 23 Apr 2020 17:20:16 +0300 Subject: net: ethernet: ti: cpts: switch to use new .gettimex64() interface The CPTS HW latches and saves CPTS counter value in CPTS fifo immediately after writing to CPSW_CPTS_PUSH.TS_PUSH (bit 0), so the total time that the driver needs to read the CPTS timestamp is the time required CPSW_CPTS_PUSH write to actually reach HW. Hence switch CPTS driver to implement new .gettimex64() callback for more precise measurement of the offset between a PHC and the system clock which is measured as time between write(CPSW_CPTS_PUSH) read(CPSW_CPTS_PUSH) Signed-off-by: Grygorii Strashko Acked-by: Richard Cochran Signed-off-by: David S. Miller --- drivers/net/ethernet/ti/cpts.c | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/drivers/net/ethernet/ti/cpts.c b/drivers/net/ethernet/ti/cpts.c index a2974b542bed..1f738bb3df74 100644 --- a/drivers/net/ethernet/ti/cpts.c +++ b/drivers/net/ethernet/ti/cpts.c @@ -203,9 +203,13 @@ static u64 cpts_systim_read(const struct cyclecounter *cc) return READ_ONCE(cpts->cur_timestamp); } -static void cpts_update_cur_time(struct cpts *cpts, int match) +static void cpts_update_cur_time(struct cpts *cpts, int match, + struct ptp_system_timestamp *sts) { + ptp_read_system_prets(sts); cpts_write32(cpts, TS_PUSH, ts_push); + cpts_read32(cpts, ts_push); + ptp_read_system_postts(sts); if (cpts_fifo_read(cpts, match) && match != -1) dev_err(cpts->dev, "cpts: unable to obtain a time stamp\n"); @@ -234,7 +238,7 @@ static int cpts_ptp_adjfreq(struct ptp_clock_info *ptp, s32 ppb) cpts->mult_new = neg_adj ? mult - diff : mult + diff; - cpts_update_cur_time(cpts, CPTS_EV_PUSH); + cpts_update_cur_time(cpts, CPTS_EV_PUSH, NULL); spin_unlock_irqrestore(&cpts->lock, flags); @@ -253,15 +257,17 @@ static int cpts_ptp_adjtime(struct ptp_clock_info *ptp, s64 delta) return 0; } -static int cpts_ptp_gettime(struct ptp_clock_info *ptp, struct timespec64 *ts) +static int cpts_ptp_gettimeex(struct ptp_clock_info *ptp, + struct timespec64 *ts, + struct ptp_system_timestamp *sts) { - u64 ns; - unsigned long flags; struct cpts *cpts = container_of(ptp, struct cpts, info); + unsigned long flags; + u64 ns; spin_lock_irqsave(&cpts->lock, flags); - cpts_update_cur_time(cpts, CPTS_EV_PUSH); + cpts_update_cur_time(cpts, CPTS_EV_PUSH, sts); ns = timecounter_read(&cpts->tc); spin_unlock_irqrestore(&cpts->lock, flags); @@ -302,7 +308,7 @@ static long cpts_overflow_check(struct ptp_clock_info *ptp) spin_lock_irqsave(&cpts->lock, flags); - cpts_update_cur_time(cpts, -1); + cpts_update_cur_time(cpts, -1, NULL); ns = timecounter_read(&cpts->tc); @@ -326,7 +332,7 @@ static const struct ptp_clock_info cpts_info = { .pps = 0, .adjfreq = cpts_ptp_adjfreq, .adjtime = cpts_ptp_adjtime, - .gettime64 = cpts_ptp_gettime, + .gettimex64 = cpts_ptp_gettimeex, .settime64 = cpts_ptp_settime, .enable = cpts_ptp_enable, .do_aux_work = cpts_overflow_check, -- cgit v1.2.3-59-g8ed1b From 3bfd41b57811d76412af57f4884e28ad78c2ab2f Mon Sep 17 00:00:00 2001 From: Grygorii Strashko Date: Thu, 23 Apr 2020 17:20:17 +0300 Subject: net: ethernet: ti: cpts: optimize packet to event matching Now the CPTS driver performs packet (skb) parsing every time when it needs to match packet to CPTS event (including ptp_classify_raw() calls). This patch optimizes matching process by parsing packet only once upon arrival and stores PTP specific data in skb->cb using the same fromat as in CPTS HW event. As result, all future matching reduces to comparing two u32 values. Signed-off-by: Grygorii Strashko Acked-by: Richard Cochran Signed-off-by: David S. Miller --- drivers/net/ethernet/ti/cpts.c | 91 +++++++++++++++++++++++++++--------------- 1 file changed, 58 insertions(+), 33 deletions(-) diff --git a/drivers/net/ethernet/ti/cpts.c b/drivers/net/ethernet/ti/cpts.c index 1f738bb3df74..6efb809d58ed 100644 --- a/drivers/net/ethernet/ti/cpts.c +++ b/drivers/net/ethernet/ti/cpts.c @@ -23,15 +23,13 @@ #define CPTS_SKB_TX_WORK_TIMEOUT 1 /* jiffies */ struct cpts_skb_cb_data { + u32 skb_mtype_seqid; unsigned long tmo; }; #define cpts_read32(c, r) readl_relaxed(&c->reg->r) #define cpts_write32(c, v, r) writel_relaxed(v, &c->reg->r) -static int cpts_match(struct sk_buff *skb, unsigned int ptp_class, - u16 ts_seqid, u8 ts_msgtype); - static int event_expired(struct cpts_event *event) { return time_after(jiffies, event->tmo); @@ -97,29 +95,29 @@ static void cpts_purge_txq(struct cpts *cpts) static bool cpts_match_tx_ts(struct cpts *cpts, struct cpts_event *event) { struct sk_buff *skb, *tmp; - u16 seqid; - u8 mtype; bool found = false; + u32 mtype_seqid; - mtype = (event->high >> MESSAGE_TYPE_SHIFT) & MESSAGE_TYPE_MASK; - seqid = (event->high >> SEQUENCE_ID_SHIFT) & SEQUENCE_ID_MASK; + mtype_seqid = event->high & + ((MESSAGE_TYPE_MASK << MESSAGE_TYPE_SHIFT) | + (SEQUENCE_ID_MASK << SEQUENCE_ID_SHIFT) | + (EVENT_TYPE_MASK << EVENT_TYPE_SHIFT)); /* no need to grab txq.lock as access is always done under cpts->lock */ skb_queue_walk_safe(&cpts->txq, skb, tmp) { struct skb_shared_hwtstamps ssh; - unsigned int class = ptp_classify_raw(skb); struct cpts_skb_cb_data *skb_cb = (struct cpts_skb_cb_data *)skb->cb; - if (cpts_match(skb, class, seqid, mtype)) { + if (mtype_seqid == skb_cb->skb_mtype_seqid) { memset(&ssh, 0, sizeof(ssh)); ssh.hwtstamp = ns_to_ktime(event->timestamp); skb_tstamp_tx(skb, &ssh); found = true; __skb_unlink(skb, &cpts->txq); dev_consume_skb_any(skb); - dev_dbg(cpts->dev, "match tx timestamp mtype %u seqid %04x\n", - mtype, seqid); + dev_dbg(cpts->dev, "match tx timestamp mtype_seqid %08x\n", + mtype_seqid); break; } @@ -338,12 +336,15 @@ static const struct ptp_clock_info cpts_info = { .do_aux_work = cpts_overflow_check, }; -static int cpts_match(struct sk_buff *skb, unsigned int ptp_class, - u16 ts_seqid, u8 ts_msgtype) +static int cpts_skb_get_mtype_seqid(struct sk_buff *skb, u32 *mtype_seqid) { - u16 *seqid; - unsigned int offset = 0; + unsigned int ptp_class = ptp_classify_raw(skb); u8 *msgtype, *data = skb->data; + unsigned int offset = 0; + u16 *seqid; + + if (ptp_class == PTP_CLASS_NONE) + return 0; if (ptp_class & PTP_CLASS_VLAN) offset += VLAN_HLEN; @@ -371,22 +372,20 @@ static int cpts_match(struct sk_buff *skb, unsigned int ptp_class, msgtype = data + offset; seqid = (u16 *)(data + offset + OFF_PTP_SEQUENCE_ID); + *mtype_seqid = (*msgtype & MESSAGE_TYPE_MASK) << MESSAGE_TYPE_SHIFT; + *mtype_seqid |= (ntohs(*seqid) & SEQUENCE_ID_MASK) << SEQUENCE_ID_SHIFT; - return (ts_msgtype == (*msgtype & 0xf) && ts_seqid == ntohs(*seqid)); + return 1; } -static u64 cpts_find_ts(struct cpts *cpts, struct sk_buff *skb, int ev_type) +static u64 cpts_find_ts(struct cpts *cpts, struct sk_buff *skb, + int ev_type, u32 skb_mtype_seqid) { - u64 ns = 0; - struct cpts_event *event; struct list_head *this, *next; - unsigned int class = ptp_classify_raw(skb); + struct cpts_event *event; unsigned long flags; - u16 seqid; - u8 mtype; - - if (class == PTP_CLASS_NONE) - return 0; + u32 mtype_seqid; + u64 ns = 0; spin_lock_irqsave(&cpts->lock, flags); cpts_fifo_read(cpts, -1); @@ -397,10 +396,13 @@ static u64 cpts_find_ts(struct cpts *cpts, struct sk_buff *skb, int ev_type) list_add(&event->list, &cpts->pool); continue; } - mtype = (event->high >> MESSAGE_TYPE_SHIFT) & MESSAGE_TYPE_MASK; - seqid = (event->high >> SEQUENCE_ID_SHIFT) & SEQUENCE_ID_MASK; - if (ev_type == event_type(event) && - cpts_match(skb, class, seqid, mtype)) { + + mtype_seqid = event->high & + ((MESSAGE_TYPE_MASK << MESSAGE_TYPE_SHIFT) | + (SEQUENCE_ID_MASK << SEQUENCE_ID_SHIFT) | + (EVENT_TYPE_MASK << EVENT_TYPE_SHIFT)); + + if (mtype_seqid == skb_mtype_seqid) { ns = event->timestamp; list_del_init(&event->list); list_add(&event->list, &cpts->pool); @@ -427,10 +429,21 @@ static u64 cpts_find_ts(struct cpts *cpts, struct sk_buff *skb, int ev_type) void cpts_rx_timestamp(struct cpts *cpts, struct sk_buff *skb) { - u64 ns; + struct cpts_skb_cb_data *skb_cb = (struct cpts_skb_cb_data *)skb->cb; struct skb_shared_hwtstamps *ssh; + int ret; + u64 ns; + + ret = cpts_skb_get_mtype_seqid(skb, &skb_cb->skb_mtype_seqid); + if (!ret) + return; + + skb_cb->skb_mtype_seqid |= (CPTS_EV_RX << EVENT_TYPE_SHIFT); - ns = cpts_find_ts(cpts, skb, CPTS_EV_RX); + dev_dbg(cpts->dev, "%s mtype seqid %08x\n", + __func__, skb_cb->skb_mtype_seqid); + + ns = cpts_find_ts(cpts, skb, CPTS_EV_RX, skb_cb->skb_mtype_seqid); if (!ns) return; ssh = skb_hwtstamps(skb); @@ -441,12 +454,24 @@ EXPORT_SYMBOL_GPL(cpts_rx_timestamp); void cpts_tx_timestamp(struct cpts *cpts, struct sk_buff *skb) { - u64 ns; + struct cpts_skb_cb_data *skb_cb = (struct cpts_skb_cb_data *)skb->cb; struct skb_shared_hwtstamps ssh; + int ret; + u64 ns; if (!(skb_shinfo(skb)->tx_flags & SKBTX_IN_PROGRESS)) return; - ns = cpts_find_ts(cpts, skb, CPTS_EV_TX); + + ret = cpts_skb_get_mtype_seqid(skb, &skb_cb->skb_mtype_seqid); + if (!ret) + return; + + skb_cb->skb_mtype_seqid |= (CPTS_EV_TX << EVENT_TYPE_SHIFT); + + dev_dbg(cpts->dev, "%s mtype seqid %08x\n", + __func__, skb_cb->skb_mtype_seqid); + + ns = cpts_find_ts(cpts, skb, CPTS_EV_TX, skb_cb->skb_mtype_seqid); if (!ns) return; memset(&ssh, 0, sizeof(ssh)); -- cgit v1.2.3-59-g8ed1b From c8f8e47efe66dae775b617982e47a4564d7c4dda Mon Sep 17 00:00:00 2001 From: Grygorii Strashko Date: Thu, 23 Apr 2020 17:20:18 +0300 Subject: net: ethernet: ti: cpts: move tx timestamp processing to ptp worker only Now the tx timestamp processing happens from different contexts - softirq and thread/PTP worker. Enabling IRQ will add one more hard_irq context. This makes over all defered TX timestamp processing and locking overcomplicated. Move tx timestamp processing to PTP worker always instead. napi_rx->cpts_tx_timestamp if ptp_packet then push to txq ptp_schedule_worker() do_aux_work->cpts_overflow_check cpts_process_events() Signed-off-by: Grygorii Strashko Acked-by: Richard Cochran Signed-off-by: David S. Miller --- drivers/net/ethernet/ti/cpts.c | 165 +++++++++++++++++++++++------------------ 1 file changed, 94 insertions(+), 71 deletions(-) diff --git a/drivers/net/ethernet/ti/cpts.c b/drivers/net/ethernet/ti/cpts.c index 6efb809d58ed..55ba6b425fb5 100644 --- a/drivers/net/ethernet/ti/cpts.c +++ b/drivers/net/ethernet/ti/cpts.c @@ -21,6 +21,8 @@ #include "cpts.h" #define CPTS_SKB_TX_WORK_TIMEOUT 1 /* jiffies */ +#define CPTS_SKB_RX_TX_TMO 100 /*ms */ +#define CPTS_EVENT_RX_TX_TIMEOUT (100) /* ms */ struct cpts_skb_cb_data { u32 skb_mtype_seqid; @@ -92,46 +94,6 @@ static void cpts_purge_txq(struct cpts *cpts) dev_dbg(cpts->dev, "txq cleaned up %d\n", removed); } -static bool cpts_match_tx_ts(struct cpts *cpts, struct cpts_event *event) -{ - struct sk_buff *skb, *tmp; - bool found = false; - u32 mtype_seqid; - - mtype_seqid = event->high & - ((MESSAGE_TYPE_MASK << MESSAGE_TYPE_SHIFT) | - (SEQUENCE_ID_MASK << SEQUENCE_ID_SHIFT) | - (EVENT_TYPE_MASK << EVENT_TYPE_SHIFT)); - - /* no need to grab txq.lock as access is always done under cpts->lock */ - skb_queue_walk_safe(&cpts->txq, skb, tmp) { - struct skb_shared_hwtstamps ssh; - struct cpts_skb_cb_data *skb_cb = - (struct cpts_skb_cb_data *)skb->cb; - - if (mtype_seqid == skb_cb->skb_mtype_seqid) { - memset(&ssh, 0, sizeof(ssh)); - ssh.hwtstamp = ns_to_ktime(event->timestamp); - skb_tstamp_tx(skb, &ssh); - found = true; - __skb_unlink(skb, &cpts->txq); - dev_consume_skb_any(skb); - dev_dbg(cpts->dev, "match tx timestamp mtype_seqid %08x\n", - mtype_seqid); - break; - } - - if (time_after(jiffies, skb_cb->tmo)) { - /* timeout any expired skbs over 1s */ - dev_dbg(cpts->dev, "expiring tx timestamp from txq\n"); - __skb_unlink(skb, &cpts->txq); - dev_consume_skb_any(skb); - } - } - - return found; -} - /* * Returns zero if matching event type was found. */ @@ -151,7 +113,6 @@ static int cpts_fifo_read(struct cpts *cpts, int match) } event = list_first_entry(&cpts->pool, struct cpts_event, list); - event->tmo = jiffies + 2; event->high = hi; event->low = lo; event->timestamp = timecounter_cyc2time(&cpts->tc, event->low); @@ -169,14 +130,10 @@ static int cpts_fifo_read(struct cpts *cpts, int match) } break; case CPTS_EV_TX: - if (cpts_match_tx_ts(cpts, event)) { - /* if the new event matches an existing skb, - * then don't queue it - */ - break; - } - /* fall through */ case CPTS_EV_RX: + event->tmo = jiffies + + msecs_to_jiffies(CPTS_EVENT_RX_TX_TIMEOUT); + list_del_init(&event->list); list_add_tail(&event->list, &cpts->events); break; @@ -297,6 +254,84 @@ static int cpts_ptp_enable(struct ptp_clock_info *ptp, return -EOPNOTSUPP; } +static bool cpts_match_tx_ts(struct cpts *cpts, struct cpts_event *event) +{ + struct sk_buff_head txq_list; + struct sk_buff *skb, *tmp; + unsigned long flags; + bool found = false; + u32 mtype_seqid; + + mtype_seqid = event->high & + ((MESSAGE_TYPE_MASK << MESSAGE_TYPE_SHIFT) | + (SEQUENCE_ID_MASK << SEQUENCE_ID_SHIFT) | + (EVENT_TYPE_MASK << EVENT_TYPE_SHIFT)); + + __skb_queue_head_init(&txq_list); + + spin_lock_irqsave(&cpts->txq.lock, flags); + skb_queue_splice_init(&cpts->txq, &txq_list); + spin_unlock_irqrestore(&cpts->txq.lock, flags); + + skb_queue_walk_safe(&txq_list, skb, tmp) { + struct skb_shared_hwtstamps ssh; + struct cpts_skb_cb_data *skb_cb = + (struct cpts_skb_cb_data *)skb->cb; + + if (mtype_seqid == skb_cb->skb_mtype_seqid) { + memset(&ssh, 0, sizeof(ssh)); + ssh.hwtstamp = ns_to_ktime(event->timestamp); + skb_tstamp_tx(skb, &ssh); + found = true; + __skb_unlink(skb, &txq_list); + dev_consume_skb_any(skb); + dev_dbg(cpts->dev, "match tx timestamp mtype_seqid %08x\n", + mtype_seqid); + break; + } + + if (time_after(jiffies, skb_cb->tmo)) { + /* timeout any expired skbs over 1s */ + dev_dbg(cpts->dev, "expiring tx timestamp from txq\n"); + __skb_unlink(skb, &txq_list); + dev_consume_skb_any(skb); + } + } + + spin_lock_irqsave(&cpts->txq.lock, flags); + skb_queue_splice(&txq_list, &cpts->txq); + spin_unlock_irqrestore(&cpts->txq.lock, flags); + + return found; +} + +static void cpts_process_events(struct cpts *cpts) +{ + struct list_head *this, *next; + struct cpts_event *event; + LIST_HEAD(events_free); + unsigned long flags; + LIST_HEAD(events); + + spin_lock_irqsave(&cpts->lock, flags); + list_splice_init(&cpts->events, &events); + spin_unlock_irqrestore(&cpts->lock, flags); + + list_for_each_safe(this, next, &events) { + event = list_entry(this, struct cpts_event, list); + if (cpts_match_tx_ts(cpts, event) || + time_after(jiffies, event->tmo)) { + list_del_init(&event->list); + list_add(&event->list, &events_free); + } + } + + spin_lock_irqsave(&cpts->lock, flags); + list_splice_tail(&events, &cpts->events); + list_splice_tail(&events_free, &cpts->pool); + spin_unlock_irqrestore(&cpts->lock, flags); +} + static long cpts_overflow_check(struct ptp_clock_info *ptp) { struct cpts *cpts = container_of(ptp, struct cpts, info); @@ -305,17 +340,20 @@ static long cpts_overflow_check(struct ptp_clock_info *ptp) u64 ns; spin_lock_irqsave(&cpts->lock, flags); - cpts_update_cur_time(cpts, -1, NULL); + spin_unlock_irqrestore(&cpts->lock, flags); ns = timecounter_read(&cpts->tc); + cpts_process_events(cpts); + + spin_lock_irqsave(&cpts->txq.lock, flags); if (!skb_queue_empty(&cpts->txq)) { cpts_purge_txq(cpts); if (!skb_queue_empty(&cpts->txq)) delay = CPTS_SKB_TX_WORK_TIMEOUT; } - spin_unlock_irqrestore(&cpts->lock, flags); + spin_unlock_irqrestore(&cpts->txq.lock, flags); dev_dbg(cpts->dev, "cpts overflow check at %lld\n", ns); return (long)delay; @@ -409,19 +447,6 @@ static u64 cpts_find_ts(struct cpts *cpts, struct sk_buff *skb, break; } } - - if (ev_type == CPTS_EV_TX && !ns) { - struct cpts_skb_cb_data *skb_cb = - (struct cpts_skb_cb_data *)skb->cb; - /* Not found, add frame to queue for processing later. - * The periodic FIFO check will handle this. - */ - skb_get(skb); - /* get the timestamp for timeouts */ - skb_cb->tmo = jiffies + msecs_to_jiffies(100); - __skb_queue_tail(&cpts->txq, skb); - ptp_schedule_worker(cpts->clock, 0); - } spin_unlock_irqrestore(&cpts->lock, flags); return ns; @@ -455,9 +480,7 @@ EXPORT_SYMBOL_GPL(cpts_rx_timestamp); void cpts_tx_timestamp(struct cpts *cpts, struct sk_buff *skb) { struct cpts_skb_cb_data *skb_cb = (struct cpts_skb_cb_data *)skb->cb; - struct skb_shared_hwtstamps ssh; int ret; - u64 ns; if (!(skb_shinfo(skb)->tx_flags & SKBTX_IN_PROGRESS)) return; @@ -471,12 +494,12 @@ void cpts_tx_timestamp(struct cpts *cpts, struct sk_buff *skb) dev_dbg(cpts->dev, "%s mtype seqid %08x\n", __func__, skb_cb->skb_mtype_seqid); - ns = cpts_find_ts(cpts, skb, CPTS_EV_TX, skb_cb->skb_mtype_seqid); - if (!ns) - return; - memset(&ssh, 0, sizeof(ssh)); - ssh.hwtstamp = ns_to_ktime(ns); - skb_tstamp_tx(skb, &ssh); + /* Always defer TX TS processing to PTP worker */ + skb_get(skb); + /* get the timestamp for timeouts */ + skb_cb->tmo = jiffies + msecs_to_jiffies(CPTS_SKB_RX_TX_TMO); + skb_queue_tail(&cpts->txq, skb); + ptp_schedule_worker(cpts->clock, 0); } EXPORT_SYMBOL_GPL(cpts_tx_timestamp); -- cgit v1.2.3-59-g8ed1b From ba10742840fbc6e475dc05c7515fd91b7c88e1b2 Mon Sep 17 00:00:00 2001 From: Grygorii Strashko Date: Thu, 23 Apr 2020 17:20:19 +0300 Subject: net: ethernet: ti: cpts: rework locking Now spinlock is used to synchronize everything which is not required. Add mutex and use to sync access to PTP interface and PTP worker and use spinlock only to sync FIFO/events processing. Signed-off-by: Grygorii Strashko Acked-by: Richard Cochran Signed-off-by: David S. Miller --- drivers/net/ethernet/ti/cpts.c | 53 ++++++++++++++++++++++++------------------ drivers/net/ethernet/ti/cpts.h | 3 ++- 2 files changed, 32 insertions(+), 24 deletions(-) diff --git a/drivers/net/ethernet/ti/cpts.c b/drivers/net/ethernet/ti/cpts.c index 55ba6b425fb5..8db9efdf1708 100644 --- a/drivers/net/ethernet/ti/cpts.c +++ b/drivers/net/ethernet/ti/cpts.c @@ -99,9 +99,12 @@ static void cpts_purge_txq(struct cpts *cpts) */ static int cpts_fifo_read(struct cpts *cpts, int match) { + struct cpts_event *event; + unsigned long flags; int i, type = -1; u32 hi, lo; - struct cpts_event *event; + + spin_lock_irqsave(&cpts->lock, flags); for (i = 0; i < CPTS_FIFO_DEPTH; i++) { if (cpts_fifo_pop(cpts, &hi, &lo)) @@ -109,7 +112,7 @@ static int cpts_fifo_read(struct cpts *cpts, int match) if (list_empty(&cpts->pool) && cpts_purge_events(cpts)) { dev_warn(cpts->dev, "cpts: event pool empty\n"); - return -1; + break; } event = list_first_entry(&cpts->pool, struct cpts_event, list); @@ -148,6 +151,9 @@ static int cpts_fifo_read(struct cpts *cpts, int match) if (type == match) break; } + + spin_unlock_irqrestore(&cpts->lock, flags); + return type == match ? 0 : -1; } @@ -161,10 +167,15 @@ static u64 cpts_systim_read(const struct cyclecounter *cc) static void cpts_update_cur_time(struct cpts *cpts, int match, struct ptp_system_timestamp *sts) { + unsigned long flags; + + /* use spin_lock_irqsave() here as it has to run very fast */ + spin_lock_irqsave(&cpts->lock, flags); ptp_read_system_prets(sts); cpts_write32(cpts, TS_PUSH, ts_push); cpts_read32(cpts, ts_push); ptp_read_system_postts(sts); + spin_unlock_irqrestore(&cpts->lock, flags); if (cpts_fifo_read(cpts, match) && match != -1) dev_err(cpts->dev, "cpts: unable to obtain a time stamp\n"); @@ -174,11 +185,10 @@ static void cpts_update_cur_time(struct cpts *cpts, int match, static int cpts_ptp_adjfreq(struct ptp_clock_info *ptp, s32 ppb) { - u64 adj; - u32 diff, mult; - int neg_adj = 0; - unsigned long flags; struct cpts *cpts = container_of(ptp, struct cpts, info); + int neg_adj = 0; + u32 diff, mult; + u64 adj; if (ppb < 0) { neg_adj = 1; @@ -189,25 +199,23 @@ static int cpts_ptp_adjfreq(struct ptp_clock_info *ptp, s32 ppb) adj *= ppb; diff = div_u64(adj, 1000000000ULL); - spin_lock_irqsave(&cpts->lock, flags); + mutex_lock(&cpts->ptp_clk_mutex); cpts->mult_new = neg_adj ? mult - diff : mult + diff; cpts_update_cur_time(cpts, CPTS_EV_PUSH, NULL); - spin_unlock_irqrestore(&cpts->lock, flags); - + mutex_unlock(&cpts->ptp_clk_mutex); return 0; } static int cpts_ptp_adjtime(struct ptp_clock_info *ptp, s64 delta) { - unsigned long flags; struct cpts *cpts = container_of(ptp, struct cpts, info); - spin_lock_irqsave(&cpts->lock, flags); + mutex_lock(&cpts->ptp_clk_mutex); timecounter_adjtime(&cpts->tc, delta); - spin_unlock_irqrestore(&cpts->lock, flags); + mutex_unlock(&cpts->ptp_clk_mutex); return 0; } @@ -217,15 +225,14 @@ static int cpts_ptp_gettimeex(struct ptp_clock_info *ptp, struct ptp_system_timestamp *sts) { struct cpts *cpts = container_of(ptp, struct cpts, info); - unsigned long flags; u64 ns; - spin_lock_irqsave(&cpts->lock, flags); + mutex_lock(&cpts->ptp_clk_mutex); cpts_update_cur_time(cpts, CPTS_EV_PUSH, sts); ns = timecounter_read(&cpts->tc); - spin_unlock_irqrestore(&cpts->lock, flags); + mutex_unlock(&cpts->ptp_clk_mutex); *ts = ns_to_timespec64(ns); @@ -235,15 +242,14 @@ static int cpts_ptp_gettimeex(struct ptp_clock_info *ptp, static int cpts_ptp_settime(struct ptp_clock_info *ptp, const struct timespec64 *ts) { - u64 ns; - unsigned long flags; struct cpts *cpts = container_of(ptp, struct cpts, info); + u64 ns; ns = timespec64_to_ns(ts); - spin_lock_irqsave(&cpts->lock, flags); + mutex_lock(&cpts->ptp_clk_mutex); timecounter_init(&cpts->tc, &cpts->cc, ns); - spin_unlock_irqrestore(&cpts->lock, flags); + mutex_unlock(&cpts->ptp_clk_mutex); return 0; } @@ -339,10 +345,9 @@ static long cpts_overflow_check(struct ptp_clock_info *ptp) unsigned long flags; u64 ns; - spin_lock_irqsave(&cpts->lock, flags); - cpts_update_cur_time(cpts, -1, NULL); - spin_unlock_irqrestore(&cpts->lock, flags); + mutex_lock(&cpts->ptp_clk_mutex); + cpts_update_cur_time(cpts, -1, NULL); ns = timecounter_read(&cpts->tc); cpts_process_events(cpts); @@ -356,6 +361,7 @@ static long cpts_overflow_check(struct ptp_clock_info *ptp) spin_unlock_irqrestore(&cpts->txq.lock, flags); dev_dbg(cpts->dev, "cpts overflow check at %lld\n", ns); + mutex_unlock(&cpts->ptp_clk_mutex); return (long)delay; } @@ -425,8 +431,8 @@ static u64 cpts_find_ts(struct cpts *cpts, struct sk_buff *skb, u32 mtype_seqid; u64 ns = 0; - spin_lock_irqsave(&cpts->lock, flags); cpts_fifo_read(cpts, -1); + spin_lock_irqsave(&cpts->lock, flags); list_for_each_safe(this, next, &cpts->events) { event = list_entry(this, struct cpts_event, list); if (event_expired(event)) { @@ -703,6 +709,7 @@ struct cpts *cpts_create(struct device *dev, void __iomem *regs, cpts->dev = dev; cpts->reg = (struct cpsw_cpts __iomem *)regs; spin_lock_init(&cpts->lock); + mutex_init(&cpts->ptp_clk_mutex); ret = cpts_of_parse(cpts, node); if (ret) diff --git a/drivers/net/ethernet/ti/cpts.h b/drivers/net/ethernet/ti/cpts.h index 421630049ee7..f16e14d67f5f 100644 --- a/drivers/net/ethernet/ti/cpts.h +++ b/drivers/net/ethernet/ti/cpts.h @@ -104,7 +104,7 @@ struct cpts { int rx_enable; struct ptp_clock_info info; struct ptp_clock *clock; - spinlock_t lock; /* protects time registers */ + spinlock_t lock; /* protects fifo/events */ u32 cc_mult; /* for the nominal frequency */ struct cyclecounter cc; struct timecounter tc; @@ -117,6 +117,7 @@ struct cpts { struct sk_buff_head txq; u64 cur_timestamp; u32 mult_new; + struct mutex ptp_clk_mutex; /* sync PTP interface and worker */ }; void cpts_rx_timestamp(struct cpts *cpts, struct sk_buff *skb); -- cgit v1.2.3-59-g8ed1b From 85624412a03dc61eabddeb1cfbbc8325e3544694 Mon Sep 17 00:00:00 2001 From: Grygorii Strashko Date: Thu, 23 Apr 2020 17:20:20 +0300 Subject: net: ethernet: ti: cpts: add irq support Add CPTS IRQ support, but do not enable it. By default, the CPTS driver will continue working using polling mode which is required for CPTS to continue working on platforms other than CPSW, like Keystone 2. The CPTS IRQ support is required to enable support for HW_TS_PUSH events. The CPSW CPTS IRQ and HW_TS_PUSH events support will be enabled in follow up patches. Signed-off-by: Grygorii Strashko Acked-by: Richard Cochran Signed-off-by: David S. Miller --- drivers/net/ethernet/ti/cpts.c | 23 ++++++++++++++++++++++- drivers/net/ethernet/ti/cpts.h | 16 ++++++++++++++++ 2 files changed, 38 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/ti/cpts.c b/drivers/net/ethernet/ti/cpts.c index 8db9efdf1708..339796c87bf6 100644 --- a/drivers/net/ethernet/ti/cpts.c +++ b/drivers/net/ethernet/ti/cpts.c @@ -99,6 +99,7 @@ static void cpts_purge_txq(struct cpts *cpts) */ static int cpts_fifo_read(struct cpts *cpts, int match) { + bool need_schedule = false; struct cpts_event *event; unsigned long flags; int i, type = -1; @@ -131,6 +132,8 @@ static int cpts_fifo_read(struct cpts *cpts, int match) cpts->cc.mult = cpts->mult_new; cpts->mult_new = 0; } + if (!cpts->irq_poll) + complete(&cpts->ts_push_complete); break; case CPTS_EV_TX: case CPTS_EV_RX: @@ -139,6 +142,7 @@ static int cpts_fifo_read(struct cpts *cpts, int match) list_del_init(&event->list); list_add_tail(&event->list, &cpts->events); + need_schedule = true; break; case CPTS_EV_ROLL: case CPTS_EV_HALF: @@ -154,9 +158,18 @@ static int cpts_fifo_read(struct cpts *cpts, int match) spin_unlock_irqrestore(&cpts->lock, flags); + if (!cpts->irq_poll && need_schedule) + ptp_schedule_worker(cpts->clock, 0); + return type == match ? 0 : -1; } +void cpts_misc_interrupt(struct cpts *cpts) +{ + cpts_fifo_read(cpts, -1); +} +EXPORT_SYMBOL_GPL(cpts_misc_interrupt); + static u64 cpts_systim_read(const struct cyclecounter *cc) { struct cpts *cpts = container_of(cc, struct cpts, cc); @@ -169,6 +182,8 @@ static void cpts_update_cur_time(struct cpts *cpts, int match, { unsigned long flags; + reinit_completion(&cpts->ts_push_complete); + /* use spin_lock_irqsave() here as it has to run very fast */ spin_lock_irqsave(&cpts->lock, flags); ptp_read_system_prets(sts); @@ -177,8 +192,12 @@ static void cpts_update_cur_time(struct cpts *cpts, int match, ptp_read_system_postts(sts); spin_unlock_irqrestore(&cpts->lock, flags); - if (cpts_fifo_read(cpts, match) && match != -1) + if (cpts->irq_poll && cpts_fifo_read(cpts, match) && match != -1) dev_err(cpts->dev, "cpts: unable to obtain a time stamp\n"); + + if (!cpts->irq_poll && + !wait_for_completion_timeout(&cpts->ts_push_complete, HZ)) + dev_err(cpts->dev, "cpts: obtain a time stamp timeout\n"); } /* PTP clock operations */ @@ -708,8 +727,10 @@ struct cpts *cpts_create(struct device *dev, void __iomem *regs, cpts->dev = dev; cpts->reg = (struct cpsw_cpts __iomem *)regs; + cpts->irq_poll = true; spin_lock_init(&cpts->lock); mutex_init(&cpts->ptp_clk_mutex); + init_completion(&cpts->ts_push_complete); ret = cpts_of_parse(cpts, node); if (ret) diff --git a/drivers/net/ethernet/ti/cpts.h b/drivers/net/ethernet/ti/cpts.h index f16e14d67f5f..473d0622e861 100644 --- a/drivers/net/ethernet/ti/cpts.h +++ b/drivers/net/ethernet/ti/cpts.h @@ -118,6 +118,8 @@ struct cpts { u64 cur_timestamp; u32 mult_new; struct mutex ptp_clk_mutex; /* sync PTP interface and worker */ + bool irq_poll; + struct completion ts_push_complete; }; void cpts_rx_timestamp(struct cpts *cpts, struct sk_buff *skb); @@ -127,6 +129,7 @@ void cpts_unregister(struct cpts *cpts); struct cpts *cpts_create(struct device *dev, void __iomem *regs, struct device_node *node); void cpts_release(struct cpts *cpts); +void cpts_misc_interrupt(struct cpts *cpts); static inline bool cpts_can_timestamp(struct cpts *cpts, struct sk_buff *skb) { @@ -138,6 +141,11 @@ static inline bool cpts_can_timestamp(struct cpts *cpts, struct sk_buff *skb) return true; } +static inline void cpts_set_irqpoll(struct cpts *cpts, bool en) +{ + cpts->irq_poll = en; +} + #else struct cpts; @@ -173,6 +181,14 @@ static inline bool cpts_can_timestamp(struct cpts *cpts, struct sk_buff *skb) { return false; } + +static inline void cpts_misc_interrupt(struct cpts *cpts) +{ +} + +static inline void cpts_set_irqpoll(struct cpts *cpts, bool en) +{ +} #endif -- cgit v1.2.3-59-g8ed1b From b78aba495df0eaee4f4a779b2354d6e2a43a3d70 Mon Sep 17 00:00:00 2001 From: Grygorii Strashko Date: Thu, 23 Apr 2020 17:20:21 +0300 Subject: net: ethernet: ti: cpts: add support for HW_TS_PUSH events Hence CPTS IRQ support is in place the W_TS_PUSH events can be added. PWM capable DmTimers can be used to generete input signals for CPTS on TI AM335x/AM437x/DRA7 SoCs to be timestamped: AM335x/AM437x: timer4 - timer7 DRA7/AM57xx: timer13 - timer16 Signed-off-by: Grygorii Strashko Acked-by: Richard Cochran Signed-off-by: David S. Miller --- drivers/net/ethernet/ti/cpsw_priv.c | 5 +++- drivers/net/ethernet/ti/cpts.c | 49 ++++++++++++++++++++++++++++++++++- drivers/net/ethernet/ti/cpts.h | 5 ++-- drivers/net/ethernet/ti/netcp_ethss.c | 3 ++- 4 files changed, 57 insertions(+), 5 deletions(-) diff --git a/drivers/net/ethernet/ti/cpsw_priv.c b/drivers/net/ethernet/ti/cpsw_priv.c index 97a058ca60ac..099208927400 100644 --- a/drivers/net/ethernet/ti/cpsw_priv.c +++ b/drivers/net/ethernet/ti/cpsw_priv.c @@ -28,6 +28,8 @@ #include "cpsw_sl.h" #include "davinci_cpdma.h" +#define CPTS_N_ETX_TS 4 + int (*cpsw_slave_index)(struct cpsw_common *cpsw, struct cpsw_priv *priv); void cpsw_intr_enable(struct cpsw_common *cpsw) @@ -522,7 +524,8 @@ int cpsw_init_common(struct cpsw_common *cpsw, void __iomem *ss_regs, if (!cpts_node) cpts_node = cpsw->dev->of_node; - cpsw->cpts = cpts_create(cpsw->dev, cpts_regs, cpts_node); + cpsw->cpts = cpts_create(cpsw->dev, cpts_regs, cpts_node, + CPTS_N_ETX_TS); if (IS_ERR(cpsw->cpts)) { ret = PTR_ERR(cpsw->cpts); cpdma_ctlr_destroy(cpsw->dma); diff --git a/drivers/net/ethernet/ti/cpts.c b/drivers/net/ethernet/ti/cpts.c index 339796c87bf6..7c55d395de2c 100644 --- a/drivers/net/ethernet/ti/cpts.c +++ b/drivers/net/ethernet/ti/cpts.c @@ -32,6 +32,11 @@ struct cpts_skb_cb_data { #define cpts_read32(c, r) readl_relaxed(&c->reg->r) #define cpts_write32(c, v, r) writel_relaxed(v, &c->reg->r) +static int cpts_event_port(struct cpts_event *event) +{ + return (event->high >> PORT_NUMBER_SHIFT) & PORT_NUMBER_MASK; +} + static int event_expired(struct cpts_event *event) { return time_after(jiffies, event->tmo); @@ -99,6 +104,7 @@ static void cpts_purge_txq(struct cpts *cpts) */ static int cpts_fifo_read(struct cpts *cpts, int match) { + struct ptp_clock_event pevent; bool need_schedule = false; struct cpts_event *event; unsigned long flags; @@ -146,7 +152,12 @@ static int cpts_fifo_read(struct cpts *cpts, int match) break; case CPTS_EV_ROLL: case CPTS_EV_HALF: + break; case CPTS_EV_HW: + pevent.timestamp = event->timestamp; + pevent.type = PTP_CLOCK_EXTTS; + pevent.index = cpts_event_port(event) - 1; + ptp_clock_event(cpts->clock, &pevent); break; default: dev_err(cpts->dev, "cpts: unknown event type\n"); @@ -273,9 +284,42 @@ static int cpts_ptp_settime(struct ptp_clock_info *ptp, return 0; } +static int cpts_extts_enable(struct cpts *cpts, u32 index, int on) +{ + u32 v; + + if (((cpts->hw_ts_enable & BIT(index)) >> index) == on) + return 0; + + mutex_lock(&cpts->ptp_clk_mutex); + + v = cpts_read32(cpts, control); + if (on) { + v |= BIT(8 + index); + cpts->hw_ts_enable |= BIT(index); + } else { + v &= ~BIT(8 + index); + cpts->hw_ts_enable &= ~BIT(index); + } + cpts_write32(cpts, v, control); + + mutex_unlock(&cpts->ptp_clk_mutex); + + return 0; +} + static int cpts_ptp_enable(struct ptp_clock_info *ptp, struct ptp_clock_request *rq, int on) { + struct cpts *cpts = container_of(ptp, struct cpts, info); + + switch (rq->type) { + case PTP_CLK_REQ_EXTTS: + return cpts_extts_enable(cpts, rq->extts.index, on); + default: + break; + } + return -EOPNOTSUPP; } @@ -716,7 +760,7 @@ of_error: } struct cpts *cpts_create(struct device *dev, void __iomem *regs, - struct device_node *node) + struct device_node *node, u32 n_ext_ts) { struct cpts *cpts; int ret; @@ -755,6 +799,9 @@ struct cpts *cpts_create(struct device *dev, void __iomem *regs, cpts->cc.mask = CLOCKSOURCE_MASK(32); cpts->info = cpts_info; + if (n_ext_ts) + cpts->info.n_ext_ts = n_ext_ts; + cpts_calc_mult_shift(cpts); /* save cc.mult original value as it can be modified * by cpts_ptp_adjfreq(). diff --git a/drivers/net/ethernet/ti/cpts.h b/drivers/net/ethernet/ti/cpts.h index 473d0622e861..07222f651d2e 100644 --- a/drivers/net/ethernet/ti/cpts.h +++ b/drivers/net/ethernet/ti/cpts.h @@ -120,6 +120,7 @@ struct cpts { struct mutex ptp_clk_mutex; /* sync PTP interface and worker */ bool irq_poll; struct completion ts_push_complete; + u32 hw_ts_enable; }; void cpts_rx_timestamp(struct cpts *cpts, struct sk_buff *skb); @@ -127,7 +128,7 @@ void cpts_tx_timestamp(struct cpts *cpts, struct sk_buff *skb); int cpts_register(struct cpts *cpts); void cpts_unregister(struct cpts *cpts); struct cpts *cpts_create(struct device *dev, void __iomem *regs, - struct device_node *node); + struct device_node *node, u32 n_ext_ts); void cpts_release(struct cpts *cpts); void cpts_misc_interrupt(struct cpts *cpts); @@ -158,7 +159,7 @@ static inline void cpts_tx_timestamp(struct cpts *cpts, struct sk_buff *skb) static inline struct cpts *cpts_create(struct device *dev, void __iomem *regs, - struct device_node *node) + struct device_node *node, u32 n_ext_ts) { return NULL; } diff --git a/drivers/net/ethernet/ti/netcp_ethss.c b/drivers/net/ethernet/ti/netcp_ethss.c index fb36115e9c51..9d6e27fb710e 100644 --- a/drivers/net/ethernet/ti/netcp_ethss.c +++ b/drivers/net/ethernet/ti/netcp_ethss.c @@ -3716,7 +3716,8 @@ static int gbe_probe(struct netcp_device *netcp_device, struct device *dev, if (!cpts_node) cpts_node = of_node_get(node); - gbe_dev->cpts = cpts_create(gbe_dev->dev, gbe_dev->cpts_reg, cpts_node); + gbe_dev->cpts = cpts_create(gbe_dev->dev, gbe_dev->cpts_reg, + cpts_node, 0); of_node_put(cpts_node); if (IS_ENABLED(CONFIG_TI_CPTS) && IS_ERR(gbe_dev->cpts)) { ret = PTR_ERR(gbe_dev->cpts); -- cgit v1.2.3-59-g8ed1b From 84ea9c0a95d7b3e554d6c3d7d719cc57be22e7ad Mon Sep 17 00:00:00 2001 From: Grygorii Strashko Date: Thu, 23 Apr 2020 17:20:22 +0300 Subject: net: ethernet: ti: cpsw: enable cpts irq The CPSW misc IRQ need be enabled for CPTS event_pend IRQs processing. This patch adds corresponding support to CPSW driver. Signed-off-by: Grygorii Strashko Acked-by: Richard Cochran Signed-off-by: David S. Miller --- drivers/net/ethernet/ti/cpsw.c | 21 +++++++++++++++++++++ drivers/net/ethernet/ti/cpsw_new.c | 20 ++++++++++++++++++++ drivers/net/ethernet/ti/cpsw_priv.c | 12 ++++++++++++ drivers/net/ethernet/ti/cpsw_priv.h | 2 ++ 4 files changed, 55 insertions(+) diff --git a/drivers/net/ethernet/ti/cpsw.c b/drivers/net/ethernet/ti/cpsw.c index c2c5bf87da01..09f98fa2fb4e 100644 --- a/drivers/net/ethernet/ti/cpsw.c +++ b/drivers/net/ethernet/ti/cpsw.c @@ -1569,6 +1569,12 @@ static int cpsw_probe(struct platform_device *pdev) return irq; cpsw->irqs_table[1] = irq; + /* get misc irq*/ + irq = platform_get_irq(pdev, 3); + if (irq <= 0) + return irq; + cpsw->misc_irq = irq; + /* * This may be required here for child devices. */ @@ -1703,6 +1709,21 @@ static int cpsw_probe(struct platform_device *pdev) goto clean_unregister_netdev_ret; } + if (!cpsw->cpts) + goto skip_cpts; + + ret = devm_request_irq(&pdev->dev, cpsw->misc_irq, cpsw_misc_interrupt, + 0, dev_name(&pdev->dev), cpsw); + if (ret < 0) { + dev_err(dev, "error attaching misc irq (%d)\n", ret); + goto clean_unregister_netdev_ret; + } + + /* Enable misc CPTS evnt_pend IRQ */ + cpts_set_irqpoll(cpsw->cpts, false); + writel(0x10, &cpsw->wr_regs->misc_en); + +skip_cpts: cpsw_notice(priv, probe, "initialized device (regs %pa, irq %d, pool size %d)\n", &ss_res->start, cpsw->irqs_table[0], descs_pool_size); diff --git a/drivers/net/ethernet/ti/cpsw_new.c b/drivers/net/ethernet/ti/cpsw_new.c index 9209e613257d..33c8dd686206 100644 --- a/drivers/net/ethernet/ti/cpsw_new.c +++ b/drivers/net/ethernet/ti/cpsw_new.c @@ -1896,6 +1896,11 @@ static int cpsw_probe(struct platform_device *pdev) return irq; cpsw->irqs_table[1] = irq; + irq = platform_get_irq_byname(pdev, "misc"); + if (irq <= 0) + return irq; + cpsw->misc_irq = irq; + platform_set_drvdata(pdev, cpsw); /* This may be required here for child devices. */ pm_runtime_enable(dev); @@ -1975,6 +1980,21 @@ static int cpsw_probe(struct platform_device *pdev) goto clean_unregister_netdev; } + if (!cpsw->cpts) + goto skip_cpts; + + ret = devm_request_irq(dev, cpsw->misc_irq, cpsw_misc_interrupt, + 0, dev_name(&pdev->dev), cpsw); + if (ret < 0) { + dev_err(dev, "error attaching misc irq (%d)\n", ret); + goto clean_unregister_netdev; + } + + /* Enable misc CPTS evnt_pend IRQ */ + cpts_set_irqpoll(cpsw->cpts, false); + writel(0x10, &cpsw->wr_regs->misc_en); + +skip_cpts: ret = cpsw_register_notifiers(cpsw); if (ret) goto clean_unregister_netdev; diff --git a/drivers/net/ethernet/ti/cpsw_priv.c b/drivers/net/ethernet/ti/cpsw_priv.c index 099208927400..9d098c802c6d 100644 --- a/drivers/net/ethernet/ti/cpsw_priv.c +++ b/drivers/net/ethernet/ti/cpsw_priv.c @@ -114,6 +114,18 @@ irqreturn_t cpsw_rx_interrupt(int irq, void *dev_id) return IRQ_HANDLED; } +irqreturn_t cpsw_misc_interrupt(int irq, void *dev_id) +{ + struct cpsw_common *cpsw = dev_id; + + writel(0, &cpsw->wr_regs->misc_en); + cpdma_ctlr_eoi(cpsw->dma, CPDMA_EOI_MISC); + cpts_misc_interrupt(cpsw->cpts); + writel(0x10, &cpsw->wr_regs->misc_en); + + return IRQ_HANDLED; +} + int cpsw_tx_mq_poll(struct napi_struct *napi_tx, int budget) { struct cpsw_common *cpsw = napi_to_cpsw(napi_tx); diff --git a/drivers/net/ethernet/ti/cpsw_priv.h b/drivers/net/ethernet/ti/cpsw_priv.h index b8d7b924ee3d..bf4e179b4ca4 100644 --- a/drivers/net/ethernet/ti/cpsw_priv.h +++ b/drivers/net/ethernet/ti/cpsw_priv.h @@ -350,6 +350,7 @@ struct cpsw_common { bool rx_irq_disabled; bool tx_irq_disabled; u32 irqs_table[IRQ_NUM]; + int misc_irq; struct cpts *cpts; struct devlink *devlink; int rx_ch_num, tx_ch_num; @@ -442,6 +443,7 @@ int cpsw_run_xdp(struct cpsw_priv *priv, int ch, struct xdp_buff *xdp, struct page *page, int port); irqreturn_t cpsw_tx_interrupt(int irq, void *dev_id); irqreturn_t cpsw_rx_interrupt(int irq, void *dev_id); +irqreturn_t cpsw_misc_interrupt(int irq, void *dev_id); int cpsw_tx_mq_poll(struct napi_struct *napi_tx, int budget); int cpsw_tx_poll(struct napi_struct *napi_tx, int budget); int cpsw_rx_mq_poll(struct napi_struct *napi_rx, int budget); -- cgit v1.2.3-59-g8ed1b From 3c9143d96852485725d330b9164062d8f2d90a38 Mon Sep 17 00:00:00 2001 From: Xu Wang Date: Thu, 23 Apr 2020 13:43:13 +0800 Subject: net: sched : Remove unnecessary cast in kfree Remove unnecassary casts in the argument to kfree. Signed-off-by: Xu Wang Signed-off-by: David S. Miller --- net/sched/em_ipt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/sched/em_ipt.c b/net/sched/em_ipt.c index eecfe072c508..18755d29fd15 100644 --- a/net/sched/em_ipt.c +++ b/net/sched/em_ipt.c @@ -199,7 +199,7 @@ static void em_ipt_destroy(struct tcf_ematch *em) im->match->destroy(&par); } module_put(im->match->me); - kfree((void *)im); + kfree(im); } static int em_ipt_match(struct sk_buff *skb, struct tcf_ematch *em, -- cgit v1.2.3-59-g8ed1b From 8ffe2df6426f874659a3aa1654f45ba83fa91f87 Mon Sep 17 00:00:00 2001 From: Zou Wei Date: Thu, 23 Apr 2020 15:27:40 +0800 Subject: qed: Make ll2_cbs static MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix the following sparse warning: drivers/net/ethernet/qlogic/qed/qed_ll2.c:2334:20: warning: symbol 'll2_cbs' was not declared. Should it be static? Reported-by: Hulk Robot Signed-off-by: Zou Wei Acked-by: Michal Kalderon  Signed-off-by: David S. Miller --- drivers/net/ethernet/qlogic/qed/qed_ll2.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/qlogic/qed/qed_ll2.c b/drivers/net/ethernet/qlogic/qed/qed_ll2.c index 037e5978787e..4afd8572ada6 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_ll2.c +++ b/drivers/net/ethernet/qlogic/qed/qed_ll2.c @@ -2331,7 +2331,7 @@ static void qed_ll2_register_cb_ops(struct qed_dev *cdev, cdev->ll2->cb_cookie = cookie; } -struct qed_ll2_cbs ll2_cbs = { +static struct qed_ll2_cbs ll2_cbs = { .rx_comp_cb = &qed_ll2b_complete_rx_packet, .rx_release_cb = &qed_ll2b_release_rx_packet, .tx_comp_cb = &qed_ll2b_complete_tx_packet, -- cgit v1.2.3-59-g8ed1b From efcd549da9d7e8194b4d2d2f35eff8ce7b4da684 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Thu, 23 Apr 2020 15:10:16 +0100 Subject: net: phy: bcm54140: fix less than zero comparison on an unsigned Currently the unsigned variable tmp is being checked for an negative error return from the call to bcm_phy_read_rdb and this can never be true since tmp is unsigned. Fix this by making tmp a plain int. Addresses-Coverity: ("Unsigned compared against 0") Fixes: 4406d36dfdf1 ("net: phy: bcm54140: add hwmon support") Signed-off-by: Colin Ian King Reviewed-by: Andrew Lunn Reviewed-by: Michael Walle Signed-off-by: David S. Miller --- drivers/net/phy/bcm54140.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/net/phy/bcm54140.c b/drivers/net/phy/bcm54140.c index aa854477e06a..7341f0126cc4 100644 --- a/drivers/net/phy/bcm54140.c +++ b/drivers/net/phy/bcm54140.c @@ -191,7 +191,8 @@ out: static int bcm54140_hwmon_read_temp(struct device *dev, u32 attr, long *val) { struct phy_device *phydev = dev_get_drvdata(dev); - u16 reg, tmp; + u16 reg; + int tmp; switch (attr) { case hwmon_temp_input: @@ -224,7 +225,8 @@ static int bcm54140_hwmon_read_in(struct device *dev, u32 attr, int channel, long *val) { struct phy_device *phydev = dev_get_drvdata(dev); - u16 bit, reg, tmp; + u16 bit, reg; + int tmp; switch (attr) { case hwmon_in_input: -- cgit v1.2.3-59-g8ed1b From c7c4c44c9a95d87e50ced38f7480e779cb472174 Mon Sep 17 00:00:00 2001 From: Tonghao Zhang Date: Fri, 24 Apr 2020 08:08:02 +0800 Subject: net: openvswitch: expand the meters supported number In kernel datapath of Open vSwitch, there are only 1024 buckets of meter in one datapath. If installing more than 1024 (e.g. 8192) meters, it may lead to the performance drop. But in some case, for example, Open vSwitch used as edge gateway, there should be 20K at least, where meters used for IP address bandwidth limitation. [Open vSwitch userspace datapath has this issue too.] For more scalable meter, this patch use meter array instead of hash tables, and expand/shrink the array when necessary. So we can install more meters than before in the datapath. Introducing the struct *dp_meter_instance, it's easy to expand meter though changing the *ti point in the struct *dp_meter_table. Cc: Pravin B Shelar Cc: Andy Zhou Signed-off-by: Tonghao Zhang Acked-by: Pravin B Shelar Signed-off-by: David S. Miller --- net/openvswitch/datapath.h | 2 +- net/openvswitch/meter.c | 240 ++++++++++++++++++++++++++++++++++----------- net/openvswitch/meter.h | 16 ++- 3 files changed, 195 insertions(+), 63 deletions(-) diff --git a/net/openvswitch/datapath.h b/net/openvswitch/datapath.h index e239a46c2f94..2016dd107939 100644 --- a/net/openvswitch/datapath.h +++ b/net/openvswitch/datapath.h @@ -82,7 +82,7 @@ struct datapath { u32 max_headroom; /* Switch meters. */ - struct hlist_head *meters; + struct dp_meter_table meter_tbl; }; /** diff --git a/net/openvswitch/meter.c b/net/openvswitch/meter.c index 5010d1ddd4bd..f806ded1dd0a 100644 --- a/net/openvswitch/meter.c +++ b/net/openvswitch/meter.c @@ -19,8 +19,6 @@ #include "datapath.h" #include "meter.h" -#define METER_HASH_BUCKETS 1024 - static const struct nla_policy meter_policy[OVS_METER_ATTR_MAX + 1] = { [OVS_METER_ATTR_ID] = { .type = NLA_U32, }, [OVS_METER_ATTR_KBPS] = { .type = NLA_FLAG }, @@ -39,6 +37,11 @@ static const struct nla_policy band_policy[OVS_BAND_ATTR_MAX + 1] = { [OVS_BAND_ATTR_STATS] = { .len = sizeof(struct ovs_flow_stats) }, }; +static u32 meter_hash(struct dp_meter_instance *ti, u32 id) +{ + return id % ti->n_meters; +} + static void ovs_meter_free(struct dp_meter *meter) { if (!meter) @@ -47,40 +50,153 @@ static void ovs_meter_free(struct dp_meter *meter) kfree_rcu(meter, rcu); } -static struct hlist_head *meter_hash_bucket(const struct datapath *dp, - u32 meter_id) -{ - return &dp->meters[meter_id & (METER_HASH_BUCKETS - 1)]; -} - /* Call with ovs_mutex or RCU read lock. */ -static struct dp_meter *lookup_meter(const struct datapath *dp, +static struct dp_meter *lookup_meter(const struct dp_meter_table *tbl, u32 meter_id) { + struct dp_meter_instance *ti = rcu_dereference_ovsl(tbl->ti); + u32 hash = meter_hash(ti, meter_id); struct dp_meter *meter; - struct hlist_head *head; - head = meter_hash_bucket(dp, meter_id); - hlist_for_each_entry_rcu(meter, head, dp_hash_node, - lockdep_ovsl_is_held()) { - if (meter->id == meter_id) - return meter; - } + meter = rcu_dereference_ovsl(ti->dp_meters[hash]); + if (meter && likely(meter->id == meter_id)) + return meter; + return NULL; } -static void attach_meter(struct datapath *dp, struct dp_meter *meter) +static struct dp_meter_instance *dp_meter_instance_alloc(const u32 size) +{ + struct dp_meter_instance *ti; + + ti = kvzalloc(sizeof(*ti) + + sizeof(struct dp_meter *) * size, + GFP_KERNEL); + if (!ti) + return NULL; + + ti->n_meters = size; + + return ti; +} + +static void dp_meter_instance_free(struct dp_meter_instance *ti) +{ + kvfree(ti); +} + +static void dp_meter_instance_free_rcu(struct rcu_head *rcu) +{ + struct dp_meter_instance *ti; + + ti = container_of(rcu, struct dp_meter_instance, rcu); + kvfree(ti); +} + +static int +dp_meter_instance_realloc(struct dp_meter_table *tbl, u32 size) +{ + struct dp_meter_instance *ti = rcu_dereference_ovsl(tbl->ti); + int n_meters = min(size, ti->n_meters); + struct dp_meter_instance *new_ti; + int i; + + new_ti = dp_meter_instance_alloc(size); + if (!new_ti) + return -ENOMEM; + + for (i = 0; i < n_meters; i++) + new_ti->dp_meters[i] = + rcu_dereference_ovsl(ti->dp_meters[i]); + + rcu_assign_pointer(tbl->ti, new_ti); + call_rcu(&ti->rcu, dp_meter_instance_free_rcu); + + return 0; +} + +static void dp_meter_instance_insert(struct dp_meter_instance *ti, + struct dp_meter *meter) +{ + u32 hash; + + hash = meter_hash(ti, meter->id); + rcu_assign_pointer(ti->dp_meters[hash], meter); +} + +static void dp_meter_instance_remove(struct dp_meter_instance *ti, + struct dp_meter *meter) { - struct hlist_head *head = meter_hash_bucket(dp, meter->id); + u32 hash; - hlist_add_head_rcu(&meter->dp_hash_node, head); + hash = meter_hash(ti, meter->id); + RCU_INIT_POINTER(ti->dp_meters[hash], NULL); } -static void detach_meter(struct dp_meter *meter) +static int attach_meter(struct dp_meter_table *tbl, struct dp_meter *meter) { + struct dp_meter_instance *ti = rcu_dereference_ovsl(tbl->ti); + u32 hash = meter_hash(ti, meter->id); + + /* In generally, slots selected should be empty, because + * OvS uses id-pool to fetch a available id. + */ + if (unlikely(rcu_dereference_ovsl(ti->dp_meters[hash]))) + return -EBUSY; + + dp_meter_instance_insert(ti, meter); + + /* That function is thread-safe. */ + if (++tbl->count >= ti->n_meters) + if (dp_meter_instance_realloc(tbl, ti->n_meters * 2)) + goto expand_err; + + return 0; + +expand_err: + dp_meter_instance_remove(ti, meter); + tbl->count--; + return -ENOMEM; +} + +static int detach_meter(struct dp_meter_table *tbl, struct dp_meter *meter) +{ + struct dp_meter_instance *ti; + ASSERT_OVSL(); - if (meter) - hlist_del_rcu(&meter->dp_hash_node); + if (!meter) + return 0; + + ti = rcu_dereference_ovsl(tbl->ti); + dp_meter_instance_remove(ti, meter); + + tbl->count--; + + /* Shrink the meter array if necessary. */ + if (ti->n_meters > DP_METER_ARRAY_SIZE_MIN && + tbl->count <= (ti->n_meters / 4)) { + int half_size = ti->n_meters / 2; + int i; + + /* Avoid hash collision, don't move slots to other place. + * Make sure there are no references of meters in array + * which will be released. + */ + for (i = half_size; i < ti->n_meters; i++) + if (rcu_dereference_ovsl(ti->dp_meters[i])) + goto out; + + if (dp_meter_instance_realloc(tbl, half_size)) + goto shrink_err; + } + +out: + return 0; + +shrink_err: + dp_meter_instance_insert(ti, meter); + tbl->count++; + return -ENOMEM; } static struct sk_buff * @@ -273,6 +389,7 @@ static int ovs_meter_cmd_set(struct sk_buff *skb, struct genl_info *info) struct sk_buff *reply; struct ovs_header *ovs_reply_header; struct ovs_header *ovs_header = info->userhdr; + struct dp_meter_table *meter_tbl; struct datapath *dp; int err; u32 meter_id; @@ -300,12 +417,18 @@ static int ovs_meter_cmd_set(struct sk_buff *skb, struct genl_info *info) goto exit_unlock; } + meter_tbl = &dp->meter_tbl; meter_id = nla_get_u32(a[OVS_METER_ATTR_ID]); - /* Cannot fail after this. */ - old_meter = lookup_meter(dp, meter_id); - detach_meter(old_meter); - attach_meter(dp, meter); + old_meter = lookup_meter(meter_tbl, meter_id); + err = detach_meter(meter_tbl, old_meter); + if (err) + goto exit_unlock; + + err = attach_meter(meter_tbl, meter); + if (err) + goto exit_unlock; + ovs_unlock(); /* Build response with the meter_id and stats from @@ -337,14 +460,14 @@ exit_free_meter: static int ovs_meter_cmd_get(struct sk_buff *skb, struct genl_info *info) { - struct nlattr **a = info->attrs; - u32 meter_id; struct ovs_header *ovs_header = info->userhdr; struct ovs_header *ovs_reply_header; + struct nlattr **a = info->attrs; + struct dp_meter *meter; + struct sk_buff *reply; struct datapath *dp; + u32 meter_id; int err; - struct sk_buff *reply; - struct dp_meter *meter; if (!a[OVS_METER_ATTR_ID]) return -EINVAL; @@ -365,7 +488,7 @@ static int ovs_meter_cmd_get(struct sk_buff *skb, struct genl_info *info) } /* Locate meter, copy stats. */ - meter = lookup_meter(dp, meter_id); + meter = lookup_meter(&dp->meter_tbl, meter_id); if (!meter) { err = -ENOENT; goto exit_unlock; @@ -390,18 +513,17 @@ exit_unlock: static int ovs_meter_cmd_del(struct sk_buff *skb, struct genl_info *info) { - struct nlattr **a = info->attrs; - u32 meter_id; struct ovs_header *ovs_header = info->userhdr; struct ovs_header *ovs_reply_header; + struct nlattr **a = info->attrs; + struct dp_meter *old_meter; + struct sk_buff *reply; struct datapath *dp; + u32 meter_id; int err; - struct sk_buff *reply; - struct dp_meter *old_meter; if (!a[OVS_METER_ATTR_ID]) return -EINVAL; - meter_id = nla_get_u32(a[OVS_METER_ATTR_ID]); reply = ovs_meter_cmd_reply_start(info, OVS_METER_CMD_DEL, &ovs_reply_header); @@ -416,14 +538,19 @@ static int ovs_meter_cmd_del(struct sk_buff *skb, struct genl_info *info) goto exit_unlock; } - old_meter = lookup_meter(dp, meter_id); + meter_id = nla_get_u32(a[OVS_METER_ATTR_ID]); + old_meter = lookup_meter(&dp->meter_tbl, meter_id); if (old_meter) { spin_lock_bh(&old_meter->lock); err = ovs_meter_cmd_reply_stats(reply, meter_id, old_meter); WARN_ON(err); spin_unlock_bh(&old_meter->lock); - detach_meter(old_meter); + + err = detach_meter(&dp->meter_tbl, old_meter); + if (err) + goto exit_unlock; } + ovs_unlock(); ovs_meter_free(old_meter); genlmsg_end(reply, ovs_reply_header); @@ -443,16 +570,16 @@ exit_unlock: bool ovs_meter_execute(struct datapath *dp, struct sk_buff *skb, struct sw_flow_key *key, u32 meter_id) { - struct dp_meter *meter; - struct dp_meter_band *band; long long int now_ms = div_u64(ktime_get_ns(), 1000 * 1000); long long int long_delta_ms; - u32 delta_ms; - u32 cost; + struct dp_meter_band *band; + struct dp_meter *meter; int i, band_exceeded_max = -1; u32 band_exceeded_rate = 0; + u32 delta_ms; + u32 cost; - meter = lookup_meter(dp, meter_id); + meter = lookup_meter(&dp->meter_tbl, meter_id); /* Do not drop the packet when there is no meter. */ if (!meter) return false; @@ -570,32 +697,27 @@ struct genl_family dp_meter_genl_family __ro_after_init = { int ovs_meters_init(struct datapath *dp) { - int i; + struct dp_meter_table *tbl = &dp->meter_tbl; + struct dp_meter_instance *ti; - dp->meters = kmalloc_array(METER_HASH_BUCKETS, - sizeof(struct hlist_head), GFP_KERNEL); - - if (!dp->meters) + ti = dp_meter_instance_alloc(DP_METER_ARRAY_SIZE_MIN); + if (!ti) return -ENOMEM; - for (i = 0; i < METER_HASH_BUCKETS; i++) - INIT_HLIST_HEAD(&dp->meters[i]); + rcu_assign_pointer(tbl->ti, ti); + tbl->count = 0; return 0; } void ovs_meters_exit(struct datapath *dp) { + struct dp_meter_table *tbl = &dp->meter_tbl; + struct dp_meter_instance *ti = rcu_dereference_raw(tbl->ti); int i; - for (i = 0; i < METER_HASH_BUCKETS; i++) { - struct hlist_head *head = &dp->meters[i]; - struct dp_meter *meter; - struct hlist_node *n; - - hlist_for_each_entry_safe(meter, n, head, dp_hash_node) - kfree(meter); - } + for (i = 0; i < ti->n_meters; i++) + ovs_meter_free(ti->dp_meters[i]); - kfree(dp->meters); + dp_meter_instance_free(ti); } diff --git a/net/openvswitch/meter.h b/net/openvswitch/meter.h index f645913870bd..f52052d30a16 100644 --- a/net/openvswitch/meter.h +++ b/net/openvswitch/meter.h @@ -13,11 +13,13 @@ #include #include #include +#include #include "flow.h" struct datapath; #define DP_MAX_BANDS 1 +#define DP_METER_ARRAY_SIZE_MIN BIT_ULL(10) struct dp_meter_band { u32 type; @@ -30,9 +32,6 @@ struct dp_meter_band { struct dp_meter { spinlock_t lock; /* Per meter lock */ struct rcu_head rcu; - struct hlist_node dp_hash_node; /*Element in datapath->meters - * hash table. - */ u32 id; u16 kbps:1, keep_stats:1; u16 n_bands; @@ -42,6 +41,17 @@ struct dp_meter { struct dp_meter_band bands[]; }; +struct dp_meter_instance { + struct rcu_head rcu; + u32 n_meters; + struct dp_meter __rcu *dp_meters[]; +}; + +struct dp_meter_table { + struct dp_meter_instance __rcu *ti; + u32 count; +}; + extern struct genl_family dp_meter_genl_family; int ovs_meters_init(struct datapath *dp); void ovs_meters_exit(struct datapath *dp); -- cgit v1.2.3-59-g8ed1b From eb58eebc7fb5e23c9cc7d557c0a9236630591526 Mon Sep 17 00:00:00 2001 From: Tonghao Zhang Date: Fri, 24 Apr 2020 08:08:03 +0800 Subject: net: openvswitch: set max limitation to meters MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Don't allow user to create meter unlimitedly, which may cause to consume a large amount of kernel memory. The max number supported is decided by physical memory and 20K meters as default. Cc: Pravin B Shelar Cc: Andy Zhou Signed-off-by: Tonghao Zhang Acked-by: Pravin B Shelar Signed-off-by: David S. Miller --- net/openvswitch/meter.c | 57 ++++++++++++++++++++++++++++++++++++++++--------- net/openvswitch/meter.h | 2 ++ 2 files changed, 49 insertions(+), 10 deletions(-) diff --git a/net/openvswitch/meter.c b/net/openvswitch/meter.c index f806ded1dd0a..372f4565872d 100644 --- a/net/openvswitch/meter.c +++ b/net/openvswitch/meter.c @@ -12,6 +12,7 @@ #include #include #include +#include #include #include @@ -137,6 +138,7 @@ static int attach_meter(struct dp_meter_table *tbl, struct dp_meter *meter) { struct dp_meter_instance *ti = rcu_dereference_ovsl(tbl->ti); u32 hash = meter_hash(ti, meter->id); + int err; /* In generally, slots selected should be empty, because * OvS uses id-pool to fetch a available id. @@ -147,16 +149,24 @@ static int attach_meter(struct dp_meter_table *tbl, struct dp_meter *meter) dp_meter_instance_insert(ti, meter); /* That function is thread-safe. */ - if (++tbl->count >= ti->n_meters) - if (dp_meter_instance_realloc(tbl, ti->n_meters * 2)) - goto expand_err; + tbl->count++; + if (tbl->count >= tbl->max_meters_allowed) { + err = -EFBIG; + goto attach_err; + } + + if (tbl->count >= ti->n_meters && + dp_meter_instance_realloc(tbl, ti->n_meters * 2)) { + err = -ENOMEM; + goto attach_err; + } return 0; -expand_err: +attach_err: dp_meter_instance_remove(ti, meter); tbl->count--; - return -ENOMEM; + return err; } static int detach_meter(struct dp_meter_table *tbl, struct dp_meter *meter) @@ -266,18 +276,32 @@ error: static int ovs_meter_cmd_features(struct sk_buff *skb, struct genl_info *info) { - struct sk_buff *reply; + struct ovs_header *ovs_header = info->userhdr; struct ovs_header *ovs_reply_header; struct nlattr *nla, *band_nla; - int err; + struct sk_buff *reply; + struct datapath *dp; + int err = -EMSGSIZE; reply = ovs_meter_cmd_reply_start(info, OVS_METER_CMD_FEATURES, &ovs_reply_header); if (IS_ERR(reply)) return PTR_ERR(reply); - if (nla_put_u32(reply, OVS_METER_ATTR_MAX_METERS, U32_MAX) || - nla_put_u32(reply, OVS_METER_ATTR_MAX_BANDS, DP_MAX_BANDS)) + ovs_lock(); + dp = get_dp(sock_net(skb->sk), ovs_header->dp_ifindex); + if (!dp) { + err = -ENODEV; + goto exit_unlock; + } + + if (nla_put_u32(reply, OVS_METER_ATTR_MAX_METERS, + dp->meter_tbl.max_meters_allowed)) + goto exit_unlock; + + ovs_unlock(); + + if (nla_put_u32(reply, OVS_METER_ATTR_MAX_BANDS, DP_MAX_BANDS)) goto nla_put_failure; nla = nla_nest_start_noflag(reply, OVS_METER_ATTR_BANDS); @@ -296,9 +320,10 @@ static int ovs_meter_cmd_features(struct sk_buff *skb, struct genl_info *info) genlmsg_end(reply, ovs_reply_header); return genlmsg_reply(reply, info); +exit_unlock: + ovs_unlock(); nla_put_failure: nlmsg_free(reply); - err = -EMSGSIZE; return err; } @@ -699,15 +724,27 @@ int ovs_meters_init(struct datapath *dp) { struct dp_meter_table *tbl = &dp->meter_tbl; struct dp_meter_instance *ti; + unsigned long free_mem_bytes; ti = dp_meter_instance_alloc(DP_METER_ARRAY_SIZE_MIN); if (!ti) return -ENOMEM; + /* Allow meters in a datapath to use ~3.12% of physical memory. */ + free_mem_bytes = nr_free_buffer_pages() * (PAGE_SIZE >> 5); + tbl->max_meters_allowed = min(free_mem_bytes / sizeof(struct dp_meter), + DP_METER_NUM_MAX); + if (!tbl->max_meters_allowed) + goto out_err; + rcu_assign_pointer(tbl->ti, ti); tbl->count = 0; return 0; + +out_err: + dp_meter_instance_free(ti); + return -ENOMEM; } void ovs_meters_exit(struct datapath *dp) diff --git a/net/openvswitch/meter.h b/net/openvswitch/meter.h index f52052d30a16..61a3ca43cd77 100644 --- a/net/openvswitch/meter.h +++ b/net/openvswitch/meter.h @@ -20,6 +20,7 @@ struct datapath; #define DP_MAX_BANDS 1 #define DP_METER_ARRAY_SIZE_MIN BIT_ULL(10) +#define DP_METER_NUM_MAX (200000UL) struct dp_meter_band { u32 type; @@ -50,6 +51,7 @@ struct dp_meter_instance { struct dp_meter_table { struct dp_meter_instance __rcu *ti; u32 count; + u32 max_meters_allowed; }; extern struct genl_family dp_meter_genl_family; -- cgit v1.2.3-59-g8ed1b From a8e387384f554ea0b63889a7682ffcddee24df8b Mon Sep 17 00:00:00 2001 From: Tonghao Zhang Date: Fri, 24 Apr 2020 08:08:04 +0800 Subject: net: openvswitch: remove the unnecessary check Before invoking the ovs_meter_cmd_reply_stats, "meter" was checked, so don't check it agin in that function. Cc: Pravin B Shelar Cc: Andy Zhou Signed-off-by: Tonghao Zhang Acked-by: Pravin B Shelar Signed-off-by: David S. Miller --- net/openvswitch/meter.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/net/openvswitch/meter.c b/net/openvswitch/meter.c index 372f4565872d..b7893b0d6423 100644 --- a/net/openvswitch/meter.c +++ b/net/openvswitch/meter.c @@ -242,12 +242,11 @@ static int ovs_meter_cmd_reply_stats(struct sk_buff *reply, u32 meter_id, if (nla_put_u32(reply, OVS_METER_ATTR_ID, meter_id)) goto error; - if (!meter) - return 0; - if (nla_put(reply, OVS_METER_ATTR_STATS, - sizeof(struct ovs_flow_stats), &meter->stats) || - nla_put_u64_64bit(reply, OVS_METER_ATTR_USED, meter->used, + sizeof(struct ovs_flow_stats), &meter->stats)) + goto error; + + if (nla_put_u64_64bit(reply, OVS_METER_ATTR_USED, meter->used, OVS_METER_ATTR_PAD)) goto error; -- cgit v1.2.3-59-g8ed1b From c77350089052cafa8125169e37463ab7028d6a18 Mon Sep 17 00:00:00 2001 From: Tonghao Zhang Date: Fri, 24 Apr 2020 08:08:05 +0800 Subject: net: openvswitch: make EINVAL return value more obvious Cc: Pravin B Shelar Cc: Andy Zhou Signed-off-by: Tonghao Zhang Acked-by: Pravin B Shelar Signed-off-by: David S. Miller --- net/openvswitch/meter.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/net/openvswitch/meter.c b/net/openvswitch/meter.c index b7893b0d6423..e36b464b32a5 100644 --- a/net/openvswitch/meter.c +++ b/net/openvswitch/meter.c @@ -419,9 +419,8 @@ static int ovs_meter_cmd_set(struct sk_buff *skb, struct genl_info *info) u32 meter_id; bool failed; - if (!a[OVS_METER_ATTR_ID]) { - return -ENODEV; - } + if (!a[OVS_METER_ATTR_ID]) + return -EINVAL; meter = dp_meter_create(a); if (IS_ERR_OR_NULL(meter)) -- cgit v1.2.3-59-g8ed1b From e57358873bb5d6caa882b9684f59140912b37dde Mon Sep 17 00:00:00 2001 From: Tonghao Zhang Date: Fri, 24 Apr 2020 08:08:06 +0800 Subject: net: openvswitch: use u64 for meter bucket When setting the meter rate to 4+Gbps, there is an overflow, the meters don't work as expected. Cc: Pravin B Shelar Cc: Andy Zhou Signed-off-by: Tonghao Zhang Acked-by: Pravin B Shelar Signed-off-by: David S. Miller --- net/openvswitch/meter.c | 2 +- net/openvswitch/meter.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/net/openvswitch/meter.c b/net/openvswitch/meter.c index e36b464b32a5..915f31123f23 100644 --- a/net/openvswitch/meter.c +++ b/net/openvswitch/meter.c @@ -392,7 +392,7 @@ static struct dp_meter *dp_meter_create(struct nlattr **a) * * Start with a full bucket. */ - band->bucket = (band->burst_size + band->rate) * 1000; + band->bucket = (band->burst_size + band->rate) * 1000ULL; band_max_delta_t = band->bucket / band->rate; if (band_max_delta_t > meter->max_delta_t) meter->max_delta_t = band_max_delta_t; diff --git a/net/openvswitch/meter.h b/net/openvswitch/meter.h index 61a3ca43cd77..0c33889a8515 100644 --- a/net/openvswitch/meter.h +++ b/net/openvswitch/meter.h @@ -26,7 +26,7 @@ struct dp_meter_band { u32 type; u32 rate; u32 burst_size; - u32 bucket; /* 1/1000 packets, or in bits */ + u64 bucket; /* 1/1000 packets, or in bits */ struct ovs_flow_stats stats; }; -- cgit v1.2.3-59-g8ed1b From 5cc58a9ecfa1bbf5fb587ec65e42f15dd5051238 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Mon, 23 Mar 2020 16:24:00 +0100 Subject: mac80211_hwsim: notify wmediumd of used MAC addresses Currently, wmediumd requires each used MAC address to be configured as a station in the virtual air, but that doesn't make sense as any station could have multiple MAC addresses, and even have randomized ones in scanning, etc. Add some code here to tell wmediumd of used MAC addresses, binding them to the hardware address. Combined with a wmediumd patch that makes it track the addresses this allows configuring just the radio address (42:00:00:00:nn:00 unless the radio was manually created) in wmediumd as a station, and all addresses that the station uses are added/removed dynamically. Tested with random scan, which without this and the corresponding wmediumd change doesn't get anything through as the sender doesn't exist as far as wmediumd is concerned (it's random). Link: https://lore.kernel.org/r/20200323162358.b397b1a1acef.Ice0536e34e5d96c51f97c374ea8af9551347c7e8@changeid Signed-off-by: Johannes Berg --- drivers/net/wireless/mac80211_hwsim.c | 51 +++++++++++++++++++++++++++++++++++ drivers/net/wireless/mac80211_hwsim.h | 8 ++++++ 2 files changed, 59 insertions(+) diff --git a/drivers/net/wireless/mac80211_hwsim.c b/drivers/net/wireless/mac80211_hwsim.c index 7c4b7c31d07a..f1c08b31c564 100644 --- a/drivers/net/wireless/mac80211_hwsim.c +++ b/drivers/net/wireless/mac80211_hwsim.c @@ -1068,6 +1068,47 @@ static int hwsim_unicast_netgroup(struct mac80211_hwsim_data *data, return res; } +static void mac80211_hwsim_config_mac_nl(struct ieee80211_hw *hw, + const u8 *addr, bool add) +{ + struct mac80211_hwsim_data *data = hw->priv; + u32 _portid = READ_ONCE(data->wmediumd); + struct sk_buff *skb; + void *msg_head; + + if (!_portid && !hwsim_virtio_enabled) + return; + + skb = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_ATOMIC); + if (!skb) + return; + + msg_head = genlmsg_put(skb, 0, 0, &hwsim_genl_family, 0, + add ? HWSIM_CMD_ADD_MAC_ADDR : + HWSIM_CMD_DEL_MAC_ADDR); + if (!msg_head) { + pr_debug("mac80211_hwsim: problem with msg_head\n"); + goto nla_put_failure; + } + + if (nla_put(skb, HWSIM_ATTR_ADDR_TRANSMITTER, + ETH_ALEN, data->addresses[1].addr)) + goto nla_put_failure; + + if (nla_put(skb, HWSIM_ATTR_ADDR_RECEIVER, ETH_ALEN, addr)) + goto nla_put_failure; + + genlmsg_end(skb, msg_head); + + if (hwsim_virtio_enabled) + hwsim_tx_virtio(data, skb); + else + hwsim_unicast_netgroup(data, skb, _portid); + return; +nla_put_failure: + nlmsg_free(skb); +} + static inline u16 trans_tx_rate_flags_ieee2hwsim(struct ieee80211_tx_rate *rate) { u16 result = 0; @@ -1545,6 +1586,9 @@ static int mac80211_hwsim_add_interface(struct ieee80211_hw *hw, vif->addr); hwsim_set_magic(vif); + if (vif->type != NL80211_IFTYPE_MONITOR) + mac80211_hwsim_config_mac_nl(hw, vif->addr, true); + vif->cab_queue = 0; vif->hw_queue[IEEE80211_AC_VO] = 0; vif->hw_queue[IEEE80211_AC_VI] = 1; @@ -1584,6 +1628,8 @@ static void mac80211_hwsim_remove_interface( vif->addr); hwsim_check_magic(vif); hwsim_clear_magic(vif); + if (vif->type != NL80211_IFTYPE_MONITOR) + mac80211_hwsim_config_mac_nl(hw, vif->addr, false); } static void mac80211_hwsim_tx_frame(struct ieee80211_hw *hw, @@ -2104,6 +2150,8 @@ static void hw_scan_work(struct work_struct *work) hwsim->hw_scan_vif = NULL; hwsim->tmp_chan = NULL; mutex_unlock(&hwsim->mutex); + mac80211_hwsim_config_mac_nl(hwsim->hw, hwsim->scan_addr, + false); return; } @@ -2177,6 +2225,7 @@ static int mac80211_hwsim_hw_scan(struct ieee80211_hw *hw, memset(hwsim->survey_data, 0, sizeof(hwsim->survey_data)); mutex_unlock(&hwsim->mutex); + mac80211_hwsim_config_mac_nl(hw, hwsim->scan_addr, true); wiphy_dbg(hw->wiphy, "hwsim hw_scan request\n"); ieee80211_queue_delayed_work(hwsim->hw, &hwsim->hw_scan, 0); @@ -2220,6 +2269,7 @@ static void mac80211_hwsim_sw_scan(struct ieee80211_hw *hw, pr_debug("hwsim sw_scan request, prepping stuff\n"); memcpy(hwsim->scan_addr, mac_addr, ETH_ALEN); + mac80211_hwsim_config_mac_nl(hw, hwsim->scan_addr, true); hwsim->scanning = true; memset(hwsim->survey_data, 0, sizeof(hwsim->survey_data)); @@ -2236,6 +2286,7 @@ static void mac80211_hwsim_sw_scan_complete(struct ieee80211_hw *hw, pr_debug("hwsim sw_scan_complete\n"); hwsim->scanning = false; + mac80211_hwsim_config_mac_nl(hw, hwsim->scan_addr, false); eth_zero_addr(hwsim->scan_addr); mutex_unlock(&hwsim->mutex); diff --git a/drivers/net/wireless/mac80211_hwsim.h b/drivers/net/wireless/mac80211_hwsim.h index 28ade92adcb4..9dceed77c5d6 100644 --- a/drivers/net/wireless/mac80211_hwsim.h +++ b/drivers/net/wireless/mac80211_hwsim.h @@ -75,6 +75,12 @@ enum hwsim_tx_control_flags { * @HWSIM_CMD_DEL_RADIO: destroy a radio, reply is multicasted * @HWSIM_CMD_GET_RADIO: fetch information about existing radios, uses: * %HWSIM_ATTR_RADIO_ID + * @HWSIM_CMD_ADD_MAC_ADDR: add a receive MAC address (given in the + * %HWSIM_ATTR_ADDR_RECEIVER attribute) to a device identified by + * %HWSIM_ATTR_ADDR_TRANSMITTER. This lets wmediumd forward frames + * to this receiver address for a given station. + * @HWSIM_CMD_DEL_MAC_ADDR: remove the MAC address again, the attributes + * are the same as to @HWSIM_CMD_ADD_MAC_ADDR. * @__HWSIM_CMD_MAX: enum limit */ enum { @@ -85,6 +91,8 @@ enum { HWSIM_CMD_NEW_RADIO, HWSIM_CMD_DEL_RADIO, HWSIM_CMD_GET_RADIO, + HWSIM_CMD_ADD_MAC_ADDR, + HWSIM_CMD_DEL_MAC_ADDR, __HWSIM_CMD_MAX, }; #define HWSIM_CMD_MAX (_HWSIM_CMD_MAX - 1) -- cgit v1.2.3-59-g8ed1b From 1db364c88695272e3410eb4b5d4595c8cb15db30 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Fri, 17 Apr 2020 12:38:04 +0200 Subject: mac80211: mlme: remove duplicate AID bookkeeping Maintain the connection AID only in sdata->vif.bss_conf.aid, not also in sdata->u.mgd.aid. Keep setting that where we set ifmgd->aid before, which has the side effect of exposing the AID to the driver before the station entry (AP) is marked associated, in case it needs it then. Requested-by: Felix Fietkau Signed-off-by: Johannes Berg Tested-by: Lorenzo Bianconi Link: https://lore.kernel.org/r/20200417123802.085d4a322b0c.I2e7a2ceceea8c6880219f9e9ee4d4ac985fd295a@changeid Signed-off-by: Johannes Berg --- net/mac80211/debugfs_netdev.c | 2 +- net/mac80211/ieee80211_i.h | 2 -- net/mac80211/mlme.c | 7 +++---- net/mac80211/tdls.c | 3 +-- net/mac80211/tx.c | 2 +- 5 files changed, 6 insertions(+), 10 deletions(-) diff --git a/net/mac80211/debugfs_netdev.c b/net/mac80211/debugfs_netdev.c index 3dbe7c5cefd1..d7e955127d5c 100644 --- a/net/mac80211/debugfs_netdev.c +++ b/net/mac80211/debugfs_netdev.c @@ -236,7 +236,7 @@ IEEE80211_IF_FILE_R(hw_queues); /* STA attributes */ IEEE80211_IF_FILE(bssid, u.mgd.bssid, MAC); -IEEE80211_IF_FILE(aid, u.mgd.aid, DEC); +IEEE80211_IF_FILE(aid, vif.bss_conf.aid, DEC); IEEE80211_IF_FILE(beacon_timeout, u.mgd.beacon_timeout, JIFFIES_TO_MS); static int ieee80211_set_smps(struct ieee80211_sub_if_data *sdata, diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index f8ed4f621f7f..934a91bef575 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -450,8 +450,6 @@ struct ieee80211_if_managed { u8 bssid[ETH_ALEN] __aligned(2); - u16 aid; - bool powersave; /* powersave requested for this iface */ bool broken_ap; /* AP is broken -- turn off powersave */ bool have_beacon; diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index 16d75da0996a..7139335f29c0 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -3249,7 +3249,7 @@ static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata, return false; } - ifmgd->aid = aid; + sdata->vif.bss_conf.aid = aid; ifmgd->tdls_chan_switch_prohibited = elems->ext_capab && elems->ext_capab_len >= 5 && (elems->ext_capab[4] & WLAN_EXT_CAPA5_TDLS_CH_SW_PROHIBITED); @@ -3521,9 +3521,8 @@ static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata, bss_conf->protected_keep_alive = false; } - /* set AID and assoc capability, + /* set assoc capability (AID was already set earlier), * ieee80211_set_associated() will tell the driver */ - bss_conf->aid = aid; bss_conf->assoc_capability = capab_info; ieee80211_set_associated(sdata, cbss, changed); @@ -3948,7 +3947,7 @@ static void ieee80211_rx_mgmt_beacon(struct ieee80211_sub_if_data *sdata, mgmt->bssid, bssid); if (ieee80211_hw_check(&local->hw, PS_NULLFUNC_STACK) && - ieee80211_check_tim(elems.tim, elems.tim_len, ifmgd->aid)) { + ieee80211_check_tim(elems.tim, elems.tim_len, bss_conf->aid)) { if (local->hw.conf.dynamic_ps_timeout > 0) { if (local->hw.conf.flags & IEEE80211_CONF_PS) { local->hw.conf.flags &= ~IEEE80211_CONF_PS; diff --git a/net/mac80211/tdls.c b/net/mac80211/tdls.c index fca1f5477396..7ff22f9d6e80 100644 --- a/net/mac80211/tdls.c +++ b/net/mac80211/tdls.c @@ -226,12 +226,11 @@ static void ieee80211_tdls_add_link_ie(struct ieee80211_sub_if_data *sdata, static void ieee80211_tdls_add_aid(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb) { - struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; u8 *pos = skb_put(skb, 4); *pos++ = WLAN_EID_AID; *pos++ = 2; /* len */ - put_unaligned_le16(ifmgd->aid, pos); + put_unaligned_le16(sdata->vif.bss_conf.aid, pos); } /* translate numbering in the WMM parameter IE to the mac80211 notation */ diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index 82846aca86d9..3dc1990e15c5 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -5006,7 +5006,7 @@ struct sk_buff *ieee80211_pspoll_get(struct ieee80211_hw *hw, pspoll = skb_put_zero(skb, sizeof(*pspoll)); pspoll->frame_control = cpu_to_le16(IEEE80211_FTYPE_CTL | IEEE80211_STYPE_PSPOLL); - pspoll->aid = cpu_to_le16(ifmgd->aid); + pspoll->aid = cpu_to_le16(sdata->vif.bss_conf.aid); /* aid in PS-Poll has its two MSBs each set to 1 */ pspoll->aid |= cpu_to_le16(1 << 15 | 1 << 14); -- cgit v1.2.3-59-g8ed1b From 90e8f58dfc04d1bd48ca155cc55ebf7ba1824864 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Fri, 17 Apr 2020 11:18:31 +0200 Subject: mac80211: fix drv_config_iface_filter() behaviour There are two bugs with this, first, it shouldn't be called on an interface that's down, and secondly, it should then be called when the interface comes up. Note that the currently only user (iwlwifi) doesn't seem to care about either of these scenarios. Signed-off-by: Johannes Berg Link: https://lore.kernel.org/r/20200417111830.401d82c7a0bf.I5dc7d718816460c2d8d89c7af6c215f9e2b3078f@changeid Signed-off-by: Johannes Berg --- net/mac80211/cfg.c | 15 +++++++++------ net/mac80211/iface.c | 5 +++++ 2 files changed, 14 insertions(+), 6 deletions(-) diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index 0f72813fed53..b90f2131ec7a 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -3421,12 +3421,15 @@ static void ieee80211_mgmt_frame_register(struct wiphy *wiphy, if (!local->open_count) break; - if (sdata->vif.probe_req_reg == 1) - drv_config_iface_filter(local, sdata, FIF_PROBE_REQ, - FIF_PROBE_REQ); - else if (sdata->vif.probe_req_reg == 0) - drv_config_iface_filter(local, sdata, 0, - FIF_PROBE_REQ); + if (ieee80211_sdata_running(sdata)) { + if (sdata->vif.probe_req_reg == 1) + drv_config_iface_filter(local, sdata, + FIF_PROBE_REQ, + FIF_PROBE_REQ); + else if (sdata->vif.probe_req_reg == 0) + drv_config_iface_filter(local, sdata, 0, + FIF_PROBE_REQ); + } ieee80211_configure_filter(local); break; diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c index d069825705d6..f900c84fb40f 100644 --- a/net/mac80211/iface.c +++ b/net/mac80211/iface.c @@ -644,6 +644,11 @@ int ieee80211_do_open(struct wireless_dev *wdev, bool coming_up) local->fif_probe_req++; } + if (sdata->vif.probe_req_reg) + drv_config_iface_filter(local, sdata, + FIF_PROBE_REQ, + FIF_PROBE_REQ); + if (sdata->vif.type != NL80211_IFTYPE_P2P_DEVICE && sdata->vif.type != NL80211_IFTYPE_NAN) changed |= ieee80211_reset_erp_info(sdata); -- cgit v1.2.3-59-g8ed1b From de2cc97acba036847cdfb74e336f6e560eb6907c Mon Sep 17 00:00:00 2001 From: Tova Mussai Date: Fri, 17 Apr 2020 13:21:33 +0300 Subject: iwlwifi: scan: remove support for fw scan api v13 The fw already supports scan api v14 and the firmware version that supports only v13 was not published, so we can remove support for v13 in the driver. Signed-off-by: Tova Mussai Signed-off-by: Luca Coelho Link: https://lore.kernel.org/r/iwlwifi.20200417131727.11883315579a.I4f59100e457c1079c5e4c90e4930d1fa62b7ddd7@changeid --- drivers/net/wireless/intel/iwlwifi/fw/api/scan.h | 26 ----------------- drivers/net/wireless/intel/iwlwifi/mvm/scan.c | 36 ------------------------ 2 files changed, 62 deletions(-) diff --git a/drivers/net/wireless/intel/iwlwifi/fw/api/scan.h b/drivers/net/wireless/intel/iwlwifi/fw/api/scan.h index 3d770f406c38..5cc33a1b7172 100644 --- a/drivers/net/wireless/intel/iwlwifi/fw/api/scan.h +++ b/drivers/net/wireless/intel/iwlwifi/fw/api/scan.h @@ -1050,20 +1050,6 @@ struct iwl_scan_req_params_v12 { struct iwl_scan_probe_params_v3 probe_params; } __packed; /* SCAN_REQUEST_PARAMS_API_S_VER_12 */ -/** - * struct iwl_scan_req_params_v13 - * @general_params: &struct iwl_scan_general_params_v10 - * @channel_params: &struct iwl_scan_channel_params_v4 - * @periodic_params: &struct iwl_scan_periodic_parms_v1 - * @probe_params: &struct iwl_scan_probe_params_v4 - */ -struct iwl_scan_req_params_v13 { - struct iwl_scan_general_params_v10 general_params; - struct iwl_scan_channel_params_v4 channel_params; - struct iwl_scan_periodic_parms_v1 periodic_params; - struct iwl_scan_probe_params_v4 probe_params; -} __packed; /* SCAN_REQUEST_PARAMS_API_S_VER_13 */ - /** * struct iwl_scan_req_params_v14 * @general_params: &struct iwl_scan_general_params_v10 @@ -1090,18 +1076,6 @@ struct iwl_scan_req_umac_v12 { struct iwl_scan_req_params_v12 scan_params; } __packed; /* SCAN_REQUEST_CMD_UMAC_API_S_VER_12 */ -/** - * struct iwl_scan_req_umac_v13 - * @uid: scan id, &enum iwl_umac_scan_uid_offsets - * @ooc_priority: out of channel priority - &enum iwl_scan_priority - * @scan_params: scan parameters - */ -struct iwl_scan_req_umac_v13 { - __le32 uid; - __le32 ooc_priority; - struct iwl_scan_req_params_v13 scan_params; -} __packed; /* SCAN_REQUEST_CMD_UMAC_API_S_VER_13 */ - /** * struct iwl_scan_req_umac_v14 * @uid: scan id, &enum iwl_umac_scan_uid_offsets diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/scan.c b/drivers/net/wireless/intel/iwlwifi/mvm/scan.c index 7a6ad1ff7055..bc48113f0568 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/scan.c +++ b/drivers/net/wireless/intel/iwlwifi/mvm/scan.c @@ -2051,40 +2051,6 @@ static int iwl_mvm_scan_umac_v12(struct iwl_mvm *mvm, struct ieee80211_vif *vif, return 0; } -static int iwl_mvm_scan_umac_v13(struct iwl_mvm *mvm, struct ieee80211_vif *vif, - struct iwl_mvm_scan_params *params, int type, - int uid) -{ - struct iwl_scan_req_umac_v13 *cmd = mvm->scan_cmd; - struct iwl_scan_req_params_v13 *scan_p = &cmd->scan_params; - int ret; - u16 gen_flags; - u32 bitmap_ssid = 0; - - mvm->scan_uid_status[uid] = type; - - cmd->ooc_priority = cpu_to_le32(iwl_mvm_scan_umac_ooc_priority(params)); - cmd->uid = cpu_to_le32(uid); - - gen_flags = iwl_mvm_scan_umac_flags_v2(mvm, params, vif, type); - iwl_mvm_scan_umac_fill_general_p_v10(mvm, params, vif, - &scan_p->general_params, - gen_flags); - - ret = iwl_mvm_fill_scan_sched_params(params, - scan_p->periodic_params.schedule, - &scan_p->periodic_params.delay); - if (ret) - return ret; - - iwl_mvm_scan_umac_fill_probe_p_v4(params, &scan_p->probe_params, - &bitmap_ssid); - iwl_mvm_scan_umac_fill_ch_p_v4(mvm, params, vif, - &scan_p->channel_params, bitmap_ssid); - - return 0; -} - static int iwl_mvm_scan_umac_v14(struct iwl_mvm *mvm, struct ieee80211_vif *vif, struct iwl_mvm_scan_params *params, int type, int uid) @@ -2235,7 +2201,6 @@ struct iwl_scan_umac_handler { static const struct iwl_scan_umac_handler iwl_scan_umac_handlers[] = { /* set the newest version first to shorten the list traverse time */ IWL_SCAN_UMAC_HANDLER(14), - IWL_SCAN_UMAC_HANDLER(13), IWL_SCAN_UMAC_HANDLER(12), }; @@ -2594,7 +2559,6 @@ static int iwl_scan_req_umac_get_size(u8 scan_ver) { switch (scan_ver) { IWL_SCAN_REQ_UMAC_HANDLE_SIZE(14); - IWL_SCAN_REQ_UMAC_HANDLE_SIZE(13); IWL_SCAN_REQ_UMAC_HANDLE_SIZE(12); } -- cgit v1.2.3-59-g8ed1b From 4d797fce783a8eb11dd23463828db84743795046 Mon Sep 17 00:00:00 2001 From: Jouni Malinen Date: Wed, 1 Apr 2020 17:25:47 +0300 Subject: cfg80211: Unprotected Beacon frame RX indication Extend cfg80211_rx_unprot_mlme_mgmt() to cover indication of unprotected Beacon frames in addition to the previously used Deauthentication and Disassociation frames. The Beacon frame case is quite similar, but has couple of exceptions: this is used both with fully unprotected and also incorrectly protected frames and there is a rate limit on the events to avoid unnecessary flooding netlink events in case something goes wrong. Signed-off-by: Jouni Malinen Link: https://lore.kernel.org/r/20200401142548.6990-1-jouni@codeaurora.org [add missing kernel-doc] Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 12 ++++++++++-- include/uapi/linux/nl80211.h | 7 +++++++ net/wireless/nl80211.c | 13 +++++++++++-- net/wireless/sme.c | 2 ++ 4 files changed, 30 insertions(+), 4 deletions(-) diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 70e48f66dac8..775952677b3d 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -5045,6 +5045,8 @@ struct cfg80211_cqm_config; * @pmsr_list: (private) peer measurement requests * @pmsr_lock: (private) peer measurements requests/results lock * @pmsr_free_wk: (private) peer measurements cleanup work + * @unprot_beacon_reported: (private) timestamp of last + * unprotected beacon report */ struct wireless_dev { struct wiphy *wiphy; @@ -5121,6 +5123,8 @@ struct wireless_dev { struct list_head pmsr_list; spinlock_t pmsr_lock; struct work_struct pmsr_free_wk; + + unsigned long unprot_beacon_reported; }; static inline u8 *wdev_address(struct wireless_dev *wdev) @@ -6135,12 +6139,16 @@ void cfg80211_tx_mlme_mgmt(struct net_device *dev, const u8 *buf, size_t len); /** * cfg80211_rx_unprot_mlme_mgmt - notification of unprotected mlme mgmt frame * @dev: network device - * @buf: deauthentication frame (header + body) + * @buf: received management frame (header + body) * @len: length of the frame data * * This function is called whenever a received deauthentication or dissassoc * frame has been dropped in station mode because of MFP being used but the - * frame was not protected. This function may sleep. + * frame was not protected. This is also used to notify reception of a Beacon + * frame that was dropped because it did not include a valid MME MIC while + * beacon protection was enabled (BIGTK configured in station mode). + * + * This function may sleep. */ void cfg80211_rx_unprot_mlme_mgmt(struct net_device *dev, const u8 *buf, size_t len); diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index 2b691161830f..afdd9802ccb8 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -1151,6 +1151,11 @@ * @NL80211_CMD_SET_TID_CONFIG: Data frame TID specific configuration * is passed using %NL80211_ATTR_TID_CONFIG attribute. * + * @NL80211_CMD_UNPROT_BEACON: Unprotected or incorrectly protected Beacon + * frame. This event is used to indicate that a received Beacon frame was + * dropped because it did not include a valid MME MIC while beacon + * protection was enabled (BIGTK configured in station mode). + * * @NL80211_CMD_MAX: highest used command number * @__NL80211_CMD_AFTER_LAST: internal use */ @@ -1377,6 +1382,8 @@ enum nl80211_commands { NL80211_CMD_SET_TID_CONFIG, + NL80211_CMD_UNPROT_BEACON, + /* add new commands above here */ /* used to define NL80211_CMD_MAX below */ diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 692bcd35f809..2127e5344b1a 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -15542,10 +15542,19 @@ void cfg80211_rx_unprot_mlme_mgmt(struct net_device *dev, const u8 *buf, if (WARN_ON(len < 2)) return; - if (ieee80211_is_deauth(mgmt->frame_control)) + if (ieee80211_is_deauth(mgmt->frame_control)) { cmd = NL80211_CMD_UNPROT_DEAUTHENTICATE; - else + } else if (ieee80211_is_disassoc(mgmt->frame_control)) { cmd = NL80211_CMD_UNPROT_DISASSOCIATE; + } else if (ieee80211_is_beacon(mgmt->frame_control)) { + if (wdev->unprot_beacon_reported && + elapsed_jiffies_msecs(wdev->unprot_beacon_reported) < 10000) + return; + cmd = NL80211_CMD_UNPROT_BEACON; + wdev->unprot_beacon_reported = jiffies; + } else { + return; + } trace_cfg80211_rx_unprot_mlme_mgmt(dev, buf, len); nl80211_send_mlme_event(rdev, dev, buf, len, cmd, GFP_ATOMIC, -1, diff --git a/net/wireless/sme.c b/net/wireless/sme.c index ac3e60aa1fc8..3554c0d951f4 100644 --- a/net/wireless/sme.c +++ b/net/wireless/sme.c @@ -694,6 +694,7 @@ void __cfg80211_connect_result(struct net_device *dev, return; } + wdev->unprot_beacon_reported = 0; nl80211_send_connect_result(wiphy_to_rdev(wdev->wiphy), dev, cr, GFP_KERNEL); @@ -921,6 +922,7 @@ void __cfg80211_roamed(struct wireless_dev *wdev, cfg80211_hold_bss(bss_from_pub(info->bss)); wdev->current_bss = bss_from_pub(info->bss); + wdev->unprot_beacon_reported = 0; nl80211_send_roamed(wiphy_to_rdev(wdev->wiphy), wdev->netdev, info, GFP_KERNEL); -- cgit v1.2.3-59-g8ed1b From 9eaf183af741e3d8393eb571ac8aec9ee7d6530e Mon Sep 17 00:00:00 2001 From: Jouni Malinen Date: Wed, 1 Apr 2020 17:25:48 +0300 Subject: mac80211: Report beacon protection failures to user space Report received Beacon frames that do not have a valid MME MIC when beacon protection is enabled. This covers both the cases of no MME in the received frame and invalid MIC in the MME. Signed-off-by: Jouni Malinen Link: https://lore.kernel.org/r/20200401142548.6990-2-jouni@codeaurora.org Signed-off-by: Johannes Berg --- net/mac80211/rx.c | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index 91a13aee4378..a724551b8ddf 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -1984,8 +1984,12 @@ ieee80211_rx_h_decrypt(struct ieee80211_rx_data *rx) if (mmie_keyidx < NUM_DEFAULT_KEYS + NUM_DEFAULT_MGMT_KEYS || mmie_keyidx >= NUM_DEFAULT_KEYS + NUM_DEFAULT_MGMT_KEYS + - NUM_DEFAULT_BEACON_KEYS) + NUM_DEFAULT_BEACON_KEYS) { + cfg80211_rx_unprot_mlme_mgmt(rx->sdata->dev, + skb->data, + skb->len); return RX_DROP_MONITOR; /* unexpected BIP keyidx */ + } rx->key = ieee80211_rx_get_bigtk(rx, mmie_keyidx); if (!rx->key) @@ -2131,6 +2135,10 @@ ieee80211_rx_h_decrypt(struct ieee80211_rx_data *rx) /* either the frame has been decrypted or will be dropped */ status->flag |= RX_FLAG_DECRYPTED; + if (unlikely(ieee80211_is_beacon(fc) && result == RX_DROP_UNUSABLE)) + cfg80211_rx_unprot_mlme_mgmt(rx->sdata->dev, + skb->data, skb->len); + return result; } @@ -2411,8 +2419,12 @@ static int ieee80211_drop_unencrypted_mgmt(struct ieee80211_rx_data *rx) return -EACCES; } if (unlikely(ieee80211_is_beacon(fc) && rx->key && - ieee80211_get_mmie_keyidx(rx->skb) < 0)) + ieee80211_get_mmie_keyidx(rx->skb) < 0)) { + cfg80211_rx_unprot_mlme_mgmt(rx->sdata->dev, + rx->skb->data, + rx->skb->len); return -EACCES; + } /* * When using MFP, Action frames are not allowed prior to * having configured keys. -- cgit v1.2.3-59-g8ed1b From 6cd536fe62ef58d7c4eac2da07ab0ed7fd19010d Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Fri, 17 Apr 2020 12:43:01 +0200 Subject: cfg80211: change internal management frame registration API Almost all drivers below cfg80211 get the API wrong (except for cfg80211) and are unable to cope with multiple registrations for the same frame type, which is valid due to the match filter. This seems to indicate the API is wrong, and we should maintain the full information in cfg80211 instead of the drivers. Change the API to no longer inform the driver about individual registrations and unregistrations, but rather every time about the entire state of the entire wiphy and single wdev, whenever it may have changed. This also simplifies the code in cfg80211 as it no longer has to track exactly what was unregistered and can free things immediately. Signed-off-by: Johannes Berg Acked-by: Arend van Spriel Reviewed-by: Sergey Matyukevich Link: https://lore.kernel.org/r/20200417124300.f47f3828afc8.I7f81ef59c2c5a340d7075fb3c6d0e08e8aeffe07@changeid Signed-off-by: Johannes Berg --- drivers/net/wireless/ath/ath6kl/cfg80211.c | 26 ++++--- .../broadcom/brcm80211/brcmfmac/cfg80211.c | 19 ++--- drivers/net/wireless/marvell/mwifiex/cfg80211.c | 16 ++--- drivers/net/wireless/quantenna/qtnfmac/cfg80211.c | 83 +++++++++++----------- include/net/cfg80211.h | 23 ++++-- include/net/mac80211.h | 2 +- net/mac80211/cfg.c | 50 ++++++------- net/mac80211/ieee80211_i.h | 2 +- net/wireless/core.c | 7 +- net/wireless/core.h | 6 +- net/wireless/mlme.c | 72 ++++++++----------- net/wireless/rdev-ops.h | 11 +-- net/wireless/trace.h | 20 +++--- 13 files changed, 159 insertions(+), 178 deletions(-) diff --git a/drivers/net/wireless/ath/ath6kl/cfg80211.c b/drivers/net/wireless/ath/ath6kl/cfg80211.c index 37cf602d8adf..67f8f2aa7a53 100644 --- a/drivers/net/wireless/ath/ath6kl/cfg80211.c +++ b/drivers/net/wireless/ath/ath6kl/cfg80211.c @@ -3249,22 +3249,19 @@ static int ath6kl_get_antenna(struct wiphy *wiphy, return 0; } -static void ath6kl_mgmt_frame_register(struct wiphy *wiphy, - struct wireless_dev *wdev, - u16 frame_type, bool reg) +static void ath6kl_update_mgmt_frame_registrations(struct wiphy *wiphy, + struct wireless_dev *wdev, + struct mgmt_frame_regs *upd) { struct ath6kl_vif *vif = ath6kl_vif_from_wdev(wdev); - ath6kl_dbg(ATH6KL_DBG_WLAN_CFG, "%s: frame_type=0x%x reg=%d\n", - __func__, frame_type, reg); - if (frame_type == IEEE80211_STYPE_PROBE_REQ) { - /* - * Note: This notification callback is not allowed to sleep, so - * we cannot send WMI_PROBE_REQ_REPORT_CMD here. Instead, we - * hardcode target to report Probe Request frames all the time. - */ - vif->probe_req_report = reg; - } + /* + * FIXME: send WMI_PROBE_REQ_REPORT_CMD here instead of hardcoding + * the reporting in the target all the time, this callback + * *is* allowed to sleep after all. + */ + vif->probe_req_report = + upd->interface_stypes & BIT(IEEE80211_STYPE_PROBE_REQ >> 4); } static int ath6kl_cfg80211_sscan_start(struct wiphy *wiphy, @@ -3464,7 +3461,8 @@ static struct cfg80211_ops ath6kl_cfg80211_ops = { .remain_on_channel = ath6kl_remain_on_channel, .cancel_remain_on_channel = ath6kl_cancel_remain_on_channel, .mgmt_tx = ath6kl_mgmt_tx, - .mgmt_frame_register = ath6kl_mgmt_frame_register, + .update_mgmt_frame_registrations = + ath6kl_update_mgmt_frame_registrations, .get_antenna = ath6kl_get_antenna, .sched_scan_start = ath6kl_cfg80211_sscan_start, .sched_scan_stop = ath6kl_cfg80211_sscan_stop, diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c index 2ba165330038..fa846471dac2 100644 --- a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c +++ b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c @@ -4979,21 +4979,15 @@ brcmf_cfg80211_change_station(struct wiphy *wiphy, struct net_device *ndev, } static void -brcmf_cfg80211_mgmt_frame_register(struct wiphy *wiphy, - struct wireless_dev *wdev, - u16 frame_type, bool reg) +brcmf_cfg80211_update_mgmt_frame_registrations(struct wiphy *wiphy, + struct wireless_dev *wdev, + struct mgmt_frame_regs *upd) { struct brcmf_cfg80211_vif *vif; - u16 mgmt_type; - brcmf_dbg(TRACE, "Enter, frame_type %04x, reg=%d\n", frame_type, reg); - - mgmt_type = (frame_type & IEEE80211_FCTL_STYPE) >> 4; vif = container_of(wdev, struct brcmf_cfg80211_vif, wdev); - if (reg) - vif->mgmt_rx_reg |= BIT(mgmt_type); - else - vif->mgmt_rx_reg &= ~BIT(mgmt_type); + + vif->mgmt_rx_reg = upd->interface_stypes; } @@ -5408,7 +5402,8 @@ static struct cfg80211_ops brcmf_cfg80211_ops = { .change_station = brcmf_cfg80211_change_station, .sched_scan_start = brcmf_cfg80211_sched_scan_start, .sched_scan_stop = brcmf_cfg80211_sched_scan_stop, - .mgmt_frame_register = brcmf_cfg80211_mgmt_frame_register, + .update_mgmt_frame_registrations = + brcmf_cfg80211_update_mgmt_frame_registrations, .mgmt_tx = brcmf_cfg80211_mgmt_tx, .remain_on_channel = brcmf_p2p_remain_on_channel, .cancel_remain_on_channel = brcmf_cfg80211_cancel_remain_on_channel, diff --git a/drivers/net/wireless/marvell/mwifiex/cfg80211.c b/drivers/net/wireless/marvell/mwifiex/cfg80211.c index 1566d2197906..21a17d4017c4 100644 --- a/drivers/net/wireless/marvell/mwifiex/cfg80211.c +++ b/drivers/net/wireless/marvell/mwifiex/cfg80211.c @@ -269,17 +269,12 @@ mwifiex_cfg80211_mgmt_tx(struct wiphy *wiphy, struct wireless_dev *wdev, * CFG802.11 operation handler to register a mgmt frame. */ static void -mwifiex_cfg80211_mgmt_frame_register(struct wiphy *wiphy, - struct wireless_dev *wdev, - u16 frame_type, bool reg) +mwifiex_cfg80211_update_mgmt_frame_registrations(struct wiphy *wiphy, + struct wireless_dev *wdev, + struct mgmt_frame_regs *upd) { struct mwifiex_private *priv = mwifiex_netdev_get_priv(wdev->netdev); - u32 mask; - - if (reg) - mask = priv->mgmt_frame_mask | BIT(frame_type >> 4); - else - mask = priv->mgmt_frame_mask & ~BIT(frame_type >> 4); + u32 mask = upd->interface_stypes; if (mask != priv->mgmt_frame_mask) { priv->mgmt_frame_mask = mask; @@ -4189,7 +4184,8 @@ static struct cfg80211_ops mwifiex_cfg80211_ops = { .del_key = mwifiex_cfg80211_del_key, .set_default_mgmt_key = mwifiex_cfg80211_set_default_mgmt_key, .mgmt_tx = mwifiex_cfg80211_mgmt_tx, - .mgmt_frame_register = mwifiex_cfg80211_mgmt_frame_register, + .update_mgmt_frame_registrations = + mwifiex_cfg80211_update_mgmt_frame_registrations, .remain_on_channel = mwifiex_cfg80211_remain_on_channel, .cancel_remain_on_channel = mwifiex_cfg80211_cancel_remain_on_channel, .set_default_key = mwifiex_cfg80211_set_default_key, diff --git a/drivers/net/wireless/quantenna/qtnfmac/cfg80211.c b/drivers/net/wireless/quantenna/qtnfmac/cfg80211.c index 8be17106008d..54cdf3ad09d7 100644 --- a/drivers/net/wireless/quantenna/qtnfmac/cfg80211.c +++ b/drivers/net/wireless/quantenna/qtnfmac/cfg80211.c @@ -389,55 +389,57 @@ static int qtnf_set_wiphy_params(struct wiphy *wiphy, u32 changed) } static void -qtnf_mgmt_frame_register(struct wiphy *wiphy, struct wireless_dev *wdev, - u16 frame_type, bool reg) +qtnf_update_mgmt_frame_registrations(struct wiphy *wiphy, + struct wireless_dev *wdev, + struct mgmt_frame_regs *upd) { struct qtnf_vif *vif = qtnf_netdev_get_priv(wdev->netdev); - u16 mgmt_type; - u16 new_mask; - u16 qlink_frame_type = 0; + u16 new_mask = upd->interface_stypes; + u16 old_mask = vif->mgmt_frames_bitmask; + static const struct { + u16 mask, qlink_type; + } updates[] = { + { + .mask = BIT(IEEE80211_STYPE_REASSOC_REQ >> 4) | + BIT(IEEE80211_STYPE_ASSOC_REQ >> 4), + .qlink_type = QLINK_MGMT_FRAME_ASSOC_REQ, + }, + { + .mask = BIT(IEEE80211_STYPE_AUTH >> 4), + .qlink_type = QLINK_MGMT_FRAME_AUTH, + }, + { + .mask = BIT(IEEE80211_STYPE_PROBE_REQ >> 4), + .qlink_type = QLINK_MGMT_FRAME_PROBE_REQ, + }, + { + .mask = BIT(IEEE80211_STYPE_ACTION >> 4), + .qlink_type = QLINK_MGMT_FRAME_ACTION, + }, + }; + unsigned int i; - mgmt_type = (frame_type & IEEE80211_FCTL_STYPE) >> 4; + if (new_mask == old_mask) + return; - if (reg) - new_mask = vif->mgmt_frames_bitmask | BIT(mgmt_type); - else - new_mask = vif->mgmt_frames_bitmask & ~BIT(mgmt_type); + for (i = 0; i < ARRAY_SIZE(updates); i++) { + u16 mask = updates[i].mask; + u16 qlink_frame_type = updates[i].qlink_type; + bool reg; - if (new_mask == vif->mgmt_frames_bitmask) - return; + /* the ! are here due to the assoc/reassoc merge */ + if (!(new_mask & mask) == !(old_mask & mask)) + continue; - switch (frame_type & IEEE80211_FCTL_STYPE) { - case IEEE80211_STYPE_REASSOC_REQ: - case IEEE80211_STYPE_ASSOC_REQ: - qlink_frame_type = QLINK_MGMT_FRAME_ASSOC_REQ; - break; - case IEEE80211_STYPE_AUTH: - qlink_frame_type = QLINK_MGMT_FRAME_AUTH; - break; - case IEEE80211_STYPE_PROBE_REQ: - qlink_frame_type = QLINK_MGMT_FRAME_PROBE_REQ; - break; - case IEEE80211_STYPE_ACTION: - qlink_frame_type = QLINK_MGMT_FRAME_ACTION; - break; - default: - pr_warn("VIF%u.%u: unsupported frame type: %X\n", - vif->mac->macid, vif->vifid, - (frame_type & IEEE80211_FCTL_STYPE) >> 4); - return; - } + reg = new_mask & mask; - if (qtnf_cmd_send_register_mgmt(vif, qlink_frame_type, reg)) { - pr_warn("VIF%u.%u: failed to %sregister mgmt frame type 0x%x\n", - vif->mac->macid, vif->vifid, reg ? "" : "un", - frame_type); - return; + if (qtnf_cmd_send_register_mgmt(vif, qlink_frame_type, reg)) + pr_warn("VIF%u.%u: failed to %sregister qlink frame type 0x%x\n", + vif->mac->macid, vif->vifid, reg ? "" : "un", + qlink_frame_type); } vif->mgmt_frames_bitmask = new_mask; - pr_debug("VIF%u.%u: %sregistered mgmt frame type 0x%x\n", - vif->mac->macid, vif->vifid, reg ? "" : "un", frame_type); } static int @@ -1017,7 +1019,8 @@ static struct cfg80211_ops qtn_cfg80211_ops = { .change_beacon = qtnf_change_beacon, .stop_ap = qtnf_stop_ap, .set_wiphy_params = qtnf_set_wiphy_params, - .mgmt_frame_register = qtnf_mgmt_frame_register, + .update_mgmt_frame_registrations = + qtnf_update_mgmt_frame_registrations, .mgmt_tx = qtnf_mgmt_tx, .change_station = qtnf_change_station, .del_station = qtnf_del_station, diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 775952677b3d..bc273f6d60f2 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -3384,6 +3384,17 @@ struct cfg80211_update_owe_info { size_t ie_len; }; +/** + * struct mgmt_frame_regs - management frame registrations data + * @global_stypes: bitmap of management frame subtypes registered + * for the entire device + * @interface_stypes: bitmap of management frame subtypes registered + * for the given interface + */ +struct mgmt_frame_regs { + u32 global_stypes, interface_stypes; +}; + /** * struct cfg80211_ops - backend description for wireless configuration * @@ -3608,8 +3619,8 @@ struct cfg80211_update_owe_info { * The driver should not call cfg80211_sched_scan_stopped() for a requested * stop (when this method returns 0). * - * @mgmt_frame_register: Notify driver that a management frame type was - * registered. The callback is allowed to sleep. + * @update_mgmt_frame_registrations: Notify the driver that management frame + * registrations were updated. The callback is allowed to sleep. * * @set_antenna: Set antenna configuration (tx_ant, rx_ant) on the device. * Parameters are bitmaps of allowed antennas to use for TX/RX. Drivers may @@ -3932,9 +3943,9 @@ struct cfg80211_ops { struct net_device *dev, u32 rate, u32 pkts, u32 intvl); - void (*mgmt_frame_register)(struct wiphy *wiphy, - struct wireless_dev *wdev, - u16 frame_type, bool reg); + void (*update_mgmt_frame_registrations)(struct wiphy *wiphy, + struct wireless_dev *wdev, + struct mgmt_frame_regs *upd); int (*set_antenna)(struct wiphy *wiphy, u32 tx_ant, u32 rx_ant); int (*get_antenna)(struct wiphy *wiphy, u32 *tx_ant, u32 *rx_ant); @@ -5015,6 +5026,7 @@ struct cfg80211_cqm_config; * by cfg80211 on change_interface * @mgmt_registrations: list of registrations for management frames * @mgmt_registrations_lock: lock for the list + * @mgmt_registrations_update_wk: update work to defer from atomic context * @mtx: mutex used to lock data in this struct, may be used by drivers * and some API functions require it held * @beacon_interval: beacon interval used on this device for transmitting @@ -5060,6 +5072,7 @@ struct wireless_dev { struct list_head mgmt_registrations; spinlock_t mgmt_registrations_lock; + struct work_struct mgmt_registrations_update_wk; struct mutex mtx; diff --git a/include/net/mac80211.h b/include/net/mac80211.h index b6b4de0e4b5e..f6dc5a38720f 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -1647,7 +1647,7 @@ struct ieee80211_vif { struct dentry *debugfs_dir; #endif - unsigned int probe_req_reg; + bool probe_req_reg; bool txqs_stopped[IEEE80211_NUM_ACS]; diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index b90f2131ec7a..e62b4764e82e 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -3398,44 +3398,35 @@ int ieee80211_attach_ack_skb(struct ieee80211_local *local, struct sk_buff *skb, return 0; } -static void ieee80211_mgmt_frame_register(struct wiphy *wiphy, +static void +ieee80211_update_mgmt_frame_registrations(struct wiphy *wiphy, struct wireless_dev *wdev, - u16 frame_type, bool reg) + struct mgmt_frame_regs *upd) { struct ieee80211_local *local = wiphy_priv(wiphy); struct ieee80211_sub_if_data *sdata = IEEE80211_WDEV_TO_SUB_IF(wdev); + u32 preq_mask = BIT(IEEE80211_STYPE_PROBE_REQ >> 4); + bool global_change, intf_change; - switch (frame_type) { - case IEEE80211_FTYPE_MGMT | IEEE80211_STYPE_PROBE_REQ: - if (reg) { - local->probe_req_reg++; - sdata->vif.probe_req_reg++; - } else { - if (local->probe_req_reg) - local->probe_req_reg--; + global_change = + local->probe_req_reg != !!(upd->global_stypes & preq_mask); + local->probe_req_reg = upd->global_stypes & preq_mask; - if (sdata->vif.probe_req_reg) - sdata->vif.probe_req_reg--; - } + intf_change = sdata->vif.probe_req_reg != + !!(upd->interface_stypes & preq_mask); + sdata->vif.probe_req_reg = upd->interface_stypes & preq_mask; - if (!local->open_count) - break; + if (!local->open_count) + return; - if (ieee80211_sdata_running(sdata)) { - if (sdata->vif.probe_req_reg == 1) - drv_config_iface_filter(local, sdata, - FIF_PROBE_REQ, - FIF_PROBE_REQ); - else if (sdata->vif.probe_req_reg == 0) - drv_config_iface_filter(local, sdata, 0, - FIF_PROBE_REQ); - } + if (intf_change && ieee80211_sdata_running(sdata)) + drv_config_iface_filter(local, sdata, + sdata->vif.probe_req_reg ? + FIF_PROBE_REQ : 0, + FIF_PROBE_REQ); + if (global_change) ieee80211_configure_filter(local); - break; - default: - break; - } } static int ieee80211_set_antenna(struct wiphy *wiphy, u32 tx_ant, u32 rx_ant) @@ -4020,7 +4011,8 @@ const struct cfg80211_ops mac80211_config_ops = { .mgmt_tx_cancel_wait = ieee80211_mgmt_tx_cancel_wait, .set_cqm_rssi_config = ieee80211_set_cqm_rssi_config, .set_cqm_rssi_range_config = ieee80211_set_cqm_rssi_range_config, - .mgmt_frame_register = ieee80211_mgmt_frame_register, + .update_mgmt_frame_registrations = + ieee80211_update_mgmt_frame_registrations, .set_antenna = ieee80211_set_antenna, .get_antenna = ieee80211_get_antenna, .set_rekey_data = ieee80211_set_rekey_data, diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index 934a91bef575..da41ee996d3d 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -1167,7 +1167,7 @@ struct ieee80211_local { /* number of interfaces with corresponding FIF_ flags */ int fif_fcsfail, fif_plcpfail, fif_control, fif_other_bss, fif_pspoll, fif_probe_req; - int probe_req_reg; + bool probe_req_reg; unsigned int filter_flags; /* FIF_* */ bool wiphy_ciphers_allocated; diff --git a/net/wireless/core.c b/net/wireless/core.c index 341402b4f178..5757dea2aa94 100644 --- a/net/wireless/core.c +++ b/net/wireless/core.c @@ -480,9 +480,6 @@ use_default_name: INIT_LIST_HEAD(&rdev->bss_list); INIT_LIST_HEAD(&rdev->sched_scan_req_list); INIT_WORK(&rdev->scan_done_wk, __cfg80211_scan_done); - INIT_LIST_HEAD(&rdev->mlme_unreg); - spin_lock_init(&rdev->mlme_unreg_lock); - INIT_WORK(&rdev->mlme_unreg_wk, cfg80211_mlme_unreg_wk); INIT_DELAYED_WORK(&rdev->dfs_update_channels_wk, cfg80211_dfs_channels_update_work); #ifdef CONFIG_CFG80211_WEXT @@ -1030,7 +1027,6 @@ void wiphy_unregister(struct wiphy *wiphy) cancel_delayed_work_sync(&rdev->dfs_update_channels_wk); flush_work(&rdev->destroy_work); flush_work(&rdev->sched_scan_stop_wk); - flush_work(&rdev->mlme_unreg_wk); flush_work(&rdev->propagate_radar_detect_wk); flush_work(&rdev->propagate_cac_done_wk); @@ -1094,6 +1090,7 @@ static void __cfg80211_unregister_wdev(struct wireless_dev *wdev, bool sync) rdev->devlist_generation++; cfg80211_mlme_purge_registrations(wdev); + flush_work(&wdev->mgmt_registrations_update_wk); switch (wdev->iftype) { case NL80211_IFTYPE_P2P_DEVICE: @@ -1238,6 +1235,8 @@ void cfg80211_init_wdev(struct cfg80211_registered_device *rdev, spin_lock_init(&wdev->event_lock); INIT_LIST_HEAD(&wdev->mgmt_registrations); spin_lock_init(&wdev->mgmt_registrations_lock); + INIT_WORK(&wdev->mgmt_registrations_update_wk, + cfg80211_mgmt_registrations_update_wk); INIT_LIST_HEAD(&wdev->pmsr_list); spin_lock_init(&wdev->pmsr_lock); INIT_WORK(&wdev->pmsr_free_wk, cfg80211_pmsr_free_wk); diff --git a/net/wireless/core.h b/net/wireless/core.h index bb897a803ffe..30fb2c35ae43 100644 --- a/net/wireless/core.h +++ b/net/wireless/core.h @@ -60,10 +60,6 @@ struct cfg80211_registered_device { struct list_head beacon_registrations; spinlock_t beacon_registrations_lock; - struct list_head mlme_unreg; - spinlock_t mlme_unreg_lock; - struct work_struct mlme_unreg_wk; - /* protected by RTNL only */ int num_running_ifaces; int num_running_monitor_ifaces; @@ -386,7 +382,7 @@ void cfg80211_mlme_down(struct cfg80211_registered_device *rdev, int cfg80211_mlme_register_mgmt(struct wireless_dev *wdev, u32 snd_pid, u16 frame_type, const u8 *match_data, int match_len, struct netlink_ext_ack *extack); -void cfg80211_mlme_unreg_wk(struct work_struct *wk); +void cfg80211_mgmt_registrations_update_wk(struct work_struct *wk); void cfg80211_mlme_unregister_socket(struct wireless_dev *wdev, u32 nlpid); void cfg80211_mlme_purge_registrations(struct wireless_dev *wdev); int cfg80211_mlme_mgmt_tx(struct cfg80211_registered_device *rdev, diff --git a/net/wireless/mlme.c b/net/wireless/mlme.c index e4805a3bd310..2e1a21e90b83 100644 --- a/net/wireless/mlme.c +++ b/net/wireless/mlme.c @@ -429,43 +429,37 @@ struct cfg80211_mgmt_registration { u8 match[]; }; -static void -cfg80211_process_mlme_unregistrations(struct cfg80211_registered_device *rdev) +static void cfg80211_mgmt_registrations_update(struct wireless_dev *wdev) { + struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); + struct wireless_dev *tmp; struct cfg80211_mgmt_registration *reg; + struct mgmt_frame_regs upd = {}; ASSERT_RTNL(); - spin_lock_bh(&rdev->mlme_unreg_lock); - while ((reg = list_first_entry_or_null(&rdev->mlme_unreg, - struct cfg80211_mgmt_registration, - list))) { - list_del(®->list); - spin_unlock_bh(&rdev->mlme_unreg_lock); - - if (rdev->ops->mgmt_frame_register) { - u16 frame_type = le16_to_cpu(reg->frame_type); + rcu_read_lock(); + list_for_each_entry_rcu(tmp, &rdev->wiphy.wdev_list, list) { + list_for_each_entry_rcu(reg, &tmp->mgmt_registrations, list) { + u32 mask = BIT(le16_to_cpu(reg->frame_type) >> 4); - rdev_mgmt_frame_register(rdev, reg->wdev, - frame_type, false); + upd.global_stypes |= mask; + if (tmp == wdev) + upd.interface_stypes |= mask; } - - kfree(reg); - - spin_lock_bh(&rdev->mlme_unreg_lock); } - spin_unlock_bh(&rdev->mlme_unreg_lock); + rcu_read_unlock(); + + rdev_update_mgmt_frame_registrations(rdev, wdev, &upd); } -void cfg80211_mlme_unreg_wk(struct work_struct *wk) +void cfg80211_mgmt_registrations_update_wk(struct work_struct *wk) { - struct cfg80211_registered_device *rdev; - - rdev = container_of(wk, struct cfg80211_registered_device, - mlme_unreg_wk); + struct wireless_dev *wdev = container_of(wk, struct wireless_dev, + mgmt_registrations_update_wk); rtnl_lock(); - cfg80211_process_mlme_unregistrations(rdev); + cfg80211_mgmt_registrations_update(wdev); rtnl_unlock(); } @@ -473,8 +467,6 @@ int cfg80211_mlme_register_mgmt(struct wireless_dev *wdev, u32 snd_portid, u16 frame_type, const u8 *match_data, int match_len, struct netlink_ext_ack *extack) { - struct wiphy *wiphy = wdev->wiphy; - struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); struct cfg80211_mgmt_registration *reg, *nreg; int err = 0; u16 mgmt_type; @@ -534,10 +526,8 @@ int cfg80211_mlme_register_mgmt(struct wireless_dev *wdev, u32 snd_portid, } } - if (err) { - kfree(nreg); + if (err) goto out; - } memcpy(nreg->match, match_data, match_len); nreg->match_len = match_len; @@ -547,15 +537,12 @@ int cfg80211_mlme_register_mgmt(struct wireless_dev *wdev, u32 snd_portid, list_add(&nreg->list, &wdev->mgmt_registrations); spin_unlock_bh(&wdev->mgmt_registrations_lock); - /* process all unregistrations to avoid driver confusion */ - cfg80211_process_mlme_unregistrations(rdev); - - if (rdev->ops->mgmt_frame_register) - rdev_mgmt_frame_register(rdev, wdev, frame_type, true); + cfg80211_mgmt_registrations_update(wdev); return 0; out: + kfree(nreg); spin_unlock_bh(&wdev->mgmt_registrations_lock); return err; @@ -574,11 +561,9 @@ void cfg80211_mlme_unregister_socket(struct wireless_dev *wdev, u32 nlportid) continue; list_del(®->list); - spin_lock(&rdev->mlme_unreg_lock); - list_add_tail(®->list, &rdev->mlme_unreg); - spin_unlock(&rdev->mlme_unreg_lock); + kfree(reg); - schedule_work(&rdev->mlme_unreg_wk); + schedule_work(&wdev->mgmt_registrations_update_wk); } spin_unlock_bh(&wdev->mgmt_registrations_lock); @@ -594,15 +579,16 @@ void cfg80211_mlme_unregister_socket(struct wireless_dev *wdev, u32 nlportid) void cfg80211_mlme_purge_registrations(struct wireless_dev *wdev) { - struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); + struct cfg80211_mgmt_registration *reg, *tmp; spin_lock_bh(&wdev->mgmt_registrations_lock); - spin_lock(&rdev->mlme_unreg_lock); - list_splice_tail_init(&wdev->mgmt_registrations, &rdev->mlme_unreg); - spin_unlock(&rdev->mlme_unreg_lock); + list_for_each_entry_safe(reg, tmp, &wdev->mgmt_registrations, list) { + list_del(®->list); + kfree(reg); + } spin_unlock_bh(&wdev->mgmt_registrations_lock); - cfg80211_process_mlme_unregistrations(rdev); + cfg80211_mgmt_registrations_update(wdev); } int cfg80211_mlme_mgmt_tx(struct cfg80211_registered_device *rdev, diff --git a/net/wireless/rdev-ops.h b/net/wireless/rdev-ops.h index 99462f0c4e08..df5142e86c4f 100644 --- a/net/wireless/rdev-ops.h +++ b/net/wireless/rdev-ops.h @@ -819,13 +819,16 @@ rdev_set_cqm_txe_config(struct cfg80211_registered_device *rdev, } static inline void -rdev_mgmt_frame_register(struct cfg80211_registered_device *rdev, - struct wireless_dev *wdev, u16 frame_type, bool reg) +rdev_update_mgmt_frame_registrations(struct cfg80211_registered_device *rdev, + struct wireless_dev *wdev, + struct mgmt_frame_regs *upd) { might_sleep(); - trace_rdev_mgmt_frame_register(&rdev->wiphy, wdev , frame_type, reg); - rdev->ops->mgmt_frame_register(&rdev->wiphy, wdev , frame_type, reg); + trace_rdev_update_mgmt_frame_registrations(&rdev->wiphy, wdev, upd); + if (rdev->ops->update_mgmt_frame_registrations) + rdev->ops->update_mgmt_frame_registrations(&rdev->wiphy, wdev, + upd); trace_rdev_return_void(&rdev->wiphy); } diff --git a/net/wireless/trace.h b/net/wireless/trace.h index 839df54cee21..ee736620f1e3 100644 --- a/net/wireless/trace.h +++ b/net/wireless/trace.h @@ -1582,25 +1582,25 @@ TRACE_EVENT(rdev_set_bitrate_mask, WIPHY_PR_ARG, NETDEV_PR_ARG, MAC_PR_ARG(peer)) ); -TRACE_EVENT(rdev_mgmt_frame_register, +TRACE_EVENT(rdev_update_mgmt_frame_registrations, TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev, - u16 frame_type, bool reg), - TP_ARGS(wiphy, wdev, frame_type, reg), + struct mgmt_frame_regs *upd), + TP_ARGS(wiphy, wdev, upd), TP_STRUCT__entry( WIPHY_ENTRY WDEV_ENTRY - __field(u16, frame_type) - __field(bool, reg) + __field(u16, global_stypes) + __field(u16, interface_stypes) ), TP_fast_assign( WIPHY_ASSIGN; WDEV_ASSIGN; - __entry->frame_type = frame_type; - __entry->reg = reg; + __entry->global_stypes = upd->global_stypes; + __entry->interface_stypes = upd->interface_stypes; ), - TP_printk(WIPHY_PR_FMT ", " WDEV_PR_FMT ", frame_type: 0x%.2x, reg: %s ", - WIPHY_PR_ARG, WDEV_PR_ARG, __entry->frame_type, - __entry->reg ? "true" : "false") + TP_printk(WIPHY_PR_FMT ", " WDEV_PR_FMT ", global: 0x%.2x, intf: 0x%.2x", + WIPHY_PR_ARG, WDEV_PR_ARG, + __entry->global_stypes, __entry->interface_stypes) ); TRACE_EVENT(rdev_return_int_tx_rx, -- cgit v1.2.3-59-g8ed1b From 9dba48a6ece79da064655736dc7347a5fcadedef Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Fri, 17 Apr 2020 12:40:15 +0200 Subject: cfg80211: support multicast RX registration For DPP, there's a need to receive multicast action frames, but many drivers need a special filter configuration for this. Support announcing from userspace in the management registration that multicast RX is required, with an extended feature flag if the driver handles this. Signed-off-by: Johannes Berg Reviewed-by: Sergey Matyukevich Link: https://lore.kernel.org/r/20200417124013.c46238801048.Ib041d437ce0bff28a0c6d5dc915f68f1d8591002@changeid Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 4 ++++ include/uapi/linux/nl80211.h | 13 +++++++++++++ net/wireless/core.h | 3 ++- net/wireless/mlme.c | 38 ++++++++++++++++++++++++++++++-------- net/wireless/nl80211.c | 10 ++++++++++ 5 files changed, 59 insertions(+), 9 deletions(-) diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index bc273f6d60f2..dbb9675fe38f 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -3390,9 +3390,13 @@ struct cfg80211_update_owe_info { * for the entire device * @interface_stypes: bitmap of management frame subtypes registered * for the given interface + * @global_mcast_rx: mcast RX is needed globally for these subtypes + * @interface_mcast_stypes: mcast RX is needed on this interface + * for these subtypes */ struct mgmt_frame_regs { u32 global_stypes, interface_stypes; + u32 global_mcast_stypes, interface_mcast_stypes; }; /** diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index afdd9802ccb8..e0dc89eceab8 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -687,6 +687,10 @@ * four bytes for vendor frames including the OUI. The registration * cannot be dropped, but is removed automatically when the netlink * socket is closed. Multiple registrations can be made. + * The %NL80211_ATTR_RECEIVE_MULTICAST flag attribute can be given if + * %NL80211_EXT_FEATURE_MULTICAST_REGISTRATIONS is available, in which + * case the registration can also be modified to include/exclude the + * flag, rather than requiring unregistration to change it. * @NL80211_CMD_REGISTER_ACTION: Alias for @NL80211_CMD_REGISTER_FRAME for * backward compatibility * @NL80211_CMD_FRAME: Management frame TX request and RX notification. This @@ -2477,6 +2481,9 @@ enum nl80211_commands { * no roaming occurs between the reauth threshold and PMK expiration, * disassociation is still forced. * + * @NL80211_ATTR_RECEIVE_MULTICAST: multicast flag for the + * %NL80211_CMD_REGISTER_FRAME command, see the description there. + * * @NUM_NL80211_ATTR: total number of nl80211_attrs available * @NL80211_ATTR_MAX: highest attribute number currently defined * @__NL80211_ATTR_AFTER_LAST: internal use @@ -2952,6 +2959,8 @@ enum nl80211_attrs { NL80211_ATTR_PMK_LIFETIME, NL80211_ATTR_PMK_REAUTH_THRESHOLD, + NL80211_ATTR_RECEIVE_MULTICAST, + /* add attributes here, update the policy in nl80211.c */ __NL80211_ATTR_AFTER_LAST, @@ -5691,6 +5700,9 @@ enum nl80211_feature_flags { * @NL80211_EXT_FEATURE_DEL_IBSS_STA: The driver supports removing stations * in IBSS mode, essentially by dropping their state. * + * @NL80211_EXT_FEATURE_MULTICAST_REGISTRATIONS: management frame registrations + * are possible for multicast frames and those will be reported properly. + * * @NUM_NL80211_EXT_FEATURES: number of extended features. * @MAX_NL80211_EXT_FEATURES: highest extended feature index. */ @@ -5742,6 +5754,7 @@ enum nl80211_ext_feature_index { NL80211_EXT_FEATURE_CONTROL_PORT_NO_PREAUTH, NL80211_EXT_FEATURE_PROTECTED_TWT, NL80211_EXT_FEATURE_DEL_IBSS_STA, + NL80211_EXT_FEATURE_MULTICAST_REGISTRATIONS, /* add new features before the definition below */ NUM_NL80211_EXT_FEATURES, diff --git a/net/wireless/core.h b/net/wireless/core.h index 30fb2c35ae43..639d41896573 100644 --- a/net/wireless/core.h +++ b/net/wireless/core.h @@ -381,7 +381,8 @@ void cfg80211_mlme_down(struct cfg80211_registered_device *rdev, struct net_device *dev); int cfg80211_mlme_register_mgmt(struct wireless_dev *wdev, u32 snd_pid, u16 frame_type, const u8 *match_data, - int match_len, struct netlink_ext_ack *extack); + int match_len, bool multicast_rx, + struct netlink_ext_ack *extack); void cfg80211_mgmt_registrations_update_wk(struct work_struct *wk); void cfg80211_mlme_unregister_socket(struct wireless_dev *wdev, u32 nlpid); void cfg80211_mlme_purge_registrations(struct wireless_dev *wdev); diff --git a/net/wireless/mlme.c b/net/wireless/mlme.c index 2e1a21e90b83..409497a3527d 100644 --- a/net/wireless/mlme.c +++ b/net/wireless/mlme.c @@ -426,6 +426,8 @@ struct cfg80211_mgmt_registration { __le16 frame_type; + bool multicast_rx; + u8 match[]; }; @@ -442,10 +444,18 @@ static void cfg80211_mgmt_registrations_update(struct wireless_dev *wdev) list_for_each_entry_rcu(tmp, &rdev->wiphy.wdev_list, list) { list_for_each_entry_rcu(reg, &tmp->mgmt_registrations, list) { u32 mask = BIT(le16_to_cpu(reg->frame_type) >> 4); + u32 mcast_mask = 0; + + if (reg->multicast_rx) + mcast_mask = mask; upd.global_stypes |= mask; - if (tmp == wdev) + upd.global_mcast_stypes |= mcast_mask; + + if (tmp == wdev) { upd.interface_stypes |= mask; + upd.interface_mcast_stypes |= mcast_mask; + } } } rcu_read_unlock(); @@ -465,11 +475,13 @@ void cfg80211_mgmt_registrations_update_wk(struct work_struct *wk) int cfg80211_mlme_register_mgmt(struct wireless_dev *wdev, u32 snd_portid, u16 frame_type, const u8 *match_data, - int match_len, struct netlink_ext_ack *extack) + int match_len, bool multicast_rx, + struct netlink_ext_ack *extack) { struct cfg80211_mgmt_registration *reg, *nreg; int err = 0; u16 mgmt_type; + bool update_multicast = false; if (!wdev->wiphy->mgmt_stypes) return -EOPNOTSUPP; @@ -520,6 +532,11 @@ int cfg80211_mlme_register_mgmt(struct wireless_dev *wdev, u32 snd_portid, continue; if (memcmp(reg->match, match_data, mlen) == 0) { + if (reg->multicast_rx != multicast_rx) { + update_multicast = true; + reg->multicast_rx = multicast_rx; + break; + } NL_SET_ERR_MSG(extack, "Match already configured"); err = -EALREADY; break; @@ -529,12 +546,17 @@ int cfg80211_mlme_register_mgmt(struct wireless_dev *wdev, u32 snd_portid, if (err) goto out; - memcpy(nreg->match, match_data, match_len); - nreg->match_len = match_len; - nreg->nlportid = snd_portid; - nreg->frame_type = cpu_to_le16(frame_type); - nreg->wdev = wdev; - list_add(&nreg->list, &wdev->mgmt_registrations); + if (update_multicast) { + kfree(nreg); + } else { + memcpy(nreg->match, match_data, match_len); + nreg->match_len = match_len; + nreg->nlportid = snd_portid; + nreg->frame_type = cpu_to_le16(frame_type); + nreg->wdev = wdev; + nreg->multicast_rx = multicast_rx; + list_add(&nreg->list, &wdev->mgmt_registrations); + } spin_unlock_bh(&wdev->mgmt_registrations_lock); cfg80211_mgmt_registrations_update(wdev); diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 2127e5344b1a..73a3e885d4dd 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -661,6 +661,7 @@ const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = { [NL80211_ATTR_CONTROL_PORT_NO_PREAUTH] = { .type = NLA_FLAG }, [NL80211_ATTR_PMK_LIFETIME] = NLA_POLICY_MIN(NLA_U32, 1), [NL80211_ATTR_PMK_REAUTH_THRESHOLD] = NLA_POLICY_RANGE(NLA_U8, 1, 100), + [NL80211_ATTR_RECEIVE_MULTICAST] = { .type = NLA_FLAG }, }; /* policy for the key attributes */ @@ -10773,9 +10774,18 @@ static int nl80211_register_mgmt(struct sk_buff *skb, struct genl_info *info) if (!rdev->ops->mgmt_tx) return -EOPNOTSUPP; + if (info->attrs[NL80211_ATTR_RECEIVE_MULTICAST] && + !wiphy_ext_feature_isset(&rdev->wiphy, + NL80211_EXT_FEATURE_MULTICAST_REGISTRATIONS)) { + GENL_SET_ERR_MSG(info, + "multicast RX registrations are not supported"); + return -EOPNOTSUPP; + } + return cfg80211_mlme_register_mgmt(wdev, info->snd_portid, frame_type, nla_data(info->attrs[NL80211_ATTR_FRAME_MATCH]), nla_len(info->attrs[NL80211_ATTR_FRAME_MATCH]), + info->attrs[NL80211_ATTR_RECEIVE_MULTICAST], info->extack); } -- cgit v1.2.3-59-g8ed1b From 155d7c733807190258639c66b36340948f369349 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Mon, 20 Apr 2020 14:06:00 +0200 Subject: nl80211: allow client-only BIGTK support The current NL80211_EXT_FEATURE_BEACON_PROTECTION feature flag requires both AP and client support, add a new one called NL80211_EXT_FEATURE_BEACON_PROTECTION_CLIENT that enables only support in client (and P2P-client) modes. Link: https://lore.kernel.org/r/20200420140559.6ba704053a5a.Ifeb869fb0b48e52fe0cb9c15572b93ac8a924f8d@changeid Signed-off-by: Johannes Berg --- include/uapi/linux/nl80211.h | 3 +++ net/wireless/nl80211.c | 19 +++++++++++++++---- 2 files changed, 18 insertions(+), 4 deletions(-) diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index e0dc89eceab8..9679d561f7d0 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -5690,6 +5690,8 @@ enum nl80211_feature_flags { * * @NL80211_EXT_FEATURE_BEACON_PROTECTION: The driver supports Beacon protection * and can receive key configuration for BIGTK using key indexes 6 and 7. + * @NL80211_EXT_FEATURE_BEACON_PROTECTION_CLIENT: The driver supports Beacon + * protection as a client only and cannot transmit protected beacons. * * @NL80211_EXT_FEATURE_CONTROL_PORT_NO_PREAUTH: The driver can disable the * forwarding of preauth frames over the control port. They are then @@ -5755,6 +5757,7 @@ enum nl80211_ext_feature_index { NL80211_EXT_FEATURE_PROTECTED_TWT, NL80211_EXT_FEATURE_DEL_IBSS_STA, NL80211_EXT_FEATURE_MULTICAST_REGISTRATIONS, + NL80211_EXT_FEATURE_BEACON_PROTECTION_CLIENT, /* add new features before the definition below */ NUM_NL80211_EXT_FEATURES, diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 73a3e885d4dd..d470d77d2eb6 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -3905,14 +3905,25 @@ static int nl80211_get_key(struct sk_buff *skb, struct genl_info *info) }; void *hdr; struct sk_buff *msg; + bool bigtk_support = false; + + if (wiphy_ext_feature_isset(&rdev->wiphy, + NL80211_EXT_FEATURE_BEACON_PROTECTION)) + bigtk_support = true; + + if ((dev->ieee80211_ptr->iftype == NL80211_IFTYPE_STATION || + dev->ieee80211_ptr->iftype == NL80211_IFTYPE_P2P_CLIENT) && + wiphy_ext_feature_isset(&rdev->wiphy, + NL80211_EXT_FEATURE_BEACON_PROTECTION_CLIENT)) + bigtk_support = true; if (info->attrs[NL80211_ATTR_KEY_IDX]) { key_idx = nla_get_u8(info->attrs[NL80211_ATTR_KEY_IDX]); - if (key_idx > 5 && - !wiphy_ext_feature_isset( - &rdev->wiphy, - NL80211_EXT_FEATURE_BEACON_PROTECTION)) + + if (key_idx >= 6 && key_idx <= 7 && !bigtk_support) { + GENL_SET_ERR_MSG(info, "BIGTK not supported"); return -EINVAL; + } } if (info->attrs[NL80211_ATTR_MAC]) -- cgit v1.2.3-59-g8ed1b From 873b1cf61105a67f01f6fc3758405edb1bd1ba35 Mon Sep 17 00:00:00 2001 From: Jouni Malinen Date: Tue, 21 Apr 2020 17:48:15 +0300 Subject: mac80211: Process multicast RX registration for Action frames Convert a user space registration for processing multicast Action frames (NL80211_CMD_REGISTER_FRAME with NL80211_ATTR_RECEIVE_MULTICAST) to a new enum ieee80211_filter_flags bit FIF_MCAST_ACTION so that drivers can update their RX filter parameters appropriately, if needed. Signed-off-by: Jouni Malinen Link: https://lore.kernel.org/r/20200421144815.19175-1-jouni@codeaurora.org [rename variables to rx_mcast_action_reg indicating action frames only] Signed-off-by: Johannes Berg --- include/net/mac80211.h | 6 ++++++ net/mac80211/cfg.c | 14 +++++++++++--- net/mac80211/ieee80211_i.h | 1 + net/mac80211/main.c | 3 +++ 4 files changed, 21 insertions(+), 3 deletions(-) diff --git a/include/net/mac80211.h b/include/net/mac80211.h index f6dc5a38720f..f12fe3b0a868 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -1620,6 +1620,8 @@ enum ieee80211_vif_flags { * monitor interface (if that is requested.) * @probe_req_reg: probe requests should be reported to mac80211 for this * interface. + * @rx_mcast_action_reg: multicast Action frames should be reported to mac80211 + * for this interface. * @drv_priv: data area for driver use, will always be aligned to * sizeof(void \*). * @txq: the multicast data TX queue (if driver uses the TXQ abstraction) @@ -1648,6 +1650,7 @@ struct ieee80211_vif { #endif bool probe_req_reg; + bool rx_mcast_action_reg; bool txqs_stopped[IEEE80211_NUM_ACS]; @@ -3091,6 +3094,8 @@ void ieee80211_free_txskb(struct ieee80211_hw *hw, struct sk_buff *skb); * @FIF_PSPOLL: pass PS Poll frames * * @FIF_PROBE_REQ: pass probe request frames + * + * @FIF_MCAST_ACTION: pass multicast Action frames */ enum ieee80211_filter_flags { FIF_ALLMULTI = 1<<1, @@ -3101,6 +3106,7 @@ enum ieee80211_filter_flags { FIF_OTHER_BSS = 1<<6, FIF_PSPOLL = 1<<7, FIF_PROBE_REQ = 1<<8, + FIF_MCAST_ACTION = 1<<9, }; /** diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index e62b4764e82e..f0d43b9cfa43 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -3406,15 +3406,23 @@ ieee80211_update_mgmt_frame_registrations(struct wiphy *wiphy, struct ieee80211_local *local = wiphy_priv(wiphy); struct ieee80211_sub_if_data *sdata = IEEE80211_WDEV_TO_SUB_IF(wdev); u32 preq_mask = BIT(IEEE80211_STYPE_PROBE_REQ >> 4); + u32 action_mask = BIT(IEEE80211_STYPE_ACTION >> 4); bool global_change, intf_change; global_change = - local->probe_req_reg != !!(upd->global_stypes & preq_mask); + (local->probe_req_reg != !!(upd->global_stypes & preq_mask)) || + (local->rx_mcast_action_reg != + !!(upd->global_mcast_stypes & action_mask)); local->probe_req_reg = upd->global_stypes & preq_mask; + local->rx_mcast_action_reg = upd->global_mcast_stypes & action_mask; - intf_change = sdata->vif.probe_req_reg != - !!(upd->interface_stypes & preq_mask); + intf_change = (sdata->vif.probe_req_reg != + !!(upd->interface_stypes & preq_mask)) || + (sdata->vif.rx_mcast_action_reg != + !!(upd->interface_mcast_stypes & action_mask)); sdata->vif.probe_req_reg = upd->interface_stypes & preq_mask; + sdata->vif.rx_mcast_action_reg = + upd->interface_mcast_stypes & action_mask; if (!local->open_count) return; diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index da41ee996d3d..9407cf44305c 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -1168,6 +1168,7 @@ struct ieee80211_local { int fif_fcsfail, fif_plcpfail, fif_control, fif_other_bss, fif_pspoll, fif_probe_req; bool probe_req_reg; + bool rx_mcast_action_reg; unsigned int filter_flags; /* FIF_* */ bool wiphy_ciphers_allocated; diff --git a/net/mac80211/main.c b/net/mac80211/main.c index 0e9ad60fb2b3..a0cb052ea30d 100644 --- a/net/mac80211/main.c +++ b/net/mac80211/main.c @@ -64,6 +64,9 @@ void ieee80211_configure_filter(struct ieee80211_local *local) if (local->fif_pspoll) new_flags |= FIF_PSPOLL; + if (local->rx_mcast_action_reg) + new_flags |= FIF_MCAST_ACTION; + spin_lock_bh(&local->filter_lock); changed_flags = local->filter_flags ^ new_flags; -- cgit v1.2.3-59-g8ed1b From 9166cc49767a646990a73380480356416b7794eb Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Thu, 26 Mar 2020 15:09:32 +0200 Subject: mac80211: implement Operating Mode Notification extended NSS support Somehow we missed this for a long time, but similar to the extended NSS support in VHT capabilities, we need to have this in Operating Mode notification. Implement it by * parsing the 160/80+80 bit there and setting the bandwidth appropriately * having callers of ieee80211_get_vht_max_nss() pass in the current max NSS value as received in the operating mode notification in order to modify it appropriately depending on the extended NSS bits. This updates all drivers that use it, i.e. only iwlwifi/mvm. Signed-off-by: Johannes Berg Signed-off-by: Luca Coelho Link: https://lore.kernel.org/r/iwlwifi.20200326150855.098483728cfa.I4e8c25d3288441759c2793247197229f0696a37d@changeid Signed-off-by: Johannes Berg --- drivers/net/wireless/intel/iwlwifi/mvm/rs.c | 6 +++--- include/linux/ieee80211.h | 12 +++++++++--- net/mac80211/vht.c | 10 ++++++++-- net/wireless/util.c | 26 ++++++++++++++------------ 4 files changed, 34 insertions(+), 20 deletions(-) diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/rs.c b/drivers/net/wireless/intel/iwlwifi/mvm/rs.c index c1aba2bf73cf..a8c13f6fbce0 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/rs.c +++ b/drivers/net/wireless/intel/iwlwifi/mvm/rs.c @@ -1,10 +1,9 @@ // SPDX-License-Identifier: GPL-2.0-only /****************************************************************************** * - * Copyright(c) 2005 - 2014 Intel Corporation. All rights reserved. + * Copyright(c) 2005 - 2014, 2018 - 2020 Intel Corporation. All rights reserved. * Copyright(c) 2013 - 2015 Intel Mobile Communications GmbH * Copyright(c) 2016 - 2017 Intel Deutschland GmbH - * Copyright(c) 2018 - 2019 Intel Corporation * * Contact Information: * Intel Linux Wireless @@ -1430,7 +1429,8 @@ static u32 rs_bw_from_sta_bw(struct ieee80211_sta *sta) */ if (ieee80211_get_vht_max_nss(&vht_cap, IEEE80211_VHT_CHANWIDTH_160MHZ, - 0, true) < sta->rx_nss) + 0, true, + sta->rx_nss) < sta->rx_nss) return RATE_MCS_CHAN_WIDTH_80; return RATE_MCS_CHAN_WIDTH_160; case IEEE80211_STA_RX_BW_80: diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index 16268ef1cbcc..c326aec535c6 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -9,7 +9,7 @@ * Copyright (c) 2006, Michael Wu * Copyright (c) 2013 - 2014 Intel Mobile Communications GmbH * Copyright (c) 2016 - 2017 Intel Deutschland GmbH - * Copyright (c) 2018 - 2019 Intel Corporation + * Copyright (c) 2018 - 2020 Intel Corporation */ #ifndef LINUX_IEEE80211_H @@ -859,6 +859,7 @@ enum ieee80211_ht_chanwidth_values { * @IEEE80211_OPMODE_NOTIF_CHANWIDTH_40MHZ: 40 MHz channel width * @IEEE80211_OPMODE_NOTIF_CHANWIDTH_80MHZ: 80 MHz channel width * @IEEE80211_OPMODE_NOTIF_CHANWIDTH_160MHZ: 160 MHz or 80+80 MHz channel width + * @IEEE80211_OPMODE_NOTIF_BW_160_80P80: 160 / 80+80 MHz indicator flag * @IEEE80211_OPMODE_NOTIF_RX_NSS_MASK: number of spatial streams mask * (the NSS value is the value of this field + 1) * @IEEE80211_OPMODE_NOTIF_RX_NSS_SHIFT: number of spatial streams shift @@ -866,11 +867,12 @@ enum ieee80211_ht_chanwidth_values { * using a beamforming steering matrix */ enum ieee80211_vht_opmode_bits { - IEEE80211_OPMODE_NOTIF_CHANWIDTH_MASK = 3, + IEEE80211_OPMODE_NOTIF_CHANWIDTH_MASK = 0x03, IEEE80211_OPMODE_NOTIF_CHANWIDTH_20MHZ = 0, IEEE80211_OPMODE_NOTIF_CHANWIDTH_40MHZ = 1, IEEE80211_OPMODE_NOTIF_CHANWIDTH_80MHZ = 2, IEEE80211_OPMODE_NOTIF_CHANWIDTH_160MHZ = 3, + IEEE80211_OPMODE_NOTIF_BW_160_80P80 = 0x04, IEEE80211_OPMODE_NOTIF_RX_NSS_MASK = 0x70, IEEE80211_OPMODE_NOTIF_RX_NSS_SHIFT = 4, IEEE80211_OPMODE_NOTIF_RX_NSS_TYPE_BF = 0x80, @@ -1731,6 +1733,9 @@ struct ieee80211_mu_edca_param_set { * @ext_nss_bw_capable: indicates whether or not the local transmitter * (rate scaling algorithm) can deal with the new logic * (dot11VHTExtendedNSSBWCapable) + * @max_vht_nss: current maximum NSS as advertised by the STA in + * operating mode notification, can be 0 in which case the + * capability data will be used to derive this (from MCS support) * * Due to the VHT Extended NSS Bandwidth Support, the maximum NSS can * vary for a given BW/MCS. This function parses the data. @@ -1739,7 +1744,8 @@ struct ieee80211_mu_edca_param_set { */ int ieee80211_get_vht_max_nss(struct ieee80211_vht_cap *cap, enum ieee80211_vht_chanwidth bw, - int mcs, bool ext_nss_bw_capable); + int mcs, bool ext_nss_bw_capable, + unsigned int max_vht_nss); /* 802.11ax HE MAC capabilities */ #define IEEE80211_HE_MAC_CAP0_HTC_HE 0x01 diff --git a/net/mac80211/vht.c b/net/mac80211/vht.c index 632f07401850..9c6045f9c24d 100644 --- a/net/mac80211/vht.c +++ b/net/mac80211/vht.c @@ -4,7 +4,7 @@ * * Portions of this file * Copyright(c) 2015 - 2016 Intel Deutschland GmbH - * Copyright (C) 2018 - 2019 Intel Corporation + * Copyright (C) 2018 - 2020 Intel Corporation */ #include @@ -575,15 +575,21 @@ u32 __ieee80211_vht_handle_opmode(struct ieee80211_sub_if_data *sdata, switch (opmode & IEEE80211_OPMODE_NOTIF_CHANWIDTH_MASK) { case IEEE80211_OPMODE_NOTIF_CHANWIDTH_20MHZ: + /* ignore IEEE80211_OPMODE_NOTIF_BW_160_80P80 must not be set */ sta->cur_max_bandwidth = IEEE80211_STA_RX_BW_20; break; case IEEE80211_OPMODE_NOTIF_CHANWIDTH_40MHZ: + /* ignore IEEE80211_OPMODE_NOTIF_BW_160_80P80 must not be set */ sta->cur_max_bandwidth = IEEE80211_STA_RX_BW_40; break; case IEEE80211_OPMODE_NOTIF_CHANWIDTH_80MHZ: - sta->cur_max_bandwidth = IEEE80211_STA_RX_BW_80; + if (opmode & IEEE80211_OPMODE_NOTIF_BW_160_80P80) + sta->cur_max_bandwidth = IEEE80211_STA_RX_BW_160; + else + sta->cur_max_bandwidth = IEEE80211_STA_RX_BW_80; break; case IEEE80211_OPMODE_NOTIF_CHANWIDTH_160MHZ: + /* legacy only, no longer used by newer spec */ sta->cur_max_bandwidth = IEEE80211_STA_RX_BW_160; break; } diff --git a/net/wireless/util.c b/net/wireless/util.c index 6590efbbcbb9..123d6ce79b8e 100644 --- a/net/wireless/util.c +++ b/net/wireless/util.c @@ -5,7 +5,7 @@ * Copyright 2007-2009 Johannes Berg * Copyright 2013-2014 Intel Mobile Communications GmbH * Copyright 2017 Intel Deutschland GmbH - * Copyright (C) 2018-2019 Intel Corporation + * Copyright (C) 2018-2020 Intel Corporation */ #include #include @@ -2030,10 +2030,10 @@ EXPORT_SYMBOL(cfg80211_send_layer2_update); int ieee80211_get_vht_max_nss(struct ieee80211_vht_cap *cap, enum ieee80211_vht_chanwidth bw, - int mcs, bool ext_nss_bw_capable) + int mcs, bool ext_nss_bw_capable, + unsigned int max_vht_nss) { u16 map = le16_to_cpu(cap->supp_mcs.rx_mcs_map); - int max_vht_nss = 0; int ext_nss_bw; int supp_width; int i, mcs_encoding; @@ -2041,7 +2041,7 @@ int ieee80211_get_vht_max_nss(struct ieee80211_vht_cap *cap, if (map == 0xffff) return 0; - if (WARN_ON(mcs > 9)) + if (WARN_ON(mcs > 9 || max_vht_nss > 8)) return 0; if (mcs <= 7) mcs_encoding = 0; @@ -2050,16 +2050,18 @@ int ieee80211_get_vht_max_nss(struct ieee80211_vht_cap *cap, else mcs_encoding = 2; - /* find max_vht_nss for the given MCS */ - for (i = 7; i >= 0; i--) { - int supp = (map >> (2 * i)) & 3; + if (!max_vht_nss) { + /* find max_vht_nss for the given MCS */ + for (i = 7; i >= 0; i--) { + int supp = (map >> (2 * i)) & 3; - if (supp == 3) - continue; + if (supp == 3) + continue; - if (supp >= mcs_encoding) { - max_vht_nss = i + 1; - break; + if (supp >= mcs_encoding) { + max_vht_nss = i + 1; + break; + } } } -- cgit v1.2.3-59-g8ed1b From d46b4ab870fa29445b701e922e9aa36b15f833ea Mon Sep 17 00:00:00 2001 From: Shaul Triebitz Date: Thu, 26 Mar 2020 15:09:33 +0200 Subject: mac80211: add twt_protected flag to the bss_conf structure Add a flag to the BSS conf whether the BSS and STA support protected TWT. Signed-off-by: Shaul Triebitz Signed-off-by: Luca Coelho Link: https://lore.kernel.org/r/iwlwifi.20200326150855.1dcb2d16fa74.I74d7c007dad2601d2e39f54612fe6554dd5ab386@changeid Signed-off-by: Johannes Berg --- include/net/mac80211.h | 2 ++ net/mac80211/mlme.c | 9 +++++++++ 2 files changed, 11 insertions(+) diff --git a/include/net/mac80211.h b/include/net/mac80211.h index f12fe3b0a868..5fb80dd8bbbc 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -508,6 +508,7 @@ struct ieee80211_ftm_responder_params { * mode only, set if the AP advertises TWT responder role) * @twt_responder: does this BSS support TWT requester (relevant for managed * mode only, set if the AP advertises TWT responder role) + * @twt_protected: does this BSS support protected TWT frames * @assoc: association status * @ibss_joined: indicates whether this station is part of an IBSS * or not @@ -618,6 +619,7 @@ struct ieee80211_bss_conf { bool he_support; bool twt_requester; bool twt_responder; + bool twt_protected; /* association related data */ bool assoc, ibss_joined; bool ibss_creator; diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index 7139335f29c0..b77787995723 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -3384,10 +3384,19 @@ static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata, sta); bss_conf->he_support = sta->sta.he_cap.has_he; + if (elems->rsnx && elems->rsnx_len && + (elems->rsnx[0] & WLAN_RSNX_CAPA_PROTECTED_TWT) && + wiphy_ext_feature_isset(local->hw.wiphy, + NL80211_EXT_FEATURE_PROTECTED_TWT)) + bss_conf->twt_protected = true; + else + bss_conf->twt_protected = false; + changed |= ieee80211_recalc_twt_req(sdata, sta, elems); } else { bss_conf->he_support = false; bss_conf->twt_requester = false; + bss_conf->twt_protected = false; } if (bss_conf->he_support) { -- cgit v1.2.3-59-g8ed1b From a4055e74a2ff7c70ccdb6c36254ad5181464f211 Mon Sep 17 00:00:00 2001 From: Andrei Otcheretianski Date: Thu, 26 Mar 2020 15:09:34 +0200 Subject: mac80211: Don't destroy auth data in case of anti-clogging SAE AP may reject authentication with WLAN_STATUS_ANTI_CLOG_REQUIRED. As the user space will immediately continue the authentication flow, there is no need to destroy the authentication data in this case. This saves unneeded station removal and releasing the channel. Signed-off-by: Andrei Otcheretianski Signed-off-by: Luca Coelho Link: https://lore.kernel.org/r/iwlwifi.20200326150855.7483996157a8.I8040a842874aaf6d209df3fc8a2acb97a0bf508b@changeid Signed-off-by: Johannes Berg --- net/mac80211/mlme.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index b77787995723..56d61bc9954d 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -2948,10 +2948,15 @@ static void ieee80211_rx_mgmt_auth(struct ieee80211_sub_if_data *sdata, } if (status_code != WLAN_STATUS_SUCCESS) { + cfg80211_rx_mlme_mgmt(sdata->dev, (u8 *)mgmt, len); + + if (auth_alg == WLAN_AUTH_SAE && + status_code == WLAN_STATUS_ANTI_CLOG_REQUIRED) + return; + sdata_info(sdata, "%pM denied authentication (status %d)\n", mgmt->sa, status_code); ieee80211_destroy_auth_data(sdata, false); - cfg80211_rx_mlme_mgmt(sdata->dev, (u8 *)mgmt, len); event.u.mlme.status = MLME_DENIED; event.u.mlme.reason = status_code; drv_event_callback(sdata->local, sdata, &event); -- cgit v1.2.3-59-g8ed1b From 2a392596d8811c6d58c014ec881b159c75a0cf45 Mon Sep 17 00:00:00 2001 From: Ilan Peer Date: Thu, 26 Mar 2020 15:09:35 +0200 Subject: cfg80211: Parse HE membership selector This extends the support for drivers that rebuilds IEs in the FW (same as with HT/VHT). Signed-off-by: Ilan Peer Signed-off-by: Luca Coelho Link: https://lore.kernel.org/r/iwlwifi.20200326150855.20feaabfb484.I886252639604c8e3e84b8ef97962f1b0e4beec81@changeid Signed-off-by: Johannes Berg --- include/linux/ieee80211.h | 1 + include/net/cfg80211.h | 3 ++- net/wireless/nl80211.c | 2 ++ 3 files changed, 5 insertions(+), 1 deletion(-) diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index c326aec535c6..38f513ce7528 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -1067,6 +1067,7 @@ struct ieee80211_mgmt { /* Supported rates membership selectors */ #define BSS_MEMBERSHIP_SELECTOR_HT_PHY 127 #define BSS_MEMBERSHIP_SELECTOR_VHT_PHY 126 +#define BSS_MEMBERSHIP_SELECTOR_HE_PHY 122 /* mgmt header + 1 byte category code */ #define IEEE80211_MIN_ACTION_SIZE offsetof(struct ieee80211_mgmt, u.action.u) diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index dbb9675fe38f..e288fdcb3df2 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -1054,6 +1054,7 @@ enum cfg80211_ap_settings_flags { * @ht_required: stations must support HT * @vht_required: stations must support VHT * @twt_responder: Enable Target Wait Time + * @he_required: stations must support HE * @flags: flags, as defined in enum cfg80211_ap_settings_flags * @he_obss_pd: OBSS Packet Detection settings * @he_bss_color: BSS Color settings @@ -1083,7 +1084,7 @@ struct cfg80211_ap_settings { const struct ieee80211_vht_cap *vht_cap; const struct ieee80211_he_cap_elem *he_cap; const struct ieee80211_he_operation *he_oper; - bool ht_required, vht_required; + bool ht_required, vht_required, he_required; bool twt_responder; u32 flags; struct ieee80211_he_obss_pd he_obss_pd; diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index d470d77d2eb6..3d27b24c68b2 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -4738,6 +4738,8 @@ static void nl80211_check_ap_rate_selectors(struct cfg80211_ap_settings *params, params->ht_required = true; if (rates[2 + i] == BSS_MEMBERSHIP_SELECTOR_VHT_PHY) params->vht_required = true; + if (rates[2 + i] == BSS_MEMBERSHIP_SELECTOR_HE_PHY) + params->he_required = true; } } -- cgit v1.2.3-59-g8ed1b From 4826e721103acf42421304330cf48a642fa163bb Mon Sep 17 00:00:00 2001 From: Ilan Peer Date: Thu, 26 Mar 2020 15:09:36 +0200 Subject: mac80211: Skip entries with HE membership selector When parsing supported rates IE. Signed-off-by: Ilan Peer Signed-off-by: Luca Coelho Link: https://lore.kernel.org/r/iwlwifi.20200326150855.ed3e66f8c197.I93aad0e5ddb7ce79f05f8153922acb9aa5076d38@changeid Signed-off-by: Johannes Berg --- net/mac80211/mlme.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index 56d61bc9954d..c77f47b41356 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -3154,15 +3154,16 @@ static void ieee80211_get_rates(struct ieee80211_supported_band *sband, *have_higher_than_11mbit = true; /* - * Skip HT and VHT BSS membership selectors since they're not - * rates. + * Skip HT, VHT and HE BSS membership selectors since they're + * not rates. * * Note: Even though the membership selector and the basic * rate flag share the same bit, they are not exactly * the same. */ if (supp_rates[i] == (0x80 | BSS_MEMBERSHIP_SELECTOR_HT_PHY) || - supp_rates[i] == (0x80 | BSS_MEMBERSHIP_SELECTOR_VHT_PHY)) + supp_rates[i] == (0x80 | BSS_MEMBERSHIP_SELECTOR_VHT_PHY) || + supp_rates[i] == (0x80 | BSS_MEMBERSHIP_SELECTOR_HE_PHY)) continue; for (j = 0; j < sband->n_bitrates; j++) { -- cgit v1.2.3-59-g8ed1b From 31d8bb4e07f80935ee9bf599a9d99de7ca90fc5a Mon Sep 17 00:00:00 2001 From: Mordechay Goodstein Date: Thu, 26 Mar 2020 15:09:37 +0200 Subject: mac80211: agg-tx: refactor sending addba We move the actual arming the timer and sending ADDBA to a function for the use in different places calling the same logic. Signed-off-by: Mordechay Goodstein Signed-off-by: Luca Coelho Link: https://lore.kernel.org/r/iwlwifi.20200326150855.58a337eb90a1.I75934e6464535fbf43969acc796bc886291e79a5@changeid Signed-off-by: Johannes Berg --- net/mac80211/agg-tx.c | 67 ++++++++++++++++++++++++++++++--------------------- 1 file changed, 39 insertions(+), 28 deletions(-) diff --git a/net/mac80211/agg-tx.c b/net/mac80211/agg-tx.c index 33da6f738c99..32f40c4f3120 100644 --- a/net/mac80211/agg-tx.c +++ b/net/mac80211/agg-tx.c @@ -9,7 +9,7 @@ * Copyright 2007, Michael Wu * Copyright 2007-2010, Intel Corporation * Copyright(c) 2015-2017 Intel Deutschland GmbH - * Copyright (C) 2018 - 2019 Intel Corporation + * Copyright (C) 2018 - 2020 Intel Corporation */ #include @@ -448,6 +448,43 @@ static void sta_addba_resp_timer_expired(struct timer_list *t) ieee80211_stop_tx_ba_session(&sta->sta, tid); } +static void ieee80211_send_addba_with_timeout(struct sta_info *sta, + struct tid_ampdu_tx *tid_tx) +{ + struct ieee80211_sub_if_data *sdata = sta->sdata; + struct ieee80211_local *local = sta->local; + u8 tid = tid_tx->tid; + u16 buf_size; + + /* activate the timer for the recipient's addBA response */ + mod_timer(&tid_tx->addba_resp_timer, jiffies + ADDBA_RESP_INTERVAL); + ht_dbg(sdata, "activated addBA response timer on %pM tid %d\n", + sta->sta.addr, tid); + + spin_lock_bh(&sta->lock); + sta->ampdu_mlme.last_addba_req_time[tid] = jiffies; + sta->ampdu_mlme.addba_req_num[tid]++; + spin_unlock_bh(&sta->lock); + + if (sta->sta.he_cap.has_he) { + buf_size = local->hw.max_tx_aggregation_subframes; + } else { + /* + * We really should use what the driver told us it will + * transmit as the maximum, but certain APs (e.g. the + * LinkSys WRT120N with FW v1.0.07 build 002 Jun 18 2012) + * will crash when we use a lower number. + */ + buf_size = IEEE80211_MAX_AMPDU_BUF_HT; + } + + /* send AddBA request */ + ieee80211_send_addba_request(sdata, sta->sta.addr, tid, + tid_tx->dialog_token, + sta->tid_seq[tid] >> 4, + buf_size, tid_tx->timeout); +} + void ieee80211_tx_ba_session_handle_start(struct sta_info *sta, int tid) { struct tid_ampdu_tx *tid_tx; @@ -462,7 +499,6 @@ void ieee80211_tx_ba_session_handle_start(struct sta_info *sta, int tid) .timeout = 0, }; int ret; - u16 buf_size; tid_tx = rcu_dereference_protected_tid_tx(sta, tid); @@ -508,32 +544,7 @@ void ieee80211_tx_ba_session_handle_start(struct sta_info *sta, int tid) return; } - /* activate the timer for the recipient's addBA response */ - mod_timer(&tid_tx->addba_resp_timer, jiffies + ADDBA_RESP_INTERVAL); - ht_dbg(sdata, "activated addBA response timer on %pM tid %d\n", - sta->sta.addr, tid); - - spin_lock_bh(&sta->lock); - sta->ampdu_mlme.last_addba_req_time[tid] = jiffies; - sta->ampdu_mlme.addba_req_num[tid]++; - spin_unlock_bh(&sta->lock); - - if (sta->sta.he_cap.has_he) { - buf_size = local->hw.max_tx_aggregation_subframes; - } else { - /* - * We really should use what the driver told us it will - * transmit as the maximum, but certain APs (e.g. the - * LinkSys WRT120N with FW v1.0.07 build 002 Jun 18 2012) - * will crash when we use a lower number. - */ - buf_size = IEEE80211_MAX_AMPDU_BUF_HT; - } - - /* send AddBA request */ - ieee80211_send_addba_request(sdata, sta->sta.addr, tid, - tid_tx->dialog_token, params.ssn, - buf_size, tid_tx->timeout); + ieee80211_send_addba_with_timeout(sta, tid_tx); } /* -- cgit v1.2.3-59-g8ed1b From 0c197f16f7bc5ddb43073690a80fb15998ad61e4 Mon Sep 17 00:00:00 2001 From: Mordechay Goodstein Date: Thu, 26 Mar 2020 15:09:38 +0200 Subject: mac80211: agg-tx: add an option to defer ADDBA transmit Driver tells mac80211 to sends ADDBA with SSN (starting sequence number) from the head of the queue, while the transmission of all the frames in the queue may take a while, which causes the peer to time out. In order to fix this scenario, add an option to defer ADDBA transmit until queue is drained. Signed-off-by: Mordechay Goodstein Signed-off-by: Luca Coelho Link: https://lore.kernel.org/r/iwlwifi.20200326150855.0f27423fec75.If67daab123a27c1cbddef000d6a3f212aa6309ef@changeid Signed-off-by: Johannes Berg --- include/net/mac80211.h | 6 +++++- net/mac80211/agg-tx.c | 12 +++++++++++- net/mac80211/sta_info.h | 2 ++ 3 files changed, 18 insertions(+), 2 deletions(-) diff --git a/include/net/mac80211.h b/include/net/mac80211.h index 5fb80dd8bbbc..f3147633dda2 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -3125,7 +3125,10 @@ enum ieee80211_filter_flags { * @IEEE80211_AMPDU_RX_START: start RX aggregation * @IEEE80211_AMPDU_RX_STOP: stop RX aggregation * @IEEE80211_AMPDU_TX_START: start TX aggregation, the driver must either - * call ieee80211_start_tx_ba_cb_irqsafe() or return the special + * call ieee80211_start_tx_ba_cb_irqsafe() or + * call ieee80211_start_tx_ba_cb_irqsafe() with status + * %IEEE80211_AMPDU_TX_START_DELAY_ADDBA to delay addba after + * ieee80211_start_tx_ba_cb_irqsafe is called, or just return the special * status %IEEE80211_AMPDU_TX_START_IMMEDIATE. * @IEEE80211_AMPDU_TX_OPERATIONAL: TX aggregation has become operational * @IEEE80211_AMPDU_TX_STOP_CONT: stop TX aggregation but continue transmitting @@ -3151,6 +3154,7 @@ enum ieee80211_ampdu_mlme_action { }; #define IEEE80211_AMPDU_TX_START_IMMEDIATE 1 +#define IEEE80211_AMPDU_TX_START_DELAY_ADDBA 2 /** * struct ieee80211_ampdu_params - AMPDU action parameters diff --git a/net/mac80211/agg-tx.c b/net/mac80211/agg-tx.c index 32f40c4f3120..c2d5f512526d 100644 --- a/net/mac80211/agg-tx.c +++ b/net/mac80211/agg-tx.c @@ -483,6 +483,8 @@ static void ieee80211_send_addba_with_timeout(struct sta_info *sta, tid_tx->dialog_token, sta->tid_seq[tid] >> 4, buf_size, tid_tx->timeout); + + WARN_ON(test_and_set_bit(HT_AGG_STATE_SENT_ADDBA, &tid_tx->state)); } void ieee80211_tx_ba_session_handle_start(struct sta_info *sta, int tid) @@ -521,7 +523,9 @@ void ieee80211_tx_ba_session_handle_start(struct sta_info *sta, int tid) params.ssn = sta->tid_seq[tid] >> 4; ret = drv_ampdu_action(local, sdata, ¶ms); - if (ret == IEEE80211_AMPDU_TX_START_IMMEDIATE) { + if (ret == IEEE80211_AMPDU_TX_START_DELAY_ADDBA) { + return; + } else if (ret == IEEE80211_AMPDU_TX_START_IMMEDIATE) { /* * We didn't send the request yet, so don't need to check * here if we already got a response, just mark as driver @@ -765,6 +769,12 @@ void ieee80211_start_tx_ba_cb(struct sta_info *sta, int tid, if (WARN_ON(test_and_set_bit(HT_AGG_STATE_DRV_READY, &tid_tx->state))) return; + if (!test_bit(HT_AGG_STATE_SENT_ADDBA, &tid_tx->state)) { + ieee80211_send_addba_with_timeout(sta, tid_tx); + /* RESPONSE_RECEIVED state whould trigger the flow again */ + return; + } + if (test_bit(HT_AGG_STATE_RESPONSE_RECEIVED, &tid_tx->state)) ieee80211_agg_tx_operational(local, sta, tid); } diff --git a/net/mac80211/sta_info.h b/net/mac80211/sta_info.h index 36f1abaab9ff..a5de3aa6ea42 100644 --- a/net/mac80211/sta_info.h +++ b/net/mac80211/sta_info.h @@ -3,6 +3,7 @@ * Copyright 2002-2005, Devicescape Software, Inc. * Copyright 2013-2014 Intel Mobile Communications GmbH * Copyright(c) 2015-2017 Intel Deutschland GmbH + * Copyright(c) 2020 Intel Corporation */ #ifndef STA_INFO_H @@ -116,6 +117,7 @@ enum ieee80211_sta_info_flags { #define HT_AGG_STATE_WANT_STOP 5 #define HT_AGG_STATE_START_CB 6 #define HT_AGG_STATE_STOP_CB 7 +#define HT_AGG_STATE_SENT_ADDBA 8 DECLARE_EWMA(avg_signal, 10, 8) enum ieee80211_agg_stop_reason { -- cgit v1.2.3-59-g8ed1b From 302ff8b7a2b01cfb7645f112bb259af1c146c57a Mon Sep 17 00:00:00 2001 From: Ilan Peer Date: Thu, 26 Mar 2020 15:09:39 +0200 Subject: mac80211: Fail association when AP has no legacy rates The MLME logic had a workaround that allowed to continue an association with an AP even if the AP did not provide any basic rates in its supported rates in the association response, assuming that the first (non basic) legacy rate could be used as a basic rate. However, this did not consider the case where the AP (which is obviously buggy) did not provide any legacy rate. Fix this by failing the association, as this can result in an unexpected failure in the low level driver and FW, e.g., in rate scale logic etc. Signed-off-by: Ilan Peer Signed-off-by: Luca Coelho Link: https://lore.kernel.org/r/iwlwifi.20200326150855.d70a1450d83f.I6e6ce5efda351a8544c0e7bfeee260fe3360d401@changeid Signed-off-by: Johannes Berg --- net/mac80211/mlme.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index c77f47b41356..59a35c7997c3 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -5036,8 +5036,16 @@ static int ieee80211_prep_connection(struct ieee80211_sub_if_data *sdata, * doesn't happen any more, but keep the workaround so * in case some *other* APs are buggy in different ways * we can connect -- with a warning. + * Allow this workaround only in case the AP provided at least + * one rate. */ - if (!basic_rates && min_rate_index >= 0) { + if (min_rate_index < 0) { + sdata_info(sdata, + "No legacy rates in association response\n"); + + sta_info_free(local, new_sta); + return -EINVAL; + } else if (!basic_rates) { sdata_info(sdata, "No basic rates, using min rate instead\n"); basic_rates = BIT(min_rate_index); -- cgit v1.2.3-59-g8ed1b From dba25b04c61170cf8592f87df2bb086201047473 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Thu, 26 Mar 2020 15:09:40 +0200 Subject: mac80211: minstrel_ht_assign_best_tp_rates: remove redundant test We know this pointer isn't NULL and in fact dereferenced it before, remove the redundant test. Signed-off-by: Johannes Berg Signed-off-by: Luca Coelho Link: https://lore.kernel.org/r/iwlwifi.20200326150855.adf551928846.Iae9015573d6c350cc1b12a311d6d13d086beec6c@changeid Signed-off-by: Johannes Berg --- net/mac80211/rc80211_minstrel_ht.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/mac80211/rc80211_minstrel_ht.c b/net/mac80211/rc80211_minstrel_ht.c index 694a31978a04..5547111d22bf 100644 --- a/net/mac80211/rc80211_minstrel_ht.c +++ b/net/mac80211/rc80211_minstrel_ht.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2010-2013 Felix Fietkau + * Copyright (C) 2019-2020 Intel Corporation */ #include #include @@ -490,7 +491,7 @@ minstrel_ht_assign_best_tp_rates(struct minstrel_ht_sta *mi, tmp_prob = mi->groups[tmp_group].rates[tmp_idx].prob_avg; tmp_mcs_tp = minstrel_ht_get_tp_avg(mi, tmp_group, tmp_idx, tmp_prob); - if (tmp_cck_tp_rate && tmp_cck_tp > tmp_mcs_tp) { + if (tmp_cck_tp > tmp_mcs_tp) { for(i = 0; i < MAX_THR_RATES; i++) { minstrel_ht_sort_best_tp_rates(mi, tmp_cck_tp_rate[i], tmp_mcs_tp_rate); -- cgit v1.2.3-59-g8ed1b From bdee75d2ac23a1db30d3c689665a584c20220f97 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Thu, 26 Mar 2020 15:09:41 +0200 Subject: mac80211_hwsim: indicate in IBSS that we have transmitted beacons This is actually true because there's no functional beacon distribution and lets us get active scanning working - without it, mac80211 doesn't respond to probe requests. Signed-off-by: Johannes Berg Signed-off-by: Luca Coelho Link: https://lore.kernel.org/r/iwlwifi.20200326150855.554d1199b309.Id86fd36e3d88d2a75d6e0c6618fd93ce8fe84065@changeid Signed-off-by: Johannes Berg --- drivers/net/wireless/mac80211_hwsim.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/net/wireless/mac80211_hwsim.c b/drivers/net/wireless/mac80211_hwsim.c index f1c08b31c564..05e8203aa6d9 100644 --- a/drivers/net/wireless/mac80211_hwsim.c +++ b/drivers/net/wireless/mac80211_hwsim.c @@ -2464,6 +2464,11 @@ static void mac80211_hwsim_get_et_stats(struct ieee80211_hw *hw, WARN_ON(i != MAC80211_HWSIM_SSTATS_LEN); } +static int mac80211_hwsim_tx_last_beacon(struct ieee80211_hw *hw) +{ + return 1; +} + #define HWSIM_COMMON_OPS \ .tx = mac80211_hwsim_tx, \ .start = mac80211_hwsim_start, \ @@ -2474,6 +2479,7 @@ static void mac80211_hwsim_get_et_stats(struct ieee80211_hw *hw, .config = mac80211_hwsim_config, \ .configure_filter = mac80211_hwsim_configure_filter, \ .bss_info_changed = mac80211_hwsim_bss_info_changed, \ + .tx_last_beacon = mac80211_hwsim_tx_last_beacon, \ .sta_add = mac80211_hwsim_sta_add, \ .sta_remove = mac80211_hwsim_sta_remove, \ .sta_notify = mac80211_hwsim_sta_notify, \ -- cgit v1.2.3-59-g8ed1b From b572510100165ba037ba43dbbb0f05e8da12c741 Mon Sep 17 00:00:00 2001 From: Thomas Pedersen Date: Wed, 1 Apr 2020 18:18:02 -0700 Subject: ieee80211: share 802.11 unit conversion helpers MHZ_TO_KHZ, and KHZ_TO_MHZ are useful to drivers and elsewhere so export these in the common ieee80211 header. Move the power helpers also because we might as well. Signed-off-by: Thomas Pedersen Link: https://lore.kernel.org/r/20200402011810.22947-2-thomas@adapt-ip.com Signed-off-by: Johannes Berg --- include/linux/ieee80211.h | 10 ++++++++++ include/net/regulatory.h | 7 ------- 2 files changed, 10 insertions(+), 7 deletions(-) diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index 38f513ce7528..a561db435a4b 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -3330,6 +3330,16 @@ static inline int ieee80211_get_tdls_action(struct sk_buff *skb, u32 hdr_size) #define TU_TO_JIFFIES(x) (usecs_to_jiffies((x) * 1024)) #define TU_TO_EXP_TIME(x) (jiffies + TU_TO_JIFFIES(x)) +/* convert frequencies */ +#define MHZ_TO_KHZ(freq) ((freq) * 1000) +#define KHZ_TO_MHZ(freq) ((freq) / 1000) + +/* convert powers */ +#define DBI_TO_MBI(gain) ((gain) * 100) +#define MBI_TO_DBI(gain) ((gain) / 100) +#define DBM_TO_MBM(gain) ((gain) * 100) +#define MBM_TO_DBM(gain) ((gain) / 100) + /** * ieee80211_action_contains_tpc - checks if the frame contains TPC element * @skb: the skb containing the frame, length will be checked diff --git a/include/net/regulatory.h b/include/net/regulatory.h index 3469750df0f4..09a3099886e5 100644 --- a/include/net/regulatory.h +++ b/include/net/regulatory.h @@ -231,13 +231,6 @@ struct ieee80211_regdomain { struct ieee80211_reg_rule reg_rules[]; }; -#define MHZ_TO_KHZ(freq) ((freq) * 1000) -#define KHZ_TO_MHZ(freq) ((freq) / 1000) -#define DBI_TO_MBI(gain) ((gain) * 100) -#define MBI_TO_DBI(gain) ((gain) / 100) -#define DBM_TO_MBM(gain) ((gain) * 100) -#define MBM_TO_DBM(gain) ((gain) / 100) - #define REG_RULE_EXT(start, end, bw, gain, eirp, dfs_cac, reg_flags) \ { \ .freq_range.start_freq_khz = MHZ_TO_KHZ(start), \ -- cgit v1.2.3-59-g8ed1b From 934f4c7dd3a544bb8000f7436f1f0e12e04ebc37 Mon Sep 17 00:00:00 2001 From: Thomas Pedersen Date: Wed, 1 Apr 2020 18:18:03 -0700 Subject: cfg80211: express channels with a KHz component Some bands (S1G) define channels centered on a non-integer MHz. Give ieee80211_channel and cfg80211_chan_def a freq_offset component where the final frequency can be expressed as: MHZ_TO_KHZ(chan->center_freq) + chan->freq_offset; Also provide some helper functions to do the frequency conversion and test for equality. Retain the existing interface to frequency and channel conversion helpers, and expose new ones which handle frequencies in units of KHz. Some internal functions (net/wireless/chan.c) pass around a frequency value. Convert these to units of KHz. mesh, ibss, wext, etc. are currently ignored. Signed-off-by: Thomas Pedersen Link: https://lore.kernel.org/r/20200402011810.22947-3-thomas@adapt-ip.com Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 92 ++++++++++++++++++++++++++++++++++++++++++++++---- net/wireless/chan.c | 68 +++++++++++++++++++++---------------- net/wireless/reg.c | 40 +++++++++++----------- net/wireless/scan.c | 4 +-- net/wireless/trace.h | 21 +++++++++--- net/wireless/util.c | 32 +++++++++++------- 6 files changed, 182 insertions(+), 75 deletions(-) diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index e288fdcb3df2..a82fc59a1d82 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -128,6 +128,7 @@ enum ieee80211_channel_flags { * with cfg80211. * * @center_freq: center frequency in MHz + * @freq_offset: offset from @center_freq, in KHz * @hw_value: hardware-specific value for the channel * @flags: channel flags from &enum ieee80211_channel_flags. * @orig_flags: channel flags at registration time, used by regulatory @@ -149,6 +150,7 @@ enum ieee80211_channel_flags { struct ieee80211_channel { enum nl80211_band band; u32 center_freq; + u16 freq_offset; u16 hw_value; u32 flags; int max_antenna_gain; @@ -617,6 +619,7 @@ struct key_params { * If edmg is requested (i.e. the .channels member is non-zero), * chan will define the primary channel and all other * parameters are ignored. + * @freq1_offset: offset from @center_freq1, in KHz */ struct cfg80211_chan_def { struct ieee80211_channel *chan; @@ -624,6 +627,7 @@ struct cfg80211_chan_def { u32 center_freq1; u32 center_freq2; struct ieee80211_edmg edmg; + u16 freq1_offset; }; /** @@ -713,6 +717,7 @@ cfg80211_chandef_identical(const struct cfg80211_chan_def *chandef1, return (chandef1->chan == chandef2->chan && chandef1->width == chandef2->width && chandef1->center_freq1 == chandef2->center_freq1 && + chandef1->freq1_offset == chandef2->freq1_offset && chandef1->center_freq2 == chandef2->center_freq2); } @@ -5177,30 +5182,92 @@ static inline void *wdev_priv(struct wireless_dev *wdev) * cfg80211 offers a number of utility functions that can be useful. */ +/** + * ieee80211_channel_equal - compare two struct ieee80211_channel + * + * @a: 1st struct ieee80211_channel + * @b: 2nd struct ieee80211_channel + * Return: true if center frequency of @a == @b + */ +static inline bool +ieee80211_channel_equal(struct ieee80211_channel *a, + struct ieee80211_channel *b) +{ + return (a->center_freq == b->center_freq && + a->freq_offset == b->freq_offset); +} + +/** + * ieee80211_channel_to_khz - convert ieee80211_channel to frequency in KHz + * @chan: struct ieee80211_channel to convert + * Return: The corresponding frequency (in KHz) + */ +static inline u32 +ieee80211_channel_to_khz(const struct ieee80211_channel *chan) +{ + return MHZ_TO_KHZ(chan->center_freq) + chan->freq_offset; +} + +/** + * ieee80211_channel_to_freq_khz - convert channel number to frequency + * @chan: channel number + * @band: band, necessary due to channel number overlap + * Return: The corresponding frequency (in KHz), or 0 if the conversion failed. + */ +u32 ieee80211_channel_to_freq_khz(int chan, enum nl80211_band band); + /** * ieee80211_channel_to_frequency - convert channel number to frequency * @chan: channel number * @band: band, necessary due to channel number overlap * Return: The corresponding frequency (in MHz), or 0 if the conversion failed. */ -int ieee80211_channel_to_frequency(int chan, enum nl80211_band band); +static inline int +ieee80211_channel_to_frequency(int chan, enum nl80211_band band) +{ + return KHZ_TO_MHZ(ieee80211_channel_to_freq_khz(chan, band)); +} + +/** + * ieee80211_freq_khz_to_channel - convert frequency to channel number + * @freq: center frequency in KHz + * Return: The corresponding channel, or 0 if the conversion failed. + */ +int ieee80211_freq_khz_to_channel(u32 freq); /** * ieee80211_frequency_to_channel - convert frequency to channel number - * @freq: center frequency + * @freq: center frequency in MHz * Return: The corresponding channel, or 0 if the conversion failed. */ -int ieee80211_frequency_to_channel(int freq); +static inline int +ieee80211_frequency_to_channel(int freq) +{ + return ieee80211_freq_khz_to_channel(MHZ_TO_KHZ(freq)); +} + +/** + * ieee80211_get_channel_khz - get channel struct from wiphy for specified + * frequency + * @wiphy: the struct wiphy to get the channel for + * @freq: the center frequency (in KHz) of the channel + * Return: The channel struct from @wiphy at @freq. + */ +struct ieee80211_channel * +ieee80211_get_channel_khz(struct wiphy *wiphy, u32 freq); /** * ieee80211_get_channel - get channel struct from wiphy for specified frequency * * @wiphy: the struct wiphy to get the channel for - * @freq: the center frequency of the channel - * + * @freq: the center frequency (in MHz) of the channel * Return: The channel struct from @wiphy at @freq. */ -struct ieee80211_channel *ieee80211_get_channel(struct wiphy *wiphy, int freq); +static inline struct ieee80211_channel * +ieee80211_get_channel(struct wiphy *wiphy, int freq) +{ + return ieee80211_get_channel_khz(wiphy, MHZ_TO_KHZ(freq)); +} /** * ieee80211_get_response_rate - get basic rate for a given rate @@ -7228,6 +7295,19 @@ bool ieee80211_operating_class_to_band(u8 operating_class, bool ieee80211_chandef_to_operating_class(struct cfg80211_chan_def *chandef, u8 *op_class); +/** + * ieee80211_chandef_to_khz - convert chandef to frequency in KHz + * + * @chandef: the chandef to convert + * + * Returns the center frequency of chandef (1st segment) in KHz. + */ +static inline u32 +ieee80211_chandef_to_khz(const struct cfg80211_chan_def *chandef) +{ + return MHZ_TO_KHZ(chandef->center_freq1) + chandef->freq1_offset; +} + /* * cfg80211_tdls_oper_request - request userspace to perform TDLS operation * @dev: the device on which the operation is requested diff --git a/net/wireless/chan.c b/net/wireless/chan.c index fcac5c6366e1..d60e50a3b910 100644 --- a/net/wireless/chan.c +++ b/net/wireless/chan.c @@ -27,6 +27,7 @@ void cfg80211_chandef_create(struct cfg80211_chan_def *chandef, return; chandef->chan = chan; + chandef->freq1_offset = chan->freq_offset; chandef->center_freq2 = 0; chandef->edmg.bw_config = 0; chandef->edmg.channels = 0; @@ -153,7 +154,8 @@ bool cfg80211_chandef_valid(const struct cfg80211_chan_def *chandef) case NL80211_CHAN_WIDTH_10: case NL80211_CHAN_WIDTH_20: case NL80211_CHAN_WIDTH_20_NOHT: - if (chandef->center_freq1 != control_freq) + if (ieee80211_chandef_to_khz(chandef) != + ieee80211_channel_to_khz(chandef->chan)) return false; if (chandef->center_freq2) return false; @@ -386,10 +388,11 @@ static u32 cfg80211_get_start_freq(u32 center_freq, { u32 start_freq; - if (bandwidth <= 20) + bandwidth = MHZ_TO_KHZ(bandwidth); + if (bandwidth <= MHZ_TO_KHZ(20)) start_freq = center_freq; else - start_freq = center_freq - bandwidth/2 + 10; + start_freq = center_freq - bandwidth / 2 + MHZ_TO_KHZ(10); return start_freq; } @@ -399,10 +402,11 @@ static u32 cfg80211_get_end_freq(u32 center_freq, { u32 end_freq; - if (bandwidth <= 20) + bandwidth = MHZ_TO_KHZ(bandwidth); + if (bandwidth <= MHZ_TO_KHZ(20)) end_freq = center_freq; else - end_freq = center_freq + bandwidth/2 - 10; + end_freq = center_freq + bandwidth / 2 - MHZ_TO_KHZ(10); return end_freq; } @@ -417,8 +421,8 @@ static int cfg80211_get_chans_dfs_required(struct wiphy *wiphy, start_freq = cfg80211_get_start_freq(center_freq, bandwidth); end_freq = cfg80211_get_end_freq(center_freq, bandwidth); - for (freq = start_freq; freq <= end_freq; freq += 20) { - c = ieee80211_get_channel(wiphy, freq); + for (freq = start_freq; freq <= end_freq; freq += MHZ_TO_KHZ(20)) { + c = ieee80211_get_channel_khz(wiphy, freq); if (!c) return -EINVAL; @@ -449,8 +453,8 @@ int cfg80211_chandef_dfs_required(struct wiphy *wiphy, return -EINVAL; ret = cfg80211_get_chans_dfs_required(wiphy, - chandef->center_freq1, - width); + ieee80211_chandef_to_khz(chandef), + width); if (ret < 0) return ret; else if (ret > 0) @@ -460,8 +464,8 @@ int cfg80211_chandef_dfs_required(struct wiphy *wiphy, return 0; ret = cfg80211_get_chans_dfs_required(wiphy, - chandef->center_freq2, - width); + MHZ_TO_KHZ(chandef->center_freq2), + width); if (ret < 0) return ret; else if (ret > 0) @@ -503,8 +507,8 @@ static int cfg80211_get_chans_dfs_usable(struct wiphy *wiphy, * DFS_AVAILABLE). Return number of usable channels * (require CAC). Allow DFS and non-DFS channel mix. */ - for (freq = start_freq; freq <= end_freq; freq += 20) { - c = ieee80211_get_channel(wiphy, freq); + for (freq = start_freq; freq <= end_freq; freq += MHZ_TO_KHZ(20)) { + c = ieee80211_get_channel_khz(wiphy, freq); if (!c) return -EINVAL; @@ -536,8 +540,9 @@ bool cfg80211_chandef_dfs_usable(struct wiphy *wiphy, if (width < 0) return false; - r1 = cfg80211_get_chans_dfs_usable(wiphy, chandef->center_freq1, - width); + r1 = cfg80211_get_chans_dfs_usable(wiphy, + MHZ_TO_KHZ(chandef->center_freq1), + width); if (r1 < 0) return false; @@ -546,8 +551,8 @@ bool cfg80211_chandef_dfs_usable(struct wiphy *wiphy, case NL80211_CHAN_WIDTH_80P80: WARN_ON(!chandef->center_freq2); r2 = cfg80211_get_chans_dfs_usable(wiphy, - chandef->center_freq2, - width); + MHZ_TO_KHZ(chandef->center_freq2), + width); if (r2 < 0) return false; break; @@ -694,8 +699,8 @@ static bool cfg80211_get_chans_dfs_available(struct wiphy *wiphy, * If any channel in between is disabled or has not * had gone through CAC return false */ - for (freq = start_freq; freq <= end_freq; freq += 20) { - c = ieee80211_get_channel(wiphy, freq); + for (freq = start_freq; freq <= end_freq; freq += MHZ_TO_KHZ(20)) { + c = ieee80211_get_channel_khz(wiphy, freq); if (!c) return false; @@ -724,7 +729,8 @@ static bool cfg80211_chandef_dfs_available(struct wiphy *wiphy, if (width < 0) return false; - r = cfg80211_get_chans_dfs_available(wiphy, chandef->center_freq1, + r = cfg80211_get_chans_dfs_available(wiphy, + MHZ_TO_KHZ(chandef->center_freq1), width); /* If any of channels unavailable for cf1 just return */ @@ -735,8 +741,8 @@ static bool cfg80211_chandef_dfs_available(struct wiphy *wiphy, case NL80211_CHAN_WIDTH_80P80: WARN_ON(!chandef->center_freq2); r = cfg80211_get_chans_dfs_available(wiphy, - chandef->center_freq2, - width); + MHZ_TO_KHZ(chandef->center_freq2), + width); break; default: WARN_ON(chandef->center_freq2); @@ -757,8 +763,8 @@ static unsigned int cfg80211_get_chans_dfs_cac_time(struct wiphy *wiphy, start_freq = cfg80211_get_start_freq(center_freq, bandwidth); end_freq = cfg80211_get_end_freq(center_freq, bandwidth); - for (freq = start_freq; freq <= end_freq; freq += 20) { - c = ieee80211_get_channel(wiphy, freq); + for (freq = start_freq; freq <= end_freq; freq += MHZ_TO_KHZ(20)) { + c = ieee80211_get_channel_khz(wiphy, freq); if (!c) return 0; @@ -790,14 +796,14 @@ cfg80211_chandef_dfs_cac_time(struct wiphy *wiphy, return 0; t1 = cfg80211_get_chans_dfs_cac_time(wiphy, - chandef->center_freq1, + MHZ_TO_KHZ(chandef->center_freq1), width); if (!chandef->center_freq2) return t1; t2 = cfg80211_get_chans_dfs_cac_time(wiphy, - chandef->center_freq2, + MHZ_TO_KHZ(chandef->center_freq2), width); return max(t1, t2); @@ -813,8 +819,8 @@ static bool cfg80211_secondary_chans_ok(struct wiphy *wiphy, start_freq = cfg80211_get_start_freq(center_freq, bandwidth); end_freq = cfg80211_get_end_freq(center_freq, bandwidth); - for (freq = start_freq; freq <= end_freq; freq += 20) { - c = ieee80211_get_channel(wiphy, freq); + for (freq = start_freq; freq <= end_freq; freq += MHZ_TO_KHZ(20)) { + c = ieee80211_get_channel_khz(wiphy, freq); if (!c || c->flags & prohibited_flags) return false; } @@ -976,13 +982,15 @@ bool cfg80211_chandef_usable(struct wiphy *wiphy, prohibited_flags |= IEEE80211_CHAN_NO_OFDM; - if (!cfg80211_secondary_chans_ok(wiphy, chandef->center_freq1, + if (!cfg80211_secondary_chans_ok(wiphy, + ieee80211_chandef_to_khz(chandef), width, prohibited_flags)) return false; if (!chandef->center_freq2) return true; - return cfg80211_secondary_chans_ok(wiphy, chandef->center_freq2, + return cfg80211_secondary_chans_ok(wiphy, + MHZ_TO_KHZ(chandef->center_freq2), width, prohibited_flags); } EXPORT_SYMBOL(cfg80211_chandef_usable); diff --git a/net/wireless/reg.c b/net/wireless/reg.c index d476d4da0d09..0d74a31ef0ab 100644 --- a/net/wireless/reg.c +++ b/net/wireless/reg.c @@ -1658,22 +1658,23 @@ static uint32_t reg_rule_to_chan_bw_flags(const struct ieee80211_regdomain *regd const struct ieee80211_channel *chan) { const struct ieee80211_freq_range *freq_range = NULL; - u32 max_bandwidth_khz, bw_flags = 0; + u32 max_bandwidth_khz, center_freq_khz, bw_flags = 0; freq_range = ®_rule->freq_range; max_bandwidth_khz = freq_range->max_bandwidth_khz; + center_freq_khz = ieee80211_channel_to_khz(chan); /* Check if auto calculation requested */ if (reg_rule->flags & NL80211_RRF_AUTO_BW) max_bandwidth_khz = reg_get_max_bandwidth(regd, reg_rule); /* If we get a reg_rule we can assume that at least 5Mhz fit */ if (!cfg80211_does_bw_fit_range(freq_range, - MHZ_TO_KHZ(chan->center_freq), + center_freq_khz, MHZ_TO_KHZ(10))) bw_flags |= IEEE80211_CHAN_NO_10MHZ; if (!cfg80211_does_bw_fit_range(freq_range, - MHZ_TO_KHZ(chan->center_freq), + center_freq_khz, MHZ_TO_KHZ(20))) bw_flags |= IEEE80211_CHAN_NO_20MHZ; @@ -1710,7 +1711,7 @@ static void handle_channel(struct wiphy *wiphy, flags = chan->orig_flags; - reg_rule = freq_reg_info(wiphy, MHZ_TO_KHZ(chan->center_freq)); + reg_rule = freq_reg_info(wiphy, ieee80211_channel_to_khz(chan)); if (IS_ERR(reg_rule)) { /* * We will disable all channels that do not match our @@ -1729,13 +1730,13 @@ static void handle_channel(struct wiphy *wiphy, if (lr->initiator == NL80211_REGDOM_SET_BY_DRIVER && request_wiphy && request_wiphy == wiphy && request_wiphy->regulatory_flags & REGULATORY_STRICT_REG) { - pr_debug("Disabling freq %d MHz for good\n", - chan->center_freq); + pr_debug("Disabling freq %d.%03d MHz for good\n", + chan->center_freq, chan->freq_offset); chan->orig_flags |= IEEE80211_CHAN_DISABLED; chan->flags = chan->orig_flags; } else { - pr_debug("Disabling freq %d MHz\n", - chan->center_freq); + pr_debug("Disabling freq %d.%03d MHz\n", + chan->center_freq, chan->freq_offset); chan->flags |= IEEE80211_CHAN_DISABLED; } return; @@ -1936,7 +1937,7 @@ static void handle_reg_beacon(struct wiphy *wiphy, unsigned int chan_idx, sband = wiphy->bands[reg_beacon->chan.band]; chan = &sband->channels[chan_idx]; - if (likely(chan->center_freq != reg_beacon->chan.center_freq)) + if (likely(!ieee80211_channel_equal(chan, ®_beacon->chan))) return; if (chan->beacon_found) @@ -2269,18 +2270,18 @@ static void handle_channel_custom(struct wiphy *wiphy, u32 bw_flags = 0; const struct ieee80211_reg_rule *reg_rule = NULL; const struct ieee80211_power_rule *power_rule = NULL; - u32 bw; + u32 bw, center_freq_khz; + center_freq_khz = ieee80211_channel_to_khz(chan); for (bw = MHZ_TO_KHZ(20); bw >= min_bw; bw = bw / 2) { - reg_rule = freq_reg_info_regd(MHZ_TO_KHZ(chan->center_freq), - regd, bw); + reg_rule = freq_reg_info_regd(center_freq_khz, regd, bw); if (!IS_ERR(reg_rule)) break; } if (IS_ERR_OR_NULL(reg_rule)) { - pr_debug("Disabling freq %d MHz as custom regd has no rule that fits it\n", - chan->center_freq); + pr_debug("Disabling freq %d.%03d MHz as custom regd has no rule that fits it\n", + chan->center_freq, chan->freq_offset); if (wiphy->regulatory_flags & REGULATORY_WIPHY_SELF_MANAGED) { chan->flags |= IEEE80211_CHAN_DISABLED; } else { @@ -3337,8 +3338,8 @@ static bool pending_reg_beacon(struct ieee80211_channel *beacon_chan) struct reg_beacon *pending_beacon; list_for_each_entry(pending_beacon, ®_pending_beacons, list) - if (beacon_chan->center_freq == - pending_beacon->chan.center_freq) + if (ieee80211_channel_equal(beacon_chan, + &pending_beacon->chan)) return true; return false; } @@ -3367,9 +3368,10 @@ int regulatory_hint_found_beacon(struct wiphy *wiphy, if (!reg_beacon) return -ENOMEM; - pr_debug("Found new beacon on frequency: %d MHz (Ch %d) on %s\n", - beacon_chan->center_freq, - ieee80211_frequency_to_channel(beacon_chan->center_freq), + pr_debug("Found new beacon on frequency: %d.%03d MHz (Ch %d) on %s\n", + beacon_chan->center_freq, beacon_chan->freq_offset, + ieee80211_freq_khz_to_channel( + ieee80211_channel_to_khz(beacon_chan)), wiphy_name(wiphy)); memcpy(®_beacon->chan, beacon_chan, diff --git a/net/wireless/scan.c b/net/wireless/scan.c index 4000382aef48..74ea4cfb39fb 100644 --- a/net/wireless/scan.c +++ b/net/wireless/scan.c @@ -1322,8 +1322,8 @@ cfg80211_get_bss_channel(struct wiphy *wiphy, const u8 *ie, size_t ielen, return channel; } - freq = ieee80211_channel_to_frequency(channel_number, channel->band); - alt_channel = ieee80211_get_channel(wiphy, freq); + freq = ieee80211_channel_to_freq_khz(channel_number, channel->band); + alt_channel = ieee80211_get_channel_khz(wiphy, freq); if (!alt_channel) { if (channel->band == NL80211_BAND_2GHZ) { /* diff --git a/net/wireless/trace.h b/net/wireless/trace.h index ee736620f1e3..53c887ea67c7 100644 --- a/net/wireless/trace.h +++ b/net/wireless/trace.h @@ -112,24 +112,29 @@ } while (0) #define CHAN_ENTRY __field(enum nl80211_band, band) \ - __field(u32, center_freq) + __field(u32, center_freq) \ + __field(u16, freq_offset) #define CHAN_ASSIGN(chan) \ do { \ if (chan) { \ __entry->band = chan->band; \ __entry->center_freq = chan->center_freq; \ + __entry->freq_offset = chan->freq_offset; \ } else { \ __entry->band = 0; \ __entry->center_freq = 0; \ + __entry->freq_offset = 0; \ } \ } while (0) -#define CHAN_PR_FMT "band: %d, freq: %u" -#define CHAN_PR_ARG __entry->band, __entry->center_freq +#define CHAN_PR_FMT "band: %d, freq: %u.%03u" +#define CHAN_PR_ARG __entry->band, __entry->center_freq, __entry->freq_offset #define CHAN_DEF_ENTRY __field(enum nl80211_band, band) \ __field(u32, control_freq) \ + __field(u32, freq_offset) \ __field(u32, width) \ __field(u32, center_freq1) \ + __field(u32, freq1_offset) \ __field(u32, center_freq2) #define CHAN_DEF_ASSIGN(chandef) \ do { \ @@ -137,21 +142,27 @@ __entry->band = (chandef)->chan->band; \ __entry->control_freq = \ (chandef)->chan->center_freq; \ + __entry->freq_offset = \ + (chandef)->chan->freq_offset; \ __entry->width = (chandef)->width; \ __entry->center_freq1 = (chandef)->center_freq1;\ + __entry->freq1_offset = (chandef)->freq1_offset;\ __entry->center_freq2 = (chandef)->center_freq2;\ } else { \ __entry->band = 0; \ __entry->control_freq = 0; \ + __entry->freq_offset = 0; \ __entry->width = 0; \ __entry->center_freq1 = 0; \ + __entry->freq1_offset = 0; \ __entry->center_freq2 = 0; \ } \ } while (0) #define CHAN_DEF_PR_FMT \ - "band: %d, control freq: %u, width: %d, cf1: %u, cf2: %u" + "band: %d, control freq: %u.%03u, width: %d, cf1: %u.%03u, cf2: %u" #define CHAN_DEF_PR_ARG __entry->band, __entry->control_freq, \ - __entry->width, __entry->center_freq1, \ + __entry->freq_offset, __entry->width, \ + __entry->center_freq1, __entry->freq1_offset, \ __entry->center_freq2 #define SINFO_ENTRY __field(int, generation) \ diff --git a/net/wireless/util.c b/net/wireless/util.c index 123d6ce79b8e..df75e58eca5d 100644 --- a/net/wireless/util.c +++ b/net/wireless/util.c @@ -72,7 +72,7 @@ u32 ieee80211_mandatory_rates(struct ieee80211_supported_band *sband, } EXPORT_SYMBOL(ieee80211_mandatory_rates); -int ieee80211_channel_to_frequency(int chan, enum nl80211_band band) +u32 ieee80211_channel_to_freq_khz(int chan, enum nl80211_band band) { /* see 802.11 17.3.8.3.2 and Annex J * there are overlapping channel numbers in 5GHz and 2GHz bands */ @@ -81,15 +81,15 @@ int ieee80211_channel_to_frequency(int chan, enum nl80211_band band) switch (band) { case NL80211_BAND_2GHZ: if (chan == 14) - return 2484; + return MHZ_TO_KHZ(2484); else if (chan < 14) - return 2407 + chan * 5; + return MHZ_TO_KHZ(2407 + chan * 5); break; case NL80211_BAND_5GHZ: if (chan >= 182 && chan <= 196) - return 4000 + chan * 5; + return MHZ_TO_KHZ(4000 + chan * 5); else - return 5000 + chan * 5; + return MHZ_TO_KHZ(5000 + chan * 5); break; case NL80211_BAND_6GHZ: /* see 802.11ax D4.1 27.3.22.2 */ @@ -98,17 +98,20 @@ int ieee80211_channel_to_frequency(int chan, enum nl80211_band band) break; case NL80211_BAND_60GHZ: if (chan < 7) - return 56160 + chan * 2160; + return MHZ_TO_KHZ(56160 + chan * 2160); break; default: ; } return 0; /* not supported */ } -EXPORT_SYMBOL(ieee80211_channel_to_frequency); +EXPORT_SYMBOL(ieee80211_channel_to_freq_khz); -int ieee80211_frequency_to_channel(int freq) +int ieee80211_freq_khz_to_channel(u32 freq) { + /* TODO: just handle MHz for now */ + freq = KHZ_TO_MHZ(freq); + /* see 802.11 17.3.8.3.2 and Annex J */ if (freq == 2484) return 14; @@ -126,9 +129,10 @@ int ieee80211_frequency_to_channel(int freq) else return 0; } -EXPORT_SYMBOL(ieee80211_frequency_to_channel); +EXPORT_SYMBOL(ieee80211_freq_khz_to_channel); -struct ieee80211_channel *ieee80211_get_channel(struct wiphy *wiphy, int freq) +struct ieee80211_channel *ieee80211_get_channel_khz(struct wiphy *wiphy, + u32 freq) { enum nl80211_band band; struct ieee80211_supported_band *sband; @@ -141,14 +145,16 @@ struct ieee80211_channel *ieee80211_get_channel(struct wiphy *wiphy, int freq) continue; for (i = 0; i < sband->n_channels; i++) { - if (sband->channels[i].center_freq == freq) - return &sband->channels[i]; + struct ieee80211_channel *chan = &sband->channels[i]; + + if (ieee80211_channel_to_khz(chan) == freq) + return chan; } } return NULL; } -EXPORT_SYMBOL(ieee80211_get_channel); +EXPORT_SYMBOL(ieee80211_get_channel_khz); static void set_mandatory_flags_band(struct ieee80211_supported_band *sband) { -- cgit v1.2.3-59-g8ed1b From b6011960f392d1de619f10aa5d088c27f1e7526c Mon Sep 17 00:00:00 2001 From: Thomas Pedersen Date: Wed, 1 Apr 2020 18:18:04 -0700 Subject: mac80211: handle channel frequency offset cfg80211_chan_def and ieee80211_channel recently gained a frequency offset component. Handle this where it makes sense (potentially required by S1G channels). For IBSS, TDLS, CSA, and ROC we return -EOPNOTSUPP if a channel with frequency offset is passed, since they may or may not work. Once someone tests and verifies these commands work on thos types of channels, we can remove that error. join_ocb and join_mesh look harmless because they use a simple ieee80211_vif_use_channel(), which is using an already verified channel, so we let those through. Signed-off-by: Thomas Pedersen Link: https://lore.kernel.org/r/20200402011810.22947-4-thomas@adapt-ip.com Signed-off-by: Johannes Berg --- net/mac80211/cfg.c | 6 ++++++ net/mac80211/chan.c | 1 + net/mac80211/ibss.c | 5 +++++ net/mac80211/main.c | 8 +++++--- net/mac80211/mlme.c | 16 ++++++++++++---- net/mac80211/offchannel.c | 4 ++++ net/mac80211/scan.c | 1 + net/mac80211/tdls.c | 4 ++++ net/mac80211/trace.h | 41 +++++++++++++++++++++++++++++++++-------- 9 files changed, 71 insertions(+), 15 deletions(-) diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index f0d43b9cfa43..ae3e06375a28 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -3287,6 +3287,12 @@ __ieee80211_channel_switch(struct wiphy *wiphy, struct net_device *dev, goto out; } + if (params->chandef.chan->freq_offset) { + /* this may work, but is untested */ + err = -EOPNOTSUPP; + goto out; + } + chanctx = container_of(conf, struct ieee80211_chanctx, conf); ch_switch.timestamp = 0; diff --git a/net/mac80211/chan.c b/net/mac80211/chan.c index 9c94baaf693c..e6e192f53e4e 100644 --- a/net/mac80211/chan.c +++ b/net/mac80211/chan.c @@ -533,6 +533,7 @@ static void ieee80211_del_chanctx(struct ieee80211_local *local, struct cfg80211_chan_def *chandef = &local->_oper_chandef; chandef->width = NL80211_CHAN_WIDTH_20_NOHT; chandef->center_freq1 = chandef->chan->center_freq; + chandef->freq1_offset = chandef->chan->freq_offset; chandef->center_freq2 = 0; /* NOTE: Disabling radar is only valid here for diff --git a/net/mac80211/ibss.c b/net/mac80211/ibss.c index d40744903fa9..2479cd48fed0 100644 --- a/net/mac80211/ibss.c +++ b/net/mac80211/ibss.c @@ -1758,6 +1758,11 @@ int ieee80211_ibss_join(struct ieee80211_sub_if_data *sdata, int i; int ret; + if (params->chandef.chan->freq_offset) { + /* this may work, but is untested */ + return -EOPNOTSUPP; + } + ret = cfg80211_chandef_dfs_required(local->hw.wiphy, ¶ms->chandef, sdata->wdev.iftype); diff --git a/net/mac80211/main.c b/net/mac80211/main.c index a0cb052ea30d..dfcee5e462da 100644 --- a/net/mac80211/main.c +++ b/net/mac80211/main.c @@ -107,13 +107,15 @@ static u32 ieee80211_hw_conf_chan(struct ieee80211_local *local) chandef.chan = local->tmp_channel; chandef.width = NL80211_CHAN_WIDTH_20_NOHT; chandef.center_freq1 = chandef.chan->center_freq; + chandef.freq1_offset = chandef.chan->freq_offset; } else chandef = local->_oper_chandef; WARN(!cfg80211_chandef_valid(&chandef), - "control:%d MHz width:%d center: %d/%d MHz", - chandef.chan->center_freq, chandef.width, - chandef.center_freq1, chandef.center_freq2); + "control:%d.%03d MHz width:%d center: %d.%03d/%d MHz", + chandef.chan->center_freq, chandef.chan->freq_offset, + chandef.width, chandef.center_freq1, chandef.freq1_offset, + chandef.center_freq2); if (!cfg80211_chandef_identical(&chandef, &local->_oper_chandef)) local->hw.conf.flags |= IEEE80211_CONF_OFFCHANNEL; diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index 59a35c7997c3..acc8adf50d69 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -162,6 +162,7 @@ ieee80211_determine_chantype(struct ieee80211_sub_if_data *sdata, chandef->chan = channel; chandef->width = NL80211_CHAN_WIDTH_20_NOHT; chandef->center_freq1 = channel->center_freq; + chandef->freq1_offset = channel->freq_offset; if (!ht_oper || !sta_ht_cap.ht_supported) { ret = IEEE80211_STA_DISABLE_HT | @@ -396,9 +397,12 @@ static int ieee80211_config_bw(struct ieee80211_sub_if_data *sdata, return 0; sdata_info(sdata, - "AP %pM changed bandwidth, new config is %d MHz, width %d (%d/%d MHz)\n", - ifmgd->bssid, chandef.chan->center_freq, chandef.width, - chandef.center_freq1, chandef.center_freq2); + "AP %pM changed bandwidth, new config is %d.%03d MHz, " + "width %d (%d.%03d/%d MHz)\n", + ifmgd->bssid, chandef.chan->center_freq, + chandef.chan->freq_offset, chandef.width, + chandef.center_freq1, chandef.freq1_offset, + chandef.center_freq2); if (flags != (ifmgd->flags & (IEEE80211_STA_DISABLE_HT | IEEE80211_STA_DISABLE_VHT | @@ -1364,10 +1368,14 @@ ieee80211_sta_process_chanswitch(struct ieee80211_sub_if_data *sdata, if (!cfg80211_chandef_usable(local->hw.wiphy, &csa_ie.chandef, IEEE80211_CHAN_DISABLED)) { sdata_info(sdata, - "AP %pM switches to unsupported channel (%d MHz, width:%d, CF1/2: %d/%d MHz), disconnecting\n", + "AP %pM switches to unsupported channel " + "(%d.%03d MHz, width:%d, CF1/2: %d.%03d/%d MHz), " + "disconnecting\n", ifmgd->associated->bssid, csa_ie.chandef.chan->center_freq, + csa_ie.chandef.chan->freq_offset, csa_ie.chandef.width, csa_ie.chandef.center_freq1, + csa_ie.chandef.freq1_offset, csa_ie.chandef.center_freq2); ieee80211_queue_work(&local->hw, &ifmgd->csa_connection_drop_work); diff --git a/net/mac80211/offchannel.c b/net/mac80211/offchannel.c index c710504ccf1a..db3b8bf75656 100644 --- a/net/mac80211/offchannel.c +++ b/net/mac80211/offchannel.c @@ -557,6 +557,10 @@ static int ieee80211_start_roc_work(struct ieee80211_local *local, lockdep_assert_held(&local->mtx); + if (channel->freq_offset) + /* this may work, but is untested */ + return -EOPNOTSUPP; + if (local->use_chanctx && !local->ops->remain_on_channel) return -EOPNOTSUPP; diff --git a/net/mac80211/scan.c b/net/mac80211/scan.c index fdac8192a519..4d14118dddca 100644 --- a/net/mac80211/scan.c +++ b/net/mac80211/scan.c @@ -896,6 +896,7 @@ static void ieee80211_scan_state_set_channel(struct ieee80211_local *local, local->scan_chandef.chan = chan; local->scan_chandef.center_freq1 = chan->center_freq; + local->scan_chandef.freq1_offset = chan->freq_offset; local->scan_chandef.center_freq2 = 0; switch (scan_req->scan_width) { case NL80211_BSS_CHAN_WIDTH_5: diff --git a/net/mac80211/tdls.c b/net/mac80211/tdls.c index 7ff22f9d6e80..8ad420db3766 100644 --- a/net/mac80211/tdls.c +++ b/net/mac80211/tdls.c @@ -1566,6 +1566,10 @@ ieee80211_tdls_channel_switch(struct wiphy *wiphy, struct net_device *dev, u32 ch_sw_tm_ie; int ret; + if (chandef->chan->freq_offset) + /* this may work, but is untested */ + return -EOPNOTSUPP; + mutex_lock(&local->sta_mtx); sta = sta_info_get(sdata, addr); if (!sta) { diff --git a/net/mac80211/trace.h b/net/mac80211/trace.h index 427f51a0a994..1b4709694d2a 100644 --- a/net/mac80211/trace.h +++ b/net/mac80211/trace.h @@ -37,32 +37,42 @@ #define VIF_PR_ARG __get_str(vif_name), __entry->vif_type, __entry->p2p ? "/p2p" : "" #define CHANDEF_ENTRY __field(u32, control_freq) \ + __field(u32, freq_offset) \ __field(u32, chan_width) \ __field(u32, center_freq1) \ + __field(u32, freq1_offset) \ __field(u32, center_freq2) #define CHANDEF_ASSIGN(c) \ __entry->control_freq = (c) ? ((c)->chan ? (c)->chan->center_freq : 0) : 0; \ + __entry->freq_offset = (c) ? ((c)->chan ? (c)->chan->freq_offset : 0) : 0; \ __entry->chan_width = (c) ? (c)->width : 0; \ __entry->center_freq1 = (c) ? (c)->center_freq1 : 0; \ + __entry->freq1_offset = (c) ? (c)->freq1_offset : 0; \ __entry->center_freq2 = (c) ? (c)->center_freq2 : 0; -#define CHANDEF_PR_FMT " control:%d MHz width:%d center: %d/%d MHz" -#define CHANDEF_PR_ARG __entry->control_freq, __entry->chan_width, \ - __entry->center_freq1, __entry->center_freq2 +#define CHANDEF_PR_FMT " control:%d.%03d MHz width:%d center: %d.%03d/%d MHz" +#define CHANDEF_PR_ARG __entry->control_freq, __entry->freq_offset, __entry->chan_width, \ + __entry->center_freq1, __entry->freq1_offset, __entry->center_freq2 #define MIN_CHANDEF_ENTRY \ __field(u32, min_control_freq) \ + __field(u32, min_freq_offset) \ __field(u32, min_chan_width) \ __field(u32, min_center_freq1) \ + __field(u32, min_freq1_offset) \ __field(u32, min_center_freq2) #define MIN_CHANDEF_ASSIGN(c) \ __entry->min_control_freq = (c)->chan ? (c)->chan->center_freq : 0; \ + __entry->min_freq_offset = (c)->chan ? (c)->chan->freq_offset : 0; \ __entry->min_chan_width = (c)->width; \ __entry->min_center_freq1 = (c)->center_freq1; \ + __entry->freq1_offset = (c)->freq1_offset; \ __entry->min_center_freq2 = (c)->center_freq2; -#define MIN_CHANDEF_PR_FMT " min_control:%d MHz min_width:%d min_center: %d/%d MHz" -#define MIN_CHANDEF_PR_ARG __entry->min_control_freq, __entry->min_chan_width, \ - __entry->min_center_freq1, __entry->min_center_freq2 +#define MIN_CHANDEF_PR_FMT " min_control:%d.%03d MHz min_width:%d min_center: %d.%03d/%d MHz" +#define MIN_CHANDEF_PR_ARG __entry->min_control_freq, __entry->min_freq_offset, \ + __entry->min_chan_width, \ + __entry->min_center_freq1, __entry->min_freq1_offset, \ + __entry->min_center_freq2 #define CHANCTX_ENTRY CHANDEF_ENTRY \ MIN_CHANDEF_ENTRY \ @@ -412,6 +422,7 @@ TRACE_EVENT(drv_bss_info_changed, __field(s32, cqm_rssi_hyst) __field(u32, channel_width) __field(u32, channel_cfreq1) + __field(u32, channel_cfreq1_offset) __dynamic_array(u32, arp_addr_list, info->arp_addr_cnt > IEEE80211_BSS_ARP_ADDR_LIST_LEN ? IEEE80211_BSS_ARP_ADDR_LIST_LEN : @@ -452,6 +463,7 @@ TRACE_EVENT(drv_bss_info_changed, __entry->cqm_rssi_hyst = info->cqm_rssi_hyst; __entry->channel_width = info->chandef.width; __entry->channel_cfreq1 = info->chandef.center_freq1; + __entry->channel_cfreq1_offset = info->chandef.freq1_offset; __entry->arp_addr_cnt = info->arp_addr_cnt; memcpy(__get_dynamic_array(arp_addr_list), info->arp_addr_list, sizeof(u32) * (info->arp_addr_cnt > IEEE80211_BSS_ARP_ADDR_LIST_LEN ? @@ -1223,6 +1235,7 @@ TRACE_EVENT(drv_remain_on_channel, LOCAL_ENTRY VIF_ENTRY __field(int, center_freq) + __field(int, freq_offset) __field(unsigned int, duration) __field(u32, type) ), @@ -1231,14 +1244,16 @@ TRACE_EVENT(drv_remain_on_channel, LOCAL_ASSIGN; VIF_ASSIGN; __entry->center_freq = chan->center_freq; + __entry->freq_offset = chan->freq_offset; __entry->duration = duration; __entry->type = type; ), TP_printk( - LOCAL_PR_FMT VIF_PR_FMT " freq:%dMHz duration:%dms type=%d", + LOCAL_PR_FMT VIF_PR_FMT " freq:%d.%03dMHz duration:%dms type=%d", LOCAL_PR_ARG, VIF_PR_ARG, - __entry->center_freq, __entry->duration, __entry->type + __entry->center_freq, __entry->freq_offset, + __entry->duration, __entry->type ) ); @@ -1546,8 +1561,10 @@ struct trace_vif_entry { struct trace_chandef_entry { u32 control_freq; + u32 freq_offset; u32 chan_width; u32 center_freq1; + u32 freq1_offset; u32 center_freq2; } __packed; @@ -1597,18 +1614,26 @@ TRACE_EVENT(drv_switch_vif_chanctx, sizeof(local_vifs[i].vif.vif_name)); SWITCH_ENTRY_ASSIGN(old_chandef.control_freq, old_ctx->def.chan->center_freq); + SWITCH_ENTRY_ASSIGN(old_chandef.freq_offset, + old_ctx->def.chan->freq_offset); SWITCH_ENTRY_ASSIGN(old_chandef.chan_width, old_ctx->def.width); SWITCH_ENTRY_ASSIGN(old_chandef.center_freq1, old_ctx->def.center_freq1); + SWITCH_ENTRY_ASSIGN(old_chandef.freq1_offset, + old_ctx->def.freq1_offset); SWITCH_ENTRY_ASSIGN(old_chandef.center_freq2, old_ctx->def.center_freq2); SWITCH_ENTRY_ASSIGN(new_chandef.control_freq, new_ctx->def.chan->center_freq); + SWITCH_ENTRY_ASSIGN(new_chandef.freq_offset, + new_ctx->def.chan->freq_offset); SWITCH_ENTRY_ASSIGN(new_chandef.chan_width, new_ctx->def.width); SWITCH_ENTRY_ASSIGN(new_chandef.center_freq1, new_ctx->def.center_freq1); + SWITCH_ENTRY_ASSIGN(new_chandef.freq1_offset, + new_ctx->def.freq1_offset); SWITCH_ENTRY_ASSIGN(new_chandef.center_freq2, new_ctx->def.center_freq2); } -- cgit v1.2.3-59-g8ed1b From 3b23c184f72acddad39c40373f165e1a9e384758 Mon Sep 17 00:00:00 2001 From: Thomas Pedersen Date: Wed, 1 Apr 2020 18:18:05 -0700 Subject: mac80211: add freq_offset to RX status RX status needs a KHz component, so add freq_offset. We can reduce the bits for the frequency since 60 GHz isn't supported. Signed-off-by: Thomas Pedersen Link: https://lore.kernel.org/r/20200402011810.22947-5-thomas@adapt-ip.com [fix commit message] Signed-off-by: Johannes Berg --- include/net/mac80211.h | 10 +++++++++- net/mac80211/mlme.c | 6 ++++-- net/mac80211/rx.c | 1 + net/mac80211/scan.c | 3 ++- 4 files changed, 16 insertions(+), 4 deletions(-) diff --git a/include/net/mac80211.h b/include/net/mac80211.h index f3147633dda2..2936049f918e 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -1335,6 +1335,7 @@ enum mac80211_rx_encoding { * @freq: frequency the radio was tuned to when receiving this frame, in MHz * This field must be set for management frames, but isn't strictly needed * for data (other) frames - for those it only affects radiotap reporting. + * @freq_offset: @freq has a positive offset of 500Khz. * @signal: signal strength when receiving this frame, either in dBm, in dB or * unspecified depending on the hardware capabilities flags * @IEEE80211_HW_SIGNAL_* @@ -1365,7 +1366,7 @@ struct ieee80211_rx_status { u32 device_timestamp; u32 ampdu_reference; u32 flag; - u16 freq; + u16 freq: 13, freq_offset: 1; u8 enc_flags; u8 encoding:2, bw:3, he_ru:3; u8 he_gi:2, he_dcm:1; @@ -1381,6 +1382,13 @@ struct ieee80211_rx_status { u8 zero_length_psdu_type; }; +static inline u32 +ieee80211_rx_status_to_khz(struct ieee80211_rx_status *rx_status) +{ + return MHZ_TO_KHZ(rx_status->freq) + + (rx_status->freq_offset ? 500 : 0); +} + /** * struct ieee80211_vendor_radiotap - vendor radiotap data information * @present: presence bitmap for this vendor namespace diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index acc8adf50d69..a259b4487b60 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -3683,7 +3683,8 @@ static void ieee80211_rx_bss_info(struct ieee80211_sub_if_data *sdata, sdata_assert_lock(sdata); - channel = ieee80211_get_channel(local->hw.wiphy, rx_status->freq); + channel = ieee80211_get_channel_khz(local->hw.wiphy, + ieee80211_rx_status_to_khz(rx_status)); if (!channel) return; @@ -3899,7 +3900,8 @@ static void ieee80211_rx_mgmt_beacon(struct ieee80211_sub_if_data *sdata, return; } - if (rx_status->freq != chanctx_conf->def.chan->center_freq) { + if (ieee80211_rx_status_to_khz(rx_status) != + ieee80211_channel_to_khz(chanctx_conf->def.chan)) { rcu_read_unlock(); return; } diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index a724551b8ddf..eaf8931e4627 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -412,6 +412,7 @@ ieee80211_add_rx_radiotap_header(struct ieee80211_local *local, pos++; /* IEEE80211_RADIOTAP_CHANNEL */ + /* TODO: frequency offset in KHz */ put_unaligned_le16(status->freq, pos); pos += 2; if (status->bw == RATE_INFO_BW_10) diff --git a/net/mac80211/scan.c b/net/mac80211/scan.c index 4d14118dddca..5db15996524f 100644 --- a/net/mac80211/scan.c +++ b/net/mac80211/scan.c @@ -275,7 +275,8 @@ void ieee80211_scan_rx(struct ieee80211_local *local, struct sk_buff *skb) return; } - channel = ieee80211_get_channel(local->hw.wiphy, rx_status->freq); + channel = ieee80211_get_channel_khz(local->hw.wiphy, + ieee80211_rx_status_to_khz(rx_status)); if (!channel || channel->flags & IEEE80211_CHAN_DISABLED) return; -- cgit v1.2.3-59-g8ed1b From be689f68d040702a3521035d267949d3927971f0 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Fri, 24 Apr 2020 12:01:04 +0200 Subject: cfg80211: reject channels/chandefs with KHz offset >= 1000 This should be covered by the next MHz, make sure that the numbers are always normalized. Signed-off-by: Johannes Berg Link: https://lore.kernel.org/r/20200424120103.12b91ecf75f9.I4bf499d58404283bbfacb517d614a816763bccf2@changeid Signed-off-by: Johannes Berg --- net/wireless/chan.c | 3 +++ net/wireless/core.c | 3 +++ 2 files changed, 6 insertions(+) diff --git a/net/wireless/chan.c b/net/wireless/chan.c index d60e50a3b910..e111c08daa0e 100644 --- a/net/wireless/chan.c +++ b/net/wireless/chan.c @@ -147,6 +147,9 @@ bool cfg80211_chandef_valid(const struct cfg80211_chan_def *chandef) if (!chandef->chan) return false; + if (chandef->freq1_offset >= 1000) + return false; + control_freq = chandef->chan->center_freq; switch (chandef->width) { diff --git a/net/wireless/core.c b/net/wireless/core.c index 5757dea2aa94..b795f363d004 100644 --- a/net/wireless/core.c +++ b/net/wireless/core.c @@ -834,6 +834,9 @@ int wiphy_register(struct wiphy *wiphy) sband->channels[i].orig_mpwr = sband->channels[i].max_power; sband->channels[i].band = band; + + if (WARN_ON(sband->channels[i].freq_offset >= 1000)) + return -EINVAL; } for (i = 0; i < sband->n_iftype_data; i++) { -- cgit v1.2.3-59-g8ed1b From b6b5c42e3bab939d357d800fd313e3c995164065 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Fri, 24 Apr 2020 12:39:46 +0200 Subject: mac80211: fix two missing documentation entries Add documentation for two struct entries that was missing. Signed-off-by: Johannes Berg Link: https://lore.kernel.org/r/20200424123945.6b23a26ab5e7.I664440ab5f33442df8103253bf5b9fe84be8d58c@changeid Signed-off-by: Johannes Berg --- include/net/mac80211.h | 2 ++ net/mac80211/sta_info.h | 2 ++ 2 files changed, 4 insertions(+) diff --git a/include/net/mac80211.h b/include/net/mac80211.h index 2936049f918e..ecb219e3ec4f 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -820,6 +820,8 @@ enum mac80211_tx_info_flags { * @IEEE80211_TX_CTRL_AMSDU: This frame is an A-MSDU frame * @IEEE80211_TX_CTRL_FAST_XMIT: This frame is going through the fast_xmit path * @IEEE80211_TX_CTRL_SKIP_MPATH_LOOKUP: This frame skips mesh path lookup + * @IEEE80211_TX_CTRL_HW_80211_ENCAP: This frame uses hardware encapsulation + * (header conversion) * * These flags are used in tx_info->control.flags. */ diff --git a/net/mac80211/sta_info.h b/net/mac80211/sta_info.h index a5de3aa6ea42..49728047dfad 100644 --- a/net/mac80211/sta_info.h +++ b/net/mac80211/sta_info.h @@ -69,6 +69,8 @@ * @WLAN_STA_MPSP_RECIPIENT: local STA is recipient of a MPSP. * @WLAN_STA_PS_DELIVER: station woke up, but we're still blocking TX * until pending frames are delivered + * @WLAN_STA_USES_ENCRYPTION: This station was configured for encryption, + * so drop all packets without a key later. * * @NUM_WLAN_STA_FLAGS: number of defined flags */ -- cgit v1.2.3-59-g8ed1b From e73f94d1b6f05f6f22434c63de255a9dec6fd23d Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Fri, 24 Apr 2020 21:14:37 +0800 Subject: batman-adv: remove unused inline function batadv_arp_change_timeout There's no callers in-tree. Signed-off-by: YueHaibing Signed-off-by: Sven Eckelmann Signed-off-by: Simon Wunderlich --- net/batman-adv/distributed-arp-table.h | 5 ----- 1 file changed, 5 deletions(-) diff --git a/net/batman-adv/distributed-arp-table.h b/net/batman-adv/distributed-arp-table.h index 2bff2f4a325c..4e031661682a 100644 --- a/net/batman-adv/distributed-arp-table.h +++ b/net/batman-adv/distributed-arp-table.h @@ -163,11 +163,6 @@ static inline void batadv_dat_init_own_addr(struct batadv_priv *bat_priv, { } -static inline void batadv_arp_change_timeout(struct net_device *soft_iface, - const char *name) -{ -} - static inline int batadv_dat_init(struct batadv_priv *bat_priv) { return 0; -- cgit v1.2.3-59-g8ed1b From c2cf318df87c3745fd0cf76c6a6ec2b85380dbdf Mon Sep 17 00:00:00 2001 From: Tova Mussai Date: Fri, 17 Apr 2020 13:21:35 +0300 Subject: iwlwifi: nvm: use iwl_nl80211_band_from_channel_idx Use iwl_nl80211_band_from_channel_idx in iwl_parse_nvm_mcc_info Signed-off-by: Tova Mussai Signed-off-by: Luca Coelho Link: https://lore.kernel.org/r/iwlwifi.20200417131727.a64a018f244e.Ie75ac5bb0f0f524d26944800138855ef2228339a@changeid --- drivers/net/wireless/intel/iwlwifi/iwl-nvm-parse.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/net/wireless/intel/iwlwifi/iwl-nvm-parse.c b/drivers/net/wireless/intel/iwlwifi/iwl-nvm-parse.c index 9e9810d2b262..6047b98691eb 100644 --- a/drivers/net/wireless/intel/iwlwifi/iwl-nvm-parse.c +++ b/drivers/net/wireless/intel/iwlwifi/iwl-nvm-parse.c @@ -1168,8 +1168,7 @@ iwl_parse_nvm_mcc_info(struct device *dev, const struct iwl_cfg *cfg, for (ch_idx = 0; ch_idx < num_of_ch; ch_idx++) { ch_flags = (u16)__le32_to_cpup(channels + ch_idx); - band = (ch_idx < NUM_2GHZ_CHANNELS) ? - NL80211_BAND_2GHZ : NL80211_BAND_5GHZ; + band = iwl_nl80211_band_from_channel_idx(ch_idx); center_freq = ieee80211_channel_to_frequency(nvm_chan[ch_idx], band); new_rule = false; -- cgit v1.2.3-59-g8ed1b From 4af119509a4807ac450634c73d38757aaf0b3f98 Mon Sep 17 00:00:00 2001 From: Mordechay Goodstein Date: Tue, 21 Apr 2020 13:33:47 +0300 Subject: iwlwifi: move API version lookup to common code The API version lookup is parsed from a TLV and should be in shared code make make it reusable across all opmodes. Also change the function names from mvm to fw, since this is not mvm-specific anymore. Additionally, since this function is not just a single line of code, it shouldn't be inline. Convert them to actual functions. Signed-off-by: Mordechay Goodstein Signed-off-by: Luca Coelho Link: https://lore.kernel.org/r/iwlwifi.20200421133326.cf94672dfcdb.I5ede9cc25ee8de7b8d2b5c574f917a18971da734@changeid --- drivers/net/wireless/intel/iwlwifi/Makefile | 3 +- drivers/net/wireless/intel/iwlwifi/fw/img.c | 99 ++++++++++++++++++++++ drivers/net/wireless/intel/iwlwifi/fw/img.h | 19 +---- .../net/wireless/intel/iwlwifi/mvm/ftm-initiator.c | 4 +- .../net/wireless/intel/iwlwifi/mvm/ftm-responder.c | 4 +- drivers/net/wireless/intel/iwlwifi/mvm/fw.c | 6 +- drivers/net/wireless/intel/iwlwifi/mvm/mvm.h | 4 +- drivers/net/wireless/intel/iwlwifi/mvm/ops.c | 24 +----- drivers/net/wireless/intel/iwlwifi/mvm/scan.c | 8 +- 9 files changed, 118 insertions(+), 53 deletions(-) create mode 100644 drivers/net/wireless/intel/iwlwifi/fw/img.c diff --git a/drivers/net/wireless/intel/iwlwifi/Makefile b/drivers/net/wireless/intel/iwlwifi/Makefile index 0aae3fa4128c..fbcd1405aeea 100644 --- a/drivers/net/wireless/intel/iwlwifi/Makefile +++ b/drivers/net/wireless/intel/iwlwifi/Makefile @@ -13,7 +13,8 @@ iwlwifi-$(CONFIG_IWLDVM) += cfg/1000.o cfg/2000.o cfg/5000.o cfg/6000.o iwlwifi-$(CONFIG_IWLMVM) += cfg/7000.o cfg/8000.o cfg/9000.o cfg/22000.o iwlwifi-objs += iwl-dbg-tlv.o iwlwifi-objs += iwl-trans.o -iwlwifi-objs += fw/notif-wait.o + +iwlwifi-objs += fw/img.o fw/notif-wait.o iwlwifi-objs += fw/dbg.o iwlwifi-$(CONFIG_IWLMVM) += fw/paging.o fw/smem.o fw/init.o iwlwifi-$(CONFIG_ACPI) += fw/acpi.o diff --git a/drivers/net/wireless/intel/iwlwifi/fw/img.c b/drivers/net/wireless/intel/iwlwifi/fw/img.c new file mode 100644 index 000000000000..de8cff463dbe --- /dev/null +++ b/drivers/net/wireless/intel/iwlwifi/fw/img.c @@ -0,0 +1,99 @@ +/****************************************************************************** + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * Copyright(c) 2019 Intel Corporation + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * The full GNU General Public License is included in this distribution + * in the file called COPYING. + * + * Contact Information: + * Intel Linux Wireless + * Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497 + * + * BSD LICENSE + * + * Copyright(c) 2019 Intel Corporation + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + *****************************************************************************/ + +#include "img.h" + +u8 iwl_fw_lookup_cmd_ver(const struct iwl_fw *fw, u8 grp, u8 cmd) +{ + const struct iwl_fw_cmd_version *entry; + unsigned int i; + + if (!fw->ucode_capa.cmd_versions || + !fw->ucode_capa.n_cmd_versions) + return IWL_FW_CMD_VER_UNKNOWN; + + entry = fw->ucode_capa.cmd_versions; + for (i = 0; i < fw->ucode_capa.n_cmd_versions; i++, entry++) { + if (entry->group == grp && entry->cmd == cmd) + return entry->cmd_ver; + } + + return IWL_FW_CMD_VER_UNKNOWN; +} +EXPORT_SYMBOL_GPL(iwl_fw_lookup_cmd_ver); + +u8 iwl_fw_lookup_notif_ver(const struct iwl_fw *fw, u8 grp, u8 cmd, u8 def) +{ + const struct iwl_fw_cmd_version *entry; + unsigned int i; + + if (!fw->ucode_capa.cmd_versions || + !fw->ucode_capa.n_cmd_versions) + return def; + + entry = fw->ucode_capa.cmd_versions; + for (i = 0; i < fw->ucode_capa.n_cmd_versions; i++, entry++) { + if (entry->group == grp && entry->cmd == cmd) { + if (entry->notif_ver == IWL_FW_CMD_VER_UNKNOWN) + return def; + return entry->notif_ver; + } + } + + return def; +} +EXPORT_SYMBOL_GPL(iwl_fw_lookup_notif_ver); diff --git a/drivers/net/wireless/intel/iwlwifi/fw/img.h b/drivers/net/wireless/intel/iwlwifi/fw/img.h index 90ca5f929cf9..a8630bf90b63 100644 --- a/drivers/net/wireless/intel/iwlwifi/fw/img.h +++ b/drivers/net/wireless/intel/iwlwifi/fw/img.h @@ -313,22 +313,7 @@ iwl_get_ucode_image(const struct iwl_fw *fw, enum iwl_ucode_type ucode_type) return &fw->img[ucode_type]; } -static inline u8 iwl_mvm_lookup_cmd_ver(const struct iwl_fw *fw, u8 grp, u8 cmd) -{ - const struct iwl_fw_cmd_version *entry; - unsigned int i; - - if (!fw->ucode_capa.cmd_versions || - !fw->ucode_capa.n_cmd_versions) - return IWL_FW_CMD_VER_UNKNOWN; - - entry = fw->ucode_capa.cmd_versions; - for (i = 0; i < fw->ucode_capa.n_cmd_versions; i++, entry++) { - if (entry->group == grp && entry->cmd == cmd) - return entry->cmd_ver; - } - - return IWL_FW_CMD_VER_UNKNOWN; -} +u8 iwl_fw_lookup_cmd_ver(const struct iwl_fw *fw, u8 grp, u8 cmd); +u8 iwl_fw_lookup_notif_ver(const struct iwl_fw *fw, u8 grp, u8 cmd, u8 def); #endif /* __iwl_fw_img_h__ */ diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/ftm-initiator.c b/drivers/net/wireless/intel/iwlwifi/mvm/ftm-initiator.c index 9e21f5e5d364..cdb87139100d 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/ftm-initiator.c +++ b/drivers/net/wireless/intel/iwlwifi/mvm/ftm-initiator.c @@ -508,8 +508,8 @@ int iwl_mvm_ftm_start(struct iwl_mvm *mvm, struct ieee80211_vif *vif, return -EBUSY; if (new_api) { - u8 cmd_ver = iwl_mvm_lookup_cmd_ver(mvm->fw, LOCATION_GROUP, - TOF_RANGE_REQ_CMD); + u8 cmd_ver = iwl_fw_lookup_cmd_ver(mvm->fw, LOCATION_GROUP, + TOF_RANGE_REQ_CMD); if (cmd_ver == 8) err = iwl_mvm_ftm_start_v8(mvm, vif, req); diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/ftm-responder.c b/drivers/net/wireless/intel/iwlwifi/mvm/ftm-responder.c index 834564198409..0b6c32098b5a 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/ftm-responder.c +++ b/drivers/net/wireless/intel/iwlwifi/mvm/ftm-responder.c @@ -136,8 +136,8 @@ iwl_mvm_ftm_responder_cmd(struct iwl_mvm *mvm, IWL_TOF_RESPONDER_CMD_VALID_STA_ID), .sta_id = mvmvif->bcast_sta.sta_id, }; - u8 cmd_ver = iwl_mvm_lookup_cmd_ver(mvm->fw, LOCATION_GROUP, - TOF_RESPONDER_CONFIG_CMD); + u8 cmd_ver = iwl_fw_lookup_cmd_ver(mvm->fw, LOCATION_GROUP, + TOF_RESPONDER_CONFIG_CMD); int err; lockdep_assert_held(&mvm->mutex); diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/fw.c b/drivers/net/wireless/intel/iwlwifi/mvm/fw.c index a4038f289ab3..8fe78ce37771 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/fw.c +++ b/drivers/net/wireless/intel/iwlwifi/mvm/fw.c @@ -102,9 +102,9 @@ static int iwl_set_soc_latency(struct iwl_mvm *mvm) if (!mvm->trans->trans_cfg->integrated) cmd.flags = cpu_to_le32(SOC_CONFIG_CMD_FLAGS_DISCRETE); - if (iwl_mvm_lookup_cmd_ver(mvm->fw, IWL_ALWAYS_LONG_GROUP, - SCAN_REQ_UMAC) >= 2 && - (mvm->trans->trans_cfg->low_latency_xtal)) + if (iwl_fw_lookup_cmd_ver(mvm->fw, IWL_ALWAYS_LONG_GROUP, + SCAN_REQ_UMAC) >= 2 && + mvm->trans->trans_cfg->low_latency_xtal) cmd.flags |= cpu_to_le32(SOC_CONFIG_CMD_FLAGS_LOW_LATENCY); cmd.latency = cpu_to_le32(mvm->trans->trans_cfg->xtal_latency); diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/mvm.h b/drivers/net/wireless/intel/iwlwifi/mvm/mvm.h index afcf2b98a9cb..9e2a0858108c 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/mvm.h +++ b/drivers/net/wireless/intel/iwlwifi/mvm/mvm.h @@ -2149,8 +2149,8 @@ iwl_mvm_set_chan_info_chandef(struct iwl_mvm *mvm, static inline int iwl_umac_scan_get_max_profiles(const struct iwl_fw *fw) { - u8 ver = iwl_mvm_lookup_cmd_ver(fw, IWL_ALWAYS_LONG_GROUP, - SCAN_OFFLOAD_UPDATE_PROFILES_CMD); + u8 ver = iwl_fw_lookup_cmd_ver(fw, IWL_ALWAYS_LONG_GROUP, + SCAN_OFFLOAD_UPDATE_PROFILES_CMD); return (ver == IWL_FW_CMD_VER_UNKNOWN || ver < 3) ? IWL_SCAN_MAX_PROFILES : IWL_SCAN_MAX_PROFILES_V2; } diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/ops.c b/drivers/net/wireless/intel/iwlwifi/mvm/ops.c index dfe02440d474..b00f4a8b8424 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/ops.c +++ b/drivers/net/wireless/intel/iwlwifi/mvm/ops.c @@ -612,27 +612,6 @@ static const struct iwl_fw_runtime_ops iwl_mvm_fwrt_ops = { .d3_debug_enable = iwl_mvm_d3_debug_enable, }; -static u8 iwl_mvm_lookup_notif_ver(struct iwl_mvm *mvm, u8 grp, u8 cmd, u8 def) -{ - const struct iwl_fw_cmd_version *entry; - unsigned int i; - - if (!mvm->fw->ucode_capa.cmd_versions || - !mvm->fw->ucode_capa.n_cmd_versions) - return def; - - entry = mvm->fw->ucode_capa.cmd_versions; - for (i = 0; i < mvm->fw->ucode_capa.n_cmd_versions; i++, entry++) { - if (entry->group == grp && entry->cmd == cmd) { - if (entry->notif_ver == IWL_FW_CMD_VER_UNKNOWN) - return def; - return entry->notif_ver; - } - } - - return def; -} - static struct iwl_op_mode * iwl_op_mode_mvm_start(struct iwl_trans *trans, const struct iwl_cfg *cfg, const struct iwl_fw *fw, struct dentry *dbgfs_dir) @@ -745,7 +724,8 @@ iwl_op_mode_mvm_start(struct iwl_trans *trans, const struct iwl_cfg *cfg, INIT_DELAYED_WORK(&mvm->cs_tx_unblock_dwork, iwl_mvm_tx_unblock_dwork); mvm->cmd_ver.d0i3_resp = - iwl_mvm_lookup_notif_ver(mvm, LEGACY_GROUP, D0I3_END_CMD, 0); + iwl_fw_lookup_notif_ver(mvm->fw, LEGACY_GROUP, D0I3_END_CMD, + 0); /* we only support version 1 */ if (WARN_ON_ONCE(mvm->cmd_ver.d0i3_resp > 1)) goto out_free; diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/scan.c b/drivers/net/wireless/intel/iwlwifi/mvm/scan.c index bc48113f0568..51a061b138ba 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/scan.c +++ b/drivers/net/wireless/intel/iwlwifi/mvm/scan.c @@ -2228,8 +2228,8 @@ static int iwl_mvm_build_scan_cmd(struct iwl_mvm *mvm, hcmd->id = iwl_cmd_id(SCAN_REQ_UMAC, IWL_ALWAYS_LONG_GROUP, 0); - scan_ver = iwl_mvm_lookup_cmd_ver(mvm->fw, IWL_ALWAYS_LONG_GROUP, - SCAN_REQ_UMAC); + scan_ver = iwl_fw_lookup_cmd_ver(mvm->fw, IWL_ALWAYS_LONG_GROUP, + SCAN_REQ_UMAC); for (i = 0; i < ARRAY_SIZE(iwl_scan_umac_handlers); i++) { const struct iwl_scan_umac_handler *ver_handler = @@ -2568,8 +2568,8 @@ static int iwl_scan_req_umac_get_size(u8 scan_ver) int iwl_mvm_scan_size(struct iwl_mvm *mvm) { int base_size, tail_size; - u8 scan_ver = iwl_mvm_lookup_cmd_ver(mvm->fw, IWL_ALWAYS_LONG_GROUP, - SCAN_REQ_UMAC); + u8 scan_ver = iwl_fw_lookup_cmd_ver(mvm->fw, IWL_ALWAYS_LONG_GROUP, + SCAN_REQ_UMAC); base_size = iwl_scan_req_umac_get_size(scan_ver); if (base_size) -- cgit v1.2.3-59-g8ed1b From 7a99c877ae8e2f1b4bd9811addd337900d24b3ae Mon Sep 17 00:00:00 2001 From: Shahar S Matityahu Date: Fri, 17 Apr 2020 13:21:36 +0300 Subject: iwlwifi: dbg: support multiple dumps in legacy dump flow Support multiple debug data collection triggers in legacy flow. Utilize the already existing Yoyo infra so the change is rather simple. Signed-off-by: Shahar S Matityahu Signed-off-by: Luca Coelho Link: https://lore.kernel.org/r/iwlwifi.20200417131727.5be6a1923cbe.I10701236b03f66328041f2a38f5f0f22a26fd40b@changeid --- drivers/net/wireless/intel/iwlwifi/fw/dbg.c | 97 +++++++++++++++-------- drivers/net/wireless/intel/iwlwifi/fw/dbg.h | 11 --- drivers/net/wireless/intel/iwlwifi/fw/runtime.h | 14 +++- drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c | 1 - 4 files changed, 72 insertions(+), 51 deletions(-) diff --git a/drivers/net/wireless/intel/iwlwifi/fw/dbg.c b/drivers/net/wireless/intel/iwlwifi/fw/dbg.c index 14ac7153a3e7..8daa83cdc72c 100644 --- a/drivers/net/wireless/intel/iwlwifi/fw/dbg.c +++ b/drivers/net/wireless/intel/iwlwifi/fw/dbg.c @@ -818,7 +818,8 @@ static void iwl_dump_paging(struct iwl_fw_runtime *fwrt, static struct iwl_fw_error_dump_file * iwl_fw_error_dump_file(struct iwl_fw_runtime *fwrt, - struct iwl_fw_dump_ptrs *fw_error_dump) + struct iwl_fw_dump_ptrs *fw_error_dump, + struct iwl_fwrt_dump_data *data) { struct iwl_fw_error_dump_file *dump_file; struct iwl_fw_error_dump_data *dump_data; @@ -900,15 +901,15 @@ iwl_fw_error_dump_file(struct iwl_fw_runtime *fwrt, } /* If we only want a monitor dump, reset the file length */ - if (fwrt->dump.monitor_only) { + if (data->monitor_only) { file_len = sizeof(*dump_file) + sizeof(*dump_data) * 2 + sizeof(*dump_info) + sizeof(*dump_smem_cfg); } if (iwl_fw_dbg_type_on(fwrt, IWL_FW_ERROR_DUMP_ERROR_INFO) && - fwrt->dump.desc) + data->desc) file_len += sizeof(*dump_data) + sizeof(*dump_trig) + - fwrt->dump.desc->len; + data->desc->len; dump_file = vzalloc(file_len); if (!dump_file) @@ -984,19 +985,19 @@ iwl_fw_error_dump_file(struct iwl_fw_runtime *fwrt, iwl_read_radio_regs(fwrt, &dump_data); if (iwl_fw_dbg_type_on(fwrt, IWL_FW_ERROR_DUMP_ERROR_INFO) && - fwrt->dump.desc) { + data->desc) { dump_data->type = cpu_to_le32(IWL_FW_ERROR_DUMP_ERROR_INFO); dump_data->len = cpu_to_le32(sizeof(*dump_trig) + - fwrt->dump.desc->len); + data->desc->len); dump_trig = (void *)dump_data->data; - memcpy(dump_trig, &fwrt->dump.desc->trig_desc, - sizeof(*dump_trig) + fwrt->dump.desc->len); + memcpy(dump_trig, &data->desc->trig_desc, + sizeof(*dump_trig) + data->desc->len); dump_data = iwl_fw_error_next_data(dump_data); } /* In case we only want monitor dump, skip to dump trasport data */ - if (fwrt->dump.monitor_only) + if (data->monitor_only) goto out; if (iwl_fw_dbg_type_on(fwrt, IWL_FW_ERROR_DUMP_MEM)) { @@ -2172,7 +2173,21 @@ static u32 iwl_dump_ini_file_gen(struct iwl_fw_runtime *fwrt, return le32_to_cpu(hdr->file_len); } -static void iwl_fw_error_dump(struct iwl_fw_runtime *fwrt) +static inline void iwl_fw_free_dump_desc(struct iwl_fw_runtime *fwrt, + const struct iwl_fw_dump_desc **desc) +{ + if (desc && *desc != &iwl_dump_desc_assert) + kfree(*desc); + + *desc = NULL; + fwrt->dump.lmac_err_id[0] = 0; + if (fwrt->smem_cfg.num_lmacs > 1) + fwrt->dump.lmac_err_id[1] = 0; + fwrt->dump.umac_err_id = 0; +} + +static void iwl_fw_error_dump(struct iwl_fw_runtime *fwrt, + struct iwl_fwrt_dump_data *dump_data) { struct iwl_fw_dump_ptrs fw_error_dump = {}; struct iwl_fw_error_dump_file *dump_file; @@ -2180,11 +2195,11 @@ static void iwl_fw_error_dump(struct iwl_fw_runtime *fwrt) u32 file_len; u32 dump_mask = fwrt->fw->dbg.dump_mask; - dump_file = iwl_fw_error_dump_file(fwrt, &fw_error_dump); + dump_file = iwl_fw_error_dump_file(fwrt, &fw_error_dump, dump_data); if (!dump_file) - goto out; + return; - if (fwrt->dump.monitor_only) + if (dump_data->monitor_only) dump_mask &= IWL_FW_ERROR_DUMP_FW_MONITOR; fw_error_dump.trans_ptr = iwl_trans_dump_data(fwrt->trans, dump_mask); @@ -2213,9 +2228,6 @@ static void iwl_fw_error_dump(struct iwl_fw_runtime *fwrt) } vfree(fw_error_dump.fwrt_ptr); vfree(fw_error_dump.trans_ptr); - -out: - iwl_fw_free_dump_desc(fwrt); } static void iwl_dump_ini_list_free(struct list_head *list) @@ -2244,7 +2256,7 @@ static void iwl_fw_error_ini_dump(struct iwl_fw_runtime *fwrt, u32 file_len = iwl_dump_ini_file_gen(fwrt, dump_data, &dump_list); if (!file_len) - goto out; + return; sg_dump_data = alloc_sgtable(file_len); if (sg_dump_data) { @@ -2261,9 +2273,6 @@ static void iwl_fw_error_ini_dump(struct iwl_fw_runtime *fwrt, GFP_KERNEL); } iwl_dump_ini_list_free(&dump_list); - -out: - iwl_fw_error_dump_data_free(dump_data); } const struct iwl_fw_dump_desc iwl_dump_desc_assert = { @@ -2278,27 +2287,40 @@ int iwl_fw_dbg_collect_desc(struct iwl_fw_runtime *fwrt, bool monitor_only, unsigned int delay) { + struct iwl_fwrt_wk_data *wk_data; + unsigned long idx; + if (iwl_trans_dbg_ini_valid(fwrt->trans)) { - iwl_fw_free_dump_desc(fwrt); + iwl_fw_free_dump_desc(fwrt, &desc); return 0; } - /* use wks[0] since dump flow prior to ini does not need to support - * consecutive triggers collection + /* + * Check there is an available worker. + * ffz return value is undefined if no zero exists, + * so check against ~0UL first. */ - if (test_and_set_bit(fwrt->dump.wks[0].idx, &fwrt->dump.active_wks)) + if (fwrt->dump.active_wks == ~0UL) return -EBUSY; - if (WARN_ON(fwrt->dump.desc)) - iwl_fw_free_dump_desc(fwrt); + idx = ffz(fwrt->dump.active_wks); + + if (idx >= IWL_FW_RUNTIME_DUMP_WK_NUM || + test_and_set_bit(fwrt->dump.wks[idx].idx, &fwrt->dump.active_wks)) + return -EBUSY; + + wk_data = &fwrt->dump.wks[idx]; + + if (WARN_ON(wk_data->dump_data.desc)) + iwl_fw_free_dump_desc(fwrt, &wk_data->dump_data.desc); + + wk_data->dump_data.desc = desc; + wk_data->dump_data.monitor_only = monitor_only; IWL_WARN(fwrt, "Collecting data: trigger %d fired.\n", le32_to_cpu(desc->trig_desc.type)); - fwrt->dump.desc = desc; - fwrt->dump.monitor_only = monitor_only; - - schedule_delayed_work(&fwrt->dump.wks[0].wk, usecs_to_jiffies(delay)); + schedule_delayed_work(&wk_data->wk, usecs_to_jiffies(delay)); return 0; } @@ -2504,14 +2526,14 @@ IWL_EXPORT_SYMBOL(iwl_fw_start_dbg_conf); static void iwl_fw_dbg_collect_sync(struct iwl_fw_runtime *fwrt, u8 wk_idx) { struct iwl_fw_dbg_params params = {0}; + struct iwl_fwrt_dump_data *dump_data = + &fwrt->dump.wks[wk_idx].dump_data; if (!test_bit(wk_idx, &fwrt->dump.active_wks)) return; - if (fwrt->ops && fwrt->ops->fw_running && - !fwrt->ops->fw_running(fwrt->ops_ctx)) { - IWL_ERR(fwrt, "Firmware not running - cannot dump error\n"); - iwl_fw_free_dump_desc(fwrt); + if (!test_bit(STATUS_DEVICE_ENABLED, &fwrt->trans->status)) { + IWL_ERR(fwrt, "Device is not enabled - cannot dump error\n"); goto out; } @@ -2527,12 +2549,17 @@ static void iwl_fw_dbg_collect_sync(struct iwl_fw_runtime *fwrt, u8 wk_idx) if (iwl_trans_dbg_ini_valid(fwrt->trans)) iwl_fw_error_ini_dump(fwrt, &fwrt->dump.wks[wk_idx].dump_data); else - iwl_fw_error_dump(fwrt); + iwl_fw_error_dump(fwrt, &fwrt->dump.wks[wk_idx].dump_data); IWL_DEBUG_FW_INFO(fwrt, "WRT: Data collection done\n"); iwl_fw_dbg_stop_restart_recording(fwrt, ¶ms, false); out: + if (iwl_trans_dbg_ini_valid(fwrt->trans)) + iwl_fw_error_dump_data_free(dump_data); + else + iwl_fw_free_dump_desc(fwrt, &dump_data->desc); + clear_bit(wk_idx, &fwrt->dump.active_wks); } diff --git a/drivers/net/wireless/intel/iwlwifi/fw/dbg.h b/drivers/net/wireless/intel/iwlwifi/fw/dbg.h index 9d3513213f5f..11558df36b94 100644 --- a/drivers/net/wireless/intel/iwlwifi/fw/dbg.h +++ b/drivers/net/wireless/intel/iwlwifi/fw/dbg.h @@ -98,17 +98,6 @@ struct iwl_fw_dbg_params { extern const struct iwl_fw_dump_desc iwl_dump_desc_assert; -static inline void iwl_fw_free_dump_desc(struct iwl_fw_runtime *fwrt) -{ - if (fwrt->dump.desc != &iwl_dump_desc_assert) - kfree(fwrt->dump.desc); - fwrt->dump.desc = NULL; - fwrt->dump.lmac_err_id[0] = 0; - if (fwrt->smem_cfg.num_lmacs > 1) - fwrt->dump.lmac_err_id[1] = 0; - fwrt->dump.umac_err_id = 0; -} - int iwl_fw_dbg_collect_desc(struct iwl_fw_runtime *fwrt, const struct iwl_fw_dump_desc *desc, bool monitor_only, unsigned int delay); diff --git a/drivers/net/wireless/intel/iwlwifi/fw/runtime.h b/drivers/net/wireless/intel/iwlwifi/fw/runtime.h index da0d90e2b537..9906d9b9bdd5 100644 --- a/drivers/net/wireless/intel/iwlwifi/fw/runtime.h +++ b/drivers/net/wireless/intel/iwlwifi/fw/runtime.h @@ -98,8 +98,16 @@ struct iwl_fwrt_shared_mem_cfg { * @fw_pkt: packet received from FW */ struct iwl_fwrt_dump_data { - struct iwl_fw_ini_trigger_tlv *trig; - struct iwl_rx_packet *fw_pkt; + union { + struct { + struct iwl_fw_ini_trigger_tlv *trig; + struct iwl_rx_packet *fw_pkt; + }; + struct { + const struct iwl_fw_dump_desc *desc; + bool monitor_only; + }; + }; }; /** @@ -162,8 +170,6 @@ struct iwl_fw_runtime { /* debug */ struct { - const struct iwl_fw_dump_desc *desc; - bool monitor_only; struct iwl_fwrt_wk_data wks[IWL_FW_RUNTIME_DUMP_WK_NUM]; unsigned long active_wks; diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c b/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c index 7aa1350b093e..853ba7b8bf3f 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c +++ b/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c @@ -1264,7 +1264,6 @@ static void iwl_mvm_mac_stop(struct ieee80211_hw *hw) cancel_delayed_work_sync(&mvm->cs_tx_unblock_dwork); cancel_delayed_work_sync(&mvm->scan_timeout_dwork); - iwl_fw_free_dump_desc(&mvm->fwrt); mutex_lock(&mvm->mutex); __iwl_mvm_mac_stop(mvm); -- cgit v1.2.3-59-g8ed1b From 250380c9b8e5a1d893a8012a33667343dc75e17e Mon Sep 17 00:00:00 2001 From: Mordechay Goodstein Date: Fri, 17 Apr 2020 13:21:37 +0300 Subject: iwlwifi: support version 9 of WOWLAN_GET_STATUS notification Add support for the new WOWLAN_GET_STATUS notification that contains a new element that informs the driver of TIDs whose BA sessions were closed during suspend. Note that the new functionality of handling these closed sessions is not implemented in this patch it. It only aligns to the new API version. Signed-off-by: Mordechay Goodstein Signed-off-by: Luca Coelho Link: https://lore.kernel.org/r/iwlwifi.20200417131727.b02153b94c1d.Ieb6291586d60f372d5a505604b18227ef97e7202@changeid --- drivers/net/wireless/intel/iwlwifi/fw/api/d3.h | 39 +++++++++++++++++++++++++- drivers/net/wireless/intel/iwlwifi/mvm/d3.c | 29 +++++++++++++------ 2 files changed, 58 insertions(+), 10 deletions(-) diff --git a/drivers/net/wireless/intel/iwlwifi/fw/api/d3.h b/drivers/net/wireless/intel/iwlwifi/fw/api/d3.h index 3643b6ba6385..c4562e1f8d18 100644 --- a/drivers/net/wireless/intel/iwlwifi/fw/api/d3.h +++ b/drivers/net/wireless/intel/iwlwifi/fw/api/d3.h @@ -618,7 +618,7 @@ struct iwl_wowlan_status_v6 { * @wake_packet_bufsize: wakeup packet buffer size * @wake_packet: wakeup packet */ -struct iwl_wowlan_status { +struct iwl_wowlan_status_v7 { struct iwl_wowlan_gtk_status gtk[WOWLAN_GTK_KEYS_NUM]; struct iwl_wowlan_igtk_status igtk[WOWLAN_IGTK_KEYS_NUM]; __le64 replay_ctr; @@ -634,6 +634,43 @@ struct iwl_wowlan_status { u8 wake_packet[]; /* can be truncated from _length to _bufsize */ } __packed; /* WOWLAN_STATUSES_API_S_VER_7 */ +/** + * struct iwl_wowlan_status - WoWLAN status + * @gtk: GTK data + * @igtk: IGTK data + * @replay_ctr: GTK rekey replay counter + * @pattern_number: number of the matched pattern + * @non_qos_seq_ctr: non-QoS sequence counter to use next + * @qos_seq_ctr: QoS sequence counters to use next + * @wakeup_reasons: wakeup reasons, see &enum iwl_wowlan_wakeup_reason + * @num_of_gtk_rekeys: number of GTK rekeys + * @transmitted_ndps: number of transmitted neighbor discovery packets + * @received_beacons: number of received beacons + * @wake_packet_length: wakeup packet length + * @wake_packet_bufsize: wakeup packet buffer size + * @tid_tear_down: bit mask of tids whose BA sessions were closed + * in suspend state + * @reserved: unused + * @wake_packet: wakeup packet + */ +struct iwl_wowlan_status { + struct iwl_wowlan_gtk_status gtk[WOWLAN_GTK_KEYS_NUM]; + struct iwl_wowlan_igtk_status igtk[WOWLAN_IGTK_KEYS_NUM]; + __le64 replay_ctr; + __le16 pattern_number; + __le16 non_qos_seq_ctr; + __le16 qos_seq_ctr[8]; + __le32 wakeup_reasons; + __le32 num_of_gtk_rekeys; + __le32 transmitted_ndps; + __le32 received_beacons; + __le32 wake_packet_length; + __le32 wake_packet_bufsize; + u8 tid_tear_down; + u8 reserved[3]; + u8 wake_packet[]; /* can be truncated from _length to _bufsize */ +} __packed; /* WOWLAN_STATUSES_API_S_VER_9 */ + static inline u8 iwlmvm_wowlan_gtk_idx(struct iwl_wowlan_gtk_status *gtk) { return gtk->key_flags & IWL_WOWLAN_GTK_IDX_MASK; diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/d3.c b/drivers/net/wireless/intel/iwlwifi/mvm/d3.c index 122ca7624073..222775714859 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/d3.c +++ b/drivers/net/wireless/intel/iwlwifi/mvm/d3.c @@ -1517,12 +1517,14 @@ out: struct iwl_wowlan_status *iwl_mvm_send_wowlan_get_status(struct iwl_mvm *mvm) { - struct iwl_wowlan_status *v7, *status; + struct iwl_wowlan_status_v7 *v7; + struct iwl_wowlan_status *status; struct iwl_host_cmd cmd = { .id = WOWLAN_GET_STATUSES, .flags = CMD_WANT_SKB, }; - int ret, len, status_size; + int ret, len, status_size, data_size; + u8 notif_ver; lockdep_assert_held(&mvm->mutex); @@ -1532,13 +1534,12 @@ struct iwl_wowlan_status *iwl_mvm_send_wowlan_get_status(struct iwl_mvm *mvm) return ERR_PTR(ret); } + len = iwl_rx_packet_payload_len(cmd.resp_pkt); if (!fw_has_api(&mvm->fw->ucode_capa, IWL_UCODE_TLV_API_WOWLAN_KEY_MATERIAL)) { struct iwl_wowlan_status_v6 *v6 = (void *)cmd.resp_pkt->data; - int data_size; status_size = sizeof(*v6); - len = iwl_rx_packet_payload_len(cmd.resp_pkt); if (len < status_size) { IWL_ERR(mvm, "Invalid WoWLAN status response!\n"); @@ -1593,23 +1594,33 @@ struct iwl_wowlan_status *iwl_mvm_send_wowlan_get_status(struct iwl_mvm *mvm) } v7 = (void *)cmd.resp_pkt->data; - status_size = sizeof(*v7); - len = iwl_rx_packet_payload_len(cmd.resp_pkt); + notif_ver = iwl_fw_lookup_notif_ver(mvm->fw, LEGACY_GROUP, + WOWLAN_GET_STATUSES, 0); + + status_size = sizeof(*status); + + if (notif_ver == IWL_FW_CMD_VER_UNKNOWN || notif_ver < 9) + status_size = sizeof(*v7); if (len < status_size) { IWL_ERR(mvm, "Invalid WoWLAN status response!\n"); status = ERR_PTR(-EIO); goto out_free_resp; } + data_size = ALIGN(le32_to_cpu(v7->wake_packet_bufsize), 4); - if (len != (status_size + - ALIGN(le32_to_cpu(v7->wake_packet_bufsize), 4))) { + if (len != (status_size + data_size)) { IWL_ERR(mvm, "Invalid WoWLAN status response!\n"); status = ERR_PTR(-EIO); goto out_free_resp; } - status = kmemdup(v7, len, GFP_KERNEL); + status = kzalloc(sizeof(*status) + data_size, GFP_KERNEL); + if (!status) + goto out_free_resp; + + memcpy(status, v7, status_size); + memcpy(status->wake_packet, (u8 *)v7 + status_size, data_size); out_free_resp: iwl_free_resp(&cmd); -- cgit v1.2.3-59-g8ed1b From df67a1bea0378488a0454f2be2609349fda86727 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Fri, 17 Apr 2020 13:21:38 +0300 Subject: iwlwifi: pcie: use seq_file for tx_queue debugfs file On newer hardware, the tx_queue debugfs file would need to allocate 37.5kib data since there are 512 queues, which is too much. Rather than resorting to kludges like kvmalloc(), use the seq_file API to print out the data. While at it, also fix a NULL pointer dereference here, the txq pointer from trans_pcie->txqs[] may be NULL if that queue hasn't been allocated. Signed-off-by: Johannes Berg Signed-off-by: Luca Coelho Link: https://lore.kernel.org/r/iwlwifi.20200417131727.491cf8224c49.I7f154d81e5becef3b5ff22d7c6e36170bde0d7d5@changeid --- drivers/net/wireless/intel/iwlwifi/pcie/trans.c | 121 +++++++++++++++++------- 1 file changed, 89 insertions(+), 32 deletions(-) diff --git a/drivers/net/wireless/intel/iwlwifi/pcie/trans.c b/drivers/net/wireless/intel/iwlwifi/pcie/trans.c index e4cbd8daa7c6..3c33c01cda60 100644 --- a/drivers/net/wireless/intel/iwlwifi/pcie/trans.c +++ b/drivers/net/wireless/intel/iwlwifi/pcie/trans.c @@ -70,6 +70,7 @@ #include #include #include +#include #include "iwl-drv.h" #include "iwl-trans.h" @@ -2544,44 +2545,94 @@ static const struct file_operations iwl_dbgfs_##name##_ops = { \ .llseek = generic_file_llseek, \ }; -static ssize_t iwl_dbgfs_tx_queue_read(struct file *file, - char __user *user_buf, - size_t count, loff_t *ppos) +struct iwl_dbgfs_tx_queue_priv { + struct iwl_trans *trans; +}; + +struct iwl_dbgfs_tx_queue_state { + loff_t pos; +}; + +static void *iwl_dbgfs_tx_queue_seq_start(struct seq_file *seq, loff_t *pos) { - struct iwl_trans *trans = file->private_data; + struct iwl_dbgfs_tx_queue_priv *priv = seq->private; + struct iwl_dbgfs_tx_queue_state *state; + + if (*pos >= priv->trans->trans_cfg->base_params->num_of_queues) + return NULL; + + state = kmalloc(sizeof(*state), GFP_KERNEL); + if (!state) + return NULL; + state->pos = *pos; + return state; +} + +static void *iwl_dbgfs_tx_queue_seq_next(struct seq_file *seq, + void *v, loff_t *pos) +{ + struct iwl_dbgfs_tx_queue_priv *priv = seq->private; + struct iwl_dbgfs_tx_queue_state *state = v; + + *pos = ++state->pos; + + if (*pos >= priv->trans->trans_cfg->base_params->num_of_queues) + return NULL; + + return state; +} + +static void iwl_dbgfs_tx_queue_seq_stop(struct seq_file *seq, void *v) +{ + kfree(v); +} + +static int iwl_dbgfs_tx_queue_seq_show(struct seq_file *seq, void *v) +{ + struct iwl_dbgfs_tx_queue_priv *priv = seq->private; + struct iwl_dbgfs_tx_queue_state *state = v; + struct iwl_trans *trans = priv->trans; struct iwl_trans_pcie *trans_pcie = IWL_TRANS_GET_PCIE_TRANS(trans); - struct iwl_txq *txq; - char *buf; - int pos = 0; - int cnt; - int ret; - size_t bufsz; + struct iwl_txq *txq = trans_pcie->txq[state->pos]; + + seq_printf(seq, "hwq %.3u: used=%d stopped=%d ", + (unsigned int)state->pos, + !!test_bit(state->pos, trans_pcie->queue_used), + !!test_bit(state->pos, trans_pcie->queue_stopped)); + if (txq) + seq_printf(seq, + "read=%u write=%u need_update=%d frozen=%d", + txq->read_ptr, txq->write_ptr, + txq->need_update, txq->frozen); + else + seq_puts(seq, "(unallocated)"); - bufsz = sizeof(char) * 75 * - trans->trans_cfg->base_params->num_of_queues; + if (state->pos == trans_pcie->cmd_queue) + seq_puts(seq, " (HCMD)"); + seq_puts(seq, "\n"); - if (!trans_pcie->txq_memory) - return -EAGAIN; + return 0; +} - buf = kzalloc(bufsz, GFP_KERNEL); - if (!buf) +static const struct seq_operations iwl_dbgfs_tx_queue_seq_ops = { + .start = iwl_dbgfs_tx_queue_seq_start, + .next = iwl_dbgfs_tx_queue_seq_next, + .stop = iwl_dbgfs_tx_queue_seq_stop, + .show = iwl_dbgfs_tx_queue_seq_show, +}; + +static int iwl_dbgfs_tx_queue_open(struct inode *inode, struct file *filp) +{ + struct iwl_dbgfs_tx_queue_priv *priv; + + priv = __seq_open_private(filp, &iwl_dbgfs_tx_queue_seq_ops, + sizeof(*priv)); + + if (!priv) return -ENOMEM; - for (cnt = 0; - cnt < trans->trans_cfg->base_params->num_of_queues; - cnt++) { - txq = trans_pcie->txq[cnt]; - pos += scnprintf(buf + pos, bufsz - pos, - "hwq %.2d: read=%u write=%u use=%d stop=%d need_update=%d frozen=%d%s\n", - cnt, txq->read_ptr, txq->write_ptr, - !!test_bit(cnt, trans_pcie->queue_used), - !!test_bit(cnt, trans_pcie->queue_stopped), - txq->need_update, txq->frozen, - (cnt == trans_pcie->cmd_queue ? " HCMD" : "")); - } - ret = simple_read_from_buffer(user_buf, count, ppos, buf, pos); - kfree(buf); - return ret; + priv->trans = inode->i_private; + return 0; } static ssize_t iwl_dbgfs_rx_queue_read(struct file *file, @@ -2914,9 +2965,15 @@ static ssize_t iwl_dbgfs_monitor_data_read(struct file *file, DEBUGFS_READ_WRITE_FILE_OPS(interrupt); DEBUGFS_READ_FILE_OPS(fh_reg); DEBUGFS_READ_FILE_OPS(rx_queue); -DEBUGFS_READ_FILE_OPS(tx_queue); DEBUGFS_WRITE_FILE_OPS(csr); DEBUGFS_READ_WRITE_FILE_OPS(rfkill); +static const struct file_operations iwl_dbgfs_tx_queue_ops = { + .owner = THIS_MODULE, + .open = iwl_dbgfs_tx_queue_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_private, +}; static const struct file_operations iwl_dbgfs_monitor_data_ops = { .read = iwl_dbgfs_monitor_data_read, -- cgit v1.2.3-59-g8ed1b From 95a9e44f8fb2626e4d0cb642ae6b5f6f30c5fb58 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Fri, 17 Apr 2020 13:21:39 +0300 Subject: iwlwifi: pcie: add n_window/ampdu to tx_queue debugfs Add the n_window and ampdu parameters so we can see them. Signed-off-by: Johannes Berg Signed-off-by: Luca Coelho Link: https://lore.kernel.org/r/iwlwifi.20200417131727.a2cc1f36008f.Iea23802bb64a08de410223e9af4431dfcadf121b@changeid --- drivers/net/wireless/intel/iwlwifi/pcie/trans.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/net/wireless/intel/iwlwifi/pcie/trans.c b/drivers/net/wireless/intel/iwlwifi/pcie/trans.c index 3c33c01cda60..06785c46c50d 100644 --- a/drivers/net/wireless/intel/iwlwifi/pcie/trans.c +++ b/drivers/net/wireless/intel/iwlwifi/pcie/trans.c @@ -2601,9 +2601,10 @@ static int iwl_dbgfs_tx_queue_seq_show(struct seq_file *seq, void *v) !!test_bit(state->pos, trans_pcie->queue_stopped)); if (txq) seq_printf(seq, - "read=%u write=%u need_update=%d frozen=%d", + "read=%u write=%u need_update=%d frozen=%d n_window=%d ampdu=%d", txq->read_ptr, txq->write_ptr, - txq->need_update, txq->frozen); + txq->need_update, txq->frozen, + txq->n_window, txq->ampdu); else seq_puts(seq, "(unallocated)"); -- cgit v1.2.3-59-g8ed1b From 161158d7af3f67b15ee681c2b26b99ba461da9a6 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Fri, 17 Apr 2020 13:21:40 +0300 Subject: iwlwifi: pcie: gen2: minor code cleanups in byte table update One line should be indented less, otherwise it looks like it belongs into the parentheses, which clearly it doesn't; also some variables can move into their respective if branches. Signed-off-by: Johannes Berg Signed-off-by: Luca Coelho Link: https://lore.kernel.org/r/iwlwifi.20200417131727.a4858aa0441b.I0e70e4a5493fe6b8db6390f9349ff0e7888ab240@changeid --- drivers/net/wireless/intel/iwlwifi/pcie/tx-gen2.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/drivers/net/wireless/intel/iwlwifi/pcie/tx-gen2.c b/drivers/net/wireless/intel/iwlwifi/pcie/tx-gen2.c index 86fc00167817..a30f6b080201 100644 --- a/drivers/net/wireless/intel/iwlwifi/pcie/tx-gen2.c +++ b/drivers/net/wireless/intel/iwlwifi/pcie/tx-gen2.c @@ -90,9 +90,7 @@ static void iwl_pcie_gen2_update_byte_tbl(struct iwl_trans_pcie *trans_pcie, struct iwl_txq *txq, u16 byte_cnt, int num_tbs) { - struct iwlagn_scd_bc_tbl *scd_bc_tbl = txq->bc_tbl.addr; struct iwl_trans *trans = iwl_trans_pcie_get_trans(trans_pcie); - struct iwl_gen3_bc_tbl *scd_bc_tbl_gen3 = txq->bc_tbl.addr; int idx = iwl_pcie_get_cmd_index(txq, txq->write_ptr); u8 filled_tfd_size, num_fetch_chunks; u16 len = byte_cnt; @@ -102,7 +100,7 @@ static void iwl_pcie_gen2_update_byte_tbl(struct iwl_trans_pcie *trans_pcie, return; filled_tfd_size = offsetof(struct iwl_tfh_tfd, tbs) + - num_tbs * sizeof(struct iwl_tfh_tb); + num_tbs * sizeof(struct iwl_tfh_tb); /* * filled_tfd_size contains the number of filled bytes in the TFD. * Dividing it by 64 will give the number of chunks to fetch @@ -114,12 +112,16 @@ static void iwl_pcie_gen2_update_byte_tbl(struct iwl_trans_pcie *trans_pcie, num_fetch_chunks = DIV_ROUND_UP(filled_tfd_size, 64) - 1; if (trans->trans_cfg->device_family >= IWL_DEVICE_FAMILY_AX210) { + struct iwl_gen3_bc_tbl *scd_bc_tbl_gen3 = txq->bc_tbl.addr; + /* Starting from AX210, the HW expects bytes */ WARN_ON(trans_pcie->bc_table_dword); WARN_ON(len > 0x3FFF); bc_ent = cpu_to_le16(len | (num_fetch_chunks << 14)); scd_bc_tbl_gen3->tfd_offset[idx] = bc_ent; } else { + struct iwlagn_scd_bc_tbl *scd_bc_tbl = txq->bc_tbl.addr; + /* Before AX210, the HW expects DW */ WARN_ON(!trans_pcie->bc_table_dword); len = DIV_ROUND_UP(len, 4); -- cgit v1.2.3-59-g8ed1b From a548c69d2e0f010cd7b77404f94660a6c789abc8 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Fri, 17 Apr 2020 13:21:41 +0300 Subject: iwlwifi: mvm: add DCM flag to rate pretty-print It's useful to know if DCM was enabled, add this flag to the rate pretty-printer. Signed-off-by: Johannes Berg Signed-off-by: Luca Coelho Link: https://lore.kernel.org/r/iwlwifi.20200417131727.891bb9741eca.Ia66448f7e00be9e4c9ea7147b90d4fcd5f1d3845@changeid --- drivers/net/wireless/intel/iwlwifi/mvm/rs.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/rs.c b/drivers/net/wireless/intel/iwlwifi/mvm/rs.c index c1aba2bf73cf..1b6cbcf57d5f 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/rs.c +++ b/drivers/net/wireless/intel/iwlwifi/mvm/rs.c @@ -3740,11 +3740,12 @@ int rs_pretty_print_rate(char *buf, int bufsz, const u32 rate) } return scnprintf(buf, bufsz, - "0x%x: %s | ANT: %s BW: %s MCS: %d NSS: %d %s%s%s%s", + "0x%x: %s | ANT: %s BW: %s MCS: %d NSS: %d %s%s%s%s%s", rate, type, rs_pretty_ant(ant), bw, mcs, nss, (rate & RATE_MCS_SGI_MSK) ? "SGI " : "NGI ", (rate & RATE_MCS_STBC_MSK) ? "STBC " : "", (rate & RATE_MCS_LDPC_MSK) ? "LDPC " : "", + (rate & RATE_HE_DUAL_CARRIER_MODE_MSK) ? "DCM " : "", (rate & RATE_MCS_BF_MSK) ? "BF " : ""); } -- cgit v1.2.3-59-g8ed1b From f05f8edd90f1f637b60c4ed07a4f387052c84cbb Mon Sep 17 00:00:00 2001 From: Shahar S Matityahu Date: Fri, 17 Apr 2020 13:21:42 +0300 Subject: iwlwifi: yoyo: support IWL_FW_INI_TIME_POINT_HOST_ALIVE_TIMEOUT time point Allow the driver to perform dump collection in case of alive notification timeout in yoyo mode. Signed-off-by: Shahar S Matityahu Signed-off-by: Luca Coelho Link: https://lore.kernel.org/r/iwlwifi.20200417131727.bd46e6240590.Ibda6d9d330a1ae49670152cede34629b280f6cf9@changeid --- drivers/net/wireless/intel/iwlwifi/fw/dbg.c | 42 +++++++++++++++++++---------- 1 file changed, 28 insertions(+), 14 deletions(-) diff --git a/drivers/net/wireless/intel/iwlwifi/fw/dbg.c b/drivers/net/wireless/intel/iwlwifi/fw/dbg.c index 8daa83cdc72c..39c8332be3ac 100644 --- a/drivers/net/wireless/intel/iwlwifi/fw/dbg.c +++ b/drivers/net/wireless/intel/iwlwifi/fw/dbg.c @@ -2329,26 +2329,40 @@ IWL_EXPORT_SYMBOL(iwl_fw_dbg_collect_desc); int iwl_fw_dbg_error_collect(struct iwl_fw_runtime *fwrt, enum iwl_fw_dbg_trigger trig_type) { - int ret; - struct iwl_fw_dump_desc *iwl_dump_error_desc; - if (!test_bit(STATUS_DEVICE_ENABLED, &fwrt->trans->status)) return -EIO; - iwl_dump_error_desc = kmalloc(sizeof(*iwl_dump_error_desc), GFP_KERNEL); - if (!iwl_dump_error_desc) - return -ENOMEM; + if (iwl_trans_dbg_ini_valid(fwrt->trans)) { + if (trig_type != FW_DBG_TRIGGER_ALIVE_TIMEOUT) + return -EIO; - iwl_dump_error_desc->trig_desc.type = cpu_to_le32(trig_type); - iwl_dump_error_desc->len = 0; + iwl_dbg_tlv_time_point(fwrt, + IWL_FW_INI_TIME_POINT_HOST_ALIVE_TIMEOUT, + NULL); + } else { + struct iwl_fw_dump_desc *iwl_dump_error_desc; + int ret; - ret = iwl_fw_dbg_collect_desc(fwrt, iwl_dump_error_desc, false, 0); - if (ret) - kfree(iwl_dump_error_desc); - else - iwl_trans_sync_nmi(fwrt->trans); + iwl_dump_error_desc = + kmalloc(sizeof(*iwl_dump_error_desc), GFP_KERNEL); - return ret; + if (!iwl_dump_error_desc) + return -ENOMEM; + + iwl_dump_error_desc->trig_desc.type = cpu_to_le32(trig_type); + iwl_dump_error_desc->len = 0; + + ret = iwl_fw_dbg_collect_desc(fwrt, iwl_dump_error_desc, + false, 0); + if (ret) { + kfree(iwl_dump_error_desc); + return ret; + } + } + + iwl_trans_sync_nmi(fwrt->trans); + + return 0; } IWL_EXPORT_SYMBOL(iwl_fw_dbg_error_collect); -- cgit v1.2.3-59-g8ed1b From 63417549fc8ea5d84cf7172e72a7452938755874 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Sat, 18 Apr 2020 11:08:46 +0300 Subject: iwlwifi: pcie: move iwl_pcie_ctxt_info_alloc_dma() to user There's no need for this to be an inline in the header file, only the context-info.c file ever uses it. Move it there. Signed-off-by: Johannes Berg Signed-off-by: Luca Coelho Link: https://lore.kernel.org/r/iwlwifi.20200418110539.818a06457888.Ib4f55280cd14d7edab37f2992b381c9b6ca4cd7a@changeid --- drivers/net/wireless/intel/iwlwifi/pcie/ctxt-info.c | 15 +++++++++++++++ drivers/net/wireless/intel/iwlwifi/pcie/internal.h | 16 ---------------- 2 files changed, 15 insertions(+), 16 deletions(-) diff --git a/drivers/net/wireless/intel/iwlwifi/pcie/ctxt-info.c b/drivers/net/wireless/intel/iwlwifi/pcie/ctxt-info.c index acd01d86f101..b65405009d02 100644 --- a/drivers/net/wireless/intel/iwlwifi/pcie/ctxt-info.c +++ b/drivers/net/wireless/intel/iwlwifi/pcie/ctxt-info.c @@ -93,6 +93,21 @@ static void *iwl_pcie_ctxt_info_dma_alloc_coherent(struct iwl_trans *trans, return _iwl_pcie_ctxt_info_dma_alloc_coherent(trans, size, phys, 0); } +static int iwl_pcie_ctxt_info_alloc_dma(struct iwl_trans *trans, + const struct fw_desc *sec, + struct iwl_dram_data *dram) +{ + dram->block = iwl_pcie_ctxt_info_dma_alloc_coherent(trans, sec->len, + &dram->physical); + if (!dram->block) + return -ENOMEM; + + dram->size = sec->len; + memcpy(dram->block, sec->data, sec->len); + + return 0; +} + void iwl_pcie_ctxt_info_free_paging(struct iwl_trans *trans) { struct iwl_self_init_dram *dram = &trans->init_dram; diff --git a/drivers/net/wireless/intel/iwlwifi/pcie/internal.h b/drivers/net/wireless/intel/iwlwifi/pcie/internal.h index 595e6873d56e..abe649af689c 100644 --- a/drivers/net/wireless/intel/iwlwifi/pcie/internal.h +++ b/drivers/net/wireless/intel/iwlwifi/pcie/internal.h @@ -792,22 +792,6 @@ static inline int iwl_pcie_get_num_sections(const struct fw_img *fw, return i; } -static inline int iwl_pcie_ctxt_info_alloc_dma(struct iwl_trans *trans, - const struct fw_desc *sec, - struct iwl_dram_data *dram) -{ - dram->block = dma_alloc_coherent(trans->dev, sec->len, - &dram->physical, - GFP_KERNEL); - if (!dram->block) - return -ENOMEM; - - dram->size = sec->len; - memcpy(dram->block, sec->data, sec->len); - - return 0; -} - static inline void iwl_pcie_ctxt_info_free_fw_img(struct iwl_trans *trans) { struct iwl_self_init_dram *dram = &trans->init_dram; -- cgit v1.2.3-59-g8ed1b From c4ace42659b572c597e37a91902036378fe0f973 Mon Sep 17 00:00:00 2001 From: Gil Adam Date: Sat, 18 Apr 2020 11:08:47 +0300 Subject: iwlwifi: mvm: add framework for specific phy configuration Add framework for supporting specific PHY filter configuration, which allows for application of various FW defined PHY filters (one per antenna). Change phy_cfg_cmd to the new API (ver3). Reading of configuration from platform's ACPI tables to be added later when tables are defined. Signed-off-by: Gil Adam Signed-off-by: Luca Coelho Link: https://lore.kernel.org/r/iwlwifi.20200418110539.242a8f979592.I13c77a8a8dbf1a169b5052c7af1f8401ff3991ad@changeid --- drivers/net/wireless/intel/iwlwifi/fw/api/config.h | 39 ++++++++++++++-- drivers/net/wireless/intel/iwlwifi/mvm/constants.h | 4 ++ drivers/net/wireless/intel/iwlwifi/mvm/fw.c | 54 ++++++++++++++++++++-- 3 files changed, 91 insertions(+), 6 deletions(-) diff --git a/drivers/net/wireless/intel/iwlwifi/fw/api/config.h b/drivers/net/wireless/intel/iwlwifi/fw/api/config.h index 5e88fa2e6fb7..546fa60ed9fd 100644 --- a/drivers/net/wireless/intel/iwlwifi/fw/api/config.h +++ b/drivers/net/wireless/intel/iwlwifi/fw/api/config.h @@ -8,7 +8,7 @@ * Copyright(c) 2012 - 2014 Intel Corporation. All rights reserved. * Copyright(c) 2013 - 2015 Intel Mobile Communications GmbH * Copyright(c) 2016 - 2017 Intel Deutschland GmbH - * Copyright (C) 2018 Intel Corporation + * Copyright (C) 2018 - 2019 Intel Corporation * * This program is free software; you can redistribute it and/or modify * it under the terms of version 2 of the GNU General Public License as @@ -31,7 +31,7 @@ * Copyright(c) 2012 - 2014 Intel Corporation. All rights reserved. * Copyright(c) 2013 - 2015 Intel Mobile Communications GmbH * Copyright(c) 2016 - 2017 Intel Deutschland GmbH - * Copyright (C) 2018 Intel Corporation + * Copyright (C) 2018 - 2019 Intel Corporation * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -119,16 +119,49 @@ enum iwl_calib_cfg { IWL_CALIB_CFG_AGC_IDX = BIT(18), }; +/** + * struct iwl_phy_specific_cfg - specific PHY filter configuration + * + * Sent as part of the phy configuration command (v3) to configure specific FW + * defined PHY filters that can be applied to each antenna. + * + * @filter_cfg_chain_a: filter config id for LMAC1 chain A + * @filter_cfg_chain_b: filter config id for LMAC1 chain B + * @filter_cfg_chain_c: filter config id for LMAC2 chain A + * @filter_cfg_chain_d: filter config id for LMAC2 chain B + * values: 0 - no filter; 0xffffffff - reserved; otherwise - filter id + */ +struct iwl_phy_specific_cfg { + __le32 filter_cfg_chain_a; + __le32 filter_cfg_chain_b; + __le32 filter_cfg_chain_c; + __le32 filter_cfg_chain_d; +} __packed; /* PHY_SPECIFIC_CONFIGURATION_API_VER_1*/ + /** * struct iwl_phy_cfg_cmd - Phy configuration command + * * @phy_cfg: PHY configuration value, uses &enum iwl_fw_phy_cfg * @calib_control: calibration control data */ -struct iwl_phy_cfg_cmd { +struct iwl_phy_cfg_cmd_v1 { __le32 phy_cfg; struct iwl_calib_ctrl calib_control; } __packed; +/** + * struct iwl_phy_cfg_cmd_v3 - Phy configuration command (v3) + * + * @phy_cfg: PHY configuration value, uses &enum iwl_fw_phy_cfg + * @calib_control: calibration control data + * @phy_specific_cfg: configure predefined PHY filters + */ +struct iwl_phy_cfg_cmd_v3 { + __le32 phy_cfg; + struct iwl_calib_ctrl calib_control; + struct iwl_phy_specific_cfg phy_specific_cfg; +} __packed; /* PHY_CONFIGURATION_CMD_API_S_VER_3 */ + /* * enum iwl_dc2dc_config_id - flag ids * diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/constants.h b/drivers/net/wireless/intel/iwlwifi/mvm/constants.h index 58df25e2fb32..b0268f44b2ea 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/constants.h +++ b/drivers/net/wireless/intel/iwlwifi/mvm/constants.h @@ -155,5 +155,9 @@ #define IWL_MVM_USE_TWT false #define IWL_MVM_AMPDU_CONSEC_DROPS_DELBA 10 #define IWL_MVM_USE_NSSN_SYNC 0 +#define IWL_MVM_PHY_FILTER_CHAIN_A 0 +#define IWL_MVM_PHY_FILTER_CHAIN_B 0 +#define IWL_MVM_PHY_FILTER_CHAIN_C 0 +#define IWL_MVM_PHY_FILTER_CHAIN_D 0 #endif /* __MVM_CONSTANTS_H */ diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/fw.c b/drivers/net/wireless/intel/iwlwifi/mvm/fw.c index 8fe78ce37771..2bc15ef13bb5 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/fw.c +++ b/drivers/net/wireless/intel/iwlwifi/mvm/fw.c @@ -550,10 +550,49 @@ error: return ret; } +#ifdef CONFIG_ACPI +static void iwl_mvm_phy_filter_init(struct iwl_mvm *mvm, + struct iwl_phy_specific_cfg *phy_filters) +{ + /* + * TODO: read specific phy config from BIOS + * ACPI table for this feature has not been defined yet, + * so for now we use hardcoded values. + */ + + if (IWL_MVM_PHY_FILTER_CHAIN_A) { + phy_filters->filter_cfg_chain_a = + cpu_to_le32(IWL_MVM_PHY_FILTER_CHAIN_A); + } + if (IWL_MVM_PHY_FILTER_CHAIN_B) { + phy_filters->filter_cfg_chain_b = + cpu_to_le32(IWL_MVM_PHY_FILTER_CHAIN_B); + } + if (IWL_MVM_PHY_FILTER_CHAIN_C) { + phy_filters->filter_cfg_chain_c = + cpu_to_le32(IWL_MVM_PHY_FILTER_CHAIN_C); + } + if (IWL_MVM_PHY_FILTER_CHAIN_D) { + phy_filters->filter_cfg_chain_d = + cpu_to_le32(IWL_MVM_PHY_FILTER_CHAIN_D); + } +} + +#else /* CONFIG_ACPI */ + +static void iwl_mvm_phy_filter_init(struct iwl_mvm *mvm, + struct iwl_phy_specific_cfg *phy_filters) +{ +} +#endif /* CONFIG_ACPI */ + static int iwl_send_phy_cfg_cmd(struct iwl_mvm *mvm) { - struct iwl_phy_cfg_cmd phy_cfg_cmd; + struct iwl_phy_cfg_cmd_v3 phy_cfg_cmd; enum iwl_ucode_type ucode_type = mvm->fwrt.cur_fw_img; + struct iwl_phy_specific_cfg phy_filters = {}; + u8 cmd_ver; + size_t cmd_size; if (iwl_mvm_has_unified_ucode(mvm) && !mvm->trans->cfg->tx_with_siso_diversity) @@ -580,11 +619,20 @@ static int iwl_send_phy_cfg_cmd(struct iwl_mvm *mvm) phy_cfg_cmd.calib_control.flow_trigger = mvm->fw->default_calib[ucode_type].flow_trigger; + cmd_ver = iwl_fw_lookup_cmd_ver(mvm->fw, IWL_ALWAYS_LONG_GROUP, + PHY_CONFIGURATION_CMD); + if (cmd_ver == 3) { + iwl_mvm_phy_filter_init(mvm, &phy_filters); + memcpy(&phy_cfg_cmd.phy_specific_cfg, &phy_filters, + sizeof(struct iwl_phy_specific_cfg)); + } + IWL_DEBUG_INFO(mvm, "Sending Phy CFG command: 0x%x\n", phy_cfg_cmd.phy_cfg); - + cmd_size = (cmd_ver == 3) ? sizeof(struct iwl_phy_cfg_cmd_v3) : + sizeof(struct iwl_phy_cfg_cmd_v1); return iwl_mvm_send_cmd_pdu(mvm, PHY_CONFIGURATION_CMD, 0, - sizeof(phy_cfg_cmd), &phy_cfg_cmd); + cmd_size, &phy_cfg_cmd); } int iwl_run_init_mvm_ucode(struct iwl_mvm *mvm, bool read_nvm) -- cgit v1.2.3-59-g8ed1b From 9dede812455041eef3fa308af015825ce9c701d0 Mon Sep 17 00:00:00 2001 From: Luca Coelho Date: Sat, 18 Apr 2020 11:08:48 +0300 Subject: iwlwifi: remove deprecated and unused iwl_mvm_keyinfo struct This struct hasn't been used in years and is just a remnant of an API support removal that missed this structured. Signed-off-by: Luca Coelho Link: https://lore.kernel.org/r/iwlwifi.20200418110539.93860da2d12a.Ifeca3b3313e3f14330317bc3e3d62f7d991ec955@changeid --- drivers/net/wireless/intel/iwlwifi/fw/api/sta.h | 26 ------------------------- 1 file changed, 26 deletions(-) diff --git a/drivers/net/wireless/intel/iwlwifi/fw/api/sta.h b/drivers/net/wireless/intel/iwlwifi/fw/api/sta.h index 970e9e508ad0..c010e6febbf4 100644 --- a/drivers/net/wireless/intel/iwlwifi/fw/api/sta.h +++ b/drivers/net/wireless/intel/iwlwifi/fw/api/sta.h @@ -245,32 +245,6 @@ enum iwl_sta_sleep_flag { #define STA_KEY_LEN_WEP40 (5) #define STA_KEY_LEN_WEP104 (13) -/** - * struct iwl_mvm_keyinfo - key information - * @key_flags: type &enum iwl_sta_key_flag - * @tkip_rx_tsc_byte2: TSC[2] for key mix ph1 detection - * @reserved1: reserved - * @tkip_rx_ttak: 10-byte unicast TKIP TTAK for Rx - * @key_offset: key offset in the fw's key table - * @reserved2: reserved - * @key: 16-byte unicast decryption key - * @tx_secur_seq_cnt: initial RSC / PN needed for replay check - * @hw_tkip_mic_rx_key: byte: MIC Rx Key - used for TKIP only - * @hw_tkip_mic_tx_key: byte: MIC Tx Key - used for TKIP only - */ -struct iwl_mvm_keyinfo { - __le16 key_flags; - u8 tkip_rx_tsc_byte2; - u8 reserved1; - __le16 tkip_rx_ttak[5]; - u8 key_offset; - u8 reserved2; - u8 key[16]; - __le64 tx_secur_seq_cnt; - __le64 hw_tkip_mic_rx_key; - __le64 hw_tkip_mic_tx_key; -} __packed; - #define IWL_ADD_STA_STATUS_MASK 0xFF #define IWL_ADD_STA_BAID_VALID_MASK 0x8000 #define IWL_ADD_STA_BAID_MASK 0x7F00 -- cgit v1.2.3-59-g8ed1b From 9efab1ad3ffb5b5ecbe24ea5ace420a9b7466338 Mon Sep 17 00:00:00 2001 From: Emmanuel Grumbach Date: Sat, 18 Apr 2020 11:08:49 +0300 Subject: iwlwifi: remove fw_monitor module parameter This module parameter is no longer useful now that other debug infrastructure was added to iwlwifi. Signed-off-by: Emmanuel Grumbach Signed-off-by: Luca Coelho Link: https://lore.kernel.org/r/iwlwifi.20200418110539.03bd49c3432b.Ie62047d3b364b19c8c3584ea37790220466f2a8d@changeid --- drivers/net/wireless/intel/iwlwifi/iwl-drv.c | 4 ---- drivers/net/wireless/intel/iwlwifi/iwl-modparams.h | 2 -- drivers/net/wireless/intel/iwlwifi/pcie/trans.c | 15 +-------------- 3 files changed, 1 insertion(+), 20 deletions(-) diff --git a/drivers/net/wireless/intel/iwlwifi/iwl-drv.c b/drivers/net/wireless/intel/iwlwifi/iwl-drv.c index ff52e69c1c80..9ff12c207f25 100644 --- a/drivers/net/wireless/intel/iwlwifi/iwl-drv.c +++ b/drivers/net/wireless/intel/iwlwifi/iwl-drv.c @@ -1872,10 +1872,6 @@ module_param_named(power_level, iwlwifi_mod_params.power_level, int, 0444); MODULE_PARM_DESC(power_level, "default power save level (range from 1 - 5, default: 1)"); -module_param_named(fw_monitor, iwlwifi_mod_params.fw_monitor, bool, 0444); -MODULE_PARM_DESC(fw_monitor, - "firmware monitor - to debug FW (default: false - needs lots of memory)"); - module_param_named(disable_11ac, iwlwifi_mod_params.disable_11ac, bool, 0444); MODULE_PARM_DESC(disable_11ac, "Disable VHT capabilities (default: false)"); diff --git a/drivers/net/wireless/intel/iwlwifi/iwl-modparams.h b/drivers/net/wireless/intel/iwlwifi/iwl-modparams.h index 82e5cac23d8d..b094cc1e9be0 100644 --- a/drivers/net/wireless/intel/iwlwifi/iwl-modparams.h +++ b/drivers/net/wireless/intel/iwlwifi/iwl-modparams.h @@ -115,7 +115,6 @@ enum iwl_uapsd_disable { * @nvm_file: specifies a external NVM file * @uapsd_disable: disable U-APSD, see &enum iwl_uapsd_disable, default = * IWL_DISABLE_UAPSD_BSS | IWL_DISABLE_UAPSD_P2P_CLIENT - * @fw_monitor: allow to use firmware monitor * @disable_11ac: disable VHT capabilities, default = false. * @remove_when_gone: remove an inaccessible device from the PCIe bus. * @enable_ini: enable new FW debug infratructure (INI TLVs) @@ -135,7 +134,6 @@ struct iwl_mod_params { int antenna_coupling; char *nvm_file; u32 uapsd_disable; - bool fw_monitor; bool disable_11ac; /** * @disable_11ax: disable HE capabilities, default = false diff --git a/drivers/net/wireless/intel/iwlwifi/pcie/trans.c b/drivers/net/wireless/intel/iwlwifi/pcie/trans.c index 06785c46c50d..a0daae058c1c 100644 --- a/drivers/net/wireless/intel/iwlwifi/pcie/trans.c +++ b/drivers/net/wireless/intel/iwlwifi/pcie/trans.c @@ -1019,21 +1019,8 @@ static int iwl_pcie_load_given_ucode(struct iwl_trans *trans, return ret; } - /* supported for 7000 only for the moment */ - if (iwlwifi_mod_params.fw_monitor && - trans->trans_cfg->device_family == IWL_DEVICE_FAMILY_7000) { - struct iwl_dram_data *fw_mon = &trans->dbg.fw_mon; - - iwl_pcie_alloc_fw_monitor(trans, 0); - if (fw_mon->size) { - iwl_write_prph(trans, MON_BUFF_BASE_ADDR, - fw_mon->physical >> 4); - iwl_write_prph(trans, MON_BUFF_END_ADDR, - (fw_mon->physical + fw_mon->size) >> 4); - } - } else if (iwl_pcie_dbg_on(trans)) { + if (iwl_pcie_dbg_on(trans)) iwl_pcie_apply_destination(trans); - } iwl_enable_interrupts(trans); -- cgit v1.2.3-59-g8ed1b From 28dd7ccdc56fbde66d49a36dc1fce06730586681 Mon Sep 17 00:00:00 2001 From: Mordechay Goodstein Date: Sat, 18 Apr 2020 11:08:50 +0300 Subject: iwlwifi: acpi: read TAS table from ACPI and send it to the FW Read the Time Averaged SAR (TAS) table from ACPI and if TAS feature is enabled in the FW send the black list countries which TAS is disabled in to the FW Signed-off-by: Mordechay Goodstein Signed-off-by: Luca Coelho Link: https://lore.kernel.org/r/iwlwifi.20200418110539.40a327d32cfd.I7203f3afc8186cca34c48a1a116baac1df5eff4e@changeid --- drivers/net/wireless/intel/iwlwifi/fw/acpi.c | 76 ++++++++++++++++++++++ drivers/net/wireless/intel/iwlwifi/fw/acpi.h | 17 +++++ .../net/wireless/intel/iwlwifi/fw/api/nvm-reg.h | 15 +++++ drivers/net/wireless/intel/iwlwifi/fw/file.h | 1 + drivers/net/wireless/intel/iwlwifi/mvm/fw.c | 39 +++++++++++ drivers/net/wireless/intel/iwlwifi/mvm/ops.c | 1 + 6 files changed, 149 insertions(+) diff --git a/drivers/net/wireless/intel/iwlwifi/fw/acpi.c b/drivers/net/wireless/intel/iwlwifi/fw/acpi.c index ba2aff3af0fe..344eba82a902 100644 --- a/drivers/net/wireless/intel/iwlwifi/fw/acpi.c +++ b/drivers/net/wireless/intel/iwlwifi/fw/acpi.c @@ -151,6 +151,82 @@ found: } IWL_EXPORT_SYMBOL(iwl_acpi_get_wifi_pkg); +int iwl_acpi_get_tas(struct iwl_fw_runtime *fwrt, + __le32 *black_list_array, + int *black_list_size) +{ + union acpi_object *wifi_pkg, *data; + int ret, tbl_rev, i; + bool enabled; + + data = iwl_acpi_get_object(fwrt->dev, ACPI_WTAS_METHOD); + if (IS_ERR(data)) + return PTR_ERR(data); + + wifi_pkg = iwl_acpi_get_wifi_pkg(fwrt->dev, data, + ACPI_WTAS_WIFI_DATA_SIZE, + &tbl_rev); + if (IS_ERR(wifi_pkg)) { + ret = PTR_ERR(wifi_pkg); + goto out_free; + } + + if (wifi_pkg->package.elements[0].type != ACPI_TYPE_INTEGER || + tbl_rev != 0) { + ret = -EINVAL; + goto out_free; + } + + enabled = !!wifi_pkg->package.elements[0].integer.value; + + if (!enabled) { + *black_list_size = -1; + IWL_DEBUG_RADIO(fwrt, "TAS not enabled\n"); + ret = 0; + goto out_free; + } + + if (wifi_pkg->package.elements[1].type != ACPI_TYPE_INTEGER || + wifi_pkg->package.elements[1].integer.value > + APCI_WTAS_BLACK_LIST_MAX) { + IWL_DEBUG_RADIO(fwrt, "TAS invalid array size %llu\n", + wifi_pkg->package.elements[1].integer.value); + ret = -EINVAL; + goto out_free; + } + *black_list_size = wifi_pkg->package.elements[1].integer.value; + + IWL_DEBUG_RADIO(fwrt, "TAS array size %d\n", *black_list_size); + if (*black_list_size > APCI_WTAS_BLACK_LIST_MAX) { + IWL_DEBUG_RADIO(fwrt, "TAS invalid array size value %u\n", + *black_list_size); + ret = -EINVAL; + goto out_free; + } + + for (i = 0; i < *black_list_size; i++) { + u32 country; + + if (wifi_pkg->package.elements[2 + i].type != + ACPI_TYPE_INTEGER) { + IWL_DEBUG_RADIO(fwrt, + "TAS invalid array elem %d\n", 2 + i); + ret = -EINVAL; + goto out_free; + } + + country = wifi_pkg->package.elements[2 + i].integer.value; + black_list_array[i] = cpu_to_le32(country); + IWL_DEBUG_RADIO(fwrt, "TAS black list country %d\n", country); + } + + ret = 0; +out_free: + kfree(data); + return ret; +} +IWL_EXPORT_SYMBOL(iwl_acpi_get_tas); + int iwl_acpi_get_mcc(struct device *dev, char *mcc) { union acpi_object *wifi_pkg, *data; diff --git a/drivers/net/wireless/intel/iwlwifi/fw/acpi.h b/drivers/net/wireless/intel/iwlwifi/fw/acpi.h index 5590e5cc8fbb..6a646dc524e1 100644 --- a/drivers/net/wireless/intel/iwlwifi/fw/acpi.h +++ b/drivers/net/wireless/intel/iwlwifi/fw/acpi.h @@ -64,6 +64,7 @@ #include "fw/api/commands.h" #include "fw/api/power.h" #include "fw/api/phy.h" +#include "fw/api/nvm-reg.h" #include "fw/img.h" #include "iwl-trans.h" @@ -75,6 +76,7 @@ #define ACPI_SPLC_METHOD "SPLC" #define ACPI_ECKV_METHOD "ECKV" #define ACPI_PPAG_METHOD "PPAG" +#define ACPI_WTAS_METHOD "WTAS" #define ACPI_WIFI_DOMAIN (0x07) @@ -96,6 +98,12 @@ #define ACPI_SPLC_WIFI_DATA_SIZE 2 #define ACPI_ECKV_WIFI_DATA_SIZE 2 +/* + * 1 type, 1 enabled, 1 black list size, 16 black list array + */ +#define APCI_WTAS_BLACK_LIST_MAX 16 +#define ACPI_WTAS_WIFI_DATA_SIZE (3 + APCI_WTAS_BLACK_LIST_MAX) + #define ACPI_WGDS_NUM_BANDS 2 #define ACPI_WGDS_TABLE_SIZE 3 @@ -174,6 +182,9 @@ int iwl_validate_sar_geo_profile(struct iwl_fw_runtime *fwrt, int iwl_sar_geo_init(struct iwl_fw_runtime *fwrt, struct iwl_per_chain_offset_group *table); +int iwl_acpi_get_tas(struct iwl_fw_runtime *fwrt, __le32 *black_list_array, + int *black_list_size); + #else /* CONFIG_ACPI */ static inline void *iwl_acpi_get_object(struct device *dev, acpi_string method) @@ -250,5 +261,11 @@ static inline int iwl_sar_geo_init(struct iwl_fw_runtime *fwrt, return -ENOENT; } +static inline int iwl_acpi_get_tas(struct iwl_fw_runtime *fwrt, + __le32 *black_list_array, + int *black_list_size) +{ + return -ENOENT; +} #endif /* CONFIG_ACPI */ #endif /* __iwl_fw_acpi__ */ diff --git a/drivers/net/wireless/intel/iwlwifi/fw/api/nvm-reg.h b/drivers/net/wireless/intel/iwlwifi/fw/api/nvm-reg.h index 97b49843e318..2d230a7893c2 100644 --- a/drivers/net/wireless/intel/iwlwifi/fw/api/nvm-reg.h +++ b/drivers/net/wireless/intel/iwlwifi/fw/api/nvm-reg.h @@ -80,6 +80,11 @@ enum iwl_regulatory_and_nvm_subcmd_ids { * response is &struct iwl_nvm_get_info_rsp */ NVM_GET_INFO = 0x2, + + /** + * @TAS_CONFIG: &struct iwl_tas_config_cmd + */ + TAS_CONFIG = 0x3, }; /** @@ -431,4 +436,14 @@ enum iwl_mcc_source { MCC_SOURCE_GETTING_MCC_TEST_MODE = 0x11, }; +#define IWL_TAS_BLACK_LIST_MAX 16 +/** + * struct iwl_tas_config_cmd - configures the TAS + * @black_list_size: size of relevant field in black_list_array + * @black_list_array: black list countries (without TAS) + */ +struct iwl_tas_config_cmd { + __le32 black_list_size; + __le32 black_list_array[IWL_TAS_BLACK_LIST_MAX]; +} __packed; /* TAS_CONFIG_CMD_API_S_VER_2 */ #endif /* __iwl_fw_api_nvm_reg_h__ */ diff --git a/drivers/net/wireless/intel/iwlwifi/fw/file.h b/drivers/net/wireless/intel/iwlwifi/fw/file.h index 35f42e529a6d..1fb45fd30ffa 100644 --- a/drivers/net/wireless/intel/iwlwifi/fw/file.h +++ b/drivers/net/wireless/intel/iwlwifi/fw/file.h @@ -449,6 +449,7 @@ enum iwl_ucode_tlv_capa { IWL_UCODE_TLV_CAPA_CS_MODIFY = (__force iwl_ucode_tlv_capa_t)49, IWL_UCODE_TLV_CAPA_SET_LTR_GEN2 = (__force iwl_ucode_tlv_capa_t)50, IWL_UCODE_TLV_CAPA_SET_PPAG = (__force iwl_ucode_tlv_capa_t)52, + IWL_UCODE_TLV_CAPA_TAS_CFG = (__force iwl_ucode_tlv_capa_t)53, IWL_UCODE_TLV_CAPA_SESSION_PROT_CMD = (__force iwl_ucode_tlv_capa_t)54, /* set 2 */ diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/fw.c b/drivers/net/wireless/intel/iwlwifi/mvm/fw.c index 2bc15ef13bb5..bf3eaadfb343 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/fw.c +++ b/drivers/net/wireless/intel/iwlwifi/mvm/fw.c @@ -979,6 +979,40 @@ static int iwl_mvm_ppag_init(struct iwl_mvm *mvm) return iwl_mvm_ppag_send_cmd(mvm); } +static void iwl_mvm_tas_init(struct iwl_mvm *mvm) +{ + int ret; + struct iwl_tas_config_cmd cmd = {}; + int list_size; + + BUILD_BUG_ON(ARRAY_SIZE(cmd.black_list_array) < + APCI_WTAS_BLACK_LIST_MAX); + + if (!fw_has_capa(&mvm->fw->ucode_capa, IWL_UCODE_TLV_CAPA_TAS_CFG)) { + IWL_DEBUG_RADIO(mvm, "TAS not enabled in FW\n"); + return; + } + + ret = iwl_acpi_get_tas(&mvm->fwrt, cmd.black_list_array, &list_size); + if (ret < 0) { + IWL_DEBUG_RADIO(mvm, + "TAS table invalid or unavailable. (%d)\n", + ret); + return; + } + + if (list_size < 0) + return; + + /* list size if TAS enabled can only be non-negative */ + cmd.black_list_size = cpu_to_le32((u32)list_size); + + ret = iwl_mvm_send_cmd_pdu(mvm, WIDE_ID(REGULATORY_AND_NVM_GROUP, + TAS_CONFIG), + 0, sizeof(cmd), &cmd); + if (ret < 0) + IWL_DEBUG_RADIO(mvm, "failed to send TAS_CONFIG (%d)\n", ret); +} #else /* CONFIG_ACPI */ inline int iwl_mvm_sar_select_profile(struct iwl_mvm *mvm, @@ -1006,6 +1040,10 @@ static int iwl_mvm_ppag_init(struct iwl_mvm *mvm) { return 0; } + +static void iwl_mvm_tas_init(struct iwl_mvm *mvm) +{ +} #endif /* CONFIG_ACPI */ void iwl_mvm_send_recovery_cmd(struct iwl_mvm *mvm, u32 flags) @@ -1333,6 +1371,7 @@ int iwl_mvm_up(struct iwl_mvm *mvm) if (ret < 0) goto error; + iwl_mvm_tas_init(mvm); iwl_mvm_leds_sync(mvm); IWL_DEBUG_INFO(mvm, "RT uCode started.\n"); diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/ops.c b/drivers/net/wireless/intel/iwlwifi/mvm/ops.c index b00f4a8b8424..d0afc806706d 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/ops.c +++ b/drivers/net/wireless/intel/iwlwifi/mvm/ops.c @@ -505,6 +505,7 @@ static const struct iwl_hcmd_names iwl_mvm_prot_offload_names[] = { static const struct iwl_hcmd_names iwl_mvm_regulatory_and_nvm_names[] = { HCMD_NAME(NVM_ACCESS_COMPLETE), HCMD_NAME(NVM_GET_INFO), + HCMD_NAME(TAS_CONFIG), }; static const struct iwl_hcmd_arr iwl_mvm_groups[] = { -- cgit v1.2.3-59-g8ed1b From e819a80a9764aea789ec6a25d3858d2a5d9ac7bc Mon Sep 17 00:00:00 2001 From: Ihab Zhaika Date: Sat, 18 Apr 2020 11:08:51 +0300 Subject: iwlwifi: add new cards for AX family add few PCI ID'S for AX family. Signed-off-by: Ihab Zhaika Signed-off-by: Luca Coelho Link: https://lore.kernel.org/r/iwlwifi.20200418110539.5eae2261b70c.I0369619a562c4e4008e2f0a3afb9ed5d4c9b49d4@changeid --- drivers/net/wireless/intel/iwlwifi/cfg/22000.c | 21 ++++++++++++++++++--- drivers/net/wireless/intel/iwlwifi/iwl-config.h | 1 + drivers/net/wireless/intel/iwlwifi/pcie/drv.c | 7 ++++++- 3 files changed, 25 insertions(+), 4 deletions(-) diff --git a/drivers/net/wireless/intel/iwlwifi/cfg/22000.c b/drivers/net/wireless/intel/iwlwifi/cfg/22000.c index bc49cdd819df..1af4ba2d30cb 100644 --- a/drivers/net/wireless/intel/iwlwifi/cfg/22000.c +++ b/drivers/net/wireless/intel/iwlwifi/cfg/22000.c @@ -90,7 +90,8 @@ #define IWL_22000_SO_A_GF_A_FW_PRE "iwlwifi-so-a0-gf-a0-" #define IWL_22000_TY_A_GF_A_FW_PRE "iwlwifi-ty-a0-gf-a0-" #define IWL_22000_SO_A_GF4_A_FW_PRE "iwlwifi-so-a0-gf4-a0-" -#define IWL_22000_SOSNJ_A_GF4_A_FW_PRE "iwlwifi-SoSnj-a0-gf4-a0-" +#define IWL_SNJ_A_GF4_A_FW_PRE "iwlwifi-SoSnj-a0-gf4-a0-" +#define IWL_SNJ_A_GF_A_FW_PRE "iwlwifi-SoSnj-a0-gf-a0-" #define IWL_22000_HR_MODULE_FIRMWARE(api) \ IWL_22000_HR_FW_PRE __stringify(api) ".ucode" @@ -120,6 +121,10 @@ IWL_22000_SO_A_GF_A_FW_PRE __stringify(api) ".ucode" #define IWL_22000_TY_A_GF_A_MODULE_FIRMWARE(api) \ IWL_22000_TY_A_GF_A_FW_PRE __stringify(api) ".ucode" +#define IWL_SNJ_A_GF4_A_MODULE_FIRMWARE(api) \ + IWL_SNJ_A_GF4_A_FW_PRE __stringify(api) ".ucode" +#define IWL_SNJ_A_GF_A_MODULE_FIRMWARE(api) \ + IWL_SNJ_A_GF_A_FW_PRE __stringify(api) ".ucode" static const struct iwl_base_params iwl_22000_base_params = { .eeprom_size = OTP_LOW_IMAGE_SIZE_32K, @@ -553,8 +558,16 @@ const struct iwl_cfg iwlax411_2ax_cfg_so_gf4_a0 = { }; const struct iwl_cfg iwlax411_2ax_cfg_sosnj_gf4_a0 = { - .name = "Intel(R) Wi-Fi 7 AX411 160MHz", - .fw_name_pre = IWL_22000_SOSNJ_A_GF4_A_FW_PRE, + .name = "Intel(R) Wi-Fi 6 AX411 160MHz", + .fw_name_pre = IWL_SNJ_A_GF4_A_FW_PRE, + .uhb_supported = true, + IWL_DEVICE_AX210, + .num_rbds = IWL_NUM_RBDS_AX210_HE, +}; + +const struct iwl_cfg iwlax211_cfg_snj_gf_a0 = { + .name = "Intel(R) Wi-Fi 6 AX211 160MHz", + .fw_name_pre = IWL_SNJ_A_GF_A_FW_PRE, .uhb_supported = true, IWL_DEVICE_AX210, .num_rbds = IWL_NUM_RBDS_AX210_HE, @@ -573,3 +586,5 @@ MODULE_FIRMWARE(IWL_22000_SO_A_JF_B_MODULE_FIRMWARE(IWL_22000_UCODE_API_MAX)); MODULE_FIRMWARE(IWL_22000_SO_A_HR_B_MODULE_FIRMWARE(IWL_22000_UCODE_API_MAX)); MODULE_FIRMWARE(IWL_22000_SO_A_GF_A_MODULE_FIRMWARE(IWL_22000_UCODE_API_MAX)); MODULE_FIRMWARE(IWL_22000_TY_A_GF_A_MODULE_FIRMWARE(IWL_22000_UCODE_API_MAX)); +MODULE_FIRMWARE(IWL_SNJ_A_GF4_A_MODULE_FIRMWARE(IWL_22000_UCODE_API_MAX)); +MODULE_FIRMWARE(IWL_SNJ_A_GF_A_MODULE_FIRMWARE(IWL_22000_UCODE_API_MAX)); diff --git a/drivers/net/wireless/intel/iwlwifi/iwl-config.h b/drivers/net/wireless/intel/iwlwifi/iwl-config.h index d5d984d7ce83..3ed8ead8a9bb 100644 --- a/drivers/net/wireless/intel/iwlwifi/iwl-config.h +++ b/drivers/net/wireless/intel/iwlwifi/iwl-config.h @@ -625,6 +625,7 @@ extern const struct iwl_cfg iwlax211_2ax_cfg_so_gf_a0; extern const struct iwl_cfg iwlax210_2ax_cfg_ty_gf_a0; extern const struct iwl_cfg iwlax411_2ax_cfg_so_gf4_a0; extern const struct iwl_cfg iwlax411_2ax_cfg_sosnj_gf4_a0; +extern const struct iwl_cfg iwlax211_cfg_snj_gf_a0; #endif /* CPTCFG_IWLMVM || CPTCFG_IWLFMAC */ #endif /* __IWL_CONFIG_H__ */ diff --git a/drivers/net/wireless/intel/iwlwifi/pcie/drv.c b/drivers/net/wireless/intel/iwlwifi/pcie/drv.c index 6744c0281ffb..ab849aa4434e 100644 --- a/drivers/net/wireless/intel/iwlwifi/pcie/drv.c +++ b/drivers/net/wireless/intel/iwlwifi/pcie/drv.c @@ -539,12 +539,17 @@ static const struct pci_device_id iwl_hw_card_ids[] = { {IWL_PCI_DEVICE(0x2725, 0x0310, iwlax210_2ax_cfg_ty_gf_a0)}, {IWL_PCI_DEVICE(0x2725, 0x0510, iwlax210_2ax_cfg_ty_gf_a0)}, {IWL_PCI_DEVICE(0x2725, 0x0A10, iwlax210_2ax_cfg_ty_gf_a0)}, - {IWL_PCI_DEVICE(0x2725, 0x00B0, iwlax411_2ax_cfg_so_gf4_a0)}, + {IWL_PCI_DEVICE(0x2725, 0x00B0, iwlax411_2ax_cfg_sosnj_gf4_a0)}, + {IWL_PCI_DEVICE(0x2726, 0x0090, iwlax211_cfg_snj_gf_a0)}, + {IWL_PCI_DEVICE(0x2726, 0x00B0, iwlax411_2ax_cfg_sosnj_gf4_a0)}, + {IWL_PCI_DEVICE(0x2726, 0x0510, iwlax211_cfg_snj_gf_a0)}, {IWL_PCI_DEVICE(0x7A70, 0x0090, iwlax211_2ax_cfg_so_gf_a0)}, + {IWL_PCI_DEVICE(0x7A70, 0x00B0, iwlax411_2ax_cfg_so_gf4_a0)}, {IWL_PCI_DEVICE(0x7A70, 0x0310, iwlax211_2ax_cfg_so_gf_a0)}, {IWL_PCI_DEVICE(0x7A70, 0x0510, iwlax211_2ax_cfg_so_gf_a0)}, {IWL_PCI_DEVICE(0x7A70, 0x0A10, iwlax211_2ax_cfg_so_gf_a0)}, {IWL_PCI_DEVICE(0x7AF0, 0x0090, iwlax211_2ax_cfg_so_gf_a0)}, + {IWL_PCI_DEVICE(0x7AF0, 0x00B0, iwlax411_2ax_cfg_so_gf4_a0)}, {IWL_PCI_DEVICE(0x7AF0, 0x0310, iwlax211_2ax_cfg_so_gf_a0)}, {IWL_PCI_DEVICE(0x7AF0, 0x0510, iwlax211_2ax_cfg_so_gf_a0)}, {IWL_PCI_DEVICE(0x7AF0, 0x0A10, iwlax211_2ax_cfg_so_gf_a0)}, -- cgit v1.2.3-59-g8ed1b From 4ee27edd389174b3e1a9c84a1b3a678d9e9f0934 Mon Sep 17 00:00:00 2001 From: Luca Coelho Date: Sat, 18 Apr 2020 11:08:52 +0300 Subject: iwlwifi: pcie: add cfgs for SoCs with device ID 0x4FD0 A new device ID needs to be added to the list to support new SoCs. Add it and support all subsystem IDs that other Qu devices support. Signed-off-by: Luca Coelho Link: https://lore.kernel.org/r/iwlwifi.20200418110539.5e5ce668ff8b.I20a9c8b3470aaabaa54361a5641637e5a14d8321@changeid --- drivers/net/wireless/intel/iwlwifi/pcie/drv.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/drivers/net/wireless/intel/iwlwifi/pcie/drv.c b/drivers/net/wireless/intel/iwlwifi/pcie/drv.c index ab849aa4434e..bc2cdf40028e 100644 --- a/drivers/net/wireless/intel/iwlwifi/pcie/drv.c +++ b/drivers/net/wireless/intel/iwlwifi/pcie/drv.c @@ -526,6 +526,7 @@ static const struct pci_device_id iwl_hw_card_ids[] = { {IWL_PCI_DEVICE(0x06F0, PCI_ANY_ID, iwl_qu_trans_cfg)}, {IWL_PCI_DEVICE(0x34F0, PCI_ANY_ID, iwl_qu_trans_cfg)}, {IWL_PCI_DEVICE(0x3DF0, PCI_ANY_ID, iwl_qu_trans_cfg)}, + {IWL_PCI_DEVICE(0x4DF0, PCI_ANY_ID, iwl_qu_trans_cfg)}, {IWL_PCI_DEVICE(0x43F0, PCI_ANY_ID, iwl_qu_long_latency_trans_cfg)}, {IWL_PCI_DEVICE(0xA0F0, PCI_ANY_ID, iwl_qu_long_latency_trans_cfg)}, @@ -662,6 +663,19 @@ static const struct iwl_dev_info iwl_dev_info_table[] = { IWL_DEV_INFO(0x3DF0, 0x4070, iwl_ax201_cfg_qu_hr, NULL), IWL_DEV_INFO(0x3DF0, 0x4244, iwl_ax101_cfg_qu_hr, NULL), + IWL_DEV_INFO(0x4DF0, 0x0044, iwl_ax101_cfg_qu_hr, NULL), + IWL_DEV_INFO(0x4DF0, 0x0070, iwl_ax201_cfg_qu_hr, NULL), + IWL_DEV_INFO(0x4DF0, 0x0074, iwl_ax201_cfg_qu_hr, NULL), + IWL_DEV_INFO(0x4DF0, 0x0078, iwl_ax201_cfg_qu_hr, NULL), + IWL_DEV_INFO(0x4DF0, 0x007C, iwl_ax201_cfg_qu_hr, NULL), + IWL_DEV_INFO(0x4DF0, 0x0244, iwl_ax101_cfg_qu_hr, NULL), + IWL_DEV_INFO(0x4DF0, 0x0310, iwl_ax201_cfg_qu_hr, NULL), + IWL_DEV_INFO(0x4DF0, 0x1651, killer1650s_2ax_cfg_qu_b0_hr_b0, NULL), + IWL_DEV_INFO(0x4DF0, 0x1652, killer1650i_2ax_cfg_qu_b0_hr_b0, NULL), + IWL_DEV_INFO(0x4DF0, 0x2074, iwl_ax201_cfg_qu_hr, NULL), + IWL_DEV_INFO(0x4DF0, 0x4070, iwl_ax201_cfg_qu_hr, NULL), + IWL_DEV_INFO(0x4DF0, 0x4244, iwl_ax101_cfg_qu_hr, NULL), + IWL_DEV_INFO(0x2720, 0x0000, iwl22000_2ax_cfg_qnj_hr_b0, NULL), IWL_DEV_INFO(0x2720, 0x0040, iwl22000_2ax_cfg_qnj_hr_b0, NULL), IWL_DEV_INFO(0x2720, 0x0044, iwl22000_2ax_cfg_qnj_hr_b0, NULL), -- cgit v1.2.3-59-g8ed1b From 2d39683e739940b852e579b8d5005c25a8912c0a Mon Sep 17 00:00:00 2001 From: Ihab Zhaika Date: Sat, 18 Apr 2020 11:08:53 +0300 Subject: iwlwifi: update few product names in AX family update the product names of few structs in AX family. Signed-off-by: Ihab Zhaika Signed-off-by: Luca Coelho Link: https://lore.kernel.org/r/iwlwifi.20200418110539.738dabad8732.I5673eaf8a016b8aa27ab8bab02121108fa723783@changeid --- drivers/net/wireless/intel/iwlwifi/cfg/22000.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/net/wireless/intel/iwlwifi/cfg/22000.c b/drivers/net/wireless/intel/iwlwifi/cfg/22000.c index 1af4ba2d30cb..35fa89d28fef 100644 --- a/drivers/net/wireless/intel/iwlwifi/cfg/22000.c +++ b/drivers/net/wireless/intel/iwlwifi/cfg/22000.c @@ -527,14 +527,14 @@ const struct iwl_cfg iwlax210_2ax_cfg_so_jf_a0 = { }; const struct iwl_cfg iwlax210_2ax_cfg_so_hr_a0 = { - .name = "Intel(R) Wi-Fi 7 AX210 160MHz", + .name = "Intel(R) Wi-Fi 6 AX210 160MHz", .fw_name_pre = IWL_22000_SO_A_HR_B_FW_PRE, IWL_DEVICE_AX210, .num_rbds = IWL_NUM_RBDS_AX210_HE, }; const struct iwl_cfg iwlax211_2ax_cfg_so_gf_a0 = { - .name = "Intel(R) Wi-Fi 7 AX211 160MHz", + .name = "Intel(R) Wi-Fi 6 AX211 160MHz", .fw_name_pre = IWL_22000_SO_A_GF_A_FW_PRE, .uhb_supported = true, IWL_DEVICE_AX210, @@ -542,7 +542,7 @@ const struct iwl_cfg iwlax211_2ax_cfg_so_gf_a0 = { }; const struct iwl_cfg iwlax210_2ax_cfg_ty_gf_a0 = { - .name = "Intel(R) Wi-Fi 7 AX210 160MHz", + .name = "Intel(R) Wi-Fi 6 AX210 160MHz", .fw_name_pre = IWL_22000_TY_A_GF_A_FW_PRE, .uhb_supported = true, IWL_DEVICE_AX210, @@ -550,7 +550,7 @@ const struct iwl_cfg iwlax210_2ax_cfg_ty_gf_a0 = { }; const struct iwl_cfg iwlax411_2ax_cfg_so_gf4_a0 = { - .name = "Intel(R) Wi-Fi 7 AX411 160MHz", + .name = "Intel(R) Wi-Fi 6 AX411 160MHz", .fw_name_pre = IWL_22000_SO_A_GF4_A_FW_PRE, .uhb_supported = true, IWL_DEVICE_AX210, -- cgit v1.2.3-59-g8ed1b From 0928df0a868c010c1dfb5269a23ffa2f9adc876b Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Sat, 18 Apr 2020 11:08:54 +0300 Subject: iwlwifi: mvm: tell firmware about required LTR delay Some (integrated) devices need a longer LTR delay than the firmware would typically apply, tell it about that. Signed-off-by: Johannes Berg Signed-off-by: Luca Coelho Link: https://lore.kernel.org/r/iwlwifi.20200418110539.24276ae2ad61.I8831a538f75893d5cee47b4a81f4b9b7fd0e8bea@changeid --- drivers/net/wireless/intel/iwlwifi/cfg/22000.c | 2 ++ drivers/net/wireless/intel/iwlwifi/fw/api/soc.h | 12 ++++++++---- drivers/net/wireless/intel/iwlwifi/iwl-config.h | 17 ++++++++++++----- drivers/net/wireless/intel/iwlwifi/mvm/fw.c | 14 ++++++++++++++ 4 files changed, 36 insertions(+), 9 deletions(-) diff --git a/drivers/net/wireless/intel/iwlwifi/cfg/22000.c b/drivers/net/wireless/intel/iwlwifi/cfg/22000.c index 35fa89d28fef..1f30d8fdf35d 100644 --- a/drivers/net/wireless/intel/iwlwifi/cfg/22000.c +++ b/drivers/net/wireless/intel/iwlwifi/cfg/22000.c @@ -243,6 +243,7 @@ const struct iwl_cfg_trans_params iwl_qu_trans_cfg = { .base_params = &iwl_22000_base_params, .integrated = true, .xtal_latency = 5000, + .ltr_delay = IWL_CFG_TRANS_LTR_DELAY_200US, }; const struct iwl_cfg_trans_params iwl_qu_long_latency_trans_cfg = { @@ -255,6 +256,7 @@ const struct iwl_cfg_trans_params iwl_qu_long_latency_trans_cfg = { .integrated = true, .xtal_latency = 12000, .low_latency_xtal = true, + .ltr_delay = IWL_CFG_TRANS_LTR_DELAY_2500US, }; const struct iwl_cfg_trans_params iwl_qnj_trans_cfg = { diff --git a/drivers/net/wireless/intel/iwlwifi/fw/api/soc.h b/drivers/net/wireless/intel/iwlwifi/fw/api/soc.h index aadca78e9846..0c6d7b3e1324 100644 --- a/drivers/net/wireless/intel/iwlwifi/fw/api/soc.h +++ b/drivers/net/wireless/intel/iwlwifi/fw/api/soc.h @@ -5,10 +5,9 @@ * * GPL LICENSE SUMMARY * - * Copyright(c) 2012 - 2014 Intel Corporation. All rights reserved. * Copyright(c) 2013 - 2015 Intel Mobile Communications GmbH * Copyright(c) 2016 - 2017 Intel Deutschland GmbH - * Copyright(c) 2019 Intel Deutschland GmbH + * Copyright(c) 2012 - 2014, 2019 - 2020 Intel Corporation * * This program is free software; you can redistribute it and/or modify * it under the terms of version 2 of the GNU General Public License as @@ -28,10 +27,9 @@ * * BSD LICENSE * - * Copyright(c) 2012 - 2014 Intel Corporation. All rights reserved. * Copyright(c) 2013 - 2015 Intel Mobile Communications GmbH * Copyright(c) 2016 - 2017 Intel Deutschland GmbH - * Copyright(c) 2019 Intel Deutschland GmbH + * Copyright(c) 2012 - 2014, 2019 - 2020 Intel Corporation * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -68,6 +66,12 @@ #define SOC_CONFIG_CMD_FLAGS_DISCRETE BIT(0) #define SOC_CONFIG_CMD_FLAGS_LOW_LATENCY BIT(1) +#define SOC_FLAGS_LTR_APPLY_DELAY_MASK 0xc +#define SOC_FLAGS_LTR_APPLY_DELAY_NONE 0 +#define SOC_FLAGS_LTR_APPLY_DELAY_200 1 +#define SOC_FLAGS_LTR_APPLY_DELAY_2500 2 +#define SOC_FLAGS_LTR_APPLY_DELAY_1820 3 + /** * struct iwl_soc_configuration_cmd - Set device stabilization latency * diff --git a/drivers/net/wireless/intel/iwlwifi/iwl-config.h b/drivers/net/wireless/intel/iwlwifi/iwl-config.h index 3ed8ead8a9bb..9b31fcc37ace 100644 --- a/drivers/net/wireless/intel/iwlwifi/iwl-config.h +++ b/drivers/net/wireless/intel/iwlwifi/iwl-config.h @@ -5,9 +5,8 @@ * * GPL LICENSE SUMMARY * - * Copyright(c) 2007 - 2014 Intel Corporation. All rights reserved. * Copyright (C) 2016 - 2017 Intel Deutschland GmbH - * Copyright(c) 2018 - 2019 Intel Corporation + * Copyright(c) 2007 - 2014, 2018 - 2020 Intel Corporation * * This program is free software; you can redistribute it and/or modify * it under the terms of version 2 of the GNU General Public License as @@ -27,9 +26,8 @@ * * BSD LICENSE * - * Copyright(c) 2005 - 2014 Intel Corporation. All rights reserved. * Copyright (C) 2016 - 2017 Intel Deutschland GmbH - * Copyright(c) 2018 - 2019 Intel Corporation + * Copyright(c) 2005 - 2014, 2018 - 2020 Intel Corporation * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -284,6 +282,13 @@ struct iwl_pwr_tx_backoff { u32 backoff; }; +enum iwl_cfg_trans_ltr_delay { + IWL_CFG_TRANS_LTR_DELAY_NONE = 0, + IWL_CFG_TRANS_LTR_DELAY_200US = 1, + IWL_CFG_TRANS_LTR_DELAY_2500US = 2, + IWL_CFG_TRANS_LTR_DELAY_1820US = 3, +}; + /** * struct iwl_cfg_trans - information needed to start the trans * @@ -304,6 +309,7 @@ struct iwl_pwr_tx_backoff { * @mq_rx_supported: multi-queue rx support * @integrated: discrete or integrated * @low_latency_xtal: use the low latency xtal if supported + * @ltr_delay: LTR delay parameter, &enum iwl_cfg_trans_ltr_delay. */ struct iwl_cfg_trans_params { const struct iwl_base_params *base_params; @@ -317,7 +323,8 @@ struct iwl_cfg_trans_params { mq_rx_supported:1, integrated:1, low_latency_xtal:1, - bisr_workaround:1; + bisr_workaround:1, + ltr_delay:2; }; /** diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/fw.c b/drivers/net/wireless/intel/iwlwifi/mvm/fw.c index bf3eaadfb343..d6598339c55c 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/fw.c +++ b/drivers/net/wireless/intel/iwlwifi/mvm/fw.c @@ -102,6 +102,20 @@ static int iwl_set_soc_latency(struct iwl_mvm *mvm) if (!mvm->trans->trans_cfg->integrated) cmd.flags = cpu_to_le32(SOC_CONFIG_CMD_FLAGS_DISCRETE); + BUILD_BUG_ON(IWL_CFG_TRANS_LTR_DELAY_NONE != + SOC_FLAGS_LTR_APPLY_DELAY_NONE); + BUILD_BUG_ON(IWL_CFG_TRANS_LTR_DELAY_200US != + SOC_FLAGS_LTR_APPLY_DELAY_200); + BUILD_BUG_ON(IWL_CFG_TRANS_LTR_DELAY_2500US != + SOC_FLAGS_LTR_APPLY_DELAY_2500); + BUILD_BUG_ON(IWL_CFG_TRANS_LTR_DELAY_1820US != + SOC_FLAGS_LTR_APPLY_DELAY_1820); + + if (mvm->trans->trans_cfg->ltr_delay != IWL_CFG_TRANS_LTR_DELAY_NONE && + !WARN_ON(!mvm->trans->trans_cfg->integrated)) + cmd.flags |= le32_encode_bits(mvm->trans->trans_cfg->ltr_delay, + SOC_FLAGS_LTR_APPLY_DELAY_MASK); + if (iwl_fw_lookup_cmd_ver(mvm->fw, IWL_ALWAYS_LONG_GROUP, SCAN_REQ_UMAC) >= 2 && mvm->trans->trans_cfg->low_latency_xtal) -- cgit v1.2.3-59-g8ed1b From 9c9613f0ee07b8da049914d92fc735a0e954d777 Mon Sep 17 00:00:00 2001 From: Luca Coelho Date: Sat, 18 Apr 2020 11:08:55 +0300 Subject: iwlwifi: pcie: add new structure for Qu devices with medium latency Some Qu devices require an intermediate amount of time to wake up and for LTR notifications, so add a new structure with the correct values for them and change the corresponding devices to use it. Signed-off-by: Luca Coelho Link: https://lore.kernel.org/r/iwlwifi.20200418110539.d6df2bcee78f.Ie008b0c8f03340a466c1ef981bfd25359c9de90d@changeid --- drivers/net/wireless/intel/iwlwifi/cfg/22000.c | 22 +++++++++++++++++----- drivers/net/wireless/intel/iwlwifi/iwl-config.h | 3 ++- drivers/net/wireless/intel/iwlwifi/pcie/drv.c | 7 ++++--- 3 files changed, 23 insertions(+), 9 deletions(-) diff --git a/drivers/net/wireless/intel/iwlwifi/cfg/22000.c b/drivers/net/wireless/intel/iwlwifi/cfg/22000.c index 1f30d8fdf35d..7db4472a1ec5 100644 --- a/drivers/net/wireless/intel/iwlwifi/cfg/22000.c +++ b/drivers/net/wireless/intel/iwlwifi/cfg/22000.c @@ -234,6 +234,15 @@ static const struct iwl_ht_params iwl_22000_ht_params = { }, \ } +const struct iwl_cfg_trans_params iwl_qnj_trans_cfg = { + .mq_rx_supported = true, + .use_tfh = true, + .rf_id = true, + .gen2 = true, + .device_family = IWL_DEVICE_FAMILY_22000, + .base_params = &iwl_22000_base_params, +}; + const struct iwl_cfg_trans_params iwl_qu_trans_cfg = { .mq_rx_supported = true, .use_tfh = true, @@ -246,7 +255,7 @@ const struct iwl_cfg_trans_params iwl_qu_trans_cfg = { .ltr_delay = IWL_CFG_TRANS_LTR_DELAY_200US, }; -const struct iwl_cfg_trans_params iwl_qu_long_latency_trans_cfg = { +const struct iwl_cfg_trans_params iwl_qu_medium_latency_trans_cfg = { .mq_rx_supported = true, .use_tfh = true, .rf_id = true, @@ -254,18 +263,21 @@ const struct iwl_cfg_trans_params iwl_qu_long_latency_trans_cfg = { .device_family = IWL_DEVICE_FAMILY_22000, .base_params = &iwl_22000_base_params, .integrated = true, - .xtal_latency = 12000, - .low_latency_xtal = true, - .ltr_delay = IWL_CFG_TRANS_LTR_DELAY_2500US, + .xtal_latency = 1820, + .ltr_delay = IWL_CFG_TRANS_LTR_DELAY_1820US, }; -const struct iwl_cfg_trans_params iwl_qnj_trans_cfg = { +const struct iwl_cfg_trans_params iwl_qu_long_latency_trans_cfg = { .mq_rx_supported = true, .use_tfh = true, .rf_id = true, .gen2 = true, .device_family = IWL_DEVICE_FAMILY_22000, .base_params = &iwl_22000_base_params, + .integrated = true, + .xtal_latency = 12000, + .low_latency_xtal = true, + .ltr_delay = IWL_CFG_TRANS_LTR_DELAY_2500US, }; /* diff --git a/drivers/net/wireless/intel/iwlwifi/iwl-config.h b/drivers/net/wireless/intel/iwlwifi/iwl-config.h index 9b31fcc37ace..efb10a7f4d4f 100644 --- a/drivers/net/wireless/intel/iwlwifi/iwl-config.h +++ b/drivers/net/wireless/intel/iwlwifi/iwl-config.h @@ -513,9 +513,10 @@ struct iwl_dev_info { extern const struct iwl_cfg_trans_params iwl9000_trans_cfg; extern const struct iwl_cfg_trans_params iwl9560_trans_cfg; extern const struct iwl_cfg_trans_params iwl9560_shared_clk_trans_cfg; +extern const struct iwl_cfg_trans_params iwl_qnj_trans_cfg; extern const struct iwl_cfg_trans_params iwl_qu_trans_cfg; +extern const struct iwl_cfg_trans_params iwl_qu_medium_latency_trans_cfg; extern const struct iwl_cfg_trans_params iwl_qu_long_latency_trans_cfg; -extern const struct iwl_cfg_trans_params iwl_qnj_trans_cfg; extern const struct iwl_cfg_trans_params iwl_ax200_trans_cfg; extern const char iwl9162_name[]; extern const char iwl9260_name[]; diff --git a/drivers/net/wireless/intel/iwlwifi/pcie/drv.c b/drivers/net/wireless/intel/iwlwifi/pcie/drv.c index bc2cdf40028e..2d78f8504bd5 100644 --- a/drivers/net/wireless/intel/iwlwifi/pcie/drv.c +++ b/drivers/net/wireless/intel/iwlwifi/pcie/drv.c @@ -524,9 +524,10 @@ static const struct pci_device_id iwl_hw_card_ids[] = { /* Qu devices */ {IWL_PCI_DEVICE(0x02F0, PCI_ANY_ID, iwl_qu_trans_cfg)}, {IWL_PCI_DEVICE(0x06F0, PCI_ANY_ID, iwl_qu_trans_cfg)}, - {IWL_PCI_DEVICE(0x34F0, PCI_ANY_ID, iwl_qu_trans_cfg)}, - {IWL_PCI_DEVICE(0x3DF0, PCI_ANY_ID, iwl_qu_trans_cfg)}, - {IWL_PCI_DEVICE(0x4DF0, PCI_ANY_ID, iwl_qu_trans_cfg)}, + + {IWL_PCI_DEVICE(0x34F0, PCI_ANY_ID, iwl_qu_medium_latency_trans_cfg)}, + {IWL_PCI_DEVICE(0x3DF0, PCI_ANY_ID, iwl_qu_medium_latency_trans_cfg)}, + {IWL_PCI_DEVICE(0x4DF0, PCI_ANY_ID, iwl_qu_medium_latency_trans_cfg)}, {IWL_PCI_DEVICE(0x43F0, PCI_ANY_ID, iwl_qu_long_latency_trans_cfg)}, {IWL_PCI_DEVICE(0xA0F0, PCI_ANY_ID, iwl_qu_long_latency_trans_cfg)}, -- cgit v1.2.3-59-g8ed1b From 62bee4862bfa03c4d4ec9205111be99210923f49 Mon Sep 17 00:00:00 2001 From: Luca Coelho Date: Sat, 18 Apr 2020 11:08:56 +0300 Subject: iwlwifi: pcie: add new structs for So devices with long latency Some So devices have a longer wake latency. To support this properly, add new cfg structs for them so the driver will inform the FW about the need to use another xtal and use a higher wait value during state transitions. Signed-off-by: Luca Coelho Link: https://lore.kernel.org/r/iwlwifi.20200418110539.daf515618f57.I80e60006b108e1586e3c56669635c670597fe08d@changeid --- drivers/net/wireless/intel/iwlwifi/cfg/22000.c | 20 ++++++++++++++++++++ drivers/net/wireless/intel/iwlwifi/iwl-config.h | 2 ++ drivers/net/wireless/intel/iwlwifi/pcie/drv.c | 10 +++++----- 3 files changed, 27 insertions(+), 5 deletions(-) diff --git a/drivers/net/wireless/intel/iwlwifi/cfg/22000.c b/drivers/net/wireless/intel/iwlwifi/cfg/22000.c index 7db4472a1ec5..2f741f5e3a7d 100644 --- a/drivers/net/wireless/intel/iwlwifi/cfg/22000.c +++ b/drivers/net/wireless/intel/iwlwifi/cfg/22000.c @@ -555,6 +555,16 @@ const struct iwl_cfg iwlax211_2ax_cfg_so_gf_a0 = { .num_rbds = IWL_NUM_RBDS_AX210_HE, }; +const struct iwl_cfg iwlax211_2ax_cfg_so_gf_a0_long = { + .name = "Intel(R) Wi-Fi 6 AX211 160MHz", + .fw_name_pre = IWL_22000_SO_A_GF_A_FW_PRE, + .uhb_supported = true, + IWL_DEVICE_AX210, + .num_rbds = IWL_NUM_RBDS_AX210_HE, + .trans.xtal_latency = 12000, + .trans.low_latency_xtal = true, +}; + const struct iwl_cfg iwlax210_2ax_cfg_ty_gf_a0 = { .name = "Intel(R) Wi-Fi 6 AX210 160MHz", .fw_name_pre = IWL_22000_TY_A_GF_A_FW_PRE, @@ -571,6 +581,16 @@ const struct iwl_cfg iwlax411_2ax_cfg_so_gf4_a0 = { .num_rbds = IWL_NUM_RBDS_AX210_HE, }; +const struct iwl_cfg iwlax411_2ax_cfg_so_gf4_a0_long = { + .name = "Intel(R) Wi-Fi 6 AX411 160MHz", + .fw_name_pre = IWL_22000_SO_A_GF4_A_FW_PRE, + .uhb_supported = true, + IWL_DEVICE_AX210, + .num_rbds = IWL_NUM_RBDS_AX210_HE, + .trans.xtal_latency = 12000, + .trans.low_latency_xtal = true, +}; + const struct iwl_cfg iwlax411_2ax_cfg_sosnj_gf4_a0 = { .name = "Intel(R) Wi-Fi 6 AX411 160MHz", .fw_name_pre = IWL_SNJ_A_GF4_A_FW_PRE, diff --git a/drivers/net/wireless/intel/iwlwifi/iwl-config.h b/drivers/net/wireless/intel/iwlwifi/iwl-config.h index efb10a7f4d4f..3a9a33851793 100644 --- a/drivers/net/wireless/intel/iwlwifi/iwl-config.h +++ b/drivers/net/wireless/intel/iwlwifi/iwl-config.h @@ -630,8 +630,10 @@ extern const struct iwl_cfg iwl22000_2ax_cfg_qnj_hr_b0; extern const struct iwl_cfg iwlax210_2ax_cfg_so_jf_a0; extern const struct iwl_cfg iwlax210_2ax_cfg_so_hr_a0; extern const struct iwl_cfg iwlax211_2ax_cfg_so_gf_a0; +extern const struct iwl_cfg iwlax211_2ax_cfg_so_gf_a0_long; extern const struct iwl_cfg iwlax210_2ax_cfg_ty_gf_a0; extern const struct iwl_cfg iwlax411_2ax_cfg_so_gf4_a0; +extern const struct iwl_cfg iwlax411_2ax_cfg_so_gf4_a0_long; extern const struct iwl_cfg iwlax411_2ax_cfg_sosnj_gf4_a0; extern const struct iwl_cfg iwlax211_cfg_snj_gf_a0; #endif /* CPTCFG_IWLMVM || CPTCFG_IWLFMAC */ diff --git a/drivers/net/wireless/intel/iwlwifi/pcie/drv.c b/drivers/net/wireless/intel/iwlwifi/pcie/drv.c index 2d78f8504bd5..2083eb4f2f15 100644 --- a/drivers/net/wireless/intel/iwlwifi/pcie/drv.c +++ b/drivers/net/wireless/intel/iwlwifi/pcie/drv.c @@ -545,11 +545,11 @@ static const struct pci_device_id iwl_hw_card_ids[] = { {IWL_PCI_DEVICE(0x2726, 0x0090, iwlax211_cfg_snj_gf_a0)}, {IWL_PCI_DEVICE(0x2726, 0x00B0, iwlax411_2ax_cfg_sosnj_gf4_a0)}, {IWL_PCI_DEVICE(0x2726, 0x0510, iwlax211_cfg_snj_gf_a0)}, - {IWL_PCI_DEVICE(0x7A70, 0x0090, iwlax211_2ax_cfg_so_gf_a0)}, - {IWL_PCI_DEVICE(0x7A70, 0x00B0, iwlax411_2ax_cfg_so_gf4_a0)}, - {IWL_PCI_DEVICE(0x7A70, 0x0310, iwlax211_2ax_cfg_so_gf_a0)}, - {IWL_PCI_DEVICE(0x7A70, 0x0510, iwlax211_2ax_cfg_so_gf_a0)}, - {IWL_PCI_DEVICE(0x7A70, 0x0A10, iwlax211_2ax_cfg_so_gf_a0)}, + {IWL_PCI_DEVICE(0x7A70, 0x0090, iwlax211_2ax_cfg_so_gf_a0_long)}, + {IWL_PCI_DEVICE(0x7A70, 0x00B0, iwlax411_2ax_cfg_so_gf4_a0_long)}, + {IWL_PCI_DEVICE(0x7A70, 0x0310, iwlax211_2ax_cfg_so_gf_a0_long)}, + {IWL_PCI_DEVICE(0x7A70, 0x0510, iwlax211_2ax_cfg_so_gf_a0_long)}, + {IWL_PCI_DEVICE(0x7A70, 0x0A10, iwlax211_2ax_cfg_so_gf_a0_long)}, {IWL_PCI_DEVICE(0x7AF0, 0x0090, iwlax211_2ax_cfg_so_gf_a0)}, {IWL_PCI_DEVICE(0x7AF0, 0x00B0, iwlax411_2ax_cfg_so_gf4_a0)}, {IWL_PCI_DEVICE(0x7AF0, 0x0310, iwlax211_2ax_cfg_so_gf_a0)}, -- cgit v1.2.3-59-g8ed1b From cbc636557d2d20c4fb808c14df545b3c407a53d6 Mon Sep 17 00:00:00 2001 From: Gil Adam Date: Sat, 18 Apr 2020 11:08:57 +0300 Subject: iwlwifi: debug: set NPK buffer in context info When buffer destination for ini debug is configured to NPK (TB22DTF) set the appropriate bit in the context info struct. Signed-off-by: Gil Adam Signed-off-by: Luca Coelho Link: https://lore.kernel.org/r/iwlwifi.20200418110539.3c9f0fa6033f.Id1d6c191f85efe0d6cf35434bfb186ffd46ff64c@changeid --- drivers/net/wireless/intel/iwlwifi/iwl-dbg-tlv.c | 26 ++++++++---- .../wireless/intel/iwlwifi/pcie/ctxt-info-gen3.c | 47 ++++++++++++---------- 2 files changed, 43 insertions(+), 30 deletions(-) diff --git a/drivers/net/wireless/intel/iwlwifi/iwl-dbg-tlv.c b/drivers/net/wireless/intel/iwlwifi/iwl-dbg-tlv.c index bf2f00b89214..9eb8fbfaa2a2 100644 --- a/drivers/net/wireless/intel/iwlwifi/iwl-dbg-tlv.c +++ b/drivers/net/wireless/intel/iwlwifi/iwl-dbg-tlv.c @@ -5,7 +5,7 @@ * * GPL LICENSE SUMMARY * - * Copyright (C) 2018 - 2019 Intel Corporation + * Copyright (C) 2018 - 2020 Intel Corporation * * This program is free software; you can redistribute it and/or modify * it under the terms of version 2 of the GNU General Public License as @@ -28,7 +28,7 @@ * * BSD LICENSE * - * Copyright (C) 2018 - 2019 Intel Corporation + * Copyright (C) 2018 - 2020 Intel Corporation * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -170,14 +170,24 @@ static int iwl_dbg_tlv_alloc_buf_alloc(struct iwl_trans *trans, if (le32_to_cpu(tlv->length) != sizeof(*alloc) || (buf_location != IWL_FW_INI_LOCATION_SRAM_PATH && - buf_location != IWL_FW_INI_LOCATION_DRAM_PATH)) + buf_location != IWL_FW_INI_LOCATION_DRAM_PATH && + buf_location != IWL_FW_INI_LOCATION_NPK_PATH)) { + IWL_ERR(trans, + "WRT: Invalid allocation TLV\n"); + return -EINVAL; + } + + if ((buf_location == IWL_FW_INI_LOCATION_SRAM_PATH || + buf_location == IWL_FW_INI_LOCATION_NPK_PATH) && + alloc_id != IWL_FW_INI_ALLOCATION_ID_DBGC1) { + IWL_ERR(trans, + "WRT: Allocation TLV for SMEM/NPK path must have id %u (current: %u)\n", + IWL_FW_INI_ALLOCATION_ID_DBGC1, alloc_id); return -EINVAL; + } - if ((buf_location == IWL_FW_INI_LOCATION_SRAM_PATH && - alloc_id != IWL_FW_INI_ALLOCATION_ID_DBGC1) || - (buf_location == IWL_FW_INI_LOCATION_DRAM_PATH && - (alloc_id == IWL_FW_INI_ALLOCATION_INVALID || - alloc_id >= IWL_FW_INI_ALLOCATION_NUM))) { + if (alloc_id == IWL_FW_INI_ALLOCATION_INVALID || + alloc_id >= IWL_FW_INI_ALLOCATION_NUM) { IWL_ERR(trans, "WRT: Invalid allocation id %u for allocation TLV\n", alloc_id); diff --git a/drivers/net/wireless/intel/iwlwifi/pcie/ctxt-info-gen3.c b/drivers/net/wireless/intel/iwlwifi/pcie/ctxt-info-gen3.c index 01f248ba8fec..27e94e6140b3 100644 --- a/drivers/net/wireless/intel/iwlwifi/pcie/ctxt-info-gen3.c +++ b/drivers/net/wireless/intel/iwlwifi/pcie/ctxt-info-gen3.c @@ -5,7 +5,7 @@ * * GPL LICENSE SUMMARY * - * Copyright(c) 2018 - 2019 Intel Corporation + * Copyright(c) 2018 - 2020 Intel Corporation * * This program is free software; you can redistribute it and/or modify * it under the terms of version 2 of the GNU General Public License as @@ -18,7 +18,7 @@ * * BSD LICENSE * - * Copyright(c) 2018 - 2019 Intel Corporation + * Copyright(c) 2018 - 2020 Intel Corporation * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -84,32 +84,35 @@ iwl_pcie_ctxt_info_dbg_enable(struct iwl_trans *trans, fw_mon_cfg = &trans->dbg.fw_mon_cfg[alloc_id]; - if (le32_to_cpu(fw_mon_cfg->buf_location) == - IWL_FW_INI_LOCATION_SRAM_PATH) { + switch (le32_to_cpu(fw_mon_cfg->buf_location)) { + case IWL_FW_INI_LOCATION_SRAM_PATH: dbg_flags |= IWL_PRPH_SCRATCH_EDBG_DEST_INTERNAL; - IWL_DEBUG_FW(trans, - "WRT: Applying SMEM buffer destination\n"); - - goto out; - } - - if (le32_to_cpu(fw_mon_cfg->buf_location) == - IWL_FW_INI_LOCATION_DRAM_PATH && - trans->dbg.fw_mon_ini[alloc_id].num_frags) { - struct iwl_dram_data *frag = - &trans->dbg.fw_mon_ini[alloc_id].frags[0]; - - dbg_flags |= IWL_PRPH_SCRATCH_EDBG_DEST_DRAM; + "WRT: Applying SMEM buffer destination\n"); + break; + case IWL_FW_INI_LOCATION_NPK_PATH: + dbg_flags |= IWL_PRPH_SCRATCH_EDBG_DEST_TB22DTF; IWL_DEBUG_FW(trans, - "WRT: Applying DRAM destination (alloc_id=%u)\n", - alloc_id); + "WRT: Applying NPK buffer destination\n"); + break; - dbg_cfg->hwm_base_addr = cpu_to_le64(frag->physical); - dbg_cfg->hwm_size = cpu_to_le32(frag->size); + case IWL_FW_INI_LOCATION_DRAM_PATH: + if (trans->dbg.fw_mon_ini[alloc_id].num_frags) { + struct iwl_dram_data *frag = + &trans->dbg.fw_mon_ini[alloc_id].frags[0]; + dbg_flags |= IWL_PRPH_SCRATCH_EDBG_DEST_DRAM; + dbg_cfg->hwm_base_addr = cpu_to_le64(frag->physical); + dbg_cfg->hwm_size = cpu_to_le32(frag->size); + IWL_DEBUG_FW(trans, + "WRT: Applying DRAM destination (alloc_id=%u, num_frags=%u)\n", + alloc_id, + trans->dbg.fw_mon_ini[alloc_id].num_frags); + } + break; + default: + IWL_ERR(trans, "WRT: Invalid buffer destination\n"); } - out: if (dbg_flags) *control_flags |= IWL_PRPH_SCRATCH_EARLY_DEBUG_EN | dbg_flags; -- cgit v1.2.3-59-g8ed1b From 8146458fcd7942861c754b85b5464ef2a8cfacbb Mon Sep 17 00:00:00 2001 From: Amit Cohen Date: Fri, 24 Apr 2020 18:43:41 +0300 Subject: mlxsw: spectrum_span: Reduce nesting in mlxsw_sp_span_entry_configure() Use early return to avoid unnecessary nesting. Signed-off-by: Amit Cohen Signed-off-by: Ido Schimmel Reviewed-by: Jiri Pirko Signed-off-by: David S. Miller --- .../net/ethernet/mellanox/mlxsw/spectrum_span.c | 29 ++++++++++++++-------- 1 file changed, 19 insertions(+), 10 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_span.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_span.c index 9fb2e9d93929..e7be1bfe7f75 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_span.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_span.c @@ -622,18 +622,27 @@ mlxsw_sp_span_entry_configure(struct mlxsw_sp *mlxsw_sp, struct mlxsw_sp_span_entry *span_entry, struct mlxsw_sp_span_parms sparms) { - if (sparms.dest_port) { - if (sparms.dest_port->mlxsw_sp != mlxsw_sp) { - netdev_err(span_entry->to_dev, "Cannot mirror to %s, which belongs to a different mlxsw instance", - sparms.dest_port->dev->name); - sparms.dest_port = NULL; - } else if (span_entry->ops->configure(span_entry, sparms)) { - netdev_err(span_entry->to_dev, "Failed to offload mirror to %s", - sparms.dest_port->dev->name); - sparms.dest_port = NULL; - } + int err; + + if (!sparms.dest_port) + goto set_parms; + + if (sparms.dest_port->mlxsw_sp != mlxsw_sp) { + netdev_err(span_entry->to_dev, "Cannot mirror to %s, which belongs to a different mlxsw instance", + sparms.dest_port->dev->name); + sparms.dest_port = NULL; + goto set_parms; + } + + err = span_entry->ops->configure(span_entry, sparms); + if (err) { + netdev_err(span_entry->to_dev, "Failed to offload mirror to %s", + sparms.dest_port->dev->name); + sparms.dest_port = NULL; + goto set_parms; } +set_parms: span_entry->parms = sparms; } -- cgit v1.2.3-59-g8ed1b From 7f9b099bd9d3143d3c1a0ea74b586c5189e58750 Mon Sep 17 00:00:00 2001 From: Amit Cohen Date: Fri, 24 Apr 2020 18:43:42 +0300 Subject: mlxsw: spectrum_span: Rename parms() to parms_set() Use a more meaningful name for parms() function. Signed-off-by: Amit Cohen Reviewed-by: Jiri Pirko Signed-off-by: Ido Schimmel Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlxsw/spectrum_span.c | 14 +++++++------- drivers/net/ethernet/mellanox/mlxsw/spectrum_span.h | 4 ++-- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_span.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_span.c index e7be1bfe7f75..eb4a1c0f2788 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_span.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_span.c @@ -130,7 +130,7 @@ mlxsw_sp_span_entry_phys_deconfigure(struct mlxsw_sp_span_entry *span_entry) static const struct mlxsw_sp_span_entry_ops mlxsw_sp_span_entry_ops_phys = { .can_handle = mlxsw_sp_port_dev_check, - .parms = mlxsw_sp_span_entry_phys_parms, + .parms_set = mlxsw_sp_span_entry_phys_parms, .configure = mlxsw_sp_span_entry_phys_configure, .deconfigure = mlxsw_sp_span_entry_phys_deconfigure, }; @@ -418,7 +418,7 @@ mlxsw_sp_span_entry_gretap4_deconfigure(struct mlxsw_sp_span_entry *span_entry) static const struct mlxsw_sp_span_entry_ops mlxsw_sp_span_entry_ops_gretap4 = { .can_handle = netif_is_gretap, - .parms = mlxsw_sp_span_entry_gretap4_parms, + .parms_set = mlxsw_sp_span_entry_gretap4_parms, .configure = mlxsw_sp_span_entry_gretap4_configure, .deconfigure = mlxsw_sp_span_entry_gretap4_deconfigure, }; @@ -519,7 +519,7 @@ mlxsw_sp_span_entry_gretap6_deconfigure(struct mlxsw_sp_span_entry *span_entry) static const struct mlxsw_sp_span_entry_ops mlxsw_sp_span_entry_ops_gretap6 = { .can_handle = netif_is_ip6gretap, - .parms = mlxsw_sp_span_entry_gretap6_parms, + .parms_set = mlxsw_sp_span_entry_gretap6_parms, .configure = mlxsw_sp_span_entry_gretap6_configure, .deconfigure = mlxsw_sp_span_entry_gretap6_deconfigure, }; @@ -575,7 +575,7 @@ mlxsw_sp_span_entry_vlan_deconfigure(struct mlxsw_sp_span_entry *span_entry) static const struct mlxsw_sp_span_entry_ops mlxsw_sp_span_entry_ops_vlan = { .can_handle = mlxsw_sp_span_vlan_can_handle, - .parms = mlxsw_sp_span_entry_vlan_parms, + .parms_set = mlxsw_sp_span_entry_vlan_parms, .configure = mlxsw_sp_span_entry_vlan_configure, .deconfigure = mlxsw_sp_span_entry_vlan_deconfigure, }; @@ -612,7 +612,7 @@ mlxsw_sp_span_entry_nop_deconfigure(struct mlxsw_sp_span_entry *span_entry) } static const struct mlxsw_sp_span_entry_ops mlxsw_sp_span_entry_ops_nop = { - .parms = mlxsw_sp_span_entry_nop_parms, + .parms_set = mlxsw_sp_span_entry_nop_parms, .configure = mlxsw_sp_span_entry_nop_configure, .deconfigure = mlxsw_sp_span_entry_nop_deconfigure, }; @@ -970,7 +970,7 @@ int mlxsw_sp_span_mirror_add(struct mlxsw_sp_port *from, return -EOPNOTSUPP; } - err = ops->parms(to_dev, &sparms); + err = ops->parms_set(to_dev, &sparms); if (err) return err; @@ -1026,7 +1026,7 @@ static void mlxsw_sp_span_respin_work(struct work_struct *work) if (!curr->ref_count) continue; - err = curr->ops->parms(curr->to_dev, &sparms); + err = curr->ops->parms_set(curr->to_dev, &sparms); if (err) continue; diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_span.h b/drivers/net/ethernet/mellanox/mlxsw/spectrum_span.h index 59724335525f..01273e54ba20 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_span.h +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_span.h @@ -49,8 +49,8 @@ struct mlxsw_sp_span_entry { struct mlxsw_sp_span_entry_ops { bool (*can_handle)(const struct net_device *to_dev); - int (*parms)(const struct net_device *to_dev, - struct mlxsw_sp_span_parms *sparmsp); + int (*parms_set)(const struct net_device *to_dev, + struct mlxsw_sp_span_parms *sparmsp); int (*configure)(struct mlxsw_sp_span_entry *span_entry, struct mlxsw_sp_span_parms sparms); void (*deconfigure)(struct mlxsw_sp_span_entry *span_entry); -- cgit v1.2.3-59-g8ed1b From c0c2899cf66ee7a68e5b3a7a135089622e005008 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Fri, 24 Apr 2020 18:43:43 +0300 Subject: mlxsw: spectrum_span: Remove unnecessary debug prints To the best of my knowledge, these debug prints were never used. Remove them. Signed-off-by: Ido Schimmel Reviewed-by: Jiri Pirko Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlxsw/spectrum_span.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_span.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_span.c index eb4a1c0f2788..14c5edc71239 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_span.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_span.c @@ -978,9 +978,6 @@ int mlxsw_sp_span_mirror_add(struct mlxsw_sp_port *from, if (!span_entry) return -ENOBUFS; - netdev_dbg(from->dev, "Adding inspected port to SPAN entry %d\n", - span_entry->id); - err = mlxsw_sp_span_inspected_port_add(from, span_entry, type, bind); if (err) goto err_port_bind; @@ -1004,8 +1001,6 @@ void mlxsw_sp_span_mirror_del(struct mlxsw_sp_port *from, int span_id, return; } - netdev_dbg(from->dev, "removing inspected port from SPAN entry %d\n", - span_entry->id); mlxsw_sp_span_inspected_port_del(from, span_entry, type, bind); } -- cgit v1.2.3-59-g8ed1b From 4c00dafc59c7cc25a381abf7671b203a2fcfca71 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Fri, 24 Apr 2020 18:43:44 +0300 Subject: mlxsw: spectrum_span: Use 'refcount_t' for reference counting 'refcount_t' is very useful for catching over/under flows. Convert the SPAN agent objects to use it instead of 'int' for their reference count. Signed-off-by: Ido Schimmel Reviewed-by: Jiri Pirko Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlxsw/spectrum_span.c | 16 ++++++++-------- drivers/net/ethernet/mellanox/mlxsw/spectrum_span.h | 3 ++- 2 files changed, 10 insertions(+), 9 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_span.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_span.c index 14c5edc71239..235556be58f5 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_span.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_span.c @@ -3,6 +3,7 @@ #include #include +#include #include #include #include @@ -664,7 +665,7 @@ mlxsw_sp_span_entry_create(struct mlxsw_sp *mlxsw_sp, /* find a free entry to use */ for (i = 0; i < mlxsw_sp->span->entries_count; i++) { - if (!mlxsw_sp->span->entries[i].ref_count) { + if (!refcount_read(&mlxsw_sp->span->entries[i].ref_count)) { span_entry = &mlxsw_sp->span->entries[i]; break; } @@ -674,7 +675,7 @@ mlxsw_sp_span_entry_create(struct mlxsw_sp *mlxsw_sp, atomic_inc(&mlxsw_sp->span->active_entries_count); span_entry->ops = ops; - span_entry->ref_count = 1; + refcount_set(&span_entry->ref_count, 1); span_entry->to_dev = to_dev; mlxsw_sp_span_entry_configure(mlxsw_sp, span_entry, sparms); @@ -697,7 +698,7 @@ mlxsw_sp_span_entry_find_by_port(struct mlxsw_sp *mlxsw_sp, for (i = 0; i < mlxsw_sp->span->entries_count; i++) { struct mlxsw_sp_span_entry *curr = &mlxsw_sp->span->entries[i]; - if (curr->ref_count && curr->to_dev == to_dev) + if (refcount_read(&curr->ref_count) && curr->to_dev == to_dev) return curr; } return NULL; @@ -718,7 +719,7 @@ mlxsw_sp_span_entry_find_by_id(struct mlxsw_sp *mlxsw_sp, int span_id) for (i = 0; i < mlxsw_sp->span->entries_count; i++) { struct mlxsw_sp_span_entry *curr = &mlxsw_sp->span->entries[i]; - if (curr->ref_count && curr->id == span_id) + if (refcount_read(&curr->ref_count) && curr->id == span_id) return curr; } return NULL; @@ -735,7 +736,7 @@ mlxsw_sp_span_entry_get(struct mlxsw_sp *mlxsw_sp, span_entry = mlxsw_sp_span_entry_find_by_port(mlxsw_sp, to_dev); if (span_entry) { /* Already exists, just take a reference */ - span_entry->ref_count++; + refcount_inc(&span_entry->ref_count); return span_entry; } @@ -745,8 +746,7 @@ mlxsw_sp_span_entry_get(struct mlxsw_sp *mlxsw_sp, static int mlxsw_sp_span_entry_put(struct mlxsw_sp *mlxsw_sp, struct mlxsw_sp_span_entry *span_entry) { - WARN_ON(!span_entry->ref_count); - if (--span_entry->ref_count == 0) + if (refcount_dec_and_test(&span_entry->ref_count)) mlxsw_sp_span_entry_destroy(mlxsw_sp, span_entry); return 0; } @@ -1018,7 +1018,7 @@ static void mlxsw_sp_span_respin_work(struct work_struct *work) struct mlxsw_sp_span_entry *curr = &mlxsw_sp->span->entries[i]; struct mlxsw_sp_span_parms sparms = {NULL}; - if (!curr->ref_count) + if (!refcount_read(&curr->ref_count)) continue; err = curr->ops->parms_set(curr->to_dev, &sparms); diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_span.h b/drivers/net/ethernet/mellanox/mlxsw/spectrum_span.h index 01273e54ba20..d23abdf957fa 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_span.h +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_span.h @@ -6,6 +6,7 @@ #include #include +#include #include "spectrum_router.h" @@ -43,7 +44,7 @@ struct mlxsw_sp_span_entry { const struct mlxsw_sp_span_entry_ops *ops; struct mlxsw_sp_span_parms parms; struct list_head bound_ports_list; - int ref_count; + refcount_t ref_count; int id; }; -- cgit v1.2.3-59-g8ed1b From 4780dbdbd957c204b62680161f39bb0bc4daf3a0 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Fri, 24 Apr 2020 18:43:45 +0300 Subject: mlxsw: spectrum_span: Replace zero-length array with flexible-array member In a similar fashion to commit e99f8e7f88b5 ("mlxsw: Replace zero-length array with flexible-array member"), use a flexible-array member to get a compiler warning in case the flexible array does not occur last in the structure. Signed-off-by: Ido Schimmel Reviewed-by: Jiri Pirko Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlxsw/spectrum_span.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_span.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_span.c index 235556be58f5..ae3c8a1e9a43 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_span.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_span.c @@ -22,7 +22,7 @@ struct mlxsw_sp_span { struct mlxsw_sp *mlxsw_sp; atomic_t active_entries_count; int entries_count; - struct mlxsw_sp_span_entry entries[0]; + struct mlxsw_sp_span_entry entries[]; }; static void mlxsw_sp_span_respin_work(struct work_struct *work); -- cgit v1.2.3-59-g8ed1b From b70ba69ef1f72e58c8e43b5689a37a66a7b31d11 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Thu, 23 Apr 2020 16:57:45 +0200 Subject: net: sched: report ndo_setup_tc failures via extack Help end-users of the 'tc' command to see if the drivers ndo_setup_tc function call fails. Troubleshooting when this happens is non-trivial (see full process here[1]), and results in net_device getting assigned the 'qdisc noop', which will drop all TX packets on the interface. [1]: https://github.com/xdp-project/xdp-project/blob/master/areas/arm64/board_nxp_ls1088/nxp-board04-troubleshoot-qdisc.org Signed-off-by: Jesper Dangaard Brouer Tested-by: Ioana Ciornei Signed-off-by: David S. Miller --- net/sched/cls_api.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index 55bd1429678f..11b683c45c28 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -735,8 +735,11 @@ static int tcf_block_offload_cmd(struct tcf_block *block, INIT_LIST_HEAD(&bo.cb_list); err = dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_BLOCK, &bo); - if (err < 0) + if (err < 0) { + if (err != -EOPNOTSUPP) + NL_SET_ERR_MSG(extack, "Driver ndo_setup_tc failed"); return err; + } return tcf_block_setup(block, &bo); } -- cgit v1.2.3-59-g8ed1b From b89c1e6bdc73f5775e118eb2ab778e75b262b30c Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Thu, 23 Apr 2020 16:57:50 +0200 Subject: dpaa2-eth: fix return codes used in ndo_setup_tc Drivers ndo_setup_tc call should return -EOPNOTSUPP, when it cannot support the qdisc type. Other return values will result in failing the qdisc setup. This lead to qdisc noop getting assigned, which will drop all TX packets on the interface. Fixes: ab1e6de2bd49 ("dpaa2-eth: Add mqprio support") Signed-off-by: Jesper Dangaard Brouer Tested-by: Ioana Ciornei Signed-off-by: David S. Miller --- drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c b/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c index 9d4061bba0b8..d271c016229d 100644 --- a/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c +++ b/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c @@ -2021,7 +2021,7 @@ static int dpaa2_eth_setup_tc(struct net_device *net_dev, int i; if (type != TC_SETUP_QDISC_MQPRIO) - return -EINVAL; + return -EOPNOTSUPP; mqprio->hw = TC_MQPRIO_HW_OFFLOAD_TCS; num_queues = dpaa2_eth_queue_count(priv); @@ -2033,7 +2033,7 @@ static int dpaa2_eth_setup_tc(struct net_device *net_dev, if (num_tc > dpaa2_eth_tc_count(priv)) { netdev_err(net_dev, "Max %d traffic classes supported\n", dpaa2_eth_tc_count(priv)); - return -EINVAL; + return -EOPNOTSUPP; } if (!num_tc) { -- cgit v1.2.3-59-g8ed1b From 5c05c1dbb177293636a3f5ea4caa872dfcf50ccd Mon Sep 17 00:00:00 2001 From: Russell King Date: Thu, 23 Apr 2020 17:02:56 +0100 Subject: net: phylink, dsa: eliminate phylink_fixed_state_cb() Move the callback into the phylink_config structure, rather than providing a callback to set this up. Signed-off-by: Russell King Tested-by: Florian Fainelli Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/phy/phylink.c | 46 +++++++++++++++------------------------------- include/linux/phylink.h | 6 +++--- net/dsa/slave.c | 20 +++++++++++--------- 3 files changed, 29 insertions(+), 43 deletions(-) diff --git a/drivers/net/phy/phylink.c b/drivers/net/phy/phylink.c index 34ca12aec61b..0f23bec431c1 100644 --- a/drivers/net/phy/phylink.c +++ b/drivers/net/phy/phylink.c @@ -480,8 +480,8 @@ static void phylink_get_fixed_state(struct phylink *pl, struct phylink_link_state *state) { *state = pl->link_config; - if (pl->get_fixed_state) - pl->get_fixed_state(pl->netdev, state); + if (pl->config->get_fixed_state) + pl->config->get_fixed_state(pl->config, state); else if (pl->link_gpio) state->link = !!gpiod_get_value_cansleep(pl->link_gpio); @@ -1044,32 +1044,6 @@ void phylink_disconnect_phy(struct phylink *pl) } EXPORT_SYMBOL_GPL(phylink_disconnect_phy); -/** - * phylink_fixed_state_cb() - allow setting a fixed link callback - * @pl: a pointer to a &struct phylink returned from phylink_create() - * @cb: callback to execute to determine the fixed link state. - * - * The MAC driver should call this driver when the state of its link - * can be determined through e.g: an out of band MMIO register. - */ -int phylink_fixed_state_cb(struct phylink *pl, - void (*cb)(struct net_device *dev, - struct phylink_link_state *state)) -{ - /* It does not make sense to let the link be overriden unless we use - * MLO_AN_FIXED - */ - if (pl->cfg_link_an_mode != MLO_AN_FIXED) - return -EINVAL; - - mutex_lock(&pl->state_mutex); - pl->get_fixed_state = cb; - mutex_unlock(&pl->state_mutex); - - return 0; -} -EXPORT_SYMBOL_GPL(phylink_fixed_state_cb); - /** * phylink_mac_change() - notify phylink of a change in MAC state * @pl: a pointer to a &struct phylink returned from phylink_create() @@ -1106,6 +1080,8 @@ static irqreturn_t phylink_link_handler(int irq, void *data) */ void phylink_start(struct phylink *pl) { + bool poll = false; + ASSERT_RTNL(); phylink_info(pl, "configuring for %s/%s link mode\n", @@ -1142,10 +1118,18 @@ void phylink_start(struct phylink *pl) irq = 0; } if (irq <= 0) - mod_timer(&pl->link_poll, jiffies + HZ); + poll = true; + } + + switch (pl->cfg_link_an_mode) { + case MLO_AN_FIXED: + poll |= pl->config->poll_fixed_state; + break; + case MLO_AN_INBAND: + poll |= pl->config->pcs_poll; + break; } - if ((pl->cfg_link_an_mode == MLO_AN_FIXED && pl->get_fixed_state) || - pl->config->pcs_poll) + if (poll) mod_timer(&pl->link_poll, jiffies + HZ); if (pl->phydev) phy_start(pl->phydev); diff --git a/include/linux/phylink.h b/include/linux/phylink.h index 3f8d37ec5503..cc5b452a184e 100644 --- a/include/linux/phylink.h +++ b/include/linux/phylink.h @@ -67,6 +67,9 @@ struct phylink_config { struct device *dev; enum phylink_op_type type; bool pcs_poll; + bool poll_fixed_state; + void (*get_fixed_state)(struct phylink_config *config, + struct phylink_link_state *state); }; /** @@ -366,9 +369,6 @@ void phylink_destroy(struct phylink *); int phylink_connect_phy(struct phylink *, struct phy_device *); int phylink_of_phy_connect(struct phylink *, struct device_node *, u32 flags); void phylink_disconnect_phy(struct phylink *); -int phylink_fixed_state_cb(struct phylink *, - void (*cb)(struct net_device *dev, - struct phylink_link_state *)); void phylink_mac_change(struct phylink *, bool up); diff --git a/net/dsa/slave.c b/net/dsa/slave.c index f2c241cf3a80..1035230771ae 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -1590,10 +1590,10 @@ void dsa_port_phylink_mac_change(struct dsa_switch *ds, int port, bool up) } EXPORT_SYMBOL_GPL(dsa_port_phylink_mac_change); -static void dsa_slave_phylink_fixed_state(struct net_device *dev, +static void dsa_slave_phylink_fixed_state(struct phylink_config *config, struct phylink_link_state *state) { - struct dsa_port *dp = dsa_slave_to_port(dev); + struct dsa_port *dp = container_of(config, struct dsa_port, pl_config); struct dsa_switch *ds = dp->ds; /* No need to check that this operation is valid, the callback would @@ -1633,6 +1633,15 @@ static int dsa_slave_phy_setup(struct net_device *slave_dev) dp->pl_config.dev = &slave_dev->dev; dp->pl_config.type = PHYLINK_NETDEV; + /* The get_fixed_state callback takes precedence over polling the + * link GPIO in PHYLINK (see phylink_get_fixed_state). Only set + * this if the switch provides such a callback. + */ + if (ds->ops->phylink_fixed_state) { + dp->pl_config.get_fixed_state = dsa_slave_phylink_fixed_state; + dp->pl_config.poll_fixed_state = true; + } + dp->pl = phylink_create(&dp->pl_config, of_fwnode_handle(port_dn), mode, &dsa_port_phylink_mac_ops); if (IS_ERR(dp->pl)) { @@ -1641,13 +1650,6 @@ static int dsa_slave_phy_setup(struct net_device *slave_dev) return PTR_ERR(dp->pl); } - /* Register only if the switch provides such a callback, since this - * callback takes precedence over polling the link GPIO in PHYLINK - * (see phylink_get_fixed_state). - */ - if (ds->ops->phylink_fixed_state) - phylink_fixed_state_cb(dp->pl, dsa_slave_phylink_fixed_state); - if (ds->ops->get_phy_flags) phy_flags = ds->ops->get_phy_flags(ds, dp->index); -- cgit v1.2.3-59-g8ed1b From d70c47c8dc6902db19555b7ff7e6eeb264d4ac06 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Thu, 23 Apr 2020 21:34:33 +0200 Subject: net: phy: make phy_suspend a no-op if PHY is suspended already Gently handle the case that phy_suspend() is called whilst PHY is in power-down. Signed-off-by: Heiner Kallweit Signed-off-by: David S. Miller --- drivers/net/phy/phy_device.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c index ac2784192472..206d98502b13 100644 --- a/drivers/net/phy/phy_device.c +++ b/drivers/net/phy/phy_device.c @@ -1524,6 +1524,9 @@ int phy_suspend(struct phy_device *phydev) struct phy_driver *phydrv = phydev->drv; int ret; + if (phydev->suspended) + return 0; + /* If the device has WOL enabled, we cannot suspend the PHY */ phy_ethtool_get_wol(phydev, &wol); if (wol.wolopts || (netdev && netdev->wol_enabled)) -- cgit v1.2.3-59-g8ed1b From 3194915486b2bc3f77745774f1731b78f32ff688 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Thu, 23 Apr 2020 21:35:36 +0200 Subject: net: phy: remove genphy_no_soft_reset Since 6e2d85ec0559 ("net: phy: Stop with excessive soft reset") we don't need genphy_no_soft_reset() any longer. Not setting callback soft_reset results in a no-op now. Signed-off-by: Heiner Kallweit Signed-off-by: David S. Miller --- drivers/net/phy/cortina.c | 1 - drivers/net/phy/marvell10g.c | 2 -- drivers/net/phy/phy-c45.c | 1 - drivers/net/phy/phy_device.c | 1 - drivers/net/phy/teranetics.c | 1 - include/linux/phy.h | 4 ---- 6 files changed, 10 deletions(-) diff --git a/drivers/net/phy/cortina.c b/drivers/net/phy/cortina.c index 856cdc36aacd..aac51362c0fe 100644 --- a/drivers/net/phy/cortina.c +++ b/drivers/net/phy/cortina.c @@ -82,7 +82,6 @@ static struct phy_driver cortina_driver[] = { .features = PHY_10GBIT_FEATURES, .config_aneg = gen10g_config_aneg, .read_status = cortina_read_status, - .soft_reset = genphy_no_soft_reset, .probe = cortina_probe, }, }; diff --git a/drivers/net/phy/marvell10g.c b/drivers/net/phy/marvell10g.c index 95e3f4644aeb..80cbc77ffd55 100644 --- a/drivers/net/phy/marvell10g.c +++ b/drivers/net/phy/marvell10g.c @@ -727,7 +727,6 @@ static struct phy_driver mv3310_drivers[] = { .phy_id_mask = MARVELL_PHY_ID_MASK, .name = "mv88x3310", .get_features = mv3310_get_features, - .soft_reset = genphy_no_soft_reset, .config_init = mv3310_config_init, .probe = mv3310_probe, .suspend = mv3310_suspend, @@ -745,7 +744,6 @@ static struct phy_driver mv3310_drivers[] = { .probe = mv3310_probe, .suspend = mv3310_suspend, .resume = mv3310_resume, - .soft_reset = genphy_no_soft_reset, .config_init = mv3310_config_init, .config_aneg = mv3310_config_aneg, .aneg_done = mv3310_aneg_done, diff --git a/drivers/net/phy/phy-c45.c b/drivers/net/phy/phy-c45.c index 67ba47ae5284..defe09d94422 100644 --- a/drivers/net/phy/phy-c45.c +++ b/drivers/net/phy/phy-c45.c @@ -564,6 +564,5 @@ struct phy_driver genphy_c45_driver = { .phy_id = 0xffffffff, .phy_id_mask = 0xffffffff, .name = "Generic Clause 45 PHY", - .soft_reset = genphy_no_soft_reset, .read_status = genphy_c45_read_status, }; diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c index 206d98502b13..c8f8fd9908fe 100644 --- a/drivers/net/phy/phy_device.c +++ b/drivers/net/phy/phy_device.c @@ -2630,7 +2630,6 @@ static struct phy_driver genphy_driver = { .phy_id = 0xffffffff, .phy_id_mask = 0xffffffff, .name = "Generic PHY", - .soft_reset = genphy_no_soft_reset, .get_features = genphy_read_abilities, .suspend = genphy_suspend, .resume = genphy_resume, diff --git a/drivers/net/phy/teranetics.c b/drivers/net/phy/teranetics.c index beb054b931ee..8057ea8dbc21 100644 --- a/drivers/net/phy/teranetics.c +++ b/drivers/net/phy/teranetics.c @@ -78,7 +78,6 @@ static struct phy_driver teranetics_driver[] = { .phy_id_mask = 0xffffffff, .name = "Teranetics TN2020", .features = PHY_10GBIT_FEATURES, - .soft_reset = genphy_no_soft_reset, .aneg_done = teranetics_aneg_done, .config_aneg = gen10g_config_aneg, .read_status = teranetics_read_status, diff --git a/include/linux/phy.h b/include/linux/phy.h index 3941a6bcba10..e2bfb9240587 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -1251,10 +1251,6 @@ static inline int genphy_config_aneg(struct phy_device *phydev) return __genphy_config_aneg(phydev, false); } -static inline int genphy_no_soft_reset(struct phy_device *phydev) -{ - return 0; -} static inline int genphy_no_ack_interrupt(struct phy_device *phydev) { return 0; -- cgit v1.2.3-59-g8ed1b From 9576e9fa1c02aea3cf1e42eadcbeb12ccf5e87de Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Thu, 23 Apr 2020 21:38:42 +0200 Subject: net: phy: clear phydev->suspended after soft reset If a soft reset is triggered whilst PHY is in power-down, then phydev->suspended will remain set. Seems we didn't face any issue yet caused by this, but better reset the suspended flag after soft reset. See also the following from 22.2.4.1.1 Resetting a PHY is accomplished by setting bit 0.15 to a logic one. This action shall set the status and control registers to their default states. Signed-off-by: Heiner Kallweit Signed-off-by: David S. Miller --- drivers/net/phy/phy_device.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c index c8f8fd9908fe..7e1ddd5745d2 100644 --- a/drivers/net/phy/phy_device.c +++ b/drivers/net/phy/phy_device.c @@ -1082,8 +1082,12 @@ int phy_init_hw(struct phy_device *phydev) if (!phydev->drv) return 0; - if (phydev->drv->soft_reset) + if (phydev->drv->soft_reset) { ret = phydev->drv->soft_reset(phydev); + /* see comment in genphy_soft_reset for an explanation */ + if (!ret) + phydev->suspended = 0; + } if (ret < 0) return ret; @@ -2157,6 +2161,12 @@ int genphy_soft_reset(struct phy_device *phydev) if (ret < 0) return ret; + /* Clause 22 states that setting bit BMCR_RESET sets control registers + * to their default value. Therefore the POWER DOWN bit is supposed to + * be cleared after soft reset. + */ + phydev->suspended = 0; + ret = phy_poll_reset(phydev); if (ret) return ret; -- cgit v1.2.3-59-g8ed1b From 10395e99f4a6f67d53b89f143f610ee954c24531 Mon Sep 17 00:00:00 2001 From: Zheng Bin Date: Fri, 24 Apr 2020 17:00:15 +0800 Subject: net/mlxfw: Remove unneeded semicolon Fixes coccicheck warning: drivers/net/ethernet/mellanox/mlxfw/mlxfw_fsm.c:79:2-3: Unneeded semicolon drivers/net/ethernet/mellanox/mlxfw/mlxfw_fsm.c:162:2-3: Unneeded semicolon Reported-by: Hulk Robot Signed-off-by: Zheng Bin Reviewed-by: Ido Schimmel Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlxfw/mlxfw_fsm.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlxfw/mlxfw_fsm.c b/drivers/net/ethernet/mellanox/mlxfw/mlxfw_fsm.c index 046a0cb82ed8..7a04c626a2aa 100644 --- a/drivers/net/ethernet/mellanox/mlxfw/mlxfw_fsm.c +++ b/drivers/net/ethernet/mellanox/mlxfw/mlxfw_fsm.c @@ -76,7 +76,7 @@ static int mlxfw_fsm_state_err(struct mlxfw_dev *mlxfw_dev, case MLXFW_FSM_STATE_ERR_MAX: MLXFW_ERR_MSG(mlxfw_dev, extack, "unknown error", err); break; - }; + } return mlxfw_fsm_state_errno[fsm_state_err]; }; @@ -159,7 +159,7 @@ mlxfw_fsm_reactivate_err(struct mlxfw_dev *mlxfw_dev, case MLXFW_FSM_REACTIVATE_STATUS_MAX: MLXFW_REACT_ERR("unexpected error", err); break; - }; + } return -EREMOTEIO; }; -- cgit v1.2.3-59-g8ed1b From d9e4171a4a2cf20e803eb31a92d6854ea7002c38 Mon Sep 17 00:00:00 2001 From: Zheng Bin Date: Fri, 24 Apr 2020 17:04:28 +0800 Subject: net: atlantic: Remove unneeded semicolon Fixes coccicheck warning: drivers/net/ethernet/aquantia/atlantic/aq_macsec.c:404:2-3: Unneeded semicolon drivers/net/ethernet/aquantia/atlantic/aq_macsec.c:420:2-3: Unneeded semicolon Reported-by: Hulk Robot Signed-off-by: Zheng Bin Acked-by: Igor Russkikh Signed-off-by: David S. Miller --- drivers/net/ethernet/aquantia/atlantic/aq_macsec.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_macsec.c b/drivers/net/ethernet/aquantia/atlantic/aq_macsec.c index 0b3e234a54aa..91870ceaf3fe 100644 --- a/drivers/net/ethernet/aquantia/atlantic/aq_macsec.c +++ b/drivers/net/ethernet/aquantia/atlantic/aq_macsec.c @@ -401,7 +401,7 @@ static u32 aq_sc_idx_max(const enum aq_macsec_sc_sa sc_sa) break; default: break; - }; + } return result; } @@ -417,7 +417,7 @@ static u32 aq_to_hw_sc_idx(const u32 sc_idx, const enum aq_macsec_sc_sa sc_sa) return sc_idx; default: WARN_ONCE(true, "Invalid sc_sa"); - }; + } return sc_idx; } -- cgit v1.2.3-59-g8ed1b From 7f023ec91c3cb188ab8a52478f3145ccd4daef68 Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Fri, 24 Apr 2020 17:04:50 +0800 Subject: net: sched: remove unused inline function qdisc_reset_all_tx There's no callers in-tree anymore. Signed-off-by: YueHaibing Signed-off-by: David S. Miller --- include/net/sch_generic.h | 5 ----- 1 file changed, 5 deletions(-) diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index 25d2ec4c8f00..1862bf5a105b 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -710,11 +710,6 @@ static inline void qdisc_reset_all_tx_gt(struct net_device *dev, unsigned int i) } } -static inline void qdisc_reset_all_tx(struct net_device *dev) -{ - qdisc_reset_all_tx_gt(dev, 0); -} - /* Are all TX queues of the device empty? */ static inline bool qdisc_all_tx_empty(const struct net_device *dev) { -- cgit v1.2.3-59-g8ed1b From 5d7163a117876f95f44f6f9fb9f028daead42243 Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Fri, 24 Apr 2020 17:06:29 +0800 Subject: net: ipv6: remove unused inline function ip6_set_txhash commit 877d1f6291f8 ("net: Set sk_txhash from a random number") left behind this, remove it. Signed-off-by: YueHaibing Signed-off-by: David S. Miller --- include/net/ipv6.h | 1 - 1 file changed, 1 deletion(-) diff --git a/include/net/ipv6.h b/include/net/ipv6.h index 1bf8065fe871..955badd1e8ff 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -908,7 +908,6 @@ static inline int ip6_default_np_autolabel(struct net *net) } } #else -static inline void ip6_set_txhash(struct sock *sk) { } static inline __be32 ip6_make_flowlabel(struct net *net, struct sk_buff *skb, __be32 flowlabel, bool autolabel, struct flowi6 *fl6) -- cgit v1.2.3-59-g8ed1b From 6033cebdfff9b10192eb254e8cc60fedd595ea7f Mon Sep 17 00:00:00 2001 From: Yang Yingliang Date: Fri, 24 Apr 2020 16:03:48 +0800 Subject: ptp: idt82p33: remove unnecessary comparison The type of loaddr is u8 which is always '<=' 0xff, so the loaddr <= 0xff is always true, we can remove this comparison. Reported-by: Hulk Robot Signed-off-by: Yang Yingliang Signed-off-by: David S. Miller --- drivers/ptp/ptp_idt82p33.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/ptp/ptp_idt82p33.c b/drivers/ptp/ptp_idt82p33.c index 31ea811b6d5f..179f6c472e50 100644 --- a/drivers/ptp/ptp_idt82p33.c +++ b/drivers/ptp/ptp_idt82p33.c @@ -881,7 +881,7 @@ static int idt82p33_load_firmware(struct idt82p33 *idt82p33) /* Page size 128, last 4 bytes of page skipped */ if (((loaddr > 0x7b) && (loaddr <= 0x7f)) - || ((loaddr > 0xfb) && (loaddr <= 0xff))) + || loaddr > 0xfb) continue; err = idt82p33_write(idt82p33, _ADDR(page, loaddr), -- cgit v1.2.3-59-g8ed1b From 1ac0e6c292983c2665383ae9efcfe1f8b53271e0 Mon Sep 17 00:00:00 2001 From: Jian Shen Date: Fri, 24 Apr 2020 10:23:06 +0800 Subject: net: hns3: refine for unicast MAC VLAN space management Currently, firmware helps manage the unicast MAC VLAN table space for each PF. PF just needs to tell firmware its wanted space when initializing, and unnecessary to free it when un-intializing. So this patch removes the umv space free handle, and removes the forward statement of hclge_set_umv_space() by defining hclge_init_umv_space() after it. Signed-off-by: Jian Shen Signed-off-by: Huazhong Tan Signed-off-by: David S. Miller --- .../ethernet/hisilicon/hns3/hns3pf/hclge_main.c | 72 ++++++++-------------- 1 file changed, 24 insertions(+), 48 deletions(-) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c index 0618f22e6f14..ccf269a9a3b1 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c @@ -62,8 +62,6 @@ static int hclge_init_vlan_config(struct hclge_dev *hdev); static void hclge_sync_vlan_filter(struct hclge_dev *hdev); static int hclge_reset_ae_dev(struct hnae3_ae_dev *ae_dev); static bool hclge_get_hw_reset_stat(struct hnae3_handle *handle); -static int hclge_set_umv_space(struct hclge_dev *hdev, u16 space_size, - u16 *allocated_size, bool is_alloc); static void hclge_rfs_filter_expire(struct hclge_dev *hdev); static void hclge_clear_arfs_rules(struct hnae3_handle *handle); static enum hnae3_reset_type hclge_get_reset_level(struct hnae3_ae_dev *ae_dev, @@ -7196,50 +7194,6 @@ static int hclge_add_mac_vlan_tbl(struct hclge_vport *vport, return cfg_status; } -static int hclge_init_umv_space(struct hclge_dev *hdev) -{ - u16 allocated_size = 0; - int ret; - - ret = hclge_set_umv_space(hdev, hdev->wanted_umv_size, &allocated_size, - true); - if (ret) - return ret; - - if (allocated_size < hdev->wanted_umv_size) - dev_warn(&hdev->pdev->dev, - "Alloc umv space failed, want %u, get %u\n", - hdev->wanted_umv_size, allocated_size); - - mutex_init(&hdev->umv_mutex); - hdev->max_umv_size = allocated_size; - /* divide max_umv_size by (hdev->num_req_vfs + 2), in order to - * preserve some unicast mac vlan table entries shared by pf - * and its vfs. - */ - hdev->priv_umv_size = hdev->max_umv_size / (hdev->num_req_vfs + 2); - hdev->share_umv_size = hdev->priv_umv_size + - hdev->max_umv_size % (hdev->num_req_vfs + 2); - - return 0; -} - -static int hclge_uninit_umv_space(struct hclge_dev *hdev) -{ - int ret; - - if (hdev->max_umv_size > 0) { - ret = hclge_set_umv_space(hdev, hdev->max_umv_size, NULL, - false); - if (ret) - return ret; - hdev->max_umv_size = 0; - } - mutex_destroy(&hdev->umv_mutex); - - return 0; -} - static int hclge_set_umv_space(struct hclge_dev *hdev, u16 space_size, u16 *allocated_size, bool is_alloc) { @@ -7268,6 +7222,30 @@ static int hclge_set_umv_space(struct hclge_dev *hdev, u16 space_size, return 0; } +static int hclge_init_umv_space(struct hclge_dev *hdev) +{ + u16 allocated_size = 0; + int ret; + + ret = hclge_set_umv_space(hdev, hdev->wanted_umv_size, &allocated_size, + true); + if (ret) + return ret; + + if (allocated_size < hdev->wanted_umv_size) + dev_warn(&hdev->pdev->dev, + "failed to alloc umv space, want %u, get %u\n", + hdev->wanted_umv_size, allocated_size); + + mutex_init(&hdev->umv_mutex); + hdev->max_umv_size = allocated_size; + hdev->priv_umv_size = hdev->max_umv_size / (hdev->num_alloc_vport + 1); + hdev->share_umv_size = hdev->priv_umv_size + + hdev->max_umv_size % (hdev->num_alloc_vport + 1); + + return 0; +} + static void hclge_reset_umv_space(struct hclge_dev *hdev) { struct hclge_vport *vport; @@ -10041,8 +10019,6 @@ static void hclge_uninit_ae_dev(struct hnae3_ae_dev *ae_dev) if (mac->phydev) mdiobus_unregister(mac->mdio_bus); - hclge_uninit_umv_space(hdev); - /* Disable MISC vector(vector0) */ hclge_enable_vector(&hdev->misc_vector, false); synchronize_irq(hdev->misc_vector.vector_irq); -- cgit v1.2.3-59-g8ed1b From c1c5f66ee0dea782de9c95e6d63f19c355389ee8 Mon Sep 17 00:00:00 2001 From: Jian Shen Date: Fri, 24 Apr 2020 10:23:07 +0800 Subject: net: hns3: remove unnecessary parameter 'is_alloc' in hclge_set_umv_space() Since hclge_set_umv_space() is only called by hclge_init_umv_space(), so parameter 'is_alloc' is redundant. Signed-off-by: Jian Shen Signed-off-by: Huazhong Tan Signed-off-by: David S. Miller --- drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c index ccf269a9a3b1..fe6e60a41925 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c @@ -7195,7 +7195,7 @@ static int hclge_add_mac_vlan_tbl(struct hclge_vport *vport, } static int hclge_set_umv_space(struct hclge_dev *hdev, u16 space_size, - u16 *allocated_size, bool is_alloc) + u16 *allocated_size) { struct hclge_umv_spc_alc_cmd *req; struct hclge_desc desc; @@ -7203,20 +7203,17 @@ static int hclge_set_umv_space(struct hclge_dev *hdev, u16 space_size, req = (struct hclge_umv_spc_alc_cmd *)desc.data; hclge_cmd_setup_basic_desc(&desc, HCLGE_OPC_MAC_VLAN_ALLOCATE, false); - if (!is_alloc) - hnae3_set_bit(req->allocate, HCLGE_UMV_SPC_ALC_B, 1); req->space_size = cpu_to_le32(space_size); ret = hclge_cmd_send(&hdev->hw, &desc, 1); if (ret) { - dev_err(&hdev->pdev->dev, - "%s umv space failed for cmd_send, ret =%d\n", - is_alloc ? "allocate" : "free", ret); + dev_err(&hdev->pdev->dev, "failed to set umv space, ret = %d\n", + ret); return ret; } - if (is_alloc && allocated_size) + if (allocated_size) *allocated_size = le32_to_cpu(desc.data[1]); return 0; @@ -7227,8 +7224,7 @@ static int hclge_init_umv_space(struct hclge_dev *hdev) u16 allocated_size = 0; int ret; - ret = hclge_set_umv_space(hdev, hdev->wanted_umv_size, &allocated_size, - true); + ret = hclge_set_umv_space(hdev, hdev->wanted_umv_size, &allocated_size); if (ret) return ret; -- cgit v1.2.3-59-g8ed1b From 4c58f592470192d2b5ce4cfd2f7ff0ea2624c073 Mon Sep 17 00:00:00 2001 From: Jian Shen Date: Fri, 24 Apr 2020 10:23:08 +0800 Subject: net: hns3: replace num_req_vfs with num_alloc_vport in hclge_reset_umv_space() Like the calculation elsewhere, replaces num_req_vfs with num_alloc_vport in hclge_reset_umv_space(). Signed-off-by: Jian Shen Signed-off-by: Huazhong Tan Signed-off-by: David S. Miller --- drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c index fe6e60a41925..a268004c8e0e 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c @@ -7254,7 +7254,7 @@ static void hclge_reset_umv_space(struct hclge_dev *hdev) mutex_lock(&hdev->umv_mutex); hdev->share_umv_size = hdev->priv_umv_size + - hdev->max_umv_size % (hdev->num_req_vfs + 2); + hdev->max_umv_size % (hdev->num_alloc_vport + 1); mutex_unlock(&hdev->umv_mutex); } -- cgit v1.2.3-59-g8ed1b From ee4bcd3b7ae40bd77732eb1ba14aa26d6c514525 Mon Sep 17 00:00:00 2001 From: Jian Shen Date: Fri, 24 Apr 2020 10:23:09 +0800 Subject: net: hns3: refactor the MAC address configure Currently, the HNS3 driver sync and unsync MAC address in function hns3_set_rx_mode(). For PF, it adds and deletes MAC address directly in the path of dev_set_rx_mode(). If failed, it won't retry until next calling of hns3_set_rx_mode(). On the other hand, if request add and remove a same address many times at a short interval, each request must be done one by one, can't be merged. For VF, it sends mailbox messages to PF to request adding or deleting MAC address in the path of function hns3_set_rx_mode(), no matter the address is configured success. This patch refines it by recording the MAC address in function hns3_set_rx_mode(), and updating MAC address in the service task. If failed, it will retry by the next calling of periodical service task. It also uses some state to mark the state of each MAC address in the MAC list, which can help merge configure request for a same address. With these changes, when global reset or IMP reset occurs, we can restore the MAC table with the MAC list. Signed-off-by: Jian Shen Signed-off-by: Huazhong Tan Signed-off-by: David S. Miller --- drivers/net/ethernet/hisilicon/hns3/hns3_enet.c | 79 +-- .../ethernet/hisilicon/hns3/hns3pf/hclge_main.c | 592 +++++++++++++++++---- .../ethernet/hisilicon/hns3/hns3pf/hclge_main.h | 27 +- .../net/ethernet/hisilicon/hns3/hns3pf/hclge_mbx.c | 42 +- .../ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c | 313 ++++++++++- .../ethernet/hisilicon/hns3/hns3vf/hclgevf_main.h | 25 + 6 files changed, 860 insertions(+), 218 deletions(-) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c index ac3a48a24d86..341e8b5cd219 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c @@ -40,7 +40,6 @@ } while (0) static void hns3_clear_all_ring(struct hnae3_handle *h, bool force); -static void hns3_remove_hw_addr(struct net_device *netdev); static const char hns3_driver_name[] = "hns3"; static const char hns3_driver_string[] = @@ -548,6 +547,13 @@ static int hns3_nic_uc_unsync(struct net_device *netdev, { struct hnae3_handle *h = hns3_get_handle(netdev); + /* need ignore the request of removing device address, because + * we store the device address and other addresses of uc list + * in the function's mac filter list. + */ + if (ether_addr_equal(addr, netdev->dev_addr)) + return 0; + if (h->ae_algo->ops->rm_uc_addr) return h->ae_algo->ops->rm_uc_addr(h, addr); @@ -3907,9 +3913,11 @@ static int hns3_init_mac_addr(struct net_device *netdev) eth_hw_addr_random(netdev); dev_warn(priv->dev, "using random MAC address %pM\n", netdev->dev_addr); - } else { + } else if (!ether_addr_equal(netdev->dev_addr, mac_addr_temp)) { ether_addr_copy(netdev->dev_addr, mac_addr_temp); ether_addr_copy(netdev->perm_addr, mac_addr_temp); + } else { + return 0; } if (h->ae_algo->ops->set_mac_addr) @@ -4119,8 +4127,6 @@ static void hns3_client_uninit(struct hnae3_handle *handle, bool reset) struct hns3_nic_priv *priv = netdev_priv(netdev); int ret; - hns3_remove_hw_addr(netdev); - if (netdev->reg_state != NETREG_UNINITIALIZED) unregister_netdev(netdev); @@ -4191,56 +4197,6 @@ static int hns3_client_setup_tc(struct hnae3_handle *handle, u8 tc) return hns3_nic_set_real_num_queue(ndev); } -static int hns3_recover_hw_addr(struct net_device *ndev) -{ - struct netdev_hw_addr_list *list; - struct netdev_hw_addr *ha, *tmp; - int ret = 0; - - netif_addr_lock_bh(ndev); - /* go through and sync uc_addr entries to the device */ - list = &ndev->uc; - list_for_each_entry_safe(ha, tmp, &list->list, list) { - ret = hns3_nic_uc_sync(ndev, ha->addr); - if (ret) - goto out; - } - - /* go through and sync mc_addr entries to the device */ - list = &ndev->mc; - list_for_each_entry_safe(ha, tmp, &list->list, list) { - ret = hns3_nic_mc_sync(ndev, ha->addr); - if (ret) - goto out; - } - -out: - netif_addr_unlock_bh(ndev); - return ret; -} - -static void hns3_remove_hw_addr(struct net_device *netdev) -{ - struct netdev_hw_addr_list *list; - struct netdev_hw_addr *ha, *tmp; - - hns3_nic_uc_unsync(netdev, netdev->dev_addr); - - netif_addr_lock_bh(netdev); - /* go through and unsync uc_addr entries to the device */ - list = &netdev->uc; - list_for_each_entry_safe(ha, tmp, &list->list, list) - hns3_nic_uc_unsync(netdev, ha->addr); - - /* go through and unsync mc_addr entries to the device */ - list = &netdev->mc; - list_for_each_entry_safe(ha, tmp, &list->list, list) - if (ha->refcount > 1) - hns3_nic_mc_unsync(netdev, ha->addr); - - netif_addr_unlock_bh(netdev); -} - static void hns3_clear_tx_ring(struct hns3_enet_ring *ring) { while (ring->next_to_clean != ring->next_to_use) { @@ -4411,10 +4367,8 @@ static int hns3_reset_notify_down_enet(struct hnae3_handle *handle) * from table space. Hence, for function reset software intervention is * required to delete the entries */ - if (hns3_dev_ongoing_func_reset(ae_dev)) { - hns3_remove_hw_addr(ndev); + if (hns3_dev_ongoing_func_reset(ae_dev)) hns3_del_all_fd_rules(ndev, false); - } if (!netif_running(ndev)) return 0; @@ -4482,6 +4436,9 @@ static int hns3_reset_notify_init_enet(struct hnae3_handle *handle) goto err_init_irq_fail; } + if (!hns3_is_phys_func(handle->pdev)) + hns3_init_mac_addr(netdev); + ret = hns3_client_start(handle); if (ret) { dev_err(priv->dev, "hns3_client_start fail! ret=%d\n", ret); @@ -4513,14 +4470,6 @@ static int hns3_reset_notify_restore_enet(struct hnae3_handle *handle) bool vlan_filter_enable; int ret; - ret = hns3_init_mac_addr(netdev); - if (ret) - return ret; - - ret = hns3_recover_hw_addr(netdev); - if (ret) - return ret; - ret = hns3_update_promisc_mode(netdev, handle->netdev_flags); if (ret) return ret; diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c index a268004c8e0e..c3205ae620ce 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c @@ -68,6 +68,8 @@ static enum hnae3_reset_type hclge_get_reset_level(struct hnae3_ae_dev *ae_dev, unsigned long *addr); static int hclge_set_default_loopback(struct hclge_dev *hdev); +static void hclge_sync_mac_table(struct hclge_dev *hdev); + static struct hnae3_ae_algo ae_algo; static struct workqueue_struct *hclge_wq; @@ -1685,6 +1687,7 @@ static int hclge_alloc_vport(struct hclge_dev *hdev) INIT_LIST_HEAD(&vport->vlan_list); INIT_LIST_HEAD(&vport->uc_mac_list); INIT_LIST_HEAD(&vport->mc_mac_list); + spin_lock_init(&vport->mac_list_lock); if (i == 0) ret = hclge_vport_setup(vport, tqp_main_vport); @@ -3971,6 +3974,7 @@ static void hclge_periodic_service_task(struct hclge_dev *hdev) * updated when it is triggered by mbx. */ hclge_update_link_status(hdev); + hclge_sync_mac_table(hdev); if (time_is_after_jiffies(hdev->last_serv_processed + HZ)) { delta = jiffies - hdev->last_serv_processed; @@ -6922,8 +6926,16 @@ static void hclge_ae_stop(struct hnae3_handle *handle) int hclge_vport_start(struct hclge_vport *vport) { + struct hclge_dev *hdev = vport->back; + set_bit(HCLGE_VPORT_STATE_ALIVE, &vport->state); vport->last_active_jiffies = jiffies; + + if (test_bit(vport->vport_id, hdev->vport_config_block)) + hclge_restore_mac_table_common(vport); + + clear_bit(vport->vport_id, hdev->vport_config_block); + return 0; } @@ -7291,12 +7303,106 @@ static void hclge_update_umv_space(struct hclge_vport *vport, bool is_free) mutex_unlock(&hdev->umv_mutex); } +static struct hclge_mac_node *hclge_find_mac_node(struct list_head *list, + const u8 *mac_addr) +{ + struct hclge_mac_node *mac_node, *tmp; + + list_for_each_entry_safe(mac_node, tmp, list, node) + if (ether_addr_equal(mac_addr, mac_node->mac_addr)) + return mac_node; + + return NULL; +} + +static void hclge_update_mac_node(struct hclge_mac_node *mac_node, + enum HCLGE_MAC_NODE_STATE state) +{ + switch (state) { + /* from set_rx_mode or tmp_add_list */ + case HCLGE_MAC_TO_ADD: + if (mac_node->state == HCLGE_MAC_TO_DEL) + mac_node->state = HCLGE_MAC_ACTIVE; + break; + /* only from set_rx_mode */ + case HCLGE_MAC_TO_DEL: + if (mac_node->state == HCLGE_MAC_TO_ADD) { + list_del(&mac_node->node); + kfree(mac_node); + } else { + mac_node->state = HCLGE_MAC_TO_DEL; + } + break; + /* only from tmp_add_list, the mac_node->state won't be + * ACTIVE. + */ + case HCLGE_MAC_ACTIVE: + if (mac_node->state == HCLGE_MAC_TO_ADD) + mac_node->state = HCLGE_MAC_ACTIVE; + + break; + } +} + +int hclge_update_mac_list(struct hclge_vport *vport, + enum HCLGE_MAC_NODE_STATE state, + enum HCLGE_MAC_ADDR_TYPE mac_type, + const unsigned char *addr) +{ + struct hclge_dev *hdev = vport->back; + struct hclge_mac_node *mac_node; + struct list_head *list; + + list = (mac_type == HCLGE_MAC_ADDR_UC) ? + &vport->uc_mac_list : &vport->mc_mac_list; + + spin_lock_bh(&vport->mac_list_lock); + + /* if the mac addr is already in the mac list, no need to add a new + * one into it, just check the mac addr state, convert it to a new + * new state, or just remove it, or do nothing. + */ + mac_node = hclge_find_mac_node(list, addr); + if (mac_node) { + hclge_update_mac_node(mac_node, state); + spin_unlock_bh(&vport->mac_list_lock); + set_bit(HCLGE_VPORT_STATE_MAC_TBL_CHANGE, &vport->state); + return 0; + } + + /* if this address is never added, unnecessary to delete */ + if (state == HCLGE_MAC_TO_DEL) { + spin_unlock_bh(&vport->mac_list_lock); + dev_err(&hdev->pdev->dev, + "failed to delete address %pM from mac list\n", + addr); + return -ENOENT; + } + + mac_node = kzalloc(sizeof(*mac_node), GFP_ATOMIC); + if (!mac_node) { + spin_unlock_bh(&vport->mac_list_lock); + return -ENOMEM; + } + + set_bit(HCLGE_VPORT_STATE_MAC_TBL_CHANGE, &vport->state); + + mac_node->state = state; + ether_addr_copy(mac_node->mac_addr, addr); + list_add_tail(&mac_node->node, list); + + spin_unlock_bh(&vport->mac_list_lock); + + return 0; +} + static int hclge_add_uc_addr(struct hnae3_handle *handle, const unsigned char *addr) { struct hclge_vport *vport = hclge_get_vport(handle); - return hclge_add_uc_addr_common(vport, addr); + return hclge_update_mac_list(vport, HCLGE_MAC_TO_ADD, HCLGE_MAC_ADDR_UC, + addr); } int hclge_add_uc_addr_common(struct hclge_vport *vport, @@ -7367,7 +7473,8 @@ static int hclge_rm_uc_addr(struct hnae3_handle *handle, { struct hclge_vport *vport = hclge_get_vport(handle); - return hclge_rm_uc_addr_common(vport, addr); + return hclge_update_mac_list(vport, HCLGE_MAC_TO_DEL, HCLGE_MAC_ADDR_UC, + addr); } int hclge_rm_uc_addr_common(struct hclge_vport *vport, @@ -7392,6 +7499,8 @@ int hclge_rm_uc_addr_common(struct hclge_vport *vport, ret = hclge_remove_mac_vlan_tbl(vport, &req); if (!ret) hclge_update_umv_space(vport, true); + else if (ret == -ENOENT) + ret = 0; return ret; } @@ -7401,7 +7510,8 @@ static int hclge_add_mc_addr(struct hnae3_handle *handle, { struct hclge_vport *vport = hclge_get_vport(handle); - return hclge_add_mc_addr_common(vport, addr); + return hclge_update_mac_list(vport, HCLGE_MAC_TO_ADD, HCLGE_MAC_ADDR_MC, + addr); } int hclge_add_mc_addr_common(struct hclge_vport *vport, @@ -7444,7 +7554,8 @@ static int hclge_rm_mc_addr(struct hnae3_handle *handle, { struct hclge_vport *vport = hclge_get_vport(handle); - return hclge_rm_mc_addr_common(vport, addr); + return hclge_update_mac_list(vport, HCLGE_MAC_TO_DEL, HCLGE_MAC_ADDR_MC, + addr); } int hclge_rm_mc_addr_common(struct hclge_vport *vport, @@ -7479,111 +7590,328 @@ int hclge_rm_mc_addr_common(struct hclge_vport *vport, /* Not all the vfid is zero, update the vfid */ status = hclge_add_mac_vlan_tbl(vport, &req, desc); - } else { - /* Maybe this mac address is in mta table, but it cannot be - * deleted here because an entry of mta represents an address - * range rather than a specific address. the delete action to - * all entries will take effect in update_mta_status called by - * hns3_nic_set_rx_mode. - */ + } else if (status == -ENOENT) { status = 0; } return status; } -void hclge_add_vport_mac_table(struct hclge_vport *vport, const u8 *mac_addr, - enum HCLGE_MAC_ADDR_TYPE mac_type) +static void hclge_sync_vport_mac_list(struct hclge_vport *vport, + struct list_head *list, + int (*sync)(struct hclge_vport *, + const unsigned char *)) { - struct hclge_vport_mac_addr_cfg *mac_cfg; - struct list_head *list; + struct hclge_mac_node *mac_node, *tmp; + int ret; - if (!vport->vport_id) - return; + list_for_each_entry_safe(mac_node, tmp, list, node) { + ret = sync(vport, mac_node->mac_addr); + if (!ret) { + mac_node->state = HCLGE_MAC_ACTIVE; + } else { + set_bit(HCLGE_VPORT_STATE_MAC_TBL_CHANGE, + &vport->state); + break; + } + } +} - mac_cfg = kzalloc(sizeof(*mac_cfg), GFP_KERNEL); - if (!mac_cfg) - return; +static void hclge_unsync_vport_mac_list(struct hclge_vport *vport, + struct list_head *list, + int (*unsync)(struct hclge_vport *, + const unsigned char *)) +{ + struct hclge_mac_node *mac_node, *tmp; + int ret; - mac_cfg->hd_tbl_status = true; - memcpy(mac_cfg->mac_addr, mac_addr, ETH_ALEN); + list_for_each_entry_safe(mac_node, tmp, list, node) { + ret = unsync(vport, mac_node->mac_addr); + if (!ret || ret == -ENOENT) { + list_del(&mac_node->node); + kfree(mac_node); + } else { + set_bit(HCLGE_VPORT_STATE_MAC_TBL_CHANGE, + &vport->state); + break; + } + } +} - list = (mac_type == HCLGE_MAC_ADDR_UC) ? - &vport->uc_mac_list : &vport->mc_mac_list; +static void hclge_sync_from_add_list(struct list_head *add_list, + struct list_head *mac_list) +{ + struct hclge_mac_node *mac_node, *tmp, *new_node; - list_add_tail(&mac_cfg->node, list); + list_for_each_entry_safe(mac_node, tmp, add_list, node) { + /* if the mac address from tmp_add_list is not in the + * uc/mc_mac_list, it means have received a TO_DEL request + * during the time window of adding the mac address into mac + * table. if mac_node state is ACTIVE, then change it to TO_DEL, + * then it will be removed at next time. else it must be TO_ADD, + * this address hasn't been added into mac table, + * so just remove the mac node. + */ + new_node = hclge_find_mac_node(mac_list, mac_node->mac_addr); + if (new_node) { + hclge_update_mac_node(new_node, mac_node->state); + list_del(&mac_node->node); + kfree(mac_node); + } else if (mac_node->state == HCLGE_MAC_ACTIVE) { + mac_node->state = HCLGE_MAC_TO_DEL; + list_del(&mac_node->node); + list_add_tail(&mac_node->node, mac_list); + } else { + list_del(&mac_node->node); + kfree(mac_node); + } + } } -void hclge_rm_vport_mac_table(struct hclge_vport *vport, const u8 *mac_addr, - bool is_write_tbl, - enum HCLGE_MAC_ADDR_TYPE mac_type) +static void hclge_sync_from_del_list(struct list_head *del_list, + struct list_head *mac_list) { - struct hclge_vport_mac_addr_cfg *mac_cfg, *tmp; - struct list_head *list; - bool uc_flag, mc_flag; + struct hclge_mac_node *mac_node, *tmp, *new_node; - list = (mac_type == HCLGE_MAC_ADDR_UC) ? - &vport->uc_mac_list : &vport->mc_mac_list; + list_for_each_entry_safe(mac_node, tmp, del_list, node) { + new_node = hclge_find_mac_node(mac_list, mac_node->mac_addr); + if (new_node) { + /* If the mac addr exists in the mac list, it means + * received a new TO_ADD request during the time window + * of configuring the mac address. For the mac node + * state is TO_ADD, and the address is already in the + * in the hardware(due to delete fail), so we just need + * to change the mac node state to ACTIVE. + */ + new_node->state = HCLGE_MAC_ACTIVE; + list_del(&mac_node->node); + kfree(mac_node); + } else { + list_del(&mac_node->node); + list_add_tail(&mac_node->node, mac_list); + } + } +} - uc_flag = is_write_tbl && mac_type == HCLGE_MAC_ADDR_UC; - mc_flag = is_write_tbl && mac_type == HCLGE_MAC_ADDR_MC; +static void hclge_sync_vport_mac_table(struct hclge_vport *vport, + enum HCLGE_MAC_ADDR_TYPE mac_type) +{ + struct hclge_mac_node *mac_node, *tmp, *new_node; + struct list_head tmp_add_list, tmp_del_list; + struct list_head *list; - list_for_each_entry_safe(mac_cfg, tmp, list, node) { - if (ether_addr_equal(mac_cfg->mac_addr, mac_addr)) { - if (uc_flag && mac_cfg->hd_tbl_status) - hclge_rm_uc_addr_common(vport, mac_addr); + INIT_LIST_HEAD(&tmp_add_list); + INIT_LIST_HEAD(&tmp_del_list); - if (mc_flag && mac_cfg->hd_tbl_status) - hclge_rm_mc_addr_common(vport, mac_addr); + /* move the mac addr to the tmp_add_list and tmp_del_list, then + * we can add/delete these mac addr outside the spin lock + */ + list = (mac_type == HCLGE_MAC_ADDR_UC) ? + &vport->uc_mac_list : &vport->mc_mac_list; - list_del(&mac_cfg->node); - kfree(mac_cfg); + spin_lock_bh(&vport->mac_list_lock); + + list_for_each_entry_safe(mac_node, tmp, list, node) { + switch (mac_node->state) { + case HCLGE_MAC_TO_DEL: + list_del(&mac_node->node); + list_add_tail(&mac_node->node, &tmp_del_list); + break; + case HCLGE_MAC_TO_ADD: + new_node = kzalloc(sizeof(*new_node), GFP_ATOMIC); + if (!new_node) + goto stop_traverse; + ether_addr_copy(new_node->mac_addr, mac_node->mac_addr); + new_node->state = mac_node->state; + list_add_tail(&new_node->node, &tmp_add_list); + break; + default: break; } } + +stop_traverse: + spin_unlock_bh(&vport->mac_list_lock); + + /* delete first, in order to get max mac table space for adding */ + if (mac_type == HCLGE_MAC_ADDR_UC) { + hclge_unsync_vport_mac_list(vport, &tmp_del_list, + hclge_rm_uc_addr_common); + hclge_sync_vport_mac_list(vport, &tmp_add_list, + hclge_add_uc_addr_common); + } else { + hclge_unsync_vport_mac_list(vport, &tmp_del_list, + hclge_rm_mc_addr_common); + hclge_sync_vport_mac_list(vport, &tmp_add_list, + hclge_add_mc_addr_common); + } + + /* if some mac addresses were added/deleted fail, move back to the + * mac_list, and retry at next time. + */ + spin_lock_bh(&vport->mac_list_lock); + + hclge_sync_from_del_list(&tmp_del_list, list); + hclge_sync_from_add_list(&tmp_add_list, list); + + spin_unlock_bh(&vport->mac_list_lock); +} + +static bool hclge_need_sync_mac_table(struct hclge_vport *vport) +{ + struct hclge_dev *hdev = vport->back; + + if (test_bit(vport->vport_id, hdev->vport_config_block)) + return false; + + if (test_and_clear_bit(HCLGE_VPORT_STATE_MAC_TBL_CHANGE, &vport->state)) + return true; + + return false; +} + +static void hclge_sync_mac_table(struct hclge_dev *hdev) +{ + int i; + + for (i = 0; i < hdev->num_alloc_vport; i++) { + struct hclge_vport *vport = &hdev->vport[i]; + + if (!hclge_need_sync_mac_table(vport)) + continue; + + hclge_sync_vport_mac_table(vport, HCLGE_MAC_ADDR_UC); + hclge_sync_vport_mac_table(vport, HCLGE_MAC_ADDR_MC); + } } void hclge_rm_vport_all_mac_table(struct hclge_vport *vport, bool is_del_list, enum HCLGE_MAC_ADDR_TYPE mac_type) { - struct hclge_vport_mac_addr_cfg *mac_cfg, *tmp; - struct list_head *list; + int (*unsync)(struct hclge_vport *vport, const unsigned char *addr); + struct hclge_mac_node *mac_cfg, *tmp; + struct hclge_dev *hdev = vport->back; + struct list_head tmp_del_list, *list; + int ret; - list = (mac_type == HCLGE_MAC_ADDR_UC) ? - &vport->uc_mac_list : &vport->mc_mac_list; + if (mac_type == HCLGE_MAC_ADDR_UC) { + list = &vport->uc_mac_list; + unsync = hclge_rm_uc_addr_common; + } else { + list = &vport->mc_mac_list; + unsync = hclge_rm_mc_addr_common; + } - list_for_each_entry_safe(mac_cfg, tmp, list, node) { - if (mac_type == HCLGE_MAC_ADDR_UC && mac_cfg->hd_tbl_status) - hclge_rm_uc_addr_common(vport, mac_cfg->mac_addr); + INIT_LIST_HEAD(&tmp_del_list); - if (mac_type == HCLGE_MAC_ADDR_MC && mac_cfg->hd_tbl_status) - hclge_rm_mc_addr_common(vport, mac_cfg->mac_addr); + if (!is_del_list) + set_bit(vport->vport_id, hdev->vport_config_block); - mac_cfg->hd_tbl_status = false; - if (is_del_list) { + spin_lock_bh(&vport->mac_list_lock); + + list_for_each_entry_safe(mac_cfg, tmp, list, node) { + switch (mac_cfg->state) { + case HCLGE_MAC_TO_DEL: + case HCLGE_MAC_ACTIVE: list_del(&mac_cfg->node); - kfree(mac_cfg); + list_add_tail(&mac_cfg->node, &tmp_del_list); + break; + case HCLGE_MAC_TO_ADD: + if (is_del_list) { + list_del(&mac_cfg->node); + kfree(mac_cfg); + } + break; } } + + spin_unlock_bh(&vport->mac_list_lock); + + list_for_each_entry_safe(mac_cfg, tmp, &tmp_del_list, node) { + ret = unsync(vport, mac_cfg->mac_addr); + if (!ret || ret == -ENOENT) { + /* clear all mac addr from hardware, but remain these + * mac addr in the mac list, and restore them after + * vf reset finished. + */ + if (!is_del_list && + mac_cfg->state == HCLGE_MAC_ACTIVE) { + mac_cfg->state = HCLGE_MAC_TO_ADD; + } else { + list_del(&mac_cfg->node); + kfree(mac_cfg); + } + } else if (is_del_list) { + mac_cfg->state = HCLGE_MAC_TO_DEL; + } + } + + spin_lock_bh(&vport->mac_list_lock); + + hclge_sync_from_del_list(&tmp_del_list, list); + + spin_unlock_bh(&vport->mac_list_lock); +} + +/* remove all mac address when uninitailize */ +static void hclge_uninit_vport_mac_list(struct hclge_vport *vport, + enum HCLGE_MAC_ADDR_TYPE mac_type) +{ + struct hclge_mac_node *mac_node, *tmp; + struct hclge_dev *hdev = vport->back; + struct list_head tmp_del_list, *list; + + INIT_LIST_HEAD(&tmp_del_list); + + list = (mac_type == HCLGE_MAC_ADDR_UC) ? + &vport->uc_mac_list : &vport->mc_mac_list; + + spin_lock_bh(&vport->mac_list_lock); + + list_for_each_entry_safe(mac_node, tmp, list, node) { + switch (mac_node->state) { + case HCLGE_MAC_TO_DEL: + case HCLGE_MAC_ACTIVE: + list_del(&mac_node->node); + list_add_tail(&mac_node->node, &tmp_del_list); + break; + case HCLGE_MAC_TO_ADD: + list_del(&mac_node->node); + kfree(mac_node); + break; + } + } + + spin_unlock_bh(&vport->mac_list_lock); + + if (mac_type == HCLGE_MAC_ADDR_UC) + hclge_unsync_vport_mac_list(vport, &tmp_del_list, + hclge_rm_uc_addr_common); + else + hclge_unsync_vport_mac_list(vport, &tmp_del_list, + hclge_rm_mc_addr_common); + + if (!list_empty(&tmp_del_list)) + dev_warn(&hdev->pdev->dev, + "uninit %s mac list for vport %u not completely.\n", + mac_type == HCLGE_MAC_ADDR_UC ? "uc" : "mc", + vport->vport_id); + + list_for_each_entry_safe(mac_node, tmp, &tmp_del_list, node) { + list_del(&mac_node->node); + kfree(mac_node); + } } -void hclge_uninit_vport_mac_table(struct hclge_dev *hdev) +static void hclge_uninit_mac_table(struct hclge_dev *hdev) { - struct hclge_vport_mac_addr_cfg *mac, *tmp; struct hclge_vport *vport; int i; for (i = 0; i < hdev->num_alloc_vport; i++) { vport = &hdev->vport[i]; - list_for_each_entry_safe(mac, tmp, &vport->uc_mac_list, node) { - list_del(&mac->node); - kfree(mac); - } - - list_for_each_entry_safe(mac, tmp, &vport->mc_mac_list, node) { - list_del(&mac->node); - kfree(mac); - } + hclge_uninit_vport_mac_list(vport, HCLGE_MAC_ADDR_UC); + hclge_uninit_vport_mac_list(vport, HCLGE_MAC_ADDR_MC); } } @@ -7747,12 +8075,57 @@ static void hclge_get_mac_addr(struct hnae3_handle *handle, u8 *p) ether_addr_copy(p, hdev->hw.mac.mac_addr); } +int hclge_update_mac_node_for_dev_addr(struct hclge_vport *vport, + const u8 *old_addr, const u8 *new_addr) +{ + struct list_head *list = &vport->uc_mac_list; + struct hclge_mac_node *old_node, *new_node; + + new_node = hclge_find_mac_node(list, new_addr); + if (!new_node) { + new_node = kzalloc(sizeof(*new_node), GFP_ATOMIC); + if (!new_node) + return -ENOMEM; + + new_node->state = HCLGE_MAC_TO_ADD; + ether_addr_copy(new_node->mac_addr, new_addr); + list_add(&new_node->node, list); + } else { + if (new_node->state == HCLGE_MAC_TO_DEL) + new_node->state = HCLGE_MAC_ACTIVE; + + /* make sure the new addr is in the list head, avoid dev + * addr may be not re-added into mac table for the umv space + * limitation after global/imp reset which will clear mac + * table by hardware. + */ + list_move(&new_node->node, list); + } + + if (old_addr && !ether_addr_equal(old_addr, new_addr)) { + old_node = hclge_find_mac_node(list, old_addr); + if (old_node) { + if (old_node->state == HCLGE_MAC_TO_ADD) { + list_del(&old_node->node); + kfree(old_node); + } else { + old_node->state = HCLGE_MAC_TO_DEL; + } + } + } + + set_bit(HCLGE_VPORT_STATE_MAC_TBL_CHANGE, &vport->state); + + return 0; +} + static int hclge_set_mac_addr(struct hnae3_handle *handle, void *p, bool is_first) { const unsigned char *new_addr = (const unsigned char *)p; struct hclge_vport *vport = hclge_get_vport(handle); struct hclge_dev *hdev = vport->back; + unsigned char *old_addr = NULL; int ret; /* mac addr check */ @@ -7760,39 +8133,42 @@ static int hclge_set_mac_addr(struct hnae3_handle *handle, void *p, is_broadcast_ether_addr(new_addr) || is_multicast_ether_addr(new_addr)) { dev_err(&hdev->pdev->dev, - "Change uc mac err! invalid mac:%pM.\n", + "change uc mac err! invalid mac: %pM.\n", new_addr); return -EINVAL; } - if ((!is_first || is_kdump_kernel()) && - hclge_rm_uc_addr(handle, hdev->hw.mac.mac_addr)) - dev_warn(&hdev->pdev->dev, - "remove old uc mac address fail.\n"); - - ret = hclge_add_uc_addr(handle, new_addr); + ret = hclge_pause_addr_cfg(hdev, new_addr); if (ret) { dev_err(&hdev->pdev->dev, - "add uc mac address fail, ret =%d.\n", + "failed to configure mac pause address, ret = %d\n", ret); - - if (!is_first && - hclge_add_uc_addr(handle, hdev->hw.mac.mac_addr)) - dev_err(&hdev->pdev->dev, - "restore uc mac address fail.\n"); - - return -EIO; + return ret; } - ret = hclge_pause_addr_cfg(hdev, new_addr); + if (!is_first) + old_addr = hdev->hw.mac.mac_addr; + + spin_lock_bh(&vport->mac_list_lock); + ret = hclge_update_mac_node_for_dev_addr(vport, old_addr, new_addr); if (ret) { dev_err(&hdev->pdev->dev, - "configure mac pause address fail, ret =%d.\n", - ret); - return -EIO; - } + "failed to change the mac addr:%pM, ret = %d\n", + new_addr, ret); + spin_unlock_bh(&vport->mac_list_lock); + + if (!is_first) + hclge_pause_addr_cfg(hdev, old_addr); + return ret; + } + /* we must update dev addr with spin lock protect, preventing dev addr + * being removed by set_rx_mode path. + */ ether_addr_copy(hdev->hw.mac.mac_addr, new_addr); + spin_unlock_bh(&vport->mac_list_lock); + + hclge_task_schedule(hdev, 0); return 0; } @@ -8408,6 +8784,37 @@ static void hclge_restore_vlan_table(struct hnae3_handle *handle) } } +/* For global reset and imp reset, hardware will clear the mac table, + * so we change the mac address state from ACTIVE to TO_ADD, then they + * can be restored in the service task after reset complete. Furtherly, + * the mac addresses with state TO_DEL or DEL_FAIL are unnecessary to + * be restored after reset, so just remove these mac nodes from mac_list. + */ +static void hclge_mac_node_convert_for_reset(struct list_head *list) +{ + struct hclge_mac_node *mac_node, *tmp; + + list_for_each_entry_safe(mac_node, tmp, list, node) { + if (mac_node->state == HCLGE_MAC_ACTIVE) { + mac_node->state = HCLGE_MAC_TO_ADD; + } else if (mac_node->state == HCLGE_MAC_TO_DEL) { + list_del(&mac_node->node); + kfree(mac_node); + } + } +} + +void hclge_restore_mac_table_common(struct hclge_vport *vport) +{ + spin_lock_bh(&vport->mac_list_lock); + + hclge_mac_node_convert_for_reset(&vport->uc_mac_list); + hclge_mac_node_convert_for_reset(&vport->mc_mac_list); + set_bit(HCLGE_VPORT_STATE_MAC_TBL_CHANGE, &vport->state); + + spin_unlock_bh(&vport->mac_list_lock); +} + int hclge_en_hw_strip_rxvtag(struct hnae3_handle *handle, bool enable) { struct hclge_vport *vport = hclge_get_vport(handle); @@ -9899,6 +10306,15 @@ static int hclge_reset_ae_dev(struct hnae3_ae_dev *ae_dev) set_bit(HCLGE_STATE_DOWN, &hdev->state); hclge_stats_clear(hdev); + /* NOTE: pf reset needn't to clear or restore pf and vf table entry. + * so here should not clean table in memory. + */ + if (hdev->reset_type == HNAE3_IMP_RESET || + hdev->reset_type == HNAE3_GLOBAL_RESET) { + bitmap_set(hdev->vport_config_block, 0, hdev->num_alloc_vport); + hclge_reset_umv_space(hdev); + } + memset(hdev->vlan_table, 0, sizeof(hdev->vlan_table)); memset(hdev->vf_vlan_full, 0, sizeof(hdev->vf_vlan_full)); @@ -9914,8 +10330,6 @@ static int hclge_reset_ae_dev(struct hnae3_ae_dev *ae_dev) return ret; } - hclge_reset_umv_space(hdev); - ret = hclge_mac_init(hdev); if (ret) { dev_err(&pdev->dev, "Mac init error, ret = %d\n", ret); @@ -10011,6 +10425,7 @@ static void hclge_uninit_ae_dev(struct hnae3_ae_dev *ae_dev) hclge_clear_vf_vlan(hdev); hclge_misc_affinity_teardown(hdev); hclge_state_uninit(hdev); + hclge_uninit_mac_table(hdev); if (mac->phydev) mdiobus_unregister(mac->mdio_bus); @@ -10028,7 +10443,6 @@ static void hclge_uninit_ae_dev(struct hnae3_ae_dev *ae_dev) hclge_misc_irq_uninit(hdev); hclge_pci_uninit(hdev); mutex_destroy(&hdev->vport_lock); - hclge_uninit_vport_mac_table(hdev); hclge_uninit_vport_vlan_table(hdev); ae_dev->priv = NULL; } diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h index a58c26200ea0..5fcbc3d23f21 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h @@ -630,9 +630,15 @@ struct hclge_fd_ad_data { u16 rule_id; }; -struct hclge_vport_mac_addr_cfg { +enum HCLGE_MAC_NODE_STATE { + HCLGE_MAC_TO_ADD, + HCLGE_MAC_TO_DEL, + HCLGE_MAC_ACTIVE +}; + +struct hclge_mac_node { struct list_head node; - int hd_tbl_status; + enum HCLGE_MAC_NODE_STATE state; u8 mac_addr[ETH_ALEN]; }; @@ -805,6 +811,8 @@ struct hclge_dev { unsigned long vlan_table[VLAN_N_VID][BITS_TO_LONGS(HCLGE_VPORT_NUM)]; unsigned long vf_vlan_full[BITS_TO_LONGS(HCLGE_VPORT_NUM)]; + unsigned long vport_config_block[BITS_TO_LONGS(HCLGE_VPORT_NUM)]; + struct hclge_fd_cfg fd_cfg; struct hlist_head fd_rule_list; spinlock_t fd_rule_lock; /* protect fd_rule_list and fd_bmap */ @@ -866,6 +874,7 @@ struct hclge_rss_tuple_cfg { enum HCLGE_VPORT_STATE { HCLGE_VPORT_STATE_ALIVE, + HCLGE_VPORT_STATE_MAC_TBL_CHANGE, HCLGE_VPORT_STATE_MAX }; @@ -922,6 +931,7 @@ struct hclge_vport { u32 mps; /* Max packet size */ struct hclge_vf_info vf_info; + spinlock_t mac_list_lock; /* protect mac address need to add/detele */ struct list_head uc_mac_list; /* Store VF unicast table */ struct list_head mc_mac_list; /* Store VF multicast table */ struct list_head vlan_list; /* Store VF vlan table */ @@ -977,16 +987,17 @@ int hclge_dbg_run_cmd(struct hnae3_handle *handle, const char *cmd_buf); u16 hclge_covert_handle_qid_global(struct hnae3_handle *handle, u16 queue_id); int hclge_notify_client(struct hclge_dev *hdev, enum hnae3_reset_notify_type type); -void hclge_add_vport_mac_table(struct hclge_vport *vport, const u8 *mac_addr, - enum HCLGE_MAC_ADDR_TYPE mac_type); -void hclge_rm_vport_mac_table(struct hclge_vport *vport, const u8 *mac_addr, - bool is_write_tbl, - enum HCLGE_MAC_ADDR_TYPE mac_type); +int hclge_update_mac_list(struct hclge_vport *vport, + enum HCLGE_MAC_NODE_STATE state, + enum HCLGE_MAC_ADDR_TYPE mac_type, + const unsigned char *addr); +int hclge_update_mac_node_for_dev_addr(struct hclge_vport *vport, + const u8 *old_addr, const u8 *new_addr); void hclge_rm_vport_all_mac_table(struct hclge_vport *vport, bool is_del_list, enum HCLGE_MAC_ADDR_TYPE mac_type); -void hclge_uninit_vport_mac_table(struct hclge_dev *hdev); void hclge_rm_vport_all_vlan_table(struct hclge_vport *vport, bool is_del_list); void hclge_uninit_vport_vlan_table(struct hclge_dev *hdev); +void hclge_restore_mac_table_common(struct hclge_vport *vport); int hclge_update_port_base_vlan_cfg(struct hclge_vport *vport, u16 state, struct hclge_vlan_info *vlan_info); int hclge_push_vf_port_base_vlan_info(struct hclge_vport *vport, u8 vfid, diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_mbx.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_mbx.c index 103c2ec777b0..0efc04562ba6 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_mbx.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_mbx.c @@ -275,26 +275,17 @@ static int hclge_set_vf_uc_mac_addr(struct hclge_vport *vport, if (!is_valid_ether_addr(mac_addr)) return -EINVAL; - hclge_rm_uc_addr_common(vport, old_addr); - status = hclge_add_uc_addr_common(vport, mac_addr); - if (status) { - hclge_add_uc_addr_common(vport, old_addr); - } else { - hclge_rm_vport_mac_table(vport, mac_addr, - false, HCLGE_MAC_ADDR_UC); - hclge_add_vport_mac_table(vport, mac_addr, - HCLGE_MAC_ADDR_UC); - } + spin_lock_bh(&vport->mac_list_lock); + status = hclge_update_mac_node_for_dev_addr(vport, old_addr, + mac_addr); + spin_unlock_bh(&vport->mac_list_lock); + hclge_task_schedule(hdev, 0); } else if (mbx_req->msg.subcode == HCLGE_MBX_MAC_VLAN_UC_ADD) { - status = hclge_add_uc_addr_common(vport, mac_addr); - if (!status) - hclge_add_vport_mac_table(vport, mac_addr, - HCLGE_MAC_ADDR_UC); + status = hclge_update_mac_list(vport, HCLGE_MAC_TO_ADD, + HCLGE_MAC_ADDR_UC, mac_addr); } else if (mbx_req->msg.subcode == HCLGE_MBX_MAC_VLAN_UC_REMOVE) { - status = hclge_rm_uc_addr_common(vport, mac_addr); - if (!status) - hclge_rm_vport_mac_table(vport, mac_addr, - false, HCLGE_MAC_ADDR_UC); + status = hclge_update_mac_list(vport, HCLGE_MAC_TO_DEL, + HCLGE_MAC_ADDR_UC, mac_addr); } else { dev_err(&hdev->pdev->dev, "failed to set unicast mac addr, unknown subcode %u\n", @@ -310,18 +301,13 @@ static int hclge_set_vf_mc_mac_addr(struct hclge_vport *vport, { const u8 *mac_addr = (const u8 *)(mbx_req->msg.data); struct hclge_dev *hdev = vport->back; - int status; if (mbx_req->msg.subcode == HCLGE_MBX_MAC_VLAN_MC_ADD) { - status = hclge_add_mc_addr_common(vport, mac_addr); - if (!status) - hclge_add_vport_mac_table(vport, mac_addr, - HCLGE_MAC_ADDR_MC); + hclge_update_mac_list(vport, HCLGE_MAC_TO_ADD, + HCLGE_MAC_ADDR_MC, mac_addr); } else if (mbx_req->msg.subcode == HCLGE_MBX_MAC_VLAN_MC_REMOVE) { - status = hclge_rm_mc_addr_common(vport, mac_addr); - if (!status) - hclge_rm_vport_mac_table(vport, mac_addr, - false, HCLGE_MAC_ADDR_MC); + hclge_update_mac_list(vport, HCLGE_MAC_TO_DEL, + HCLGE_MAC_ADDR_MC, mac_addr); } else { dev_err(&hdev->pdev->dev, "failed to set mcast mac addr, unknown subcode %u\n", @@ -329,7 +315,7 @@ static int hclge_set_vf_mc_mac_addr(struct hclge_vport *vport, return -EIO; } - return status; + return 0; } int hclge_push_vf_port_base_vlan_info(struct hclge_vport *vport, u8 vfid, diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c index e02d427131ee..05d485a48706 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c @@ -1245,10 +1245,12 @@ static int hclgevf_set_mac_addr(struct hnae3_handle *handle, void *p, int status; hclgevf_build_send_msg(&send_msg, HCLGE_MBX_SET_UNICAST, 0); - send_msg.subcode = is_first ? HCLGE_MBX_MAC_VLAN_UC_ADD : - HCLGE_MBX_MAC_VLAN_UC_MODIFY; + send_msg.subcode = HCLGE_MBX_MAC_VLAN_UC_MODIFY; ether_addr_copy(send_msg.data, new_mac_addr); - ether_addr_copy(&send_msg.data[ETH_ALEN], old_mac_addr); + if (is_first && !hdev->has_pf_mac) + eth_zero_addr(&send_msg.data[ETH_ALEN]); + else + ether_addr_copy(&send_msg.data[ETH_ALEN], old_mac_addr); status = hclgevf_send_mbx_msg(hdev, &send_msg, true, NULL, 0); if (!status) ether_addr_copy(hdev->hw.mac.mac_addr, new_mac_addr); @@ -1256,54 +1258,302 @@ static int hclgevf_set_mac_addr(struct hnae3_handle *handle, void *p, return status; } -static int hclgevf_add_uc_addr(struct hnae3_handle *handle, - const unsigned char *addr) +static struct hclgevf_mac_addr_node * +hclgevf_find_mac_node(struct list_head *list, const u8 *mac_addr) +{ + struct hclgevf_mac_addr_node *mac_node, *tmp; + + list_for_each_entry_safe(mac_node, tmp, list, node) + if (ether_addr_equal(mac_addr, mac_node->mac_addr)) + return mac_node; + + return NULL; +} + +static void hclgevf_update_mac_node(struct hclgevf_mac_addr_node *mac_node, + enum HCLGEVF_MAC_NODE_STATE state) +{ + switch (state) { + /* from set_rx_mode or tmp_add_list */ + case HCLGEVF_MAC_TO_ADD: + if (mac_node->state == HCLGEVF_MAC_TO_DEL) + mac_node->state = HCLGEVF_MAC_ACTIVE; + break; + /* only from set_rx_mode */ + case HCLGEVF_MAC_TO_DEL: + if (mac_node->state == HCLGEVF_MAC_TO_ADD) { + list_del(&mac_node->node); + kfree(mac_node); + } else { + mac_node->state = HCLGEVF_MAC_TO_DEL; + } + break; + /* only from tmp_add_list, the mac_node->state won't be + * HCLGEVF_MAC_ACTIVE + */ + case HCLGEVF_MAC_ACTIVE: + if (mac_node->state == HCLGEVF_MAC_TO_ADD) + mac_node->state = HCLGEVF_MAC_ACTIVE; + break; + } +} + +static int hclgevf_update_mac_list(struct hnae3_handle *handle, + enum HCLGEVF_MAC_NODE_STATE state, + enum HCLGEVF_MAC_ADDR_TYPE mac_type, + const unsigned char *addr) { struct hclgevf_dev *hdev = hclgevf_ae_get_hdev(handle); - struct hclge_vf_to_pf_msg send_msg; + struct hclgevf_mac_addr_node *mac_node; + struct list_head *list; - hclgevf_build_send_msg(&send_msg, HCLGE_MBX_SET_UNICAST, - HCLGE_MBX_MAC_VLAN_UC_ADD); - ether_addr_copy(send_msg.data, addr); - return hclgevf_send_mbx_msg(hdev, &send_msg, false, NULL, 0); + list = (mac_type == HCLGEVF_MAC_ADDR_UC) ? + &hdev->mac_table.uc_mac_list : &hdev->mac_table.mc_mac_list; + + spin_lock_bh(&hdev->mac_table.mac_list_lock); + + /* if the mac addr is already in the mac list, no need to add a new + * one into it, just check the mac addr state, convert it to a new + * new state, or just remove it, or do nothing. + */ + mac_node = hclgevf_find_mac_node(list, addr); + if (mac_node) { + hclgevf_update_mac_node(mac_node, state); + spin_unlock_bh(&hdev->mac_table.mac_list_lock); + return 0; + } + /* if this address is never added, unnecessary to delete */ + if (state == HCLGEVF_MAC_TO_DEL) { + spin_unlock_bh(&hdev->mac_table.mac_list_lock); + return -ENOENT; + } + + mac_node = kzalloc(sizeof(*mac_node), GFP_ATOMIC); + if (!mac_node) { + spin_unlock_bh(&hdev->mac_table.mac_list_lock); + return -ENOMEM; + } + + mac_node->state = state; + ether_addr_copy(mac_node->mac_addr, addr); + list_add_tail(&mac_node->node, list); + + spin_unlock_bh(&hdev->mac_table.mac_list_lock); + return 0; +} + +static int hclgevf_add_uc_addr(struct hnae3_handle *handle, + const unsigned char *addr) +{ + return hclgevf_update_mac_list(handle, HCLGEVF_MAC_TO_ADD, + HCLGEVF_MAC_ADDR_UC, addr); } static int hclgevf_rm_uc_addr(struct hnae3_handle *handle, const unsigned char *addr) { - struct hclgevf_dev *hdev = hclgevf_ae_get_hdev(handle); - struct hclge_vf_to_pf_msg send_msg; - - hclgevf_build_send_msg(&send_msg, HCLGE_MBX_SET_UNICAST, - HCLGE_MBX_MAC_VLAN_UC_REMOVE); - ether_addr_copy(send_msg.data, addr); - return hclgevf_send_mbx_msg(hdev, &send_msg, false, NULL, 0); + return hclgevf_update_mac_list(handle, HCLGEVF_MAC_TO_DEL, + HCLGEVF_MAC_ADDR_UC, addr); } static int hclgevf_add_mc_addr(struct hnae3_handle *handle, const unsigned char *addr) { - struct hclgevf_dev *hdev = hclgevf_ae_get_hdev(handle); - struct hclge_vf_to_pf_msg send_msg; - - hclgevf_build_send_msg(&send_msg, HCLGE_MBX_SET_MULTICAST, - HCLGE_MBX_MAC_VLAN_MC_ADD); - ether_addr_copy(send_msg.data, addr); - return hclgevf_send_mbx_msg(hdev, &send_msg, false, NULL, 0); + return hclgevf_update_mac_list(handle, HCLGEVF_MAC_TO_ADD, + HCLGEVF_MAC_ADDR_MC, addr); } static int hclgevf_rm_mc_addr(struct hnae3_handle *handle, const unsigned char *addr) { - struct hclgevf_dev *hdev = hclgevf_ae_get_hdev(handle); + return hclgevf_update_mac_list(handle, HCLGEVF_MAC_TO_DEL, + HCLGEVF_MAC_ADDR_MC, addr); +} + +static int hclgevf_add_del_mac_addr(struct hclgevf_dev *hdev, + struct hclgevf_mac_addr_node *mac_node, + enum HCLGEVF_MAC_ADDR_TYPE mac_type) +{ struct hclge_vf_to_pf_msg send_msg; + u8 code, subcode; - hclgevf_build_send_msg(&send_msg, HCLGE_MBX_SET_MULTICAST, - HCLGE_MBX_MAC_VLAN_MC_REMOVE); - ether_addr_copy(send_msg.data, addr); + if (mac_type == HCLGEVF_MAC_ADDR_UC) { + code = HCLGE_MBX_SET_UNICAST; + if (mac_node->state == HCLGEVF_MAC_TO_ADD) + subcode = HCLGE_MBX_MAC_VLAN_UC_ADD; + else + subcode = HCLGE_MBX_MAC_VLAN_UC_REMOVE; + } else { + code = HCLGE_MBX_SET_MULTICAST; + if (mac_node->state == HCLGEVF_MAC_TO_ADD) + subcode = HCLGE_MBX_MAC_VLAN_MC_ADD; + else + subcode = HCLGE_MBX_MAC_VLAN_MC_REMOVE; + } + + hclgevf_build_send_msg(&send_msg, code, subcode); + ether_addr_copy(send_msg.data, mac_node->mac_addr); return hclgevf_send_mbx_msg(hdev, &send_msg, false, NULL, 0); } +static void hclgevf_config_mac_list(struct hclgevf_dev *hdev, + struct list_head *list, + enum HCLGEVF_MAC_ADDR_TYPE mac_type) +{ + struct hclgevf_mac_addr_node *mac_node, *tmp; + int ret; + + list_for_each_entry_safe(mac_node, tmp, list, node) { + ret = hclgevf_add_del_mac_addr(hdev, mac_node, mac_type); + if (ret) { + dev_err(&hdev->pdev->dev, + "failed to configure mac %pM, state = %d, ret = %d\n", + mac_node->mac_addr, mac_node->state, ret); + return; + } + if (mac_node->state == HCLGEVF_MAC_TO_ADD) { + mac_node->state = HCLGEVF_MAC_ACTIVE; + } else { + list_del(&mac_node->node); + kfree(mac_node); + } + } +} + +static void hclgevf_sync_from_add_list(struct list_head *add_list, + struct list_head *mac_list) +{ + struct hclgevf_mac_addr_node *mac_node, *tmp, *new_node; + + list_for_each_entry_safe(mac_node, tmp, add_list, node) { + /* if the mac address from tmp_add_list is not in the + * uc/mc_mac_list, it means have received a TO_DEL request + * during the time window of sending mac config request to PF + * If mac_node state is ACTIVE, then change its state to TO_DEL, + * then it will be removed at next time. If is TO_ADD, it means + * send TO_ADD request failed, so just remove the mac node. + */ + new_node = hclgevf_find_mac_node(mac_list, mac_node->mac_addr); + if (new_node) { + hclgevf_update_mac_node(new_node, mac_node->state); + list_del(&mac_node->node); + kfree(mac_node); + } else if (mac_node->state == HCLGEVF_MAC_ACTIVE) { + mac_node->state = HCLGEVF_MAC_TO_DEL; + list_del(&mac_node->node); + list_add_tail(&mac_node->node, mac_list); + } else { + list_del(&mac_node->node); + kfree(mac_node); + } + } +} + +static void hclgevf_sync_from_del_list(struct list_head *del_list, + struct list_head *mac_list) +{ + struct hclgevf_mac_addr_node *mac_node, *tmp, *new_node; + + list_for_each_entry_safe(mac_node, tmp, del_list, node) { + new_node = hclgevf_find_mac_node(mac_list, mac_node->mac_addr); + if (new_node) { + /* If the mac addr is exist in the mac list, it means + * received a new request TO_ADD during the time window + * of sending mac addr configurrequest to PF, so just + * change the mac state to ACTIVE. + */ + new_node->state = HCLGEVF_MAC_ACTIVE; + list_del(&mac_node->node); + kfree(mac_node); + } else { + list_del(&mac_node->node); + list_add_tail(&mac_node->node, mac_list); + } + } +} + +static void hclgevf_clear_list(struct list_head *list) +{ + struct hclgevf_mac_addr_node *mac_node, *tmp; + + list_for_each_entry_safe(mac_node, tmp, list, node) { + list_del(&mac_node->node); + kfree(mac_node); + } +} + +static void hclgevf_sync_mac_list(struct hclgevf_dev *hdev, + enum HCLGEVF_MAC_ADDR_TYPE mac_type) +{ + struct hclgevf_mac_addr_node *mac_node, *tmp, *new_node; + struct list_head tmp_add_list, tmp_del_list; + struct list_head *list; + + INIT_LIST_HEAD(&tmp_add_list); + INIT_LIST_HEAD(&tmp_del_list); + + /* move the mac addr to the tmp_add_list and tmp_del_list, then + * we can add/delete these mac addr outside the spin lock + */ + list = (mac_type == HCLGEVF_MAC_ADDR_UC) ? + &hdev->mac_table.uc_mac_list : &hdev->mac_table.mc_mac_list; + + spin_lock_bh(&hdev->mac_table.mac_list_lock); + + list_for_each_entry_safe(mac_node, tmp, list, node) { + switch (mac_node->state) { + case HCLGEVF_MAC_TO_DEL: + list_del(&mac_node->node); + list_add_tail(&mac_node->node, &tmp_del_list); + break; + case HCLGEVF_MAC_TO_ADD: + new_node = kzalloc(sizeof(*new_node), GFP_ATOMIC); + if (!new_node) + goto stop_traverse; + + ether_addr_copy(new_node->mac_addr, mac_node->mac_addr); + new_node->state = mac_node->state; + list_add_tail(&new_node->node, &tmp_add_list); + break; + default: + break; + } + } + +stop_traverse: + spin_unlock_bh(&hdev->mac_table.mac_list_lock); + + /* delete first, in order to get max mac table space for adding */ + hclgevf_config_mac_list(hdev, &tmp_del_list, mac_type); + hclgevf_config_mac_list(hdev, &tmp_add_list, mac_type); + + /* if some mac addresses were added/deleted fail, move back to the + * mac_list, and retry at next time. + */ + spin_lock_bh(&hdev->mac_table.mac_list_lock); + + hclgevf_sync_from_del_list(&tmp_del_list, list); + hclgevf_sync_from_add_list(&tmp_add_list, list); + + spin_unlock_bh(&hdev->mac_table.mac_list_lock); +} + +static void hclgevf_sync_mac_table(struct hclgevf_dev *hdev) +{ + hclgevf_sync_mac_list(hdev, HCLGEVF_MAC_ADDR_UC); + hclgevf_sync_mac_list(hdev, HCLGEVF_MAC_ADDR_MC); +} + +static void hclgevf_uninit_mac_list(struct hclgevf_dev *hdev) +{ + spin_lock_bh(&hdev->mac_table.mac_list_lock); + + hclgevf_clear_list(&hdev->mac_table.uc_mac_list); + hclgevf_clear_list(&hdev->mac_table.mc_mac_list); + + spin_unlock_bh(&hdev->mac_table.mac_list_lock); +} + static int hclgevf_set_vlan_filter(struct hnae3_handle *handle, __be16 proto, u16 vlan_id, bool is_kill) @@ -1951,6 +2201,8 @@ static void hclgevf_periodic_service_task(struct hclgevf_dev *hdev) hclgevf_sync_vlan_filter(hdev); + hclgevf_sync_mac_table(hdev); + hdev->last_serv_processed = jiffies; out: @@ -2313,6 +2565,10 @@ static void hclgevf_state_init(struct hclgevf_dev *hdev) mutex_init(&hdev->mbx_resp.mbx_mutex); sema_init(&hdev->reset_sem, 1); + spin_lock_init(&hdev->mac_table.mac_list_lock); + INIT_LIST_HEAD(&hdev->mac_table.uc_mac_list); + INIT_LIST_HEAD(&hdev->mac_table.mc_mac_list); + /* bring the device down */ set_bit(HCLGEVF_STATE_DOWN, &hdev->state); } @@ -2846,6 +3102,7 @@ static void hclgevf_uninit_hdev(struct hclgevf_dev *hdev) hclgevf_pci_uninit(hdev); hclgevf_cmd_uninit(hdev); + hclgevf_uninit_mac_list(hdev); } static int hclgevf_init_ae_dev(struct hnae3_ae_dev *ae_dev) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.h b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.h index 3b88d866facc..0222d9b87a42 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.h +++ b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.h @@ -234,6 +234,29 @@ struct hclgevf_rst_stats { u32 rst_fail_cnt; /* the number of VF reset fail */ }; +enum HCLGEVF_MAC_ADDR_TYPE { + HCLGEVF_MAC_ADDR_UC, + HCLGEVF_MAC_ADDR_MC +}; + +enum HCLGEVF_MAC_NODE_STATE { + HCLGEVF_MAC_TO_ADD, + HCLGEVF_MAC_TO_DEL, + HCLGEVF_MAC_ACTIVE +}; + +struct hclgevf_mac_addr_node { + struct list_head node; + enum HCLGEVF_MAC_NODE_STATE state; + u8 mac_addr[ETH_ALEN]; +}; + +struct hclgevf_mac_table_cfg { + spinlock_t mac_list_lock; /* protect mac address need to add/detele */ + struct list_head uc_mac_list; + struct list_head mc_mac_list; +}; + struct hclgevf_dev { struct pci_dev *pdev; struct hnae3_ae_dev *ae_dev; @@ -282,6 +305,8 @@ struct hclgevf_dev { unsigned long vlan_del_fail_bmap[BITS_TO_LONGS(VLAN_N_VID)]; + struct hclgevf_mac_table_cfg mac_table; + bool mbx_event_pending; struct hclgevf_mbx_resp_status mbx_resp; /* mailbox response */ struct hclgevf_mbx_arq_ring arq; /* mailbox async rx queue */ -- cgit v1.2.3-59-g8ed1b From f671237a4b4521dfde5f96c2b088287712e72f4b Mon Sep 17 00:00:00 2001 From: Jian Shen Date: Fri, 24 Apr 2020 10:23:10 +0800 Subject: net: hns3: add support for dumping UC and MC MAC list This patch adds support for dumping entries of UC and MC MAC list, which help checking whether a MAC address being added into hardware or not. Signed-off-by: Jian Shen Signed-off-by: Huazhong Tan Signed-off-by: David S. Miller --- drivers/net/ethernet/hisilicon/hns3/hns3_debugfs.c | 2 + .../ethernet/hisilicon/hns3/hns3pf/hclge_debugfs.c | 51 ++++++++++++++++++++++ 2 files changed, 53 insertions(+) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_debugfs.c b/drivers/net/ethernet/hisilicon/hns3/hns3_debugfs.c index c934f328c040..fe7fb565da19 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3_debugfs.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3_debugfs.c @@ -262,6 +262,8 @@ static void hns3_dbg_help(struct hnae3_handle *h) dev_info(&h->pdev->dev, "dump mac tnl status\n"); dev_info(&h->pdev->dev, "dump loopback\n"); dev_info(&h->pdev->dev, "dump qs shaper [qs id]\n"); + dev_info(&h->pdev->dev, "dump uc mac list \n"); + dev_info(&h->pdev->dev, "dump mc mac list \n"); memset(printf_buf, 0, HNS3_DBG_BUF_LEN); strncat(printf_buf, "dump reg [[bios common] [ssu ]", diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_debugfs.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_debugfs.c index 66c1ad3a156b..6cfa8253eefc 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_debugfs.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_debugfs.c @@ -1441,6 +1441,49 @@ static void hclge_dbg_dump_qs_shaper(struct hclge_dev *hdev, hclge_dbg_dump_qs_shaper_single(hdev, qsid); } +static int hclge_dbg_dump_mac_list(struct hclge_dev *hdev, const char *cmd_buf, + bool is_unicast) +{ + struct hclge_mac_node *mac_node, *tmp; + struct hclge_vport *vport; + struct list_head *list; + u32 func_id; + int ret; + + ret = kstrtouint(cmd_buf, 0, &func_id); + if (ret < 0) { + dev_err(&hdev->pdev->dev, + "dump mac list: bad command string, ret = %d\n", ret); + return -EINVAL; + } + + if (func_id >= hdev->num_alloc_vport) { + dev_err(&hdev->pdev->dev, + "function id(%u) is out of range(0-%u)\n", func_id, + hdev->num_alloc_vport - 1); + return -EINVAL; + } + + vport = &hdev->vport[func_id]; + + list = is_unicast ? &vport->uc_mac_list : &vport->mc_mac_list; + + dev_info(&hdev->pdev->dev, "vport %u %s mac list:\n", + func_id, is_unicast ? "uc" : "mc"); + dev_info(&hdev->pdev->dev, "mac address state\n"); + + spin_lock_bh(&vport->mac_list_lock); + + list_for_each_entry_safe(mac_node, tmp, list, node) { + dev_info(&hdev->pdev->dev, "%pM %d\n", + mac_node->mac_addr, mac_node->state); + } + + spin_unlock_bh(&vport->mac_list_lock); + + return 0; +} + int hclge_dbg_run_cmd(struct hnae3_handle *handle, const char *cmd_buf) { #define DUMP_REG "dump reg" @@ -1485,6 +1528,14 @@ int hclge_dbg_run_cmd(struct hnae3_handle *handle, const char *cmd_buf) } else if (strncmp(cmd_buf, "dump qs shaper", 14) == 0) { hclge_dbg_dump_qs_shaper(hdev, &cmd_buf[sizeof("dump qs shaper")]); + } else if (strncmp(cmd_buf, "dump uc mac list", 16) == 0) { + hclge_dbg_dump_mac_list(hdev, + &cmd_buf[sizeof("dump uc mac list")], + true); + } else if (strncmp(cmd_buf, "dump mc mac list", 16) == 0) { + hclge_dbg_dump_mac_list(hdev, + &cmd_buf[sizeof("dump mc mac list")], + false); } else { dev_info(&hdev->pdev->dev, "unknown command\n"); return -EINVAL; -- cgit v1.2.3-59-g8ed1b From c631c696823cdddbf3c683c3b78812ecba31c350 Mon Sep 17 00:00:00 2001 From: Jian Shen Date: Fri, 24 Apr 2020 10:23:11 +0800 Subject: net: hns3: refactor the promisc mode setting As the HNS3 driver doesn't update the MAC address directly in function hns3_set_rx_mode() now, it can't know whether the MAC table is full from __dev_uc_sync() and __dev_mc_sync(), so it's senseless to handle the overflow promisc here. This patch removes the handle of overflow promisc from function hns3_set_rx_mode(), and updates the promisc mode in the service task. Signed-off-by: Jian Shen Signed-off-by: Huazhong Tan Signed-off-by: David S. Miller --- drivers/net/ethernet/hisilicon/hns3/hnae3.h | 3 + drivers/net/ethernet/hisilicon/hns3/hns3_enet.c | 42 +++------- drivers/net/ethernet/hisilicon/hns3/hns3_enet.h | 1 + drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c | 2 +- .../ethernet/hisilicon/hns3/hns3pf/hclge_main.c | 89 ++++++++++++++++++---- .../ethernet/hisilicon/hns3/hns3pf/hclge_main.h | 4 + .../ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c | 26 +++++++ .../ethernet/hisilicon/hns3/hns3vf/hclgevf_main.h | 1 + 8 files changed, 122 insertions(+), 46 deletions(-) diff --git a/drivers/net/ethernet/hisilicon/hns3/hnae3.h b/drivers/net/ethernet/hisilicon/hns3/hnae3.h index 5587605d6deb..a56f8d623349 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hnae3.h +++ b/drivers/net/ethernet/hisilicon/hns3/hnae3.h @@ -270,6 +270,8 @@ struct hnae3_ae_dev { * Set loopback * set_promisc_mode * Set promisc mode + * request_update_promisc_mode + * request to hclge(vf) to update promisc mode * set_mtu() * set mtu * get_pauseparam() @@ -408,6 +410,7 @@ struct hnae3_ae_ops { int (*set_promisc_mode)(struct hnae3_handle *handle, bool en_uc_pmc, bool en_mc_pmc); + void (*request_update_promisc_mode)(struct hnae3_handle *handle); int (*set_mtu)(struct hnae3_handle *handle, int new_mtu); void (*get_pauseparam)(struct hnae3_handle *handle, diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c index 341e8b5cd219..6b9535c17647 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c @@ -601,34 +601,25 @@ static void hns3_nic_set_rx_mode(struct net_device *netdev) { struct hnae3_handle *h = hns3_get_handle(netdev); u8 new_flags; - int ret; new_flags = hns3_get_netdev_flags(netdev); - ret = __dev_uc_sync(netdev, hns3_nic_uc_sync, hns3_nic_uc_unsync); - if (ret) { - netdev_err(netdev, "sync uc address fail\n"); - if (ret == -ENOSPC) - new_flags |= HNAE3_OVERFLOW_UPE; - } - - if (netdev->flags & IFF_MULTICAST) { - ret = __dev_mc_sync(netdev, hns3_nic_mc_sync, - hns3_nic_mc_unsync); - if (ret) { - netdev_err(netdev, "sync mc address fail\n"); - if (ret == -ENOSPC) - new_flags |= HNAE3_OVERFLOW_MPE; - } - } + __dev_uc_sync(netdev, hns3_nic_uc_sync, hns3_nic_uc_unsync); + __dev_mc_sync(netdev, hns3_nic_mc_sync, hns3_nic_mc_unsync); /* User mode Promisc mode enable and vlan filtering is disabled to - * let all packets in. MAC-VLAN Table overflow Promisc enabled and - * vlan fitering is enabled + * let all packets in. */ - hns3_enable_vlan_filter(netdev, new_flags & HNAE3_VLAN_FLTR); h->netdev_flags = new_flags; - hns3_update_promisc_mode(netdev, new_flags); + hns3_request_update_promisc_mode(h); +} + +void hns3_request_update_promisc_mode(struct hnae3_handle *handle) +{ + const struct hnae3_ae_ops *ops = handle->ae_algo->ops; + + if (ops->request_update_promisc_mode) + ops->request_update_promisc_mode(handle); } int hns3_update_promisc_mode(struct net_device *netdev, u8 promisc_flags) @@ -4467,15 +4458,6 @@ err_put_ring: static int hns3_reset_notify_restore_enet(struct hnae3_handle *handle) { struct net_device *netdev = handle->kinfo.netdev; - bool vlan_filter_enable; - int ret; - - ret = hns3_update_promisc_mode(netdev, handle->netdev_flags); - if (ret) - return ret; - - vlan_filter_enable = netdev->flags & IFF_PROMISC ? false : true; - hns3_enable_vlan_filter(netdev, vlan_filter_enable); if (handle->ae_algo->ops->restore_vlan_table) handle->ae_algo->ops->restore_vlan_table(handle); diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.h b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.h index 4b3f0abf0715..53bc0edf9b6f 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.h +++ b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.h @@ -658,6 +658,7 @@ void hns3_set_vector_coalesce_rl(struct hns3_enet_tqp_vector *tqp_vector, void hns3_enable_vlan_filter(struct net_device *netdev, bool enable); int hns3_update_promisc_mode(struct net_device *netdev, u8 promisc_flags); +void hns3_request_update_promisc_mode(struct hnae3_handle *handle); #ifdef CONFIG_HNS3_DCB void hns3_dcbnl_setup(struct hnae3_handle *handle); diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c b/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c index 6a0734be4a1a..4d9c85f049dc 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c @@ -99,7 +99,7 @@ static int hns3_lp_setup(struct net_device *ndev, enum hnae3_loop loop, bool en) h->ae_algo->ops->set_promisc_mode(h, true, true); } else { /* recover promisc mode before loopback test */ - hns3_update_promisc_mode(ndev, h->netdev_flags); + hns3_request_update_promisc_mode(h); vlan_filter_enable = ndev->flags & IFF_PROMISC ? false : true; hns3_enable_vlan_filter(ndev, vlan_filter_enable); } diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c index c3205ae620ce..71ff0fa64f46 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c @@ -69,6 +69,7 @@ static enum hnae3_reset_type hclge_get_reset_level(struct hnae3_ae_dev *ae_dev, static int hclge_set_default_loopback(struct hclge_dev *hdev); static void hclge_sync_mac_table(struct hclge_dev *hdev); +static void hclge_sync_promisc_mode(struct hclge_dev *hdev); static struct hnae3_ae_algo ae_algo; @@ -3975,6 +3976,7 @@ static void hclge_periodic_service_task(struct hclge_dev *hdev) */ hclge_update_link_status(hdev); hclge_sync_mac_table(hdev); + hclge_sync_promisc_mode(hdev); if (time_is_after_jiffies(hdev->last_serv_processed + HZ)) { delta = jiffies - hdev->last_serv_processed; @@ -4724,7 +4726,8 @@ static int hclge_cmd_set_promisc_mode(struct hclge_dev *hdev, ret = hclge_cmd_send(&hdev->hw, &desc, 1); if (ret) dev_err(&hdev->pdev->dev, - "Set promisc mode fail, status is %d.\n", ret); + "failed to set vport %d promisc mode, ret = %d.\n", + param->vf_id, ret); return ret; } @@ -4774,6 +4777,14 @@ static int hclge_set_promisc_mode(struct hnae3_handle *handle, bool en_uc_pmc, en_bc_pmc); } +static void hclge_request_update_promisc_mode(struct hnae3_handle *handle) +{ + struct hclge_vport *vport = hclge_get_vport(handle); + struct hclge_dev *hdev = vport->back; + + set_bit(HCLGE_STATE_PROMISC_CHANGED, &hdev->state); +} + static int hclge_get_fd_mode(struct hclge_dev *hdev, u8 *fd_mode) { struct hclge_get_fd_mode_cmd *req; @@ -6972,17 +6983,11 @@ static int hclge_get_mac_vlan_cmd_status(struct hclge_vport *vport, } if (op == HCLGE_MAC_VLAN_ADD) { - if ((!resp_code) || (resp_code == 1)) { + if (!resp_code || resp_code == 1) return 0; - } else if (resp_code == HCLGE_ADD_UC_OVERFLOW) { - dev_err(&hdev->pdev->dev, - "add mac addr failed for uc_overflow.\n"); - return -ENOSPC; - } else if (resp_code == HCLGE_ADD_MC_OVERFLOW) { - dev_err(&hdev->pdev->dev, - "add mac addr failed for mc_overflow.\n"); + else if (resp_code == HCLGE_ADD_UC_OVERFLOW || + resp_code == HCLGE_ADD_MC_OVERFLOW) return -ENOSPC; - } dev_err(&hdev->pdev->dev, "add mac addr failed for undefined, code=%u.\n", @@ -7448,8 +7453,9 @@ int hclge_add_uc_addr_common(struct hclge_vport *vport, return ret; } - dev_err(&hdev->pdev->dev, "UC MAC table full(%u)\n", - hdev->priv_umv_size); + if (!(vport->overflow_promisc_flags & HNAE3_OVERFLOW_UPE)) + dev_err(&hdev->pdev->dev, "UC MAC table full(%u)\n", + hdev->priv_umv_size); return -ENOSPC; } @@ -7543,7 +7549,9 @@ int hclge_add_mc_addr_common(struct hclge_vport *vport, return status; status = hclge_add_mac_vlan_tbl(vport, &req, desc); - if (status == -ENOSPC) + /* if already overflow, not to print each time */ + if (status == -ENOSPC && + !(vport->overflow_promisc_flags & HNAE3_OVERFLOW_MPE)) dev_err(&hdev->pdev->dev, "mc mac vlan table is full\n"); return status; @@ -7638,12 +7646,16 @@ static void hclge_unsync_vport_mac_list(struct hclge_vport *vport, } } -static void hclge_sync_from_add_list(struct list_head *add_list, +static bool hclge_sync_from_add_list(struct list_head *add_list, struct list_head *mac_list) { struct hclge_mac_node *mac_node, *tmp, *new_node; + bool all_added = true; list_for_each_entry_safe(mac_node, tmp, add_list, node) { + if (mac_node->state == HCLGE_MAC_TO_ADD) + all_added = false; + /* if the mac address from tmp_add_list is not in the * uc/mc_mac_list, it means have received a TO_DEL request * during the time window of adding the mac address into mac @@ -7666,6 +7678,8 @@ static void hclge_sync_from_add_list(struct list_head *add_list, kfree(mac_node); } } + + return all_added; } static void hclge_sync_from_del_list(struct list_head *del_list, @@ -7693,12 +7707,30 @@ static void hclge_sync_from_del_list(struct list_head *del_list, } } +static void hclge_update_overflow_flags(struct hclge_vport *vport, + enum HCLGE_MAC_ADDR_TYPE mac_type, + bool is_all_added) +{ + if (mac_type == HCLGE_MAC_ADDR_UC) { + if (is_all_added) + vport->overflow_promisc_flags &= ~HNAE3_OVERFLOW_UPE; + else + vport->overflow_promisc_flags |= HNAE3_OVERFLOW_UPE; + } else { + if (is_all_added) + vport->overflow_promisc_flags &= ~HNAE3_OVERFLOW_MPE; + else + vport->overflow_promisc_flags |= HNAE3_OVERFLOW_MPE; + } +} + static void hclge_sync_vport_mac_table(struct hclge_vport *vport, enum HCLGE_MAC_ADDR_TYPE mac_type) { struct hclge_mac_node *mac_node, *tmp, *new_node; struct list_head tmp_add_list, tmp_del_list; struct list_head *list; + bool all_added; INIT_LIST_HEAD(&tmp_add_list); INIT_LIST_HEAD(&tmp_del_list); @@ -7752,9 +7784,11 @@ stop_traverse: spin_lock_bh(&vport->mac_list_lock); hclge_sync_from_del_list(&tmp_del_list, list); - hclge_sync_from_add_list(&tmp_add_list, list); + all_added = hclge_sync_from_add_list(&tmp_add_list, list); spin_unlock_bh(&vport->mac_list_lock); + + hclge_update_overflow_flags(vport, mac_type, all_added); } static bool hclge_need_sync_mac_table(struct hclge_vport *vport) @@ -11052,6 +11086,30 @@ static int hclge_gro_en(struct hnae3_handle *handle, bool enable) return hclge_config_gro(hdev, enable); } +static void hclge_sync_promisc_mode(struct hclge_dev *hdev) +{ + struct hclge_vport *vport = &hdev->vport[0]; + struct hnae3_handle *handle = &vport->nic; + u8 tmp_flags = 0; + int ret; + + if (vport->last_promisc_flags != vport->overflow_promisc_flags) { + set_bit(HCLGE_STATE_PROMISC_CHANGED, &hdev->state); + vport->last_promisc_flags = vport->overflow_promisc_flags; + } + + if (test_bit(HCLGE_STATE_PROMISC_CHANGED, &hdev->state)) { + tmp_flags = handle->netdev_flags | vport->last_promisc_flags; + ret = hclge_set_promisc_mode(handle, tmp_flags & HNAE3_UPE, + tmp_flags & HNAE3_MPE); + if (!ret) { + clear_bit(HCLGE_STATE_PROMISC_CHANGED, &hdev->state); + hclge_enable_vlan_filter(handle, + tmp_flags & HNAE3_VLAN_FLTR); + } + } +} + static const struct hnae3_ae_ops hclge_ops = { .init_ae_dev = hclge_init_ae_dev, .uninit_ae_dev = hclge_uninit_ae_dev, @@ -11064,6 +11122,7 @@ static const struct hnae3_ae_ops hclge_ops = { .get_vector = hclge_get_vector, .put_vector = hclge_put_vector, .set_promisc_mode = hclge_set_promisc_mode, + .request_update_promisc_mode = hclge_request_update_promisc_mode, .set_loopback = hclge_set_loopback, .start = hclge_ae_start, .stop = hclge_ae_stop, diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h index 5fcbc3d23f21..85180f4a9b4d 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h @@ -217,6 +217,7 @@ enum HCLGE_DEV_STATE { HCLGE_STATE_STATISTICS_UPDATING, HCLGE_STATE_CMD_DISABLE, HCLGE_STATE_LINK_UPDATING, + HCLGE_STATE_PROMISC_CHANGED, HCLGE_STATE_RST_FAIL, HCLGE_STATE_MAX }; @@ -931,6 +932,9 @@ struct hclge_vport { u32 mps; /* Max packet size */ struct hclge_vf_info vf_info; + u8 overflow_promisc_flags; + u8 last_promisc_flags; + spinlock_t mac_list_lock; /* protect mac address need to add/detele */ struct list_head uc_mac_list; /* Store VF unicast table */ struct list_head mc_mac_list; /* Store VF multicast table */ diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c index 05d485a48706..fea197fd77cb 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c @@ -1164,6 +1164,27 @@ static int hclgevf_set_promisc_mode(struct hnae3_handle *handle, bool en_uc_pmc, en_bc_pmc); } +static void hclgevf_request_update_promisc_mode(struct hnae3_handle *handle) +{ + struct hclgevf_dev *hdev = hclgevf_ae_get_hdev(handle); + + set_bit(HCLGEVF_STATE_PROMISC_CHANGED, &hdev->state); +} + +static void hclgevf_sync_promisc_mode(struct hclgevf_dev *hdev) +{ + struct hnae3_handle *handle = &hdev->nic; + bool en_uc_pmc = handle->netdev_flags & HNAE3_UPE; + bool en_mc_pmc = handle->netdev_flags & HNAE3_MPE; + int ret; + + if (test_bit(HCLGEVF_STATE_PROMISC_CHANGED, &hdev->state)) { + ret = hclgevf_set_promisc_mode(handle, en_uc_pmc, en_mc_pmc); + if (!ret) + clear_bit(HCLGEVF_STATE_PROMISC_CHANGED, &hdev->state); + } +} + static int hclgevf_tqp_enable(struct hclgevf_dev *hdev, unsigned int tqp_id, int stream_id, bool enable) { @@ -2203,6 +2224,8 @@ static void hclgevf_periodic_service_task(struct hclgevf_dev *hdev) hclgevf_sync_mac_table(hdev); + hclgevf_sync_promisc_mode(hdev); + hdev->last_serv_processed = jiffies; out: @@ -2986,6 +3009,8 @@ static int hclgevf_reset_hdev(struct hclgevf_dev *hdev) return ret; } + set_bit(HCLGEVF_STATE_PROMISC_CHANGED, &hdev->state); + dev_info(&hdev->pdev->dev, "Reset done\n"); return 0; @@ -3470,6 +3495,7 @@ static const struct hnae3_ae_ops hclgevf_ops = { .set_timer_task = hclgevf_set_timer_task, .get_link_mode = hclgevf_get_link_mode, .set_promisc_mode = hclgevf_set_promisc_mode, + .request_update_promisc_mode = hclgevf_request_update_promisc_mode, }; static struct hnae3_ae_algo ae_algovf = { diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.h b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.h index 0222d9b87a42..f19583c4bc9b 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.h +++ b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.h @@ -148,6 +148,7 @@ enum hclgevf_states { HCLGEVF_STATE_MBX_HANDLING, HCLGEVF_STATE_CMD_DISABLE, HCLGEVF_STATE_LINK_UPDATING, + HCLGEVF_STATE_PROMISC_CHANGED, HCLGEVF_STATE_RST_FAIL, }; -- cgit v1.2.3-59-g8ed1b From 7d0b345156d0678fbbbf885f991c6d83f23d70f1 Mon Sep 17 00:00:00 2001 From: Jian Shen Date: Fri, 24 Apr 2020 10:23:12 +0800 Subject: net: hns3: use mutex vport_lock instead of mutex umv_lock Currently, the driver use mutex umv_lock to protect the variable vport->share_umv_size. And there is already a mutex vport_lock being defined in the driver, which is designed to protect the resource of vport. So we can use vport_lock instead of umv_lock. Furthermore, there is a time window for protect share_umv_size between checking UMV space and doing MAC configuration in the lin function hclge_add_uc_addr_common(). It should be extended. This patch uses mutex vport_lock intead of spin lock umv_lock to protect share_umv_size, and adjusts the mutex's range. Signed-off-by: Jian Shen Signed-off-by: Huazhong Tan Signed-off-by: David S. Miller --- .../ethernet/hisilicon/hns3/hns3pf/hclge_main.c | 31 +++++++++++++--------- .../ethernet/hisilicon/hns3/hns3pf/hclge_main.h | 1 - 2 files changed, 19 insertions(+), 13 deletions(-) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c index 71ff0fa64f46..177ef5e93d06 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c @@ -7250,7 +7250,6 @@ static int hclge_init_umv_space(struct hclge_dev *hdev) "failed to alloc umv space, want %u, get %u\n", hdev->wanted_umv_size, allocated_size); - mutex_init(&hdev->umv_mutex); hdev->max_umv_size = allocated_size; hdev->priv_umv_size = hdev->max_umv_size / (hdev->num_alloc_vport + 1); hdev->share_umv_size = hdev->priv_umv_size + @@ -7269,21 +7268,25 @@ static void hclge_reset_umv_space(struct hclge_dev *hdev) vport->used_umv_num = 0; } - mutex_lock(&hdev->umv_mutex); + mutex_lock(&hdev->vport_lock); hdev->share_umv_size = hdev->priv_umv_size + hdev->max_umv_size % (hdev->num_alloc_vport + 1); - mutex_unlock(&hdev->umv_mutex); + mutex_unlock(&hdev->vport_lock); } -static bool hclge_is_umv_space_full(struct hclge_vport *vport) +static bool hclge_is_umv_space_full(struct hclge_vport *vport, bool need_lock) { struct hclge_dev *hdev = vport->back; bool is_full; - mutex_lock(&hdev->umv_mutex); + if (need_lock) + mutex_lock(&hdev->vport_lock); + is_full = (vport->used_umv_num >= hdev->priv_umv_size && hdev->share_umv_size == 0); - mutex_unlock(&hdev->umv_mutex); + + if (need_lock) + mutex_unlock(&hdev->vport_lock); return is_full; } @@ -7292,7 +7295,6 @@ static void hclge_update_umv_space(struct hclge_vport *vport, bool is_free) { struct hclge_dev *hdev = vport->back; - mutex_lock(&hdev->umv_mutex); if (is_free) { if (vport->used_umv_num > hdev->priv_umv_size) hdev->share_umv_size++; @@ -7305,7 +7307,6 @@ static void hclge_update_umv_space(struct hclge_vport *vport, bool is_free) hdev->share_umv_size--; vport->used_umv_num++; } - mutex_unlock(&hdev->umv_mutex); } static struct hclge_mac_node *hclge_find_mac_node(struct list_head *list, @@ -7446,12 +7447,15 @@ int hclge_add_uc_addr_common(struct hclge_vport *vport, */ ret = hclge_lookup_mac_vlan_tbl(vport, &req, &desc, false); if (ret == -ENOENT) { - if (!hclge_is_umv_space_full(vport)) { + mutex_lock(&hdev->vport_lock); + if (!hclge_is_umv_space_full(vport, false)) { ret = hclge_add_mac_vlan_tbl(vport, &req, NULL); if (!ret) hclge_update_umv_space(vport, false); + mutex_unlock(&hdev->vport_lock); return ret; } + mutex_unlock(&hdev->vport_lock); if (!(vport->overflow_promisc_flags & HNAE3_OVERFLOW_UPE)) dev_err(&hdev->pdev->dev, "UC MAC table full(%u)\n", @@ -7503,10 +7507,13 @@ int hclge_rm_uc_addr_common(struct hclge_vport *vport, hnae3_set_bit(req.entry_type, HCLGE_MAC_VLAN_BIT0_EN_B, 0); hclge_prepare_mac_addr(&req, addr, false); ret = hclge_remove_mac_vlan_tbl(vport, &req); - if (!ret) + if (!ret) { + mutex_lock(&hdev->vport_lock); hclge_update_umv_space(vport, true); - else if (ret == -ENOENT) + mutex_unlock(&hdev->vport_lock); + } else if (ret == -ENOENT) { ret = 0; + } return ret; } @@ -10163,7 +10170,7 @@ static int hclge_set_vf_spoofchk(struct hnae3_handle *handle, int vf, dev_warn(&hdev->pdev->dev, "vf %d vlan table is full, enable spoof check may cause its packet send fail\n", vf); - else if (enable && hclge_is_umv_space_full(vport)) + else if (enable && hclge_is_umv_space_full(vport, true)) dev_warn(&hdev->pdev->dev, "vf %d mac table is full, enable spoof check may cause its packet send fail\n", vf); diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h index 85180f4a9b4d..8e69651f8bf1 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h @@ -831,7 +831,6 @@ struct hclge_dev { u16 priv_umv_size; /* unicast mac vlan space shared by PF and its VFs */ u16 share_umv_size; - struct mutex umv_mutex; /* protect share_umv_size */ DECLARE_KFIFO(mac_tnl_log, struct hclge_mac_tnl_stats, HCLGE_MAC_TNL_LOG_SIZE); -- cgit v1.2.3-59-g8ed1b From 039ba863e8d71c52b1f5faf26b0f458eec33d5e7 Mon Sep 17 00:00:00 2001 From: Jian Shen Date: Fri, 24 Apr 2020 10:23:13 +0800 Subject: net: hns3: optimize the filter table entries handling when resetting Currently, the PF driver removes all (including its VFs') MAC/VLAN flow director table entries when resetting, and restores them after reset completed. In fact, the hardware will clear all table entries only in IMP reset and global reset. So driver only needs to restore the table entries in these cases, and needs do nothing when PF reset, FLR or other function level reset. This patch optimizes it by removing unnecessary table entries clear and restoring handling in the reset flow, and doing the restoring after reset completed. Signed-off-by: Jian Shen Signed-off-by: Huazhong Tan Signed-off-by: David S. Miller --- drivers/net/ethernet/hisilicon/hns3/hclge_mbx.h | 5 ++ drivers/net/ethernet/hisilicon/hns3/hnae3.h | 5 -- drivers/net/ethernet/hisilicon/hns3/hns3_enet.c | 33 -------- drivers/net/ethernet/hisilicon/hns3/hns3_enet.h | 9 --- .../ethernet/hisilicon/hns3/hns3pf/hclge_main.c | 89 +++++++++++----------- .../ethernet/hisilicon/hns3/hns3pf/hclge_main.h | 1 + .../net/ethernet/hisilicon/hns3/hns3pf/hclge_mbx.c | 28 ++++++- .../ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c | 29 ++++--- 8 files changed, 94 insertions(+), 105 deletions(-) diff --git a/drivers/net/ethernet/hisilicon/hns3/hclge_mbx.h b/drivers/net/ethernet/hisilicon/hns3/hclge_mbx.h index 948e67ef30fd..21a736174fda 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hclge_mbx.h +++ b/drivers/net/ethernet/hisilicon/hns3/hclge_mbx.h @@ -45,6 +45,7 @@ enum HCLGE_MBX_OPCODE { HCLGE_MBX_GET_MEDIA_TYPE, /* (VF -> PF) get media type */ HCLGE_MBX_PUSH_PROMISC_INFO, /* (PF -> VF) push vf promisc info */ HCLGE_MBX_VF_UNINIT, /* (VF -> PF) vf is unintializing */ + HCLGE_MBX_HANDLE_VF_TBL, /* (VF -> PF) store/clear hw table */ HCLGE_MBX_GET_VF_FLR_STATUS = 200, /* (M7 -> PF) get vf flr status */ HCLGE_MBX_PUSH_LINK_STATUS, /* (M7 -> PF) get port link status */ @@ -70,6 +71,10 @@ enum hclge_mbx_vlan_cfg_subcode { HCLGE_MBX_GET_PORT_BASE_VLAN_STATE, /* get port based vlan state */ }; +enum hclge_mbx_tbl_cfg_subcode { + HCLGE_MBX_VPORT_LIST_CLEAR, +}; + #define HCLGE_MBX_MAX_MSG_SIZE 14 #define HCLGE_MBX_MAX_RESP_DATA_SIZE 8U #define HCLGE_MBX_MAX_RING_CHAIN_PARAM_NUM 4 diff --git a/drivers/net/ethernet/hisilicon/hns3/hnae3.h b/drivers/net/ethernet/hisilicon/hns3/hnae3.h index a56f8d623349..6291aa9f06b0 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hnae3.h +++ b/drivers/net/ethernet/hisilicon/hns3/hnae3.h @@ -233,7 +233,6 @@ struct hnae3_ae_dev { struct list_head node; u32 flag; unsigned long hw_err_reset_req; - enum hnae3_reset_type reset_type; void *priv; }; @@ -356,8 +355,6 @@ struct hnae3_ae_dev { * Set vlan filter config of Ports * set_vf_vlan_filter() * Set vlan filter config of vf - * restore_vlan_table() - * Restore vlan filter entries after reset * enable_hw_strip_rxvtag() * Enable/disable hardware strip vlan tag of packets received * set_gro_en @@ -528,7 +525,6 @@ struct hnae3_ae_ops { struct ethtool_rxnfc *cmd); int (*get_fd_all_rules)(struct hnae3_handle *handle, struct ethtool_rxnfc *cmd, u32 *rule_locs); - int (*restore_fd_rules)(struct hnae3_handle *handle); void (*enable_fd)(struct hnae3_handle *handle, bool enable); int (*add_arfs_entry)(struct hnae3_handle *handle, u16 queue_id, u16 flow_id, struct flow_keys *fkeys); @@ -542,7 +538,6 @@ struct hnae3_ae_ops { void (*set_timer_task)(struct hnae3_handle *handle, bool enable); int (*mac_connect_phy)(struct hnae3_handle *handle); void (*mac_disconnect_phy)(struct hnae3_handle *handle); - void (*restore_vlan_table)(struct hnae3_handle *handle); int (*get_vf_config)(struct hnae3_handle *handle, int vf, struct ifla_vf_info *ivf); int (*set_vf_link_state)(struct hnae3_handle *handle, int vf, diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c index 6b9535c17647..c79d6a391105 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c @@ -2102,7 +2102,6 @@ static int hns3_probe(struct pci_dev *pdev, const struct pci_device_id *ent) ae_dev->pdev = pdev; ae_dev->flag = ent->driver_data; - ae_dev->reset_type = HNAE3_NONE_RESET; hns3_get_dev_capability(pdev, ae_dev); pci_set_drvdata(pdev, ae_dev); @@ -3936,17 +3935,6 @@ static void hns3_uninit_phy(struct net_device *netdev) h->ae_algo->ops->mac_disconnect_phy(h); } -static int hns3_restore_fd_rules(struct net_device *netdev) -{ - struct hnae3_handle *h = hns3_get_handle(netdev); - int ret = 0; - - if (h->ae_algo->ops->restore_fd_rules) - ret = h->ae_algo->ops->restore_fd_rules(h); - - return ret; -} - static void hns3_del_all_fd_rules(struct net_device *netdev, bool clear_list) { struct hnae3_handle *h = hns3_get_handle(netdev); @@ -4346,7 +4334,6 @@ static void hns3_restore_coal(struct hns3_nic_priv *priv) static int hns3_reset_notify_down_enet(struct hnae3_handle *handle) { - struct hnae3_ae_dev *ae_dev = pci_get_drvdata(handle->pdev); struct hnae3_knic_private_info *kinfo = &handle->kinfo; struct net_device *ndev = kinfo->netdev; struct hns3_nic_priv *priv = netdev_priv(ndev); @@ -4354,13 +4341,6 @@ static int hns3_reset_notify_down_enet(struct hnae3_handle *handle) if (test_and_set_bit(HNS3_NIC_STATE_RESETTING, &priv->state)) return 0; - /* it is cumbersome for hardware to pick-and-choose entries for deletion - * from table space. Hence, for function reset software intervention is - * required to delete the entries - */ - if (hns3_dev_ongoing_func_reset(ae_dev)) - hns3_del_all_fd_rules(ndev, false); - if (!netif_running(ndev)) return 0; @@ -4455,16 +4435,6 @@ err_put_ring: return ret; } -static int hns3_reset_notify_restore_enet(struct hnae3_handle *handle) -{ - struct net_device *netdev = handle->kinfo.netdev; - - if (handle->ae_algo->ops->restore_vlan_table) - handle->ae_algo->ops->restore_vlan_table(handle); - - return hns3_restore_fd_rules(netdev); -} - static int hns3_reset_notify_uninit_enet(struct hnae3_handle *handle) { struct net_device *netdev = handle->kinfo.netdev; @@ -4514,9 +4484,6 @@ static int hns3_reset_notify(struct hnae3_handle *handle, case HNAE3_UNINIT_CLIENT: ret = hns3_reset_notify_uninit_enet(handle); break; - case HNAE3_RESTORE_CLIENT: - ret = hns3_reset_notify_restore_enet(handle); - break; default: break; } diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.h b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.h index 53bc0edf9b6f..240ba06cd0eb 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.h +++ b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.h @@ -576,15 +576,6 @@ static inline void hns3_write_reg(void __iomem *base, u32 reg, u32 value) writel(value, reg_addr + reg); } -static inline bool hns3_dev_ongoing_func_reset(struct hnae3_ae_dev *ae_dev) -{ - return (ae_dev && (ae_dev->reset_type == HNAE3_FUNC_RESET || - ae_dev->reset_type == HNAE3_FLR_RESET || - ae_dev->reset_type == HNAE3_VF_FUNC_RESET || - ae_dev->reset_type == HNAE3_VF_FULL_RESET || - ae_dev->reset_type == HNAE3_VF_PF_FUNC_RESET)); -} - #define hns3_read_dev(a, reg) \ hns3_read_reg((a)->io_base, (reg)) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c index 177ef5e93d06..c74990a59e10 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c @@ -69,6 +69,7 @@ static enum hnae3_reset_type hclge_get_reset_level(struct hnae3_ae_dev *ae_dev, static int hclge_set_default_loopback(struct hclge_dev *hdev); static void hclge_sync_mac_table(struct hclge_dev *hdev); +static void hclge_restore_hw_table(struct hclge_dev *hdev); static void hclge_sync_promisc_mode(struct hclge_dev *hdev); static struct hnae3_ae_algo ae_algo; @@ -3731,22 +3732,13 @@ static int hclge_reset_stack(struct hclge_dev *hdev) if (ret) return ret; - ret = hclge_notify_client(hdev, HNAE3_INIT_CLIENT); - if (ret) - return ret; - - return hclge_notify_client(hdev, HNAE3_RESTORE_CLIENT); + return hclge_notify_client(hdev, HNAE3_INIT_CLIENT); } static int hclge_reset_prepare(struct hclge_dev *hdev) { - struct hnae3_ae_dev *ae_dev = pci_get_drvdata(hdev->pdev); int ret; - /* Initialize ae_dev reset status as well, in case enet layer wants to - * know if device is undergoing reset - */ - ae_dev->reset_type = hdev->reset_type; hdev->rst_stats.reset_cnt++; /* perform reset of the stack & ae device for a client */ ret = hclge_notify_roce_client(hdev, HNAE3_DOWN_CLIENT); @@ -3808,7 +3800,6 @@ static int hclge_reset_rebuild(struct hclge_dev *hdev) hdev->last_reset_time = jiffies; hdev->rst_stats.reset_fail_cnt = 0; hdev->rst_stats.reset_done_cnt++; - ae_dev->reset_type = HNAE3_NONE_RESET; clear_bit(HCLGE_STATE_RST_FAIL, &hdev->state); /* if default_reset_request has a higher level reset request, @@ -6942,8 +6933,14 @@ int hclge_vport_start(struct hclge_vport *vport) set_bit(HCLGE_VPORT_STATE_ALIVE, &vport->state); vport->last_active_jiffies = jiffies; - if (test_bit(vport->vport_id, hdev->vport_config_block)) - hclge_restore_mac_table_common(vport); + if (test_bit(vport->vport_id, hdev->vport_config_block)) { + if (vport->vport_id) { + hclge_restore_mac_table_common(vport); + hclge_restore_vport_vlan_table(vport); + } else { + hclge_restore_hw_table(hdev); + } + } clear_bit(vport->vport_id, hdev->vport_config_block); @@ -8789,39 +8786,34 @@ void hclge_uninit_vport_vlan_table(struct hclge_dev *hdev) } } -static void hclge_restore_vlan_table(struct hnae3_handle *handle) +void hclge_restore_vport_vlan_table(struct hclge_vport *vport) { - struct hclge_vport *vport = hclge_get_vport(handle); struct hclge_vport_vlan_cfg *vlan, *tmp; struct hclge_dev *hdev = vport->back; u16 vlan_proto; - u16 state, vlan_id; - int i; + u16 vlan_id; + u16 state; + int ret; - for (i = 0; i < hdev->num_alloc_vport; i++) { - vport = &hdev->vport[i]; - vlan_proto = vport->port_base_vlan_cfg.vlan_info.vlan_proto; - vlan_id = vport->port_base_vlan_cfg.vlan_info.vlan_tag; - state = vport->port_base_vlan_cfg.state; + vlan_proto = vport->port_base_vlan_cfg.vlan_info.vlan_proto; + vlan_id = vport->port_base_vlan_cfg.vlan_info.vlan_tag; + state = vport->port_base_vlan_cfg.state; - if (state != HNAE3_PORT_BASE_VLAN_DISABLE) { - hclge_set_vlan_filter_hw(hdev, htons(vlan_proto), - vport->vport_id, vlan_id, - false); - continue; - } - - list_for_each_entry_safe(vlan, tmp, &vport->vlan_list, node) { - int ret; + if (state != HNAE3_PORT_BASE_VLAN_DISABLE) { + clear_bit(vport->vport_id, hdev->vlan_table[vlan_id]); + hclge_set_vlan_filter_hw(hdev, htons(vlan_proto), + vport->vport_id, vlan_id, + false); + return; + } - if (!vlan->hd_tbl_status) - continue; - ret = hclge_set_vlan_filter_hw(hdev, htons(ETH_P_8021Q), - vport->vport_id, - vlan->vlan_id, false); - if (ret) - break; - } + list_for_each_entry_safe(vlan, tmp, &vport->vlan_list, node) { + ret = hclge_set_vlan_filter_hw(hdev, htons(ETH_P_8021Q), + vport->vport_id, + vlan->vlan_id, false); + if (ret) + break; + vlan->hd_tbl_status = true; } } @@ -8856,6 +8848,18 @@ void hclge_restore_mac_table_common(struct hclge_vport *vport) spin_unlock_bh(&vport->mac_list_lock); } +static void hclge_restore_hw_table(struct hclge_dev *hdev) +{ + struct hclge_vport *vport = &hdev->vport[0]; + struct hnae3_handle *handle = &vport->nic; + + hclge_restore_mac_table_common(vport); + hclge_restore_vport_vlan_table(vport); + set_bit(HCLGE_STATE_PROMISC_CHANGED, &hdev->state); + + hclge_restore_fd_entries(handle); +} + int hclge_en_hw_strip_rxvtag(struct hnae3_handle *handle, bool enable) { struct hclge_vport *vport = hclge_get_vport(handle); @@ -10352,13 +10356,12 @@ static int hclge_reset_ae_dev(struct hnae3_ae_dev *ae_dev) */ if (hdev->reset_type == HNAE3_IMP_RESET || hdev->reset_type == HNAE3_GLOBAL_RESET) { + memset(hdev->vlan_table, 0, sizeof(hdev->vlan_table)); + memset(hdev->vf_vlan_full, 0, sizeof(hdev->vf_vlan_full)); bitmap_set(hdev->vport_config_block, 0, hdev->num_alloc_vport); hclge_reset_umv_space(hdev); } - memset(hdev->vlan_table, 0, sizeof(hdev->vlan_table)); - memset(hdev->vf_vlan_full, 0, sizeof(hdev->vf_vlan_full)); - ret = hclge_cmd_init(hdev); if (ret) { dev_err(&pdev->dev, "Cmd queue init failed\n"); @@ -11191,7 +11194,6 @@ static const struct hnae3_ae_ops hclge_ops = { .get_fd_rule_cnt = hclge_get_fd_rule_cnt, .get_fd_rule_info = hclge_get_fd_rule_info, .get_fd_all_rules = hclge_get_all_rules, - .restore_fd_rules = hclge_restore_fd_entries, .enable_fd = hclge_enable_fd, .add_arfs_entry = hclge_add_fd_entry_by_arfs, .dbg_run_cmd = hclge_dbg_run_cmd, @@ -11204,7 +11206,6 @@ static const struct hnae3_ae_ops hclge_ops = { .set_timer_task = hclge_set_timer_task, .mac_connect_phy = hclge_mac_connect_phy, .mac_disconnect_phy = hclge_mac_disconnect_phy, - .restore_vlan_table = hclge_restore_vlan_table, .get_vf_config = hclge_get_vf_config, .set_vf_link_state = hclge_set_vf_link_state, .set_vf_spoofchk = hclge_set_vf_spoofchk, diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h index 8e69651f8bf1..913c4f677404 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h @@ -1001,6 +1001,7 @@ void hclge_rm_vport_all_mac_table(struct hclge_vport *vport, bool is_del_list, void hclge_rm_vport_all_vlan_table(struct hclge_vport *vport, bool is_del_list); void hclge_uninit_vport_vlan_table(struct hclge_dev *hdev); void hclge_restore_mac_table_common(struct hclge_vport *vport); +void hclge_restore_vport_vlan_table(struct hclge_vport *vport); int hclge_update_port_base_vlan_cfg(struct hclge_vport *vport, u16 state, struct hclge_vlan_info *vlan_info); int hclge_push_vf_port_base_vlan_info(struct hclge_vport *vport, u8 vfid, diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_mbx.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_mbx.c index 0efc04562ba6..ac70fafd15d5 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_mbx.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_mbx.c @@ -629,6 +629,23 @@ static void hclge_handle_ncsi_error(struct hclge_dev *hdev) ae_dev->ops->reset_event(hdev->pdev, NULL); } +static void hclge_handle_vf_tbl(struct hclge_vport *vport, + struct hclge_mbx_vf_to_pf_cmd *mbx_req) +{ + struct hclge_dev *hdev = vport->back; + struct hclge_vf_vlan_cfg *msg_cmd; + + msg_cmd = (struct hclge_vf_vlan_cfg *)&mbx_req->msg; + if (msg_cmd->subcode == HCLGE_MBX_VPORT_LIST_CLEAR) { + hclge_rm_vport_all_mac_table(vport, true, HCLGE_MAC_ADDR_UC); + hclge_rm_vport_all_mac_table(vport, true, HCLGE_MAC_ADDR_MC); + hclge_rm_vport_all_vlan_table(vport, true); + } else { + dev_warn(&hdev->pdev->dev, "Invalid cmd(%u)\n", + msg_cmd->subcode); + } +} + void hclge_mbx_handler(struct hclge_dev *hdev) { struct hclge_cmq_ring *crq = &hdev->hw.cmq.crq; @@ -636,6 +653,7 @@ void hclge_mbx_handler(struct hclge_dev *hdev) struct hclge_mbx_vf_to_pf_cmd *req; struct hclge_vport *vport; struct hclge_desc *desc; + bool is_del = false; unsigned int flag; int ret = 0; @@ -753,11 +771,12 @@ void hclge_mbx_handler(struct hclge_dev *hdev) break; case HCLGE_MBX_GET_VF_FLR_STATUS: case HCLGE_MBX_VF_UNINIT: - hclge_rm_vport_all_mac_table(vport, true, + is_del = req->msg.code == HCLGE_MBX_VF_UNINIT; + hclge_rm_vport_all_mac_table(vport, is_del, HCLGE_MAC_ADDR_UC); - hclge_rm_vport_all_mac_table(vport, true, + hclge_rm_vport_all_mac_table(vport, is_del, HCLGE_MAC_ADDR_MC); - hclge_rm_vport_all_vlan_table(vport, true); + hclge_rm_vport_all_vlan_table(vport, is_del); break; case HCLGE_MBX_GET_MEDIA_TYPE: hclge_get_vf_media_type(vport, &resp_msg); @@ -771,6 +790,9 @@ void hclge_mbx_handler(struct hclge_dev *hdev) case HCLGE_MBX_NCSI_ERROR: hclge_handle_ncsi_error(hdev); break; + case HCLGE_MBX_HANDLE_VF_TBL: + hclge_handle_vf_tbl(vport, req); + break; default: dev_err(&hdev->pdev->dev, "un-supported mailbox message, code = %u\n", diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c index fea197fd77cb..32341dcaa6c1 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c @@ -1777,10 +1777,6 @@ static int hclgevf_reset_stack(struct hclgevf_dev *hdev) if (ret) return ret; - ret = hclgevf_notify_client(hdev, HNAE3_RESTORE_CLIENT); - if (ret) - return ret; - /* clear handshake status with IMP */ hclgevf_reset_handshake(hdev, false); @@ -1860,13 +1856,8 @@ static void hclgevf_reset_err_handle(struct hclgevf_dev *hdev) static int hclgevf_reset_prepare(struct hclgevf_dev *hdev) { - struct hnae3_ae_dev *ae_dev = pci_get_drvdata(hdev->pdev); int ret; - /* Initialize ae_dev reset status as well, in case enet layer wants to - * know if device is undergoing reset - */ - ae_dev->reset_type = hdev->reset_type; hdev->rst_stats.rst_cnt++; rtnl_lock(); @@ -1881,7 +1872,6 @@ static int hclgevf_reset_prepare(struct hclgevf_dev *hdev) static int hclgevf_reset_rebuild(struct hclgevf_dev *hdev) { - struct hnae3_ae_dev *ae_dev = pci_get_drvdata(hdev->pdev); int ret; hdev->rst_stats.hw_rst_done_cnt++; @@ -1896,7 +1886,6 @@ static int hclgevf_reset_rebuild(struct hclgevf_dev *hdev) } hdev->last_reset_time = jiffies; - ae_dev->reset_type = HNAE3_NONE_RESET; hdev->rst_stats.rst_done_cnt++; hdev->rst_stats.rst_fail_cnt = 0; clear_bit(HCLGEVF_STATE_RST_FAIL, &hdev->state); @@ -2974,6 +2963,15 @@ static int hclgevf_pci_reset(struct hclgevf_dev *hdev) return ret; } +static int hclgevf_clear_vport_list(struct hclgevf_dev *hdev) +{ + struct hclge_vf_to_pf_msg send_msg; + + hclgevf_build_send_msg(&send_msg, HCLGE_MBX_HANDLE_VF_TBL, + HCLGE_MBX_VPORT_LIST_CLEAR); + return hclgevf_send_mbx_msg(hdev, &send_msg, false, NULL, 0); +} + static int hclgevf_reset_hdev(struct hclgevf_dev *hdev) { struct pci_dev *pdev = hdev->pdev; @@ -3083,6 +3081,15 @@ static int hclgevf_init_hdev(struct hclgevf_dev *hdev) goto err_config; } + /* ensure vf tbl list as empty before init*/ + ret = hclgevf_clear_vport_list(hdev); + if (ret) { + dev_err(&pdev->dev, + "failed to clear tbl list configuration, ret = %d.\n", + ret); + goto err_config; + } + ret = hclgevf_init_vlan_config(hdev); if (ret) { dev_err(&hdev->pdev->dev, -- cgit v1.2.3-59-g8ed1b From dce38b74b2b57c6aeb9eafa9fb08451a7bb022dc Mon Sep 17 00:00:00 2001 From: Zheng Bin Date: Fri, 24 Apr 2020 17:08:50 +0800 Subject: net: phy: dp83867: Remove unneeded semicolon Fixes coccicheck warning: drivers/net/phy/dp83867.c:368:2-3: Unneeded semicolon drivers/net/phy/dp83867.c:403:2-3: Unneeded semicolon Reported-by: Hulk Robot Signed-off-by: Zheng Bin Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- drivers/net/phy/dp83867.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/phy/dp83867.c b/drivers/net/phy/dp83867.c index b55e3c0403ed..4017ae1692d8 100644 --- a/drivers/net/phy/dp83867.c +++ b/drivers/net/phy/dp83867.c @@ -365,7 +365,7 @@ static int dp83867_get_downshift(struct phy_device *phydev, u8 *data) break; default: return -EINVAL; - }; + } *data = enable ? count : DOWNSHIFT_DEV_DISABLE; @@ -400,7 +400,7 @@ static int dp83867_set_downshift(struct phy_device *phydev, u8 cnt) phydev_err(phydev, "Downshift count must be 1, 2, 4 or 8\n"); return -EINVAL; - }; + } val = DP83867_DOWNSHIFT_EN; val |= FIELD_PREP(DP83867_DOWNSHIFT_ATTEMPT_MASK, count); -- cgit v1.2.3-59-g8ed1b From ae23aae229b8a253a3d5d51acf3bd6103a519d85 Mon Sep 17 00:00:00 2001 From: Zheng Bin Date: Fri, 24 Apr 2020 17:13:35 +0800 Subject: octeontx2-pf: Remove unneeded semicolon Fixes coccicheck warning: drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.h:312:2-3: Unneeded semicolon Reported-by: Hulk Robot Signed-off-by: Zheng Bin Signed-off-by: David S. Miller --- drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.h b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.h index 018c283a0ac4..0b1c653b3449 100644 --- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.h +++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.h @@ -309,7 +309,7 @@ static inline void __iomem *otx2_get_regaddr(struct otx2_nic *nic, u64 offset) default: blkaddr = BLKADDR_RVUM; break; - }; + } offset &= ~(RVU_FUNC_BLKADDR_MASK << RVU_FUNC_BLKADDR_SHIFT); offset |= (blkaddr << RVU_FUNC_BLKADDR_SHIFT); -- cgit v1.2.3-59-g8ed1b From 460fd830dd9d68e07c4d15363fd764944090e1f8 Mon Sep 17 00:00:00 2001 From: Ioana Ciornei Date: Fri, 24 Apr 2020 12:33:18 +0300 Subject: dpaa2-eth: add channel stat to debugfs Compute the average number of frames processed for each CDAN (Channel Data Availability Notification) and export it to debugfs detailed channel stats. Signed-off-by: Ioana Ciornei Signed-off-by: David S. Miller --- drivers/net/ethernet/freescale/dpaa2/dpaa2-eth-debugfs.c | 9 ++++++--- drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c | 1 + drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.h | 2 ++ drivers/net/ethernet/freescale/dpaa2/dpaa2-ethtool.c | 2 +- 4 files changed, 10 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth-debugfs.c b/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth-debugfs.c index a9afe46b837f..80291afff3ea 100644 --- a/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth-debugfs.c +++ b/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth-debugfs.c @@ -127,16 +127,19 @@ static int dpaa2_dbg_ch_show(struct seq_file *file, void *offset) int i; seq_printf(file, "Channel stats for %s:\n", priv->net_dev->name); - seq_printf(file, "%s%16s%16s%16s%16s\n", - "CHID", "CPU", "Deq busy", "CDANs", "Buf count"); + seq_printf(file, "%s%16s%16s%16s%16s%16s%16s\n", + "CHID", "CPU", "Deq busy", "Frames", "CDANs", + "Avg Frm/CDAN", "Buf count"); for (i = 0; i < priv->num_channels; i++) { ch = priv->channel[i]; - seq_printf(file, "%4d%16d%16llu%16llu%16d\n", + seq_printf(file, "%4d%16d%16llu%16llu%16llu%16llu%16d\n", ch->ch_id, ch->nctx.desired_cpu, ch->stats.dequeue_portal_busy, + ch->stats.frames, ch->stats.cdan, + ch->stats.frames / ch->stats.cdan, ch->buf_count); } diff --git a/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c b/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c index d271c016229d..8ec435ba7d27 100644 --- a/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c +++ b/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c @@ -493,6 +493,7 @@ static int consume_frames(struct dpaa2_eth_channel *ch, return 0; fq->stats.frames += cleaned; + ch->stats.frames += cleaned; /* A dequeue operation only pulls frames from a single queue * into the store. Return the frame queue as an out param. diff --git a/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.h b/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.h index 289053099974..43cd8409f2e9 100644 --- a/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.h +++ b/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.h @@ -288,6 +288,8 @@ struct dpaa2_eth_ch_stats { __u64 xdp_tx; __u64 xdp_tx_err; __u64 xdp_redirect; + /* Must be last, does not show up in ethtool stats */ + __u64 frames; }; /* Maximum number of queues associated with a DPNI */ diff --git a/drivers/net/ethernet/freescale/dpaa2/dpaa2-ethtool.c b/drivers/net/ethernet/freescale/dpaa2/dpaa2-ethtool.c index 94347c695233..bd13ee48d623 100644 --- a/drivers/net/ethernet/freescale/dpaa2/dpaa2-ethtool.c +++ b/drivers/net/ethernet/freescale/dpaa2/dpaa2-ethtool.c @@ -277,7 +277,7 @@ static void dpaa2_eth_get_ethtool_stats(struct net_device *net_dev, /* Per-channel stats */ for (k = 0; k < priv->num_channels; k++) { ch_stats = &priv->channel[k]->stats; - for (j = 0; j < sizeof(*ch_stats) / sizeof(__u64); j++) + for (j = 0; j < sizeof(*ch_stats) / sizeof(__u64) - 1; j++) *((__u64 *)data + i + j) += *((__u64 *)ch_stats + j); } i += j; -- cgit v1.2.3-59-g8ed1b From 071c8ed6e88d2ac0a5f26948fb9c288fd4dd6e40 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 24 Apr 2020 12:31:50 +0200 Subject: tcp: mptcp: use mptcp receive buffer space to select rcv window In MPTCP, the receive window is shared across all subflows, because it refers to the mptcp-level sequence space. MPTCP receivers already place incoming packets on the mptcp socket receive queue and will charge it to the mptcp socket rcvbuf until userspace consumes the data. Update __tcp_select_window to use the occupancy of the parent/mptcp socket instead of the subflow socket in case the tcp socket is part of a logical mptcp connection. This commit doesn't change choice of initial window for passive or active connections. While it would be possible to change those as well, this adds complexity (especially when handling MP_JOIN requests). Furthermore, the MPTCP RFC specifically says that a MPTCP sender 'MUST NOT use the RCV.WND field of a TCP segment at the connection level if it does not also carry a DSS option with a Data ACK field.' SYN/SYNACK packets do not carry a DSS option with a Data ACK field. Signed-off-by: Florian Westphal Signed-off-by: David S. Miller --- include/net/mptcp.h | 3 +++ net/ipv4/tcp_output.c | 8 ++++++-- net/mptcp/subflow.c | 18 ++++++++++++++++++ 3 files changed, 27 insertions(+), 2 deletions(-) diff --git a/include/net/mptcp.h b/include/net/mptcp.h index 0e7c5471010b..5288fba56e55 100644 --- a/include/net/mptcp.h +++ b/include/net/mptcp.h @@ -68,6 +68,8 @@ static inline bool rsk_is_mptcp(const struct request_sock *req) return tcp_rsk(req)->is_mptcp; } +void mptcp_space(const struct sock *ssk, int *space, int *full_space); + void mptcp_parse_option(const struct sk_buff *skb, const unsigned char *ptr, int opsize, struct tcp_options_received *opt_rx); bool mptcp_syn_options(struct sock *sk, const struct sk_buff *skb, @@ -197,6 +199,7 @@ static inline bool mptcp_sk_is_subflow(const struct sock *sk) return false; } +static inline void mptcp_space(const struct sock *ssk, int *s, int *fs) { } static inline void mptcp_seq_show(struct seq_file *seq) { } #endif /* CONFIG_MPTCP */ diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 2f45cde168c4..ba4482130f08 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -2772,8 +2772,12 @@ u32 __tcp_select_window(struct sock *sk) int mss = icsk->icsk_ack.rcv_mss; int free_space = tcp_space(sk); int allowed_space = tcp_full_space(sk); - int full_space = min_t(int, tp->window_clamp, allowed_space); - int window; + int full_space, window; + + if (sk_is_mptcp(sk)) + mptcp_space(sk, &free_space, &allowed_space); + + full_space = min_t(int, tp->window_clamp, allowed_space); if (unlikely(mss > full_space)) { mss = full_space; diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index fabd06f2ff45..87c094702d63 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -821,6 +821,24 @@ bool mptcp_subflow_data_available(struct sock *sk) return subflow->data_avail; } +/* If ssk has an mptcp parent socket, use the mptcp rcvbuf occupancy, + * not the ssk one. + * + * In mptcp, rwin is about the mptcp-level connection data. + * + * Data that is still on the ssk rx queue can thus be ignored, + * as far as mptcp peer is concerened that data is still inflight. + * DSS ACK is updated when skb is moved to the mptcp rx queue. + */ +void mptcp_space(const struct sock *ssk, int *space, int *full_space) +{ + const struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); + const struct sock *sk = subflow->conn; + + *space = tcp_space(sk); + *full_space = tcp_full_space(sk); +} + static void subflow_data_ready(struct sock *sk) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); -- cgit v1.2.3-59-g8ed1b From f30e472071c827e2c2f191c8a011b7e371e17af5 Mon Sep 17 00:00:00 2001 From: Taehee Yoo Date: Fri, 24 Apr 2020 12:43:09 +0000 Subject: hsr: remove unnecessary code in hsr_dev_change_mtu() In the hsr_dev_change_mtu(), the 'dev' and 'master->dev' pointer are same. So, the 'master' variable and some code are unnecessary. Signed-off-by: Taehee Yoo Signed-off-by: David S. Miller --- net/hsr/hsr_device.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/net/hsr/hsr_device.c b/net/hsr/hsr_device.c index fc7027314ad8..cd99f548e440 100644 --- a/net/hsr/hsr_device.c +++ b/net/hsr/hsr_device.c @@ -125,13 +125,11 @@ int hsr_get_max_mtu(struct hsr_priv *hsr) static int hsr_dev_change_mtu(struct net_device *dev, int new_mtu) { struct hsr_priv *hsr; - struct hsr_port *master; hsr = netdev_priv(dev); - master = hsr_port_get_hsr(hsr, HSR_PT_MASTER); if (new_mtu > hsr_get_max_mtu(hsr)) { - netdev_info(master->dev, "A HSR master's MTU cannot be greater than the smallest MTU of its slaves minus the HSR Tag length (%d octets).\n", + netdev_info(dev, "A HSR master's MTU cannot be greater than the smallest MTU of its slaves minus the HSR Tag length (%d octets).\n", HSR_HLEN); return -EINVAL; } -- cgit v1.2.3-59-g8ed1b From 3e14462f1bee6d74eea6425d7965357d55151f80 Mon Sep 17 00:00:00 2001 From: Yang Yingliang Date: Fri, 24 Apr 2020 20:52:26 +0800 Subject: ptp: clockmatrix: remove unnecessary comparison The type of loaddr is u8 which is always '<=' 0xff, so the loaddr <= 0xff is always true, we can remove this comparison. Reported-by: Hulk Robot Signed-off-by: Yang Yingliang Reviewed-by: Vincent Cheng Signed-off-by: David S. Miller --- drivers/ptp/ptp_clockmatrix.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/ptp/ptp_clockmatrix.c b/drivers/ptp/ptp_clockmatrix.c index 032e112c3dd9..a3f608832660 100644 --- a/drivers/ptp/ptp_clockmatrix.c +++ b/drivers/ptp/ptp_clockmatrix.c @@ -780,7 +780,7 @@ static int idtcm_load_firmware(struct idtcm *idtcm, /* Page size 128, last 4 bytes of page skipped */ if (((loaddr > 0x7b) && (loaddr <= 0x7f)) - || ((loaddr > 0xfb) && (loaddr <= 0xff))) + || loaddr > 0xfb) continue; err = idtcm_write(idtcm, regaddr, 0, &val, sizeof(val)); -- cgit v1.2.3-59-g8ed1b From 308de89fedf4240d2270d00a36b90a1394fef6a7 Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Fri, 24 Apr 2020 21:11:34 +0800 Subject: liquidio: remove unused inline functions commit b6334be64d6f ("net/liquidio: Delete driver version assignment") left behind this, remove it. Signed-off-by: YueHaibing Signed-off-by: David S. Miller --- drivers/net/ethernet/cavium/liquidio/octeon_device.h | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/drivers/net/ethernet/cavium/liquidio/octeon_device.h b/drivers/net/ethernet/cavium/liquidio/octeon_device.h index 3d01d3602d8f..fb380b4f3e02 100644 --- a/drivers/net/ethernet/cavium/liquidio/octeon_device.h +++ b/drivers/net/ethernet/cavium/liquidio/octeon_device.h @@ -712,18 +712,6 @@ struct octeon_device *lio_get_device(u32 octeon_id); */ int lio_get_device_id(void *dev); -static inline u16 OCTEON_MAJOR_REV(struct octeon_device *oct) -{ - u16 rev = (oct->rev_id & 0xC) >> 2; - - return (rev == 0) ? 1 : rev; -} - -static inline u16 OCTEON_MINOR_REV(struct octeon_device *oct) -{ - return oct->rev_id & 0x3; -} - /** Read windowed register. * @param oct - pointer to the Octeon device. * @param addr - Address of the register to read. -- cgit v1.2.3-59-g8ed1b From 163749ad8436b3206709172406d68869c40bc176 Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Fri, 24 Apr 2020 21:12:56 +0800 Subject: qlcnic: remove unused inline function qlcnic_hw_write_wx_2M There's no callers in-tree anymore. Signed-off-by: YueHaibing Signed-off-by: David S. Miller --- drivers/net/ethernet/qlogic/qlcnic/qlcnic.h | 6 ------ 1 file changed, 6 deletions(-) diff --git a/drivers/net/ethernet/qlogic/qlcnic/qlcnic.h b/drivers/net/ethernet/qlogic/qlcnic/qlcnic.h index 134611aa2c9a..d838774af5a6 100644 --- a/drivers/net/ethernet/qlogic/qlcnic/qlcnic.h +++ b/drivers/net/ethernet/qlogic/qlcnic/qlcnic.h @@ -1880,12 +1880,6 @@ static inline void qlcnic_write_crb(struct qlcnic_adapter *adapter, char *buf, adapter->ahw->hw_ops->write_crb(adapter, buf, offset, size); } -static inline int qlcnic_hw_write_wx_2M(struct qlcnic_adapter *adapter, - ulong off, u32 data) -{ - return adapter->ahw->hw_ops->write_reg(adapter, off, data); -} - static inline int qlcnic_get_mac_address(struct qlcnic_adapter *adapter, u8 *mac, u8 function) { -- cgit v1.2.3-59-g8ed1b From df346f1aac6cef551b69d788b022a942270dc17b Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Fri, 24 Apr 2020 21:13:34 +0800 Subject: dccp: remove unused inline function dccp_set_seqno There's no callers in-tree since commit 792b48780e8b ("dccp: Implement both feature-local and feature-remote Sequence Window feature") Signed-off-by: YueHaibing Signed-off-by: David S. Miller --- net/dccp/dccp.h | 5 ----- 1 file changed, 5 deletions(-) diff --git a/net/dccp/dccp.h b/net/dccp/dccp.h index 9c3b27c257bb..7dce4f6c7025 100644 --- a/net/dccp/dccp.h +++ b/net/dccp/dccp.h @@ -108,11 +108,6 @@ extern int sysctl_dccp_sync_ratelimit; #define ADD48(a, b) (((a) + (b)) & UINT48_MAX) #define SUB48(a, b) ADD48((a), COMPLEMENT48(b)) -static inline void dccp_set_seqno(u64 *seqno, u64 value) -{ - *seqno = value & UINT48_MAX; -} - static inline void dccp_inc_seqno(u64 *seqno) { *seqno = ADD48(*seqno, 1); -- cgit v1.2.3-59-g8ed1b From c90af587a9eee697e2d89683113707cada70116a Mon Sep 17 00:00:00 2001 From: Zou Wei Date: Fri, 24 Apr 2020 21:53:14 +0800 Subject: net/mlx4_core: Add missing iounmap() in error path This fixes the following coccicheck warning: drivers/net/ethernet/mellanox/mlx4/crdump.c:200:2-8: ERROR: missing iounmap; ioremap on line 190 and execution via conditional on line 198 Fixes: 7ef19d3b1d5e ("devlink: report error once U32_MAX snapshot ids have been used") Reported-by: Hulk Robot Signed-off-by: Zou Wei Reviewed-by: Saeed Mahameed Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlx4/crdump.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/mellanox/mlx4/crdump.c b/drivers/net/ethernet/mellanox/mlx4/crdump.c index 73eae80e1cb7..ac5468b77488 100644 --- a/drivers/net/ethernet/mellanox/mlx4/crdump.c +++ b/drivers/net/ethernet/mellanox/mlx4/crdump.c @@ -197,6 +197,7 @@ int mlx4_crdump_collect(struct mlx4_dev *dev) err = devlink_region_snapshot_id_get(devlink, &id); if (err) { mlx4_err(dev, "crdump: devlink get snapshot id err %d\n", err); + iounmap(cr_space); return err; } -- cgit v1.2.3-59-g8ed1b From a425b6e1c69ba907b72b737a4d44f8cfbc43ce3c Mon Sep 17 00:00:00 2001 From: Luo bin Date: Sat, 25 Apr 2020 01:21:09 +0000 Subject: hinic: add mailbox function support virtual function and physical function can communicate with each other through mailbox channel supported by hw Signed-off-by: Luo bin Signed-off-by: David S. Miller --- drivers/net/ethernet/huawei/hinic/Makefile | 2 +- drivers/net/ethernet/huawei/hinic/hinic_hw_csr.h | 2 +- drivers/net/ethernet/huawei/hinic/hinic_hw_dev.c | 25 +- drivers/net/ethernet/huawei/hinic/hinic_hw_dev.h | 2 + drivers/net/ethernet/huawei/hinic/hinic_hw_eqs.h | 3 +- drivers/net/ethernet/huawei/hinic/hinic_hw_if.c | 46 +- drivers/net/ethernet/huawei/hinic/hinic_hw_if.h | 18 + drivers/net/ethernet/huawei/hinic/hinic_hw_mbox.c | 1213 +++++++++++++++++++++ drivers/net/ethernet/huawei/hinic/hinic_hw_mbox.h | 154 +++ drivers/net/ethernet/huawei/hinic/hinic_hw_mgmt.h | 1 + 10 files changed, 1456 insertions(+), 10 deletions(-) create mode 100644 drivers/net/ethernet/huawei/hinic/hinic_hw_mbox.c create mode 100644 drivers/net/ethernet/huawei/hinic/hinic_hw_mbox.h diff --git a/drivers/net/ethernet/huawei/hinic/Makefile b/drivers/net/ethernet/huawei/hinic/Makefile index fe88ab88cacc..a73862a64690 100644 --- a/drivers/net/ethernet/huawei/hinic/Makefile +++ b/drivers/net/ethernet/huawei/hinic/Makefile @@ -4,4 +4,4 @@ obj-$(CONFIG_HINIC) += hinic.o hinic-y := hinic_main.o hinic_tx.o hinic_rx.o hinic_port.o hinic_hw_dev.o \ hinic_hw_io.o hinic_hw_qp.o hinic_hw_cmdq.o hinic_hw_wq.o \ hinic_hw_mgmt.o hinic_hw_api_cmd.o hinic_hw_eqs.o hinic_hw_if.o \ - hinic_common.o hinic_ethtool.o + hinic_common.o hinic_ethtool.o hinic_hw_mbox.o diff --git a/drivers/net/ethernet/huawei/hinic/hinic_hw_csr.h b/drivers/net/ethernet/huawei/hinic/hinic_hw_csr.h index cdec1d0a3962..7e84e4e33fff 100644 --- a/drivers/net/ethernet/huawei/hinic/hinic_hw_csr.h +++ b/drivers/net/ethernet/huawei/hinic/hinic_hw_csr.h @@ -10,7 +10,7 @@ /* HW interface registers */ #define HINIC_CSR_FUNC_ATTR0_ADDR 0x0 #define HINIC_CSR_FUNC_ATTR1_ADDR 0x4 - +#define HINIC_CSR_FUNC_ATTR2_ADDR 0x8 #define HINIC_CSR_FUNC_ATTR4_ADDR 0x10 #define HINIC_CSR_FUNC_ATTR5_ADDR 0x14 diff --git a/drivers/net/ethernet/huawei/hinic/hinic_hw_dev.c b/drivers/net/ethernet/huawei/hinic/hinic_hw_dev.c index c7c75b772a86..f2cf6f7ffc34 100644 --- a/drivers/net/ethernet/huawei/hinic/hinic_hw_dev.c +++ b/drivers/net/ethernet/huawei/hinic/hinic_hw_dev.c @@ -676,10 +676,23 @@ static int init_pfhwdev(struct hinic_pfhwdev *pfhwdev) return err; } - hinic_register_mgmt_msg_cb(&pfhwdev->pf_to_mgmt, HINIC_MOD_L2NIC, - pfhwdev, nic_mgmt_msg_handler); + err = hinic_func_to_func_init(hwdev); + if (err) { + dev_err(&hwif->pdev->dev, "Failed to init mailbox\n"); + hinic_pf_to_mgmt_free(&pfhwdev->pf_to_mgmt); + return err; + } + + if (!HINIC_IS_VF(hwif)) + hinic_register_mgmt_msg_cb(&pfhwdev->pf_to_mgmt, + HINIC_MOD_L2NIC, pfhwdev, + nic_mgmt_msg_handler); + else + hinic_register_vf_mbox_cb(hwdev, HINIC_MOD_L2NIC, + nic_mgmt_msg_handler); hinic_set_pf_action(hwif, HINIC_PF_MGMT_ACTIVE); + return 0; } @@ -693,7 +706,13 @@ static void free_pfhwdev(struct hinic_pfhwdev *pfhwdev) hinic_set_pf_action(hwdev->hwif, HINIC_PF_MGMT_INIT); - hinic_unregister_mgmt_msg_cb(&pfhwdev->pf_to_mgmt, HINIC_MOD_L2NIC); + if (!HINIC_IS_VF(hwdev->hwif)) + hinic_unregister_mgmt_msg_cb(&pfhwdev->pf_to_mgmt, + HINIC_MOD_L2NIC); + else + hinic_unregister_vf_mbox_cb(hwdev, HINIC_MOD_L2NIC); + + hinic_func_to_func_free(hwdev); hinic_pf_to_mgmt_free(&pfhwdev->pf_to_mgmt); } diff --git a/drivers/net/ethernet/huawei/hinic/hinic_hw_dev.h b/drivers/net/ethernet/huawei/hinic/hinic_hw_dev.h index 66fd2340d447..2574086aa314 100644 --- a/drivers/net/ethernet/huawei/hinic/hinic_hw_dev.h +++ b/drivers/net/ethernet/huawei/hinic/hinic_hw_dev.h @@ -16,6 +16,7 @@ #include "hinic_hw_mgmt.h" #include "hinic_hw_qp.h" #include "hinic_hw_io.h" +#include "hinic_hw_mbox.h" #define HINIC_MAX_QPS 32 @@ -225,6 +226,7 @@ struct hinic_hwdev { struct hinic_aeqs aeqs; struct hinic_func_to_io func_to_io; + struct hinic_mbox_func_to_func *func_to_func; struct hinic_cap nic_cap; }; diff --git a/drivers/net/ethernet/huawei/hinic/hinic_hw_eqs.h b/drivers/net/ethernet/huawei/hinic/hinic_hw_eqs.h index d35f2068ee0c..d73256da4b80 100644 --- a/drivers/net/ethernet/huawei/hinic/hinic_hw_eqs.h +++ b/drivers/net/ethernet/huawei/hinic/hinic_hw_eqs.h @@ -143,8 +143,9 @@ enum hinic_eq_type { }; enum hinic_aeq_type { + HINIC_MBX_FROM_FUNC = 1, HINIC_MSG_FROM_MGMT_CPU = 2, - + HINIC_MBX_SEND_RSLT = 5, HINIC_MAX_AEQ_EVENTS, }; diff --git a/drivers/net/ethernet/huawei/hinic/hinic_hw_if.c b/drivers/net/ethernet/huawei/hinic/hinic_hw_if.c index 07bbfbf68577..3fbd2eb80582 100644 --- a/drivers/net/ethernet/huawei/hinic/hinic_hw_if.c +++ b/drivers/net/ethernet/huawei/hinic/hinic_hw_if.c @@ -115,8 +115,12 @@ int hinic_msix_attr_cnt_clear(struct hinic_hwif *hwif, u16 msix_index) **/ void hinic_set_pf_action(struct hinic_hwif *hwif, enum hinic_pf_action action) { - u32 attr5 = hinic_hwif_read_reg(hwif, HINIC_CSR_FUNC_ATTR5_ADDR); + u32 attr5; + if (HINIC_IS_VF(hwif)) + return; + + attr5 = hinic_hwif_read_reg(hwif, HINIC_CSR_FUNC_ATTR5_ADDR); attr5 = HINIC_FA5_CLEAR(attr5, PF_ACTION); attr5 |= HINIC_FA5_SET(action, PF_ACTION); @@ -203,7 +207,8 @@ static int hwif_ready(struct hinic_hwif *hwif) * @attr0: the first attribute that was read from the hw * @attr1: the second attribute that was read from the hw **/ -static void set_hwif_attr(struct hinic_hwif *hwif, u32 attr0, u32 attr1) +static void set_hwif_attr(struct hinic_hwif *hwif, u32 attr0, u32 attr1, + u32 attr2) { hwif->attr.func_idx = HINIC_FA0_GET(attr0, FUNC_IDX); hwif->attr.pf_idx = HINIC_FA0_GET(attr0, PF_IDX); @@ -214,6 +219,8 @@ static void set_hwif_attr(struct hinic_hwif *hwif, u32 attr0, u32 attr1) hwif->attr.num_ceqs = BIT(HINIC_FA1_GET(attr1, CEQS_PER_FUNC)); hwif->attr.num_irqs = BIT(HINIC_FA1_GET(attr1, IRQS_PER_FUNC)); hwif->attr.num_dma_attr = BIT(HINIC_FA1_GET(attr1, DMA_ATTR_PER_FUNC)); + hwif->attr.global_vf_id_of_pf = HINIC_FA2_GET(attr2, + GLOBAL_VF_ID_OF_PF); } /** @@ -222,7 +229,7 @@ static void set_hwif_attr(struct hinic_hwif *hwif, u32 attr0, u32 attr1) **/ static void read_hwif_attr(struct hinic_hwif *hwif) { - u32 addr, attr0, attr1; + u32 addr, attr0, attr1, attr2; addr = HINIC_CSR_FUNC_ATTR0_ADDR; attr0 = hinic_hwif_read_reg(hwif, addr); @@ -230,7 +237,10 @@ static void read_hwif_attr(struct hinic_hwif *hwif) addr = HINIC_CSR_FUNC_ATTR1_ADDR; attr1 = hinic_hwif_read_reg(hwif, addr); - set_hwif_attr(hwif, attr0, attr1); + addr = HINIC_CSR_FUNC_ATTR2_ADDR; + attr2 = hinic_hwif_read_reg(hwif, addr); + + set_hwif_attr(hwif, attr0, attr1, attr2); } /** @@ -309,6 +319,34 @@ static void dma_attr_init(struct hinic_hwif *hwif) HINIC_PCIE_SNOOP, HINIC_PCIE_TPH_DISABLE); } +u16 hinic_glb_pf_vf_offset(struct hinic_hwif *hwif) +{ + if (!hwif) + return 0; + + return hwif->attr.global_vf_id_of_pf; +} + +u16 hinic_global_func_id_hw(struct hinic_hwif *hwif) +{ + u32 addr, attr0; + + addr = HINIC_CSR_FUNC_ATTR0_ADDR; + attr0 = hinic_hwif_read_reg(hwif, addr); + + return HINIC_FA0_GET(attr0, FUNC_IDX); +} + +u16 hinic_pf_id_of_vf_hw(struct hinic_hwif *hwif) +{ + u32 addr, attr0; + + addr = HINIC_CSR_FUNC_ATTR0_ADDR; + attr0 = hinic_hwif_read_reg(hwif, addr); + + return HINIC_FA0_GET(attr0, PF_IDX); +} + /** * hinic_init_hwif - initialize the hw interface * @hwif: the HW interface of a pci function device diff --git a/drivers/net/ethernet/huawei/hinic/hinic_hw_if.h b/drivers/net/ethernet/huawei/hinic/hinic_hw_if.h index c7bb9ceca72c..53bb89c1dd26 100644 --- a/drivers/net/ethernet/huawei/hinic/hinic_hw_if.h +++ b/drivers/net/ethernet/huawei/hinic/hinic_hw_if.h @@ -35,6 +35,7 @@ #define HINIC_FA0_FUNC_IDX_SHIFT 0 #define HINIC_FA0_PF_IDX_SHIFT 10 #define HINIC_FA0_PCI_INTF_IDX_SHIFT 14 +#define HINIC_FA0_VF_IN_PF_SHIFT 16 /* reserved members - off 16 */ #define HINIC_FA0_FUNC_TYPE_SHIFT 24 @@ -42,6 +43,7 @@ #define HINIC_FA0_PF_IDX_MASK 0xF #define HINIC_FA0_PCI_INTF_IDX_MASK 0x3 #define HINIC_FA0_FUNC_TYPE_MASK 0x1 +#define HINIC_FA0_VF_IN_PF_MASK 0xFF #define HINIC_FA0_GET(val, member) \ (((val) >> HINIC_FA0_##member##_SHIFT) & HINIC_FA0_##member##_MASK) @@ -64,6 +66,12 @@ #define HINIC_FA1_GET(val, member) \ (((val) >> HINIC_FA1_##member##_SHIFT) & HINIC_FA1_##member##_MASK) +#define HINIC_FA2_GLOBAL_VF_ID_OF_PF_SHIFT 16 +#define HINIC_FA2_GLOBAL_VF_ID_OF_PF_MASK 0x3FF + +#define HINIC_FA2_GET(val, member) \ + (((val) >> HINIC_FA2_##member##_SHIFT) & HINIC_FA2_##member##_MASK) + #define HINIC_FA4_OUTBOUND_STATE_SHIFT 0 #define HINIC_FA4_DB_STATE_SHIFT 1 @@ -140,6 +148,7 @@ #define HINIC_HWIF_PPF_IDX(hwif) ((hwif)->attr.ppf_idx) #define HINIC_FUNC_TYPE(hwif) ((hwif)->attr.func_type) +#define HINIC_IS_VF(hwif) (HINIC_FUNC_TYPE(hwif) == HINIC_VF) #define HINIC_IS_PF(hwif) (HINIC_FUNC_TYPE(hwif) == HINIC_PF) #define HINIC_IS_PPF(hwif) (HINIC_FUNC_TYPE(hwif) == HINIC_PPF) @@ -173,6 +182,7 @@ enum hinic_pcie_tph { enum hinic_func_type { HINIC_PF = 0, + HINIC_VF = 1, HINIC_PPF = 2, }; @@ -223,6 +233,8 @@ struct hinic_func_attr { u8 num_ceqs; u8 num_dma_attr; + + u16 global_vf_id_of_pf; }; struct hinic_hwif { @@ -271,6 +283,12 @@ enum hinic_db_state hinic_db_state_get(struct hinic_hwif *hwif); void hinic_db_state_set(struct hinic_hwif *hwif, enum hinic_db_state db_state); +u16 hinic_glb_pf_vf_offset(struct hinic_hwif *hwif); + +u16 hinic_global_func_id_hw(struct hinic_hwif *hwif); + +u16 hinic_pf_id_of_vf_hw(struct hinic_hwif *hwif); + int hinic_init_hwif(struct hinic_hwif *hwif, struct pci_dev *pdev); void hinic_free_hwif(struct hinic_hwif *hwif); diff --git a/drivers/net/ethernet/huawei/hinic/hinic_hw_mbox.c b/drivers/net/ethernet/huawei/hinic/hinic_hw_mbox.c new file mode 100644 index 000000000000..f8626dfd192e --- /dev/null +++ b/drivers/net/ethernet/huawei/hinic/hinic_hw_mbox.c @@ -0,0 +1,1213 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Huawei HiNIC PCI Express Linux driver + * Copyright(c) 2017 Huawei Technologies Co., Ltd + */ +#include +#include +#include +#include +#include +#include +#include + +#include "hinic_hw_if.h" +#include "hinic_hw_mgmt.h" +#include "hinic_hw_csr.h" +#include "hinic_hw_dev.h" +#include "hinic_hw_mbox.h" + +#define HINIC_MBOX_INT_DST_FUNC_SHIFT 0 +#define HINIC_MBOX_INT_DST_AEQN_SHIFT 10 +#define HINIC_MBOX_INT_SRC_RESP_AEQN_SHIFT 12 +#define HINIC_MBOX_INT_STAT_DMA_SHIFT 14 +/* The size of data to be sended (unit of 4 bytes) */ +#define HINIC_MBOX_INT_TX_SIZE_SHIFT 20 +/* SO_RO(strong order, relax order) */ +#define HINIC_MBOX_INT_STAT_DMA_SO_RO_SHIFT 25 +#define HINIC_MBOX_INT_WB_EN_SHIFT 28 + +#define HINIC_MBOX_INT_DST_FUNC_MASK 0x3FF +#define HINIC_MBOX_INT_DST_AEQN_MASK 0x3 +#define HINIC_MBOX_INT_SRC_RESP_AEQN_MASK 0x3 +#define HINIC_MBOX_INT_STAT_DMA_MASK 0x3F +#define HINIC_MBOX_INT_TX_SIZE_MASK 0x1F +#define HINIC_MBOX_INT_STAT_DMA_SO_RO_MASK 0x3 +#define HINIC_MBOX_INT_WB_EN_MASK 0x1 + +#define HINIC_MBOX_INT_SET(val, field) \ + (((val) & HINIC_MBOX_INT_##field##_MASK) << \ + HINIC_MBOX_INT_##field##_SHIFT) + +enum hinic_mbox_tx_status { + TX_NOT_DONE = 1, +}; + +#define HINIC_MBOX_CTRL_TRIGGER_AEQE_SHIFT 0 + +/* specifies the issue request for the message data. + * 0 - Tx request is done; + * 1 - Tx request is in process. + */ +#define HINIC_MBOX_CTRL_TX_STATUS_SHIFT 1 + +#define HINIC_MBOX_CTRL_TRIGGER_AEQE_MASK 0x1 +#define HINIC_MBOX_CTRL_TX_STATUS_MASK 0x1 + +#define HINIC_MBOX_CTRL_SET(val, field) \ + (((val) & HINIC_MBOX_CTRL_##field##_MASK) << \ + HINIC_MBOX_CTRL_##field##_SHIFT) + +#define HINIC_MBOX_HEADER_MSG_LEN_SHIFT 0 +#define HINIC_MBOX_HEADER_MODULE_SHIFT 11 +#define HINIC_MBOX_HEADER_SEG_LEN_SHIFT 16 +#define HINIC_MBOX_HEADER_NO_ACK_SHIFT 22 +#define HINIC_MBOX_HEADER_SEQID_SHIFT 24 +#define HINIC_MBOX_HEADER_LAST_SHIFT 30 + +/* specifies the mailbox message direction + * 0 - send + * 1 - receive + */ +#define HINIC_MBOX_HEADER_DIRECTION_SHIFT 31 +#define HINIC_MBOX_HEADER_CMD_SHIFT 32 +#define HINIC_MBOX_HEADER_MSG_ID_SHIFT 40 +#define HINIC_MBOX_HEADER_STATUS_SHIFT 48 +#define HINIC_MBOX_HEADER_SRC_GLB_FUNC_IDX_SHIFT 54 + +#define HINIC_MBOX_HEADER_MSG_LEN_MASK 0x7FF +#define HINIC_MBOX_HEADER_MODULE_MASK 0x1F +#define HINIC_MBOX_HEADER_SEG_LEN_MASK 0x3F +#define HINIC_MBOX_HEADER_NO_ACK_MASK 0x1 +#define HINIC_MBOX_HEADER_SEQID_MASK 0x3F +#define HINIC_MBOX_HEADER_LAST_MASK 0x1 +#define HINIC_MBOX_HEADER_DIRECTION_MASK 0x1 +#define HINIC_MBOX_HEADER_CMD_MASK 0xFF +#define HINIC_MBOX_HEADER_MSG_ID_MASK 0xFF +#define HINIC_MBOX_HEADER_STATUS_MASK 0x3F +#define HINIC_MBOX_HEADER_SRC_GLB_FUNC_IDX_MASK 0x3FF + +#define HINIC_MBOX_HEADER_GET(val, field) \ + (((val) >> HINIC_MBOX_HEADER_##field##_SHIFT) & \ + HINIC_MBOX_HEADER_##field##_MASK) +#define HINIC_MBOX_HEADER_SET(val, field) \ + ((u64)((val) & HINIC_MBOX_HEADER_##field##_MASK) << \ + HINIC_MBOX_HEADER_##field##_SHIFT) + +#define MBOX_SEGLEN_MASK \ + HINIC_MBOX_HEADER_SET(HINIC_MBOX_HEADER_SEG_LEN_MASK, SEG_LEN) + +#define HINIC_MBOX_SEG_LEN 48 +#define HINIC_MBOX_COMP_TIME 8000U +#define MBOX_MSG_POLLING_TIMEOUT 8000 + +#define HINIC_MBOX_DATA_SIZE 2040 + +#define MBOX_MAX_BUF_SZ 2048UL +#define MBOX_HEADER_SZ 8 + +#define MBOX_INFO_SZ 4 + +/* MBOX size is 64B, 8B for mbox_header, 4B reserved */ +#define MBOX_SEG_LEN 48 +#define MBOX_SEG_LEN_ALIGN 4 +#define MBOX_WB_STATUS_LEN 16UL + +/* mbox write back status is 16B, only first 4B is used */ +#define MBOX_WB_STATUS_ERRCODE_MASK 0xFFFF +#define MBOX_WB_STATUS_MASK 0xFF +#define MBOX_WB_ERROR_CODE_MASK 0xFF00 +#define MBOX_WB_STATUS_FINISHED_SUCCESS 0xFF +#define MBOX_WB_STATUS_FINISHED_WITH_ERR 0xFE +#define MBOX_WB_STATUS_NOT_FINISHED 0x00 + +#define MBOX_STATUS_FINISHED(wb) \ + (((wb) & MBOX_WB_STATUS_MASK) != MBOX_WB_STATUS_NOT_FINISHED) +#define MBOX_STATUS_SUCCESS(wb) \ + (((wb) & MBOX_WB_STATUS_MASK) == MBOX_WB_STATUS_FINISHED_SUCCESS) +#define MBOX_STATUS_ERRCODE(wb) \ + ((wb) & MBOX_WB_ERROR_CODE_MASK) + +#define SEQ_ID_START_VAL 0 +#define SEQ_ID_MAX_VAL 42 + +#define DST_AEQ_IDX_DEFAULT_VAL 0 +#define SRC_AEQ_IDX_DEFAULT_VAL 0 +#define NO_DMA_ATTRIBUTE_VAL 0 + +#define HINIC_MGMT_RSP_AEQN 0 +#define HINIC_MBOX_RSP_AEQN 2 +#define HINIC_MBOX_RECV_AEQN 0 + +#define MBOX_MSG_NO_DATA_LEN 1 + +#define MBOX_BODY_FROM_HDR(header) ((u8 *)(header) + MBOX_HEADER_SZ) +#define MBOX_AREA(hwif) \ + ((hwif)->cfg_regs_bar + HINIC_FUNC_CSR_MAILBOX_DATA_OFF) + +#define IS_PF_OR_PPF_SRC(src_func_idx) ((src_func_idx) < HINIC_MAX_PF_FUNCS) + +#define MBOX_RESPONSE_ERROR 0x1 +#define MBOX_MSG_ID_MASK 0xFF +#define MBOX_MSG_ID(func_to_func) ((func_to_func)->send_msg_id) +#define MBOX_MSG_ID_INC(func_to_func_mbox) (MBOX_MSG_ID(func_to_func_mbox) = \ + (MBOX_MSG_ID(func_to_func_mbox) + 1) & MBOX_MSG_ID_MASK) + +#define FUNC_ID_OFF_SET_8B 8 +#define FUNC_ID_OFF_SET_10B 10 + +/* max message counter wait to process for one function */ +#define HINIC_MAX_MSG_CNT_TO_PROCESS 10 + +#define HINIC_QUEUE_MIN_DEPTH 6 +#define HINIC_QUEUE_MAX_DEPTH 12 +#define HINIC_MAX_RX_BUFFER_SIZE 15 + +enum hinic_hwif_direction_type { + HINIC_HWIF_DIRECT_SEND = 0, + HINIC_HWIF_RESPONSE = 1, +}; + +enum mbox_send_mod { + MBOX_SEND_MSG_INT, +}; + +enum mbox_seg_type { + NOT_LAST_SEG, + LAST_SEG, +}; + +enum mbox_ordering_type { + STRONG_ORDER, +}; + +enum mbox_write_back_type { + WRITE_BACK = 1, +}; + +enum mbox_aeq_trig_type { + NOT_TRIGGER, + TRIGGER, +}; + +/** + * hinic_register_pf_mbox_cb - register mbox callback for pf + * @hwdev: the pointer to hw device + * @mod: specific mod that the callback will handle + * @callback: callback function + * Return: 0 - success, negative - failure + */ +int hinic_register_pf_mbox_cb(struct hinic_hwdev *hwdev, + enum hinic_mod_type mod, + hinic_pf_mbox_cb callback) +{ + struct hinic_mbox_func_to_func *func_to_func = hwdev->func_to_func; + + if (mod >= HINIC_MOD_MAX) + return -EFAULT; + + func_to_func->pf_mbox_cb[mod] = callback; + + set_bit(HINIC_PF_MBOX_CB_REG, &func_to_func->pf_mbox_cb_state[mod]); + + return 0; +} + +/** + * hinic_register_vf_mbox_cb - register mbox callback for vf + * @hwdev: the pointer to hw device + * @mod: specific mod that the callback will handle + * @callback: callback function + * Return: 0 - success, negative - failure + */ +int hinic_register_vf_mbox_cb(struct hinic_hwdev *hwdev, + enum hinic_mod_type mod, + hinic_vf_mbox_cb callback) +{ + struct hinic_mbox_func_to_func *func_to_func = hwdev->func_to_func; + + if (mod >= HINIC_MOD_MAX) + return -EFAULT; + + func_to_func->vf_mbox_cb[mod] = callback; + + set_bit(HINIC_VF_MBOX_CB_REG, &func_to_func->vf_mbox_cb_state[mod]); + + return 0; +} + +/** + * hinic_unregister_pf_mbox_cb - unregister the mbox callback for pf + * @hwdev: the pointer to hw device + * @mod: specific mod that the callback will handle + */ +void hinic_unregister_pf_mbox_cb(struct hinic_hwdev *hwdev, + enum hinic_mod_type mod) +{ + struct hinic_mbox_func_to_func *func_to_func = hwdev->func_to_func; + + clear_bit(HINIC_PF_MBOX_CB_REG, &func_to_func->pf_mbox_cb_state[mod]); + + while (test_bit(HINIC_PF_MBOX_CB_RUNNING, + &func_to_func->pf_mbox_cb_state[mod])) + usleep_range(900, 1000); + + func_to_func->pf_mbox_cb[mod] = NULL; +} + +/** + * hinic_unregister_vf_mbox_cb - unregister the mbox callback for vf + * @hwdev: the pointer to hw device + * @mod: specific mod that the callback will handle + */ +void hinic_unregister_vf_mbox_cb(struct hinic_hwdev *hwdev, + enum hinic_mod_type mod) +{ + struct hinic_mbox_func_to_func *func_to_func = hwdev->func_to_func; + + clear_bit(HINIC_VF_MBOX_CB_REG, &func_to_func->vf_mbox_cb_state[mod]); + + while (test_bit(HINIC_VF_MBOX_CB_RUNNING, + &func_to_func->vf_mbox_cb_state[mod])) + usleep_range(900, 1000); + + func_to_func->vf_mbox_cb[mod] = NULL; +} + +static int recv_vf_mbox_handler(struct hinic_mbox_func_to_func *func_to_func, + struct hinic_recv_mbox *recv_mbox, + void *buf_out, u16 *out_size) +{ + hinic_vf_mbox_cb cb; + int ret = 0; + + if (recv_mbox->mod >= HINIC_MOD_MAX) { + dev_err(&func_to_func->hwif->pdev->dev, "Receive illegal mbox message, mod = %d\n", + recv_mbox->mod); + return -EINVAL; + } + + set_bit(HINIC_VF_MBOX_CB_RUNNING, + &func_to_func->vf_mbox_cb_state[recv_mbox->mod]); + + cb = func_to_func->vf_mbox_cb[recv_mbox->mod]; + if (cb && test_bit(HINIC_VF_MBOX_CB_REG, + &func_to_func->vf_mbox_cb_state[recv_mbox->mod])) { + cb(func_to_func->hwdev, recv_mbox->cmd, recv_mbox->mbox, + recv_mbox->mbox_len, buf_out, out_size); + } else { + dev_err(&func_to_func->hwif->pdev->dev, "VF mbox cb is not registered\n"); + ret = -EINVAL; + } + + clear_bit(HINIC_VF_MBOX_CB_RUNNING, + &func_to_func->vf_mbox_cb_state[recv_mbox->mod]); + + return ret; +} + +static int +recv_pf_from_vf_mbox_handler(struct hinic_mbox_func_to_func *func_to_func, + struct hinic_recv_mbox *recv_mbox, + u16 src_func_idx, void *buf_out, + u16 *out_size) +{ + hinic_pf_mbox_cb cb; + u16 vf_id = 0; + int ret; + + if (recv_mbox->mod >= HINIC_MOD_MAX) { + dev_err(&func_to_func->hwif->pdev->dev, "Receive illegal mbox message, mod = %d\n", + recv_mbox->mod); + return -EINVAL; + } + + set_bit(HINIC_PF_MBOX_CB_RUNNING, + &func_to_func->pf_mbox_cb_state[recv_mbox->mod]); + + cb = func_to_func->pf_mbox_cb[recv_mbox->mod]; + if (cb && test_bit(HINIC_PF_MBOX_CB_REG, + &func_to_func->pf_mbox_cb_state[recv_mbox->mod])) { + vf_id = src_func_idx - + hinic_glb_pf_vf_offset(func_to_func->hwif); + ret = cb(func_to_func->hwdev, vf_id, recv_mbox->cmd, + recv_mbox->mbox, recv_mbox->mbox_len, + buf_out, out_size); + } else { + dev_err(&func_to_func->hwif->pdev->dev, "PF mbox mod(0x%x) cb is not registered\n", + recv_mbox->mod); + ret = -EINVAL; + } + + clear_bit(HINIC_PF_MBOX_CB_RUNNING, + &func_to_func->pf_mbox_cb_state[recv_mbox->mod]); + + return ret; +} + +static bool check_mbox_seq_id_and_seg_len(struct hinic_recv_mbox *recv_mbox, + u8 seq_id, u8 seg_len) +{ + if (seq_id > SEQ_ID_MAX_VAL || seg_len > MBOX_SEG_LEN) + return false; + + if (seq_id == 0) { + recv_mbox->seq_id = seq_id; + } else { + if (seq_id != recv_mbox->seq_id + 1) + return false; + + recv_mbox->seq_id = seq_id; + } + + return true; +} + +static void resp_mbox_handler(struct hinic_mbox_func_to_func *func_to_func, + struct hinic_recv_mbox *recv_mbox) +{ + spin_lock(&func_to_func->mbox_lock); + if (recv_mbox->msg_info.msg_id == func_to_func->send_msg_id && + func_to_func->event_flag == EVENT_START) + complete(&recv_mbox->recv_done); + else + dev_err(&func_to_func->hwif->pdev->dev, + "Mbox response timeout, current send msg id(0x%x), recv msg id(0x%x), status(0x%x)\n", + func_to_func->send_msg_id, recv_mbox->msg_info.msg_id, + recv_mbox->msg_info.status); + spin_unlock(&func_to_func->mbox_lock); +} + +static void recv_func_mbox_handler(struct hinic_mbox_func_to_func *func_to_func, + struct hinic_recv_mbox *recv_mbox, + u16 src_func_idx); + +static void recv_func_mbox_work_handler(struct work_struct *work) +{ + struct hinic_mbox_work *mbox_work = + container_of(work, struct hinic_mbox_work, work); + struct hinic_recv_mbox *recv_mbox; + + recv_func_mbox_handler(mbox_work->func_to_func, mbox_work->recv_mbox, + mbox_work->src_func_idx); + + recv_mbox = + &mbox_work->func_to_func->mbox_send[mbox_work->src_func_idx]; + + atomic_dec(&recv_mbox->msg_cnt); + + kfree(mbox_work); +} + +static void recv_mbox_handler(struct hinic_mbox_func_to_func *func_to_func, + void *header, struct hinic_recv_mbox *recv_mbox) +{ + void *mbox_body = MBOX_BODY_FROM_HDR(header); + struct hinic_recv_mbox *rcv_mbox_temp = NULL; + u64 mbox_header = *((u64 *)header); + struct hinic_mbox_work *mbox_work; + u8 seq_id, seg_len; + u16 src_func_idx; + int pos; + + seq_id = HINIC_MBOX_HEADER_GET(mbox_header, SEQID); + seg_len = HINIC_MBOX_HEADER_GET(mbox_header, SEG_LEN); + src_func_idx = HINIC_MBOX_HEADER_GET(mbox_header, SRC_GLB_FUNC_IDX); + + if (!check_mbox_seq_id_and_seg_len(recv_mbox, seq_id, seg_len)) { + dev_err(&func_to_func->hwif->pdev->dev, + "Mailbox sequence and segment check fail, src func id: 0x%x, front id: 0x%x, current id: 0x%x, seg len: 0x%x\n", + src_func_idx, recv_mbox->seq_id, seq_id, seg_len); + recv_mbox->seq_id = SEQ_ID_MAX_VAL; + return; + } + + pos = seq_id * MBOX_SEG_LEN; + memcpy((u8 *)recv_mbox->mbox + pos, mbox_body, + HINIC_MBOX_HEADER_GET(mbox_header, SEG_LEN)); + + if (!HINIC_MBOX_HEADER_GET(mbox_header, LAST)) + return; + + recv_mbox->cmd = HINIC_MBOX_HEADER_GET(mbox_header, CMD); + recv_mbox->mod = HINIC_MBOX_HEADER_GET(mbox_header, MODULE); + recv_mbox->mbox_len = HINIC_MBOX_HEADER_GET(mbox_header, MSG_LEN); + recv_mbox->ack_type = HINIC_MBOX_HEADER_GET(mbox_header, NO_ACK); + recv_mbox->msg_info.msg_id = HINIC_MBOX_HEADER_GET(mbox_header, MSG_ID); + recv_mbox->msg_info.status = HINIC_MBOX_HEADER_GET(mbox_header, STATUS); + recv_mbox->seq_id = SEQ_ID_MAX_VAL; + + if (HINIC_MBOX_HEADER_GET(mbox_header, DIRECTION) == + HINIC_HWIF_RESPONSE) { + resp_mbox_handler(func_to_func, recv_mbox); + return; + } + + if (atomic_read(&recv_mbox->msg_cnt) > HINIC_MAX_MSG_CNT_TO_PROCESS) { + dev_warn(&func_to_func->hwif->pdev->dev, + "This function(%u) have %d message wait to process,can't add to work queue\n", + src_func_idx, atomic_read(&recv_mbox->msg_cnt)); + return; + } + + rcv_mbox_temp = kzalloc(sizeof(*rcv_mbox_temp), GFP_KERNEL); + if (!rcv_mbox_temp) + return; + + memcpy(rcv_mbox_temp, recv_mbox, sizeof(*rcv_mbox_temp)); + + rcv_mbox_temp->mbox = kzalloc(MBOX_MAX_BUF_SZ, GFP_KERNEL); + if (!rcv_mbox_temp->mbox) + goto err_alloc_rcv_mbox_msg; + + memcpy(rcv_mbox_temp->mbox, recv_mbox->mbox, MBOX_MAX_BUF_SZ); + + rcv_mbox_temp->buf_out = kzalloc(MBOX_MAX_BUF_SZ, GFP_KERNEL); + if (!rcv_mbox_temp->buf_out) + goto err_alloc_rcv_mbox_buf; + + mbox_work = kzalloc(sizeof(*mbox_work), GFP_KERNEL); + if (!mbox_work) + goto err_alloc_mbox_work; + + mbox_work->func_to_func = func_to_func; + mbox_work->recv_mbox = rcv_mbox_temp; + mbox_work->src_func_idx = src_func_idx; + + atomic_inc(&recv_mbox->msg_cnt); + INIT_WORK(&mbox_work->work, recv_func_mbox_work_handler); + queue_work(func_to_func->workq, &mbox_work->work); + + return; + +err_alloc_mbox_work: + kfree(rcv_mbox_temp->buf_out); + +err_alloc_rcv_mbox_buf: + kfree(rcv_mbox_temp->mbox); + +err_alloc_rcv_mbox_msg: + kfree(rcv_mbox_temp); +} + +void hinic_mbox_func_aeqe_handler(void *handle, void *header, u8 size) +{ + struct hinic_mbox_func_to_func *func_to_func; + u64 mbox_header = *((u64 *)header); + struct hinic_recv_mbox *recv_mbox; + u64 src, dir; + + func_to_func = ((struct hinic_hwdev *)handle)->func_to_func; + + dir = HINIC_MBOX_HEADER_GET(mbox_header, DIRECTION); + src = HINIC_MBOX_HEADER_GET(mbox_header, SRC_GLB_FUNC_IDX); + + if (src >= HINIC_MAX_FUNCTIONS) { + dev_err(&func_to_func->hwif->pdev->dev, + "Mailbox source function id:%u is invalid\n", (u32)src); + return; + } + + recv_mbox = (dir == HINIC_HWIF_DIRECT_SEND) ? + &func_to_func->mbox_send[src] : + &func_to_func->mbox_resp[src]; + + recv_mbox_handler(func_to_func, (u64 *)header, recv_mbox); +} + +void hinic_mbox_self_aeqe_handler(void *handle, void *header, u8 size) +{ + struct hinic_mbox_func_to_func *func_to_func; + struct hinic_send_mbox *send_mbox; + + func_to_func = ((struct hinic_hwdev *)handle)->func_to_func; + send_mbox = &func_to_func->send_mbox; + + complete(&send_mbox->send_done); +} + +static void clear_mbox_status(struct hinic_send_mbox *mbox) +{ + *mbox->wb_status = 0; + + /* clear mailbox write back status */ + wmb(); +} + +static void mbox_copy_header(struct hinic_hwdev *hwdev, + struct hinic_send_mbox *mbox, u64 *header) +{ + u32 i, idx_max = MBOX_HEADER_SZ / sizeof(u32); + u32 *data = (u32 *)header; + + for (i = 0; i < idx_max; i++) + __raw_writel(*(data + i), mbox->data + i * sizeof(u32)); +} + +static void mbox_copy_send_data(struct hinic_hwdev *hwdev, + struct hinic_send_mbox *mbox, void *seg, + u16 seg_len) +{ + u8 mbox_max_buf[MBOX_SEG_LEN] = {0}; + u32 data_len, chk_sz = sizeof(u32); + u32 *data = seg; + u32 i, idx_max; + + /* The mbox message should be aligned in 4 bytes. */ + if (seg_len % chk_sz) { + memcpy(mbox_max_buf, seg, seg_len); + data = (u32 *)mbox_max_buf; + } + + data_len = seg_len; + idx_max = ALIGN(data_len, chk_sz) / chk_sz; + + for (i = 0; i < idx_max; i++) + __raw_writel(*(data + i), + mbox->data + MBOX_HEADER_SZ + i * sizeof(u32)); +} + +static void write_mbox_msg_attr(struct hinic_mbox_func_to_func *func_to_func, + u16 dst_func, u16 dst_aeqn, u16 seg_len, + int poll) +{ + u16 rsp_aeq = (dst_aeqn == 0) ? 0 : HINIC_MBOX_RSP_AEQN; + u32 mbox_int, mbox_ctrl; + + mbox_int = HINIC_MBOX_INT_SET(dst_func, DST_FUNC) | + HINIC_MBOX_INT_SET(dst_aeqn, DST_AEQN) | + HINIC_MBOX_INT_SET(rsp_aeq, SRC_RESP_AEQN) | + HINIC_MBOX_INT_SET(NO_DMA_ATTRIBUTE_VAL, STAT_DMA) | + HINIC_MBOX_INT_SET(ALIGN(MBOX_SEG_LEN + MBOX_HEADER_SZ + + MBOX_INFO_SZ, MBOX_SEG_LEN_ALIGN) >> 2, + TX_SIZE) | + HINIC_MBOX_INT_SET(STRONG_ORDER, STAT_DMA_SO_RO) | + HINIC_MBOX_INT_SET(WRITE_BACK, WB_EN); + + hinic_hwif_write_reg(func_to_func->hwif, + HINIC_FUNC_CSR_MAILBOX_INT_OFFSET_OFF, mbox_int); + + wmb(); /* writing the mbox int attributes */ + mbox_ctrl = HINIC_MBOX_CTRL_SET(TX_NOT_DONE, TX_STATUS); + + if (poll) + mbox_ctrl |= HINIC_MBOX_CTRL_SET(NOT_TRIGGER, TRIGGER_AEQE); + else + mbox_ctrl |= HINIC_MBOX_CTRL_SET(TRIGGER, TRIGGER_AEQE); + + hinic_hwif_write_reg(func_to_func->hwif, + HINIC_FUNC_CSR_MAILBOX_CONTROL_OFF, mbox_ctrl); +} + +void dump_mox_reg(struct hinic_hwdev *hwdev) +{ + u32 val; + + val = hinic_hwif_read_reg(hwdev->hwif, + HINIC_FUNC_CSR_MAILBOX_CONTROL_OFF); + dev_err(&hwdev->hwif->pdev->dev, "Mailbox control reg: 0x%x\n", val); + + val = hinic_hwif_read_reg(hwdev->hwif, + HINIC_FUNC_CSR_MAILBOX_INT_OFFSET_OFF); + dev_err(&hwdev->hwif->pdev->dev, "Mailbox interrupt offset: 0x%x\n", + val); +} + +static u16 get_mbox_status(struct hinic_send_mbox *mbox) +{ + /* write back is 16B, but only use first 4B */ + u64 wb_val = be64_to_cpu(*mbox->wb_status); + + rmb(); /* verify reading before check */ + + return (u16)(wb_val & MBOX_WB_STATUS_ERRCODE_MASK); +} + +static int +wait_for_mbox_seg_completion(struct hinic_mbox_func_to_func *func_to_func, + int poll, u16 *wb_status) +{ + struct hinic_send_mbox *send_mbox = &func_to_func->send_mbox; + struct hinic_hwdev *hwdev = func_to_func->hwdev; + struct completion *done = &send_mbox->send_done; + u32 cnt = 0; + ulong jif; + + if (poll) { + while (cnt < MBOX_MSG_POLLING_TIMEOUT) { + *wb_status = get_mbox_status(send_mbox); + if (MBOX_STATUS_FINISHED(*wb_status)) + break; + + usleep_range(900, 1000); + cnt++; + } + + if (cnt == MBOX_MSG_POLLING_TIMEOUT) { + dev_err(&hwdev->hwif->pdev->dev, "Send mailbox segment timeout, wb status: 0x%x\n", + *wb_status); + dump_mox_reg(hwdev); + return -ETIMEDOUT; + } + } else { + jif = msecs_to_jiffies(HINIC_MBOX_COMP_TIME); + if (!wait_for_completion_timeout(done, jif)) { + dev_err(&hwdev->hwif->pdev->dev, "Send mailbox segment timeout\n"); + dump_mox_reg(hwdev); + return -ETIMEDOUT; + } + + *wb_status = get_mbox_status(send_mbox); + } + + return 0; +} + +static int send_mbox_seg(struct hinic_mbox_func_to_func *func_to_func, + u64 header, u16 dst_func, void *seg, u16 seg_len, + int poll, void *msg_info) +{ + struct hinic_send_mbox *send_mbox = &func_to_func->send_mbox; + u16 seq_dir = HINIC_MBOX_HEADER_GET(header, DIRECTION); + struct hinic_hwdev *hwdev = func_to_func->hwdev; + struct completion *done = &send_mbox->send_done; + u8 num_aeqs = hwdev->hwif->attr.num_aeqs; + u16 dst_aeqn, wb_status = 0, errcode; + + if (num_aeqs >= 4) + dst_aeqn = (seq_dir == HINIC_HWIF_DIRECT_SEND) ? + HINIC_MBOX_RECV_AEQN : HINIC_MBOX_RSP_AEQN; + else + dst_aeqn = 0; + + if (!poll) + init_completion(done); + + clear_mbox_status(send_mbox); + + mbox_copy_header(hwdev, send_mbox, &header); + + mbox_copy_send_data(hwdev, send_mbox, seg, seg_len); + + write_mbox_msg_attr(func_to_func, dst_func, dst_aeqn, seg_len, poll); + + wmb(); /* writing the mbox msg attributes */ + + if (wait_for_mbox_seg_completion(func_to_func, poll, &wb_status)) + return -ETIMEDOUT; + + if (!MBOX_STATUS_SUCCESS(wb_status)) { + dev_err(&hwdev->hwif->pdev->dev, "Send mailbox segment to function %d error, wb status: 0x%x\n", + dst_func, wb_status); + errcode = MBOX_STATUS_ERRCODE(wb_status); + return errcode ? errcode : -EFAULT; + } + + return 0; +} + +static int send_mbox_to_func(struct hinic_mbox_func_to_func *func_to_func, + enum hinic_mod_type mod, u16 cmd, void *msg, + u16 msg_len, u16 dst_func, + enum hinic_hwif_direction_type direction, + enum hinic_mbox_ack_type ack_type, + struct mbox_msg_info *msg_info) +{ + struct hinic_hwdev *hwdev = func_to_func->hwdev; + u16 seg_len = MBOX_SEG_LEN; + u8 *msg_seg = (u8 *)msg; + u16 left = msg_len; + u32 seq_id = 0; + u64 header = 0; + int err = 0; + + down(&func_to_func->msg_send_sem); + + header = HINIC_MBOX_HEADER_SET(msg_len, MSG_LEN) | + HINIC_MBOX_HEADER_SET(mod, MODULE) | + HINIC_MBOX_HEADER_SET(seg_len, SEG_LEN) | + HINIC_MBOX_HEADER_SET(ack_type, NO_ACK) | + HINIC_MBOX_HEADER_SET(SEQ_ID_START_VAL, SEQID) | + HINIC_MBOX_HEADER_SET(NOT_LAST_SEG, LAST) | + HINIC_MBOX_HEADER_SET(direction, DIRECTION) | + HINIC_MBOX_HEADER_SET(cmd, CMD) | + /* The vf's offset to it's associated pf */ + HINIC_MBOX_HEADER_SET(msg_info->msg_id, MSG_ID) | + HINIC_MBOX_HEADER_SET(msg_info->status, STATUS) | + HINIC_MBOX_HEADER_SET(hinic_global_func_id_hw(hwdev->hwif), + SRC_GLB_FUNC_IDX); + + while (!(HINIC_MBOX_HEADER_GET(header, LAST))) { + if (left <= HINIC_MBOX_SEG_LEN) { + header &= ~MBOX_SEGLEN_MASK; + header |= HINIC_MBOX_HEADER_SET(left, SEG_LEN); + header |= HINIC_MBOX_HEADER_SET(LAST_SEG, LAST); + + seg_len = left; + } + + err = send_mbox_seg(func_to_func, header, dst_func, msg_seg, + seg_len, MBOX_SEND_MSG_INT, msg_info); + if (err) { + dev_err(&hwdev->hwif->pdev->dev, "Failed to send mbox seg, seq_id=0x%llx\n", + HINIC_MBOX_HEADER_GET(header, SEQID)); + goto err_send_mbox_seg; + } + + left -= HINIC_MBOX_SEG_LEN; + msg_seg += HINIC_MBOX_SEG_LEN; + + seq_id++; + header &= ~(HINIC_MBOX_HEADER_SET(HINIC_MBOX_HEADER_SEQID_MASK, + SEQID)); + header |= HINIC_MBOX_HEADER_SET(seq_id, SEQID); + } + +err_send_mbox_seg: + up(&func_to_func->msg_send_sem); + + return err; +} + +static void +response_for_recv_func_mbox(struct hinic_mbox_func_to_func *func_to_func, + struct hinic_recv_mbox *recv_mbox, int err, + u16 out_size, u16 src_func_idx) +{ + struct mbox_msg_info msg_info = {0}; + + if (recv_mbox->ack_type == MBOX_ACK) { + msg_info.msg_id = recv_mbox->msg_info.msg_id; + if (err == HINIC_MBOX_PF_BUSY_ACTIVE_FW) + msg_info.status = HINIC_MBOX_PF_BUSY_ACTIVE_FW; + else if (err == HINIC_MBOX_VF_CMD_ERROR) + msg_info.status = HINIC_MBOX_VF_CMD_ERROR; + else if (err) + msg_info.status = HINIC_MBOX_PF_SEND_ERR; + + /* if no data needs to response, set out_size to 1 */ + if (!out_size || err) + out_size = MBOX_MSG_NO_DATA_LEN; + + send_mbox_to_func(func_to_func, recv_mbox->mod, recv_mbox->cmd, + recv_mbox->buf_out, out_size, src_func_idx, + HINIC_HWIF_RESPONSE, MBOX_ACK, + &msg_info); + } +} + +static void recv_func_mbox_handler(struct hinic_mbox_func_to_func *func_to_func, + struct hinic_recv_mbox *recv_mbox, + u16 src_func_idx) +{ + void *buf_out = recv_mbox->buf_out; + u16 out_size = MBOX_MAX_BUF_SZ; + int err = 0; + + if (HINIC_IS_VF(func_to_func->hwif)) { + err = recv_vf_mbox_handler(func_to_func, recv_mbox, buf_out, + &out_size); + } else { + if (IS_PF_OR_PPF_SRC(src_func_idx)) + dev_warn(&func_to_func->hwif->pdev->dev, + "Unsupported pf2pf mbox msg\n"); + else + err = recv_pf_from_vf_mbox_handler(func_to_func, + recv_mbox, + src_func_idx, + buf_out, &out_size); + } + + response_for_recv_func_mbox(func_to_func, recv_mbox, err, out_size, + src_func_idx); + kfree(recv_mbox->buf_out); + kfree(recv_mbox->mbox); + kfree(recv_mbox); +} + +static void set_mbox_to_func_event(struct hinic_mbox_func_to_func *func_to_func, + enum mbox_event_state event_flag) +{ + spin_lock(&func_to_func->mbox_lock); + func_to_func->event_flag = event_flag; + spin_unlock(&func_to_func->mbox_lock); +} + +static int mbox_resp_info_handler(struct hinic_mbox_func_to_func *func_to_func, + struct hinic_recv_mbox *mbox_for_resp, + enum hinic_mod_type mod, u16 cmd, + void *buf_out, u16 *out_size) +{ + int err; + + if (mbox_for_resp->msg_info.status) { + err = mbox_for_resp->msg_info.status; + if (err != HINIC_MBOX_PF_BUSY_ACTIVE_FW) + dev_err(&func_to_func->hwif->pdev->dev, "Mbox response error(0x%x)\n", + mbox_for_resp->msg_info.status); + return err; + } + + if (buf_out && out_size) { + if (*out_size < mbox_for_resp->mbox_len) { + dev_err(&func_to_func->hwif->pdev->dev, + "Invalid response mbox message length: %d for mod %d cmd %d, should less than: %d\n", + mbox_for_resp->mbox_len, mod, cmd, *out_size); + return -EFAULT; + } + + if (mbox_for_resp->mbox_len) + memcpy(buf_out, mbox_for_resp->mbox, + mbox_for_resp->mbox_len); + + *out_size = mbox_for_resp->mbox_len; + } + + return 0; +} + +int hinic_mbox_to_func(struct hinic_mbox_func_to_func *func_to_func, + enum hinic_mod_type mod, u16 cmd, u16 dst_func, + void *buf_in, u16 in_size, void *buf_out, + u16 *out_size, u32 timeout) +{ + struct hinic_recv_mbox *mbox_for_resp; + struct mbox_msg_info msg_info = {0}; + ulong timeo; + int err; + + mbox_for_resp = &func_to_func->mbox_resp[dst_func]; + + down(&func_to_func->mbox_send_sem); + + init_completion(&mbox_for_resp->recv_done); + + msg_info.msg_id = MBOX_MSG_ID_INC(func_to_func); + + set_mbox_to_func_event(func_to_func, EVENT_START); + + err = send_mbox_to_func(func_to_func, mod, cmd, buf_in, in_size, + dst_func, HINIC_HWIF_DIRECT_SEND, MBOX_ACK, + &msg_info); + if (err) { + dev_err(&func_to_func->hwif->pdev->dev, "Send mailbox failed, msg_id: %d\n", + msg_info.msg_id); + set_mbox_to_func_event(func_to_func, EVENT_FAIL); + goto err_send_mbox; + } + + timeo = msecs_to_jiffies(timeout ? timeout : HINIC_MBOX_COMP_TIME); + if (!wait_for_completion_timeout(&mbox_for_resp->recv_done, timeo)) { + set_mbox_to_func_event(func_to_func, EVENT_TIMEOUT); + dev_err(&func_to_func->hwif->pdev->dev, + "Send mbox msg timeout, msg_id: %d\n", msg_info.msg_id); + err = -ETIMEDOUT; + goto err_send_mbox; + } + + set_mbox_to_func_event(func_to_func, EVENT_END); + + err = mbox_resp_info_handler(func_to_func, mbox_for_resp, mod, cmd, + buf_out, out_size); + +err_send_mbox: + up(&func_to_func->mbox_send_sem); + + return err; +} + +static int mbox_func_params_valid(struct hinic_mbox_func_to_func *func_to_func, + void *buf_in, u16 in_size) +{ + if (in_size > HINIC_MBOX_DATA_SIZE) { + dev_err(&func_to_func->hwif->pdev->dev, + "Mbox msg len(%d) exceed limit(%d)\n", + in_size, HINIC_MBOX_DATA_SIZE); + return -EINVAL; + } + + return 0; +} + +int hinic_mbox_to_pf(struct hinic_hwdev *hwdev, + enum hinic_mod_type mod, u8 cmd, void *buf_in, + u16 in_size, void *buf_out, u16 *out_size, u32 timeout) +{ + struct hinic_mbox_func_to_func *func_to_func = hwdev->func_to_func; + int err = mbox_func_params_valid(func_to_func, buf_in, in_size); + + if (err) + return err; + + if (!HINIC_IS_VF(hwdev->hwif)) { + dev_err(&hwdev->hwif->pdev->dev, "Params error, func_type: %d\n", + HINIC_FUNC_TYPE(hwdev->hwif)); + return -EINVAL; + } + + return hinic_mbox_to_func(func_to_func, mod, cmd, + hinic_pf_id_of_vf_hw(hwdev->hwif), buf_in, + in_size, buf_out, out_size, timeout); +} + +int hinic_mbox_to_vf(struct hinic_hwdev *hwdev, + enum hinic_mod_type mod, u16 vf_id, u8 cmd, void *buf_in, + u16 in_size, void *buf_out, u16 *out_size, u32 timeout) +{ + struct hinic_mbox_func_to_func *func_to_func; + u16 dst_func_idx; + int err; + + if (!hwdev) + return -EINVAL; + + func_to_func = hwdev->func_to_func; + err = mbox_func_params_valid(func_to_func, buf_in, in_size); + if (err) + return err; + + if (HINIC_IS_VF(hwdev->hwif)) { + dev_err(&hwdev->hwif->pdev->dev, "Params error, func_type: %d\n", + HINIC_FUNC_TYPE(hwdev->hwif)); + return -EINVAL; + } + + if (!vf_id) { + dev_err(&hwdev->hwif->pdev->dev, + "VF id(%d) error!\n", vf_id); + return -EINVAL; + } + + /* vf_offset_to_pf + vf_id is the vf's global function id of vf in + * this pf + */ + dst_func_idx = hinic_glb_pf_vf_offset(hwdev->hwif) + vf_id; + + return hinic_mbox_to_func(func_to_func, mod, cmd, dst_func_idx, buf_in, + in_size, buf_out, out_size, timeout); +} + +static int init_mbox_info(struct hinic_recv_mbox *mbox_info) +{ + int err; + + mbox_info->seq_id = SEQ_ID_MAX_VAL; + + mbox_info->mbox = kzalloc(MBOX_MAX_BUF_SZ, GFP_KERNEL); + if (!mbox_info->mbox) + return -ENOMEM; + + mbox_info->buf_out = kzalloc(MBOX_MAX_BUF_SZ, GFP_KERNEL); + if (!mbox_info->buf_out) { + err = -ENOMEM; + goto err_alloc_buf_out; + } + + atomic_set(&mbox_info->msg_cnt, 0); + + return 0; + +err_alloc_buf_out: + kfree(mbox_info->mbox); + + return err; +} + +static void clean_mbox_info(struct hinic_recv_mbox *mbox_info) +{ + kfree(mbox_info->buf_out); + kfree(mbox_info->mbox); +} + +static int alloc_mbox_info(struct hinic_hwdev *hwdev, + struct hinic_recv_mbox *mbox_info) +{ + u16 func_idx, i; + int err; + + for (func_idx = 0; func_idx < HINIC_MAX_FUNCTIONS; func_idx++) { + err = init_mbox_info(&mbox_info[func_idx]); + if (err) { + dev_err(&hwdev->hwif->pdev->dev, "Failed to init function %d mbox info\n", + func_idx); + goto err_init_mbox_info; + } + } + + return 0; + +err_init_mbox_info: + for (i = 0; i < func_idx; i++) + clean_mbox_info(&mbox_info[i]); + + return err; +} + +static void free_mbox_info(struct hinic_recv_mbox *mbox_info) +{ + u16 func_idx; + + for (func_idx = 0; func_idx < HINIC_MAX_FUNCTIONS; func_idx++) + clean_mbox_info(&mbox_info[func_idx]); +} + +static void prepare_send_mbox(struct hinic_mbox_func_to_func *func_to_func) +{ + struct hinic_send_mbox *send_mbox = &func_to_func->send_mbox; + + send_mbox->data = MBOX_AREA(func_to_func->hwif); +} + +static int alloc_mbox_wb_status(struct hinic_mbox_func_to_func *func_to_func) +{ + struct hinic_send_mbox *send_mbox = &func_to_func->send_mbox; + struct hinic_hwdev *hwdev = func_to_func->hwdev; + u32 addr_h, addr_l; + + send_mbox->wb_vaddr = dma_alloc_coherent(&hwdev->hwif->pdev->dev, + MBOX_WB_STATUS_LEN, + &send_mbox->wb_paddr, + GFP_KERNEL); + if (!send_mbox->wb_vaddr) + return -ENOMEM; + + send_mbox->wb_status = send_mbox->wb_vaddr; + + addr_h = upper_32_bits(send_mbox->wb_paddr); + addr_l = lower_32_bits(send_mbox->wb_paddr); + + hinic_hwif_write_reg(hwdev->hwif, HINIC_FUNC_CSR_MAILBOX_RESULT_H_OFF, + addr_h); + hinic_hwif_write_reg(hwdev->hwif, HINIC_FUNC_CSR_MAILBOX_RESULT_L_OFF, + addr_l); + + return 0; +} + +static void free_mbox_wb_status(struct hinic_mbox_func_to_func *func_to_func) +{ + struct hinic_send_mbox *send_mbox = &func_to_func->send_mbox; + struct hinic_hwdev *hwdev = func_to_func->hwdev; + + hinic_hwif_write_reg(hwdev->hwif, HINIC_FUNC_CSR_MAILBOX_RESULT_H_OFF, + 0); + hinic_hwif_write_reg(hwdev->hwif, HINIC_FUNC_CSR_MAILBOX_RESULT_L_OFF, + 0); + + dma_free_coherent(&hwdev->hwif->pdev->dev, MBOX_WB_STATUS_LEN, + send_mbox->wb_vaddr, + send_mbox->wb_paddr); +} + +static int comm_pf_mbox_handler(void *handle, u16 vf_id, u8 cmd, void *buf_in, + u16 in_size, void *buf_out, u16 *out_size) +{ + struct hinic_hwdev *hwdev = handle; + struct hinic_pfhwdev *pfhwdev; + int err = 0; + + pfhwdev = container_of(hwdev, struct hinic_pfhwdev, hwdev); + + if (cmd == HINIC_COMM_CMD_START_FLR) { + *out_size = 0; + } else { + err = hinic_msg_to_mgmt(&pfhwdev->pf_to_mgmt, HINIC_MOD_COMM, + cmd, buf_in, in_size, buf_out, out_size, + HINIC_MGMT_MSG_SYNC); + if (err && err != HINIC_MBOX_PF_BUSY_ACTIVE_FW) + dev_err(&hwdev->hwif->pdev->dev, + "PF mbox common callback handler err: %d\n", + err); + } + + return err; +} + +int hinic_func_to_func_init(struct hinic_hwdev *hwdev) +{ + struct hinic_mbox_func_to_func *func_to_func; + struct hinic_pfhwdev *pfhwdev; + int err; + + pfhwdev = container_of(hwdev, struct hinic_pfhwdev, hwdev); + func_to_func = kzalloc(sizeof(*func_to_func), GFP_KERNEL); + if (!func_to_func) + return -ENOMEM; + + hwdev->func_to_func = func_to_func; + func_to_func->hwdev = hwdev; + func_to_func->hwif = hwdev->hwif; + sema_init(&func_to_func->mbox_send_sem, 1); + sema_init(&func_to_func->msg_send_sem, 1); + spin_lock_init(&func_to_func->mbox_lock); + func_to_func->workq = create_singlethread_workqueue(HINIC_MBOX_WQ_NAME); + if (!func_to_func->workq) { + dev_err(&hwdev->hwif->pdev->dev, "Failed to initialize MBOX workqueue\n"); + err = -ENOMEM; + goto err_create_mbox_workq; + } + + err = alloc_mbox_info(hwdev, func_to_func->mbox_send); + if (err) { + dev_err(&hwdev->hwif->pdev->dev, "Failed to alloc mem for mbox_active\n"); + goto err_alloc_mbox_for_send; + } + + err = alloc_mbox_info(hwdev, func_to_func->mbox_resp); + if (err) { + dev_err(&hwdev->hwif->pdev->dev, "Failed to alloc mem for mbox_passive\n"); + goto err_alloc_mbox_for_resp; + } + + err = alloc_mbox_wb_status(func_to_func); + if (err) { + dev_err(&hwdev->hwif->pdev->dev, "Failed to alloc mbox write back status\n"); + goto err_alloc_wb_status; + } + + prepare_send_mbox(func_to_func); + + hinic_aeq_register_hw_cb(&hwdev->aeqs, HINIC_MBX_FROM_FUNC, + &pfhwdev->hwdev, hinic_mbox_func_aeqe_handler); + hinic_aeq_register_hw_cb(&hwdev->aeqs, HINIC_MBX_SEND_RSLT, + &pfhwdev->hwdev, hinic_mbox_self_aeqe_handler); + + if (!HINIC_IS_VF(hwdev->hwif)) + hinic_register_pf_mbox_cb(hwdev, HINIC_MOD_COMM, + comm_pf_mbox_handler); + + return 0; + +err_alloc_wb_status: + free_mbox_info(func_to_func->mbox_resp); + +err_alloc_mbox_for_resp: + free_mbox_info(func_to_func->mbox_send); + +err_alloc_mbox_for_send: + destroy_workqueue(func_to_func->workq); + +err_create_mbox_workq: + kfree(func_to_func); + + return err; +} + +void hinic_func_to_func_free(struct hinic_hwdev *hwdev) +{ + struct hinic_mbox_func_to_func *func_to_func = hwdev->func_to_func; + + hinic_aeq_unregister_hw_cb(&hwdev->aeqs, HINIC_MBX_FROM_FUNC); + hinic_aeq_unregister_hw_cb(&hwdev->aeqs, HINIC_MBX_SEND_RSLT); + + hinic_unregister_pf_mbox_cb(hwdev, HINIC_MOD_COMM); + /* destroy workqueue before free related mbox resources in case of + * illegal resource access + */ + destroy_workqueue(func_to_func->workq); + + free_mbox_wb_status(func_to_func); + free_mbox_info(func_to_func->mbox_resp); + free_mbox_info(func_to_func->mbox_send); + + kfree(func_to_func); +} diff --git a/drivers/net/ethernet/huawei/hinic/hinic_hw_mbox.h b/drivers/net/ethernet/huawei/hinic/hinic_hw_mbox.h new file mode 100644 index 000000000000..7b18559bfe80 --- /dev/null +++ b/drivers/net/ethernet/huawei/hinic/hinic_hw_mbox.h @@ -0,0 +1,154 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* Huawei HiNIC PCI Express Linux driver + * Copyright(c) 2017 Huawei Technologies Co., Ltd + */ + +#ifndef HINIC_MBOX_H_ +#define HINIC_MBOX_H_ + +#define HINIC_MBOX_PF_SEND_ERR 0x1 +#define HINIC_MBOX_PF_BUSY_ACTIVE_FW 0x2 +#define HINIC_MBOX_VF_CMD_ERROR 0x3 + +#define HINIC_MAX_FUNCTIONS 512 + +#define HINIC_MAX_PF_FUNCS 16 + +#define HINIC_MBOX_WQ_NAME "hinic_mbox" + +#define HINIC_FUNC_CSR_MAILBOX_DATA_OFF 0x80 +#define HINIC_FUNC_CSR_MAILBOX_CONTROL_OFF 0x0100 +#define HINIC_FUNC_CSR_MAILBOX_INT_OFFSET_OFF 0x0104 +#define HINIC_FUNC_CSR_MAILBOX_RESULT_H_OFF 0x0108 +#define HINIC_FUNC_CSR_MAILBOX_RESULT_L_OFF 0x010C + +enum hinic_mbox_ack_type { + MBOX_ACK, + MBOX_NO_ACK, +}; + +struct mbox_msg_info { + u8 msg_id; + u8 status; +}; + +struct hinic_recv_mbox { + struct completion recv_done; + void *mbox; + u8 cmd; + enum hinic_mod_type mod; + u16 mbox_len; + void *buf_out; + enum hinic_mbox_ack_type ack_type; + struct mbox_msg_info msg_info; + u8 seq_id; + atomic_t msg_cnt; +}; + +struct hinic_send_mbox { + struct completion send_done; + u8 *data; + + u64 *wb_status; + void *wb_vaddr; + dma_addr_t wb_paddr; +}; + +typedef void (*hinic_vf_mbox_cb)(void *handle, u8 cmd, void *buf_in, + u16 in_size, void *buf_out, u16 *out_size); +typedef int (*hinic_pf_mbox_cb)(void *handle, u16 vf_id, u8 cmd, void *buf_in, + u16 in_size, void *buf_out, u16 *out_size); + +enum mbox_event_state { + EVENT_START = 0, + EVENT_FAIL, + EVENT_TIMEOUT, + EVENT_END, +}; + +enum hinic_mbox_cb_state { + HINIC_VF_MBOX_CB_REG = 0, + HINIC_VF_MBOX_CB_RUNNING, + HINIC_PF_MBOX_CB_REG, + HINIC_PF_MBOX_CB_RUNNING, + HINIC_PPF_MBOX_CB_REG, + HINIC_PPF_MBOX_CB_RUNNING, + HINIC_PPF_TO_PF_MBOX_CB_REG, + HINIC_PPF_TO_PF_MBOX_CB_RUNNIG, +}; + +struct hinic_mbox_func_to_func { + struct hinic_hwdev *hwdev; + struct hinic_hwif *hwif; + + struct semaphore mbox_send_sem; + struct semaphore msg_send_sem; + struct hinic_send_mbox send_mbox; + + struct workqueue_struct *workq; + + struct hinic_recv_mbox mbox_resp[HINIC_MAX_FUNCTIONS]; + struct hinic_recv_mbox mbox_send[HINIC_MAX_FUNCTIONS]; + + hinic_vf_mbox_cb vf_mbox_cb[HINIC_MOD_MAX]; + hinic_pf_mbox_cb pf_mbox_cb[HINIC_MOD_MAX]; + unsigned long pf_mbox_cb_state[HINIC_MOD_MAX]; + unsigned long vf_mbox_cb_state[HINIC_MOD_MAX]; + + u8 send_msg_id; + enum mbox_event_state event_flag; + + /* lock for mbox event flag */ + spinlock_t mbox_lock; +}; + +struct hinic_mbox_work { + struct work_struct work; + u16 src_func_idx; + struct hinic_mbox_func_to_func *func_to_func; + struct hinic_recv_mbox *recv_mbox; +}; + +struct vf_cmd_msg_handle { + u8 cmd; + int (*cmd_msg_handler)(void *hwdev, u16 vf_id, + void *buf_in, u16 in_size, + void *buf_out, u16 *out_size); +}; + +int hinic_register_pf_mbox_cb(struct hinic_hwdev *hwdev, + enum hinic_mod_type mod, + hinic_pf_mbox_cb callback); + +int hinic_register_vf_mbox_cb(struct hinic_hwdev *hwdev, + enum hinic_mod_type mod, + hinic_vf_mbox_cb callback); + +void hinic_unregister_pf_mbox_cb(struct hinic_hwdev *hwdev, + enum hinic_mod_type mod); + +void hinic_unregister_vf_mbox_cb(struct hinic_hwdev *hwdev, + enum hinic_mod_type mod); + +void hinic_mbox_func_aeqe_handler(void *handle, void *header, u8 size); + +void hinic_mbox_self_aeqe_handler(void *handle, void *header, u8 size); + +int hinic_func_to_func_init(struct hinic_hwdev *hwdev); + +void hinic_func_to_func_free(struct hinic_hwdev *hwdev); + +int hinic_mbox_to_pf(struct hinic_hwdev *hwdev, enum hinic_mod_type mod, + u8 cmd, void *buf_in, u16 in_size, void *buf_out, + u16 *out_size, u32 timeout); + +int hinic_mbox_to_func(struct hinic_mbox_func_to_func *func_to_func, + enum hinic_mod_type mod, u16 cmd, u16 dst_func, + void *buf_in, u16 in_size, void *buf_out, + u16 *out_size, u32 timeout); + +int hinic_mbox_to_vf(struct hinic_hwdev *hwdev, + enum hinic_mod_type mod, u16 vf_id, u8 cmd, void *buf_in, + u16 in_size, void *buf_out, u16 *out_size, u32 timeout); + +#endif diff --git a/drivers/net/ethernet/huawei/hinic/hinic_hw_mgmt.h b/drivers/net/ethernet/huawei/hinic/hinic_hw_mgmt.h index 182fba17b643..a5ab044f98cc 100644 --- a/drivers/net/ethernet/huawei/hinic/hinic_hw_mgmt.h +++ b/drivers/net/ethernet/huawei/hinic/hinic_hw_mgmt.h @@ -60,6 +60,7 @@ enum hinic_cfg_cmd { }; enum hinic_comm_cmd { + HINIC_COMM_CMD_START_FLR = 0x1, HINIC_COMM_CMD_IO_STATUS_GET = 0x3, HINIC_COMM_CMD_CMDQ_CTXT_SET = 0x10, -- cgit v1.2.3-59-g8ed1b From 7dd29ee128654702bd493ecec0bb22c2c5f0f395 Mon Sep 17 00:00:00 2001 From: Luo bin Date: Sat, 25 Apr 2020 01:21:10 +0000 Subject: hinic: add sriov feature support adds support of basic sriov feature including initialization and tx/rx capabilities of virtual function Signed-off-by: Luo bin Signed-off-by: David S. Miller --- drivers/net/ethernet/huawei/hinic/Makefile | 2 +- drivers/net/ethernet/huawei/hinic/hinic_dev.h | 3 + drivers/net/ethernet/huawei/hinic/hinic_hw_cmdq.c | 18 +- drivers/net/ethernet/huawei/hinic/hinic_hw_cmdq.h | 2 +- drivers/net/ethernet/huawei/hinic/hinic_hw_dev.c | 123 ++-- drivers/net/ethernet/huawei/hinic/hinic_hw_dev.h | 46 ++ drivers/net/ethernet/huawei/hinic/hinic_hw_eqs.c | 98 ++- drivers/net/ethernet/huawei/hinic/hinic_hw_eqs.h | 4 +- drivers/net/ethernet/huawei/hinic/hinic_hw_io.c | 49 ++ drivers/net/ethernet/huawei/hinic/hinic_hw_io.h | 23 +- drivers/net/ethernet/huawei/hinic/hinic_hw_mgmt.c | 17 +- drivers/net/ethernet/huawei/hinic/hinic_hw_mgmt.h | 11 +- drivers/net/ethernet/huawei/hinic/hinic_hw_qp.c | 7 +- drivers/net/ethernet/huawei/hinic/hinic_hw_qp.h | 4 +- drivers/net/ethernet/huawei/hinic/hinic_hw_wq.c | 9 +- drivers/net/ethernet/huawei/hinic/hinic_hw_wq.h | 6 +- drivers/net/ethernet/huawei/hinic/hinic_main.c | 77 ++- drivers/net/ethernet/huawei/hinic/hinic_port.c | 76 +-- drivers/net/ethernet/huawei/hinic/hinic_port.h | 4 +- drivers/net/ethernet/huawei/hinic/hinic_rx.c | 15 +- drivers/net/ethernet/huawei/hinic/hinic_sriov.c | 698 ++++++++++++++++++++++ drivers/net/ethernet/huawei/hinic/hinic_sriov.h | 79 +++ drivers/net/ethernet/huawei/hinic/hinic_tx.c | 17 +- 23 files changed, 1195 insertions(+), 193 deletions(-) create mode 100644 drivers/net/ethernet/huawei/hinic/hinic_sriov.c create mode 100644 drivers/net/ethernet/huawei/hinic/hinic_sriov.h diff --git a/drivers/net/ethernet/huawei/hinic/Makefile b/drivers/net/ethernet/huawei/hinic/Makefile index a73862a64690..32a011ca44c3 100644 --- a/drivers/net/ethernet/huawei/hinic/Makefile +++ b/drivers/net/ethernet/huawei/hinic/Makefile @@ -4,4 +4,4 @@ obj-$(CONFIG_HINIC) += hinic.o hinic-y := hinic_main.o hinic_tx.o hinic_rx.o hinic_port.o hinic_hw_dev.o \ hinic_hw_io.o hinic_hw_qp.o hinic_hw_cmdq.o hinic_hw_wq.o \ hinic_hw_mgmt.o hinic_hw_api_cmd.o hinic_hw_eqs.o hinic_hw_if.o \ - hinic_common.o hinic_ethtool.o hinic_hw_mbox.o + hinic_common.o hinic_ethtool.o hinic_hw_mbox.o hinic_sriov.o diff --git a/drivers/net/ethernet/huawei/hinic/hinic_dev.h b/drivers/net/ethernet/huawei/hinic/hinic_dev.h index a209b14160cc..a621ebbf7610 100644 --- a/drivers/net/ethernet/huawei/hinic/hinic_dev.h +++ b/drivers/net/ethernet/huawei/hinic/hinic_dev.h @@ -16,6 +16,7 @@ #include "hinic_hw_dev.h" #include "hinic_tx.h" #include "hinic_rx.h" +#include "hinic_sriov.h" #define HINIC_DRV_NAME "hinic" @@ -23,6 +24,7 @@ enum hinic_flags { HINIC_LINK_UP = BIT(0), HINIC_INTF_UP = BIT(1), HINIC_RSS_ENABLE = BIT(2), + HINIC_LINK_DOWN = BIT(3), }; struct hinic_rx_mode_work { @@ -78,6 +80,7 @@ struct hinic_dev { struct hinic_rss_type rss_type; u8 *rss_hkey_user; s32 *rss_indir_user; + struct hinic_sriov_info sriov_info; }; #endif diff --git a/drivers/net/ethernet/huawei/hinic/hinic_hw_cmdq.c b/drivers/net/ethernet/huawei/hinic/hinic_hw_cmdq.c index 5f2d57d1b2d3..33c5333657c1 100644 --- a/drivers/net/ethernet/huawei/hinic/hinic_hw_cmdq.c +++ b/drivers/net/ethernet/huawei/hinic/hinic_hw_cmdq.c @@ -64,7 +64,7 @@ #define CMDQ_WQE_SIZE 64 #define CMDQ_DEPTH SZ_4K -#define CMDQ_WQ_PAGE_SIZE SZ_4K +#define CMDQ_WQ_PAGE_SIZE SZ_256K #define WQE_LCMD_SIZE 64 #define WQE_SCMD_SIZE 64 @@ -705,7 +705,7 @@ static void cmdq_init_queue_ctxt(struct hinic_cmdq_ctxt *cmdq_ctxt, /* The data in the HW is in Big Endian Format */ wq_first_page_paddr = be64_to_cpu(*wq->block_vaddr); - pfn = CMDQ_PFN(wq_first_page_paddr, wq->wq_page_size); + pfn = CMDQ_PFN(wq_first_page_paddr, SZ_4K); ctxt_info->curr_wqe_page_pfn = HINIC_CMDQ_CTXT_PAGE_INFO_SET(pfn, CURR_WQE_PAGE_PFN) | @@ -714,16 +714,19 @@ static void cmdq_init_queue_ctxt(struct hinic_cmdq_ctxt *cmdq_ctxt, HINIC_CMDQ_CTXT_PAGE_INFO_SET(1, CEQ_EN) | HINIC_CMDQ_CTXT_PAGE_INFO_SET(cmdq->wrapped, WRAPPED); - /* block PFN - Read Modify Write */ - cmdq_first_block_paddr = cmdq_pages->page_paddr; + if (wq->num_q_pages != 1) { + /* block PFN - Read Modify Write */ + cmdq_first_block_paddr = cmdq_pages->page_paddr; - pfn = CMDQ_PFN(cmdq_first_block_paddr, wq->wq_page_size); + pfn = CMDQ_PFN(cmdq_first_block_paddr, wq->wq_page_size); + } ctxt_info->wq_block_pfn = HINIC_CMDQ_CTXT_BLOCK_INFO_SET(pfn, WQ_BLOCK_PFN) | HINIC_CMDQ_CTXT_BLOCK_INFO_SET(atomic_read(&wq->cons_idx), CI); cmdq_ctxt->func_idx = HINIC_HWIF_FUNC_IDX(cmdqs->hwif); + cmdq_ctxt->ppf_idx = HINIC_HWIF_PPF_IDX(cmdqs->hwif); cmdq_ctxt->cmdq_type = cmdq->cmdq_type; } @@ -795,11 +798,6 @@ static int init_cmdqs_ctxt(struct hinic_hwdev *hwdev, size_t cmdq_ctxts_size; int err; - if (!HINIC_IS_PF(hwif) && !HINIC_IS_PPF(hwif)) { - dev_err(&pdev->dev, "Unsupported PCI function type\n"); - return -EINVAL; - } - cmdq_ctxts_size = HINIC_MAX_CMDQ_TYPES * sizeof(*cmdq_ctxts); cmdq_ctxts = devm_kzalloc(&pdev->dev, cmdq_ctxts_size, GFP_KERNEL); if (!cmdq_ctxts) diff --git a/drivers/net/ethernet/huawei/hinic/hinic_hw_cmdq.h b/drivers/net/ethernet/huawei/hinic/hinic_hw_cmdq.h index 7a434b653faa..3e4b0aef9fe6 100644 --- a/drivers/net/ethernet/huawei/hinic/hinic_hw_cmdq.h +++ b/drivers/net/ethernet/huawei/hinic/hinic_hw_cmdq.h @@ -122,7 +122,7 @@ struct hinic_cmdq_ctxt { u16 func_idx; u8 cmdq_type; - u8 rsvd1[1]; + u8 ppf_idx; u8 rsvd2[4]; diff --git a/drivers/net/ethernet/huawei/hinic/hinic_hw_dev.c b/drivers/net/ethernet/huawei/hinic/hinic_hw_dev.c index f2cf6f7ffc34..e5cab58e4ddd 100644 --- a/drivers/net/ethernet/huawei/hinic/hinic_hw_dev.c +++ b/drivers/net/ethernet/huawei/hinic/hinic_hw_dev.c @@ -15,7 +15,9 @@ #include #include #include +#include +#include "hinic_sriov.h" #include "hinic_hw_if.h" #include "hinic_hw_eqs.h" #include "hinic_hw_mgmt.h" @@ -46,20 +48,6 @@ enum hw_ioctxt_set_cmdq_depth { HW_IOCTXT_SET_CMDQ_DEPTH_DEFAULT, }; -/* HW struct */ -struct hinic_dev_cap { - u8 status; - u8 version; - u8 rsvd0[6]; - - u8 rsvd1[5]; - u8 intr_type; - u8 rsvd2[66]; - u16 max_sqs; - u16 max_rqs; - u8 rsvd3[208]; -}; - /** * get_capability - convert device capabilities to NIC capabilities * @hwdev: the HW device to set and convert device capabilities for @@ -67,16 +55,13 @@ struct hinic_dev_cap { * * Return 0 - Success, negative - Failure **/ -static int get_capability(struct hinic_hwdev *hwdev, - struct hinic_dev_cap *dev_cap) +static int parse_capability(struct hinic_hwdev *hwdev, + struct hinic_dev_cap *dev_cap) { struct hinic_cap *nic_cap = &hwdev->nic_cap; int num_aeqs, num_ceqs, num_irqs; - if (!HINIC_IS_PF(hwdev->hwif) && !HINIC_IS_PPF(hwdev->hwif)) - return -EINVAL; - - if (dev_cap->intr_type != INTR_MSIX_TYPE) + if (!HINIC_IS_VF(hwdev->hwif) && dev_cap->intr_type != INTR_MSIX_TYPE) return -EFAULT; num_aeqs = HINIC_HWIF_NUM_AEQS(hwdev->hwif); @@ -89,13 +74,19 @@ static int get_capability(struct hinic_hwdev *hwdev, if (nic_cap->num_qps > HINIC_Q_CTXT_MAX) nic_cap->num_qps = HINIC_Q_CTXT_MAX; - nic_cap->max_qps = dev_cap->max_sqs + 1; - if (nic_cap->max_qps != (dev_cap->max_rqs + 1)) - return -EFAULT; + if (!HINIC_IS_VF(hwdev->hwif)) + nic_cap->max_qps = dev_cap->max_sqs + 1; + else + nic_cap->max_qps = dev_cap->max_sqs; if (nic_cap->num_qps > nic_cap->max_qps) nic_cap->num_qps = nic_cap->max_qps; + if (!HINIC_IS_VF(hwdev->hwif)) { + nic_cap->max_vf = dev_cap->max_vf; + nic_cap->max_vf_qps = dev_cap->max_vf_sqs + 1; + } + return 0; } @@ -105,27 +96,26 @@ static int get_capability(struct hinic_hwdev *hwdev, * * Return 0 - Success, negative - Failure **/ -static int get_cap_from_fw(struct hinic_pfhwdev *pfhwdev) +static int get_capability(struct hinic_pfhwdev *pfhwdev) { struct hinic_hwdev *hwdev = &pfhwdev->hwdev; struct hinic_hwif *hwif = hwdev->hwif; struct pci_dev *pdev = hwif->pdev; struct hinic_dev_cap dev_cap; - u16 in_len, out_len; + u16 out_len; int err; - in_len = 0; out_len = sizeof(dev_cap); err = hinic_msg_to_mgmt(&pfhwdev->pf_to_mgmt, HINIC_MOD_CFGM, - HINIC_CFG_NIC_CAP, &dev_cap, in_len, &dev_cap, - &out_len, HINIC_MGMT_MSG_SYNC); + HINIC_CFG_NIC_CAP, &dev_cap, sizeof(dev_cap), + &dev_cap, &out_len, HINIC_MGMT_MSG_SYNC); if (err) { dev_err(&pdev->dev, "Failed to get capability from FW\n"); return err; } - return get_capability(hwdev, &dev_cap); + return parse_capability(hwdev, &dev_cap); } /** @@ -144,15 +134,14 @@ static int get_dev_cap(struct hinic_hwdev *hwdev) switch (HINIC_FUNC_TYPE(hwif)) { case HINIC_PPF: case HINIC_PF: + case HINIC_VF: pfhwdev = container_of(hwdev, struct hinic_pfhwdev, hwdev); - - err = get_cap_from_fw(pfhwdev); + err = get_capability(pfhwdev); if (err) { - dev_err(&pdev->dev, "Failed to get capability from FW\n"); + dev_err(&pdev->dev, "Failed to get capability\n"); return err; } break; - default: dev_err(&pdev->dev, "Unsupported PCI Function type\n"); return -EINVAL; @@ -225,15 +214,8 @@ static void disable_msix(struct hinic_hwdev *hwdev) int hinic_port_msg_cmd(struct hinic_hwdev *hwdev, enum hinic_port_cmd cmd, void *buf_in, u16 in_size, void *buf_out, u16 *out_size) { - struct hinic_hwif *hwif = hwdev->hwif; - struct pci_dev *pdev = hwif->pdev; struct hinic_pfhwdev *pfhwdev; - if (!HINIC_IS_PF(hwif) && !HINIC_IS_PPF(hwif)) { - dev_err(&pdev->dev, "unsupported PCI Function type\n"); - return -EINVAL; - } - pfhwdev = container_of(hwdev, struct hinic_pfhwdev, hwdev); return hinic_msg_to_mgmt(&pfhwdev->pf_to_mgmt, HINIC_MOD_L2NIC, cmd, @@ -252,14 +234,9 @@ static int init_fw_ctxt(struct hinic_hwdev *hwdev) struct hinic_hwif *hwif = hwdev->hwif; struct pci_dev *pdev = hwif->pdev; struct hinic_cmd_fw_ctxt fw_ctxt; - u16 out_size; + u16 out_size = sizeof(fw_ctxt); int err; - if (!HINIC_IS_PF(hwif) && !HINIC_IS_PPF(hwif)) { - dev_err(&pdev->dev, "Unsupported PCI Function type\n"); - return -EINVAL; - } - fw_ctxt.func_idx = HINIC_HWIF_FUNC_IDX(hwif); fw_ctxt.rx_buf_sz = HINIC_RX_BUF_SZ; @@ -288,14 +265,8 @@ static int set_hw_ioctxt(struct hinic_hwdev *hwdev, unsigned int rq_depth, { struct hinic_hwif *hwif = hwdev->hwif; struct hinic_cmd_hw_ioctxt hw_ioctxt; - struct pci_dev *pdev = hwif->pdev; struct hinic_pfhwdev *pfhwdev; - if (!HINIC_IS_PF(hwif) && !HINIC_IS_PPF(hwif)) { - dev_err(&pdev->dev, "Unsupported PCI Function type\n"); - return -EINVAL; - } - hw_ioctxt.func_idx = HINIC_HWIF_FUNC_IDX(hwif); hw_ioctxt.ppf_idx = HINIC_HWIF_PPF_IDX(hwif); @@ -374,11 +345,6 @@ static int clear_io_resources(struct hinic_hwdev *hwdev) struct hinic_pfhwdev *pfhwdev; int err; - if (!HINIC_IS_PF(hwif) && !HINIC_IS_PPF(hwif)) { - dev_err(&pdev->dev, "Unsupported PCI Function type\n"); - return -EINVAL; - } - /* sleep 100ms to wait for firmware stopping I/O */ msleep(100); @@ -410,14 +376,8 @@ static int set_resources_state(struct hinic_hwdev *hwdev, { struct hinic_cmd_set_res_state res_state; struct hinic_hwif *hwif = hwdev->hwif; - struct pci_dev *pdev = hwif->pdev; struct hinic_pfhwdev *pfhwdev; - if (!HINIC_IS_PF(hwif) && !HINIC_IS_PPF(hwif)) { - dev_err(&pdev->dev, "Unsupported PCI Function type\n"); - return -EINVAL; - } - res_state.func_idx = HINIC_HWIF_FUNC_IDX(hwif); res_state.state = state; @@ -441,8 +401,8 @@ static int get_base_qpn(struct hinic_hwdev *hwdev, u16 *base_qpn) { struct hinic_cmd_base_qpn cmd_base_qpn; struct hinic_hwif *hwif = hwdev->hwif; + u16 out_size = sizeof(cmd_base_qpn); struct pci_dev *pdev = hwif->pdev; - u16 out_size; int err; cmd_base_qpn.func_idx = HINIC_HWIF_FUNC_IDX(hwif); @@ -488,7 +448,7 @@ int hinic_hwdev_ifup(struct hinic_hwdev *hwdev) num_ceqs = HINIC_HWIF_NUM_CEQS(hwif); ceq_msix_entries = &hwdev->msix_entries[num_aeqs]; - + func_to_io->hwdev = hwdev; err = hinic_io_init(func_to_io, hwif, nic_cap->max_qps, num_ceqs, ceq_msix_entries); if (err) { @@ -558,17 +518,10 @@ void hinic_hwdev_cb_register(struct hinic_hwdev *hwdev, u16 in_size, void *buf_out, u16 *out_size)) { - struct hinic_hwif *hwif = hwdev->hwif; - struct pci_dev *pdev = hwif->pdev; struct hinic_pfhwdev *pfhwdev; struct hinic_nic_cb *nic_cb; u8 cmd_cb; - if (!HINIC_IS_PF(hwif) && !HINIC_IS_PPF(hwif)) { - dev_err(&pdev->dev, "unsupported PCI Function type\n"); - return; - } - pfhwdev = container_of(hwdev, struct hinic_pfhwdev, hwdev); cmd_cb = cmd - HINIC_MGMT_MSG_CMD_BASE; @@ -588,15 +541,12 @@ void hinic_hwdev_cb_unregister(struct hinic_hwdev *hwdev, enum hinic_mgmt_msg_cmd cmd) { struct hinic_hwif *hwif = hwdev->hwif; - struct pci_dev *pdev = hwif->pdev; struct hinic_pfhwdev *pfhwdev; struct hinic_nic_cb *nic_cb; u8 cmd_cb; - if (!HINIC_IS_PF(hwif) && !HINIC_IS_PPF(hwif)) { - dev_err(&pdev->dev, "unsupported PCI Function type\n"); + if (!HINIC_IS_PF(hwif) && !HINIC_IS_PPF(hwif)) return; - } pfhwdev = container_of(hwdev, struct hinic_pfhwdev, hwdev); @@ -742,12 +692,6 @@ struct hinic_hwdev *hinic_init_hwdev(struct pci_dev *pdev) return ERR_PTR(err); } - if (!HINIC_IS_PF(hwif) && !HINIC_IS_PPF(hwif)) { - dev_err(&pdev->dev, "Unsupported PCI Function type\n"); - err = -EFAULT; - goto err_func_type; - } - pfhwdev = devm_kzalloc(&pdev->dev, sizeof(*pfhwdev), GFP_KERNEL); if (!pfhwdev) { err = -ENOMEM; @@ -791,6 +735,12 @@ struct hinic_hwdev *hinic_init_hwdev(struct pci_dev *pdev) goto err_dev_cap; } + err = hinic_vf_func_init(hwdev); + if (err) { + dev_err(&pdev->dev, "Failed to init nic mbox\n"); + goto err_vf_func_init; + } + err = init_fw_ctxt(hwdev); if (err) { dev_err(&pdev->dev, "Failed to init function table\n"); @@ -807,6 +757,8 @@ struct hinic_hwdev *hinic_init_hwdev(struct pci_dev *pdev) err_resources_state: err_init_fw_ctxt: + hinic_vf_func_free(hwdev); +err_vf_func_init: err_dev_cap: free_pfhwdev(pfhwdev); @@ -818,7 +770,6 @@ err_aeqs_init: err_init_msix: err_pfhwdev_alloc: -err_func_type: hinic_free_hwif(hwif); return ERR_PTR(err); } @@ -949,15 +900,9 @@ int hinic_hwdev_hw_ci_addr_set(struct hinic_hwdev *hwdev, struct hinic_sq *sq, { struct hinic_qp *qp = container_of(sq, struct hinic_qp, sq); struct hinic_hwif *hwif = hwdev->hwif; - struct pci_dev *pdev = hwif->pdev; struct hinic_pfhwdev *pfhwdev; struct hinic_cmd_hw_ci hw_ci; - if (!HINIC_IS_PF(hwif) && !HINIC_IS_PPF(hwif)) { - dev_err(&pdev->dev, "Unsupported PCI Function type\n"); - return -EINVAL; - } - hw_ci.dma_attr_off = 0; hw_ci.pending_limit = pending_limit; hw_ci.coalesc_timer = coalesc_timer; diff --git a/drivers/net/ethernet/huawei/hinic/hinic_hw_dev.h b/drivers/net/ethernet/huawei/hinic/hinic_hw_dev.h index 2574086aa314..531d1072e0df 100644 --- a/drivers/net/ethernet/huawei/hinic/hinic_hw_dev.h +++ b/drivers/net/ethernet/huawei/hinic/hinic_hw_dev.h @@ -23,12 +23,20 @@ #define HINIC_MGMT_NUM_MSG_CMD (HINIC_MGMT_MSG_CMD_MAX - \ HINIC_MGMT_MSG_CMD_BASE) +#define HINIC_PF_SET_VF_ALREADY 0x4 +#define HINIC_MGMT_STATUS_EXIST 0x6 + struct hinic_cap { u16 max_qps; u16 num_qps; + u8 max_vf; + u16 max_vf_qps; }; enum hinic_port_cmd { + HINIC_PORT_CMD_VF_REGISTER = 0x0, + HINIC_PORT_CMD_VF_UNREGISTER = 0x1, + HINIC_PORT_CMD_CHANGE_MTU = 2, HINIC_PORT_CMD_ADD_VLAN = 3, @@ -84,10 +92,18 @@ enum hinic_port_cmd { HINIC_PORT_CMD_GET_GLOBAL_QPN = 102, + HINIC_PORT_CMD_SET_VF_VLAN = 106, + + HINIC_PORT_CMD_CLR_VF_VLAN, + HINIC_PORT_CMD_SET_TSO = 112, HINIC_PORT_CMD_SET_RQ_IQ_MAP = 115, + HINIC_PORT_CMD_LINK_STATUS_REPORT = 160, + + HINIC_PORT_CMD_UPDATE_MAC = 164, + HINIC_PORT_CMD_GET_CAP = 170, HINIC_PORT_CMD_SET_LRO_TIMER = 244, @@ -192,6 +208,17 @@ struct hinic_cmd_set_res_state { u32 rsvd2; }; +struct hinic_ceq_ctrl_reg { + u8 status; + u8 version; + u8 rsvd0[6]; + + u16 func_id; + u16 q_id; + u32 ctrl0; + u32 ctrl1; +}; + struct hinic_cmd_base_qpn { u8 status; u8 version; @@ -248,6 +275,25 @@ struct hinic_pfhwdev { struct hinic_nic_cb nic_cb[HINIC_MGMT_NUM_MSG_CMD]; }; +struct hinic_dev_cap { + u8 status; + u8 version; + u8 rsvd0[6]; + + u8 rsvd1[5]; + u8 intr_type; + u8 max_cos_id; + u8 er_id; + u8 port_id; + u8 max_vf; + u8 rsvd2[62]; + u16 max_sqs; + u16 max_rqs; + u16 max_vf_sqs; + u16 max_vf_rqs; + u8 rsvd3[204]; +}; + void hinic_hwdev_cb_register(struct hinic_hwdev *hwdev, enum hinic_mgmt_msg_cmd cmd, void *handle, void (*handler)(void *handle, void *buf_in, diff --git a/drivers/net/ethernet/huawei/hinic/hinic_hw_eqs.c b/drivers/net/ethernet/huawei/hinic/hinic_hw_eqs.c index c0b6bcb067cd..397936cac304 100644 --- a/drivers/net/ethernet/huawei/hinic/hinic_hw_eqs.c +++ b/drivers/net/ethernet/huawei/hinic/hinic_hw_eqs.c @@ -17,6 +17,7 @@ #include #include +#include "hinic_hw_dev.h" #include "hinic_hw_csr.h" #include "hinic_hw_if.h" #include "hinic_hw_eqs.h" @@ -416,11 +417,11 @@ static irqreturn_t ceq_interrupt(int irq, void *data) return IRQ_HANDLED; } -static void set_ctrl0(struct hinic_eq *eq) +static u32 get_ctrl0_val(struct hinic_eq *eq, u32 addr) { struct msix_entry *msix_entry = &eq->msix_entry; enum hinic_eq_type type = eq->type; - u32 addr, val, ctrl0; + u32 val, ctrl0; if (type == HINIC_AEQ) { /* RMW Ctrl0 */ @@ -440,9 +441,7 @@ static void set_ctrl0(struct hinic_eq *eq) HINIC_AEQ_CTRL_0_SET(EQ_INT_MODE_ARMED, INT_MODE); val |= ctrl0; - - hinic_hwif_write_reg(eq->hwif, addr, val); - } else if (type == HINIC_CEQ) { + } else { /* RMW Ctrl0 */ addr = HINIC_CSR_CEQ_CTRL_0_ADDR(eq->q_id); @@ -462,16 +461,28 @@ static void set_ctrl0(struct hinic_eq *eq) HINIC_CEQ_CTRL_0_SET(EQ_INT_MODE_ARMED, INTR_MODE); val |= ctrl0; - - hinic_hwif_write_reg(eq->hwif, addr, val); } + return val; } -static void set_ctrl1(struct hinic_eq *eq) +static void set_ctrl0(struct hinic_eq *eq) { + u32 val, addr; + + if (eq->type == HINIC_AEQ) + addr = HINIC_CSR_AEQ_CTRL_0_ADDR(eq->q_id); + else + addr = HINIC_CSR_CEQ_CTRL_0_ADDR(eq->q_id); + + val = get_ctrl0_val(eq, addr); + + hinic_hwif_write_reg(eq->hwif, addr, val); +} + +static u32 get_ctrl1_val(struct hinic_eq *eq, u32 addr) +{ + u32 page_size_val, elem_size, val, ctrl1; enum hinic_eq_type type = eq->type; - u32 page_size_val, elem_size; - u32 addr, val, ctrl1; if (type == HINIC_AEQ) { /* RMW Ctrl1 */ @@ -491,9 +502,7 @@ static void set_ctrl1(struct hinic_eq *eq) HINIC_AEQ_CTRL_1_SET(page_size_val, PAGE_SIZE); val |= ctrl1; - - hinic_hwif_write_reg(eq->hwif, addr, val); - } else if (type == HINIC_CEQ) { + } else { /* RMW Ctrl1 */ addr = HINIC_CSR_CEQ_CTRL_1_ADDR(eq->q_id); @@ -508,19 +517,70 @@ static void set_ctrl1(struct hinic_eq *eq) HINIC_CEQ_CTRL_1_SET(page_size_val, PAGE_SIZE); val |= ctrl1; + } + return val; +} - hinic_hwif_write_reg(eq->hwif, addr, val); +static void set_ctrl1(struct hinic_eq *eq) +{ + u32 addr, val; + + if (eq->type == HINIC_AEQ) + addr = HINIC_CSR_AEQ_CTRL_1_ADDR(eq->q_id); + else + addr = HINIC_CSR_CEQ_CTRL_1_ADDR(eq->q_id); + + val = get_ctrl1_val(eq, addr); + + hinic_hwif_write_reg(eq->hwif, addr, val); +} + +static int set_ceq_ctrl_reg(struct hinic_eq *eq) +{ + struct hinic_ceq_ctrl_reg ceq_ctrl = {0}; + struct hinic_hwdev *hwdev = eq->hwdev; + u16 out_size = sizeof(ceq_ctrl); + u16 in_size = sizeof(ceq_ctrl); + struct hinic_pfhwdev *pfhwdev; + u32 addr; + int err; + + pfhwdev = container_of(hwdev, struct hinic_pfhwdev, hwdev); + + addr = HINIC_CSR_CEQ_CTRL_0_ADDR(eq->q_id); + ceq_ctrl.ctrl0 = get_ctrl0_val(eq, addr); + addr = HINIC_CSR_CEQ_CTRL_1_ADDR(eq->q_id); + ceq_ctrl.ctrl1 = get_ctrl1_val(eq, addr); + + ceq_ctrl.func_id = HINIC_HWIF_FUNC_IDX(hwdev->hwif); + ceq_ctrl.q_id = eq->q_id; + + err = hinic_msg_to_mgmt(&pfhwdev->pf_to_mgmt, HINIC_MOD_COMM, + HINIC_COMM_CMD_CEQ_CTRL_REG_WR_BY_UP, + &ceq_ctrl, in_size, + &ceq_ctrl, &out_size, HINIC_MGMT_MSG_SYNC); + if (err || !out_size || ceq_ctrl.status) { + dev_err(&hwdev->hwif->pdev->dev, + "Failed to set ceq %d ctrl reg, err: %d status: 0x%x, out_size: 0x%x\n", + eq->q_id, err, ceq_ctrl.status, out_size); + return -EFAULT; } + + return 0; } /** * set_eq_ctrls - setting eq's ctrl registers * @eq: the Event Queue for setting **/ -static void set_eq_ctrls(struct hinic_eq *eq) +static int set_eq_ctrls(struct hinic_eq *eq) { + if (HINIC_IS_VF(eq->hwif) && eq->type == HINIC_CEQ) + return set_ceq_ctrl_reg(eq); + set_ctrl0(eq); set_ctrl1(eq); + return 0; } /** @@ -703,7 +763,12 @@ static int init_eq(struct hinic_eq *eq, struct hinic_hwif *hwif, return -EINVAL; } - set_eq_ctrls(eq); + err = set_eq_ctrls(eq); + if (err) { + dev_err(&pdev->dev, "Failed to set eq ctrls\n"); + return err; + } + eq_update_ci(eq, EQ_ARMED); err = alloc_eq_pages(eq); @@ -859,6 +924,7 @@ int hinic_ceqs_init(struct hinic_ceqs *ceqs, struct hinic_hwif *hwif, ceqs->num_ceqs = num_ceqs; for (q_id = 0; q_id < num_ceqs; q_id++) { + ceqs->ceq[q_id].hwdev = ceqs->hwdev; err = init_eq(&ceqs->ceq[q_id], hwif, HINIC_CEQ, q_id, q_len, page_size, msix_entries[q_id]); if (err) { diff --git a/drivers/net/ethernet/huawei/hinic/hinic_hw_eqs.h b/drivers/net/ethernet/huawei/hinic/hinic_hw_eqs.h index d73256da4b80..74b9ff90640c 100644 --- a/drivers/net/ethernet/huawei/hinic/hinic_hw_eqs.h +++ b/drivers/net/ethernet/huawei/hinic/hinic_hw_eqs.h @@ -172,7 +172,7 @@ struct hinic_eq_work { struct hinic_eq { struct hinic_hwif *hwif; - + struct hinic_hwdev *hwdev; enum hinic_eq_type type; int q_id; u32 q_len; @@ -220,7 +220,7 @@ struct hinic_ceq_cb { struct hinic_ceqs { struct hinic_hwif *hwif; - + struct hinic_hwdev *hwdev; struct hinic_eq ceq[HINIC_MAX_CEQS]; int num_ceqs; diff --git a/drivers/net/ethernet/huawei/hinic/hinic_hw_io.c b/drivers/net/ethernet/huawei/hinic/hinic_hw_io.c index d66f86fa3f46..a4581c988a63 100644 --- a/drivers/net/ethernet/huawei/hinic/hinic_hw_io.c +++ b/drivers/net/ethernet/huawei/hinic/hinic_hw_io.c @@ -15,6 +15,7 @@ #include #include +#include "hinic_hw_dev.h" #include "hinic_hw_if.h" #include "hinic_hw_eqs.h" #include "hinic_hw_wqe.h" @@ -34,6 +35,8 @@ #define DB_IDX(db, db_base) \ (((unsigned long)(db) - (unsigned long)(db_base)) / HINIC_DB_PAGE_SIZE) +#define HINIC_PAGE_SIZE_HW(pg_size) ((u8)ilog2((u32)((pg_size) >> 12))) + enum io_cmd { IO_CMD_MODIFY_QUEUE_CTXT = 0, IO_CMD_CLEAN_QUEUE_CTXT, @@ -484,6 +487,33 @@ void hinic_io_destroy_qps(struct hinic_func_to_io *func_to_io, int num_qps) devm_kfree(&pdev->dev, func_to_io->qps); } +int hinic_set_wq_page_size(struct hinic_hwdev *hwdev, u16 func_idx, + u32 page_size) +{ + struct hinic_wq_page_size page_size_info = {0}; + u16 out_size = sizeof(page_size_info); + struct hinic_pfhwdev *pfhwdev; + int err; + + pfhwdev = container_of(hwdev, struct hinic_pfhwdev, hwdev); + + page_size_info.func_idx = func_idx; + page_size_info.ppf_idx = HINIC_HWIF_PPF_IDX(hwdev->hwif); + page_size_info.page_size = HINIC_PAGE_SIZE_HW(page_size); + + err = hinic_msg_to_mgmt(&pfhwdev->pf_to_mgmt, HINIC_MOD_COMM, + HINIC_COMM_CMD_PAGESIZE_SET, &page_size_info, + sizeof(page_size_info), &page_size_info, + &out_size, HINIC_MGMT_MSG_SYNC); + if (err || !out_size || page_size_info.status) { + dev_err(&hwdev->hwif->pdev->dev, "Failed to set wq page size, err: %d, status: 0x%x, out_size: 0x%0x\n", + err, page_size_info.status, out_size); + return -EFAULT; + } + + return 0; +} + /** * hinic_io_init - Initialize the IO components * @func_to_io: func to io channel that holds the IO components @@ -506,6 +536,7 @@ int hinic_io_init(struct hinic_func_to_io *func_to_io, func_to_io->hwif = hwif; func_to_io->qps = NULL; func_to_io->max_qps = max_qps; + func_to_io->ceqs.hwdev = func_to_io->hwdev; err = hinic_ceqs_init(&func_to_io->ceqs, hwif, num_ceqs, HINIC_DEFAULT_CEQ_LEN, HINIC_EQ_PAGE_SIZE, @@ -541,6 +572,14 @@ int hinic_io_init(struct hinic_func_to_io *func_to_io, func_to_io->cmdq_db_area[cmdq] = db_area; } + err = hinic_set_wq_page_size(func_to_io->hwdev, + HINIC_HWIF_FUNC_IDX(hwif), + HINIC_DEFAULT_WQ_PAGE_SIZE); + if (err) { + dev_err(&func_to_io->hwif->pdev->dev, "Failed to set wq page size\n"); + goto init_wq_pg_size_err; + } + err = hinic_init_cmdqs(&func_to_io->cmdqs, hwif, func_to_io->cmdq_db_area); if (err) { @@ -551,6 +590,11 @@ int hinic_io_init(struct hinic_func_to_io *func_to_io, return 0; err_init_cmdqs: + if (!HINIC_IS_VF(func_to_io->hwif)) + hinic_set_wq_page_size(func_to_io->hwdev, + HINIC_HWIF_FUNC_IDX(hwif), + HINIC_HW_WQ_PAGE_SIZE); +init_wq_pg_size_err: err_db_area: for (type = HINIC_CMDQ_SYNC; type < cmdq; type++) return_db_area(func_to_io, func_to_io->cmdq_db_area[type]); @@ -575,6 +619,11 @@ void hinic_io_free(struct hinic_func_to_io *func_to_io) hinic_free_cmdqs(&func_to_io->cmdqs); + if (!HINIC_IS_VF(func_to_io->hwif)) + hinic_set_wq_page_size(func_to_io->hwdev, + HINIC_HWIF_FUNC_IDX(func_to_io->hwif), + HINIC_HW_WQ_PAGE_SIZE); + for (cmdq = HINIC_CMDQ_SYNC; cmdq < HINIC_MAX_CMDQ_TYPES; cmdq++) return_db_area(func_to_io, func_to_io->cmdq_db_area[cmdq]); diff --git a/drivers/net/ethernet/huawei/hinic/hinic_hw_io.h b/drivers/net/ethernet/huawei/hinic/hinic_hw_io.h index cac2b722e7dc..28c0594f636d 100644 --- a/drivers/net/ethernet/huawei/hinic/hinic_hw_io.h +++ b/drivers/net/ethernet/huawei/hinic/hinic_hw_io.h @@ -20,6 +20,8 @@ #define HINIC_DB_PAGE_SIZE SZ_4K #define HINIC_DB_SIZE SZ_4M +#define HINIC_HW_WQ_PAGE_SIZE SZ_4K +#define HINIC_DEFAULT_WQ_PAGE_SIZE SZ_256K #define HINIC_DB_MAX_AREAS (HINIC_DB_SIZE / HINIC_DB_PAGE_SIZE) @@ -47,7 +49,7 @@ struct hinic_free_db_area { struct hinic_func_to_io { struct hinic_hwif *hwif; - + struct hinic_hwdev *hwdev; struct hinic_ceqs ceqs; struct hinic_wqs wqs; @@ -69,8 +71,27 @@ struct hinic_func_to_io { void __iomem *cmdq_db_area[HINIC_MAX_CMDQ_TYPES]; struct hinic_cmdqs cmdqs; + + u16 max_vfs; + struct vf_data_storage *vf_infos; + u8 link_status; }; +struct hinic_wq_page_size { + u8 status; + u8 version; + u8 rsvd0[6]; + + u16 func_idx; + u8 ppf_idx; + u8 page_size; + + u32 rsvd1; +}; + +int hinic_set_wq_page_size(struct hinic_hwdev *hwdev, u16 func_idx, + u32 page_size); + int hinic_io_create_qps(struct hinic_func_to_io *func_to_io, u16 base_qpn, int num_qps, struct msix_entry *sq_msix_entries, diff --git a/drivers/net/ethernet/huawei/hinic/hinic_hw_mgmt.c b/drivers/net/ethernet/huawei/hinic/hinic_hw_mgmt.c index 8995e32dd1c0..eef855f11a01 100644 --- a/drivers/net/ethernet/huawei/hinic/hinic_hw_mgmt.c +++ b/drivers/net/ethernet/huawei/hinic/hinic_hw_mgmt.c @@ -353,7 +353,11 @@ int hinic_msg_to_mgmt(struct hinic_pf_to_mgmt *pf_to_mgmt, return -EINVAL; } - return msg_to_mgmt_sync(pf_to_mgmt, mod, cmd, buf_in, in_size, + if (HINIC_IS_VF(hwif)) + return hinic_mbox_to_pf(pf_to_mgmt->hwdev, mod, cmd, buf_in, + in_size, buf_out, out_size, 0); + else + return msg_to_mgmt_sync(pf_to_mgmt, mod, cmd, buf_in, in_size, buf_out, out_size, MGMT_DIRECT_SEND, MSG_NOT_RESP); } @@ -390,8 +394,8 @@ static void mgmt_recv_msg_handler(struct hinic_pf_to_mgmt *pf_to_mgmt, recv_msg->msg, recv_msg->msg_len, buf_out, &out_size); else - dev_err(&pdev->dev, "No MGMT msg handler, mod = %d\n", - recv_msg->mod); + dev_err(&pdev->dev, "No MGMT msg handler, mod: %d, cmd: %d\n", + recv_msg->mod, recv_msg->cmd); mgmt_cb->state &= ~HINIC_MGMT_CB_RUNNING; @@ -553,6 +557,10 @@ int hinic_pf_to_mgmt_init(struct hinic_pf_to_mgmt *pf_to_mgmt, int err; pf_to_mgmt->hwif = hwif; + pf_to_mgmt->hwdev = hwdev; + + if (HINIC_IS_VF(hwif)) + return 0; sema_init(&pf_to_mgmt->sync_msg_lock, 1); pf_to_mgmt->sync_msg_id = 0; @@ -584,6 +592,9 @@ void hinic_pf_to_mgmt_free(struct hinic_pf_to_mgmt *pf_to_mgmt) struct hinic_pfhwdev *pfhwdev = mgmt_to_pfhwdev(pf_to_mgmt); struct hinic_hwdev *hwdev = &pfhwdev->hwdev; + if (HINIC_IS_VF(hwdev->hwif)) + return; + hinic_aeq_unregister_hw_cb(&hwdev->aeqs, HINIC_MSG_FROM_MGMT_CPU); hinic_api_cmd_free(pf_to_mgmt->cmd_chain); } diff --git a/drivers/net/ethernet/huawei/hinic/hinic_hw_mgmt.h b/drivers/net/ethernet/huawei/hinic/hinic_hw_mgmt.h index a5ab044f98cc..c2b142c08b0e 100644 --- a/drivers/net/ethernet/huawei/hinic/hinic_hw_mgmt.h +++ b/drivers/net/ethernet/huawei/hinic/hinic_hw_mgmt.h @@ -62,6 +62,7 @@ enum hinic_cfg_cmd { enum hinic_comm_cmd { HINIC_COMM_CMD_START_FLR = 0x1, HINIC_COMM_CMD_IO_STATUS_GET = 0x3, + HINIC_COMM_CMD_DMA_ATTR_SET = 0x4, HINIC_COMM_CMD_CMDQ_CTXT_SET = 0x10, HINIC_COMM_CMD_CMDQ_CTXT_GET = 0x11, @@ -75,7 +76,13 @@ enum hinic_comm_cmd { HINIC_COMM_CMD_IO_RES_CLEAR = 0x29, - HINIC_COMM_CMD_MAX = 0x32, + HINIC_COMM_CMD_CEQ_CTRL_REG_WR_BY_UP = 0x33, + + HINIC_COMM_CMD_L2NIC_RESET = 0x4b, + + HINIC_COMM_CMD_PAGESIZE_SET = 0x50, + + HINIC_COMM_CMD_MAX = 0x51, }; enum hinic_mgmt_cb_state { @@ -108,7 +115,7 @@ struct hinic_mgmt_cb { struct hinic_pf_to_mgmt { struct hinic_hwif *hwif; - + struct hinic_hwdev *hwdev; struct semaphore sync_msg_lock; u16 sync_msg_id; u8 *sync_msg_buf; diff --git a/drivers/net/ethernet/huawei/hinic/hinic_hw_qp.c b/drivers/net/ethernet/huawei/hinic/hinic_hw_qp.c index be364b7a7019..20c5c8ea452e 100644 --- a/drivers/net/ethernet/huawei/hinic/hinic_hw_qp.c +++ b/drivers/net/ethernet/huawei/hinic/hinic_hw_qp.c @@ -108,7 +108,12 @@ void hinic_sq_prepare_ctxt(struct hinic_sq_ctxt *sq_ctxt, wq_page_pfn_hi = upper_32_bits(wq_page_pfn); wq_page_pfn_lo = lower_32_bits(wq_page_pfn); - wq_block_pfn = HINIC_WQ_BLOCK_PFN(wq->block_paddr); + /* If only one page, use 0-level CLA */ + if (wq->num_q_pages == 1) + wq_block_pfn = HINIC_WQ_BLOCK_PFN(wq_page_addr); + else + wq_block_pfn = HINIC_WQ_BLOCK_PFN(wq->block_paddr); + wq_block_pfn_hi = upper_32_bits(wq_block_pfn); wq_block_pfn_lo = lower_32_bits(wq_block_pfn); diff --git a/drivers/net/ethernet/huawei/hinic/hinic_hw_qp.h b/drivers/net/ethernet/huawei/hinic/hinic_hw_qp.h index 79091e131418..c30d092e48d5 100644 --- a/drivers/net/ethernet/huawei/hinic/hinic_hw_qp.h +++ b/drivers/net/ethernet/huawei/hinic/hinic_hw_qp.h @@ -38,8 +38,8 @@ #define HINIC_SQ_WQEBB_SIZE 64 #define HINIC_RQ_WQEBB_SIZE 32 -#define HINIC_SQ_PAGE_SIZE SZ_4K -#define HINIC_RQ_PAGE_SIZE SZ_4K +#define HINIC_SQ_PAGE_SIZE SZ_256K +#define HINIC_RQ_PAGE_SIZE SZ_256K #define HINIC_SQ_DEPTH SZ_4K #define HINIC_RQ_DEPTH SZ_4K diff --git a/drivers/net/ethernet/huawei/hinic/hinic_hw_wq.c b/drivers/net/ethernet/huawei/hinic/hinic_hw_wq.c index 03363216ff59..5dc3743f8091 100644 --- a/drivers/net/ethernet/huawei/hinic/hinic_hw_wq.c +++ b/drivers/net/ethernet/huawei/hinic/hinic_hw_wq.c @@ -503,7 +503,7 @@ err_alloc_wq_pages: * Return 0 - Success, negative - Failure **/ int hinic_wq_allocate(struct hinic_wqs *wqs, struct hinic_wq *wq, - u16 wqebb_size, u16 wq_page_size, u16 q_depth, + u16 wqebb_size, u32 wq_page_size, u16 q_depth, u16 max_wqe_size) { struct hinic_hwif *hwif = wqs->hwif; @@ -600,7 +600,7 @@ void hinic_wq_free(struct hinic_wqs *wqs, struct hinic_wq *wq) **/ int hinic_wqs_cmdq_alloc(struct hinic_cmdq_pages *cmdq_pages, struct hinic_wq *wq, struct hinic_hwif *hwif, - int cmdq_blocks, u16 wqebb_size, u16 wq_page_size, + int cmdq_blocks, u16 wqebb_size, u32 wq_page_size, u16 q_depth, u16 max_wqe_size) { struct pci_dev *pdev = hwif->pdev; @@ -768,7 +768,10 @@ struct hinic_hw_wqe *hinic_get_wqe(struct hinic_wq *wq, unsigned int wqe_size, *prod_idx = curr_prod_idx; - if (curr_pg != end_pg) { + /* If we only have one page, still need to get shadown wqe when + * wqe rolling-over page + */ + if (curr_pg != end_pg || MASKED_WQE_IDX(wq, end_prod_idx) < *prod_idx) { void *shadow_addr = &wq->shadow_wqe[curr_pg * wq->max_wqe_size]; copy_wqe_to_shadow(wq, shadow_addr, num_wqebbs, *prod_idx); diff --git a/drivers/net/ethernet/huawei/hinic/hinic_hw_wq.h b/drivers/net/ethernet/huawei/hinic/hinic_hw_wq.h index 811eef744140..b06f8c0255de 100644 --- a/drivers/net/ethernet/huawei/hinic/hinic_hw_wq.h +++ b/drivers/net/ethernet/huawei/hinic/hinic_hw_wq.h @@ -26,7 +26,7 @@ struct hinic_wq { int block_idx; u16 wqebb_size; - u16 wq_page_size; + u32 wq_page_size; u16 q_depth; u16 max_wqe_size; u16 num_wqebbs_per_page; @@ -76,7 +76,7 @@ struct hinic_cmdq_pages { int hinic_wqs_cmdq_alloc(struct hinic_cmdq_pages *cmdq_pages, struct hinic_wq *wq, struct hinic_hwif *hwif, - int cmdq_blocks, u16 wqebb_size, u16 wq_page_size, + int cmdq_blocks, u16 wqebb_size, u32 wq_page_size, u16 q_depth, u16 max_wqe_size); void hinic_wqs_cmdq_free(struct hinic_cmdq_pages *cmdq_pages, @@ -88,7 +88,7 @@ int hinic_wqs_alloc(struct hinic_wqs *wqs, int num_wqs, void hinic_wqs_free(struct hinic_wqs *wqs); int hinic_wq_allocate(struct hinic_wqs *wqs, struct hinic_wq *wq, - u16 wqebb_size, u16 wq_page_size, u16 q_depth, + u16 wqebb_size, u32 wq_page_size, u16 q_depth, u16 max_wqe_size); void hinic_wq_free(struct hinic_wqs *wqs, struct hinic_wq *wq); diff --git a/drivers/net/ethernet/huawei/hinic/hinic_main.c b/drivers/net/ethernet/huawei/hinic/hinic_main.c index 13560975c103..cd71249f9b1c 100644 --- a/drivers/net/ethernet/huawei/hinic/hinic_main.c +++ b/drivers/net/ethernet/huawei/hinic/hinic_main.c @@ -29,6 +29,7 @@ #include "hinic_tx.h" #include "hinic_rx.h" #include "hinic_dev.h" +#include "hinic_sriov.h" MODULE_AUTHOR("Huawei Technologies CO., Ltd"); MODULE_DESCRIPTION("Huawei Intelligent NIC driver"); @@ -46,6 +47,7 @@ MODULE_PARM_DESC(rx_weight, "Number Rx packets for NAPI budget (default=64)"); #define HINIC_DEV_ID_DUAL_PORT_100GE 0x0200 #define HINIC_DEV_ID_DUAL_PORT_100GE_MEZZ 0x0205 #define HINIC_DEV_ID_QUAD_PORT_25GE_MEZZ 0x0210 +#define HINIC_DEV_ID_VF 0x375e #define HINIC_WQ_NAME "hinic_dev" @@ -65,6 +67,8 @@ MODULE_PARM_DESC(rx_weight, "Number Rx packets for NAPI budget (default=64)"); #define rx_mode_work_to_nic_dev(rx_mode_work) \ container_of(rx_mode_work, struct hinic_dev, rx_mode_work) +#define HINIC_WAIT_SRIOV_CFG_TIMEOUT 15000 + static int change_mac_addr(struct net_device *netdev, const u8 *addr); static int set_features(struct hinic_dev *nic_dev, @@ -423,8 +427,9 @@ static int hinic_open(struct net_device *netdev) goto err_func_port_state; } - /* Wait up to 3 sec between port enable to link state */ - msleep(3000); + if (!HINIC_IS_VF(nic_dev->hwdev->hwif)) + /* Wait up to 3 sec between port enable to link state */ + msleep(3000); down(&nic_dev->mgmt_lock); @@ -434,6 +439,9 @@ static int hinic_open(struct net_device *netdev) goto err_port_link; } + if (!HINIC_IS_VF(nic_dev->hwdev->hwif)) + hinic_notify_all_vfs_link_changed(nic_dev->hwdev, link_state); + if (link_state == HINIC_LINK_STATE_UP) nic_dev->flags |= HINIC_LINK_UP; @@ -497,6 +505,9 @@ static int hinic_close(struct net_device *netdev) up(&nic_dev->mgmt_lock); + if (!HINIC_IS_VF(nic_dev->hwdev->hwif)) + hinic_notify_all_vfs_link_changed(nic_dev->hwdev, 0); + err = hinic_port_set_func_state(nic_dev, HINIC_FUNC_PORT_DISABLE); if (err) { netif_err(nic_dev, drv, netdev, @@ -685,7 +696,7 @@ static int hinic_vlan_rx_add_vid(struct net_device *netdev, } err = hinic_port_add_mac(nic_dev, netdev->dev_addr, vid); - if (err) { + if (err && err != HINIC_PF_SET_VF_ALREADY) { netif_err(nic_dev, drv, netdev, "Failed to set mac\n"); goto err_add_mac; } @@ -737,8 +748,6 @@ static void set_rx_mode(struct work_struct *work) struct hinic_rx_mode_work *rx_mode_work = work_to_rx_mode_work(work); struct hinic_dev *nic_dev = rx_mode_work_to_nic_dev(rx_mode_work); - netif_info(nic_dev, drv, nic_dev->netdev, "set rx mode work\n"); - hinic_port_set_rx_mode(nic_dev, rx_mode_work->rx_mode); __dev_uc_sync(nic_dev->netdev, add_mac_addr, remove_mac_addr); @@ -896,6 +905,10 @@ static void link_status_event_handler(void *handle, void *buf_in, u16 in_size, netif_info(nic_dev, drv, nic_dev->netdev, "HINIC_Link is DOWN\n"); } + if (!HINIC_IS_VF(nic_dev->hwdev->hwif)) + hinic_notify_all_vfs_link_changed(nic_dev->hwdev, + link_status->link); + ret_link_status = buf_out; ret_link_status->status = 0; @@ -969,7 +982,9 @@ static int nic_dev_init(struct pci_dev *pdev) } hinic_set_ethtool_ops(netdev); + netdev->netdev_ops = &hinic_netdev_ops; + netdev->max_mtu = ETH_MAX_MTU; nic_dev = netdev_priv(netdev); @@ -981,6 +996,8 @@ static int nic_dev_init(struct pci_dev *pdev) nic_dev->rxqs = NULL; nic_dev->tx_weight = tx_weight; nic_dev->rx_weight = rx_weight; + nic_dev->sriov_info.hwdev = hwdev; + nic_dev->sriov_info.pdev = pdev; sema_init(&nic_dev->mgmt_lock, 1); @@ -1007,11 +1024,25 @@ static int nic_dev_init(struct pci_dev *pdev) pci_set_drvdata(pdev, netdev); err = hinic_port_get_mac(nic_dev, netdev->dev_addr); - if (err) - dev_warn(&pdev->dev, "Failed to get mac address\n"); + if (err) { + dev_err(&pdev->dev, "Failed to get mac address\n"); + goto err_get_mac; + } + + if (!is_valid_ether_addr(netdev->dev_addr)) { + if (!HINIC_IS_VF(nic_dev->hwdev->hwif)) { + dev_err(&pdev->dev, "Invalid MAC address\n"); + err = -EIO; + goto err_add_mac; + } + + dev_info(&pdev->dev, "Invalid MAC address %pM, using random\n", + netdev->dev_addr); + eth_hw_addr_random(netdev); + } err = hinic_port_add_mac(nic_dev, netdev->dev_addr, 0); - if (err) { + if (err && err != HINIC_PF_SET_VF_ALREADY) { dev_err(&pdev->dev, "Failed to add mac\n"); goto err_add_mac; } @@ -1053,6 +1084,7 @@ err_set_features: cancel_work_sync(&rx_mode_work->work); err_set_mtu: +err_get_mac: err_add_mac: pci_set_drvdata(pdev, NULL); destroy_workqueue(nic_dev->workq); @@ -1126,12 +1158,37 @@ err_pci_regions: return err; } +#define HINIC_WAIT_SRIOV_CFG_TIMEOUT 15000 + +static void wait_sriov_cfg_complete(struct hinic_dev *nic_dev) +{ + struct hinic_sriov_info *sriov_info = &nic_dev->sriov_info; + u32 loop_cnt = 0; + + set_bit(HINIC_FUNC_REMOVE, &sriov_info->state); + usleep_range(9900, 10000); + + while (loop_cnt < HINIC_WAIT_SRIOV_CFG_TIMEOUT) { + if (!test_bit(HINIC_SRIOV_ENABLE, &sriov_info->state) && + !test_bit(HINIC_SRIOV_DISABLE, &sriov_info->state)) + return; + + usleep_range(9900, 10000); + loop_cnt++; + } +} + static void hinic_remove(struct pci_dev *pdev) { struct net_device *netdev = pci_get_drvdata(pdev); struct hinic_dev *nic_dev = netdev_priv(netdev); struct hinic_rx_mode_work *rx_mode_work; + if (!HINIC_IS_VF(nic_dev->hwdev->hwif)) { + wait_sriov_cfg_complete(nic_dev); + hinic_pci_sriov_disable(pdev); + } + unregister_netdev(netdev); hinic_hwdev_cb_unregister(nic_dev->hwdev, @@ -1144,6 +1201,8 @@ static void hinic_remove(struct pci_dev *pdev) destroy_workqueue(nic_dev->workq); + hinic_vf_func_free(nic_dev->hwdev); + hinic_free_hwdev(nic_dev->hwdev); free_netdev(netdev); @@ -1164,6 +1223,7 @@ static const struct pci_device_id hinic_pci_table[] = { { PCI_VDEVICE(HUAWEI, HINIC_DEV_ID_DUAL_PORT_100GE), 0}, { PCI_VDEVICE(HUAWEI, HINIC_DEV_ID_DUAL_PORT_100GE_MEZZ), 0}, { PCI_VDEVICE(HUAWEI, HINIC_DEV_ID_QUAD_PORT_25GE_MEZZ), 0}, + { PCI_VDEVICE(HUAWEI, HINIC_DEV_ID_VF), 0}, { 0, 0} }; MODULE_DEVICE_TABLE(pci, hinic_pci_table); @@ -1174,6 +1234,7 @@ static struct pci_driver hinic_driver = { .probe = hinic_probe, .remove = hinic_remove, .shutdown = hinic_shutdown, + .sriov_configure = hinic_pci_sriov_configure, }; module_pci_driver(hinic_driver); diff --git a/drivers/net/ethernet/huawei/hinic/hinic_port.c b/drivers/net/ethernet/huawei/hinic/hinic_port.c index 1e389a004e50..b7fe0adcc29a 100644 --- a/drivers/net/ethernet/huawei/hinic/hinic_port.c +++ b/drivers/net/ethernet/huawei/hinic/hinic_port.c @@ -37,20 +37,14 @@ enum mac_op { static int change_mac(struct hinic_dev *nic_dev, const u8 *addr, u16 vlan_id, enum mac_op op) { - struct net_device *netdev = nic_dev->netdev; struct hinic_hwdev *hwdev = nic_dev->hwdev; struct hinic_port_mac_cmd port_mac_cmd; struct hinic_hwif *hwif = hwdev->hwif; + u16 out_size = sizeof(port_mac_cmd); struct pci_dev *pdev = hwif->pdev; enum hinic_port_cmd cmd; - u16 out_size; int err; - if (vlan_id >= VLAN_N_VID) { - netif_err(nic_dev, drv, netdev, "Invalid VLAN number\n"); - return -EINVAL; - } - if (op == MAC_SET) cmd = HINIC_PORT_CMD_SET_MAC; else @@ -63,12 +57,25 @@ static int change_mac(struct hinic_dev *nic_dev, const u8 *addr, err = hinic_port_msg_cmd(hwdev, cmd, &port_mac_cmd, sizeof(port_mac_cmd), &port_mac_cmd, &out_size); - if (err || (out_size != sizeof(port_mac_cmd)) || port_mac_cmd.status) { + if (err || out_size != sizeof(port_mac_cmd) || + (port_mac_cmd.status && + port_mac_cmd.status != HINIC_PF_SET_VF_ALREADY && + port_mac_cmd.status != HINIC_MGMT_STATUS_EXIST)) { dev_err(&pdev->dev, "Failed to change MAC, ret = %d\n", port_mac_cmd.status); return -EFAULT; } + if (cmd == HINIC_PORT_CMD_SET_MAC && port_mac_cmd.status == + HINIC_PF_SET_VF_ALREADY) { + dev_warn(&pdev->dev, "PF has already set VF mac, Ignore set operation\n"); + return HINIC_PF_SET_VF_ALREADY; + } + + if (cmd == HINIC_PORT_CMD_SET_MAC && port_mac_cmd.status == + HINIC_MGMT_STATUS_EXIST) + dev_warn(&pdev->dev, "MAC is repeated. Ignore set operation\n"); + return 0; } @@ -112,8 +119,8 @@ int hinic_port_get_mac(struct hinic_dev *nic_dev, u8 *addr) struct hinic_hwdev *hwdev = nic_dev->hwdev; struct hinic_port_mac_cmd port_mac_cmd; struct hinic_hwif *hwif = hwdev->hwif; + u16 out_size = sizeof(port_mac_cmd); struct pci_dev *pdev = hwif->pdev; - u16 out_size; int err; port_mac_cmd.func_idx = HINIC_HWIF_FUNC_IDX(hwif); @@ -144,9 +151,9 @@ int hinic_port_set_mtu(struct hinic_dev *nic_dev, int new_mtu) struct hinic_hwdev *hwdev = nic_dev->hwdev; struct hinic_port_mtu_cmd port_mtu_cmd; struct hinic_hwif *hwif = hwdev->hwif; + u16 out_size = sizeof(port_mtu_cmd); struct pci_dev *pdev = hwif->pdev; int err, max_frame; - u16 out_size; if (new_mtu < HINIC_MIN_MTU_SIZE) { netif_err(nic_dev, drv, netdev, "mtu < MIN MTU size"); @@ -248,14 +255,9 @@ int hinic_port_link_state(struct hinic_dev *nic_dev, struct hinic_hwif *hwif = hwdev->hwif; struct hinic_port_link_cmd link_cmd; struct pci_dev *pdev = hwif->pdev; - u16 out_size; + u16 out_size = sizeof(link_cmd); int err; - if (!HINIC_IS_PF(hwif) && !HINIC_IS_PPF(hwif)) { - dev_err(&pdev->dev, "unsupported PCI Function type\n"); - return -EINVAL; - } - link_cmd.func_idx = HINIC_HWIF_FUNC_IDX(hwif); err = hinic_port_msg_cmd(hwdev, HINIC_PORT_CMD_GET_LINK_STATE, @@ -284,13 +286,11 @@ int hinic_port_set_state(struct hinic_dev *nic_dev, enum hinic_port_state state) struct hinic_port_state_cmd port_state; struct hinic_hwif *hwif = hwdev->hwif; struct pci_dev *pdev = hwif->pdev; - u16 out_size; + u16 out_size = sizeof(port_state); int err; - if (!HINIC_IS_PF(hwif) && !HINIC_IS_PPF(hwif)) { - dev_err(&pdev->dev, "unsupported PCI Function type\n"); - return -EINVAL; - } + if (HINIC_IS_VF(hwdev->hwif)) + return 0; port_state.state = state; @@ -320,7 +320,7 @@ int hinic_port_set_func_state(struct hinic_dev *nic_dev, struct hinic_hwdev *hwdev = nic_dev->hwdev; struct hinic_hwif *hwif = hwdev->hwif; struct pci_dev *pdev = hwif->pdev; - u16 out_size; + u16 out_size = sizeof(func_state); int err; func_state.func_idx = HINIC_HWIF_FUNC_IDX(hwif); @@ -351,7 +351,7 @@ int hinic_port_get_cap(struct hinic_dev *nic_dev, struct hinic_hwdev *hwdev = nic_dev->hwdev; struct hinic_hwif *hwif = hwdev->hwif; struct pci_dev *pdev = hwif->pdev; - u16 out_size; + u16 out_size = sizeof(*port_cap); int err; port_cap->func_idx = HINIC_HWIF_FUNC_IDX(hwif); @@ -382,7 +382,7 @@ int hinic_port_set_tso(struct hinic_dev *nic_dev, enum hinic_tso_state state) struct hinic_hwif *hwif = hwdev->hwif; struct hinic_tso_config tso_cfg = {0}; struct pci_dev *pdev = hwif->pdev; - u16 out_size; + u16 out_size = sizeof(tso_cfg); int err; tso_cfg.func_id = HINIC_HWIF_FUNC_IDX(hwif); @@ -405,9 +405,9 @@ int hinic_set_rx_csum_offload(struct hinic_dev *nic_dev, u32 en) { struct hinic_checksum_offload rx_csum_cfg = {0}; struct hinic_hwdev *hwdev = nic_dev->hwdev; + u16 out_size = sizeof(rx_csum_cfg); struct hinic_hwif *hwif; struct pci_dev *pdev; - u16 out_size; int err; if (!hwdev) @@ -443,6 +443,7 @@ int hinic_set_rx_vlan_offload(struct hinic_dev *nic_dev, u8 en) if (!hwdev) return -EINVAL; + out_size = sizeof(vlan_cfg); hwif = hwdev->hwif; pdev = hwif->pdev; vlan_cfg.func_id = HINIC_HWIF_FUNC_IDX(hwif); @@ -465,8 +466,8 @@ int hinic_set_max_qnum(struct hinic_dev *nic_dev, u8 num_rqs) { struct hinic_hwdev *hwdev = nic_dev->hwdev; struct hinic_hwif *hwif = hwdev->hwif; - struct pci_dev *pdev = hwif->pdev; struct hinic_rq_num rq_num = { 0 }; + struct pci_dev *pdev = hwif->pdev; u16 out_size = sizeof(rq_num); int err; @@ -491,8 +492,8 @@ static int hinic_set_rx_lro(struct hinic_dev *nic_dev, u8 ipv4_en, u8 ipv6_en, u8 max_wqe_num) { struct hinic_hwdev *hwdev = nic_dev->hwdev; - struct hinic_hwif *hwif = hwdev->hwif; struct hinic_lro_config lro_cfg = { 0 }; + struct hinic_hwif *hwif = hwdev->hwif; struct pci_dev *pdev = hwif->pdev; u16 out_size = sizeof(lro_cfg); int err; @@ -568,6 +569,9 @@ int hinic_set_rx_lro_state(struct hinic_dev *nic_dev, u8 lro_en, if (err) return err; + if (HINIC_IS_VF(nic_dev->hwdev->hwif)) + return 0; + err = hinic_set_rx_lro_timer(nic_dev, lro_timer); if (err) return err; @@ -741,9 +745,9 @@ int hinic_get_rss_type(struct hinic_dev *nic_dev, u32 tmpl_idx, { struct hinic_rss_context_table ctx_tbl = { 0 }; struct hinic_hwdev *hwdev = nic_dev->hwdev; + u16 out_size = sizeof(ctx_tbl); struct hinic_hwif *hwif; struct pci_dev *pdev; - u16 out_size = sizeof(ctx_tbl); int err; if (!hwdev || !rss_type) @@ -784,7 +788,7 @@ int hinic_rss_set_template_tbl(struct hinic_dev *nic_dev, u32 template_id, struct hinic_hwif *hwif = hwdev->hwif; struct hinic_rss_key rss_key = { 0 }; struct pci_dev *pdev = hwif->pdev; - u16 out_size; + u16 out_size = sizeof(rss_key); int err; rss_key.func_id = HINIC_HWIF_FUNC_IDX(hwif); @@ -809,9 +813,9 @@ int hinic_rss_get_template_tbl(struct hinic_dev *nic_dev, u32 tmpl_idx, { struct hinic_rss_template_key temp_key = { 0 }; struct hinic_hwdev *hwdev = nic_dev->hwdev; + u16 out_size = sizeof(temp_key); struct hinic_hwif *hwif; struct pci_dev *pdev; - u16 out_size = sizeof(temp_key); int err; if (!hwdev || !temp) @@ -844,7 +848,7 @@ int hinic_rss_set_hash_engine(struct hinic_dev *nic_dev, u8 template_id, struct hinic_hwdev *hwdev = nic_dev->hwdev; struct hinic_hwif *hwif = hwdev->hwif; struct pci_dev *pdev = hwif->pdev; - u16 out_size; + u16 out_size = sizeof(rss_engine); int err; rss_engine.func_id = HINIC_HWIF_FUNC_IDX(hwif); @@ -868,9 +872,9 @@ int hinic_rss_get_hash_engine(struct hinic_dev *nic_dev, u8 tmpl_idx, u8 *type) { struct hinic_rss_engine_type hash_type = { 0 }; struct hinic_hwdev *hwdev = nic_dev->hwdev; + u16 out_size = sizeof(hash_type); struct hinic_hwif *hwif; struct pci_dev *pdev; - u16 out_size = sizeof(hash_type); int err; if (!hwdev || !type) @@ -901,7 +905,7 @@ int hinic_rss_cfg(struct hinic_dev *nic_dev, u8 rss_en, u8 template_id) struct hinic_rss_config rss_cfg = { 0 }; struct hinic_hwif *hwif = hwdev->hwif; struct pci_dev *pdev = hwif->pdev; - u16 out_size; + u16 out_size = sizeof(rss_cfg); int err; rss_cfg.func_id = HINIC_HWIF_FUNC_IDX(hwif); @@ -927,8 +931,8 @@ int hinic_rss_template_alloc(struct hinic_dev *nic_dev, u8 *tmpl_idx) struct hinic_rss_template_mgmt template_mgmt = { 0 }; struct hinic_hwdev *hwdev = nic_dev->hwdev; struct hinic_hwif *hwif = hwdev->hwif; + u16 out_size = sizeof(template_mgmt); struct pci_dev *pdev = hwif->pdev; - u16 out_size; int err; template_mgmt.func_id = HINIC_HWIF_FUNC_IDX(hwif); @@ -953,8 +957,8 @@ int hinic_rss_template_free(struct hinic_dev *nic_dev, u8 tmpl_idx) struct hinic_rss_template_mgmt template_mgmt = { 0 }; struct hinic_hwdev *hwdev = nic_dev->hwdev; struct hinic_hwif *hwif = hwdev->hwif; + u16 out_size = sizeof(template_mgmt); struct pci_dev *pdev = hwif->pdev; - u16 out_size; int err; template_mgmt.func_id = HINIC_HWIF_FUNC_IDX(hwif); @@ -1043,9 +1047,9 @@ int hinic_get_mgmt_version(struct hinic_dev *nic_dev, u8 *mgmt_ver) { struct hinic_hwdev *hwdev = nic_dev->hwdev; struct hinic_version_info up_ver = {0}; + u16 out_size = sizeof(up_ver); struct hinic_hwif *hwif; struct pci_dev *pdev; - u16 out_size; int err; if (!hwdev) diff --git a/drivers/net/ethernet/huawei/hinic/hinic_port.h b/drivers/net/ethernet/huawei/hinic/hinic_port.h index 44772fd47fc1..5ad04fb6722a 100644 --- a/drivers/net/ethernet/huawei/hinic/hinic_port.h +++ b/drivers/net/ethernet/huawei/hinic/hinic_port.h @@ -148,9 +148,9 @@ struct hinic_port_link_status { u8 version; u8 rsvd0[6]; - u16 rsvd1; + u16 func_id; u8 link; - u8 rsvd2; + u8 port_id; }; struct hinic_port_func_state_cmd { diff --git a/drivers/net/ethernet/huawei/hinic/hinic_rx.c b/drivers/net/ethernet/huawei/hinic/hinic_rx.c index 815649e37cb1..af20d0dd6de7 100644 --- a/drivers/net/ethernet/huawei/hinic/hinic_rx.c +++ b/drivers/net/ethernet/huawei/hinic/hinic_rx.c @@ -432,9 +432,11 @@ static int rx_poll(struct napi_struct *napi, int budget) return budget; napi_complete(napi); - hinic_hwdev_set_msix_state(nic_dev->hwdev, - rq->msix_entry, - HINIC_MSIX_ENABLE); + + if (!HINIC_IS_VF(nic_dev->hwdev->hwif)) + hinic_hwdev_set_msix_state(nic_dev->hwdev, + rq->msix_entry, + HINIC_MSIX_ENABLE); return pkts; } @@ -461,9 +463,10 @@ static irqreturn_t rx_irq(int irq, void *data) /* Disable the interrupt until napi will be completed */ nic_dev = netdev_priv(rxq->netdev); - hinic_hwdev_set_msix_state(nic_dev->hwdev, - rq->msix_entry, - HINIC_MSIX_DISABLE); + if (!HINIC_IS_VF(nic_dev->hwdev->hwif)) + hinic_hwdev_set_msix_state(nic_dev->hwdev, + rq->msix_entry, + HINIC_MSIX_DISABLE); nic_dev = netdev_priv(rxq->netdev); hinic_hwdev_msix_cnt_set(nic_dev->hwdev, rq->msix_entry); diff --git a/drivers/net/ethernet/huawei/hinic/hinic_sriov.c b/drivers/net/ethernet/huawei/hinic/hinic_sriov.c new file mode 100644 index 000000000000..d1c4e1428b38 --- /dev/null +++ b/drivers/net/ethernet/huawei/hinic/hinic_sriov.c @@ -0,0 +1,698 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Huawei HiNIC PCI Express Linux driver + * Copyright(c) 2017 Huawei Technologies Co., Ltd + */ + +#include +#include +#include +#include +#include + +#include "hinic_hw_dev.h" +#include "hinic_dev.h" +#include "hinic_hw_mbox.h" +#include "hinic_hw_cmdq.h" +#include "hinic_port.h" +#include "hinic_sriov.h" + +static unsigned char set_vf_link_state; +module_param(set_vf_link_state, byte, 0444); +MODULE_PARM_DESC(set_vf_link_state, "Set vf link state, 0 represents link auto, 1 represents link always up, 2 represents link always down. - default is 0."); + +#define HINIC_VLAN_PRIORITY_SHIFT 13 +#define HINIC_ADD_VLAN_IN_MAC 0x8000 + +int hinic_set_mac(struct hinic_hwdev *hwdev, const u8 *mac_addr, u16 vlan_id, + u16 func_id) +{ + struct hinic_port_mac_cmd mac_info = {0}; + u16 out_size = sizeof(mac_info); + int err; + + mac_info.func_idx = func_id; + mac_info.vlan_id = vlan_id; + memcpy(mac_info.mac, mac_addr, ETH_ALEN); + + err = hinic_port_msg_cmd(hwdev, HINIC_PORT_CMD_SET_MAC, &mac_info, + sizeof(mac_info), &mac_info, &out_size); + if (err || out_size != sizeof(mac_info) || + (mac_info.status && mac_info.status != HINIC_PF_SET_VF_ALREADY && + mac_info.status != HINIC_MGMT_STATUS_EXIST)) { + dev_err(&hwdev->func_to_io.hwif->pdev->dev, "Failed to change MAC, ret = %d\n", + mac_info.status); + return -EFAULT; + } + + return 0; +} + +static void hinic_notify_vf_link_status(struct hinic_hwdev *hwdev, u16 vf_id, + u8 link_status) +{ + struct vf_data_storage *vf_infos = hwdev->func_to_io.vf_infos; + struct hinic_port_link_status link = {0}; + u16 out_size = sizeof(link); + int err; + + if (vf_infos[HW_VF_ID_TO_OS(vf_id)].registered) { + link.link = link_status; + link.func_id = hinic_glb_pf_vf_offset(hwdev->hwif) + vf_id; + err = hinic_mbox_to_vf(hwdev, HINIC_MOD_L2NIC, + vf_id, HINIC_PORT_CMD_LINK_STATUS_REPORT, + &link, sizeof(link), + &link, &out_size, 0); + if (err || !out_size || link.status) + dev_err(&hwdev->hwif->pdev->dev, + "Send link change event to VF %d failed, err: %d, status: 0x%x, out_size: 0x%x\n", + HW_VF_ID_TO_OS(vf_id), err, + link.status, out_size); + } +} + +/* send link change event mbox msg to active vfs under the pf */ +void hinic_notify_all_vfs_link_changed(struct hinic_hwdev *hwdev, + u8 link_status) +{ + struct hinic_func_to_io *nic_io = &hwdev->func_to_io; + u16 i; + + nic_io->link_status = link_status; + for (i = 1; i <= nic_io->max_vfs; i++) { + if (!nic_io->vf_infos[HW_VF_ID_TO_OS(i)].link_forced) + hinic_notify_vf_link_status(hwdev, i, link_status); + } +} + +u16 hinic_vf_info_vlanprio(struct hinic_hwdev *hwdev, int vf_id) +{ + struct hinic_func_to_io *nic_io = &hwdev->func_to_io; + u16 pf_vlan, vlanprio; + u8 pf_qos; + + pf_vlan = nic_io->vf_infos[HW_VF_ID_TO_OS(vf_id)].pf_vlan; + pf_qos = nic_io->vf_infos[HW_VF_ID_TO_OS(vf_id)].pf_qos; + vlanprio = pf_vlan | pf_qos << HINIC_VLAN_PRIORITY_SHIFT; + + return vlanprio; +} + +int hinic_set_vf_vlan(struct hinic_hwdev *hwdev, bool add, u16 vid, + u8 qos, int vf_id) +{ + struct hinic_vf_vlan_config vf_vlan = {0}; + u16 out_size = sizeof(vf_vlan); + int err; + u8 cmd; + + /* VLAN 0 is a special case, don't allow it to be removed */ + if (!vid && !add) + return 0; + + vf_vlan.func_id = hinic_glb_pf_vf_offset(hwdev->hwif) + vf_id; + vf_vlan.vlan_id = vid; + vf_vlan.qos = qos; + + if (add) + cmd = HINIC_PORT_CMD_SET_VF_VLAN; + else + cmd = HINIC_PORT_CMD_CLR_VF_VLAN; + + err = hinic_port_msg_cmd(hwdev, cmd, &vf_vlan, + sizeof(vf_vlan), &vf_vlan, &out_size); + if (err || !out_size || vf_vlan.status) { + dev_err(&hwdev->hwif->pdev->dev, "Failed to set VF %d vlan, err: %d, status: 0x%x, out size: 0x%x\n", + HW_VF_ID_TO_OS(vf_id), err, vf_vlan.status, out_size); + return -EFAULT; + } + + return 0; +} + +static int hinic_init_vf_config(struct hinic_hwdev *hwdev, u16 vf_id) +{ + struct vf_data_storage *vf_info; + u16 func_id, vlan_id; + int err = 0; + + vf_info = hwdev->func_to_io.vf_infos + HW_VF_ID_TO_OS(vf_id); + if (vf_info->pf_set_mac) { + func_id = hinic_glb_pf_vf_offset(hwdev->hwif) + vf_id; + + vlan_id = 0; + + err = hinic_set_mac(hwdev, vf_info->vf_mac_addr, vlan_id, + func_id); + if (err) { + dev_err(&hwdev->func_to_io.hwif->pdev->dev, "Failed to set VF %d MAC\n", + HW_VF_ID_TO_OS(vf_id)); + return err; + } + } + + if (hinic_vf_info_vlanprio(hwdev, vf_id)) { + err = hinic_set_vf_vlan(hwdev, true, vf_info->pf_vlan, + vf_info->pf_qos, vf_id); + if (err) { + dev_err(&hwdev->hwif->pdev->dev, "Failed to add VF %d VLAN_QOS\n", + HW_VF_ID_TO_OS(vf_id)); + return err; + } + } + + return 0; +} + +int hinic_register_vf_msg_handler(void *hwdev, u16 vf_id, + void *buf_in, u16 in_size, + void *buf_out, u16 *out_size) +{ + struct hinic_register_vf *register_info = buf_out; + struct hinic_hwdev *hw_dev = hwdev; + struct hinic_func_to_io *nic_io; + int err; + + nic_io = &hw_dev->func_to_io; + if (vf_id > nic_io->max_vfs) { + dev_err(&hw_dev->hwif->pdev->dev, "Register VF id %d exceed limit[0-%d]\n", + HW_VF_ID_TO_OS(vf_id), HW_VF_ID_TO_OS(nic_io->max_vfs)); + register_info->status = EFAULT; + return -EFAULT; + } + + *out_size = sizeof(*register_info); + err = hinic_init_vf_config(hw_dev, vf_id); + if (err) { + register_info->status = EFAULT; + return err; + } + + nic_io->vf_infos[HW_VF_ID_TO_OS(vf_id)].registered = true; + + return 0; +} + +int hinic_unregister_vf_msg_handler(void *hwdev, u16 vf_id, + void *buf_in, u16 in_size, + void *buf_out, u16 *out_size) +{ + struct hinic_hwdev *hw_dev = hwdev; + struct hinic_func_to_io *nic_io; + + nic_io = &hw_dev->func_to_io; + *out_size = 0; + if (vf_id > nic_io->max_vfs) + return 0; + + nic_io->vf_infos[HW_VF_ID_TO_OS(vf_id)].registered = false; + + return 0; +} + +int hinic_change_vf_mtu_msg_handler(void *hwdev, u16 vf_id, + void *buf_in, u16 in_size, + void *buf_out, u16 *out_size) +{ + struct hinic_hwdev *hw_dev = hwdev; + int err; + + err = hinic_port_msg_cmd(hwdev, HINIC_PORT_CMD_CHANGE_MTU, buf_in, + in_size, buf_out, out_size); + if (err) { + dev_err(&hw_dev->hwif->pdev->dev, "Failed to set VF %u mtu\n", + vf_id); + return err; + } + + return 0; +} + +int hinic_get_vf_mac_msg_handler(void *hwdev, u16 vf_id, + void *buf_in, u16 in_size, + void *buf_out, u16 *out_size) +{ + struct hinic_port_mac_cmd *mac_info = buf_out; + struct hinic_hwdev *dev = hwdev; + struct hinic_func_to_io *nic_io; + struct vf_data_storage *vf_info; + + nic_io = &dev->func_to_io; + vf_info = nic_io->vf_infos + HW_VF_ID_TO_OS(vf_id); + + memcpy(mac_info->mac, vf_info->vf_mac_addr, ETH_ALEN); + mac_info->status = 0; + *out_size = sizeof(*mac_info); + + return 0; +} + +int hinic_set_vf_mac_msg_handler(void *hwdev, u16 vf_id, + void *buf_in, u16 in_size, + void *buf_out, u16 *out_size) +{ + struct hinic_port_mac_cmd *mac_out = buf_out; + struct hinic_port_mac_cmd *mac_in = buf_in; + struct hinic_hwdev *hw_dev = hwdev; + struct hinic_func_to_io *nic_io; + struct vf_data_storage *vf_info; + int err; + + nic_io = &hw_dev->func_to_io; + vf_info = nic_io->vf_infos + HW_VF_ID_TO_OS(vf_id); + if (vf_info->pf_set_mac && !(vf_info->trust) && + is_valid_ether_addr(mac_in->mac)) { + dev_warn(&hw_dev->hwif->pdev->dev, "PF has already set VF %d MAC address\n", + HW_VF_ID_TO_OS(vf_id)); + mac_out->status = HINIC_PF_SET_VF_ALREADY; + *out_size = sizeof(*mac_out); + return 0; + } + + err = hinic_port_msg_cmd(hw_dev, HINIC_PORT_CMD_SET_MAC, buf_in, + in_size, buf_out, out_size); + if ((err && err != HINIC_MBOX_PF_BUSY_ACTIVE_FW) || !(*out_size)) { + dev_err(&hw_dev->hwif->pdev->dev, + "Failed to set VF %d MAC address, err: %d, status: 0x%x, out size: 0x%x\n", + HW_VF_ID_TO_OS(vf_id), err, mac_out->status, *out_size); + return -EFAULT; + } + + return err; +} + +int hinic_del_vf_mac_msg_handler(void *hwdev, u16 vf_id, + void *buf_in, u16 in_size, + void *buf_out, u16 *out_size) +{ + struct hinic_port_mac_cmd *mac_out = buf_out; + struct hinic_port_mac_cmd *mac_in = buf_in; + struct hinic_hwdev *hw_dev = hwdev; + struct hinic_func_to_io *nic_io; + struct vf_data_storage *vf_info; + int err; + + nic_io = &hw_dev->func_to_io; + vf_info = nic_io->vf_infos + HW_VF_ID_TO_OS(vf_id); + if (vf_info->pf_set_mac && is_valid_ether_addr(mac_in->mac) && + !memcmp(vf_info->vf_mac_addr, mac_in->mac, ETH_ALEN)) { + dev_warn(&hw_dev->hwif->pdev->dev, "PF has already set VF mac.\n"); + mac_out->status = HINIC_PF_SET_VF_ALREADY; + *out_size = sizeof(*mac_out); + return 0; + } + + err = hinic_port_msg_cmd(hw_dev, HINIC_PORT_CMD_DEL_MAC, buf_in, + in_size, buf_out, out_size); + if ((err && err != HINIC_MBOX_PF_BUSY_ACTIVE_FW) || !(*out_size)) { + dev_err(&hw_dev->hwif->pdev->dev, "Failed to delete VF %d MAC, err: %d, status: 0x%x, out size: 0x%x\n", + HW_VF_ID_TO_OS(vf_id), err, mac_out->status, *out_size); + return -EFAULT; + } + + return err; +} + +int hinic_get_vf_link_status_msg_handler(void *hwdev, u16 vf_id, + void *buf_in, u16 in_size, + void *buf_out, u16 *out_size) +{ + struct hinic_port_link_cmd *get_link = buf_out; + struct hinic_hwdev *hw_dev = hwdev; + struct vf_data_storage *vf_infos; + struct hinic_func_to_io *nic_io; + bool link_forced, link_up; + + nic_io = &hw_dev->func_to_io; + vf_infos = nic_io->vf_infos; + link_forced = vf_infos[HW_VF_ID_TO_OS(vf_id)].link_forced; + link_up = vf_infos[HW_VF_ID_TO_OS(vf_id)].link_up; + + if (link_forced) + get_link->state = link_up ? + HINIC_LINK_STATE_UP : HINIC_LINK_STATE_DOWN; + else + get_link->state = nic_io->link_status; + + get_link->status = 0; + *out_size = sizeof(*get_link); + + return 0; +} + +struct vf_cmd_msg_handle nic_vf_cmd_msg_handler[] = { + {HINIC_PORT_CMD_VF_REGISTER, hinic_register_vf_msg_handler}, + {HINIC_PORT_CMD_VF_UNREGISTER, hinic_unregister_vf_msg_handler}, + {HINIC_PORT_CMD_CHANGE_MTU, hinic_change_vf_mtu_msg_handler}, + {HINIC_PORT_CMD_GET_MAC, hinic_get_vf_mac_msg_handler}, + {HINIC_PORT_CMD_SET_MAC, hinic_set_vf_mac_msg_handler}, + {HINIC_PORT_CMD_DEL_MAC, hinic_del_vf_mac_msg_handler}, + {HINIC_PORT_CMD_GET_LINK_STATE, hinic_get_vf_link_status_msg_handler}, +}; + +#define CHECK_IPSU_15BIT 0X8000 + +struct hinic_sriov_info *hinic_get_sriov_info_by_pcidev(struct pci_dev *pdev) +{ + struct net_device *netdev = pci_get_drvdata(pdev); + struct hinic_dev *nic_dev = netdev_priv(netdev); + + return &nic_dev->sriov_info; +} + +int hinic_kill_vf_vlan(struct hinic_hwdev *hwdev, int vf_id) +{ + struct hinic_func_to_io *nic_io = &hwdev->func_to_io; + int err; + + err = hinic_set_vf_vlan(hwdev, false, + nic_io->vf_infos[HW_VF_ID_TO_OS(vf_id)].pf_vlan, + nic_io->vf_infos[HW_VF_ID_TO_OS(vf_id)].pf_qos, + vf_id); + if (err) + return err; + + dev_info(&hwdev->hwif->pdev->dev, "Remove VLAN %d on VF %d\n", + nic_io->vf_infos[HW_VF_ID_TO_OS(vf_id)].pf_vlan, + HW_VF_ID_TO_OS(vf_id)); + + nic_io->vf_infos[HW_VF_ID_TO_OS(vf_id)].pf_vlan = 0; + nic_io->vf_infos[HW_VF_ID_TO_OS(vf_id)].pf_qos = 0; + + return 0; +} + +/* pf receive message from vf */ +int nic_pf_mbox_handler(void *hwdev, u16 vf_id, u8 cmd, void *buf_in, + u16 in_size, void *buf_out, u16 *out_size) +{ + struct vf_cmd_msg_handle *vf_msg_handle; + struct hinic_hwdev *dev = hwdev; + struct hinic_func_to_io *nic_io; + struct hinic_pfhwdev *pfhwdev; + u32 i, cmd_number; + int err = 0; + + if (!hwdev) + return -EFAULT; + + cmd_number = sizeof(nic_vf_cmd_msg_handler) / + sizeof(struct vf_cmd_msg_handle); + pfhwdev = container_of(dev, struct hinic_pfhwdev, hwdev); + nic_io = &dev->func_to_io; + for (i = 0; i < cmd_number; i++) { + vf_msg_handle = &nic_vf_cmd_msg_handler[i]; + if (cmd == vf_msg_handle->cmd && + vf_msg_handle->cmd_msg_handler) { + err = vf_msg_handle->cmd_msg_handler(hwdev, vf_id, + buf_in, in_size, + buf_out, + out_size); + break; + } + } + if (i == cmd_number) + err = hinic_msg_to_mgmt(&pfhwdev->pf_to_mgmt, HINIC_MOD_L2NIC, + cmd, buf_in, in_size, buf_out, + out_size, HINIC_MGMT_MSG_SYNC); + + if (err && err != HINIC_MBOX_PF_BUSY_ACTIVE_FW) + dev_err(&nic_io->hwif->pdev->dev, "PF receive VF L2NIC cmd: %d process error, err:%d\n", + cmd, err); + return err; +} + +static int cfg_mbx_pf_proc_vf_msg(void *hwdev, u16 vf_id, u8 cmd, void *buf_in, + u16 in_size, void *buf_out, u16 *out_size) +{ + struct hinic_dev_cap *dev_cap = buf_out; + struct hinic_hwdev *dev = hwdev; + struct hinic_cap *cap; + + cap = &dev->nic_cap; + memset(dev_cap, 0, sizeof(*dev_cap)); + + dev_cap->max_vf = cap->max_vf; + dev_cap->max_sqs = cap->max_vf_qps; + dev_cap->max_rqs = cap->max_vf_qps; + + *out_size = sizeof(*dev_cap); + + return 0; +} + +static int hinic_init_vf_infos(struct hinic_func_to_io *nic_io, u16 vf_id) +{ + struct vf_data_storage *vf_infos = nic_io->vf_infos; + + if (set_vf_link_state > HINIC_IFLA_VF_LINK_STATE_DISABLE) { + dev_warn(&nic_io->hwif->pdev->dev, "Module Parameter set_vf_link_state value %d is out of range, resetting to %d\n", + set_vf_link_state, HINIC_IFLA_VF_LINK_STATE_AUTO); + set_vf_link_state = HINIC_IFLA_VF_LINK_STATE_AUTO; + } + + switch (set_vf_link_state) { + case HINIC_IFLA_VF_LINK_STATE_AUTO: + vf_infos[vf_id].link_forced = false; + break; + case HINIC_IFLA_VF_LINK_STATE_ENABLE: + vf_infos[vf_id].link_forced = true; + vf_infos[vf_id].link_up = true; + break; + case HINIC_IFLA_VF_LINK_STATE_DISABLE: + vf_infos[vf_id].link_forced = true; + vf_infos[vf_id].link_up = false; + break; + default: + dev_err(&nic_io->hwif->pdev->dev, "Invalid input parameter set_vf_link_state: %d\n", + set_vf_link_state); + return -EINVAL; + } + + return 0; +} + +void hinic_clear_vf_infos(struct hinic_dev *nic_dev, u16 vf_id) +{ + struct vf_data_storage *vf_infos; + u16 func_id; + + func_id = hinic_glb_pf_vf_offset(nic_dev->hwdev->hwif) + vf_id; + vf_infos = nic_dev->hwdev->func_to_io.vf_infos + HW_VF_ID_TO_OS(vf_id); + if (vf_infos->pf_set_mac) + hinic_port_del_mac(nic_dev, vf_infos->vf_mac_addr, 0); + + if (hinic_vf_info_vlanprio(nic_dev->hwdev, vf_id)) + hinic_kill_vf_vlan(nic_dev->hwdev, vf_id); + + memset(vf_infos, 0, sizeof(*vf_infos)); + /* set vf_infos to default */ + hinic_init_vf_infos(&nic_dev->hwdev->func_to_io, HW_VF_ID_TO_OS(vf_id)); +} + +int hinic_deinit_vf_hw(struct hinic_sriov_info *sriov_info, u16 start_vf_id, + u16 end_vf_id) +{ + struct hinic_dev *nic_dev; + u16 func_idx, idx; + + nic_dev = container_of(sriov_info, struct hinic_dev, sriov_info); + + for (idx = start_vf_id; idx <= end_vf_id; idx++) { + func_idx = hinic_glb_pf_vf_offset(nic_dev->hwdev->hwif) + idx; + hinic_set_wq_page_size(nic_dev->hwdev, func_idx, + HINIC_HW_WQ_PAGE_SIZE); + hinic_clear_vf_infos(nic_dev, idx); + } + + return 0; +} + +int hinic_vf_func_init(struct hinic_hwdev *hwdev) +{ + struct hinic_register_vf register_info = {0}; + u16 out_size = sizeof(register_info); + struct hinic_func_to_io *nic_io; + int err = 0; + u32 size, i; + + nic_io = &hwdev->func_to_io; + + if (HINIC_IS_VF(hwdev->hwif)) { + err = hinic_mbox_to_pf(hwdev, HINIC_MOD_L2NIC, + HINIC_PORT_CMD_VF_REGISTER, + ®ister_info, sizeof(register_info), + ®ister_info, &out_size, 0); + if (err || register_info.status || !out_size) { + dev_err(&hwdev->hwif->pdev->dev, + "Failed to register VF, err: %d, status: 0x%x, out size: 0x%x\n", + err, register_info.status, out_size); + hinic_unregister_vf_mbox_cb(hwdev, HINIC_MOD_L2NIC); + return -EIO; + } + } else { + err = hinic_register_pf_mbox_cb(hwdev, HINIC_MOD_CFGM, + cfg_mbx_pf_proc_vf_msg); + if (err) { + dev_err(&hwdev->hwif->pdev->dev, + "Register PF mailbox callback failed\n"); + return err; + } + nic_io->max_vfs = hwdev->nic_cap.max_vf; + size = sizeof(*nic_io->vf_infos) * nic_io->max_vfs; + if (size != 0) { + nic_io->vf_infos = kzalloc(size, GFP_KERNEL); + if (!nic_io->vf_infos) { + err = -ENOMEM; + goto out_free_nic_io; + } + + for (i = 0; i < nic_io->max_vfs; i++) { + err = hinic_init_vf_infos(nic_io, i); + if (err) + goto err_init_vf_infos; + } + + err = hinic_register_pf_mbox_cb(hwdev, HINIC_MOD_L2NIC, + nic_pf_mbox_handler); + if (err) + goto err_register_pf_mbox_cb; + } + } + + return 0; + +err_register_pf_mbox_cb: +err_init_vf_infos: + kfree(nic_io->vf_infos); +out_free_nic_io: + return err; +} + +void hinic_vf_func_free(struct hinic_hwdev *hwdev) +{ + struct hinic_register_vf unregister = {0}; + u16 out_size = sizeof(unregister); + int err; + + if (HINIC_IS_VF(hwdev->hwif)) { + err = hinic_mbox_to_pf(hwdev, HINIC_MOD_L2NIC, + HINIC_PORT_CMD_VF_UNREGISTER, + &unregister, sizeof(unregister), + &unregister, &out_size, 0); + if (err || !out_size || unregister.status) + dev_err(&hwdev->hwif->pdev->dev, "Failed to unregister VF, err: %d, status: 0x%x, out_size: 0x%x\n", + err, unregister.status, out_size); + } else { + if (hwdev->func_to_io.vf_infos) { + hinic_unregister_pf_mbox_cb(hwdev, HINIC_MOD_L2NIC); + kfree(hwdev->func_to_io.vf_infos); + } + } +} + +int hinic_init_vf_hw(struct hinic_hwdev *hwdev, u16 start_vf_id, u16 end_vf_id) +{ + u16 i, func_idx; + int err; + + /* vf use 256K as default wq page size, and can't change it */ + for (i = start_vf_id; i <= end_vf_id; i++) { + func_idx = hinic_glb_pf_vf_offset(hwdev->hwif) + i; + err = hinic_set_wq_page_size(hwdev, func_idx, + HINIC_DEFAULT_WQ_PAGE_SIZE); + if (err) + return err; + } + + return 0; +} + +int hinic_pci_sriov_disable(struct pci_dev *pdev) +{ + struct hinic_sriov_info *sriov_info; + u16 tmp_vfs; + + sriov_info = hinic_get_sriov_info_by_pcidev(pdev); + /* if SR-IOV is already disabled then nothing will be done */ + if (!sriov_info->sriov_enabled) + return 0; + + set_bit(HINIC_SRIOV_DISABLE, &sriov_info->state); + + /* If our VFs are assigned we cannot shut down SR-IOV + * without causing issues, so just leave the hardware + * available but disabled + */ + if (pci_vfs_assigned(sriov_info->pdev)) { + clear_bit(HINIC_SRIOV_DISABLE, &sriov_info->state); + dev_warn(&pdev->dev, "Unloading driver while VFs are assigned - VFs will not be deallocated\n"); + return -EPERM; + } + sriov_info->sriov_enabled = false; + + /* disable iov and allow time for transactions to clear */ + pci_disable_sriov(sriov_info->pdev); + + tmp_vfs = (u16)sriov_info->num_vfs; + sriov_info->num_vfs = 0; + hinic_deinit_vf_hw(sriov_info, OS_VF_ID_TO_HW(0), + OS_VF_ID_TO_HW(tmp_vfs - 1)); + + clear_bit(HINIC_SRIOV_DISABLE, &sriov_info->state); + + return 0; +} + +int hinic_pci_sriov_enable(struct pci_dev *pdev, int num_vfs) +{ + struct hinic_sriov_info *sriov_info; + int err; + + sriov_info = hinic_get_sriov_info_by_pcidev(pdev); + + if (test_and_set_bit(HINIC_SRIOV_ENABLE, &sriov_info->state)) { + dev_err(&pdev->dev, + "SR-IOV enable in process, please wait, num_vfs %d\n", + num_vfs); + return -EPERM; + } + + err = hinic_init_vf_hw(sriov_info->hwdev, OS_VF_ID_TO_HW(0), + OS_VF_ID_TO_HW((u16)num_vfs - 1)); + if (err) { + dev_err(&sriov_info->pdev->dev, + "Failed to init vf in hardware before enable sriov, error %d\n", + err); + clear_bit(HINIC_SRIOV_ENABLE, &sriov_info->state); + return err; + } + + err = pci_enable_sriov(sriov_info->pdev, num_vfs); + if (err) { + dev_err(&pdev->dev, + "Failed to enable SR-IOV, error %d\n", err); + clear_bit(HINIC_SRIOV_ENABLE, &sriov_info->state); + return err; + } + + sriov_info->sriov_enabled = true; + sriov_info->num_vfs = num_vfs; + clear_bit(HINIC_SRIOV_ENABLE, &sriov_info->state); + + return num_vfs; +} + +int hinic_pci_sriov_configure(struct pci_dev *dev, int num_vfs) +{ + struct hinic_sriov_info *sriov_info; + + sriov_info = hinic_get_sriov_info_by_pcidev(dev); + + if (test_bit(HINIC_FUNC_REMOVE, &sriov_info->state)) + return -EBUSY; + + if (!num_vfs) + return hinic_pci_sriov_disable(dev); + else + return hinic_pci_sriov_enable(dev, num_vfs); +} diff --git a/drivers/net/ethernet/huawei/hinic/hinic_sriov.h b/drivers/net/ethernet/huawei/hinic/hinic_sriov.h new file mode 100644 index 000000000000..4889eabe7b7c --- /dev/null +++ b/drivers/net/ethernet/huawei/hinic/hinic_sriov.h @@ -0,0 +1,79 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* Huawei HiNIC PCI Express Linux driver + * Copyright(c) 2017 Huawei Technologies Co., Ltd + */ + +#ifndef HINIC_SRIOV_H +#define HINIC_SRIOV_H + +#include "hinic_hw_dev.h" + +#define OS_VF_ID_TO_HW(os_vf_id) ((os_vf_id) + 1) +#define HW_VF_ID_TO_OS(hw_vf_id) ((hw_vf_id) - 1) + +enum hinic_sriov_state { + HINIC_SRIOV_DISABLE, + HINIC_SRIOV_ENABLE, + HINIC_FUNC_REMOVE, +}; + +enum { + HINIC_IFLA_VF_LINK_STATE_AUTO, /* link state of the uplink */ + HINIC_IFLA_VF_LINK_STATE_ENABLE, /* link always up */ + HINIC_IFLA_VF_LINK_STATE_DISABLE, /* link always down */ +}; + +struct hinic_sriov_info { + struct pci_dev *pdev; + struct hinic_hwdev *hwdev; + bool sriov_enabled; + unsigned int num_vfs; + unsigned long state; +}; + +struct vf_data_storage { + u8 vf_mac_addr[ETH_ALEN]; + bool registered; + bool pf_set_mac; + u16 pf_vlan; + u8 pf_qos; + u32 max_rate; + u32 min_rate; + + bool link_forced; + bool link_up; /* only valid if VF link is forced */ + bool spoofchk; + bool trust; +}; + +struct hinic_register_vf { + u8 status; + u8 version; + u8 rsvd0[6]; +}; + +struct hinic_vf_vlan_config { + u8 status; + u8 version; + u8 rsvd0[6]; + + u16 func_id; + u16 vlan_id; + u8 qos; + u8 rsvd1[7]; +}; + +void hinic_notify_all_vfs_link_changed(struct hinic_hwdev *hwdev, + u8 link_status); + +int hinic_pci_sriov_disable(struct pci_dev *dev); + +int hinic_pci_sriov_enable(struct pci_dev *dev, int num_vfs); + +int hinic_vf_func_init(struct hinic_hwdev *hwdev); + +void hinic_vf_func_free(struct hinic_hwdev *hwdev); + +int hinic_pci_sriov_configure(struct pci_dev *dev, int num_vfs); + +#endif diff --git a/drivers/net/ethernet/huawei/hinic/hinic_tx.c b/drivers/net/ethernet/huawei/hinic/hinic_tx.c index 365016450bdb..4c66a0bc1b28 100644 --- a/drivers/net/ethernet/huawei/hinic/hinic_tx.c +++ b/drivers/net/ethernet/huawei/hinic/hinic_tx.c @@ -673,9 +673,11 @@ static int free_tx_poll(struct napi_struct *napi, int budget) if (pkts < budget) { napi_complete(napi); - hinic_hwdev_set_msix_state(nic_dev->hwdev, - sq->msix_entry, - HINIC_MSIX_ENABLE); + if (!HINIC_IS_VF(nic_dev->hwdev->hwif)) + hinic_hwdev_set_msix_state(nic_dev->hwdev, + sq->msix_entry, + HINIC_MSIX_ENABLE); + return pkts; } @@ -701,10 +703,11 @@ static irqreturn_t tx_irq(int irq, void *data) nic_dev = netdev_priv(txq->netdev); - /* Disable the interrupt until napi will be completed */ - hinic_hwdev_set_msix_state(nic_dev->hwdev, - txq->sq->msix_entry, - HINIC_MSIX_DISABLE); + if (!HINIC_IS_VF(nic_dev->hwdev->hwif)) + /* Disable the interrupt until napi will be completed */ + hinic_hwdev_set_msix_state(nic_dev->hwdev, + txq->sq->msix_entry, + HINIC_MSIX_DISABLE); hinic_hwdev_msix_cnt_set(nic_dev->hwdev, txq->sq->msix_entry); -- cgit v1.2.3-59-g8ed1b From 1f62cfa19a619f82c098468660b7950477101d45 Mon Sep 17 00:00:00 2001 From: Luo bin Date: Sat, 25 Apr 2020 01:21:11 +0000 Subject: hinic: add net_device_ops associated with vf adds ndo_set_vf_mac/ndo_set_vf_vlan/ndo_get_vf_config and ndo_set_vf_trust to configure netdev of virtual function Signed-off-by: Luo bin Signed-off-by: David S. Miller --- drivers/net/ethernet/huawei/hinic/hinic_main.c | 43 +++- drivers/net/ethernet/huawei/hinic/hinic_sriov.c | 318 ++++++++++++++++++++++++ drivers/net/ethernet/huawei/hinic/hinic_sriov.h | 23 ++ 3 files changed, 383 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/huawei/hinic/hinic_main.c b/drivers/net/ethernet/huawei/hinic/hinic_main.c index cd71249f9b1c..b66bb86cff96 100644 --- a/drivers/net/ethernet/huawei/hinic/hinic_main.c +++ b/drivers/net/ethernet/huawei/hinic/hinic_main.c @@ -779,8 +779,26 @@ static void hinic_set_rx_mode(struct net_device *netdev) static void hinic_tx_timeout(struct net_device *netdev, unsigned int txqueue) { struct hinic_dev *nic_dev = netdev_priv(netdev); + u16 sw_pi, hw_ci, sw_ci; + struct hinic_sq *sq; + u16 num_sqs, q_id; + + num_sqs = hinic_hwdev_num_qps(nic_dev->hwdev); netif_err(nic_dev, drv, netdev, "Tx timeout\n"); + + for (q_id = 0; q_id < num_sqs; q_id++) { + if (!netif_xmit_stopped(netdev_get_tx_queue(netdev, q_id))) + continue; + + sq = hinic_hwdev_get_sq(nic_dev->hwdev, q_id); + sw_pi = atomic_read(&sq->wq->prod_idx) & sq->wq->mask; + hw_ci = be16_to_cpu(*(u16 *)(sq->hw_ci_addr)) & sq->wq->mask; + sw_ci = atomic_read(&sq->wq->cons_idx) & sq->wq->mask; + netif_err(nic_dev, drv, netdev, "Txq%d: sw_pi: %d, hw_ci: %d, sw_ci: %d, napi->state: 0x%lx\n", + q_id, sw_pi, hw_ci, sw_ci, + nic_dev->txqs[q_id].napi.state); + } } static void hinic_get_stats64(struct net_device *netdev, @@ -846,6 +864,26 @@ static const struct net_device_ops hinic_netdev_ops = { .ndo_get_stats64 = hinic_get_stats64, .ndo_fix_features = hinic_fix_features, .ndo_set_features = hinic_set_features, + .ndo_set_vf_mac = hinic_ndo_set_vf_mac, + .ndo_set_vf_vlan = hinic_ndo_set_vf_vlan, + .ndo_get_vf_config = hinic_ndo_get_vf_config, + .ndo_set_vf_trust = hinic_ndo_set_vf_trust, +}; + +static const struct net_device_ops hinicvf_netdev_ops = { + .ndo_open = hinic_open, + .ndo_stop = hinic_close, + .ndo_change_mtu = hinic_change_mtu, + .ndo_set_mac_address = hinic_set_mac_addr, + .ndo_validate_addr = eth_validate_addr, + .ndo_vlan_rx_add_vid = hinic_vlan_rx_add_vid, + .ndo_vlan_rx_kill_vid = hinic_vlan_rx_kill_vid, + .ndo_set_rx_mode = hinic_set_rx_mode, + .ndo_start_xmit = hinic_xmit_frame, + .ndo_tx_timeout = hinic_tx_timeout, + .ndo_get_stats64 = hinic_get_stats64, + .ndo_fix_features = hinic_fix_features, + .ndo_set_features = hinic_set_features, }; static void netdev_features_init(struct net_device *netdev) @@ -983,7 +1021,10 @@ static int nic_dev_init(struct pci_dev *pdev) hinic_set_ethtool_ops(netdev); - netdev->netdev_ops = &hinic_netdev_ops; + if (!HINIC_IS_VF(hwdev->hwif)) + netdev->netdev_ops = &hinic_netdev_ops; + else + netdev->netdev_ops = &hinicvf_netdev_ops; netdev->max_mtu = ETH_MAX_MTU; diff --git a/drivers/net/ethernet/huawei/hinic/hinic_sriov.c b/drivers/net/ethernet/huawei/hinic/hinic_sriov.c index d1c4e1428b38..b24788e9733c 100644 --- a/drivers/net/ethernet/huawei/hinic/hinic_sriov.c +++ b/drivers/net/ethernet/huawei/hinic/hinic_sriov.c @@ -359,6 +359,168 @@ struct hinic_sriov_info *hinic_get_sriov_info_by_pcidev(struct pci_dev *pdev) return &nic_dev->sriov_info; } +static int hinic_check_mac_info(u8 status, u16 vlan_id) +{ + if ((status && status != HINIC_MGMT_STATUS_EXIST && + status != HINIC_PF_SET_VF_ALREADY) || + (vlan_id & CHECK_IPSU_15BIT && + status == HINIC_MGMT_STATUS_EXIST)) + return -EINVAL; + + return 0; +} + +#define HINIC_VLAN_ID_MASK 0x7FFF + +int hinic_update_mac(struct hinic_hwdev *hwdev, u8 *old_mac, u8 *new_mac, + u16 vlan_id, u16 func_id) +{ + struct hinic_port_mac_update mac_info = {0}; + u16 out_size = sizeof(mac_info); + int err; + + if (!hwdev || !old_mac || !new_mac) + return -EINVAL; + + if ((vlan_id & HINIC_VLAN_ID_MASK) >= VLAN_N_VID) { + dev_err(&hwdev->hwif->pdev->dev, "Invalid VLAN number: %d\n", + (vlan_id & HINIC_VLAN_ID_MASK)); + return -EINVAL; + } + + mac_info.func_id = func_id; + mac_info.vlan_id = vlan_id; + memcpy(mac_info.old_mac, old_mac, ETH_ALEN); + memcpy(mac_info.new_mac, new_mac, ETH_ALEN); + + err = hinic_port_msg_cmd(hwdev, HINIC_PORT_CMD_UPDATE_MAC, &mac_info, + sizeof(mac_info), &mac_info, &out_size); + + if (err || !out_size || + hinic_check_mac_info(mac_info.status, mac_info.vlan_id)) { + dev_err(&hwdev->hwif->pdev->dev, + "Failed to update MAC, err: %d, status: 0x%x, out size: 0x%x\n", + err, mac_info.status, out_size); + return -EINVAL; + } + + if (mac_info.status == HINIC_PF_SET_VF_ALREADY) { + dev_warn(&hwdev->hwif->pdev->dev, + "PF has already set VF MAC. Ignore update operation\n"); + return HINIC_PF_SET_VF_ALREADY; + } + + if (mac_info.status == HINIC_MGMT_STATUS_EXIST) + dev_warn(&hwdev->hwif->pdev->dev, "MAC is repeated. Ignore update operation\n"); + + return 0; +} + +void hinic_get_vf_config(struct hinic_hwdev *hwdev, u16 vf_id, + struct ifla_vf_info *ivi) +{ + struct vf_data_storage *vfinfo; + + vfinfo = hwdev->func_to_io.vf_infos + HW_VF_ID_TO_OS(vf_id); + + ivi->vf = HW_VF_ID_TO_OS(vf_id); + memcpy(ivi->mac, vfinfo->vf_mac_addr, ETH_ALEN); + ivi->vlan = vfinfo->pf_vlan; + ivi->qos = vfinfo->pf_qos; + ivi->spoofchk = vfinfo->spoofchk; + ivi->trusted = vfinfo->trust; + ivi->max_tx_rate = vfinfo->max_rate; + ivi->min_tx_rate = vfinfo->min_rate; + + if (!vfinfo->link_forced) + ivi->linkstate = IFLA_VF_LINK_STATE_AUTO; + else if (vfinfo->link_up) + ivi->linkstate = IFLA_VF_LINK_STATE_ENABLE; + else + ivi->linkstate = IFLA_VF_LINK_STATE_DISABLE; +} + +int hinic_ndo_get_vf_config(struct net_device *netdev, + int vf, struct ifla_vf_info *ivi) +{ + struct hinic_dev *nic_dev = netdev_priv(netdev); + struct hinic_sriov_info *sriov_info; + + sriov_info = &nic_dev->sriov_info; + if (vf >= sriov_info->num_vfs) + return -EINVAL; + + hinic_get_vf_config(sriov_info->hwdev, OS_VF_ID_TO_HW(vf), ivi); + + return 0; +} + +int hinic_set_vf_mac(struct hinic_hwdev *hwdev, int vf, unsigned char *mac_addr) +{ + struct hinic_func_to_io *nic_io = &hwdev->func_to_io; + struct vf_data_storage *vf_info; + u16 func_id; + int err; + + vf_info = nic_io->vf_infos + HW_VF_ID_TO_OS(vf); + + /* duplicate request, so just return success */ + if (vf_info->pf_set_mac && + !memcmp(vf_info->vf_mac_addr, mac_addr, ETH_ALEN)) + return 0; + + vf_info->pf_set_mac = true; + + func_id = hinic_glb_pf_vf_offset(hwdev->hwif) + vf; + err = hinic_update_mac(hwdev, vf_info->vf_mac_addr, + mac_addr, 0, func_id); + if (err) { + vf_info->pf_set_mac = false; + return err; + } + + memcpy(vf_info->vf_mac_addr, mac_addr, ETH_ALEN); + + return 0; +} + +int hinic_ndo_set_vf_mac(struct net_device *netdev, int vf, u8 *mac) +{ + struct hinic_dev *nic_dev = netdev_priv(netdev); + struct hinic_sriov_info *sriov_info; + int err; + + sriov_info = &nic_dev->sriov_info; + if (!is_valid_ether_addr(mac) || vf >= sriov_info->num_vfs) + return -EINVAL; + + err = hinic_set_vf_mac(sriov_info->hwdev, OS_VF_ID_TO_HW(vf), mac); + if (err) + return err; + + netif_info(nic_dev, drv, netdev, "Setting MAC %pM on VF %d\n", mac, vf); + netif_info(nic_dev, drv, netdev, "Reload the VF driver to make this change effective."); + + return 0; +} + +int hinic_add_vf_vlan(struct hinic_hwdev *hwdev, int vf_id, u16 vlan, u8 qos) +{ + struct hinic_func_to_io *nic_io = &hwdev->func_to_io; + int err; + + err = hinic_set_vf_vlan(hwdev, true, vlan, qos, vf_id); + if (err) + return err; + + nic_io->vf_infos[HW_VF_ID_TO_OS(vf_id)].pf_vlan = vlan; + nic_io->vf_infos[HW_VF_ID_TO_OS(vf_id)].pf_qos = qos; + + dev_info(&hwdev->hwif->pdev->dev, "Setting VLAN %d, QOS 0x%x on VF %d\n", + vlan, qos, HW_VF_ID_TO_OS(vf_id)); + return 0; +} + int hinic_kill_vf_vlan(struct hinic_hwdev *hwdev, int vf_id) { struct hinic_func_to_io *nic_io = &hwdev->func_to_io; @@ -381,6 +543,159 @@ int hinic_kill_vf_vlan(struct hinic_hwdev *hwdev, int vf_id) return 0; } +int hinic_update_mac_vlan(struct hinic_dev *nic_dev, u16 old_vlan, u16 new_vlan, + int vf_id) +{ + struct vf_data_storage *vf_info; + u16 vlan_id; + int err; + + if (!nic_dev || old_vlan >= VLAN_N_VID || new_vlan >= VLAN_N_VID) + return -EINVAL; + + vf_info = nic_dev->hwdev->func_to_io.vf_infos + HW_VF_ID_TO_OS(vf_id); + if (!vf_info->pf_set_mac) + return 0; + + vlan_id = old_vlan; + if (vlan_id) + vlan_id |= HINIC_ADD_VLAN_IN_MAC; + + err = hinic_port_del_mac(nic_dev, vf_info->vf_mac_addr, vlan_id); + if (err) { + dev_err(&nic_dev->hwdev->hwif->pdev->dev, "Failed to delete VF %d MAC %pM vlan %d\n", + HW_VF_ID_TO_OS(vf_id), vf_info->vf_mac_addr, old_vlan); + return err; + } + + vlan_id = new_vlan; + if (vlan_id) + vlan_id |= HINIC_ADD_VLAN_IN_MAC; + + err = hinic_port_add_mac(nic_dev, vf_info->vf_mac_addr, vlan_id); + if (err) { + dev_err(&nic_dev->hwdev->hwif->pdev->dev, "Failed to add VF %d MAC %pM vlan %d\n", + HW_VF_ID_TO_OS(vf_id), vf_info->vf_mac_addr, new_vlan); + goto out; + } + + return 0; + +out: + vlan_id = old_vlan; + if (vlan_id) + vlan_id |= HINIC_ADD_VLAN_IN_MAC; + hinic_port_add_mac(nic_dev, vf_info->vf_mac_addr, vlan_id); + + return err; +} + +static int set_hw_vf_vlan(struct hinic_dev *nic_dev, + u16 cur_vlanprio, int vf, u16 vlan, u8 qos) +{ + u16 old_vlan = cur_vlanprio & VLAN_VID_MASK; + int err = 0; + + if (vlan || qos) { + if (cur_vlanprio) { + err = hinic_kill_vf_vlan(nic_dev->hwdev, + OS_VF_ID_TO_HW(vf)); + if (err) { + dev_err(&nic_dev->sriov_info.pdev->dev, "Failed to delete vf %d old vlan %d\n", + vf, old_vlan); + goto out; + } + } + err = hinic_add_vf_vlan(nic_dev->hwdev, + OS_VF_ID_TO_HW(vf), vlan, qos); + if (err) { + dev_err(&nic_dev->sriov_info.pdev->dev, "Failed to add vf %d new vlan %d\n", + vf, vlan); + goto out; + } + } else { + err = hinic_kill_vf_vlan(nic_dev->hwdev, OS_VF_ID_TO_HW(vf)); + if (err) { + dev_err(&nic_dev->sriov_info.pdev->dev, "Failed to delete vf %d vlan %d\n", + vf, old_vlan); + goto out; + } + } + + err = hinic_update_mac_vlan(nic_dev, old_vlan, vlan, + OS_VF_ID_TO_HW(vf)); + +out: + return err; +} + +int hinic_ndo_set_vf_vlan(struct net_device *netdev, int vf, u16 vlan, u8 qos, + __be16 vlan_proto) +{ + struct hinic_dev *nic_dev = netdev_priv(netdev); + struct hinic_sriov_info *sriov_info; + u16 vlanprio, cur_vlanprio; + + sriov_info = &nic_dev->sriov_info; + if (vf >= sriov_info->num_vfs || vlan > 4095 || qos > 7) + return -EINVAL; + if (vlan_proto != htons(ETH_P_8021Q)) + return -EPROTONOSUPPORT; + vlanprio = vlan | qos << HINIC_VLAN_PRIORITY_SHIFT; + cur_vlanprio = hinic_vf_info_vlanprio(nic_dev->hwdev, + OS_VF_ID_TO_HW(vf)); + /* duplicate request, so just return success */ + if (vlanprio == cur_vlanprio) + return 0; + + return set_hw_vf_vlan(nic_dev, cur_vlanprio, vf, vlan, qos); +} + +int hinic_set_vf_trust(struct hinic_hwdev *hwdev, u16 vf_id, bool trust) +{ + struct vf_data_storage *vf_infos; + struct hinic_func_to_io *nic_io; + + if (!hwdev) + return -EINVAL; + + nic_io = &hwdev->func_to_io; + vf_infos = nic_io->vf_infos; + vf_infos[vf_id].trust = trust; + + return 0; +} + +int hinic_ndo_set_vf_trust(struct net_device *netdev, int vf, bool setting) +{ + struct hinic_dev *adapter = netdev_priv(netdev); + struct hinic_sriov_info *sriov_info; + struct hinic_func_to_io *nic_io; + bool cur_trust; + int err; + + sriov_info = &adapter->sriov_info; + nic_io = &adapter->hwdev->func_to_io; + + if (vf >= sriov_info->num_vfs) + return -EINVAL; + + cur_trust = nic_io->vf_infos[vf].trust; + /* same request, so just return success */ + if ((setting && cur_trust) || (!setting && !cur_trust)) + return 0; + + err = hinic_set_vf_trust(adapter->hwdev, vf, setting); + if (!err) + dev_info(&sriov_info->pdev->dev, "Set VF %d trusted %s succeed\n", + vf, setting ? "on" : "off"); + else + dev_err(&sriov_info->pdev->dev, "Failed set VF %d trusted %s\n", + vf, setting ? "on" : "off"); + + return err; +} + /* pf receive message from vf */ int nic_pf_mbox_handler(void *hwdev, u16 vf_id, u8 cmd, void *buf_in, u16 in_size, void *buf_out, u16 *out_size) @@ -484,6 +799,9 @@ void hinic_clear_vf_infos(struct hinic_dev *nic_dev, u16 vf_id) if (hinic_vf_info_vlanprio(nic_dev->hwdev, vf_id)) hinic_kill_vf_vlan(nic_dev->hwdev, vf_id); + if (vf_infos->trust) + hinic_set_vf_trust(nic_dev->hwdev, vf_id, false); + memset(vf_infos, 0, sizeof(*vf_infos)); /* set vf_infos to default */ hinic_init_vf_infos(&nic_dev->hwdev->func_to_io, HW_VF_ID_TO_OS(vf_id)); diff --git a/drivers/net/ethernet/huawei/hinic/hinic_sriov.h b/drivers/net/ethernet/huawei/hinic/hinic_sriov.h index 4889eabe7b7c..64affc7474b5 100644 --- a/drivers/net/ethernet/huawei/hinic/hinic_sriov.h +++ b/drivers/net/ethernet/huawei/hinic/hinic_sriov.h @@ -52,6 +52,19 @@ struct hinic_register_vf { u8 rsvd0[6]; }; +struct hinic_port_mac_update { + u8 status; + u8 version; + u8 rsvd0[6]; + + u16 func_id; + u16 vlan_id; + u16 rsvd1; + u8 old_mac[ETH_ALEN]; + u16 rsvd2; + u8 new_mac[ETH_ALEN]; +}; + struct hinic_vf_vlan_config { u8 status; u8 version; @@ -63,6 +76,16 @@ struct hinic_vf_vlan_config { u8 rsvd1[7]; }; +int hinic_ndo_set_vf_mac(struct net_device *netdev, int vf, u8 *mac); + +int hinic_ndo_set_vf_vlan(struct net_device *netdev, int vf, u16 vlan, u8 qos, + __be16 vlan_proto); + +int hinic_ndo_get_vf_config(struct net_device *netdev, + int vf, struct ifla_vf_info *ivi); + +int hinic_ndo_set_vf_trust(struct net_device *netdev, int vf, bool setting); + void hinic_notify_all_vfs_link_changed(struct hinic_hwdev *hwdev, u8 link_status); -- cgit v1.2.3-59-g8ed1b From 4b36a0dff794a00989a50581aed2f94c88b57107 Mon Sep 17 00:00:00 2001 From: Tonghao Zhang Date: Sat, 25 Apr 2020 11:39:47 +0800 Subject: net: openvswitch: suitable access to the dp_meters To fix the following sparse warning: | net/openvswitch/meter.c:109:38: sparse: sparse: incorrect type | in assignment (different address spaces) ... | net/openvswitch/meter.c:720:45: sparse: sparse: incorrect type | in argument 1 (different address spaces) ... Reported-by: kbuild test robot Signed-off-by: Tonghao Zhang Signed-off-by: David S. Miller --- net/openvswitch/meter.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/net/openvswitch/meter.c b/net/openvswitch/meter.c index 915f31123f23..612ad5586ce9 100644 --- a/net/openvswitch/meter.c +++ b/net/openvswitch/meter.c @@ -107,8 +107,8 @@ dp_meter_instance_realloc(struct dp_meter_table *tbl, u32 size) return -ENOMEM; for (i = 0; i < n_meters; i++) - new_ti->dp_meters[i] = - rcu_dereference_ovsl(ti->dp_meters[i]); + if (rcu_dereference_ovsl(ti->dp_meters[i])) + new_ti->dp_meters[i] = ti->dp_meters[i]; rcu_assign_pointer(tbl->ti, new_ti); call_rcu(&ti->rcu, dp_meter_instance_free_rcu); @@ -752,7 +752,7 @@ void ovs_meters_exit(struct datapath *dp) int i; for (i = 0; i < ti->n_meters; i++) - ovs_meter_free(ti->dp_meters[i]); + ovs_meter_free(rcu_dereference_raw(ti->dp_meters[i])); dp_meter_instance_free(ti); } -- cgit v1.2.3-59-g8ed1b From 659d4587fe7233bfdff303744b20d6f41ad04362 Mon Sep 17 00:00:00 2001 From: Tonghao Zhang Date: Sat, 25 Apr 2020 11:39:48 +0800 Subject: net: openvswitch: use div_u64() for 64-by-32 divisions Compile the kernel for arm 32 platform, the build warning found. To fix that, should use div_u64() for divisions. | net/openvswitch/meter.c:396: undefined reference to `__udivdi3' [add more commit msg, change reported tag, and use div_u64 instead of do_div by Tonghao] Fixes: e57358873bb5d6ca ("net: openvswitch: use u64 for meter bucket") Reported-by: kbuild test robot Signed-off-by: Tonghao Zhang Tested-by: Tonghao Zhang Signed-off-by: David S. Miller --- net/openvswitch/meter.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/openvswitch/meter.c b/net/openvswitch/meter.c index 612ad5586ce9..3d3d8e094546 100644 --- a/net/openvswitch/meter.c +++ b/net/openvswitch/meter.c @@ -393,7 +393,7 @@ static struct dp_meter *dp_meter_create(struct nlattr **a) * Start with a full bucket. */ band->bucket = (band->burst_size + band->rate) * 1000ULL; - band_max_delta_t = band->bucket / band->rate; + band_max_delta_t = div_u64(band->bucket, band->rate); if (band_max_delta_t > meter->max_delta_t) meter->max_delta_t = band_max_delta_t; band++; -- cgit v1.2.3-59-g8ed1b From 3fd8dc269ff0647819589c21b2ce60af6fc0a455 Mon Sep 17 00:00:00 2001 From: Huazhong Tan Date: Sun, 26 Apr 2020 10:13:48 +0800 Subject: net: hns3: remove an unnecessary check in hclge_set_umv_space() Since hclge_set_umv_space() is only called by hclge_init_umv_space(), parameter 'allocated_size' will not be NULL. Signed-off-by: Huazhong Tan Signed-off-by: David S. Miller --- drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c index c74990a59e10..e2fec832fdf0 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c @@ -7227,8 +7227,7 @@ static int hclge_set_umv_space(struct hclge_dev *hdev, u16 space_size, return ret; } - if (allocated_size) - *allocated_size = le32_to_cpu(desc.data[1]); + *allocated_size = le32_to_cpu(desc.data[1]); return 0; } -- cgit v1.2.3-59-g8ed1b From b0b3fb6759220d4fa359e9ac486859c9d422c204 Mon Sep 17 00:00:00 2001 From: Mao Wenan Date: Sat, 18 Apr 2020 09:37:35 +0800 Subject: bpf: Remove set but not used variable 'dst_known' MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fixes gcc '-Wunused-but-set-variable' warning: kernel/bpf/verifier.c:5603:18: warning: variable ‘dst_known’ set but not used [-Wunused-but-set-variable], delete this variable. Signed-off-by: Mao Wenan Signed-off-by: Alexei Starovoitov Acked-by: Song Liu Link: https://lore.kernel.org/bpf/20200418013735.67882-1-maowenan@huawei.com --- kernel/bpf/verifier.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index fa1d8245b925..15ba8bf92ca9 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5609,7 +5609,7 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env, { struct bpf_reg_state *regs = cur_regs(env); u8 opcode = BPF_OP(insn->code); - bool src_known, dst_known; + bool src_known; s64 smin_val, smax_val; u64 umin_val, umax_val; s32 s32_min_val, s32_max_val; @@ -5631,7 +5631,6 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env, if (alu32) { src_known = tnum_subreg_is_const(src_reg.var_off); - dst_known = tnum_subreg_is_const(dst_reg->var_off); if ((src_known && (s32_min_val != s32_max_val || u32_min_val != u32_max_val)) || s32_min_val > s32_max_val || u32_min_val > u32_max_val) { @@ -5643,7 +5642,6 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env, } } else { src_known = tnum_is_const(src_reg.var_off); - dst_known = tnum_is_const(dst_reg->var_off); if ((src_known && (smin_val != smax_val || umin_val != umax_val)) || smin_val > smax_val || umin_val > umax_val) { -- cgit v1.2.3-59-g8ed1b From 93e516894752e8b2ae3c2e7671e3ea33e27e3898 Mon Sep 17 00:00:00 2001 From: Jagadeesh Pagadala Date: Sun, 19 Apr 2020 11:09:17 +0530 Subject: tools/bpf/bpftool: Remove duplicate headers Code cleanup: Remove duplicate headers which are included twice. Signed-off-by: Jagadeesh Pagadala Signed-off-by: Alexei Starovoitov Reviewed-by: Quentin Monnet Link: https://lore.kernel.org/bpf/1587274757-14101-1-git-send-email-jagdsh.linux@gmail.com --- tools/bpf/bpftool/btf.c | 1 - tools/bpf/bpftool/gen.c | 1 - tools/bpf/bpftool/jit_disasm.c | 1 - 3 files changed, 3 deletions(-) diff --git a/tools/bpf/bpftool/btf.c b/tools/bpf/bpftool/btf.c index bcaf55b59498..41a1346934a1 100644 --- a/tools/bpf/bpftool/btf.c +++ b/tools/bpf/bpftool/btf.c @@ -15,7 +15,6 @@ #include #include #include -#include #include "json_writer.h" #include "main.h" diff --git a/tools/bpf/bpftool/gen.c b/tools/bpf/bpftool/gen.c index f8113b3646f5..0e5f0236cc76 100644 --- a/tools/bpf/bpftool/gen.c +++ b/tools/bpf/bpftool/gen.c @@ -17,7 +17,6 @@ #include #include #include -#include #include #include "bpf/libbpf_internal.h" diff --git a/tools/bpf/bpftool/jit_disasm.c b/tools/bpf/bpftool/jit_disasm.c index f7f5885aa3ba..e7e7eee9f172 100644 --- a/tools/bpf/bpftool/jit_disasm.c +++ b/tools/bpf/bpftool/jit_disasm.c @@ -15,7 +15,6 @@ #include #include #include -#include #include #include #include -- cgit v1.2.3-59-g8ed1b From 0456ea170cd665ddbb9503be92e39f96055dd5fa Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Mon, 20 Apr 2020 10:46:10 -0700 Subject: bpf: Enable more helpers for BPF_PROG_TYPE_CGROUP_{DEVICE,SYSCTL,SOCKOPT} Currently the following prog types don't fall back to bpf_base_func_proto() (instead they have cgroup_base_func_proto which has a limited set of helpers from bpf_base_func_proto): * BPF_PROG_TYPE_CGROUP_DEVICE * BPF_PROG_TYPE_CGROUP_SYSCTL * BPF_PROG_TYPE_CGROUP_SOCKOPT I don't see any specific reason why we shouldn't use bpf_base_func_proto(), every other type of program (except bpf-lirc and, understandably, tracing) use it, so let's fall back to bpf_base_func_proto for those prog types as well. This basically boils down to adding access to the following helpers: * BPF_FUNC_get_prandom_u32 * BPF_FUNC_get_smp_processor_id * BPF_FUNC_get_numa_node_id * BPF_FUNC_tail_call * BPF_FUNC_ktime_get_ns * BPF_FUNC_spin_lock (CAP_SYS_ADMIN) * BPF_FUNC_spin_unlock (CAP_SYS_ADMIN) * BPF_FUNC_jiffies64 (CAP_SYS_ADMIN) I've also added bpf_perf_event_output() because it's really handy for logging and debugging. Signed-off-by: Stanislav Fomichev Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200420174610.77494-1-sdf@google.com --- include/linux/bpf.h | 1 + kernel/bpf/cgroup.c | 20 +++--------------- net/core/filter.c | 2 +- .../testing/selftests/bpf/verifier/event_output.c | 24 ++++++++++++++++++++++ 4 files changed, 29 insertions(+), 18 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index fd2b2322412d..25da6ff2a880 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1523,6 +1523,7 @@ extern const struct bpf_func_proto bpf_strtoul_proto; extern const struct bpf_func_proto bpf_tcp_sock_proto; extern const struct bpf_func_proto bpf_jiffies64_proto; extern const struct bpf_func_proto bpf_get_ns_current_pid_tgid_proto; +extern const struct bpf_func_proto bpf_event_output_data_proto; const struct bpf_func_proto *bpf_tracing_func_proto( enum bpf_func_id func_id, const struct bpf_prog *prog); diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index cb305e71e7de..4d748c5785bc 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -1060,30 +1060,16 @@ static const struct bpf_func_proto * cgroup_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { - case BPF_FUNC_map_lookup_elem: - return &bpf_map_lookup_elem_proto; - case BPF_FUNC_map_update_elem: - return &bpf_map_update_elem_proto; - case BPF_FUNC_map_delete_elem: - return &bpf_map_delete_elem_proto; - case BPF_FUNC_map_push_elem: - return &bpf_map_push_elem_proto; - case BPF_FUNC_map_pop_elem: - return &bpf_map_pop_elem_proto; - case BPF_FUNC_map_peek_elem: - return &bpf_map_peek_elem_proto; case BPF_FUNC_get_current_uid_gid: return &bpf_get_current_uid_gid_proto; case BPF_FUNC_get_local_storage: return &bpf_get_local_storage_proto; case BPF_FUNC_get_current_cgroup_id: return &bpf_get_current_cgroup_id_proto; - case BPF_FUNC_trace_printk: - if (capable(CAP_SYS_ADMIN)) - return bpf_get_trace_printk_proto(); - /* fall through */ + case BPF_FUNC_perf_event_output: + return &bpf_event_output_data_proto; default: - return NULL; + return bpf_base_func_proto(func_id); } } diff --git a/net/core/filter.c b/net/core/filter.c index 7d6ceaa54d21..a943df3ad8b0 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4214,7 +4214,7 @@ BPF_CALL_5(bpf_event_output_data, void *, ctx, struct bpf_map *, map, u64, flags return bpf_event_output(map, flags, data, size, NULL, 0, NULL); } -static const struct bpf_func_proto bpf_event_output_data_proto = { +const struct bpf_func_proto bpf_event_output_data_proto = { .func = bpf_event_output_data, .gpl_only = true, .ret_type = RET_INTEGER, diff --git a/tools/testing/selftests/bpf/verifier/event_output.c b/tools/testing/selftests/bpf/verifier/event_output.c index 130553e19eca..99f8f582c02b 100644 --- a/tools/testing/selftests/bpf/verifier/event_output.c +++ b/tools/testing/selftests/bpf/verifier/event_output.c @@ -92,3 +92,27 @@ .result = ACCEPT, .retval = 1, }, +{ + "perfevent for cgroup dev", + .insns = { __PERF_EVENT_INSNS__ }, + .prog_type = BPF_PROG_TYPE_CGROUP_DEVICE, + .fixup_map_event_output = { 4 }, + .result = ACCEPT, + .retval = 1, +}, +{ + "perfevent for cgroup sysctl", + .insns = { __PERF_EVENT_INSNS__ }, + .prog_type = BPF_PROG_TYPE_CGROUP_SYSCTL, + .fixup_map_event_output = { 4 }, + .result = ACCEPT, + .retval = 1, +}, +{ + "perfevent for cgroup sockopt", + .insns = { __PERF_EVENT_INSNS__ }, + .prog_type = BPF_PROG_TYPE_CGROUP_SOCKOPT, + .fixup_map_event_output = { 4 }, + .result = ACCEPT, + .retval = 1, +}, -- cgit v1.2.3-59-g8ed1b From ae460c022453337850bdc36a36bf7596a6cfcf99 Mon Sep 17 00:00:00 2001 From: Yoshiki Komachi Date: Tue, 21 Apr 2020 09:05:27 +0900 Subject: bpf_helpers.h: Add note for building with vmlinux.h or linux/types.h The following error was shown when a bpf program was compiled without vmlinux.h auto-generated from BTF: # clang -I./linux/tools/lib/ -I/lib/modules/$(uname -r)/build/include/ \ -O2 -Wall -target bpf -emit-llvm -c bpf_prog.c -o bpf_prog.bc ... In file included from linux/tools/lib/bpf/bpf_helpers.h:5: linux/tools/lib/bpf/bpf_helper_defs.h:56:82: error: unknown type name '__u64' ... It seems that bpf programs are intended for being built together with the vmlinux.h (which will have all the __u64 and other typedefs). But users may mistakenly think "include " is missing because the vmlinux.h is not common for non-bpf developers. IMO, an explicit comment therefore should be added to bpf_helpers.h as this patch shows. Signed-off-by: Yoshiki Komachi Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/1587427527-29399-1-git-send-email-komachi.yoshiki@gmail.com --- tools/lib/bpf/bpf_helpers.h | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/tools/lib/bpf/bpf_helpers.h b/tools/lib/bpf/bpf_helpers.h index f69cc208778a..60aad054eea1 100644 --- a/tools/lib/bpf/bpf_helpers.h +++ b/tools/lib/bpf/bpf_helpers.h @@ -2,6 +2,12 @@ #ifndef __BPF_HELPERS__ #define __BPF_HELPERS__ +/* + * Note that bpf programs need to include either + * vmlinux.h (auto-generated from BTF) or linux/types.h + * in advance since bpf_helper_defs.h uses such types + * as __u64. + */ #include "bpf_helper_defs.h" #define __uint(name, val) int (*name)[val] -- cgit v1.2.3-59-g8ed1b From 745abfaa9eafa597d31fdf24a3249e5206a98768 Mon Sep 17 00:00:00 2001 From: Luke Nelson Date: Mon, 20 Apr 2020 17:28:04 -0700 Subject: bpf, riscv: Fix tail call count off by one in RV32 BPF JIT This patch fixes an off by one error in the RV32 JIT handling for BPF tail call. Currently, the code decrements TCC before checking if it is less than zero. This limits the maximum number of tail calls to 32 instead of 33 as in other JITs. The fix is to instead check the old value of TCC before decrementing. Fixes: 5f316b65e99f ("riscv, bpf: Add RV32G eBPF JIT") Signed-off-by: Luke Nelson Signed-off-by: Alexei Starovoitov Acked-by: Xi Wang Link: https://lore.kernel.org/bpf/20200421002804.5118-1-luke.r.nels@gmail.com --- arch/riscv/net/bpf_jit_comp32.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/arch/riscv/net/bpf_jit_comp32.c b/arch/riscv/net/bpf_jit_comp32.c index 302934177760..11083d4d5f2d 100644 --- a/arch/riscv/net/bpf_jit_comp32.c +++ b/arch/riscv/net/bpf_jit_comp32.c @@ -770,12 +770,13 @@ static int emit_bpf_tail_call(int insn, struct rv_jit_context *ctx) emit_bcc(BPF_JGE, lo(idx_reg), RV_REG_T1, off, ctx); /* - * if ((temp_tcc = tcc - 1) < 0) + * temp_tcc = tcc - 1; + * if (tcc < 0) * goto out; */ emit(rv_addi(RV_REG_T1, RV_REG_TCC, -1), ctx); off = (tc_ninsn - (ctx->ninsns - start_insn)) << 2; - emit_bcc(BPF_JSLT, RV_REG_T1, RV_REG_ZERO, off, ctx); + emit_bcc(BPF_JSLT, RV_REG_TCC, RV_REG_ZERO, off, ctx); /* * prog = array->ptrs[index]; -- cgit v1.2.3-59-g8ed1b From 6890896bd765b0504761c61901c9804fca23bfb2 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Fri, 24 Apr 2020 16:59:41 -0700 Subject: bpf: Fix missing bpf_base_func_proto in cgroup_base_func_proto for CGROUP_NET=n linux-next build bot reported compile issue [1] with one of its configs. It looks like when we have CONFIG_NET=n and CONFIG_BPF{,_SYSCALL}=y, we are missing the bpf_base_func_proto definition (from net/core/filter.c) in cgroup_base_func_proto. I'm reshuffling the code a bit to make it work. The common helpers are moved into kernel/bpf/helpers.c and the bpf_base_func_proto is exported from there. Also, bpf_get_raw_cpu_id goes into kernel/bpf/core.c akin to existing bpf_user_rnd_u32. [1] https://lore.kernel.org/linux-next/CAKH8qBsBvKHswiX1nx40LgO+BGeTmb1NX8tiTttt_0uu6T3dCA@mail.gmail.com/T/#mff8b0c083314c68c2e2ef0211cb11bc20dc13c72 Fixes: 0456ea170cd6 ("bpf: Enable more helpers for BPF_PROG_TYPE_CGROUP_{DEVICE,SYSCTL,SOCKOPT}") Signed-off-by: Stanislav Fomichev Signed-off-by: Alexei Starovoitov Cc: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200424235941.58382-1-sdf@google.com --- include/linux/bpf.h | 8 ++++++ include/linux/filter.h | 2 -- kernel/bpf/core.c | 5 ++++ kernel/bpf/helpers.c | 73 ++++++++++++++++++++++++++++++++++++++++++++++ net/core/filter.c | 78 +------------------------------------------------- 5 files changed, 87 insertions(+), 79 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 25da6ff2a880..5147e11e53ff 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1215,6 +1215,7 @@ int btf_check_type_match(struct bpf_verifier_env *env, struct bpf_prog *prog, struct bpf_prog *bpf_prog_by_id(u32 id); +const struct bpf_func_proto *bpf_base_func_proto(enum bpf_func_id func_id); #else /* !CONFIG_BPF_SYSCALL */ static inline struct bpf_prog *bpf_prog_get(u32 ufd) { @@ -1365,6 +1366,12 @@ static inline struct bpf_prog *bpf_prog_by_id(u32 id) { return ERR_PTR(-ENOTSUPP); } + +static inline const struct bpf_func_proto * +bpf_base_func_proto(enum bpf_func_id func_id) +{ + return NULL; +} #endif /* CONFIG_BPF_SYSCALL */ static inline struct bpf_prog *bpf_prog_get_type(u32 ufd, @@ -1531,6 +1538,7 @@ const struct bpf_func_proto *bpf_tracing_func_proto( /* Shared helpers among cBPF and eBPF. */ void bpf_user_rnd_init_once(void); u64 bpf_user_rnd_u32(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5); +u64 bpf_get_raw_cpu_id(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5); #if defined(CONFIG_NET) bool bpf_sock_common_is_valid_access(int off, int size, diff --git a/include/linux/filter.h b/include/linux/filter.h index 9b5aa5c483cc..af37318bb1c5 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -863,8 +863,6 @@ int bpf_prog_create(struct bpf_prog **pfp, struct sock_fprog_kern *fprog); int bpf_prog_create_from_user(struct bpf_prog **pfp, struct sock_fprog *fprog, bpf_aux_classic_check_t trans, bool save_orig); void bpf_prog_destroy(struct bpf_prog *fp); -const struct bpf_func_proto * -bpf_base_func_proto(enum bpf_func_id func_id); int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk); int sk_attach_bpf(u32 ufd, struct sock *sk); diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 916f5132a984..0cc91805069a 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -2136,6 +2136,11 @@ BPF_CALL_0(bpf_user_rnd_u32) return res; } +BPF_CALL_0(bpf_get_raw_cpu_id) +{ + return raw_smp_processor_id(); +} + /* Weak definitions of helper functions in case we don't have bpf syscall. */ const struct bpf_func_proto bpf_map_lookup_elem_proto __weak; const struct bpf_func_proto bpf_map_update_elem_proto __weak; diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index bafc53ddd350..dbba4f41d508 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -562,3 +562,76 @@ const struct bpf_func_proto bpf_get_ns_current_pid_tgid_proto = { .arg3_type = ARG_PTR_TO_UNINIT_MEM, .arg4_type = ARG_CONST_SIZE, }; + +static const struct bpf_func_proto bpf_get_raw_smp_processor_id_proto = { + .func = bpf_get_raw_cpu_id, + .gpl_only = false, + .ret_type = RET_INTEGER, +}; + +BPF_CALL_5(bpf_event_output_data, void *, ctx, struct bpf_map *, map, + u64, flags, void *, data, u64, size) +{ + if (unlikely(flags & ~(BPF_F_INDEX_MASK))) + return -EINVAL; + + return bpf_event_output(map, flags, data, size, NULL, 0, NULL); +} + +const struct bpf_func_proto bpf_event_output_data_proto = { + .func = bpf_event_output_data, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_CONST_MAP_PTR, + .arg3_type = ARG_ANYTHING, + .arg4_type = ARG_PTR_TO_MEM, + .arg5_type = ARG_CONST_SIZE_OR_ZERO, +}; + +const struct bpf_func_proto * +bpf_base_func_proto(enum bpf_func_id func_id) +{ + switch (func_id) { + case BPF_FUNC_map_lookup_elem: + return &bpf_map_lookup_elem_proto; + case BPF_FUNC_map_update_elem: + return &bpf_map_update_elem_proto; + case BPF_FUNC_map_delete_elem: + return &bpf_map_delete_elem_proto; + case BPF_FUNC_map_push_elem: + return &bpf_map_push_elem_proto; + case BPF_FUNC_map_pop_elem: + return &bpf_map_pop_elem_proto; + case BPF_FUNC_map_peek_elem: + return &bpf_map_peek_elem_proto; + case BPF_FUNC_get_prandom_u32: + return &bpf_get_prandom_u32_proto; + case BPF_FUNC_get_smp_processor_id: + return &bpf_get_raw_smp_processor_id_proto; + case BPF_FUNC_get_numa_node_id: + return &bpf_get_numa_node_id_proto; + case BPF_FUNC_tail_call: + return &bpf_tail_call_proto; + case BPF_FUNC_ktime_get_ns: + return &bpf_ktime_get_ns_proto; + default: + break; + } + + if (!capable(CAP_SYS_ADMIN)) + return NULL; + + switch (func_id) { + case BPF_FUNC_spin_lock: + return &bpf_spin_lock_proto; + case BPF_FUNC_spin_unlock: + return &bpf_spin_unlock_proto; + case BPF_FUNC_trace_printk: + return bpf_get_trace_printk_proto(); + case BPF_FUNC_jiffies64: + return &bpf_jiffies64_proto; + default: + return NULL; + } +} diff --git a/net/core/filter.c b/net/core/filter.c index a943df3ad8b0..a605626142b6 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -256,17 +256,6 @@ BPF_CALL_2(bpf_skb_load_helper_32_no_cache, const struct sk_buff *, skb, offset); } -BPF_CALL_0(bpf_get_raw_cpu_id) -{ - return raw_smp_processor_id(); -} - -static const struct bpf_func_proto bpf_get_raw_smp_processor_id_proto = { - .func = bpf_get_raw_cpu_id, - .gpl_only = false, - .ret_type = RET_INTEGER, -}; - static u32 convert_skb_access(int skb_field, int dst_reg, int src_reg, struct bpf_insn *insn_buf) { @@ -4205,26 +4194,6 @@ static const struct bpf_func_proto bpf_get_socket_uid_proto = { .arg1_type = ARG_PTR_TO_CTX, }; -BPF_CALL_5(bpf_event_output_data, void *, ctx, struct bpf_map *, map, u64, flags, - void *, data, u64, size) -{ - if (unlikely(flags & ~(BPF_F_INDEX_MASK))) - return -EINVAL; - - return bpf_event_output(map, flags, data, size, NULL, 0, NULL); -} - -const struct bpf_func_proto bpf_event_output_data_proto = { - .func = bpf_event_output_data, - .gpl_only = true, - .ret_type = RET_INTEGER, - .arg1_type = ARG_PTR_TO_CTX, - .arg2_type = ARG_CONST_MAP_PTR, - .arg3_type = ARG_ANYTHING, - .arg4_type = ARG_PTR_TO_MEM, - .arg5_type = ARG_CONST_SIZE_OR_ZERO, -}; - BPF_CALL_5(bpf_setsockopt, struct bpf_sock_ops_kern *, bpf_sock, int, level, int, optname, char *, optval, int, optlen) { @@ -5983,52 +5952,7 @@ bool bpf_helper_changes_pkt_data(void *func) return false; } -const struct bpf_func_proto * -bpf_base_func_proto(enum bpf_func_id func_id) -{ - switch (func_id) { - case BPF_FUNC_map_lookup_elem: - return &bpf_map_lookup_elem_proto; - case BPF_FUNC_map_update_elem: - return &bpf_map_update_elem_proto; - case BPF_FUNC_map_delete_elem: - return &bpf_map_delete_elem_proto; - case BPF_FUNC_map_push_elem: - return &bpf_map_push_elem_proto; - case BPF_FUNC_map_pop_elem: - return &bpf_map_pop_elem_proto; - case BPF_FUNC_map_peek_elem: - return &bpf_map_peek_elem_proto; - case BPF_FUNC_get_prandom_u32: - return &bpf_get_prandom_u32_proto; - case BPF_FUNC_get_smp_processor_id: - return &bpf_get_raw_smp_processor_id_proto; - case BPF_FUNC_get_numa_node_id: - return &bpf_get_numa_node_id_proto; - case BPF_FUNC_tail_call: - return &bpf_tail_call_proto; - case BPF_FUNC_ktime_get_ns: - return &bpf_ktime_get_ns_proto; - default: - break; - } - - if (!capable(CAP_SYS_ADMIN)) - return NULL; - - switch (func_id) { - case BPF_FUNC_spin_lock: - return &bpf_spin_lock_proto; - case BPF_FUNC_spin_unlock: - return &bpf_spin_unlock_proto; - case BPF_FUNC_trace_printk: - return bpf_get_trace_printk_proto(); - case BPF_FUNC_jiffies64: - return &bpf_jiffies64_proto; - default: - return NULL; - } -} +const struct bpf_func_proto bpf_event_output_data_proto __weak; static const struct bpf_func_proto * sock_filter_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) -- cgit v1.2.3-59-g8ed1b From 6f3f65d80dac8f2bafce2213005821fccdce194c Mon Sep 17 00:00:00 2001 From: Lorenzo Colitti Date: Mon, 20 Apr 2020 11:34:08 -0700 Subject: net: bpf: Allow TC programs to call BPF_FUNC_skb_change_head MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This allows TC eBPF programs to modify and forward (redirect) packets from interfaces without ethernet headers (for example cellular) to interfaces with (for example ethernet/wifi). The lack of this appears to simply be an oversight. Tested: in active use in Android R on 4.14+ devices for ipv6 cellular to wifi tethering offload. Signed-off-by: Lorenzo Colitti Signed-off-by: Maciej Żenczykowski Signed-off-by: Alexei Starovoitov --- net/core/filter.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/core/filter.c b/net/core/filter.c index a605626142b6..da3b7a72c37c 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -6137,6 +6137,8 @@ tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_skb_adjust_room_proto; case BPF_FUNC_skb_change_tail: return &bpf_skb_change_tail_proto; + case BPF_FUNC_skb_change_head: + return &bpf_skb_change_head_proto; case BPF_FUNC_skb_get_tunnel_key: return &bpf_skb_get_tunnel_key_proto; case BPF_FUNC_skb_set_tunnel_key: -- cgit v1.2.3-59-g8ed1b From 082b57e3eb09810d357083cca5ee2df02c16aec9 Mon Sep 17 00:00:00 2001 From: Maciej Żenczykowski Date: Mon, 20 Apr 2020 11:47:50 -0700 Subject: net: bpf: Make bpf_ktime_get_ns() available to non GPL programs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The entire implementation is in kernel/bpf/helpers.c: BPF_CALL_0(bpf_ktime_get_ns) { /* NMI safe access to clock monotonic */ return ktime_get_mono_fast_ns(); } const struct bpf_func_proto bpf_ktime_get_ns_proto = { .func = bpf_ktime_get_ns, .gpl_only = false, .ret_type = RET_INTEGER, }; and this was presumably marked GPL due to kernel/time/timekeeping.c: EXPORT_SYMBOL_GPL(ktime_get_mono_fast_ns); and while that may make sense for kernel modules (although even that is doubtful), there is currently AFAICT no other source of time available to ebpf. Furthermore this is really just equivalent to clock_gettime(CLOCK_MONOTONIC) which is exposed to userspace (via vdso even to make it performant)... As such, I see no reason to keep the GPL restriction. (In the future I'd like to have access to time from Apache licensed ebpf code) Signed-off-by: Maciej Żenczykowski Signed-off-by: Alexei Starovoitov --- kernel/bpf/helpers.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index dbba4f41d508..9a6b23387d02 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -151,7 +151,7 @@ BPF_CALL_0(bpf_ktime_get_ns) const struct bpf_func_proto bpf_ktime_get_ns_proto = { .func = bpf_ktime_get_ns, - .gpl_only = true, + .gpl_only = false, .ret_type = RET_INTEGER, }; -- cgit v1.2.3-59-g8ed1b From 0a05861f80fe7d4dcfdabcc98d9854947573e072 Mon Sep 17 00:00:00 2001 From: Tobias Klauser Date: Wed, 22 Apr 2020 01:29:27 +0200 Subject: xsk: Fix typo in xsk_umem_consume_tx and xsk_generic_xmit comments s/backpreassure/backpressure/ Signed-off-by: Tobias Klauser Signed-off-by: Alexei Starovoitov Acked-by: Magnus Karlsson Link: https://lore.kernel.org/bpf/20200421232927.21082-1-tklauser@distanz.ch --- net/xdp/xsk.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index c350108aa38d..f6e6609f70a3 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -322,7 +322,7 @@ bool xsk_umem_consume_tx(struct xdp_umem *umem, struct xdp_desc *desc) if (!xskq_cons_peek_desc(xs->tx, desc, umem)) continue; - /* This is the backpreassure mechanism for the Tx path. + /* This is the backpressure mechanism for the Tx path. * Reserve space in the completion queue and only proceed * if there is space in it. This avoids having to implement * any buffering in the Tx path. @@ -406,7 +406,7 @@ static int xsk_generic_xmit(struct sock *sk) addr = desc.addr; buffer = xdp_umem_get_data(xs->umem, addr); err = skb_store_bits(skb, 0, buffer, len); - /* This is the backpreassure mechanism for the Tx path. + /* This is the backpressure mechanism for the Tx path. * Reserve space in the completion queue and only proceed * if there is space in it. This avoids having to implement * any buffering in the Tx path. -- cgit v1.2.3-59-g8ed1b From 71d19214776e61b33da48f7c1b46e522c7f78221 Mon Sep 17 00:00:00 2001 From: Maciej Żenczykowski Date: Sun, 26 Apr 2020 09:15:25 -0700 Subject: bpf: add bpf_ktime_get_boot_ns() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit On a device like a cellphone which is constantly suspending and resuming CLOCK_MONOTONIC is not particularly useful for keeping track of or reacting to external network events. Instead you want to use CLOCK_BOOTTIME. Hence add bpf_ktime_get_boot_ns() as a mirror of bpf_ktime_get_ns() based around CLOCK_BOOTTIME instead of CLOCK_MONOTONIC. Signed-off-by: Maciej Żenczykowski Signed-off-by: Alexei Starovoitov --- drivers/media/rc/bpf-lirc.c | 2 ++ include/linux/bpf.h | 1 + include/uapi/linux/bpf.h | 13 ++++++++++++- kernel/bpf/core.c | 1 + kernel/bpf/helpers.c | 14 ++++++++++++++ kernel/trace/bpf_trace.c | 2 ++ tools/include/uapi/linux/bpf.h | 13 ++++++++++++- 7 files changed, 44 insertions(+), 2 deletions(-) diff --git a/drivers/media/rc/bpf-lirc.c b/drivers/media/rc/bpf-lirc.c index 0f3417d161b8..069c42f22a8c 100644 --- a/drivers/media/rc/bpf-lirc.c +++ b/drivers/media/rc/bpf-lirc.c @@ -103,6 +103,8 @@ lirc_mode2_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_map_peek_elem_proto; case BPF_FUNC_ktime_get_ns: return &bpf_ktime_get_ns_proto; + case BPF_FUNC_ktime_get_boot_ns: + return &bpf_ktime_get_boot_ns_proto; case BPF_FUNC_tail_call: return &bpf_tail_call_proto; case BPF_FUNC_get_prandom_u32: diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 5147e11e53ff..10960cfabea4 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1509,6 +1509,7 @@ extern const struct bpf_func_proto bpf_get_smp_processor_id_proto; extern const struct bpf_func_proto bpf_get_numa_node_id_proto; extern const struct bpf_func_proto bpf_tail_call_proto; extern const struct bpf_func_proto bpf_ktime_get_ns_proto; +extern const struct bpf_func_proto bpf_ktime_get_boot_ns_proto; extern const struct bpf_func_proto bpf_get_current_pid_tgid_proto; extern const struct bpf_func_proto bpf_get_current_uid_gid_proto; extern const struct bpf_func_proto bpf_get_current_comm_proto; diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 7bbf1b65be10..4a6c47f3febe 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -652,6 +652,8 @@ union bpf_attr { * u64 bpf_ktime_get_ns(void) * Description * Return the time elapsed since system boot, in nanoseconds. + * Does not include time the system was suspended. + * See: clock_gettime(CLOCK_MONOTONIC) * Return * Current *ktime*. * @@ -3025,6 +3027,14 @@ union bpf_attr { * * **-EOPNOTSUPP** Unsupported operation, for example a * call from outside of TC ingress. * * **-ESOCKTNOSUPPORT** Socket type not supported (reuseport). + * + * u64 bpf_ktime_get_boot_ns(void) + * Description + * Return the time elapsed since system boot, in nanoseconds. + * Does include the time the system was suspended. + * See: clock_gettime(CLOCK_BOOTTIME) + * Return + * Current *ktime*. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3151,7 +3161,8 @@ union bpf_attr { FN(xdp_output), \ FN(get_netns_cookie), \ FN(get_current_ancestor_cgroup_id), \ - FN(sk_assign), + FN(sk_assign), \ + FN(ktime_get_boot_ns), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 0cc91805069a..6aa11de67315 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -2156,6 +2156,7 @@ const struct bpf_func_proto bpf_get_prandom_u32_proto __weak; const struct bpf_func_proto bpf_get_smp_processor_id_proto __weak; const struct bpf_func_proto bpf_get_numa_node_id_proto __weak; const struct bpf_func_proto bpf_ktime_get_ns_proto __weak; +const struct bpf_func_proto bpf_ktime_get_boot_ns_proto __weak; const struct bpf_func_proto bpf_get_current_pid_tgid_proto __weak; const struct bpf_func_proto bpf_get_current_uid_gid_proto __weak; diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 9a6b23387d02..5c0290e0696e 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -155,6 +155,18 @@ const struct bpf_func_proto bpf_ktime_get_ns_proto = { .ret_type = RET_INTEGER, }; +BPF_CALL_0(bpf_ktime_get_boot_ns) +{ + /* NMI safe access to clock boottime */ + return ktime_get_boot_fast_ns(); +} + +const struct bpf_func_proto bpf_ktime_get_boot_ns_proto = { + .func = bpf_ktime_get_boot_ns, + .gpl_only = false, + .ret_type = RET_INTEGER, +}; + BPF_CALL_0(bpf_get_current_pid_tgid) { struct task_struct *task = current; @@ -615,6 +627,8 @@ bpf_base_func_proto(enum bpf_func_id func_id) return &bpf_tail_call_proto; case BPF_FUNC_ktime_get_ns: return &bpf_ktime_get_ns_proto; + case BPF_FUNC_ktime_get_boot_ns: + return &bpf_ktime_get_boot_ns_proto; default: break; } diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index ca1796747a77..e875c95d3ced 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -797,6 +797,8 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_map_peek_elem_proto; case BPF_FUNC_ktime_get_ns: return &bpf_ktime_get_ns_proto; + case BPF_FUNC_ktime_get_boot_ns: + return &bpf_ktime_get_boot_ns_proto; case BPF_FUNC_tail_call: return &bpf_tail_call_proto; case BPF_FUNC_get_current_pid_tgid: diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 7bbf1b65be10..4a6c47f3febe 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -652,6 +652,8 @@ union bpf_attr { * u64 bpf_ktime_get_ns(void) * Description * Return the time elapsed since system boot, in nanoseconds. + * Does not include time the system was suspended. + * See: clock_gettime(CLOCK_MONOTONIC) * Return * Current *ktime*. * @@ -3025,6 +3027,14 @@ union bpf_attr { * * **-EOPNOTSUPP** Unsupported operation, for example a * call from outside of TC ingress. * * **-ESOCKTNOSUPPORT** Socket type not supported (reuseport). + * + * u64 bpf_ktime_get_boot_ns(void) + * Description + * Return the time elapsed since system boot, in nanoseconds. + * Does include the time the system was suspended. + * See: clock_gettime(CLOCK_BOOTTIME) + * Return + * Current *ktime*. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3151,7 +3161,8 @@ union bpf_attr { FN(xdp_output), \ FN(get_netns_cookie), \ FN(get_current_ancestor_cgroup_id), \ - FN(sk_assign), + FN(sk_assign), \ + FN(ktime_get_boot_ns), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call -- cgit v1.2.3-59-g8ed1b From 6f8a57ccf8511724e6f48d732cb2940889789ab2 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 23 Apr 2020 12:58:50 -0700 Subject: bpf: Make verifier log more relevant by default To make BPF verifier verbose log more releavant and easier to use to debug verification failures, "pop" parts of log that were successfully verified. This has effect of leaving only verifier logs that correspond to code branches that lead to verification failure, which in practice should result in much shorter and more relevant verifier log dumps. This behavior is made the default behavior and can be overriden to do exhaustive logging by specifying BPF_LOG_LEVEL2 log level. Using BPF_LOG_LEVEL2 to disable this behavior is not ideal, because in some cases it's good to have BPF_LOG_LEVEL2 per-instruction register dump verbosity, but still have only relevant verifier branches logged. But for this patch, I didn't want to add any new flags. It might be worth-while to just rethink how BPF verifier logging is performed and requested and streamline it a bit. But this trimming of successfully verified branches seems to be useful and a good default behavior. To test this, I modified runqslower slightly to introduce read of uninitialized stack variable. Log (**truncated in the middle** to save many lines out of this commit message) BEFORE this change: ; int handle__sched_switch(u64 *ctx) 0: (bf) r6 = r1 ; struct task_struct *prev = (struct task_struct *)ctx[1]; 1: (79) r1 = *(u64 *)(r6 +8) func 'sched_switch' arg1 has btf_id 151 type STRUCT 'task_struct' 2: (b7) r2 = 0 ; struct event event = {}; 3: (7b) *(u64 *)(r10 -24) = r2 last_idx 3 first_idx 0 regs=4 stack=0 before 2: (b7) r2 = 0 4: (7b) *(u64 *)(r10 -32) = r2 5: (7b) *(u64 *)(r10 -40) = r2 6: (7b) *(u64 *)(r10 -48) = r2 ; if (prev->state == TASK_RUNNING) [ ... instruction dump from insn #7 through #50 are cut out ... ] 51: (b7) r2 = 16 52: (85) call bpf_get_current_comm#16 last_idx 52 first_idx 42 regs=4 stack=0 before 51: (b7) r2 = 16 ; bpf_perf_event_output(ctx, &events, BPF_F_CURRENT_CPU, 53: (bf) r1 = r6 54: (18) r2 = 0xffff8881f3868800 56: (18) r3 = 0xffffffff 58: (bf) r4 = r7 59: (b7) r5 = 32 60: (85) call bpf_perf_event_output#25 last_idx 60 first_idx 53 regs=20 stack=0 before 59: (b7) r5 = 32 61: (bf) r2 = r10 ; event.pid = pid; 62: (07) r2 += -16 ; bpf_map_delete_elem(&start, &pid); 63: (18) r1 = 0xffff8881f3868000 65: (85) call bpf_map_delete_elem#3 ; } 66: (b7) r0 = 0 67: (95) exit from 44 to 66: safe from 34 to 66: safe from 11 to 28: R1_w=inv0 R2_w=inv0 R6_w=ctx(id=0,off=0,imm=0) R10=fp0 fp-8=mmmm???? fp-24_w=00000000 fp-32_w=00000000 fp-40_w=00000000 fp-48_w=00000000 ; bpf_map_update_elem(&start, &pid, &ts, 0); 28: (bf) r2 = r10 ; 29: (07) r2 += -16 ; tsp = bpf_map_lookup_elem(&start, &pid); 30: (18) r1 = 0xffff8881f3868000 32: (85) call bpf_map_lookup_elem#1 invalid indirect read from stack off -16+0 size 4 processed 65 insns (limit 1000000) max_states_per_insn 1 total_states 5 peak_states 5 mark_read 4 Notice how there is a successful code path from instruction 0 through 67, few successfully verified jumps (44->66, 34->66), and only after that 11->28 jump plus error on instruction #32. AFTER this change (full verifier log, **no truncation**): ; int handle__sched_switch(u64 *ctx) 0: (bf) r6 = r1 ; struct task_struct *prev = (struct task_struct *)ctx[1]; 1: (79) r1 = *(u64 *)(r6 +8) func 'sched_switch' arg1 has btf_id 151 type STRUCT 'task_struct' 2: (b7) r2 = 0 ; struct event event = {}; 3: (7b) *(u64 *)(r10 -24) = r2 last_idx 3 first_idx 0 regs=4 stack=0 before 2: (b7) r2 = 0 4: (7b) *(u64 *)(r10 -32) = r2 5: (7b) *(u64 *)(r10 -40) = r2 6: (7b) *(u64 *)(r10 -48) = r2 ; if (prev->state == TASK_RUNNING) 7: (79) r2 = *(u64 *)(r1 +16) ; if (prev->state == TASK_RUNNING) 8: (55) if r2 != 0x0 goto pc+19 R1_w=ptr_task_struct(id=0,off=0,imm=0) R2_w=inv0 R6_w=ctx(id=0,off=0,imm=0) R10=fp0 fp-24_w=00000000 fp-32_w=00000000 fp-40_w=00000000 fp-48_w=00000000 ; trace_enqueue(prev->tgid, prev->pid); 9: (61) r1 = *(u32 *)(r1 +1184) 10: (63) *(u32 *)(r10 -4) = r1 ; if (!pid || (targ_pid && targ_pid != pid)) 11: (15) if r1 == 0x0 goto pc+16 from 11 to 28: R1_w=inv0 R2_w=inv0 R6_w=ctx(id=0,off=0,imm=0) R10=fp0 fp-8=mmmm???? fp-24_w=00000000 fp-32_w=00000000 fp-40_w=00000000 fp-48_w=00000000 ; bpf_map_update_elem(&start, &pid, &ts, 0); 28: (bf) r2 = r10 ; 29: (07) r2 += -16 ; tsp = bpf_map_lookup_elem(&start, &pid); 30: (18) r1 = 0xffff8881db3ce800 32: (85) call bpf_map_lookup_elem#1 invalid indirect read from stack off -16+0 size 4 processed 65 insns (limit 1000000) max_states_per_insn 1 total_states 5 peak_states 5 mark_read 4 Notice how in this case, there are 0-11 instructions + jump from 11 to 28 is recorded + 28-32 instructions with error on insn #32. test_verifier test runner was updated to specify BPF_LOG_LEVEL2 for VERBOSE_ACCEPT expected result due to potentially "incomplete" success verbose log at BPF_LOG_LEVEL1. On success, verbose log will only have a summary of number of processed instructions, etc, but no branch tracing log. Having just a last succesful branch tracing seemed weird and confusing. Having small and clean summary log in success case seems quite logical and nice, though. Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200423195850.1259827-1-andriin@fb.com --- kernel/bpf/verifier.c | 29 +++++++++++++++++++++++++---- tools/testing/selftests/bpf/test_verifier.c | 7 ++++++- 2 files changed, 31 insertions(+), 5 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 15ba8bf92ca9..91728e0f27eb 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -168,6 +168,8 @@ struct bpf_verifier_stack_elem { int insn_idx; int prev_insn_idx; struct bpf_verifier_stack_elem *next; + /* length of verifier log at the time this state was pushed on stack */ + u32 log_pos; }; #define BPF_COMPLEXITY_LIMIT_JMP_SEQ 8192 @@ -283,6 +285,18 @@ void bpf_verifier_vlog(struct bpf_verifier_log *log, const char *fmt, log->ubuf = NULL; } +static void bpf_vlog_reset(struct bpf_verifier_log *log, u32 new_pos) +{ + char zero = 0; + + if (!bpf_verifier_log_needed(log)) + return; + + log->len_used = new_pos; + if (put_user(zero, log->ubuf + new_pos)) + log->ubuf = NULL; +} + /* log_level controls verbosity level of eBPF verifier. * bpf_verifier_log_write() is used to dump the verification trace to the log, * so the user can figure out what's wrong with the program @@ -846,7 +860,7 @@ static void update_branch_counts(struct bpf_verifier_env *env, struct bpf_verifi } static int pop_stack(struct bpf_verifier_env *env, int *prev_insn_idx, - int *insn_idx) + int *insn_idx, bool pop_log) { struct bpf_verifier_state *cur = env->cur_state; struct bpf_verifier_stack_elem *elem, *head = env->head; @@ -860,6 +874,8 @@ static int pop_stack(struct bpf_verifier_env *env, int *prev_insn_idx, if (err) return err; } + if (pop_log) + bpf_vlog_reset(&env->log, head->log_pos); if (insn_idx) *insn_idx = head->insn_idx; if (prev_insn_idx) @@ -887,6 +903,7 @@ static struct bpf_verifier_state *push_stack(struct bpf_verifier_env *env, elem->insn_idx = insn_idx; elem->prev_insn_idx = prev_insn_idx; elem->next = env->head; + elem->log_pos = env->log.len_used; env->head = elem; env->stack_size++; err = copy_verifier_state(&elem->st, cur); @@ -915,7 +932,7 @@ err: free_verifier_state(env->cur_state, true); env->cur_state = NULL; /* pop all elements and return */ - while (!pop_stack(env, NULL, NULL)); + while (!pop_stack(env, NULL, NULL, false)); return NULL; } @@ -8407,6 +8424,7 @@ static bool reg_type_mismatch(enum bpf_reg_type src, enum bpf_reg_type prev) static int do_check(struct bpf_verifier_env *env) { + bool pop_log = !(env->log.level & BPF_LOG_LEVEL2); struct bpf_verifier_state *state = env->cur_state; struct bpf_insn *insns = env->prog->insnsi; struct bpf_reg_state *regs; @@ -8683,7 +8701,7 @@ static int do_check(struct bpf_verifier_env *env) process_bpf_exit: update_branch_counts(env, env->cur_state); err = pop_stack(env, &prev_insn_idx, - &env->insn_idx); + &env->insn_idx, pop_log); if (err < 0) { if (err != -ENOENT) return err; @@ -10206,6 +10224,7 @@ static void sanitize_insn_aux_data(struct bpf_verifier_env *env) static int do_check_common(struct bpf_verifier_env *env, int subprog) { + bool pop_log = !(env->log.level & BPF_LOG_LEVEL2); struct bpf_verifier_state *state; struct bpf_reg_state *regs; int ret, i; @@ -10268,7 +10287,9 @@ out: free_verifier_state(env->cur_state, true); env->cur_state = NULL; } - while (!pop_stack(env, NULL, NULL)); + while (!pop_stack(env, NULL, NULL, false)); + if (!ret && pop_log) + bpf_vlog_reset(&env->log, 0); free_states(env); if (ret) /* clean aux data in case subprog was rejected */ diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c index 87eaa49609a0..ad6939c67c5e 100644 --- a/tools/testing/selftests/bpf/test_verifier.c +++ b/tools/testing/selftests/bpf/test_verifier.c @@ -943,7 +943,12 @@ static void do_test_single(struct bpf_test *test, bool unpriv, attr.insns = prog; attr.insns_cnt = prog_len; attr.license = "GPL"; - attr.log_level = verbose || expected_ret == VERBOSE_ACCEPT ? 1 : 4; + if (verbose) + attr.log_level = 1; + else if (expected_ret == VERBOSE_ACCEPT) + attr.log_level = 2; + else + attr.log_level = 4; attr.prog_flags = pflags; fd_prog = bpf_load_program_xattr(&attr, bpf_vlog, sizeof(bpf_vlog)); -- cgit v1.2.3-59-g8ed1b From 234589012ba0e5bf448e3fdbbac0f4c265dbdd7b Mon Sep 17 00:00:00 2001 From: Lorenz Bauer Date: Fri, 24 Apr 2020 19:55:55 +0100 Subject: selftests/bpf: Add cls_redirect classifier cls_redirect is a TC clsact based replacement for the glb-redirect iptables module available at [1]. It enables what GitHub calls "second chance" flows [2], similarly proposed by the Beamer paper [3]. In contrast to glb-redirect, it also supports migrating UDP flows as long as connected sockets are used. cls_redirect is in production at Cloudflare, as part of our own L4 load balancer. We have modified the encapsulation format slightly from glb-redirect: glbgue_chained_routing.private_data_type has been repurposed to form a version field and several flags. Both have been arranged in a way that a private_data_type value of zero matches the current glb-redirect behaviour. This means that cls_redirect will understand packets in glb-redirect format, but not vice versa. The test suite only covers basic features. For example, cls_redirect will correctly forward path MTU discovery packets, but this is not exercised. It is also possible to switch the encapsulation format to GRE on the last hop, which is also not tested. There are two major distinctions from glb-redirect: first, cls_redirect relies on receiving encapsulated packets directly from a router. This is because we don't have access to the neighbour tables from BPF, yet. See forward_to_next_hop for details. Second, cls_redirect performs decapsulation instead of using separate ipip and sit tunnel devices. This avoids issues with the sit tunnel [4] and makes deploying the classifier easier: decapsulated packets appear on the same interface, so existing firewall rules continue to work as expected. The code base started it's life on v4.19, so there are most likely still hold overs from old workarounds. In no particular order: - The function buf_off is required to defeat a clang optimization that leads to the verifier rejecting the program due to pointer arithmetic in the wrong order. - The function pkt_parse_ipv6 is force inlined, because it would otherwise be rejected due to returning a pointer to stack memory. - The functions fill_tuple and classify_tcp contain kludges, because we've run out of function arguments. - The logic in general is rather nested, due to verifier restrictions. I think this is either because the verifier loses track of constants on the stack, or because it can't track enum like variables. 1: https://github.com/github/glb-director/tree/master/src/glb-redirect 2: https://github.com/github/glb-director/blob/master/docs/development/second-chance-design.md 3: https://www.usenix.org/conference/nsdi18/presentation/olteanu 4: https://github.com/github/glb-director/issues/64 Signed-off-by: Lorenz Bauer Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200424185556.7358-2-lmb@cloudflare.com --- .../selftests/bpf/prog_tests/cls_redirect.c | 456 +++++++++ .../selftests/bpf/progs/test_cls_redirect.c | 1058 ++++++++++++++++++++ .../selftests/bpf/progs/test_cls_redirect.h | 54 + tools/testing/selftests/bpf/test_progs.h | 7 + 4 files changed, 1575 insertions(+) create mode 100644 tools/testing/selftests/bpf/prog_tests/cls_redirect.c create mode 100644 tools/testing/selftests/bpf/progs/test_cls_redirect.c create mode 100644 tools/testing/selftests/bpf/progs/test_cls_redirect.h diff --git a/tools/testing/selftests/bpf/prog_tests/cls_redirect.c b/tools/testing/selftests/bpf/prog_tests/cls_redirect.c new file mode 100644 index 000000000000..f259085cca6a --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/cls_redirect.c @@ -0,0 +1,456 @@ +// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause +// Copyright (c) 2020 Cloudflare + +#define _GNU_SOURCE + +#include +#include + +#include + +#include + +#include "progs/test_cls_redirect.h" +#include "test_cls_redirect.skel.h" + +#define ENCAP_IP INADDR_LOOPBACK +#define ENCAP_PORT (1234) + +struct addr_port { + in_port_t port; + union { + struct in_addr in_addr; + struct in6_addr in6_addr; + }; +}; + +struct tuple { + int family; + struct addr_port src; + struct addr_port dst; +}; + +static int start_server(const struct sockaddr *addr, socklen_t len, int type) +{ + int fd = socket(addr->sa_family, type, 0); + if (CHECK_FAIL(fd == -1)) + return -1; + if (CHECK_FAIL(bind(fd, addr, len) == -1)) + goto err; + if (type == SOCK_STREAM && CHECK_FAIL(listen(fd, 128) == -1)) + goto err; + + return fd; + +err: + close(fd); + return -1; +} + +static int connect_to_server(const struct sockaddr *addr, socklen_t len, + int type) +{ + int fd = socket(addr->sa_family, type, 0); + if (CHECK_FAIL(fd == -1)) + return -1; + if (CHECK_FAIL(connect(fd, addr, len))) + goto err; + + return fd; + +err: + close(fd); + return -1; +} + +static bool fill_addr_port(const struct sockaddr *sa, struct addr_port *ap) +{ + const struct sockaddr_in6 *in6; + const struct sockaddr_in *in; + + switch (sa->sa_family) { + case AF_INET: + in = (const struct sockaddr_in *)sa; + ap->in_addr = in->sin_addr; + ap->port = in->sin_port; + return true; + + case AF_INET6: + in6 = (const struct sockaddr_in6 *)sa; + ap->in6_addr = in6->sin6_addr; + ap->port = in6->sin6_port; + return true; + + default: + return false; + } +} + +static bool set_up_conn(const struct sockaddr *addr, socklen_t len, int type, + int *server, int *conn, struct tuple *tuple) +{ + struct sockaddr_storage ss; + socklen_t slen = sizeof(ss); + struct sockaddr *sa = (struct sockaddr *)&ss; + + *server = start_server(addr, len, type); + if (*server < 0) + return false; + + if (CHECK_FAIL(getsockname(*server, sa, &slen))) + goto close_server; + + *conn = connect_to_server(sa, slen, type); + if (*conn < 0) + goto close_server; + + /* We want to simulate packets arriving at conn, so we have to + * swap src and dst. + */ + slen = sizeof(ss); + if (CHECK_FAIL(getsockname(*conn, sa, &slen))) + goto close_conn; + + if (CHECK_FAIL(!fill_addr_port(sa, &tuple->dst))) + goto close_conn; + + slen = sizeof(ss); + if (CHECK_FAIL(getpeername(*conn, sa, &slen))) + goto close_conn; + + if (CHECK_FAIL(!fill_addr_port(sa, &tuple->src))) + goto close_conn; + + tuple->family = ss.ss_family; + return true; + +close_conn: + close(*conn); + *conn = -1; +close_server: + close(*server); + *server = -1; + return false; +} + +static socklen_t prepare_addr(struct sockaddr_storage *addr, int family) +{ + struct sockaddr_in *addr4; + struct sockaddr_in6 *addr6; + + switch (family) { + case AF_INET: + addr4 = (struct sockaddr_in *)addr; + memset(addr4, 0, sizeof(*addr4)); + addr4->sin_family = family; + addr4->sin_addr.s_addr = htonl(INADDR_LOOPBACK); + return sizeof(*addr4); + case AF_INET6: + addr6 = (struct sockaddr_in6 *)addr; + memset(addr6, 0, sizeof(*addr6)); + addr6->sin6_family = family; + addr6->sin6_addr = in6addr_loopback; + return sizeof(*addr6); + default: + fprintf(stderr, "Invalid family %d", family); + return 0; + } +} + +static bool was_decapsulated(struct bpf_prog_test_run_attr *tattr) +{ + return tattr->data_size_out < tattr->data_size_in; +} + +enum type { + UDP, + TCP, + __NR_KIND, +}; + +enum hops { + NO_HOPS, + ONE_HOP, +}; + +enum flags { + NONE, + SYN, + ACK, +}; + +enum conn { + KNOWN_CONN, + UNKNOWN_CONN, +}; + +enum result { + ACCEPT, + FORWARD, +}; + +struct test_cfg { + enum type type; + enum result result; + enum conn conn; + enum hops hops; + enum flags flags; +}; + +static int test_str(void *buf, size_t len, const struct test_cfg *test, + int family) +{ + const char *family_str, *type, *conn, *hops, *result, *flags; + + family_str = "IPv4"; + if (family == AF_INET6) + family_str = "IPv6"; + + type = "TCP"; + if (test->type == UDP) + type = "UDP"; + + conn = "known"; + if (test->conn == UNKNOWN_CONN) + conn = "unknown"; + + hops = "no hops"; + if (test->hops == ONE_HOP) + hops = "one hop"; + + result = "accept"; + if (test->result == FORWARD) + result = "forward"; + + flags = "none"; + if (test->flags == SYN) + flags = "SYN"; + else if (test->flags == ACK) + flags = "ACK"; + + return snprintf(buf, len, "%s %s %s %s (%s, flags: %s)", family_str, + type, result, conn, hops, flags); +} + +static struct test_cfg tests[] = { + { TCP, ACCEPT, UNKNOWN_CONN, NO_HOPS, SYN }, + { TCP, ACCEPT, UNKNOWN_CONN, NO_HOPS, ACK }, + { TCP, FORWARD, UNKNOWN_CONN, ONE_HOP, ACK }, + { TCP, ACCEPT, KNOWN_CONN, ONE_HOP, ACK }, + { UDP, ACCEPT, UNKNOWN_CONN, NO_HOPS, NONE }, + { UDP, FORWARD, UNKNOWN_CONN, ONE_HOP, NONE }, + { UDP, ACCEPT, KNOWN_CONN, ONE_HOP, NONE }, +}; + +static void encap_init(encap_headers_t *encap, uint8_t hop_count, uint8_t proto) +{ + const uint8_t hlen = + (sizeof(struct guehdr) / sizeof(uint32_t)) + hop_count; + *encap = (encap_headers_t){ + .eth = { .h_proto = htons(ETH_P_IP) }, + .ip = { + .ihl = 5, + .version = 4, + .ttl = IPDEFTTL, + .protocol = IPPROTO_UDP, + .daddr = htonl(ENCAP_IP) + }, + .udp = { + .dest = htons(ENCAP_PORT), + }, + .gue = { + .hlen = hlen, + .proto_ctype = proto + }, + .unigue = { + .hop_count = hop_count + }, + }; +} + +static size_t build_input(const struct test_cfg *test, void *const buf, + const struct tuple *tuple) +{ + in_port_t sport = tuple->src.port; + encap_headers_t encap; + struct iphdr ip; + struct ipv6hdr ipv6; + struct tcphdr tcp; + struct udphdr udp; + struct in_addr next_hop; + uint8_t *p = buf; + int proto; + + proto = IPPROTO_IPIP; + if (tuple->family == AF_INET6) + proto = IPPROTO_IPV6; + + encap_init(&encap, test->hops == ONE_HOP ? 1 : 0, proto); + p = mempcpy(p, &encap, sizeof(encap)); + + if (test->hops == ONE_HOP) { + next_hop = (struct in_addr){ .s_addr = htonl(0x7f000002) }; + p = mempcpy(p, &next_hop, sizeof(next_hop)); + } + + proto = IPPROTO_TCP; + if (test->type == UDP) + proto = IPPROTO_UDP; + + switch (tuple->family) { + case AF_INET: + ip = (struct iphdr){ + .ihl = 5, + .version = 4, + .ttl = IPDEFTTL, + .protocol = proto, + .saddr = tuple->src.in_addr.s_addr, + .daddr = tuple->dst.in_addr.s_addr, + }; + p = mempcpy(p, &ip, sizeof(ip)); + break; + case AF_INET6: + ipv6 = (struct ipv6hdr){ + .version = 6, + .hop_limit = IPDEFTTL, + .nexthdr = proto, + .saddr = tuple->src.in6_addr, + .daddr = tuple->dst.in6_addr, + }; + p = mempcpy(p, &ipv6, sizeof(ipv6)); + break; + default: + return 0; + } + + if (test->conn == UNKNOWN_CONN) + sport--; + + switch (test->type) { + case TCP: + tcp = (struct tcphdr){ + .source = sport, + .dest = tuple->dst.port, + }; + if (test->flags == SYN) + tcp.syn = true; + if (test->flags == ACK) + tcp.ack = true; + p = mempcpy(p, &tcp, sizeof(tcp)); + break; + case UDP: + udp = (struct udphdr){ + .source = sport, + .dest = tuple->dst.port, + }; + p = mempcpy(p, &udp, sizeof(udp)); + break; + default: + return 0; + } + + return (void *)p - buf; +} + +static void close_fds(int *fds, int n) +{ + int i; + + for (i = 0; i < n; i++) + if (fds[i] > 0) + close(fds[i]); +} + +void test_cls_redirect(void) +{ + struct test_cls_redirect *skel = NULL; + struct bpf_prog_test_run_attr tattr = {}; + int families[] = { AF_INET, AF_INET6 }; + struct sockaddr_storage ss; + struct sockaddr *addr; + socklen_t slen; + int i, j, err; + + int servers[__NR_KIND][ARRAY_SIZE(families)] = {}; + int conns[__NR_KIND][ARRAY_SIZE(families)] = {}; + struct tuple tuples[__NR_KIND][ARRAY_SIZE(families)]; + + skel = test_cls_redirect__open(); + if (CHECK_FAIL(!skel)) + return; + + skel->rodata->ENCAPSULATION_IP = htonl(ENCAP_IP); + skel->rodata->ENCAPSULATION_PORT = htons(ENCAP_PORT); + + if (CHECK_FAIL(test_cls_redirect__load(skel))) + goto cleanup; + + addr = (struct sockaddr *)&ss; + for (i = 0; i < ARRAY_SIZE(families); i++) { + slen = prepare_addr(&ss, families[i]); + if (CHECK_FAIL(!slen)) + goto cleanup; + + if (CHECK_FAIL(!set_up_conn(addr, slen, SOCK_DGRAM, + &servers[UDP][i], &conns[UDP][i], + &tuples[UDP][i]))) + goto cleanup; + + if (CHECK_FAIL(!set_up_conn(addr, slen, SOCK_STREAM, + &servers[TCP][i], &conns[TCP][i], + &tuples[TCP][i]))) + goto cleanup; + } + + tattr.prog_fd = bpf_program__fd(skel->progs.cls_redirect); + for (i = 0; i < ARRAY_SIZE(tests); i++) { + struct test_cfg *test = &tests[i]; + + for (j = 0; j < ARRAY_SIZE(families); j++) { + struct tuple *tuple = &tuples[test->type][j]; + char input[256]; + char tmp[256]; + + test_str(tmp, sizeof(tmp), test, tuple->family); + if (!test__start_subtest(tmp)) + continue; + + tattr.data_out = tmp; + tattr.data_size_out = sizeof(tmp); + + tattr.data_in = input; + tattr.data_size_in = build_input(test, input, tuple); + if (CHECK_FAIL(!tattr.data_size_in)) + continue; + + err = bpf_prog_test_run_xattr(&tattr); + if (CHECK_FAIL(err)) + continue; + + if (tattr.retval != TC_ACT_REDIRECT) { + PRINT_FAIL("expected TC_ACT_REDIRECT, got %d\n", + tattr.retval); + continue; + } + + switch (test->result) { + case ACCEPT: + if (CHECK_FAIL(!was_decapsulated(&tattr))) + continue; + break; + case FORWARD: + if (CHECK_FAIL(was_decapsulated(&tattr))) + continue; + break; + default: + PRINT_FAIL("unknown result %d\n", test->result); + continue; + } + } + } + +cleanup: + test_cls_redirect__destroy(skel); + close_fds((int *)servers, sizeof(servers) / sizeof(servers[0][0])); + close_fds((int *)conns, sizeof(conns) / sizeof(conns[0][0])); +} diff --git a/tools/testing/selftests/bpf/progs/test_cls_redirect.c b/tools/testing/selftests/bpf/progs/test_cls_redirect.c new file mode 100644 index 000000000000..1668b993eb86 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_cls_redirect.c @@ -0,0 +1,1058 @@ +// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause +// Copyright (c) 2019, 2020 Cloudflare + +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "test_cls_redirect.h" + +#define offsetofend(TYPE, MEMBER) \ + (offsetof(TYPE, MEMBER) + sizeof((((TYPE *)0)->MEMBER))) + +#define IP_OFFSET_MASK (0x1FFF) +#define IP_MF (0x2000) + +char _license[] SEC("license") = "Dual BSD/GPL"; + +/** + * Destination port and IP used for UDP encapsulation. + */ +static volatile const __be16 ENCAPSULATION_PORT; +static volatile const __be32 ENCAPSULATION_IP; + +typedef struct { + uint64_t processed_packets_total; + uint64_t l3_protocol_packets_total_ipv4; + uint64_t l3_protocol_packets_total_ipv6; + uint64_t l4_protocol_packets_total_tcp; + uint64_t l4_protocol_packets_total_udp; + uint64_t accepted_packets_total_syn; + uint64_t accepted_packets_total_syn_cookies; + uint64_t accepted_packets_total_last_hop; + uint64_t accepted_packets_total_icmp_echo_request; + uint64_t accepted_packets_total_established; + uint64_t forwarded_packets_total_gue; + uint64_t forwarded_packets_total_gre; + + uint64_t errors_total_unknown_l3_proto; + uint64_t errors_total_unknown_l4_proto; + uint64_t errors_total_malformed_ip; + uint64_t errors_total_fragmented_ip; + uint64_t errors_total_malformed_icmp; + uint64_t errors_total_unwanted_icmp; + uint64_t errors_total_malformed_icmp_pkt_too_big; + uint64_t errors_total_malformed_tcp; + uint64_t errors_total_malformed_udp; + uint64_t errors_total_icmp_echo_replies; + uint64_t errors_total_malformed_encapsulation; + uint64_t errors_total_encap_adjust_failed; + uint64_t errors_total_encap_buffer_too_small; + uint64_t errors_total_redirect_loop; +} metrics_t; + +typedef enum { + INVALID = 0, + UNKNOWN, + ECHO_REQUEST, + SYN, + SYN_COOKIE, + ESTABLISHED, +} verdict_t; + +typedef struct { + uint16_t src, dst; +} flow_ports_t; + +_Static_assert( + sizeof(flow_ports_t) != + offsetofend(struct bpf_sock_tuple, ipv4.dport) - + offsetof(struct bpf_sock_tuple, ipv4.sport) - 1, + "flow_ports_t must match sport and dport in struct bpf_sock_tuple"); +_Static_assert( + sizeof(flow_ports_t) != + offsetofend(struct bpf_sock_tuple, ipv6.dport) - + offsetof(struct bpf_sock_tuple, ipv6.sport) - 1, + "flow_ports_t must match sport and dport in struct bpf_sock_tuple"); + +typedef int ret_t; + +/* This is a bit of a hack. We need a return value which allows us to + * indicate that the regular flow of the program should continue, + * while allowing functions to use XDP_PASS and XDP_DROP, etc. + */ +static const ret_t CONTINUE_PROCESSING = -1; + +/* Convenience macro to call functions which return ret_t. + */ +#define MAYBE_RETURN(x) \ + do { \ + ret_t __ret = x; \ + if (__ret != CONTINUE_PROCESSING) \ + return __ret; \ + } while (0) + +/* Linux packet pointers are either aligned to NET_IP_ALIGN (aka 2 bytes), + * or not aligned if the arch supports efficient unaligned access. + * + * Since the verifier ensures that eBPF packet accesses follow these rules, + * we can tell LLVM to emit code as if we always had a larger alignment. + * It will yell at us if we end up on a platform where this is not valid. + */ +typedef uint8_t *net_ptr __attribute__((align_value(8))); + +typedef struct buf { + struct __sk_buff *skb; + net_ptr head; + /* NB: tail musn't have alignment other than 1, otherwise + * LLVM will go and eliminate code, e.g. when checking packet lengths. + */ + uint8_t *const tail; +} buf_t; + +static size_t buf_off(const buf_t *buf) +{ + /* Clang seems to optimize constructs like + * a - b + c + * if c is known: + * r? = c + * r? -= b + * r? += a + * + * This is a problem if a and b are packet pointers, + * since the verifier allows subtracting two pointers to + * get a scalar, but not a scalar and a pointer. + * + * Use inline asm to break this optimization. + */ + size_t off = (size_t)buf->head; + asm("%0 -= %1" : "+r"(off) : "r"(buf->skb->data)); + return off; +} + +static bool buf_copy(buf_t *buf, void *dst, size_t len) +{ + if (bpf_skb_load_bytes(buf->skb, buf_off(buf), dst, len)) { + return false; + } + + buf->head += len; + return true; +} + +static bool buf_skip(buf_t *buf, const size_t len) +{ + /* Check whether off + len is valid in the non-linear part. */ + if (buf_off(buf) + len > buf->skb->len) { + return false; + } + + buf->head += len; + return true; +} + +/* Returns a pointer to the start of buf, or NULL if len is + * larger than the remaining data. Consumes len bytes on a successful + * call. + * + * If scratch is not NULL, the function will attempt to load non-linear + * data via bpf_skb_load_bytes. On success, scratch is returned. + */ +static void *buf_assign(buf_t *buf, const size_t len, void *scratch) +{ + if (buf->head + len > buf->tail) { + if (scratch == NULL) { + return NULL; + } + + return buf_copy(buf, scratch, len) ? scratch : NULL; + } + + void *ptr = buf->head; + buf->head += len; + return ptr; +} + +static bool pkt_skip_ipv4_options(buf_t *buf, const struct iphdr *ipv4) +{ + if (ipv4->ihl <= 5) { + return true; + } + + return buf_skip(buf, (ipv4->ihl - 5) * 4); +} + +static bool ipv4_is_fragment(const struct iphdr *ip) +{ + uint16_t frag_off = ip->frag_off & bpf_htons(IP_OFFSET_MASK); + return (ip->frag_off & bpf_htons(IP_MF)) != 0 || frag_off > 0; +} + +static struct iphdr *pkt_parse_ipv4(buf_t *pkt, struct iphdr *scratch) +{ + struct iphdr *ipv4 = buf_assign(pkt, sizeof(*ipv4), scratch); + if (ipv4 == NULL) { + return NULL; + } + + if (ipv4->ihl < 5) { + return NULL; + } + + if (!pkt_skip_ipv4_options(pkt, ipv4)) { + return NULL; + } + + return ipv4; +} + +/* Parse the L4 ports from a packet, assuming a layout like TCP or UDP. */ +static bool pkt_parse_icmp_l4_ports(buf_t *pkt, flow_ports_t *ports) +{ + if (!buf_copy(pkt, ports, sizeof(*ports))) { + return false; + } + + /* Ports in the L4 headers are reversed, since we are parsing an ICMP + * payload which is going towards the eyeball. + */ + uint16_t dst = ports->src; + ports->src = ports->dst; + ports->dst = dst; + return true; +} + +static uint16_t pkt_checksum_fold(uint32_t csum) +{ + /* The highest reasonable value for an IPv4 header + * checksum requires two folds, so we just do that always. + */ + csum = (csum & 0xffff) + (csum >> 16); + csum = (csum & 0xffff) + (csum >> 16); + return (uint16_t)~csum; +} + +static void pkt_ipv4_checksum(struct iphdr *iph) +{ + iph->check = 0; + + /* An IP header without options is 20 bytes. Two of those + * are the checksum, which we always set to zero. Hence, + * the maximum accumulated value is 18 / 2 * 0xffff = 0x8fff7, + * which fits in 32 bit. + */ + _Static_assert(sizeof(struct iphdr) == 20, "iphdr must be 20 bytes"); + uint32_t acc = 0; + uint16_t *ipw = (uint16_t *)iph; + +#pragma clang loop unroll(full) + for (size_t i = 0; i < sizeof(struct iphdr) / 2; i++) { + acc += ipw[i]; + } + + iph->check = pkt_checksum_fold(acc); +} + +static bool pkt_skip_ipv6_extension_headers(buf_t *pkt, + const struct ipv6hdr *ipv6, + uint8_t *upper_proto, + bool *is_fragment) +{ + /* We understand five extension headers. + * https://tools.ietf.org/html/rfc8200#section-4.1 states that all + * headers should occur once, except Destination Options, which may + * occur twice. Hence we give up after 6 headers. + */ + struct { + uint8_t next; + uint8_t len; + } exthdr = { + .next = ipv6->nexthdr, + }; + *is_fragment = false; + +#pragma clang loop unroll(full) + for (int i = 0; i < 6; i++) { + switch (exthdr.next) { + case IPPROTO_FRAGMENT: + *is_fragment = true; + /* NB: We don't check that hdrlen == 0 as per spec. */ + /* fallthrough; */ + + case IPPROTO_HOPOPTS: + case IPPROTO_ROUTING: + case IPPROTO_DSTOPTS: + case IPPROTO_MH: + if (!buf_copy(pkt, &exthdr, sizeof(exthdr))) { + return false; + } + + /* hdrlen is in 8-octet units, and excludes the first 8 octets. */ + if (!buf_skip(pkt, + (exthdr.len + 1) * 8 - sizeof(exthdr))) { + return false; + } + + /* Decode next header */ + break; + + default: + /* The next header is not one of the known extension + * headers, treat it as the upper layer header. + * + * This handles IPPROTO_NONE. + * + * Encapsulating Security Payload (50) and Authentication + * Header (51) also end up here (and will trigger an + * unknown proto error later). They have a custom header + * format and seem too esoteric to care about. + */ + *upper_proto = exthdr.next; + return true; + } + } + + /* We never found an upper layer header. */ + return false; +} + +/* This function has to be inlined, because the verifier otherwise rejects it + * due to returning a pointer to the stack. This is technically correct, since + * scratch is allocated on the stack. However, this usage should be safe since + * it's the callers stack after all. + */ +static inline __attribute__((__always_inline__)) struct ipv6hdr * +pkt_parse_ipv6(buf_t *pkt, struct ipv6hdr *scratch, uint8_t *proto, + bool *is_fragment) +{ + struct ipv6hdr *ipv6 = buf_assign(pkt, sizeof(*ipv6), scratch); + if (ipv6 == NULL) { + return NULL; + } + + if (!pkt_skip_ipv6_extension_headers(pkt, ipv6, proto, is_fragment)) { + return NULL; + } + + return ipv6; +} + +/* Global metrics, per CPU + */ +struct bpf_map_def metrics_map SEC("maps") = { + .type = BPF_MAP_TYPE_PERCPU_ARRAY, + .key_size = sizeof(unsigned int), + .value_size = sizeof(metrics_t), + .max_entries = 1, +}; + +static metrics_t *get_global_metrics(void) +{ + uint64_t key = 0; + return bpf_map_lookup_elem(&metrics_map, &key); +} + +static ret_t accept_locally(struct __sk_buff *skb, encap_headers_t *encap) +{ + const int payload_off = + sizeof(*encap) + + sizeof(struct in_addr) * encap->unigue.hop_count; + int32_t encap_overhead = payload_off - sizeof(struct ethhdr); + + // Changing the ethertype if the encapsulated packet is ipv6 + if (encap->gue.proto_ctype == IPPROTO_IPV6) { + encap->eth.h_proto = bpf_htons(ETH_P_IPV6); + } + + if (bpf_skb_adjust_room(skb, -encap_overhead, BPF_ADJ_ROOM_MAC, + BPF_F_ADJ_ROOM_FIXED_GSO)) { + return TC_ACT_SHOT; + } + + return bpf_redirect(skb->ifindex, BPF_F_INGRESS); +} + +static ret_t forward_with_gre(struct __sk_buff *skb, encap_headers_t *encap, + struct in_addr *next_hop, metrics_t *metrics) +{ + metrics->forwarded_packets_total_gre++; + + const int payload_off = + sizeof(*encap) + + sizeof(struct in_addr) * encap->unigue.hop_count; + int32_t encap_overhead = + payload_off - sizeof(struct ethhdr) - sizeof(struct iphdr); + int32_t delta = sizeof(struct gre_base_hdr) - encap_overhead; + uint16_t proto = ETH_P_IP; + + /* Loop protection: the inner packet's TTL is decremented as a safeguard + * against any forwarding loop. As the only interesting field is the TTL + * hop limit for IPv6, it is easier to use bpf_skb_load_bytes/bpf_skb_store_bytes + * as they handle the split packets if needed (no need for the data to be + * in the linear section). + */ + if (encap->gue.proto_ctype == IPPROTO_IPV6) { + proto = ETH_P_IPV6; + uint8_t ttl; + int rc; + + rc = bpf_skb_load_bytes( + skb, payload_off + offsetof(struct ipv6hdr, hop_limit), + &ttl, 1); + if (rc != 0) { + metrics->errors_total_malformed_encapsulation++; + return TC_ACT_SHOT; + } + + if (ttl == 0) { + metrics->errors_total_redirect_loop++; + return TC_ACT_SHOT; + } + + ttl--; + rc = bpf_skb_store_bytes( + skb, payload_off + offsetof(struct ipv6hdr, hop_limit), + &ttl, 1, 0); + if (rc != 0) { + metrics->errors_total_malformed_encapsulation++; + return TC_ACT_SHOT; + } + } else { + uint8_t ttl; + int rc; + + rc = bpf_skb_load_bytes( + skb, payload_off + offsetof(struct iphdr, ttl), &ttl, + 1); + if (rc != 0) { + metrics->errors_total_malformed_encapsulation++; + return TC_ACT_SHOT; + } + + if (ttl == 0) { + metrics->errors_total_redirect_loop++; + return TC_ACT_SHOT; + } + + /* IPv4 also has a checksum to patch. While the TTL is only one byte, + * this function only works for 2 and 4 bytes arguments (the result is + * the same). + */ + rc = bpf_l3_csum_replace( + skb, payload_off + offsetof(struct iphdr, check), ttl, + ttl - 1, 2); + if (rc != 0) { + metrics->errors_total_malformed_encapsulation++; + return TC_ACT_SHOT; + } + + ttl--; + rc = bpf_skb_store_bytes( + skb, payload_off + offsetof(struct iphdr, ttl), &ttl, 1, + 0); + if (rc != 0) { + metrics->errors_total_malformed_encapsulation++; + return TC_ACT_SHOT; + } + } + + if (bpf_skb_adjust_room(skb, delta, BPF_ADJ_ROOM_NET, + BPF_F_ADJ_ROOM_FIXED_GSO)) { + metrics->errors_total_encap_adjust_failed++; + return TC_ACT_SHOT; + } + + if (bpf_skb_pull_data(skb, sizeof(encap_gre_t))) { + metrics->errors_total_encap_buffer_too_small++; + return TC_ACT_SHOT; + } + + buf_t pkt = { + .skb = skb, + .head = (uint8_t *)(long)skb->data, + .tail = (uint8_t *)(long)skb->data_end, + }; + + encap_gre_t *encap_gre = buf_assign(&pkt, sizeof(encap_gre_t), NULL); + if (encap_gre == NULL) { + metrics->errors_total_encap_buffer_too_small++; + return TC_ACT_SHOT; + } + + encap_gre->ip.protocol = IPPROTO_GRE; + encap_gre->ip.daddr = next_hop->s_addr; + encap_gre->ip.saddr = ENCAPSULATION_IP; + encap_gre->ip.tot_len = + bpf_htons(bpf_ntohs(encap_gre->ip.tot_len) + delta); + encap_gre->gre.flags = 0; + encap_gre->gre.protocol = bpf_htons(proto); + pkt_ipv4_checksum((void *)&encap_gre->ip); + + return bpf_redirect(skb->ifindex, 0); +} + +static ret_t forward_to_next_hop(struct __sk_buff *skb, encap_headers_t *encap, + struct in_addr *next_hop, metrics_t *metrics) +{ + /* swap L2 addresses */ + /* This assumes that packets are received from a router. + * So just swapping the MAC addresses here will make the packet go back to + * the router, which will send it to the appropriate machine. + */ + unsigned char temp[ETH_ALEN]; + memcpy(temp, encap->eth.h_dest, sizeof(temp)); + memcpy(encap->eth.h_dest, encap->eth.h_source, + sizeof(encap->eth.h_dest)); + memcpy(encap->eth.h_source, temp, sizeof(encap->eth.h_source)); + + if (encap->unigue.next_hop == encap->unigue.hop_count - 1 && + encap->unigue.last_hop_gre) { + return forward_with_gre(skb, encap, next_hop, metrics); + } + + metrics->forwarded_packets_total_gue++; + uint32_t old_saddr = encap->ip.saddr; + encap->ip.saddr = encap->ip.daddr; + encap->ip.daddr = next_hop->s_addr; + if (encap->unigue.next_hop < encap->unigue.hop_count) { + encap->unigue.next_hop++; + } + + /* Remove ip->saddr, add next_hop->s_addr */ + const uint64_t off = offsetof(typeof(*encap), ip.check); + int ret = bpf_l3_csum_replace(skb, off, old_saddr, next_hop->s_addr, 4); + if (ret < 0) { + return TC_ACT_SHOT; + } + + return bpf_redirect(skb->ifindex, 0); +} + +static ret_t skip_next_hops(buf_t *pkt, int n) +{ + switch (n) { + case 1: + if (!buf_skip(pkt, sizeof(struct in_addr))) + return TC_ACT_SHOT; + case 0: + return CONTINUE_PROCESSING; + + default: + return TC_ACT_SHOT; + } +} + +/* Get the next hop from the GLB header. + * + * Sets next_hop->s_addr to 0 if there are no more hops left. + * pkt is positioned just after the variable length GLB header + * iff the call is successful. + */ +static ret_t get_next_hop(buf_t *pkt, encap_headers_t *encap, + struct in_addr *next_hop) +{ + if (encap->unigue.next_hop > encap->unigue.hop_count) { + return TC_ACT_SHOT; + } + + /* Skip "used" next hops. */ + MAYBE_RETURN(skip_next_hops(pkt, encap->unigue.next_hop)); + + if (encap->unigue.next_hop == encap->unigue.hop_count) { + /* No more next hops, we are at the end of the GLB header. */ + next_hop->s_addr = 0; + return CONTINUE_PROCESSING; + } + + if (!buf_copy(pkt, next_hop, sizeof(*next_hop))) { + return TC_ACT_SHOT; + } + + /* Skip the remainig next hops (may be zero). */ + return skip_next_hops(pkt, encap->unigue.hop_count - + encap->unigue.next_hop - 1); +} + +/* Fill a bpf_sock_tuple to be used with the socket lookup functions. + * This is a kludge that let's us work around verifier limitations: + * + * fill_tuple(&t, foo, sizeof(struct iphdr), 123, 321) + * + * clang will substitue a costant for sizeof, which allows the verifier + * to track it's value. Based on this, it can figure out the constant + * return value, and calling code works while still being "generic" to + * IPv4 and IPv6. + */ +static uint64_t fill_tuple(struct bpf_sock_tuple *tuple, void *iph, + uint64_t iphlen, uint16_t sport, uint16_t dport) +{ + switch (iphlen) { + case sizeof(struct iphdr): { + struct iphdr *ipv4 = (struct iphdr *)iph; + tuple->ipv4.daddr = ipv4->daddr; + tuple->ipv4.saddr = ipv4->saddr; + tuple->ipv4.sport = sport; + tuple->ipv4.dport = dport; + return sizeof(tuple->ipv4); + } + + case sizeof(struct ipv6hdr): { + struct ipv6hdr *ipv6 = (struct ipv6hdr *)iph; + memcpy(&tuple->ipv6.daddr, &ipv6->daddr, + sizeof(tuple->ipv6.daddr)); + memcpy(&tuple->ipv6.saddr, &ipv6->saddr, + sizeof(tuple->ipv6.saddr)); + tuple->ipv6.sport = sport; + tuple->ipv6.dport = dport; + return sizeof(tuple->ipv6); + } + + default: + return 0; + } +} + +static verdict_t classify_tcp(struct __sk_buff *skb, + struct bpf_sock_tuple *tuple, uint64_t tuplen, + void *iph, struct tcphdr *tcp) +{ + struct bpf_sock *sk = + bpf_skc_lookup_tcp(skb, tuple, tuplen, BPF_F_CURRENT_NETNS, 0); + if (sk == NULL) { + return UNKNOWN; + } + + if (sk->state != BPF_TCP_LISTEN) { + bpf_sk_release(sk); + return ESTABLISHED; + } + + if (iph != NULL && tcp != NULL) { + /* Kludge: we've run out of arguments, but need the length of the ip header. */ + uint64_t iphlen = sizeof(struct iphdr); + if (tuplen == sizeof(tuple->ipv6)) { + iphlen = sizeof(struct ipv6hdr); + } + + if (bpf_tcp_check_syncookie(sk, iph, iphlen, tcp, + sizeof(*tcp)) == 0) { + bpf_sk_release(sk); + return SYN_COOKIE; + } + } + + bpf_sk_release(sk); + return UNKNOWN; +} + +static verdict_t classify_udp(struct __sk_buff *skb, + struct bpf_sock_tuple *tuple, uint64_t tuplen) +{ + struct bpf_sock *sk = + bpf_sk_lookup_udp(skb, tuple, tuplen, BPF_F_CURRENT_NETNS, 0); + if (sk == NULL) { + return UNKNOWN; + } + + if (sk->state == BPF_TCP_ESTABLISHED) { + bpf_sk_release(sk); + return ESTABLISHED; + } + + bpf_sk_release(sk); + return UNKNOWN; +} + +static verdict_t classify_icmp(struct __sk_buff *skb, uint8_t proto, + struct bpf_sock_tuple *tuple, uint64_t tuplen, + metrics_t *metrics) +{ + switch (proto) { + case IPPROTO_TCP: + return classify_tcp(skb, tuple, tuplen, NULL, NULL); + + case IPPROTO_UDP: + return classify_udp(skb, tuple, tuplen); + + default: + metrics->errors_total_malformed_icmp++; + return INVALID; + } +} + +static verdict_t process_icmpv4(buf_t *pkt, metrics_t *metrics) +{ + struct icmphdr icmp; + if (!buf_copy(pkt, &icmp, sizeof(icmp))) { + metrics->errors_total_malformed_icmp++; + return INVALID; + } + + /* We should never receive encapsulated echo replies. */ + if (icmp.type == ICMP_ECHOREPLY) { + metrics->errors_total_icmp_echo_replies++; + return INVALID; + } + + if (icmp.type == ICMP_ECHO) { + return ECHO_REQUEST; + } + + if (icmp.type != ICMP_DEST_UNREACH || icmp.code != ICMP_FRAG_NEEDED) { + metrics->errors_total_unwanted_icmp++; + return INVALID; + } + + struct iphdr _ip4; + const struct iphdr *ipv4 = pkt_parse_ipv4(pkt, &_ip4); + if (ipv4 == NULL) { + metrics->errors_total_malformed_icmp_pkt_too_big++; + return INVALID; + } + + /* The source address in the outer IP header is from the entity that + * originated the ICMP message. Use the original IP header to restore + * the correct flow tuple. + */ + struct bpf_sock_tuple tuple; + tuple.ipv4.saddr = ipv4->daddr; + tuple.ipv4.daddr = ipv4->saddr; + + if (!pkt_parse_icmp_l4_ports(pkt, (flow_ports_t *)&tuple.ipv4.sport)) { + metrics->errors_total_malformed_icmp_pkt_too_big++; + return INVALID; + } + + return classify_icmp(pkt->skb, ipv4->protocol, &tuple, + sizeof(tuple.ipv4), metrics); +} + +static verdict_t process_icmpv6(buf_t *pkt, metrics_t *metrics) +{ + struct icmp6hdr icmp6; + if (!buf_copy(pkt, &icmp6, sizeof(icmp6))) { + metrics->errors_total_malformed_icmp++; + return INVALID; + } + + /* We should never receive encapsulated echo replies. */ + if (icmp6.icmp6_type == ICMPV6_ECHO_REPLY) { + metrics->errors_total_icmp_echo_replies++; + return INVALID; + } + + if (icmp6.icmp6_type == ICMPV6_ECHO_REQUEST) { + return ECHO_REQUEST; + } + + if (icmp6.icmp6_type != ICMPV6_PKT_TOOBIG) { + metrics->errors_total_unwanted_icmp++; + return INVALID; + } + + bool is_fragment; + uint8_t l4_proto; + struct ipv6hdr _ipv6; + const struct ipv6hdr *ipv6 = + pkt_parse_ipv6(pkt, &_ipv6, &l4_proto, &is_fragment); + if (ipv6 == NULL) { + metrics->errors_total_malformed_icmp_pkt_too_big++; + return INVALID; + } + + if (is_fragment) { + metrics->errors_total_fragmented_ip++; + return INVALID; + } + + /* Swap source and dest addresses. */ + struct bpf_sock_tuple tuple; + memcpy(&tuple.ipv6.saddr, &ipv6->daddr, sizeof(tuple.ipv6.saddr)); + memcpy(&tuple.ipv6.daddr, &ipv6->saddr, sizeof(tuple.ipv6.daddr)); + + if (!pkt_parse_icmp_l4_ports(pkt, (flow_ports_t *)&tuple.ipv6.sport)) { + metrics->errors_total_malformed_icmp_pkt_too_big++; + return INVALID; + } + + return classify_icmp(pkt->skb, l4_proto, &tuple, sizeof(tuple.ipv6), + metrics); +} + +static verdict_t process_tcp(buf_t *pkt, void *iph, uint64_t iphlen, + metrics_t *metrics) +{ + metrics->l4_protocol_packets_total_tcp++; + + struct tcphdr _tcp; + struct tcphdr *tcp = buf_assign(pkt, sizeof(_tcp), &_tcp); + if (tcp == NULL) { + metrics->errors_total_malformed_tcp++; + return INVALID; + } + + if (tcp->syn) { + return SYN; + } + + struct bpf_sock_tuple tuple; + uint64_t tuplen = + fill_tuple(&tuple, iph, iphlen, tcp->source, tcp->dest); + return classify_tcp(pkt->skb, &tuple, tuplen, iph, tcp); +} + +static verdict_t process_udp(buf_t *pkt, void *iph, uint64_t iphlen, + metrics_t *metrics) +{ + metrics->l4_protocol_packets_total_udp++; + + struct udphdr _udp; + struct udphdr *udph = buf_assign(pkt, sizeof(_udp), &_udp); + if (udph == NULL) { + metrics->errors_total_malformed_udp++; + return INVALID; + } + + struct bpf_sock_tuple tuple; + uint64_t tuplen = + fill_tuple(&tuple, iph, iphlen, udph->source, udph->dest); + return classify_udp(pkt->skb, &tuple, tuplen); +} + +static verdict_t process_ipv4(buf_t *pkt, metrics_t *metrics) +{ + metrics->l3_protocol_packets_total_ipv4++; + + struct iphdr _ip4; + struct iphdr *ipv4 = pkt_parse_ipv4(pkt, &_ip4); + if (ipv4 == NULL) { + metrics->errors_total_malformed_ip++; + return INVALID; + } + + if (ipv4->version != 4) { + metrics->errors_total_malformed_ip++; + return INVALID; + } + + if (ipv4_is_fragment(ipv4)) { + metrics->errors_total_fragmented_ip++; + return INVALID; + } + + switch (ipv4->protocol) { + case IPPROTO_ICMP: + return process_icmpv4(pkt, metrics); + + case IPPROTO_TCP: + return process_tcp(pkt, ipv4, sizeof(*ipv4), metrics); + + case IPPROTO_UDP: + return process_udp(pkt, ipv4, sizeof(*ipv4), metrics); + + default: + metrics->errors_total_unknown_l4_proto++; + return INVALID; + } +} + +static verdict_t process_ipv6(buf_t *pkt, metrics_t *metrics) +{ + metrics->l3_protocol_packets_total_ipv6++; + + uint8_t l4_proto; + bool is_fragment; + struct ipv6hdr _ipv6; + struct ipv6hdr *ipv6 = + pkt_parse_ipv6(pkt, &_ipv6, &l4_proto, &is_fragment); + if (ipv6 == NULL) { + metrics->errors_total_malformed_ip++; + return INVALID; + } + + if (ipv6->version != 6) { + metrics->errors_total_malformed_ip++; + return INVALID; + } + + if (is_fragment) { + metrics->errors_total_fragmented_ip++; + return INVALID; + } + + switch (l4_proto) { + case IPPROTO_ICMPV6: + return process_icmpv6(pkt, metrics); + + case IPPROTO_TCP: + return process_tcp(pkt, ipv6, sizeof(*ipv6), metrics); + + case IPPROTO_UDP: + return process_udp(pkt, ipv6, sizeof(*ipv6), metrics); + + default: + metrics->errors_total_unknown_l4_proto++; + return INVALID; + } +} + +SEC("classifier/cls_redirect") +int cls_redirect(struct __sk_buff *skb) +{ + metrics_t *metrics = get_global_metrics(); + if (metrics == NULL) { + return TC_ACT_SHOT; + } + + metrics->processed_packets_total++; + + /* Pass bogus packets as long as we're not sure they're + * destined for us. + */ + if (skb->protocol != bpf_htons(ETH_P_IP)) { + return TC_ACT_OK; + } + + encap_headers_t *encap; + + /* Make sure that all encapsulation headers are available in + * the linear portion of the skb. This makes it easy to manipulate them. + */ + if (bpf_skb_pull_data(skb, sizeof(*encap))) { + return TC_ACT_OK; + } + + buf_t pkt = { + .skb = skb, + .head = (uint8_t *)(long)skb->data, + .tail = (uint8_t *)(long)skb->data_end, + }; + + encap = buf_assign(&pkt, sizeof(*encap), NULL); + if (encap == NULL) { + return TC_ACT_OK; + } + + if (encap->ip.ihl != 5) { + /* We never have any options. */ + return TC_ACT_OK; + } + + if (encap->ip.daddr != ENCAPSULATION_IP || + encap->ip.protocol != IPPROTO_UDP) { + return TC_ACT_OK; + } + + /* TODO Check UDP length? */ + if (encap->udp.dest != ENCAPSULATION_PORT) { + return TC_ACT_OK; + } + + /* We now know that the packet is destined to us, we can + * drop bogus ones. + */ + if (ipv4_is_fragment((void *)&encap->ip)) { + metrics->errors_total_fragmented_ip++; + return TC_ACT_SHOT; + } + + if (encap->gue.variant != 0) { + metrics->errors_total_malformed_encapsulation++; + return TC_ACT_SHOT; + } + + if (encap->gue.control != 0) { + metrics->errors_total_malformed_encapsulation++; + return TC_ACT_SHOT; + } + + if (encap->gue.flags != 0) { + metrics->errors_total_malformed_encapsulation++; + return TC_ACT_SHOT; + } + + if (encap->gue.hlen != + sizeof(encap->unigue) / 4 + encap->unigue.hop_count) { + metrics->errors_total_malformed_encapsulation++; + return TC_ACT_SHOT; + } + + if (encap->unigue.version != 0) { + metrics->errors_total_malformed_encapsulation++; + return TC_ACT_SHOT; + } + + if (encap->unigue.reserved != 0) { + return TC_ACT_SHOT; + } + + struct in_addr next_hop; + MAYBE_RETURN(get_next_hop(&pkt, encap, &next_hop)); + + if (next_hop.s_addr == 0) { + metrics->accepted_packets_total_last_hop++; + return accept_locally(skb, encap); + } + + verdict_t verdict; + switch (encap->gue.proto_ctype) { + case IPPROTO_IPIP: + verdict = process_ipv4(&pkt, metrics); + break; + + case IPPROTO_IPV6: + verdict = process_ipv6(&pkt, metrics); + break; + + default: + metrics->errors_total_unknown_l3_proto++; + return TC_ACT_SHOT; + } + + switch (verdict) { + case INVALID: + /* metrics have already been bumped */ + return TC_ACT_SHOT; + + case UNKNOWN: + return forward_to_next_hop(skb, encap, &next_hop, metrics); + + case ECHO_REQUEST: + metrics->accepted_packets_total_icmp_echo_request++; + break; + + case SYN: + if (encap->unigue.forward_syn) { + return forward_to_next_hop(skb, encap, &next_hop, + metrics); + } + + metrics->accepted_packets_total_syn++; + break; + + case SYN_COOKIE: + metrics->accepted_packets_total_syn_cookies++; + break; + + case ESTABLISHED: + metrics->accepted_packets_total_established++; + break; + } + + return accept_locally(skb, encap); +} diff --git a/tools/testing/selftests/bpf/progs/test_cls_redirect.h b/tools/testing/selftests/bpf/progs/test_cls_redirect.h new file mode 100644 index 000000000000..76eab0aacba0 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_cls_redirect.h @@ -0,0 +1,54 @@ +/* SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause */ +/* Copyright 2019, 2020 Cloudflare */ + +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +struct gre_base_hdr { + uint16_t flags; + uint16_t protocol; +} __attribute__((packed)); + +struct guehdr { +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + uint8_t hlen : 5, control : 1, variant : 2; +#else + uint8_t variant : 2, control : 1, hlen : 5; +#endif + uint8_t proto_ctype; + uint16_t flags; +}; + +struct unigue { +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + uint8_t _r : 2, last_hop_gre : 1, forward_syn : 1, version : 4; +#else + uint8_t version : 4, forward_syn : 1, last_hop_gre : 1, _r : 2; +#endif + uint8_t reserved; + uint8_t next_hop; + uint8_t hop_count; + // Next hops go here +} __attribute__((packed)); + +typedef struct { + struct ethhdr eth; + struct iphdr ip; + struct gre_base_hdr gre; +} __attribute__((packed)) encap_gre_t; + +typedef struct { + struct ethhdr eth; + struct iphdr ip; + struct udphdr udp; + struct guehdr gue; + struct unigue unigue; +} __attribute__((packed)) encap_headers_t; diff --git a/tools/testing/selftests/bpf/test_progs.h b/tools/testing/selftests/bpf/test_progs.h index f4aff6b8284b..10188cc8e9e0 100644 --- a/tools/testing/selftests/bpf/test_progs.h +++ b/tools/testing/selftests/bpf/test_progs.h @@ -105,6 +105,13 @@ struct ipv6_packet { } __packed; extern struct ipv6_packet pkt_v6; +#define PRINT_FAIL(format...) \ + ({ \ + test__fail(); \ + fprintf(stdout, "%s:FAIL:%d ", __func__, __LINE__); \ + fprintf(stdout, ##format); \ + }) + #define _CHECK(condition, tag, duration, format...) ({ \ int __ret = !!(condition); \ int __save_errno = errno; \ -- cgit v1.2.3-59-g8ed1b From 74f99482eae03195ced512b440b31d62bdb6e943 Mon Sep 17 00:00:00 2001 From: Bodong Wang Date: Tue, 21 Apr 2020 10:04:16 -0500 Subject: netfilter: nf_conntrack: add IPS_HW_OFFLOAD status bit This bit indicates that the conntrack entry is offloaded to hardware flow table. nf_conntrack entry will be tagged with [HW_OFFLOAD] if it's offload to hardware. cat /proc/net/nf_conntrack ipv4 2 tcp 6 \ src=1.1.1.17 dst=1.1.1.16 sport=56394 dport=5001 \ src=1.1.1.16 dst=1.1.1.17 sport=5001 dport=56394 [HW_OFFLOAD] \ mark=0 zone=0 use=3 Note that HW_OFFLOAD/OFFLOAD/ASSURED are mutually exclusive. Changelog: * V1->V2: - Remove check of lastused from stats. It was meant for cases such as removing driver module while traffic still running. Better to handle such cases from garbage collector. Signed-off-by: Bodong Wang Reviewed-by: Oz Shlomo Reviewed-by: Paul Blakey Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter/nf_conntrack_common.h | 8 ++++++-- net/netfilter/nf_conntrack_standalone.c | 4 +++- net/netfilter/nf_flow_table_offload.c | 3 +++ 3 files changed, 12 insertions(+), 3 deletions(-) diff --git a/include/uapi/linux/netfilter/nf_conntrack_common.h b/include/uapi/linux/netfilter/nf_conntrack_common.h index b6f0bb1dc799..4b3395082d15 100644 --- a/include/uapi/linux/netfilter/nf_conntrack_common.h +++ b/include/uapi/linux/netfilter/nf_conntrack_common.h @@ -114,15 +114,19 @@ enum ip_conntrack_status { IPS_OFFLOAD_BIT = 14, IPS_OFFLOAD = (1 << IPS_OFFLOAD_BIT), + /* Conntrack has been offloaded to hardware. */ + IPS_HW_OFFLOAD_BIT = 15, + IPS_HW_OFFLOAD = (1 << IPS_HW_OFFLOAD_BIT), + /* Be careful here, modifying these bits can make things messy, * so don't let users modify them directly. */ IPS_UNCHANGEABLE_MASK = (IPS_NAT_DONE_MASK | IPS_NAT_MASK | IPS_EXPECTED | IPS_CONFIRMED | IPS_DYING | IPS_SEQ_ADJUST | IPS_TEMPLATE | IPS_UNTRACKED | - IPS_OFFLOAD), + IPS_OFFLOAD | IPS_HW_OFFLOAD), - __IPS_MAX_BIT = 15, + __IPS_MAX_BIT = 16, }; /* Connection tracking event types */ diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c index 9b57330c81f8..5a3e6c43ee68 100644 --- a/net/netfilter/nf_conntrack_standalone.c +++ b/net/netfilter/nf_conntrack_standalone.c @@ -348,7 +348,9 @@ static int ct_seq_show(struct seq_file *s, void *v) if (seq_print_acct(s, ct, IP_CT_DIR_REPLY)) goto release; - if (test_bit(IPS_OFFLOAD_BIT, &ct->status)) + if (test_bit(IPS_HW_OFFLOAD_BIT, &ct->status)) + seq_puts(s, "[HW_OFFLOAD] "); + else if (test_bit(IPS_OFFLOAD_BIT, &ct->status)) seq_puts(s, "[OFFLOAD] "); else if (test_bit(IPS_ASSURED_BIT, &ct->status)) seq_puts(s, "[ASSURED] "); diff --git a/net/netfilter/nf_flow_table_offload.c b/net/netfilter/nf_flow_table_offload.c index e3b099c14eff..a2abb0feab7f 100644 --- a/net/netfilter/nf_flow_table_offload.c +++ b/net/netfilter/nf_flow_table_offload.c @@ -754,12 +754,15 @@ static void flow_offload_work_add(struct flow_offload_work *offload) err = flow_offload_rule_add(offload, flow_rule); if (err < 0) set_bit(NF_FLOW_HW_REFRESH, &offload->flow->flags); + else + set_bit(IPS_HW_OFFLOAD_BIT, &offload->flow->ct->status); nf_flow_offload_destroy(flow_rule); } static void flow_offload_work_del(struct flow_offload_work *offload) { + clear_bit(IPS_HW_OFFLOAD_BIT, &offload->flow->ct->status); flow_offload_tuple_del(offload, FLOW_OFFLOAD_DIR_ORIGINAL); flow_offload_tuple_del(offload, FLOW_OFFLOAD_DIR_REPLY); set_bit(NF_FLOW_HW_DEAD, &offload->flow->flags); -- cgit v1.2.3-59-g8ed1b From 5cb899dd5ba430bd2debf6e4ce7b5cece9e9025d Mon Sep 17 00:00:00 2001 From: Karthikeyan Periyasamy Date: Wed, 22 Apr 2020 16:16:18 +0530 Subject: ath11k: fix reo flush send we are sending the reo flush command for the deleted peer tid after the ageout period reaches 1 second. This handling causes reo ring get full when more than 128 clients are disconnected continuously. so added the count for flush list and reo flush command is triggered after the list count reaches the threshold value, it is configured as 64 (half of the reo ring). This will avoid the situation where reo ring get full. Signed-off-by: Karthikeyan Periyasamy Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/1587552378-4884-1-git-send-email-periyasa@codeaurora.org --- drivers/net/wireless/ath/ath11k/dp.c | 2 ++ drivers/net/wireless/ath/ath11k/dp.h | 9 ++++++++- drivers/net/wireless/ath/ath11k/dp_rx.c | 6 +++++- 3 files changed, 15 insertions(+), 2 deletions(-) diff --git a/drivers/net/wireless/ath/ath11k/dp.c b/drivers/net/wireless/ath/ath11k/dp.c index 50350f77b309..8d6fb848f8c4 100644 --- a/drivers/net/wireless/ath/ath11k/dp.c +++ b/drivers/net/wireless/ath/ath11k/dp.c @@ -880,6 +880,8 @@ int ath11k_dp_alloc(struct ath11k_base *ab) INIT_LIST_HEAD(&dp->reo_cmd_cache_flush_list); spin_lock_init(&dp->reo_cmd_lock); + dp->reo_cmd_cache_flush_count = 0; + ret = ath11k_wbm_idle_ring_setup(ab, &n_link_desc); if (ret) { ath11k_warn(ab, "failed to setup wbm_idle_ring: %d\n", ret); diff --git a/drivers/net/wireless/ath/ath11k/dp.h b/drivers/net/wireless/ath/ath11k/dp.h index d4e19dc4bce1..222de10e4b93 100644 --- a/drivers/net/wireless/ath/ath11k/dp.h +++ b/drivers/net/wireless/ath/ath11k/dp.h @@ -36,6 +36,7 @@ struct dp_rx_tid { struct ath11k_base *ab; }; +#define DP_REO_DESC_FREE_THRESHOLD 64 #define DP_REO_DESC_FREE_TIMEOUT_MS 1000 struct dp_reo_cache_flush_elem { @@ -222,7 +223,13 @@ struct ath11k_dp { struct hal_wbm_idle_scatter_list scatter_list[DP_IDLE_SCATTER_BUFS_MAX]; struct list_head reo_cmd_list; struct list_head reo_cmd_cache_flush_list; - /* protects access to reo_cmd_list and reo_cmd_cache_flush_list */ + u32 reo_cmd_cache_flush_count; + /** + * protects access to below fields, + * - reo_cmd_list + * - reo_cmd_cache_flush_list + * - reo_cmd_cache_flush_count + */ spinlock_t reo_cmd_lock; }; diff --git a/drivers/net/wireless/ath/ath11k/dp_rx.c b/drivers/net/wireless/ath/ath11k/dp_rx.c index bbd7da48518f..d3d2a335cc40 100644 --- a/drivers/net/wireless/ath/ath11k/dp_rx.c +++ b/drivers/net/wireless/ath/ath11k/dp_rx.c @@ -565,6 +565,7 @@ void ath11k_dp_reo_cmd_list_cleanup(struct ath11k_base *ab) list_for_each_entry_safe(cmd_cache, tmp_cache, &dp->reo_cmd_cache_flush_list, list) { list_del(&cmd_cache->list); + dp->reo_cmd_cache_flush_count--; dma_unmap_single(ab->dev, cmd_cache->data.paddr, cmd_cache->data.size, DMA_BIDIRECTIONAL); kfree(cmd_cache->data.vaddr); @@ -651,15 +652,18 @@ static void ath11k_dp_rx_tid_del_func(struct ath11k_dp *dp, void *ctx, spin_lock_bh(&dp->reo_cmd_lock); list_add_tail(&elem->list, &dp->reo_cmd_cache_flush_list); + dp->reo_cmd_cache_flush_count++; spin_unlock_bh(&dp->reo_cmd_lock); /* Flush and invalidate aged REO desc from HW cache */ spin_lock_bh(&dp->reo_cmd_lock); list_for_each_entry_safe(elem, tmp, &dp->reo_cmd_cache_flush_list, list) { - if (time_after(jiffies, elem->ts + + if (dp->reo_cmd_cache_flush_count > DP_REO_DESC_FREE_THRESHOLD || + time_after(jiffies, elem->ts + msecs_to_jiffies(DP_REO_DESC_FREE_TIMEOUT_MS))) { list_del(&elem->list); + dp->reo_cmd_cache_flush_count--; spin_unlock_bh(&dp->reo_cmd_lock); ath11k_dp_reo_cache_flush(ab, &elem->data); -- cgit v1.2.3-59-g8ed1b From 4913e675630ec1a15c92651f426a63755c71b91b Mon Sep 17 00:00:00 2001 From: Wen Gong Date: Thu, 23 Apr 2020 10:27:58 +0800 Subject: ath10k: enable rx duration report default for wmi tlv When run command "iw dev wlan0 station dump", the rx duration is 0. When firmware indicate WMI_UPDATE_STATS_EVENTID, extended flag of statsis not set by default, so firmware do not report rx duration. one sample: localhost # iw wlan0 station dump Station c4:04:15:5d:97:22 (on wlan0) inactive time: 48 ms rx bytes: 21670 rx packets: 147 tx bytes: 11529 tx packets: 100 tx retries: 88 tx failed: 36 beacon loss: 1 beacon rx: 31 rx drop misc: 47 signal: -72 [-74, -75] dBm signal avg: -71 [-74, -75] dBm beacon signal avg: -71 dBm tx bitrate: 54.0 MBit/s MCS 3 40MHz rx bitrate: 1.0 MBit/s rx duration: 0 us This patch enable firmware's extened flag of stats by setting flag WMI_TLV_STAT_PEER_EXTD of ar->fw_stats_req_mask which is set in ath10k_core_init_firmware_features via WMI_REQUEST_STATS_CMDID. After apply this patch, rx duration show value with the command: Station c4:04:15:5d:97:22 (on wlan0) inactive time: 883 ms rx bytes: 44289 rx packets: 265 tx bytes: 10838 tx packets: 93 tx retries: 899 tx failed: 103 beacon loss: 0 beacon rx: 78 rx drop misc: 46 signal: -71 [-74, -76] dBm signal avg: -70 [-74, -76] dBm beacon signal avg: -70 dBm tx bitrate: 54.0 MBit/s MCS 3 40MHz rx bitrate: 1.0 MBit/s rx duration: 358004 us This patch do not have side effect for all chips, because function ath10k_debug_fw_stats_request is already exported to debugfs "fw_stats" and WMI_REQUEST_STATS_CMDID is safely sent after condition checked by ath10k_peer_stats_enabled in ath10k_sta_statistics. Tested with QCA6174 SDIO with firmware WLAN.RMH.4.4.1-00042. Signed-off-by: Wen Gong Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/20200423022758.5365-1-wgong@codeaurora.org --- drivers/net/wireless/ath/ath10k/debug.c | 2 +- drivers/net/wireless/ath/ath10k/debug.h | 8 ++++++++ drivers/net/wireless/ath/ath10k/mac.c | 2 ++ 3 files changed, 11 insertions(+), 1 deletion(-) diff --git a/drivers/net/wireless/ath/ath10k/debug.c b/drivers/net/wireless/ath/ath10k/debug.c index 69139c2e6f82..e8250a665433 100644 --- a/drivers/net/wireless/ath/ath10k/debug.c +++ b/drivers/net/wireless/ath/ath10k/debug.c @@ -349,7 +349,7 @@ free: spin_unlock_bh(&ar->data_lock); } -static int ath10k_debug_fw_stats_request(struct ath10k *ar) +int ath10k_debug_fw_stats_request(struct ath10k *ar) { unsigned long timeout, time_left; int ret; diff --git a/drivers/net/wireless/ath/ath10k/debug.h b/drivers/net/wireless/ath/ath10k/debug.h index 82f7eb8583d9..4cbfd9279d6f 100644 --- a/drivers/net/wireless/ath/ath10k/debug.h +++ b/drivers/net/wireless/ath/ath10k/debug.h @@ -125,6 +125,9 @@ static inline int ath10k_debug_is_extd_tx_stats_enabled(struct ath10k *ar) { return ar->debug.enable_extd_tx_stats; } + +int ath10k_debug_fw_stats_request(struct ath10k *ar); + #else static inline int ath10k_debug_start(struct ath10k *ar) @@ -192,6 +195,11 @@ static inline int ath10k_debug_is_extd_tx_stats_enabled(struct ath10k *ar) return 0; } +static inline int ath10k_debug_fw_stats_request(struct ath10k *ar) +{ + return 0; +} + #define ATH10K_DFS_STAT_INC(ar, c) do { } while (0) #define ath10k_debug_get_et_strings NULL diff --git a/drivers/net/wireless/ath/ath10k/mac.c b/drivers/net/wireless/ath/ath10k/mac.c index 5de7910c24e7..98065b97b982 100644 --- a/drivers/net/wireless/ath/ath10k/mac.c +++ b/drivers/net/wireless/ath/ath10k/mac.c @@ -8311,6 +8311,8 @@ static void ath10k_sta_statistics(struct ieee80211_hw *hw, if (!ath10k_peer_stats_enabled(ar)) return; + ath10k_debug_fw_stats_request(ar); + sinfo->rx_duration = arsta->rx_duration; sinfo->filled |= BIT_ULL(NL80211_STA_INFO_RX_DURATION); -- cgit v1.2.3-59-g8ed1b From 59a022cc14cf84c6405efc1571045683c258a1f5 Mon Sep 17 00:00:00 2001 From: Wen Gong Date: Thu, 23 Apr 2020 10:41:34 +0800 Subject: ath10k: add statistics of tx retries and tx failed when tx complete disable When tx complete is disabled, all tx status will be set with status HTT_TX_COMPL_STATE_ACK and indicate to mac80211 by ieee80211_tx_status, then it does not have the statistics for retries and failed packets. count of tx retries and tx failed of command "iw wlan0 station dump" are both 0. If tx complete is not disabled, then firmware report the tx status and ath10k indicate the status to mac80211, then mac80211 save the statistics and command "iw wlan0 station dump" show them. for example: localhost ~ # iw dev wlan0 station dump Station 3c:28:6d:96:fd:69 (on wlan0) inactive time: 5 ms rx bytes: 1325012 rx packets: 6477 tx bytes: 85264 tx packets: 518 tx retries: 0 tx failed: 0 This patch only effect chips with tx complete disabled, e.g. SDIO. with this patch, output of command "iw dev wlan0 station dump": Station c4:04:15:5d:97:22 (on wlan0) inactive time: 608 ms rx bytes: 180366 rx packets: 991 tx bytes: 98765577 tx packets: 64624 tx retries: 14682 tx failed: 47086 Tested with QCA6174 SDIO with firmware WLAN.RMH.4.4.1-00042. Signed-off-by: Wen Gong Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/20200423024134.10601-1-wgong@codeaurora.org --- drivers/net/wireless/ath/ath10k/core.h | 2 ++ drivers/net/wireless/ath/ath10k/htt_rx.c | 7 +++++++ drivers/net/wireless/ath/ath10k/mac.c | 7 +++++++ 3 files changed, 16 insertions(+) diff --git a/drivers/net/wireless/ath/ath10k/core.h b/drivers/net/wireless/ath/ath10k/core.h index d6adcbaf9616..07935d39d6d6 100644 --- a/drivers/net/wireless/ath/ath10k/core.h +++ b/drivers/net/wireless/ath/ath10k/core.h @@ -500,6 +500,8 @@ struct ath10k_sta { u16 peer_id; struct rate_info txrate; struct ieee80211_tx_info tx_info; + u32 tx_retries; + u32 tx_failed; u32 last_tx_bitrate; struct work_struct update_wk; diff --git a/drivers/net/wireless/ath/ath10k/htt_rx.c b/drivers/net/wireless/ath/ath10k/htt_rx.c index 816af1a8ad69..d787cbead56a 100644 --- a/drivers/net/wireless/ath/ath10k/htt_rx.c +++ b/drivers/net/wireless/ath/ath10k/htt_rx.c @@ -3574,6 +3574,13 @@ ath10k_update_per_peer_tx_stats(struct ath10k *ar, ieee80211_tx_rate_update(ar->hw, sta, &arsta->tx_info); } + if (ar->htt.disable_tx_comp) { + arsta->tx_retries += peer_stats->retry_pkts; + arsta->tx_failed += peer_stats->failed_pkts; + ath10k_dbg(ar, ATH10K_DBG_HTT, "htt tx retries %d tx failed %d\n", + arsta->tx_retries, arsta->tx_failed); + } + if (ath10k_debug_is_extd_tx_stats_enabled(ar)) ath10k_accumulate_per_peer_tx_stats(ar, arsta, peer_stats, rate_idx); diff --git a/drivers/net/wireless/ath/ath10k/mac.c b/drivers/net/wireless/ath/ath10k/mac.c index 98065b97b982..a1147ccc09bf 100644 --- a/drivers/net/wireless/ath/ath10k/mac.c +++ b/drivers/net/wireless/ath/ath10k/mac.c @@ -8328,6 +8328,13 @@ static void ath10k_sta_statistics(struct ieee80211_hw *hw, } sinfo->txrate.flags = arsta->txrate.flags; sinfo->filled |= BIT_ULL(NL80211_STA_INFO_TX_BITRATE); + + if (ar->htt.disable_tx_comp) { + sinfo->tx_retries = arsta->tx_retries; + sinfo->filled |= BIT_ULL(NL80211_STA_INFO_TX_RETRIES); + sinfo->tx_failed = arsta->tx_failed; + sinfo->filled |= BIT_ULL(NL80211_STA_INFO_TX_FAILED); + } } static const struct ieee80211_ops ath10k_ops = { -- cgit v1.2.3-59-g8ed1b From 26363af5643490a817272e1cc6f1d3f1d550a699 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 24 Apr 2020 08:43:35 +0200 Subject: mm: remove watermark_boost_factor_sysctl_handler watermark_boost_factor_sysctl_handler is just a pointless wrapper for proc_dointvec_minmax, so remove it and use proc_dointvec_minmax directly. Signed-off-by: Christoph Hellwig Acked-by: David Rientjes Signed-off-by: Al Viro --- include/linux/mmzone.h | 2 -- kernel/sysctl.c | 2 +- mm/page_alloc.c | 12 ------------ 3 files changed, 1 insertion(+), 15 deletions(-) diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 1b9de7d220fb..f37bb8f187fc 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -911,8 +911,6 @@ static inline int is_highmem(struct zone *zone) struct ctl_table; int min_free_kbytes_sysctl_handler(struct ctl_table *, int, void __user *, size_t *, loff_t *); -int watermark_boost_factor_sysctl_handler(struct ctl_table *, int, - void __user *, size_t *, loff_t *); int watermark_scale_factor_sysctl_handler(struct ctl_table *, int, void __user *, size_t *, loff_t *); extern int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES]; diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 8a176d8727a3..99d27acf4646 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1491,7 +1491,7 @@ static struct ctl_table vm_table[] = { .data = &watermark_boost_factor, .maxlen = sizeof(watermark_boost_factor), .mode = 0644, - .proc_handler = watermark_boost_factor_sysctl_handler, + .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, }, { diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 69827d4fa052..62c1550cd43e 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -7978,18 +7978,6 @@ int min_free_kbytes_sysctl_handler(struct ctl_table *table, int write, return 0; } -int watermark_boost_factor_sysctl_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *length, loff_t *ppos) -{ - int rc; - - rc = proc_dointvec_minmax(table, write, buffer, length, ppos); - if (rc) - return rc; - - return 0; -} - int watermark_scale_factor_sysctl_handler(struct ctl_table *table, int write, void __user *buffer, size_t *length, loff_t *ppos) { -- cgit v1.2.3-59-g8ed1b From 2374c09b1c8a883bb9b4b2fc3756703eeb618f4a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 24 Apr 2020 08:43:36 +0200 Subject: sysctl: remove all extern declaration from sysctl.c Extern declarations in .c files are a bad style and can lead to mismatches. Use existing definitions in headers where they exist, and otherwise move the external declarations to suitable header files. Signed-off-by: Christoph Hellwig Signed-off-by: Al Viro --- include/linux/coredump.h | 4 ++++ include/linux/file.h | 2 ++ include/linux/mm.h | 2 ++ include/linux/mmzone.h | 2 ++ include/linux/pid.h | 3 +++ include/linux/sysctl.h | 8 ++++++++ kernel/sysctl.c | 45 +++------------------------------------------ 7 files changed, 24 insertions(+), 42 deletions(-) diff --git a/include/linux/coredump.h b/include/linux/coredump.h index abf4b4e65dbb..7a899e83835d 100644 --- a/include/linux/coredump.h +++ b/include/linux/coredump.h @@ -22,4 +22,8 @@ extern void do_coredump(const kernel_siginfo_t *siginfo); static inline void do_coredump(const kernel_siginfo_t *siginfo) {} #endif +extern int core_uses_pid; +extern char core_pattern[]; +extern unsigned int core_pipe_limit; + #endif /* _LINUX_COREDUMP_H */ diff --git a/include/linux/file.h b/include/linux/file.h index 142d102f285e..122f80084a3e 100644 --- a/include/linux/file.h +++ b/include/linux/file.h @@ -94,4 +94,6 @@ extern void fd_install(unsigned int fd, struct file *file); extern void flush_delayed_fput(void); extern void __fput_sync(struct file *); +extern unsigned int sysctl_nr_open_min, sysctl_nr_open_max; + #endif /* __LINUX_FILE_H */ diff --git a/include/linux/mm.h b/include/linux/mm.h index 5a323422d783..9c4e7e76dedd 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3140,5 +3140,7 @@ unsigned long wp_shared_mapping_range(struct address_space *mapping, pgoff_t first_index, pgoff_t nr); #endif +extern int sysctl_nr_trim_pages; + #endif /* __KERNEL__ */ #endif /* _LINUX_MM_H */ diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index f37bb8f187fc..b2af594ef0f7 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -909,6 +909,7 @@ static inline int is_highmem(struct zone *zone) /* These two functions are used to setup the per zone pages min values */ struct ctl_table; + int min_free_kbytes_sysctl_handler(struct ctl_table *, int, void __user *, size_t *, loff_t *); int watermark_scale_factor_sysctl_handler(struct ctl_table *, int, @@ -925,6 +926,7 @@ int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *, int, extern int numa_zonelist_order_handler(struct ctl_table *, int, void __user *, size_t *, loff_t *); +extern int percpu_pagelist_fraction; extern char numa_zonelist_order[]; #define NUMA_ZONELIST_ORDER_LEN 16 diff --git a/include/linux/pid.h b/include/linux/pid.h index cc896f0fc4e3..93543cbc0e6b 100644 --- a/include/linux/pid.h +++ b/include/linux/pid.h @@ -108,6 +108,9 @@ extern void transfer_pid(struct task_struct *old, struct task_struct *new, struct pid_namespace; extern struct pid_namespace init_pid_ns; +extern int pid_max; +extern int pid_max_min, pid_max_max; + /* * look up a PID in the hash table. Must be called with the tasklist_lock * or rcu_read_lock() held. diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index 02fa84493f23..36143ca40b56 100644 --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h @@ -207,7 +207,15 @@ void unregister_sysctl_table(struct ctl_table_header * table); extern int sysctl_init(void); +extern int pwrsw_enabled; +extern int unaligned_enabled; +extern int unaligned_dump_stack; +extern int no_unaligned_warning; + extern struct ctl_table sysctl_mount_point[]; +extern struct ctl_table random_table[]; +extern struct ctl_table firmware_config_table[]; +extern struct ctl_table epoll_table[]; #else /* CONFIG_SYSCTL */ static inline struct ctl_table_header *register_sysctl_table(struct ctl_table * table) diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 99d27acf4646..31b934865ebc 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -68,6 +68,9 @@ #include #include #include +#include +#include +#include #include "../lib/kstrtox.h" @@ -103,22 +106,6 @@ #if defined(CONFIG_SYSCTL) -/* External variables not in a header file. */ -extern int suid_dumpable; -#ifdef CONFIG_COREDUMP -extern int core_uses_pid; -extern char core_pattern[]; -extern unsigned int core_pipe_limit; -#endif -extern int pid_max; -extern int pid_max_min, pid_max_max; -extern int percpu_pagelist_fraction; -extern int latencytop_enabled; -extern unsigned int sysctl_nr_open_min, sysctl_nr_open_max; -#ifndef CONFIG_MMU -extern int sysctl_nr_trim_pages; -#endif - /* Constants used for minimum and maximum */ #ifdef CONFIG_LOCKUP_DETECTOR static int sixty = 60; @@ -160,24 +147,6 @@ static unsigned long hung_task_timeout_max = (LONG_MAX/HZ); #ifdef CONFIG_INOTIFY_USER #include #endif -#ifdef CONFIG_SPARC -#endif - -#ifdef CONFIG_PARISC -extern int pwrsw_enabled; -#endif - -#ifdef CONFIG_SYSCTL_ARCH_UNALIGN_ALLOW -extern int unaligned_enabled; -#endif - -#ifdef CONFIG_IA64 -extern int unaligned_dump_stack; -#endif - -#ifdef CONFIG_SYSCTL_ARCH_UNALIGN_NO_WARN -extern int no_unaligned_warning; -#endif #ifdef CONFIG_PROC_SYSCTL @@ -243,14 +212,6 @@ static struct ctl_table vm_table[]; static struct ctl_table fs_table[]; static struct ctl_table debug_table[]; static struct ctl_table dev_table[]; -extern struct ctl_table random_table[]; -#ifdef CONFIG_EPOLL -extern struct ctl_table epoll_table[]; -#endif - -#ifdef CONFIG_FW_LOADER_USER_HELPER -extern struct ctl_table firmware_config_table[]; -#endif #if defined(HAVE_ARCH_PICK_MMAP_LAYOUT) || \ defined(CONFIG_ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT) -- cgit v1.2.3-59-g8ed1b From f461d2dcd511c020a26d4d791fae595c65ed09b6 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 24 Apr 2020 08:43:37 +0200 Subject: sysctl: avoid forward declarations Move the sysctl tables to the end of the file to avoid lots of pointless forward declarations. Signed-off-by: Christoph Hellwig Signed-off-by: Al Viro --- kernel/sysctl.c | 3573 +++++++++++++++++++++++++++---------------------------- 1 file changed, 1768 insertions(+), 1805 deletions(-) diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 31b934865ebc..3fafca3ced98 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -176,79 +176,13 @@ enum sysctl_writes_mode { }; static enum sysctl_writes_mode sysctl_writes_strict = SYSCTL_WRITES_STRICT; - -static int proc_do_cad_pid(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos); -static int proc_taint(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos); -#ifdef CONFIG_COMPACTION -static int proc_dointvec_minmax_warn_RT_change(struct ctl_table *table, - int write, void __user *buffer, - size_t *lenp, loff_t *ppos); -#endif -#endif - -#ifdef CONFIG_PRINTK -static int proc_dointvec_minmax_sysadmin(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos); -#endif - -static int proc_dointvec_minmax_coredump(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos); -#ifdef CONFIG_COREDUMP -static int proc_dostring_coredump(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos); -#endif -static int proc_dopipe_max_size(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos); - -#ifdef CONFIG_MAGIC_SYSRQ -static int sysrq_sysctl_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos); -#endif - -static struct ctl_table kern_table[]; -static struct ctl_table vm_table[]; -static struct ctl_table fs_table[]; -static struct ctl_table debug_table[]; -static struct ctl_table dev_table[]; +#endif /* CONFIG_PROC_SYSCTL */ #if defined(HAVE_ARCH_PICK_MMAP_LAYOUT) || \ defined(CONFIG_ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT) int sysctl_legacy_va_layout; #endif -/* The default sysctl tables: */ - -static struct ctl_table sysctl_base_table[] = { - { - .procname = "kernel", - .mode = 0555, - .child = kern_table, - }, - { - .procname = "vm", - .mode = 0555, - .child = vm_table, - }, - { - .procname = "fs", - .mode = 0555, - .child = fs_table, - }, - { - .procname = "debug", - .mode = 0555, - .child = debug_table, - }, - { - .procname = "dev", - .mode = 0555, - .child = dev_table, - }, - { } -}; - #ifdef CONFIG_SCHED_DEBUG static int min_sched_granularity_ns = 100000; /* 100 usecs */ static int max_sched_granularity_ns = NSEC_PER_SEC; /* 1 second */ @@ -265,1676 +199,12 @@ static int min_extfrag_threshold; static int max_extfrag_threshold = 1000; #endif -static struct ctl_table kern_table[] = { - { - .procname = "sched_child_runs_first", - .data = &sysctl_sched_child_runs_first, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, -#ifdef CONFIG_SCHED_DEBUG - { - .procname = "sched_min_granularity_ns", - .data = &sysctl_sched_min_granularity, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = sched_proc_update_handler, - .extra1 = &min_sched_granularity_ns, - .extra2 = &max_sched_granularity_ns, - }, - { - .procname = "sched_latency_ns", - .data = &sysctl_sched_latency, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = sched_proc_update_handler, - .extra1 = &min_sched_granularity_ns, - .extra2 = &max_sched_granularity_ns, - }, - { - .procname = "sched_wakeup_granularity_ns", - .data = &sysctl_sched_wakeup_granularity, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = sched_proc_update_handler, - .extra1 = &min_wakeup_granularity_ns, - .extra2 = &max_wakeup_granularity_ns, - }, -#ifdef CONFIG_SMP - { - .procname = "sched_tunable_scaling", - .data = &sysctl_sched_tunable_scaling, - .maxlen = sizeof(enum sched_tunable_scaling), - .mode = 0644, - .proc_handler = sched_proc_update_handler, - .extra1 = &min_sched_tunable_scaling, - .extra2 = &max_sched_tunable_scaling, - }, - { - .procname = "sched_migration_cost_ns", - .data = &sysctl_sched_migration_cost, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { - .procname = "sched_nr_migrate", - .data = &sysctl_sched_nr_migrate, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, -#ifdef CONFIG_SCHEDSTATS - { - .procname = "sched_schedstats", - .data = NULL, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = sysctl_schedstats, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, -#endif /* CONFIG_SCHEDSTATS */ -#endif /* CONFIG_SMP */ -#ifdef CONFIG_NUMA_BALANCING - { - .procname = "numa_balancing_scan_delay_ms", - .data = &sysctl_numa_balancing_scan_delay, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { - .procname = "numa_balancing_scan_period_min_ms", - .data = &sysctl_numa_balancing_scan_period_min, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { - .procname = "numa_balancing_scan_period_max_ms", - .data = &sysctl_numa_balancing_scan_period_max, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { - .procname = "numa_balancing_scan_size_mb", - .data = &sysctl_numa_balancing_scan_size, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ONE, - }, - { - .procname = "numa_balancing", - .data = NULL, /* filled in by handler */ - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = sysctl_numa_balancing, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, -#endif /* CONFIG_NUMA_BALANCING */ -#endif /* CONFIG_SCHED_DEBUG */ - { - .procname = "sched_rt_period_us", - .data = &sysctl_sched_rt_period, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = sched_rt_handler, - }, - { - .procname = "sched_rt_runtime_us", - .data = &sysctl_sched_rt_runtime, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = sched_rt_handler, - }, - { - .procname = "sched_rr_timeslice_ms", - .data = &sysctl_sched_rr_timeslice, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = sched_rr_handler, - }, -#ifdef CONFIG_UCLAMP_TASK - { - .procname = "sched_util_clamp_min", - .data = &sysctl_sched_uclamp_util_min, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = sysctl_sched_uclamp_handler, - }, - { - .procname = "sched_util_clamp_max", - .data = &sysctl_sched_uclamp_util_max, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = sysctl_sched_uclamp_handler, - }, -#endif -#ifdef CONFIG_SCHED_AUTOGROUP - { - .procname = "sched_autogroup_enabled", - .data = &sysctl_sched_autogroup_enabled, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, -#endif -#ifdef CONFIG_CFS_BANDWIDTH - { - .procname = "sched_cfs_bandwidth_slice_us", - .data = &sysctl_sched_cfs_bandwidth_slice, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ONE, - }, -#endif -#if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) - { - .procname = "sched_energy_aware", - .data = &sysctl_sched_energy_aware, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = sched_energy_aware_handler, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, -#endif -#ifdef CONFIG_PROVE_LOCKING - { - .procname = "prove_locking", - .data = &prove_locking, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, -#endif -#ifdef CONFIG_LOCK_STAT - { - .procname = "lock_stat", - .data = &lock_stat, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, -#endif - { - .procname = "panic", - .data = &panic_timeout, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, -#ifdef CONFIG_COREDUMP - { - .procname = "core_uses_pid", - .data = &core_uses_pid, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { - .procname = "core_pattern", - .data = core_pattern, - .maxlen = CORENAME_MAX_SIZE, - .mode = 0644, - .proc_handler = proc_dostring_coredump, - }, - { - .procname = "core_pipe_limit", - .data = &core_pipe_limit, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, -#endif -#ifdef CONFIG_PROC_SYSCTL - { - .procname = "tainted", - .maxlen = sizeof(long), - .mode = 0644, - .proc_handler = proc_taint, - }, - { - .procname = "sysctl_writes_strict", - .data = &sysctl_writes_strict, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = &neg_one, - .extra2 = SYSCTL_ONE, - }, -#endif -#ifdef CONFIG_LATENCYTOP - { - .procname = "latencytop", - .data = &latencytop_enabled, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = sysctl_latencytop, - }, -#endif -#ifdef CONFIG_BLK_DEV_INITRD - { - .procname = "real-root-dev", - .data = &real_root_dev, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, -#endif - { - .procname = "print-fatal-signals", - .data = &print_fatal_signals, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, -#ifdef CONFIG_SPARC - { - .procname = "reboot-cmd", - .data = reboot_command, - .maxlen = 256, - .mode = 0644, - .proc_handler = proc_dostring, - }, - { - .procname = "stop-a", - .data = &stop_a_enabled, - .maxlen = sizeof (int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { - .procname = "scons-poweroff", - .data = &scons_pwroff, - .maxlen = sizeof (int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, -#endif -#ifdef CONFIG_SPARC64 - { - .procname = "tsb-ratio", - .data = &sysctl_tsb_ratio, - .maxlen = sizeof (int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, -#endif -#ifdef CONFIG_PARISC - { - .procname = "soft-power", - .data = &pwrsw_enabled, - .maxlen = sizeof (int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, -#endif -#ifdef CONFIG_SYSCTL_ARCH_UNALIGN_ALLOW - { - .procname = "unaligned-trap", - .data = &unaligned_enabled, - .maxlen = sizeof (int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, -#endif - { - .procname = "ctrl-alt-del", - .data = &C_A_D, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, -#ifdef CONFIG_FUNCTION_TRACER - { - .procname = "ftrace_enabled", - .data = &ftrace_enabled, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = ftrace_enable_sysctl, - }, -#endif -#ifdef CONFIG_STACK_TRACER - { - .procname = "stack_tracer_enabled", - .data = &stack_tracer_enabled, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = stack_trace_sysctl, - }, -#endif -#ifdef CONFIG_TRACING - { - .procname = "ftrace_dump_on_oops", - .data = &ftrace_dump_on_oops, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { - .procname = "traceoff_on_warning", - .data = &__disable_trace_on_warning, - .maxlen = sizeof(__disable_trace_on_warning), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { - .procname = "tracepoint_printk", - .data = &tracepoint_printk, - .maxlen = sizeof(tracepoint_printk), - .mode = 0644, - .proc_handler = tracepoint_printk_sysctl, - }, -#endif -#ifdef CONFIG_KEXEC_CORE - { - .procname = "kexec_load_disabled", - .data = &kexec_load_disabled, - .maxlen = sizeof(int), - .mode = 0644, - /* only handle a transition from default "0" to "1" */ - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ONE, - .extra2 = SYSCTL_ONE, - }, -#endif -#ifdef CONFIG_MODULES - { - .procname = "modprobe", - .data = &modprobe_path, - .maxlen = KMOD_PATH_LEN, - .mode = 0644, - .proc_handler = proc_dostring, - }, - { - .procname = "modules_disabled", - .data = &modules_disabled, - .maxlen = sizeof(int), - .mode = 0644, - /* only handle a transition from default "0" to "1" */ - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ONE, - .extra2 = SYSCTL_ONE, - }, -#endif -#ifdef CONFIG_UEVENT_HELPER - { - .procname = "hotplug", - .data = &uevent_helper, - .maxlen = UEVENT_HELPER_PATH_LEN, - .mode = 0644, - .proc_handler = proc_dostring, - }, -#endif -#ifdef CONFIG_CHR_DEV_SG - { - .procname = "sg-big-buff", - .data = &sg_big_buff, - .maxlen = sizeof (int), - .mode = 0444, - .proc_handler = proc_dointvec, - }, -#endif -#ifdef CONFIG_BSD_PROCESS_ACCT - { - .procname = "acct", - .data = &acct_parm, - .maxlen = 3*sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, -#endif -#ifdef CONFIG_MAGIC_SYSRQ - { - .procname = "sysrq", - .data = NULL, - .maxlen = sizeof (int), - .mode = 0644, - .proc_handler = sysrq_sysctl_handler, - }, -#endif -#ifdef CONFIG_PROC_SYSCTL - { - .procname = "cad_pid", - .data = NULL, - .maxlen = sizeof (int), - .mode = 0600, - .proc_handler = proc_do_cad_pid, - }, -#endif - { - .procname = "threads-max", - .data = NULL, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = sysctl_max_threads, - }, - { - .procname = "random", - .mode = 0555, - .child = random_table, - }, - { - .procname = "usermodehelper", - .mode = 0555, - .child = usermodehelper_table, - }, -#ifdef CONFIG_FW_LOADER_USER_HELPER - { - .procname = "firmware_config", - .mode = 0555, - .child = firmware_config_table, - }, -#endif - { - .procname = "overflowuid", - .data = &overflowuid, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = &minolduid, - .extra2 = &maxolduid, - }, - { - .procname = "overflowgid", - .data = &overflowgid, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = &minolduid, - .extra2 = &maxolduid, - }, -#ifdef CONFIG_S390 - { - .procname = "userprocess_debug", - .data = &show_unhandled_signals, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, -#endif - { - .procname = "pid_max", - .data = &pid_max, - .maxlen = sizeof (int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = &pid_max_min, - .extra2 = &pid_max_max, - }, - { - .procname = "panic_on_oops", - .data = &panic_on_oops, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { - .procname = "panic_print", - .data = &panic_print, - .maxlen = sizeof(unsigned long), - .mode = 0644, - .proc_handler = proc_doulongvec_minmax, - }, -#if defined CONFIG_PRINTK - { - .procname = "printk", - .data = &console_loglevel, - .maxlen = 4*sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { - .procname = "printk_ratelimit", - .data = &printk_ratelimit_state.interval, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_jiffies, - }, - { - .procname = "printk_ratelimit_burst", - .data = &printk_ratelimit_state.burst, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { - .procname = "printk_delay", - .data = &printk_delay_msec, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = &ten_thousand, - }, - { - .procname = "printk_devkmsg", - .data = devkmsg_log_str, - .maxlen = DEVKMSG_STR_MAX_SIZE, - .mode = 0644, - .proc_handler = devkmsg_sysctl_set_loglvl, - }, - { - .procname = "dmesg_restrict", - .data = &dmesg_restrict, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax_sysadmin, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, - { - .procname = "kptr_restrict", - .data = &kptr_restrict, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax_sysadmin, - .extra1 = SYSCTL_ZERO, - .extra2 = &two, - }, -#endif - { - .procname = "ngroups_max", - .data = &ngroups_max, - .maxlen = sizeof (int), - .mode = 0444, - .proc_handler = proc_dointvec, - }, - { - .procname = "cap_last_cap", - .data = (void *)&cap_last_cap, - .maxlen = sizeof(int), - .mode = 0444, - .proc_handler = proc_dointvec, - }, -#if defined(CONFIG_LOCKUP_DETECTOR) - { - .procname = "watchdog", - .data = &watchdog_user_enabled, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_watchdog, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, - { - .procname = "watchdog_thresh", - .data = &watchdog_thresh, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_watchdog_thresh, - .extra1 = SYSCTL_ZERO, - .extra2 = &sixty, - }, - { - .procname = "nmi_watchdog", - .data = &nmi_watchdog_user_enabled, - .maxlen = sizeof(int), - .mode = NMI_WATCHDOG_SYSCTL_PERM, - .proc_handler = proc_nmi_watchdog, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, - { - .procname = "watchdog_cpumask", - .data = &watchdog_cpumask_bits, - .maxlen = NR_CPUS, - .mode = 0644, - .proc_handler = proc_watchdog_cpumask, - }, -#ifdef CONFIG_SOFTLOCKUP_DETECTOR - { - .procname = "soft_watchdog", - .data = &soft_watchdog_user_enabled, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_soft_watchdog, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, - { - .procname = "softlockup_panic", - .data = &softlockup_panic, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, -#ifdef CONFIG_SMP - { - .procname = "softlockup_all_cpu_backtrace", - .data = &sysctl_softlockup_all_cpu_backtrace, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, -#endif /* CONFIG_SMP */ -#endif -#ifdef CONFIG_HARDLOCKUP_DETECTOR - { - .procname = "hardlockup_panic", - .data = &hardlockup_panic, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, -#ifdef CONFIG_SMP - { - .procname = "hardlockup_all_cpu_backtrace", - .data = &sysctl_hardlockup_all_cpu_backtrace, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, -#endif /* CONFIG_SMP */ -#endif -#endif - -#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86) - { - .procname = "unknown_nmi_panic", - .data = &unknown_nmi_panic, - .maxlen = sizeof (int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, -#endif -#if defined(CONFIG_X86) - { - .procname = "panic_on_unrecovered_nmi", - .data = &panic_on_unrecovered_nmi, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { - .procname = "panic_on_io_nmi", - .data = &panic_on_io_nmi, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, -#ifdef CONFIG_DEBUG_STACKOVERFLOW - { - .procname = "panic_on_stackoverflow", - .data = &sysctl_panic_on_stackoverflow, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, -#endif - { - .procname = "bootloader_type", - .data = &bootloader_type, - .maxlen = sizeof (int), - .mode = 0444, - .proc_handler = proc_dointvec, - }, - { - .procname = "bootloader_version", - .data = &bootloader_version, - .maxlen = sizeof (int), - .mode = 0444, - .proc_handler = proc_dointvec, - }, - { - .procname = "io_delay_type", - .data = &io_delay_type, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, -#endif -#if defined(CONFIG_MMU) - { - .procname = "randomize_va_space", - .data = &randomize_va_space, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, -#endif -#if defined(CONFIG_S390) && defined(CONFIG_SMP) - { - .procname = "spin_retry", - .data = &spin_retry, - .maxlen = sizeof (int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, -#endif -#if defined(CONFIG_ACPI_SLEEP) && defined(CONFIG_X86) - { - .procname = "acpi_video_flags", - .data = &acpi_realmode_flags, - .maxlen = sizeof (unsigned long), - .mode = 0644, - .proc_handler = proc_doulongvec_minmax, - }, -#endif -#ifdef CONFIG_SYSCTL_ARCH_UNALIGN_NO_WARN - { - .procname = "ignore-unaligned-usertrap", - .data = &no_unaligned_warning, - .maxlen = sizeof (int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, -#endif -#ifdef CONFIG_IA64 - { - .procname = "unaligned-dump-stack", - .data = &unaligned_dump_stack, - .maxlen = sizeof (int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, -#endif -#ifdef CONFIG_DETECT_HUNG_TASK - { - .procname = "hung_task_panic", - .data = &sysctl_hung_task_panic, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, - { - .procname = "hung_task_check_count", - .data = &sysctl_hung_task_check_count, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - }, - { - .procname = "hung_task_timeout_secs", - .data = &sysctl_hung_task_timeout_secs, - .maxlen = sizeof(unsigned long), - .mode = 0644, - .proc_handler = proc_dohung_task_timeout_secs, - .extra2 = &hung_task_timeout_max, - }, - { - .procname = "hung_task_check_interval_secs", - .data = &sysctl_hung_task_check_interval_secs, - .maxlen = sizeof(unsigned long), - .mode = 0644, - .proc_handler = proc_dohung_task_timeout_secs, - .extra2 = &hung_task_timeout_max, - }, - { - .procname = "hung_task_warnings", - .data = &sysctl_hung_task_warnings, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = &neg_one, - }, -#endif -#ifdef CONFIG_RT_MUTEXES - { - .procname = "max_lock_depth", - .data = &max_lock_depth, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, -#endif - { - .procname = "poweroff_cmd", - .data = &poweroff_cmd, - .maxlen = POWEROFF_CMD_PATH_LEN, - .mode = 0644, - .proc_handler = proc_dostring, - }, -#ifdef CONFIG_KEYS - { - .procname = "keys", - .mode = 0555, - .child = key_sysctls, - }, -#endif -#ifdef CONFIG_PERF_EVENTS - /* - * User-space scripts rely on the existence of this file - * as a feature check for perf_events being enabled. - * - * So it's an ABI, do not remove! - */ - { - .procname = "perf_event_paranoid", - .data = &sysctl_perf_event_paranoid, - .maxlen = sizeof(sysctl_perf_event_paranoid), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { - .procname = "perf_event_mlock_kb", - .data = &sysctl_perf_event_mlock, - .maxlen = sizeof(sysctl_perf_event_mlock), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { - .procname = "perf_event_max_sample_rate", - .data = &sysctl_perf_event_sample_rate, - .maxlen = sizeof(sysctl_perf_event_sample_rate), - .mode = 0644, - .proc_handler = perf_proc_update_handler, - .extra1 = SYSCTL_ONE, - }, - { - .procname = "perf_cpu_time_max_percent", - .data = &sysctl_perf_cpu_time_max_percent, - .maxlen = sizeof(sysctl_perf_cpu_time_max_percent), - .mode = 0644, - .proc_handler = perf_cpu_time_max_percent_handler, - .extra1 = SYSCTL_ZERO, - .extra2 = &one_hundred, - }, - { - .procname = "perf_event_max_stack", - .data = &sysctl_perf_event_max_stack, - .maxlen = sizeof(sysctl_perf_event_max_stack), - .mode = 0644, - .proc_handler = perf_event_max_stack_handler, - .extra1 = SYSCTL_ZERO, - .extra2 = &six_hundred_forty_kb, - }, - { - .procname = "perf_event_max_contexts_per_stack", - .data = &sysctl_perf_event_max_contexts_per_stack, - .maxlen = sizeof(sysctl_perf_event_max_contexts_per_stack), - .mode = 0644, - .proc_handler = perf_event_max_stack_handler, - .extra1 = SYSCTL_ZERO, - .extra2 = &one_thousand, - }, -#endif - { - .procname = "panic_on_warn", - .data = &panic_on_warn, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, -#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON) - { - .procname = "timer_migration", - .data = &sysctl_timer_migration, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = timer_migration_handler, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, -#endif -#ifdef CONFIG_BPF_SYSCALL - { - .procname = "unprivileged_bpf_disabled", - .data = &sysctl_unprivileged_bpf_disabled, - .maxlen = sizeof(sysctl_unprivileged_bpf_disabled), - .mode = 0644, - /* only handle a transition from default "0" to "1" */ - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ONE, - .extra2 = SYSCTL_ONE, - }, - { - .procname = "bpf_stats_enabled", - .data = &bpf_stats_enabled_key.key, - .maxlen = sizeof(bpf_stats_enabled_key), - .mode = 0644, - .proc_handler = proc_do_static_key, - }, -#endif -#if defined(CONFIG_TREE_RCU) - { - .procname = "panic_on_rcu_stall", - .data = &sysctl_panic_on_rcu_stall, - .maxlen = sizeof(sysctl_panic_on_rcu_stall), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, -#endif -#ifdef CONFIG_STACKLEAK_RUNTIME_DISABLE - { - .procname = "stack_erasing", - .data = NULL, - .maxlen = sizeof(int), - .mode = 0600, - .proc_handler = stack_erasing_sysctl, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, -#endif - { } -}; - -static struct ctl_table vm_table[] = { - { - .procname = "overcommit_memory", - .data = &sysctl_overcommit_memory, - .maxlen = sizeof(sysctl_overcommit_memory), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = &two, - }, - { - .procname = "panic_on_oom", - .data = &sysctl_panic_on_oom, - .maxlen = sizeof(sysctl_panic_on_oom), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = &two, - }, - { - .procname = "oom_kill_allocating_task", - .data = &sysctl_oom_kill_allocating_task, - .maxlen = sizeof(sysctl_oom_kill_allocating_task), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { - .procname = "oom_dump_tasks", - .data = &sysctl_oom_dump_tasks, - .maxlen = sizeof(sysctl_oom_dump_tasks), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { - .procname = "overcommit_ratio", - .data = &sysctl_overcommit_ratio, - .maxlen = sizeof(sysctl_overcommit_ratio), - .mode = 0644, - .proc_handler = overcommit_ratio_handler, - }, - { - .procname = "overcommit_kbytes", - .data = &sysctl_overcommit_kbytes, - .maxlen = sizeof(sysctl_overcommit_kbytes), - .mode = 0644, - .proc_handler = overcommit_kbytes_handler, - }, - { - .procname = "page-cluster", - .data = &page_cluster, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - }, - { - .procname = "dirty_background_ratio", - .data = &dirty_background_ratio, - .maxlen = sizeof(dirty_background_ratio), - .mode = 0644, - .proc_handler = dirty_background_ratio_handler, - .extra1 = SYSCTL_ZERO, - .extra2 = &one_hundred, - }, - { - .procname = "dirty_background_bytes", - .data = &dirty_background_bytes, - .maxlen = sizeof(dirty_background_bytes), - .mode = 0644, - .proc_handler = dirty_background_bytes_handler, - .extra1 = &one_ul, - }, - { - .procname = "dirty_ratio", - .data = &vm_dirty_ratio, - .maxlen = sizeof(vm_dirty_ratio), - .mode = 0644, - .proc_handler = dirty_ratio_handler, - .extra1 = SYSCTL_ZERO, - .extra2 = &one_hundred, - }, - { - .procname = "dirty_bytes", - .data = &vm_dirty_bytes, - .maxlen = sizeof(vm_dirty_bytes), - .mode = 0644, - .proc_handler = dirty_bytes_handler, - .extra1 = &dirty_bytes_min, - }, - { - .procname = "dirty_writeback_centisecs", - .data = &dirty_writeback_interval, - .maxlen = sizeof(dirty_writeback_interval), - .mode = 0644, - .proc_handler = dirty_writeback_centisecs_handler, - }, - { - .procname = "dirty_expire_centisecs", - .data = &dirty_expire_interval, - .maxlen = sizeof(dirty_expire_interval), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - }, - { - .procname = "dirtytime_expire_seconds", - .data = &dirtytime_expire_interval, - .maxlen = sizeof(dirtytime_expire_interval), - .mode = 0644, - .proc_handler = dirtytime_interval_handler, - .extra1 = SYSCTL_ZERO, - }, - { - .procname = "swappiness", - .data = &vm_swappiness, - .maxlen = sizeof(vm_swappiness), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = &one_hundred, - }, -#ifdef CONFIG_HUGETLB_PAGE - { - .procname = "nr_hugepages", - .data = NULL, - .maxlen = sizeof(unsigned long), - .mode = 0644, - .proc_handler = hugetlb_sysctl_handler, - }, -#ifdef CONFIG_NUMA - { - .procname = "nr_hugepages_mempolicy", - .data = NULL, - .maxlen = sizeof(unsigned long), - .mode = 0644, - .proc_handler = &hugetlb_mempolicy_sysctl_handler, - }, - { - .procname = "numa_stat", - .data = &sysctl_vm_numa_stat, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = sysctl_vm_numa_stat_handler, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, -#endif - { - .procname = "hugetlb_shm_group", - .data = &sysctl_hugetlb_shm_group, - .maxlen = sizeof(gid_t), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { - .procname = "nr_overcommit_hugepages", - .data = NULL, - .maxlen = sizeof(unsigned long), - .mode = 0644, - .proc_handler = hugetlb_overcommit_handler, - }, -#endif - { - .procname = "lowmem_reserve_ratio", - .data = &sysctl_lowmem_reserve_ratio, - .maxlen = sizeof(sysctl_lowmem_reserve_ratio), - .mode = 0644, - .proc_handler = lowmem_reserve_ratio_sysctl_handler, - }, - { - .procname = "drop_caches", - .data = &sysctl_drop_caches, - .maxlen = sizeof(int), - .mode = 0200, - .proc_handler = drop_caches_sysctl_handler, - .extra1 = SYSCTL_ONE, - .extra2 = &four, - }, -#ifdef CONFIG_COMPACTION - { - .procname = "compact_memory", - .data = &sysctl_compact_memory, - .maxlen = sizeof(int), - .mode = 0200, - .proc_handler = sysctl_compaction_handler, - }, - { - .procname = "extfrag_threshold", - .data = &sysctl_extfrag_threshold, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = &min_extfrag_threshold, - .extra2 = &max_extfrag_threshold, - }, - { - .procname = "compact_unevictable_allowed", - .data = &sysctl_compact_unevictable_allowed, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax_warn_RT_change, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, - -#endif /* CONFIG_COMPACTION */ - { - .procname = "min_free_kbytes", - .data = &min_free_kbytes, - .maxlen = sizeof(min_free_kbytes), - .mode = 0644, - .proc_handler = min_free_kbytes_sysctl_handler, - .extra1 = SYSCTL_ZERO, - }, - { - .procname = "watermark_boost_factor", - .data = &watermark_boost_factor, - .maxlen = sizeof(watermark_boost_factor), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - }, - { - .procname = "watermark_scale_factor", - .data = &watermark_scale_factor, - .maxlen = sizeof(watermark_scale_factor), - .mode = 0644, - .proc_handler = watermark_scale_factor_sysctl_handler, - .extra1 = SYSCTL_ONE, - .extra2 = &one_thousand, - }, - { - .procname = "percpu_pagelist_fraction", - .data = &percpu_pagelist_fraction, - .maxlen = sizeof(percpu_pagelist_fraction), - .mode = 0644, - .proc_handler = percpu_pagelist_fraction_sysctl_handler, - .extra1 = SYSCTL_ZERO, - }, -#ifdef CONFIG_MMU - { - .procname = "max_map_count", - .data = &sysctl_max_map_count, - .maxlen = sizeof(sysctl_max_map_count), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - }, -#else - { - .procname = "nr_trim_pages", - .data = &sysctl_nr_trim_pages, - .maxlen = sizeof(sysctl_nr_trim_pages), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - }, -#endif - { - .procname = "laptop_mode", - .data = &laptop_mode, - .maxlen = sizeof(laptop_mode), - .mode = 0644, - .proc_handler = proc_dointvec_jiffies, - }, - { - .procname = "block_dump", - .data = &block_dump, - .maxlen = sizeof(block_dump), - .mode = 0644, - .proc_handler = proc_dointvec, - .extra1 = SYSCTL_ZERO, - }, - { - .procname = "vfs_cache_pressure", - .data = &sysctl_vfs_cache_pressure, - .maxlen = sizeof(sysctl_vfs_cache_pressure), - .mode = 0644, - .proc_handler = proc_dointvec, - .extra1 = SYSCTL_ZERO, - }, -#if defined(HAVE_ARCH_PICK_MMAP_LAYOUT) || \ - defined(CONFIG_ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT) - { - .procname = "legacy_va_layout", - .data = &sysctl_legacy_va_layout, - .maxlen = sizeof(sysctl_legacy_va_layout), - .mode = 0644, - .proc_handler = proc_dointvec, - .extra1 = SYSCTL_ZERO, - }, -#endif -#ifdef CONFIG_NUMA - { - .procname = "zone_reclaim_mode", - .data = &node_reclaim_mode, - .maxlen = sizeof(node_reclaim_mode), - .mode = 0644, - .proc_handler = proc_dointvec, - .extra1 = SYSCTL_ZERO, - }, - { - .procname = "min_unmapped_ratio", - .data = &sysctl_min_unmapped_ratio, - .maxlen = sizeof(sysctl_min_unmapped_ratio), - .mode = 0644, - .proc_handler = sysctl_min_unmapped_ratio_sysctl_handler, - .extra1 = SYSCTL_ZERO, - .extra2 = &one_hundred, - }, - { - .procname = "min_slab_ratio", - .data = &sysctl_min_slab_ratio, - .maxlen = sizeof(sysctl_min_slab_ratio), - .mode = 0644, - .proc_handler = sysctl_min_slab_ratio_sysctl_handler, - .extra1 = SYSCTL_ZERO, - .extra2 = &one_hundred, - }, -#endif -#ifdef CONFIG_SMP - { - .procname = "stat_interval", - .data = &sysctl_stat_interval, - .maxlen = sizeof(sysctl_stat_interval), - .mode = 0644, - .proc_handler = proc_dointvec_jiffies, - }, - { - .procname = "stat_refresh", - .data = NULL, - .maxlen = 0, - .mode = 0600, - .proc_handler = vmstat_refresh, - }, -#endif -#ifdef CONFIG_MMU - { - .procname = "mmap_min_addr", - .data = &dac_mmap_min_addr, - .maxlen = sizeof(unsigned long), - .mode = 0644, - .proc_handler = mmap_min_addr_handler, - }, -#endif -#ifdef CONFIG_NUMA - { - .procname = "numa_zonelist_order", - .data = &numa_zonelist_order, - .maxlen = NUMA_ZONELIST_ORDER_LEN, - .mode = 0644, - .proc_handler = numa_zonelist_order_handler, - }, -#endif -#if (defined(CONFIG_X86_32) && !defined(CONFIG_UML))|| \ - (defined(CONFIG_SUPERH) && defined(CONFIG_VSYSCALL)) - { - .procname = "vdso_enabled", -#ifdef CONFIG_X86_32 - .data = &vdso32_enabled, - .maxlen = sizeof(vdso32_enabled), -#else - .data = &vdso_enabled, - .maxlen = sizeof(vdso_enabled), -#endif - .mode = 0644, - .proc_handler = proc_dointvec, - .extra1 = SYSCTL_ZERO, - }, -#endif -#ifdef CONFIG_HIGHMEM - { - .procname = "highmem_is_dirtyable", - .data = &vm_highmem_is_dirtyable, - .maxlen = sizeof(vm_highmem_is_dirtyable), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, -#endif -#ifdef CONFIG_MEMORY_FAILURE - { - .procname = "memory_failure_early_kill", - .data = &sysctl_memory_failure_early_kill, - .maxlen = sizeof(sysctl_memory_failure_early_kill), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, - { - .procname = "memory_failure_recovery", - .data = &sysctl_memory_failure_recovery, - .maxlen = sizeof(sysctl_memory_failure_recovery), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, -#endif - { - .procname = "user_reserve_kbytes", - .data = &sysctl_user_reserve_kbytes, - .maxlen = sizeof(sysctl_user_reserve_kbytes), - .mode = 0644, - .proc_handler = proc_doulongvec_minmax, - }, - { - .procname = "admin_reserve_kbytes", - .data = &sysctl_admin_reserve_kbytes, - .maxlen = sizeof(sysctl_admin_reserve_kbytes), - .mode = 0644, - .proc_handler = proc_doulongvec_minmax, - }, -#ifdef CONFIG_HAVE_ARCH_MMAP_RND_BITS - { - .procname = "mmap_rnd_bits", - .data = &mmap_rnd_bits, - .maxlen = sizeof(mmap_rnd_bits), - .mode = 0600, - .proc_handler = proc_dointvec_minmax, - .extra1 = (void *)&mmap_rnd_bits_min, - .extra2 = (void *)&mmap_rnd_bits_max, - }, -#endif -#ifdef CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS - { - .procname = "mmap_rnd_compat_bits", - .data = &mmap_rnd_compat_bits, - .maxlen = sizeof(mmap_rnd_compat_bits), - .mode = 0600, - .proc_handler = proc_dointvec_minmax, - .extra1 = (void *)&mmap_rnd_compat_bits_min, - .extra2 = (void *)&mmap_rnd_compat_bits_max, - }, -#endif -#ifdef CONFIG_USERFAULTFD - { - .procname = "unprivileged_userfaultfd", - .data = &sysctl_unprivileged_userfaultfd, - .maxlen = sizeof(sysctl_unprivileged_userfaultfd), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, -#endif - { } -}; - -static struct ctl_table fs_table[] = { - { - .procname = "inode-nr", - .data = &inodes_stat, - .maxlen = 2*sizeof(long), - .mode = 0444, - .proc_handler = proc_nr_inodes, - }, - { - .procname = "inode-state", - .data = &inodes_stat, - .maxlen = 7*sizeof(long), - .mode = 0444, - .proc_handler = proc_nr_inodes, - }, - { - .procname = "file-nr", - .data = &files_stat, - .maxlen = sizeof(files_stat), - .mode = 0444, - .proc_handler = proc_nr_files, - }, - { - .procname = "file-max", - .data = &files_stat.max_files, - .maxlen = sizeof(files_stat.max_files), - .mode = 0644, - .proc_handler = proc_doulongvec_minmax, - .extra1 = &zero_ul, - .extra2 = &long_max, - }, - { - .procname = "nr_open", - .data = &sysctl_nr_open, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = &sysctl_nr_open_min, - .extra2 = &sysctl_nr_open_max, - }, - { - .procname = "dentry-state", - .data = &dentry_stat, - .maxlen = 6*sizeof(long), - .mode = 0444, - .proc_handler = proc_nr_dentry, - }, - { - .procname = "overflowuid", - .data = &fs_overflowuid, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = &minolduid, - .extra2 = &maxolduid, - }, - { - .procname = "overflowgid", - .data = &fs_overflowgid, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = &minolduid, - .extra2 = &maxolduid, - }, -#ifdef CONFIG_FILE_LOCKING - { - .procname = "leases-enable", - .data = &leases_enable, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, -#endif -#ifdef CONFIG_DNOTIFY - { - .procname = "dir-notify-enable", - .data = &dir_notify_enable, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, -#endif -#ifdef CONFIG_MMU -#ifdef CONFIG_FILE_LOCKING - { - .procname = "lease-break-time", - .data = &lease_break_time, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, -#endif -#ifdef CONFIG_AIO - { - .procname = "aio-nr", - .data = &aio_nr, - .maxlen = sizeof(aio_nr), - .mode = 0444, - .proc_handler = proc_doulongvec_minmax, - }, - { - .procname = "aio-max-nr", - .data = &aio_max_nr, - .maxlen = sizeof(aio_max_nr), - .mode = 0644, - .proc_handler = proc_doulongvec_minmax, - }, -#endif /* CONFIG_AIO */ -#ifdef CONFIG_INOTIFY_USER - { - .procname = "inotify", - .mode = 0555, - .child = inotify_table, - }, -#endif -#ifdef CONFIG_EPOLL - { - .procname = "epoll", - .mode = 0555, - .child = epoll_table, - }, -#endif -#endif - { - .procname = "protected_symlinks", - .data = &sysctl_protected_symlinks, - .maxlen = sizeof(int), - .mode = 0600, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, - { - .procname = "protected_hardlinks", - .data = &sysctl_protected_hardlinks, - .maxlen = sizeof(int), - .mode = 0600, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, - { - .procname = "protected_fifos", - .data = &sysctl_protected_fifos, - .maxlen = sizeof(int), - .mode = 0600, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = &two, - }, - { - .procname = "protected_regular", - .data = &sysctl_protected_regular, - .maxlen = sizeof(int), - .mode = 0600, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = &two, - }, - { - .procname = "suid_dumpable", - .data = &suid_dumpable, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax_coredump, - .extra1 = SYSCTL_ZERO, - .extra2 = &two, - }, -#if defined(CONFIG_BINFMT_MISC) || defined(CONFIG_BINFMT_MISC_MODULE) - { - .procname = "binfmt_misc", - .mode = 0555, - .child = sysctl_mount_point, - }, -#endif - { - .procname = "pipe-max-size", - .data = &pipe_max_size, - .maxlen = sizeof(pipe_max_size), - .mode = 0644, - .proc_handler = proc_dopipe_max_size, - }, - { - .procname = "pipe-user-pages-hard", - .data = &pipe_user_pages_hard, - .maxlen = sizeof(pipe_user_pages_hard), - .mode = 0644, - .proc_handler = proc_doulongvec_minmax, - }, - { - .procname = "pipe-user-pages-soft", - .data = &pipe_user_pages_soft, - .maxlen = sizeof(pipe_user_pages_soft), - .mode = 0644, - .proc_handler = proc_doulongvec_minmax, - }, - { - .procname = "mount-max", - .data = &sysctl_mount_max, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ONE, - }, - { } -}; - -static struct ctl_table debug_table[] = { -#ifdef CONFIG_SYSCTL_EXCEPTION_TRACE - { - .procname = "exception-trace", - .data = &show_unhandled_signals, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec - }, -#endif -#if defined(CONFIG_OPTPROBES) - { - .procname = "kprobes-optimization", - .data = &sysctl_kprobes_optimization, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_kprobes_optimization_handler, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, -#endif - { } -}; - -static struct ctl_table dev_table[] = { - { } -}; - -int __init sysctl_init(void) -{ - struct ctl_table_header *hdr; - - hdr = register_sysctl_table(sysctl_base_table); - kmemleak_not_leak(hdr); - return 0; -} - -#endif /* CONFIG_SYSCTL */ - -/* - * /proc/sys support - */ - +#endif /* CONFIG_SYSCTL */ + +/* + * /proc/sys support + */ + #ifdef CONFIG_PROC_SYSCTL static int _proc_do_string(char *data, int maxlen, int write, @@ -3301,101 +1571,1794 @@ int proc_dostring(struct ctl_table *table, int write, return -ENOSYS; } -int proc_dointvec(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) +int proc_dointvec(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + return -ENOSYS; +} + +int proc_douintvec(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + return -ENOSYS; +} + +int proc_dointvec_minmax(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + return -ENOSYS; +} + +int proc_douintvec_minmax(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + return -ENOSYS; +} + +int proc_dointvec_jiffies(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + return -ENOSYS; +} + +int proc_dointvec_userhz_jiffies(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + return -ENOSYS; +} + +int proc_dointvec_ms_jiffies(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) { return -ENOSYS; } -int proc_douintvec(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) +int proc_doulongvec_minmax(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) { return -ENOSYS; } -int proc_dointvec_minmax(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) +int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int write, + void __user *buffer, + size_t *lenp, loff_t *ppos) { - return -ENOSYS; + return -ENOSYS; } -int proc_douintvec_minmax(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) +int proc_do_large_bitmap(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) { return -ENOSYS; } -int proc_dointvec_jiffies(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) +#endif /* CONFIG_PROC_SYSCTL */ + +#if defined(CONFIG_SYSCTL) +int proc_do_static_key(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos) { - return -ENOSYS; + struct static_key *key = (struct static_key *)table->data; + static DEFINE_MUTEX(static_key_mutex); + int val, ret; + struct ctl_table tmp = { + .data = &val, + .maxlen = sizeof(val), + .mode = table->mode, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }; + + if (write && !capable(CAP_SYS_ADMIN)) + return -EPERM; + + mutex_lock(&static_key_mutex); + val = static_key_enabled(key); + ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos); + if (write && !ret) { + if (val) + static_key_enable(key); + else + static_key_disable(key); + } + mutex_unlock(&static_key_mutex); + return ret; } -int proc_dointvec_userhz_jiffies(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) -{ - return -ENOSYS; -} +static struct ctl_table kern_table[] = { + { + .procname = "sched_child_runs_first", + .data = &sysctl_sched_child_runs_first, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#ifdef CONFIG_SCHED_DEBUG + { + .procname = "sched_min_granularity_ns", + .data = &sysctl_sched_min_granularity, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = sched_proc_update_handler, + .extra1 = &min_sched_granularity_ns, + .extra2 = &max_sched_granularity_ns, + }, + { + .procname = "sched_latency_ns", + .data = &sysctl_sched_latency, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = sched_proc_update_handler, + .extra1 = &min_sched_granularity_ns, + .extra2 = &max_sched_granularity_ns, + }, + { + .procname = "sched_wakeup_granularity_ns", + .data = &sysctl_sched_wakeup_granularity, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = sched_proc_update_handler, + .extra1 = &min_wakeup_granularity_ns, + .extra2 = &max_wakeup_granularity_ns, + }, +#ifdef CONFIG_SMP + { + .procname = "sched_tunable_scaling", + .data = &sysctl_sched_tunable_scaling, + .maxlen = sizeof(enum sched_tunable_scaling), + .mode = 0644, + .proc_handler = sched_proc_update_handler, + .extra1 = &min_sched_tunable_scaling, + .extra2 = &max_sched_tunable_scaling, + }, + { + .procname = "sched_migration_cost_ns", + .data = &sysctl_sched_migration_cost, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "sched_nr_migrate", + .data = &sysctl_sched_nr_migrate, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#ifdef CONFIG_SCHEDSTATS + { + .procname = "sched_schedstats", + .data = NULL, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = sysctl_schedstats, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, +#endif /* CONFIG_SCHEDSTATS */ +#endif /* CONFIG_SMP */ +#ifdef CONFIG_NUMA_BALANCING + { + .procname = "numa_balancing_scan_delay_ms", + .data = &sysctl_numa_balancing_scan_delay, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "numa_balancing_scan_period_min_ms", + .data = &sysctl_numa_balancing_scan_period_min, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "numa_balancing_scan_period_max_ms", + .data = &sysctl_numa_balancing_scan_period_max, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "numa_balancing_scan_size_mb", + .data = &sysctl_numa_balancing_scan_size, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ONE, + }, + { + .procname = "numa_balancing", + .data = NULL, /* filled in by handler */ + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = sysctl_numa_balancing, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, +#endif /* CONFIG_NUMA_BALANCING */ +#endif /* CONFIG_SCHED_DEBUG */ + { + .procname = "sched_rt_period_us", + .data = &sysctl_sched_rt_period, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = sched_rt_handler, + }, + { + .procname = "sched_rt_runtime_us", + .data = &sysctl_sched_rt_runtime, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = sched_rt_handler, + }, + { + .procname = "sched_rr_timeslice_ms", + .data = &sysctl_sched_rr_timeslice, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = sched_rr_handler, + }, +#ifdef CONFIG_UCLAMP_TASK + { + .procname = "sched_util_clamp_min", + .data = &sysctl_sched_uclamp_util_min, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = sysctl_sched_uclamp_handler, + }, + { + .procname = "sched_util_clamp_max", + .data = &sysctl_sched_uclamp_util_max, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = sysctl_sched_uclamp_handler, + }, +#endif +#ifdef CONFIG_SCHED_AUTOGROUP + { + .procname = "sched_autogroup_enabled", + .data = &sysctl_sched_autogroup_enabled, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, +#endif +#ifdef CONFIG_CFS_BANDWIDTH + { + .procname = "sched_cfs_bandwidth_slice_us", + .data = &sysctl_sched_cfs_bandwidth_slice, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ONE, + }, +#endif +#if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) + { + .procname = "sched_energy_aware", + .data = &sysctl_sched_energy_aware, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = sched_energy_aware_handler, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, +#endif +#ifdef CONFIG_PROVE_LOCKING + { + .procname = "prove_locking", + .data = &prove_locking, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#endif +#ifdef CONFIG_LOCK_STAT + { + .procname = "lock_stat", + .data = &lock_stat, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#endif + { + .procname = "panic", + .data = &panic_timeout, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#ifdef CONFIG_COREDUMP + { + .procname = "core_uses_pid", + .data = &core_uses_pid, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "core_pattern", + .data = core_pattern, + .maxlen = CORENAME_MAX_SIZE, + .mode = 0644, + .proc_handler = proc_dostring_coredump, + }, + { + .procname = "core_pipe_limit", + .data = &core_pipe_limit, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#endif +#ifdef CONFIG_PROC_SYSCTL + { + .procname = "tainted", + .maxlen = sizeof(long), + .mode = 0644, + .proc_handler = proc_taint, + }, + { + .procname = "sysctl_writes_strict", + .data = &sysctl_writes_strict, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &neg_one, + .extra2 = SYSCTL_ONE, + }, +#endif +#ifdef CONFIG_LATENCYTOP + { + .procname = "latencytop", + .data = &latencytop_enabled, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = sysctl_latencytop, + }, +#endif +#ifdef CONFIG_BLK_DEV_INITRD + { + .procname = "real-root-dev", + .data = &real_root_dev, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#endif + { + .procname = "print-fatal-signals", + .data = &print_fatal_signals, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#ifdef CONFIG_SPARC + { + .procname = "reboot-cmd", + .data = reboot_command, + .maxlen = 256, + .mode = 0644, + .proc_handler = proc_dostring, + }, + { + .procname = "stop-a", + .data = &stop_a_enabled, + .maxlen = sizeof (int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "scons-poweroff", + .data = &scons_pwroff, + .maxlen = sizeof (int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#endif +#ifdef CONFIG_SPARC64 + { + .procname = "tsb-ratio", + .data = &sysctl_tsb_ratio, + .maxlen = sizeof (int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#endif +#ifdef CONFIG_PARISC + { + .procname = "soft-power", + .data = &pwrsw_enabled, + .maxlen = sizeof (int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#endif +#ifdef CONFIG_SYSCTL_ARCH_UNALIGN_ALLOW + { + .procname = "unaligned-trap", + .data = &unaligned_enabled, + .maxlen = sizeof (int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#endif + { + .procname = "ctrl-alt-del", + .data = &C_A_D, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#ifdef CONFIG_FUNCTION_TRACER + { + .procname = "ftrace_enabled", + .data = &ftrace_enabled, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = ftrace_enable_sysctl, + }, +#endif +#ifdef CONFIG_STACK_TRACER + { + .procname = "stack_tracer_enabled", + .data = &stack_tracer_enabled, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = stack_trace_sysctl, + }, +#endif +#ifdef CONFIG_TRACING + { + .procname = "ftrace_dump_on_oops", + .data = &ftrace_dump_on_oops, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "traceoff_on_warning", + .data = &__disable_trace_on_warning, + .maxlen = sizeof(__disable_trace_on_warning), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "tracepoint_printk", + .data = &tracepoint_printk, + .maxlen = sizeof(tracepoint_printk), + .mode = 0644, + .proc_handler = tracepoint_printk_sysctl, + }, +#endif +#ifdef CONFIG_KEXEC_CORE + { + .procname = "kexec_load_disabled", + .data = &kexec_load_disabled, + .maxlen = sizeof(int), + .mode = 0644, + /* only handle a transition from default "0" to "1" */ + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ONE, + .extra2 = SYSCTL_ONE, + }, +#endif +#ifdef CONFIG_MODULES + { + .procname = "modprobe", + .data = &modprobe_path, + .maxlen = KMOD_PATH_LEN, + .mode = 0644, + .proc_handler = proc_dostring, + }, + { + .procname = "modules_disabled", + .data = &modules_disabled, + .maxlen = sizeof(int), + .mode = 0644, + /* only handle a transition from default "0" to "1" */ + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ONE, + .extra2 = SYSCTL_ONE, + }, +#endif +#ifdef CONFIG_UEVENT_HELPER + { + .procname = "hotplug", + .data = &uevent_helper, + .maxlen = UEVENT_HELPER_PATH_LEN, + .mode = 0644, + .proc_handler = proc_dostring, + }, +#endif +#ifdef CONFIG_CHR_DEV_SG + { + .procname = "sg-big-buff", + .data = &sg_big_buff, + .maxlen = sizeof (int), + .mode = 0444, + .proc_handler = proc_dointvec, + }, +#endif +#ifdef CONFIG_BSD_PROCESS_ACCT + { + .procname = "acct", + .data = &acct_parm, + .maxlen = 3*sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#endif +#ifdef CONFIG_MAGIC_SYSRQ + { + .procname = "sysrq", + .data = NULL, + .maxlen = sizeof (int), + .mode = 0644, + .proc_handler = sysrq_sysctl_handler, + }, +#endif +#ifdef CONFIG_PROC_SYSCTL + { + .procname = "cad_pid", + .data = NULL, + .maxlen = sizeof (int), + .mode = 0600, + .proc_handler = proc_do_cad_pid, + }, +#endif + { + .procname = "threads-max", + .data = NULL, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = sysctl_max_threads, + }, + { + .procname = "random", + .mode = 0555, + .child = random_table, + }, + { + .procname = "usermodehelper", + .mode = 0555, + .child = usermodehelper_table, + }, +#ifdef CONFIG_FW_LOADER_USER_HELPER + { + .procname = "firmware_config", + .mode = 0555, + .child = firmware_config_table, + }, +#endif + { + .procname = "overflowuid", + .data = &overflowuid, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &minolduid, + .extra2 = &maxolduid, + }, + { + .procname = "overflowgid", + .data = &overflowgid, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &minolduid, + .extra2 = &maxolduid, + }, +#ifdef CONFIG_S390 + { + .procname = "userprocess_debug", + .data = &show_unhandled_signals, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#endif + { + .procname = "pid_max", + .data = &pid_max, + .maxlen = sizeof (int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &pid_max_min, + .extra2 = &pid_max_max, + }, + { + .procname = "panic_on_oops", + .data = &panic_on_oops, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "panic_print", + .data = &panic_print, + .maxlen = sizeof(unsigned long), + .mode = 0644, + .proc_handler = proc_doulongvec_minmax, + }, +#if defined CONFIG_PRINTK + { + .procname = "printk", + .data = &console_loglevel, + .maxlen = 4*sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "printk_ratelimit", + .data = &printk_ratelimit_state.interval, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { + .procname = "printk_ratelimit_burst", + .data = &printk_ratelimit_state.burst, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "printk_delay", + .data = &printk_delay_msec, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = &ten_thousand, + }, + { + .procname = "printk_devkmsg", + .data = devkmsg_log_str, + .maxlen = DEVKMSG_STR_MAX_SIZE, + .mode = 0644, + .proc_handler = devkmsg_sysctl_set_loglvl, + }, + { + .procname = "dmesg_restrict", + .data = &dmesg_restrict, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax_sysadmin, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, + { + .procname = "kptr_restrict", + .data = &kptr_restrict, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax_sysadmin, + .extra1 = SYSCTL_ZERO, + .extra2 = &two, + }, +#endif + { + .procname = "ngroups_max", + .data = &ngroups_max, + .maxlen = sizeof (int), + .mode = 0444, + .proc_handler = proc_dointvec, + }, + { + .procname = "cap_last_cap", + .data = (void *)&cap_last_cap, + .maxlen = sizeof(int), + .mode = 0444, + .proc_handler = proc_dointvec, + }, +#if defined(CONFIG_LOCKUP_DETECTOR) + { + .procname = "watchdog", + .data = &watchdog_user_enabled, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_watchdog, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, + { + .procname = "watchdog_thresh", + .data = &watchdog_thresh, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_watchdog_thresh, + .extra1 = SYSCTL_ZERO, + .extra2 = &sixty, + }, + { + .procname = "nmi_watchdog", + .data = &nmi_watchdog_user_enabled, + .maxlen = sizeof(int), + .mode = NMI_WATCHDOG_SYSCTL_PERM, + .proc_handler = proc_nmi_watchdog, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, + { + .procname = "watchdog_cpumask", + .data = &watchdog_cpumask_bits, + .maxlen = NR_CPUS, + .mode = 0644, + .proc_handler = proc_watchdog_cpumask, + }, +#ifdef CONFIG_SOFTLOCKUP_DETECTOR + { + .procname = "soft_watchdog", + .data = &soft_watchdog_user_enabled, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_soft_watchdog, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, + { + .procname = "softlockup_panic", + .data = &softlockup_panic, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, +#ifdef CONFIG_SMP + { + .procname = "softlockup_all_cpu_backtrace", + .data = &sysctl_softlockup_all_cpu_backtrace, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, +#endif /* CONFIG_SMP */ +#endif +#ifdef CONFIG_HARDLOCKUP_DETECTOR + { + .procname = "hardlockup_panic", + .data = &hardlockup_panic, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, +#ifdef CONFIG_SMP + { + .procname = "hardlockup_all_cpu_backtrace", + .data = &sysctl_hardlockup_all_cpu_backtrace, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, +#endif /* CONFIG_SMP */ +#endif +#endif + +#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86) + { + .procname = "unknown_nmi_panic", + .data = &unknown_nmi_panic, + .maxlen = sizeof (int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#endif +#if defined(CONFIG_X86) + { + .procname = "panic_on_unrecovered_nmi", + .data = &panic_on_unrecovered_nmi, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "panic_on_io_nmi", + .data = &panic_on_io_nmi, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#ifdef CONFIG_DEBUG_STACKOVERFLOW + { + .procname = "panic_on_stackoverflow", + .data = &sysctl_panic_on_stackoverflow, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#endif + { + .procname = "bootloader_type", + .data = &bootloader_type, + .maxlen = sizeof (int), + .mode = 0444, + .proc_handler = proc_dointvec, + }, + { + .procname = "bootloader_version", + .data = &bootloader_version, + .maxlen = sizeof (int), + .mode = 0444, + .proc_handler = proc_dointvec, + }, + { + .procname = "io_delay_type", + .data = &io_delay_type, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#endif +#if defined(CONFIG_MMU) + { + .procname = "randomize_va_space", + .data = &randomize_va_space, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#endif +#if defined(CONFIG_S390) && defined(CONFIG_SMP) + { + .procname = "spin_retry", + .data = &spin_retry, + .maxlen = sizeof (int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#endif +#if defined(CONFIG_ACPI_SLEEP) && defined(CONFIG_X86) + { + .procname = "acpi_video_flags", + .data = &acpi_realmode_flags, + .maxlen = sizeof (unsigned long), + .mode = 0644, + .proc_handler = proc_doulongvec_minmax, + }, +#endif +#ifdef CONFIG_SYSCTL_ARCH_UNALIGN_NO_WARN + { + .procname = "ignore-unaligned-usertrap", + .data = &no_unaligned_warning, + .maxlen = sizeof (int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#endif +#ifdef CONFIG_IA64 + { + .procname = "unaligned-dump-stack", + .data = &unaligned_dump_stack, + .maxlen = sizeof (int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#endif +#ifdef CONFIG_DETECT_HUNG_TASK + { + .procname = "hung_task_panic", + .data = &sysctl_hung_task_panic, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, + { + .procname = "hung_task_check_count", + .data = &sysctl_hung_task_check_count, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + }, + { + .procname = "hung_task_timeout_secs", + .data = &sysctl_hung_task_timeout_secs, + .maxlen = sizeof(unsigned long), + .mode = 0644, + .proc_handler = proc_dohung_task_timeout_secs, + .extra2 = &hung_task_timeout_max, + }, + { + .procname = "hung_task_check_interval_secs", + .data = &sysctl_hung_task_check_interval_secs, + .maxlen = sizeof(unsigned long), + .mode = 0644, + .proc_handler = proc_dohung_task_timeout_secs, + .extra2 = &hung_task_timeout_max, + }, + { + .procname = "hung_task_warnings", + .data = &sysctl_hung_task_warnings, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &neg_one, + }, +#endif +#ifdef CONFIG_RT_MUTEXES + { + .procname = "max_lock_depth", + .data = &max_lock_depth, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#endif + { + .procname = "poweroff_cmd", + .data = &poweroff_cmd, + .maxlen = POWEROFF_CMD_PATH_LEN, + .mode = 0644, + .proc_handler = proc_dostring, + }, +#ifdef CONFIG_KEYS + { + .procname = "keys", + .mode = 0555, + .child = key_sysctls, + }, +#endif +#ifdef CONFIG_PERF_EVENTS + /* + * User-space scripts rely on the existence of this file + * as a feature check for perf_events being enabled. + * + * So it's an ABI, do not remove! + */ + { + .procname = "perf_event_paranoid", + .data = &sysctl_perf_event_paranoid, + .maxlen = sizeof(sysctl_perf_event_paranoid), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "perf_event_mlock_kb", + .data = &sysctl_perf_event_mlock, + .maxlen = sizeof(sysctl_perf_event_mlock), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "perf_event_max_sample_rate", + .data = &sysctl_perf_event_sample_rate, + .maxlen = sizeof(sysctl_perf_event_sample_rate), + .mode = 0644, + .proc_handler = perf_proc_update_handler, + .extra1 = SYSCTL_ONE, + }, + { + .procname = "perf_cpu_time_max_percent", + .data = &sysctl_perf_cpu_time_max_percent, + .maxlen = sizeof(sysctl_perf_cpu_time_max_percent), + .mode = 0644, + .proc_handler = perf_cpu_time_max_percent_handler, + .extra1 = SYSCTL_ZERO, + .extra2 = &one_hundred, + }, + { + .procname = "perf_event_max_stack", + .data = &sysctl_perf_event_max_stack, + .maxlen = sizeof(sysctl_perf_event_max_stack), + .mode = 0644, + .proc_handler = perf_event_max_stack_handler, + .extra1 = SYSCTL_ZERO, + .extra2 = &six_hundred_forty_kb, + }, + { + .procname = "perf_event_max_contexts_per_stack", + .data = &sysctl_perf_event_max_contexts_per_stack, + .maxlen = sizeof(sysctl_perf_event_max_contexts_per_stack), + .mode = 0644, + .proc_handler = perf_event_max_stack_handler, + .extra1 = SYSCTL_ZERO, + .extra2 = &one_thousand, + }, +#endif + { + .procname = "panic_on_warn", + .data = &panic_on_warn, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, +#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON) + { + .procname = "timer_migration", + .data = &sysctl_timer_migration, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = timer_migration_handler, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, +#endif +#ifdef CONFIG_BPF_SYSCALL + { + .procname = "unprivileged_bpf_disabled", + .data = &sysctl_unprivileged_bpf_disabled, + .maxlen = sizeof(sysctl_unprivileged_bpf_disabled), + .mode = 0644, + /* only handle a transition from default "0" to "1" */ + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ONE, + .extra2 = SYSCTL_ONE, + }, + { + .procname = "bpf_stats_enabled", + .data = &bpf_stats_enabled_key.key, + .maxlen = sizeof(bpf_stats_enabled_key), + .mode = 0644, + .proc_handler = proc_do_static_key, + }, +#endif +#if defined(CONFIG_TREE_RCU) + { + .procname = "panic_on_rcu_stall", + .data = &sysctl_panic_on_rcu_stall, + .maxlen = sizeof(sysctl_panic_on_rcu_stall), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, +#endif +#ifdef CONFIG_STACKLEAK_RUNTIME_DISABLE + { + .procname = "stack_erasing", + .data = NULL, + .maxlen = sizeof(int), + .mode = 0600, + .proc_handler = stack_erasing_sysctl, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, +#endif + { } +}; -int proc_dointvec_ms_jiffies(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) -{ - return -ENOSYS; -} +static struct ctl_table vm_table[] = { + { + .procname = "overcommit_memory", + .data = &sysctl_overcommit_memory, + .maxlen = sizeof(sysctl_overcommit_memory), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = &two, + }, + { + .procname = "panic_on_oom", + .data = &sysctl_panic_on_oom, + .maxlen = sizeof(sysctl_panic_on_oom), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = &two, + }, + { + .procname = "oom_kill_allocating_task", + .data = &sysctl_oom_kill_allocating_task, + .maxlen = sizeof(sysctl_oom_kill_allocating_task), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "oom_dump_tasks", + .data = &sysctl_oom_dump_tasks, + .maxlen = sizeof(sysctl_oom_dump_tasks), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "overcommit_ratio", + .data = &sysctl_overcommit_ratio, + .maxlen = sizeof(sysctl_overcommit_ratio), + .mode = 0644, + .proc_handler = overcommit_ratio_handler, + }, + { + .procname = "overcommit_kbytes", + .data = &sysctl_overcommit_kbytes, + .maxlen = sizeof(sysctl_overcommit_kbytes), + .mode = 0644, + .proc_handler = overcommit_kbytes_handler, + }, + { + .procname = "page-cluster", + .data = &page_cluster, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + }, + { + .procname = "dirty_background_ratio", + .data = &dirty_background_ratio, + .maxlen = sizeof(dirty_background_ratio), + .mode = 0644, + .proc_handler = dirty_background_ratio_handler, + .extra1 = SYSCTL_ZERO, + .extra2 = &one_hundred, + }, + { + .procname = "dirty_background_bytes", + .data = &dirty_background_bytes, + .maxlen = sizeof(dirty_background_bytes), + .mode = 0644, + .proc_handler = dirty_background_bytes_handler, + .extra1 = &one_ul, + }, + { + .procname = "dirty_ratio", + .data = &vm_dirty_ratio, + .maxlen = sizeof(vm_dirty_ratio), + .mode = 0644, + .proc_handler = dirty_ratio_handler, + .extra1 = SYSCTL_ZERO, + .extra2 = &one_hundred, + }, + { + .procname = "dirty_bytes", + .data = &vm_dirty_bytes, + .maxlen = sizeof(vm_dirty_bytes), + .mode = 0644, + .proc_handler = dirty_bytes_handler, + .extra1 = &dirty_bytes_min, + }, + { + .procname = "dirty_writeback_centisecs", + .data = &dirty_writeback_interval, + .maxlen = sizeof(dirty_writeback_interval), + .mode = 0644, + .proc_handler = dirty_writeback_centisecs_handler, + }, + { + .procname = "dirty_expire_centisecs", + .data = &dirty_expire_interval, + .maxlen = sizeof(dirty_expire_interval), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + }, + { + .procname = "dirtytime_expire_seconds", + .data = &dirtytime_expire_interval, + .maxlen = sizeof(dirtytime_expire_interval), + .mode = 0644, + .proc_handler = dirtytime_interval_handler, + .extra1 = SYSCTL_ZERO, + }, + { + .procname = "swappiness", + .data = &vm_swappiness, + .maxlen = sizeof(vm_swappiness), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = &one_hundred, + }, +#ifdef CONFIG_HUGETLB_PAGE + { + .procname = "nr_hugepages", + .data = NULL, + .maxlen = sizeof(unsigned long), + .mode = 0644, + .proc_handler = hugetlb_sysctl_handler, + }, +#ifdef CONFIG_NUMA + { + .procname = "nr_hugepages_mempolicy", + .data = NULL, + .maxlen = sizeof(unsigned long), + .mode = 0644, + .proc_handler = &hugetlb_mempolicy_sysctl_handler, + }, + { + .procname = "numa_stat", + .data = &sysctl_vm_numa_stat, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = sysctl_vm_numa_stat_handler, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, +#endif + { + .procname = "hugetlb_shm_group", + .data = &sysctl_hugetlb_shm_group, + .maxlen = sizeof(gid_t), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "nr_overcommit_hugepages", + .data = NULL, + .maxlen = sizeof(unsigned long), + .mode = 0644, + .proc_handler = hugetlb_overcommit_handler, + }, +#endif + { + .procname = "lowmem_reserve_ratio", + .data = &sysctl_lowmem_reserve_ratio, + .maxlen = sizeof(sysctl_lowmem_reserve_ratio), + .mode = 0644, + .proc_handler = lowmem_reserve_ratio_sysctl_handler, + }, + { + .procname = "drop_caches", + .data = &sysctl_drop_caches, + .maxlen = sizeof(int), + .mode = 0200, + .proc_handler = drop_caches_sysctl_handler, + .extra1 = SYSCTL_ONE, + .extra2 = &four, + }, +#ifdef CONFIG_COMPACTION + { + .procname = "compact_memory", + .data = &sysctl_compact_memory, + .maxlen = sizeof(int), + .mode = 0200, + .proc_handler = sysctl_compaction_handler, + }, + { + .procname = "extfrag_threshold", + .data = &sysctl_extfrag_threshold, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &min_extfrag_threshold, + .extra2 = &max_extfrag_threshold, + }, + { + .procname = "compact_unevictable_allowed", + .data = &sysctl_compact_unevictable_allowed, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax_warn_RT_change, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, + +#endif /* CONFIG_COMPACTION */ + { + .procname = "min_free_kbytes", + .data = &min_free_kbytes, + .maxlen = sizeof(min_free_kbytes), + .mode = 0644, + .proc_handler = min_free_kbytes_sysctl_handler, + .extra1 = SYSCTL_ZERO, + }, + { + .procname = "watermark_boost_factor", + .data = &watermark_boost_factor, + .maxlen = sizeof(watermark_boost_factor), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + }, + { + .procname = "watermark_scale_factor", + .data = &watermark_scale_factor, + .maxlen = sizeof(watermark_scale_factor), + .mode = 0644, + .proc_handler = watermark_scale_factor_sysctl_handler, + .extra1 = SYSCTL_ONE, + .extra2 = &one_thousand, + }, + { + .procname = "percpu_pagelist_fraction", + .data = &percpu_pagelist_fraction, + .maxlen = sizeof(percpu_pagelist_fraction), + .mode = 0644, + .proc_handler = percpu_pagelist_fraction_sysctl_handler, + .extra1 = SYSCTL_ZERO, + }, +#ifdef CONFIG_MMU + { + .procname = "max_map_count", + .data = &sysctl_max_map_count, + .maxlen = sizeof(sysctl_max_map_count), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + }, +#else + { + .procname = "nr_trim_pages", + .data = &sysctl_nr_trim_pages, + .maxlen = sizeof(sysctl_nr_trim_pages), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + }, +#endif + { + .procname = "laptop_mode", + .data = &laptop_mode, + .maxlen = sizeof(laptop_mode), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { + .procname = "block_dump", + .data = &block_dump, + .maxlen = sizeof(block_dump), + .mode = 0644, + .proc_handler = proc_dointvec, + .extra1 = SYSCTL_ZERO, + }, + { + .procname = "vfs_cache_pressure", + .data = &sysctl_vfs_cache_pressure, + .maxlen = sizeof(sysctl_vfs_cache_pressure), + .mode = 0644, + .proc_handler = proc_dointvec, + .extra1 = SYSCTL_ZERO, + }, +#if defined(HAVE_ARCH_PICK_MMAP_LAYOUT) || \ + defined(CONFIG_ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT) + { + .procname = "legacy_va_layout", + .data = &sysctl_legacy_va_layout, + .maxlen = sizeof(sysctl_legacy_va_layout), + .mode = 0644, + .proc_handler = proc_dointvec, + .extra1 = SYSCTL_ZERO, + }, +#endif +#ifdef CONFIG_NUMA + { + .procname = "zone_reclaim_mode", + .data = &node_reclaim_mode, + .maxlen = sizeof(node_reclaim_mode), + .mode = 0644, + .proc_handler = proc_dointvec, + .extra1 = SYSCTL_ZERO, + }, + { + .procname = "min_unmapped_ratio", + .data = &sysctl_min_unmapped_ratio, + .maxlen = sizeof(sysctl_min_unmapped_ratio), + .mode = 0644, + .proc_handler = sysctl_min_unmapped_ratio_sysctl_handler, + .extra1 = SYSCTL_ZERO, + .extra2 = &one_hundred, + }, + { + .procname = "min_slab_ratio", + .data = &sysctl_min_slab_ratio, + .maxlen = sizeof(sysctl_min_slab_ratio), + .mode = 0644, + .proc_handler = sysctl_min_slab_ratio_sysctl_handler, + .extra1 = SYSCTL_ZERO, + .extra2 = &one_hundred, + }, +#endif +#ifdef CONFIG_SMP + { + .procname = "stat_interval", + .data = &sysctl_stat_interval, + .maxlen = sizeof(sysctl_stat_interval), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { + .procname = "stat_refresh", + .data = NULL, + .maxlen = 0, + .mode = 0600, + .proc_handler = vmstat_refresh, + }, +#endif +#ifdef CONFIG_MMU + { + .procname = "mmap_min_addr", + .data = &dac_mmap_min_addr, + .maxlen = sizeof(unsigned long), + .mode = 0644, + .proc_handler = mmap_min_addr_handler, + }, +#endif +#ifdef CONFIG_NUMA + { + .procname = "numa_zonelist_order", + .data = &numa_zonelist_order, + .maxlen = NUMA_ZONELIST_ORDER_LEN, + .mode = 0644, + .proc_handler = numa_zonelist_order_handler, + }, +#endif +#if (defined(CONFIG_X86_32) && !defined(CONFIG_UML))|| \ + (defined(CONFIG_SUPERH) && defined(CONFIG_VSYSCALL)) + { + .procname = "vdso_enabled", +#ifdef CONFIG_X86_32 + .data = &vdso32_enabled, + .maxlen = sizeof(vdso32_enabled), +#else + .data = &vdso_enabled, + .maxlen = sizeof(vdso_enabled), +#endif + .mode = 0644, + .proc_handler = proc_dointvec, + .extra1 = SYSCTL_ZERO, + }, +#endif +#ifdef CONFIG_HIGHMEM + { + .procname = "highmem_is_dirtyable", + .data = &vm_highmem_is_dirtyable, + .maxlen = sizeof(vm_highmem_is_dirtyable), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, +#endif +#ifdef CONFIG_MEMORY_FAILURE + { + .procname = "memory_failure_early_kill", + .data = &sysctl_memory_failure_early_kill, + .maxlen = sizeof(sysctl_memory_failure_early_kill), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, + { + .procname = "memory_failure_recovery", + .data = &sysctl_memory_failure_recovery, + .maxlen = sizeof(sysctl_memory_failure_recovery), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, +#endif + { + .procname = "user_reserve_kbytes", + .data = &sysctl_user_reserve_kbytes, + .maxlen = sizeof(sysctl_user_reserve_kbytes), + .mode = 0644, + .proc_handler = proc_doulongvec_minmax, + }, + { + .procname = "admin_reserve_kbytes", + .data = &sysctl_admin_reserve_kbytes, + .maxlen = sizeof(sysctl_admin_reserve_kbytes), + .mode = 0644, + .proc_handler = proc_doulongvec_minmax, + }, +#ifdef CONFIG_HAVE_ARCH_MMAP_RND_BITS + { + .procname = "mmap_rnd_bits", + .data = &mmap_rnd_bits, + .maxlen = sizeof(mmap_rnd_bits), + .mode = 0600, + .proc_handler = proc_dointvec_minmax, + .extra1 = (void *)&mmap_rnd_bits_min, + .extra2 = (void *)&mmap_rnd_bits_max, + }, +#endif +#ifdef CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS + { + .procname = "mmap_rnd_compat_bits", + .data = &mmap_rnd_compat_bits, + .maxlen = sizeof(mmap_rnd_compat_bits), + .mode = 0600, + .proc_handler = proc_dointvec_minmax, + .extra1 = (void *)&mmap_rnd_compat_bits_min, + .extra2 = (void *)&mmap_rnd_compat_bits_max, + }, +#endif +#ifdef CONFIG_USERFAULTFD + { + .procname = "unprivileged_userfaultfd", + .data = &sysctl_unprivileged_userfaultfd, + .maxlen = sizeof(sysctl_unprivileged_userfaultfd), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, +#endif + { } +}; -int proc_doulongvec_minmax(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) -{ - return -ENOSYS; -} +static struct ctl_table fs_table[] = { + { + .procname = "inode-nr", + .data = &inodes_stat, + .maxlen = 2*sizeof(long), + .mode = 0444, + .proc_handler = proc_nr_inodes, + }, + { + .procname = "inode-state", + .data = &inodes_stat, + .maxlen = 7*sizeof(long), + .mode = 0444, + .proc_handler = proc_nr_inodes, + }, + { + .procname = "file-nr", + .data = &files_stat, + .maxlen = sizeof(files_stat), + .mode = 0444, + .proc_handler = proc_nr_files, + }, + { + .procname = "file-max", + .data = &files_stat.max_files, + .maxlen = sizeof(files_stat.max_files), + .mode = 0644, + .proc_handler = proc_doulongvec_minmax, + .extra1 = &zero_ul, + .extra2 = &long_max, + }, + { + .procname = "nr_open", + .data = &sysctl_nr_open, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &sysctl_nr_open_min, + .extra2 = &sysctl_nr_open_max, + }, + { + .procname = "dentry-state", + .data = &dentry_stat, + .maxlen = 6*sizeof(long), + .mode = 0444, + .proc_handler = proc_nr_dentry, + }, + { + .procname = "overflowuid", + .data = &fs_overflowuid, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &minolduid, + .extra2 = &maxolduid, + }, + { + .procname = "overflowgid", + .data = &fs_overflowgid, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &minolduid, + .extra2 = &maxolduid, + }, +#ifdef CONFIG_FILE_LOCKING + { + .procname = "leases-enable", + .data = &leases_enable, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#endif +#ifdef CONFIG_DNOTIFY + { + .procname = "dir-notify-enable", + .data = &dir_notify_enable, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#endif +#ifdef CONFIG_MMU +#ifdef CONFIG_FILE_LOCKING + { + .procname = "lease-break-time", + .data = &lease_break_time, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#endif +#ifdef CONFIG_AIO + { + .procname = "aio-nr", + .data = &aio_nr, + .maxlen = sizeof(aio_nr), + .mode = 0444, + .proc_handler = proc_doulongvec_minmax, + }, + { + .procname = "aio-max-nr", + .data = &aio_max_nr, + .maxlen = sizeof(aio_max_nr), + .mode = 0644, + .proc_handler = proc_doulongvec_minmax, + }, +#endif /* CONFIG_AIO */ +#ifdef CONFIG_INOTIFY_USER + { + .procname = "inotify", + .mode = 0555, + .child = inotify_table, + }, +#endif +#ifdef CONFIG_EPOLL + { + .procname = "epoll", + .mode = 0555, + .child = epoll_table, + }, +#endif +#endif + { + .procname = "protected_symlinks", + .data = &sysctl_protected_symlinks, + .maxlen = sizeof(int), + .mode = 0600, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, + { + .procname = "protected_hardlinks", + .data = &sysctl_protected_hardlinks, + .maxlen = sizeof(int), + .mode = 0600, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, + { + .procname = "protected_fifos", + .data = &sysctl_protected_fifos, + .maxlen = sizeof(int), + .mode = 0600, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = &two, + }, + { + .procname = "protected_regular", + .data = &sysctl_protected_regular, + .maxlen = sizeof(int), + .mode = 0600, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = &two, + }, + { + .procname = "suid_dumpable", + .data = &suid_dumpable, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax_coredump, + .extra1 = SYSCTL_ZERO, + .extra2 = &two, + }, +#if defined(CONFIG_BINFMT_MISC) || defined(CONFIG_BINFMT_MISC_MODULE) + { + .procname = "binfmt_misc", + .mode = 0555, + .child = sysctl_mount_point, + }, +#endif + { + .procname = "pipe-max-size", + .data = &pipe_max_size, + .maxlen = sizeof(pipe_max_size), + .mode = 0644, + .proc_handler = proc_dopipe_max_size, + }, + { + .procname = "pipe-user-pages-hard", + .data = &pipe_user_pages_hard, + .maxlen = sizeof(pipe_user_pages_hard), + .mode = 0644, + .proc_handler = proc_doulongvec_minmax, + }, + { + .procname = "pipe-user-pages-soft", + .data = &pipe_user_pages_soft, + .maxlen = sizeof(pipe_user_pages_soft), + .mode = 0644, + .proc_handler = proc_doulongvec_minmax, + }, + { + .procname = "mount-max", + .data = &sysctl_mount_max, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ONE, + }, + { } +}; -int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int write, - void __user *buffer, - size_t *lenp, loff_t *ppos) -{ - return -ENOSYS; -} +static struct ctl_table debug_table[] = { +#ifdef CONFIG_SYSCTL_EXCEPTION_TRACE + { + .procname = "exception-trace", + .data = &show_unhandled_signals, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, +#endif +#if defined(CONFIG_OPTPROBES) + { + .procname = "kprobes-optimization", + .data = &sysctl_kprobes_optimization, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_kprobes_optimization_handler, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, +#endif + { } +}; -int proc_do_large_bitmap(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) -{ - return -ENOSYS; -} +static struct ctl_table dev_table[] = { + { } +}; -#endif /* CONFIG_PROC_SYSCTL */ +static struct ctl_table sysctl_base_table[] = { + { + .procname = "kernel", + .mode = 0555, + .child = kern_table, + }, + { + .procname = "vm", + .mode = 0555, + .child = vm_table, + }, + { + .procname = "fs", + .mode = 0555, + .child = fs_table, + }, + { + .procname = "debug", + .mode = 0555, + .child = debug_table, + }, + { + .procname = "dev", + .mode = 0555, + .child = dev_table, + }, + { } +}; -#if defined(CONFIG_SYSCTL) -int proc_do_static_key(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) +int __init sysctl_init(void) { - struct static_key *key = (struct static_key *)table->data; - static DEFINE_MUTEX(static_key_mutex); - int val, ret; - struct ctl_table tmp = { - .data = &val, - .maxlen = sizeof(val), - .mode = table->mode, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }; - - if (write && !capable(CAP_SYS_ADMIN)) - return -EPERM; + struct ctl_table_header *hdr; - mutex_lock(&static_key_mutex); - val = static_key_enabled(key); - ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos); - if (write && !ret) { - if (val) - static_key_enable(key); - else - static_key_disable(key); - } - mutex_unlock(&static_key_mutex); - return ret; + hdr = register_sysctl_table(sysctl_base_table); + kmemleak_not_leak(hdr); + return 0; } -#endif +#endif /* CONFIG_SYSCTL */ /* * No sense putting this after each symbol definition, twice, * exception granted :-) -- cgit v1.2.3-59-g8ed1b From 32927393dc1ccd60fb2bdc05b9e8e88753761469 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 24 Apr 2020 08:43:38 +0200 Subject: sysctl: pass kernel pointers to ->proc_handler Instead of having all the sysctl handlers deal with user pointers, which is rather hairy in terms of the BPF interaction, copy the input to and from userspace in common code. This also means that the strings are always NUL-terminated by the common code, making the API a little bit safer. As most handler just pass through the data to one of the common handlers a lot of the changes are mechnical. Signed-off-by: Christoph Hellwig Acked-by: Andrey Ignatov Signed-off-by: Al Viro --- arch/arm64/kernel/armv8_deprecated.c | 2 +- arch/arm64/kernel/fpsimd.c | 3 +- arch/mips/lasat/sysctl.c | 13 +- arch/s390/appldata/appldata_base.c | 11 +- arch/s390/kernel/debug.c | 2 +- arch/s390/kernel/topology.c | 2 +- arch/s390/mm/cmm.c | 12 +- arch/x86/kernel/itmt.c | 3 +- drivers/cdrom/cdrom.c | 2 +- drivers/char/random.c | 2 +- drivers/macintosh/mac_hid.c | 3 +- drivers/parport/procfs.c | 39 +++--- fs/dcache.c | 2 +- fs/drop_caches.c | 2 +- fs/file_table.c | 4 +- fs/fscache/main.c | 3 +- fs/inode.c | 2 +- fs/proc/proc_sysctl.c | 47 ++++--- fs/quota/dquot.c | 2 +- fs/xfs/xfs_sysctl.c | 4 +- include/linux/bpf-cgroup.h | 9 +- include/linux/compaction.h | 2 +- include/linux/fs.h | 6 +- include/linux/ftrace.h | 3 +- include/linux/hugetlb.h | 15 +- include/linux/kprobes.h | 2 +- include/linux/latencytop.h | 4 +- include/linux/mm.h | 12 +- include/linux/mmzone.h | 23 ++- include/linux/nmi.h | 15 +- include/linux/perf_event.h | 13 +- include/linux/printk.h | 2 +- include/linux/sched/sysctl.h | 44 ++---- include/linux/security.h | 2 +- include/linux/sysctl.h | 53 +++---- include/linux/timer.h | 3 +- include/linux/vmstat.h | 8 +- include/linux/writeback.h | 28 ++-- ipc/ipc_sysctl.c | 10 +- ipc/mq_sysctl.c | 4 +- kernel/bpf/cgroup.c | 35 ++--- kernel/events/callchain.c | 2 +- kernel/events/core.c | 6 +- kernel/kprobes.c | 2 +- kernel/latencytop.c | 4 +- kernel/pid_namespace.c | 2 +- kernel/printk/printk.c | 2 +- kernel/sched/core.c | 9 +- kernel/sched/fair.c | 3 +- kernel/sched/rt.c | 10 +- kernel/sched/topology.c | 2 +- kernel/seccomp.c | 2 +- kernel/sysctl.c | 239 ++++++++++++-------------------- kernel/time/timer.c | 3 +- kernel/trace/trace.c | 2 +- kernel/umh.c | 2 +- kernel/utsname_sysctl.c | 2 +- kernel/watchdog.c | 12 +- mm/compaction.c | 2 +- mm/hugetlb.c | 9 +- mm/page-writeback.c | 16 +-- mm/page_alloc.c | 30 ++-- mm/util.c | 10 +- mm/vmstat.c | 4 +- net/bridge/br_netfilter_hooks.c | 2 +- net/core/neighbour.c | 28 ++-- net/core/sysctl_net_core.c | 27 ++-- net/decnet/dn_dev.c | 7 +- net/decnet/sysctl_net_decnet.c | 27 ++-- net/ipv4/devinet.c | 9 +- net/ipv4/route.c | 3 +- net/ipv4/sysctl_net_ipv4.c | 38 ++--- net/ipv6/addrconf.c | 33 ++--- net/ipv6/ndisc.c | 3 +- net/ipv6/route.c | 5 +- net/ipv6/sysctl_net_ipv6.c | 3 +- net/mpls/af_mpls.c | 5 +- net/netfilter/ipvs/ip_vs_ctl.c | 6 +- net/netfilter/nf_conntrack_standalone.c | 2 +- net/netfilter/nf_log.c | 2 +- net/phonet/sysctl.c | 3 +- net/rds/tcp.c | 6 +- net/sctp/sysctl.c | 32 ++--- net/sunrpc/sysctl.c | 29 ++-- net/sunrpc/xprtrdma/svc_rdma.c | 7 +- security/apparmor/lsm.c | 2 +- security/min_addr.c | 2 +- security/yama/yama_lsm.c | 2 +- 88 files changed, 458 insertions(+), 653 deletions(-) diff --git a/arch/arm64/kernel/armv8_deprecated.c b/arch/arm64/kernel/armv8_deprecated.c index c19aa81ddc8c..7364de008bab 100644 --- a/arch/arm64/kernel/armv8_deprecated.c +++ b/arch/arm64/kernel/armv8_deprecated.c @@ -203,7 +203,7 @@ static void __init register_insn_emulation(struct insn_emulation_ops *ops) } static int emulation_proc_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, + void *buffer, size_t *lenp, loff_t *ppos) { int ret = 0; diff --git a/arch/arm64/kernel/fpsimd.c b/arch/arm64/kernel/fpsimd.c index 94289d126993..35cb5e66c504 100644 --- a/arch/arm64/kernel/fpsimd.c +++ b/arch/arm64/kernel/fpsimd.c @@ -341,8 +341,7 @@ static unsigned int find_supported_vector_length(unsigned int vl) #ifdef CONFIG_SYSCTL static int sve_proc_do_default_vl(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int ret; int vl = sve_default_vl; diff --git a/arch/mips/lasat/sysctl.c b/arch/mips/lasat/sysctl.c index e666fe26c50d..2119541a5b8b 100644 --- a/arch/mips/lasat/sysctl.c +++ b/arch/mips/lasat/sysctl.c @@ -95,16 +95,15 @@ int proc_lasat_ip(struct ctl_table *table, int write, len = 0; p = buffer; while (len < *lenp) { - if (get_user(c, p++)) - return -EFAULT; + c = *p; + p++; if (c == 0 || c == '\n') break; len++; } if (len >= sizeof(ipbuf)-1) len = sizeof(ipbuf) - 1; - if (copy_from_user(ipbuf, buffer, len)) - return -EFAULT; + memcpy(ipbuf, buffer, len); ipbuf[len] = 0; *ppos += *lenp; /* Now see if we can convert it to a valid IP */ @@ -122,11 +121,9 @@ int proc_lasat_ip(struct ctl_table *table, int write, if (len > *lenp) len = *lenp; if (len) - if (copy_to_user(buffer, ipbuf, len)) - return -EFAULT; + memcpy(buffer, ipbuf, len); if (len < *lenp) { - if (put_user('\n', ((char *) buffer) + len)) - return -EFAULT; + *((char *)buffer + len) = '\n'; len++; } *lenp = len; diff --git a/arch/s390/appldata/appldata_base.c b/arch/s390/appldata/appldata_base.c index aa738cad1338..d74a4c7d5df6 100644 --- a/arch/s390/appldata/appldata_base.c +++ b/arch/s390/appldata/appldata_base.c @@ -51,10 +51,9 @@ static struct platform_device *appldata_pdev; */ static const char appldata_proc_name[APPLDATA_PROC_NAME_LENGTH] = "appldata"; static int appldata_timer_handler(struct ctl_table *ctl, int write, - void __user *buffer, size_t *lenp, loff_t *ppos); + void *buffer, size_t *lenp, loff_t *ppos); static int appldata_interval_handler(struct ctl_table *ctl, int write, - void __user *buffer, - size_t *lenp, loff_t *ppos); + void *buffer, size_t *lenp, loff_t *ppos); static struct ctl_table_header *appldata_sysctl_header; static struct ctl_table appldata_table[] = { @@ -217,7 +216,7 @@ static void __appldata_vtimer_setup(int cmd) */ static int appldata_timer_handler(struct ctl_table *ctl, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int timer_active = appldata_timer_active; int rc; @@ -250,7 +249,7 @@ appldata_timer_handler(struct ctl_table *ctl, int write, */ static int appldata_interval_handler(struct ctl_table *ctl, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int interval = appldata_interval; int rc; @@ -280,7 +279,7 @@ appldata_interval_handler(struct ctl_table *ctl, int write, */ static int appldata_generic_handler(struct ctl_table *ctl, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { struct appldata_ops *ops = NULL, *tmp_ops; struct list_head *lh; diff --git a/arch/s390/kernel/debug.c b/arch/s390/kernel/debug.c index 6d321f5f101d..636446003a06 100644 --- a/arch/s390/kernel/debug.c +++ b/arch/s390/kernel/debug.c @@ -867,7 +867,7 @@ static int debug_active = 1; * if debug_active is already off */ static int s390dbf_procactive(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { if (!write || debug_stoppable || !debug_active) return proc_dointvec(table, write, buffer, lenp, ppos); diff --git a/arch/s390/kernel/topology.c b/arch/s390/kernel/topology.c index 5f70cefc13e4..332b542548cd 100644 --- a/arch/s390/kernel/topology.c +++ b/arch/s390/kernel/topology.c @@ -594,7 +594,7 @@ static int __init topology_setup(char *str) early_param("topology", topology_setup); static int topology_ctl_handler(struct ctl_table *ctl, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int enabled = topology_is_enabled(); int new_mode; diff --git a/arch/s390/mm/cmm.c b/arch/s390/mm/cmm.c index ae989b740376..36bce727897b 100644 --- a/arch/s390/mm/cmm.c +++ b/arch/s390/mm/cmm.c @@ -245,7 +245,7 @@ static int cmm_skip_blanks(char *cp, char **endp) } static int cmm_pages_handler(struct ctl_table *ctl, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { long nr = cmm_get_pages(); struct ctl_table ctl_entry = { @@ -264,7 +264,7 @@ static int cmm_pages_handler(struct ctl_table *ctl, int write, } static int cmm_timed_pages_handler(struct ctl_table *ctl, int write, - void __user *buffer, size_t *lenp, + void *buffer, size_t *lenp, loff_t *ppos) { long nr = cmm_get_timed_pages(); @@ -284,7 +284,7 @@ static int cmm_timed_pages_handler(struct ctl_table *ctl, int write, } static int cmm_timeout_handler(struct ctl_table *ctl, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { char buf[64], *p; long nr, seconds; @@ -297,8 +297,7 @@ static int cmm_timeout_handler(struct ctl_table *ctl, int write, if (write) { len = min(*lenp, sizeof(buf)); - if (copy_from_user(buf, buffer, len)) - return -EFAULT; + memcpy(buf, buffer, len); buf[len - 1] = '\0'; cmm_skip_blanks(buf, &p); nr = simple_strtoul(p, &p, 0); @@ -311,8 +310,7 @@ static int cmm_timeout_handler(struct ctl_table *ctl, int write, cmm_timeout_pages, cmm_timeout_seconds); if (len > *lenp) len = *lenp; - if (copy_to_user(buffer, buf, len)) - return -EFAULT; + memcpy(buffer, buf, len); *lenp = len; *ppos += len; } diff --git a/arch/x86/kernel/itmt.c b/arch/x86/kernel/itmt.c index 1cb3ca9bba49..1afbdd1dd777 100644 --- a/arch/x86/kernel/itmt.c +++ b/arch/x86/kernel/itmt.c @@ -39,8 +39,7 @@ static bool __read_mostly sched_itmt_capable; unsigned int __read_mostly sysctl_sched_itmt_enabled; static int sched_itmt_update_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { unsigned int old_sysctl; int ret; diff --git a/drivers/cdrom/cdrom.c b/drivers/cdrom/cdrom.c index faca0f346fff..e3bbe108eb54 100644 --- a/drivers/cdrom/cdrom.c +++ b/drivers/cdrom/cdrom.c @@ -3631,7 +3631,7 @@ static void cdrom_update_settings(void) } static int cdrom_sysctl_handler(struct ctl_table *ctl, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int ret; diff --git a/drivers/char/random.c b/drivers/char/random.c index 0d10e31fd342..1e0db78b83ba 100644 --- a/drivers/char/random.c +++ b/drivers/char/random.c @@ -2057,7 +2057,7 @@ static char sysctl_bootid[16]; * sysctl system call, as 16 bytes of binary data. */ static int proc_do_uuid(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { struct ctl_table fake_table; unsigned char buf[64], tmp_uuid[16], *uuid; diff --git a/drivers/macintosh/mac_hid.c b/drivers/macintosh/mac_hid.c index 7af0c536d568..28b8581b44dd 100644 --- a/drivers/macintosh/mac_hid.c +++ b/drivers/macintosh/mac_hid.c @@ -183,8 +183,7 @@ static void mac_hid_stop_emulation(void) } static int mac_hid_toggle_emumouse(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int *valp = table->data; int old_val = *valp; diff --git a/drivers/parport/procfs.c b/drivers/parport/procfs.c index 48804049d697..ee7b5daabfd4 100644 --- a/drivers/parport/procfs.c +++ b/drivers/parport/procfs.c @@ -34,7 +34,7 @@ #define PARPORT_MAX_SPINTIME_VALUE 1000 static int do_active_device(struct ctl_table *table, int write, - void __user *result, size_t *lenp, loff_t *ppos) + void *result, size_t *lenp, loff_t *ppos) { struct parport *port = (struct parport *)table->extra1; char buffer[256]; @@ -65,13 +65,13 @@ static int do_active_device(struct ctl_table *table, int write, *lenp = len; *ppos += len; - - return copy_to_user(result, buffer, len) ? -EFAULT : 0; + memcpy(result, buffer, len); + return 0; } #ifdef CONFIG_PARPORT_1284 static int do_autoprobe(struct ctl_table *table, int write, - void __user *result, size_t *lenp, loff_t *ppos) + void *result, size_t *lenp, loff_t *ppos) { struct parport_device_info *info = table->extra2; const char *str; @@ -108,13 +108,13 @@ static int do_autoprobe(struct ctl_table *table, int write, *ppos += len; - return copy_to_user (result, buffer, len) ? -EFAULT : 0; + memcpy(result, buffer, len); + return 0; } #endif /* IEEE1284.3 support. */ static int do_hardware_base_addr(struct ctl_table *table, int write, - void __user *result, - size_t *lenp, loff_t *ppos) + void *result, size_t *lenp, loff_t *ppos) { struct parport *port = (struct parport *)table->extra1; char buffer[20]; @@ -136,13 +136,12 @@ static int do_hardware_base_addr(struct ctl_table *table, int write, *lenp = len; *ppos += len; - - return copy_to_user(result, buffer, len) ? -EFAULT : 0; + memcpy(result, buffer, len); + return 0; } static int do_hardware_irq(struct ctl_table *table, int write, - void __user *result, - size_t *lenp, loff_t *ppos) + void *result, size_t *lenp, loff_t *ppos) { struct parport *port = (struct parport *)table->extra1; char buffer[20]; @@ -164,13 +163,12 @@ static int do_hardware_irq(struct ctl_table *table, int write, *lenp = len; *ppos += len; - - return copy_to_user(result, buffer, len) ? -EFAULT : 0; + memcpy(result, buffer, len); + return 0; } static int do_hardware_dma(struct ctl_table *table, int write, - void __user *result, - size_t *lenp, loff_t *ppos) + void *result, size_t *lenp, loff_t *ppos) { struct parport *port = (struct parport *)table->extra1; char buffer[20]; @@ -192,13 +190,12 @@ static int do_hardware_dma(struct ctl_table *table, int write, *lenp = len; *ppos += len; - - return copy_to_user(result, buffer, len) ? -EFAULT : 0; + memcpy(result, buffer, len); + return 0; } static int do_hardware_modes(struct ctl_table *table, int write, - void __user *result, - size_t *lenp, loff_t *ppos) + void *result, size_t *lenp, loff_t *ppos) { struct parport *port = (struct parport *)table->extra1; char buffer[40]; @@ -231,8 +228,8 @@ static int do_hardware_modes(struct ctl_table *table, int write, *lenp = len; *ppos += len; - - return copy_to_user(result, buffer, len) ? -EFAULT : 0; + memcpy(result, buffer, len); + return 0; } #define PARPORT_PORT_DIR(CHILD) { .procname = NULL, .mode = 0555, .child = CHILD } diff --git a/fs/dcache.c b/fs/dcache.c index b280e07e162b..8dd4d8d7bd0b 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -165,7 +165,7 @@ static long get_nr_dentry_negative(void) return sum < 0 ? 0 : sum; } -int proc_nr_dentry(struct ctl_table *table, int write, void __user *buffer, +int proc_nr_dentry(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { dentry_stat.nr_dentry = get_nr_dentry(); diff --git a/fs/drop_caches.c b/fs/drop_caches.c index dc1a1d5d825b..f00fcc4a4f72 100644 --- a/fs/drop_caches.c +++ b/fs/drop_caches.c @@ -47,7 +47,7 @@ static void drop_pagecache_sb(struct super_block *sb, void *unused) } int drop_caches_sysctl_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *length, loff_t *ppos) + void *buffer, size_t *length, loff_t *ppos) { int ret; diff --git a/fs/file_table.c b/fs/file_table.c index 30d55c9a1744..3b612535391f 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -80,14 +80,14 @@ EXPORT_SYMBOL_GPL(get_max_files); */ #if defined(CONFIG_SYSCTL) && defined(CONFIG_PROC_FS) int proc_nr_files(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { files_stat.nr_files = get_nr_files(); return proc_doulongvec_minmax(table, write, buffer, lenp, ppos); } #else int proc_nr_files(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { return -ENOSYS; } diff --git a/fs/fscache/main.c b/fs/fscache/main.c index 59c2494efda3..c1e6cc9091aa 100644 --- a/fs/fscache/main.c +++ b/fs/fscache/main.c @@ -51,8 +51,7 @@ static unsigned fscache_op_max_active = 2; static struct ctl_table_header *fscache_sysctl_header; static int fscache_max_active_sysctl(struct ctl_table *table, int write, - void __user *buffer, - size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { struct workqueue_struct **wqp = table->extra1; unsigned int *datap = table->data; diff --git a/fs/inode.c b/fs/inode.c index 93d9252a00ab..cc6e701b7e5d 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -108,7 +108,7 @@ long get_nr_dirty_inodes(void) */ #ifdef CONFIG_SYSCTL int proc_nr_inodes(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { inodes_stat.nr_inodes = get_nr_inodes(); inodes_stat.nr_unused = get_nr_inodes_unused(); diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index b6f5d459b087..df2143e05c57 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -539,13 +539,13 @@ out: return err; } -static ssize_t proc_sys_call_handler(struct file *filp, void __user *buf, +static ssize_t proc_sys_call_handler(struct file *filp, void __user *ubuf, size_t count, loff_t *ppos, int write) { struct inode *inode = file_inode(filp); struct ctl_table_header *head = grab_header(inode); struct ctl_table *table = PROC_I(inode)->sysctl_entry; - void *new_buf = NULL; + void *kbuf; ssize_t error; if (IS_ERR(head)) @@ -564,27 +564,38 @@ static ssize_t proc_sys_call_handler(struct file *filp, void __user *buf, if (!table->proc_handler) goto out; - error = BPF_CGROUP_RUN_PROG_SYSCTL(head, table, write, buf, &count, - ppos, &new_buf); + if (write) { + kbuf = memdup_user_nul(ubuf, count); + if (IS_ERR(kbuf)) { + error = PTR_ERR(kbuf); + goto out; + } + } else { + error = -ENOMEM; + kbuf = kzalloc(count, GFP_KERNEL); + if (!kbuf) + goto out; + } + + error = BPF_CGROUP_RUN_PROG_SYSCTL(head, table, write, &kbuf, &count, + ppos); if (error) - goto out; + goto out_free_buf; /* careful: calling conventions are nasty here */ - if (new_buf) { - mm_segment_t old_fs; - - old_fs = get_fs(); - set_fs(KERNEL_DS); - error = table->proc_handler(table, write, (void __user *)new_buf, - &count, ppos); - set_fs(old_fs); - kfree(new_buf); - } else { - error = table->proc_handler(table, write, buf, &count, ppos); + error = table->proc_handler(table, write, kbuf, &count, ppos); + if (error) + goto out_free_buf; + + if (!write) { + error = -EFAULT; + if (copy_to_user(ubuf, kbuf, count)) + goto out_free_buf; } - if (!error) - error = count; + error = count; +out_free_buf: + kfree(kbuf); out: sysctl_head_finish(head); diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index b6a4f692d345..7b4bac91146b 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -2841,7 +2841,7 @@ const struct quotactl_ops dquot_quotactl_sysfile_ops = { EXPORT_SYMBOL(dquot_quotactl_sysfile_ops); static int do_proc_dqstats(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { unsigned int type = (unsigned long *)table->data - dqstats.stat; s64 value = percpu_counter_sum(&dqstats.counter[type]); diff --git a/fs/xfs/xfs_sysctl.c b/fs/xfs/xfs_sysctl.c index 31b3bdbd2eba..021ef96d0542 100644 --- a/fs/xfs/xfs_sysctl.c +++ b/fs/xfs/xfs_sysctl.c @@ -13,7 +13,7 @@ STATIC int xfs_stats_clear_proc_handler( struct ctl_table *ctl, int write, - void __user *buffer, + void *buffer, size_t *lenp, loff_t *ppos) { @@ -33,7 +33,7 @@ STATIC int xfs_panic_mask_proc_handler( struct ctl_table *ctl, int write, - void __user *buffer, + void *buffer, size_t *lenp, loff_t *ppos) { diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h index c11b413d5b1a..0b41fd5fc96b 100644 --- a/include/linux/bpf-cgroup.h +++ b/include/linux/bpf-cgroup.h @@ -138,8 +138,7 @@ int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor, int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head, struct ctl_table *table, int write, - void __user *buf, size_t *pcount, - loff_t *ppos, void **new_buf, + void **buf, size_t *pcount, loff_t *ppos, enum bpf_attach_type type); int __cgroup_bpf_run_filter_setsockopt(struct sock *sock, int *level, @@ -302,12 +301,12 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key, }) -#define BPF_CGROUP_RUN_PROG_SYSCTL(head, table, write, buf, count, pos, nbuf) \ +#define BPF_CGROUP_RUN_PROG_SYSCTL(head, table, write, buf, count, pos) \ ({ \ int __ret = 0; \ if (cgroup_bpf_enabled) \ __ret = __cgroup_bpf_run_filter_sysctl(head, table, write, \ - buf, count, pos, nbuf, \ + buf, count, pos, \ BPF_CGROUP_SYSCTL); \ __ret; \ }) @@ -429,7 +428,7 @@ static inline int bpf_percpu_cgroup_storage_update(struct bpf_map *map, #define BPF_CGROUP_RUN_PROG_UDP6_RECVMSG_LOCK(sk, uaddr) ({ 0; }) #define BPF_CGROUP_RUN_PROG_SOCK_OPS(sock_ops) ({ 0; }) #define BPF_CGROUP_RUN_PROG_DEVICE_CGROUP(type,major,minor,access) ({ 0; }) -#define BPF_CGROUP_RUN_PROG_SYSCTL(head,table,write,buf,count,pos,nbuf) ({ 0; }) +#define BPF_CGROUP_RUN_PROG_SYSCTL(head,table,write,buf,count,pos) ({ 0; }) #define BPF_CGROUP_GETSOCKOPT_MAX_OPTLEN(optlen) ({ 0; }) #define BPF_CGROUP_RUN_PROG_GETSOCKOPT(sock, level, optname, optval, \ optlen, max_optlen, retval) ({ retval; }) diff --git a/include/linux/compaction.h b/include/linux/compaction.h index 4b898cdbdf05..a0eabfbeb0e1 100644 --- a/include/linux/compaction.h +++ b/include/linux/compaction.h @@ -86,7 +86,7 @@ static inline unsigned long compact_gap(unsigned int order) #ifdef CONFIG_COMPACTION extern int sysctl_compact_memory; extern int sysctl_compaction_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *length, loff_t *ppos); + void *buffer, size_t *length, loff_t *ppos); extern int sysctl_extfrag_threshold; extern int sysctl_compact_unevictable_allowed; diff --git a/include/linux/fs.h b/include/linux/fs.h index 4f6f59b4f22a..9b028d260649 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3536,11 +3536,11 @@ ssize_t simple_attr_write(struct file *file, const char __user *buf, struct ctl_table; int proc_nr_files(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos); + void *buffer, size_t *lenp, loff_t *ppos); int proc_nr_dentry(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos); + void *buffer, size_t *lenp, loff_t *ppos); int proc_nr_inodes(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos); + void *buffer, size_t *lenp, loff_t *ppos); int __init get_filesystem_list(char *buf); #define __FMODE_EXEC ((__force int) FMODE_EXEC) diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index db95244a62d4..ddfc377de0d2 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -1005,8 +1005,7 @@ extern void disable_trace_on_warning(void); extern int __disable_trace_on_warning; int tracepoint_printk_sysctl(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos); + void *buffer, size_t *lenp, loff_t *ppos); #else /* CONFIG_TRACING */ static inline void disable_trace_on_warning(void) { } diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 43a1cef8f0f1..92c21c5ccc58 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -105,14 +105,13 @@ struct hugepage_subpool *hugepage_new_subpool(struct hstate *h, long max_hpages, void hugepage_put_subpool(struct hugepage_subpool *spool); void reset_vma_resv_huge_pages(struct vm_area_struct *vma); -int hugetlb_sysctl_handler(struct ctl_table *, int, void __user *, size_t *, loff_t *); -int hugetlb_overcommit_handler(struct ctl_table *, int, void __user *, size_t *, loff_t *); -int hugetlb_treat_movable_handler(struct ctl_table *, int, void __user *, size_t *, loff_t *); - -#ifdef CONFIG_NUMA -int hugetlb_mempolicy_sysctl_handler(struct ctl_table *, int, - void __user *, size_t *, loff_t *); -#endif +int hugetlb_sysctl_handler(struct ctl_table *, int, void *, size_t *, loff_t *); +int hugetlb_overcommit_handler(struct ctl_table *, int, void *, size_t *, + loff_t *); +int hugetlb_treat_movable_handler(struct ctl_table *, int, void *, size_t *, + loff_t *); +int hugetlb_mempolicy_sysctl_handler(struct ctl_table *, int, void *, size_t *, + loff_t *); int copy_hugetlb_page_range(struct mm_struct *, struct mm_struct *, struct vm_area_struct *); long follow_hugetlb_page(struct mm_struct *, struct vm_area_struct *, diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h index 04bdaf01112c..594265bfd390 100644 --- a/include/linux/kprobes.h +++ b/include/linux/kprobes.h @@ -312,7 +312,7 @@ DEFINE_INSN_CACHE_OPS(optinsn); #ifdef CONFIG_SYSCTL extern int sysctl_kprobes_optimization; extern int proc_kprobes_optimization_handler(struct ctl_table *table, - int write, void __user *buffer, + int write, void *buffer, size_t *length, loff_t *ppos); #endif extern void wait_for_kprobe_optimizer(void); diff --git a/include/linux/latencytop.h b/include/linux/latencytop.h index 9022f0c2e2e4..abe3d95f795b 100644 --- a/include/linux/latencytop.h +++ b/include/linux/latencytop.h @@ -38,8 +38,8 @@ account_scheduler_latency(struct task_struct *task, int usecs, int inter) void clear_tsk_latency_tracing(struct task_struct *p); -extern int sysctl_latencytop(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos); +int sysctl_latencytop(struct ctl_table *table, int write, void *buffer, + size_t *lenp, loff_t *ppos); #else diff --git a/include/linux/mm.h b/include/linux/mm.h index 9c4e7e76dedd..a7b1ef8ed970 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -201,10 +201,10 @@ extern int sysctl_overcommit_memory; extern int sysctl_overcommit_ratio; extern unsigned long sysctl_overcommit_kbytes; -extern int overcommit_ratio_handler(struct ctl_table *, int, void __user *, - size_t *, loff_t *); -extern int overcommit_kbytes_handler(struct ctl_table *, int, void __user *, - size_t *, loff_t *); +int overcommit_ratio_handler(struct ctl_table *, int, void *, size_t *, + loff_t *); +int overcommit_kbytes_handler(struct ctl_table *, int, void *, size_t *, + loff_t *); #define nth_page(page,n) pfn_to_page(page_to_pfn((page)) + (n)) @@ -2957,8 +2957,8 @@ extern bool process_shares_mm(struct task_struct *p, struct mm_struct *mm); #ifdef CONFIG_SYSCTL extern int sysctl_drop_caches; -int drop_caches_sysctl_handler(struct ctl_table *, int, - void __user *, size_t *, loff_t *); +int drop_caches_sysctl_handler(struct ctl_table *, int, void *, size_t *, + loff_t *); #endif void drop_slab(void); diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index b2af594ef0f7..93cf20f41e26 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -910,22 +910,21 @@ static inline int is_highmem(struct zone *zone) /* These two functions are used to setup the per zone pages min values */ struct ctl_table; -int min_free_kbytes_sysctl_handler(struct ctl_table *, int, - void __user *, size_t *, loff_t *); -int watermark_scale_factor_sysctl_handler(struct ctl_table *, int, - void __user *, size_t *, loff_t *); +int min_free_kbytes_sysctl_handler(struct ctl_table *, int, void *, size_t *, + loff_t *); +int watermark_scale_factor_sysctl_handler(struct ctl_table *, int, void *, + size_t *, loff_t *); extern int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES]; -int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *, int, - void __user *, size_t *, loff_t *); +int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *, int, void *, + size_t *, loff_t *); int percpu_pagelist_fraction_sysctl_handler(struct ctl_table *, int, - void __user *, size_t *, loff_t *); + void *, size_t *, loff_t *); int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *, int, - void __user *, size_t *, loff_t *); + void *, size_t *, loff_t *); int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *, int, - void __user *, size_t *, loff_t *); - -extern int numa_zonelist_order_handler(struct ctl_table *, int, - void __user *, size_t *, loff_t *); + void *, size_t *, loff_t *); +int numa_zonelist_order_handler(struct ctl_table *, int, + void *, size_t *, loff_t *); extern int percpu_pagelist_fraction; extern char numa_zonelist_order[]; #define NUMA_ZONELIST_ORDER_LEN 16 diff --git a/include/linux/nmi.h b/include/linux/nmi.h index 9003e29cde46..750c7f395ca9 100644 --- a/include/linux/nmi.h +++ b/include/linux/nmi.h @@ -202,16 +202,11 @@ static inline void watchdog_update_hrtimer_threshold(u64 period) { } #endif struct ctl_table; -extern int proc_watchdog(struct ctl_table *, int , - void __user *, size_t *, loff_t *); -extern int proc_nmi_watchdog(struct ctl_table *, int , - void __user *, size_t *, loff_t *); -extern int proc_soft_watchdog(struct ctl_table *, int , - void __user *, size_t *, loff_t *); -extern int proc_watchdog_thresh(struct ctl_table *, int , - void __user *, size_t *, loff_t *); -extern int proc_watchdog_cpumask(struct ctl_table *, int, - void __user *, size_t *, loff_t *); +int proc_watchdog(struct ctl_table *, int, void *, size_t *, loff_t *); +int proc_nmi_watchdog(struct ctl_table *, int , void *, size_t *, loff_t *); +int proc_soft_watchdog(struct ctl_table *, int , void *, size_t *, loff_t *); +int proc_watchdog_thresh(struct ctl_table *, int , void *, size_t *, loff_t *); +int proc_watchdog_cpumask(struct ctl_table *, int, void *, size_t *, loff_t *); #ifdef CONFIG_HAVE_ACPI_APEI_NMI #include diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 9c3e7619c929..347ea379622a 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -1280,15 +1280,12 @@ extern int sysctl_perf_cpu_time_max_percent; extern void perf_sample_event_took(u64 sample_len_ns); -extern int perf_proc_update_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos); -extern int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos); - +int perf_proc_update_handler(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos); +int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos); int perf_event_max_stack_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos); + void *buffer, size_t *lenp, loff_t *ppos); /* Access to perf_event_open(2) syscall. */ #define PERF_SECURITY_OPEN 0 diff --git a/include/linux/printk.h b/include/linux/printk.h index e061635e0409..fcde0772ec98 100644 --- a/include/linux/printk.h +++ b/include/linux/printk.h @@ -189,7 +189,7 @@ extern int printk_delay_msec; extern int dmesg_restrict; extern int -devkmsg_sysctl_set_loglvl(struct ctl_table *table, int write, void __user *buf, +devkmsg_sysctl_set_loglvl(struct ctl_table *table, int write, void *buf, size_t *lenp, loff_t *ppos); extern void wake_up_klogd(void); diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h index d4f6215ee03f..7b4d3a49b6c5 100644 --- a/include/linux/sched/sysctl.h +++ b/include/linux/sched/sysctl.h @@ -12,9 +12,8 @@ extern unsigned int sysctl_hung_task_panic; extern unsigned long sysctl_hung_task_timeout_secs; extern unsigned long sysctl_hung_task_check_interval_secs; extern int sysctl_hung_task_warnings; -extern int proc_dohung_task_timeout_secs(struct ctl_table *table, int write, - void __user *buffer, - size_t *lenp, loff_t *ppos); +int proc_dohung_task_timeout_secs(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos); #else /* Avoid need for ifdefs elsewhere in the code */ enum { sysctl_hung_task_timeout_secs = 0 }; @@ -43,8 +42,7 @@ extern __read_mostly unsigned int sysctl_sched_migration_cost; extern __read_mostly unsigned int sysctl_sched_nr_migrate; int sched_proc_update_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *length, - loff_t *ppos); + void *buffer, size_t *length, loff_t *ppos); #endif /* @@ -72,33 +70,21 @@ extern unsigned int sysctl_sched_autogroup_enabled; extern int sysctl_sched_rr_timeslice; extern int sched_rr_timeslice; -extern int sched_rr_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos); - -extern int sched_rt_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos); - -#ifdef CONFIG_UCLAMP_TASK -extern int sysctl_sched_uclamp_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos); -#endif - -extern int sysctl_numa_balancing(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos); - -extern int sysctl_schedstats(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos); +int sched_rr_handler(struct ctl_table *table, int write, void *buffer, + size_t *lenp, loff_t *ppos); +int sched_rt_handler(struct ctl_table *table, int write, void *buffer, + size_t *lenp, loff_t *ppos); +int sysctl_sched_uclamp_handler(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos); +int sysctl_numa_balancing(struct ctl_table *table, int write, void *buffer, + size_t *lenp, loff_t *ppos); +int sysctl_schedstats(struct ctl_table *table, int write, void *buffer, + size_t *lenp, loff_t *ppos); #if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) extern unsigned int sysctl_sched_energy_aware; -extern int sched_energy_aware_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos); +int sched_energy_aware_handler(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos); #endif #endif /* _LINUX_SCHED_SYSCTL_H */ diff --git a/include/linux/security.h b/include/linux/security.h index a8d9310472df..6aa229b252ce 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -211,7 +211,7 @@ struct request_sock; #ifdef CONFIG_MMU extern int mmap_min_addr_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos); + void *buffer, size_t *lenp, loff_t *ppos); #endif /* security_inode_init_security callback function to write xattrs */ diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index 36143ca40b56..f2401e45a3c2 100644 --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h @@ -44,35 +44,26 @@ struct ctl_dir; extern const int sysctl_vals[]; -typedef int proc_handler (struct ctl_table *ctl, int write, - void __user *buffer, size_t *lenp, loff_t *ppos); - -extern int proc_dostring(struct ctl_table *, int, - void __user *, size_t *, loff_t *); -extern int proc_dointvec(struct ctl_table *, int, - void __user *, size_t *, loff_t *); -extern int proc_douintvec(struct ctl_table *, int, - void __user *, size_t *, loff_t *); -extern int proc_dointvec_minmax(struct ctl_table *, int, - void __user *, size_t *, loff_t *); -extern int proc_douintvec_minmax(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos); -extern int proc_dointvec_jiffies(struct ctl_table *, int, - void __user *, size_t *, loff_t *); -extern int proc_dointvec_userhz_jiffies(struct ctl_table *, int, - void __user *, size_t *, loff_t *); -extern int proc_dointvec_ms_jiffies(struct ctl_table *, int, - void __user *, size_t *, loff_t *); -extern int proc_doulongvec_minmax(struct ctl_table *, int, - void __user *, size_t *, loff_t *); -extern int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int, - void __user *, size_t *, loff_t *); -extern int proc_do_large_bitmap(struct ctl_table *, int, - void __user *, size_t *, loff_t *); -extern int proc_do_static_key(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos); +typedef int proc_handler(struct ctl_table *ctl, int write, void *buffer, + size_t *lenp, loff_t *ppos); + +int proc_dostring(struct ctl_table *, int, void *, size_t *, loff_t *); +int proc_dointvec(struct ctl_table *, int, void *, size_t *, loff_t *); +int proc_douintvec(struct ctl_table *, int, void *, size_t *, loff_t *); +int proc_dointvec_minmax(struct ctl_table *, int, void *, size_t *, loff_t *); +int proc_douintvec_minmax(struct ctl_table *table, int write, void *buffer, + size_t *lenp, loff_t *ppos); +int proc_dointvec_jiffies(struct ctl_table *, int, void *, size_t *, loff_t *); +int proc_dointvec_userhz_jiffies(struct ctl_table *, int, void *, size_t *, + loff_t *); +int proc_dointvec_ms_jiffies(struct ctl_table *, int, void *, size_t *, + loff_t *); +int proc_doulongvec_minmax(struct ctl_table *, int, void *, size_t *, loff_t *); +int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int, void *, + size_t *, loff_t *); +int proc_do_large_bitmap(struct ctl_table *, int, void *, size_t *, loff_t *); +int proc_do_static_key(struct ctl_table *table, int write, void *buffer, + size_t *lenp, loff_t *ppos); /* * Register a set of sysctl names by calling register_sysctl_table @@ -246,7 +237,7 @@ static inline void setup_sysctl_set(struct ctl_table_set *p, #endif /* CONFIG_SYSCTL */ -int sysctl_max_threads(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos); +int sysctl_max_threads(struct ctl_table *table, int write, void *buffer, + size_t *lenp, loff_t *ppos); #endif /* _LINUX_SYSCTL_H */ diff --git a/include/linux/timer.h b/include/linux/timer.h index 0dc19a8c39c9..07910ae5ddd9 100644 --- a/include/linux/timer.h +++ b/include/linux/timer.h @@ -201,8 +201,7 @@ struct ctl_table; extern unsigned int sysctl_timer_migration; int timer_migration_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos); + void *buffer, size_t *lenp, loff_t *ppos); #endif unsigned long __round_jiffies(unsigned long j, int cpu); diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h index 292485f3d24d..cb507151710f 100644 --- a/include/linux/vmstat.h +++ b/include/linux/vmstat.h @@ -16,8 +16,8 @@ extern int sysctl_stat_interval; #define DISABLE_NUMA_STAT 0 extern int sysctl_vm_numa_stat; DECLARE_STATIC_KEY_TRUE(vm_numa_stat_key); -extern int sysctl_vm_numa_stat_handler(struct ctl_table *table, - int write, void __user *buffer, size_t *length, loff_t *ppos); +int sysctl_vm_numa_stat_handler(struct ctl_table *table, int write, + void *buffer, size_t *length, loff_t *ppos); #endif struct reclaim_stat { @@ -274,8 +274,8 @@ void cpu_vm_stats_fold(int cpu); void refresh_zone_stat_thresholds(void); struct ctl_table; -int vmstat_refresh(struct ctl_table *, int write, - void __user *buffer, size_t *lenp, loff_t *ppos); +int vmstat_refresh(struct ctl_table *, int write, void *buffer, size_t *lenp, + loff_t *ppos); void drain_zonestat(struct zone *zone, struct per_cpu_pageset *); diff --git a/include/linux/writeback.h b/include/linux/writeback.h index a19d845dd7eb..f8a7e1a850fb 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -362,24 +362,18 @@ extern int vm_highmem_is_dirtyable; extern int block_dump; extern int laptop_mode; -extern int dirty_background_ratio_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos); -extern int dirty_background_bytes_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos); -extern int dirty_ratio_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos); -extern int dirty_bytes_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos); +int dirty_background_ratio_handler(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos); +int dirty_background_bytes_handler(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos); +int dirty_ratio_handler(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos); +int dirty_bytes_handler(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos); int dirtytime_interval_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos); - -struct ctl_table; -int dirty_writeback_centisecs_handler(struct ctl_table *, int, - void __user *, size_t *, loff_t *); + void *buffer, size_t *lenp, loff_t *ppos); +int dirty_writeback_centisecs_handler(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos); void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty); unsigned long wb_calc_thresh(struct bdi_writeback *wb, unsigned long thresh); diff --git a/ipc/ipc_sysctl.c b/ipc/ipc_sysctl.c index affd66537e87..d1b8644bfb88 100644 --- a/ipc/ipc_sysctl.c +++ b/ipc/ipc_sysctl.c @@ -24,7 +24,7 @@ static void *get_ipc(struct ctl_table *table) #ifdef CONFIG_PROC_SYSCTL static int proc_ipc_dointvec(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { struct ctl_table ipc_table; @@ -35,7 +35,7 @@ static int proc_ipc_dointvec(struct ctl_table *table, int write, } static int proc_ipc_dointvec_minmax(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { struct ctl_table ipc_table; @@ -46,7 +46,7 @@ static int proc_ipc_dointvec_minmax(struct ctl_table *table, int write, } static int proc_ipc_dointvec_minmax_orphans(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { struct ipc_namespace *ns = current->nsproxy->ipc_ns; int err = proc_ipc_dointvec_minmax(table, write, buffer, lenp, ppos); @@ -59,7 +59,7 @@ static int proc_ipc_dointvec_minmax_orphans(struct ctl_table *table, int write, } static int proc_ipc_doulongvec_minmax(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { struct ctl_table ipc_table; memcpy(&ipc_table, table, sizeof(ipc_table)); @@ -70,7 +70,7 @@ static int proc_ipc_doulongvec_minmax(struct ctl_table *table, int write, } static int proc_ipc_auto_msgmni(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { struct ctl_table ipc_table; int dummy = 0; diff --git a/ipc/mq_sysctl.c b/ipc/mq_sysctl.c index 7c00f28923a8..72a92a08c848 100644 --- a/ipc/mq_sysctl.c +++ b/ipc/mq_sysctl.c @@ -19,7 +19,7 @@ static void *get_mq(struct ctl_table *table) } static int proc_mq_dointvec(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { struct ctl_table mq_table; memcpy(&mq_table, table, sizeof(mq_table)); @@ -29,7 +29,7 @@ static int proc_mq_dointvec(struct ctl_table *table, int write, } static int proc_mq_dointvec_minmax(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { struct ctl_table mq_table; memcpy(&mq_table, table, sizeof(mq_table)); diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index cb305e71e7de..977bc69bb1c5 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -1137,16 +1137,13 @@ const struct bpf_verifier_ops cg_dev_verifier_ops = { * @head: sysctl table header * @table: sysctl table * @write: sysctl is being read (= 0) or written (= 1) - * @buf: pointer to buffer passed by user space + * @buf: pointer to buffer (in and out) * @pcount: value-result argument: value is size of buffer pointed to by @buf, * result is size of @new_buf if program set new value, initial value * otherwise * @ppos: value-result argument: value is position at which read from or write * to sysctl is happening, result is new position if program overrode it, * initial value otherwise - * @new_buf: pointer to pointer to new buffer that will be allocated if program - * overrides new value provided by user space on sysctl write - * NOTE: it's caller responsibility to free *new_buf if it was set * @type: type of program to be executed * * Program is run when sysctl is being accessed, either read or written, and @@ -1157,8 +1154,7 @@ const struct bpf_verifier_ops cg_dev_verifier_ops = { */ int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head, struct ctl_table *table, int write, - void __user *buf, size_t *pcount, - loff_t *ppos, void **new_buf, + void **buf, size_t *pcount, loff_t *ppos, enum bpf_attach_type type) { struct bpf_sysctl_kern ctx = { @@ -1173,36 +1169,28 @@ int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head, .new_updated = 0, }; struct cgroup *cgrp; + loff_t pos = 0; int ret; ctx.cur_val = kmalloc_track_caller(ctx.cur_len, GFP_KERNEL); - if (ctx.cur_val) { - mm_segment_t old_fs; - loff_t pos = 0; - - old_fs = get_fs(); - set_fs(KERNEL_DS); - if (table->proc_handler(table, 0, (void __user *)ctx.cur_val, - &ctx.cur_len, &pos)) { - /* Let BPF program decide how to proceed. */ - ctx.cur_len = 0; - } - set_fs(old_fs); - } else { + if (!ctx.cur_val || + table->proc_handler(table, 0, ctx.cur_val, &ctx.cur_len, &pos)) { /* Let BPF program decide how to proceed. */ ctx.cur_len = 0; } - if (write && buf && *pcount) { + if (write && *buf && *pcount) { /* BPF program should be able to override new value with a * buffer bigger than provided by user. */ ctx.new_val = kmalloc_track_caller(PAGE_SIZE, GFP_KERNEL); ctx.new_len = min_t(size_t, PAGE_SIZE, *pcount); - if (!ctx.new_val || - copy_from_user(ctx.new_val, buf, ctx.new_len)) + if (ctx.new_val) { + memcpy(ctx.new_val, *buf, ctx.new_len); + } else { /* Let BPF program decide how to proceed. */ ctx.new_len = 0; + } } rcu_read_lock(); @@ -1213,7 +1201,8 @@ int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head, kfree(ctx.cur_val); if (ret == 1 && ctx.new_updated) { - *new_buf = ctx.new_val; + kfree(*buf); + *buf = ctx.new_val; *pcount = ctx.new_len; } else { kfree(ctx.new_val); diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c index c2b41a263166..bdb1533ada81 100644 --- a/kernel/events/callchain.c +++ b/kernel/events/callchain.c @@ -236,7 +236,7 @@ exit_put: * sysctl_perf_event_max_contexts_per_stack. */ int perf_event_max_stack_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int *value = table->data; int new_value = *value, ret; diff --git a/kernel/events/core.c b/kernel/events/core.c index bc9b98a9af9a..f86d46f2c4d9 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -437,8 +437,7 @@ static void update_perf_cpu_limits(void) static bool perf_rotate_context(struct perf_cpu_context *cpuctx); int perf_proc_update_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int ret; int perf_cpu = sysctl_perf_cpu_time_max_percent; @@ -462,8 +461,7 @@ int perf_proc_update_handler(struct ctl_table *table, int write, int sysctl_perf_cpu_time_max_percent __read_mostly = DEFAULT_CPU_TIME_MAX_PERCENT; int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 2625c241ac00..ffbe03a45c16 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -892,7 +892,7 @@ static void unoptimize_all_kprobes(void) static DEFINE_MUTEX(kprobe_sysctl_mutex); int sysctl_kprobes_optimization; int proc_kprobes_optimization_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *length, + void *buffer, size_t *length, loff_t *ppos) { int ret; diff --git a/kernel/latencytop.c b/kernel/latencytop.c index 8d1c15832e55..166d7bf49666 100644 --- a/kernel/latencytop.c +++ b/kernel/latencytop.c @@ -269,8 +269,8 @@ static int __init init_lstats_procfs(void) return 0; } -int sysctl_latencytop(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) +int sysctl_latencytop(struct ctl_table *table, int write, void *buffer, + size_t *lenp, loff_t *ppos) { int err; diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index 01f8ba32cc0c..3ccaba5f15c0 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -263,7 +263,7 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns) #ifdef CONFIG_CHECKPOINT_RESTORE static int pid_ns_ctl_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { struct pid_namespace *pid_ns = task_active_pid_ns(current); struct ctl_table tmp = *table; diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 9a9b6156270b..471f649b5868 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -173,7 +173,7 @@ __setup("printk.devkmsg=", control_devkmsg); char devkmsg_log_str[DEVKMSG_STR_MAX_SIZE] = "ratelimit"; int devkmsg_sysctl_set_loglvl(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { char old_str[DEVKMSG_STR_MAX_SIZE]; unsigned int old; diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 3a61a3b8eaa9..5c589a2e4d19 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1110,8 +1110,7 @@ static void uclamp_update_root_tg(void) { } #endif int sysctl_sched_uclamp_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { bool update_root_tg = false; int old_min, old_max; @@ -2723,7 +2722,7 @@ void set_numabalancing_state(bool enabled) #ifdef CONFIG_PROC_SYSCTL int sysctl_numa_balancing(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { struct ctl_table t; int err; @@ -2797,8 +2796,8 @@ static void __init init_schedstats(void) } #ifdef CONFIG_PROC_SYSCTL -int sysctl_schedstats(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) +int sysctl_schedstats(struct ctl_table *table, int write, void *buffer, + size_t *lenp, loff_t *ppos) { struct ctl_table t; int err; diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 02f323b85b6d..b6077fd5b32f 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -645,8 +645,7 @@ struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq) */ int sched_proc_update_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); unsigned int factor = get_update_sysctl_factor(); diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index df11d88c9895..45da29de3ecc 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -2714,9 +2714,8 @@ static void sched_rt_do_global(void) def_rt_bandwidth.rt_period = ns_to_ktime(global_rt_period()); } -int sched_rt_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) +int sched_rt_handler(struct ctl_table *table, int write, void *buffer, + size_t *lenp, loff_t *ppos) { int old_period, old_runtime; static DEFINE_MUTEX(mutex); @@ -2754,9 +2753,8 @@ undo: return ret; } -int sched_rr_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) +int sched_rr_handler(struct ctl_table *table, int write, void *buffer, + size_t *lenp, loff_t *ppos) { int ret; static DEFINE_MUTEX(mutex); diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 8344757bba6e..fa64b2ee9fe6 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -209,7 +209,7 @@ bool sched_energy_update; #ifdef CONFIG_PROC_SYSCTL int sched_energy_aware_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int ret, state; diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 55a6184f5990..d653d8426de9 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -1776,7 +1776,7 @@ static void audit_actions_logged(u32 actions_logged, u32 old_actions_logged, } static int seccomp_actions_logged_handler(struct ctl_table *ro_table, int write, - void __user *buffer, size_t *lenp, + void *buffer, size_t *lenp, loff_t *ppos) { int ret; diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 3fafca3ced98..e961286d0e14 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -208,12 +208,10 @@ static int max_extfrag_threshold = 1000; #ifdef CONFIG_PROC_SYSCTL static int _proc_do_string(char *data, int maxlen, int write, - char __user *buffer, - size_t *lenp, loff_t *ppos) + char *buffer, size_t *lenp, loff_t *ppos) { size_t len; - char __user *p; - char c; + char c, *p; if (!data || !maxlen || !*lenp) { *lenp = 0; @@ -238,8 +236,7 @@ static int _proc_do_string(char *data, int maxlen, int write, *ppos += *lenp; p = buffer; while ((p - buffer) < *lenp && len < maxlen - 1) { - if (get_user(c, p++)) - return -EFAULT; + c = *(p++); if (c == 0 || c == '\n') break; data[len++] = c; @@ -261,11 +258,9 @@ static int _proc_do_string(char *data, int maxlen, int write, if (len > *lenp) len = *lenp; if (len) - if (copy_to_user(buffer, data, len)) - return -EFAULT; + memcpy(buffer, data, len); if (len < *lenp) { - if (put_user('\n', buffer + len)) - return -EFAULT; + buffer[len] = '\n'; len++; } *lenp = len; @@ -326,13 +321,13 @@ static bool proc_first_pos_non_zero_ignore(loff_t *ppos, * Returns 0 on success. */ int proc_dostring(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { if (write) proc_first_pos_non_zero_ignore(ppos, table); - return _proc_do_string((char *)(table->data), table->maxlen, write, - (char __user *)buffer, lenp, ppos); + return _proc_do_string(table->data, table->maxlen, write, buffer, lenp, + ppos); } static size_t proc_skip_spaces(char **buf) @@ -463,11 +458,10 @@ static int proc_get_long(char **buf, size_t *size, * @val: the integer to be converted * @neg: sign of the number, %TRUE for negative * - * In case of success %0 is returned and @buf and @size are updated with - * the amount of bytes written. + * In case of success @buf and @size are updated with the amount of bytes + * written. */ -static int proc_put_long(void __user **buf, size_t *size, unsigned long val, - bool neg) +static void proc_put_long(void **buf, size_t *size, unsigned long val, bool neg) { int len; char tmp[TMPBUFLEN], *p = tmp; @@ -476,24 +470,22 @@ static int proc_put_long(void __user **buf, size_t *size, unsigned long val, len = strlen(tmp); if (len > *size) len = *size; - if (copy_to_user(*buf, tmp, len)) - return -EFAULT; + memcpy(*buf, tmp, len); *size -= len; *buf += len; - return 0; } #undef TMPBUFLEN -static int proc_put_char(void __user **buf, size_t *size, char c) +static void proc_put_char(void **buf, size_t *size, char c) { if (*size) { - char __user **buffer = (char __user **)buf; - if (put_user(c, *buffer)) - return -EFAULT; - (*size)--, (*buffer)++; + char **buffer = (char **)buf; + **buffer = c; + + (*size)--; + (*buffer)++; *buf = *buffer; } - return 0; } static int do_proc_dointvec_conv(bool *negp, unsigned long *lvalp, @@ -541,7 +533,7 @@ static int do_proc_douintvec_conv(unsigned long *lvalp, static const char proc_wspace_sep[] = { ' ', '\t', '\n' }; static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table, - int write, void __user *buffer, + int write, void *buffer, size_t *lenp, loff_t *ppos, int (*conv)(bool *negp, unsigned long *lvalp, int *valp, int write, void *data), @@ -549,7 +541,7 @@ static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table, { int *i, vleft, first = 1, err = 0; size_t left; - char *kbuf = NULL, *p; + char *p; if (!tbl_data || !table->maxlen || !*lenp || (*ppos && !write)) { *lenp = 0; @@ -569,9 +561,7 @@ static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table, if (left > PAGE_SIZE - 1) left = PAGE_SIZE - 1; - p = kbuf = memdup_user_nul(buffer, left); - if (IS_ERR(kbuf)) - return PTR_ERR(kbuf); + p = buffer; } for (; left && vleft--; i++, first=0) { @@ -598,24 +588,17 @@ static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table, break; } if (!first) - err = proc_put_char(&buffer, &left, '\t'); - if (err) - break; - err = proc_put_long(&buffer, &left, lval, neg); - if (err) - break; + proc_put_char(&buffer, &left, '\t'); + proc_put_long(&buffer, &left, lval, neg); } } if (!write && !first && left && !err) - err = proc_put_char(&buffer, &left, '\n'); + proc_put_char(&buffer, &left, '\n'); if (write && !err && left) left -= proc_skip_spaces(&p); - if (write) { - kfree(kbuf); - if (first) - return err ? : -EINVAL; - } + if (write && first) + return err ? : -EINVAL; *lenp -= left; out: *ppos += *lenp; @@ -623,7 +606,7 @@ out: } static int do_proc_dointvec(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos, + void *buffer, size_t *lenp, loff_t *ppos, int (*conv)(bool *negp, unsigned long *lvalp, int *valp, int write, void *data), void *data) @@ -634,7 +617,7 @@ static int do_proc_dointvec(struct ctl_table *table, int write, static int do_proc_douintvec_w(unsigned int *tbl_data, struct ctl_table *table, - void __user *buffer, + void *buffer, size_t *lenp, loff_t *ppos, int (*conv)(unsigned long *lvalp, unsigned int *valp, @@ -645,7 +628,7 @@ static int do_proc_douintvec_w(unsigned int *tbl_data, int err = 0; size_t left; bool neg; - char *kbuf = NULL, *p; + char *p = buffer; left = *lenp; @@ -655,10 +638,6 @@ static int do_proc_douintvec_w(unsigned int *tbl_data, if (left > PAGE_SIZE - 1) left = PAGE_SIZE - 1; - p = kbuf = memdup_user_nul(buffer, left); - if (IS_ERR(kbuf)) - return -EINVAL; - left -= proc_skip_spaces(&p); if (!left) { err = -EINVAL; @@ -682,7 +661,6 @@ static int do_proc_douintvec_w(unsigned int *tbl_data, left -= proc_skip_spaces(&p); out_free: - kfree(kbuf); if (err) return -EINVAL; @@ -694,7 +672,7 @@ bail_early: return err; } -static int do_proc_douintvec_r(unsigned int *tbl_data, void __user *buffer, +static int do_proc_douintvec_r(unsigned int *tbl_data, void *buffer, size_t *lenp, loff_t *ppos, int (*conv)(unsigned long *lvalp, unsigned int *valp, @@ -712,11 +690,11 @@ static int do_proc_douintvec_r(unsigned int *tbl_data, void __user *buffer, goto out; } - err = proc_put_long(&buffer, &left, lval, false); - if (err || !left) + proc_put_long(&buffer, &left, lval, false); + if (!left) goto out; - err = proc_put_char(&buffer, &left, '\n'); + proc_put_char(&buffer, &left, '\n'); out: *lenp -= left; @@ -726,7 +704,7 @@ out: } static int __do_proc_douintvec(void *tbl_data, struct ctl_table *table, - int write, void __user *buffer, + int write, void *buffer, size_t *lenp, loff_t *ppos, int (*conv)(unsigned long *lvalp, unsigned int *valp, @@ -762,7 +740,7 @@ static int __do_proc_douintvec(void *tbl_data, struct ctl_table *table, } static int do_proc_douintvec(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos, + void *buffer, size_t *lenp, loff_t *ppos, int (*conv)(unsigned long *lvalp, unsigned int *valp, int write, void *data), @@ -785,16 +763,15 @@ static int do_proc_douintvec(struct ctl_table *table, int write, * * Returns 0 on success. */ -int proc_dointvec(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) +int proc_dointvec(struct ctl_table *table, int write, void *buffer, + size_t *lenp, loff_t *ppos) { return do_proc_dointvec(table, write, buffer, lenp, ppos, NULL, NULL); } #ifdef CONFIG_COMPACTION static int proc_dointvec_minmax_warn_RT_change(struct ctl_table *table, - int write, void __user *buffer, - size_t *lenp, loff_t *ppos) + int write, void *buffer, size_t *lenp, loff_t *ppos) { int ret, old; @@ -826,8 +803,8 @@ static int proc_dointvec_minmax_warn_RT_change(struct ctl_table *table, * * Returns 0 on success. */ -int proc_douintvec(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) +int proc_douintvec(struct ctl_table *table, int write, void *buffer, + size_t *lenp, loff_t *ppos) { return do_proc_douintvec(table, write, buffer, lenp, ppos, do_proc_douintvec_conv, NULL); @@ -838,7 +815,7 @@ int proc_douintvec(struct ctl_table *table, int write, * This means we can safely use a temporary. */ static int proc_taint(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { struct ctl_table t; unsigned long tmptaint = get_taint(); @@ -870,7 +847,7 @@ static int proc_taint(struct ctl_table *table, int write, #ifdef CONFIG_PRINTK static int proc_dointvec_minmax_sysadmin(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { if (write && !capable(CAP_SYS_ADMIN)) return -EPERM; @@ -936,7 +913,7 @@ static int do_proc_dointvec_minmax_conv(bool *negp, unsigned long *lvalp, * Returns 0 on success or -EINVAL on write when the range check fails. */ int proc_dointvec_minmax(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { struct do_proc_dointvec_minmax_conv_param param = { .min = (int *) table->extra1, @@ -1005,7 +982,7 @@ static int do_proc_douintvec_minmax_conv(unsigned long *lvalp, * Returns 0 on success or -ERANGE on write when the range check fails. */ int proc_douintvec_minmax(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { struct do_proc_douintvec_minmax_conv_param param = { .min = (unsigned int *) table->extra1, @@ -1036,7 +1013,7 @@ static int do_proc_dopipe_max_size_conv(unsigned long *lvalp, } static int proc_dopipe_max_size(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { return do_proc_douintvec(table, write, buffer, lenp, ppos, do_proc_dopipe_max_size_conv, NULL); @@ -1057,7 +1034,7 @@ static void validate_coredump_safety(void) } static int proc_dointvec_minmax_coredump(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int error = proc_dointvec_minmax(table, write, buffer, lenp, ppos); if (!error) @@ -1067,7 +1044,7 @@ static int proc_dointvec_minmax_coredump(struct ctl_table *table, int write, #ifdef CONFIG_COREDUMP static int proc_dostring_coredump(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int error = proc_dostring(table, write, buffer, lenp, ppos); if (!error) @@ -1078,7 +1055,7 @@ static int proc_dostring_coredump(struct ctl_table *table, int write, #ifdef CONFIG_MAGIC_SYSRQ static int sysrq_sysctl_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int tmp, ret; @@ -1096,16 +1073,14 @@ static int sysrq_sysctl_handler(struct ctl_table *table, int write, } #endif -static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int write, - void __user *buffer, - size_t *lenp, loff_t *ppos, - unsigned long convmul, - unsigned long convdiv) +static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, + int write, void *buffer, size_t *lenp, loff_t *ppos, + unsigned long convmul, unsigned long convdiv) { unsigned long *i, *min, *max; int vleft, first = 1, err = 0; size_t left; - char *kbuf = NULL, *p; + char *p; if (!data || !table->maxlen || !*lenp || (*ppos && !write)) { *lenp = 0; @@ -1124,9 +1099,7 @@ static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int if (left > PAGE_SIZE - 1) left = PAGE_SIZE - 1; - p = kbuf = memdup_user_nul(buffer, left); - if (IS_ERR(kbuf)) - return PTR_ERR(kbuf); + p = buffer; } for (; left && vleft--; i++, first = 0) { @@ -1154,26 +1127,18 @@ static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int *i = val; } else { val = convdiv * (*i) / convmul; - if (!first) { - err = proc_put_char(&buffer, &left, '\t'); - if (err) - break; - } - err = proc_put_long(&buffer, &left, val, false); - if (err) - break; + if (!first) + proc_put_char(&buffer, &left, '\t'); + proc_put_long(&buffer, &left, val, false); } } if (!write && !first && left && !err) - err = proc_put_char(&buffer, &left, '\n'); + proc_put_char(&buffer, &left, '\n'); if (write && !err) left -= proc_skip_spaces(&p); - if (write) { - kfree(kbuf); - if (first) - return err ? : -EINVAL; - } + if (write && first) + return err ? : -EINVAL; *lenp -= left; out: *ppos += *lenp; @@ -1181,10 +1146,8 @@ out: } static int do_proc_doulongvec_minmax(struct ctl_table *table, int write, - void __user *buffer, - size_t *lenp, loff_t *ppos, - unsigned long convmul, - unsigned long convdiv) + void *buffer, size_t *lenp, loff_t *ppos, unsigned long convmul, + unsigned long convdiv) { return __do_proc_doulongvec_minmax(table->data, table, write, buffer, lenp, ppos, convmul, convdiv); @@ -1207,7 +1170,7 @@ static int do_proc_doulongvec_minmax(struct ctl_table *table, int write, * Returns 0 on success. */ int proc_doulongvec_minmax(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { return do_proc_doulongvec_minmax(table, write, buffer, lenp, ppos, 1l, 1l); } @@ -1230,8 +1193,7 @@ int proc_doulongvec_minmax(struct ctl_table *table, int write, * Returns 0 on success. */ int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int write, - void __user *buffer, - size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { return do_proc_doulongvec_minmax(table, write, buffer, lenp, ppos, HZ, 1000l); @@ -1325,7 +1287,7 @@ static int do_proc_dointvec_ms_jiffies_conv(bool *negp, unsigned long *lvalp, * Returns 0 on success. */ int proc_dointvec_jiffies(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { return do_proc_dointvec(table,write,buffer,lenp,ppos, do_proc_dointvec_jiffies_conv,NULL); @@ -1347,7 +1309,7 @@ int proc_dointvec_jiffies(struct ctl_table *table, int write, * Returns 0 on success. */ int proc_dointvec_userhz_jiffies(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { return do_proc_dointvec(table,write,buffer,lenp,ppos, do_proc_dointvec_userhz_jiffies_conv,NULL); @@ -1369,15 +1331,15 @@ int proc_dointvec_userhz_jiffies(struct ctl_table *table, int write, * * Returns 0 on success. */ -int proc_dointvec_ms_jiffies(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) +int proc_dointvec_ms_jiffies(struct ctl_table *table, int write, void *buffer, + size_t *lenp, loff_t *ppos) { return do_proc_dointvec(table, write, buffer, lenp, ppos, do_proc_dointvec_ms_jiffies_conv, NULL); } -static int proc_do_cad_pid(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) +static int proc_do_cad_pid(struct ctl_table *table, int write, void *buffer, + size_t *lenp, loff_t *ppos) { struct pid *new_pid; pid_t tmp; @@ -1416,7 +1378,7 @@ static int proc_do_cad_pid(struct ctl_table *table, int write, * Returns 0 on success. */ int proc_do_large_bitmap(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int err = 0; bool first = 1; @@ -1432,7 +1394,7 @@ int proc_do_large_bitmap(struct ctl_table *table, int write, } if (write) { - char *kbuf, *p; + char *p = buffer; size_t skipped = 0; if (left > PAGE_SIZE - 1) { @@ -1441,15 +1403,9 @@ int proc_do_large_bitmap(struct ctl_table *table, int write, skipped = *lenp - left; } - p = kbuf = memdup_user_nul(buffer, left); - if (IS_ERR(kbuf)) - return PTR_ERR(kbuf); - tmp_bitmap = bitmap_zalloc(bitmap_len, GFP_KERNEL); - if (!tmp_bitmap) { - kfree(kbuf); + if (!tmp_bitmap) return -ENOMEM; - } proc_skip_char(&p, &left, '\n'); while (!err && left) { unsigned long val_a, val_b; @@ -1513,7 +1469,6 @@ int proc_do_large_bitmap(struct ctl_table *table, int write, first = 0; proc_skip_char(&p, &left, '\n'); } - kfree(kbuf); left += skipped; } else { unsigned long bit_a, bit_b = 0; @@ -1525,27 +1480,17 @@ int proc_do_large_bitmap(struct ctl_table *table, int write, bit_b = find_next_zero_bit(bitmap, bitmap_len, bit_a + 1) - 1; - if (!first) { - err = proc_put_char(&buffer, &left, ','); - if (err) - break; - } - err = proc_put_long(&buffer, &left, bit_a, false); - if (err) - break; + if (!first) + proc_put_char(&buffer, &left, ','); + proc_put_long(&buffer, &left, bit_a, false); if (bit_a != bit_b) { - err = proc_put_char(&buffer, &left, '-'); - if (err) - break; - err = proc_put_long(&buffer, &left, bit_b, false); - if (err) - break; + proc_put_char(&buffer, &left, '-'); + proc_put_long(&buffer, &left, bit_b, false); } first = 0; bit_b++; } - if (!err) - err = proc_put_char(&buffer, &left, '\n'); + proc_put_char(&buffer, &left, '\n'); } if (!err) { @@ -1566,68 +1511,67 @@ int proc_do_large_bitmap(struct ctl_table *table, int write, #else /* CONFIG_PROC_SYSCTL */ int proc_dostring(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { return -ENOSYS; } int proc_dointvec(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { return -ENOSYS; } int proc_douintvec(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { return -ENOSYS; } int proc_dointvec_minmax(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { return -ENOSYS; } int proc_douintvec_minmax(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { return -ENOSYS; } int proc_dointvec_jiffies(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { return -ENOSYS; } int proc_dointvec_userhz_jiffies(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { return -ENOSYS; } int proc_dointvec_ms_jiffies(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { return -ENOSYS; } int proc_doulongvec_minmax(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { return -ENOSYS; } int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int write, - void __user *buffer, - size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { - return -ENOSYS; + return -ENOSYS; } int proc_do_large_bitmap(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { return -ENOSYS; } @@ -1636,8 +1580,7 @@ int proc_do_large_bitmap(struct ctl_table *table, int write, #if defined(CONFIG_SYSCTL) int proc_do_static_key(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { struct static_key *key = (struct static_key *)table->data; static DEFINE_MUTEX(static_key_mutex); diff --git a/kernel/time/timer.c b/kernel/time/timer.c index a5221abb4594..398e6eadb861 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -249,8 +249,7 @@ void timers_update_nohz(void) } int timer_migration_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int ret; diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 8d2b98812625..167a74a15b1a 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2661,7 +2661,7 @@ static void output_printk(struct trace_event_buffer *fbuffer) } int tracepoint_printk_sysctl(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, + void *buffer, size_t *lenp, loff_t *ppos) { int save_tracepoint_printk; diff --git a/kernel/umh.c b/kernel/umh.c index 7f255b5a8845..9788ed481a6a 100644 --- a/kernel/umh.c +++ b/kernel/umh.c @@ -630,7 +630,7 @@ int call_usermodehelper(const char *path, char **argv, char **envp, int wait) EXPORT_SYMBOL(call_usermodehelper); static int proc_cap_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { struct ctl_table t; unsigned long cap_array[_KERNEL_CAPABILITY_U32S]; diff --git a/kernel/utsname_sysctl.c b/kernel/utsname_sysctl.c index 3732c888a949..4ca61d49885b 100644 --- a/kernel/utsname_sysctl.c +++ b/kernel/utsname_sysctl.c @@ -30,7 +30,7 @@ static void *get_uts(struct ctl_table *table) * to observe. Should this be in kernel/sys.c ???? */ static int proc_do_uts_string(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { struct ctl_table uts_table; int r; diff --git a/kernel/watchdog.c b/kernel/watchdog.c index b6b1f54a7837..53ff2c81b084 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -661,7 +661,7 @@ static void proc_watchdog_update(void) * proc_soft_watchdog | soft_watchdog_user_enabled | SOFT_WATCHDOG_ENABLED */ static int proc_watchdog_common(int which, struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int err, old, *param = table->data; @@ -688,7 +688,7 @@ static int proc_watchdog_common(int which, struct ctl_table *table, int write, * /proc/sys/kernel/watchdog */ int proc_watchdog(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { return proc_watchdog_common(NMI_WATCHDOG_ENABLED|SOFT_WATCHDOG_ENABLED, table, write, buffer, lenp, ppos); @@ -698,7 +698,7 @@ int proc_watchdog(struct ctl_table *table, int write, * /proc/sys/kernel/nmi_watchdog */ int proc_nmi_watchdog(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { if (!nmi_watchdog_available && write) return -ENOTSUPP; @@ -710,7 +710,7 @@ int proc_nmi_watchdog(struct ctl_table *table, int write, * /proc/sys/kernel/soft_watchdog */ int proc_soft_watchdog(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { return proc_watchdog_common(SOFT_WATCHDOG_ENABLED, table, write, buffer, lenp, ppos); @@ -720,7 +720,7 @@ int proc_soft_watchdog(struct ctl_table *table, int write, * /proc/sys/kernel/watchdog_thresh */ int proc_watchdog_thresh(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int err, old; @@ -743,7 +743,7 @@ int proc_watchdog_thresh(struct ctl_table *table, int write, * been brought online, if desired. */ int proc_watchdog_cpumask(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int err; diff --git a/mm/compaction.c b/mm/compaction.c index 46f0fcc93081..d8cfb7b99a83 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -2463,7 +2463,7 @@ int sysctl_compact_memory; * /proc/sys/vm/compact_memory */ int sysctl_compaction_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *length, loff_t *ppos) + void *buffer, size_t *length, loff_t *ppos) { if (write) compact_nodes(); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index cd459155d28a..2277c5728b1f 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -3352,7 +3352,7 @@ static unsigned int cpuset_mems_nr(unsigned int *array) #ifdef CONFIG_SYSCTL static int hugetlb_sysctl_handler_common(bool obey_mempolicy, struct ctl_table *table, int write, - void __user *buffer, size_t *length, loff_t *ppos) + void *buffer, size_t *length, loff_t *ppos) { struct hstate *h = &default_hstate; unsigned long tmp = h->max_huge_pages; @@ -3375,7 +3375,7 @@ out: } int hugetlb_sysctl_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *length, loff_t *ppos) + void *buffer, size_t *length, loff_t *ppos) { return hugetlb_sysctl_handler_common(false, table, write, @@ -3384,7 +3384,7 @@ int hugetlb_sysctl_handler(struct ctl_table *table, int write, #ifdef CONFIG_NUMA int hugetlb_mempolicy_sysctl_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *length, loff_t *ppos) + void *buffer, size_t *length, loff_t *ppos) { return hugetlb_sysctl_handler_common(true, table, write, buffer, length, ppos); @@ -3392,8 +3392,7 @@ int hugetlb_mempolicy_sysctl_handler(struct ctl_table *table, int write, #endif /* CONFIG_NUMA */ int hugetlb_overcommit_handler(struct ctl_table *table, int write, - void __user *buffer, - size_t *length, loff_t *ppos) + void *buffer, size_t *length, loff_t *ppos) { struct hstate *h = &default_hstate; unsigned long tmp; diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 7326b54ab728..d3ee4c4dafac 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -512,8 +512,7 @@ bool node_dirty_ok(struct pglist_data *pgdat) } int dirty_background_ratio_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int ret; @@ -524,8 +523,7 @@ int dirty_background_ratio_handler(struct ctl_table *table, int write, } int dirty_background_bytes_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int ret; @@ -535,9 +533,8 @@ int dirty_background_bytes_handler(struct ctl_table *table, int write, return ret; } -int dirty_ratio_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) +int dirty_ratio_handler(struct ctl_table *table, int write, void *buffer, + size_t *lenp, loff_t *ppos) { int old_ratio = vm_dirty_ratio; int ret; @@ -551,8 +548,7 @@ int dirty_ratio_handler(struct ctl_table *table, int write, } int dirty_bytes_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { unsigned long old_bytes = vm_dirty_bytes; int ret; @@ -1972,7 +1968,7 @@ bool wb_over_bg_thresh(struct bdi_writeback *wb) * sysctl handler for /proc/sys/vm/dirty_writeback_centisecs */ int dirty_writeback_centisecs_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *length, loff_t *ppos) + void *buffer, size_t *length, loff_t *ppos) { unsigned int old_interval = dirty_writeback_interval; int ret; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 62c1550cd43e..0c43e9ae5004 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5546,21 +5546,11 @@ char numa_zonelist_order[] = "Node"; * sysctl handler for numa_zonelist_order */ int numa_zonelist_order_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *length, - loff_t *ppos) + void *buffer, size_t *length, loff_t *ppos) { - char *str; - int ret; - - if (!write) - return proc_dostring(table, write, buffer, length, ppos); - str = memdup_user_nul(buffer, 16); - if (IS_ERR(str)) - return PTR_ERR(str); - - ret = __parse_numa_zonelist_order(str); - kfree(str); - return ret; + if (write) + return __parse_numa_zonelist_order(buffer); + return proc_dostring(table, write, buffer, length, ppos); } @@ -7963,7 +7953,7 @@ core_initcall(init_per_zone_wmark_min) * changes. */ int min_free_kbytes_sysctl_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *length, loff_t *ppos) + void *buffer, size_t *length, loff_t *ppos) { int rc; @@ -7979,7 +7969,7 @@ int min_free_kbytes_sysctl_handler(struct ctl_table *table, int write, } int watermark_scale_factor_sysctl_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *length, loff_t *ppos) + void *buffer, size_t *length, loff_t *ppos) { int rc; @@ -8009,7 +7999,7 @@ static void setup_min_unmapped_ratio(void) int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *length, loff_t *ppos) + void *buffer, size_t *length, loff_t *ppos) { int rc; @@ -8036,7 +8026,7 @@ static void setup_min_slab_ratio(void) } int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *length, loff_t *ppos) + void *buffer, size_t *length, loff_t *ppos) { int rc; @@ -8060,7 +8050,7 @@ int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *table, int write, * if in function of the boot time zone sizes. */ int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *length, loff_t *ppos) + void *buffer, size_t *length, loff_t *ppos) { proc_dointvec_minmax(table, write, buffer, length, ppos); setup_per_zone_lowmem_reserve(); @@ -8082,7 +8072,7 @@ static void __zone_pcp_update(struct zone *zone) * pagelist can have before it gets flushed back to buddy allocator. */ int percpu_pagelist_fraction_sysctl_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *length, loff_t *ppos) + void *buffer, size_t *length, loff_t *ppos) { struct zone *zone; int old_percpu_pagelist_fraction; diff --git a/mm/util.c b/mm/util.c index 988d11e6c17c..8defc8ec141f 100644 --- a/mm/util.c +++ b/mm/util.c @@ -717,9 +717,8 @@ int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT; unsigned long sysctl_user_reserve_kbytes __read_mostly = 1UL << 17; /* 128MB */ unsigned long sysctl_admin_reserve_kbytes __read_mostly = 1UL << 13; /* 8MB */ -int overcommit_ratio_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) +int overcommit_ratio_handler(struct ctl_table *table, int write, void *buffer, + size_t *lenp, loff_t *ppos) { int ret; @@ -729,9 +728,8 @@ int overcommit_ratio_handler(struct ctl_table *table, int write, return ret; } -int overcommit_kbytes_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) +int overcommit_kbytes_handler(struct ctl_table *table, int write, void *buffer, + size_t *lenp, loff_t *ppos) { int ret; diff --git a/mm/vmstat.c b/mm/vmstat.c index 96d21a792b57..c03a8c914922 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -76,7 +76,7 @@ static void invalid_numa_statistics(void) static DEFINE_MUTEX(vm_numa_stat_lock); int sysctl_vm_numa_stat_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *length, loff_t *ppos) + void *buffer, size_t *length, loff_t *ppos) { int ret, oldval; @@ -1751,7 +1751,7 @@ static void refresh_vm_stats(struct work_struct *work) } int vmstat_refresh(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { long val; int err; diff --git a/net/bridge/br_netfilter_hooks.c b/net/bridge/br_netfilter_hooks.c index 59980ecfc962..04c3f9a82650 100644 --- a/net/bridge/br_netfilter_hooks.c +++ b/net/bridge/br_netfilter_hooks.c @@ -1027,7 +1027,7 @@ int br_nf_hook_thresh(unsigned int hook, struct net *net, #ifdef CONFIG_SYSCTL static int brnf_sysctl_call_tables(struct ctl_table *ctl, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int ret; diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 39d37d0ef575..3f2263e79e4b 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -3379,7 +3379,7 @@ EXPORT_SYMBOL(neigh_app_ns); static int unres_qlen_max = INT_MAX / SKB_TRUESIZE(ETH_FRAME_LEN); static int proc_unres_qlen(struct ctl_table *ctl, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int size, ret; struct ctl_table tmp = *ctl; @@ -3443,8 +3443,8 @@ static void neigh_proc_update(struct ctl_table *ctl, int write) } static int neigh_proc_dointvec_zero_intmax(struct ctl_table *ctl, int write, - void __user *buffer, - size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, + loff_t *ppos) { struct ctl_table tmp = *ctl; int ret; @@ -3457,8 +3457,8 @@ static int neigh_proc_dointvec_zero_intmax(struct ctl_table *ctl, int write, return ret; } -int neigh_proc_dointvec(struct ctl_table *ctl, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) +int neigh_proc_dointvec(struct ctl_table *ctl, int write, void *buffer, + size_t *lenp, loff_t *ppos) { int ret = proc_dointvec(ctl, write, buffer, lenp, ppos); @@ -3467,8 +3467,7 @@ int neigh_proc_dointvec(struct ctl_table *ctl, int write, } EXPORT_SYMBOL(neigh_proc_dointvec); -int neigh_proc_dointvec_jiffies(struct ctl_table *ctl, int write, - void __user *buffer, +int neigh_proc_dointvec_jiffies(struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos) { int ret = proc_dointvec_jiffies(ctl, write, buffer, lenp, ppos); @@ -3479,8 +3478,8 @@ int neigh_proc_dointvec_jiffies(struct ctl_table *ctl, int write, EXPORT_SYMBOL(neigh_proc_dointvec_jiffies); static int neigh_proc_dointvec_userhz_jiffies(struct ctl_table *ctl, int write, - void __user *buffer, - size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, + loff_t *ppos) { int ret = proc_dointvec_userhz_jiffies(ctl, write, buffer, lenp, ppos); @@ -3489,8 +3488,7 @@ static int neigh_proc_dointvec_userhz_jiffies(struct ctl_table *ctl, int write, } int neigh_proc_dointvec_ms_jiffies(struct ctl_table *ctl, int write, - void __user *buffer, - size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int ret = proc_dointvec_ms_jiffies(ctl, write, buffer, lenp, ppos); @@ -3500,8 +3498,8 @@ int neigh_proc_dointvec_ms_jiffies(struct ctl_table *ctl, int write, EXPORT_SYMBOL(neigh_proc_dointvec_ms_jiffies); static int neigh_proc_dointvec_unres_qlen(struct ctl_table *ctl, int write, - void __user *buffer, - size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, + loff_t *ppos) { int ret = proc_unres_qlen(ctl, write, buffer, lenp, ppos); @@ -3510,8 +3508,8 @@ static int neigh_proc_dointvec_unres_qlen(struct ctl_table *ctl, int write, } static int neigh_proc_base_reachable_time(struct ctl_table *ctl, int write, - void __user *buffer, - size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, + loff_t *ppos) { struct neigh_parms *p = ctl->extra2; int ret; diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index 9f9e00ba3ad7..0ddb13a6282b 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -45,7 +45,7 @@ EXPORT_SYMBOL(sysctl_devconf_inherit_init_net); #ifdef CONFIG_RPS static int rps_sock_flow_sysctl(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { unsigned int orig_size, size; int ret, i; @@ -115,8 +115,7 @@ static int rps_sock_flow_sysctl(struct ctl_table *table, int write, static DEFINE_MUTEX(flow_limit_update_mutex); static int flow_limit_cpu_sysctl(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { struct sd_flow_limit *cur; struct softnet_data *sd; @@ -180,10 +179,7 @@ write_unlock: } if (len < *lenp) kbuf[len++] = '\n'; - if (copy_to_user(buffer, kbuf, len)) { - ret = -EFAULT; - goto done; - } + memcpy(buffer, kbuf, len); *lenp = len; *ppos += len; } @@ -194,8 +190,7 @@ done: } static int flow_limit_table_len_sysctl(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { unsigned int old, *ptr; int ret; @@ -217,7 +212,7 @@ static int flow_limit_table_len_sysctl(struct ctl_table *table, int write, #ifdef CONFIG_NET_SCHED static int set_default_qdisc(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { char id[IFNAMSIZ]; struct ctl_table tbl = { @@ -236,7 +231,7 @@ static int set_default_qdisc(struct ctl_table *table, int write, #endif static int proc_do_dev_weight(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int ret; @@ -251,7 +246,7 @@ static int proc_do_dev_weight(struct ctl_table *table, int write, } static int proc_do_rss_key(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { struct ctl_table fake_table; char buf[NETDEV_RSS_KEY_LEN * 3]; @@ -264,7 +259,7 @@ static int proc_do_rss_key(struct ctl_table *table, int write, #ifdef CONFIG_BPF_JIT static int proc_dointvec_minmax_bpf_enable(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, + void *buffer, size_t *lenp, loff_t *ppos) { int ret, jit_enable = *(int *)table->data; @@ -291,8 +286,7 @@ static int proc_dointvec_minmax_bpf_enable(struct ctl_table *table, int write, # ifdef CONFIG_HAVE_EBPF_JIT static int proc_dointvec_minmax_bpf_restricted(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { if (!capable(CAP_SYS_ADMIN)) return -EPERM; @@ -303,8 +297,7 @@ proc_dointvec_minmax_bpf_restricted(struct ctl_table *table, int write, static int proc_dolongvec_minmax_bpf_restricted(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { if (!capable(CAP_SYS_ADMIN)) return -EPERM; diff --git a/net/decnet/dn_dev.c b/net/decnet/dn_dev.c index cca7ae712995..65abcf1b3210 100644 --- a/net/decnet/dn_dev.c +++ b/net/decnet/dn_dev.c @@ -160,8 +160,8 @@ static int max_t3[] = { 8191 }; /* Must fit in 16 bits when multiplied by BCT3MU static int min_priority[1]; static int max_priority[] = { 127 }; /* From DECnet spec */ -static int dn_forwarding_proc(struct ctl_table *, int, - void __user *, size_t *, loff_t *); +static int dn_forwarding_proc(struct ctl_table *, int, void *, size_t *, + loff_t *); static struct dn_dev_sysctl_table { struct ctl_table_header *sysctl_header; struct ctl_table dn_dev_vars[5]; @@ -245,8 +245,7 @@ static void dn_dev_sysctl_unregister(struct dn_dev_parms *parms) } static int dn_forwarding_proc(struct ctl_table *table, int write, - void __user *buffer, - size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { #ifdef CONFIG_DECNET_ROUTER struct net_device *dev = table->extra1; diff --git a/net/decnet/sysctl_net_decnet.c b/net/decnet/sysctl_net_decnet.c index 55bf64a22b59..deae519bdeec 100644 --- a/net/decnet/sysctl_net_decnet.c +++ b/net/decnet/sysctl_net_decnet.c @@ -134,8 +134,7 @@ static int parse_addr(__le16 *addr, char *str) } static int dn_node_address_handler(struct ctl_table *table, int write, - void __user *buffer, - size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { char addr[DN_ASCBUF_LEN]; size_t len; @@ -148,10 +147,7 @@ static int dn_node_address_handler(struct ctl_table *table, int write, if (write) { len = (*lenp < DN_ASCBUF_LEN) ? *lenp : (DN_ASCBUF_LEN-1); - - if (copy_from_user(addr, buffer, len)) - return -EFAULT; - + memcpy(addr, buffer, len); addr[len] = 0; strip_it(addr); @@ -173,11 +169,9 @@ static int dn_node_address_handler(struct ctl_table *table, int write, len = strlen(addr); addr[len++] = '\n'; - if (len > *lenp) len = *lenp; - - if (copy_to_user(buffer, addr, len)) - return -EFAULT; - + if (len > *lenp) + len = *lenp; + memcpy(buffer, addr, len); *lenp = len; *ppos += len; @@ -185,8 +179,7 @@ static int dn_node_address_handler(struct ctl_table *table, int write, } static int dn_def_dev_handler(struct ctl_table *table, int write, - void __user *buffer, - size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { size_t len; struct net_device *dev; @@ -201,9 +194,7 @@ static int dn_def_dev_handler(struct ctl_table *table, int write, if (*lenp > 16) return -E2BIG; - if (copy_from_user(devname, buffer, *lenp)) - return -EFAULT; - + memcpy(devname, buffer, *lenp); devname[*lenp] = 0; strip_it(devname); @@ -238,9 +229,7 @@ static int dn_def_dev_handler(struct ctl_table *table, int write, if (len > *lenp) len = *lenp; - if (copy_to_user(buffer, devname, len)) - return -EFAULT; - + memcpy(buffer, devname, len); *lenp = len; *ppos += len; diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index 30fa42f5997d..a118978d222c 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -2361,8 +2361,7 @@ static int devinet_conf_ifindex(struct net *net, struct ipv4_devconf *cnf) } static int devinet_conf_proc(struct ctl_table *ctl, int write, - void __user *buffer, - size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int old_value = *(int *)ctl->data; int ret = proc_dointvec(ctl, write, buffer, lenp, ppos); @@ -2414,8 +2413,7 @@ static int devinet_conf_proc(struct ctl_table *ctl, int write, } static int devinet_sysctl_forward(struct ctl_table *ctl, int write, - void __user *buffer, - size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int *valp = ctl->data; int val = *valp; @@ -2458,8 +2456,7 @@ static int devinet_sysctl_forward(struct ctl_table *ctl, int write, } static int ipv4_doint_and_flush(struct ctl_table *ctl, int write, - void __user *buffer, - size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int *valp = ctl->data; int val = *valp; diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 788c69d9bfe0..041f4dcac440 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -3336,8 +3336,7 @@ static int ip_rt_gc_elasticity __read_mostly = 8; static int ip_min_valid_pmtu __read_mostly = IPV4_MIN_MTU; static int ipv4_sysctl_rtcache_flush(struct ctl_table *__ctl, int write, - void __user *buffer, - size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { struct net *net = (struct net *)__ctl->extra1; diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 81b267e990a1..868e317cc324 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -71,8 +71,7 @@ static void set_local_port_range(struct net *net, int range[2]) /* Validate changes from /proc interface. */ static int ipv4_local_port_range(struct ctl_table *table, int write, - void __user *buffer, - size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { struct net *net = container_of(table->data, struct net, ipv4.ip_local_ports.range); @@ -107,7 +106,7 @@ static int ipv4_local_port_range(struct ctl_table *table, int write, /* Validate changes from /proc interface. */ static int ipv4_privileged_ports(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { struct net *net = container_of(table->data, struct net, ipv4.sysctl_ip_prot_sock); @@ -168,8 +167,7 @@ static void set_ping_group_range(struct ctl_table *table, kgid_t low, kgid_t hig /* Validate changes from /proc interface. */ static int ipv4_ping_group_range(struct ctl_table *table, int write, - void __user *buffer, - size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { struct user_namespace *user_ns = current_user_ns(); int ret; @@ -204,8 +202,7 @@ static int ipv4_ping_group_range(struct ctl_table *table, int write, } static int ipv4_fwd_update_priority(struct ctl_table *table, int write, - void __user *buffer, - size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { struct net *net; int ret; @@ -221,7 +218,7 @@ static int ipv4_fwd_update_priority(struct ctl_table *table, int write, } static int proc_tcp_congestion_control(struct ctl_table *ctl, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { struct net *net = container_of(ctl->data, struct net, ipv4.tcp_congestion_control); @@ -241,9 +238,8 @@ static int proc_tcp_congestion_control(struct ctl_table *ctl, int write, } static int proc_tcp_available_congestion_control(struct ctl_table *ctl, - int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) + int write, void *buffer, + size_t *lenp, loff_t *ppos) { struct ctl_table tbl = { .maxlen = TCP_CA_BUF_MAX, }; int ret; @@ -258,9 +254,8 @@ static int proc_tcp_available_congestion_control(struct ctl_table *ctl, } static int proc_allowed_congestion_control(struct ctl_table *ctl, - int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) + int write, void *buffer, + size_t *lenp, loff_t *ppos) { struct ctl_table tbl = { .maxlen = TCP_CA_BUF_MAX }; int ret; @@ -296,8 +291,7 @@ static int sscanf_key(char *buf, __le32 *key) } static int proc_tcp_fastopen_key(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { struct net *net = container_of(table->data, struct net, ipv4.sysctl_tcp_fastopen); @@ -399,7 +393,7 @@ static void proc_configure_early_demux(int enabled, int protocol) } static int proc_tcp_early_demux(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int ret = 0; @@ -415,7 +409,7 @@ static int proc_tcp_early_demux(struct ctl_table *table, int write, } static int proc_udp_early_demux(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int ret = 0; @@ -431,8 +425,7 @@ static int proc_udp_early_demux(struct ctl_table *table, int write, } static int proc_tfo_blackhole_detect_timeout(struct ctl_table *table, - int write, - void __user *buffer, + int write, void *buffer, size_t *lenp, loff_t *ppos) { struct net *net = container_of(table->data, struct net, @@ -447,8 +440,7 @@ static int proc_tfo_blackhole_detect_timeout(struct ctl_table *table, } static int proc_tcp_available_ulp(struct ctl_table *ctl, - int write, - void __user *buffer, size_t *lenp, + int write, void *buffer, size_t *lenp, loff_t *ppos) { struct ctl_table tbl = { .maxlen = TCP_ULP_BUF_MAX, }; @@ -466,7 +458,7 @@ static int proc_tcp_available_ulp(struct ctl_table *ctl, #ifdef CONFIG_IP_ROUTE_MULTIPATH static int proc_fib_multipath_hash_policy(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, + void *buffer, size_t *lenp, loff_t *ppos) { struct net *net = container_of(table->data, struct net, diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 24e319dfb510..9d0e89bccb90 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -6108,9 +6108,8 @@ static void ipv6_ifa_notify(int event, struct inet6_ifaddr *ifp) #ifdef CONFIG_SYSCTL -static -int addrconf_sysctl_forward(struct ctl_table *ctl, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) +static int addrconf_sysctl_forward(struct ctl_table *ctl, int write, + void *buffer, size_t *lenp, loff_t *ppos) { int *valp = ctl->data; int val = *valp; @@ -6134,9 +6133,8 @@ int addrconf_sysctl_forward(struct ctl_table *ctl, int write, return ret; } -static -int addrconf_sysctl_mtu(struct ctl_table *ctl, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) +static int addrconf_sysctl_mtu(struct ctl_table *ctl, int write, + void *buffer, size_t *lenp, loff_t *ppos) { struct inet6_dev *idev = ctl->extra1; int min_mtu = IPV6_MIN_MTU; @@ -6206,9 +6204,8 @@ static int addrconf_disable_ipv6(struct ctl_table *table, int *p, int newf) return 0; } -static -int addrconf_sysctl_disable(struct ctl_table *ctl, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) +static int addrconf_sysctl_disable(struct ctl_table *ctl, int write, + void *buffer, size_t *lenp, loff_t *ppos) { int *valp = ctl->data; int val = *valp; @@ -6232,9 +6229,8 @@ int addrconf_sysctl_disable(struct ctl_table *ctl, int write, return ret; } -static -int addrconf_sysctl_proxy_ndp(struct ctl_table *ctl, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) +static int addrconf_sysctl_proxy_ndp(struct ctl_table *ctl, int write, + void *buffer, size_t *lenp, loff_t *ppos) { int *valp = ctl->data; int ret; @@ -6275,7 +6271,7 @@ int addrconf_sysctl_proxy_ndp(struct ctl_table *ctl, int write, } static int addrconf_sysctl_addr_gen_mode(struct ctl_table *ctl, int write, - void __user *buffer, size_t *lenp, + void *buffer, size_t *lenp, loff_t *ppos) { int ret = 0; @@ -6337,7 +6333,7 @@ out: } static int addrconf_sysctl_stable_secret(struct ctl_table *ctl, int write, - void __user *buffer, size_t *lenp, + void *buffer, size_t *lenp, loff_t *ppos) { int err; @@ -6404,8 +6400,7 @@ out: static int addrconf_sysctl_ignore_routes_with_linkdown(struct ctl_table *ctl, - int write, - void __user *buffer, + int write, void *buffer, size_t *lenp, loff_t *ppos) { @@ -6505,10 +6500,8 @@ int addrconf_disable_policy(struct ctl_table *ctl, int *valp, int val) return 0; } -static -int addrconf_sysctl_disable_policy(struct ctl_table *ctl, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) +static int addrconf_sysctl_disable_policy(struct ctl_table *ctl, int write, + void *buffer, size_t *lenp, loff_t *ppos) { int *valp = ctl->data; int val = *valp; diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index 1ecd4e9b0bdf..58f1255295d3 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -1835,7 +1835,8 @@ static void ndisc_warn_deprecated_sysctl(struct ctl_table *ctl, } } -int ndisc_ifinfo_sysctl_change(struct ctl_table *ctl, int write, void __user *buffer, size_t *lenp, loff_t *ppos) +int ndisc_ifinfo_sysctl_change(struct ctl_table *ctl, int write, void *buffer, + size_t *lenp, loff_t *ppos) { struct net_device *dev = ctl->extra1; struct inet6_dev *idev; diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 310cbddaa533..acdb31e38412 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -6088,9 +6088,8 @@ static int rt6_stats_seq_show(struct seq_file *seq, void *v) #ifdef CONFIG_SYSCTL -static -int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) +static int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write, + void *buffer, size_t *lenp, loff_t *ppos) { struct net *net; int delay; diff --git a/net/ipv6/sysctl_net_ipv6.c b/net/ipv6/sysctl_net_ipv6.c index 63b657aa8d29..fac2135aa47b 100644 --- a/net/ipv6/sysctl_net_ipv6.c +++ b/net/ipv6/sysctl_net_ipv6.c @@ -26,8 +26,7 @@ static int auto_flowlabels_min; static int auto_flowlabels_max = IP6_AUTO_FLOW_LABEL_MAX; static int proc_rt6_multipath_hash_policy(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { struct net *net; int ret; diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c index 4701edffb1f7..a42e4ed5ab0e 100644 --- a/net/mpls/af_mpls.c +++ b/net/mpls/af_mpls.c @@ -1362,8 +1362,7 @@ done: (&((struct mpls_dev *)0)->field) static int mpls_conf_proc(struct ctl_table *ctl, int write, - void __user *buffer, - size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int oval = *(int *)ctl->data; int ret = proc_dointvec(ctl, write, buffer, lenp, ppos); @@ -2594,7 +2593,7 @@ nolabels: } static int mpls_platform_labels(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { struct net *net = table->data; int platform_labels = net->mpls.platform_labels; diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index 8d14a1acbc37..412656c34f20 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -1736,7 +1736,7 @@ static int three = 3; static int proc_do_defense_mode(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { struct netns_ipvs *ipvs = table->extra2; int *valp = table->data; @@ -1763,7 +1763,7 @@ proc_do_defense_mode(struct ctl_table *table, int write, static int proc_do_sync_threshold(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int *valp = table->data; int val[2]; @@ -1788,7 +1788,7 @@ proc_do_sync_threshold(struct ctl_table *table, int write, static int proc_do_sync_ports(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int *valp = table->data; int val = *valp; diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c index 9b57330c81f8..31b027b12ff3 100644 --- a/net/netfilter/nf_conntrack_standalone.c +++ b/net/netfilter/nf_conntrack_standalone.c @@ -517,7 +517,7 @@ static unsigned int nf_conntrack_htable_size_user __read_mostly; static int nf_conntrack_hash_sysctl(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int ret; diff --git a/net/netfilter/nf_log.c b/net/netfilter/nf_log.c index bb25d4c794c7..6cb9f9474b05 100644 --- a/net/netfilter/nf_log.c +++ b/net/netfilter/nf_log.c @@ -414,7 +414,7 @@ static struct ctl_table nf_log_sysctl_ftable[] = { }; static int nf_log_proc_dostring(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { const struct nf_logger *logger; char buf[NFLOGGER_NAME_LEN]; diff --git a/net/phonet/sysctl.c b/net/phonet/sysctl.c index 251e750fd9aa..0d0bf41381c2 100644 --- a/net/phonet/sysctl.c +++ b/net/phonet/sysctl.c @@ -49,8 +49,7 @@ void phonet_get_local_port_range(int *min, int *max) } static int proc_local_port_range(struct ctl_table *table, int write, - void __user *buffer, - size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int ret; int range[2] = {local_port_range[0], local_port_range[1]}; diff --git a/net/rds/tcp.c b/net/rds/tcp.c index 66121bc6f34e..46782fac4c16 100644 --- a/net/rds/tcp.c +++ b/net/rds/tcp.c @@ -62,8 +62,7 @@ static atomic_t rds_tcp_unloading = ATOMIC_INIT(0); static struct kmem_cache *rds_tcp_conn_slab; static int rds_tcp_skbuf_handler(struct ctl_table *ctl, int write, - void __user *buffer, size_t *lenp, - loff_t *fpos); + void *buffer, size_t *lenp, loff_t *fpos); static int rds_tcp_min_sndbuf = SOCK_MIN_SNDBUF; static int rds_tcp_min_rcvbuf = SOCK_MIN_RCVBUF; @@ -676,8 +675,7 @@ static void rds_tcp_sysctl_reset(struct net *net) } static int rds_tcp_skbuf_handler(struct ctl_table *ctl, int write, - void __user *buffer, size_t *lenp, - loff_t *fpos) + void *buffer, size_t *lenp, loff_t *fpos) { struct net *net = current->nsproxy->net_ns; int err; diff --git a/net/sctp/sysctl.c b/net/sctp/sysctl.c index 4740aa70e652..c16c80963e55 100644 --- a/net/sctp/sysctl.c +++ b/net/sctp/sysctl.c @@ -43,20 +43,15 @@ static unsigned long max_autoclose_max = ? UINT_MAX : MAX_SCHEDULE_TIMEOUT / HZ; static int proc_sctp_do_hmac_alg(struct ctl_table *ctl, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos); + void *buffer, size_t *lenp, loff_t *ppos); static int proc_sctp_do_rto_min(struct ctl_table *ctl, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos); -static int proc_sctp_do_rto_max(struct ctl_table *ctl, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos); + void *buffer, size_t *lenp, loff_t *ppos); +static int proc_sctp_do_rto_max(struct ctl_table *ctl, int write, void *buffer, + size_t *lenp, loff_t *ppos); static int proc_sctp_do_alpha_beta(struct ctl_table *ctl, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos); + void *buffer, size_t *lenp, loff_t *ppos); static int proc_sctp_do_auth(struct ctl_table *ctl, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos); + void *buffer, size_t *lenp, loff_t *ppos); static struct ctl_table sctp_table[] = { { @@ -343,8 +338,7 @@ static struct ctl_table sctp_net_table[] = { }; static int proc_sctp_do_hmac_alg(struct ctl_table *ctl, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { struct net *net = current->nsproxy->net_ns; struct ctl_table tbl; @@ -389,8 +383,7 @@ static int proc_sctp_do_hmac_alg(struct ctl_table *ctl, int write, } static int proc_sctp_do_rto_min(struct ctl_table *ctl, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { struct net *net = current->nsproxy->net_ns; unsigned int min = *(unsigned int *) ctl->extra1; @@ -418,8 +411,7 @@ static int proc_sctp_do_rto_min(struct ctl_table *ctl, int write, } static int proc_sctp_do_rto_max(struct ctl_table *ctl, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { struct net *net = current->nsproxy->net_ns; unsigned int min = *(unsigned int *) ctl->extra1; @@ -447,8 +439,7 @@ static int proc_sctp_do_rto_max(struct ctl_table *ctl, int write, } static int proc_sctp_do_alpha_beta(struct ctl_table *ctl, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { if (write) pr_warn_once("Changing rto_alpha or rto_beta may lead to " @@ -458,8 +449,7 @@ static int proc_sctp_do_alpha_beta(struct ctl_table *ctl, int write, } static int proc_sctp_do_auth(struct ctl_table *ctl, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { struct net *net = current->nsproxy->net_ns; struct ctl_table tbl; diff --git a/net/sunrpc/sysctl.c b/net/sunrpc/sysctl.c index d75f17b56f0e..999eee1ed61c 100644 --- a/net/sunrpc/sysctl.c +++ b/net/sunrpc/sysctl.c @@ -60,7 +60,7 @@ rpc_unregister_sysctl(void) } static int proc_do_xprt(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { char tmpbuf[256]; size_t len; @@ -70,15 +70,15 @@ static int proc_do_xprt(struct ctl_table *table, int write, return 0; } len = svc_print_xprts(tmpbuf, sizeof(tmpbuf)); - return simple_read_from_buffer(buffer, *lenp, ppos, tmpbuf, len); + return memory_read_from_buffer(buffer, *lenp, ppos, tmpbuf, len); } static int -proc_dodebug(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) +proc_dodebug(struct ctl_table *table, int write, void *buffer, size_t *lenp, + loff_t *ppos) { - char tmpbuf[20], c, *s = NULL; - char __user *p; + char tmpbuf[20], *s = NULL; + char *p; unsigned int value; size_t left, len; @@ -90,18 +90,17 @@ proc_dodebug(struct ctl_table *table, int write, left = *lenp; if (write) { - if (!access_ok(buffer, left)) - return -EFAULT; p = buffer; - while (left && __get_user(c, p) >= 0 && isspace(c)) - left--, p++; + while (left && isspace(*p)) { + left--; + p++; + } if (!left) goto done; if (left > sizeof(tmpbuf) - 1) return -EINVAL; - if (copy_from_user(tmpbuf, p, left)) - return -EFAULT; + memcpy(tmpbuf, p, left); tmpbuf[left] = '\0'; value = simple_strtol(tmpbuf, &s, 0); @@ -121,11 +120,9 @@ proc_dodebug(struct ctl_table *table, int write, len = sprintf(tmpbuf, "0x%04x", *(unsigned int *) table->data); if (len > left) len = left; - if (copy_to_user(buffer, tmpbuf, len)) - return -EFAULT; + memcpy(buffer, tmpbuf, len); if ((left -= len) > 0) { - if (put_user('\n', (char __user *)buffer + len)) - return -EFAULT; + *((char *)buffer + len) = '\n'; left--; } } diff --git a/net/sunrpc/xprtrdma/svc_rdma.c b/net/sunrpc/xprtrdma/svc_rdma.c index 97bca509a391..526da5d4710b 100644 --- a/net/sunrpc/xprtrdma/svc_rdma.c +++ b/net/sunrpc/xprtrdma/svc_rdma.c @@ -80,8 +80,7 @@ atomic_t rdma_stat_sq_prod; * current value. */ static int read_reset_stat(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { atomic_t *stat = (atomic_t *)table->data; @@ -103,8 +102,8 @@ static int read_reset_stat(struct ctl_table *table, int write, len -= *ppos; if (len > *lenp) len = *lenp; - if (len && copy_to_user(buffer, str_buf, len)) - return -EFAULT; + if (len) + memcpy(buffer, str_buf, len); *lenp = len; *ppos += len; } diff --git a/security/apparmor/lsm.c b/security/apparmor/lsm.c index b621ad74f54a..27e371b44dad 100644 --- a/security/apparmor/lsm.c +++ b/security/apparmor/lsm.c @@ -1696,7 +1696,7 @@ static int __init alloc_buffers(void) #ifdef CONFIG_SYSCTL static int apparmor_dointvec(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { if (!policy_admin_capable(NULL)) return -EPERM; diff --git a/security/min_addr.c b/security/min_addr.c index 94d2b0cf0e7b..88c9a6a21f47 100644 --- a/security/min_addr.c +++ b/security/min_addr.c @@ -30,7 +30,7 @@ static void update_mmap_min_addr(void) * calls update_mmap_min_addr() so non MAP_FIXED hints get rounded properly */ int mmap_min_addr_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int ret; diff --git a/security/yama/yama_lsm.c b/security/yama/yama_lsm.c index 94dc346370b1..536c99646f6a 100644 --- a/security/yama/yama_lsm.c +++ b/security/yama/yama_lsm.c @@ -430,7 +430,7 @@ static struct security_hook_list yama_hooks[] __lsm_ro_after_init = { #ifdef CONFIG_SYSCTL static int yama_dointvec_minmax(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { struct ctl_table table_copy; -- cgit v1.2.3-59-g8ed1b From e62905ae34eaf5fe2cfb254be5e0c097b3b1f798 Mon Sep 17 00:00:00 2001 From: Nicolas Dichtel Date: Thu, 23 Apr 2020 11:39:20 +0200 Subject: xfrm interface: don't take extra reference to netdev I don't see any reason to do this. Maybe needed before commit 56c5ee1a5823 ("xfrm interface: fix memory leak on creation"). Signed-off-by: Nicolas Dichtel Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_interface.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/net/xfrm/xfrm_interface.c b/net/xfrm/xfrm_interface.c index 3361e3ac5714..eb9928c0a87c 100644 --- a/net/xfrm/xfrm_interface.c +++ b/net/xfrm/xfrm_interface.c @@ -145,7 +145,6 @@ static int xfrmi_create(struct net_device *dev) if (err < 0) goto out; - dev_hold(dev); xfrmi_link(xfrmn, xi); return 0; @@ -175,7 +174,6 @@ static void xfrmi_dev_uninit(struct net_device *dev) struct xfrmi_net *xfrmn = net_generic(xi->net, xfrmi_net_id); xfrmi_unlink(xfrmn, xi); - dev_put(dev); } static void xfrmi_scrub_packet(struct sk_buff *skb, bool xnet) -- cgit v1.2.3-59-g8ed1b From e411eb257b331bf44cbe8845b5351260c8222c6c Mon Sep 17 00:00:00 2001 From: Mao Wenan Date: Sun, 26 Apr 2020 14:36:35 +0800 Subject: libbpf: Return err if bpf_object__load failed bpf_object__load() has various return code, when it failed to load object, it must return err instead of -EINVAL. Signed-off-by: Mao Wenan Signed-off-by: Daniel Borkmann Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200426063635.130680-3-maowenan@huawei.com --- tools/lib/bpf/libbpf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 8f480e29a6b0..8e1dc6980fac 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -7006,7 +7006,7 @@ int bpf_prog_load_xattr(const struct bpf_prog_load_attr *attr, err = bpf_object__load(obj); if (err) { bpf_object__close(obj); - return -EINVAL; + return err; } *pobj = obj; -- cgit v1.2.3-59-g8ed1b From 0767ec04289757c0edc2322957ba51d97446eaa4 Mon Sep 17 00:00:00 2001 From: Oleksij Rempel Date: Fri, 24 Apr 2020 06:59:14 +0200 Subject: net: ag71xx: extend link validation to support other SoCs Most (all?) QCA SoCs have two MAC with different supported link capabilities. Extend ag71xx_mac_validate() to properly validate this variants. Signed-off-by: Oleksij Rempel Signed-off-by: David S. Miller --- drivers/net/ethernet/atheros/ag71xx.c | 43 +++++++++++++++++++++++++++++++---- 1 file changed, 38 insertions(+), 5 deletions(-) diff --git a/drivers/net/ethernet/atheros/ag71xx.c b/drivers/net/ethernet/atheros/ag71xx.c index 02b7705393ca..112edbd30823 100644 --- a/drivers/net/ethernet/atheros/ag71xx.c +++ b/drivers/net/ethernet/atheros/ag71xx.c @@ -871,13 +871,40 @@ static void ag71xx_mac_validate(struct phylink_config *config, unsigned long *supported, struct phylink_link_state *state) { + struct ag71xx *ag = netdev_priv(to_net_dev(config->dev)); __ETHTOOL_DECLARE_LINK_MODE_MASK(mask) = { 0, }; - if (state->interface != PHY_INTERFACE_MODE_NA && - state->interface != PHY_INTERFACE_MODE_GMII && - state->interface != PHY_INTERFACE_MODE_MII) { - bitmap_zero(supported, __ETHTOOL_LINK_MODE_MASK_NBITS); - return; + switch (state->interface) { + case PHY_INTERFACE_MODE_NA: + break; + case PHY_INTERFACE_MODE_MII: + if ((ag71xx_is(ag, AR9330) && ag->mac_idx == 0) || + ag71xx_is(ag, AR9340) || + ag71xx_is(ag, QCA9530) || + (ag71xx_is(ag, QCA9550) && ag->mac_idx == 1)) + break; + goto unsupported; + case PHY_INTERFACE_MODE_GMII: + if ((ag71xx_is(ag, AR9330) && ag->mac_idx == 1) || + (ag71xx_is(ag, AR9340) && ag->mac_idx == 1) || + (ag71xx_is(ag, QCA9530) && ag->mac_idx == 1)) + break; + goto unsupported; + case PHY_INTERFACE_MODE_SGMII: + if (ag71xx_is(ag, QCA9550) && ag->mac_idx == 0) + break; + goto unsupported; + case PHY_INTERFACE_MODE_RMII: + if (ag71xx_is(ag, AR9340) && ag->mac_idx == 0) + break; + goto unsupported; + case PHY_INTERFACE_MODE_RGMII: + if ((ag71xx_is(ag, AR9340) && ag->mac_idx == 0) || + (ag71xx_is(ag, QCA9550) && ag->mac_idx == 1)) + break; + goto unsupported; + default: + goto unsupported; } phylink_set(mask, MII); @@ -889,6 +916,8 @@ static void ag71xx_mac_validate(struct phylink_config *config, phylink_set(mask, 100baseT_Full); if (state->interface == PHY_INTERFACE_MODE_NA || + state->interface == PHY_INTERFACE_MODE_SGMII || + state->interface == PHY_INTERFACE_MODE_RGMII || state->interface == PHY_INTERFACE_MODE_GMII) { phylink_set(mask, 1000baseT_Full); phylink_set(mask, 1000baseX_Full); @@ -898,6 +927,10 @@ static void ag71xx_mac_validate(struct phylink_config *config, __ETHTOOL_LINK_MODE_MASK_NBITS); bitmap_and(state->advertising, state->advertising, mask, __ETHTOOL_LINK_MODE_MASK_NBITS); + + return; +unsupported: + bitmap_zero(supported, __ETHTOOL_LINK_MODE_MASK_NBITS); } static void ag71xx_mac_pcs_get_state(struct phylink_config *config, -- cgit v1.2.3-59-g8ed1b From 3608a199749873edc992c74adf077c3a848121ad Mon Sep 17 00:00:00 2001 From: Oleksij Rempel Date: Fri, 24 Apr 2020 07:21:16 +0200 Subject: dt-bindings: net: convert qca,ar71xx documentation to yaml Now that we have the DT validation in place, let's convert the device tree bindings for the Atheros AR71XX over to a YAML schemas. Signed-off-by: Oleksij Rempel Signed-off-by: David S. Miller --- .../devicetree/bindings/net/qca,ar71xx.txt | 45 ----- .../devicetree/bindings/net/qca,ar71xx.yaml | 216 +++++++++++++++++++++ 2 files changed, 216 insertions(+), 45 deletions(-) delete mode 100644 Documentation/devicetree/bindings/net/qca,ar71xx.txt create mode 100644 Documentation/devicetree/bindings/net/qca,ar71xx.yaml diff --git a/Documentation/devicetree/bindings/net/qca,ar71xx.txt b/Documentation/devicetree/bindings/net/qca,ar71xx.txt deleted file mode 100644 index 2a33e71ba72b..000000000000 --- a/Documentation/devicetree/bindings/net/qca,ar71xx.txt +++ /dev/null @@ -1,45 +0,0 @@ -Required properties: -- compatible: Should be "qca,-eth". Currently support compatibles are: - qca,ar7100-eth - Atheros AR7100 - qca,ar7240-eth - Atheros AR7240 - qca,ar7241-eth - Atheros AR7241 - qca,ar7242-eth - Atheros AR7242 - qca,ar9130-eth - Atheros AR9130 - qca,ar9330-eth - Atheros AR9330 - qca,ar9340-eth - Atheros AR9340 - qca,qca9530-eth - Qualcomm Atheros QCA9530 - qca,qca9550-eth - Qualcomm Atheros QCA9550 - qca,qca9560-eth - Qualcomm Atheros QCA9560 - -- reg : Address and length of the register set for the device -- interrupts : Should contain eth interrupt -- phy-mode : See ethernet.txt file in the same directory -- clocks: the clock used by the core -- clock-names: the names of the clock listed in the clocks property. These are - "eth" and "mdio". -- resets: Should contain phandles to the reset signals -- reset-names: Should contain the names of reset signal listed in the resets - property. These are "mac" and "mdio" - -Optional properties: -- phy-handle : phandle to the PHY device connected to this device. -- fixed-link : Assume a fixed link. See fixed-link.txt in the same directory. - Use instead of phy-handle. - -Optional subnodes: -- mdio : specifies the mdio bus, used as a container for phy nodes - according to phy.txt in the same directory - -Example: - -ethernet@1a000000 { - compatible = "qca,ar9330-eth"; - reg = <0x1a000000 0x200>; - interrupts = <5>; - resets = <&rst 13>, <&rst 23>; - reset-names = "mac", "mdio"; - clocks = <&pll ATH79_CLK_AHB>, <&pll ATH79_CLK_MDIO>; - clock-names = "eth", "mdio"; - - phy-mode = "gmii"; -}; diff --git a/Documentation/devicetree/bindings/net/qca,ar71xx.yaml b/Documentation/devicetree/bindings/net/qca,ar71xx.yaml new file mode 100644 index 000000000000..f99a5aabe923 --- /dev/null +++ b/Documentation/devicetree/bindings/net/qca,ar71xx.yaml @@ -0,0 +1,216 @@ +# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/net/qca,ar71xx.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: QCA AR71XX MAC + +allOf: + - $ref: ethernet-controller.yaml# + +maintainers: + - Oleksij Rempel + +properties: + compatible: + oneOf: + - items: + - enum: + - qca,ar7100-eth # Atheros AR7100 + - qca,ar7240-eth # Atheros AR7240 + - qca,ar7241-eth # Atheros AR7241 + - qca,ar7242-eth # Atheros AR7242 + - qca,ar9130-eth # Atheros AR9130 + - qca,ar9330-eth # Atheros AR9330 + - qca,ar9340-eth # Atheros AR9340 + - qca,qca9530-eth # Qualcomm Atheros QCA9530 + - qca,qca9550-eth # Qualcomm Atheros QCA9550 + - qca,qca9560-eth # Qualcomm Atheros QCA9560 + + reg: + maxItems: 1 + + interrupts: + maxItems: 1 + + '#address-cells': + description: number of address cells for the MDIO bus + const: 1 + + '#size-cells': + description: number of size cells on the MDIO bus + const: 0 + + clocks: + items: + - description: MAC main clock + - description: MDIO clock + + clock-names: + items: + - const: eth + - const: mdio + + resets: + items: + - description: MAC reset + - description: MDIO reset + + reset-names: + items: + - const: mac + - const: mdio + +required: + - compatible + - reg + - interrupts + - phy-mode + - clocks + - clock-names + - resets + - reset-names + +examples: + # Lager board + - | + eth0: ethernet@19000000 { + compatible = "qca,ar9330-eth"; + reg = <0x19000000 0x200>; + interrupts = <4>; + resets = <&rst 9>, <&rst 22>; + reset-names = "mac", "mdio"; + clocks = <&pll 1>, <&pll 2>; + clock-names = "eth", "mdio"; + qca,ethcfg = <ðcfg>; + phy-mode = "mii"; + phy-handle = <&phy_port4>; + }; + + eth1: ethernet@1a000000 { + compatible = "qca,ar9330-eth"; + reg = <0x1a000000 0x200>; + interrupts = <5>; + resets = <&rst 13>, <&rst 23>; + reset-names = "mac", "mdio"; + clocks = <&pll 1>, <&pll 2>; + clock-names = "eth", "mdio"; + + phy-mode = "gmii"; + + status = "disabled"; + + fixed-link { + speed = <1000>; + full-duplex; + }; + + mdio { + #address-cells = <1>; + #size-cells = <0>; + + switch10: switch@10 { + #address-cells = <1>; + #size-cells = <0>; + + compatible = "qca,ar9331-switch"; + reg = <0x10>; + resets = <&rst 8>; + reset-names = "switch"; + + interrupt-parent = <&miscintc>; + interrupts = <12>; + + interrupt-controller; + #interrupt-cells = <1>; + + ports { + #address-cells = <1>; + #size-cells = <0>; + + switch_port0: port@0 { + reg = <0x0>; + label = "cpu"; + ethernet = <ð1>; + + phy-mode = "gmii"; + + fixed-link { + speed = <1000>; + full-duplex; + }; + }; + + switch_port1: port@1 { + reg = <0x1>; + phy-handle = <&phy_port0>; + phy-mode = "internal"; + + status = "disabled"; + }; + + switch_port2: port@2 { + reg = <0x2>; + phy-handle = <&phy_port1>; + phy-mode = "internal"; + + status = "disabled"; + }; + + switch_port3: port@3 { + reg = <0x3>; + phy-handle = <&phy_port2>; + phy-mode = "internal"; + + status = "disabled"; + }; + + switch_port4: port@4 { + reg = <0x4>; + phy-handle = <&phy_port3>; + phy-mode = "internal"; + + status = "disabled"; + }; + }; + + mdio { + #address-cells = <1>; + #size-cells = <0>; + + interrupt-parent = <&switch10>; + + phy_port0: phy@0 { + reg = <0x0>; + interrupts = <0>; + status = "disabled"; + }; + + phy_port1: phy@1 { + reg = <0x1>; + interrupts = <0>; + status = "disabled"; + }; + + phy_port2: phy@2 { + reg = <0x2>; + interrupts = <0>; + status = "disabled"; + }; + + phy_port3: phy@3 { + reg = <0x3>; + interrupts = <0>; + status = "disabled"; + }; + + phy_port4: phy@4 { + reg = <0x4>; + interrupts = <0>; + status = "disabled"; + }; + }; + }; + }; + }; -- cgit v1.2.3-59-g8ed1b From 7d3118016787b5c05da94b3bcdb96c9d6ff82c44 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Sat, 25 Apr 2020 12:28:14 +0100 Subject: net: rtnetlink: remove redundant assignment to variable err The variable err is being initializeed with a value that is never read and it is being updated later with a new value. The initialization is redundant and can be removed. Addresses-Coverity: ("Unused value") Signed-off-by: Colin Ian King Signed-off-by: David S. Miller --- net/core/rtnetlink.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index d6f4f4a9e8ba..2269199c5891 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -3997,8 +3997,8 @@ static int rtnl_fdb_del(struct sk_buff *skb, struct nlmsghdr *nlh, struct ndmsg *ndm; struct nlattr *tb[NDA_MAX+1]; struct net_device *dev; - int err = -EINVAL; __u8 *addr; + int err; u16 vid; if (!netlink_capable(skb, CAP_NET_ADMIN)) -- cgit v1.2.3-59-g8ed1b From 4714d13791f831d253852c8b5d657270becb8b2a Mon Sep 17 00:00:00 2001 From: Horatiu Vultur Date: Sun, 26 Apr 2020 15:21:58 +0200 Subject: bridge: uapi: mrp: Add mrp attributes. Add new nested netlink attribute to configure the MRP. These attributes are used by the userspace to add/delete/configure MRP instances and by the kernel to notify the userspace when the MRP ring gets open/closed. MRP nested attribute has the following attributes: IFLA_BRIDGE_MRP_INSTANCE - the parameter type is br_mrp_instance which contains the instance id, and the ifindex of the two ports. The ports can't be part of multiple instances. This is used to create/delete MRP instances. IFLA_BRIDGE_MRP_PORT_STATE - the parameter type is u32. Which can be forwarding, blocking or disabled. IFLA_BRIDGE_MRP_PORT_ROLE - the parameter type is br_mrp_port_role which contains the instance id and the role. The role can be primary or secondary. IFLA_BRIDGE_MRP_RING_STATE - the parameter type is br_mrp_ring_state which contains the instance id and the state. The state can be open or closed. IFLA_BRIDGE_MRP_RING_ROLE - the parameter type is br_mrp_ring_role which contains the instance id and the ring role. The role can be MRM or MRC. IFLA_BRIDGE_MRP_START_TEST - the parameter type is br_mrp_start_test which contains the instance id, the interval at which to send the MRP_Test frames, how many test frames can be missed before declaring the ring open and the period which represent for how long to send the test frames. Also add the file include/uapi/linux/mrp_bridge.h which defines all the types used by MRP that are also needed by the userpace. Reviewed-by: Nikolay Aleksandrov Signed-off-by: Horatiu Vultur Signed-off-by: David S. Miller --- include/uapi/linux/if_bridge.h | 42 +++++++++++++++++++++ include/uapi/linux/if_ether.h | 1 + include/uapi/linux/mrp_bridge.h | 84 +++++++++++++++++++++++++++++++++++++++++ 3 files changed, 127 insertions(+) create mode 100644 include/uapi/linux/mrp_bridge.h diff --git a/include/uapi/linux/if_bridge.h b/include/uapi/linux/if_bridge.h index bfe621ea51b3..bd8c95488f16 100644 --- a/include/uapi/linux/if_bridge.h +++ b/include/uapi/linux/if_bridge.h @@ -120,6 +120,7 @@ enum { IFLA_BRIDGE_MODE, IFLA_BRIDGE_VLAN_INFO, IFLA_BRIDGE_VLAN_TUNNEL_INFO, + IFLA_BRIDGE_MRP, __IFLA_BRIDGE_MAX, }; #define IFLA_BRIDGE_MAX (__IFLA_BRIDGE_MAX - 1) @@ -157,6 +158,47 @@ struct bridge_vlan_xstats { __u32 pad2; }; +enum { + IFLA_BRIDGE_MRP_UNSPEC, + IFLA_BRIDGE_MRP_INSTANCE, + IFLA_BRIDGE_MRP_PORT_STATE, + IFLA_BRIDGE_MRP_PORT_ROLE, + IFLA_BRIDGE_MRP_RING_STATE, + IFLA_BRIDGE_MRP_RING_ROLE, + IFLA_BRIDGE_MRP_START_TEST, + __IFLA_BRIDGE_MRP_MAX, +}; + +struct br_mrp_instance { + __u32 ring_id; + __u32 p_ifindex; + __u32 s_ifindex; +}; + +struct br_mrp_port_role { + __u32 ring_id; + __u32 role; +}; + +struct br_mrp_ring_state { + __u32 ring_id; + __u32 ring_state; +}; + +struct br_mrp_ring_role { + __u32 ring_id; + __u32 ring_role; +}; + +struct br_mrp_start_test { + __u32 ring_id; + __u32 interval; + __u32 max_miss; + __u32 period; +}; + +#define IFLA_BRIDGE_MRP_MAX (__IFLA_BRIDGE_MRP_MAX - 1) + struct bridge_stp_xstats { __u64 transition_blk; __u64 transition_fwd; diff --git a/include/uapi/linux/if_ether.h b/include/uapi/linux/if_ether.h index f6ceb2e63d1e..d6de2b167448 100644 --- a/include/uapi/linux/if_ether.h +++ b/include/uapi/linux/if_ether.h @@ -92,6 +92,7 @@ #define ETH_P_PREAUTH 0x88C7 /* 802.11 Preauthentication */ #define ETH_P_TIPC 0x88CA /* TIPC */ #define ETH_P_LLDP 0x88CC /* Link Layer Discovery Protocol */ +#define ETH_P_MRP 0x88E3 /* Media Redundancy Protocol */ #define ETH_P_MACSEC 0x88E5 /* 802.1ae MACsec */ #define ETH_P_8021AH 0x88E7 /* 802.1ah Backbone Service Tag */ #define ETH_P_MVRP 0x88F5 /* 802.1Q MVRP */ diff --git a/include/uapi/linux/mrp_bridge.h b/include/uapi/linux/mrp_bridge.h new file mode 100644 index 000000000000..2600cdf5a284 --- /dev/null +++ b/include/uapi/linux/mrp_bridge.h @@ -0,0 +1,84 @@ +/* SPDX-License-Identifier: GPL-2.0+ WITH Linux-syscall-note */ + +#ifndef _UAPI_LINUX_MRP_BRIDGE_H_ +#define _UAPI_LINUX_MRP_BRIDGE_H_ + +#include +#include + +#define MRP_MAX_FRAME_LENGTH 200 +#define MRP_DEFAULT_PRIO 0x8000 +#define MRP_DOMAIN_UUID_LENGTH 16 +#define MRP_VERSION 1 +#define MRP_FRAME_PRIO 7 + +enum br_mrp_ring_role_type { + BR_MRP_RING_ROLE_DISABLED, + BR_MRP_RING_ROLE_MRC, + BR_MRP_RING_ROLE_MRM, +}; + +enum br_mrp_ring_state_type { + BR_MRP_RING_STATE_OPEN, + BR_MRP_RING_STATE_CLOSED, +}; + +enum br_mrp_port_state_type { + BR_MRP_PORT_STATE_DISABLED, + BR_MRP_PORT_STATE_BLOCKED, + BR_MRP_PORT_STATE_FORWARDING, + BR_MRP_PORT_STATE_NOT_CONNECTED, +}; + +enum br_mrp_port_role_type { + BR_MRP_PORT_ROLE_PRIMARY, + BR_MRP_PORT_ROLE_SECONDARY, + BR_MRP_PORT_ROLE_NONE, +}; + +enum br_mrp_tlv_header_type { + BR_MRP_TLV_HEADER_END = 0x0, + BR_MRP_TLV_HEADER_COMMON = 0x1, + BR_MRP_TLV_HEADER_RING_TEST = 0x2, + BR_MRP_TLV_HEADER_RING_TOPO = 0x3, + BR_MRP_TLV_HEADER_RING_LINK_DOWN = 0x4, + BR_MRP_TLV_HEADER_RING_LINK_UP = 0x5, +}; + +struct br_mrp_tlv_hdr { + __u8 type; + __u8 length; +}; + +struct br_mrp_end_hdr { + struct br_mrp_tlv_hdr hdr; +}; + +struct br_mrp_common_hdr { + __u16 seq_id; + __u8 domain[MRP_DOMAIN_UUID_LENGTH]; +}; + +struct br_mrp_ring_test_hdr { + __u16 prio; + __u8 sa[ETH_ALEN]; + __u16 port_role; + __u16 state; + __u16 transitions; + __u32 timestamp; +}; + +struct br_mrp_ring_topo_hdr { + __u16 prio; + __u8 sa[ETH_ALEN]; + __u16 interval; +}; + +struct br_mrp_ring_link_hdr { + __u8 sa[ETH_ALEN]; + __u16 port_role; + __u16 interval; + __u16 blocked; +}; + +#endif -- cgit v1.2.3-59-g8ed1b From 2cc974f83fb505751b7fbcf8dee27bdcc7054a7e Mon Sep 17 00:00:00 2001 From: Horatiu Vultur Date: Sun, 26 Apr 2020 15:21:59 +0200 Subject: bridge: mrp: Update Kconfig Add the option BRIDGE_MRP to allow to build in or not MRP support. The default value is N. Reviewed-by: Nikolay Aleksandrov Signed-off-by: Horatiu Vultur Signed-off-by: David S. Miller --- net/bridge/Kconfig | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/net/bridge/Kconfig b/net/bridge/Kconfig index e4fb050e2078..51a6414145d2 100644 --- a/net/bridge/Kconfig +++ b/net/bridge/Kconfig @@ -61,3 +61,15 @@ config BRIDGE_VLAN_FILTERING Say N to exclude this support and reduce the binary size. If unsure, say Y. + +config BRIDGE_MRP + bool "MRP protocol" + depends on BRIDGE + default n + help + If you say Y here, then the Ethernet bridge will be able to run MRP + protocol to detect loops + + Say N to exclude this support and reduce the binary size. + + If unsure, say N. -- cgit v1.2.3-59-g8ed1b From 4b8d7d4c599182393421c190bae3604b4db9629a Mon Sep 17 00:00:00 2001 From: Horatiu Vultur Date: Sun, 26 Apr 2020 15:22:00 +0200 Subject: bridge: mrp: Extend bridge interface To integrate MRP into the bridge, first the bridge needs to be aware of ports that are part of an MRP ring and which rings are on the bridge. Therefore extend bridge interface with the following: - add new flag(BR_MPP_AWARE) to the net bridge ports, this bit will be set when the port is added to an MRP instance. In this way it knows if the frame was received on MRP ring port - add new flag(BR_MRP_LOST_CONT) to the net bridge ports, this bit will be set when the port lost the continuity of MRP Test frames. - add a list of MRP instances Reviewed-by: Nikolay Aleksandrov Signed-off-by: Horatiu Vultur Signed-off-by: David S. Miller --- include/linux/if_bridge.h | 2 ++ net/bridge/br_private.h | 4 ++++ 2 files changed, 6 insertions(+) diff --git a/include/linux/if_bridge.h b/include/linux/if_bridge.h index 9e57c4411734..b3a8d3054af0 100644 --- a/include/linux/if_bridge.h +++ b/include/linux/if_bridge.h @@ -47,6 +47,8 @@ struct br_ip_list { #define BR_BCAST_FLOOD BIT(14) #define BR_NEIGH_SUPPRESS BIT(15) #define BR_ISOLATED BIT(16) +#define BR_MRP_AWARE BIT(17) +#define BR_MRP_LOST_CONT BIT(18) #define BR_DEFAULT_AGEING_TIME (300 * HZ) diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index 1f97703a52ff..835a70f8d3ea 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -428,6 +428,10 @@ struct net_bridge { int offload_fwd_mark; #endif struct hlist_head fdb_list; + +#if IS_ENABLED(CONFIG_BRIDGE_MRP) + struct list_head __rcu mrp_list; +#endif }; struct br_input_skb_cb { -- cgit v1.2.3-59-g8ed1b From 3e54442c93845316762b1b3c75e654463fd1b715 Mon Sep 17 00:00:00 2001 From: Horatiu Vultur Date: Sun, 26 Apr 2020 15:22:01 +0200 Subject: net: bridge: Add port attribute IFLA_BRPORT_MRP_RING_OPEN This patch adds a new port attribute, IFLA_BRPORT_MRP_RING_OPEN, which allows to notify the userspace when the port lost the continuite of MRP frames. This attribute is set by kernel whenever the SW or HW detects that the ring is being open or closed. Reviewed-by: Nikolay Aleksandrov Signed-off-by: Horatiu Vultur Signed-off-by: David S. Miller --- include/uapi/linux/if_link.h | 1 + net/bridge/br_netlink.c | 3 +++ tools/include/uapi/linux/if_link.h | 1 + 3 files changed, 5 insertions(+) diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index 127c704eeba9..a009365ad67b 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -343,6 +343,7 @@ enum { IFLA_BRPORT_NEIGH_SUPPRESS, IFLA_BRPORT_ISOLATED, IFLA_BRPORT_BACKUP_PORT, + IFLA_BRPORT_MRP_RING_OPEN, __IFLA_BRPORT_MAX }; #define IFLA_BRPORT_MAX (__IFLA_BRPORT_MAX - 1) diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c index 43dab4066f91..4084f1ef8641 100644 --- a/net/bridge/br_netlink.c +++ b/net/bridge/br_netlink.c @@ -151,6 +151,7 @@ static inline size_t br_port_info_size(void) + nla_total_size(sizeof(u8)) /* IFLA_BRPORT_MULTICAST_ROUTER */ #endif + nla_total_size(sizeof(u16)) /* IFLA_BRPORT_GROUP_FWD_MASK */ + + nla_total_size(sizeof(u8)) /* IFLA_BRPORT_MRP_RING_OPEN */ + 0; } @@ -213,6 +214,8 @@ static int br_port_fill_attrs(struct sk_buff *skb, nla_put_u16(skb, IFLA_BRPORT_GROUP_FWD_MASK, p->group_fwd_mask) || nla_put_u8(skb, IFLA_BRPORT_NEIGH_SUPPRESS, !!(p->flags & BR_NEIGH_SUPPRESS)) || + nla_put_u8(skb, IFLA_BRPORT_MRP_RING_OPEN, !!(p->flags & + BR_MRP_LOST_CONT)) || nla_put_u8(skb, IFLA_BRPORT_ISOLATED, !!(p->flags & BR_ISOLATED))) return -EMSGSIZE; diff --git a/tools/include/uapi/linux/if_link.h b/tools/include/uapi/linux/if_link.h index ca6665ea758a..cafedbbfefbe 100644 --- a/tools/include/uapi/linux/if_link.h +++ b/tools/include/uapi/linux/if_link.h @@ -343,6 +343,7 @@ enum { IFLA_BRPORT_NEIGH_SUPPRESS, IFLA_BRPORT_ISOLATED, IFLA_BRPORT_BACKUP_PORT, + IFLA_BRPORT_MRP_RING_OPEN, __IFLA_BRPORT_MAX }; #define IFLA_BRPORT_MAX (__IFLA_BRPORT_MAX - 1) -- cgit v1.2.3-59-g8ed1b From 2f1a11ae11d222b3a3b41d09a85cb2bf8f83db49 Mon Sep 17 00:00:00 2001 From: Horatiu Vultur Date: Sun, 26 Apr 2020 15:22:02 +0200 Subject: bridge: mrp: Add MRP interface. Define the MRP interface. This interface is used by the netlink to update the MRP instances and by the MRP to make the calls to switchdev to offload it to HW. It defines an MRP instance 'struct br_mrp' which is a list of MRP instances. Which will be part of the 'struct net_bridge'. Each instance has 2 ring ports, a bridge and an ID. In case the HW can't generate MRP Test frames then the SW will generate those. br_mrp_add - adds a new MRP instance. br_mrp_del - deletes an existing MRP instance. Each instance has an ID(ring_id). br_mrp_set_port_state - changes the port state. The port can be in forwarding state, which means that the frames can pass through or in blocked state which means that the frames can't pass through except MRP frames. This will eventually call the switchdev API to notify the HW. This information is used also by the SW bridge to know how to forward frames in case the HW doesn't have this capability. br_mrp_set_port_role - a port role can be primary or secondary. This information is required to be pushed to HW in case the HW can generate MRP_Test frames. Because the MRP_Test frames contains a file with this information. Otherwise the HW will not be able to generate the frames correctly. br_mrp_set_ring_state - a ring can be in state open or closed. State open means that the mrp port stopped receiving MRP_Test frames, while closed means that the mrp port received MRP_Test frames. Similar with br_mrp_port_role, this information is pushed in HW because the MRP_Test frames contain this information. br_mrp_set_ring_role - a ring can have the following roles MRM or MRC. For the role MRM it is expected that the HW can terminate the MRP frames, notify the SW that it stopped receiving MRP_Test frames and trapp all the other MRP frames. While for MRC mode it is expected that the HW can forward the MRP frames only between the MRP ports and copy MRP_Topology frames to CPU. In case the HW doesn't support a role it needs to return an error code different than -EOPNOTSUPP. br_mrp_start_test - this starts/stops the generation of MRP_Test frames. To stop the generation of frames the interval needs to have a value of 0. In this case the userspace needs to know if the HW supports this or not. Not to have duplicate frames(generated by HW and SW). Because if the HW supports this then the SW will not generate anymore frames and will expect that the HW will notify when it stopped receiving MRP frames using the function br_mrp_port_open. br_mrp_port_open - this function is used by drivers to notify the userspace via a netlink callback that one of the ports stopped receiving MRP_Test frames. This function is called only when the node has the role MRM. It is not supposed to be called from userspace. br_mrp_port_switchdev_add - this corresponds to the function br_mrp_add, and will notify the HW that a MRP instance is added. The function gets as parameter the MRP instance. br_mrp_port_switchdev_del - this corresponds to the function br_mrp_del, and will notify the HW that a MRP instance is removed. The function gets as parameter the ID of the MRP instance that is removed. br_mrp_port_switchdev_set_state - this corresponds to the function br_mrp_set_port_state. It would notify the HW if it should block or not non-MRP frames. br_mrp_port_switchdev_set_port - this corresponds to the function br_mrp_set_port_role. It would set the port role, primary or secondary. br_mrp_switchdev_set_role - this corresponds to the function br_mrp_set_ring_role and would set one of the role MRM or MRC. br_mrp_switchdev_set_ring_state - this corresponds to the function br_mrp_set_ring_state and would set the ring to be open or closed. br_mrp_switchdev_send_ring_test - this corresponds to the function br_mrp_start_test. This will notify the HW to start or stop generating MRP_Test frames. Value 0 for the interval parameter means to stop generating the frames. br_mrp_port_open - this function is used to notify the userspace that the port lost the continuity of MRP Test frames. Reviewed-by: Nikolay Aleksandrov Signed-off-by: Horatiu Vultur Signed-off-by: David S. Miller --- net/bridge/br_private_mrp.h | 63 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 63 insertions(+) create mode 100644 net/bridge/br_private_mrp.h diff --git a/net/bridge/br_private_mrp.h b/net/bridge/br_private_mrp.h new file mode 100644 index 000000000000..2921a4b59f8e --- /dev/null +++ b/net/bridge/br_private_mrp.h @@ -0,0 +1,63 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ + +#ifndef _BR_PRIVATE_MRP_H_ +#define _BR_PRIVATE_MRP_H_ + +#include "br_private.h" +#include + +struct br_mrp { + /* list of mrp instances */ + struct list_head __rcu list; + + struct net_bridge_port __rcu *p_port; + struct net_bridge_port __rcu *s_port; + + u32 ring_id; + + enum br_mrp_ring_role_type ring_role; + u8 ring_role_offloaded; + enum br_mrp_ring_state_type ring_state; + u32 ring_transitions; + + struct delayed_work test_work; + u32 test_interval; + unsigned long test_end; + u32 test_count_miss; + u32 test_max_miss; + + u32 seq_id; + + struct rcu_head rcu; +}; + +/* br_mrp.c */ +int br_mrp_add(struct net_bridge *br, struct br_mrp_instance *instance); +int br_mrp_del(struct net_bridge *br, struct br_mrp_instance *instance); +int br_mrp_set_port_state(struct net_bridge_port *p, + enum br_mrp_port_state_type state); +int br_mrp_set_port_role(struct net_bridge_port *p, + struct br_mrp_port_role *role); +int br_mrp_set_ring_state(struct net_bridge *br, + struct br_mrp_ring_state *state); +int br_mrp_set_ring_role(struct net_bridge *br, struct br_mrp_ring_role *role); +int br_mrp_start_test(struct net_bridge *br, struct br_mrp_start_test *test); + +/* br_mrp_switchdev.c */ +int br_mrp_switchdev_add(struct net_bridge *br, struct br_mrp *mrp); +int br_mrp_switchdev_del(struct net_bridge *br, struct br_mrp *mrp); +int br_mrp_switchdev_set_ring_role(struct net_bridge *br, struct br_mrp *mrp, + enum br_mrp_ring_role_type role); +int br_mrp_switchdev_set_ring_state(struct net_bridge *br, struct br_mrp *mrp, + enum br_mrp_ring_state_type state); +int br_mrp_switchdev_send_ring_test(struct net_bridge *br, struct br_mrp *mrp, + u32 interval, u8 max_miss, u32 period); +int br_mrp_port_switchdev_set_state(struct net_bridge_port *p, + enum br_mrp_port_state_type state); +int br_mrp_port_switchdev_set_role(struct net_bridge_port *p, + enum br_mrp_port_role_type role); + +/* br_mrp_netlink.c */ +int br_mrp_port_open(struct net_device *dev, u8 loc); + +#endif /* _BR_PRIVATE_MRP_H */ -- cgit v1.2.3-59-g8ed1b From c284b54590083017193a836362daa4489e782028 Mon Sep 17 00:00:00 2001 From: Horatiu Vultur Date: Sun, 26 Apr 2020 15:22:03 +0200 Subject: switchdev: mrp: Extend switchdev API to offload MRP Extend switchdev API to add support for MRP. The HW is notified in following cases: SWITCHDEV_OBJ_ID_MRP: This is used when a MRP instance is added/removed from the MRP ring. SWITCHDEV_OBJ_ID_RING_ROLE_MRP: This is used when the role of the node changes. The current supported roles are MRM and MRC. SWITCHDEV_OBJ_ID_RING_TEST_MRP: This is used when to start/stop sending MRP_Test frames on the mrp ring ports. This is called only on nodes that have the role MRM. In case this fails then the SW will generate the frames. SWITCHDEV_OBJ_ID_RING_STATE_STATE: This is used when the ring changes it states to open or closed. This is required to notify HW because the MRP_Test frame contains the field MRP_InState which contains this information. SWITCHDEV_ATTR_ID_MRP_PORT_STATE: This is used when the port's state is changed. It can be in blocking/forwarding mode. SWITCHDEV_ATTR_ID_MRP_PORT_ROLE: This is used when port's role changes. The roles of the port can be primary/secondary. This is required to notify HW because the MRP_Test frame contains the field MRP_PortRole that contains this information. Signed-off-by: Horatiu Vultur Signed-off-by: David S. Miller --- include/net/switchdev.h | 62 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 62 insertions(+) diff --git a/include/net/switchdev.h b/include/net/switchdev.h index aee86a189432..ae7aeb0d1f9c 100644 --- a/include/net/switchdev.h +++ b/include/net/switchdev.h @@ -40,6 +40,10 @@ enum switchdev_attr_id { SWITCHDEV_ATTR_ID_BRIDGE_VLAN_FILTERING, SWITCHDEV_ATTR_ID_BRIDGE_MC_DISABLED, SWITCHDEV_ATTR_ID_BRIDGE_MROUTER, +#if IS_ENABLED(CONFIG_BRIDGE_MRP) + SWITCHDEV_ATTR_ID_MRP_PORT_STATE, + SWITCHDEV_ATTR_ID_MRP_PORT_ROLE, +#endif }; struct switchdev_attr { @@ -55,6 +59,11 @@ struct switchdev_attr { clock_t ageing_time; /* BRIDGE_AGEING_TIME */ bool vlan_filtering; /* BRIDGE_VLAN_FILTERING */ bool mc_disabled; /* MC_DISABLED */ +#if IS_ENABLED(CONFIG_BRIDGE_MRP) + u8 mrp_port_state; /* MRP_PORT_STATE */ + u8 mrp_port_role; /* MRP_PORT_ROLE */ + u8 mrp_ring_state; /* MRP_RING_STATE */ +#endif } u; }; @@ -63,6 +72,12 @@ enum switchdev_obj_id { SWITCHDEV_OBJ_ID_PORT_VLAN, SWITCHDEV_OBJ_ID_PORT_MDB, SWITCHDEV_OBJ_ID_HOST_MDB, +#if IS_ENABLED(CONFIG_BRIDGE_MRP) + SWITCHDEV_OBJ_ID_MRP, + SWITCHDEV_OBJ_ID_RING_TEST_MRP, + SWITCHDEV_OBJ_ID_RING_ROLE_MRP, + SWITCHDEV_OBJ_ID_RING_STATE_MRP, +#endif }; struct switchdev_obj { @@ -94,6 +109,53 @@ struct switchdev_obj_port_mdb { #define SWITCHDEV_OBJ_PORT_MDB(OBJ) \ container_of((OBJ), struct switchdev_obj_port_mdb, obj) + +#if IS_ENABLED(CONFIG_BRIDGE_MRP) +/* SWITCHDEV_OBJ_ID_MRP */ +struct switchdev_obj_mrp { + struct switchdev_obj obj; + struct net_device *p_port; + struct net_device *s_port; + u32 ring_id; +}; + +#define SWITCHDEV_OBJ_MRP(OBJ) \ + container_of((OBJ), struct switchdev_obj_mrp, obj) + +/* SWITCHDEV_OBJ_ID_RING_TEST_MRP */ +struct switchdev_obj_ring_test_mrp { + struct switchdev_obj obj; + /* The value is in us and a value of 0 represents to stop */ + u32 interval; + u8 max_miss; + u32 ring_id; + u32 period; +}; + +#define SWITCHDEV_OBJ_RING_TEST_MRP(OBJ) \ + container_of((OBJ), struct switchdev_obj_ring_test_mrp, obj) + +/* SWICHDEV_OBJ_ID_RING_ROLE_MRP */ +struct switchdev_obj_ring_role_mrp { + struct switchdev_obj obj; + u8 ring_role; + u32 ring_id; +}; + +#define SWITCHDEV_OBJ_RING_ROLE_MRP(OBJ) \ + container_of((OBJ), struct switchdev_obj_ring_role_mrp, obj) + +struct switchdev_obj_ring_state_mrp { + struct switchdev_obj obj; + u8 ring_state; + u32 ring_id; +}; + +#define SWITCHDEV_OBJ_RING_STATE_MRP(OBJ) \ + container_of((OBJ), struct switchdev_obj_ring_state_mrp, obj) + +#endif + typedef int switchdev_obj_dump_cb_t(struct switchdev_obj *obj); enum switchdev_notifier_type { -- cgit v1.2.3-59-g8ed1b From fadd409136f0f21192d80816edd9529f27d88c17 Mon Sep 17 00:00:00 2001 From: Horatiu Vultur Date: Sun, 26 Apr 2020 15:22:04 +0200 Subject: bridge: switchdev: mrp: Implement MRP API for switchdev Implement the MRP api for switchdev. These functions will just eventually call the switchdev functions: switchdev_port_obj_add/del and switchdev_port_attr_set. Signed-off-by: Horatiu Vultur Signed-off-by: David S. Miller --- net/bridge/Makefile | 2 + net/bridge/br_mrp_switchdev.c | 140 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 142 insertions(+) create mode 100644 net/bridge/br_mrp_switchdev.c diff --git a/net/bridge/Makefile b/net/bridge/Makefile index 49da7ae6f077..3cacf9dd78d5 100644 --- a/net/bridge/Makefile +++ b/net/bridge/Makefile @@ -25,3 +25,5 @@ bridge-$(CONFIG_BRIDGE_VLAN_FILTERING) += br_vlan.o br_vlan_tunnel.o br_vlan_opt bridge-$(CONFIG_NET_SWITCHDEV) += br_switchdev.o obj-$(CONFIG_NETFILTER) += netfilter/ + +bridge-$(CONFIG_BRIDGE_MRP) += br_mrp_switchdev.o diff --git a/net/bridge/br_mrp_switchdev.c b/net/bridge/br_mrp_switchdev.c new file mode 100644 index 000000000000..51cb1d5a24b4 --- /dev/null +++ b/net/bridge/br_mrp_switchdev.c @@ -0,0 +1,140 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +#include + +#include "br_private_mrp.h" + +int br_mrp_switchdev_add(struct net_bridge *br, struct br_mrp *mrp) +{ + struct switchdev_obj_mrp mrp_obj = { + .obj.orig_dev = br->dev, + .obj.id = SWITCHDEV_OBJ_ID_MRP, + .p_port = rtnl_dereference(mrp->p_port)->dev, + .s_port = rtnl_dereference(mrp->s_port)->dev, + .ring_id = mrp->ring_id, + }; + int err; + + err = switchdev_port_obj_add(br->dev, &mrp_obj.obj, NULL); + + if (err && err != -EOPNOTSUPP) + return err; + + return 0; +} + +int br_mrp_switchdev_del(struct net_bridge *br, struct br_mrp *mrp) +{ + struct switchdev_obj_mrp mrp_obj = { + .obj.orig_dev = br->dev, + .obj.id = SWITCHDEV_OBJ_ID_MRP, + .p_port = NULL, + .s_port = NULL, + .ring_id = mrp->ring_id, + }; + int err; + + err = switchdev_port_obj_del(br->dev, &mrp_obj.obj); + + if (err && err != -EOPNOTSUPP) + return err; + + return 0; +} + +int br_mrp_switchdev_set_ring_role(struct net_bridge *br, + struct br_mrp *mrp, + enum br_mrp_ring_role_type role) +{ + struct switchdev_obj_ring_role_mrp mrp_role = { + .obj.orig_dev = br->dev, + .obj.id = SWITCHDEV_OBJ_ID_RING_ROLE_MRP, + .ring_role = role, + .ring_id = mrp->ring_id, + }; + int err; + + if (role == BR_MRP_RING_ROLE_DISABLED) + err = switchdev_port_obj_del(br->dev, &mrp_role.obj); + else + err = switchdev_port_obj_add(br->dev, &mrp_role.obj, NULL); + + return err; +} + +int br_mrp_switchdev_send_ring_test(struct net_bridge *br, + struct br_mrp *mrp, u32 interval, + u8 max_miss, u32 period) +{ + struct switchdev_obj_ring_test_mrp test = { + .obj.orig_dev = br->dev, + .obj.id = SWITCHDEV_OBJ_ID_RING_TEST_MRP, + .interval = interval, + .max_miss = max_miss, + .ring_id = mrp->ring_id, + .period = period, + }; + int err; + + if (interval == 0) + err = switchdev_port_obj_del(br->dev, &test.obj); + else + err = switchdev_port_obj_add(br->dev, &test.obj, NULL); + + return err; +} + +int br_mrp_switchdev_set_ring_state(struct net_bridge *br, + struct br_mrp *mrp, + enum br_mrp_ring_state_type state) +{ + struct switchdev_obj_ring_state_mrp mrp_state = { + .obj.orig_dev = br->dev, + .obj.id = SWITCHDEV_OBJ_ID_RING_STATE_MRP, + .ring_state = state, + .ring_id = mrp->ring_id, + }; + int err; + + err = switchdev_port_obj_add(br->dev, &mrp_state.obj, NULL); + + if (err && err != -EOPNOTSUPP) + return err; + + return 0; +} + +int br_mrp_port_switchdev_set_state(struct net_bridge_port *p, + enum br_mrp_port_state_type state) +{ + struct switchdev_attr attr = { + .orig_dev = p->dev, + .id = SWITCHDEV_ATTR_ID_MRP_PORT_STATE, + .u.mrp_port_state = state, + }; + int err; + + err = switchdev_port_attr_set(p->dev, &attr); + if (err && err != -EOPNOTSUPP) + br_warn(p->br, "error setting offload MRP state on port %u(%s)\n", + (unsigned int)p->port_no, p->dev->name); + + return err; +} + +int br_mrp_port_switchdev_set_role(struct net_bridge_port *p, + enum br_mrp_port_role_type role) +{ + struct switchdev_attr attr = { + .orig_dev = p->dev, + .id = SWITCHDEV_ATTR_ID_MRP_PORT_ROLE, + .u.mrp_port_role = role, + }; + int err; + + err = switchdev_port_attr_set(p->dev, &attr); + if (err && err != -EOPNOTSUPP) + return err; + + return 0; +} -- cgit v1.2.3-59-g8ed1b From 9a9f26e8f7ea300e8efffcae036dbef239be433a Mon Sep 17 00:00:00 2001 From: Horatiu Vultur Date: Sun, 26 Apr 2020 15:22:05 +0200 Subject: bridge: mrp: Connect MRP API with the switchdev API Implement the MRP API. In case the HW can't generate MRP Test frames then the SW will try to generate the frames. In case that also the SW will fail in generating the frames then a error is return to the userspace. The userspace is responsible to generate all the other MRP frames regardless if the test frames are generated by HW or SW. The forwarding/termination of MRP frames is happening in the kernel and is done by the MRP instance. The userspace application doesn't do the forwarding. Signed-off-by: Horatiu Vultur Signed-off-by: David S. Miller --- net/bridge/Makefile | 2 +- net/bridge/br_mrp.c | 559 ++++++++++++++++++++++++++++++++++++++++++++ net/bridge/br_mrp_netlink.c | 29 +++ 3 files changed, 589 insertions(+), 1 deletion(-) create mode 100644 net/bridge/br_mrp.c create mode 100644 net/bridge/br_mrp_netlink.c diff --git a/net/bridge/Makefile b/net/bridge/Makefile index 3cacf9dd78d5..ccb394236fbd 100644 --- a/net/bridge/Makefile +++ b/net/bridge/Makefile @@ -26,4 +26,4 @@ bridge-$(CONFIG_NET_SWITCHDEV) += br_switchdev.o obj-$(CONFIG_NETFILTER) += netfilter/ -bridge-$(CONFIG_BRIDGE_MRP) += br_mrp_switchdev.o +bridge-$(CONFIG_BRIDGE_MRP) += br_mrp_switchdev.o br_mrp.o br_mrp_netlink.o diff --git a/net/bridge/br_mrp.c b/net/bridge/br_mrp.c new file mode 100644 index 000000000000..d7bc09de4c13 --- /dev/null +++ b/net/bridge/br_mrp.c @@ -0,0 +1,559 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +#include +#include "br_private_mrp.h" + +static const u8 mrp_test_dmac[ETH_ALEN] = { 0x1, 0x15, 0x4e, 0x0, 0x0, 0x1 }; + +static struct net_bridge_port *br_mrp_get_port(struct net_bridge *br, + u32 ifindex) +{ + struct net_bridge_port *res = NULL; + struct net_bridge_port *port; + + list_for_each_entry(port, &br->port_list, list) { + if (port->dev->ifindex == ifindex) { + res = port; + break; + } + } + + return res; +} + +static struct br_mrp *br_mrp_find_id(struct net_bridge *br, u32 ring_id) +{ + struct br_mrp *res = NULL; + struct br_mrp *mrp; + + list_for_each_entry_rcu(mrp, &br->mrp_list, list, + lockdep_rtnl_is_held()) { + if (mrp->ring_id == ring_id) { + res = mrp; + break; + } + } + + return res; +} + +static struct br_mrp *br_mrp_find_port(struct net_bridge *br, + struct net_bridge_port *p) +{ + struct br_mrp *res = NULL; + struct br_mrp *mrp; + + list_for_each_entry_rcu(mrp, &br->mrp_list, list, + lockdep_rtnl_is_held()) { + if (rcu_access_pointer(mrp->p_port) == p || + rcu_access_pointer(mrp->s_port) == p) { + res = mrp; + break; + } + } + + return res; +} + +static int br_mrp_next_seq(struct br_mrp *mrp) +{ + mrp->seq_id++; + return mrp->seq_id; +} + +static struct sk_buff *br_mrp_skb_alloc(struct net_bridge_port *p, + const u8 *src, const u8 *dst) +{ + struct ethhdr *eth_hdr; + struct sk_buff *skb; + u16 *version; + + skb = dev_alloc_skb(MRP_MAX_FRAME_LENGTH); + if (!skb) + return NULL; + + skb->dev = p->dev; + skb->protocol = htons(ETH_P_MRP); + skb->priority = MRP_FRAME_PRIO; + skb_reserve(skb, sizeof(*eth_hdr)); + + eth_hdr = skb_push(skb, sizeof(*eth_hdr)); + ether_addr_copy(eth_hdr->h_dest, dst); + ether_addr_copy(eth_hdr->h_source, src); + eth_hdr->h_proto = htons(ETH_P_MRP); + + version = skb_put(skb, sizeof(*version)); + *version = cpu_to_be16(MRP_VERSION); + + return skb; +} + +static void br_mrp_skb_tlv(struct sk_buff *skb, + enum br_mrp_tlv_header_type type, + u8 length) +{ + struct br_mrp_tlv_hdr *hdr; + + hdr = skb_put(skb, sizeof(*hdr)); + hdr->type = type; + hdr->length = length; +} + +static void br_mrp_skb_common(struct sk_buff *skb, struct br_mrp *mrp) +{ + struct br_mrp_common_hdr *hdr; + + br_mrp_skb_tlv(skb, BR_MRP_TLV_HEADER_COMMON, sizeof(*hdr)); + + hdr = skb_put(skb, sizeof(*hdr)); + hdr->seq_id = cpu_to_be16(br_mrp_next_seq(mrp)); + memset(hdr->domain, 0xff, MRP_DOMAIN_UUID_LENGTH); +} + +static struct sk_buff *br_mrp_alloc_test_skb(struct br_mrp *mrp, + struct net_bridge_port *p, + enum br_mrp_port_role_type port_role) +{ + struct br_mrp_ring_test_hdr *hdr = NULL; + struct sk_buff *skb = NULL; + + if (!p) + return NULL; + + skb = br_mrp_skb_alloc(p, p->dev->dev_addr, mrp_test_dmac); + if (!skb) + return NULL; + + br_mrp_skb_tlv(skb, BR_MRP_TLV_HEADER_RING_TEST, sizeof(*hdr)); + hdr = skb_put(skb, sizeof(*hdr)); + + hdr->prio = cpu_to_be16(MRP_DEFAULT_PRIO); + ether_addr_copy(hdr->sa, p->br->dev->dev_addr); + hdr->port_role = cpu_to_be16(port_role); + hdr->state = cpu_to_be16(mrp->ring_state); + hdr->transitions = cpu_to_be16(mrp->ring_transitions); + hdr->timestamp = cpu_to_be32(jiffies_to_msecs(jiffies)); + + br_mrp_skb_common(skb, mrp); + br_mrp_skb_tlv(skb, BR_MRP_TLV_HEADER_END, 0x0); + + return skb; +} + +static void br_mrp_test_work_expired(struct work_struct *work) +{ + struct delayed_work *del_work = to_delayed_work(work); + struct br_mrp *mrp = container_of(del_work, struct br_mrp, test_work); + struct net_bridge_port *p; + bool notify_open = false; + struct sk_buff *skb; + + if (time_before_eq(mrp->test_end, jiffies)) + return; + + if (mrp->test_count_miss < mrp->test_max_miss) { + mrp->test_count_miss++; + } else { + /* Notify that the ring is open only if the ring state is + * closed, otherwise it would continue to notify at every + * interval. + */ + if (mrp->ring_state == BR_MRP_RING_STATE_CLOSED) + notify_open = true; + } + + rcu_read_lock(); + + p = rcu_dereference(mrp->p_port); + if (p) { + skb = br_mrp_alloc_test_skb(mrp, p, BR_MRP_PORT_ROLE_PRIMARY); + if (!skb) + goto out; + + skb_reset_network_header(skb); + dev_queue_xmit(skb); + + if (notify_open && !mrp->ring_role_offloaded) + br_mrp_port_open(p->dev, true); + } + + p = rcu_dereference(mrp->s_port); + if (p) { + skb = br_mrp_alloc_test_skb(mrp, p, BR_MRP_PORT_ROLE_SECONDARY); + if (!skb) + goto out; + + skb_reset_network_header(skb); + dev_queue_xmit(skb); + + if (notify_open && !mrp->ring_role_offloaded) + br_mrp_port_open(p->dev, true); + } + +out: + rcu_read_unlock(); + + queue_delayed_work(system_wq, &mrp->test_work, + usecs_to_jiffies(mrp->test_interval)); +} + +/* Deletes the MRP instance. + * note: called under rtnl_lock + */ +static void br_mrp_del_impl(struct net_bridge *br, struct br_mrp *mrp) +{ + struct net_bridge_port *p; + + /* Stop sending MRP_Test frames */ + cancel_delayed_work_sync(&mrp->test_work); + br_mrp_switchdev_send_ring_test(br, mrp, 0, 0, 0); + + br_mrp_switchdev_del(br, mrp); + + /* Reset the ports */ + p = rtnl_dereference(mrp->p_port); + if (p) { + spin_lock_bh(&br->lock); + p->state = BR_STATE_FORWARDING; + p->flags &= ~BR_MRP_AWARE; + spin_unlock_bh(&br->lock); + br_mrp_port_switchdev_set_state(p, BR_STATE_FORWARDING); + rcu_assign_pointer(mrp->p_port, NULL); + } + + p = rtnl_dereference(mrp->s_port); + if (p) { + spin_lock_bh(&br->lock); + p->state = BR_STATE_FORWARDING; + p->flags &= ~BR_MRP_AWARE; + spin_unlock_bh(&br->lock); + br_mrp_port_switchdev_set_state(p, BR_STATE_FORWARDING); + rcu_assign_pointer(mrp->s_port, NULL); + } + + list_del_rcu(&mrp->list); + kfree_rcu(mrp, rcu); +} + +/* Adds a new MRP instance. + * note: called under rtnl_lock + */ +int br_mrp_add(struct net_bridge *br, struct br_mrp_instance *instance) +{ + struct net_bridge_port *p; + struct br_mrp *mrp; + int err; + + /* If the ring exists, it is not possible to create another one with the + * same ring_id + */ + mrp = br_mrp_find_id(br, instance->ring_id); + if (mrp) + return -EINVAL; + + if (!br_mrp_get_port(br, instance->p_ifindex) || + !br_mrp_get_port(br, instance->s_ifindex)) + return -EINVAL; + + mrp = kzalloc(sizeof(*mrp), GFP_KERNEL); + if (!mrp) + return -ENOMEM; + + mrp->ring_id = instance->ring_id; + + p = br_mrp_get_port(br, instance->p_ifindex); + spin_lock_bh(&br->lock); + p->state = BR_STATE_FORWARDING; + p->flags |= BR_MRP_AWARE; + spin_unlock_bh(&br->lock); + rcu_assign_pointer(mrp->p_port, p); + + p = br_mrp_get_port(br, instance->s_ifindex); + spin_lock_bh(&br->lock); + p->state = BR_STATE_FORWARDING; + p->flags |= BR_MRP_AWARE; + spin_unlock_bh(&br->lock); + rcu_assign_pointer(mrp->s_port, p); + + INIT_DELAYED_WORK(&mrp->test_work, br_mrp_test_work_expired); + list_add_tail_rcu(&mrp->list, &br->mrp_list); + + err = br_mrp_switchdev_add(br, mrp); + if (err) + goto delete_mrp; + + return 0; + +delete_mrp: + br_mrp_del_impl(br, mrp); + + return err; +} + +/* Deletes the MRP instance from which the port is part of + * note: called under rtnl_lock + */ +void br_mrp_port_del(struct net_bridge *br, struct net_bridge_port *p) +{ + struct br_mrp *mrp = br_mrp_find_port(br, p); + + /* If the port is not part of a MRP instance just bail out */ + if (!mrp) + return; + + br_mrp_del_impl(br, mrp); +} + +/* Deletes existing MRP instance based on ring_id + * note: called under rtnl_lock + */ +int br_mrp_del(struct net_bridge *br, struct br_mrp_instance *instance) +{ + struct br_mrp *mrp = br_mrp_find_id(br, instance->ring_id); + + if (!mrp) + return -EINVAL; + + br_mrp_del_impl(br, mrp); + + return 0; +} + +/* Set port state, port state can be forwarding, blocked or disabled + * note: already called with rtnl_lock + */ +int br_mrp_set_port_state(struct net_bridge_port *p, + enum br_mrp_port_state_type state) +{ + if (!p || !(p->flags & BR_MRP_AWARE)) + return -EINVAL; + + spin_lock_bh(&p->br->lock); + + if (state == BR_MRP_PORT_STATE_FORWARDING) + p->state = BR_STATE_FORWARDING; + else + p->state = BR_STATE_BLOCKING; + + spin_unlock_bh(&p->br->lock); + + br_mrp_port_switchdev_set_state(p, state); + + return 0; +} + +/* Set port role, port role can be primary or secondary + * note: already called with rtnl_lock + */ +int br_mrp_set_port_role(struct net_bridge_port *p, + struct br_mrp_port_role *role) +{ + struct br_mrp *mrp; + + if (!p || !(p->flags & BR_MRP_AWARE)) + return -EINVAL; + + mrp = br_mrp_find_id(p->br, role->ring_id); + + if (!mrp) + return -EINVAL; + + if (role->role == BR_MRP_PORT_ROLE_PRIMARY) + rcu_assign_pointer(mrp->p_port, p); + else + rcu_assign_pointer(mrp->s_port, p); + + br_mrp_port_switchdev_set_role(p, role->role); + + return 0; +} + +/* Set ring state, ring state can be only Open or Closed + * note: already called with rtnl_lock + */ +int br_mrp_set_ring_state(struct net_bridge *br, + struct br_mrp_ring_state *state) +{ + struct br_mrp *mrp = br_mrp_find_id(br, state->ring_id); + + if (!mrp) + return -EINVAL; + + if (mrp->ring_state == BR_MRP_RING_STATE_CLOSED && + state->ring_state != BR_MRP_RING_STATE_CLOSED) + mrp->ring_transitions++; + + mrp->ring_state = state->ring_state; + + br_mrp_switchdev_set_ring_state(br, mrp, state->ring_state); + + return 0; +} + +/* Set ring role, ring role can be only MRM(Media Redundancy Manager) or + * MRC(Media Redundancy Client). + * note: already called with rtnl_lock + */ +int br_mrp_set_ring_role(struct net_bridge *br, + struct br_mrp_ring_role *role) +{ + struct br_mrp *mrp = br_mrp_find_id(br, role->ring_id); + int err; + + if (!mrp) + return -EINVAL; + + mrp->ring_role = role->ring_role; + + /* If there is an error just bailed out */ + err = br_mrp_switchdev_set_ring_role(br, mrp, role->ring_role); + if (err && err != -EOPNOTSUPP) + return err; + + /* Now detect if the HW actually applied the role or not. If the HW + * applied the role it means that the SW will not to do those operations + * anymore. For example if the role ir MRM then the HW will notify the + * SW when ring is open, but if the is not pushed to the HW the SW will + * need to detect when the ring is open + */ + mrp->ring_role_offloaded = err == -EOPNOTSUPP ? 0 : 1; + + return 0; +} + +/* Start to generate MRP test frames, the frames are generated by HW and if it + * fails, they are generated by the SW. + * note: already called with rtnl_lock + */ +int br_mrp_start_test(struct net_bridge *br, + struct br_mrp_start_test *test) +{ + struct br_mrp *mrp = br_mrp_find_id(br, test->ring_id); + + if (!mrp) + return -EINVAL; + + /* Try to push it to the HW and if it fails then continue to generate in + * SW and if that also fails then return error + */ + if (!br_mrp_switchdev_send_ring_test(br, mrp, test->interval, + test->max_miss, test->period)) + return 0; + + mrp->test_interval = test->interval; + mrp->test_end = jiffies + usecs_to_jiffies(test->period); + mrp->test_max_miss = test->max_miss; + mrp->test_count_miss = 0; + queue_delayed_work(system_wq, &mrp->test_work, + usecs_to_jiffies(test->interval)); + + return 0; +} + +/* Process only MRP Test frame. All the other MRP frames are processed by + * userspace application + * note: already called with rcu_read_lock + */ +static void br_mrp_mrm_process(struct br_mrp *mrp, struct net_bridge_port *port, + struct sk_buff *skb) +{ + const struct br_mrp_tlv_hdr *hdr; + struct br_mrp_tlv_hdr _hdr; + + /* Each MRP header starts with a version field which is 16 bits. + * Therefore skip the version and get directly the TLV header. + */ + hdr = skb_header_pointer(skb, sizeof(uint16_t), sizeof(_hdr), &_hdr); + if (!hdr) + return; + + if (hdr->type != BR_MRP_TLV_HEADER_RING_TEST) + return; + + mrp->test_count_miss = 0; + + /* Notify the userspace that the ring is closed only when the ring is + * not closed + */ + if (mrp->ring_state != BR_MRP_RING_STATE_CLOSED) + br_mrp_port_open(port->dev, false); +} + +/* This will just forward the frame to the other mrp ring port(MRC role) or will + * not do anything. + * note: already called with rcu_read_lock + */ +static int br_mrp_rcv(struct net_bridge_port *p, + struct sk_buff *skb, struct net_device *dev) +{ + struct net_device *s_dev, *p_dev, *d_dev; + struct net_bridge_port *p_port, *s_port; + struct net_bridge *br; + struct sk_buff *nskb; + struct br_mrp *mrp; + + /* If port is disabled don't accept any frames */ + if (p->state == BR_STATE_DISABLED) + return 0; + + br = p->br; + mrp = br_mrp_find_port(br, p); + if (unlikely(!mrp)) + return 0; + + p_port = rcu_dereference(mrp->p_port); + if (!p_port) + return 0; + + s_port = rcu_dereference(mrp->s_port); + if (!s_port) + return 0; + + /* If the role is MRM then don't forward the frames */ + if (mrp->ring_role == BR_MRP_RING_ROLE_MRM) { + br_mrp_mrm_process(mrp, p, skb); + return 1; + } + + /* Clone the frame and forward it on the other MRP port */ + nskb = skb_clone(skb, GFP_ATOMIC); + if (!nskb) + return 0; + + p_dev = p_port->dev; + s_dev = s_port->dev; + + if (p_dev == dev) + d_dev = s_dev; + else + d_dev = p_dev; + + nskb->dev = d_dev; + skb_push(nskb, ETH_HLEN); + dev_queue_xmit(nskb); + + return 1; +} + +/* Check if the frame was received on a port that is part of MRP ring + * and if the frame has MRP eth. In that case process the frame otherwise do + * normal forwarding. + * note: already called with rcu_read_lock + */ +int br_mrp_process(struct net_bridge_port *p, struct sk_buff *skb) +{ + /* If there is no MRP instance do normal forwarding */ + if (likely(!(p->flags & BR_MRP_AWARE))) + goto out; + + if (unlikely(skb->protocol == htons(ETH_P_MRP))) + return br_mrp_rcv(p, skb, p->dev); + +out: + return 0; +} + +bool br_mrp_enabled(struct net_bridge *br) +{ + return !list_empty(&br->mrp_list); +} diff --git a/net/bridge/br_mrp_netlink.c b/net/bridge/br_mrp_netlink.c new file mode 100644 index 000000000000..b982db14bbf4 --- /dev/null +++ b/net/bridge/br_mrp_netlink.c @@ -0,0 +1,29 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +#include + +#include +#include "br_private.h" +#include "br_private_mrp.h" + +int br_mrp_port_open(struct net_device *dev, u8 loc) +{ + struct net_bridge_port *p; + int err = 0; + + p = br_port_get_rcu(dev); + if (!p) { + err = -EINVAL; + goto out; + } + + if (loc) + p->flags |= BR_MRP_LOST_CONT; + else + p->flags &= ~BR_MRP_LOST_CONT; + + br_ifinfo_notify(RTM_NEWLINK, NULL, p); + +out: + return err; +} -- cgit v1.2.3-59-g8ed1b From 4d02b8f075153508562803e590f76c4dfe5f4b66 Mon Sep 17 00:00:00 2001 From: Horatiu Vultur Date: Sun, 26 Apr 2020 15:22:06 +0200 Subject: bridge: mrp: Implement netlink interface to configure MRP Implement netlink interface to configure MRP. The implementation will do sanity checks over the attributes and then eventually call the MRP interface. Signed-off-by: Horatiu Vultur Signed-off-by: David S. Miller --- net/bridge/br_mrp_netlink.c | 91 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 91 insertions(+) diff --git a/net/bridge/br_mrp_netlink.c b/net/bridge/br_mrp_netlink.c index b982db14bbf4..503896638be0 100644 --- a/net/bridge/br_mrp_netlink.c +++ b/net/bridge/br_mrp_netlink.c @@ -6,6 +6,97 @@ #include "br_private.h" #include "br_private_mrp.h" +static const struct nla_policy br_mrp_policy[IFLA_BRIDGE_MRP_MAX + 1] = { + [IFLA_BRIDGE_MRP_UNSPEC] = { .type = NLA_REJECT }, + [IFLA_BRIDGE_MRP_INSTANCE] = { .type = NLA_EXACT_LEN, + .len = sizeof(struct br_mrp_instance)}, + [IFLA_BRIDGE_MRP_PORT_STATE] = { .type = NLA_U32 }, + [IFLA_BRIDGE_MRP_PORT_ROLE] = { .type = NLA_EXACT_LEN, + .len = sizeof(struct br_mrp_port_role)}, + [IFLA_BRIDGE_MRP_RING_STATE] = { .type = NLA_EXACT_LEN, + .len = sizeof(struct br_mrp_ring_state)}, + [IFLA_BRIDGE_MRP_RING_ROLE] = { .type = NLA_EXACT_LEN, + .len = sizeof(struct br_mrp_ring_role)}, + [IFLA_BRIDGE_MRP_START_TEST] = { .type = NLA_EXACT_LEN, + .len = sizeof(struct br_mrp_start_test)}, +}; + +int br_mrp_parse(struct net_bridge *br, struct net_bridge_port *p, + struct nlattr *attr, int cmd, struct netlink_ext_ack *extack) +{ + struct nlattr *tb[IFLA_BRIDGE_MRP_MAX + 1]; + int err; + + if (br->stp_enabled != BR_NO_STP) { + NL_SET_ERR_MSG_MOD(extack, "MRP can't be enabled if STP is already enabled\n"); + return -EINVAL; + } + + err = nla_parse_nested(tb, IFLA_BRIDGE_MRP_MAX, attr, + br_mrp_policy, extack); + if (err) + return err; + + if (tb[IFLA_BRIDGE_MRP_INSTANCE]) { + struct br_mrp_instance *instance = + nla_data(tb[IFLA_BRIDGE_MRP_INSTANCE]); + + if (cmd == RTM_SETLINK) + err = br_mrp_add(br, instance); + else + err = br_mrp_del(br, instance); + if (err) + return err; + } + + if (tb[IFLA_BRIDGE_MRP_PORT_STATE]) { + enum br_mrp_port_state_type state = + nla_get_u32(tb[IFLA_BRIDGE_MRP_PORT_STATE]); + + err = br_mrp_set_port_state(p, state); + if (err) + return err; + } + + if (tb[IFLA_BRIDGE_MRP_PORT_ROLE]) { + struct br_mrp_port_role *role = + nla_data(tb[IFLA_BRIDGE_MRP_PORT_ROLE]); + + err = br_mrp_set_port_role(p, role); + if (err) + return err; + } + + if (tb[IFLA_BRIDGE_MRP_RING_STATE]) { + struct br_mrp_ring_state *state = + nla_data(tb[IFLA_BRIDGE_MRP_RING_STATE]); + + err = br_mrp_set_ring_state(br, state); + if (err) + return err; + } + + if (tb[IFLA_BRIDGE_MRP_RING_ROLE]) { + struct br_mrp_ring_role *role = + nla_data(tb[IFLA_BRIDGE_MRP_RING_ROLE]); + + err = br_mrp_set_ring_role(br, role); + if (err) + return err; + } + + if (tb[IFLA_BRIDGE_MRP_START_TEST]) { + struct br_mrp_start_test *test = + nla_data(tb[IFLA_BRIDGE_MRP_START_TEST]); + + err = br_mrp_start_test(br, test); + if (err) + return err; + } + + return 0; +} + int br_mrp_port_open(struct net_device *dev, u8 loc) { struct net_bridge_port *p; -- cgit v1.2.3-59-g8ed1b From 6536993371fab3de4e8379649b60e94d03e6ff37 Mon Sep 17 00:00:00 2001 From: Horatiu Vultur Date: Sun, 26 Apr 2020 15:22:07 +0200 Subject: bridge: mrp: Integrate MRP into the bridge To integrate MRP into the bridge, the bridge needs to do the following: - detect if the MRP frame was received on MRP ring port in that case it would be processed otherwise just forward it as usual. - enable parsing of MRP - before whenever the bridge was set up, it would set all the ports in forwarding state. Add an extra check to not set ports in forwarding state if the port is an MRP ring port. The reason of this change is that if the MRP instance initially sets the port in blocked state by setting the bridge up it would overwrite this setting. Reviewed-by: Nikolay Aleksandrov Signed-off-by: Horatiu Vultur Signed-off-by: David S. Miller --- net/bridge/br_device.c | 3 +++ net/bridge/br_if.c | 2 ++ net/bridge/br_input.c | 3 +++ net/bridge/br_netlink.c | 5 +++++ net/bridge/br_private.h | 31 +++++++++++++++++++++++++++++++ 5 files changed, 44 insertions(+) diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c index 0e3dbc5f3c34..8ec1362588af 100644 --- a/net/bridge/br_device.c +++ b/net/bridge/br_device.c @@ -463,6 +463,9 @@ void br_dev_setup(struct net_device *dev) spin_lock_init(&br->lock); INIT_LIST_HEAD(&br->port_list); INIT_HLIST_HEAD(&br->fdb_list); +#if IS_ENABLED(CONFIG_BRIDGE_MRP) + INIT_LIST_HEAD(&br->mrp_list); +#endif spin_lock_init(&br->hash_lock); br->bridge_id.prio[0] = 0x80; diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c index 4fe30b182ee7..ca685c0cdf95 100644 --- a/net/bridge/br_if.c +++ b/net/bridge/br_if.c @@ -333,6 +333,8 @@ static void del_nbp(struct net_bridge_port *p) br_stp_disable_port(p); spin_unlock_bh(&br->lock); + br_mrp_port_del(br, p); + br_ifinfo_notify(RTM_DELLINK, NULL, p); list_del_rcu(&p->list); diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c index fcc260840028..d5c34f36f0f4 100644 --- a/net/bridge/br_input.c +++ b/net/bridge/br_input.c @@ -342,6 +342,9 @@ rx_handler_result_t br_handle_frame(struct sk_buff **pskb) } } + if (unlikely(br_mrp_process(p, skb))) + return RX_HANDLER_PASS; + forward: switch (p->state) { case BR_STATE_FORWARDING: diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c index 4084f1ef8641..1a5e681a626a 100644 --- a/net/bridge/br_netlink.c +++ b/net/bridge/br_netlink.c @@ -672,6 +672,11 @@ static int br_afspec(struct net_bridge *br, if (err) return err; break; + case IFLA_BRIDGE_MRP: + err = br_mrp_parse(br, p, attr, cmd, extack); + if (err) + return err; + break; } } diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index 835a70f8d3ea..5835828320b6 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -1308,6 +1308,37 @@ unsigned long br_timer_value(const struct timer_list *timer); extern int (*br_fdb_test_addr_hook)(struct net_device *dev, unsigned char *addr); #endif +/* br_mrp.c */ +#if IS_ENABLED(CONFIG_BRIDGE_MRP) +int br_mrp_parse(struct net_bridge *br, struct net_bridge_port *p, + struct nlattr *attr, int cmd, struct netlink_ext_ack *extack); +int br_mrp_process(struct net_bridge_port *p, struct sk_buff *skb); +bool br_mrp_enabled(struct net_bridge *br); +void br_mrp_port_del(struct net_bridge *br, struct net_bridge_port *p); +#else +static inline int br_mrp_parse(struct net_bridge *br, struct net_bridge_port *p, + struct nlattr *attr, int cmd, + struct netlink_ext_ack *extack) +{ + return -EOPNOTSUPP; +} + +static inline int br_mrp_process(struct net_bridge_port *p, struct sk_buff *skb) +{ + return 0; +} + +static inline bool br_mrp_enabled(struct net_bridge *br) +{ + return 0; +} + +static inline void br_mrp_port_del(struct net_bridge *br, + struct net_bridge_port *p) +{ +} +#endif + /* br_netlink.c */ extern struct rtnl_link_ops br_link_ops; int br_netlink_init(void); -- cgit v1.2.3-59-g8ed1b From 419dba8a49d7cc355e5b495d20dea8217369ed63 Mon Sep 17 00:00:00 2001 From: Horatiu Vultur Date: Sun, 26 Apr 2020 15:22:08 +0200 Subject: net: bridge: Add checks for enabling the STP. It is not possible to have the MRP and STP running at the same time on the bridge, therefore add check when enabling the STP to check if MRP is already enabled. In that case return error. Reviewed-by: Nikolay Aleksandrov Signed-off-by: Horatiu Vultur Signed-off-by: David S. Miller --- net/bridge/br_ioctl.c | 3 +-- net/bridge/br_netlink.c | 4 +++- net/bridge/br_private.h | 3 ++- net/bridge/br_stp.c | 6 ++++++ net/bridge/br_stp_if.c | 11 ++++++++++- net/bridge/br_sysfs_br.c | 4 +--- 6 files changed, 23 insertions(+), 8 deletions(-) diff --git a/net/bridge/br_ioctl.c b/net/bridge/br_ioctl.c index ae22d784b88a..5e71fc8b826f 100644 --- a/net/bridge/br_ioctl.c +++ b/net/bridge/br_ioctl.c @@ -242,8 +242,7 @@ static int old_dev_ioctl(struct net_device *dev, struct ifreq *rq, int cmd) if (!ns_capable(dev_net(dev)->user_ns, CAP_NET_ADMIN)) return -EPERM; - br_stp_set_enabled(br, args[1]); - ret = 0; + ret = br_stp_set_enabled(br, args[1], NULL); break; case BRCTL_SET_BRIDGE_PRIORITY: diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c index 1a5e681a626a..a774e19c41bb 100644 --- a/net/bridge/br_netlink.c +++ b/net/bridge/br_netlink.c @@ -1109,7 +1109,9 @@ static int br_changelink(struct net_device *brdev, struct nlattr *tb[], if (data[IFLA_BR_STP_STATE]) { u32 stp_enabled = nla_get_u32(data[IFLA_BR_STP_STATE]); - br_stp_set_enabled(br, stp_enabled); + err = br_stp_set_enabled(br, stp_enabled, extack); + if (err) + return err; } if (data[IFLA_BR_PRIORITY]) { diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index 5835828320b6..c35647cb138a 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -1283,7 +1283,8 @@ int br_set_ageing_time(struct net_bridge *br, clock_t ageing_time); /* br_stp_if.c */ void br_stp_enable_bridge(struct net_bridge *br); void br_stp_disable_bridge(struct net_bridge *br); -void br_stp_set_enabled(struct net_bridge *br, unsigned long val); +int br_stp_set_enabled(struct net_bridge *br, unsigned long val, + struct netlink_ext_ack *extack); void br_stp_enable_port(struct net_bridge_port *p); void br_stp_disable_port(struct net_bridge_port *p); bool br_stp_recalculate_bridge_id(struct net_bridge *br); diff --git a/net/bridge/br_stp.c b/net/bridge/br_stp.c index 1f14b8455345..3e88be7aa269 100644 --- a/net/bridge/br_stp.c +++ b/net/bridge/br_stp.c @@ -36,6 +36,12 @@ void br_set_state(struct net_bridge_port *p, unsigned int state) }; int err; + /* Don't change the state of the ports if they are driven by a different + * protocol. + */ + if (p->flags & BR_MRP_AWARE) + return; + p->state = state; err = switchdev_port_attr_set(p->dev, &attr); if (err && err != -EOPNOTSUPP) diff --git a/net/bridge/br_stp_if.c b/net/bridge/br_stp_if.c index d174d3a566aa..a42850b7eb9a 100644 --- a/net/bridge/br_stp_if.c +++ b/net/bridge/br_stp_if.c @@ -196,10 +196,17 @@ static void br_stp_stop(struct net_bridge *br) br->stp_enabled = BR_NO_STP; } -void br_stp_set_enabled(struct net_bridge *br, unsigned long val) +int br_stp_set_enabled(struct net_bridge *br, unsigned long val, + struct netlink_ext_ack *extack) { ASSERT_RTNL(); + if (br_mrp_enabled(br)) { + NL_SET_ERR_MSG_MOD(extack, + "STP can't be enabled if MRP is already enabled\n"); + return -EINVAL; + } + if (val) { if (br->stp_enabled == BR_NO_STP) br_stp_start(br); @@ -207,6 +214,8 @@ void br_stp_set_enabled(struct net_bridge *br, unsigned long val) if (br->stp_enabled != BR_NO_STP) br_stp_stop(br); } + + return 0; } /* called under bridge lock */ diff --git a/net/bridge/br_sysfs_br.c b/net/bridge/br_sysfs_br.c index 9ab0f00b1081..7db06e3f642a 100644 --- a/net/bridge/br_sysfs_br.c +++ b/net/bridge/br_sysfs_br.c @@ -126,9 +126,7 @@ static ssize_t stp_state_show(struct device *d, static int set_stp_state(struct net_bridge *br, unsigned long val) { - br_stp_set_enabled(br, val); - - return 0; + return br_stp_set_enabled(br, val, NULL); } static ssize_t stp_state_store(struct device *d, -- cgit v1.2.3-59-g8ed1b From 975e8505e6701c77b43bcdd5acc48d3735063b68 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Sun, 26 Apr 2020 23:35:59 +0200 Subject: r8169: improve handling CPCMD_MASK It's sufficient to do the masking once in probe() for clearing unwanted bits that may have been set by the BIOS. Signed-off-by: Heiner Kallweit Signed-off-by: David S. Miller --- drivers/net/ethernet/realtek/r8169_main.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/net/ethernet/realtek/r8169_main.c b/drivers/net/ethernet/realtek/r8169_main.c index 4c616701856a..06a877fe74ba 100644 --- a/drivers/net/ethernet/realtek/r8169_main.c +++ b/drivers/net/ethernet/realtek/r8169_main.c @@ -3845,7 +3845,6 @@ static void rtl_hw_start(struct rtl8169_private *tp) { rtl_unlock_config_regs(tp); - tp->cp_cmd &= CPCMD_MASK; RTL_W16(tp, CPlusCmd, tp->cp_cmd); if (tp->mac_version <= RTL_GIGA_MAC_VER_06) @@ -5424,7 +5423,7 @@ static int rtl_init_one(struct pci_dev *pdev, const struct pci_device_id *ent) tp->mac_version = chipset; - tp->cp_cmd = RTL_R16(tp, CPlusCmd); + tp->cp_cmd = RTL_R16(tp, CPlusCmd) & CPCMD_MASK; if (sizeof(dma_addr_t) > 4 && tp->mac_version >= RTL_GIGA_MAC_VER_18 && !dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(64))) -- cgit v1.2.3-59-g8ed1b From 10478283f210e64ac682083083437dd5f89b7c4a Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Sun, 26 Apr 2020 23:36:56 +0200 Subject: r8169: improve configuring RxConfig register Two bits in RxConfig are controlled by the following dev->feature's: - NETIF_F_RXALL - NETIF_F_HW_VLAN_CTAG_RX (since RTL8125) We have to take care that RxConfig gets fully configured in rtl_hw_start() after e.g. resume from hibernation. Therefore: - Factor out setting the feature-controlled RxConfig bits to a new function rtl_set_rx_config_features() that is called from rtl8169_set_features() and rtl_hw_start(). - Don't deal with RX_VLAN_8125 in rtl_init_rxcfg(), it will be set by rtl_set_rx_config_features(). - Don't handle NETIF_F_RXALL in rtl_set_rx_mode(). Signed-off-by: Heiner Kallweit Signed-off-by: David S. Miller --- drivers/net/ethernet/realtek/r8169_main.c | 38 ++++++++++++++++--------------- 1 file changed, 20 insertions(+), 18 deletions(-) diff --git a/drivers/net/ethernet/realtek/r8169_main.c b/drivers/net/ethernet/realtek/r8169_main.c index 06a877fe74ba..f70e36c20431 100644 --- a/drivers/net/ethernet/realtek/r8169_main.c +++ b/drivers/net/ethernet/realtek/r8169_main.c @@ -388,10 +388,12 @@ enum rtl_register_content { /* rx_mode_bits */ AcceptErr = 0x20, AcceptRunt = 0x10, +#define RX_CONFIG_ACCEPT_ERR_MASK 0x30 AcceptBroadcast = 0x08, AcceptMulticast = 0x04, AcceptMyPhys = 0x02, AcceptAllPhys = 0x01, +#define RX_CONFIG_ACCEPT_OK_MASK 0x0f #define RX_CONFIG_ACCEPT_MASK 0x3f /* TxConfigBits */ @@ -1497,19 +1499,15 @@ static netdev_features_t rtl8169_fix_features(struct net_device *dev, return features; } -static int rtl8169_set_features(struct net_device *dev, - netdev_features_t features) +static void rtl_set_rx_config_features(struct rtl8169_private *tp, + netdev_features_t features) { - struct rtl8169_private *tp = netdev_priv(dev); - u32 rx_config; - - rtl_lock_work(tp); + u32 rx_config = RTL_R32(tp, RxConfig); - rx_config = RTL_R32(tp, RxConfig); if (features & NETIF_F_RXALL) - rx_config |= (AcceptErr | AcceptRunt); + rx_config |= RX_CONFIG_ACCEPT_ERR_MASK; else - rx_config &= ~(AcceptErr | AcceptRunt); + rx_config &= ~RX_CONFIG_ACCEPT_ERR_MASK; if (rtl_is_8125(tp)) { if (features & NETIF_F_HW_VLAN_CTAG_RX) @@ -1519,6 +1517,16 @@ static int rtl8169_set_features(struct net_device *dev, } RTL_W32(tp, RxConfig, rx_config); +} + +static int rtl8169_set_features(struct net_device *dev, + netdev_features_t features) +{ + struct rtl8169_private *tp = netdev_priv(dev); + + rtl_lock_work(tp); + + rtl_set_rx_config_features(tp, features); if (features & NETIF_F_RXCSUM) tp->cp_cmd |= RxChkSum; @@ -2395,8 +2403,6 @@ static void rtl_pll_power_up(struct rtl8169_private *tp) static void rtl_init_rxcfg(struct rtl8169_private *tp) { - u32 vlan; - switch (tp->mac_version) { case RTL_GIGA_MAC_VER_02 ... RTL_GIGA_MAC_VER_06: case RTL_GIGA_MAC_VER_10 ... RTL_GIGA_MAC_VER_17: @@ -2411,9 +2417,7 @@ static void rtl_init_rxcfg(struct rtl8169_private *tp) RTL_W32(tp, RxConfig, RX128_INT_EN | RX_MULTI_EN | RX_DMA_BURST | RX_EARLY_OFF); break; case RTL_GIGA_MAC_VER_60 ... RTL_GIGA_MAC_VER_61: - /* VLAN flags are controlled by NETIF_F_HW_VLAN_CTAG_RX */ - vlan = RTL_R32(tp, RxConfig) & RX_VLAN_8125; - RTL_W32(tp, RxConfig, vlan | RX_FETCH_DFLT_8125 | RX_DMA_BURST); + RTL_W32(tp, RxConfig, RX_FETCH_DFLT_8125 | RX_DMA_BURST); break; default: RTL_W32(tp, RxConfig, RX128_INT_EN | RX_DMA_BURST); @@ -2680,14 +2684,11 @@ static void rtl_set_rx_mode(struct net_device *dev) } } - if (dev->features & NETIF_F_RXALL) - rx_mode |= (AcceptErr | AcceptRunt); - RTL_W32(tp, MAR0 + 4, mc_filter[1]); RTL_W32(tp, MAR0 + 0, mc_filter[0]); tmp = RTL_R32(tp, RxConfig); - RTL_W32(tp, RxConfig, (tmp & ~RX_CONFIG_ACCEPT_MASK) | rx_mode); + RTL_W32(tp, RxConfig, (tmp & ~RX_CONFIG_ACCEPT_OK_MASK) | rx_mode); } DECLARE_RTL_COND(rtl_csiar_cond) @@ -3866,6 +3867,7 @@ static void rtl_hw_start(struct rtl8169_private *tp) RTL_W8(tp, ChipCmd, CmdTxEnb | CmdRxEnb); rtl_init_rxcfg(tp); rtl_set_tx_config_registers(tp); + rtl_set_rx_config_features(tp, tp->dev->features); rtl_set_rx_mode(tp->dev); rtl_irq_enable(tp); } -- cgit v1.2.3-59-g8ed1b From 49c958ccd2433114ca8a96c996b5016aa89c7ba5 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Mon, 27 Apr 2020 18:12:58 +0300 Subject: mlxsw: spectrum_acl: Move block helpers into inline header functions The struct is defined in the header, no need to have the helpers in the c file. Move the helpers to the header. Signed-off-by: Jiri Pirko Signed-off-by: Ido Schimmel Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlxsw/spectrum.h | 60 ++++++++++++++++++---- drivers/net/ethernet/mellanox/mlxsw/spectrum_acl.c | 43 ---------------- 2 files changed, 51 insertions(+), 52 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.h b/drivers/net/ethernet/mellanox/mlxsw/spectrum.h index ca56e72cb4b7..f158cd98f8d8 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.h +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.h @@ -677,12 +677,57 @@ struct mlxsw_sp_acl_block { }; struct mlxsw_afk *mlxsw_sp_acl_afk(struct mlxsw_sp_acl *acl); -struct mlxsw_sp *mlxsw_sp_acl_block_mlxsw_sp(struct mlxsw_sp_acl_block *block); -unsigned int -mlxsw_sp_acl_block_rule_count(const struct mlxsw_sp_acl_block *block); -void mlxsw_sp_acl_block_disable_inc(struct mlxsw_sp_acl_block *block); -void mlxsw_sp_acl_block_disable_dec(struct mlxsw_sp_acl_block *block); -bool mlxsw_sp_acl_block_disabled(const struct mlxsw_sp_acl_block *block); + +static inline struct mlxsw_sp * +mlxsw_sp_acl_block_mlxsw_sp(struct mlxsw_sp_acl_block *block) +{ + return block->mlxsw_sp; +} + +static inline unsigned int +mlxsw_sp_acl_block_rule_count(const struct mlxsw_sp_acl_block *block) +{ + return block ? block->rule_count : 0; +} + +static inline void +mlxsw_sp_acl_block_disable_inc(struct mlxsw_sp_acl_block *block) +{ + if (block) + block->disable_count++; +} + +static inline void +mlxsw_sp_acl_block_disable_dec(struct mlxsw_sp_acl_block *block) +{ + if (block) + block->disable_count--; +} + +static inline bool +mlxsw_sp_acl_block_disabled(const struct mlxsw_sp_acl_block *block) +{ + return block->disable_count; +} + +static inline bool +mlxsw_sp_acl_block_is_egress_bound(const struct mlxsw_sp_acl_block *block) +{ + return block->egress_binding_count; +} + +static inline bool +mlxsw_sp_acl_block_is_ingress_bound(const struct mlxsw_sp_acl_block *block) +{ + return block->ingress_binding_count; +} + +static inline bool +mlxsw_sp_acl_block_is_mixed_bound(const struct mlxsw_sp_acl_block *block) +{ + return block->ingress_binding_count && block->egress_binding_count; +} + struct mlxsw_sp_acl_block *mlxsw_sp_acl_block_create(struct mlxsw_sp *mlxsw_sp, struct net *net); void mlxsw_sp_acl_block_destroy(struct mlxsw_sp_acl_block *block); @@ -695,9 +740,6 @@ int mlxsw_sp_acl_block_unbind(struct mlxsw_sp *mlxsw_sp, struct mlxsw_sp_acl_block *block, struct mlxsw_sp_port *mlxsw_sp_port, bool ingress); -bool mlxsw_sp_acl_block_is_egress_bound(const struct mlxsw_sp_acl_block *block); -bool mlxsw_sp_acl_block_is_ingress_bound(const struct mlxsw_sp_acl_block *block); -bool mlxsw_sp_acl_block_is_mixed_bound(const struct mlxsw_sp_acl_block *block); struct mlxsw_sp_acl_ruleset * mlxsw_sp_acl_ruleset_lookup(struct mlxsw_sp *mlxsw_sp, struct mlxsw_sp_acl_block *block, u32 chain_index, diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_acl.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_acl.c index 01cff711bbd2..bb06c007b3f2 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_acl.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_acl.c @@ -94,49 +94,6 @@ struct mlxsw_sp_fid *mlxsw_sp_acl_dummy_fid(struct mlxsw_sp *mlxsw_sp) return mlxsw_sp->acl->dummy_fid; } -struct mlxsw_sp *mlxsw_sp_acl_block_mlxsw_sp(struct mlxsw_sp_acl_block *block) -{ - return block->mlxsw_sp; -} - -unsigned int -mlxsw_sp_acl_block_rule_count(const struct mlxsw_sp_acl_block *block) -{ - return block ? block->rule_count : 0; -} - -void mlxsw_sp_acl_block_disable_inc(struct mlxsw_sp_acl_block *block) -{ - if (block) - block->disable_count++; -} - -void mlxsw_sp_acl_block_disable_dec(struct mlxsw_sp_acl_block *block) -{ - if (block) - block->disable_count--; -} - -bool mlxsw_sp_acl_block_disabled(const struct mlxsw_sp_acl_block *block) -{ - return block->disable_count; -} - -bool mlxsw_sp_acl_block_is_egress_bound(const struct mlxsw_sp_acl_block *block) -{ - return block->egress_binding_count; -} - -bool mlxsw_sp_acl_block_is_ingress_bound(const struct mlxsw_sp_acl_block *block) -{ - return block->ingress_binding_count; -} - -bool mlxsw_sp_acl_block_is_mixed_bound(const struct mlxsw_sp_acl_block *block) -{ - return block->ingress_binding_count && block->egress_binding_count; -} - static bool mlxsw_sp_acl_ruleset_is_singular(const struct mlxsw_sp_acl_ruleset *ruleset) { -- cgit v1.2.3-59-g8ed1b From 3bc3ffb6e911f9de099d81187ae99b45966cbd05 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Mon, 27 Apr 2020 18:12:59 +0300 Subject: mlxsw: spectrum: Rename acl_block to flow_block The acl_block structure is going to be used for non-acl case - matchall offload. So rename it accordingly. Signed-off-by: Jiri Pirko Signed-off-by: Ido Schimmel Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlxsw/spectrum.c | 71 ++++++++++--------- drivers/net/ethernet/mellanox/mlxsw/spectrum.h | 64 ++++++++--------- .../ethernet/mellanox/mlxsw/spectrum2_mr_tcam.c | 14 ++-- drivers/net/ethernet/mellanox/mlxsw/spectrum_acl.c | 82 +++++++++++----------- .../net/ethernet/mellanox/mlxsw/spectrum_flower.c | 24 +++---- 5 files changed, 128 insertions(+), 127 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c index 24ca8d5bc564..f64e8da21d4a 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c @@ -1544,23 +1544,23 @@ static int mlxsw_sp_setup_tc_cls_matchall(struct mlxsw_sp_port *mlxsw_sp_port, } static int -mlxsw_sp_setup_tc_cls_flower(struct mlxsw_sp_acl_block *acl_block, +mlxsw_sp_setup_tc_cls_flower(struct mlxsw_sp_flow_block *flow_block, struct flow_cls_offload *f) { - struct mlxsw_sp *mlxsw_sp = mlxsw_sp_acl_block_mlxsw_sp(acl_block); + struct mlxsw_sp *mlxsw_sp = mlxsw_sp_flow_block_mlxsw_sp(flow_block); switch (f->command) { case FLOW_CLS_REPLACE: - return mlxsw_sp_flower_replace(mlxsw_sp, acl_block, f); + return mlxsw_sp_flower_replace(mlxsw_sp, flow_block, f); case FLOW_CLS_DESTROY: - mlxsw_sp_flower_destroy(mlxsw_sp, acl_block, f); + mlxsw_sp_flower_destroy(mlxsw_sp, flow_block, f); return 0; case FLOW_CLS_STATS: - return mlxsw_sp_flower_stats(mlxsw_sp, acl_block, f); + return mlxsw_sp_flower_stats(mlxsw_sp, flow_block, f); case FLOW_CLS_TMPLT_CREATE: - return mlxsw_sp_flower_tmplt_create(mlxsw_sp, acl_block, f); + return mlxsw_sp_flower_tmplt_create(mlxsw_sp, flow_block, f); case FLOW_CLS_TMPLT_DESTROY: - mlxsw_sp_flower_tmplt_destroy(mlxsw_sp, acl_block, f); + mlxsw_sp_flower_tmplt_destroy(mlxsw_sp, flow_block, f); return 0; default: return -EOPNOTSUPP; @@ -1607,16 +1607,16 @@ static int mlxsw_sp_setup_tc_block_cb_matchall_eg(enum tc_setup_type type, static int mlxsw_sp_setup_tc_block_cb_flower(enum tc_setup_type type, void *type_data, void *cb_priv) { - struct mlxsw_sp_acl_block *acl_block = cb_priv; + struct mlxsw_sp_flow_block *flow_block = cb_priv; switch (type) { case TC_SETUP_CLSMATCHALL: return 0; case TC_SETUP_CLSFLOWER: - if (mlxsw_sp_acl_block_disabled(acl_block)) + if (mlxsw_sp_flow_block_disabled(flow_block)) return -EOPNOTSUPP; - return mlxsw_sp_setup_tc_cls_flower(acl_block, type_data); + return mlxsw_sp_setup_tc_cls_flower(flow_block, type_data); default: return -EOPNOTSUPP; } @@ -1624,9 +1624,9 @@ static int mlxsw_sp_setup_tc_block_cb_flower(enum tc_setup_type type, static void mlxsw_sp_tc_block_flower_release(void *cb_priv) { - struct mlxsw_sp_acl_block *acl_block = cb_priv; + struct mlxsw_sp_flow_block *flow_block = cb_priv; - mlxsw_sp_acl_block_destroy(acl_block); + mlxsw_sp_flow_block_destroy(flow_block); } static LIST_HEAD(mlxsw_sp_block_cb_list); @@ -1636,7 +1636,7 @@ mlxsw_sp_setup_tc_block_flower_bind(struct mlxsw_sp_port *mlxsw_sp_port, struct flow_block_offload *f, bool ingress) { struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp; - struct mlxsw_sp_acl_block *acl_block; + struct mlxsw_sp_flow_block *flow_block; struct flow_block_cb *block_cb; bool register_block = false; int err; @@ -1645,31 +1645,31 @@ mlxsw_sp_setup_tc_block_flower_bind(struct mlxsw_sp_port *mlxsw_sp_port, mlxsw_sp_setup_tc_block_cb_flower, mlxsw_sp); if (!block_cb) { - acl_block = mlxsw_sp_acl_block_create(mlxsw_sp, f->net); - if (!acl_block) + flow_block = mlxsw_sp_flow_block_create(mlxsw_sp, f->net); + if (!flow_block) return -ENOMEM; block_cb = flow_block_cb_alloc(mlxsw_sp_setup_tc_block_cb_flower, - mlxsw_sp, acl_block, + mlxsw_sp, flow_block, mlxsw_sp_tc_block_flower_release); if (IS_ERR(block_cb)) { - mlxsw_sp_acl_block_destroy(acl_block); + mlxsw_sp_flow_block_destroy(flow_block); err = PTR_ERR(block_cb); goto err_cb_register; } register_block = true; } else { - acl_block = flow_block_cb_priv(block_cb); + flow_block = flow_block_cb_priv(block_cb); } flow_block_cb_incref(block_cb); - err = mlxsw_sp_acl_block_bind(mlxsw_sp, acl_block, - mlxsw_sp_port, ingress, f->extack); + err = mlxsw_sp_flow_block_bind(mlxsw_sp, flow_block, + mlxsw_sp_port, ingress, f->extack); if (err) goto err_block_bind; if (ingress) - mlxsw_sp_port->ing_acl_block = acl_block; + mlxsw_sp_port->ing_flow_block = flow_block; else - mlxsw_sp_port->eg_acl_block = acl_block; + mlxsw_sp_port->eg_flow_block = flow_block; if (register_block) { flow_block_cb_add(block_cb, f); @@ -1687,10 +1687,11 @@ err_cb_register: static void mlxsw_sp_setup_tc_block_flower_unbind(struct mlxsw_sp_port *mlxsw_sp_port, - struct flow_block_offload *f, bool ingress) + struct flow_block_offload *f, + bool ingress) { struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp; - struct mlxsw_sp_acl_block *acl_block; + struct mlxsw_sp_flow_block *flow_block; struct flow_block_cb *block_cb; int err; @@ -1701,13 +1702,13 @@ mlxsw_sp_setup_tc_block_flower_unbind(struct mlxsw_sp_port *mlxsw_sp_port, return; if (ingress) - mlxsw_sp_port->ing_acl_block = NULL; + mlxsw_sp_port->ing_flow_block = NULL; else - mlxsw_sp_port->eg_acl_block = NULL; + mlxsw_sp_port->eg_flow_block = NULL; - acl_block = flow_block_cb_priv(block_cb); - err = mlxsw_sp_acl_block_unbind(mlxsw_sp, acl_block, - mlxsw_sp_port, ingress); + flow_block = flow_block_cb_priv(block_cb); + err = mlxsw_sp_flow_block_unbind(mlxsw_sp, flow_block, + mlxsw_sp_port, ingress); if (!err && !flow_block_cb_decref(block_cb)) { flow_block_cb_remove(block_cb, f); list_del(&block_cb->driver_list); @@ -1797,17 +1798,17 @@ static int mlxsw_sp_feature_hw_tc(struct net_device *dev, bool enable) struct mlxsw_sp_port *mlxsw_sp_port = netdev_priv(dev); if (!enable) { - if (mlxsw_sp_acl_block_rule_count(mlxsw_sp_port->ing_acl_block) || - mlxsw_sp_acl_block_rule_count(mlxsw_sp_port->eg_acl_block) || + if (mlxsw_sp_flow_block_rule_count(mlxsw_sp_port->ing_flow_block) || + mlxsw_sp_flow_block_rule_count(mlxsw_sp_port->eg_flow_block) || !list_empty(&mlxsw_sp_port->mall_tc_list)) { netdev_err(dev, "Active offloaded tc filters, can't turn hw_tc_offload off\n"); return -EINVAL; } - mlxsw_sp_acl_block_disable_inc(mlxsw_sp_port->ing_acl_block); - mlxsw_sp_acl_block_disable_inc(mlxsw_sp_port->eg_acl_block); + mlxsw_sp_flow_block_disable_inc(mlxsw_sp_port->ing_flow_block); + mlxsw_sp_flow_block_disable_inc(mlxsw_sp_port->eg_flow_block); } else { - mlxsw_sp_acl_block_disable_dec(mlxsw_sp_port->ing_acl_block); - mlxsw_sp_acl_block_disable_dec(mlxsw_sp_port->eg_acl_block); + mlxsw_sp_flow_block_disable_dec(mlxsw_sp_port->ing_flow_block); + mlxsw_sp_flow_block_disable_dec(mlxsw_sp_port->eg_flow_block); } return 0; } diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.h b/drivers/net/ethernet/mellanox/mlxsw/spectrum.h index f158cd98f8d8..65b1a2d87c2d 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.h +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.h @@ -287,8 +287,8 @@ struct mlxsw_sp_port { struct mlxsw_sp_port_vlan *default_vlan; struct mlxsw_sp_qdisc_state *qdisc; unsigned acl_rule_count; - struct mlxsw_sp_acl_block *ing_acl_block; - struct mlxsw_sp_acl_block *eg_acl_block; + struct mlxsw_sp_flow_block *ing_flow_block; + struct mlxsw_sp_flow_block *eg_flow_block; struct { struct delayed_work shaper_dw; struct hwtstamp_config hwtstamp_config; @@ -654,7 +654,7 @@ struct mlxsw_sp_acl_rule_info { unsigned int counter_index; }; -struct mlxsw_sp_acl_block; +struct mlxsw_sp_flow_block; struct mlxsw_sp_acl_ruleset; /* spectrum_acl.c */ @@ -663,7 +663,7 @@ enum mlxsw_sp_acl_profile { MLXSW_SP_ACL_PROFILE_MR, }; -struct mlxsw_sp_acl_block { +struct mlxsw_sp_flow_block { struct list_head binding_list; struct mlxsw_sp_acl_ruleset *ruleset_zero; struct mlxsw_sp *mlxsw_sp; @@ -679,74 +679,74 @@ struct mlxsw_sp_acl_block { struct mlxsw_afk *mlxsw_sp_acl_afk(struct mlxsw_sp_acl *acl); static inline struct mlxsw_sp * -mlxsw_sp_acl_block_mlxsw_sp(struct mlxsw_sp_acl_block *block) +mlxsw_sp_flow_block_mlxsw_sp(struct mlxsw_sp_flow_block *block) { return block->mlxsw_sp; } static inline unsigned int -mlxsw_sp_acl_block_rule_count(const struct mlxsw_sp_acl_block *block) +mlxsw_sp_flow_block_rule_count(const struct mlxsw_sp_flow_block *block) { return block ? block->rule_count : 0; } static inline void -mlxsw_sp_acl_block_disable_inc(struct mlxsw_sp_acl_block *block) +mlxsw_sp_flow_block_disable_inc(struct mlxsw_sp_flow_block *block) { if (block) block->disable_count++; } static inline void -mlxsw_sp_acl_block_disable_dec(struct mlxsw_sp_acl_block *block) +mlxsw_sp_flow_block_disable_dec(struct mlxsw_sp_flow_block *block) { if (block) block->disable_count--; } static inline bool -mlxsw_sp_acl_block_disabled(const struct mlxsw_sp_acl_block *block) +mlxsw_sp_flow_block_disabled(const struct mlxsw_sp_flow_block *block) { return block->disable_count; } static inline bool -mlxsw_sp_acl_block_is_egress_bound(const struct mlxsw_sp_acl_block *block) +mlxsw_sp_flow_block_is_egress_bound(const struct mlxsw_sp_flow_block *block) { return block->egress_binding_count; } static inline bool -mlxsw_sp_acl_block_is_ingress_bound(const struct mlxsw_sp_acl_block *block) +mlxsw_sp_flow_block_is_ingress_bound(const struct mlxsw_sp_flow_block *block) { return block->ingress_binding_count; } static inline bool -mlxsw_sp_acl_block_is_mixed_bound(const struct mlxsw_sp_acl_block *block) +mlxsw_sp_flow_block_is_mixed_bound(const struct mlxsw_sp_flow_block *block) { return block->ingress_binding_count && block->egress_binding_count; } -struct mlxsw_sp_acl_block *mlxsw_sp_acl_block_create(struct mlxsw_sp *mlxsw_sp, - struct net *net); -void mlxsw_sp_acl_block_destroy(struct mlxsw_sp_acl_block *block); -int mlxsw_sp_acl_block_bind(struct mlxsw_sp *mlxsw_sp, - struct mlxsw_sp_acl_block *block, - struct mlxsw_sp_port *mlxsw_sp_port, - bool ingress, - struct netlink_ext_ack *extack); -int mlxsw_sp_acl_block_unbind(struct mlxsw_sp *mlxsw_sp, - struct mlxsw_sp_acl_block *block, - struct mlxsw_sp_port *mlxsw_sp_port, - bool ingress); +struct mlxsw_sp_flow_block *mlxsw_sp_flow_block_create(struct mlxsw_sp *mlxsw_sp, + struct net *net); +void mlxsw_sp_flow_block_destroy(struct mlxsw_sp_flow_block *block); +int mlxsw_sp_flow_block_bind(struct mlxsw_sp *mlxsw_sp, + struct mlxsw_sp_flow_block *block, + struct mlxsw_sp_port *mlxsw_sp_port, + bool ingress, + struct netlink_ext_ack *extack); +int mlxsw_sp_flow_block_unbind(struct mlxsw_sp *mlxsw_sp, + struct mlxsw_sp_flow_block *block, + struct mlxsw_sp_port *mlxsw_sp_port, + bool ingress); struct mlxsw_sp_acl_ruleset * mlxsw_sp_acl_ruleset_lookup(struct mlxsw_sp *mlxsw_sp, - struct mlxsw_sp_acl_block *block, u32 chain_index, + struct mlxsw_sp_flow_block *block, u32 chain_index, enum mlxsw_sp_acl_profile profile); struct mlxsw_sp_acl_ruleset * mlxsw_sp_acl_ruleset_get(struct mlxsw_sp *mlxsw_sp, - struct mlxsw_sp_acl_block *block, u32 chain_index, + struct mlxsw_sp_flow_block *block, u32 chain_index, enum mlxsw_sp_acl_profile profile, struct mlxsw_afk_element_usage *tmplt_elusage); void mlxsw_sp_acl_ruleset_put(struct mlxsw_sp *mlxsw_sp, @@ -778,7 +778,7 @@ int mlxsw_sp_acl_rulei_act_drop(struct mlxsw_sp_acl_rule_info *rulei, int mlxsw_sp_acl_rulei_act_trap(struct mlxsw_sp_acl_rule_info *rulei); int mlxsw_sp_acl_rulei_act_mirror(struct mlxsw_sp *mlxsw_sp, struct mlxsw_sp_acl_rule_info *rulei, - struct mlxsw_sp_acl_block *block, + struct mlxsw_sp_flow_block *block, struct net_device *out_dev, struct netlink_ext_ack *extack); int mlxsw_sp_acl_rulei_act_fwd(struct mlxsw_sp *mlxsw_sp, @@ -901,19 +901,19 @@ extern const struct mlxsw_afk_ops mlxsw_sp2_afk_ops; /* spectrum_flower.c */ int mlxsw_sp_flower_replace(struct mlxsw_sp *mlxsw_sp, - struct mlxsw_sp_acl_block *block, + struct mlxsw_sp_flow_block *block, struct flow_cls_offload *f); void mlxsw_sp_flower_destroy(struct mlxsw_sp *mlxsw_sp, - struct mlxsw_sp_acl_block *block, + struct mlxsw_sp_flow_block *block, struct flow_cls_offload *f); int mlxsw_sp_flower_stats(struct mlxsw_sp *mlxsw_sp, - struct mlxsw_sp_acl_block *block, + struct mlxsw_sp_flow_block *block, struct flow_cls_offload *f); int mlxsw_sp_flower_tmplt_create(struct mlxsw_sp *mlxsw_sp, - struct mlxsw_sp_acl_block *block, + struct mlxsw_sp_flow_block *block, struct flow_cls_offload *f); void mlxsw_sp_flower_tmplt_destroy(struct mlxsw_sp *mlxsw_sp, - struct mlxsw_sp_acl_block *block, + struct mlxsw_sp_flow_block *block, struct flow_cls_offload *f); /* spectrum_qdisc.c */ diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum2_mr_tcam.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum2_mr_tcam.c index e31ec75ac035..a11d911302f1 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum2_mr_tcam.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum2_mr_tcam.c @@ -9,7 +9,7 @@ struct mlxsw_sp2_mr_tcam { struct mlxsw_sp *mlxsw_sp; - struct mlxsw_sp_acl_block *acl_block; + struct mlxsw_sp_flow_block *flow_block; struct mlxsw_sp_acl_ruleset *ruleset4; struct mlxsw_sp_acl_ruleset *ruleset6; }; @@ -61,7 +61,7 @@ static int mlxsw_sp2_mr_tcam_ipv4_init(struct mlxsw_sp2_mr_tcam *mr_tcam) mlxsw_sp2_mr_tcam_usage_ipv4, ARRAY_SIZE(mlxsw_sp2_mr_tcam_usage_ipv4)); mr_tcam->ruleset4 = mlxsw_sp_acl_ruleset_get(mr_tcam->mlxsw_sp, - mr_tcam->acl_block, + mr_tcam->flow_block, MLXSW_SP_L3_PROTO_IPV4, MLXSW_SP_ACL_PROFILE_MR, &elusage); @@ -111,7 +111,7 @@ static int mlxsw_sp2_mr_tcam_ipv6_init(struct mlxsw_sp2_mr_tcam *mr_tcam) mlxsw_sp2_mr_tcam_usage_ipv6, ARRAY_SIZE(mlxsw_sp2_mr_tcam_usage_ipv6)); mr_tcam->ruleset6 = mlxsw_sp_acl_ruleset_get(mr_tcam->mlxsw_sp, - mr_tcam->acl_block, + mr_tcam->flow_block, MLXSW_SP_L3_PROTO_IPV6, MLXSW_SP_ACL_PROFILE_MR, &elusage); @@ -289,8 +289,8 @@ static int mlxsw_sp2_mr_tcam_init(struct mlxsw_sp *mlxsw_sp, void *priv) int err; mr_tcam->mlxsw_sp = mlxsw_sp; - mr_tcam->acl_block = mlxsw_sp_acl_block_create(mlxsw_sp, NULL); - if (!mr_tcam->acl_block) + mr_tcam->flow_block = mlxsw_sp_flow_block_create(mlxsw_sp, NULL); + if (!mr_tcam->flow_block) return -ENOMEM; err = mlxsw_sp2_mr_tcam_ipv4_init(mr_tcam); @@ -306,7 +306,7 @@ static int mlxsw_sp2_mr_tcam_init(struct mlxsw_sp *mlxsw_sp, void *priv) err_ipv6_init: mlxsw_sp2_mr_tcam_ipv4_fini(mr_tcam); err_ipv4_init: - mlxsw_sp_acl_block_destroy(mr_tcam->acl_block); + mlxsw_sp_flow_block_destroy(mr_tcam->flow_block); return err; } @@ -316,7 +316,7 @@ static void mlxsw_sp2_mr_tcam_fini(void *priv) mlxsw_sp2_mr_tcam_ipv6_fini(mr_tcam); mlxsw_sp2_mr_tcam_ipv4_fini(mr_tcam); - mlxsw_sp_acl_block_destroy(mr_tcam->acl_block); + mlxsw_sp_flow_block_destroy(mr_tcam->flow_block); } const struct mlxsw_sp_mr_tcam_ops mlxsw_sp2_mr_tcam_ops = { diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_acl.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_acl.c index bb06c007b3f2..f9524cb95e9f 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_acl.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_acl.c @@ -40,7 +40,7 @@ struct mlxsw_afk *mlxsw_sp_acl_afk(struct mlxsw_sp_acl *acl) return acl->afk; } -struct mlxsw_sp_acl_block_binding { +struct mlxsw_sp_flow_block_binding { struct list_head list; struct net_device *dev; struct mlxsw_sp_port *mlxsw_sp_port; @@ -48,7 +48,7 @@ struct mlxsw_sp_acl_block_binding { }; struct mlxsw_sp_acl_ruleset_ht_key { - struct mlxsw_sp_acl_block *block; + struct mlxsw_sp_flow_block *block; u32 chain_index; const struct mlxsw_sp_acl_profile_ops *ops; }; @@ -103,8 +103,8 @@ mlxsw_sp_acl_ruleset_is_singular(const struct mlxsw_sp_acl_ruleset *ruleset) static int mlxsw_sp_acl_ruleset_bind(struct mlxsw_sp *mlxsw_sp, - struct mlxsw_sp_acl_block *block, - struct mlxsw_sp_acl_block_binding *binding) + struct mlxsw_sp_flow_block *block, + struct mlxsw_sp_flow_block_binding *binding) { struct mlxsw_sp_acl_ruleset *ruleset = block->ruleset_zero; const struct mlxsw_sp_acl_profile_ops *ops = ruleset->ht_key.ops; @@ -115,8 +115,8 @@ mlxsw_sp_acl_ruleset_bind(struct mlxsw_sp *mlxsw_sp, static void mlxsw_sp_acl_ruleset_unbind(struct mlxsw_sp *mlxsw_sp, - struct mlxsw_sp_acl_block *block, - struct mlxsw_sp_acl_block_binding *binding) + struct mlxsw_sp_flow_block *block, + struct mlxsw_sp_flow_block_binding *binding) { struct mlxsw_sp_acl_ruleset *ruleset = block->ruleset_zero; const struct mlxsw_sp_acl_profile_ops *ops = ruleset->ht_key.ops; @@ -126,7 +126,7 @@ mlxsw_sp_acl_ruleset_unbind(struct mlxsw_sp *mlxsw_sp, } static bool -mlxsw_sp_acl_ruleset_block_bound(const struct mlxsw_sp_acl_block *block) +mlxsw_sp_acl_ruleset_block_bound(const struct mlxsw_sp_flow_block *block) { return block->ruleset_zero; } @@ -134,9 +134,9 @@ mlxsw_sp_acl_ruleset_block_bound(const struct mlxsw_sp_acl_block *block) static int mlxsw_sp_acl_ruleset_block_bind(struct mlxsw_sp *mlxsw_sp, struct mlxsw_sp_acl_ruleset *ruleset, - struct mlxsw_sp_acl_block *block) + struct mlxsw_sp_flow_block *block) { - struct mlxsw_sp_acl_block_binding *binding; + struct mlxsw_sp_flow_block_binding *binding; int err; block->ruleset_zero = ruleset; @@ -159,19 +159,19 @@ rollback: static void mlxsw_sp_acl_ruleset_block_unbind(struct mlxsw_sp *mlxsw_sp, struct mlxsw_sp_acl_ruleset *ruleset, - struct mlxsw_sp_acl_block *block) + struct mlxsw_sp_flow_block *block) { - struct mlxsw_sp_acl_block_binding *binding; + struct mlxsw_sp_flow_block_binding *binding; list_for_each_entry(binding, &block->binding_list, list) mlxsw_sp_acl_ruleset_unbind(mlxsw_sp, block, binding); block->ruleset_zero = NULL; } -struct mlxsw_sp_acl_block *mlxsw_sp_acl_block_create(struct mlxsw_sp *mlxsw_sp, - struct net *net) +struct mlxsw_sp_flow_block * +mlxsw_sp_flow_block_create(struct mlxsw_sp *mlxsw_sp, struct net *net) { - struct mlxsw_sp_acl_block *block; + struct mlxsw_sp_flow_block *block; block = kzalloc(sizeof(*block), GFP_KERNEL); if (!block) @@ -182,17 +182,17 @@ struct mlxsw_sp_acl_block *mlxsw_sp_acl_block_create(struct mlxsw_sp *mlxsw_sp, return block; } -void mlxsw_sp_acl_block_destroy(struct mlxsw_sp_acl_block *block) +void mlxsw_sp_flow_block_destroy(struct mlxsw_sp_flow_block *block) { WARN_ON(!list_empty(&block->binding_list)); kfree(block); } -static struct mlxsw_sp_acl_block_binding * -mlxsw_sp_acl_block_lookup(struct mlxsw_sp_acl_block *block, - struct mlxsw_sp_port *mlxsw_sp_port, bool ingress) +static struct mlxsw_sp_flow_block_binding * +mlxsw_sp_flow_block_lookup(struct mlxsw_sp_flow_block *block, + struct mlxsw_sp_port *mlxsw_sp_port, bool ingress) { - struct mlxsw_sp_acl_block_binding *binding; + struct mlxsw_sp_flow_block_binding *binding; list_for_each_entry(binding, &block->binding_list, list) if (binding->mlxsw_sp_port == mlxsw_sp_port && @@ -201,16 +201,16 @@ mlxsw_sp_acl_block_lookup(struct mlxsw_sp_acl_block *block, return NULL; } -int mlxsw_sp_acl_block_bind(struct mlxsw_sp *mlxsw_sp, - struct mlxsw_sp_acl_block *block, - struct mlxsw_sp_port *mlxsw_sp_port, - bool ingress, - struct netlink_ext_ack *extack) +int mlxsw_sp_flow_block_bind(struct mlxsw_sp *mlxsw_sp, + struct mlxsw_sp_flow_block *block, + struct mlxsw_sp_port *mlxsw_sp_port, + bool ingress, + struct netlink_ext_ack *extack) { - struct mlxsw_sp_acl_block_binding *binding; + struct mlxsw_sp_flow_block_binding *binding; int err; - if (WARN_ON(mlxsw_sp_acl_block_lookup(block, mlxsw_sp_port, ingress))) + if (WARN_ON(mlxsw_sp_flow_block_lookup(block, mlxsw_sp_port, ingress))) return -EEXIST; if (ingress && block->ingress_blocker_rule_count) { @@ -247,14 +247,14 @@ err_ruleset_bind: return err; } -int mlxsw_sp_acl_block_unbind(struct mlxsw_sp *mlxsw_sp, - struct mlxsw_sp_acl_block *block, - struct mlxsw_sp_port *mlxsw_sp_port, - bool ingress) +int mlxsw_sp_flow_block_unbind(struct mlxsw_sp *mlxsw_sp, + struct mlxsw_sp_flow_block *block, + struct mlxsw_sp_port *mlxsw_sp_port, + bool ingress) { - struct mlxsw_sp_acl_block_binding *binding; + struct mlxsw_sp_flow_block_binding *binding; - binding = mlxsw_sp_acl_block_lookup(block, mlxsw_sp_port, ingress); + binding = mlxsw_sp_flow_block_lookup(block, mlxsw_sp_port, ingress); if (!binding) return -ENOENT; @@ -274,7 +274,7 @@ int mlxsw_sp_acl_block_unbind(struct mlxsw_sp *mlxsw_sp, static struct mlxsw_sp_acl_ruleset * mlxsw_sp_acl_ruleset_create(struct mlxsw_sp *mlxsw_sp, - struct mlxsw_sp_acl_block *block, u32 chain_index, + struct mlxsw_sp_flow_block *block, u32 chain_index, const struct mlxsw_sp_acl_profile_ops *ops, struct mlxsw_afk_element_usage *tmplt_elusage) { @@ -345,7 +345,7 @@ static void mlxsw_sp_acl_ruleset_ref_dec(struct mlxsw_sp *mlxsw_sp, static struct mlxsw_sp_acl_ruleset * __mlxsw_sp_acl_ruleset_lookup(struct mlxsw_sp_acl *acl, - struct mlxsw_sp_acl_block *block, u32 chain_index, + struct mlxsw_sp_flow_block *block, u32 chain_index, const struct mlxsw_sp_acl_profile_ops *ops) { struct mlxsw_sp_acl_ruleset_ht_key ht_key; @@ -360,7 +360,7 @@ __mlxsw_sp_acl_ruleset_lookup(struct mlxsw_sp_acl *acl, struct mlxsw_sp_acl_ruleset * mlxsw_sp_acl_ruleset_lookup(struct mlxsw_sp *mlxsw_sp, - struct mlxsw_sp_acl_block *block, u32 chain_index, + struct mlxsw_sp_flow_block *block, u32 chain_index, enum mlxsw_sp_acl_profile profile) { const struct mlxsw_sp_acl_profile_ops *ops; @@ -378,7 +378,7 @@ mlxsw_sp_acl_ruleset_lookup(struct mlxsw_sp *mlxsw_sp, struct mlxsw_sp_acl_ruleset * mlxsw_sp_acl_ruleset_get(struct mlxsw_sp *mlxsw_sp, - struct mlxsw_sp_acl_block *block, u32 chain_index, + struct mlxsw_sp_flow_block *block, u32 chain_index, enum mlxsw_sp_acl_profile profile, struct mlxsw_afk_element_usage *tmplt_elusage) { @@ -541,11 +541,11 @@ int mlxsw_sp_acl_rulei_act_fwd(struct mlxsw_sp *mlxsw_sp, int mlxsw_sp_acl_rulei_act_mirror(struct mlxsw_sp *mlxsw_sp, struct mlxsw_sp_acl_rule_info *rulei, - struct mlxsw_sp_acl_block *block, + struct mlxsw_sp_flow_block *block, struct net_device *out_dev, struct netlink_ext_ack *extack) { - struct mlxsw_sp_acl_block_binding *binding; + struct mlxsw_sp_flow_block_binding *binding; struct mlxsw_sp_port *in_port; if (!list_is_singular(&block->binding_list)) { @@ -553,7 +553,7 @@ int mlxsw_sp_acl_rulei_act_mirror(struct mlxsw_sp *mlxsw_sp, return -EOPNOTSUPP; } binding = list_first_entry(&block->binding_list, - struct mlxsw_sp_acl_block_binding, list); + struct mlxsw_sp_flow_block_binding, list); in_port = binding->mlxsw_sp_port; return mlxsw_afa_block_append_mirror(rulei->act_block, @@ -775,7 +775,7 @@ int mlxsw_sp_acl_rule_add(struct mlxsw_sp *mlxsw_sp, { struct mlxsw_sp_acl_ruleset *ruleset = rule->ruleset; const struct mlxsw_sp_acl_profile_ops *ops = ruleset->ht_key.ops; - struct mlxsw_sp_acl_block *block = ruleset->ht_key.block; + struct mlxsw_sp_flow_block *block = ruleset->ht_key.block; int err; err = ops->rule_add(mlxsw_sp, ruleset->priv, rule->priv, rule->rulei); @@ -819,7 +819,7 @@ void mlxsw_sp_acl_rule_del(struct mlxsw_sp *mlxsw_sp, { struct mlxsw_sp_acl_ruleset *ruleset = rule->ruleset; const struct mlxsw_sp_acl_profile_ops *ops = ruleset->ht_key.ops; - struct mlxsw_sp_acl_block *block = ruleset->ht_key.block; + struct mlxsw_sp_flow_block *block = ruleset->ht_key.block; block->egress_blocker_rule_count -= rule->rulei->egress_bind_blocker; block->ingress_blocker_rule_count -= rule->rulei->ingress_bind_blocker; diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_flower.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_flower.c index 51117a5a6bbf..89c2e9820e95 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_flower.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_flower.c @@ -15,7 +15,7 @@ #include "core_acl_flex_keys.h" static int mlxsw_sp_flower_parse_actions(struct mlxsw_sp *mlxsw_sp, - struct mlxsw_sp_acl_block *block, + struct mlxsw_sp_flow_block *block, struct mlxsw_sp_acl_rule_info *rulei, struct flow_action *flow_action, struct netlink_ext_ack *extack) @@ -53,11 +53,11 @@ static int mlxsw_sp_flower_parse_actions(struct mlxsw_sp *mlxsw_sp, case FLOW_ACTION_DROP: { bool ingress; - if (mlxsw_sp_acl_block_is_mixed_bound(block)) { + if (mlxsw_sp_flow_block_is_mixed_bound(block)) { NL_SET_ERR_MSG_MOD(extack, "Drop action is not supported when block is bound to ingress and egress"); return -EOPNOTSUPP; } - ingress = mlxsw_sp_acl_block_is_ingress_bound(block); + ingress = mlxsw_sp_flow_block_is_ingress_bound(block); err = mlxsw_sp_acl_rulei_act_drop(rulei, ingress, act->cookie, extack); if (err) { @@ -106,7 +106,7 @@ static int mlxsw_sp_flower_parse_actions(struct mlxsw_sp *mlxsw_sp, struct mlxsw_sp_fid *fid; u16 fid_index; - if (mlxsw_sp_acl_block_is_egress_bound(block)) { + if (mlxsw_sp_flow_block_is_egress_bound(block)) { NL_SET_ERR_MSG_MOD(extack, "Redirect action is not supported on egress"); return -EOPNOTSUPP; } @@ -190,7 +190,7 @@ static int mlxsw_sp_flower_parse_actions(struct mlxsw_sp *mlxsw_sp, static int mlxsw_sp_flower_parse_meta(struct mlxsw_sp_acl_rule_info *rulei, struct flow_cls_offload *f, - struct mlxsw_sp_acl_block *block) + struct mlxsw_sp_flow_block *block) { struct flow_rule *rule = flow_cls_offload_flow_rule(f); struct mlxsw_sp_port *mlxsw_sp_port; @@ -371,7 +371,7 @@ static int mlxsw_sp_flower_parse_ip(struct mlxsw_sp *mlxsw_sp, } static int mlxsw_sp_flower_parse(struct mlxsw_sp *mlxsw_sp, - struct mlxsw_sp_acl_block *block, + struct mlxsw_sp_flow_block *block, struct mlxsw_sp_acl_rule_info *rulei, struct flow_cls_offload *f) { @@ -460,7 +460,7 @@ static int mlxsw_sp_flower_parse(struct mlxsw_sp *mlxsw_sp, struct flow_match_vlan match; flow_rule_match_vlan(rule, &match); - if (mlxsw_sp_acl_block_is_egress_bound(block)) { + if (mlxsw_sp_flow_block_is_egress_bound(block)) { NL_SET_ERR_MSG_MOD(f->common.extack, "vlan_id key is not supported on egress"); return -EOPNOTSUPP; } @@ -505,7 +505,7 @@ static int mlxsw_sp_flower_parse(struct mlxsw_sp *mlxsw_sp, } int mlxsw_sp_flower_replace(struct mlxsw_sp *mlxsw_sp, - struct mlxsw_sp_acl_block *block, + struct mlxsw_sp_flow_block *block, struct flow_cls_offload *f) { struct mlxsw_sp_acl_rule_info *rulei; @@ -552,7 +552,7 @@ err_rule_create: } void mlxsw_sp_flower_destroy(struct mlxsw_sp *mlxsw_sp, - struct mlxsw_sp_acl_block *block, + struct mlxsw_sp_flow_block *block, struct flow_cls_offload *f) { struct mlxsw_sp_acl_ruleset *ruleset; @@ -574,7 +574,7 @@ void mlxsw_sp_flower_destroy(struct mlxsw_sp *mlxsw_sp, } int mlxsw_sp_flower_stats(struct mlxsw_sp *mlxsw_sp, - struct mlxsw_sp_acl_block *block, + struct mlxsw_sp_flow_block *block, struct flow_cls_offload *f) { enum flow_action_hw_stats used_hw_stats = FLOW_ACTION_HW_STATS_DISABLED; @@ -611,7 +611,7 @@ err_rule_get_stats: } int mlxsw_sp_flower_tmplt_create(struct mlxsw_sp *mlxsw_sp, - struct mlxsw_sp_acl_block *block, + struct mlxsw_sp_flow_block *block, struct flow_cls_offload *f) { struct mlxsw_sp_acl_ruleset *ruleset; @@ -632,7 +632,7 @@ int mlxsw_sp_flower_tmplt_create(struct mlxsw_sp *mlxsw_sp, } void mlxsw_sp_flower_tmplt_destroy(struct mlxsw_sp *mlxsw_sp, - struct mlxsw_sp_acl_block *block, + struct mlxsw_sp_flow_block *block, struct flow_cls_offload *f) { struct mlxsw_sp_acl_ruleset *ruleset; -- cgit v1.2.3-59-g8ed1b From d52238eb7bcf53225841217af12fe1383205fcaa Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Mon, 27 Apr 2020 18:13:00 +0300 Subject: mlxsw: spectrum: Push flow_block related functions into a separate file The code around flow_block is currently mixed in spectrum_acl.c. However, as it really does not directly relate to ACL part only, push the bits into a separate file. Signed-off-by: Jiri Pirko Signed-off-by: Ido Schimmel Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlxsw/Makefile | 1 + drivers/net/ethernet/mellanox/mlxsw/spectrum.h | 34 ++++-- drivers/net/ethernet/mellanox/mlxsw/spectrum_acl.c | 131 +-------------------- .../net/ethernet/mellanox/mlxsw/spectrum_flow.c | 120 +++++++++++++++++++ 4 files changed, 151 insertions(+), 135 deletions(-) create mode 100644 drivers/net/ethernet/mellanox/mlxsw/spectrum_flow.c diff --git a/drivers/net/ethernet/mellanox/mlxsw/Makefile b/drivers/net/ethernet/mellanox/mlxsw/Makefile index 0e86a581d45b..59cbf02d6731 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/Makefile +++ b/drivers/net/ethernet/mellanox/mlxsw/Makefile @@ -21,6 +21,7 @@ mlxsw_spectrum-objs := spectrum.o spectrum_buffers.o \ spectrum_acl_atcam.o spectrum_acl_erp.o \ spectrum1_acl_tcam.o spectrum2_acl_tcam.o \ spectrum_acl_bloom_filter.o spectrum_acl.o \ + spectrum_flow.o \ spectrum_flower.o spectrum_cnt.o \ spectrum_fid.o spectrum_ipip.o \ spectrum_acl_flex_actions.o \ diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.h b/drivers/net/ethernet/mellanox/mlxsw/spectrum.h index 65b1a2d87c2d..d4ef079aab4b 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.h +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.h @@ -654,15 +654,7 @@ struct mlxsw_sp_acl_rule_info { unsigned int counter_index; }; -struct mlxsw_sp_flow_block; -struct mlxsw_sp_acl_ruleset; - -/* spectrum_acl.c */ -enum mlxsw_sp_acl_profile { - MLXSW_SP_ACL_PROFILE_FLOWER, - MLXSW_SP_ACL_PROFILE_MR, -}; - +/* spectrum_flow.c */ struct mlxsw_sp_flow_block { struct list_head binding_list; struct mlxsw_sp_acl_ruleset *ruleset_zero; @@ -676,7 +668,12 @@ struct mlxsw_sp_flow_block { struct net *net; }; -struct mlxsw_afk *mlxsw_sp_acl_afk(struct mlxsw_sp_acl *acl); +struct mlxsw_sp_flow_block_binding { + struct list_head list; + struct net_device *dev; + struct mlxsw_sp_port *mlxsw_sp_port; + bool ingress; +}; static inline struct mlxsw_sp * mlxsw_sp_flow_block_mlxsw_sp(struct mlxsw_sp_flow_block *block) @@ -740,6 +737,23 @@ int mlxsw_sp_flow_block_unbind(struct mlxsw_sp *mlxsw_sp, struct mlxsw_sp_flow_block *block, struct mlxsw_sp_port *mlxsw_sp_port, bool ingress); + +/* spectrum_acl.c */ +struct mlxsw_sp_acl_ruleset; + +enum mlxsw_sp_acl_profile { + MLXSW_SP_ACL_PROFILE_FLOWER, + MLXSW_SP_ACL_PROFILE_MR, +}; + +struct mlxsw_afk *mlxsw_sp_acl_afk(struct mlxsw_sp_acl *acl); + +int mlxsw_sp_acl_ruleset_bind(struct mlxsw_sp *mlxsw_sp, + struct mlxsw_sp_flow_block *block, + struct mlxsw_sp_flow_block_binding *binding); +void mlxsw_sp_acl_ruleset_unbind(struct mlxsw_sp *mlxsw_sp, + struct mlxsw_sp_flow_block *block, + struct mlxsw_sp_flow_block_binding *binding); struct mlxsw_sp_acl_ruleset * mlxsw_sp_acl_ruleset_lookup(struct mlxsw_sp *mlxsw_sp, struct mlxsw_sp_flow_block *block, u32 chain_index, diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_acl.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_acl.c index f9524cb95e9f..800eaa6be3c0 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_acl.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_acl.c @@ -40,13 +40,6 @@ struct mlxsw_afk *mlxsw_sp_acl_afk(struct mlxsw_sp_acl *acl) return acl->afk; } -struct mlxsw_sp_flow_block_binding { - struct list_head list; - struct net_device *dev; - struct mlxsw_sp_port *mlxsw_sp_port; - bool ingress; -}; - struct mlxsw_sp_acl_ruleset_ht_key { struct mlxsw_sp_flow_block *block; u32 chain_index; @@ -101,10 +94,9 @@ mlxsw_sp_acl_ruleset_is_singular(const struct mlxsw_sp_acl_ruleset *ruleset) return ruleset->ref_count == 2; } -static int -mlxsw_sp_acl_ruleset_bind(struct mlxsw_sp *mlxsw_sp, - struct mlxsw_sp_flow_block *block, - struct mlxsw_sp_flow_block_binding *binding) +int mlxsw_sp_acl_ruleset_bind(struct mlxsw_sp *mlxsw_sp, + struct mlxsw_sp_flow_block *block, + struct mlxsw_sp_flow_block_binding *binding) { struct mlxsw_sp_acl_ruleset *ruleset = block->ruleset_zero; const struct mlxsw_sp_acl_profile_ops *ops = ruleset->ht_key.ops; @@ -113,10 +105,9 @@ mlxsw_sp_acl_ruleset_bind(struct mlxsw_sp *mlxsw_sp, binding->mlxsw_sp_port, binding->ingress); } -static void -mlxsw_sp_acl_ruleset_unbind(struct mlxsw_sp *mlxsw_sp, - struct mlxsw_sp_flow_block *block, - struct mlxsw_sp_flow_block_binding *binding) +void mlxsw_sp_acl_ruleset_unbind(struct mlxsw_sp *mlxsw_sp, + struct mlxsw_sp_flow_block *block, + struct mlxsw_sp_flow_block_binding *binding) { struct mlxsw_sp_acl_ruleset *ruleset = block->ruleset_zero; const struct mlxsw_sp_acl_profile_ops *ops = ruleset->ht_key.ops; @@ -125,12 +116,6 @@ mlxsw_sp_acl_ruleset_unbind(struct mlxsw_sp *mlxsw_sp, binding->mlxsw_sp_port, binding->ingress); } -static bool -mlxsw_sp_acl_ruleset_block_bound(const struct mlxsw_sp_flow_block *block) -{ - return block->ruleset_zero; -} - static int mlxsw_sp_acl_ruleset_block_bind(struct mlxsw_sp *mlxsw_sp, struct mlxsw_sp_acl_ruleset *ruleset, @@ -168,110 +153,6 @@ mlxsw_sp_acl_ruleset_block_unbind(struct mlxsw_sp *mlxsw_sp, block->ruleset_zero = NULL; } -struct mlxsw_sp_flow_block * -mlxsw_sp_flow_block_create(struct mlxsw_sp *mlxsw_sp, struct net *net) -{ - struct mlxsw_sp_flow_block *block; - - block = kzalloc(sizeof(*block), GFP_KERNEL); - if (!block) - return NULL; - INIT_LIST_HEAD(&block->binding_list); - block->mlxsw_sp = mlxsw_sp; - block->net = net; - return block; -} - -void mlxsw_sp_flow_block_destroy(struct mlxsw_sp_flow_block *block) -{ - WARN_ON(!list_empty(&block->binding_list)); - kfree(block); -} - -static struct mlxsw_sp_flow_block_binding * -mlxsw_sp_flow_block_lookup(struct mlxsw_sp_flow_block *block, - struct mlxsw_sp_port *mlxsw_sp_port, bool ingress) -{ - struct mlxsw_sp_flow_block_binding *binding; - - list_for_each_entry(binding, &block->binding_list, list) - if (binding->mlxsw_sp_port == mlxsw_sp_port && - binding->ingress == ingress) - return binding; - return NULL; -} - -int mlxsw_sp_flow_block_bind(struct mlxsw_sp *mlxsw_sp, - struct mlxsw_sp_flow_block *block, - struct mlxsw_sp_port *mlxsw_sp_port, - bool ingress, - struct netlink_ext_ack *extack) -{ - struct mlxsw_sp_flow_block_binding *binding; - int err; - - if (WARN_ON(mlxsw_sp_flow_block_lookup(block, mlxsw_sp_port, ingress))) - return -EEXIST; - - if (ingress && block->ingress_blocker_rule_count) { - NL_SET_ERR_MSG_MOD(extack, "Block cannot be bound to ingress because it contains unsupported rules"); - return -EOPNOTSUPP; - } - - if (!ingress && block->egress_blocker_rule_count) { - NL_SET_ERR_MSG_MOD(extack, "Block cannot be bound to egress because it contains unsupported rules"); - return -EOPNOTSUPP; - } - - binding = kzalloc(sizeof(*binding), GFP_KERNEL); - if (!binding) - return -ENOMEM; - binding->mlxsw_sp_port = mlxsw_sp_port; - binding->ingress = ingress; - - if (mlxsw_sp_acl_ruleset_block_bound(block)) { - err = mlxsw_sp_acl_ruleset_bind(mlxsw_sp, block, binding); - if (err) - goto err_ruleset_bind; - } - - if (ingress) - block->ingress_binding_count++; - else - block->egress_binding_count++; - list_add(&binding->list, &block->binding_list); - return 0; - -err_ruleset_bind: - kfree(binding); - return err; -} - -int mlxsw_sp_flow_block_unbind(struct mlxsw_sp *mlxsw_sp, - struct mlxsw_sp_flow_block *block, - struct mlxsw_sp_port *mlxsw_sp_port, - bool ingress) -{ - struct mlxsw_sp_flow_block_binding *binding; - - binding = mlxsw_sp_flow_block_lookup(block, mlxsw_sp_port, ingress); - if (!binding) - return -ENOENT; - - list_del(&binding->list); - - if (ingress) - block->ingress_binding_count--; - else - block->egress_binding_count--; - - if (mlxsw_sp_acl_ruleset_block_bound(block)) - mlxsw_sp_acl_ruleset_unbind(mlxsw_sp, block, binding); - - kfree(binding); - return 0; -} - static struct mlxsw_sp_acl_ruleset * mlxsw_sp_acl_ruleset_create(struct mlxsw_sp *mlxsw_sp, struct mlxsw_sp_flow_block *block, u32 chain_index, diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_flow.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_flow.c new file mode 100644 index 000000000000..655e1df5c95a --- /dev/null +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_flow.c @@ -0,0 +1,120 @@ +// SPDX-License-Identifier: BSD-3-Clause OR GPL-2.0 +/* Copyright (c) 2017-2020 Mellanox Technologies. All rights reserved */ + +#include +#include +#include +#include +#include + +#include "spectrum.h" + +struct mlxsw_sp_flow_block * +mlxsw_sp_flow_block_create(struct mlxsw_sp *mlxsw_sp, struct net *net) +{ + struct mlxsw_sp_flow_block *block; + + block = kzalloc(sizeof(*block), GFP_KERNEL); + if (!block) + return NULL; + INIT_LIST_HEAD(&block->binding_list); + block->mlxsw_sp = mlxsw_sp; + block->net = net; + return block; +} + +void mlxsw_sp_flow_block_destroy(struct mlxsw_sp_flow_block *block) +{ + WARN_ON(!list_empty(&block->binding_list)); + kfree(block); +} + +static struct mlxsw_sp_flow_block_binding * +mlxsw_sp_flow_block_lookup(struct mlxsw_sp_flow_block *block, + struct mlxsw_sp_port *mlxsw_sp_port, bool ingress) +{ + struct mlxsw_sp_flow_block_binding *binding; + + list_for_each_entry(binding, &block->binding_list, list) + if (binding->mlxsw_sp_port == mlxsw_sp_port && + binding->ingress == ingress) + return binding; + return NULL; +} + +static bool +mlxsw_sp_flow_block_ruleset_bound(const struct mlxsw_sp_flow_block *block) +{ + return block->ruleset_zero; +} + +int mlxsw_sp_flow_block_bind(struct mlxsw_sp *mlxsw_sp, + struct mlxsw_sp_flow_block *block, + struct mlxsw_sp_port *mlxsw_sp_port, + bool ingress, + struct netlink_ext_ack *extack) +{ + struct mlxsw_sp_flow_block_binding *binding; + int err; + + if (WARN_ON(mlxsw_sp_flow_block_lookup(block, mlxsw_sp_port, ingress))) + return -EEXIST; + + if (ingress && block->ingress_blocker_rule_count) { + NL_SET_ERR_MSG_MOD(extack, "Block cannot be bound to ingress because it contains unsupported rules"); + return -EOPNOTSUPP; + } + + if (!ingress && block->egress_blocker_rule_count) { + NL_SET_ERR_MSG_MOD(extack, "Block cannot be bound to egress because it contains unsupported rules"); + return -EOPNOTSUPP; + } + + binding = kzalloc(sizeof(*binding), GFP_KERNEL); + if (!binding) + return -ENOMEM; + binding->mlxsw_sp_port = mlxsw_sp_port; + binding->ingress = ingress; + + if (mlxsw_sp_flow_block_ruleset_bound(block)) { + err = mlxsw_sp_acl_ruleset_bind(mlxsw_sp, block, binding); + if (err) + goto err_ruleset_bind; + } + + if (ingress) + block->ingress_binding_count++; + else + block->egress_binding_count++; + list_add(&binding->list, &block->binding_list); + return 0; + +err_ruleset_bind: + kfree(binding); + return err; +} + +int mlxsw_sp_flow_block_unbind(struct mlxsw_sp *mlxsw_sp, + struct mlxsw_sp_flow_block *block, + struct mlxsw_sp_port *mlxsw_sp_port, + bool ingress) +{ + struct mlxsw_sp_flow_block_binding *binding; + + binding = mlxsw_sp_flow_block_lookup(block, mlxsw_sp_port, ingress); + if (!binding) + return -ENOENT; + + list_del(&binding->list); + + if (ingress) + block->ingress_binding_count--; + else + block->egress_binding_count--; + + if (mlxsw_sp_flow_block_ruleset_bound(block)) + mlxsw_sp_acl_ruleset_unbind(mlxsw_sp, block, binding); + + kfree(binding); + return 0; +} -- cgit v1.2.3-59-g8ed1b From d7fcc986224d77c4ab66f436e7eaa11170c509af Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Mon, 27 Apr 2020 18:13:01 +0300 Subject: mlxsw: spectrum: Push matchall bits into a separate file Similar to flower, have matchall related code in a separate file. Do some small renaming on the way (consistent "mall" prefixes, dropped "_tc_", dropped "_port_" where suitable). Signed-off-by: Jiri Pirko Signed-off-by: Ido Schimmel Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlxsw/Makefile | 2 +- drivers/net/ethernet/mellanox/mlxsw/spectrum.c | 186 +------------------ drivers/net/ethernet/mellanox/mlxsw/spectrum.h | 28 +-- .../ethernet/mellanox/mlxsw/spectrum_matchall.c | 202 +++++++++++++++++++++ 4 files changed, 214 insertions(+), 204 deletions(-) create mode 100644 drivers/net/ethernet/mellanox/mlxsw/spectrum_matchall.c diff --git a/drivers/net/ethernet/mellanox/mlxsw/Makefile b/drivers/net/ethernet/mellanox/mlxsw/Makefile index 59cbf02d6731..4aeabb35c943 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/Makefile +++ b/drivers/net/ethernet/mellanox/mlxsw/Makefile @@ -21,7 +21,7 @@ mlxsw_spectrum-objs := spectrum.o spectrum_buffers.o \ spectrum_acl_atcam.o spectrum_acl_erp.o \ spectrum1_acl_tcam.o spectrum2_acl_tcam.o \ spectrum_acl_bloom_filter.o spectrum_acl.o \ - spectrum_flow.o \ + spectrum_flow.o spectrum_matchall.o \ spectrum_flower.o spectrum_cnt.o \ spectrum_fid.o spectrum_ipip.o \ spectrum_acl_flex_actions.o \ diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c index f64e8da21d4a..ff25f8fc55e9 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c @@ -25,9 +25,7 @@ #include #include #include -#include #include -#include #include #include "spectrum.h" @@ -582,16 +580,6 @@ static int mlxsw_sp_base_mac_get(struct mlxsw_sp *mlxsw_sp) return 0; } -static int mlxsw_sp_port_sample_set(struct mlxsw_sp_port *mlxsw_sp_port, - bool enable, u32 rate) -{ - struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp; - char mpsc_pl[MLXSW_REG_MPSC_LEN]; - - mlxsw_reg_mpsc_pack(mpsc_pl, mlxsw_sp_port->local_port, enable, rate); - return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(mpsc), mpsc_pl); -} - static int mlxsw_sp_port_admin_status_set(struct mlxsw_sp_port *mlxsw_sp_port, bool is_up) { @@ -1362,181 +1350,15 @@ static int mlxsw_sp_port_kill_vid(struct net_device *dev, return 0; } -static struct mlxsw_sp_port_mall_tc_entry * -mlxsw_sp_port_mall_tc_entry_find(struct mlxsw_sp_port *port, - unsigned long cookie) { - struct mlxsw_sp_port_mall_tc_entry *mall_tc_entry; - - list_for_each_entry(mall_tc_entry, &port->mall_tc_list, list) - if (mall_tc_entry->cookie == cookie) - return mall_tc_entry; - - return NULL; -} - -static int -mlxsw_sp_port_add_cls_matchall_mirror(struct mlxsw_sp_port *mlxsw_sp_port, - struct mlxsw_sp_port_mall_mirror_tc_entry *mirror, - const struct flow_action_entry *act, - bool ingress) -{ - enum mlxsw_sp_span_type span_type; - - if (!act->dev) { - netdev_err(mlxsw_sp_port->dev, "Could not find requested device\n"); - return -EINVAL; - } - - mirror->ingress = ingress; - span_type = ingress ? MLXSW_SP_SPAN_INGRESS : MLXSW_SP_SPAN_EGRESS; - return mlxsw_sp_span_mirror_add(mlxsw_sp_port, act->dev, span_type, - true, &mirror->span_id); -} - -static void -mlxsw_sp_port_del_cls_matchall_mirror(struct mlxsw_sp_port *mlxsw_sp_port, - struct mlxsw_sp_port_mall_mirror_tc_entry *mirror) -{ - enum mlxsw_sp_span_type span_type; - - span_type = mirror->ingress ? - MLXSW_SP_SPAN_INGRESS : MLXSW_SP_SPAN_EGRESS; - mlxsw_sp_span_mirror_del(mlxsw_sp_port, mirror->span_id, - span_type, true); -} - -static int -mlxsw_sp_port_add_cls_matchall_sample(struct mlxsw_sp_port *mlxsw_sp_port, - struct tc_cls_matchall_offload *cls, - const struct flow_action_entry *act, - bool ingress) -{ - int err; - - if (!mlxsw_sp_port->sample) - return -EOPNOTSUPP; - if (rtnl_dereference(mlxsw_sp_port->sample->psample_group)) { - netdev_err(mlxsw_sp_port->dev, "sample already active\n"); - return -EEXIST; - } - if (act->sample.rate > MLXSW_REG_MPSC_RATE_MAX) { - netdev_err(mlxsw_sp_port->dev, "sample rate not supported\n"); - return -EOPNOTSUPP; - } - - rcu_assign_pointer(mlxsw_sp_port->sample->psample_group, - act->sample.psample_group); - mlxsw_sp_port->sample->truncate = act->sample.truncate; - mlxsw_sp_port->sample->trunc_size = act->sample.trunc_size; - mlxsw_sp_port->sample->rate = act->sample.rate; - - err = mlxsw_sp_port_sample_set(mlxsw_sp_port, true, act->sample.rate); - if (err) - goto err_port_sample_set; - return 0; - -err_port_sample_set: - RCU_INIT_POINTER(mlxsw_sp_port->sample->psample_group, NULL); - return err; -} - -static void -mlxsw_sp_port_del_cls_matchall_sample(struct mlxsw_sp_port *mlxsw_sp_port) -{ - if (!mlxsw_sp_port->sample) - return; - - mlxsw_sp_port_sample_set(mlxsw_sp_port, false, 1); - RCU_INIT_POINTER(mlxsw_sp_port->sample->psample_group, NULL); -} - -static int mlxsw_sp_port_add_cls_matchall(struct mlxsw_sp_port *mlxsw_sp_port, - struct tc_cls_matchall_offload *f, - bool ingress) -{ - struct mlxsw_sp_port_mall_tc_entry *mall_tc_entry; - __be16 protocol = f->common.protocol; - struct flow_action_entry *act; - int err; - - if (!flow_offload_has_one_action(&f->rule->action)) { - netdev_err(mlxsw_sp_port->dev, "only singular actions are supported\n"); - return -EOPNOTSUPP; - } - - mall_tc_entry = kzalloc(sizeof(*mall_tc_entry), GFP_KERNEL); - if (!mall_tc_entry) - return -ENOMEM; - mall_tc_entry->cookie = f->cookie; - - act = &f->rule->action.entries[0]; - - if (act->id == FLOW_ACTION_MIRRED && protocol == htons(ETH_P_ALL)) { - struct mlxsw_sp_port_mall_mirror_tc_entry *mirror; - - mall_tc_entry->type = MLXSW_SP_PORT_MALL_MIRROR; - mirror = &mall_tc_entry->mirror; - err = mlxsw_sp_port_add_cls_matchall_mirror(mlxsw_sp_port, - mirror, act, - ingress); - } else if (act->id == FLOW_ACTION_SAMPLE && - protocol == htons(ETH_P_ALL)) { - mall_tc_entry->type = MLXSW_SP_PORT_MALL_SAMPLE; - err = mlxsw_sp_port_add_cls_matchall_sample(mlxsw_sp_port, f, - act, ingress); - } else { - err = -EOPNOTSUPP; - } - - if (err) - goto err_add_action; - - list_add_tail(&mall_tc_entry->list, &mlxsw_sp_port->mall_tc_list); - return 0; - -err_add_action: - kfree(mall_tc_entry); - return err; -} - -static void mlxsw_sp_port_del_cls_matchall(struct mlxsw_sp_port *mlxsw_sp_port, - struct tc_cls_matchall_offload *f) -{ - struct mlxsw_sp_port_mall_tc_entry *mall_tc_entry; - - mall_tc_entry = mlxsw_sp_port_mall_tc_entry_find(mlxsw_sp_port, - f->cookie); - if (!mall_tc_entry) { - netdev_dbg(mlxsw_sp_port->dev, "tc entry not found on port\n"); - return; - } - list_del(&mall_tc_entry->list); - - switch (mall_tc_entry->type) { - case MLXSW_SP_PORT_MALL_MIRROR: - mlxsw_sp_port_del_cls_matchall_mirror(mlxsw_sp_port, - &mall_tc_entry->mirror); - break; - case MLXSW_SP_PORT_MALL_SAMPLE: - mlxsw_sp_port_del_cls_matchall_sample(mlxsw_sp_port); - break; - default: - WARN_ON(1); - } - - kfree(mall_tc_entry); -} - static int mlxsw_sp_setup_tc_cls_matchall(struct mlxsw_sp_port *mlxsw_sp_port, struct tc_cls_matchall_offload *f, bool ingress) { switch (f->command) { case TC_CLSMATCHALL_REPLACE: - return mlxsw_sp_port_add_cls_matchall(mlxsw_sp_port, f, - ingress); + return mlxsw_sp_mall_replace(mlxsw_sp_port, f, ingress); case TC_CLSMATCHALL_DESTROY: - mlxsw_sp_port_del_cls_matchall(mlxsw_sp_port, f); + mlxsw_sp_mall_destroy(mlxsw_sp_port, f); return 0; default: return -EOPNOTSUPP; @@ -1800,7 +1622,7 @@ static int mlxsw_sp_feature_hw_tc(struct net_device *dev, bool enable) if (!enable) { if (mlxsw_sp_flow_block_rule_count(mlxsw_sp_port->ing_flow_block) || mlxsw_sp_flow_block_rule_count(mlxsw_sp_port->eg_flow_block) || - !list_empty(&mlxsw_sp_port->mall_tc_list)) { + !list_empty(&mlxsw_sp_port->mall_list)) { netdev_err(dev, "Active offloaded tc filters, can't turn hw_tc_offload off\n"); return -EINVAL; } @@ -3696,7 +3518,7 @@ static int mlxsw_sp_port_create(struct mlxsw_sp *mlxsw_sp, u8 local_port, mlxsw_sp_port->mapping = *port_mapping; mlxsw_sp_port->link.autoneg = 1; INIT_LIST_HEAD(&mlxsw_sp_port->vlans_list); - INIT_LIST_HEAD(&mlxsw_sp_port->mall_tc_list); + INIT_LIST_HEAD(&mlxsw_sp_port->mall_list); mlxsw_sp_port->pcpu_stats = netdev_alloc_pcpu_stats(struct mlxsw_sp_port_pcpu_stats); diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.h b/drivers/net/ethernet/mellanox/mlxsw/spectrum.h index d4ef079aab4b..5c2f1af53e53 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.h +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.h @@ -109,25 +109,6 @@ struct mlxsw_sp_mid { unsigned long *ports_in_mid; /* bits array */ }; -enum mlxsw_sp_port_mall_action_type { - MLXSW_SP_PORT_MALL_MIRROR, - MLXSW_SP_PORT_MALL_SAMPLE, -}; - -struct mlxsw_sp_port_mall_mirror_tc_entry { - int span_id; - bool ingress; -}; - -struct mlxsw_sp_port_mall_tc_entry { - struct list_head list; - unsigned long cookie; - enum mlxsw_sp_port_mall_action_type type; - union { - struct mlxsw_sp_port_mall_mirror_tc_entry mirror; - }; -}; - struct mlxsw_sp_sb; struct mlxsw_sp_bridge; struct mlxsw_sp_router; @@ -274,8 +255,7 @@ struct mlxsw_sp_port { * the same localport can have * different mapping. */ - /* TC handles */ - struct list_head mall_tc_list; + struct list_head mall_list; struct { #define MLXSW_HW_STATS_UPDATE_TIME HZ struct rtnl_link_stats64 stats; @@ -913,6 +893,12 @@ extern const struct mlxsw_afa_ops mlxsw_sp2_act_afa_ops; extern const struct mlxsw_afk_ops mlxsw_sp1_afk_ops; extern const struct mlxsw_afk_ops mlxsw_sp2_afk_ops; +/* spectrum_matchall.c */ +int mlxsw_sp_mall_replace(struct mlxsw_sp_port *mlxsw_sp_port, + struct tc_cls_matchall_offload *f, bool ingress); +void mlxsw_sp_mall_destroy(struct mlxsw_sp_port *mlxsw_sp_port, + struct tc_cls_matchall_offload *f); + /* spectrum_flower.c */ int mlxsw_sp_flower_replace(struct mlxsw_sp *mlxsw_sp, struct mlxsw_sp_flow_block *block, diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_matchall.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_matchall.c new file mode 100644 index 000000000000..56f21cfdb48e --- /dev/null +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_matchall.c @@ -0,0 +1,202 @@ +// SPDX-License-Identifier: BSD-3-Clause OR GPL-2.0 +/* Copyright (c) 2017-2020 Mellanox Technologies. All rights reserved */ + +#include +#include +#include +#include + +#include "spectrum.h" +#include "spectrum_span.h" +#include "reg.h" + +enum mlxsw_sp_mall_action_type { + MLXSW_SP_MALL_ACTION_TYPE_MIRROR, + MLXSW_SP_MALL_ACTION_TYPE_SAMPLE, +}; + +struct mlxsw_sp_mall_mirror_entry { + int span_id; + bool ingress; +}; + +struct mlxsw_sp_mall_entry { + struct list_head list; + unsigned long cookie; + enum mlxsw_sp_mall_action_type type; + union { + struct mlxsw_sp_mall_mirror_entry mirror; + }; +}; + +static struct mlxsw_sp_mall_entry * +mlxsw_sp_mall_entry_find(struct mlxsw_sp_port *port, unsigned long cookie) +{ + struct mlxsw_sp_mall_entry *mall_entry; + + list_for_each_entry(mall_entry, &port->mall_list, list) + if (mall_entry->cookie == cookie) + return mall_entry; + + return NULL; +} + +static int +mlxsw_sp_mall_port_mirror_add(struct mlxsw_sp_port *mlxsw_sp_port, + struct mlxsw_sp_mall_mirror_entry *mirror, + const struct flow_action_entry *act, + bool ingress) +{ + enum mlxsw_sp_span_type span_type; + + if (!act->dev) { + netdev_err(mlxsw_sp_port->dev, "Could not find requested device\n"); + return -EINVAL; + } + + mirror->ingress = ingress; + span_type = ingress ? MLXSW_SP_SPAN_INGRESS : MLXSW_SP_SPAN_EGRESS; + return mlxsw_sp_span_mirror_add(mlxsw_sp_port, act->dev, span_type, + true, &mirror->span_id); +} + +static void +mlxsw_sp_mall_port_mirror_del(struct mlxsw_sp_port *mlxsw_sp_port, + struct mlxsw_sp_mall_mirror_entry *mirror) +{ + enum mlxsw_sp_span_type span_type; + + span_type = mirror->ingress ? MLXSW_SP_SPAN_INGRESS : + MLXSW_SP_SPAN_EGRESS; + mlxsw_sp_span_mirror_del(mlxsw_sp_port, mirror->span_id, + span_type, true); +} + +static int mlxsw_sp_mall_port_sample_set(struct mlxsw_sp_port *mlxsw_sp_port, + bool enable, u32 rate) +{ + struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp; + char mpsc_pl[MLXSW_REG_MPSC_LEN]; + + mlxsw_reg_mpsc_pack(mpsc_pl, mlxsw_sp_port->local_port, enable, rate); + return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(mpsc), mpsc_pl); +} + +static int +mlxsw_sp_mall_port_sample_add(struct mlxsw_sp_port *mlxsw_sp_port, + struct tc_cls_matchall_offload *cls, + const struct flow_action_entry *act, bool ingress) +{ + int err; + + if (!mlxsw_sp_port->sample) + return -EOPNOTSUPP; + if (rtnl_dereference(mlxsw_sp_port->sample->psample_group)) { + netdev_err(mlxsw_sp_port->dev, "sample already active\n"); + return -EEXIST; + } + if (act->sample.rate > MLXSW_REG_MPSC_RATE_MAX) { + netdev_err(mlxsw_sp_port->dev, "sample rate not supported\n"); + return -EOPNOTSUPP; + } + + rcu_assign_pointer(mlxsw_sp_port->sample->psample_group, + act->sample.psample_group); + mlxsw_sp_port->sample->truncate = act->sample.truncate; + mlxsw_sp_port->sample->trunc_size = act->sample.trunc_size; + mlxsw_sp_port->sample->rate = act->sample.rate; + + err = mlxsw_sp_mall_port_sample_set(mlxsw_sp_port, true, + act->sample.rate); + if (err) + goto err_port_sample_set; + return 0; + +err_port_sample_set: + RCU_INIT_POINTER(mlxsw_sp_port->sample->psample_group, NULL); + return err; +} + +static void +mlxsw_sp_mall_port_sample_del(struct mlxsw_sp_port *mlxsw_sp_port) +{ + if (!mlxsw_sp_port->sample) + return; + + mlxsw_sp_mall_port_sample_set(mlxsw_sp_port, false, 1); + RCU_INIT_POINTER(mlxsw_sp_port->sample->psample_group, NULL); +} + +int mlxsw_sp_mall_replace(struct mlxsw_sp_port *mlxsw_sp_port, + struct tc_cls_matchall_offload *f, bool ingress) +{ + struct mlxsw_sp_mall_entry *mall_entry; + __be16 protocol = f->common.protocol; + struct flow_action_entry *act; + int err; + + if (!flow_offload_has_one_action(&f->rule->action)) { + netdev_err(mlxsw_sp_port->dev, "only singular actions are supported\n"); + return -EOPNOTSUPP; + } + + mall_entry = kzalloc(sizeof(*mall_entry), GFP_KERNEL); + if (!mall_entry) + return -ENOMEM; + mall_entry->cookie = f->cookie; + + act = &f->rule->action.entries[0]; + + if (act->id == FLOW_ACTION_MIRRED && protocol == htons(ETH_P_ALL)) { + struct mlxsw_sp_mall_mirror_entry *mirror; + + mall_entry->type = MLXSW_SP_MALL_ACTION_TYPE_MIRROR; + mirror = &mall_entry->mirror; + err = mlxsw_sp_mall_port_mirror_add(mlxsw_sp_port, mirror, act, + ingress); + } else if (act->id == FLOW_ACTION_SAMPLE && + protocol == htons(ETH_P_ALL)) { + mall_entry->type = MLXSW_SP_MALL_ACTION_TYPE_SAMPLE; + err = mlxsw_sp_mall_port_sample_add(mlxsw_sp_port, f, act, + ingress); + } else { + err = -EOPNOTSUPP; + } + + if (err) + goto err_add_action; + + list_add_tail(&mall_entry->list, &mlxsw_sp_port->mall_list); + return 0; + +err_add_action: + kfree(mall_entry); + return err; +} + +void mlxsw_sp_mall_destroy(struct mlxsw_sp_port *mlxsw_sp_port, + struct tc_cls_matchall_offload *f) +{ + struct mlxsw_sp_mall_entry *mall_entry; + + mall_entry = mlxsw_sp_mall_entry_find(mlxsw_sp_port, f->cookie); + if (!mall_entry) { + netdev_dbg(mlxsw_sp_port->dev, "tc entry not found on port\n"); + return; + } + list_del(&mall_entry->list); + + switch (mall_entry->type) { + case MLXSW_SP_MALL_ACTION_TYPE_MIRROR: + mlxsw_sp_mall_port_mirror_del(mlxsw_sp_port, + &mall_entry->mirror); + break; + case MLXSW_SP_MALL_ACTION_TYPE_SAMPLE: + mlxsw_sp_mall_port_sample_del(mlxsw_sp_port); + break; + default: + WARN_ON(1); + } + + kfree(mall_entry); +} -- cgit v1.2.3-59-g8ed1b From 6c8cd435b58780ded6e3e06d722249ec181efb36 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Mon, 27 Apr 2020 18:13:02 +0300 Subject: mlxsw: spectrum_acl: Use block variable in mlxsw_sp_acl_rule_del() On couple of places in mlxsw_sp_acl_rule_del(), block variable is not used directly as it could be. So do it. Signed-off-by: Jiri Pirko Signed-off-by: Ido Schimmel Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlxsw/spectrum_acl.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_acl.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_acl.c index 800eaa6be3c0..c61f78e30397 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_acl.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_acl.c @@ -704,14 +704,13 @@ void mlxsw_sp_acl_rule_del(struct mlxsw_sp *mlxsw_sp, block->egress_blocker_rule_count -= rule->rulei->egress_bind_blocker; block->ingress_blocker_rule_count -= rule->rulei->ingress_bind_blocker; - ruleset->ht_key.block->rule_count--; + block->rule_count--; mutex_lock(&mlxsw_sp->acl->rules_lock); list_del(&rule->list); mutex_unlock(&mlxsw_sp->acl->rules_lock); if (!ruleset->ht_key.chain_index && mlxsw_sp_acl_ruleset_is_singular(ruleset)) - mlxsw_sp_acl_ruleset_block_unbind(mlxsw_sp, ruleset, - ruleset->ht_key.block); + mlxsw_sp_acl_ruleset_block_unbind(mlxsw_sp, ruleset, block); rhashtable_remove_fast(&ruleset->rule_ht, &rule->ht_node, mlxsw_sp_acl_rule_ht_params); ops->rule_del(mlxsw_sp, rule->priv); -- cgit v1.2.3-59-g8ed1b From 780ba878a1b024e176dc27c980e1600a23d7b5c5 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Mon, 27 Apr 2020 18:13:03 +0300 Subject: mlxsw: spectrum_matchall: Pass mall_entry as arg to mlxsw_sp_mall_port_mirror_add() In the preparation for future changes, have the mlxsw_sp_mall_port_mirror_add() function to accept mall_entry including the "to_dev" originally obtained from act pointer. Signed-off-by: Jiri Pirko Signed-off-by: Ido Schimmel Signed-off-by: David S. Miller --- .../ethernet/mellanox/mlxsw/spectrum_matchall.c | 34 +++++++++++----------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_matchall.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_matchall.c index 56f21cfdb48e..b57267f0c9a1 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_matchall.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_matchall.c @@ -16,6 +16,7 @@ enum mlxsw_sp_mall_action_type { }; struct mlxsw_sp_mall_mirror_entry { + const struct net_device *to_dev; int span_id; bool ingress; }; @@ -43,32 +44,34 @@ mlxsw_sp_mall_entry_find(struct mlxsw_sp_port *port, unsigned long cookie) static int mlxsw_sp_mall_port_mirror_add(struct mlxsw_sp_port *mlxsw_sp_port, - struct mlxsw_sp_mall_mirror_entry *mirror, - const struct flow_action_entry *act, + struct mlxsw_sp_mall_entry *mall_entry, bool ingress) { enum mlxsw_sp_span_type span_type; - if (!act->dev) { + if (!mall_entry->mirror.to_dev) { netdev_err(mlxsw_sp_port->dev, "Could not find requested device\n"); return -EINVAL; } - mirror->ingress = ingress; - span_type = ingress ? MLXSW_SP_SPAN_INGRESS : MLXSW_SP_SPAN_EGRESS; - return mlxsw_sp_span_mirror_add(mlxsw_sp_port, act->dev, span_type, - true, &mirror->span_id); + mall_entry->mirror.ingress = ingress; + span_type = mall_entry->mirror.ingress ? MLXSW_SP_SPAN_INGRESS : + MLXSW_SP_SPAN_EGRESS; + return mlxsw_sp_span_mirror_add(mlxsw_sp_port, + mall_entry->mirror.to_dev, + span_type, true, + &mall_entry->mirror.span_id); } static void mlxsw_sp_mall_port_mirror_del(struct mlxsw_sp_port *mlxsw_sp_port, - struct mlxsw_sp_mall_mirror_entry *mirror) + struct mlxsw_sp_mall_entry *mall_entry) { enum mlxsw_sp_span_type span_type; - span_type = mirror->ingress ? MLXSW_SP_SPAN_INGRESS : - MLXSW_SP_SPAN_EGRESS; - mlxsw_sp_span_mirror_del(mlxsw_sp_port, mirror->span_id, + span_type = mall_entry->mirror.ingress ? MLXSW_SP_SPAN_INGRESS : + MLXSW_SP_SPAN_EGRESS; + mlxsw_sp_span_mirror_del(mlxsw_sp_port, mall_entry->mirror.span_id, span_type, true); } @@ -148,11 +151,9 @@ int mlxsw_sp_mall_replace(struct mlxsw_sp_port *mlxsw_sp_port, act = &f->rule->action.entries[0]; if (act->id == FLOW_ACTION_MIRRED && protocol == htons(ETH_P_ALL)) { - struct mlxsw_sp_mall_mirror_entry *mirror; - mall_entry->type = MLXSW_SP_MALL_ACTION_TYPE_MIRROR; - mirror = &mall_entry->mirror; - err = mlxsw_sp_mall_port_mirror_add(mlxsw_sp_port, mirror, act, + mall_entry->mirror.to_dev = act->dev; + err = mlxsw_sp_mall_port_mirror_add(mlxsw_sp_port, mall_entry, ingress); } else if (act->id == FLOW_ACTION_SAMPLE && protocol == htons(ETH_P_ALL)) { @@ -188,8 +189,7 @@ void mlxsw_sp_mall_destroy(struct mlxsw_sp_port *mlxsw_sp_port, switch (mall_entry->type) { case MLXSW_SP_MALL_ACTION_TYPE_MIRROR: - mlxsw_sp_mall_port_mirror_del(mlxsw_sp_port, - &mall_entry->mirror); + mlxsw_sp_mall_port_mirror_del(mlxsw_sp_port, mall_entry); break; case MLXSW_SP_MALL_ACTION_TYPE_SAMPLE: mlxsw_sp_mall_port_sample_del(mlxsw_sp_port); -- cgit v1.2.3-59-g8ed1b From c7ea0e162fc84602fde67d41edf72ae3f4f4a14f Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Mon, 27 Apr 2020 18:13:04 +0300 Subject: mlxsw: spectrum_matchall: Pass mall_entry as arg to mlxsw_sp_mall_port_sample_add() In the preparation for future changes, have the mlxsw_sp_mall_port_sample_add() function to accept mall_entry including all needed info originally obtained from cls and act pointers. Signed-off-by: Jiri Pirko Signed-off-by: Ido Schimmel Signed-off-by: David S. Miller --- .../ethernet/mellanox/mlxsw/spectrum_matchall.c | 35 ++++++++++++---------- 1 file changed, 19 insertions(+), 16 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_matchall.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_matchall.c index b57267f0c9a1..adaaee208655 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_matchall.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_matchall.c @@ -27,6 +27,7 @@ struct mlxsw_sp_mall_entry { enum mlxsw_sp_mall_action_type type; union { struct mlxsw_sp_mall_mirror_entry mirror; + struct mlxsw_sp_port_sample sample; }; }; @@ -87,8 +88,7 @@ static int mlxsw_sp_mall_port_sample_set(struct mlxsw_sp_port *mlxsw_sp_port, static int mlxsw_sp_mall_port_sample_add(struct mlxsw_sp_port *mlxsw_sp_port, - struct tc_cls_matchall_offload *cls, - const struct flow_action_entry *act, bool ingress) + struct mlxsw_sp_mall_entry *mall_entry) { int err; @@ -98,19 +98,14 @@ mlxsw_sp_mall_port_sample_add(struct mlxsw_sp_port *mlxsw_sp_port, netdev_err(mlxsw_sp_port->dev, "sample already active\n"); return -EEXIST; } - if (act->sample.rate > MLXSW_REG_MPSC_RATE_MAX) { - netdev_err(mlxsw_sp_port->dev, "sample rate not supported\n"); - return -EOPNOTSUPP; - } - rcu_assign_pointer(mlxsw_sp_port->sample->psample_group, - act->sample.psample_group); - mlxsw_sp_port->sample->truncate = act->sample.truncate; - mlxsw_sp_port->sample->trunc_size = act->sample.trunc_size; - mlxsw_sp_port->sample->rate = act->sample.rate; + mall_entry->sample.psample_group); + mlxsw_sp_port->sample->truncate = mall_entry->sample.truncate; + mlxsw_sp_port->sample->trunc_size = mall_entry->sample.trunc_size; + mlxsw_sp_port->sample->rate = mall_entry->sample.rate; err = mlxsw_sp_mall_port_sample_set(mlxsw_sp_port, true, - act->sample.rate); + mall_entry->sample.rate); if (err) goto err_port_sample_set; return 0; @@ -157,20 +152,28 @@ int mlxsw_sp_mall_replace(struct mlxsw_sp_port *mlxsw_sp_port, ingress); } else if (act->id == FLOW_ACTION_SAMPLE && protocol == htons(ETH_P_ALL)) { + if (act->sample.rate > MLXSW_REG_MPSC_RATE_MAX) { + netdev_err(mlxsw_sp_port->dev, "sample rate not supported\n"); + err = -EOPNOTSUPP; + goto errout; + } mall_entry->type = MLXSW_SP_MALL_ACTION_TYPE_SAMPLE; - err = mlxsw_sp_mall_port_sample_add(mlxsw_sp_port, f, act, - ingress); + mall_entry->sample.psample_group = act->sample.psample_group; + mall_entry->sample.truncate = act->sample.truncate; + mall_entry->sample.trunc_size = act->sample.trunc_size; + mall_entry->sample.rate = act->sample.rate; + err = mlxsw_sp_mall_port_sample_add(mlxsw_sp_port, mall_entry); } else { err = -EOPNOTSUPP; } if (err) - goto err_add_action; + goto errout; list_add_tail(&mall_entry->list, &mlxsw_sp_port->mall_list); return 0; -err_add_action: +errout: kfree(mall_entry); return err; } -- cgit v1.2.3-59-g8ed1b From 47fa15eae487f3f454d004894671ebea53e77bde Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Mon, 27 Apr 2020 18:13:05 +0300 Subject: mlxsw: spectrum_matchall: Move ingress indication into mall_entry Instead of having it in mirror_entry structure, move it to mall_entry and set it during rule insertion. Signed-off-by: Jiri Pirko Signed-off-by: Ido Schimmel Signed-off-by: David S. Miller --- .../net/ethernet/mellanox/mlxsw/spectrum_matchall.c | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_matchall.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_matchall.c index adaaee208655..c05e28971d06 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_matchall.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_matchall.c @@ -18,13 +18,13 @@ enum mlxsw_sp_mall_action_type { struct mlxsw_sp_mall_mirror_entry { const struct net_device *to_dev; int span_id; - bool ingress; }; struct mlxsw_sp_mall_entry { struct list_head list; unsigned long cookie; enum mlxsw_sp_mall_action_type type; + bool ingress; union { struct mlxsw_sp_mall_mirror_entry mirror; struct mlxsw_sp_port_sample sample; @@ -45,8 +45,7 @@ mlxsw_sp_mall_entry_find(struct mlxsw_sp_port *port, unsigned long cookie) static int mlxsw_sp_mall_port_mirror_add(struct mlxsw_sp_port *mlxsw_sp_port, - struct mlxsw_sp_mall_entry *mall_entry, - bool ingress) + struct mlxsw_sp_mall_entry *mall_entry) { enum mlxsw_sp_span_type span_type; @@ -55,9 +54,8 @@ mlxsw_sp_mall_port_mirror_add(struct mlxsw_sp_port *mlxsw_sp_port, return -EINVAL; } - mall_entry->mirror.ingress = ingress; - span_type = mall_entry->mirror.ingress ? MLXSW_SP_SPAN_INGRESS : - MLXSW_SP_SPAN_EGRESS; + span_type = mall_entry->ingress ? MLXSW_SP_SPAN_INGRESS : + MLXSW_SP_SPAN_EGRESS; return mlxsw_sp_span_mirror_add(mlxsw_sp_port, mall_entry->mirror.to_dev, span_type, true, @@ -70,8 +68,8 @@ mlxsw_sp_mall_port_mirror_del(struct mlxsw_sp_port *mlxsw_sp_port, { enum mlxsw_sp_span_type span_type; - span_type = mall_entry->mirror.ingress ? MLXSW_SP_SPAN_INGRESS : - MLXSW_SP_SPAN_EGRESS; + span_type = mall_entry->ingress ? MLXSW_SP_SPAN_INGRESS : + MLXSW_SP_SPAN_EGRESS; mlxsw_sp_span_mirror_del(mlxsw_sp_port, mall_entry->mirror.span_id, span_type, true); } @@ -142,14 +140,14 @@ int mlxsw_sp_mall_replace(struct mlxsw_sp_port *mlxsw_sp_port, if (!mall_entry) return -ENOMEM; mall_entry->cookie = f->cookie; + mall_entry->ingress = ingress; act = &f->rule->action.entries[0]; if (act->id == FLOW_ACTION_MIRRED && protocol == htons(ETH_P_ALL)) { mall_entry->type = MLXSW_SP_MALL_ACTION_TYPE_MIRROR; mall_entry->mirror.to_dev = act->dev; - err = mlxsw_sp_mall_port_mirror_add(mlxsw_sp_port, mall_entry, - ingress); + err = mlxsw_sp_mall_port_mirror_add(mlxsw_sp_port, mall_entry); } else if (act->id == FLOW_ACTION_SAMPLE && protocol == htons(ETH_P_ALL)) { if (act->sample.rate > MLXSW_REG_MPSC_RATE_MAX) { -- cgit v1.2.3-59-g8ed1b From dd0fbc89d274e392a077c5dc9a21d581de3252d1 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Mon, 27 Apr 2020 18:13:06 +0300 Subject: mlxsw: spectrum_matchall: Push per-port rule add/del into separate functions As the replace/destroy is going to be used later on per-block, push the per-port rule addition/deletion into separate functions. Signed-off-by: Jiri Pirko Signed-off-by: Ido Schimmel Signed-off-by: David S. Miller --- .../ethernet/mellanox/mlxsw/spectrum_matchall.c | 48 ++++++++++++++++------ 1 file changed, 35 insertions(+), 13 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_matchall.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_matchall.c index c05e28971d06..41301027a47c 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_matchall.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_matchall.c @@ -123,6 +123,37 @@ mlxsw_sp_mall_port_sample_del(struct mlxsw_sp_port *mlxsw_sp_port) RCU_INIT_POINTER(mlxsw_sp_port->sample->psample_group, NULL); } +static int +mlxsw_sp_mall_port_rule_add(struct mlxsw_sp_port *mlxsw_sp_port, + struct mlxsw_sp_mall_entry *mall_entry) +{ + switch (mall_entry->type) { + case MLXSW_SP_MALL_ACTION_TYPE_MIRROR: + return mlxsw_sp_mall_port_mirror_add(mlxsw_sp_port, mall_entry); + case MLXSW_SP_MALL_ACTION_TYPE_SAMPLE: + return mlxsw_sp_mall_port_sample_add(mlxsw_sp_port, mall_entry); + default: + WARN_ON(1); + return -EINVAL; + } +} + +static void +mlxsw_sp_mall_port_rule_del(struct mlxsw_sp_port *mlxsw_sp_port, + struct mlxsw_sp_mall_entry *mall_entry) +{ + switch (mall_entry->type) { + case MLXSW_SP_MALL_ACTION_TYPE_MIRROR: + mlxsw_sp_mall_port_mirror_del(mlxsw_sp_port, mall_entry); + break; + case MLXSW_SP_MALL_ACTION_TYPE_SAMPLE: + mlxsw_sp_mall_port_sample_del(mlxsw_sp_port); + break; + default: + WARN_ON(1); + } +} + int mlxsw_sp_mall_replace(struct mlxsw_sp_port *mlxsw_sp_port, struct tc_cls_matchall_offload *f, bool ingress) { @@ -147,7 +178,6 @@ int mlxsw_sp_mall_replace(struct mlxsw_sp_port *mlxsw_sp_port, if (act->id == FLOW_ACTION_MIRRED && protocol == htons(ETH_P_ALL)) { mall_entry->type = MLXSW_SP_MALL_ACTION_TYPE_MIRROR; mall_entry->mirror.to_dev = act->dev; - err = mlxsw_sp_mall_port_mirror_add(mlxsw_sp_port, mall_entry); } else if (act->id == FLOW_ACTION_SAMPLE && protocol == htons(ETH_P_ALL)) { if (act->sample.rate > MLXSW_REG_MPSC_RATE_MAX) { @@ -160,11 +190,12 @@ int mlxsw_sp_mall_replace(struct mlxsw_sp_port *mlxsw_sp_port, mall_entry->sample.truncate = act->sample.truncate; mall_entry->sample.trunc_size = act->sample.trunc_size; mall_entry->sample.rate = act->sample.rate; - err = mlxsw_sp_mall_port_sample_add(mlxsw_sp_port, mall_entry); } else { err = -EOPNOTSUPP; + goto errout; } + err = mlxsw_sp_mall_port_rule_add(mlxsw_sp_port, mall_entry); if (err) goto errout; @@ -186,18 +217,9 @@ void mlxsw_sp_mall_destroy(struct mlxsw_sp_port *mlxsw_sp_port, netdev_dbg(mlxsw_sp_port->dev, "tc entry not found on port\n"); return; } - list_del(&mall_entry->list); - switch (mall_entry->type) { - case MLXSW_SP_MALL_ACTION_TYPE_MIRROR: - mlxsw_sp_mall_port_mirror_del(mlxsw_sp_port, mall_entry); - break; - case MLXSW_SP_MALL_ACTION_TYPE_SAMPLE: - mlxsw_sp_mall_port_sample_del(mlxsw_sp_port); - break; - default: - WARN_ON(1); - } + mlxsw_sp_mall_port_rule_del(mlxsw_sp_port, mall_entry); + list_del(&mall_entry->list); kfree(mall_entry); } -- cgit v1.2.3-59-g8ed1b From 481ff57aadf5ea36bb3c5a9e659a2e1c5ecc6725 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Mon, 27 Apr 2020 18:13:07 +0300 Subject: mlxsw: spectrum: Avoid copying sample values and use RCU pointer direcly instead Currently, only the psample_group is accessed using RCU on RX path. However, it is possible (unlikely) that other sample values get change during RX processing. Fix this by having the port->sample struct accessed as RCU pointer, containing all sample values including psample_group pointer. That avoids extra alloc per-port, copying the values and the race condition described above. Signed-off-by: Jiri Pirko Signed-off-by: Ido Schimmel Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlxsw/spectrum.c | 30 +++++----------------- drivers/net/ethernet/mellanox/mlxsw/spectrum.h | 4 +-- .../ethernet/mellanox/mlxsw/spectrum_matchall.c | 17 +++++------- 3 files changed, 14 insertions(+), 37 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c index ff25f8fc55e9..5952ec26c169 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c @@ -3527,13 +3527,6 @@ static int mlxsw_sp_port_create(struct mlxsw_sp *mlxsw_sp, u8 local_port, goto err_alloc_stats; } - mlxsw_sp_port->sample = kzalloc(sizeof(*mlxsw_sp_port->sample), - GFP_KERNEL); - if (!mlxsw_sp_port->sample) { - err = -ENOMEM; - goto err_alloc_sample; - } - INIT_DELAYED_WORK(&mlxsw_sp_port->periodic_hw_stats.update_dw, &update_stats_cache); @@ -3720,8 +3713,6 @@ err_dev_addr_init: err_port_swid_set: mlxsw_sp_port_module_unmap(mlxsw_sp_port); err_port_module_map: - kfree(mlxsw_sp_port->sample); -err_alloc_sample: free_percpu(mlxsw_sp_port->pcpu_stats); err_alloc_stats: free_netdev(dev); @@ -3749,7 +3740,6 @@ static void mlxsw_sp_port_remove(struct mlxsw_sp *mlxsw_sp, u8 local_port) mlxsw_sp_port_tc_mc_mode_set(mlxsw_sp_port, false); mlxsw_sp_port_swid_set(mlxsw_sp_port, MLXSW_PORT_SWID_DISABLED_PORT); mlxsw_sp_port_module_unmap(mlxsw_sp_port); - kfree(mlxsw_sp_port->sample); free_percpu(mlxsw_sp_port->pcpu_stats); WARN_ON_ONCE(!list_empty(&mlxsw_sp_port->vlans_list)); free_netdev(mlxsw_sp_port->dev); @@ -4236,7 +4226,7 @@ static void mlxsw_sp_rx_listener_sample_func(struct sk_buff *skb, u8 local_port, { struct mlxsw_sp *mlxsw_sp = priv; struct mlxsw_sp_port *mlxsw_sp_port = mlxsw_sp->ports[local_port]; - struct psample_group *psample_group; + struct mlxsw_sp_port_sample *sample; u32 size; if (unlikely(!mlxsw_sp_port)) { @@ -4244,22 +4234,14 @@ static void mlxsw_sp_rx_listener_sample_func(struct sk_buff *skb, u8 local_port, local_port); goto out; } - if (unlikely(!mlxsw_sp_port->sample)) { - dev_warn_ratelimited(mlxsw_sp->bus_info->dev, "Port %d: sample skb received on unsupported port\n", - local_port); - goto out; - } - - size = mlxsw_sp_port->sample->truncate ? - mlxsw_sp_port->sample->trunc_size : skb->len; rcu_read_lock(); - psample_group = rcu_dereference(mlxsw_sp_port->sample->psample_group); - if (!psample_group) + sample = rcu_dereference(mlxsw_sp_port->sample); + if (!sample) goto out_unlock; - psample_sample_packet(psample_group, skb, size, - mlxsw_sp_port->dev->ifindex, 0, - mlxsw_sp_port->sample->rate); + size = sample->truncate ? sample->trunc_size : skb->len; + psample_sample_packet(sample->psample_group, skb, size, + mlxsw_sp_port->dev->ifindex, 0, sample->rate); out_unlock: rcu_read_unlock(); out: diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.h b/drivers/net/ethernet/mellanox/mlxsw/spectrum.h index 5c2f1af53e53..4cdb7f1d7436 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.h +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.h @@ -192,7 +192,7 @@ struct mlxsw_sp_port_pcpu_stats { }; struct mlxsw_sp_port_sample { - struct psample_group __rcu *psample_group; + struct psample_group *psample_group; u32 trunc_size; u32 rate; bool truncate; @@ -262,7 +262,7 @@ struct mlxsw_sp_port { struct mlxsw_sp_port_xstats xstats; struct delayed_work update_dw; } periodic_hw_stats; - struct mlxsw_sp_port_sample *sample; + struct mlxsw_sp_port_sample __rcu *sample; struct list_head vlans_list; struct mlxsw_sp_port_vlan *default_vlan; struct mlxsw_sp_qdisc_state *qdisc; diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_matchall.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_matchall.c index 41301027a47c..bda5fb34162a 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_matchall.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_matchall.c @@ -29,6 +29,7 @@ struct mlxsw_sp_mall_entry { struct mlxsw_sp_mall_mirror_entry mirror; struct mlxsw_sp_port_sample sample; }; + struct rcu_head rcu; }; static struct mlxsw_sp_mall_entry * @@ -90,17 +91,11 @@ mlxsw_sp_mall_port_sample_add(struct mlxsw_sp_port *mlxsw_sp_port, { int err; - if (!mlxsw_sp_port->sample) - return -EOPNOTSUPP; - if (rtnl_dereference(mlxsw_sp_port->sample->psample_group)) { + if (rtnl_dereference(mlxsw_sp_port->sample)) { netdev_err(mlxsw_sp_port->dev, "sample already active\n"); return -EEXIST; } - rcu_assign_pointer(mlxsw_sp_port->sample->psample_group, - mall_entry->sample.psample_group); - mlxsw_sp_port->sample->truncate = mall_entry->sample.truncate; - mlxsw_sp_port->sample->trunc_size = mall_entry->sample.trunc_size; - mlxsw_sp_port->sample->rate = mall_entry->sample.rate; + rcu_assign_pointer(mlxsw_sp_port->sample, &mall_entry->sample); err = mlxsw_sp_mall_port_sample_set(mlxsw_sp_port, true, mall_entry->sample.rate); @@ -109,7 +104,7 @@ mlxsw_sp_mall_port_sample_add(struct mlxsw_sp_port *mlxsw_sp_port, return 0; err_port_sample_set: - RCU_INIT_POINTER(mlxsw_sp_port->sample->psample_group, NULL); + RCU_INIT_POINTER(mlxsw_sp_port->sample, NULL); return err; } @@ -120,7 +115,7 @@ mlxsw_sp_mall_port_sample_del(struct mlxsw_sp_port *mlxsw_sp_port) return; mlxsw_sp_mall_port_sample_set(mlxsw_sp_port, false, 1); - RCU_INIT_POINTER(mlxsw_sp_port->sample->psample_group, NULL); + RCU_INIT_POINTER(mlxsw_sp_port->sample, NULL); } static int @@ -221,5 +216,5 @@ void mlxsw_sp_mall_destroy(struct mlxsw_sp_port *mlxsw_sp_port, mlxsw_sp_mall_port_rule_del(mlxsw_sp_port, mall_entry); list_del(&mall_entry->list); - kfree(mall_entry); + kfree_rcu(mall_entry, rcu); /* sample RX packets may be in-flight */ } -- cgit v1.2.3-59-g8ed1b From 3c650136afba8233e738849149b578d0ad6d2023 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Mon, 27 Apr 2020 18:13:08 +0300 Subject: mlxsw: spectrum_matchall: Process matchall events from the same cb as flower Currently there are two callbacks registered: one for matchall, one for flower. This causes the user to see "in_hw_count 2" in TC filter dump. Because of this and also as a preparation for future matchall offload for rules equivalent to flower-all-match, move the processing of shared block into matchall.c. Leave only one cb for mlxsw driver per-block. Signed-off-by: Jiri Pirko Signed-off-by: Ido Schimmel Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlxsw/spectrum.c | 125 +++++---------------- drivers/net/ethernet/mellanox/mlxsw/spectrum.h | 12 +- .../net/ethernet/mellanox/mlxsw/spectrum_flow.c | 17 ++- .../ethernet/mellanox/mlxsw/spectrum_matchall.c | 90 ++++++++++++--- 4 files changed, 124 insertions(+), 120 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c index 5952ec26c169..ceaf73ac2008 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c @@ -1350,15 +1350,15 @@ static int mlxsw_sp_port_kill_vid(struct net_device *dev, return 0; } -static int mlxsw_sp_setup_tc_cls_matchall(struct mlxsw_sp_port *mlxsw_sp_port, - struct tc_cls_matchall_offload *f, - bool ingress) +static int +mlxsw_sp_setup_tc_cls_matchall(struct mlxsw_sp_flow_block *flow_block, + struct tc_cls_matchall_offload *f) { switch (f->command) { case TC_CLSMATCHALL_REPLACE: - return mlxsw_sp_mall_replace(mlxsw_sp_port, f, ingress); + return mlxsw_sp_mall_replace(flow_block, f); case TC_CLSMATCHALL_DESTROY: - mlxsw_sp_mall_destroy(mlxsw_sp_port, f); + mlxsw_sp_mall_destroy(flow_block, f); return 0; default: return -EOPNOTSUPP; @@ -1389,62 +1389,25 @@ mlxsw_sp_setup_tc_cls_flower(struct mlxsw_sp_flow_block *flow_block, } } -static int mlxsw_sp_setup_tc_block_cb_matchall(enum tc_setup_type type, - void *type_data, - void *cb_priv, bool ingress) +static int mlxsw_sp_setup_tc_block_cb(enum tc_setup_type type, + void *type_data, void *cb_priv) { - struct mlxsw_sp_port *mlxsw_sp_port = cb_priv; - - switch (type) { - case TC_SETUP_CLSMATCHALL: - if (!tc_cls_can_offload_and_chain0(mlxsw_sp_port->dev, - type_data)) - return -EOPNOTSUPP; + struct mlxsw_sp_flow_block *flow_block = cb_priv; - return mlxsw_sp_setup_tc_cls_matchall(mlxsw_sp_port, type_data, - ingress); - case TC_SETUP_CLSFLOWER: - return 0; - default: + if (mlxsw_sp_flow_block_disabled(flow_block)) return -EOPNOTSUPP; - } -} - -static int mlxsw_sp_setup_tc_block_cb_matchall_ig(enum tc_setup_type type, - void *type_data, - void *cb_priv) -{ - return mlxsw_sp_setup_tc_block_cb_matchall(type, type_data, - cb_priv, true); -} - -static int mlxsw_sp_setup_tc_block_cb_matchall_eg(enum tc_setup_type type, - void *type_data, - void *cb_priv) -{ - return mlxsw_sp_setup_tc_block_cb_matchall(type, type_data, - cb_priv, false); -} - -static int mlxsw_sp_setup_tc_block_cb_flower(enum tc_setup_type type, - void *type_data, void *cb_priv) -{ - struct mlxsw_sp_flow_block *flow_block = cb_priv; switch (type) { case TC_SETUP_CLSMATCHALL: - return 0; + return mlxsw_sp_setup_tc_cls_matchall(flow_block, type_data); case TC_SETUP_CLSFLOWER: - if (mlxsw_sp_flow_block_disabled(flow_block)) - return -EOPNOTSUPP; - return mlxsw_sp_setup_tc_cls_flower(flow_block, type_data); default: return -EOPNOTSUPP; } } -static void mlxsw_sp_tc_block_flower_release(void *cb_priv) +static void mlxsw_sp_tc_block_release(void *cb_priv) { struct mlxsw_sp_flow_block *flow_block = cb_priv; @@ -1453,9 +1416,9 @@ static void mlxsw_sp_tc_block_flower_release(void *cb_priv) static LIST_HEAD(mlxsw_sp_block_cb_list); -static int -mlxsw_sp_setup_tc_block_flower_bind(struct mlxsw_sp_port *mlxsw_sp_port, - struct flow_block_offload *f, bool ingress) +static int mlxsw_sp_setup_tc_block_bind(struct mlxsw_sp_port *mlxsw_sp_port, + struct flow_block_offload *f, + bool ingress) { struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp; struct mlxsw_sp_flow_block *flow_block; @@ -1463,16 +1426,15 @@ mlxsw_sp_setup_tc_block_flower_bind(struct mlxsw_sp_port *mlxsw_sp_port, bool register_block = false; int err; - block_cb = flow_block_cb_lookup(f->block, - mlxsw_sp_setup_tc_block_cb_flower, + block_cb = flow_block_cb_lookup(f->block, mlxsw_sp_setup_tc_block_cb, mlxsw_sp); if (!block_cb) { flow_block = mlxsw_sp_flow_block_create(mlxsw_sp, f->net); if (!flow_block) return -ENOMEM; - block_cb = flow_block_cb_alloc(mlxsw_sp_setup_tc_block_cb_flower, + block_cb = flow_block_cb_alloc(mlxsw_sp_setup_tc_block_cb, mlxsw_sp, flow_block, - mlxsw_sp_tc_block_flower_release); + mlxsw_sp_tc_block_release); if (IS_ERR(block_cb)) { mlxsw_sp_flow_block_destroy(flow_block); err = PTR_ERR(block_cb); @@ -1507,18 +1469,16 @@ err_cb_register: return err; } -static void -mlxsw_sp_setup_tc_block_flower_unbind(struct mlxsw_sp_port *mlxsw_sp_port, - struct flow_block_offload *f, - bool ingress) +static void mlxsw_sp_setup_tc_block_unbind(struct mlxsw_sp_port *mlxsw_sp_port, + struct flow_block_offload *f, + bool ingress) { struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp; struct mlxsw_sp_flow_block *flow_block; struct flow_block_cb *block_cb; int err; - block_cb = flow_block_cb_lookup(f->block, - mlxsw_sp_setup_tc_block_cb_flower, + block_cb = flow_block_cb_lookup(f->block, mlxsw_sp_setup_tc_block_cb, mlxsw_sp); if (!block_cb) return; @@ -1540,51 +1500,22 @@ mlxsw_sp_setup_tc_block_flower_unbind(struct mlxsw_sp_port *mlxsw_sp_port, static int mlxsw_sp_setup_tc_block(struct mlxsw_sp_port *mlxsw_sp_port, struct flow_block_offload *f) { - struct flow_block_cb *block_cb; - flow_setup_cb_t *cb; bool ingress; - int err; - if (f->binder_type == FLOW_BLOCK_BINDER_TYPE_CLSACT_INGRESS) { - cb = mlxsw_sp_setup_tc_block_cb_matchall_ig; + if (f->binder_type == FLOW_BLOCK_BINDER_TYPE_CLSACT_INGRESS) ingress = true; - } else if (f->binder_type == FLOW_BLOCK_BINDER_TYPE_CLSACT_EGRESS) { - cb = mlxsw_sp_setup_tc_block_cb_matchall_eg; + else if (f->binder_type == FLOW_BLOCK_BINDER_TYPE_CLSACT_EGRESS) ingress = false; - } else { + else return -EOPNOTSUPP; - } f->driver_block_list = &mlxsw_sp_block_cb_list; switch (f->command) { case FLOW_BLOCK_BIND: - if (flow_block_cb_is_busy(cb, mlxsw_sp_port, - &mlxsw_sp_block_cb_list)) - return -EBUSY; - - block_cb = flow_block_cb_alloc(cb, mlxsw_sp_port, - mlxsw_sp_port, NULL); - if (IS_ERR(block_cb)) - return PTR_ERR(block_cb); - err = mlxsw_sp_setup_tc_block_flower_bind(mlxsw_sp_port, f, - ingress); - if (err) { - flow_block_cb_free(block_cb); - return err; - } - flow_block_cb_add(block_cb, f); - list_add_tail(&block_cb->driver_list, &mlxsw_sp_block_cb_list); - return 0; + return mlxsw_sp_setup_tc_block_bind(mlxsw_sp_port, f, ingress); case FLOW_BLOCK_UNBIND: - mlxsw_sp_setup_tc_block_flower_unbind(mlxsw_sp_port, - f, ingress); - block_cb = flow_block_cb_lookup(f->block, cb, mlxsw_sp_port); - if (!block_cb) - return -ENOENT; - - flow_block_cb_remove(block_cb, f); - list_del(&block_cb->driver_list); + mlxsw_sp_setup_tc_block_unbind(mlxsw_sp_port, f, ingress); return 0; default: return -EOPNOTSUPP; @@ -1621,8 +1552,7 @@ static int mlxsw_sp_feature_hw_tc(struct net_device *dev, bool enable) if (!enable) { if (mlxsw_sp_flow_block_rule_count(mlxsw_sp_port->ing_flow_block) || - mlxsw_sp_flow_block_rule_count(mlxsw_sp_port->eg_flow_block) || - !list_empty(&mlxsw_sp_port->mall_list)) { + mlxsw_sp_flow_block_rule_count(mlxsw_sp_port->eg_flow_block)) { netdev_err(dev, "Active offloaded tc filters, can't turn hw_tc_offload off\n"); return -EINVAL; } @@ -3518,7 +3448,6 @@ static int mlxsw_sp_port_create(struct mlxsw_sp *mlxsw_sp, u8 local_port, mlxsw_sp_port->mapping = *port_mapping; mlxsw_sp_port->link.autoneg = 1; INIT_LIST_HEAD(&mlxsw_sp_port->vlans_list); - INIT_LIST_HEAD(&mlxsw_sp_port->mall_list); mlxsw_sp_port->pcpu_stats = netdev_alloc_pcpu_stats(struct mlxsw_sp_port_pcpu_stats); diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.h b/drivers/net/ethernet/mellanox/mlxsw/spectrum.h index 4cdb7f1d7436..57d320728914 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.h +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.h @@ -255,7 +255,6 @@ struct mlxsw_sp_port { * the same localport can have * different mapping. */ - struct list_head mall_list; struct { #define MLXSW_HW_STATS_UPDATE_TIME HZ struct rtnl_link_stats64 stats; @@ -637,6 +636,7 @@ struct mlxsw_sp_acl_rule_info { /* spectrum_flow.c */ struct mlxsw_sp_flow_block { struct list_head binding_list; + struct list_head mall_list; struct mlxsw_sp_acl_ruleset *ruleset_zero; struct mlxsw_sp *mlxsw_sp; unsigned int rule_count; @@ -894,10 +894,14 @@ extern const struct mlxsw_afk_ops mlxsw_sp1_afk_ops; extern const struct mlxsw_afk_ops mlxsw_sp2_afk_ops; /* spectrum_matchall.c */ -int mlxsw_sp_mall_replace(struct mlxsw_sp_port *mlxsw_sp_port, - struct tc_cls_matchall_offload *f, bool ingress); -void mlxsw_sp_mall_destroy(struct mlxsw_sp_port *mlxsw_sp_port, +int mlxsw_sp_mall_replace(struct mlxsw_sp_flow_block *block, + struct tc_cls_matchall_offload *f); +void mlxsw_sp_mall_destroy(struct mlxsw_sp_flow_block *block, struct tc_cls_matchall_offload *f); +int mlxsw_sp_mall_port_bind(struct mlxsw_sp_flow_block *block, + struct mlxsw_sp_port *mlxsw_sp_port); +void mlxsw_sp_mall_port_unbind(struct mlxsw_sp_flow_block *block, + struct mlxsw_sp_port *mlxsw_sp_port); /* spectrum_flower.c */ int mlxsw_sp_flower_replace(struct mlxsw_sp *mlxsw_sp, diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_flow.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_flow.c index 655e1df5c95a..51de6aca1930 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_flow.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_flow.c @@ -18,6 +18,7 @@ mlxsw_sp_flow_block_create(struct mlxsw_sp *mlxsw_sp, struct net *net) if (!block) return NULL; INIT_LIST_HEAD(&block->binding_list); + INIT_LIST_HEAD(&block->mall_list); block->mlxsw_sp = mlxsw_sp; block->net = net; return block; @@ -70,9 +71,15 @@ int mlxsw_sp_flow_block_bind(struct mlxsw_sp *mlxsw_sp, return -EOPNOTSUPP; } + err = mlxsw_sp_mall_port_bind(block, mlxsw_sp_port); + if (err) + return err; + binding = kzalloc(sizeof(*binding), GFP_KERNEL); - if (!binding) - return -ENOMEM; + if (!binding) { + err = -ENOMEM; + goto err_binding_alloc; + } binding->mlxsw_sp_port = mlxsw_sp_port; binding->ingress = ingress; @@ -91,6 +98,9 @@ int mlxsw_sp_flow_block_bind(struct mlxsw_sp *mlxsw_sp, err_ruleset_bind: kfree(binding); +err_binding_alloc: + mlxsw_sp_mall_port_unbind(block, mlxsw_sp_port); + return err; } @@ -116,5 +126,8 @@ int mlxsw_sp_flow_block_unbind(struct mlxsw_sp *mlxsw_sp, mlxsw_sp_acl_ruleset_unbind(mlxsw_sp, block, binding); kfree(binding); + + mlxsw_sp_mall_port_unbind(block, mlxsw_sp_port); + return 0; } diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_matchall.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_matchall.c index bda5fb34162a..889da63072be 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_matchall.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_matchall.c @@ -33,11 +33,11 @@ struct mlxsw_sp_mall_entry { }; static struct mlxsw_sp_mall_entry * -mlxsw_sp_mall_entry_find(struct mlxsw_sp_port *port, unsigned long cookie) +mlxsw_sp_mall_entry_find(struct mlxsw_sp_flow_block *block, unsigned long cookie) { struct mlxsw_sp_mall_entry *mall_entry; - list_for_each_entry(mall_entry, &port->mall_list, list) + list_for_each_entry(mall_entry, &block->mall_list, list) if (mall_entry->cookie == cookie) return mall_entry; @@ -149,16 +149,27 @@ mlxsw_sp_mall_port_rule_del(struct mlxsw_sp_port *mlxsw_sp_port, } } -int mlxsw_sp_mall_replace(struct mlxsw_sp_port *mlxsw_sp_port, - struct tc_cls_matchall_offload *f, bool ingress) +int mlxsw_sp_mall_replace(struct mlxsw_sp_flow_block *block, + struct tc_cls_matchall_offload *f) { + struct mlxsw_sp_flow_block_binding *binding; struct mlxsw_sp_mall_entry *mall_entry; __be16 protocol = f->common.protocol; struct flow_action_entry *act; int err; if (!flow_offload_has_one_action(&f->rule->action)) { - netdev_err(mlxsw_sp_port->dev, "only singular actions are supported\n"); + NL_SET_ERR_MSG(f->common.extack, "Only singular actions are supported"); + return -EOPNOTSUPP; + } + + if (f->common.chain_index) { + NL_SET_ERR_MSG(f->common.extack, "Only chain 0 is supported"); + return -EOPNOTSUPP; + } + + if (mlxsw_sp_flow_block_is_mixed_bound(block)) { + NL_SET_ERR_MSG(f->common.extack, "Only not mixed bound blocks are supported"); return -EOPNOTSUPP; } @@ -166,7 +177,7 @@ int mlxsw_sp_mall_replace(struct mlxsw_sp_port *mlxsw_sp_port, if (!mall_entry) return -ENOMEM; mall_entry->cookie = f->cookie; - mall_entry->ingress = ingress; + mall_entry->ingress = mlxsw_sp_flow_block_is_ingress_bound(block); act = &f->rule->action.entries[0]; @@ -176,7 +187,7 @@ int mlxsw_sp_mall_replace(struct mlxsw_sp_port *mlxsw_sp_port, } else if (act->id == FLOW_ACTION_SAMPLE && protocol == htons(ETH_P_ALL)) { if (act->sample.rate > MLXSW_REG_MPSC_RATE_MAX) { - netdev_err(mlxsw_sp_port->dev, "sample rate not supported\n"); + NL_SET_ERR_MSG(f->common.extack, "Sample rate not supported"); err = -EOPNOTSUPP; goto errout; } @@ -190,31 +201,78 @@ int mlxsw_sp_mall_replace(struct mlxsw_sp_port *mlxsw_sp_port, goto errout; } - err = mlxsw_sp_mall_port_rule_add(mlxsw_sp_port, mall_entry); - if (err) - goto errout; + list_for_each_entry(binding, &block->binding_list, list) { + err = mlxsw_sp_mall_port_rule_add(binding->mlxsw_sp_port, + mall_entry); + if (err) + goto rollback; + } - list_add_tail(&mall_entry->list, &mlxsw_sp_port->mall_list); + block->rule_count++; + if (mall_entry->ingress) + block->egress_blocker_rule_count++; + else + block->ingress_blocker_rule_count++; + list_add_tail(&mall_entry->list, &block->mall_list); return 0; +rollback: + list_for_each_entry_continue_reverse(binding, &block->binding_list, + list) + mlxsw_sp_mall_port_rule_del(binding->mlxsw_sp_port, mall_entry); errout: kfree(mall_entry); return err; } -void mlxsw_sp_mall_destroy(struct mlxsw_sp_port *mlxsw_sp_port, +void mlxsw_sp_mall_destroy(struct mlxsw_sp_flow_block *block, struct tc_cls_matchall_offload *f) { + struct mlxsw_sp_flow_block_binding *binding; struct mlxsw_sp_mall_entry *mall_entry; - mall_entry = mlxsw_sp_mall_entry_find(mlxsw_sp_port, f->cookie); + mall_entry = mlxsw_sp_mall_entry_find(block, f->cookie); if (!mall_entry) { - netdev_dbg(mlxsw_sp_port->dev, "tc entry not found on port\n"); + NL_SET_ERR_MSG(f->common.extack, "Entry not found"); return; } - mlxsw_sp_mall_port_rule_del(mlxsw_sp_port, mall_entry); - list_del(&mall_entry->list); + if (mall_entry->ingress) + block->egress_blocker_rule_count--; + else + block->ingress_blocker_rule_count--; + block->rule_count--; + list_for_each_entry(binding, &block->binding_list, list) + mlxsw_sp_mall_port_rule_del(binding->mlxsw_sp_port, mall_entry); kfree_rcu(mall_entry, rcu); /* sample RX packets may be in-flight */ } + +int mlxsw_sp_mall_port_bind(struct mlxsw_sp_flow_block *block, + struct mlxsw_sp_port *mlxsw_sp_port) +{ + struct mlxsw_sp_mall_entry *mall_entry; + int err; + + list_for_each_entry(mall_entry, &block->mall_list, list) { + err = mlxsw_sp_mall_port_rule_add(mlxsw_sp_port, mall_entry); + if (err) + goto rollback; + } + return 0; + +rollback: + list_for_each_entry_continue_reverse(mall_entry, &block->mall_list, + list) + mlxsw_sp_mall_port_rule_del(mlxsw_sp_port, mall_entry); + return err; +} + +void mlxsw_sp_mall_port_unbind(struct mlxsw_sp_flow_block *block, + struct mlxsw_sp_port *mlxsw_sp_port) +{ + struct mlxsw_sp_mall_entry *mall_entry; + + list_for_each_entry(mall_entry, &block->mall_list, list) + mlxsw_sp_mall_port_rule_del(mlxsw_sp_port, mall_entry); +} -- cgit v1.2.3-59-g8ed1b From 19f06771ca3dd8346972bc7627f9bcb7b6a8ce0b Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Mon, 27 Apr 2020 18:13:09 +0300 Subject: mlxsw: spectrum: Move flow offload binding into spectrum_flow.c Move the code taking case of setup of flow offload into spectrum_flow.c Do small renaming of callbacks on the way. Signed-off-by: Jiri Pirko Signed-off-by: Ido Schimmel Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlxsw/spectrum.c | 173 ------------------- drivers/net/ethernet/mellanox/mlxsw/spectrum.h | 11 +- .../net/ethernet/mellanox/mlxsw/spectrum_flow.c | 188 ++++++++++++++++++++- 3 files changed, 181 insertions(+), 191 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c index ceaf73ac2008..f78bde8bc16e 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c @@ -1350,178 +1350,6 @@ static int mlxsw_sp_port_kill_vid(struct net_device *dev, return 0; } -static int -mlxsw_sp_setup_tc_cls_matchall(struct mlxsw_sp_flow_block *flow_block, - struct tc_cls_matchall_offload *f) -{ - switch (f->command) { - case TC_CLSMATCHALL_REPLACE: - return mlxsw_sp_mall_replace(flow_block, f); - case TC_CLSMATCHALL_DESTROY: - mlxsw_sp_mall_destroy(flow_block, f); - return 0; - default: - return -EOPNOTSUPP; - } -} - -static int -mlxsw_sp_setup_tc_cls_flower(struct mlxsw_sp_flow_block *flow_block, - struct flow_cls_offload *f) -{ - struct mlxsw_sp *mlxsw_sp = mlxsw_sp_flow_block_mlxsw_sp(flow_block); - - switch (f->command) { - case FLOW_CLS_REPLACE: - return mlxsw_sp_flower_replace(mlxsw_sp, flow_block, f); - case FLOW_CLS_DESTROY: - mlxsw_sp_flower_destroy(mlxsw_sp, flow_block, f); - return 0; - case FLOW_CLS_STATS: - return mlxsw_sp_flower_stats(mlxsw_sp, flow_block, f); - case FLOW_CLS_TMPLT_CREATE: - return mlxsw_sp_flower_tmplt_create(mlxsw_sp, flow_block, f); - case FLOW_CLS_TMPLT_DESTROY: - mlxsw_sp_flower_tmplt_destroy(mlxsw_sp, flow_block, f); - return 0; - default: - return -EOPNOTSUPP; - } -} - -static int mlxsw_sp_setup_tc_block_cb(enum tc_setup_type type, - void *type_data, void *cb_priv) -{ - struct mlxsw_sp_flow_block *flow_block = cb_priv; - - if (mlxsw_sp_flow_block_disabled(flow_block)) - return -EOPNOTSUPP; - - switch (type) { - case TC_SETUP_CLSMATCHALL: - return mlxsw_sp_setup_tc_cls_matchall(flow_block, type_data); - case TC_SETUP_CLSFLOWER: - return mlxsw_sp_setup_tc_cls_flower(flow_block, type_data); - default: - return -EOPNOTSUPP; - } -} - -static void mlxsw_sp_tc_block_release(void *cb_priv) -{ - struct mlxsw_sp_flow_block *flow_block = cb_priv; - - mlxsw_sp_flow_block_destroy(flow_block); -} - -static LIST_HEAD(mlxsw_sp_block_cb_list); - -static int mlxsw_sp_setup_tc_block_bind(struct mlxsw_sp_port *mlxsw_sp_port, - struct flow_block_offload *f, - bool ingress) -{ - struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp; - struct mlxsw_sp_flow_block *flow_block; - struct flow_block_cb *block_cb; - bool register_block = false; - int err; - - block_cb = flow_block_cb_lookup(f->block, mlxsw_sp_setup_tc_block_cb, - mlxsw_sp); - if (!block_cb) { - flow_block = mlxsw_sp_flow_block_create(mlxsw_sp, f->net); - if (!flow_block) - return -ENOMEM; - block_cb = flow_block_cb_alloc(mlxsw_sp_setup_tc_block_cb, - mlxsw_sp, flow_block, - mlxsw_sp_tc_block_release); - if (IS_ERR(block_cb)) { - mlxsw_sp_flow_block_destroy(flow_block); - err = PTR_ERR(block_cb); - goto err_cb_register; - } - register_block = true; - } else { - flow_block = flow_block_cb_priv(block_cb); - } - flow_block_cb_incref(block_cb); - err = mlxsw_sp_flow_block_bind(mlxsw_sp, flow_block, - mlxsw_sp_port, ingress, f->extack); - if (err) - goto err_block_bind; - - if (ingress) - mlxsw_sp_port->ing_flow_block = flow_block; - else - mlxsw_sp_port->eg_flow_block = flow_block; - - if (register_block) { - flow_block_cb_add(block_cb, f); - list_add_tail(&block_cb->driver_list, &mlxsw_sp_block_cb_list); - } - - return 0; - -err_block_bind: - if (!flow_block_cb_decref(block_cb)) - flow_block_cb_free(block_cb); -err_cb_register: - return err; -} - -static void mlxsw_sp_setup_tc_block_unbind(struct mlxsw_sp_port *mlxsw_sp_port, - struct flow_block_offload *f, - bool ingress) -{ - struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp; - struct mlxsw_sp_flow_block *flow_block; - struct flow_block_cb *block_cb; - int err; - - block_cb = flow_block_cb_lookup(f->block, mlxsw_sp_setup_tc_block_cb, - mlxsw_sp); - if (!block_cb) - return; - - if (ingress) - mlxsw_sp_port->ing_flow_block = NULL; - else - mlxsw_sp_port->eg_flow_block = NULL; - - flow_block = flow_block_cb_priv(block_cb); - err = mlxsw_sp_flow_block_unbind(mlxsw_sp, flow_block, - mlxsw_sp_port, ingress); - if (!err && !flow_block_cb_decref(block_cb)) { - flow_block_cb_remove(block_cb, f); - list_del(&block_cb->driver_list); - } -} - -static int mlxsw_sp_setup_tc_block(struct mlxsw_sp_port *mlxsw_sp_port, - struct flow_block_offload *f) -{ - bool ingress; - - if (f->binder_type == FLOW_BLOCK_BINDER_TYPE_CLSACT_INGRESS) - ingress = true; - else if (f->binder_type == FLOW_BLOCK_BINDER_TYPE_CLSACT_EGRESS) - ingress = false; - else - return -EOPNOTSUPP; - - f->driver_block_list = &mlxsw_sp_block_cb_list; - - switch (f->command) { - case FLOW_BLOCK_BIND: - return mlxsw_sp_setup_tc_block_bind(mlxsw_sp_port, f, ingress); - case FLOW_BLOCK_UNBIND: - mlxsw_sp_setup_tc_block_unbind(mlxsw_sp_port, f, ingress); - return 0; - default: - return -EOPNOTSUPP; - } -} - static int mlxsw_sp_setup_tc(struct net_device *dev, enum tc_setup_type type, void *type_data) { @@ -1545,7 +1373,6 @@ static int mlxsw_sp_setup_tc(struct net_device *dev, enum tc_setup_type type, } } - static int mlxsw_sp_feature_hw_tc(struct net_device *dev, bool enable) { struct mlxsw_sp_port *mlxsw_sp_port = netdev_priv(dev); diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.h b/drivers/net/ethernet/mellanox/mlxsw/spectrum.h index 57d320728914..a12ca673c224 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.h +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.h @@ -708,15 +708,8 @@ mlxsw_sp_flow_block_is_mixed_bound(const struct mlxsw_sp_flow_block *block) struct mlxsw_sp_flow_block *mlxsw_sp_flow_block_create(struct mlxsw_sp *mlxsw_sp, struct net *net); void mlxsw_sp_flow_block_destroy(struct mlxsw_sp_flow_block *block); -int mlxsw_sp_flow_block_bind(struct mlxsw_sp *mlxsw_sp, - struct mlxsw_sp_flow_block *block, - struct mlxsw_sp_port *mlxsw_sp_port, - bool ingress, - struct netlink_ext_ack *extack); -int mlxsw_sp_flow_block_unbind(struct mlxsw_sp *mlxsw_sp, - struct mlxsw_sp_flow_block *block, - struct mlxsw_sp_port *mlxsw_sp_port, - bool ingress); +int mlxsw_sp_setup_tc_block(struct mlxsw_sp_port *mlxsw_sp_port, + struct flow_block_offload *f); /* spectrum_acl.c */ struct mlxsw_sp_acl_ruleset; diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_flow.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_flow.c index 51de6aca1930..ecab581ff956 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_flow.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_flow.c @@ -49,11 +49,11 @@ mlxsw_sp_flow_block_ruleset_bound(const struct mlxsw_sp_flow_block *block) return block->ruleset_zero; } -int mlxsw_sp_flow_block_bind(struct mlxsw_sp *mlxsw_sp, - struct mlxsw_sp_flow_block *block, - struct mlxsw_sp_port *mlxsw_sp_port, - bool ingress, - struct netlink_ext_ack *extack) +static int mlxsw_sp_flow_block_bind(struct mlxsw_sp *mlxsw_sp, + struct mlxsw_sp_flow_block *block, + struct mlxsw_sp_port *mlxsw_sp_port, + bool ingress, + struct netlink_ext_ack *extack) { struct mlxsw_sp_flow_block_binding *binding; int err; @@ -104,10 +104,10 @@ err_binding_alloc: return err; } -int mlxsw_sp_flow_block_unbind(struct mlxsw_sp *mlxsw_sp, - struct mlxsw_sp_flow_block *block, - struct mlxsw_sp_port *mlxsw_sp_port, - bool ingress) +static int mlxsw_sp_flow_block_unbind(struct mlxsw_sp *mlxsw_sp, + struct mlxsw_sp_flow_block *block, + struct mlxsw_sp_port *mlxsw_sp_port, + bool ingress) { struct mlxsw_sp_flow_block_binding *binding; @@ -131,3 +131,173 @@ int mlxsw_sp_flow_block_unbind(struct mlxsw_sp *mlxsw_sp, return 0; } + +static int mlxsw_sp_flow_block_mall_cb(struct mlxsw_sp_flow_block *flow_block, + struct tc_cls_matchall_offload *f) +{ + switch (f->command) { + case TC_CLSMATCHALL_REPLACE: + return mlxsw_sp_mall_replace(flow_block, f); + case TC_CLSMATCHALL_DESTROY: + mlxsw_sp_mall_destroy(flow_block, f); + return 0; + default: + return -EOPNOTSUPP; + } +} + +static int mlxsw_sp_flow_block_flower_cb(struct mlxsw_sp_flow_block *flow_block, + struct flow_cls_offload *f) +{ + struct mlxsw_sp *mlxsw_sp = mlxsw_sp_flow_block_mlxsw_sp(flow_block); + + switch (f->command) { + case FLOW_CLS_REPLACE: + return mlxsw_sp_flower_replace(mlxsw_sp, flow_block, f); + case FLOW_CLS_DESTROY: + mlxsw_sp_flower_destroy(mlxsw_sp, flow_block, f); + return 0; + case FLOW_CLS_STATS: + return mlxsw_sp_flower_stats(mlxsw_sp, flow_block, f); + case FLOW_CLS_TMPLT_CREATE: + return mlxsw_sp_flower_tmplt_create(mlxsw_sp, flow_block, f); + case FLOW_CLS_TMPLT_DESTROY: + mlxsw_sp_flower_tmplt_destroy(mlxsw_sp, flow_block, f); + return 0; + default: + return -EOPNOTSUPP; + } +} + +static int mlxsw_sp_flow_block_cb(enum tc_setup_type type, + void *type_data, void *cb_priv) +{ + struct mlxsw_sp_flow_block *flow_block = cb_priv; + + if (mlxsw_sp_flow_block_disabled(flow_block)) + return -EOPNOTSUPP; + + switch (type) { + case TC_SETUP_CLSMATCHALL: + return mlxsw_sp_flow_block_mall_cb(flow_block, type_data); + case TC_SETUP_CLSFLOWER: + return mlxsw_sp_flow_block_flower_cb(flow_block, type_data); + default: + return -EOPNOTSUPP; + } +} + +static void mlxsw_sp_tc_block_release(void *cb_priv) +{ + struct mlxsw_sp_flow_block *flow_block = cb_priv; + + mlxsw_sp_flow_block_destroy(flow_block); +} + +static LIST_HEAD(mlxsw_sp_block_cb_list); + +static int mlxsw_sp_setup_tc_block_bind(struct mlxsw_sp_port *mlxsw_sp_port, + struct flow_block_offload *f, + bool ingress) +{ + struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp; + struct mlxsw_sp_flow_block *flow_block; + struct flow_block_cb *block_cb; + bool register_block = false; + int err; + + block_cb = flow_block_cb_lookup(f->block, mlxsw_sp_flow_block_cb, + mlxsw_sp); + if (!block_cb) { + flow_block = mlxsw_sp_flow_block_create(mlxsw_sp, f->net); + if (!flow_block) + return -ENOMEM; + block_cb = flow_block_cb_alloc(mlxsw_sp_flow_block_cb, + mlxsw_sp, flow_block, + mlxsw_sp_tc_block_release); + if (IS_ERR(block_cb)) { + mlxsw_sp_flow_block_destroy(flow_block); + err = PTR_ERR(block_cb); + goto err_cb_register; + } + register_block = true; + } else { + flow_block = flow_block_cb_priv(block_cb); + } + flow_block_cb_incref(block_cb); + err = mlxsw_sp_flow_block_bind(mlxsw_sp, flow_block, + mlxsw_sp_port, ingress, f->extack); + if (err) + goto err_block_bind; + + if (ingress) + mlxsw_sp_port->ing_flow_block = flow_block; + else + mlxsw_sp_port->eg_flow_block = flow_block; + + if (register_block) { + flow_block_cb_add(block_cb, f); + list_add_tail(&block_cb->driver_list, &mlxsw_sp_block_cb_list); + } + + return 0; + +err_block_bind: + if (!flow_block_cb_decref(block_cb)) + flow_block_cb_free(block_cb); +err_cb_register: + return err; +} + +static void mlxsw_sp_setup_tc_block_unbind(struct mlxsw_sp_port *mlxsw_sp_port, + struct flow_block_offload *f, + bool ingress) +{ + struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp; + struct mlxsw_sp_flow_block *flow_block; + struct flow_block_cb *block_cb; + int err; + + block_cb = flow_block_cb_lookup(f->block, mlxsw_sp_flow_block_cb, + mlxsw_sp); + if (!block_cb) + return; + + if (ingress) + mlxsw_sp_port->ing_flow_block = NULL; + else + mlxsw_sp_port->eg_flow_block = NULL; + + flow_block = flow_block_cb_priv(block_cb); + err = mlxsw_sp_flow_block_unbind(mlxsw_sp, flow_block, + mlxsw_sp_port, ingress); + if (!err && !flow_block_cb_decref(block_cb)) { + flow_block_cb_remove(block_cb, f); + list_del(&block_cb->driver_list); + } +} + +int mlxsw_sp_setup_tc_block(struct mlxsw_sp_port *mlxsw_sp_port, + struct flow_block_offload *f) +{ + bool ingress; + + if (f->binder_type == FLOW_BLOCK_BINDER_TYPE_CLSACT_INGRESS) + ingress = true; + else if (f->binder_type == FLOW_BLOCK_BINDER_TYPE_CLSACT_EGRESS) + ingress = false; + else + return -EOPNOTSUPP; + + f->driver_block_list = &mlxsw_sp_block_cb_list; + + switch (f->command) { + case FLOW_BLOCK_BIND: + return mlxsw_sp_setup_tc_block_bind(mlxsw_sp_port, f, ingress); + case FLOW_BLOCK_UNBIND: + mlxsw_sp_setup_tc_block_unbind(mlxsw_sp_port, f, ingress); + return 0; + default: + return -EOPNOTSUPP; + } +} -- cgit v1.2.3-59-g8ed1b From 075c8aa79d541ea08c67a2e6d955f6457e98c21c Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Mon, 27 Apr 2020 18:13:10 +0300 Subject: selftests: forwarding: tc_actions.sh: add matchall mirror test Add test for matchall classifier with mirred egress mirror action. Signed-off-by: Jiri Pirko Signed-off-by: Ido Schimmel Signed-off-by: David S. Miller --- .../testing/selftests/net/forwarding/tc_actions.sh | 26 +++++++++++++++------- 1 file changed, 18 insertions(+), 8 deletions(-) diff --git a/tools/testing/selftests/net/forwarding/tc_actions.sh b/tools/testing/selftests/net/forwarding/tc_actions.sh index 813d02d1939d..d9eca227136b 100755 --- a/tools/testing/selftests/net/forwarding/tc_actions.sh +++ b/tools/testing/selftests/net/forwarding/tc_actions.sh @@ -2,7 +2,8 @@ # SPDX-License-Identifier: GPL-2.0 ALL_TESTS="gact_drop_and_ok_test mirred_egress_redirect_test \ - mirred_egress_mirror_test gact_trap_test" + mirred_egress_mirror_test matchall_mirred_egress_mirror_test \ + gact_trap_test" NUM_NETIFS=4 source tc_common.sh source lib.sh @@ -50,6 +51,9 @@ switch_destroy() mirred_egress_test() { local action=$1 + local protocol=$2 + local classifier=$3 + local classifier_args=$4 RET=0 @@ -62,9 +66,9 @@ mirred_egress_test() tc_check_packets "dev $h2 ingress" 101 1 check_fail $? "Matched without redirect rule inserted" - tc filter add dev $swp1 ingress protocol ip pref 1 handle 101 flower \ - $tcflags dst_ip 192.0.2.2 action mirred egress $action \ - dev $swp2 + tc filter add dev $swp1 ingress protocol $protocol pref 1 handle 101 \ + $classifier $tcflags $classifier_args \ + action mirred egress $action dev $swp2 $MZ $h1 -c 1 -p 64 -a $h1mac -b $h2mac -A 192.0.2.1 -B 192.0.2.2 \ -t ip -q @@ -72,10 +76,11 @@ mirred_egress_test() tc_check_packets "dev $h2 ingress" 101 1 check_err $? "Did not match incoming $action packet" - tc filter del dev $swp1 ingress protocol ip pref 1 handle 101 flower + tc filter del dev $swp1 ingress protocol $protocol pref 1 handle 101 \ + $classifier tc filter del dev $h2 ingress protocol ip pref 1 handle 101 flower - log_test "mirred egress $action ($tcflags)" + log_test "mirred egress $classifier $action ($tcflags)" } gact_drop_and_ok_test() @@ -187,12 +192,17 @@ cleanup() mirred_egress_redirect_test() { - mirred_egress_test "redirect" + mirred_egress_test "redirect" "ip" "flower" "dst_ip 192.0.2.2" } mirred_egress_mirror_test() { - mirred_egress_test "mirror" + mirred_egress_test "mirror" "ip" "flower" "dst_ip 192.0.2.2" +} + +matchall_mirred_egress_mirror_test() +{ + mirred_egress_test "mirror" "all" "matchall" "" } trap cleanup EXIT -- cgit v1.2.3-59-g8ed1b From fdb9c405e35bdc6e305b9b4e20ebc141ed14fc81 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Fri, 24 Apr 2020 21:55:33 +0200 Subject: netfilter: nf_tables: allow up to 64 bytes in the set element data area So far, the set elements could store up to 128-bits in the data area. Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 4 ++++ net/netfilter/nf_tables_api.c | 38 ++++++++++++++++++++++++++------------ 2 files changed, 30 insertions(+), 12 deletions(-) diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 4ff7c81e6717..d4e29c952c40 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -243,6 +243,10 @@ struct nft_set_elem { u32 buf[NFT_DATA_VALUE_MAXLEN / sizeof(u32)]; struct nft_data val; } key_end; + union { + u32 buf[NFT_DATA_VALUE_MAXLEN / sizeof(u32)]; + struct nft_data val; + } data; void *priv; }; diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 9780bd93b7e4..3558e76e2733 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -4669,6 +4669,25 @@ static int nft_setelem_parse_key(struct nft_ctx *ctx, struct nft_set *set, return 0; } +static int nft_setelem_parse_data(struct nft_ctx *ctx, struct nft_set *set, + struct nft_data_desc *desc, + struct nft_data *data, + struct nlattr *attr) +{ + int err; + + err = nft_data_init(ctx, data, NFT_DATA_VALUE_MAXLEN, desc, attr); + if (err < 0) + return err; + + if (desc->type != NFT_DATA_VERDICT && desc->len != set->dlen) { + nft_data_release(data, desc->type); + return -EINVAL; + } + + return 0; +} + static int nft_get_set_elem(struct nft_ctx *ctx, struct nft_set *set, const struct nlattr *attr) { @@ -4946,7 +4965,6 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, struct nft_expr *expr = NULL; struct nft_userdata *udata; struct nft_data_desc desc; - struct nft_data data; enum nft_registers dreg; struct nft_trans *trans; u32 flags = 0; @@ -5072,15 +5090,11 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, } if (nla[NFTA_SET_ELEM_DATA] != NULL) { - err = nft_data_init(ctx, &data, sizeof(data), &desc, - nla[NFTA_SET_ELEM_DATA]); + err = nft_setelem_parse_data(ctx, set, &desc, &elem.data.val, + nla[NFTA_SET_ELEM_DATA]); if (err < 0) goto err_parse_key_end; - err = -EINVAL; - if (set->dtype != NFT_DATA_VERDICT && desc.len != set->dlen) - goto err_parse_data; - dreg = nft_type_to_reg(set->dtype); list_for_each_entry(binding, &set->bindings, list) { struct nft_ctx bind_ctx = { @@ -5094,14 +5108,14 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, continue; err = nft_validate_register_store(&bind_ctx, dreg, - &data, + &elem.data.val, desc.type, desc.len); if (err < 0) goto err_parse_data; if (desc.type == NFT_DATA_VERDICT && - (data.verdict.code == NFT_GOTO || - data.verdict.code == NFT_JUMP)) + (elem.data.val.verdict.code == NFT_GOTO || + elem.data.val.verdict.code == NFT_JUMP)) nft_validate_state_update(ctx->net, NFT_VALIDATE_NEED); } @@ -5123,7 +5137,7 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, err = -ENOMEM; elem.priv = nft_set_elem_init(set, &tmpl, elem.key.val.data, - elem.key_end.val.data, data.data, + elem.key_end.val.data, elem.data.val.data, timeout, expiration, GFP_KERNEL); if (elem.priv == NULL) goto err_parse_data; @@ -5201,7 +5215,7 @@ err_trans: nf_tables_set_elem_destroy(ctx, set, elem.priv); err_parse_data: if (nla[NFTA_SET_ELEM_DATA] != NULL) - nft_data_release(&data, desc.type); + nft_data_release(&elem.data.val, desc.type); err_parse_key_end: nft_data_release(&elem.key_end.val, NFT_DATA_VALUE); err_parse_key: -- cgit v1.2.3-59-g8ed1b From 8c1b2bf16d5944cd5c3a8a72e24ed9e22360c1af Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 24 Apr 2020 08:43:34 +0200 Subject: bpf, cgroup: Remove unused exports Except for a few of the networking hooks called from modular ipv4 or ipv6 code, all of hooks are just called from guaranteed to be built-in code. Signed-off-by: Christoph Hellwig Signed-off-by: Daniel Borkmann Acked-by: Andrey Ignatov Link: https://lore.kernel.org/bpf/20200424064338.538313-2-hch@lst.de --- kernel/bpf/cgroup.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 4d748c5785bc..fc7c7002fd37 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -1054,7 +1054,6 @@ int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor, return !allow; } -EXPORT_SYMBOL(__cgroup_bpf_check_dev_permission); static const struct bpf_func_proto * cgroup_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) @@ -1207,7 +1206,6 @@ int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head, return ret == 1 ? 0 : -EPERM; } -EXPORT_SYMBOL(__cgroup_bpf_run_filter_sysctl); #ifdef CONFIG_NET static bool __cgroup_bpf_prog_array_is_empty(struct cgroup *cgrp, @@ -1312,7 +1310,6 @@ out: sockopt_free_buf(&ctx); return ret; } -EXPORT_SYMBOL(__cgroup_bpf_run_filter_setsockopt); int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level, int optname, char __user *optval, @@ -1399,7 +1396,6 @@ out: sockopt_free_buf(&ctx); return ret; } -EXPORT_SYMBOL(__cgroup_bpf_run_filter_getsockopt); #endif static ssize_t sysctl_cpy_dir(const struct ctl_dir *dir, char **bufp, -- cgit v1.2.3-59-g8ed1b From 0c2006b29e5f62784c70209e71da7876267e0e2d Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Mon, 27 Apr 2020 21:07:00 +0200 Subject: r8169: improve error message if no dedicated PHY driver is found There's a number of consumer mainboards where the BIOS leaves the PHY in a state that it's reporting an invalid PHY ID. To detect such cases add the PHY ID to the error message if no dedicated PHY driver is found. Signed-off-by: Heiner Kallweit Signed-off-by: David S. Miller --- drivers/net/ethernet/realtek/r8169_main.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/realtek/r8169_main.c b/drivers/net/ethernet/realtek/r8169_main.c index f70e36c20431..68d5255568a5 100644 --- a/drivers/net/ethernet/realtek/r8169_main.c +++ b/drivers/net/ethernet/realtek/r8169_main.c @@ -5197,7 +5197,8 @@ static int r8169_mdio_register(struct rtl8169_private *tp) /* Most chip versions fail with the genphy driver. * Therefore ensure that the dedicated PHY driver is loaded. */ - dev_err(&pdev->dev, "realtek.ko not loaded, maybe it needs to be added to initramfs?\n"); + dev_err(&pdev->dev, "no dedicated PHY driver found for PHY ID 0x%08x, maybe realtek.ko needs to be added to initramfs?\n", + tp->phydev->phy_id); return -EUNATCH; } -- cgit v1.2.3-59-g8ed1b From 2ac757e4152e3322a04a6dfb3d1fa010d3521abf Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Mon, 27 Apr 2020 09:33:43 +0000 Subject: net: ethernet: ti: fix return value check in k3_cppi_desc_pool_create_name() In case of error, the function gen_pool_create() returns NULL pointer not ERR_PTR(). The IS_ERR() test in the return value check should be replaced with NULL test. Fixes: 93a76530316a ("net: ethernet: ti: introduce am65x/j721e gigabit eth subsystem driver") Signed-off-by: Wei Yongjun Signed-off-by: David S. Miller --- drivers/net/ethernet/ti/k3-cppi-desc-pool.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/ti/k3-cppi-desc-pool.c b/drivers/net/ethernet/ti/k3-cppi-desc-pool.c index ad7cfc1316ce..38cc12f9f133 100644 --- a/drivers/net/ethernet/ti/k3-cppi-desc-pool.c +++ b/drivers/net/ethernet/ti/k3-cppi-desc-pool.c @@ -64,8 +64,8 @@ k3_cppi_desc_pool_create_name(struct device *dev, size_t size, return ERR_PTR(-ENOMEM); pool->gen_pool = gen_pool_create(ilog2(pool->desc_size), -1); - if (IS_ERR(pool->gen_pool)) { - ret = PTR_ERR(pool->gen_pool); + if (!pool->gen_pool) { + ret = -ENOMEM; dev_err(pool->dev, "pool create failed %d\n", ret); kfree_const(pool_name); goto gen_pool_create_fail; -- cgit v1.2.3-59-g8ed1b From 0d7c83463fdf7841350f37960a7abadd3e650b41 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Fri, 24 Apr 2020 21:55:34 +0200 Subject: netfilter: nft_nat: return EOPNOTSUPP if type or flags are not supported Instead of EINVAL which should be used for malformed netlink messages. Fixes: eb31628e37a0 ("netfilter: nf_tables: Add support for IPv6 NAT") Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_nat.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/netfilter/nft_nat.c b/net/netfilter/nft_nat.c index 8b44a4de5329..bb49a217635e 100644 --- a/net/netfilter/nft_nat.c +++ b/net/netfilter/nft_nat.c @@ -129,7 +129,7 @@ static int nft_nat_init(const struct nft_ctx *ctx, const struct nft_expr *expr, priv->type = NF_NAT_MANIP_DST; break; default: - return -EINVAL; + return -EOPNOTSUPP; } if (tb[NFTA_NAT_FAMILY] == NULL) @@ -196,7 +196,7 @@ static int nft_nat_init(const struct nft_ctx *ctx, const struct nft_expr *expr, if (tb[NFTA_NAT_FLAGS]) { priv->flags = ntohl(nla_get_be32(tb[NFTA_NAT_FLAGS])); if (priv->flags & ~NF_NAT_RANGE_MASK) - return -EINVAL; + return -EOPNOTSUPP; } return nf_ct_netns_get(ctx->net, family); -- cgit v1.2.3-59-g8ed1b From 4566aa440008103c9bd364f38c03ca5309acc8f4 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Fri, 24 Apr 2020 21:55:35 +0200 Subject: netfilter: nft_nat: set flags from initialization path This patch sets the NAT flags from the control plane path. Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_nat.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/net/netfilter/nft_nat.c b/net/netfilter/nft_nat.c index bb49a217635e..5c7ff213c030 100644 --- a/net/netfilter/nft_nat.c +++ b/net/netfilter/nft_nat.c @@ -55,7 +55,6 @@ static void nft_nat_eval(const struct nft_expr *expr, ®s->data[priv->sreg_addr_max], sizeof(range.max_addr.ip6)); } - range.flags |= NF_NAT_RANGE_MAP_IPS; } if (priv->sreg_proto_min) { @@ -63,10 +62,9 @@ static void nft_nat_eval(const struct nft_expr *expr, ®s->data[priv->sreg_proto_min]); range.max_proto.all = (__force __be16)nft_reg_load16( ®s->data[priv->sreg_proto_max]); - range.flags |= NF_NAT_RANGE_PROTO_SPECIFIED; } - range.flags |= priv->flags; + range.flags = priv->flags; regs->verdict.code = nf_nat_setup_info(ct, &range, priv->type); } @@ -169,6 +167,8 @@ static int nft_nat_init(const struct nft_ctx *ctx, const struct nft_expr *expr, } else { priv->sreg_addr_max = priv->sreg_addr_min; } + + priv->flags |= NF_NAT_RANGE_MAP_IPS; } plen = sizeof_field(struct nf_nat_range, min_addr.all); @@ -191,10 +191,12 @@ static int nft_nat_init(const struct nft_ctx *ctx, const struct nft_expr *expr, } else { priv->sreg_proto_max = priv->sreg_proto_min; } + + priv->flags |= NF_NAT_RANGE_PROTO_SPECIFIED; } if (tb[NFTA_NAT_FLAGS]) { - priv->flags = ntohl(nla_get_be32(tb[NFTA_NAT_FLAGS])); + priv->flags |= ntohl(nla_get_be32(tb[NFTA_NAT_FLAGS])); if (priv->flags & ~NF_NAT_RANGE_MASK) return -EOPNOTSUPP; } -- cgit v1.2.3-59-g8ed1b From acd766e31bb96b90c2dc4954f86e573c9ac16c66 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Fri, 24 Apr 2020 21:55:36 +0200 Subject: netfilter: nft_nat: add helper function to set up NAT address and protocol This patch add nft_nat_setup_addr() and nft_nat_setup_proto() to set up the NAT mangling. Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_nat.c | 56 ++++++++++++++++++++++++++++++------------------- 1 file changed, 34 insertions(+), 22 deletions(-) diff --git a/net/netfilter/nft_nat.c b/net/netfilter/nft_nat.c index 5c7ff213c030..7442aa8b1555 100644 --- a/net/netfilter/nft_nat.c +++ b/net/netfilter/nft_nat.c @@ -30,6 +30,36 @@ struct nft_nat { u16 flags; }; +static void nft_nat_setup_addr(struct nf_nat_range2 *range, + const struct nft_regs *regs, + const struct nft_nat *priv) +{ + switch (priv->family) { + case AF_INET: + range->min_addr.ip = (__force __be32) + regs->data[priv->sreg_addr_min]; + range->max_addr.ip = (__force __be32) + regs->data[priv->sreg_addr_max]; + break; + case AF_INET6: + memcpy(range->min_addr.ip6, ®s->data[priv->sreg_addr_min], + sizeof(range->min_addr.ip6)); + memcpy(range->max_addr.ip6, ®s->data[priv->sreg_addr_max], + sizeof(range->max_addr.ip6)); + break; + } +} + +static void nft_nat_setup_proto(struct nf_nat_range2 *range, + const struct nft_regs *regs, + const struct nft_nat *priv) +{ + range->min_proto.all = (__force __be16) + nft_reg_load16(®s->data[priv->sreg_proto_min]); + range->max_proto.all = (__force __be16) + nft_reg_load16(®s->data[priv->sreg_proto_max]); +} + static void nft_nat_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) @@ -40,29 +70,11 @@ static void nft_nat_eval(const struct nft_expr *expr, struct nf_nat_range2 range; memset(&range, 0, sizeof(range)); - if (priv->sreg_addr_min) { - if (priv->family == AF_INET) { - range.min_addr.ip = (__force __be32) - regs->data[priv->sreg_addr_min]; - range.max_addr.ip = (__force __be32) - regs->data[priv->sreg_addr_max]; - - } else { - memcpy(range.min_addr.ip6, - ®s->data[priv->sreg_addr_min], - sizeof(range.min_addr.ip6)); - memcpy(range.max_addr.ip6, - ®s->data[priv->sreg_addr_max], - sizeof(range.max_addr.ip6)); - } - } + if (priv->sreg_addr_min) + nft_nat_setup_addr(&range, regs, priv); - if (priv->sreg_proto_min) { - range.min_proto.all = (__force __be16)nft_reg_load16( - ®s->data[priv->sreg_proto_min]); - range.max_proto.all = (__force __be16)nft_reg_load16( - ®s->data[priv->sreg_proto_max]); - } + if (priv->sreg_proto_min) + nft_nat_setup_proto(&range, regs, priv); range.flags = priv->flags; -- cgit v1.2.3-59-g8ed1b From 3ff7ddb1353da9b535e65702704cbadea1da9a00 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Fri, 24 Apr 2020 21:55:37 +0200 Subject: netfilter: nft_nat: add netmap support This patch allows you to NAT the network address prefix onto another network address prefix, a.k.a. netmapping. Userspace must specify the NF_NAT_RANGE_NETMAP flag and the prefix address through the NFTA_NAT_REG_ADDR_MIN and NFTA_NAT_REG_ADDR_MAX netlink attributes. Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter/nf_nat.h | 4 ++- net/netfilter/nft_nat.c | 46 ++++++++++++++++++++++++++++++++++- 2 files changed, 48 insertions(+), 2 deletions(-) diff --git a/include/uapi/linux/netfilter/nf_nat.h b/include/uapi/linux/netfilter/nf_nat.h index 4a95c0db14d4..a64586e77b24 100644 --- a/include/uapi/linux/netfilter/nf_nat.h +++ b/include/uapi/linux/netfilter/nf_nat.h @@ -11,6 +11,7 @@ #define NF_NAT_RANGE_PERSISTENT (1 << 3) #define NF_NAT_RANGE_PROTO_RANDOM_FULLY (1 << 4) #define NF_NAT_RANGE_PROTO_OFFSET (1 << 5) +#define NF_NAT_RANGE_NETMAP (1 << 6) #define NF_NAT_RANGE_PROTO_RANDOM_ALL \ (NF_NAT_RANGE_PROTO_RANDOM | NF_NAT_RANGE_PROTO_RANDOM_FULLY) @@ -18,7 +19,8 @@ #define NF_NAT_RANGE_MASK \ (NF_NAT_RANGE_MAP_IPS | NF_NAT_RANGE_PROTO_SPECIFIED | \ NF_NAT_RANGE_PROTO_RANDOM | NF_NAT_RANGE_PERSISTENT | \ - NF_NAT_RANGE_PROTO_RANDOM_FULLY | NF_NAT_RANGE_PROTO_OFFSET) + NF_NAT_RANGE_PROTO_RANDOM_FULLY | NF_NAT_RANGE_PROTO_OFFSET | \ + NF_NAT_RANGE_NETMAP) struct nf_nat_ipv4_range { unsigned int flags; diff --git a/net/netfilter/nft_nat.c b/net/netfilter/nft_nat.c index 7442aa8b1555..23a7bfd10521 100644 --- a/net/netfilter/nft_nat.c +++ b/net/netfilter/nft_nat.c @@ -60,6 +60,46 @@ static void nft_nat_setup_proto(struct nf_nat_range2 *range, nft_reg_load16(®s->data[priv->sreg_proto_max]); } +static void nft_nat_setup_netmap(struct nf_nat_range2 *range, + const struct nft_pktinfo *pkt, + const struct nft_nat *priv) +{ + struct sk_buff *skb = pkt->skb; + union nf_inet_addr new_addr; + __be32 netmask; + int i, len = 0; + + switch (priv->type) { + case NFT_NAT_SNAT: + if (nft_pf(pkt) == NFPROTO_IPV4) { + new_addr.ip = ip_hdr(skb)->saddr; + len = sizeof(struct in_addr); + } else { + new_addr.in6 = ipv6_hdr(skb)->saddr; + len = sizeof(struct in6_addr); + } + break; + case NFT_NAT_DNAT: + if (nft_pf(pkt) == NFPROTO_IPV4) { + new_addr.ip = ip_hdr(skb)->daddr; + len = sizeof(struct in_addr); + } else { + new_addr.in6 = ipv6_hdr(skb)->daddr; + len = sizeof(struct in6_addr); + } + break; + } + + for (i = 0; i < len / sizeof(__be32); i++) { + netmask = ~(range->min_addr.ip6[i] ^ range->max_addr.ip6[i]); + new_addr.ip6[i] &= ~netmask; + new_addr.ip6[i] |= range->min_addr.ip6[i] & netmask; + } + + range->min_addr = new_addr; + range->max_addr = new_addr; +} + static void nft_nat_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) @@ -70,8 +110,12 @@ static void nft_nat_eval(const struct nft_expr *expr, struct nf_nat_range2 range; memset(&range, 0, sizeof(range)); - if (priv->sreg_addr_min) + + if (priv->sreg_addr_min) { nft_nat_setup_addr(&range, regs, priv); + if (priv->flags & NF_NAT_RANGE_NETMAP) + nft_nat_setup_netmap(&range, pkt, priv); + } if (priv->sreg_proto_min) nft_nat_setup_proto(&range, regs, priv); -- cgit v1.2.3-59-g8ed1b From 3d8bf50860c7de09c9713b97ec2f87ad42338c7e Mon Sep 17 00:00:00 2001 From: Yan-Hsuan Chuang Date: Fri, 24 Apr 2020 18:12:55 +0800 Subject: rtw88: fix sparse warnings for download firmware routine sparse warnings: (new ones prefixed by >>) >> drivers/net/wireless/realtek/rtw88/mac.c:653:5: sparse: sparse: symbol '__rtw_download_firmware' was not declared. Should it be static? >> drivers/net/wireless/realtek/rtw88/mac.c:817:5: sparse: sparse: symbol '__rtw_download_firmware_legacy' was not declared. Should it be static? Fixes: 15d2fcc6b2de ("rtw88: add legacy firmware download for 8723D devices") Reported-by: kbuild test robot Signed-off-by: Yan-Hsuan Chuang Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/20200424101255.28239-1-yhchuang@realtek.com --- drivers/net/wireless/realtek/rtw88/mac.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/net/wireless/realtek/rtw88/mac.c b/drivers/net/wireless/realtek/rtw88/mac.c index 645207a01525..6969379ba37e 100644 --- a/drivers/net/wireless/realtek/rtw88/mac.c +++ b/drivers/net/wireless/realtek/rtw88/mac.c @@ -696,7 +696,8 @@ static void download_firmware_end_flow(struct rtw_dev *rtwdev) rtw_write16(rtwdev, REG_MCUFW_CTRL, fw_ctrl); } -int __rtw_download_firmware(struct rtw_dev *rtwdev, struct rtw_fw_state *fw) +static int __rtw_download_firmware(struct rtw_dev *rtwdev, + struct rtw_fw_state *fw) { struct rtw_backup_info bckp[DLFW_RESTORE_REG_NUM]; const u8 *data = fw->firmware->data; @@ -860,7 +861,8 @@ static int download_firmware_validate_legacy(struct rtw_dev *rtwdev) return -EINVAL; } -int __rtw_download_firmware_legacy(struct rtw_dev *rtwdev, struct rtw_fw_state *fw) +static int __rtw_download_firmware_legacy(struct rtw_dev *rtwdev, + struct rtw_fw_state *fw) { int ret = 0; -- cgit v1.2.3-59-g8ed1b From 2aad9f81d34c3ecb6b5dbfd7c64a76298d40c361 Mon Sep 17 00:00:00 2001 From: John Oldman Date: Fri, 24 Apr 2020 18:50:43 +0100 Subject: ssb: sprom: fix block comments coding style issues Fixed coding style issues Signed-off-by: John Oldman Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/20200424175043.16261-1-john.oldman@polehill.co.uk --- drivers/ssb/sprom.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/drivers/ssb/sprom.c b/drivers/ssb/sprom.c index 52d2e0f33be7..42d620cee8a9 100644 --- a/drivers/ssb/sprom.c +++ b/drivers/ssb/sprom.c @@ -78,7 +78,8 @@ ssize_t ssb_attr_sprom_show(struct ssb_bus *bus, char *buf, /* Use interruptible locking, as the SPROM write might * be holding the lock for several seconds. So allow userspace - * to cancel operation. */ + * to cancel operation. + */ err = -ERESTARTSYS; if (mutex_lock_interruptible(&bus->sprom_mutex)) goto out_kfree; @@ -121,7 +122,8 @@ ssize_t ssb_attr_sprom_store(struct ssb_bus *bus, /* Use interruptible locking, as the SPROM write might * be holding the lock for several seconds. So allow userspace - * to cancel operation. */ + * to cancel operation. + */ err = -ERESTARTSYS; if (mutex_lock_interruptible(&bus->sprom_mutex)) goto out_kfree; @@ -188,9 +190,11 @@ int ssb_fill_sprom_with_fallback(struct ssb_bus *bus, struct ssb_sprom *out) bool ssb_is_sprom_available(struct ssb_bus *bus) { /* status register only exists on chipcomon rev >= 11 and we need check - for >= 31 only */ + * for >= 31 only + */ /* this routine differs from specs as we do not access SPROM directly - on PCMCIA */ + * on PCMCIA + */ if (bus->bustype == SSB_BUSTYPE_PCI && bus->chipco.dev && /* can be unavailable! */ bus->chipco.dev->id.revision >= 31) -- cgit v1.2.3-59-g8ed1b From 86501437d88532952a21b427d2ffd1683870d369 Mon Sep 17 00:00:00 2001 From: John Oldman Date: Sat, 25 Apr 2020 16:52:33 +0100 Subject: ssb: scan: fix block comments coding style issues Fixed coding style issues Signed-off-by: John Oldman Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/20200425155233.19624-1-john.oldman@polehill.co.uk --- drivers/ssb/scan.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/ssb/scan.c b/drivers/ssb/scan.c index 6ceee98ed6ff..b97a5c32d44a 100644 --- a/drivers/ssb/scan.c +++ b/drivers/ssb/scan.c @@ -400,7 +400,8 @@ int ssb_bus_scan(struct ssb_bus *bus, #ifdef CONFIG_SSB_DRIVER_PCICORE if (bus->bustype == SSB_BUSTYPE_PCI) { /* Ignore PCI cores on PCI-E cards. - * Ignore PCI-E cores on PCI cards. */ + * Ignore PCI-E cores on PCI cards. + */ if (dev->id.coreid == SSB_DEV_PCI) { if (pci_is_pcie(bus->host_pci)) continue; @@ -421,7 +422,8 @@ int ssb_bus_scan(struct ssb_bus *bus, if (bus->host_pci->vendor == PCI_VENDOR_ID_BROADCOM && (bus->host_pci->device & 0xFF00) == 0x4300) { /* This is a dangling ethernet core on a - * wireless device. Ignore it. */ + * wireless device. Ignore it. + */ continue; } } -- cgit v1.2.3-59-g8ed1b From d6cae2bc195b558ba79315eae699138ebdf41b57 Mon Sep 17 00:00:00 2001 From: Sergey Ryazanov Date: Fri, 24 Apr 2020 03:49:18 +0300 Subject: ath9k: fix AR9002 ADC and NF calibrations ADC calibration is only required for a 80 MHz sampling rate (i.e. for 40 MHz channels), when the chip utilizes the pair of ADCs in interleved mode. Calibration on a 20 MHz channel will never be completed. Previous channel check is trying to exclude all channels where the calibration will get stuck. It effectively blocks the calibration run for HT20 channels, but fails to exclude 20 MHz channels without HT (e.g. legacy mode channels). Fix this issue by reworking the channel check to explicitly allow ADCs gain & DC offset calibrations for HT40 channels only. Also update the complicated comment to make it clear that these calibrations are for multi-ADC mode only. Stuck ADCs calibration blocks the NF calibration, what could make it impossible to work in a noisy evironment: too big Rx attentuation, invalid RSSI value, etc. So this change is actually more of a NF calibration fix rather then the ADC calibration fix. Run tested with AR9220. Signed-off-by: Sergey Ryazanov Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/20200424004923.17129-2-ryazanov.s.a@gmail.com --- drivers/net/wireless/ath/ath9k/ar9002_calib.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/net/wireless/ath/ath9k/ar9002_calib.c b/drivers/net/wireless/ath/ath9k/ar9002_calib.c index fd9db8ca99d7..14eee06744ed 100644 --- a/drivers/net/wireless/ath/ath9k/ar9002_calib.c +++ b/drivers/net/wireless/ath/ath9k/ar9002_calib.c @@ -37,9 +37,8 @@ static bool ar9002_hw_is_cal_supported(struct ath_hw *ah, break; case ADC_GAIN_CAL: case ADC_DC_CAL: - /* Run ADC Gain Cal for non-CCK & non 2GHz-HT20 only */ - if (!((IS_CHAN_2GHZ(chan) || IS_CHAN_A_FAST_CLOCK(ah, chan)) && - IS_CHAN_HT20(chan))) + /* Run even/odd ADCs calibrations for HT40 channels only */ + if (IS_CHAN_HT40(chan)) supported = true; break; } -- cgit v1.2.3-59-g8ed1b From 93f8d4223163c6ba4f2ec8ade4832ef40e2b1d0c Mon Sep 17 00:00:00 2001 From: Sergey Ryazanov Date: Fri, 24 Apr 2020 03:49:19 +0300 Subject: ath9k: remove needless NFCAL_PENDING flag setting The NFCAL_PENDING flag is set by the ath9k_hw_start_nfcal() routine, so there is no reason to set it manually after calling it during the AR9002 calibrations initialization. Signed-off-by: Sergey Ryazanov Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/20200424004923.17129-3-ryazanov.s.a@gmail.com --- drivers/net/wireless/ath/ath9k/ar9002_calib.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/drivers/net/wireless/ath/ath9k/ar9002_calib.c b/drivers/net/wireless/ath/ath9k/ar9002_calib.c index 14eee06744ed..0f7c5812e5c2 100644 --- a/drivers/net/wireless/ath/ath9k/ar9002_calib.c +++ b/drivers/net/wireless/ath/ath9k/ar9002_calib.c @@ -857,9 +857,6 @@ static bool ar9002_hw_init_cal(struct ath_hw *ah, struct ath9k_channel *chan) ath9k_hw_loadnf(ah, chan); ath9k_hw_start_nfcal(ah, true); - if (ah->caldata) - set_bit(NFCAL_PENDING, &ah->caldata->cal_flags); - ah->cal_list = ah->cal_list_last = ah->cal_list_curr = NULL; /* Enable IQ, ADC Gain and ADC DC offset CALs */ -- cgit v1.2.3-59-g8ed1b From 41ba50fd6cac084a93e7651c528ef4bc37996ee2 Mon Sep 17 00:00:00 2001 From: Sergey Ryazanov Date: Fri, 24 Apr 2020 03:49:20 +0300 Subject: ath9k: do not miss longcal on AR9002 Each of AGC & I/Q calibrations can take a long time. Long calibration and NF calibration in particular are forbiden for parallel run with ADC & I/Q calibrations. So, the chip could not be ready to perform the long calibration at the time of request. And a request to perform the long calibration may be lost. In order to fix this, preserve the long calibration request as a calibration state flag and restore the long calibration request each time the calibration function is called again (i.e. on each subsequent ivocation of the short calibration). This feature will be twice useful after the next change, which will make it possible to start the long calibration before all ADCs & I/Q calibrations are completed. Run tested with AR9220. Signed-off-by: Sergey Ryazanov Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/20200424004923.17129-4-ryazanov.s.a@gmail.com --- drivers/net/wireless/ath/ath9k/ar9002_calib.c | 10 +++++++++- drivers/net/wireless/ath/ath9k/hw.h | 1 + 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/drivers/net/wireless/ath/ath9k/ar9002_calib.c b/drivers/net/wireless/ath/ath9k/ar9002_calib.c index 0f7c5812e5c2..ad8db7720993 100644 --- a/drivers/net/wireless/ath/ath9k/ar9002_calib.c +++ b/drivers/net/wireless/ath/ath9k/ar9002_calib.c @@ -663,8 +663,13 @@ static int ar9002_hw_calibrate(struct ath_hw *ah, struct ath9k_channel *chan, int ret; nfcal = !!(REG_READ(ah, AR_PHY_AGC_CONTROL) & AR_PHY_AGC_CONTROL_NF); - if (ah->caldata) + if (ah->caldata) { nfcal_pending = test_bit(NFCAL_PENDING, &ah->caldata->cal_flags); + if (longcal) /* Remember to not miss */ + set_bit(LONGCAL_PENDING, &ah->caldata->cal_flags); + else if (test_bit(LONGCAL_PENDING, &ah->caldata->cal_flags)) + longcal = true; /* Respin a previous one */ + } percal_pending = (currCal && (currCal->calState == CAL_RUNNING || @@ -700,6 +705,9 @@ static int ar9002_hw_calibrate(struct ath_hw *ah, struct ath9k_channel *chan, } if (longcal) { + if (ah->caldata) + clear_bit(LONGCAL_PENDING, + &ah->caldata->cal_flags); ath9k_hw_start_nfcal(ah, false); /* Do periodic PAOffset Cal */ ar9002_hw_pa_cal(ah, false); diff --git a/drivers/net/wireless/ath/ath9k/hw.h b/drivers/net/wireless/ath/ath9k/hw.h index 2e4489700a85..c99f3c77c823 100644 --- a/drivers/net/wireless/ath/ath9k/hw.h +++ b/drivers/net/wireless/ath/ath9k/hw.h @@ -427,6 +427,7 @@ enum ath9k_cal_flags { TXIQCAL_DONE, TXCLCAL_DONE, SW_PKDET_DONE, + LONGCAL_PENDING, }; struct ath9k_hw_cal_data { -- cgit v1.2.3-59-g8ed1b From 2bb7027b64b68bf8620b849d6ec1c223572c7e92 Mon Sep 17 00:00:00 2001 From: Sergey Ryazanov Date: Fri, 24 Apr 2020 03:49:21 +0300 Subject: ath9k: interleaved NF calibration on AR9002 NF calibration and other elements of long calibration are usually faster than ADCs & I/Q calibrations due to independence of receiption of the OFDM signal. Moreover sometime I/Q calibration can not be completed at all without preceding NF calibration. This is due to AGC, which has a habit to block a weak signal without regular NF calibration. Thus, we do not need to deferr the long calibration forever. So, if the long calibration is requested, then deferr the ADCs & I/Q calibration(s) and run the longcal (the NF calibration in particular) to obtain fresh noise data. Run tested with AR9220. Signed-off-by: Sergey Ryazanov Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/20200424004923.17129-5-ryazanov.s.a@gmail.com --- drivers/net/wireless/ath/ath9k/ar9002_calib.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/net/wireless/ath/ath9k/ar9002_calib.c b/drivers/net/wireless/ath/ath9k/ar9002_calib.c index ad8db7720993..68188f500949 100644 --- a/drivers/net/wireless/ath/ath9k/ar9002_calib.c +++ b/drivers/net/wireless/ath/ath9k/ar9002_calib.c @@ -680,8 +680,12 @@ static int ar9002_hw_calibrate(struct ath_hw *ah, struct ath9k_channel *chan, return 0; ah->cal_list_curr = currCal = currCal->calNext; - if (currCal->calState == CAL_WAITING) - ath9k_hw_reset_calibration(ah, currCal); + percal_pending = currCal->calState == CAL_WAITING; + } + + /* Do not start a next calibration if the longcal is in action */ + if (percal_pending && !nfcal && !longcal) { + ath9k_hw_reset_calibration(ah, currCal); return 0; } -- cgit v1.2.3-59-g8ed1b From ded6ff15a1911af7dd641b4cc1a1a2e161f08e1f Mon Sep 17 00:00:00 2001 From: Sergey Ryazanov Date: Fri, 24 Apr 2020 03:49:22 +0300 Subject: ath9k: invalidate all calibrations at once Previously after the calibration validity period is over, calibrations are invalidated in a one at time manner. So, for AR9002 family, which has three calibrations, the full recalibration interval becomes 3 x ATH_RESTART_CALINTERVAL. And each next calibration will be separated by the ATH_RESTART_CALINTERVAL time from a previous one. It seems like it is better to do whole recalibration at once. Also, this change makes the driver behaviour a little simpler. So, invalidate all calibrations at once at the end of the calibration validity interval. This change affects only AR9002 chips family, since the AR9003 utilize only a single calibration. Signed-off-by: Sergey Ryazanov Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/20200424004923.17129-6-ryazanov.s.a@gmail.com --- drivers/net/wireless/ath/ath9k/calib.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/drivers/net/wireless/ath/ath9k/calib.c b/drivers/net/wireless/ath/ath9k/calib.c index 695c779ae8cf..2ac3eefd3851 100644 --- a/drivers/net/wireless/ath/ath9k/calib.c +++ b/drivers/net/wireless/ath/ath9k/calib.c @@ -209,14 +209,17 @@ bool ath9k_hw_reset_calvalid(struct ath_hw *ah) return true; } - if (!(ah->supp_cals & currCal->calData->calType)) - return true; + currCal = ah->cal_list; + do { + ath_dbg(common, CALIBRATE, "Resetting Cal %d state for channel %u\n", + currCal->calData->calType, + ah->curchan->chan->center_freq); - ath_dbg(common, CALIBRATE, "Resetting Cal %d state for channel %u\n", - currCal->calData->calType, ah->curchan->chan->center_freq); + ah->caldata->CalValid &= ~currCal->calData->calType; + currCal->calState = CAL_WAITING; - ah->caldata->CalValid &= ~currCal->calData->calType; - currCal->calState = CAL_WAITING; + currCal = currCal->calNext; + } while (currCal != ah->cal_list); return false; } -- cgit v1.2.3-59-g8ed1b From d8d20845c7f129359c845c526929056651d4e5df Mon Sep 17 00:00:00 2001 From: Sergey Ryazanov Date: Fri, 24 Apr 2020 03:49:23 +0300 Subject: ath9k: add calibration timeout for AR9002 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ADC & I/Q calibrations could take infinite time to comple, since they depend on received frames. In particular the I/Q mismatch calibration requires receiving of OFDM frames for completion. But in the 2.4GHz band, a station could receive only CCK frames for a very long time. And while we wait for the completion of one of the mentioned calibrations, the NF calibration is blocked. Moreover, in some environments, I/Q calibration is unable to complete until a correct noise calibration will be performed due to AGC behaviour. In order to avoid delaying NF calibration on forever, limit the maximum duration of ADCs & I/Q calibrations. If the calibration is not completed within the maximum time, it will be interrupted and a next calibration will be performed. The code that selects the next calibration has been reworked to the loop so incompleted calibration will be respinned later. А maximum calibration time of 30 seconds was selected to give the calibration enough time to complete and to not interfere with the long (NF) calibration. Run tested with AR9220. Signed-off-by: Sergey Ryazanov Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/20200424004923.17129-7-ryazanov.s.a@gmail.com --- drivers/net/wireless/ath/ath9k/ar9002_calib.c | 25 +++++++++++++++++++++++-- drivers/net/wireless/ath/ath9k/calib.c | 1 + drivers/net/wireless/ath/ath9k/hw.h | 1 + 3 files changed, 25 insertions(+), 2 deletions(-) diff --git a/drivers/net/wireless/ath/ath9k/ar9002_calib.c b/drivers/net/wireless/ath/ath9k/ar9002_calib.c index 68188f500949..fd53b5f9e9b5 100644 --- a/drivers/net/wireless/ath/ath9k/ar9002_calib.c +++ b/drivers/net/wireless/ath/ath9k/ar9002_calib.c @@ -19,6 +19,8 @@ #include "ar9002_phy.h" #define AR9285_CLCAL_REDO_THRESH 1 +/* AGC & I/Q calibrations time limit, ms */ +#define AR9002_CAL_MAX_TIME 30000 enum ar9002_cal_types { ADC_GAIN_CAL = BIT(0), @@ -104,6 +106,14 @@ static bool ar9002_hw_per_calibration(struct ath_hw *ah, } else { ar9002_hw_setup_calibration(ah, currCal); } + } else if (time_after(jiffies, ah->cal_start_time + + msecs_to_jiffies(AR9002_CAL_MAX_TIME))) { + REG_CLR_BIT(ah, AR_PHY_TIMING_CTRL4(0), + AR_PHY_TIMING_CTRL4_DO_CAL); + ath_dbg(ath9k_hw_common(ah), CALIBRATE, + "calibration timeout\n"); + currCal->calState = CAL_WAITING; /* Try later */ + iscaldone = true; } } else if (!(caldata->CalValid & currCal->calData->calType)) { ath9k_hw_reset_calibration(ah, currCal); @@ -679,8 +689,19 @@ static int ar9002_hw_calibrate(struct ath_hw *ah, struct ath9k_channel *chan, if (!ar9002_hw_per_calibration(ah, chan, rxchainmask, currCal)) return 0; - ah->cal_list_curr = currCal = currCal->calNext; - percal_pending = currCal->calState == CAL_WAITING; + /* Looking for next waiting calibration if any */ + for (currCal = currCal->calNext; currCal != ah->cal_list_curr; + currCal = currCal->calNext) { + if (currCal->calState == CAL_WAITING) + break; + } + if (currCal->calState == CAL_WAITING) { + percal_pending = true; + ah->cal_list_curr = currCal; + } else { + percal_pending = false; + ah->cal_list_curr = ah->cal_list; + } } /* Do not start a next calibration if the longcal is in action */ diff --git a/drivers/net/wireless/ath/ath9k/calib.c b/drivers/net/wireless/ath/ath9k/calib.c index 2ac3eefd3851..0422a33395b7 100644 --- a/drivers/net/wireless/ath/ath9k/calib.c +++ b/drivers/net/wireless/ath/ath9k/calib.c @@ -176,6 +176,7 @@ void ath9k_hw_reset_calibration(struct ath_hw *ah, ath9k_hw_setup_calibration(ah, currCal); + ah->cal_start_time = jiffies; currCal->calState = CAL_RUNNING; for (i = 0; i < AR5416_MAX_CHAINS; i++) { diff --git a/drivers/net/wireless/ath/ath9k/hw.h b/drivers/net/wireless/ath/ath9k/hw.h index c99f3c77c823..023599e10dd5 100644 --- a/drivers/net/wireless/ath/ath9k/hw.h +++ b/drivers/net/wireless/ath/ath9k/hw.h @@ -834,6 +834,7 @@ struct ath_hw { /* Calibration */ u32 supp_cals; + unsigned long cal_start_time; struct ath9k_cal_list iq_caldata; struct ath9k_cal_list adcgain_caldata; struct ath9k_cal_list adcdc_caldata; -- cgit v1.2.3-59-g8ed1b From c26b01d5ec1ab4dbfbdeb93ef4bc9e34951b6688 Mon Sep 17 00:00:00 2001 From: Jason Yan Date: Sun, 26 Apr 2020 17:40:37 +0800 Subject: ath5k: remove conversion to bool in ath5k_ani_calibration() The '>' expression itself is bool, no need to convert it to bool again. This fixes the following coccicheck warning: drivers/net/wireless/ath/ath5k/ani.c:504:56-61: WARNING: conversion to bool not needed here Signed-off-by: Jason Yan Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/20200426094037.23048-1-yanaijie@huawei.com --- drivers/net/wireless/ath/ath5k/ani.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/wireless/ath/ath5k/ani.c b/drivers/net/wireless/ath/ath5k/ani.c index 0624333f5430..850c608b43a3 100644 --- a/drivers/net/wireless/ath/ath5k/ani.c +++ b/drivers/net/wireless/ath/ath5k/ani.c @@ -501,7 +501,7 @@ ath5k_ani_calibration(struct ath5k_hw *ah) if (as->ofdm_errors > ofdm_high || as->cck_errors > cck_high) { /* too many PHY errors - we have to raise immunity */ - bool ofdm_flag = as->ofdm_errors > ofdm_high ? true : false; + bool ofdm_flag = as->ofdm_errors > ofdm_high; ath5k_ani_raise_immunity(ah, as, ofdm_flag); ath5k_ani_period_restart(as); -- cgit v1.2.3-59-g8ed1b From 2289bef25e32808bb6d748edc667ca297792bf8f Mon Sep 17 00:00:00 2001 From: Wen Gong Date: Mon, 27 Apr 2020 16:04:13 +0800 Subject: ath10k: enable firmware peer stats info for wmi tlv For wmi tlv type, firmware disable peer stats info by default, after enable it, firmware will report WMI_TLV_PEER_STATS_INFO_EVENTID if ath10k send WMI_TLV_REQUEST_PEER_STATS_INFO_CMDID to firmware. Enable it will only set a flag in firmware, firmware will not report it without receive request WMI command. Tested with QCA6174 SDIO with firmware WLAN.RMH.4.4.1-00042. Signed-off-by: Wen Gong Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/20200427080416.8265-2-wgong@codeaurora.org --- drivers/net/wireless/ath/ath10k/mac.c | 5 +++++ drivers/net/wireless/ath/ath10k/wmi-tlv.c | 1 + drivers/net/wireless/ath/ath10k/wmi-tlv.h | 1 + drivers/net/wireless/ath/ath10k/wmi.h | 1 + 4 files changed, 8 insertions(+) diff --git a/drivers/net/wireless/ath/ath10k/mac.c b/drivers/net/wireless/ath/ath10k/mac.c index a1147ccc09bf..0fb082c9d04b 100644 --- a/drivers/net/wireless/ath/ath10k/mac.c +++ b/drivers/net/wireless/ath/ath10k/mac.c @@ -2959,6 +2959,11 @@ static void ath10k_bss_assoc(struct ieee80211_hw *hw, arvif->aid = bss_conf->aid; ether_addr_copy(arvif->bssid, bss_conf->bssid); + ret = ath10k_wmi_pdev_set_param(ar, + ar->wmi.pdev_param->peer_stats_info_enable, 1); + if (ret) + ath10k_warn(ar, "failed to enable peer stats info: %d\n", ret); + ret = ath10k_wmi_vdev_up(ar, arvif->vdev_id, arvif->aid, arvif->bssid); if (ret) { ath10k_warn(ar, "failed to set vdev %d up: %d\n", diff --git a/drivers/net/wireless/ath/ath10k/wmi-tlv.c b/drivers/net/wireless/ath/ath10k/wmi-tlv.c index e1ab900f2662..27aaa48615d2 100644 --- a/drivers/net/wireless/ath/ath10k/wmi-tlv.c +++ b/drivers/net/wireless/ath/ath10k/wmi-tlv.c @@ -4269,6 +4269,7 @@ static struct wmi_pdev_param_map wmi_tlv_pdev_param_map = { .arp_dstaddr = WMI_PDEV_PARAM_UNSUPPORTED, .rfkill_config = WMI_TLV_PDEV_PARAM_HW_RFKILL_CONFIG, .rfkill_enable = WMI_TLV_PDEV_PARAM_RFKILL_ENABLE, + .peer_stats_info_enable = WMI_TLV_PDEV_PARAM_PEER_STATS_INFO_ENABLE, }; static struct wmi_peer_param_map wmi_tlv_peer_param_map = { diff --git a/drivers/net/wireless/ath/ath10k/wmi-tlv.h b/drivers/net/wireless/ath/ath10k/wmi-tlv.h index 4972dc12991c..cd400b19a64d 100644 --- a/drivers/net/wireless/ath/ath10k/wmi-tlv.h +++ b/drivers/net/wireless/ath/ath10k/wmi-tlv.h @@ -451,6 +451,7 @@ enum wmi_tlv_pdev_param { WMI_TLV_PDEV_PARAM_VDEV_RATE_STATS_UPDATE_PERIOD, WMI_TLV_PDEV_PARAM_TXPOWER_REASON_NONE, WMI_TLV_PDEV_PARAM_TXPOWER_REASON_SAR, + WMI_TLV_PDEV_PARAM_PEER_STATS_INFO_ENABLE = 0x8b, WMI_TLV_PDEV_PARAM_TXPOWER_REASON_MAX, }; diff --git a/drivers/net/wireless/ath/ath10k/wmi.h b/drivers/net/wireless/ath/ath10k/wmi.h index 209070714d1a..46740e16f3ce 100644 --- a/drivers/net/wireless/ath/ath10k/wmi.h +++ b/drivers/net/wireless/ath/ath10k/wmi.h @@ -3798,6 +3798,7 @@ struct wmi_pdev_param_map { u32 enable_btcoex; u32 rfkill_config; u32 rfkill_enable; + u32 peer_stats_info_enable; }; #define WMI_PDEV_PARAM_UNSUPPORTED 0 -- cgit v1.2.3-59-g8ed1b From 0f7cb26830a6e740455a7064e46ff1e926197ecb Mon Sep 17 00:00:00 2001 From: Wen Gong Date: Mon, 27 Apr 2020 16:04:14 +0800 Subject: ath10k: add rx bitrate report for SDIO For SDIO chip, its rx indication is struct htt_rx_indication_hl, which does not include the bitrate info as well as PCIe, for PCIe, it use function ath10k_htt_rx_h_rates to parse the bitrate info in struct rx_ppdu_start and then report it to mac80211 via ieee80211_rx_status. SDIO does not have the same info as PCIe, then iw command can not get the rx bitrate by "iw wlan0 station dump". for example, it always show 6.0 MBit/s localhost ~ # iw wlan0 link Connected to 3c:28:6d:96:fd:69 (on wlan0) SSID: kukui_test freq: 5180 RX: 111800 bytes (595 packets) TX: 35419 bytes (202 packets) signal: -41 dBm rx bitrate: 6.0 MBit/s This patch is to send WMI_TLV_REQUEST_PEER_STATS_INFO_CMDID to firmware for ath10k_sta_statistics and save the rx bitrate for WMI event WMI_TLV_PEER_STATS_INFO_EVENTID. This patch only effect SDIO chip, ath10k_mac_sta_get_peer_stats_info has check for bitrate_statistics of hw_params, this patch only enable it for "qca6174 hw3.2 sdio". Tested with QCA6174 SDIO firmware WLAN.RMH.4.4.1-00042. Signed-off-by: Wen Gong Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/20200427080416.8265-3-wgong@codeaurora.org --- drivers/net/wireless/ath/ath10k/core.c | 2 + drivers/net/wireless/ath/ath10k/core.h | 3 + drivers/net/wireless/ath/ath10k/hw.h | 3 + drivers/net/wireless/ath/ath10k/mac.c | 40 ++++++++++ drivers/net/wireless/ath/ath10k/wmi-ops.h | 30 ++++++++ drivers/net/wireless/ath/ath10k/wmi-tlv.c | 118 ++++++++++++++++++++++++++++++ drivers/net/wireless/ath/ath10k/wmi-tlv.h | 100 +++++++++++++++++++++++++ drivers/net/wireless/ath/ath10k/wmi.h | 8 ++ 8 files changed, 304 insertions(+) diff --git a/drivers/net/wireless/ath/ath10k/core.c b/drivers/net/wireless/ath/ath10k/core.c index d96d178b4980..22b6937ac225 100644 --- a/drivers/net/wireless/ath/ath10k/core.c +++ b/drivers/net/wireless/ath/ath10k/core.c @@ -190,6 +190,7 @@ static const struct ath10k_hw_params ath10k_hw_params_list[] = { .uart_pin_workaround = true, .tx_stats_over_pktlog = false, .bmi_large_size_download = true, + .supports_peer_stats_info = true, }, { .id = QCA6174_HW_2_1_VERSION, @@ -3277,6 +3278,7 @@ struct ath10k *ath10k_core_create(size_t priv_size, struct device *dev, init_completion(&ar->thermal.wmi_sync); init_completion(&ar->bss_survey_done); init_completion(&ar->peer_delete_done); + init_completion(&ar->peer_stats_info_complete); INIT_DELAYED_WORK(&ar->scan.timeout, ath10k_scan_timeout_work); diff --git a/drivers/net/wireless/ath/ath10k/core.h b/drivers/net/wireless/ath/ath10k/core.h index 07935d39d6d6..11d9132be4fd 100644 --- a/drivers/net/wireless/ath/ath10k/core.h +++ b/drivers/net/wireless/ath/ath10k/core.h @@ -504,6 +504,8 @@ struct ath10k_sta { u32 tx_failed; u32 last_tx_bitrate; + u32 rx_rate_code; + u32 rx_bitrate_kbps; struct work_struct update_wk; u64 rx_duration; struct ath10k_htt_tx_stats *tx_stats; @@ -1089,6 +1091,7 @@ struct ath10k { int last_wmi_vdev_start_status; struct completion vdev_setup_done; struct completion vdev_delete_done; + struct completion peer_stats_info_complete; struct workqueue_struct *workqueue; /* Auxiliary workqueue */ diff --git a/drivers/net/wireless/ath/ath10k/hw.h b/drivers/net/wireless/ath/ath10k/hw.h index 2a7af5861788..d9907a4648a8 100644 --- a/drivers/net/wireless/ath/ath10k/hw.h +++ b/drivers/net/wireless/ath/ath10k/hw.h @@ -623,6 +623,9 @@ struct ath10k_hw_params { /* tx stats support over pktlog */ bool tx_stats_over_pktlog; + + /* provides bitrates for sta_statistics using WMI_TLV_PEER_STATS_INFO_EVENTID */ + bool supports_peer_stats_info; }; struct htt_rx_desc; diff --git a/drivers/net/wireless/ath/ath10k/mac.c b/drivers/net/wireless/ath/ath10k/mac.c index 0fb082c9d04b..d0401d3adde4 100644 --- a/drivers/net/wireless/ath/ath10k/mac.c +++ b/drivers/net/wireless/ath/ath10k/mac.c @@ -8305,6 +8305,44 @@ static void ath10k_mac_op_sta_pre_rcu_remove(struct ieee80211_hw *hw, peer->removed = true; } +static void ath10k_mac_sta_get_peer_stats_info(struct ath10k *ar, + struct ieee80211_sta *sta, + struct station_info *sinfo) +{ + struct ath10k_sta *arsta = (struct ath10k_sta *)sta->drv_priv; + struct ath10k_peer *peer; + unsigned long time_left; + int ret; + + if (!(ar->hw_params.supports_peer_stats_info && + arsta->arvif->vdev_type == WMI_VDEV_TYPE_STA)) + return; + + spin_lock_bh(&ar->data_lock); + peer = ath10k_peer_find(ar, arsta->arvif->vdev_id, sta->addr); + spin_unlock_bh(&ar->data_lock); + if (!peer) + return; + + reinit_completion(&ar->peer_stats_info_complete); + + ret = ath10k_wmi_request_peer_stats_info(ar, + arsta->arvif->vdev_id, + WMI_REQUEST_ONE_PEER_STATS_INFO, + arsta->arvif->bssid, + 0); + if (ret && ret != -EOPNOTSUPP) { + ath10k_warn(ar, "could not request peer stats info: %d\n", ret); + return; + } + + time_left = wait_for_completion_timeout(&ar->peer_stats_info_complete, 3 * HZ); + if (time_left == 0) { + ath10k_warn(ar, "timed out waiting peer stats info\n"); + return; + } +} + static void ath10k_sta_statistics(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_sta *sta, @@ -8340,6 +8378,8 @@ static void ath10k_sta_statistics(struct ieee80211_hw *hw, sinfo->tx_failed = arsta->tx_failed; sinfo->filled |= BIT_ULL(NL80211_STA_INFO_TX_FAILED); } + + ath10k_mac_sta_get_peer_stats_info(ar, sta, sinfo); } static const struct ieee80211_ops ath10k_ops = { diff --git a/drivers/net/wireless/ath/ath10k/wmi-ops.h b/drivers/net/wireless/ath/ath10k/wmi-ops.h index 1491c25518bb..6b730f59fd5b 100644 --- a/drivers/net/wireless/ath/ath10k/wmi-ops.h +++ b/drivers/net/wireless/ath/ath10k/wmi-ops.h @@ -126,6 +126,13 @@ struct wmi_ops { struct sk_buff *(*gen_pdev_set_wmm)(struct ath10k *ar, const struct wmi_wmm_params_all_arg *arg); struct sk_buff *(*gen_request_stats)(struct ath10k *ar, u32 stats_mask); + struct sk_buff *(*gen_request_peer_stats_info)(struct ath10k *ar, + u32 vdev_id, + enum + wmi_peer_stats_info_request_type + type, + u8 *addr, + u32 reset); struct sk_buff *(*gen_force_fw_hang)(struct ath10k *ar, enum wmi_force_fw_hang_type type, u32 delay_ms); @@ -1064,6 +1071,29 @@ ath10k_wmi_request_stats(struct ath10k *ar, u32 stats_mask) return ath10k_wmi_cmd_send(ar, skb, ar->wmi.cmd->request_stats_cmdid); } +static inline int +ath10k_wmi_request_peer_stats_info(struct ath10k *ar, + u32 vdev_id, + enum wmi_peer_stats_info_request_type type, + u8 *addr, + u32 reset) +{ + struct sk_buff *skb; + + if (!ar->wmi.ops->gen_request_peer_stats_info) + return -EOPNOTSUPP; + + skb = ar->wmi.ops->gen_request_peer_stats_info(ar, + vdev_id, + type, + addr, + reset); + if (IS_ERR(skb)) + return PTR_ERR(skb); + + return ath10k_wmi_cmd_send(ar, skb, ar->wmi.cmd->request_peer_stats_info_cmdid); +} + static inline int ath10k_wmi_force_fw_hang(struct ath10k *ar, enum wmi_force_fw_hang_type type, u32 delay_ms) diff --git a/drivers/net/wireless/ath/ath10k/wmi-tlv.c b/drivers/net/wireless/ath/ath10k/wmi-tlv.c index 27aaa48615d2..eec1f1f27dec 100644 --- a/drivers/net/wireless/ath/ath10k/wmi-tlv.c +++ b/drivers/net/wireless/ath/ath10k/wmi-tlv.c @@ -219,6 +219,89 @@ static void ath10k_wmi_tlv_event_vdev_delete_resp(struct ath10k *ar, complete(&ar->vdev_delete_done); } +static int ath10k_wmi_tlv_parse_peer_stats_info(struct ath10k *ar, u16 tag, u16 len, + const void *ptr, void *data) +{ + const struct wmi_tlv_peer_stats_info *stat = ptr; + struct ieee80211_sta *sta; + struct ath10k_sta *arsta; + + if (tag != WMI_TLV_TAG_STRUCT_PEER_STATS_INFO) + return -EPROTO; + + ath10k_dbg(ar, ATH10K_DBG_WMI, + "wmi tlv stats peer addr %pMF rx rate code 0x%x bit rate %d kbps\n", + stat->peer_macaddr.addr, + __le32_to_cpu(stat->last_rx_rate_code), + __le32_to_cpu(stat->last_rx_bitrate_kbps)); + + ath10k_dbg(ar, ATH10K_DBG_WMI, + "wmi tlv stats tx rate code 0x%x bit rate %d kbps\n", + __le32_to_cpu(stat->last_tx_rate_code), + __le32_to_cpu(stat->last_tx_bitrate_kbps)); + + sta = ieee80211_find_sta_by_ifaddr(ar->hw, stat->peer_macaddr.addr, NULL); + if (!sta) { + ath10k_warn(ar, "not found station for peer stats\n"); + return -EINVAL; + } + + arsta = (struct ath10k_sta *)sta->drv_priv; + arsta->rx_rate_code = __le32_to_cpu(stat->last_rx_rate_code); + arsta->rx_bitrate_kbps = __le32_to_cpu(stat->last_rx_bitrate_kbps); + + return 0; +} + +static int ath10k_wmi_tlv_op_pull_peer_stats_info(struct ath10k *ar, + struct sk_buff *skb) +{ + const void **tb; + const struct wmi_tlv_peer_stats_info_ev *ev; + const void *data; + u32 num_peer_stats; + int ret; + + tb = ath10k_wmi_tlv_parse_alloc(ar, skb->data, skb->len, GFP_ATOMIC); + if (IS_ERR(tb)) { + ret = PTR_ERR(tb); + ath10k_warn(ar, "failed to parse tlv: %d\n", ret); + return ret; + } + + ev = tb[WMI_TLV_TAG_STRUCT_PEER_STATS_INFO_EVENT]; + data = tb[WMI_TLV_TAG_ARRAY_STRUCT]; + + if (!ev || !data) { + kfree(tb); + return -EPROTO; + } + + num_peer_stats = __le32_to_cpu(ev->num_peers); + + ath10k_dbg(ar, ATH10K_DBG_WMI, + "wmi tlv peer stats info update peer vdev id %d peers %i more data %d\n", + __le32_to_cpu(ev->vdev_id), + num_peer_stats, + __le32_to_cpu(ev->more_data)); + + ret = ath10k_wmi_tlv_iter(ar, data, ath10k_wmi_tlv_len(data), + ath10k_wmi_tlv_parse_peer_stats_info, NULL); + if (ret) + ath10k_warn(ar, "failed to parse stats info tlv: %d\n", ret); + + kfree(tb); + return 0; +} + +static void ath10k_wmi_tlv_event_peer_stats_info(struct ath10k *ar, + struct sk_buff *skb) +{ + ath10k_dbg(ar, ATH10K_DBG_WMI, "WMI_PEER_STATS_INFO_EVENTID\n"); + ath10k_wmi_tlv_op_pull_peer_stats_info(ar, skb); + complete(&ar->peer_stats_info_complete); +} + static int ath10k_wmi_tlv_event_diag_data(struct ath10k *ar, struct sk_buff *skb) { @@ -576,6 +659,9 @@ static void ath10k_wmi_tlv_op_rx(struct ath10k *ar, struct sk_buff *skb) case WMI_TLV_UPDATE_STATS_EVENTID: ath10k_wmi_event_update_stats(ar, skb); break; + case WMI_TLV_PEER_STATS_INFO_EVENTID: + ath10k_wmi_tlv_event_peer_stats_info(ar, skb); + break; case WMI_TLV_VDEV_START_RESP_EVENTID: ath10k_wmi_event_vdev_start_resp(ar, skb); break; @@ -2897,6 +2983,36 @@ ath10k_wmi_tlv_op_gen_request_stats(struct ath10k *ar, u32 stats_mask) return skb; } +static struct sk_buff * +ath10k_wmi_tlv_op_gen_request_peer_stats_info(struct ath10k *ar, + u32 vdev_id, + enum wmi_peer_stats_info_request_type type, + u8 *addr, + u32 reset) +{ + struct wmi_tlv_request_peer_stats_info *cmd; + struct wmi_tlv *tlv; + struct sk_buff *skb; + + skb = ath10k_wmi_alloc_skb(ar, sizeof(*tlv) + sizeof(*cmd)); + if (!skb) + return ERR_PTR(-ENOMEM); + + tlv = (void *)skb->data; + tlv->tag = __cpu_to_le16(WMI_TLV_TAG_STRUCT_REQUEST_PEER_STATS_INFO_CMD); + tlv->len = __cpu_to_le16(sizeof(*cmd)); + cmd = (void *)tlv->value; + cmd->vdev_id = __cpu_to_le32(vdev_id); + cmd->request_type = __cpu_to_le32(type); + + if (type == WMI_REQUEST_ONE_PEER_STATS_INFO) + ether_addr_copy(cmd->peer_macaddr.addr, addr); + + cmd->reset_after_request = reset; + ath10k_dbg(ar, ATH10K_DBG_WMI, "wmi tlv request peer stats info\n"); + return skb; +} + static int ath10k_wmi_mgmt_tx_alloc_msdu_id(struct ath10k *ar, struct sk_buff *skb, dma_addr_t paddr) @@ -4113,6 +4229,7 @@ static struct wmi_cmd_map wmi_tlv_cmd_map = { .vdev_spectral_scan_configure_cmdid = WMI_TLV_SPECTRAL_SCAN_CONF_CMDID, .vdev_spectral_scan_enable_cmdid = WMI_TLV_SPECTRAL_SCAN_ENABLE_CMDID, .request_stats_cmdid = WMI_TLV_REQUEST_STATS_CMDID, + .request_peer_stats_info_cmdid = WMI_TLV_REQUEST_PEER_STATS_INFO_CMDID, .set_arp_ns_offload_cmdid = WMI_TLV_SET_ARP_NS_OFFLOAD_CMDID, .network_list_offload_config_cmdid = WMI_TLV_NETWORK_LIST_OFFLOAD_CONFIG_CMDID, @@ -4417,6 +4534,7 @@ static const struct wmi_ops wmi_tlv_ops = { .gen_beacon_dma = ath10k_wmi_tlv_op_gen_beacon_dma, .gen_pdev_set_wmm = ath10k_wmi_tlv_op_gen_pdev_set_wmm, .gen_request_stats = ath10k_wmi_tlv_op_gen_request_stats, + .gen_request_peer_stats_info = ath10k_wmi_tlv_op_gen_request_peer_stats_info, .gen_force_fw_hang = ath10k_wmi_tlv_op_gen_force_fw_hang, /* .gen_mgmt_tx = not implemented; HTT is used */ .gen_mgmt_tx_send = ath10k_wmi_tlv_op_gen_mgmt_tx_send, diff --git a/drivers/net/wireless/ath/ath10k/wmi-tlv.h b/drivers/net/wireless/ath/ath10k/wmi-tlv.h index cd400b19a64d..2153e2d9a955 100644 --- a/drivers/net/wireless/ath/ath10k/wmi-tlv.h +++ b/drivers/net/wireless/ath/ath10k/wmi-tlv.h @@ -198,6 +198,12 @@ enum wmi_tlv_cmd_id { WMI_TLV_REQUEST_LINK_STATS_CMDID, WMI_TLV_START_LINK_STATS_CMDID, WMI_TLV_CLEAR_LINK_STATS_CMDID, + WMI_TLV_CGET_FW_MEM_DUMP_CMDID, + WMI_TLV_CDEBUG_MESG_FLUSH_CMDID, + WMI_TLV_CDIAG_EVENT_LOG_CONFIG_CMDID, + WMI_TLV_CREQUEST_WLAN_STATS_CMDID, + WMI_TLV_CREQUEST_RCPI_CMDID, + WMI_TLV_REQUEST_PEER_STATS_INFO_CMDID, WMI_TLV_SET_ARP_NS_OFFLOAD_CMDID = WMI_TLV_CMD(WMI_TLV_GRP_ARP_NS_OFL), WMI_TLV_ADD_PROACTIVE_ARP_RSP_PATTERN_CMDID, WMI_TLV_DEL_PROACTIVE_ARP_RSP_PATTERN_CMDID, @@ -338,6 +344,13 @@ enum wmi_tlv_event_id { WMI_TLV_IFACE_LINK_STATS_EVENTID, WMI_TLV_PEER_LINK_STATS_EVENTID, WMI_TLV_RADIO_LINK_STATS_EVENTID, + WMI_TLV_UPDATE_FW_MEM_DUMP_EVENTID, + WMI_TLV_DIAG_EVENT_LOG_SUPPORTED_EVENTID, + WMI_TLV_INST_RSSI_STATS_EVENTID, + WMI_TLV_RADIO_TX_POWER_LEVEL_STATS_EVENTID, + WMI_TLV_REPORT_STATS_EVENTID, + WMI_TLV_UPDATE_RCPI_EVENTID, + WMI_TLV_PEER_STATS_INFO_EVENTID, WMI_TLV_NLO_MATCH_EVENTID = WMI_TLV_EV(WMI_TLV_GRP_NLO_OFL), WMI_TLV_NLO_SCAN_COMPLETE_EVENTID, WMI_TLV_APFIND_EVENTID, @@ -2082,6 +2095,85 @@ struct wmi_tlv_stats_ev { __le32 num_peer_stats_extd; } __packed; +struct wmi_tlv_peer_stats_info_ev { + __le32 vdev_id; + __le32 num_peers; + __le32 more_data; +} __packed; + +#define WMI_TLV_MAX_CHAINS 8 + +struct wmi_tlv_peer_stats_info { + struct wmi_mac_addr peer_macaddr; + struct { + /* lower 32 bits of the tx_bytes value */ + __le32 low_32; + /* upper 32 bits of the tx_bytes value */ + __le32 high_32; + } __packed tx_bytes; + struct { + /* lower 32 bits of the tx_packets value */ + __le32 low_32; + /* upper 32 bits of the tx_packets value */ + __le32 high_32; + } __packed tx_packets; + struct { + /* lower 32 bits of the rx_bytes value */ + __le32 low_32; + /* upper 32 bits of the rx_bytes value */ + __le32 high_32; + } __packed rx_bytes; + struct { + /* lower 32 bits of the rx_packets value */ + __le32 low_32; + /* upper 32 bits of the rx_packets value */ + __le32 high_32; + } __packed rx_packets; + __le32 tx_retries; + __le32 tx_failed; + + /* rate information, it is output of WMI_ASSEMBLE_RATECODE_V1 + * (in format of 0x1000RRRR) + * The rate-code is a 4-bytes field in which, + * for given rate, nss and preamble + * + * b'31-b'29 unused / reserved + * b'28 indicate the version of rate-code (1 = RATECODE_V1) + * b'27-b'11 unused / reserved + * b'10-b'8 indicate the preamble (0 OFDM, 1 CCK, 2 HT, 3 VHT) + * b'7-b'5 indicate the NSS (0 - 1x1, 1 - 2x2, 2 - 3x3, 3 - 4x4) + * b'4-b'0 indicate the rate, which is indicated as follows: + * OFDM : 0: OFDM 48 Mbps + * 1: OFDM 24 Mbps + * 2: OFDM 12 Mbps + * 3: OFDM 6 Mbps + * 4: OFDM 54 Mbps + * 5: OFDM 36 Mbps + * 6: OFDM 18 Mbps + * 7: OFDM 9 Mbps + * CCK (pream == 1) + * 0: CCK 11 Mbps Long + * 1: CCK 5.5 Mbps Long + * 2: CCK 2 Mbps Long + * 3: CCK 1 Mbps Long + * 4: CCK 11 Mbps Short + * 5: CCK 5.5 Mbps Short + * 6: CCK 2 Mbps Short + * HT/VHT (pream == 2/3) + * 0..7: MCS0..MCS7 (HT) + * 0..9: MCS0..MCS9 (11AC VHT) + * 0..11: MCS0..MCS11 (11AX VHT) + * rate-code of the last transmission + */ + __le32 last_tx_rate_code; + __le32 last_rx_rate_code; + __le32 last_tx_bitrate_kbps; + __le32 last_rx_bitrate_kbps; + __le32 peer_rssi; + __le32 tx_succeed; + __le32 peer_rssi_per_chain[WMI_TLV_MAX_CHAINS]; +} __packed; + struct wmi_tlv_p2p_noa_ev { __le32 vdev_id; } __packed; @@ -2098,6 +2190,14 @@ struct wmi_tlv_wow_add_del_event_cmd { __le32 event_bitmap; } __packed; +struct wmi_tlv_request_peer_stats_info { + __le32 request_type; + __le32 vdev_id; + /* peer MAC address */ + struct wmi_mac_addr peer_macaddr; + __le32 reset_after_request; +} __packed; + /* Command to set/unset chip in quiet mode */ struct wmi_tlv_set_quiet_cmd { __le32 vdev_id; diff --git a/drivers/net/wireless/ath/ath10k/wmi.h b/drivers/net/wireless/ath/ath10k/wmi.h index 46740e16f3ce..0f05405bebc0 100644 --- a/drivers/net/wireless/ath/ath10k/wmi.h +++ b/drivers/net/wireless/ath/ath10k/wmi.h @@ -940,6 +940,7 @@ struct wmi_cmd_map { u32 vdev_spectral_scan_configure_cmdid; u32 vdev_spectral_scan_enable_cmdid; u32 request_stats_cmdid; + u32 request_peer_stats_info_cmdid; u32 set_arp_ns_offload_cmdid; u32 network_list_offload_config_cmdid; u32 gtk_offload_cmdid; @@ -4579,6 +4580,13 @@ struct wmi_request_stats_cmd { struct wlan_inst_rssi_args inst_rssi_args; } __packed; +enum wmi_peer_stats_info_request_type { + /* request stats of one specified peer */ + WMI_REQUEST_ONE_PEER_STATS_INFO = 0x01, + /* request stats of all peers belong to specified VDEV */ + WMI_REQUEST_VDEV_ALL_PEER_STATS_INFO = 0x02, +}; + /* Suspend option */ enum { /* suspend */ -- cgit v1.2.3-59-g8ed1b From 3344b99d69ab6b479c5a54c7b72c325aaa4bdad0 Mon Sep 17 00:00:00 2001 From: Wen Gong Date: Mon, 27 Apr 2020 16:04:15 +0800 Subject: ath10k: add bitrate parse for peer stats info The rate code and rate kbps report by WMI_TLV_PEER_STATS_INFO_EVENTID from firmware contains all the bitrate info which include OFDM, CCK, HT/VHT, and mac80211 need the struct rate_info which include below parameters: flags: bitflag of flags from &enum rate_info_flags mcs: mcs index if struct describes an HT/VHT/HE rate legacy: bitrate in 100kbit/s for 802.11abg nss: number of streams (VHT & HE only) bw: bandwidth (from &enum rate_info_bw) For OFDM/CCK, its rate kbps indicate the bitrate, for HT/VHT, mac80211 need the above 5 parameters to cacluate the bitrate and show by iw. After parse the bitrate info, iw show the correct rx bitrate: localhost ~ # iw wlan0 link rx bitrate: 234.0 MBit/s VHT-MCS 3 80MHz VHT-NSS 2 rx bitrate: 40.5 MBit/s MCS 2 40MHz rx bitrate: 72.2 MBit/s MCS 7 short GI rx bitrate: 54.0 MBit/s rx bitrate: 48.0 MBit/s Tested with QCA6174 SDIO with firmware WLAN.RMH.4.4.1-00042. Signed-off-by: Wen Gong Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/20200427080416.8265-4-wgong@codeaurora.org --- drivers/net/wireless/ath/ath10k/core.h | 20 ++++ drivers/net/wireless/ath/ath10k/mac.c | 161 ++++++++++++++++++++++++++++++ drivers/net/wireless/ath/ath10k/wmi-tlv.h | 9 ++ 3 files changed, 190 insertions(+) diff --git a/drivers/net/wireless/ath/ath10k/core.h b/drivers/net/wireless/ath/ath10k/core.h index 11d9132be4fd..1700bf59e8fa 100644 --- a/drivers/net/wireless/ath/ath10k/core.h +++ b/drivers/net/wireless/ath/ath10k/core.h @@ -149,6 +149,26 @@ static inline u32 host_interest_item_address(u32 item_offset) return QCA988X_HOST_INTEREST_ADDRESS + item_offset; } +enum ath10k_phy_mode { + ATH10K_PHY_MODE_LEGACY = 0, + ATH10K_PHY_MODE_HT = 1, + ATH10K_PHY_MODE_VHT = 2, +}; + +/* Data rate 100KBPS based on IE Index */ +struct ath10k_index_ht_data_rate_type { + u8 beacon_rate_index; + u16 supported_rate[4]; +}; + +/* Data rate 100KBPS based on IE Index */ +struct ath10k_index_vht_data_rate_type { + u8 beacon_rate_index; + u16 supported_VHT80_rate[2]; + u16 supported_VHT40_rate[2]; + u16 supported_VHT20_rate[2]; +}; + struct ath10k_bmi { bool done_sent; }; diff --git a/drivers/net/wireless/ath/ath10k/mac.c b/drivers/net/wireless/ath/ath10k/mac.c index d0401d3adde4..38fc8cb3aac9 100644 --- a/drivers/net/wireless/ath/ath10k/mac.c +++ b/drivers/net/wireless/ath/ath10k/mac.c @@ -8305,6 +8305,157 @@ static void ath10k_mac_op_sta_pre_rcu_remove(struct ieee80211_hw *hw, peer->removed = true; } +/* HT MCS parameters with Nss = 1 */ +static const struct ath10k_index_ht_data_rate_type supported_ht_mcs_rate_nss1[] = { + /* MCS L20 L40 S20 S40 */ + {0, { 65, 135, 72, 150} }, + {1, { 130, 270, 144, 300} }, + {2, { 195, 405, 217, 450} }, + {3, { 260, 540, 289, 600} }, + {4, { 390, 810, 433, 900} }, + {5, { 520, 1080, 578, 1200} }, + {6, { 585, 1215, 650, 1350} }, + {7, { 650, 1350, 722, 1500} } +}; + +/* HT MCS parameters with Nss = 2 */ +static const struct ath10k_index_ht_data_rate_type supported_ht_mcs_rate_nss2[] = { + /* MCS L20 L40 S20 S40 */ + {0, {130, 270, 144, 300} }, + {1, {260, 540, 289, 600} }, + {2, {390, 810, 433, 900} }, + {3, {520, 1080, 578, 1200} }, + {4, {780, 1620, 867, 1800} }, + {5, {1040, 2160, 1156, 2400} }, + {6, {1170, 2430, 1300, 2700} }, + {7, {1300, 2700, 1444, 3000} } +}; + +/* MCS parameters with Nss = 1 */ +static const struct ath10k_index_vht_data_rate_type supported_vht_mcs_rate_nss1[] = { + /* MCS L80 S80 L40 S40 L20 S20 */ + {0, {293, 325}, {135, 150}, {65, 72} }, + {1, {585, 650}, {270, 300}, {130, 144} }, + {2, {878, 975}, {405, 450}, {195, 217} }, + {3, {1170, 1300}, {540, 600}, {260, 289} }, + {4, {1755, 1950}, {810, 900}, {390, 433} }, + {5, {2340, 2600}, {1080, 1200}, {520, 578} }, + {6, {2633, 2925}, {1215, 1350}, {585, 650} }, + {7, {2925, 3250}, {1350, 1500}, {650, 722} }, + {8, {3510, 3900}, {1620, 1800}, {780, 867} }, + {9, {3900, 4333}, {1800, 2000}, {780, 867} } +}; + +/*MCS parameters with Nss = 2 */ +static const struct ath10k_index_vht_data_rate_type supported_vht_mcs_rate_nss2[] = { + /* MCS L80 S80 L40 S40 L20 S20 */ + {0, {585, 650}, {270, 300}, {130, 144} }, + {1, {1170, 1300}, {540, 600}, {260, 289} }, + {2, {1755, 1950}, {810, 900}, {390, 433} }, + {3, {2340, 2600}, {1080, 1200}, {520, 578} }, + {4, {3510, 3900}, {1620, 1800}, {780, 867} }, + {5, {4680, 5200}, {2160, 2400}, {1040, 1156} }, + {6, {5265, 5850}, {2430, 2700}, {1170, 1300} }, + {7, {5850, 6500}, {2700, 3000}, {1300, 1444} }, + {8, {7020, 7800}, {3240, 3600}, {1560, 1733} }, + {9, {7800, 8667}, {3600, 4000}, {1560, 1733} } +}; + +static void ath10k_mac_get_rate_flags_ht(struct ath10k *ar, u32 rate, u8 nss, u8 mcs, + u8 *flags, u8 *bw) +{ + struct ath10k_index_ht_data_rate_type *mcs_rate; + + mcs_rate = (struct ath10k_index_ht_data_rate_type *) + ((nss == 1) ? &supported_ht_mcs_rate_nss1 : + &supported_ht_mcs_rate_nss2); + + if (rate == mcs_rate[mcs].supported_rate[0]) { + *bw = RATE_INFO_BW_20; + } else if (rate == mcs_rate[mcs].supported_rate[1]) { + *bw |= RATE_INFO_BW_40; + } else if (rate == mcs_rate[mcs].supported_rate[2]) { + *bw |= RATE_INFO_BW_20; + *flags |= RATE_INFO_FLAGS_SHORT_GI; + } else if (rate == mcs_rate[mcs].supported_rate[3]) { + *bw |= RATE_INFO_BW_40; + *flags |= RATE_INFO_FLAGS_SHORT_GI; + } else { + ath10k_warn(ar, "invalid ht params rate %d 100kbps nss %d mcs %d", + rate, nss, mcs); + } +} + +static void ath10k_mac_get_rate_flags_vht(struct ath10k *ar, u32 rate, u8 nss, u8 mcs, + u8 *flags, u8 *bw) +{ + struct ath10k_index_vht_data_rate_type *mcs_rate; + + mcs_rate = (struct ath10k_index_vht_data_rate_type *) + ((nss == 1) ? &supported_vht_mcs_rate_nss1 : + &supported_vht_mcs_rate_nss2); + + if (rate == mcs_rate[mcs].supported_VHT80_rate[0]) { + *bw = RATE_INFO_BW_80; + } else if (rate == mcs_rate[mcs].supported_VHT80_rate[1]) { + *bw = RATE_INFO_BW_80; + *flags |= RATE_INFO_FLAGS_SHORT_GI; + } else if (rate == mcs_rate[mcs].supported_VHT40_rate[0]) { + *bw = RATE_INFO_BW_40; + } else if (rate == mcs_rate[mcs].supported_VHT40_rate[1]) { + *bw = RATE_INFO_BW_40; + *flags |= RATE_INFO_FLAGS_SHORT_GI; + } else if (rate == mcs_rate[mcs].supported_VHT20_rate[0]) { + *bw = RATE_INFO_BW_20; + } else if (rate == mcs_rate[mcs].supported_VHT20_rate[1]) { + *bw = RATE_INFO_BW_20; + *flags |= RATE_INFO_FLAGS_SHORT_GI; + } else { + ath10k_warn(ar, "invalid vht params rate %d 100kbps nss %d mcs %d", + rate, nss, mcs); + } +} + +static void ath10k_mac_get_rate_flags(struct ath10k *ar, u32 rate, + enum ath10k_phy_mode mode, u8 nss, u8 mcs, + u8 *flags, u8 *bw) +{ + if (mode == ATH10K_PHY_MODE_HT) { + *flags = RATE_INFO_FLAGS_MCS; + ath10k_mac_get_rate_flags_ht(ar, rate, nss, mcs, flags, bw); + } else if (mode == ATH10K_PHY_MODE_VHT) { + *flags = RATE_INFO_FLAGS_VHT_MCS; + ath10k_mac_get_rate_flags_vht(ar, rate, nss, mcs, flags, bw); + } +} + +static void ath10k_mac_parse_bitrate(struct ath10k *ar, u32 rate_code, + u32 bitrate_kbps, struct rate_info *rate) +{ + enum ath10k_phy_mode mode = ATH10K_PHY_MODE_LEGACY; + enum wmi_rate_preamble preamble = WMI_TLV_GET_HW_RC_PREAM_V1(rate_code); + u8 nss = WMI_TLV_GET_HW_RC_NSS_V1(rate_code) + 1; + u8 mcs = WMI_TLV_GET_HW_RC_RATE_V1(rate_code); + u8 flags = 0, bw = 0; + + if (preamble == WMI_RATE_PREAMBLE_HT) + mode = ATH10K_PHY_MODE_HT; + else if (preamble == WMI_RATE_PREAMBLE_VHT) + mode = ATH10K_PHY_MODE_VHT; + + ath10k_mac_get_rate_flags(ar, bitrate_kbps / 100, mode, nss, mcs, &flags, &bw); + + ath10k_dbg(ar, ATH10K_DBG_MAC, + "mac parse bitrate preamble %d mode %d nss %d mcs %d flags %x bw %d\n", + preamble, mode, nss, mcs, flags, bw); + + rate->flags = flags; + rate->bw = bw; + rate->legacy = bitrate_kbps / 100; + rate->nss = nss; + rate->mcs = mcs; +} + static void ath10k_mac_sta_get_peer_stats_info(struct ath10k *ar, struct ieee80211_sta *sta, struct station_info *sinfo) @@ -8341,6 +8492,16 @@ static void ath10k_mac_sta_get_peer_stats_info(struct ath10k *ar, ath10k_warn(ar, "timed out waiting peer stats info\n"); return; } + + if (arsta->rx_rate_code != 0 && arsta->rx_bitrate_kbps != 0) { + ath10k_mac_parse_bitrate(ar, arsta->rx_rate_code, + arsta->rx_bitrate_kbps, + &sinfo->rxrate); + + sinfo->filled |= BIT_ULL(NL80211_STA_INFO_RX_BITRATE); + arsta->rx_rate_code = 0; + arsta->rx_bitrate_kbps = 0; + } } static void ath10k_sta_statistics(struct ieee80211_hw *hw, diff --git a/drivers/net/wireless/ath/ath10k/wmi-tlv.h b/drivers/net/wireless/ath/ath10k/wmi-tlv.h index 2153e2d9a955..6e0537dabd1d 100644 --- a/drivers/net/wireless/ath/ath10k/wmi-tlv.h +++ b/drivers/net/wireless/ath/ath10k/wmi-tlv.h @@ -2174,6 +2174,15 @@ struct wmi_tlv_peer_stats_info { __le32 peer_rssi_per_chain[WMI_TLV_MAX_CHAINS]; } __packed; +#define HW_RATECODE_PREAM_V1_MASK GENMASK(10, 8) +#define WMI_TLV_GET_HW_RC_PREAM_V1(rc) FIELD_GET(HW_RATECODE_PREAM_V1_MASK, rc) + +#define HW_RATECODE_NSS_V1_MASK GENMASK(7, 5) +#define WMI_TLV_GET_HW_RC_NSS_V1(rc) FIELD_GET(HW_RATECODE_NSS_V1_MASK, rc) + +#define HW_RATECODE_RATE_V1_MASK GENMASK(4, 0) +#define WMI_TLV_GET_HW_RC_RATE_V1(rc) FIELD_GET(HW_RATECODE_RATE_V1_MASK, rc) + struct wmi_tlv_p2p_noa_ev { __le32 vdev_id; } __packed; -- cgit v1.2.3-59-g8ed1b From 4cc02c7c14944b16020c8da44572b3c5d189d386 Mon Sep 17 00:00:00 2001 From: Wen Gong Date: Mon, 27 Apr 2020 16:04:16 +0800 Subject: ath10k: correct tx bitrate of iw for SDIO For legacy mode, tx bitrate not show correct sometimes, for example: iw wlan0 link Connected to 8c:21:0a:b3:5a:64 (on wlan0) SSID: tplinkgw freq: 2462 RX: 19672 bytes (184 packets) TX: 9851 bytes (87 packets) signal: -51 dBm rx bitrate: 54.0 MBit/s tx bitrate: 2.8 MBit/s This patch use the tx bitrate info from WMI_TLV_PEER_STATS_INFO_EVENTID report from firmware, and tx bitrate show correct. iw wlan0 link Connected to 8c:21:0a:b3:5a:64 (on wlan0) SSID: tplinkgw freq: 2462 RX: 13973 bytes (120 packets) TX: 6737 bytes (57 packets) signal: -52 dBm rx bitrate: 54.0 MBit/s tx bitrate: 54.0 MBit/s This patch only effect SDIO chip, ath10k_mac_sta_get_peer_stats_info has check for bitrate_statistics of hw_params, it is enabled only for "qca6174 hw3.2 sdio". Tested with QCA6174 SDIO with firmware WLAN.RMH.4.4.1-00042. Signed-off-by: Wen Gong Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/20200427080416.8265-5-wgong@codeaurora.org --- drivers/net/wireless/ath/ath10k/core.h | 2 ++ drivers/net/wireless/ath/ath10k/mac.c | 10 ++++++++++ drivers/net/wireless/ath/ath10k/wmi-tlv.c | 2 ++ 3 files changed, 14 insertions(+) diff --git a/drivers/net/wireless/ath/ath10k/core.h b/drivers/net/wireless/ath/ath10k/core.h index 1700bf59e8fa..ad6ef8d492c8 100644 --- a/drivers/net/wireless/ath/ath10k/core.h +++ b/drivers/net/wireless/ath/ath10k/core.h @@ -526,6 +526,8 @@ struct ath10k_sta { u32 rx_rate_code; u32 rx_bitrate_kbps; + u32 tx_rate_code; + u32 tx_bitrate_kbps; struct work_struct update_wk; u64 rx_duration; struct ath10k_htt_tx_stats *tx_stats; diff --git a/drivers/net/wireless/ath/ath10k/mac.c b/drivers/net/wireless/ath/ath10k/mac.c index 38fc8cb3aac9..0b7d510d2725 100644 --- a/drivers/net/wireless/ath/ath10k/mac.c +++ b/drivers/net/wireless/ath/ath10k/mac.c @@ -8502,6 +8502,16 @@ static void ath10k_mac_sta_get_peer_stats_info(struct ath10k *ar, arsta->rx_rate_code = 0; arsta->rx_bitrate_kbps = 0; } + + if (arsta->tx_rate_code != 0 && arsta->tx_bitrate_kbps != 0) { + ath10k_mac_parse_bitrate(ar, arsta->tx_rate_code, + arsta->tx_bitrate_kbps, + &sinfo->txrate); + + sinfo->filled |= BIT_ULL(NL80211_STA_INFO_TX_BITRATE); + arsta->tx_rate_code = 0; + arsta->tx_bitrate_kbps = 0; + } } static void ath10k_sta_statistics(struct ieee80211_hw *hw, diff --git a/drivers/net/wireless/ath/ath10k/wmi-tlv.c b/drivers/net/wireless/ath/ath10k/wmi-tlv.c index eec1f1f27dec..9187b62b331c 100644 --- a/drivers/net/wireless/ath/ath10k/wmi-tlv.c +++ b/drivers/net/wireless/ath/ath10k/wmi-tlv.c @@ -249,6 +249,8 @@ static int ath10k_wmi_tlv_parse_peer_stats_info(struct ath10k *ar, u16 tag, u16 arsta = (struct ath10k_sta *)sta->drv_priv; arsta->rx_rate_code = __le32_to_cpu(stat->last_rx_rate_code); arsta->rx_bitrate_kbps = __le32_to_cpu(stat->last_rx_bitrate_kbps); + arsta->tx_rate_code = __le32_to_cpu(stat->last_tx_rate_code); + arsta->tx_bitrate_kbps = __le32_to_cpu(stat->last_tx_bitrate_kbps); return 0; } -- cgit v1.2.3-59-g8ed1b From 69c93f9674c97dc439cdc0527811f8ad104c2e35 Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Mon, 27 Apr 2020 09:24:17 +0000 Subject: ath11k: use GFP_ATOMIC under spin lock A spin lock is taken here so we should use GFP_ATOMIC. Fixes: d5c65159f289 ("ath11k: driver for Qualcomm IEEE 802.11ax devices") Signed-off-by: Wei Yongjun Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/20200427092417.56236-1-weiyongjun1@huawei.com --- drivers/net/wireless/ath/ath11k/dp_rx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/wireless/ath/ath11k/dp_rx.c b/drivers/net/wireless/ath/ath11k/dp_rx.c index d3d2a335cc40..47ad3bd9e1c6 100644 --- a/drivers/net/wireless/ath/ath11k/dp_rx.c +++ b/drivers/net/wireless/ath/ath11k/dp_rx.c @@ -896,7 +896,7 @@ int ath11k_peer_rx_tid_setup(struct ath11k *ar, const u8 *peer_mac, int vdev_id, else hw_desc_sz = ath11k_hal_reo_qdesc_size(DP_BA_WIN_SZ_MAX, tid); - vaddr = kzalloc(hw_desc_sz + HAL_LINK_DESC_ALIGN - 1, GFP_KERNEL); + vaddr = kzalloc(hw_desc_sz + HAL_LINK_DESC_ALIGN - 1, GFP_ATOMIC); if (!vaddr) { spin_unlock_bh(&ab->base_lock); return -ENOMEM; -- cgit v1.2.3-59-g8ed1b From 0146dca70b877b73c5fd9c67912b8a0ca8a7bac7 Mon Sep 17 00:00:00 2001 From: Sabrina Dubroca Date: Mon, 27 Apr 2020 17:59:34 +0200 Subject: xfrm: add support for UDPv6 encapsulation of ESP This patch adds support for encapsulation of ESP over UDPv6. The code is very similar to the IPv4 encapsulation implementation, and allows to easily add espintcp on IPv6 as a follow-up. Signed-off-by: Sabrina Dubroca Signed-off-by: Steffen Klassert --- include/net/ipv6_stubs.h | 3 + include/net/xfrm.h | 5 + net/ipv4/udp.c | 10 +- net/ipv6/af_inet6.c | 4 + net/ipv6/ah6.c | 1 + net/ipv6/esp6.c | 226 +++++++++++++++++++++++++++++++++++++++++----- net/ipv6/esp6_offload.c | 7 +- net/ipv6/ip6_vti.c | 18 +++- net/ipv6/ipcomp6.c | 1 + net/ipv6/xfrm6_input.c | 106 +++++++++++++++++++++- net/ipv6/xfrm6_protocol.c | 48 ++++++++++ net/xfrm/xfrm_interface.c | 3 + 12 files changed, 395 insertions(+), 37 deletions(-) diff --git a/include/net/ipv6_stubs.h b/include/net/ipv6_stubs.h index 3e7d2c0e79ca..f033a17b53b6 100644 --- a/include/net/ipv6_stubs.h +++ b/include/net/ipv6_stubs.h @@ -56,6 +56,9 @@ struct ipv6_stub { void (*ndisc_send_na)(struct net_device *dev, const struct in6_addr *daddr, const struct in6_addr *solicited_addr, bool router, bool solicited, bool override, bool inc_opt); +#if IS_ENABLED(CONFIG_XFRM) + int (*xfrm6_udp_encap_rcv)(struct sock *sk, struct sk_buff *skb); +#endif struct neigh_table *nd_tbl; }; extern const struct ipv6_stub *ipv6_stub __read_mostly; diff --git a/include/net/xfrm.h b/include/net/xfrm.h index 8f71c111e65a..2577666c34c8 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -1406,6 +1406,8 @@ struct xfrm4_protocol { struct xfrm6_protocol { int (*handler)(struct sk_buff *skb); + int (*input_handler)(struct sk_buff *skb, int nexthdr, __be32 spi, + int encap_type); int (*cb_handler)(struct sk_buff *skb, int err); int (*err_handler)(struct sk_buff *skb, struct inet6_skb_parm *opt, u8 type, u8 code, int offset, __be32 info); @@ -1590,6 +1592,8 @@ int xfrm6_extract_header(struct sk_buff *skb); int xfrm6_extract_input(struct xfrm_state *x, struct sk_buff *skb); int xfrm6_rcv_spi(struct sk_buff *skb, int nexthdr, __be32 spi, struct ip6_tnl *t); +int xfrm6_rcv_encap(struct sk_buff *skb, int nexthdr, __be32 spi, + int encap_type); int xfrm6_transport_finish(struct sk_buff *skb, int async); int xfrm6_rcv_tnl(struct sk_buff *skb, struct ip6_tnl *t); int xfrm6_rcv(struct sk_buff *skb); @@ -1610,6 +1614,7 @@ int xfrm6_find_1stfragopt(struct xfrm_state *x, struct sk_buff *skb, #ifdef CONFIG_XFRM int xfrm4_udp_encap_rcv(struct sock *sk, struct sk_buff *skb); +int xfrm6_udp_encap_rcv(struct sock *sk, struct sk_buff *skb); int xfrm_user_policy(struct sock *sk, int optname, u8 __user *optval, int optlen); #else diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 32564b350823..1b7ebbcae497 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -112,6 +112,9 @@ #include #include #include +#if IS_ENABLED(CONFIG_IPV6) +#include +#endif struct udp_table udp_table __read_mostly; EXPORT_SYMBOL(udp_table); @@ -2563,7 +2566,12 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname, #ifdef CONFIG_XFRM case UDP_ENCAP_ESPINUDP: case UDP_ENCAP_ESPINUDP_NON_IKE: - up->encap_rcv = xfrm4_udp_encap_rcv; +#if IS_ENABLED(CONFIG_IPV6) + if (sk->sk_family == AF_INET6) + up->encap_rcv = ipv6_stub->xfrm6_udp_encap_rcv; + else +#endif + up->encap_rcv = xfrm4_udp_encap_rcv; #endif fallthrough; case UDP_ENCAP_L2TPINUDP: diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index 345baa0a754f..b0b99c08350a 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -60,6 +60,7 @@ #include #include #include +#include #include #include @@ -961,6 +962,9 @@ static const struct ipv6_stub ipv6_stub_impl = { .ip6_del_rt = ip6_del_rt, .udpv6_encap_enable = udpv6_encap_enable, .ndisc_send_na = ndisc_send_na, +#if IS_ENABLED(CONFIG_XFRM) + .xfrm6_udp_encap_rcv = xfrm6_udp_encap_rcv, +#endif .nd_tbl = &nd_tbl, }; diff --git a/net/ipv6/ah6.c b/net/ipv6/ah6.c index 45e2adc56610..d88d97617f7e 100644 --- a/net/ipv6/ah6.c +++ b/net/ipv6/ah6.c @@ -767,6 +767,7 @@ static const struct xfrm_type ah6_type = { static struct xfrm6_protocol ah6_protocol = { .handler = xfrm6_rcv, + .input_handler = xfrm_input, .cb_handler = ah6_rcv_cb, .err_handler = ah6_err, .priority = 0, diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c index 11143d039f16..e8800968e209 100644 --- a/net/ipv6/esp6.c +++ b/net/ipv6/esp6.c @@ -26,10 +26,12 @@ #include #include #include +#include #include #include #include #include +#include #include #include @@ -39,6 +41,11 @@ struct esp_skb_cb { void *tmp; }; +struct esp_output_extra { + __be32 seqhi; + u32 esphoff; +}; + #define ESP_SKB_CB(__skb) ((struct esp_skb_cb *)&((__skb)->cb[0])) /* @@ -72,9 +79,9 @@ static void *esp_alloc_tmp(struct crypto_aead *aead, int nfrags, int seqihlen) return kmalloc(len, GFP_ATOMIC); } -static inline __be32 *esp_tmp_seqhi(void *tmp) +static inline void *esp_tmp_extra(void *tmp) { - return PTR_ALIGN((__be32 *)tmp, __alignof__(__be32)); + return PTR_ALIGN(tmp, __alignof__(struct esp_output_extra)); } static inline u8 *esp_tmp_iv(struct crypto_aead *aead, void *tmp, int seqhilen) @@ -104,16 +111,17 @@ static inline struct scatterlist *esp_req_sg(struct crypto_aead *aead, static void esp_ssg_unref(struct xfrm_state *x, void *tmp) { + struct esp_output_extra *extra = esp_tmp_extra(tmp); struct crypto_aead *aead = x->data; - int seqhilen = 0; + int extralen = 0; u8 *iv; struct aead_request *req; struct scatterlist *sg; if (x->props.flags & XFRM_STATE_ESN) - seqhilen += sizeof(__be32); + extralen += sizeof(*extra); - iv = esp_tmp_iv(aead, tmp, seqhilen); + iv = esp_tmp_iv(aead, tmp, extralen); req = esp_tmp_req(aead, iv); /* Unref skb_frag_pages in the src scatterlist if necessary. @@ -124,6 +132,23 @@ static void esp_ssg_unref(struct xfrm_state *x, void *tmp) put_page(sg_page(sg)); } +static void esp_output_encap_csum(struct sk_buff *skb) +{ + /* UDP encap with IPv6 requires a valid checksum */ + if (*skb_mac_header(skb) == IPPROTO_UDP) { + struct udphdr *uh = udp_hdr(skb); + struct ipv6hdr *ip6h = ipv6_hdr(skb); + int len = ntohs(uh->len); + unsigned int offset = skb_transport_offset(skb); + __wsum csum = skb_checksum(skb, offset, skb->len - offset, 0); + + uh->check = csum_ipv6_magic(&ip6h->saddr, &ip6h->daddr, + len, IPPROTO_UDP, csum); + if (uh->check == 0) + uh->check = CSUM_MANGLED_0; + } +} + static void esp_output_done(struct crypto_async_request *base, int err) { struct sk_buff *skb = base->data; @@ -143,6 +168,8 @@ static void esp_output_done(struct crypto_async_request *base, int err) esp_ssg_unref(x, tmp); kfree(tmp); + esp_output_encap_csum(skb); + if (xo && (xo->flags & XFRM_DEV_RESUME)) { if (err) { XFRM_INC_STATS(xs_net(x), LINUX_MIB_XFRMOUTSTATEPROTOERROR); @@ -163,7 +190,7 @@ static void esp_restore_header(struct sk_buff *skb, unsigned int offset) { struct ip_esp_hdr *esph = (void *)(skb->data + offset); void *tmp = ESP_SKB_CB(skb)->tmp; - __be32 *seqhi = esp_tmp_seqhi(tmp); + __be32 *seqhi = esp_tmp_extra(tmp); esph->seq_no = esph->spi; esph->spi = *seqhi; @@ -171,27 +198,36 @@ static void esp_restore_header(struct sk_buff *skb, unsigned int offset) static void esp_output_restore_header(struct sk_buff *skb) { - esp_restore_header(skb, skb_transport_offset(skb) - sizeof(__be32)); + void *tmp = ESP_SKB_CB(skb)->tmp; + struct esp_output_extra *extra = esp_tmp_extra(tmp); + + esp_restore_header(skb, skb_transport_offset(skb) + extra->esphoff - + sizeof(__be32)); } static struct ip_esp_hdr *esp_output_set_esn(struct sk_buff *skb, struct xfrm_state *x, struct ip_esp_hdr *esph, - __be32 *seqhi) + struct esp_output_extra *extra) { /* For ESN we move the header forward by 4 bytes to * accomodate the high bits. We will move it back after * encryption. */ if ((x->props.flags & XFRM_STATE_ESN)) { + __u32 seqhi; struct xfrm_offload *xo = xfrm_offload(skb); - esph = (void *)(skb_transport_header(skb) - sizeof(__be32)); - *seqhi = esph->spi; if (xo) - esph->seq_no = htonl(xo->seq.hi); + seqhi = xo->seq.hi; else - esph->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.hi); + seqhi = XFRM_SKB_CB(skb)->seq.output.hi; + + extra->esphoff = (unsigned char *)esph - + skb_transport_header(skb); + esph = (struct ip_esp_hdr *)((unsigned char *)esph - 4); + extra->seqhi = esph->spi; + esph->seq_no = htonl(seqhi); } esph->spi = x->id.spi; @@ -207,15 +243,84 @@ static void esp_output_done_esn(struct crypto_async_request *base, int err) esp_output_done(base, err); } +static struct ip_esp_hdr *esp6_output_udp_encap(struct sk_buff *skb, + int encap_type, + struct esp_info *esp, + __be16 sport, + __be16 dport) +{ + struct udphdr *uh; + __be32 *udpdata32; + unsigned int len; + + len = skb->len + esp->tailen - skb_transport_offset(skb); + if (len > U16_MAX) + return ERR_PTR(-EMSGSIZE); + + uh = (struct udphdr *)esp->esph; + uh->source = sport; + uh->dest = dport; + uh->len = htons(len); + uh->check = 0; + + *skb_mac_header(skb) = IPPROTO_UDP; + + if (encap_type == UDP_ENCAP_ESPINUDP_NON_IKE) { + udpdata32 = (__be32 *)(uh + 1); + udpdata32[0] = udpdata32[1] = 0; + return (struct ip_esp_hdr *)(udpdata32 + 2); + } + + return (struct ip_esp_hdr *)(uh + 1); +} + +static int esp6_output_encap(struct xfrm_state *x, struct sk_buff *skb, + struct esp_info *esp) +{ + struct xfrm_encap_tmpl *encap = x->encap; + struct ip_esp_hdr *esph; + __be16 sport, dport; + int encap_type; + + spin_lock_bh(&x->lock); + sport = encap->encap_sport; + dport = encap->encap_dport; + encap_type = encap->encap_type; + spin_unlock_bh(&x->lock); + + switch (encap_type) { + default: + case UDP_ENCAP_ESPINUDP: + case UDP_ENCAP_ESPINUDP_NON_IKE: + esph = esp6_output_udp_encap(skb, encap_type, esp, sport, dport); + break; + } + + if (IS_ERR(esph)) + return PTR_ERR(esph); + + esp->esph = esph; + + return 0; +} + int esp6_output_head(struct xfrm_state *x, struct sk_buff *skb, struct esp_info *esp) { u8 *tail; u8 *vaddr; int nfrags; + int esph_offset; struct page *page; struct sk_buff *trailer; int tailen = esp->tailen; + if (x->encap) { + int err = esp6_output_encap(x, skb, esp); + + if (err < 0) + return err; + } + if (!skb_cloned(skb)) { if (tailen <= skb_tailroom(skb)) { nfrags = 1; @@ -274,10 +379,13 @@ int esp6_output_head(struct xfrm_state *x, struct sk_buff *skb, struct esp_info } cow: + esph_offset = (unsigned char *)esp->esph - skb_transport_header(skb); + nfrags = skb_cow_data(skb, tailen, &trailer); if (nfrags < 0) goto out; tail = skb_tail_pointer(trailer); + esp->esph = (struct ip_esp_hdr *)(skb_transport_header(skb) + esph_offset); skip_cow: esp_output_fill_trailer(tail, esp->tfclen, esp->plen, esp->proto); @@ -295,20 +403,20 @@ int esp6_output_tail(struct xfrm_state *x, struct sk_buff *skb, struct esp_info void *tmp; int ivlen; int assoclen; - int seqhilen; - __be32 *seqhi; + int extralen; struct page *page; struct ip_esp_hdr *esph; struct aead_request *req; struct crypto_aead *aead; struct scatterlist *sg, *dsg; + struct esp_output_extra *extra; int err = -ENOMEM; assoclen = sizeof(struct ip_esp_hdr); - seqhilen = 0; + extralen = 0; if (x->props.flags & XFRM_STATE_ESN) { - seqhilen += sizeof(__be32); + extralen += sizeof(*extra); assoclen += sizeof(__be32); } @@ -316,12 +424,12 @@ int esp6_output_tail(struct xfrm_state *x, struct sk_buff *skb, struct esp_info alen = crypto_aead_authsize(aead); ivlen = crypto_aead_ivsize(aead); - tmp = esp_alloc_tmp(aead, esp->nfrags + 2, seqhilen); + tmp = esp_alloc_tmp(aead, esp->nfrags + 2, extralen); if (!tmp) goto error; - seqhi = esp_tmp_seqhi(tmp); - iv = esp_tmp_iv(aead, tmp, seqhilen); + extra = esp_tmp_extra(tmp); + iv = esp_tmp_iv(aead, tmp, extralen); req = esp_tmp_req(aead, iv); sg = esp_req_sg(aead, req); @@ -330,7 +438,8 @@ int esp6_output_tail(struct xfrm_state *x, struct sk_buff *skb, struct esp_info else dsg = &sg[esp->nfrags]; - esph = esp_output_set_esn(skb, x, ip_esp_hdr(skb), seqhi); + esph = esp_output_set_esn(skb, x, esp->esph, extra); + esp->esph = esph; sg_init_table(sg, esp->nfrags); err = skb_to_sgvec(skb, sg, @@ -394,6 +503,7 @@ int esp6_output_tail(struct xfrm_state *x, struct sk_buff *skb, struct esp_info case 0: if ((x->props.flags & XFRM_STATE_ESN)) esp_output_restore_header(skb); + esp_output_encap_csum(skb); } if (sg != dsg) @@ -438,11 +548,13 @@ static int esp6_output(struct xfrm_state *x, struct sk_buff *skb) esp.plen = esp.clen - skb->len - esp.tfclen; esp.tailen = esp.tfclen + esp.plen + alen; + esp.esph = ip_esp_hdr(skb); + esp.nfrags = esp6_output_head(x, skb, &esp); if (esp.nfrags < 0) return esp.nfrags; - esph = ip_esp_hdr(skb); + esph = esp.esph; esph->spi = x->id.spi; esph->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.low); @@ -517,6 +629,56 @@ int esp6_input_done2(struct sk_buff *skb, int err) if (unlikely(err < 0)) goto out; + if (x->encap) { + const struct ipv6hdr *ip6h = ipv6_hdr(skb); + struct xfrm_encap_tmpl *encap = x->encap; + struct udphdr *uh = (void *)(skb_network_header(skb) + hdr_len); + __be16 source; + + switch (x->encap->encap_type) { + case UDP_ENCAP_ESPINUDP: + case UDP_ENCAP_ESPINUDP_NON_IKE: + source = uh->source; + break; + default: + WARN_ON_ONCE(1); + err = -EINVAL; + goto out; + } + + /* + * 1) if the NAT-T peer's IP or port changed then + * advertize the change to the keying daemon. + * This is an inbound SA, so just compare + * SRC ports. + */ + if (!ipv6_addr_equal(&ip6h->saddr, &x->props.saddr.in6) || + source != encap->encap_sport) { + xfrm_address_t ipaddr; + + memcpy(&ipaddr.a6, &ip6h->saddr.s6_addr, sizeof(ipaddr.a6)); + km_new_mapping(x, &ipaddr, source); + + /* XXX: perhaps add an extra + * policy check here, to see + * if we should allow or + * reject a packet from a + * different source + * address/port. + */ + } + + /* + * 2) ignore UDP/TCP checksums in case + * of NAT-T in Transport Mode, or + * perform other post-processing fixes + * as per draft-ietf-ipsec-udp-encaps-06, + * section 3.1.2 + */ + if (x->props.mode == XFRM_MODE_TRANSPORT) + skb->ip_summed = CHECKSUM_UNNECESSARY; + } + skb_postpull_rcsum(skb, skb_network_header(skb), skb_network_header_len(skb)); skb_pull_rcsum(skb, hlen); @@ -632,7 +794,7 @@ skip_cow: goto out; ESP_SKB_CB(skb)->tmp = tmp; - seqhi = esp_tmp_seqhi(tmp); + seqhi = esp_tmp_extra(tmp); iv = esp_tmp_iv(aead, tmp, seqhilen); req = esp_tmp_req(aead, iv); sg = esp_req_sg(aead, req); @@ -836,9 +998,6 @@ static int esp6_init_state(struct xfrm_state *x) u32 align; int err; - if (x->encap) - return -EINVAL; - x->data = NULL; if (x->aead) @@ -867,6 +1026,22 @@ static int esp6_init_state(struct xfrm_state *x) break; } + if (x->encap) { + struct xfrm_encap_tmpl *encap = x->encap; + + switch (encap->encap_type) { + default: + err = -EINVAL; + goto error; + case UDP_ENCAP_ESPINUDP: + x->props.header_len += sizeof(struct udphdr); + break; + case UDP_ENCAP_ESPINUDP_NON_IKE: + x->props.header_len += sizeof(struct udphdr) + 2 * sizeof(u32); + break; + } + } + align = ALIGN(crypto_aead_blocksize(aead), 4); x->props.trailer_len = align + 1 + crypto_aead_authsize(aead); @@ -893,6 +1068,7 @@ static const struct xfrm_type esp6_type = { static struct xfrm6_protocol esp6_protocol = { .handler = xfrm6_rcv, + .input_handler = xfrm_input, .cb_handler = esp6_rcv_cb, .err_handler = esp6_err, .priority = 0, diff --git a/net/ipv6/esp6_offload.c b/net/ipv6/esp6_offload.c index 8eab2c869d61..06163cc15844 100644 --- a/net/ipv6/esp6_offload.c +++ b/net/ipv6/esp6_offload.c @@ -271,7 +271,6 @@ static int esp6_xmit(struct xfrm_state *x, struct sk_buff *skb, netdev_features int alen; int blksize; struct xfrm_offload *xo; - struct ip_esp_hdr *esph; struct crypto_aead *aead; struct esp_info esp; bool hw_offload = true; @@ -312,13 +311,13 @@ static int esp6_xmit(struct xfrm_state *x, struct sk_buff *skb, netdev_features seq = xo->seq.low; - esph = ip_esp_hdr(skb); - esph->spi = x->id.spi; + esp.esph = ip_esp_hdr(skb); + esp.esph->spi = x->id.spi; skb_push(skb, -skb_network_offset(skb)); if (xo->flags & XFRM_GSO_SEGMENT) { - esph->seq_no = htonl(seq); + esp.esph->seq_no = htonl(seq); if (!skb_is_gso(skb)) xo->seq.low++; diff --git a/net/ipv6/ip6_vti.c b/net/ipv6/ip6_vti.c index cc6180e08a4f..1147f647b9a0 100644 --- a/net/ipv6/ip6_vti.c +++ b/net/ipv6/ip6_vti.c @@ -296,7 +296,8 @@ static void vti6_dev_uninit(struct net_device *dev) dev_put(dev); } -static int vti6_rcv(struct sk_buff *skb) +static int vti6_input_proto(struct sk_buff *skb, int nexthdr, __be32 spi, + int encap_type) { struct ip6_tnl *t; const struct ipv6hdr *ipv6h = ipv6_hdr(skb); @@ -323,7 +324,10 @@ static int vti6_rcv(struct sk_buff *skb) rcu_read_unlock(); - return xfrm6_rcv_tnl(skb, t); + XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip6 = t; + XFRM_SPI_SKB_CB(skb)->family = AF_INET6; + XFRM_SPI_SKB_CB(skb)->daddroff = offsetof(struct ipv6hdr, daddr); + return xfrm_input(skb, nexthdr, spi, encap_type); } rcu_read_unlock(); return -EINVAL; @@ -332,6 +336,13 @@ discard: return 0; } +static int vti6_rcv(struct sk_buff *skb) +{ + int nexthdr = skb_network_header(skb)[IP6CB(skb)->nhoff]; + + return vti6_input_proto(skb, nexthdr, 0, 0); +} + static int vti6_rcv_cb(struct sk_buff *skb, int err) { unsigned short family; @@ -1185,6 +1196,7 @@ static struct pernet_operations vti6_net_ops = { static struct xfrm6_protocol vti_esp6_protocol __read_mostly = { .handler = vti6_rcv, + .input_handler = vti6_input_proto, .cb_handler = vti6_rcv_cb, .err_handler = vti6_err, .priority = 100, @@ -1192,6 +1204,7 @@ static struct xfrm6_protocol vti_esp6_protocol __read_mostly = { static struct xfrm6_protocol vti_ah6_protocol __read_mostly = { .handler = vti6_rcv, + .input_handler = vti6_input_proto, .cb_handler = vti6_rcv_cb, .err_handler = vti6_err, .priority = 100, @@ -1199,6 +1212,7 @@ static struct xfrm6_protocol vti_ah6_protocol __read_mostly = { static struct xfrm6_protocol vti_ipcomp6_protocol __read_mostly = { .handler = vti6_rcv, + .input_handler = vti6_input_proto, .cb_handler = vti6_rcv_cb, .err_handler = vti6_err, .priority = 100, diff --git a/net/ipv6/ipcomp6.c b/net/ipv6/ipcomp6.c index 3752bd3e92ce..99668bfebd85 100644 --- a/net/ipv6/ipcomp6.c +++ b/net/ipv6/ipcomp6.c @@ -183,6 +183,7 @@ static const struct xfrm_type ipcomp6_type = { static struct xfrm6_protocol ipcomp6_protocol = { .handler = xfrm6_rcv, + .input_handler = xfrm_input, .cb_handler = ipcomp6_rcv_cb, .err_handler = ipcomp6_err, .priority = 0, diff --git a/net/ipv6/xfrm6_input.c b/net/ipv6/xfrm6_input.c index a52cb3fc6df5..56f52353b324 100644 --- a/net/ipv6/xfrm6_input.c +++ b/net/ipv6/xfrm6_input.c @@ -35,9 +35,12 @@ EXPORT_SYMBOL(xfrm6_rcv_spi); static int xfrm6_transport_finish2(struct net *net, struct sock *sk, struct sk_buff *skb) { - if (xfrm_trans_queue(skb, ip6_rcv_finish)) - __kfree_skb(skb); - return -1; + if (xfrm_trans_queue(skb, ip6_rcv_finish)) { + kfree_skb(skb); + return NET_RX_DROP; + } + + return 0; } int xfrm6_transport_finish(struct sk_buff *skb, int async) @@ -60,13 +63,106 @@ int xfrm6_transport_finish(struct sk_buff *skb, int async) if (xo && (xo->flags & XFRM_GRO)) { skb_mac_header_rebuild(skb); skb_reset_transport_header(skb); - return -1; + return 0; } NF_HOOK(NFPROTO_IPV6, NF_INET_PRE_ROUTING, dev_net(skb->dev), NULL, skb, skb->dev, NULL, xfrm6_transport_finish2); - return -1; + return 0; +} + +/* If it's a keepalive packet, then just eat it. + * If it's an encapsulated packet, then pass it to the + * IPsec xfrm input. + * Returns 0 if skb passed to xfrm or was dropped. + * Returns >0 if skb should be passed to UDP. + * Returns <0 if skb should be resubmitted (-ret is protocol) + */ +int xfrm6_udp_encap_rcv(struct sock *sk, struct sk_buff *skb) +{ + struct udp_sock *up = udp_sk(sk); + struct udphdr *uh; + struct ipv6hdr *ip6h; + int len; + int ip6hlen = sizeof(struct ipv6hdr); + + __u8 *udpdata; + __be32 *udpdata32; + __u16 encap_type = up->encap_type; + + /* if this is not encapsulated socket, then just return now */ + if (!encap_type) + return 1; + + /* If this is a paged skb, make sure we pull up + * whatever data we need to look at. */ + len = skb->len - sizeof(struct udphdr); + if (!pskb_may_pull(skb, sizeof(struct udphdr) + min(len, 8))) + return 1; + + /* Now we can get the pointers */ + uh = udp_hdr(skb); + udpdata = (__u8 *)uh + sizeof(struct udphdr); + udpdata32 = (__be32 *)udpdata; + + switch (encap_type) { + default: + case UDP_ENCAP_ESPINUDP: + /* Check if this is a keepalive packet. If so, eat it. */ + if (len == 1 && udpdata[0] == 0xff) { + goto drop; + } else if (len > sizeof(struct ip_esp_hdr) && udpdata32[0] != 0) { + /* ESP Packet without Non-ESP header */ + len = sizeof(struct udphdr); + } else + /* Must be an IKE packet.. pass it through */ + return 1; + break; + case UDP_ENCAP_ESPINUDP_NON_IKE: + /* Check if this is a keepalive packet. If so, eat it. */ + if (len == 1 && udpdata[0] == 0xff) { + goto drop; + } else if (len > 2 * sizeof(u32) + sizeof(struct ip_esp_hdr) && + udpdata32[0] == 0 && udpdata32[1] == 0) { + + /* ESP Packet with Non-IKE marker */ + len = sizeof(struct udphdr) + 2 * sizeof(u32); + } else + /* Must be an IKE packet.. pass it through */ + return 1; + break; + } + + /* At this point we are sure that this is an ESPinUDP packet, + * so we need to remove 'len' bytes from the packet (the UDP + * header and optional ESP marker bytes) and then modify the + * protocol to ESP, and then call into the transform receiver. + */ + if (skb_unclone(skb, GFP_ATOMIC)) + goto drop; + + /* Now we can update and verify the packet length... */ + ip6h = ipv6_hdr(skb); + ip6h->payload_len = htons(ntohs(ip6h->payload_len) - len); + if (skb->len < ip6hlen + len) { + /* packet is too small!?! */ + goto drop; + } + + /* pull the data buffer up to the ESP header and set the + * transport header to point to ESP. Keep UDP on the stack + * for later. + */ + __skb_pull(skb, len); + skb_reset_transport_header(skb); + + /* process ESP */ + return xfrm6_rcv_encap(skb, IPPROTO_ESP, 0, encap_type); + +drop: + kfree_skb(skb); + return 0; } int xfrm6_rcv_tnl(struct sk_buff *skb, struct ip6_tnl *t) diff --git a/net/ipv6/xfrm6_protocol.c b/net/ipv6/xfrm6_protocol.c index 34cb65c7d5a7..ea2f805d3b01 100644 --- a/net/ipv6/xfrm6_protocol.c +++ b/net/ipv6/xfrm6_protocol.c @@ -14,6 +14,7 @@ #include #include #include +#include #include #include #include @@ -58,6 +59,53 @@ static int xfrm6_rcv_cb(struct sk_buff *skb, u8 protocol, int err) return 0; } +int xfrm6_rcv_encap(struct sk_buff *skb, int nexthdr, __be32 spi, + int encap_type) +{ + int ret; + struct xfrm6_protocol *handler; + struct xfrm6_protocol __rcu **head = proto_handlers(nexthdr); + + XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip6 = NULL; + XFRM_SPI_SKB_CB(skb)->family = AF_INET6; + XFRM_SPI_SKB_CB(skb)->daddroff = offsetof(struct ipv6hdr, daddr); + + if (!head) + goto out; + + if (!skb_dst(skb)) { + const struct ipv6hdr *ip6h = ipv6_hdr(skb); + int flags = RT6_LOOKUP_F_HAS_SADDR; + struct dst_entry *dst; + struct flowi6 fl6 = { + .flowi6_iif = skb->dev->ifindex, + .daddr = ip6h->daddr, + .saddr = ip6h->saddr, + .flowlabel = ip6_flowinfo(ip6h), + .flowi6_mark = skb->mark, + .flowi6_proto = ip6h->nexthdr, + }; + + dst = ip6_route_input_lookup(dev_net(skb->dev), skb->dev, &fl6, + skb, flags); + if (dst->error) + goto drop; + skb_dst_set(skb, dst); + } + + for_each_protocol_rcu(*head, handler) + if ((ret = handler->input_handler(skb, nexthdr, spi, encap_type)) != -EINVAL) + return ret; + +out: + icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_PORT_UNREACH, 0); + +drop: + kfree_skb(skb); + return 0; +} +EXPORT_SYMBOL(xfrm6_rcv_encap); + static int xfrm6_esp_rcv(struct sk_buff *skb) { int ret; diff --git a/net/xfrm/xfrm_interface.c b/net/xfrm/xfrm_interface.c index eb9928c0a87c..02f8f46d0cc5 100644 --- a/net/xfrm/xfrm_interface.c +++ b/net/xfrm/xfrm_interface.c @@ -755,6 +755,7 @@ static struct pernet_operations xfrmi_net_ops = { static struct xfrm6_protocol xfrmi_esp6_protocol __read_mostly = { .handler = xfrm6_rcv, + .input_handler = xfrm_input, .cb_handler = xfrmi_rcv_cb, .err_handler = xfrmi6_err, .priority = 10, @@ -762,6 +763,7 @@ static struct xfrm6_protocol xfrmi_esp6_protocol __read_mostly = { static struct xfrm6_protocol xfrmi_ah6_protocol __read_mostly = { .handler = xfrm6_rcv, + .input_handler = xfrm_input, .cb_handler = xfrmi_rcv_cb, .err_handler = xfrmi6_err, .priority = 10, @@ -769,6 +771,7 @@ static struct xfrm6_protocol xfrmi_ah6_protocol __read_mostly = { static struct xfrm6_protocol xfrmi_ipcomp6_protocol __read_mostly = { .handler = xfrm6_rcv, + .input_handler = xfrm_input, .cb_handler = xfrmi_rcv_cb, .err_handler = xfrmi6_err, .priority = 10, -- cgit v1.2.3-59-g8ed1b From 26333c37fc285e7372f1b9461f3ae0ba3dc699c9 Mon Sep 17 00:00:00 2001 From: Sabrina Dubroca Date: Mon, 27 Apr 2020 17:59:35 +0200 Subject: xfrm: add IPv6 support for espintcp This extends espintcp to support IPv6, building on the existing code and the new UDPv6 encapsulation support. Most of the code is either reused directly (stream parser, ULP) or very similar to the IPv4 variant (net/ipv6/esp6.c changes). The separation of config options for IPv4 and IPv6 espintcp requires a bit of Kconfig gymnastics to enable the core code. Signed-off-by: Sabrina Dubroca Signed-off-by: Steffen Klassert --- include/net/ipv6_stubs.h | 2 + net/ipv4/Kconfig | 1 + net/ipv6/Kconfig | 12 +++ net/ipv6/af_inet6.c | 1 + net/ipv6/esp6.c | 188 ++++++++++++++++++++++++++++++++++++++++++++++- net/xfrm/Kconfig | 3 + net/xfrm/Makefile | 2 +- net/xfrm/espintcp.c | 56 +++++++++++--- 8 files changed, 252 insertions(+), 13 deletions(-) diff --git a/include/net/ipv6_stubs.h b/include/net/ipv6_stubs.h index f033a17b53b6..1e9e0cf7dc75 100644 --- a/include/net/ipv6_stubs.h +++ b/include/net/ipv6_stubs.h @@ -58,6 +58,8 @@ struct ipv6_stub { bool router, bool solicited, bool override, bool inc_opt); #if IS_ENABLED(CONFIG_XFRM) int (*xfrm6_udp_encap_rcv)(struct sock *sk, struct sk_buff *skb); + int (*xfrm6_rcv_encap)(struct sk_buff *skb, int nexthdr, __be32 spi, + int encap_type); #endif struct neigh_table *nd_tbl; }; diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig index 25a8888826b8..014aaa17dc79 100644 --- a/net/ipv4/Kconfig +++ b/net/ipv4/Kconfig @@ -384,6 +384,7 @@ config INET_ESPINTCP depends on XFRM && INET_ESP select STREAM_PARSER select NET_SOCK_MSG + select XFRM_ESPINTCP help Support for RFC 8229 encapsulation of ESP and IKE over TCP/IPv4 sockets. diff --git a/net/ipv6/Kconfig b/net/ipv6/Kconfig index 2ccaee98fddb..468a2faadc7d 100644 --- a/net/ipv6/Kconfig +++ b/net/ipv6/Kconfig @@ -88,6 +88,18 @@ config INET6_ESP_OFFLOAD If unsure, say N. +config INET6_ESPINTCP + bool "IPv6: ESP in TCP encapsulation (RFC 8229)" + depends on XFRM && INET6_ESP + select STREAM_PARSER + select NET_SOCK_MSG + select XFRM_ESPINTCP + help + Support for RFC 8229 encapsulation of ESP and IKE over + TCP/IPv6 sockets. + + If unsure, say N. + config INET6_IPCOMP tristate "IPv6: IPComp transformation" select INET6_XFRM_TUNNEL diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index b0b99c08350a..cbbb00bad20e 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -964,6 +964,7 @@ static const struct ipv6_stub ipv6_stub_impl = { .ndisc_send_na = ndisc_send_na, #if IS_ENABLED(CONFIG_XFRM) .xfrm6_udp_encap_rcv = xfrm6_udp_encap_rcv, + .xfrm6_rcv_encap = xfrm6_rcv_encap, #endif .nd_tbl = &nd_tbl, }; diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c index e8800968e209..c43592771126 100644 --- a/net/ipv6/esp6.c +++ b/net/ipv6/esp6.c @@ -33,6 +33,9 @@ #include #include #include +#include +#include +#include #include @@ -132,6 +135,132 @@ static void esp_ssg_unref(struct xfrm_state *x, void *tmp) put_page(sg_page(sg)); } +#ifdef CONFIG_INET6_ESPINTCP +struct esp_tcp_sk { + struct sock *sk; + struct rcu_head rcu; +}; + +static void esp_free_tcp_sk(struct rcu_head *head) +{ + struct esp_tcp_sk *esk = container_of(head, struct esp_tcp_sk, rcu); + + sock_put(esk->sk); + kfree(esk); +} + +static struct sock *esp6_find_tcp_sk(struct xfrm_state *x) +{ + struct xfrm_encap_tmpl *encap = x->encap; + struct esp_tcp_sk *esk; + __be16 sport, dport; + struct sock *nsk; + struct sock *sk; + + sk = rcu_dereference(x->encap_sk); + if (sk && sk->sk_state == TCP_ESTABLISHED) + return sk; + + spin_lock_bh(&x->lock); + sport = encap->encap_sport; + dport = encap->encap_dport; + nsk = rcu_dereference_protected(x->encap_sk, + lockdep_is_held(&x->lock)); + if (sk && sk == nsk) { + esk = kmalloc(sizeof(*esk), GFP_ATOMIC); + if (!esk) { + spin_unlock_bh(&x->lock); + return ERR_PTR(-ENOMEM); + } + RCU_INIT_POINTER(x->encap_sk, NULL); + esk->sk = sk; + call_rcu(&esk->rcu, esp_free_tcp_sk); + } + spin_unlock_bh(&x->lock); + + sk = __inet6_lookup_established(xs_net(x), &tcp_hashinfo, &x->id.daddr.in6, + dport, &x->props.saddr.in6, ntohs(sport), 0, 0); + if (!sk) + return ERR_PTR(-ENOENT); + + if (!tcp_is_ulp_esp(sk)) { + sock_put(sk); + return ERR_PTR(-EINVAL); + } + + spin_lock_bh(&x->lock); + nsk = rcu_dereference_protected(x->encap_sk, + lockdep_is_held(&x->lock)); + if (encap->encap_sport != sport || + encap->encap_dport != dport) { + sock_put(sk); + sk = nsk ?: ERR_PTR(-EREMCHG); + } else if (sk == nsk) { + sock_put(sk); + } else { + rcu_assign_pointer(x->encap_sk, sk); + } + spin_unlock_bh(&x->lock); + + return sk; +} + +static int esp_output_tcp_finish(struct xfrm_state *x, struct sk_buff *skb) +{ + struct sock *sk; + int err; + + rcu_read_lock(); + + sk = esp6_find_tcp_sk(x); + err = PTR_ERR_OR_ZERO(sk); + if (err) + goto out; + + bh_lock_sock(sk); + if (sock_owned_by_user(sk)) + err = espintcp_queue_out(sk, skb); + else + err = espintcp_push_skb(sk, skb); + bh_unlock_sock(sk); + +out: + rcu_read_unlock(); + return err; +} + +static int esp_output_tcp_encap_cb(struct net *net, struct sock *sk, + struct sk_buff *skb) +{ + struct dst_entry *dst = skb_dst(skb); + struct xfrm_state *x = dst->xfrm; + + return esp_output_tcp_finish(x, skb); +} + +static int esp_output_tail_tcp(struct xfrm_state *x, struct sk_buff *skb) +{ + int err; + + local_bh_disable(); + err = xfrm_trans_queue_net(xs_net(x), skb, esp_output_tcp_encap_cb); + local_bh_enable(); + + /* EINPROGRESS just happens to do the right thing. It + * actually means that the skb has been consumed and + * isn't coming back. + */ + return err ?: -EINPROGRESS; +} +#else +static int esp_output_tail_tcp(struct xfrm_state *x, struct sk_buff *skb) +{ + kfree_skb(skb); + + return -EOPNOTSUPP; +} +#endif + static void esp_output_encap_csum(struct sk_buff *skb) { /* UDP encap with IPv6 requires a valid checksum */ @@ -181,7 +310,11 @@ static void esp_output_done(struct crypto_async_request *base, int err) secpath_reset(skb); xfrm_dev_resume(skb); } else { - xfrm_output_resume(skb, err); + if (!err && + x->encap && x->encap->encap_type == TCP_ENCAP_ESPINTCP) + esp_output_tail_tcp(x, skb); + else + xfrm_output_resume(skb, err); } } @@ -274,6 +407,41 @@ static struct ip_esp_hdr *esp6_output_udp_encap(struct sk_buff *skb, return (struct ip_esp_hdr *)(uh + 1); } +#ifdef CONFIG_INET6_ESPINTCP +static struct ip_esp_hdr *esp6_output_tcp_encap(struct xfrm_state *x, + struct sk_buff *skb, + struct esp_info *esp) +{ + __be16 *lenp = (void *)esp->esph; + struct ip_esp_hdr *esph; + unsigned int len; + struct sock *sk; + + len = skb->len + esp->tailen - skb_transport_offset(skb); + if (len > IP_MAX_MTU) + return ERR_PTR(-EMSGSIZE); + + rcu_read_lock(); + sk = esp6_find_tcp_sk(x); + rcu_read_unlock(); + + if (IS_ERR(sk)) + return ERR_CAST(sk); + + *lenp = htons(len); + esph = (struct ip_esp_hdr *)(lenp + 1); + + return esph; +} +#else +static struct ip_esp_hdr *esp6_output_tcp_encap(struct xfrm_state *x, + struct sk_buff *skb, + struct esp_info *esp) +{ + return ERR_PTR(-EOPNOTSUPP); +} +#endif + static int esp6_output_encap(struct xfrm_state *x, struct sk_buff *skb, struct esp_info *esp) { @@ -294,6 +462,9 @@ static int esp6_output_encap(struct xfrm_state *x, struct sk_buff *skb, case UDP_ENCAP_ESPINUDP_NON_IKE: esph = esp6_output_udp_encap(skb, encap_type, esp, sport, dport); break; + case TCP_ENCAP_ESPINTCP: + esph = esp6_output_tcp_encap(x, skb, esp); + break; } if (IS_ERR(esph)) @@ -509,6 +680,9 @@ int esp6_output_tail(struct xfrm_state *x, struct sk_buff *skb, struct esp_info if (sg != dsg) esp_ssg_unref(x, tmp); + if (!err && x->encap && x->encap->encap_type == TCP_ENCAP_ESPINTCP) + err = esp_output_tail_tcp(x, skb); + error_free: kfree(tmp); error: @@ -633,9 +807,13 @@ int esp6_input_done2(struct sk_buff *skb, int err) const struct ipv6hdr *ip6h = ipv6_hdr(skb); struct xfrm_encap_tmpl *encap = x->encap; struct udphdr *uh = (void *)(skb_network_header(skb) + hdr_len); + struct tcphdr *th = (void *)(skb_network_header(skb) + hdr_len); __be16 source; switch (x->encap->encap_type) { + case TCP_ENCAP_ESPINTCP: + source = th->source; + break; case UDP_ENCAP_ESPINUDP: case UDP_ENCAP_ESPINUDP_NON_IKE: source = uh->source; @@ -1039,6 +1217,14 @@ static int esp6_init_state(struct xfrm_state *x) case UDP_ENCAP_ESPINUDP_NON_IKE: x->props.header_len += sizeof(struct udphdr) + 2 * sizeof(u32); break; +#ifdef CONFIG_INET6_ESPINTCP + case TCP_ENCAP_ESPINTCP: + /* only the length field, TCP encap is done by + * the socket + */ + x->props.header_len += 2; + break; +#endif } } diff --git a/net/xfrm/Kconfig b/net/xfrm/Kconfig index 6921a18201a0..b7fd9c838416 100644 --- a/net/xfrm/Kconfig +++ b/net/xfrm/Kconfig @@ -99,4 +99,7 @@ config NET_KEY_MIGRATE If unsure, say N. +config XFRM_ESPINTCP + bool + endif # INET diff --git a/net/xfrm/Makefile b/net/xfrm/Makefile index 212a4fcb4a88..2d4bb4b9f75e 100644 --- a/net/xfrm/Makefile +++ b/net/xfrm/Makefile @@ -11,4 +11,4 @@ obj-$(CONFIG_XFRM_ALGO) += xfrm_algo.o obj-$(CONFIG_XFRM_USER) += xfrm_user.o obj-$(CONFIG_XFRM_IPCOMP) += xfrm_ipcomp.o obj-$(CONFIG_XFRM_INTERFACE) += xfrm_interface.o -obj-$(CONFIG_INET_ESPINTCP) += espintcp.o +obj-$(CONFIG_XFRM_ESPINTCP) += espintcp.o diff --git a/net/xfrm/espintcp.c b/net/xfrm/espintcp.c index 037ea156d2f9..2132a3b6df0f 100644 --- a/net/xfrm/espintcp.c +++ b/net/xfrm/espintcp.c @@ -6,6 +6,9 @@ #include #include #include +#if IS_ENABLED(CONFIG_IPV6) +#include +#endif static void handle_nonesp(struct espintcp_ctx *ctx, struct sk_buff *skb, struct sock *sk) @@ -31,7 +34,12 @@ static void handle_esp(struct sk_buff *skb, struct sock *sk) rcu_read_lock(); skb->dev = dev_get_by_index_rcu(sock_net(sk), skb->skb_iif); local_bh_disable(); - xfrm4_rcv_encap(skb, IPPROTO_ESP, 0, TCP_ENCAP_ESPINTCP); +#if IS_ENABLED(CONFIG_IPV6) + if (sk->sk_family == AF_INET6) + ipv6_stub->xfrm6_rcv_encap(skb, IPPROTO_ESP, 0, TCP_ENCAP_ESPINTCP); + else +#endif + xfrm4_rcv_encap(skb, IPPROTO_ESP, 0, TCP_ENCAP_ESPINTCP); local_bh_enable(); rcu_read_unlock(); } @@ -347,6 +355,9 @@ unlock: static struct proto espintcp_prot __ro_after_init; static struct proto_ops espintcp_ops __ro_after_init; +static struct proto espintcp6_prot; +static struct proto_ops espintcp6_ops; +static DEFINE_MUTEX(tcpv6_prot_mutex); static void espintcp_data_ready(struct sock *sk) { @@ -384,10 +395,14 @@ static void espintcp_destruct(struct sock *sk) bool tcp_is_ulp_esp(struct sock *sk) { - return sk->sk_prot == &espintcp_prot; + return sk->sk_prot == &espintcp_prot || sk->sk_prot == &espintcp6_prot; } EXPORT_SYMBOL_GPL(tcp_is_ulp_esp); +static void build_protos(struct proto *espintcp_prot, + struct proto_ops *espintcp_ops, + const struct proto *orig_prot, + const struct proto_ops *orig_ops); static int espintcp_init_sk(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); @@ -415,8 +430,19 @@ static int espintcp_init_sk(struct sock *sk) strp_check_rcv(&ctx->strp); skb_queue_head_init(&ctx->ike_queue); skb_queue_head_init(&ctx->out_queue); - sk->sk_prot = &espintcp_prot; - sk->sk_socket->ops = &espintcp_ops; + + if (sk->sk_family == AF_INET) { + sk->sk_prot = &espintcp_prot; + sk->sk_socket->ops = &espintcp_ops; + } else { + mutex_lock(&tcpv6_prot_mutex); + if (!espintcp6_prot.recvmsg) + build_protos(&espintcp6_prot, &espintcp6_ops, sk->sk_prot, sk->sk_socket->ops); + mutex_unlock(&tcpv6_prot_mutex); + + sk->sk_prot = &espintcp6_prot; + sk->sk_socket->ops = &espintcp6_ops; + } ctx->saved_data_ready = sk->sk_data_ready; ctx->saved_write_space = sk->sk_write_space; sk->sk_data_ready = espintcp_data_ready; @@ -489,6 +515,20 @@ static __poll_t espintcp_poll(struct file *file, struct socket *sock, return mask; } +static void build_protos(struct proto *espintcp_prot, + struct proto_ops *espintcp_ops, + const struct proto *orig_prot, + const struct proto_ops *orig_ops) +{ + memcpy(espintcp_prot, orig_prot, sizeof(struct proto)); + memcpy(espintcp_ops, orig_ops, sizeof(struct proto_ops)); + espintcp_prot->sendmsg = espintcp_sendmsg; + espintcp_prot->recvmsg = espintcp_recvmsg; + espintcp_prot->close = espintcp_close; + espintcp_prot->release_cb = espintcp_release; + espintcp_ops->poll = espintcp_poll; +} + static struct tcp_ulp_ops espintcp_ulp __read_mostly = { .name = "espintcp", .owner = THIS_MODULE, @@ -497,13 +537,7 @@ static struct tcp_ulp_ops espintcp_ulp __read_mostly = { void __init espintcp_init(void) { - memcpy(&espintcp_prot, &tcp_prot, sizeof(tcp_prot)); - memcpy(&espintcp_ops, &inet_stream_ops, sizeof(inet_stream_ops)); - espintcp_prot.sendmsg = espintcp_sendmsg; - espintcp_prot.recvmsg = espintcp_recvmsg; - espintcp_prot.close = espintcp_close; - espintcp_prot.release_cb = espintcp_release; - espintcp_ops.poll = espintcp_poll; + build_protos(&espintcp_prot, &espintcp_ops, &tcp_prot, &inet_stream_ops); tcp_register_ulp(&espintcp_ulp); } -- cgit v1.2.3-59-g8ed1b From 220915857e29795ae5ba4222806268b4a99c19c1 Mon Sep 17 00:00:00 2001 From: Alain Michaud Date: Thu, 23 Apr 2020 14:43:27 +0000 Subject: Bluetooth: Adding driver and quirk defs for multi-role LE This change adds the relevant driver and quirk to allow drivers to report the le_states as being trustworthy. This has historically been disabled as controllers did not reliably support this. In particular, this will be used to relax this condition for controllers that have been well tested and reliable. /* Most controller will fail if we try to create new connections * while we have an existing one in slave role. */ if (hdev->conn_hash.le_num_slave > 0) return NULL; Signed-off-by: Alain Michaud Signed-off-by: Marcel Holtmann --- drivers/bluetooth/btusb.c | 1 + include/net/bluetooth/hci.h | 9 +++++++++ 2 files changed, 10 insertions(+) diff --git a/drivers/bluetooth/btusb.c b/drivers/bluetooth/btusb.c index 871162790a0e..9a0ac333c886 100644 --- a/drivers/bluetooth/btusb.c +++ b/drivers/bluetooth/btusb.c @@ -58,6 +58,7 @@ static struct usb_driver btusb_driver; #define BTUSB_CW6622 0x100000 #define BTUSB_MEDIATEK 0x200000 #define BTUSB_WIDEBAND_SPEECH 0x400000 +#define BTUSB_VALID_LE_STATES 0x800000 static const struct usb_device_id btusb_table[] = { /* Generic Bluetooth USB device */ diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h index 1da8cec8e210..e5bc1dfe809a 100644 --- a/include/net/bluetooth/hci.h +++ b/include/net/bluetooth/hci.h @@ -218,6 +218,15 @@ enum { * This quirk must be set before hci_register_dev is called. */ HCI_QUIRK_WIDEBAND_SPEECH_SUPPORTED, + + /* When this quirk is set, the controller has validated that + * LE states reported through the HCI_LE_READ_SUPPORTED_STATES are + * valid. This mechanism is necessary as many controllers have + * been seen has having trouble initiating a connectable + * advertisement despite the state combination being reported as + * supported. + */ + HCI_QUIRK_VALID_LE_STATES, }; /* HCI device flags */ -- cgit v1.2.3-59-g8ed1b From 4364f2e91f0d44fa0e233d2a55e3ec35053d9bd9 Mon Sep 17 00:00:00 2001 From: Alain Michaud Date: Thu, 23 Apr 2020 14:43:29 +0000 Subject: Bluetooth: allow scatternet connections if supported. This change allows scatternet connections to be created if the controller reports support and the HCI_QUIRK_VALID_LE_STATES indicates that the reported LE states can be trusted. Signed-off-by: Alain Michaud Signed-off-by: Marcel Holtmann --- net/bluetooth/hci_event.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index 966fc543c01d..006c24e04b44 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -5288,7 +5288,9 @@ static struct hci_conn *check_pending_le_conn(struct hci_dev *hdev, /* Most controller will fail if we try to create new connections * while we have an existing one in slave role. */ - if (hdev->conn_hash.le_num_slave > 0) + if (hdev->conn_hash.le_num_slave > 0 && + (!test_bit(HCI_QUIRK_VALID_LE_STATES, &hdev->quirks) || + !(hdev->le_states[3] & 0x10))) return NULL; /* If we're not connectable only connect devices that we have in -- cgit v1.2.3-59-g8ed1b From aff8c489256ea1e32b35a007906a16dce7c6b4db Mon Sep 17 00:00:00 2001 From: Alain Michaud Date: Thu, 23 Apr 2020 14:43:31 +0000 Subject: Bluetooth: btusb: Adding support for LE scatternet to Jfp and ThP This change adds support for LE scatternet connections to Intel's JfP and ThP controllers. Signed-off-by: Alain Michaud Signed-off-by: Marcel Holtmann --- drivers/bluetooth/btusb.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/drivers/bluetooth/btusb.c b/drivers/bluetooth/btusb.c index 9a0ac333c886..8ae3ad7a6013 100644 --- a/drivers/bluetooth/btusb.c +++ b/drivers/bluetooth/btusb.c @@ -336,7 +336,8 @@ static const struct usb_device_id blacklist_table[] = { /* Intel Bluetooth devices */ { USB_DEVICE(0x8087, 0x0025), .driver_info = BTUSB_INTEL_NEW | - BTUSB_WIDEBAND_SPEECH }, + BTUSB_WIDEBAND_SPEECH | + BTUSB_VALID_LE_STATES }, { USB_DEVICE(0x8087, 0x0026), .driver_info = BTUSB_INTEL_NEW | BTUSB_WIDEBAND_SPEECH }, { USB_DEVICE(0x8087, 0x0029), .driver_info = BTUSB_INTEL_NEW | @@ -349,7 +350,8 @@ static const struct usb_device_id blacklist_table[] = { { USB_DEVICE(0x8087, 0x0aa7), .driver_info = BTUSB_INTEL | BTUSB_WIDEBAND_SPEECH }, { USB_DEVICE(0x8087, 0x0aaa), .driver_info = BTUSB_INTEL_NEW | - BTUSB_WIDEBAND_SPEECH }, + BTUSB_WIDEBAND_SPEECH | + BTUSB_VALID_LE_STATES }, /* Other Intel Bluetooth devices */ { USB_VENDOR_AND_INTERFACE_INFO(0x8087, 0xe0, 0x01, 0x01), @@ -3973,6 +3975,9 @@ static int btusb_probe(struct usb_interface *intf, if (id->driver_info & BTUSB_WIDEBAND_SPEECH) set_bit(HCI_QUIRK_WIDEBAND_SPEECH_SUPPORTED, &hdev->quirks); + if (id->driver_info & BTUSB_VALID_LE_STATES) + set_bit(HCI_QUIRK_VALID_LE_STATES, &hdev->quirks); + if (id->driver_info & BTUSB_DIGIANSWER) { data->cmdreq_type = USB_TYPE_VENDOR; set_bit(HCI_QUIRK_RESET_ON_CLOSE, &hdev->quirks); -- cgit v1.2.3-59-g8ed1b From 65749009242bbf67d5be129b7b67c4f9bfd93360 Mon Sep 17 00:00:00 2001 From: Christian Hewitt Date: Thu, 23 Apr 2020 01:34:28 +0000 Subject: dt-bindings: net: bluetooth: Add device tree bindings for QCA9377 QCA9377 is a QCA ROME device frequently found in Android TV boxes. Signed-off-by: Christian Hewitt Signed-off-by: Marcel Holtmann --- Documentation/devicetree/bindings/net/qualcomm-bluetooth.txt | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/Documentation/devicetree/bindings/net/qualcomm-bluetooth.txt b/Documentation/devicetree/bindings/net/qualcomm-bluetooth.txt index badf597c0e58..f21ea24fb0ca 100644 --- a/Documentation/devicetree/bindings/net/qualcomm-bluetooth.txt +++ b/Documentation/devicetree/bindings/net/qualcomm-bluetooth.txt @@ -10,6 +10,7 @@ device the slave device is attached to. Required properties: - compatible: should contain one of the following: * "qcom,qca6174-bt" + * "qcom,qca9377-bt" * "qcom,wcn3990-bt" * "qcom,wcn3991-bt" * "qcom,wcn3998-bt" @@ -21,6 +22,10 @@ Optional properties for compatible string qcom,qca6174-bt: - clocks: clock provided to the controller (SUSCLK_32KHZ) - firmware-name: specify the name of nvm firmware to load +Optional properties for compatible string qcom,qca9377-bt: + + - max-speed: see Documentation/devicetree/bindings/serial/serial.yaml + Required properties for compatible string qcom,wcn399x-bt: - vddio-supply: VDD_IO supply regulator handle. -- cgit v1.2.3-59-g8ed1b From 31d4ab856e2d57a8877d90012b42d029f1fe0fe9 Mon Sep 17 00:00:00 2001 From: Christian Hewitt Date: Thu, 23 Apr 2020 01:34:29 +0000 Subject: Bluetooth: hci_qca: add compatible for QCA9377 Add a compatible so QCA9377 devices can be defined in device-tree. Signed-off-by: Christian Hewitt Signed-off-by: Marcel Holtmann --- drivers/bluetooth/hci_qca.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/bluetooth/hci_qca.c b/drivers/bluetooth/hci_qca.c index d0ac554584a4..072983dc07e3 100644 --- a/drivers/bluetooth/hci_qca.c +++ b/drivers/bluetooth/hci_qca.c @@ -2058,6 +2058,7 @@ static SIMPLE_DEV_PM_OPS(qca_pm_ops, qca_suspend, qca_resume); static const struct of_device_id qca_bluetooth_of_match[] = { { .compatible = "qcom,qca6174-bt" }, { .compatible = "qcom,qca6390-bt", .data = &qca_soc_data_qca6390}, + { .compatible = "qcom,qca9377-bt" }, { .compatible = "qcom,wcn3990-bt", .data = &qca_soc_data_wcn3990}, { .compatible = "qcom,wcn3991-bt", .data = &qca_soc_data_wcn3991}, { .compatible = "qcom,wcn3998-bt", .data = &qca_soc_data_wcn3998}, -- cgit v1.2.3-59-g8ed1b From 37aee136f8c4e4989099e7d4972d617ea7ad3e5c Mon Sep 17 00:00:00 2001 From: Christian Hewitt Date: Thu, 23 Apr 2020 01:34:30 +0000 Subject: Bluetooth: hci_qca: allow max-speed to be set for QCA9377 devices Move the read of max-speed from device-tree out of the qca_is_wcn399x if block so oper_speed can be set for QCA9377 devices as well. Suggested-by: Abhishek Pandit-Subedi Signed-off-by: Christian Hewitt Signed-off-by: Marcel Holtmann --- drivers/bluetooth/hci_qca.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/drivers/bluetooth/hci_qca.c b/drivers/bluetooth/hci_qca.c index 072983dc07e3..b3fd07a6f812 100644 --- a/drivers/bluetooth/hci_qca.c +++ b/drivers/bluetooth/hci_qca.c @@ -597,10 +597,12 @@ static int qca_open(struct hci_uart *hu) if (hu->serdev) { qcadev = serdev_device_get_drvdata(hu->serdev); - if (qca_is_wcn399x(qcadev->btsoc_type)) { + + if (qca_is_wcn399x(qcadev->btsoc_type)) hu->init_speed = qcadev->init_speed; + + if (qcadev->oper_speed) hu->oper_speed = qcadev->oper_speed; - } } timer_setup(&qca->wake_retrans_timer, hci_ibs_wake_retrans_timeout, 0); @@ -1871,6 +1873,11 @@ static int qca_serdev_probe(struct serdev_device *serdev) serdev_device_set_drvdata(serdev, qcadev); device_property_read_string(&serdev->dev, "firmware-name", &qcadev->firmware_name); + device_property_read_u32(&serdev->dev, "max-speed", + &qcadev->oper_speed); + if (!qcadev->oper_speed) + BT_DBG("UART will pick default operating speed"); + if (data && qca_is_wcn399x(data->soc_type)) { qcadev->btsoc_type = data->soc_type; qcadev->bt_power = devm_kzalloc(&serdev->dev, @@ -1895,11 +1902,6 @@ static int qca_serdev_probe(struct serdev_device *serdev) return PTR_ERR(qcadev->susclk); } - device_property_read_u32(&serdev->dev, "max-speed", - &qcadev->oper_speed); - if (!qcadev->oper_speed) - BT_DBG("UART will pick default operating speed"); - err = hci_uart_register_device(&qcadev->serdev_hu, &qca_proto); if (err) { BT_ERR("wcn3990 serdev registration failed"); -- cgit v1.2.3-59-g8ed1b From b26d1e2b60284dc9f66ffad9ccd5c5da1100bb4b Mon Sep 17 00:00:00 2001 From: Veronika Kabatova Date: Tue, 28 Apr 2020 19:37:42 +0200 Subject: selftests/bpf: Copy runqslower to OUTPUT directory $(OUTPUT)/runqslower makefile target doesn't actually create runqslower binary in the $(OUTPUT) directory. As lib.mk expects all TEST_GEN_PROGS_EXTENDED (which runqslower is a part of) to be present in the OUTPUT directory, this results in an error when running e.g. `make install`: rsync: link_stat "tools/testing/selftests/bpf/runqslower" failed: No such file or directory (2) Copy the binary into the OUTPUT directory after building it to fix the error. Fixes: 3a0d3092a4ed ("selftests/bpf: Build runqslower from selftests") Signed-off-by: Veronika Kabatova Signed-off-by: Daniel Borkmann Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200428173742.2988395-1-vkabatov@redhat.com --- tools/testing/selftests/bpf/Makefile | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index 7729892e0b04..4e654d41c7af 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -141,7 +141,8 @@ VMLINUX_BTF := $(abspath $(firstword $(wildcard $(VMLINUX_BTF_PATHS)))) $(OUTPUT)/runqslower: $(BPFOBJ) $(Q)$(MAKE) $(submake_extras) -C $(TOOLSDIR)/bpf/runqslower \ OUTPUT=$(SCRATCH_DIR)/ VMLINUX_BTF=$(VMLINUX_BTF) \ - BPFOBJ=$(BPFOBJ) BPF_INCLUDE=$(INCLUDE_DIR) + BPFOBJ=$(BPFOBJ) BPF_INCLUDE=$(INCLUDE_DIR) && \ + cp $(SCRATCH_DIR)/runqslower $@ $(TEST_GEN_PROGS) $(TEST_GEN_PROGS_EXTENDED): $(OUTPUT)/test_stub.o $(BPFOBJ) -- cgit v1.2.3-59-g8ed1b From d65dbedfd298344747033f17c1efd2afc8082bc7 Mon Sep 17 00:00:00 2001 From: Huy Nguyen Date: Fri, 24 Apr 2020 12:45:02 -0700 Subject: net/mlx5: Add support for COPY steering action Add COPY type to modify_header action. IPsec feature is the first feature that needs COPY steering action. Signed-off-by: Huy Nguyen Signed-off-by: Raed Salem Signed-off-by: Saeed Mahameed Acked-by: Leon Romanovsky --- drivers/infiniband/hw/mlx5/flow.c | 4 ++-- drivers/net/ethernet/mellanox/mlx5/core/en/tc_ct.c | 2 +- drivers/net/ethernet/mellanox/mlx5/core/en_tc.c | 6 +++--- drivers/net/ethernet/mellanox/mlx5/core/esw/chains.c | 2 +- drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c | 4 ++-- drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c | 2 +- drivers/net/ethernet/mellanox/mlx5/core/steering/fs_dr.c | 2 +- include/linux/mlx5/mlx5_ifc.h | 8 ++++---- 8 files changed, 15 insertions(+), 15 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/flow.c b/drivers/infiniband/hw/mlx5/flow.c index 862b7bf3e646..69cb7e6e8955 100644 --- a/drivers/infiniband/hw/mlx5/flow.c +++ b/drivers/infiniband/hw/mlx5/flow.c @@ -427,7 +427,7 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_FLOW_ACTION_CREATE_MODIFY_HEADER)( num_actions = uverbs_attr_ptr_get_array_size( attrs, MLX5_IB_ATTR_CREATE_MODIFY_HEADER_ACTIONS_PRM, - MLX5_UN_SZ_BYTES(set_action_in_add_action_in_auto)); + MLX5_UN_SZ_BYTES(set_add_copy_action_in_auto)); if (num_actions < 0) return num_actions; @@ -648,7 +648,7 @@ DECLARE_UVERBS_NAMED_METHOD( UA_MANDATORY), UVERBS_ATTR_PTR_IN(MLX5_IB_ATTR_CREATE_MODIFY_HEADER_ACTIONS_PRM, UVERBS_ATTR_MIN_SIZE(MLX5_UN_SZ_BYTES( - set_action_in_add_action_in_auto)), + set_add_copy_action_in_auto)), UA_MANDATORY, UA_ALLOC_AND_COPY), UVERBS_ATTR_CONST_IN(MLX5_IB_ATTR_CREATE_MODIFY_HEADER_FT_TYPE, diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_ct.c b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_ct.c index ad3e3a65d403..91464f70a3fa 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_ct.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_ct.c @@ -385,7 +385,7 @@ mlx5_tc_ct_entry_create_nat(struct mlx5_tc_ct_priv *ct_priv, char *modact; int err, i; - action_size = MLX5_UN_SZ_BYTES(set_action_in_add_action_in_auto); + action_size = MLX5_UN_SZ_BYTES(set_add_copy_action_in_auto); flow_action_for_each(i, act, flow_action) { switch (act->id) { diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c index 88c0e460e995..12c5ca5b93ca 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c @@ -61,7 +61,7 @@ #include "lib/geneve.h" #include "diag/en_tc_tracepoint.h" -#define MLX5_MH_ACT_SZ MLX5_UN_SZ_BYTES(set_action_in_add_action_in_auto) +#define MLX5_MH_ACT_SZ MLX5_UN_SZ_BYTES(set_add_copy_action_in_auto) struct mlx5_nic_flow_attr { u32 action; @@ -2660,7 +2660,7 @@ static int offload_pedit_fields(struct mlx5e_priv *priv, set_vals = &hdrs[0].vals; add_vals = &hdrs[1].vals; - action_size = MLX5_UN_SZ_BYTES(set_action_in_add_action_in_auto); + action_size = MLX5_UN_SZ_BYTES(set_add_copy_action_in_auto); for (i = 0; i < ARRAY_SIZE(fields); i++) { bool skip; @@ -2793,7 +2793,7 @@ int alloc_mod_hdr_actions(struct mlx5_core_dev *mdev, if (mod_hdr_acts->num_actions < mod_hdr_acts->max_actions) return 0; - action_size = MLX5_UN_SZ_BYTES(set_action_in_add_action_in_auto); + action_size = MLX5_UN_SZ_BYTES(set_add_copy_action_in_auto); max_hw_actions = mlx5e_flow_namespace_max_modify_action(mdev, namespace); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/esw/chains.c b/drivers/net/ethernet/mellanox/mlx5/core/esw/chains.c index 029001040737..d5bf908dfecd 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/esw/chains.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/esw/chains.c @@ -274,7 +274,7 @@ mlx5_esw_chains_destroy_fdb_table(struct mlx5_eswitch *esw, static int create_fdb_chain_restore(struct fdb_chain *fdb_chain) { - char modact[MLX5_UN_SZ_BYTES(set_action_in_add_action_in_auto)]; + char modact[MLX5_UN_SZ_BYTES(set_add_copy_action_in_auto)]; struct mlx5_eswitch *esw = fdb_chain->esw; struct mlx5_modify_hdr *mod_hdr; u32 index; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c index dc098bb58973..703f307c5967 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c @@ -1490,7 +1490,7 @@ static void esw_destroy_restore_table(struct mlx5_eswitch *esw) static int esw_create_restore_table(struct mlx5_eswitch *esw) { - u8 modact[MLX5_UN_SZ_BYTES(set_action_in_add_action_in_auto)] = {}; + u8 modact[MLX5_UN_SZ_BYTES(set_add_copy_action_in_auto)] = {}; int inlen = MLX5_ST_SZ_BYTES(create_flow_group_in); struct mlx5_flow_table_attr ft_attr = {}; struct mlx5_core_dev *dev = esw->dev; @@ -1900,7 +1900,7 @@ static int esw_vport_ingress_prio_tag_config(struct mlx5_eswitch *esw, static int esw_vport_add_ingress_acl_modify_metadata(struct mlx5_eswitch *esw, struct mlx5_vport *vport) { - u8 action[MLX5_UN_SZ_BYTES(set_action_in_add_action_in_auto)] = {}; + u8 action[MLX5_UN_SZ_BYTES(set_add_copy_action_in_auto)] = {}; struct mlx5_flow_act flow_act = {}; int err = 0; u32 key; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c index 304d1e4f0541..1a8e826ac86b 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c @@ -791,7 +791,7 @@ static int mlx5_cmd_modify_header_alloc(struct mlx5_flow_root_namespace *ns, return -EOPNOTSUPP; } - actions_size = MLX5_UN_SZ_BYTES(set_action_in_add_action_in_auto) * num_actions; + actions_size = MLX5_UN_SZ_BYTES(set_add_copy_action_in_auto) * num_actions; inlen = MLX5_ST_SZ_BYTES(alloc_modify_header_context_in) + actions_size; in = kzalloc(inlen, GFP_KERNEL); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/fs_dr.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/fs_dr.c index 3b3f5b9d4f95..8887b2440c7d 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/fs_dr.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/fs_dr.c @@ -576,7 +576,7 @@ static int mlx5_cmd_dr_modify_header_alloc(struct mlx5_flow_root_namespace *ns, struct mlx5dr_action *action; size_t actions_sz; - actions_sz = MLX5_UN_SZ_BYTES(set_action_in_add_action_in_auto) * + actions_sz = MLX5_UN_SZ_BYTES(set_add_copy_action_in_auto) * num_actions; action = mlx5dr_action_create_modify_header(dr_domain, 0, actions_sz, diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 6fa24918eade..3ad2c51ccde9 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -5670,9 +5670,9 @@ struct mlx5_ifc_copy_action_in_bits { u8 reserved_at_38[0x8]; }; -union mlx5_ifc_set_action_in_add_action_in_auto_bits { - struct mlx5_ifc_set_action_in_bits set_action_in; - struct mlx5_ifc_add_action_in_bits add_action_in; +union mlx5_ifc_set_add_copy_action_in_auto_bits { + struct mlx5_ifc_set_action_in_bits set_action_in; + struct mlx5_ifc_add_action_in_bits add_action_in; struct mlx5_ifc_copy_action_in_bits copy_action_in; u8 reserved_at_0[0x40]; }; @@ -5746,7 +5746,7 @@ struct mlx5_ifc_alloc_modify_header_context_in_bits { u8 reserved_at_68[0x10]; u8 num_of_actions[0x8]; - union mlx5_ifc_set_action_in_add_action_in_auto_bits actions[0]; + union mlx5_ifc_set_add_copy_action_in_auto_bits actions[0]; }; struct mlx5_ifc_dealloc_modify_header_context_out_bits { -- cgit v1.2.3-59-g8ed1b From 2b58f6d9df50f534fe465113b69de60a2ef0e74a Mon Sep 17 00:00:00 2001 From: Raed Salem Date: Fri, 24 Apr 2020 12:45:03 -0700 Subject: net/mlx5: Introduce IPsec Connect-X offload hardware bits and structures Add IPsec offload related IFC structs, layouts and enumerations. Signed-off-by: Raed Salem Reviewed-by: Tariq Toukan Signed-off-by: Saeed Mahameed --- include/linux/mlx5/device.h | 4 +++ include/linux/mlx5/mlx5_ifc.h | 78 +++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 79 insertions(+), 3 deletions(-) diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h index 2b90097a6cf9..7b57877e501e 100644 --- a/include/linux/mlx5/device.h +++ b/include/linux/mlx5/device.h @@ -1107,6 +1107,7 @@ enum mlx5_cap_type { MLX5_CAP_TLS, MLX5_CAP_VDPA_EMULATION = 0x13, MLX5_CAP_DEV_EVENT = 0x14, + MLX5_CAP_IPSEC, /* NUM OF CAP Types */ MLX5_CAP_NUM }; @@ -1324,6 +1325,9 @@ enum mlx5_qcam_feature_groups { MLX5_GET64(device_virtio_emulation_cap, \ (mdev)->caps.hca_cur[MLX5_CAP_VDPA_EMULATION], cap) +#define MLX5_CAP_IPSEC(mdev, cap)\ + MLX5_GET(ipsec_cap, (mdev)->caps.hca_cur[MLX5_CAP_IPSEC], cap) + enum { MLX5_CMD_STAT_OK = 0x0, MLX5_CMD_STAT_INT_ERR = 0x1, diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 3ad2c51ccde9..cf971d341189 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -886,7 +886,8 @@ struct mlx5_ifc_per_protocol_networking_offload_caps_bits { u8 tunnel_stateless_vxlan_gpe[0x1]; u8 tunnel_stateless_ipv4_over_vxlan[0x1]; u8 tunnel_stateless_ip_over_ip[0x1]; - u8 reserved_at_2a[0x6]; + u8 insert_trailer[0x1]; + u8 reserved_at_2b[0x5]; u8 max_vxlan_udp_ports[0x8]; u8 reserved_at_38[0x6]; u8 max_geneve_opt_len[0x1]; @@ -1100,6 +1101,23 @@ struct mlx5_ifc_tls_cap_bits { u8 reserved_at_20[0x7e0]; }; +struct mlx5_ifc_ipsec_cap_bits { + u8 ipsec_full_offload[0x1]; + u8 ipsec_crypto_offload[0x1]; + u8 ipsec_esn[0x1]; + u8 ipsec_crypto_esp_aes_gcm_256_encrypt[0x1]; + u8 ipsec_crypto_esp_aes_gcm_128_encrypt[0x1]; + u8 ipsec_crypto_esp_aes_gcm_256_decrypt[0x1]; + u8 ipsec_crypto_esp_aes_gcm_128_decrypt[0x1]; + u8 reserved_at_7[0x4]; + u8 log_max_ipsec_offload[0x5]; + u8 reserved_at_10[0x10]; + + u8 min_log_ipsec_full_replay_window[0x8]; + u8 max_log_ipsec_full_replay_window[0x8]; + u8 reserved_at_30[0x7d0]; +}; + enum { MLX5_WQ_TYPE_LINKED_LIST = 0x0, MLX5_WQ_TYPE_CYCLIC = 0x1, @@ -1464,7 +1482,8 @@ struct mlx5_ifc_cmd_hca_cap_bits { u8 reserved_at_460[0x3]; u8 log_max_uctx[0x5]; - u8 reserved_at_468[0x3]; + u8 reserved_at_468[0x2]; + u8 ipsec_offload[0x1]; u8 log_max_umem[0x5]; u8 max_num_eqs[0x10]; @@ -4143,7 +4162,8 @@ enum { MLX5_SET_FTE_MODIFY_ENABLE_MASK_ACTION = 0x0, MLX5_SET_FTE_MODIFY_ENABLE_MASK_FLOW_TAG = 0x1, MLX5_SET_FTE_MODIFY_ENABLE_MASK_DESTINATION_LIST = 0x2, - MLX5_SET_FTE_MODIFY_ENABLE_MASK_FLOW_COUNTERS = 0x3 + MLX5_SET_FTE_MODIFY_ENABLE_MASK_FLOW_COUNTERS = 0x3, + MLX5_SET_FTE_MODIFY_ENABLE_MASK_IPSEC_OBJ_ID = 0x4 }; struct mlx5_ifc_set_fte_out_bits { @@ -10468,10 +10488,62 @@ struct mlx5_ifc_affiliated_event_header_bits { enum { MLX5_HCA_CAP_GENERAL_OBJECT_TYPES_ENCRYPTION_KEY = BIT(0xc), + MLX5_HCA_CAP_GENERAL_OBJECT_TYPES_IPSEC = BIT(0x13), }; enum { MLX5_GENERAL_OBJECT_TYPES_ENCRYPTION_KEY = 0xc, + MLX5_GENERAL_OBJECT_TYPES_IPSEC = 0x13, +}; + +enum { + MLX5_IPSEC_OBJECT_ICV_LEN_16B, + MLX5_IPSEC_OBJECT_ICV_LEN_12B, + MLX5_IPSEC_OBJECT_ICV_LEN_8B, +}; + +struct mlx5_ifc_ipsec_obj_bits { + u8 modify_field_select[0x40]; + u8 full_offload[0x1]; + u8 reserved_at_41[0x1]; + u8 esn_en[0x1]; + u8 esn_overlap[0x1]; + u8 reserved_at_44[0x2]; + u8 icv_length[0x2]; + u8 reserved_at_48[0x4]; + u8 aso_return_reg[0x4]; + u8 reserved_at_50[0x10]; + + u8 esn_msb[0x20]; + + u8 reserved_at_80[0x8]; + u8 dekn[0x18]; + + u8 salt[0x20]; + + u8 implicit_iv[0x40]; + + u8 reserved_at_100[0x700]; +}; + +struct mlx5_ifc_create_ipsec_obj_in_bits { + struct mlx5_ifc_general_obj_in_cmd_hdr_bits general_obj_in_cmd_hdr; + struct mlx5_ifc_ipsec_obj_bits ipsec_object; +}; + +enum { + MLX5_MODIFY_IPSEC_BITMASK_ESN_OVERLAP = BIT(0), + MLX5_MODIFY_IPSEC_BITMASK_ESN_MSB = BIT(1), +}; + +struct mlx5_ifc_query_ipsec_obj_out_bits { + struct mlx5_ifc_general_obj_out_cmd_hdr_bits general_obj_out_cmd_hdr; + struct mlx5_ifc_ipsec_obj_bits ipsec_object; +}; + +struct mlx5_ifc_modify_ipsec_obj_in_bits { + struct mlx5_ifc_general_obj_in_cmd_hdr_bits general_obj_in_cmd_hdr; + struct mlx5_ifc_ipsec_obj_bits ipsec_object; }; struct mlx5_ifc_encryption_key_obj_bits { -- cgit v1.2.3-59-g8ed1b From dff8e2d15283dd92582ddeec25ca86e4cf2618c7 Mon Sep 17 00:00:00 2001 From: Erez Shitrit Date: Fri, 24 Apr 2020 12:45:04 -0700 Subject: net/mlx5: Use aligned variable while allocating ICM memory The alignment value is part of the input structure, so use it and spare extra memory allocation when is not needed. Now, using the new ability when allocating icm for Direct-Rule insertion. Signed-off-by: Ariel Levkovich Signed-off-by: Erez Shitrit Signed-off-by: Saeed Mahameed --- drivers/infiniband/hw/mlx5/main.c | 2 +- drivers/net/ethernet/mellanox/mlx5/core/lib/dm.c | 15 ++++-- .../mellanox/mlx5/core/steering/dr_icm_pool.c | 53 ++++++++++------------ include/linux/mlx5/driver.h | 3 +- 4 files changed, 38 insertions(+), 35 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index f10675213115..65e0e24d463b 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -2444,7 +2444,7 @@ static int handle_alloc_dm_sw_icm(struct ib_ucontext *ctx, act_size = roundup_pow_of_two(act_size); dm->size = act_size; - err = mlx5_dm_sw_icm_alloc(dev, type, act_size, + err = mlx5_dm_sw_icm_alloc(dev, type, act_size, attr->alignment, to_mucontext(ctx)->devx_uid, &dm->dev_addr, &dm->icm_dm.obj_id); if (err) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/dm.c b/drivers/net/ethernet/mellanox/mlx5/core/lib/dm.c index 6cbccba56f70..3d5e57ff558c 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lib/dm.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/dm.c @@ -90,7 +90,8 @@ void mlx5_dm_cleanup(struct mlx5_core_dev *dev) } int mlx5_dm_sw_icm_alloc(struct mlx5_core_dev *dev, enum mlx5_sw_icm_type type, - u64 length, u16 uid, phys_addr_t *addr, u32 *obj_id) + u64 length, u32 log_alignment, u16 uid, + phys_addr_t *addr, u32 *obj_id) { u32 num_blocks = DIV_ROUND_UP_ULL(length, MLX5_SW_ICM_BLOCK_SIZE(dev)); u32 out[MLX5_ST_SZ_DW(general_obj_out_cmd_hdr)] = {}; @@ -99,6 +100,7 @@ int mlx5_dm_sw_icm_alloc(struct mlx5_core_dev *dev, enum mlx5_sw_icm_type type, unsigned long *block_map; u64 icm_start_addr; u32 log_icm_size; + u64 align_mask; u32 max_blocks; u64 block_idx; void *sw_icm; @@ -136,11 +138,14 @@ int mlx5_dm_sw_icm_alloc(struct mlx5_core_dev *dev, enum mlx5_sw_icm_type type, return -EOPNOTSUPP; max_blocks = BIT(log_icm_size - MLX5_LOG_SW_ICM_BLOCK_SIZE(dev)); + + if (log_alignment < MLX5_LOG_SW_ICM_BLOCK_SIZE(dev)) + log_alignment = MLX5_LOG_SW_ICM_BLOCK_SIZE(dev); + align_mask = BIT(log_alignment - MLX5_LOG_SW_ICM_BLOCK_SIZE(dev)) - 1; + spin_lock(&dm->lock); - block_idx = bitmap_find_next_zero_area(block_map, - max_blocks, - 0, - num_blocks, 0); + block_idx = bitmap_find_next_zero_area(block_map, max_blocks, 0, + num_blocks, align_mask); if (block_idx < max_blocks) bitmap_set(block_map, diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_icm_pool.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_icm_pool.c index 30d2d7376f56..cc33515b9aba 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_icm_pool.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_icm_pool.c @@ -95,13 +95,12 @@ static int dr_icm_create_dm_mkey(struct mlx5_core_dev *mdev, } static struct mlx5dr_icm_mr * -dr_icm_pool_mr_create(struct mlx5dr_icm_pool *pool, - enum mlx5_sw_icm_type type, - size_t align_base) +dr_icm_pool_mr_create(struct mlx5dr_icm_pool *pool) { struct mlx5_core_dev *mdev = pool->dmn->mdev; + enum mlx5_sw_icm_type dm_type; struct mlx5dr_icm_mr *icm_mr; - size_t align_diff; + size_t log_align_base; int err; icm_mr = kvzalloc(sizeof(*icm_mr), GFP_KERNEL); @@ -111,14 +110,22 @@ dr_icm_pool_mr_create(struct mlx5dr_icm_pool *pool, icm_mr->pool = pool; INIT_LIST_HEAD(&icm_mr->mr_list); - icm_mr->dm.type = type; - - /* 2^log_biggest_table * entry-size * double-for-alignment */ icm_mr->dm.length = mlx5dr_icm_pool_chunk_size_to_byte(pool->max_log_chunk_sz, - pool->icm_type) * 2; + pool->icm_type); + + if (pool->icm_type == DR_ICM_TYPE_STE) { + dm_type = MLX5_SW_ICM_TYPE_STEERING; + log_align_base = ilog2(icm_mr->dm.length); + } else { + dm_type = MLX5_SW_ICM_TYPE_HEADER_MODIFY; + /* Align base is 64B */ + log_align_base = ilog2(DR_ICM_MODIFY_HDR_ALIGN_BASE); + } + icm_mr->dm.type = dm_type; - err = mlx5_dm_sw_icm_alloc(mdev, icm_mr->dm.type, icm_mr->dm.length, 0, - &icm_mr->dm.addr, &icm_mr->dm.obj_id); + err = mlx5_dm_sw_icm_alloc(mdev, icm_mr->dm.type, icm_mr->dm.length, + log_align_base, 0, &icm_mr->dm.addr, + &icm_mr->dm.obj_id); if (err) { mlx5dr_err(pool->dmn, "Failed to allocate SW ICM memory, err (%d)\n", err); goto free_icm_mr; @@ -137,15 +144,18 @@ dr_icm_pool_mr_create(struct mlx5dr_icm_pool *pool, icm_mr->icm_start_addr = icm_mr->dm.addr; - /* align_base is always a power of 2 */ - align_diff = icm_mr->icm_start_addr & (align_base - 1); - if (align_diff) - icm_mr->used_length = align_base - align_diff; + if (icm_mr->icm_start_addr & (BIT(log_align_base) - 1)) { + mlx5dr_err(pool->dmn, "Failed to get Aligned ICM mem (asked: %zu)\n", + log_align_base); + goto free_mkey; + } list_add_tail(&icm_mr->mr_list, &pool->icm_mr_list); return icm_mr; +free_mkey: + mlx5_core_destroy_mkey(mdev, &icm_mr->mkey); free_dm: mlx5_dm_sw_icm_dealloc(mdev, icm_mr->dm.type, icm_mr->dm.length, 0, icm_mr->dm.addr, icm_mr->dm.obj_id); @@ -200,24 +210,11 @@ static int dr_icm_chunks_create(struct mlx5dr_icm_bucket *bucket) struct mlx5dr_icm_pool *pool = bucket->pool; struct mlx5dr_icm_mr *icm_mr = NULL; struct mlx5dr_icm_chunk *chunk; - enum mlx5_sw_icm_type dm_type; - size_t align_base; int i, err = 0; mr_req_size = bucket->num_of_entries * bucket->entry_size; mr_row_size = mlx5dr_icm_pool_chunk_size_to_byte(pool->max_log_chunk_sz, pool->icm_type); - - if (pool->icm_type == DR_ICM_TYPE_STE) { - dm_type = MLX5_SW_ICM_TYPE_STEERING; - /* Align base is the biggest chunk size / row size */ - align_base = mr_row_size; - } else { - dm_type = MLX5_SW_ICM_TYPE_HEADER_MODIFY; - /* Align base is 64B */ - align_base = DR_ICM_MODIFY_HDR_ALIGN_BASE; - } - mutex_lock(&pool->mr_mutex); if (!list_empty(&pool->icm_mr_list)) { icm_mr = list_last_entry(&pool->icm_mr_list, @@ -228,7 +225,7 @@ static int dr_icm_chunks_create(struct mlx5dr_icm_bucket *bucket) } if (!icm_mr || mr_free_size < mr_row_size) { - icm_mr = dr_icm_pool_mr_create(pool, dm_type, align_base); + icm_mr = dr_icm_pool_mr_create(pool); if (!icm_mr) { err = -ENOMEM; goto out_err; diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index b60e5ab7906b..b46537a81703 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -1080,7 +1080,8 @@ int mlx5_lag_query_cong_counters(struct mlx5_core_dev *dev, struct mlx5_uars_page *mlx5_get_uars_page(struct mlx5_core_dev *mdev); void mlx5_put_uars_page(struct mlx5_core_dev *mdev, struct mlx5_uars_page *up); int mlx5_dm_sw_icm_alloc(struct mlx5_core_dev *dev, enum mlx5_sw_icm_type type, - u64 length, u16 uid, phys_addr_t *addr, u32 *obj_id); + u64 length, u32 log_alignment, u16 uid, + phys_addr_t *addr, u32 *obj_id); int mlx5_dm_sw_icm_dealloc(struct mlx5_core_dev *dev, enum mlx5_sw_icm_type type, u64 length, u16 uid, phys_addr_t addr, u32 obj_id); -- cgit v1.2.3-59-g8ed1b From 244faedfd4d8e8c8e9f3c628d29bb74196b49743 Mon Sep 17 00:00:00 2001 From: Raed Salem Date: Fri, 24 Apr 2020 12:45:05 -0700 Subject: net/mlx5: Refactor imm_inval_pkey field in cqe struct The imm_inval_pkey field can hold four different types of data, depends on the usage, the data could be one of the below: - Immediate field of the received message - Invalidate rkey - Pkey of the packet - Flow table metadata Current implementation doesn't reflect the intended usage of the field at usage time. Reflect the different types by replace this field with a union, modify code where this field is used to reflect its intended usage. Signed-off-by: Raed Salem Reviewed-by: Huy Nguyen Reviewed-by: Tariq Toukan Signed-off-by: Saeed Mahameed --- drivers/infiniband/hw/mlx5/cq.c | 8 ++++---- drivers/net/ethernet/mellanox/mlx5/core/en_tc.c | 2 +- include/linux/mlx5/device.h | 7 ++++++- 3 files changed, 11 insertions(+), 6 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/cq.c b/drivers/infiniband/hw/mlx5/cq.c index 32c05730dfe9..0c18cb6a2f14 100644 --- a/drivers/infiniband/hw/mlx5/cq.c +++ b/drivers/infiniband/hw/mlx5/cq.c @@ -202,7 +202,7 @@ static void handle_responder(struct ib_wc *wc, struct mlx5_cqe64 *cqe, case MLX5_CQE_RESP_WR_IMM: wc->opcode = IB_WC_RECV_RDMA_WITH_IMM; wc->wc_flags = IB_WC_WITH_IMM; - wc->ex.imm_data = cqe->imm_inval_pkey; + wc->ex.imm_data = cqe->immediate; break; case MLX5_CQE_RESP_SEND: wc->opcode = IB_WC_RECV; @@ -214,12 +214,12 @@ static void handle_responder(struct ib_wc *wc, struct mlx5_cqe64 *cqe, case MLX5_CQE_RESP_SEND_IMM: wc->opcode = IB_WC_RECV; wc->wc_flags = IB_WC_WITH_IMM; - wc->ex.imm_data = cqe->imm_inval_pkey; + wc->ex.imm_data = cqe->immediate; break; case MLX5_CQE_RESP_SEND_INV: wc->opcode = IB_WC_RECV; wc->wc_flags = IB_WC_WITH_INVALIDATE; - wc->ex.invalidate_rkey = be32_to_cpu(cqe->imm_inval_pkey); + wc->ex.invalidate_rkey = be32_to_cpu(cqe->inval_rkey); break; } wc->src_qp = be32_to_cpu(cqe->flags_rqpn) & 0xffffff; @@ -227,7 +227,7 @@ static void handle_responder(struct ib_wc *wc, struct mlx5_cqe64 *cqe, g = (be32_to_cpu(cqe->flags_rqpn) >> 28) & 3; wc->wc_flags |= g ? IB_WC_GRH : 0; if (unlikely(is_qp1(qp->ibqp.qp_type))) { - u16 pkey = be32_to_cpu(cqe->imm_inval_pkey) & 0xffff; + u16 pkey = be32_to_cpu(cqe->pkey) & 0xffff; ib_find_cached_pkey(&dev->ib_dev, qp->port, pkey, &wc->pkey_index); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c index 12c5ca5b93ca..5b632434866f 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c @@ -4891,7 +4891,7 @@ bool mlx5e_tc_rep_update_skb(struct mlx5_cqe64 *cqe, reg_c0 = (be32_to_cpu(cqe->sop_drop_qpn) & MLX5E_TC_FLOW_ID_MASK); if (reg_c0 == MLX5_FS_DEFAULT_FLOW_TAG) reg_c0 = 0; - reg_c1 = be32_to_cpu(cqe->imm_inval_pkey); + reg_c1 = be32_to_cpu(cqe->ft_metadata); if (!reg_c0) return true; diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h index 7b57877e501e..746e17473d72 100644 --- a/include/linux/mlx5/device.h +++ b/include/linux/mlx5/device.h @@ -767,7 +767,12 @@ struct mlx5_cqe64 { u8 l4_l3_hdr_type; __be16 vlan_info; __be32 srqn; /* [31:24]: lro_num_seg, [23:0]: srqn */ - __be32 imm_inval_pkey; + union { + __be32 immediate; + __be32 inval_rkey; + __be32 pkey; + __be32 ft_metadata; + }; u8 rsvd40[4]; __be32 byte_cnt; __be32 timestamp_h; -- cgit v1.2.3-59-g8ed1b From 06939536263d684073a30543930622eede633af1 Mon Sep 17 00:00:00 2001 From: Moshe Shemesh Date: Fri, 24 Apr 2020 12:45:06 -0700 Subject: net/mlx5: Add structure layout and defines for MFRL register Add needed structure layouts and defines for MFRL (Management Firmware Reset Level) register. This structure will be used for the firmware upgrade and reset flow in the downstream patches. Signed-off-by: Moshe Shemesh Reviewed-by: Tariq Toukan Signed-off-by: Saeed Mahameed --- include/linux/mlx5/driver.h | 1 + include/linux/mlx5/mlx5_ifc.h | 24 ++++++++++++++++++++++++ 2 files changed, 25 insertions(+) diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index b46537a81703..d82dbbab8179 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -130,6 +130,7 @@ enum { MLX5_REG_NODE_DESC = 0x6001, MLX5_REG_HOST_ENDIANNESS = 0x7004, MLX5_REG_MCIA = 0x9014, + MLX5_REG_MFRL = 0x9028, MLX5_REG_MLCR = 0x902b, MLX5_REG_MTRC_CAP = 0x9040, MLX5_REG_MTRC_CONF = 0x9041, diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index cf971d341189..9e6a3cec1e32 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -9703,6 +9703,29 @@ struct mlx5_ifc_mcda_reg_bits { u8 data[0][0x20]; }; +enum { + MLX5_MFRL_REG_RESET_TYPE_FULL_CHIP = BIT(0), + MLX5_MFRL_REG_RESET_TYPE_NET_PORT_ALIVE = BIT(1), +}; + +enum { + MLX5_MFRL_REG_RESET_LEVEL0 = BIT(0), + MLX5_MFRL_REG_RESET_LEVEL3 = BIT(3), + MLX5_MFRL_REG_RESET_LEVEL6 = BIT(6), +}; + +struct mlx5_ifc_mfrl_reg_bits { + u8 reserved_at_0[0x20]; + + u8 reserved_at_20[0x2]; + u8 pci_sync_for_fw_update_start[0x1]; + u8 pci_sync_for_fw_update_resp[0x2]; + u8 rst_type_sel[0x3]; + u8 reserved_at_28[0x8]; + u8 reset_type[0x8]; + u8 reset_level[0x8]; +}; + struct mlx5_ifc_mirc_reg_bits { u8 reserved_at_0[0x18]; u8 status_code[0x8]; @@ -9766,6 +9789,7 @@ union mlx5_ifc_ports_control_registers_document_bits { struct mlx5_ifc_mcc_reg_bits mcc_reg; struct mlx5_ifc_mcda_reg_bits mcda_reg; struct mlx5_ifc_mirc_reg_bits mirc_reg; + struct mlx5_ifc_mfrl_reg_bits mfrl_reg; u8 reserved_at_0[0x60e0]; }; -- cgit v1.2.3-59-g8ed1b From 3df0107784ceb388039b1fe510a8c7b8816de8f0 Mon Sep 17 00:00:00 2001 From: Moshe Shemesh Date: Fri, 24 Apr 2020 12:45:07 -0700 Subject: net/mlx5: Add structure and defines for pci sync for fw update event Add needed structure layouts and defines for pci sync for fw update event. The downstream patches will include event handlers for this event type. Signed-off-by: Moshe Shemesh Reviewed-by: Tariq Toukan Signed-off-by: Saeed Mahameed --- include/linux/mlx5/device.h | 15 +++++++++++++++ include/linux/mlx5/mlx5_ifc.h | 4 +++- 2 files changed, 18 insertions(+), 1 deletion(-) diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h index 746e17473d72..de93f0b67973 100644 --- a/include/linux/mlx5/device.h +++ b/include/linux/mlx5/device.h @@ -364,6 +364,7 @@ enum { enum { MLX5_GENERAL_SUBTYPE_DELAY_DROP_TIMEOUT = 0x1, MLX5_GENERAL_SUBTYPE_PCI_POWER_CHANGE_EVENT = 0x5, + MLX5_GENERAL_SUBTYPE_PCI_SYNC_FOR_FW_UPDATE_EVENT = 0x8, }; enum { @@ -689,6 +690,19 @@ struct mlx5_eqe_temp_warning { __be64 sensor_warning_lsb; } __packed; +#define SYNC_RST_STATE_MASK 0xf + +enum sync_rst_state_type { + MLX5_SYNC_RST_STATE_RESET_REQUEST = 0x0, + MLX5_SYNC_RST_STATE_RESET_NOW = 0x1, + MLX5_SYNC_RST_STATE_RESET_ABORT = 0x2, +}; + +struct mlx5_eqe_sync_fw_update { + u8 reserved_at_0[3]; + u8 sync_rst_state; +}; + union ev_data { __be32 raw[7]; struct mlx5_eqe_cmd cmd; @@ -707,6 +721,7 @@ union ev_data { struct mlx5_eqe_dct dct; struct mlx5_eqe_temp_warning temp_warning; struct mlx5_eqe_xrq_err xrq_err; + struct mlx5_eqe_sync_fw_update sync_fw_update; } __packed; struct mlx5_eqe { diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 9e6a3cec1e32..058ded202b65 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -1317,7 +1317,9 @@ struct mlx5_ifc_cmd_hca_cap_bits { u8 wol_p[0x1]; u8 stat_rate_support[0x10]; - u8 reserved_at_1f0[0xc]; + u8 reserved_at_1f0[0x1]; + u8 pci_sync_for_fw_update_event[0x1]; + u8 reserved_at_1f2[0xa]; u8 cqe_version[0x4]; u8 compact_address_vector[0x1]; -- cgit v1.2.3-59-g8ed1b From ee5cdf7a5e8945372c7496e98de2b364e095b60b Mon Sep 17 00:00:00 2001 From: Tariq Toukan Date: Fri, 24 Apr 2020 12:45:08 -0700 Subject: net/mlx5: Introduce TLS RX offload hardware bits Add TLS RX offload related IFC hardware fields and enumerations. Signed-off-by: Tariq Toukan Reviewed-by: Maxim Mikityanskiy Reviewed-by: Boris Pismenny Signed-off-by: Saeed Mahameed --- include/linux/mlx5/device.h | 18 ++++++++++++++++-- include/linux/mlx5/mlx5_ifc.h | 5 +++-- 2 files changed, 19 insertions(+), 4 deletions(-) diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h index de93f0b67973..1bc27aca648b 100644 --- a/include/linux/mlx5/device.h +++ b/include/linux/mlx5/device.h @@ -450,10 +450,12 @@ enum { enum { MLX5_OPC_MOD_TLS_TIS_STATIC_PARAMS = 0x1, + MLX5_OPC_MOD_TLS_TIR_STATIC_PARAMS = 0x2, }; enum { MLX5_OPC_MOD_TLS_TIS_PROGRESS_PARAMS = 0x1, + MLX5_OPC_MOD_TLS_TIR_PROGRESS_PARAMS = 0x2, }; enum { @@ -764,7 +766,7 @@ struct mlx5_err_cqe { }; struct mlx5_cqe64 { - u8 outer_l3_tunneled; + u8 tls_outer_l3_tunneled; u8 rsvd0; __be16 wqe_id; u8 lro_tcppsh_abort_dupack; @@ -854,7 +856,12 @@ static inline u8 get_cqe_l3_hdr_type(struct mlx5_cqe64 *cqe) static inline bool cqe_is_tunneled(struct mlx5_cqe64 *cqe) { - return cqe->outer_l3_tunneled & 0x1; + return cqe->tls_outer_l3_tunneled & 0x1; +} + +static inline u8 get_cqe_tls_offload(struct mlx5_cqe64 *cqe) +{ + return (cqe->tls_outer_l3_tunneled >> 3) & 0x3; } static inline bool cqe_has_vlan(struct mlx5_cqe64 *cqe) @@ -942,6 +949,13 @@ enum { CQE_L4_OK = 1 << 2, }; +enum { + CQE_TLS_OFFLOAD_NOT_DECRYPTED = 0x0, + CQE_TLS_OFFLOAD_DECRYPTED = 0x1, + CQE_TLS_OFFLOAD_RESYNC = 0x2, + CQE_TLS_OFFLOAD_ERROR = 0x3, +}; + struct mlx5_sig_err_cqe { u8 rsvd0[16]; __be32 expected_trans_sig; diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 058ded202b65..6a6bb5dc7916 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -1491,7 +1491,7 @@ struct mlx5_ifc_cmd_hca_cap_bits { u8 reserved_at_480[0x1]; u8 tls_tx[0x1]; - u8 reserved_at_482[0x1]; + u8 tls_rx[0x1]; u8 log_max_l2_table[0x5]; u8 reserved_at_488[0x8]; u8 log_uar_page_sz[0x10]; @@ -3136,7 +3136,8 @@ struct mlx5_ifc_tirc_bits { u8 reserved_at_0[0x20]; u8 disp_type[0x4]; - u8 reserved_at_24[0x1c]; + u8 tls_en[0x1]; + u8 reserved_at_25[0x1b]; u8 reserved_at_40[0x40]; -- cgit v1.2.3-59-g8ed1b From 0e1533bb9cce2c6b2aecdfddfcc0de3beeaddc7b Mon Sep 17 00:00:00 2001 From: Eran Ben Elisha Date: Fri, 24 Apr 2020 12:45:09 -0700 Subject: net/mlx5: Add release all pages capability bit Add a bit in HCA capabilities layout to indicate if release all pages is supported. Signed-off-by: Eran Ben Elisha Reviewed-by: Moshe Shemesh Signed-off-by: Saeed Mahameed --- include/linux/mlx5/mlx5_ifc.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 6a6bb5dc7916..fb243848132d 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -1244,7 +1244,9 @@ struct mlx5_ifc_cmd_hca_cap_bits { u8 reserved_at_130[0xa]; u8 log_max_ra_res_dc[0x6]; - u8 reserved_at_140[0x9]; + u8 reserved_at_140[0x6]; + u8 release_all_pages[0x1]; + u8 reserved_at_147[0x2]; u8 roce_accl[0x1]; u8 log_max_ra_req_qp[0x6]; u8 reserved_at_150[0xa]; -- cgit v1.2.3-59-g8ed1b From 2dc8b5246d2c94f732c02e7a688d8a9c0c65361f Mon Sep 17 00:00:00 2001 From: Raed Salem Date: Fri, 24 Apr 2020 12:45:10 -0700 Subject: net/mlx5: TX WQE Add trailer insertion field Add new TX WQE field for Connect-X6DX trailer insertion support, when set, the HW adds a trailer to the packet, the WQE trailer association flags are used to set to HW the header which the trailer belongs. Signed-off-by: Raed Salem Signed-off-by: Saeed Mahameed --- include/linux/mlx5/qp.h | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/include/linux/mlx5/qp.h b/include/linux/mlx5/qp.h index ef127a156a62..f23eb18526fe 100644 --- a/include/linux/mlx5/qp.h +++ b/include/linux/mlx5/qp.h @@ -229,6 +229,11 @@ enum { enum { MLX5_ETH_WQE_SVLAN = 1 << 0, + MLX5_ETH_WQE_TRAILER_HDR_OUTER_IP_ASSOC = 1 << 26, + MLX5_ETH_WQE_TRAILER_HDR_OUTER_L4_ASSOC = 1 << 27, + MLX5_ETH_WQE_TRAILER_HDR_INNER_IP_ASSOC = 3 << 26, + MLX5_ETH_WQE_TRAILER_HDR_INNER_L4_ASSOC = 1 << 28, + MLX5_ETH_WQE_INSERT_TRAILER = 1 << 30, MLX5_ETH_WQE_INSERT_VLAN = 1 << 15, }; @@ -257,6 +262,7 @@ struct mlx5_wqe_eth_seg { __be16 type; __be16 vlan_tci; } insert; + __be32 trailer; }; }; -- cgit v1.2.3-59-g8ed1b From a6bbdf2e750f245d219d39f3c3d06ace2c5871e6 Mon Sep 17 00:00:00 2001 From: Zou Wei Date: Tue, 28 Apr 2020 17:07:09 +0800 Subject: libbpf: Remove unneeded semicolon in btf_dump_emit_type Fixes the following coccicheck warning: tools/lib/bpf/btf_dump.c:661:4-5: Unneeded semicolon Reported-by: Hulk Robot Signed-off-by: Zou Wei Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/1588064829-70613-1-git-send-email-zou_wei@huawei.com --- tools/lib/bpf/btf_dump.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/lib/bpf/btf_dump.c b/tools/lib/bpf/btf_dump.c index 0c28ee82834b..de07e559a11d 100644 --- a/tools/lib/bpf/btf_dump.c +++ b/tools/lib/bpf/btf_dump.c @@ -658,7 +658,7 @@ static void btf_dump_emit_type(struct btf_dump *d, __u32 id, __u32 cont_id) if (!btf_dump_is_blacklisted(d, id)) { btf_dump_emit_typedef_def(d, id, t, 0); btf_dump_printf(d, ";\n\n"); - }; + } tstate->fwd_emitted = 1; break; default: -- cgit v1.2.3-59-g8ed1b From 11dd74b338bf83f8bca70b57bad33a903fedfa6e Mon Sep 17 00:00:00 2001 From: Roopa Prabhu Date: Mon, 27 Apr 2020 13:56:45 -0700 Subject: net: ipv6: new arg skip_notify to ip6_rt_del Used in subsequent work to skip route delete notifications on nexthop deletes. Suggested-by: David Ahern Signed-off-by: Roopa Prabhu Reviewed-by: David Ahern Signed-off-by: David S. Miller --- include/net/ip6_route.h | 2 +- include/net/ipv6_stubs.h | 2 +- net/ipv4/nexthop.c | 2 +- net/ipv6/addrconf.c | 12 ++++++------ net/ipv6/addrconf_core.c | 3 ++- net/ipv6/anycast.c | 4 ++-- net/ipv6/ndisc.c | 2 +- net/ipv6/route.c | 11 +++++++---- 8 files changed, 21 insertions(+), 17 deletions(-) diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h index 9947eb1e9eb6..e525f003e619 100644 --- a/include/net/ip6_route.h +++ b/include/net/ip6_route.h @@ -123,7 +123,7 @@ int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg); int ip6_route_add(struct fib6_config *cfg, gfp_t gfp_flags, struct netlink_ext_ack *extack); int ip6_ins_rt(struct net *net, struct fib6_info *f6i); -int ip6_del_rt(struct net *net, struct fib6_info *f6i); +int ip6_del_rt(struct net *net, struct fib6_info *f6i, bool skip_notify); void rt6_flush_exceptions(struct fib6_info *f6i); void rt6_age_exceptions(struct fib6_info *f6i, struct fib6_gc_args *gc_args, diff --git a/include/net/ipv6_stubs.h b/include/net/ipv6_stubs.h index 3e7d2c0e79ca..a5f7c12c326a 100644 --- a/include/net/ipv6_stubs.h +++ b/include/net/ipv6_stubs.h @@ -48,7 +48,7 @@ struct ipv6_stub { struct netlink_ext_ack *extack); void (*fib6_nh_release)(struct fib6_nh *fib6_nh); void (*fib6_update_sernum)(struct net *net, struct fib6_info *rt); - int (*ip6_del_rt)(struct net *net, struct fib6_info *rt); + int (*ip6_del_rt)(struct net *net, struct fib6_info *rt, bool skip_notify); void (*fib6_rt_update)(struct net *net, struct fib6_info *rt, struct nl_info *info); diff --git a/net/ipv4/nexthop.c b/net/ipv4/nexthop.c index fdfca534d094..9999687ad6dc 100644 --- a/net/ipv4/nexthop.c +++ b/net/ipv4/nexthop.c @@ -784,7 +784,7 @@ static void __remove_nexthop_fib(struct net *net, struct nexthop *nh) list_for_each_entry_safe(f6i, tmp, &nh->f6i_list, nh_list) { /* __ip6_del_rt does a release, so do a hold here */ fib6_info_hold(f6i); - ipv6_stub->ip6_del_rt(net, f6i); + ipv6_stub->ip6_del_rt(net, f6i, false); } } diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 27b4fb6e452b..2c4f20ec1e2a 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -1238,7 +1238,7 @@ cleanup_prefix_route(struct inet6_ifaddr *ifp, unsigned long expires, ifp->idev->dev, 0, RTF_DEFAULT, true); if (f6i) { if (del_rt) - ip6_del_rt(dev_net(ifp->idev->dev), f6i); + ip6_del_rt(dev_net(ifp->idev->dev), f6i, false); else { if (!(f6i->fib6_flags & RTF_EXPIRES)) fib6_set_expires(f6i, expires); @@ -2718,7 +2718,7 @@ void addrconf_prefix_rcv(struct net_device *dev, u8 *opt, int len, bool sllao) if (rt) { /* Autoconf prefix route */ if (valid_lft == 0) { - ip6_del_rt(net, rt); + ip6_del_rt(net, rt, false); rt = NULL; } else if (addrconf_finite_timeout(rt_expires)) { /* not infinity */ @@ -3813,7 +3813,7 @@ restart: spin_unlock_bh(&ifa->lock); if (rt) - ip6_del_rt(net, rt); + ip6_del_rt(net, rt, false); if (state != INET6_IFADDR_STATE_DEAD) { __ipv6_ifa_notify(RTM_DELADDR, ifa); @@ -4652,7 +4652,7 @@ static int modify_prefix_route(struct inet6_ifaddr *ifp, prio = ifp->rt_priority ? : IP6_RT_PRIO_ADDRCONF; if (f6i->fib6_metric != prio) { /* delete old one */ - ip6_del_rt(dev_net(ifp->idev->dev), f6i); + ip6_del_rt(dev_net(ifp->idev->dev), f6i, false); /* add new one */ addrconf_prefix_route(modify_peer ? &ifp->peer_addr : &ifp->addr, @@ -6073,10 +6073,10 @@ static void __ipv6_ifa_notify(int event, struct inet6_ifaddr *ifp) ifp->idev->dev, 0, 0, false); if (rt) - ip6_del_rt(net, rt); + ip6_del_rt(net, rt, false); } if (ifp->rt) { - ip6_del_rt(net, ifp->rt); + ip6_del_rt(net, ifp->rt, false); ifp->rt = NULL; } rt_genid_bump_ipv6(net); diff --git a/net/ipv6/addrconf_core.c b/net/ipv6/addrconf_core.c index ea00ce3d4117..9ebf3fe0d2b1 100644 --- a/net/ipv6/addrconf_core.c +++ b/net/ipv6/addrconf_core.c @@ -185,7 +185,8 @@ static int eafnosupport_fib6_nh_init(struct net *net, struct fib6_nh *fib6_nh, return -EAFNOSUPPORT; } -static int eafnosupport_ip6_del_rt(struct net *net, struct fib6_info *rt) +static int eafnosupport_ip6_del_rt(struct net *net, struct fib6_info *rt, + bool skip_notify) { return -EAFNOSUPPORT; } diff --git a/net/ipv6/anycast.c b/net/ipv6/anycast.c index fed91ab7ec46..893261230ffc 100644 --- a/net/ipv6/anycast.c +++ b/net/ipv6/anycast.c @@ -364,7 +364,7 @@ int __ipv6_dev_ac_dec(struct inet6_dev *idev, const struct in6_addr *addr) ipv6_del_acaddr_hash(aca); addrconf_leave_solict(idev, &aca->aca_addr); - ip6_del_rt(dev_net(idev->dev), aca->aca_rt); + ip6_del_rt(dev_net(idev->dev), aca->aca_rt, false); aca_put(aca); return 0; @@ -393,7 +393,7 @@ void ipv6_ac_destroy_dev(struct inet6_dev *idev) addrconf_leave_solict(idev, &aca->aca_addr); - ip6_del_rt(dev_net(idev->dev), aca->aca_rt); + ip6_del_rt(dev_net(idev->dev), aca->aca_rt, false); aca_put(aca); diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index 1ecd4e9b0bdf..2d09c4da03ee 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -1302,7 +1302,7 @@ static void ndisc_router_discovery(struct sk_buff *skb) } } if (rt && lifetime == 0) { - ip6_del_rt(net, rt); + ip6_del_rt(net, rt, false); rt = NULL; } diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 310cbddaa533..486c36a14f24 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -984,7 +984,7 @@ int rt6_route_rcv(struct net_device *dev, u8 *opt, int len, gwaddr, dev); if (rt && !lifetime) { - ip6_del_rt(net, rt); + ip6_del_rt(net, rt, false); rt = NULL; } @@ -3729,9 +3729,12 @@ out: return err; } -int ip6_del_rt(struct net *net, struct fib6_info *rt) +int ip6_del_rt(struct net *net, struct fib6_info *rt, bool skip_notify) { - struct nl_info info = { .nl_net = net }; + struct nl_info info = { + .nl_net = net, + .skip_notify = skip_notify + }; return __ip6_del_rt(rt, &info); } @@ -4252,7 +4255,7 @@ restart: (!idev || idev->cnf.accept_ra != 2) && fib6_info_hold_safe(rt)) { rcu_read_unlock(); - ip6_del_rt(net, rt); + ip6_del_rt(net, rt, false); goto restart; } } -- cgit v1.2.3-59-g8ed1b From 4f80116d3df3b23ee4b83ea8557629e1799bc230 Mon Sep 17 00:00:00 2001 From: Roopa Prabhu Date: Mon, 27 Apr 2020 13:56:46 -0700 Subject: net: ipv4: add sysctl for nexthop api compatibility mode Current route nexthop API maintains user space compatibility with old route API by default. Dumps and netlink notifications support both new and old API format. In systems which have moved to the new API, this compatibility mode cancels some of the performance benefits provided by the new nexthop API. This patch adds new sysctl nexthop_compat_mode which is on by default but provides the ability to turn off compatibility mode allowing systems to run entirely with the new routing API. Old route API behaviour and support is not modified by this sysctl. Uses a single sysctl to cover both ipv4 and ipv6 following other sysctls. Covers dumps and delete notifications as suggested by David Ahern. Signed-off-by: Roopa Prabhu Reviewed-by: David Ahern Signed-off-by: David S. Miller --- Documentation/networking/ip-sysctl.txt | 12 ++++++++++++ include/net/netns/ipv4.h | 2 ++ net/ipv4/af_inet.c | 1 + net/ipv4/fib_semantics.c | 3 +++ net/ipv4/nexthop.c | 5 +++-- net/ipv4/sysctl_net_ipv4.c | 9 +++++++++ net/ipv6/route.c | 3 ++- 7 files changed, 32 insertions(+), 3 deletions(-) diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt index 9375324aa8e1..5cdc37c34830 100644 --- a/Documentation/networking/ip-sysctl.txt +++ b/Documentation/networking/ip-sysctl.txt @@ -1560,6 +1560,18 @@ skip_notify_on_dev_down - BOOLEAN on userspace caches to track link events and evict routes. Default: false (generate message) +nexthop_compat_mode - BOOLEAN + New nexthop API provides a means for managing nexthops independent of + prefixes. Backwards compatibilty with old route format is enabled by + default which means route dumps and notifications contain the new + nexthop attribute but also the full, expanded nexthop definition. + Further, updates or deletes of a nexthop configuration generate route + notifications for each fib entry using the nexthop. Once a system + understands the new API, this sysctl can be disabled to achieve full + performance benefits of the new API by disabling the nexthop expansion + and extraneous notifications. + Default: true (backward compat mode) + IPv6 Fragmentation: ip6frag_high_thresh - INTEGER diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index 154b8f01499b..5acdb4d414c4 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -111,6 +111,8 @@ struct netns_ipv4 { int sysctl_tcp_early_demux; int sysctl_udp_early_demux; + int sysctl_nexthop_compat_mode; + int sysctl_fwmark_reflect; int sysctl_tcp_fwmark_accept; #ifdef CONFIG_NET_L3_MASTER_DEV diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index c618e242490f..6177c4ba0037 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -1835,6 +1835,7 @@ static __net_init int inet_init_net(struct net *net) net->ipv4.sysctl_ip_early_demux = 1; net->ipv4.sysctl_udp_early_demux = 1; net->ipv4.sysctl_tcp_early_demux = 1; + net->ipv4.sysctl_nexthop_compat_mode = 1; #ifdef CONFIG_SYSCTL net->ipv4.sysctl_ip_prot_sock = PROT_SOCK; #endif diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index 55ca2e521828..e53871e4a097 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -1780,6 +1780,8 @@ int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event, goto nla_put_failure; if (nexthop_is_blackhole(fi->nh)) rtm->rtm_type = RTN_BLACKHOLE; + if (!fi->fib_net->ipv4.sysctl_nexthop_compat_mode) + goto offload; } if (nhs == 1) { @@ -1805,6 +1807,7 @@ int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event, goto nla_put_failure; } +offload: if (fri->offload) rtm->rtm_flags |= RTM_F_OFFLOAD; if (fri->trap) diff --git a/net/ipv4/nexthop.c b/net/ipv4/nexthop.c index 9999687ad6dc..3957364d556c 100644 --- a/net/ipv4/nexthop.c +++ b/net/ipv4/nexthop.c @@ -784,7 +784,8 @@ static void __remove_nexthop_fib(struct net *net, struct nexthop *nh) list_for_each_entry_safe(f6i, tmp, &nh->f6i_list, nh_list) { /* __ip6_del_rt does a release, so do a hold here */ fib6_info_hold(f6i); - ipv6_stub->ip6_del_rt(net, f6i, false); + ipv6_stub->ip6_del_rt(net, f6i, + !net->ipv4.sysctl_nexthop_compat_mode); } } @@ -1041,7 +1042,7 @@ out: if (!rc) { nh_base_seq_inc(net); nexthop_notify(RTM_NEWNEXTHOP, new_nh, &cfg->nlinfo); - if (replace_notify) + if (replace_notify && net->ipv4.sysctl_nexthop_compat_mode) nexthop_replace_notify(net, new_nh, &cfg->nlinfo); } diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 81b267e990a1..95ad71e76cc3 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -710,6 +710,15 @@ static struct ctl_table ipv4_net_table[] = { .mode = 0644, .proc_handler = proc_tcp_early_demux }, + { + .procname = "nexthop_compat_mode", + .data = &init_net.ipv4.sysctl_nexthop_compat_mode, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, { .procname = "ip_default_ttl", .data = &init_net.ipv4.sysctl_ip_default_ttl, diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 486c36a14f24..803212aae4ca 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -5557,7 +5557,8 @@ static int rt6_fill_node(struct net *net, struct sk_buff *skb, if (nexthop_is_blackhole(rt->nh)) rtm->rtm_type = RTN_BLACKHOLE; - if (rt6_fill_node_nexthop(skb, rt->nh, &nh_flags) < 0) + if (net->ipv4.sysctl_nexthop_compat_mode && + rt6_fill_node_nexthop(skb, rt->nh, &nh_flags) < 0) goto nla_put_failure; rtm->rtm_flags |= nh_flags; -- cgit v1.2.3-59-g8ed1b From 4dddb5be136a7b151c11f0fbe350feff75a89867 Mon Sep 17 00:00:00 2001 From: Roopa Prabhu Date: Mon, 27 Apr 2020 13:56:47 -0700 Subject: selftests: net: add new testcases for nexthop API compat mode sysctl New tests to check route dump and notifications with net.ipv4.nexthop_compat_mode on and off. Signed-off-by: Roopa Prabhu Reviewed-by: David Ahern Signed-off-by: David S. Miller --- tools/testing/selftests/net/fib_nexthops.sh | 198 +++++++++++++++++++++++++++- 1 file changed, 196 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/net/fib_nexthops.sh b/tools/testing/selftests/net/fib_nexthops.sh index b785241127df..dd0e5fec6367 100755 --- a/tools/testing/selftests/net/fib_nexthops.sh +++ b/tools/testing/selftests/net/fib_nexthops.sh @@ -19,8 +19,8 @@ ret=0 ksft_skip=4 # all tests in this script. Can be overridden with -t option -IPV4_TESTS="ipv4_fcnal ipv4_grp_fcnal ipv4_withv6_fcnal ipv4_fcnal_runtime" -IPV6_TESTS="ipv6_fcnal ipv6_grp_fcnal ipv6_fcnal_runtime" +IPV4_TESTS="ipv4_fcnal ipv4_grp_fcnal ipv4_withv6_fcnal ipv4_fcnal_runtime ipv4_compat_mode" +IPV6_TESTS="ipv6_fcnal ipv6_grp_fcnal ipv6_fcnal_runtime ipv6_compat_mode" ALL_TESTS="basic ${IPV4_TESTS} ${IPV6_TESTS}" TESTS="${ALL_TESTS}" @@ -253,6 +253,33 @@ check_route6() check_output "${out}" "${expected}" } +start_ip_monitor() +{ + local mtype=$1 + + # start the monitor in the background + tmpfile=`mktemp /var/run/nexthoptestXXX` + mpid=`($IP monitor $mtype > $tmpfile & echo $!) 2>/dev/null` + sleep 0.2 + echo "$mpid $tmpfile" +} + +stop_ip_monitor() +{ + local mpid=$1 + local tmpfile=$2 + local el=$3 + + # check the monitor results + kill $mpid + lines=`wc -l $tmpfile | cut "-d " -f1` + test $lines -eq $el + rc=$? + rm -rf $tmpfile + + return $rc +} + ################################################################################ # basic operations (add, delete, replace) on nexthops and nexthop groups # @@ -883,6 +910,173 @@ ipv4_fcnal_runtime() log_test $? 0 "IPv4 route with MPLS encap, v6 gw - check" } +sysctl_nexthop_compat_mode_check() +{ + local sysctlname="net.ipv4.nexthop_compat_mode" + local lprefix=$1 + + IPE="ip netns exec me" + + $IPE sysctl -q $sysctlname 2>&1 >/dev/null + if [ $? -ne 0 ]; then + echo "SKIP: kernel lacks nexthop compat mode sysctl control" + return $ksft_skip + fi + + out=$($IPE sysctl $sysctlname 2>/dev/null) + log_test $? 0 "$lprefix default nexthop compat mode check" + check_output "${out}" "$sysctlname = 1" +} + +sysctl_nexthop_compat_mode_set() +{ + local sysctlname="net.ipv4.nexthop_compat_mode" + local mode=$1 + local lprefix=$2 + + IPE="ip netns exec me" + + out=$($IPE sysctl -w $sysctlname=$mode) + log_test $? 0 "$lprefix set compat mode - $mode" + check_output "${out}" "net.ipv4.nexthop_compat_mode = $mode" +} + +ipv6_compat_mode() +{ + local rc + + echo + echo "IPv6 nexthop api compat mode test" + echo "--------------------------------" + + sysctl_nexthop_compat_mode_check "IPv6" + if [ $? -eq $ksft_skip ]; then + return $ksft_skip + fi + + run_cmd "$IP nexthop add id 62 via 2001:db8:91::2 dev veth1" + run_cmd "$IP nexthop add id 63 via 2001:db8:91::3 dev veth1" + run_cmd "$IP nexthop add id 122 group 62/63" + ipmout=$(start_ip_monitor route) + + run_cmd "$IP -6 ro add 2001:db8:101::1/128 nhid 122" + # route add notification should contain expanded nexthops + stop_ip_monitor $ipmout 3 + log_test $? 0 "IPv6 compat mode on - route add notification" + + # route dump should contain expanded nexthops + check_route6 "2001:db8:101::1" "2001:db8:101::1 nhid 122 metric 1024 pref medium nexthop via 2001:db8:91::2 dev veth1 weight 1 nexthop via 2001:db8:91::3 dev veth1 weight 1" + log_test $? 0 "IPv6 compat mode on - route dump" + + # change in nexthop group should generate route notification + run_cmd "$IP nexthop add id 64 via 2001:db8:91::4 dev veth1" + ipmout=$(start_ip_monitor route) + run_cmd "$IP nexthop replace id 122 group 62/64" + stop_ip_monitor $ipmout 3 + + log_test $? 0 "IPv6 compat mode on - nexthop change" + + # set compat mode off + sysctl_nexthop_compat_mode_set 0 "IPv6" + + run_cmd "$IP -6 ro del 2001:db8:101::1/128 nhid 122" + + run_cmd "$IP nexthop add id 62 via 2001:db8:91::2 dev veth1" + run_cmd "$IP nexthop add id 63 via 2001:db8:91::3 dev veth1" + run_cmd "$IP nexthop add id 122 group 62/63" + ipmout=$(start_ip_monitor route) + + run_cmd "$IP -6 ro add 2001:db8:101::1/128 nhid 122" + # route add notification should not contain expanded nexthops + stop_ip_monitor $ipmout 1 + log_test $? 0 "IPv6 compat mode off - route add notification" + + # route dump should not contain expanded nexthops + check_route6 "2001:db8:101::1" "2001:db8:101::1 nhid 122 metric 1024 pref medium" + log_test $? 0 "IPv6 compat mode off - route dump" + + # change in nexthop group should not generate route notification + run_cmd "$IP nexthop add id 64 via 2001:db8:91::4 dev veth1" + ipmout=$(start_ip_monitor route) + run_cmd "$IP nexthop replace id 122 group 62/64" + stop_ip_monitor $ipmout 0 + log_test $? 0 "IPv6 compat mode off - nexthop change" + + # nexthop delete should not generate route notification + ipmout=$(start_ip_monitor route) + run_cmd "$IP nexthop del id 122" + stop_ip_monitor $ipmout 0 + log_test $? 0 "IPv6 compat mode off - nexthop delete" + + # set compat mode back on + sysctl_nexthop_compat_mode_set 1 "IPv6" +} + +ipv4_compat_mode() +{ + local rc + + echo + echo "IPv4 nexthop api compat mode" + echo "----------------------------" + + sysctl_nexthop_compat_mode_check "IPv4" + if [ $? -eq $ksft_skip ]; then + return $ksft_skip + fi + + run_cmd "$IP nexthop add id 21 via 172.16.1.2 dev veth1" + run_cmd "$IP nexthop add id 22 via 172.16.1.2 dev veth1" + run_cmd "$IP nexthop add id 122 group 21/22" + ipmout=$(start_ip_monitor route) + + run_cmd "$IP ro add 172.16.101.1/32 nhid 122" + stop_ip_monitor $ipmout 3 + + # route add notification should contain expanded nexthops + log_test $? 0 "IPv4 compat mode on - route add notification" + + # route dump should contain expanded nexthops + check_route "172.16.101.1" "172.16.101.1 nhid 122 nexthop via 172.16.1.2 dev veth1 weight 1 nexthop via 172.16.1.2 dev veth1 weight 1" + log_test $? 0 "IPv4 compat mode on - route dump" + + # change in nexthop group should generate route notification + run_cmd "$IP nexthop add id 23 via 172.16.1.3 dev veth1" + ipmout=$(start_ip_monitor route) + run_cmd "$IP nexthop replace id 122 group 21/23" + stop_ip_monitor $ipmout 3 + log_test $? 0 "IPv4 compat mode on - nexthop change" + + sysctl_nexthop_compat_mode_set 0 "IPv4" + + # cleanup + run_cmd "$IP ro del 172.16.101.1/32 nhid 122" + + ipmout=$(start_ip_monitor route) + run_cmd "$IP ro add 172.16.101.1/32 nhid 122" + stop_ip_monitor $ipmout 1 + # route add notification should not contain expanded nexthops + log_test $? 0 "IPv4 compat mode off - route add notification" + + # route dump should not contain expanded nexthops + check_route "172.16.101.1" "172.16.101.1 nhid 122" + log_test $? 0 "IPv4 compat mode off - route dump" + + # change in nexthop group should not generate route notification + ipmout=$(start_ip_monitor route) + run_cmd "$IP nexthop replace id 122 group 21/22" + stop_ip_monitor $ipmout 0 + log_test $? 0 "IPv4 compat mode off - nexthop change" + + # nexthop delete should not generate route notification + ipmout=$(start_ip_monitor route) + run_cmd "$IP nexthop del id 122" + stop_ip_monitor $ipmout 0 + log_test $? 0 "IPv4 compat mode off - nexthop delete" + + sysctl_nexthop_compat_mode_set 1 "IPv4" +} + basic() { echo -- cgit v1.2.3-59-g8ed1b From 1a89595c2272aa9b4cd3fda562545dc1d9cd89ed Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 27 Apr 2020 18:03:47 -0700 Subject: kselftest: factor out list manipulation to a helper Kees suggest to factor out the list append code to a macro, since following commits need it, which leads to code duplication. Suggested-by: Kees Cook Signed-off-by: Jakub Kicinski Acked-by: Kees Cook Signed-off-by: David S. Miller --- tools/testing/selftests/kselftest_harness.h | 42 ++++++++++++++++------------- 1 file changed, 24 insertions(+), 18 deletions(-) diff --git a/tools/testing/selftests/kselftest_harness.h b/tools/testing/selftests/kselftest_harness.h index 2bb8c81fc0b4..77f754854f0d 100644 --- a/tools/testing/selftests/kselftest_harness.h +++ b/tools/testing/selftests/kselftest_harness.h @@ -631,6 +631,29 @@ } \ } while (0); OPTIONAL_HANDLER(_assert) +/* List helpers */ +#define __LIST_APPEND(head, item) \ +{ \ + /* Circular linked list where only prev is circular. */ \ + if (head == NULL) { \ + head = item; \ + item->next = NULL; \ + item->prev = item; \ + return; \ + } \ + if (__constructor_order == _CONSTRUCTOR_ORDER_FORWARD) { \ + item->next = NULL; \ + item->prev = head->prev; \ + item->prev->next = item; \ + head->prev = item; \ + } else { \ + item->next = head; \ + item->next->prev = item; \ + item->prev = item; \ + head = item; \ + } \ +} + /* Contains all the information for test execution and status checking. */ struct __test_metadata { const char *name; @@ -667,24 +690,7 @@ static int __constructor_order; static inline void __register_test(struct __test_metadata *t) { __test_count++; - /* Circular linked list where only prev is circular. */ - if (__test_list == NULL) { - __test_list = t; - t->next = NULL; - t->prev = t; - return; - } - if (__constructor_order == _CONSTRUCTOR_ORDER_FORWARD) { - t->next = NULL; - t->prev = __test_list->prev; - t->prev->next = t; - __test_list->prev = t; - } else { - t->next = __test_list; - t->next->prev = t; - t->prev = t; - __test_list = t; - } + __LIST_APPEND(__test_list, t); } static inline int __bail(int for_realz, bool no_print, __u8 step) -- cgit v1.2.3-59-g8ed1b From 142aca6b388c8ab83dc41bd71150cb23115bd285 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 27 Apr 2020 18:03:48 -0700 Subject: kselftest: create fixture objects Grouping tests by fixture will allow us to parametrize test runs. Create full objects for fixtures. Add a "global" fixture for tests without a fixture. Signed-off-by: Jakub Kicinski Acked-by: Kees Cook Signed-off-by: David S. Miller --- tools/testing/selftests/kselftest_harness.h | 51 +++++++++++++++++++++-------- 1 file changed, 38 insertions(+), 13 deletions(-) diff --git a/tools/testing/selftests/kselftest_harness.h b/tools/testing/selftests/kselftest_harness.h index 77f754854f0d..de283fd6fc4d 100644 --- a/tools/testing/selftests/kselftest_harness.h +++ b/tools/testing/selftests/kselftest_harness.h @@ -169,8 +169,10 @@ #define __TEST_IMPL(test_name, _signal) \ static void test_name(struct __test_metadata *_metadata); \ static struct __test_metadata _##test_name##_object = \ - { .name = "global." #test_name, \ - .fn = &test_name, .termsig = _signal, \ + { .name = #test_name, \ + .fn = &test_name, \ + .fixture = &_fixture_global, \ + .termsig = _signal, \ .timeout = TEST_TIMEOUT_DEFAULT, }; \ static void __attribute__((constructor)) _register_##test_name(void) \ { \ @@ -212,10 +214,12 @@ * populated and cleaned up using FIXTURE_SETUP() and FIXTURE_TEARDOWN(). */ #define FIXTURE(fixture_name) \ + static struct __fixture_metadata _##fixture_name##_fixture_object = \ + { .name = #fixture_name, }; \ static void __attribute__((constructor)) \ _register_##fixture_name##_data(void) \ { \ - __fixture_count++; \ + __register_fixture(&_##fixture_name##_fixture_object); \ } \ FIXTURE_DATA(fixture_name) @@ -309,8 +313,9 @@ } \ static struct __test_metadata \ _##fixture_name##_##test_name##_object = { \ - .name = #fixture_name "." #test_name, \ + .name = #test_name, \ .fn = &wrapper_##fixture_name##_##test_name, \ + .fixture = &_##fixture_name##_fixture_object, \ .termsig = signal, \ .timeout = tmout, \ }; \ @@ -654,11 +659,34 @@ } \ } +/* Contains all the information about a fixture. */ +struct __fixture_metadata { + const char *name; + struct __fixture_metadata *prev, *next; +} _fixture_global __attribute__((unused)) = { + .name = "global", + .prev = &_fixture_global, +}; + +static struct __fixture_metadata *__fixture_list = &_fixture_global; +static unsigned int __fixture_count; +static int __constructor_order; + +#define _CONSTRUCTOR_ORDER_FORWARD 1 +#define _CONSTRUCTOR_ORDER_BACKWARD -1 + +static inline void __register_fixture(struct __fixture_metadata *f) +{ + __fixture_count++; + __LIST_APPEND(__fixture_list, f); +} + /* Contains all the information for test execution and status checking. */ struct __test_metadata { const char *name; void (*fn)(struct __test_metadata *); pid_t pid; /* pid of test when being run */ + struct __fixture_metadata *fixture; int termsig; int passed; int trigger; /* extra handler after the evaluation */ @@ -672,11 +700,6 @@ struct __test_metadata { /* Storage for the (global) tests to be run. */ static struct __test_metadata *__test_list; static unsigned int __test_count; -static unsigned int __fixture_count; -static int __constructor_order; - -#define _CONSTRUCTOR_ORDER_FORWARD 1 -#define _CONSTRUCTOR_ORDER_BACKWARD -1 /* * Since constructors are called in reverse order, reverse the test @@ -796,11 +819,12 @@ void __wait_for_test(struct __test_metadata *t) } } -void __run_test(struct __test_metadata *t) +void __run_test(struct __fixture_metadata *f, + struct __test_metadata *t) { t->passed = 1; t->trigger = 0; - printf("[ RUN ] %s\n", t->name); + printf("[ RUN ] %s.%s\n", f->name, t->name); t->pid = fork(); if (t->pid < 0) { printf("ERROR SPAWNING TEST CHILD\n"); @@ -812,7 +836,8 @@ void __run_test(struct __test_metadata *t) } else { __wait_for_test(t); } - printf("[ %4s ] %s\n", (t->passed ? "OK" : "FAIL"), t->name); + printf("[ %4s ] %s.%s\n", (t->passed ? "OK" : "FAIL"), + f->name, t->name); } static int test_harness_run(int __attribute__((unused)) argc, @@ -828,7 +853,7 @@ static int test_harness_run(int __attribute__((unused)) argc, __test_count, __fixture_count + 1); for (t = __test_list; t; t = t->next) { count++; - __run_test(t); + __run_test(t->fixture, t); if (t->passed) pass_count++; else -- cgit v1.2.3-59-g8ed1b From e7f304607778e31bfd8e6b00ce2a8f990b265e14 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 27 Apr 2020 18:03:49 -0700 Subject: kselftest: run tests by fixture Now that all tests have a fixture object move from a global list of tests to a list of tests per fixture. Order of tests may change as we will now group and run test fixture by fixture, rather than in declaration order. Signed-off-by: Jakub Kicinski Acked-by: Kees Cook Signed-off-by: David S. Miller --- tools/testing/selftests/kselftest_harness.h | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/tools/testing/selftests/kselftest_harness.h b/tools/testing/selftests/kselftest_harness.h index de283fd6fc4d..fa7185e45472 100644 --- a/tools/testing/selftests/kselftest_harness.h +++ b/tools/testing/selftests/kselftest_harness.h @@ -659,9 +659,12 @@ } \ } +struct __test_metadata; + /* Contains all the information about a fixture. */ struct __fixture_metadata { const char *name; + struct __test_metadata *tests; struct __fixture_metadata *prev, *next; } _fixture_global __attribute__((unused)) = { .name = "global", @@ -698,7 +701,6 @@ struct __test_metadata { }; /* Storage for the (global) tests to be run. */ -static struct __test_metadata *__test_list; static unsigned int __test_count; /* @@ -713,7 +715,7 @@ static unsigned int __test_count; static inline void __register_test(struct __test_metadata *t) { __test_count++; - __LIST_APPEND(__test_list, t); + __LIST_APPEND(t->fixture->tests, t); } static inline int __bail(int for_realz, bool no_print, __u8 step) @@ -843,6 +845,7 @@ void __run_test(struct __fixture_metadata *f, static int test_harness_run(int __attribute__((unused)) argc, char __attribute__((unused)) **argv) { + struct __fixture_metadata *f; struct __test_metadata *t; int ret = 0; unsigned int count = 0; @@ -851,13 +854,15 @@ static int test_harness_run(int __attribute__((unused)) argc, /* TODO(wad) add optional arguments similar to gtest. */ printf("[==========] Running %u tests from %u test cases.\n", __test_count, __fixture_count + 1); - for (t = __test_list; t; t = t->next) { - count++; - __run_test(t->fixture, t); - if (t->passed) - pass_count++; - else - ret = 1; + for (f = __fixture_list; f; f = f->next) { + for (t = f->tests; t; t = t->next) { + count++; + __run_test(f, t); + if (t->passed) + pass_count++; + else + ret = 1; + } } printf("[==========] %u / %u tests passed.\n", pass_count, count); printf("[ %s ]\n", (ret ? "FAILED" : "PASSED")); -- cgit v1.2.3-59-g8ed1b From 74bc7c97fa88ae334752e7b45702d23813df8873 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 27 Apr 2020 18:03:50 -0700 Subject: kselftest: add fixture variants Allow users to build parameterized variants of fixtures. If fixtures want variants, they call FIXTURE_VARIANT() to declare the structure to fill for each variant. Each fixture will be re-run for each of the variants defined by calling FIXTURE_VARIANT_ADD() with the differing parameters initializing the structure. Since tests are being re-run, additional initialization (steps, no_print) is also added. Signed-off-by: Jakub Kicinski Acked-by: Kees Cook Signed-off-by: David S. Miller --- Documentation/dev-tools/kselftest.rst | 3 +- tools/testing/selftests/kselftest_harness.h | 148 +++++++++++++++++++++++----- 2 files changed, 124 insertions(+), 27 deletions(-) diff --git a/Documentation/dev-tools/kselftest.rst b/Documentation/dev-tools/kselftest.rst index 61ae13c44f91..5d1f56fcd2e7 100644 --- a/Documentation/dev-tools/kselftest.rst +++ b/Documentation/dev-tools/kselftest.rst @@ -301,7 +301,8 @@ Helpers .. kernel-doc:: tools/testing/selftests/kselftest_harness.h :functions: TH_LOG TEST TEST_SIGNAL FIXTURE FIXTURE_DATA FIXTURE_SETUP - FIXTURE_TEARDOWN TEST_F TEST_HARNESS_MAIN + FIXTURE_TEARDOWN TEST_F TEST_HARNESS_MAIN FIXTURE_VARIANT + FIXTURE_VARIANT_ADD Operators --------- diff --git a/tools/testing/selftests/kselftest_harness.h b/tools/testing/selftests/kselftest_harness.h index fa7185e45472..c9f03ef93338 100644 --- a/tools/testing/selftests/kselftest_harness.h +++ b/tools/testing/selftests/kselftest_harness.h @@ -168,9 +168,15 @@ #define __TEST_IMPL(test_name, _signal) \ static void test_name(struct __test_metadata *_metadata); \ + static inline void wrapper_##test_name( \ + struct __test_metadata *_metadata, \ + struct __fixture_variant_metadata *variant) \ + { \ + test_name(_metadata); \ + } \ static struct __test_metadata _##test_name##_object = \ { .name = #test_name, \ - .fn = &test_name, \ + .fn = &wrapper_##test_name, \ .fixture = &_fixture_global, \ .termsig = _signal, \ .timeout = TEST_TIMEOUT_DEFAULT, }; \ @@ -214,6 +220,7 @@ * populated and cleaned up using FIXTURE_SETUP() and FIXTURE_TEARDOWN(). */ #define FIXTURE(fixture_name) \ + FIXTURE_VARIANT(fixture_name); \ static struct __fixture_metadata _##fixture_name##_fixture_object = \ { .name = #fixture_name, }; \ static void __attribute__((constructor)) \ @@ -245,7 +252,10 @@ #define FIXTURE_SETUP(fixture_name) \ void fixture_name##_setup( \ struct __test_metadata __attribute__((unused)) *_metadata, \ - FIXTURE_DATA(fixture_name) __attribute__((unused)) *self) + FIXTURE_DATA(fixture_name) __attribute__((unused)) *self, \ + const FIXTURE_VARIANT(fixture_name) \ + __attribute__((unused)) *variant) + /** * FIXTURE_TEARDOWN(fixture_name) * *_metadata* is included so that EXPECT_* and ASSERT_* work correctly. @@ -267,6 +277,59 @@ struct __test_metadata __attribute__((unused)) *_metadata, \ FIXTURE_DATA(fixture_name) __attribute__((unused)) *self) +/** + * FIXTURE_VARIANT(fixture_name) - Optionally called once per fixture + * to declare fixture variant + * + * @fixture_name: fixture name + * + * .. code-block:: c + * + * FIXTURE_VARIANT(datatype name) { + * type property1; + * ... + * }; + * + * Defines type of constant parameters provided to FIXTURE_SETUP() and TEST_F() + * as *variant*. Variants allow the same tests to be run with different + * arguments. + */ +#define FIXTURE_VARIANT(fixture_name) struct _fixture_variant_##fixture_name + +/** + * FIXTURE_VARIANT_ADD(fixture_name, variant_name) - Called once per fixture + * variant to setup and register the data + * + * @fixture_name: fixture name + * @variant_name: name of the parameter set + * + * .. code-block:: c + * + * FIXTURE_ADD(datatype name) { + * .property1 = val1; + * ... + * }; + * + * Defines a variant of the test fixture, provided to FIXTURE_SETUP() and + * TEST_F() as *variant*. Tests of each fixture will be run once for each + * variant. + */ +#define FIXTURE_VARIANT_ADD(fixture_name, variant_name) \ + extern FIXTURE_VARIANT(fixture_name) \ + _##fixture_name##_##variant_name##_variant; \ + static struct __fixture_variant_metadata \ + _##fixture_name##_##variant_name##_object = \ + { .name = #variant_name, \ + .data = &_##fixture_name##_##variant_name##_variant}; \ + static void __attribute__((constructor)) \ + _register_##fixture_name##_##variant_name(void) \ + { \ + __register_fixture_variant(&_##fixture_name##_fixture_object, \ + &_##fixture_name##_##variant_name##_object); \ + } \ + FIXTURE_VARIANT(fixture_name) \ + _##fixture_name##_##variant_name##_variant = + /** * TEST_F(fixture_name, test_name) - Emits test registration and helpers for * fixture-based test cases @@ -297,18 +360,20 @@ #define __TEST_F_IMPL(fixture_name, test_name, signal, tmout) \ static void fixture_name##_##test_name( \ struct __test_metadata *_metadata, \ - FIXTURE_DATA(fixture_name) *self); \ + FIXTURE_DATA(fixture_name) *self, \ + const FIXTURE_VARIANT(fixture_name) *variant); \ static inline void wrapper_##fixture_name##_##test_name( \ - struct __test_metadata *_metadata) \ + struct __test_metadata *_metadata, \ + struct __fixture_variant_metadata *variant) \ { \ /* fixture data is alloced, setup, and torn down per call. */ \ FIXTURE_DATA(fixture_name) self; \ memset(&self, 0, sizeof(FIXTURE_DATA(fixture_name))); \ - fixture_name##_setup(_metadata, &self); \ + fixture_name##_setup(_metadata, &self, variant->data); \ /* Let setup failure terminate early. */ \ if (!_metadata->passed) \ return; \ - fixture_name##_##test_name(_metadata, &self); \ + fixture_name##_##test_name(_metadata, &self, variant->data); \ fixture_name##_teardown(_metadata, &self); \ } \ static struct __test_metadata \ @@ -326,7 +391,9 @@ } \ static void fixture_name##_##test_name( \ struct __test_metadata __attribute__((unused)) *_metadata, \ - FIXTURE_DATA(fixture_name) __attribute__((unused)) *self) + FIXTURE_DATA(fixture_name) __attribute__((unused)) *self, \ + const FIXTURE_VARIANT(fixture_name) \ + __attribute__((unused)) *variant) /** * TEST_HARNESS_MAIN - Simple wrapper to run the test harness @@ -660,11 +727,13 @@ } struct __test_metadata; +struct __fixture_variant_metadata; /* Contains all the information about a fixture. */ struct __fixture_metadata { const char *name; struct __test_metadata *tests; + struct __fixture_variant_metadata *variant; struct __fixture_metadata *prev, *next; } _fixture_global __attribute__((unused)) = { .name = "global", @@ -672,7 +741,6 @@ struct __fixture_metadata { }; static struct __fixture_metadata *__fixture_list = &_fixture_global; -static unsigned int __fixture_count; static int __constructor_order; #define _CONSTRUCTOR_ORDER_FORWARD 1 @@ -680,14 +748,27 @@ static int __constructor_order; static inline void __register_fixture(struct __fixture_metadata *f) { - __fixture_count++; __LIST_APPEND(__fixture_list, f); } +struct __fixture_variant_metadata { + const char *name; + const void *data; + struct __fixture_variant_metadata *prev, *next; +}; + +static inline void +__register_fixture_variant(struct __fixture_metadata *f, + struct __fixture_variant_metadata *variant) +{ + __LIST_APPEND(f->variant, variant); +} + /* Contains all the information for test execution and status checking. */ struct __test_metadata { const char *name; - void (*fn)(struct __test_metadata *); + void (*fn)(struct __test_metadata *, + struct __fixture_variant_metadata *); pid_t pid; /* pid of test when being run */ struct __fixture_metadata *fixture; int termsig; @@ -700,9 +781,6 @@ struct __test_metadata { struct __test_metadata *prev, *next; }; -/* Storage for the (global) tests to be run. */ -static unsigned int __test_count; - /* * Since constructors are called in reverse order, reverse the test * list so tests are run in source declaration order. @@ -714,7 +792,6 @@ static unsigned int __test_count; */ static inline void __register_test(struct __test_metadata *t) { - __test_count++; __LIST_APPEND(t->fixture->tests, t); } @@ -822,46 +899,65 @@ void __wait_for_test(struct __test_metadata *t) } void __run_test(struct __fixture_metadata *f, + struct __fixture_variant_metadata *variant, struct __test_metadata *t) { + /* reset test struct */ t->passed = 1; t->trigger = 0; - printf("[ RUN ] %s.%s\n", f->name, t->name); + t->step = 0; + t->no_print = 0; + + printf("[ RUN ] %s%s%s.%s\n", + f->name, variant->name[0] ? "." : "", variant->name, t->name); t->pid = fork(); if (t->pid < 0) { printf("ERROR SPAWNING TEST CHILD\n"); t->passed = 0; } else if (t->pid == 0) { - t->fn(t); + t->fn(t, variant); /* return the step that failed or 0 */ _exit(t->passed ? 0 : t->step); } else { __wait_for_test(t); } - printf("[ %4s ] %s.%s\n", (t->passed ? "OK" : "FAIL"), - f->name, t->name); + printf("[ %4s ] %s%s%s.%s\n", (t->passed ? "OK" : "FAIL"), + f->name, variant->name[0] ? "." : "", variant->name, t->name); } static int test_harness_run(int __attribute__((unused)) argc, char __attribute__((unused)) **argv) { + struct __fixture_variant_metadata no_variant = { .name = "", }; + struct __fixture_variant_metadata *v; struct __fixture_metadata *f; struct __test_metadata *t; int ret = 0; + unsigned int case_count = 0, test_count = 0; unsigned int count = 0; unsigned int pass_count = 0; + for (f = __fixture_list; f; f = f->next) { + for (v = f->variant ?: &no_variant; v; v = v->next) { + case_count++; + for (t = f->tests; t; t = t->next) + test_count++; + } + } + /* TODO(wad) add optional arguments similar to gtest. */ printf("[==========] Running %u tests from %u test cases.\n", - __test_count, __fixture_count + 1); + test_count, case_count); for (f = __fixture_list; f; f = f->next) { - for (t = f->tests; t; t = t->next) { - count++; - __run_test(f, t); - if (t->passed) - pass_count++; - else - ret = 1; + for (v = f->variant ?: &no_variant; v; v = v->next) { + for (t = f->tests; t; t = t->next) { + count++; + __run_test(f, v, t); + if (t->passed) + pass_count++; + else + ret = 1; + } } } printf("[==========] %u / %u tests passed.\n", pass_count, count); -- cgit v1.2.3-59-g8ed1b From 0feba2219b7348dce7d59312f4701a4805768f2d Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 27 Apr 2020 18:03:51 -0700 Subject: selftests: tls: run all tests for TLS 1.2 and TLS 1.3 TLS 1.2 and TLS 1.3 differ in the implementation. Use fixture parameters to run all tests for both versions, and remove the one-off TLS 1.2 test. Signed-off-by: Jakub Kicinski Reviewed-by: Kees Cook Signed-off-by: David S. Miller --- tools/testing/selftests/net/tls.c | 93 +++++++-------------------------------- 1 file changed, 17 insertions(+), 76 deletions(-) diff --git a/tools/testing/selftests/net/tls.c b/tools/testing/selftests/net/tls.c index 0ea44d975b6c..c5282e62df75 100644 --- a/tools/testing/selftests/net/tls.c +++ b/tools/testing/selftests/net/tls.c @@ -101,6 +101,21 @@ FIXTURE(tls) bool notls; }; +FIXTURE_VARIANT(tls) +{ + unsigned int tls_version; +}; + +FIXTURE_VARIANT_ADD(tls, 12) +{ + .tls_version = TLS_1_2_VERSION, +}; + +FIXTURE_VARIANT_ADD(tls, 13) +{ + .tls_version = TLS_1_3_VERSION, +}; + FIXTURE_SETUP(tls) { struct tls12_crypto_info_aes_gcm_128 tls12; @@ -112,7 +127,7 @@ FIXTURE_SETUP(tls) len = sizeof(addr); memset(&tls12, 0, sizeof(tls12)); - tls12.info.version = TLS_1_3_VERSION; + tls12.info.version = variant->tls_version; tls12.info.cipher_type = TLS_CIPHER_AES_GCM_128; addr.sin_family = AF_INET; @@ -733,7 +748,7 @@ TEST_F(tls, bidir) struct tls12_crypto_info_aes_gcm_128 tls12; memset(&tls12, 0, sizeof(tls12)); - tls12.info.version = TLS_1_3_VERSION; + tls12.info.version = variant->tls_version; tls12.info.cipher_type = TLS_CIPHER_AES_GCM_128; ret = setsockopt(self->fd, SOL_TLS, TLS_RX, &tls12, @@ -1258,78 +1273,4 @@ TEST(keysizes) { close(cfd); } -TEST(tls12) { - int fd, cfd; - bool notls; - - struct tls12_crypto_info_aes_gcm_128 tls12; - struct sockaddr_in addr; - socklen_t len; - int sfd, ret; - - notls = false; - len = sizeof(addr); - - memset(&tls12, 0, sizeof(tls12)); - tls12.info.version = TLS_1_2_VERSION; - tls12.info.cipher_type = TLS_CIPHER_AES_GCM_128; - - addr.sin_family = AF_INET; - addr.sin_addr.s_addr = htonl(INADDR_ANY); - addr.sin_port = 0; - - fd = socket(AF_INET, SOCK_STREAM, 0); - sfd = socket(AF_INET, SOCK_STREAM, 0); - - ret = bind(sfd, &addr, sizeof(addr)); - ASSERT_EQ(ret, 0); - ret = listen(sfd, 10); - ASSERT_EQ(ret, 0); - - ret = getsockname(sfd, &addr, &len); - ASSERT_EQ(ret, 0); - - ret = connect(fd, &addr, sizeof(addr)); - ASSERT_EQ(ret, 0); - - ret = setsockopt(fd, IPPROTO_TCP, TCP_ULP, "tls", sizeof("tls")); - if (ret != 0) { - notls = true; - printf("Failure setting TCP_ULP, testing without tls\n"); - } - - if (!notls) { - ret = setsockopt(fd, SOL_TLS, TLS_TX, &tls12, - sizeof(tls12)); - ASSERT_EQ(ret, 0); - } - - cfd = accept(sfd, &addr, &len); - ASSERT_GE(cfd, 0); - - if (!notls) { - ret = setsockopt(cfd, IPPROTO_TCP, TCP_ULP, "tls", - sizeof("tls")); - ASSERT_EQ(ret, 0); - - ret = setsockopt(cfd, SOL_TLS, TLS_RX, &tls12, - sizeof(tls12)); - ASSERT_EQ(ret, 0); - } - - close(sfd); - - char const *test_str = "test_read"; - int send_len = 10; - char buf[10]; - - send_len = strlen(test_str) + 1; - EXPECT_EQ(send(fd, test_str, send_len, 0), send_len); - EXPECT_NE(recv(cfd, buf, send_len, 0), -1); - EXPECT_EQ(memcmp(buf, test_str, send_len), 0); - - close(fd); - close(cfd); -} - TEST_HARNESS_MAIN -- cgit v1.2.3-59-g8ed1b From 9d42205036d4be7e45fb4dafe5173cd804fd9d5f Mon Sep 17 00:00:00 2001 From: ChenTao Date: Tue, 28 Apr 2020 09:48:04 +0800 Subject: net: phy: bcm54140: Make a bunch of functions static Fix the following warning: drivers/net/phy/bcm54140.c:663:5: warning: symbol 'bcm54140_did_interrupt' was not declared. Should it be static? drivers/net/phy/bcm54140.c:672:5: warning: symbol 'bcm54140_ack_intr' was not declared. Should it be static? drivers/net/phy/bcm54140.c:684:5: warning: symbol 'bcm54140_config_intr' was not declared. Should it be static? Reported-by: Hulk Robot Signed-off-by: ChenTao Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/phy/bcm54140.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/net/phy/bcm54140.c b/drivers/net/phy/bcm54140.c index 7341f0126cc4..c009ac2856a5 100644 --- a/drivers/net/phy/bcm54140.c +++ b/drivers/net/phy/bcm54140.c @@ -660,7 +660,7 @@ static int bcm54140_config_init(struct phy_device *phydev) BCM54140_RDB_C_PWR_ISOLATE, 0); } -int bcm54140_did_interrupt(struct phy_device *phydev) +static int bcm54140_did_interrupt(struct phy_device *phydev) { int ret; @@ -669,7 +669,7 @@ int bcm54140_did_interrupt(struct phy_device *phydev) return (ret < 0) ? 0 : ret; } -int bcm54140_ack_intr(struct phy_device *phydev) +static int bcm54140_ack_intr(struct phy_device *phydev) { int reg; @@ -681,7 +681,7 @@ int bcm54140_ack_intr(struct phy_device *phydev) return 0; } -int bcm54140_config_intr(struct phy_device *phydev) +static int bcm54140_config_intr(struct phy_device *phydev) { struct bcm54140_priv *priv = phydev->priv; static const u16 port_to_imr_bit[] = { -- cgit v1.2.3-59-g8ed1b From 88fb831f773e6c51ec528eacb524f00f518f7e54 Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Tue, 28 Apr 2020 10:42:22 -0700 Subject: dpaa2-eth: Use proper division helper in dpaa2_dbg_ch_show When building arm32 allmodconfig: ERROR: modpost: "__aeabi_uldivmod" [drivers/net/ethernet/freescale/dpaa2/fsl-dpaa2-eth.ko] undefined! frames and cdan are both of type __u64 (unsigned long long) so we need to use div64_u64 to avoid this issues. Fixes: 460fd830dd9d ("dpaa2-eth: add channel stat to debugfs") Link: https://github.com/ClangBuiltLinux/linux/issues/1012 Signed-off-by: Nathan Chancellor Reported-by: kernelci.org bot Reviewed-by: Nick Desaulniers Signed-off-by: David S. Miller --- drivers/net/ethernet/freescale/dpaa2/dpaa2-eth-debugfs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth-debugfs.c b/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth-debugfs.c index 80291afff3ea..0a31e4268dfb 100644 --- a/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth-debugfs.c +++ b/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth-debugfs.c @@ -139,7 +139,7 @@ static int dpaa2_dbg_ch_show(struct seq_file *file, void *offset) ch->stats.dequeue_portal_busy, ch->stats.frames, ch->stats.cdan, - ch->stats.frames / ch->stats.cdan, + div64_u64(ch->stats.frames, ch->stats.cdan), ch->buf_count); } -- cgit v1.2.3-59-g8ed1b From 790ab249b55d75fdb427b92f81964cd7cb525eec Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Tue, 28 Apr 2020 19:58:33 +0200 Subject: net: ethernet: fec: Prevent MII event after MII_SPEED write The change to polled IO for MDIO completion assumes that MII events are only generated for MDIO transactions. However on some SoCs writing to the MII_SPEED register can also trigger an MII event. As a result, the next MDIO read has a pending MII event, and immediately reads the data registers before it contains useful data. When the read does complete, another MII event is posted, which results in the next read also going wrong, and the cycle continues. By writing 0 to the MII_DATA register before writing to the speed register, this MII event for the MII_SPEED is suppressed, and polled IO works as expected. Fixes: 29ae6bd1b0d8 ("net: ethernet: fec: Replace interrupt driven MDIO with polled IO") Reported-by: Andy Duan Suggested-by: Andy Duan Signed-off-by: Andrew Lunn Signed-off-by: David S. Miller --- drivers/net/ethernet/freescale/fec_main.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/drivers/net/ethernet/freescale/fec_main.c b/drivers/net/ethernet/freescale/fec_main.c index 1ae075a246a3..aa5e744ec098 100644 --- a/drivers/net/ethernet/freescale/fec_main.c +++ b/drivers/net/ethernet/freescale/fec_main.c @@ -996,6 +996,9 @@ fec_restart(struct net_device *ndev) writel(0x0, fep->hwp + FEC_X_CNTRL); } + /* Prevent an MII event being report when changing speed */ + writel(0, fep->hwp + FEC_MII_DATA); + /* Set MII speed */ writel(fep->phy_speed, fep->hwp + FEC_MII_SPEED); @@ -1182,6 +1185,10 @@ fec_stop(struct net_device *ndev) writel(val, fep->hwp + FEC_ECNTRL); fec_enet_stop_mode(fep, true); } + + /* Prevent an MII event being report when changing speed */ + writel(0, fep->hwp + FEC_MII_DATA); + writel(fep->phy_speed, fep->hwp + FEC_MII_SPEED); /* We have to keep ENET enabled to have MII interrupt stay working */ @@ -2142,6 +2149,16 @@ static int fec_enet_mii_init(struct platform_device *pdev) if (suppress_preamble) fep->phy_speed |= BIT(7); + /* Clear MMFR to avoid to generate MII event by writing MSCR. + * MII event generation condition: + * - writing MSCR: + * - mmfr[31:0]_not_zero & mscr[7:0]_is_zero & + * mscr_reg_data_in[7:0] != 0 + * - writing MMFR: + * - mscr[7:0]_not_zero + */ + writel(0, fep->hwp + FEC_MII_DATA); + writel(fep->phy_speed, fep->hwp + FEC_MII_SPEED); /* Clear any pending transaction complete indication */ -- cgit v1.2.3-59-g8ed1b From da50d57abd7ecaba151600e726ccb944e7ddf81a Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Tue, 28 Apr 2020 00:01:16 +0200 Subject: docs: networking: convert caif files to ReST There are two text files for caif, plus one already converted file. Convert the two remaining ones to ReST, create a new index.rst file for CAIF, adding it to the main networking documentation index. Signed-off-by: Mauro Carvalho Chehab Signed-off-by: David S. Miller --- Documentation/networking/caif/Linux-CAIF.txt | 175 -------------------- Documentation/networking/caif/caif.rst | 2 - Documentation/networking/caif/index.rst | 13 ++ Documentation/networking/caif/linux_caif.rst | 195 ++++++++++++++++++++++ Documentation/networking/caif/spi_porting.rst | 229 ++++++++++++++++++++++++++ Documentation/networking/caif/spi_porting.txt | 208 ----------------------- Documentation/networking/index.rst | 1 + drivers/net/caif/Kconfig | 2 +- 8 files changed, 439 insertions(+), 386 deletions(-) delete mode 100644 Documentation/networking/caif/Linux-CAIF.txt create mode 100644 Documentation/networking/caif/index.rst create mode 100644 Documentation/networking/caif/linux_caif.rst create mode 100644 Documentation/networking/caif/spi_porting.rst delete mode 100644 Documentation/networking/caif/spi_porting.txt diff --git a/Documentation/networking/caif/Linux-CAIF.txt b/Documentation/networking/caif/Linux-CAIF.txt deleted file mode 100644 index 0aa4bd381bec..000000000000 --- a/Documentation/networking/caif/Linux-CAIF.txt +++ /dev/null @@ -1,175 +0,0 @@ -Linux CAIF -=========== -copyright (C) ST-Ericsson AB 2010 -Author: Sjur Brendeland/ sjur.brandeland@stericsson.com -License terms: GNU General Public License (GPL) version 2 - - -Introduction ------------- -CAIF is a MUX protocol used by ST-Ericsson cellular modems for -communication between Modem and host. The host processes can open virtual AT -channels, initiate GPRS Data connections, Video channels and Utility Channels. -The Utility Channels are general purpose pipes between modem and host. - -ST-Ericsson modems support a number of transports between modem -and host. Currently, UART and Loopback are available for Linux. - - -Architecture: ------------- -The implementation of CAIF is divided into: -* CAIF Socket Layer and GPRS IP Interface. -* CAIF Core Protocol Implementation -* CAIF Link Layer, implemented as NET devices. - - - RTNL - ! - ! +------+ +------+ - ! +------+! +------+! - ! ! IP !! !Socket!! - +-------> !interf!+ ! API !+ <- CAIF Client APIs - ! +------+ +------! - ! ! ! - ! +-----------+ - ! ! - ! +------+ <- CAIF Core Protocol - ! ! CAIF ! - ! ! Core ! - ! +------+ - ! +----------!---------+ - ! ! ! ! - ! +------+ +-----+ +------+ - +--> ! HSI ! ! TTY ! ! USB ! <- Link Layer (Net Devices) - +------+ +-----+ +------+ - - - -I M P L E M E N T A T I O N -=========================== - - -CAIF Core Protocol Layer -========================================= - -CAIF Core layer implements the CAIF protocol as defined by ST-Ericsson. -It implements the CAIF protocol stack in a layered approach, where -each layer described in the specification is implemented as a separate layer. -The architecture is inspired by the design patterns "Protocol Layer" and -"Protocol Packet". - -== CAIF structure == -The Core CAIF implementation contains: - - Simple implementation of CAIF. - - Layered architecture (a la Streams), each layer in the CAIF - specification is implemented in a separate c-file. - - Clients must call configuration function to add PHY layer. - - Clients must implement CAIF layer to consume/produce - CAIF payload with receive and transmit functions. - - Clients must call configuration function to add and connect the - Client layer. - - When receiving / transmitting CAIF Packets (cfpkt), ownership is passed - to the called function (except for framing layers' receive function) - -Layered Architecture --------------------- -The CAIF protocol can be divided into two parts: Support functions and Protocol -Implementation. The support functions include: - - - CFPKT CAIF Packet. Implementation of CAIF Protocol Packet. The - CAIF Packet has functions for creating, destroying and adding content - and for adding/extracting header and trailers to protocol packets. - -The CAIF Protocol implementation contains: - - - CFCNFG CAIF Configuration layer. Configures the CAIF Protocol - Stack and provides a Client interface for adding Link-Layer and - Driver interfaces on top of the CAIF Stack. - - - CFCTRL CAIF Control layer. Encodes and Decodes control messages - such as enumeration and channel setup. Also matches request and - response messages. - - - CFSERVL General CAIF Service Layer functionality; handles flow - control and remote shutdown requests. - - - CFVEI CAIF VEI layer. Handles CAIF AT Channels on VEI (Virtual - External Interface). This layer encodes/decodes VEI frames. - - - CFDGML CAIF Datagram layer. Handles CAIF Datagram layer (IP - traffic), encodes/decodes Datagram frames. - - - CFMUX CAIF Mux layer. Handles multiplexing between multiple - physical bearers and multiple channels such as VEI, Datagram, etc. - The MUX keeps track of the existing CAIF Channels and - Physical Instances and selects the appropriate instance based - on Channel-Id and Physical-ID. - - - CFFRML CAIF Framing layer. Handles Framing i.e. Frame length - and frame checksum. - - - CFSERL CAIF Serial layer. Handles concatenation/split of frames - into CAIF Frames with correct length. - - - - +---------+ - | Config | - | CFCNFG | - +---------+ - ! - +---------+ +---------+ +---------+ - | AT | | Control | | Datagram| - | CFVEIL | | CFCTRL | | CFDGML | - +---------+ +---------+ +---------+ - \_____________!______________/ - ! - +---------+ - | MUX | - | | - +---------+ - _____!_____ - / \ - +---------+ +---------+ - | CFFRML | | CFFRML | - | Framing | | Framing | - +---------+ +---------+ - ! ! - +---------+ +---------+ - | | | Serial | - | | | CFSERL | - +---------+ +---------+ - - -In this layered approach the following "rules" apply. - - All layers embed the same structure "struct cflayer" - - A layer does not depend on any other layer's private data. - - Layers are stacked by setting the pointers - layer->up , layer->dn - - In order to send data upwards, each layer should do - layer->up->receive(layer->up, packet); - - In order to send data downwards, each layer should do - layer->dn->transmit(layer->dn, packet); - - -CAIF Socket and IP interface -=========================== - -The IP interface and CAIF socket API are implemented on top of the -CAIF Core protocol. The IP Interface and CAIF socket have an instance of -'struct cflayer', just like the CAIF Core protocol stack. -Net device and Socket implement the 'receive()' function defined by -'struct cflayer', just like the rest of the CAIF stack. In this way, transmit and -receive of packets is handled as by the rest of the layers: the 'dn->transmit()' -function is called in order to transmit data. - -Configuration of Link Layer ---------------------------- -The Link Layer is implemented as Linux network devices (struct net_device). -Payload handling and registration is done using standard Linux mechanisms. - -The CAIF Protocol relies on a loss-less link layer without implementing -retransmission. This implies that packet drops must not happen. -Therefore a flow-control mechanism is implemented where the physical -interface can initiate flow stop for all CAIF Channels. diff --git a/Documentation/networking/caif/caif.rst b/Documentation/networking/caif/caif.rst index 07afc8063d4d..a07213030ccf 100644 --- a/Documentation/networking/caif/caif.rst +++ b/Documentation/networking/caif/caif.rst @@ -1,5 +1,3 @@ -:orphan: - .. SPDX-License-Identifier: GPL-2.0 .. include:: diff --git a/Documentation/networking/caif/index.rst b/Documentation/networking/caif/index.rst new file mode 100644 index 000000000000..86e5b7832ec3 --- /dev/null +++ b/Documentation/networking/caif/index.rst @@ -0,0 +1,13 @@ +.. SPDX-License-Identifier: GPL-2.0 + +CAIF +==== + +Contents: + +.. toctree:: + :maxdepth: 2 + + linux_caif + caif + spi_porting diff --git a/Documentation/networking/caif/linux_caif.rst b/Documentation/networking/caif/linux_caif.rst new file mode 100644 index 000000000000..a0480862ab8c --- /dev/null +++ b/Documentation/networking/caif/linux_caif.rst @@ -0,0 +1,195 @@ +.. SPDX-License-Identifier: GPL-2.0 +.. include:: + +========== +Linux CAIF +========== + +Copyright |copy| ST-Ericsson AB 2010 + +:Author: Sjur Brendeland/ sjur.brandeland@stericsson.com +:License terms: GNU General Public License (GPL) version 2 + + +Introduction +============ + +CAIF is a MUX protocol used by ST-Ericsson cellular modems for +communication between Modem and host. The host processes can open virtual AT +channels, initiate GPRS Data connections, Video channels and Utility Channels. +The Utility Channels are general purpose pipes between modem and host. + +ST-Ericsson modems support a number of transports between modem +and host. Currently, UART and Loopback are available for Linux. + + +Architecture +============ + +The implementation of CAIF is divided into: + +* CAIF Socket Layer and GPRS IP Interface. +* CAIF Core Protocol Implementation +* CAIF Link Layer, implemented as NET devices. + +:: + + RTNL + ! + ! +------+ +------+ + ! +------+! +------+! + ! ! IP !! !Socket!! + +-------> !interf!+ ! API !+ <- CAIF Client APIs + ! +------+ +------! + ! ! ! + ! +-----------+ + ! ! + ! +------+ <- CAIF Core Protocol + ! ! CAIF ! + ! ! Core ! + ! +------+ + ! +----------!---------+ + ! ! ! ! + ! +------+ +-----+ +------+ + +--> ! HSI ! ! TTY ! ! USB ! <- Link Layer (Net Devices) + +------+ +-----+ +------+ + + + +Implementation +============== + + +CAIF Core Protocol Layer +------------------------ + +CAIF Core layer implements the CAIF protocol as defined by ST-Ericsson. +It implements the CAIF protocol stack in a layered approach, where +each layer described in the specification is implemented as a separate layer. +The architecture is inspired by the design patterns "Protocol Layer" and +"Protocol Packet". + +CAIF structure +^^^^^^^^^^^^^^ + +The Core CAIF implementation contains: + + - Simple implementation of CAIF. + - Layered architecture (a la Streams), each layer in the CAIF + specification is implemented in a separate c-file. + - Clients must call configuration function to add PHY layer. + - Clients must implement CAIF layer to consume/produce + CAIF payload with receive and transmit functions. + - Clients must call configuration function to add and connect the + Client layer. + - When receiving / transmitting CAIF Packets (cfpkt), ownership is passed + to the called function (except for framing layers' receive function) + +Layered Architecture +==================== + +The CAIF protocol can be divided into two parts: Support functions and Protocol +Implementation. The support functions include: + + - CFPKT CAIF Packet. Implementation of CAIF Protocol Packet. The + CAIF Packet has functions for creating, destroying and adding content + and for adding/extracting header and trailers to protocol packets. + +The CAIF Protocol implementation contains: + + - CFCNFG CAIF Configuration layer. Configures the CAIF Protocol + Stack and provides a Client interface for adding Link-Layer and + Driver interfaces on top of the CAIF Stack. + + - CFCTRL CAIF Control layer. Encodes and Decodes control messages + such as enumeration and channel setup. Also matches request and + response messages. + + - CFSERVL General CAIF Service Layer functionality; handles flow + control and remote shutdown requests. + + - CFVEI CAIF VEI layer. Handles CAIF AT Channels on VEI (Virtual + External Interface). This layer encodes/decodes VEI frames. + + - CFDGML CAIF Datagram layer. Handles CAIF Datagram layer (IP + traffic), encodes/decodes Datagram frames. + + - CFMUX CAIF Mux layer. Handles multiplexing between multiple + physical bearers and multiple channels such as VEI, Datagram, etc. + The MUX keeps track of the existing CAIF Channels and + Physical Instances and selects the appropriate instance based + on Channel-Id and Physical-ID. + + - CFFRML CAIF Framing layer. Handles Framing i.e. Frame length + and frame checksum. + + - CFSERL CAIF Serial layer. Handles concatenation/split of frames + into CAIF Frames with correct length. + +:: + + +---------+ + | Config | + | CFCNFG | + +---------+ + ! + +---------+ +---------+ +---------+ + | AT | | Control | | Datagram| + | CFVEIL | | CFCTRL | | CFDGML | + +---------+ +---------+ +---------+ + \_____________!______________/ + ! + +---------+ + | MUX | + | | + +---------+ + _____!_____ + / \ + +---------+ +---------+ + | CFFRML | | CFFRML | + | Framing | | Framing | + +---------+ +---------+ + ! ! + +---------+ +---------+ + | | | Serial | + | | | CFSERL | + +---------+ +---------+ + + +In this layered approach the following "rules" apply. + + - All layers embed the same structure "struct cflayer" + - A layer does not depend on any other layer's private data. + - Layers are stacked by setting the pointers:: + + layer->up , layer->dn + + - In order to send data upwards, each layer should do:: + + layer->up->receive(layer->up, packet); + + - In order to send data downwards, each layer should do:: + + layer->dn->transmit(layer->dn, packet); + + +CAIF Socket and IP interface +============================ + +The IP interface and CAIF socket API are implemented on top of the +CAIF Core protocol. The IP Interface and CAIF socket have an instance of +'struct cflayer', just like the CAIF Core protocol stack. +Net device and Socket implement the 'receive()' function defined by +'struct cflayer', just like the rest of the CAIF stack. In this way, transmit and +receive of packets is handled as by the rest of the layers: the 'dn->transmit()' +function is called in order to transmit data. + +Configuration of Link Layer +--------------------------- +The Link Layer is implemented as Linux network devices (struct net_device). +Payload handling and registration is done using standard Linux mechanisms. + +The CAIF Protocol relies on a loss-less link layer without implementing +retransmission. This implies that packet drops must not happen. +Therefore a flow-control mechanism is implemented where the physical +interface can initiate flow stop for all CAIF Channels. diff --git a/Documentation/networking/caif/spi_porting.rst b/Documentation/networking/caif/spi_porting.rst new file mode 100644 index 000000000000..d49f874b20ac --- /dev/null +++ b/Documentation/networking/caif/spi_porting.rst @@ -0,0 +1,229 @@ +.. SPDX-License-Identifier: GPL-2.0 + +================ +CAIF SPI porting +================ + +CAIF SPI basics +=============== + +Running CAIF over SPI needs some extra setup, owing to the nature of SPI. +Two extra GPIOs have been added in order to negotiate the transfers +between the master and the slave. The minimum requirement for running +CAIF over SPI is a SPI slave chip and two GPIOs (more details below). +Please note that running as a slave implies that you need to keep up +with the master clock. An overrun or underrun event is fatal. + +CAIF SPI framework +================== + +To make porting as easy as possible, the CAIF SPI has been divided in +two parts. The first part (called the interface part) deals with all +generic functionality such as length framing, SPI frame negotiation +and SPI frame delivery and transmission. The other part is the CAIF +SPI slave device part, which is the module that you have to write if +you want to run SPI CAIF on a new hardware. This part takes care of +the physical hardware, both with regard to SPI and to GPIOs. + +- Implementing a CAIF SPI device: + + - Functionality provided by the CAIF SPI slave device: + + In order to implement a SPI device you will, as a minimum, + need to implement the following + functions: + + :: + + int (*init_xfer) (struct cfspi_xfer * xfer, struct cfspi_dev *dev): + + This function is called by the CAIF SPI interface to give + you a chance to set up your hardware to be ready to receive + a stream of data from the master. The xfer structure contains + both physical and logical addresses, as well as the total length + of the transfer in both directions.The dev parameter can be used + to map to different CAIF SPI slave devices. + + :: + + void (*sig_xfer) (bool xfer, struct cfspi_dev *dev): + + This function is called by the CAIF SPI interface when the output + (SPI_INT) GPIO needs to change state. The boolean value of the xfer + variable indicates whether the GPIO should be asserted (HIGH) or + deasserted (LOW). The dev parameter can be used to map to different CAIF + SPI slave devices. + + - Functionality provided by the CAIF SPI interface: + + :: + + void (*ss_cb) (bool assert, struct cfspi_ifc *ifc); + + This function is called by the CAIF SPI slave device in order to + signal a change of state of the input GPIO (SS) to the interface. + Only active edges are mandatory to be reported. + This function can be called from IRQ context (recommended in order + not to introduce latency). The ifc parameter should be the pointer + returned from the platform probe function in the SPI device structure. + + :: + + void (*xfer_done_cb) (struct cfspi_ifc *ifc); + + This function is called by the CAIF SPI slave device in order to + report that a transfer is completed. This function should only be + called once both the transmission and the reception are completed. + This function can be called from IRQ context (recommended in order + not to introduce latency). The ifc parameter should be the pointer + returned from the platform probe function in the SPI device structure. + + - Connecting the bits and pieces: + + - Filling in the SPI slave device structure: + + Connect the necessary callback functions. + + Indicate clock speed (used to calculate toggle delays). + + Chose a suitable name (helps debugging if you use several CAIF + SPI slave devices). + + Assign your private data (can be used to map to your + structure). + + - Filling in the SPI slave platform device structure: + + Add name of driver to connect to ("cfspi_sspi"). + + Assign the SPI slave device structure as platform data. + +Padding +======= + +In order to optimize throughput, a number of SPI padding options are provided. +Padding can be enabled independently for uplink and downlink transfers. +Padding can be enabled for the head, the tail and for the total frame size. +The padding needs to be correctly configured on both sides of the link. +The padding can be changed via module parameters in cfspi_sspi.c or via +the sysfs directory of the cfspi_sspi driver (before device registration). + +- CAIF SPI device template:: + + /* + * Copyright (C) ST-Ericsson AB 2010 + * Author: Daniel Martensson / Daniel.Martensson@stericsson.com + * License terms: GNU General Public License (GPL), version 2. + * + */ + + #include + #include + #include + #include + #include + #include + #include + + MODULE_LICENSE("GPL"); + + struct sspi_struct { + struct cfspi_dev sdev; + struct cfspi_xfer *xfer; + }; + + static struct sspi_struct slave; + static struct platform_device slave_device; + + static irqreturn_t sspi_irq(int irq, void *arg) + { + /* You only need to trigger on an edge to the active state of the + * SS signal. Once a edge is detected, the ss_cb() function should be + * called with the parameter assert set to true. It is OK + * (and even advised) to call the ss_cb() function in IRQ context in + * order not to add any delay. */ + + return IRQ_HANDLED; + } + + static void sspi_complete(void *context) + { + /* Normally the DMA or the SPI framework will call you back + * in something similar to this. The only thing you need to + * do is to call the xfer_done_cb() function, providing the pointer + * to the CAIF SPI interface. It is OK to call this function + * from IRQ context. */ + } + + static int sspi_init_xfer(struct cfspi_xfer *xfer, struct cfspi_dev *dev) + { + /* Store transfer info. For a normal implementation you should + * set up your DMA here and make sure that you are ready to + * receive the data from the master SPI. */ + + struct sspi_struct *sspi = (struct sspi_struct *)dev->priv; + + sspi->xfer = xfer; + + return 0; + } + + void sspi_sig_xfer(bool xfer, struct cfspi_dev *dev) + { + /* If xfer is true then you should assert the SPI_INT to indicate to + * the master that you are ready to receive the data from the master + * SPI. If xfer is false then you should de-assert SPI_INT to indicate + * that the transfer is done. + */ + + struct sspi_struct *sspi = (struct sspi_struct *)dev->priv; + } + + static void sspi_release(struct device *dev) + { + /* + * Here you should release your SPI device resources. + */ + } + + static int __init sspi_init(void) + { + /* Here you should initialize your SPI device by providing the + * necessary functions, clock speed, name and private data. Once + * done, you can register your device with the + * platform_device_register() function. This function will return + * with the CAIF SPI interface initialized. This is probably also + * the place where you should set up your GPIOs, interrupts and SPI + * resources. */ + + int res = 0; + + /* Initialize slave device. */ + slave.sdev.init_xfer = sspi_init_xfer; + slave.sdev.sig_xfer = sspi_sig_xfer; + slave.sdev.clk_mhz = 13; + slave.sdev.priv = &slave; + slave.sdev.name = "spi_sspi"; + slave_device.dev.release = sspi_release; + + /* Initialize platform device. */ + slave_device.name = "cfspi_sspi"; + slave_device.dev.platform_data = &slave.sdev; + + /* Register platform device. */ + res = platform_device_register(&slave_device); + if (res) { + printk(KERN_WARNING "sspi_init: failed to register dev.\n"); + return -ENODEV; + } + + return res; + } + + static void __exit sspi_exit(void) + { + platform_device_del(&slave_device); + } + + module_init(sspi_init); + module_exit(sspi_exit); diff --git a/Documentation/networking/caif/spi_porting.txt b/Documentation/networking/caif/spi_porting.txt deleted file mode 100644 index 9efd0687dc4c..000000000000 --- a/Documentation/networking/caif/spi_porting.txt +++ /dev/null @@ -1,208 +0,0 @@ -- CAIF SPI porting - - -- CAIF SPI basics: - -Running CAIF over SPI needs some extra setup, owing to the nature of SPI. -Two extra GPIOs have been added in order to negotiate the transfers - between the master and the slave. The minimum requirement for running -CAIF over SPI is a SPI slave chip and two GPIOs (more details below). -Please note that running as a slave implies that you need to keep up -with the master clock. An overrun or underrun event is fatal. - -- CAIF SPI framework: - -To make porting as easy as possible, the CAIF SPI has been divided in -two parts. The first part (called the interface part) deals with all -generic functionality such as length framing, SPI frame negotiation -and SPI frame delivery and transmission. The other part is the CAIF -SPI slave device part, which is the module that you have to write if -you want to run SPI CAIF on a new hardware. This part takes care of -the physical hardware, both with regard to SPI and to GPIOs. - -- Implementing a CAIF SPI device: - - - Functionality provided by the CAIF SPI slave device: - - In order to implement a SPI device you will, as a minimum, - need to implement the following - functions: - - int (*init_xfer) (struct cfspi_xfer * xfer, struct cfspi_dev *dev): - - This function is called by the CAIF SPI interface to give - you a chance to set up your hardware to be ready to receive - a stream of data from the master. The xfer structure contains - both physical and logical addresses, as well as the total length - of the transfer in both directions.The dev parameter can be used - to map to different CAIF SPI slave devices. - - void (*sig_xfer) (bool xfer, struct cfspi_dev *dev): - - This function is called by the CAIF SPI interface when the output - (SPI_INT) GPIO needs to change state. The boolean value of the xfer - variable indicates whether the GPIO should be asserted (HIGH) or - deasserted (LOW). The dev parameter can be used to map to different CAIF - SPI slave devices. - - - Functionality provided by the CAIF SPI interface: - - void (*ss_cb) (bool assert, struct cfspi_ifc *ifc); - - This function is called by the CAIF SPI slave device in order to - signal a change of state of the input GPIO (SS) to the interface. - Only active edges are mandatory to be reported. - This function can be called from IRQ context (recommended in order - not to introduce latency). The ifc parameter should be the pointer - returned from the platform probe function in the SPI device structure. - - void (*xfer_done_cb) (struct cfspi_ifc *ifc); - - This function is called by the CAIF SPI slave device in order to - report that a transfer is completed. This function should only be - called once both the transmission and the reception are completed. - This function can be called from IRQ context (recommended in order - not to introduce latency). The ifc parameter should be the pointer - returned from the platform probe function in the SPI device structure. - - - Connecting the bits and pieces: - - - Filling in the SPI slave device structure: - - Connect the necessary callback functions. - Indicate clock speed (used to calculate toggle delays). - Chose a suitable name (helps debugging if you use several CAIF - SPI slave devices). - Assign your private data (can be used to map to your structure). - - - Filling in the SPI slave platform device structure: - Add name of driver to connect to ("cfspi_sspi"). - Assign the SPI slave device structure as platform data. - -- Padding: - -In order to optimize throughput, a number of SPI padding options are provided. -Padding can be enabled independently for uplink and downlink transfers. -Padding can be enabled for the head, the tail and for the total frame size. -The padding needs to be correctly configured on both sides of the link. -The padding can be changed via module parameters in cfspi_sspi.c or via -the sysfs directory of the cfspi_sspi driver (before device registration). - -- CAIF SPI device template: - -/* - * Copyright (C) ST-Ericsson AB 2010 - * Author: Daniel Martensson / Daniel.Martensson@stericsson.com - * License terms: GNU General Public License (GPL), version 2. - * - */ - -#include -#include -#include -#include -#include -#include -#include - -MODULE_LICENSE("GPL"); - -struct sspi_struct { - struct cfspi_dev sdev; - struct cfspi_xfer *xfer; -}; - -static struct sspi_struct slave; -static struct platform_device slave_device; - -static irqreturn_t sspi_irq(int irq, void *arg) -{ - /* You only need to trigger on an edge to the active state of the - * SS signal. Once a edge is detected, the ss_cb() function should be - * called with the parameter assert set to true. It is OK - * (and even advised) to call the ss_cb() function in IRQ context in - * order not to add any delay. */ - - return IRQ_HANDLED; -} - -static void sspi_complete(void *context) -{ - /* Normally the DMA or the SPI framework will call you back - * in something similar to this. The only thing you need to - * do is to call the xfer_done_cb() function, providing the pointer - * to the CAIF SPI interface. It is OK to call this function - * from IRQ context. */ -} - -static int sspi_init_xfer(struct cfspi_xfer *xfer, struct cfspi_dev *dev) -{ - /* Store transfer info. For a normal implementation you should - * set up your DMA here and make sure that you are ready to - * receive the data from the master SPI. */ - - struct sspi_struct *sspi = (struct sspi_struct *)dev->priv; - - sspi->xfer = xfer; - - return 0; -} - -void sspi_sig_xfer(bool xfer, struct cfspi_dev *dev) -{ - /* If xfer is true then you should assert the SPI_INT to indicate to - * the master that you are ready to receive the data from the master - * SPI. If xfer is false then you should de-assert SPI_INT to indicate - * that the transfer is done. - */ - - struct sspi_struct *sspi = (struct sspi_struct *)dev->priv; -} - -static void sspi_release(struct device *dev) -{ - /* - * Here you should release your SPI device resources. - */ -} - -static int __init sspi_init(void) -{ - /* Here you should initialize your SPI device by providing the - * necessary functions, clock speed, name and private data. Once - * done, you can register your device with the - * platform_device_register() function. This function will return - * with the CAIF SPI interface initialized. This is probably also - * the place where you should set up your GPIOs, interrupts and SPI - * resources. */ - - int res = 0; - - /* Initialize slave device. */ - slave.sdev.init_xfer = sspi_init_xfer; - slave.sdev.sig_xfer = sspi_sig_xfer; - slave.sdev.clk_mhz = 13; - slave.sdev.priv = &slave; - slave.sdev.name = "spi_sspi"; - slave_device.dev.release = sspi_release; - - /* Initialize platform device. */ - slave_device.name = "cfspi_sspi"; - slave_device.dev.platform_data = &slave.sdev; - - /* Register platform device. */ - res = platform_device_register(&slave_device); - if (res) { - printk(KERN_WARNING "sspi_init: failed to register dev.\n"); - return -ENODEV; - } - - return res; -} - -static void __exit sspi_exit(void) -{ - platform_device_del(&slave_device); -} - -module_init(sspi_init); -module_exit(sspi_exit); diff --git a/Documentation/networking/index.rst b/Documentation/networking/index.rst index 6538ede29661..5b3421ec25ec 100644 --- a/Documentation/networking/index.rst +++ b/Documentation/networking/index.rst @@ -15,6 +15,7 @@ Contents: device_drivers/index dsa/index devlink/index + caif/index ethtool-netlink ieee802154 j1939 diff --git a/drivers/net/caif/Kconfig b/drivers/net/caif/Kconfig index 661c25eb1c46..1538ad194cf4 100644 --- a/drivers/net/caif/Kconfig +++ b/drivers/net/caif/Kconfig @@ -28,7 +28,7 @@ config CAIF_SPI_SLAVE The CAIF Link layer SPI Protocol driver for Slave SPI interface. This driver implements a platform driver to accommodate for a platform specific SPI device. A sample CAIF SPI Platform device is - provided in . + provided in . config CAIF_SPI_SYNC bool "Next command and length in start of frame" -- cgit v1.2.3-59-g8ed1b From a434aaba17f56c0a25edff4104dd5f9d5b3ceba2 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Tue, 28 Apr 2020 00:01:17 +0200 Subject: docs: networking: convert 6pack.txt to ReST - add SPDX header; - use title markups; - mark code blocks and literals as such; - adjust identation, whitespaces and blank lines; - add to networking/index.rst. Signed-off-by: Mauro Carvalho Chehab Signed-off-by: David S. Miller --- Documentation/networking/6pack.rst | 191 +++++++++++++++++++++++++++++++++++++ Documentation/networking/6pack.txt | 175 --------------------------------- Documentation/networking/index.rst | 1 + drivers/net/hamradio/Kconfig | 2 +- 4 files changed, 193 insertions(+), 176 deletions(-) create mode 100644 Documentation/networking/6pack.rst delete mode 100644 Documentation/networking/6pack.txt diff --git a/Documentation/networking/6pack.rst b/Documentation/networking/6pack.rst new file mode 100644 index 000000000000..bc5bf1f1a98f --- /dev/null +++ b/Documentation/networking/6pack.rst @@ -0,0 +1,191 @@ +.. SPDX-License-Identifier: GPL-2.0 + +============== +6pack Protocol +============== + +This is the 6pack-mini-HOWTO, written by + +Andreas Könsgen DG3KQ + +:Internet: ajk@comnets.uni-bremen.de +:AMPR-net: dg3kq@db0pra.ampr.org +:AX.25: dg3kq@db0ach.#nrw.deu.eu + +Last update: April 7, 1998 + +1. What is 6pack, and what are the advantages to KISS? +====================================================== + +6pack is a transmission protocol for data exchange between the PC and +the TNC over a serial line. It can be used as an alternative to KISS. + +6pack has two major advantages: + +- The PC is given full control over the radio + channel. Special control data is exchanged between the PC and the TNC so + that the PC knows at any time if the TNC is receiving data, if a TNC + buffer underrun or overrun has occurred, if the PTT is + set and so on. This control data is processed at a higher priority than + normal data, so a data stream can be interrupted at any time to issue an + important event. This helps to improve the channel access and timing + algorithms as everything is computed in the PC. It would even be possible + to experiment with something completely different from the known CSMA and + DAMA channel access methods. + This kind of real-time control is especially important to supply several + TNCs that are connected between each other and the PC by a daisy chain + (however, this feature is not supported yet by the Linux 6pack driver). + +- Each packet transferred over the serial line is supplied with a checksum, + so it is easy to detect errors due to problems on the serial line. + Received packets that are corrupt are not passed on to the AX.25 layer. + Damaged packets that the TNC has received from the PC are not transmitted. + +More details about 6pack are described in the file 6pack.ps that is located +in the doc directory of the AX.25 utilities package. + +2. Who has developed the 6pack protocol? +======================================== + +The 6pack protocol has been developed by Ekki Plicht DF4OR, Henning Rech +DF9IC and Gunter Jost DK7WJ. A driver for 6pack, written by Gunter Jost and +Matthias Welwarsky DG2FEF, comes along with the PC version of FlexNet. +They have also written a firmware for TNCs to perform the 6pack +protocol (see section 4 below). + +3. Where can I get the latest version of 6pack for LinuX? +========================================================= + +At the moment, the 6pack stuff can obtained via anonymous ftp from +db0bm.automation.fh-aachen.de. In the directory /incoming/dg3kq, +there is a file named 6pack.tgz. + +4. Preparing the TNC for 6pack operation +======================================== + +To be able to use 6pack, a special firmware for the TNC is needed. The EPROM +of a newly bought TNC does not contain 6pack, so you will have to +program an EPROM yourself. The image file for 6pack EPROMs should be +available on any packet radio box where PC/FlexNet can be found. The name of +the file is 6pack.bin. This file is copyrighted and maintained by the FlexNet +team. It can be used under the terms of the license that comes along +with PC/FlexNet. Please do not ask me about the internals of this file as I +don't know anything about it. I used a textual description of the 6pack +protocol to program the Linux driver. + +TNCs contain a 64kByte EPROM, the lower half of which is used for +the firmware/KISS. The upper half is either empty or is sometimes +programmed with software called TAPR. In the latter case, the TNC +is supplied with a DIP switch so you can easily change between the +two systems. When programming a new EPROM, one of the systems is replaced +by 6pack. It is useful to replace TAPR, as this software is rarely used +nowadays. If your TNC is not equipped with the switch mentioned above, you +can build in one yourself that switches over the highest address pin +of the EPROM between HIGH and LOW level. After having inserted the new EPROM +and switched to 6pack, apply power to the TNC for a first test. The connect +and the status LED are lit for about a second if the firmware initialises +the TNC correctly. + +5. Building and installing the 6pack driver +=========================================== + +The driver has been tested with kernel version 2.1.90. Use with older +kernels may lead to a compilation error because the interface to a kernel +function has been changed in the 2.1.8x kernels. + +How to turn on 6pack support: +============================= + +- In the linux kernel configuration program, select the code maturity level + options menu and turn on the prompting for development drivers. + +- Select the amateur radio support menu and turn on the serial port 6pack + driver. + +- Compile and install the kernel and the modules. + +To use the driver, the kissattach program delivered with the AX.25 utilities +has to be modified. + +- Do a cd to the directory that holds the kissattach sources. Edit the + kissattach.c file. At the top, insert the following lines:: + + #ifndef N_6PACK + #define N_6PACK (N_AX25+1) + #endif + + Then find the line: + + int disc = N_AX25; + + and replace N_AX25 by N_6PACK. + +- Recompile kissattach. Rename it to spattach to avoid confusions. + +Installing the driver: +---------------------- + +- Do an insmod 6pack. Look at your /var/log/messages file to check if the + module has printed its initialization message. + +- Do a spattach as you would launch kissattach when starting a KISS port. + Check if the kernel prints the message '6pack: TNC found'. + +- From here, everything should work as if you were setting up a KISS port. + The only difference is that the network device that represents + the 6pack port is called sp instead of sl or ax. So, sp0 would be the + first 6pack port. + +Although the driver has been tested on various platforms, I still declare it +ALPHA. BE CAREFUL! Sync your disks before insmoding the 6pack module +and spattaching. Watch out if your computer behaves strangely. Read section +6 of this file about known problems. + +Note that the connect and status LEDs of the TNC are controlled in a +different way than they are when the TNC is used with PC/FlexNet. When using +FlexNet, the connect LED is on if there is a connection; the status LED is +on if there is data in the buffer of the PC's AX.25 engine that has to be +transmitted. Under Linux, the 6pack layer is beyond the AX.25 layer, +so the 6pack driver doesn't know anything about connects or data that +has not yet been transmitted. Therefore the LEDs are controlled +as they are in KISS mode: The connect LED is turned on if data is transferred +from the PC to the TNC over the serial line, the status LED if data is +sent to the PC. + +6. Known problems +================= + +When testing the driver with 2.0.3x kernels and +operating with data rates on the radio channel of 9600 Baud or higher, +the driver may, on certain systems, sometimes print the message '6pack: +bad checksum', which is due to data loss if the other station sends two +or more subsequent packets. I have been told that this is due to a problem +with the serial driver of 2.0.3x kernels. I don't know yet if the problem +still exists with 2.1.x kernels, as I have heard that the serial driver +code has been changed with 2.1.x. + +When shutting down the sp interface with ifconfig, the kernel crashes if +there is still an AX.25 connection left over which an IP connection was +running, even if that IP connection is already closed. The problem does not +occur when there is a bare AX.25 connection still running. I don't know if +this is a problem of the 6pack driver or something else in the kernel. + +The driver has been tested as a module, not yet as a kernel-builtin driver. + +The 6pack protocol supports daisy-chaining of TNCs in a token ring, which is +connected to one serial port of the PC. This feature is not implemented +and at least at the moment I won't be able to do it because I do not have +the opportunity to build a TNC daisy-chain and test it. + +Some of the comments in the source code are inaccurate. They are left from +the SLIP/KISS driver, from which the 6pack driver has been derived. +I haven't modified or removed them yet -- sorry! The code itself needs +some cleaning and optimizing. This will be done in a later release. + +If you encounter a bug or if you have a question or suggestion concerning the +driver, feel free to mail me, using the addresses given at the beginning of +this file. + +Have fun! + +Andreas diff --git a/Documentation/networking/6pack.txt b/Documentation/networking/6pack.txt deleted file mode 100644 index 8f339428fdf4..000000000000 --- a/Documentation/networking/6pack.txt +++ /dev/null @@ -1,175 +0,0 @@ -This is the 6pack-mini-HOWTO, written by - -Andreas Könsgen DG3KQ -Internet: ajk@comnets.uni-bremen.de -AMPR-net: dg3kq@db0pra.ampr.org -AX.25: dg3kq@db0ach.#nrw.deu.eu - -Last update: April 7, 1998 - -1. What is 6pack, and what are the advantages to KISS? - -6pack is a transmission protocol for data exchange between the PC and -the TNC over a serial line. It can be used as an alternative to KISS. - -6pack has two major advantages: -- The PC is given full control over the radio - channel. Special control data is exchanged between the PC and the TNC so - that the PC knows at any time if the TNC is receiving data, if a TNC - buffer underrun or overrun has occurred, if the PTT is - set and so on. This control data is processed at a higher priority than - normal data, so a data stream can be interrupted at any time to issue an - important event. This helps to improve the channel access and timing - algorithms as everything is computed in the PC. It would even be possible - to experiment with something completely different from the known CSMA and - DAMA channel access methods. - This kind of real-time control is especially important to supply several - TNCs that are connected between each other and the PC by a daisy chain - (however, this feature is not supported yet by the Linux 6pack driver). - -- Each packet transferred over the serial line is supplied with a checksum, - so it is easy to detect errors due to problems on the serial line. - Received packets that are corrupt are not passed on to the AX.25 layer. - Damaged packets that the TNC has received from the PC are not transmitted. - -More details about 6pack are described in the file 6pack.ps that is located -in the doc directory of the AX.25 utilities package. - -2. Who has developed the 6pack protocol? - -The 6pack protocol has been developed by Ekki Plicht DF4OR, Henning Rech -DF9IC and Gunter Jost DK7WJ. A driver for 6pack, written by Gunter Jost and -Matthias Welwarsky DG2FEF, comes along with the PC version of FlexNet. -They have also written a firmware for TNCs to perform the 6pack -protocol (see section 4 below). - -3. Where can I get the latest version of 6pack for LinuX? - -At the moment, the 6pack stuff can obtained via anonymous ftp from -db0bm.automation.fh-aachen.de. In the directory /incoming/dg3kq, -there is a file named 6pack.tgz. - -4. Preparing the TNC for 6pack operation - -To be able to use 6pack, a special firmware for the TNC is needed. The EPROM -of a newly bought TNC does not contain 6pack, so you will have to -program an EPROM yourself. The image file for 6pack EPROMs should be -available on any packet radio box where PC/FlexNet can be found. The name of -the file is 6pack.bin. This file is copyrighted and maintained by the FlexNet -team. It can be used under the terms of the license that comes along -with PC/FlexNet. Please do not ask me about the internals of this file as I -don't know anything about it. I used a textual description of the 6pack -protocol to program the Linux driver. - -TNCs contain a 64kByte EPROM, the lower half of which is used for -the firmware/KISS. The upper half is either empty or is sometimes -programmed with software called TAPR. In the latter case, the TNC -is supplied with a DIP switch so you can easily change between the -two systems. When programming a new EPROM, one of the systems is replaced -by 6pack. It is useful to replace TAPR, as this software is rarely used -nowadays. If your TNC is not equipped with the switch mentioned above, you -can build in one yourself that switches over the highest address pin -of the EPROM between HIGH and LOW level. After having inserted the new EPROM -and switched to 6pack, apply power to the TNC for a first test. The connect -and the status LED are lit for about a second if the firmware initialises -the TNC correctly. - -5. Building and installing the 6pack driver - -The driver has been tested with kernel version 2.1.90. Use with older -kernels may lead to a compilation error because the interface to a kernel -function has been changed in the 2.1.8x kernels. - -How to turn on 6pack support: - -- In the linux kernel configuration program, select the code maturity level - options menu and turn on the prompting for development drivers. - -- Select the amateur radio support menu and turn on the serial port 6pack - driver. - -- Compile and install the kernel and the modules. - -To use the driver, the kissattach program delivered with the AX.25 utilities -has to be modified. - -- Do a cd to the directory that holds the kissattach sources. Edit the - kissattach.c file. At the top, insert the following lines: - - #ifndef N_6PACK - #define N_6PACK (N_AX25+1) - #endif - - Then find the line - - int disc = N_AX25; - - and replace N_AX25 by N_6PACK. - -- Recompile kissattach. Rename it to spattach to avoid confusions. - -Installing the driver: - -- Do an insmod 6pack. Look at your /var/log/messages file to check if the - module has printed its initialization message. - -- Do a spattach as you would launch kissattach when starting a KISS port. - Check if the kernel prints the message '6pack: TNC found'. - -- From here, everything should work as if you were setting up a KISS port. - The only difference is that the network device that represents - the 6pack port is called sp instead of sl or ax. So, sp0 would be the - first 6pack port. - -Although the driver has been tested on various platforms, I still declare it -ALPHA. BE CAREFUL! Sync your disks before insmoding the 6pack module -and spattaching. Watch out if your computer behaves strangely. Read section -6 of this file about known problems. - -Note that the connect and status LEDs of the TNC are controlled in a -different way than they are when the TNC is used with PC/FlexNet. When using -FlexNet, the connect LED is on if there is a connection; the status LED is -on if there is data in the buffer of the PC's AX.25 engine that has to be -transmitted. Under Linux, the 6pack layer is beyond the AX.25 layer, -so the 6pack driver doesn't know anything about connects or data that -has not yet been transmitted. Therefore the LEDs are controlled -as they are in KISS mode: The connect LED is turned on if data is transferred -from the PC to the TNC over the serial line, the status LED if data is -sent to the PC. - -6. Known problems - -When testing the driver with 2.0.3x kernels and -operating with data rates on the radio channel of 9600 Baud or higher, -the driver may, on certain systems, sometimes print the message '6pack: -bad checksum', which is due to data loss if the other station sends two -or more subsequent packets. I have been told that this is due to a problem -with the serial driver of 2.0.3x kernels. I don't know yet if the problem -still exists with 2.1.x kernels, as I have heard that the serial driver -code has been changed with 2.1.x. - -When shutting down the sp interface with ifconfig, the kernel crashes if -there is still an AX.25 connection left over which an IP connection was -running, even if that IP connection is already closed. The problem does not -occur when there is a bare AX.25 connection still running. I don't know if -this is a problem of the 6pack driver or something else in the kernel. - -The driver has been tested as a module, not yet as a kernel-builtin driver. - -The 6pack protocol supports daisy-chaining of TNCs in a token ring, which is -connected to one serial port of the PC. This feature is not implemented -and at least at the moment I won't be able to do it because I do not have -the opportunity to build a TNC daisy-chain and test it. - -Some of the comments in the source code are inaccurate. They are left from -the SLIP/KISS driver, from which the 6pack driver has been derived. -I haven't modified or removed them yet -- sorry! The code itself needs -some cleaning and optimizing. This will be done in a later release. - -If you encounter a bug or if you have a question or suggestion concerning the -driver, feel free to mail me, using the addresses given at the beginning of -this file. - -Have fun! - -Andreas diff --git a/Documentation/networking/index.rst b/Documentation/networking/index.rst index 5b3421ec25ec..dc37fc8d5bee 100644 --- a/Documentation/networking/index.rst +++ b/Documentation/networking/index.rst @@ -37,6 +37,7 @@ Contents: tls-offload nfc 6lowpan + 6pack .. only:: subproject and html diff --git a/drivers/net/hamradio/Kconfig b/drivers/net/hamradio/Kconfig index 8e05b5c31a77..bf306fed04cc 100644 --- a/drivers/net/hamradio/Kconfig +++ b/drivers/net/hamradio/Kconfig @@ -30,7 +30,7 @@ config 6PACK Note that this driver is still experimental and might cause problems. For details about the features and the usage of the - driver, read . + driver, read . To compile this driver as a module, choose M here: the module will be called 6pack. -- cgit v1.2.3-59-g8ed1b From 5a7f3132121bbcafd61f616170a08e511d675347 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Tue, 28 Apr 2020 00:01:18 +0200 Subject: docs: networking: convert altera_tse.txt to ReST - add SPDX header; - use copyright symbol; - adjust titles and chapters, adding proper markups; - mark lists as such; - adjust identation, whitespaces and blank lines; - add to networking/index.rst. Signed-off-by: Mauro Carvalho Chehab Signed-off-by: David S. Miller --- Documentation/networking/altera_tse.rst | 286 ++++++++++++++++++++++++++++++++ Documentation/networking/altera_tse.txt | 263 ----------------------------- Documentation/networking/index.rst | 1 + 3 files changed, 287 insertions(+), 263 deletions(-) create mode 100644 Documentation/networking/altera_tse.rst delete mode 100644 Documentation/networking/altera_tse.txt diff --git a/Documentation/networking/altera_tse.rst b/Documentation/networking/altera_tse.rst new file mode 100644 index 000000000000..7a7040072e58 --- /dev/null +++ b/Documentation/networking/altera_tse.rst @@ -0,0 +1,286 @@ +.. SPDX-License-Identifier: GPL-2.0 + +.. include:: + +======================================= +Altera Triple-Speed Ethernet MAC driver +======================================= + +Copyright |copy| 2008-2014 Altera Corporation + +This is the driver for the Altera Triple-Speed Ethernet (TSE) controllers +using the SGDMA and MSGDMA soft DMA IP components. The driver uses the +platform bus to obtain component resources. The designs used to test this +driver were built for a Cyclone(R) V SOC FPGA board, a Cyclone(R) V FPGA board, +and tested with ARM and NIOS processor hosts separately. The anticipated use +cases are simple communications between an embedded system and an external peer +for status and simple configuration of the embedded system. + +For more information visit www.altera.com and www.rocketboards.org. Support +forums for the driver may be found on www.rocketboards.org, and a design used +to test this driver may be found there as well. Support is also available from +the maintainer of this driver, found in MAINTAINERS. + +The Triple-Speed Ethernet, SGDMA, and MSGDMA components are all soft IP +components that can be assembled and built into an FPGA using the Altera +Quartus toolchain. Quartus 13.1 and 14.0 were used to build the design that +this driver was tested against. The sopc2dts tool is used to create the +device tree for the driver, and may be found at rocketboards.org. + +The driver probe function examines the device tree and determines if the +Triple-Speed Ethernet instance is using an SGDMA or MSGDMA component. The +probe function then installs the appropriate set of DMA routines to +initialize, setup transmits, receives, and interrupt handling primitives for +the respective configurations. + +The SGDMA component is to be deprecated in the near future (over the next 1-2 +years as of this writing in early 2014) in favor of the MSGDMA component. +SGDMA support is included for existing designs and reference in case a +developer wishes to support their own soft DMA logic and driver support. Any +new designs should not use the SGDMA. + +The SGDMA supports only a single transmit or receive operation at a time, and +therefore will not perform as well compared to the MSGDMA soft IP. Please +visit www.altera.com for known, documented SGDMA errata. + +Scatter-gather DMA is not supported by the SGDMA or MSGDMA at this time. +Scatter-gather DMA will be added to a future maintenance update to this +driver. + +Jumbo frames are not supported at this time. + +The driver limits PHY operations to 10/100Mbps, and has not yet been fully +tested for 1Gbps. This support will be added in a future maintenance update. + +1. Kernel Configuration +======================= + +The kernel configuration option is ALTERA_TSE: + + Device Drivers ---> Network device support ---> Ethernet driver support ---> + Altera Triple-Speed Ethernet MAC support (ALTERA_TSE) + +2. Driver parameters list +========================= + + - debug: message level (0: no output, 16: all); + - dma_rx_num: Number of descriptors in the RX list (default is 64); + - dma_tx_num: Number of descriptors in the TX list (default is 64). + +3. Command line options +======================= + +Driver parameters can be also passed in command line by using:: + + altera_tse=dma_rx_num:128,dma_tx_num:512 + +4. Driver information and notes +=============================== + +4.1. Transmit process +--------------------- +When the driver's transmit routine is called by the kernel, it sets up a +transmit descriptor by calling the underlying DMA transmit routine (SGDMA or +MSGDMA), and initiates a transmit operation. Once the transmit is complete, an +interrupt is driven by the transmit DMA logic. The driver handles the transmit +completion in the context of the interrupt handling chain by recycling +resource required to send and track the requested transmit operation. + +4.2. Receive process +-------------------- +The driver will post receive buffers to the receive DMA logic during driver +initialization. Receive buffers may or may not be queued depending upon the +underlying DMA logic (MSGDMA is able queue receive buffers, SGDMA is not able +to queue receive buffers to the SGDMA receive logic). When a packet is +received, the DMA logic generates an interrupt. The driver handles a receive +interrupt by obtaining the DMA receive logic status, reaping receive +completions until no more receive completions are available. + +4.3. Interrupt Mitigation +------------------------- +The driver is able to mitigate the number of its DMA interrupts +using NAPI for receive operations. Interrupt mitigation is not yet supported +for transmit operations, but will be added in a future maintenance release. + +4.4) Ethtool support +-------------------- +Ethtool is supported. Driver statistics and internal errors can be taken using: +ethtool -S ethX command. It is possible to dump registers etc. + +4.5) PHY Support +---------------- +The driver is compatible with PAL to work with PHY and GPHY devices. + +4.7) List of source files: +-------------------------- + - Kconfig + - Makefile + - altera_tse_main.c: main network device driver + - altera_tse_ethtool.c: ethtool support + - altera_tse.h: private driver structure and common definitions + - altera_msgdma.h: MSGDMA implementation function definitions + - altera_sgdma.h: SGDMA implementation function definitions + - altera_msgdma.c: MSGDMA implementation + - altera_sgdma.c: SGDMA implementation + - altera_sgdmahw.h: SGDMA register and descriptor definitions + - altera_msgdmahw.h: MSGDMA register and descriptor definitions + - altera_utils.c: Driver utility functions + - altera_utils.h: Driver utility function definitions + +5. Debug Information +==================== + +The driver exports debug information such as internal statistics, +debug information, MAC and DMA registers etc. + +A user may use the ethtool support to get statistics: +e.g. using: ethtool -S ethX (that shows the statistics counters) +or sees the MAC registers: e.g. using: ethtool -d ethX + +The developer can also use the "debug" module parameter to get +further debug information. + +6. Statistics Support +===================== + +The controller and driver support a mix of IEEE standard defined statistics, +RFC defined statistics, and driver or Altera defined statistics. The four +specifications containing the standard definitions for these statistics are +as follows: + + - IEEE 802.3-2012 - IEEE Standard for Ethernet. + - RFC 2863 found at http://www.rfc-editor.org/rfc/rfc2863.txt. + - RFC 2819 found at http://www.rfc-editor.org/rfc/rfc2819.txt. + - Altera Triple Speed Ethernet User Guide, found at http://www.altera.com + +The statistics supported by the TSE and the device driver are as follows: + +"tx_packets" is equivalent to aFramesTransmittedOK defined in IEEE 802.3-2012, +Section 5.2.2.1.2. This statistics is the count of frames that are successfully +transmitted. + +"rx_packets" is equivalent to aFramesReceivedOK defined in IEEE 802.3-2012, +Section 5.2.2.1.5. This statistic is the count of frames that are successfully +received. This count does not include any error packets such as CRC errors, +length errors, or alignment errors. + +"rx_crc_errors" is equivalent to aFrameCheckSequenceErrors defined in IEEE +802.3-2012, Section 5.2.2.1.6. This statistic is the count of frames that are +an integral number of bytes in length and do not pass the CRC test as the frame +is received. + +"rx_align_errors" is equivalent to aAlignmentErrors defined in IEEE 802.3-2012, +Section 5.2.2.1.7. This statistic is the count of frames that are not an +integral number of bytes in length and do not pass the CRC test as the frame is +received. + +"tx_bytes" is equivalent to aOctetsTransmittedOK defined in IEEE 802.3-2012, +Section 5.2.2.1.8. This statistic is the count of data and pad bytes +successfully transmitted from the interface. + +"rx_bytes" is equivalent to aOctetsReceivedOK defined in IEEE 802.3-2012, +Section 5.2.2.1.14. This statistic is the count of data and pad bytes +successfully received by the controller. + +"tx_pause" is equivalent to aPAUSEMACCtrlFramesTransmitted defined in IEEE +802.3-2012, Section 30.3.4.2. This statistic is a count of PAUSE frames +transmitted from the network controller. + +"rx_pause" is equivalent to aPAUSEMACCtrlFramesReceived defined in IEEE +802.3-2012, Section 30.3.4.3. This statistic is a count of PAUSE frames +received by the network controller. + +"rx_errors" is equivalent to ifInErrors defined in RFC 2863. This statistic is +a count of the number of packets received containing errors that prevented the +packet from being delivered to a higher level protocol. + +"tx_errors" is equivalent to ifOutErrors defined in RFC 2863. This statistic +is a count of the number of packets that could not be transmitted due to errors. + +"rx_unicast" is equivalent to ifInUcastPkts defined in RFC 2863. This +statistic is a count of the number of packets received that were not addressed +to the broadcast address or a multicast group. + +"rx_multicast" is equivalent to ifInMulticastPkts defined in RFC 2863. This +statistic is a count of the number of packets received that were addressed to +a multicast address group. + +"rx_broadcast" is equivalent to ifInBroadcastPkts defined in RFC 2863. This +statistic is a count of the number of packets received that were addressed to +the broadcast address. + +"tx_discards" is equivalent to ifOutDiscards defined in RFC 2863. This +statistic is the number of outbound packets not transmitted even though an +error was not detected. An example of a reason this might occur is to free up +internal buffer space. + +"tx_unicast" is equivalent to ifOutUcastPkts defined in RFC 2863. This +statistic counts the number of packets transmitted that were not addressed to +a multicast group or broadcast address. + +"tx_multicast" is equivalent to ifOutMulticastPkts defined in RFC 2863. This +statistic counts the number of packets transmitted that were addressed to a +multicast group. + +"tx_broadcast" is equivalent to ifOutBroadcastPkts defined in RFC 2863. This +statistic counts the number of packets transmitted that were addressed to a +broadcast address. + +"ether_drops" is equivalent to etherStatsDropEvents defined in RFC 2819. +This statistic counts the number of packets dropped due to lack of internal +controller resources. + +"rx_total_bytes" is equivalent to etherStatsOctets defined in RFC 2819. +This statistic counts the total number of bytes received by the controller, +including error and discarded packets. + +"rx_total_packets" is equivalent to etherStatsPkts defined in RFC 2819. +This statistic counts the total number of packets received by the controller, +including error, discarded, unicast, multicast, and broadcast packets. + +"rx_undersize" is equivalent to etherStatsUndersizePkts defined in RFC 2819. +This statistic counts the number of correctly formed packets received less +than 64 bytes long. + +"rx_oversize" is equivalent to etherStatsOversizePkts defined in RFC 2819. +This statistic counts the number of correctly formed packets greater than 1518 +bytes long. + +"rx_64_bytes" is equivalent to etherStatsPkts64Octets defined in RFC 2819. +This statistic counts the total number of packets received that were 64 octets +in length. + +"rx_65_127_bytes" is equivalent to etherStatsPkts65to127Octets defined in RFC +2819. This statistic counts the total number of packets received that were +between 65 and 127 octets in length inclusive. + +"rx_128_255_bytes" is equivalent to etherStatsPkts128to255Octets defined in +RFC 2819. This statistic is the total number of packets received that were +between 128 and 255 octets in length inclusive. + +"rx_256_511_bytes" is equivalent to etherStatsPkts256to511Octets defined in +RFC 2819. This statistic is the total number of packets received that were +between 256 and 511 octets in length inclusive. + +"rx_512_1023_bytes" is equivalent to etherStatsPkts512to1023Octets defined in +RFC 2819. This statistic is the total number of packets received that were +between 512 and 1023 octets in length inclusive. + +"rx_1024_1518_bytes" is equivalent to etherStatsPkts1024to1518Octets define +in RFC 2819. This statistic is the total number of packets received that were +between 1024 and 1518 octets in length inclusive. + +"rx_gte_1519_bytes" is a statistic defined specific to the behavior of the +Altera TSE. This statistics counts the number of received good and errored +frames between the length of 1519 and the maximum frame length configured +in the frm_length register. See the Altera TSE User Guide for More details. + +"rx_jabbers" is equivalent to etherStatsJabbers defined in RFC 2819. This +statistic is the total number of packets received that were longer than 1518 +octets, and had either a bad CRC with an integral number of octets (CRC Error) +or a bad CRC with a non-integral number of octets (Alignment Error). + +"rx_runts" is equivalent to etherStatsFragments defined in RFC 2819. This +statistic is the total number of packets received that were less than 64 octets +in length and had either a bad CRC with an integral number of octets (CRC +error) or a bad CRC with a non-integral number of octets (Alignment Error). diff --git a/Documentation/networking/altera_tse.txt b/Documentation/networking/altera_tse.txt deleted file mode 100644 index 50b8589d12fd..000000000000 --- a/Documentation/networking/altera_tse.txt +++ /dev/null @@ -1,263 +0,0 @@ - Altera Triple-Speed Ethernet MAC driver - -Copyright (C) 2008-2014 Altera Corporation - -This is the driver for the Altera Triple-Speed Ethernet (TSE) controllers -using the SGDMA and MSGDMA soft DMA IP components. The driver uses the -platform bus to obtain component resources. The designs used to test this -driver were built for a Cyclone(R) V SOC FPGA board, a Cyclone(R) V FPGA board, -and tested with ARM and NIOS processor hosts separately. The anticipated use -cases are simple communications between an embedded system and an external peer -for status and simple configuration of the embedded system. - -For more information visit www.altera.com and www.rocketboards.org. Support -forums for the driver may be found on www.rocketboards.org, and a design used -to test this driver may be found there as well. Support is also available from -the maintainer of this driver, found in MAINTAINERS. - -The Triple-Speed Ethernet, SGDMA, and MSGDMA components are all soft IP -components that can be assembled and built into an FPGA using the Altera -Quartus toolchain. Quartus 13.1 and 14.0 were used to build the design that -this driver was tested against. The sopc2dts tool is used to create the -device tree for the driver, and may be found at rocketboards.org. - -The driver probe function examines the device tree and determines if the -Triple-Speed Ethernet instance is using an SGDMA or MSGDMA component. The -probe function then installs the appropriate set of DMA routines to -initialize, setup transmits, receives, and interrupt handling primitives for -the respective configurations. - -The SGDMA component is to be deprecated in the near future (over the next 1-2 -years as of this writing in early 2014) in favor of the MSGDMA component. -SGDMA support is included for existing designs and reference in case a -developer wishes to support their own soft DMA logic and driver support. Any -new designs should not use the SGDMA. - -The SGDMA supports only a single transmit or receive operation at a time, and -therefore will not perform as well compared to the MSGDMA soft IP. Please -visit www.altera.com for known, documented SGDMA errata. - -Scatter-gather DMA is not supported by the SGDMA or MSGDMA at this time. -Scatter-gather DMA will be added to a future maintenance update to this -driver. - -Jumbo frames are not supported at this time. - -The driver limits PHY operations to 10/100Mbps, and has not yet been fully -tested for 1Gbps. This support will be added in a future maintenance update. - -1) Kernel Configuration -The kernel configuration option is ALTERA_TSE: - Device Drivers ---> Network device support ---> Ethernet driver support ---> - Altera Triple-Speed Ethernet MAC support (ALTERA_TSE) - -2) Driver parameters list: - debug: message level (0: no output, 16: all); - dma_rx_num: Number of descriptors in the RX list (default is 64); - dma_tx_num: Number of descriptors in the TX list (default is 64). - -3) Command line options -Driver parameters can be also passed in command line by using: - altera_tse=dma_rx_num:128,dma_tx_num:512 - -4) Driver information and notes - -4.1) Transmit process -When the driver's transmit routine is called by the kernel, it sets up a -transmit descriptor by calling the underlying DMA transmit routine (SGDMA or -MSGDMA), and initiates a transmit operation. Once the transmit is complete, an -interrupt is driven by the transmit DMA logic. The driver handles the transmit -completion in the context of the interrupt handling chain by recycling -resource required to send and track the requested transmit operation. - -4.2) Receive process -The driver will post receive buffers to the receive DMA logic during driver -initialization. Receive buffers may or may not be queued depending upon the -underlying DMA logic (MSGDMA is able queue receive buffers, SGDMA is not able -to queue receive buffers to the SGDMA receive logic). When a packet is -received, the DMA logic generates an interrupt. The driver handles a receive -interrupt by obtaining the DMA receive logic status, reaping receive -completions until no more receive completions are available. - -4.3) Interrupt Mitigation -The driver is able to mitigate the number of its DMA interrupts -using NAPI for receive operations. Interrupt mitigation is not yet supported -for transmit operations, but will be added in a future maintenance release. - -4.4) Ethtool support -Ethtool is supported. Driver statistics and internal errors can be taken using: -ethtool -S ethX command. It is possible to dump registers etc. - -4.5) PHY Support -The driver is compatible with PAL to work with PHY and GPHY devices. - -4.7) List of source files: - o Kconfig - o Makefile - o altera_tse_main.c: main network device driver - o altera_tse_ethtool.c: ethtool support - o altera_tse.h: private driver structure and common definitions - o altera_msgdma.h: MSGDMA implementation function definitions - o altera_sgdma.h: SGDMA implementation function definitions - o altera_msgdma.c: MSGDMA implementation - o altera_sgdma.c: SGDMA implementation - o altera_sgdmahw.h: SGDMA register and descriptor definitions - o altera_msgdmahw.h: MSGDMA register and descriptor definitions - o altera_utils.c: Driver utility functions - o altera_utils.h: Driver utility function definitions - -5) Debug Information - -The driver exports debug information such as internal statistics, -debug information, MAC and DMA registers etc. - -A user may use the ethtool support to get statistics: -e.g. using: ethtool -S ethX (that shows the statistics counters) -or sees the MAC registers: e.g. using: ethtool -d ethX - -The developer can also use the "debug" module parameter to get -further debug information. - -6) Statistics Support - -The controller and driver support a mix of IEEE standard defined statistics, -RFC defined statistics, and driver or Altera defined statistics. The four -specifications containing the standard definitions for these statistics are -as follows: - - o IEEE 802.3-2012 - IEEE Standard for Ethernet. - o RFC 2863 found at http://www.rfc-editor.org/rfc/rfc2863.txt. - o RFC 2819 found at http://www.rfc-editor.org/rfc/rfc2819.txt. - o Altera Triple Speed Ethernet User Guide, found at http://www.altera.com - -The statistics supported by the TSE and the device driver are as follows: - -"tx_packets" is equivalent to aFramesTransmittedOK defined in IEEE 802.3-2012, -Section 5.2.2.1.2. This statistics is the count of frames that are successfully -transmitted. - -"rx_packets" is equivalent to aFramesReceivedOK defined in IEEE 802.3-2012, -Section 5.2.2.1.5. This statistic is the count of frames that are successfully -received. This count does not include any error packets such as CRC errors, -length errors, or alignment errors. - -"rx_crc_errors" is equivalent to aFrameCheckSequenceErrors defined in IEEE -802.3-2012, Section 5.2.2.1.6. This statistic is the count of frames that are -an integral number of bytes in length and do not pass the CRC test as the frame -is received. - -"rx_align_errors" is equivalent to aAlignmentErrors defined in IEEE 802.3-2012, -Section 5.2.2.1.7. This statistic is the count of frames that are not an -integral number of bytes in length and do not pass the CRC test as the frame is -received. - -"tx_bytes" is equivalent to aOctetsTransmittedOK defined in IEEE 802.3-2012, -Section 5.2.2.1.8. This statistic is the count of data and pad bytes -successfully transmitted from the interface. - -"rx_bytes" is equivalent to aOctetsReceivedOK defined in IEEE 802.3-2012, -Section 5.2.2.1.14. This statistic is the count of data and pad bytes -successfully received by the controller. - -"tx_pause" is equivalent to aPAUSEMACCtrlFramesTransmitted defined in IEEE -802.3-2012, Section 30.3.4.2. This statistic is a count of PAUSE frames -transmitted from the network controller. - -"rx_pause" is equivalent to aPAUSEMACCtrlFramesReceived defined in IEEE -802.3-2012, Section 30.3.4.3. This statistic is a count of PAUSE frames -received by the network controller. - -"rx_errors" is equivalent to ifInErrors defined in RFC 2863. This statistic is -a count of the number of packets received containing errors that prevented the -packet from being delivered to a higher level protocol. - -"tx_errors" is equivalent to ifOutErrors defined in RFC 2863. This statistic -is a count of the number of packets that could not be transmitted due to errors. - -"rx_unicast" is equivalent to ifInUcastPkts defined in RFC 2863. This -statistic is a count of the number of packets received that were not addressed -to the broadcast address or a multicast group. - -"rx_multicast" is equivalent to ifInMulticastPkts defined in RFC 2863. This -statistic is a count of the number of packets received that were addressed to -a multicast address group. - -"rx_broadcast" is equivalent to ifInBroadcastPkts defined in RFC 2863. This -statistic is a count of the number of packets received that were addressed to -the broadcast address. - -"tx_discards" is equivalent to ifOutDiscards defined in RFC 2863. This -statistic is the number of outbound packets not transmitted even though an -error was not detected. An example of a reason this might occur is to free up -internal buffer space. - -"tx_unicast" is equivalent to ifOutUcastPkts defined in RFC 2863. This -statistic counts the number of packets transmitted that were not addressed to -a multicast group or broadcast address. - -"tx_multicast" is equivalent to ifOutMulticastPkts defined in RFC 2863. This -statistic counts the number of packets transmitted that were addressed to a -multicast group. - -"tx_broadcast" is equivalent to ifOutBroadcastPkts defined in RFC 2863. This -statistic counts the number of packets transmitted that were addressed to a -broadcast address. - -"ether_drops" is equivalent to etherStatsDropEvents defined in RFC 2819. -This statistic counts the number of packets dropped due to lack of internal -controller resources. - -"rx_total_bytes" is equivalent to etherStatsOctets defined in RFC 2819. -This statistic counts the total number of bytes received by the controller, -including error and discarded packets. - -"rx_total_packets" is equivalent to etherStatsPkts defined in RFC 2819. -This statistic counts the total number of packets received by the controller, -including error, discarded, unicast, multicast, and broadcast packets. - -"rx_undersize" is equivalent to etherStatsUndersizePkts defined in RFC 2819. -This statistic counts the number of correctly formed packets received less -than 64 bytes long. - -"rx_oversize" is equivalent to etherStatsOversizePkts defined in RFC 2819. -This statistic counts the number of correctly formed packets greater than 1518 -bytes long. - -"rx_64_bytes" is equivalent to etherStatsPkts64Octets defined in RFC 2819. -This statistic counts the total number of packets received that were 64 octets -in length. - -"rx_65_127_bytes" is equivalent to etherStatsPkts65to127Octets defined in RFC -2819. This statistic counts the total number of packets received that were -between 65 and 127 octets in length inclusive. - -"rx_128_255_bytes" is equivalent to etherStatsPkts128to255Octets defined in -RFC 2819. This statistic is the total number of packets received that were -between 128 and 255 octets in length inclusive. - -"rx_256_511_bytes" is equivalent to etherStatsPkts256to511Octets defined in -RFC 2819. This statistic is the total number of packets received that were -between 256 and 511 octets in length inclusive. - -"rx_512_1023_bytes" is equivalent to etherStatsPkts512to1023Octets defined in -RFC 2819. This statistic is the total number of packets received that were -between 512 and 1023 octets in length inclusive. - -"rx_1024_1518_bytes" is equivalent to etherStatsPkts1024to1518Octets define -in RFC 2819. This statistic is the total number of packets received that were -between 1024 and 1518 octets in length inclusive. - -"rx_gte_1519_bytes" is a statistic defined specific to the behavior of the -Altera TSE. This statistics counts the number of received good and errored -frames between the length of 1519 and the maximum frame length configured -in the frm_length register. See the Altera TSE User Guide for More details. - -"rx_jabbers" is equivalent to etherStatsJabbers defined in RFC 2819. This -statistic is the total number of packets received that were longer than 1518 -octets, and had either a bad CRC with an integral number of octets (CRC Error) -or a bad CRC with a non-integral number of octets (Alignment Error). - -"rx_runts" is equivalent to etherStatsFragments defined in RFC 2819. This -statistic is the total number of packets received that were less than 64 octets -in length and had either a bad CRC with an integral number of octets (CRC -error) or a bad CRC with a non-integral number of octets (Alignment Error). diff --git a/Documentation/networking/index.rst b/Documentation/networking/index.rst index dc37fc8d5bee..96ffad845fd9 100644 --- a/Documentation/networking/index.rst +++ b/Documentation/networking/index.rst @@ -38,6 +38,7 @@ Contents: nfc 6lowpan 6pack + altera_tse .. only:: subproject and html -- cgit v1.2.3-59-g8ed1b From aa92320b3e38f2b64b2d91a20761db1683e6c531 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Tue, 28 Apr 2020 00:01:19 +0200 Subject: docs: networking: convert arcnet-hardware.txt to ReST - add SPDX header; - add document title markup; - add notes markups; - mark tables as such; - mark code blocks and literals as such; - adjust identation, whitespaces and blank lines; - add to networking/index.rst. Signed-off-by: Mauro Carvalho Chehab Signed-off-by: David S. Miller --- Documentation/networking/arcnet-hardware.rst | 3234 ++++++++++++++++++++++++++ Documentation/networking/arcnet-hardware.txt | 3133 ------------------------- Documentation/networking/index.rst | 1 + 3 files changed, 3235 insertions(+), 3133 deletions(-) create mode 100644 Documentation/networking/arcnet-hardware.rst delete mode 100644 Documentation/networking/arcnet-hardware.txt diff --git a/Documentation/networking/arcnet-hardware.rst b/Documentation/networking/arcnet-hardware.rst new file mode 100644 index 000000000000..b5a1a020c824 --- /dev/null +++ b/Documentation/networking/arcnet-hardware.rst @@ -0,0 +1,3234 @@ +.. SPDX-License-Identifier: GPL-2.0 + +=============== +ARCnet Hardware +=============== + +.. note:: + + 1) This file is a supplement to arcnet.txt. Please read that for general + driver configuration help. + 2) This file is no longer Linux-specific. It should probably be moved out + of the kernel sources. Ideas? + +Because so many people (myself included) seem to have obtained ARCnet cards +without manuals, this file contains a quick introduction to ARCnet hardware, +some cabling tips, and a listing of all jumper settings I can find. Please +e-mail apenwarr@worldvisions.ca with any settings for your particular card, +or any other information you have! + + +Introduction to ARCnet +====================== + +ARCnet is a network type which works in a way similar to popular Ethernet +networks but which is also different in some very important ways. + +First of all, you can get ARCnet cards in at least two speeds: 2.5 Mbps +(slower than Ethernet) and 100 Mbps (faster than normal Ethernet). In fact, +there are others as well, but these are less common. The different hardware +types, as far as I'm aware, are not compatible and so you cannot wire a +100 Mbps card to a 2.5 Mbps card, and so on. From what I hear, my driver does +work with 100 Mbps cards, but I haven't been able to verify this myself, +since I only have the 2.5 Mbps variety. It is probably not going to saturate +your 100 Mbps card. Stop complaining. :) + +You also cannot connect an ARCnet card to any kind of Ethernet card and +expect it to work. + +There are two "types" of ARCnet - STAR topology and BUS topology. This +refers to how the cards are meant to be wired together. According to most +available documentation, you can only connect STAR cards to STAR cards and +BUS cards to BUS cards. That makes sense, right? Well, it's not quite +true; see below under "Cabling." + +Once you get past these little stumbling blocks, ARCnet is actually quite a +well-designed standard. It uses something called "modified token passing" +which makes it completely incompatible with so-called "Token Ring" cards, +but which makes transfers much more reliable than Ethernet does. In fact, +ARCnet will guarantee that a packet arrives safely at the destination, and +even if it can't possibly be delivered properly (ie. because of a cable +break, or because the destination computer does not exist) it will at least +tell the sender about it. + +Because of the carefully defined action of the "token", it will always make +a pass around the "ring" within a maximum length of time. This makes it +useful for realtime networks. + +In addition, all known ARCnet cards have an (almost) identical programming +interface. This means that with one ARCnet driver you can support any +card, whereas with Ethernet each manufacturer uses what is sometimes a +completely different programming interface, leading to a lot of different, +sometimes very similar, Ethernet drivers. Of course, always using the same +programming interface also means that when high-performance hardware +facilities like PCI bus mastering DMA appear, it's hard to take advantage of +them. Let's not go into that. + +One thing that makes ARCnet cards difficult to program for, however, is the +limit on their packet sizes; standard ARCnet can only send packets that are +up to 508 bytes in length. This is smaller than the Internet "bare minimum" +of 576 bytes, let alone the Ethernet MTU of 1500. To compensate, an extra +level of encapsulation is defined by RFC1201, which I call "packet +splitting," that allows "virtual packets" to grow as large as 64K each, +although they are generally kept down to the Ethernet-style 1500 bytes. + +For more information on the advantages and disadvantages (mostly the +advantages) of ARCnet networks, you might try the "ARCnet Trade Association" +WWW page: + + http://www.arcnet.com + + +Cabling ARCnet Networks +======================= + +This section was rewritten by + + Vojtech Pavlik + +using information from several people, including: + + - Avery Pennraun + - Stephen A. Wood + - John Paul Morrison + - Joachim Koenig + +and Avery touched it up a bit, at Vojtech's request. + +ARCnet (the classic 2.5 Mbps version) can be connected by two different +types of cabling: coax and twisted pair. The other ARCnet-type networks +(100 Mbps TCNS and 320 kbps - 32 Mbps ARCnet Plus) use different types of +cabling (Type1, Fiber, C1, C4, C5). + +For a coax network, you "should" use 93 Ohm RG-62 cable. But other cables +also work fine, because ARCnet is a very stable network. I personally use 75 +Ohm TV antenna cable. + +Cards for coax cabling are shipped in two different variants: for BUS and +STAR network topologies. They are mostly the same. The only difference +lies in the hybrid chip installed. BUS cards use high impedance output, +while STAR use low impedance. Low impedance card (STAR) is electrically +equal to a high impedance one with a terminator installed. + +Usually, the ARCnet networks are built up from STAR cards and hubs. There +are two types of hubs - active and passive. Passive hubs are small boxes +with four BNC connectors containing four 47 Ohm resistors:: + + | | wires + R + junction + -R-+-R- R 47 Ohm resistors + R + | + +The shielding is connected together. Active hubs are much more complicated; +they are powered and contain electronics to amplify the signal and send it +to other segments of the net. They usually have eight connectors. Active +hubs come in two variants - dumb and smart. The dumb variant just +amplifies, but the smart one decodes to digital and encodes back all packets +coming through. This is much better if you have several hubs in the net, +since many dumb active hubs may worsen the signal quality. + +And now to the cabling. What you can connect together: + +1. A card to a card. This is the simplest way of creating a 2-computer + network. + +2. A card to a passive hub. Remember that all unused connectors on the hub + must be properly terminated with 93 Ohm (or something else if you don't + have the right ones) terminators. + + (Avery's note: oops, I didn't know that. Mine (TV cable) works + anyway, though.) + +3. A card to an active hub. Here is no need to terminate the unused + connectors except some kind of aesthetic feeling. But, there may not be + more than eleven active hubs between any two computers. That of course + doesn't limit the number of active hubs on the network. + +4. An active hub to another. + +5. An active hub to passive hub. + +Remember that you cannot connect two passive hubs together. The power loss +implied by such a connection is too high for the net to operate reliably. + +An example of a typical ARCnet network:: + + R S - STAR type card + S------H--------A-------S R - Terminator + | | H - Hub + | | A - Active hub + | S----H----S + S | + | + S + +The BUS topology is very similar to the one used by Ethernet. The only +difference is in cable and terminators: they should be 93 Ohm. Ethernet +uses 50 Ohm impedance. You use T connectors to put the computers on a single +line of cable, the bus. You have to put terminators at both ends of the +cable. A typical BUS ARCnet network looks like:: + + RT----T------T------T------T------TR + B B B B B B + + B - BUS type card + R - Terminator + T - T connector + +But that is not all! The two types can be connected together. According to +the official documentation the only way of connecting them is using an active +hub:: + + A------T------T------TR + | B B B + S---H---S + | + S + +The official docs also state that you can use STAR cards at the ends of +BUS network in place of a BUS card and a terminator:: + + S------T------T------S + B B + +But, according to my own experiments, you can simply hang a BUS type card +anywhere in middle of a cable in a STAR topology network. And more - you +can use the bus card in place of any star card if you use a terminator. Then +you can build very complicated networks fulfilling all your needs! An +example:: + + S + | + RT------T-------T------H------S + B B B | + | R + S------A------T-------T-------A-------H------TR + | B B | | B + | S BT | + | | | S----A-----S + S------H---A----S | | + | | S------T----H---S | + S S B R S + +A basically different cabling scheme is used with Twisted Pair cabling. Each +of the TP cards has two RJ (phone-cord style) connectors. The cards are +then daisy-chained together using a cable connecting every two neighboring +cards. The ends are terminated with RJ 93 Ohm terminators which plug into +the empty connectors of cards on the ends of the chain. An example:: + + ___________ ___________ + _R_|_ _|_|_ _|_R_ + | | | | | | + |Card | |Card | |Card | + |_____| |_____| |_____| + + +There are also hubs for the TP topology. There is nothing difficult +involved in using them; you just connect a TP chain to a hub on any end or +even at both. This way you can create almost any network configuration. +The maximum of 11 hubs between any two computers on the net applies here as +well. An example:: + + RP-------P--------P--------H-----P------P-----PR + | + RP-----H--------P--------H-----P------PR + | | + PR PR + + R - RJ Terminator + P - TP Card + H - TP Hub + +Like any network, ARCnet has a limited cable length. These are the maximum +cable lengths between two active ends (an active end being an active hub or +a STAR card). + + ========== ======= =========== + RG-62 93 Ohm up to 650 m + RG-59/U 75 Ohm up to 457 m + RG-11/U 75 Ohm up to 533 m + IBM Type 1 150 Ohm up to 200 m + IBM Type 3 100 Ohm up to 100 m + ========== ======= =========== + +The maximum length of all cables connected to a passive hub is limited to 65 +meters for RG-62 cabling; less for others. You can see that using passive +hubs in a large network is a bad idea. The maximum length of a single "BUS +Trunk" is about 300 meters for RG-62. The maximum distance between the two +most distant points of the net is limited to 3000 meters. The maximum length +of a TP cable between two cards/hubs is 650 meters. + + +Setting the Jumpers +=================== + +All ARCnet cards should have a total of four or five different settings: + + - the I/O address: this is the "port" your ARCnet card is on. Probed + values in the Linux ARCnet driver are only from 0x200 through 0x3F0. (If + your card has additional ones, which is possible, please tell me.) This + should not be the same as any other device on your system. According to + a doc I got from Novell, MS Windows prefers values of 0x300 or more, + eating net connections on my system (at least) otherwise. My guess is + this may be because, if your card is at 0x2E0, probing for a serial port + at 0x2E8 will reset the card and probably mess things up royally. + + - Avery's favourite: 0x300. + + - the IRQ: on 8-bit cards, it might be 2 (9), 3, 4, 5, or 7. + on 16-bit cards, it might be 2 (9), 3, 4, 5, 7, or 10-15. + + Make sure this is different from any other card on your system. Note + that IRQ2 is the same as IRQ9, as far as Linux is concerned. You can + "cat /proc/interrupts" for a somewhat complete list of which ones are in + use at any given time. Here is a list of common usages from Vojtech + Pavlik : + + ("Not on bus" means there is no way for a card to generate this + interrupt) + + ====== ========================================================= + IRQ 0 Timer 0 (Not on bus) + IRQ 1 Keyboard (Not on bus) + IRQ 2 IRQ Controller 2 (Not on bus, nor does interrupt the CPU) + IRQ 3 COM2 + IRQ 4 COM1 + IRQ 5 FREE (LPT2 if you have it; sometimes COM3; maybe PLIP) + IRQ 6 Floppy disk controller + IRQ 7 FREE (LPT1 if you don't use the polling driver; PLIP) + IRQ 8 Realtime Clock Interrupt (Not on bus) + IRQ 9 FREE (VGA vertical sync interrupt if enabled) + IRQ 10 FREE + IRQ 11 FREE + IRQ 12 FREE + IRQ 13 Numeric Coprocessor (Not on bus) + IRQ 14 Fixed Disk Controller + IRQ 15 FREE (Fixed Disk Controller 2 if you have it) + ====== ========================================================= + + + .. note:: + + IRQ 9 is used on some video cards for the "vertical retrace" + interrupt. This interrupt would have been handy for things like + video games, as it occurs exactly once per screen refresh, but + unfortunately IBM cancelled this feature starting with the original + VGA and thus many VGA/SVGA cards do not support it. For this + reason, no modern software uses this interrupt and it can almost + always be safely disabled, if your video card supports it at all. + + If your card for some reason CANNOT disable this IRQ (usually there + is a jumper), one solution would be to clip the printed circuit + contact on the board: it's the fourth contact from the left on the + back side. I take no responsibility if you try this. + + - Avery's favourite: IRQ2 (actually IRQ9). Watch that VGA, though. + + - the memory address: Unlike most cards, ARCnets use "shared memory" for + copying buffers around. Make SURE it doesn't conflict with any other + used memory in your system! + + :: + + A0000 - VGA graphics memory (ok if you don't have VGA) + B0000 - Monochrome text mode + C0000 \ One of these is your VGA BIOS - usually C0000. + E0000 / + F0000 - System BIOS + + Anything less than 0xA0000 is, well, a BAD idea since it isn't above + 640k. + + - Avery's favourite: 0xD0000 + + - the station address: Every ARCnet card has its own "unique" network + address from 0 to 255. Unlike Ethernet, you can set this address + yourself with a jumper or switch (or on some cards, with special + software). Since it's only 8 bits, you can only have 254 ARCnet cards + on a network. DON'T use 0 or 255, since these are reserved (although + neat stuff will probably happen if you DO use them). By the way, if you + haven't already guessed, don't set this the same as any other ARCnet on + your network! + + - Avery's favourite: 3 and 4. Not that it matters. + + - There may be ETS1 and ETS2 settings. These may or may not make a + difference on your card (many manuals call them "reserved"), but are + used to change the delays used when powering up a computer on the + network. This is only necessary when wiring VERY long range ARCnet + networks, on the order of 4km or so; in any case, the only real + requirement here is that all cards on the network with ETS1 and ETS2 + jumpers have them in the same position. Chris Hindy + sent in a chart with actual values for this: + + ======= ======= =============== ==================== + ET1 ET2 Response Time Reconfiguration Time + ======= ======= =============== ==================== + open open 74.7us 840us + open closed 283.4us 1680us + closed open 561.8us 1680us + closed closed 1118.6us 1680us + ======= ======= =============== ==================== + + Make sure you set ETS1 and ETS2 to the SAME VALUE for all cards on your + network. + +Also, on many cards (not mine, though) there are red and green LED's. +Vojtech Pavlik tells me this is what they mean: + + =============== =============== ===================================== + GREEN RED Status + =============== =============== ===================================== + OFF OFF Power off + OFF Short flashes Cabling problems (broken cable or not + terminated) + OFF (short) ON Card init + ON ON Normal state - everything OK, nothing + happens + ON Long flashes Data transfer + ON OFF Never happens (maybe when wrong ID) + =============== =============== ===================================== + + +The following is all the specific information people have sent me about +their own particular ARCnet cards. It is officially a mess, and contains +huge amounts of duplicated information. I have no time to fix it. If you +want to, PLEASE DO! Just send me a 'diff -u' of all your changes. + +The model # is listed right above specifics for that card, so you should be +able to use your text viewer's "search" function to find the entry you want. +If you don't KNOW what kind of card you have, try looking through the +various diagrams to see if you can tell. + +If your model isn't listed and/or has different settings, PLEASE PLEASE +tell me. I had to figure mine out without the manual, and it WASN'T FUN! + +Even if your ARCnet model isn't listed, but has the same jumpers as another +model that is, please e-mail me to say so. + +Cards Listed in this file (in this order, mostly): + + =============== ======================= ==== + Manufacturer Model # Bits + =============== ======================= ==== + SMC PC100 8 + SMC PC110 8 + SMC PC120 8 + SMC PC130 8 + SMC PC270E 8 + SMC PC500 16 + SMC PC500Longboard 16 + SMC PC550Longboard 16 + SMC PC600 16 + SMC PC710 8 + SMC? LCS-8830(-T) 8/16 + Puredata PDI507 8 + CNet Tech CN120-Series 8 + CNet Tech CN160-Series 16 + Lantech? UM9065L chipset 8 + Acer 5210-003 8 + Datapoint? LAN-ARC-8 8 + Topware TA-ARC/10 8 + Thomas-Conrad 500-6242-0097 REV A 8 + Waterloo? (C)1985 Waterloo Micro. 8 + No Name -- 8/16 + No Name Taiwan R.O.C? 8 + No Name Model 9058 8 + Tiara Tiara Lancard? 8 + =============== ======================= ==== + + +* SMC = Standard Microsystems Corp. +* CNet Tech = CNet Technology, Inc. + +Unclassified Stuff +================== + + - Please send any other information you can find. + + - And some other stuff (more info is welcome!):: + + From: root@ultraworld.xs4all.nl (Timo Hilbrink) + To: apenwarr@foxnet.net (Avery Pennarun) + Date: Wed, 26 Oct 1994 02:10:32 +0000 (GMT) + Reply-To: timoh@xs4all.nl + + [...parts deleted...] + + About the jumpers: On my PC130 there is one more jumper, located near the + cable-connector and it's for changing to star or bus topology; + closed: star - open: bus + On the PC500 are some more jumper-pins, one block labeled with RX,PDN,TXI + and another with ALE,LA17,LA18,LA19 these are undocumented.. + + [...more parts deleted...] + + --- CUT --- + +Standard Microsystems Corp (SMC) +================================ + +PC100, PC110, PC120, PC130 (8-bit cards) and PC500, PC600 (16-bit cards) +------------------------------------------------------------------------ + + - mainly from Avery Pennarun . Values depicted + are from Avery's setup. + - special thanks to Timo Hilbrink for noting that PC120, + 130, 500, and 600 all have the same switches as Avery's PC100. + PC500/600 have several extra, undocumented pins though. (?) + - PC110 settings were verified by Stephen A. Wood + - Also, the JP- and S-numbers probably don't match your card exactly. Try + to find jumpers/switches with the same number of settings - it's + probably more reliable. + +:: + + JP5 [|] : : : : + (IRQ Setting) IRQ2 IRQ3 IRQ4 IRQ5 IRQ7 + Put exactly one jumper on exactly one set of pins. + + + 1 2 3 4 5 6 7 8 9 10 + S1 /----------------------------------\ + (I/O and Memory | 1 1 * 0 0 0 0 * 1 1 0 1 | + addresses) \----------------------------------/ + |--| |--------| |--------| + (a) (b) (m) + + WARNING. It's very important when setting these which way + you're holding the card, and which way you think is '1'! + + If you suspect that your settings are not being made + correctly, try reversing the direction or inverting the + switch positions. + + a: The first digit of the I/O address. + Setting Value + ------- ----- + 00 0 + 01 1 + 10 2 + 11 3 + + b: The second digit of the I/O address. + Setting Value + ------- ----- + 0000 0 + 0001 1 + 0010 2 + ... ... + 1110 E + 1111 F + + The I/O address is in the form ab0. For example, if + a is 0x2 and b is 0xE, the address will be 0x2E0. + + DO NOT SET THIS LESS THAN 0x200!!!!! + + + m: The first digit of the memory address. + Setting Value + ------- ----- + 0000 0 + 0001 1 + 0010 2 + ... ... + 1110 E + 1111 F + + The memory address is in the form m0000. For example, if + m is D, the address will be 0xD0000. + + DO NOT SET THIS TO C0000, F0000, OR LESS THAN A0000! + + 1 2 3 4 5 6 7 8 + S2 /--------------------------\ + (Station Address) | 1 1 0 0 0 0 0 0 | + \--------------------------/ + + Setting Value + ------- ----- + 00000000 00 + 10000000 01 + 01000000 02 + ... + 01111111 FE + 11111111 FF + + Note that this is binary with the digits reversed! + + DO NOT SET THIS TO 0 OR 255 (0xFF)! + + +PC130E/PC270E (8-bit cards) +--------------------------- + + - from Juergen Seifert + +This description has been written by Juergen Seifert +using information from the following Original SMC Manual + + "Configuration Guide for ARCNET(R)-PC130E/PC270 Network + Controller Boards Pub. # 900.044A June, 1989" + +ARCNET is a registered trademark of the Datapoint Corporation +SMC is a registered trademark of the Standard Microsystems Corporation + +The PC130E is an enhanced version of the PC130 board, is equipped with a +standard BNC female connector for connection to RG-62/U coax cable. +Since this board is designed both for point-to-point connection in star +networks and for connection to bus networks, it is downwardly compatible +with all the other standard boards designed for coax networks (that is, +the PC120, PC110 and PC100 star topology boards and the PC220, PC210 and +PC200 bus topology boards). + +The PC270E is an enhanced version of the PC260 board, is equipped with two +modular RJ11-type jacks for connection to twisted pair wiring. +It can be used in a star or a daisy-chained network. + +:: + + 8 7 6 5 4 3 2 1 + ________________________________________________________________ + | | S1 | | + | |_________________| | + | Offs|Base |I/O Addr | + | RAM Addr | ___| + | ___ ___ CR3 |___| + | | \/ | CR4 |___| + | | PROM | ___| + | | | N | | 8 + | | SOCKET | o | | 7 + | |________| d | | 6 + | ___________________ e | | 5 + | | | A | S | 4 + | |oo| EXT2 | | d | 2 | 3 + | |oo| EXT1 | SMC | d | | 2 + | |oo| ROM | 90C63 | r |___| 1 + | |oo| IRQ7 | | |o| _____| + | |oo| IRQ5 | | |o| | J1 | + | |oo| IRQ4 | | STAR |_____| + | |oo| IRQ3 | | | J2 | + | |oo| IRQ2 |___________________| |_____| + |___ ______________| + | | + |_____________________________________________| + +Legend:: + + SMC 90C63 ARCNET Controller / Transceiver /Logic + S1 1-3: I/O Base Address Select + 4-6: Memory Base Address Select + 7-8: RAM Offset Select + S2 1-8: Node ID Select + EXT Extended Timeout Select + ROM ROM Enable Select + STAR Selected - Star Topology (PC130E only) + Deselected - Bus Topology (PC130E only) + CR3/CR4 Diagnostic LEDs + J1 BNC RG62/U Connector (PC130E only) + J1 6-position Telephone Jack (PC270E only) + J2 6-position Telephone Jack (PC270E only) + +Setting one of the switches to Off/Open means "1", On/Closed means "0". + + +Setting the Node ID +^^^^^^^^^^^^^^^^^^^ + +The eight switches in group S2 are used to set the node ID. +These switches work in a way similar to the PC100-series cards; see that +entry for more information. + + +Setting the I/O Base Address +^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The first three switches in switch group S1 are used to select one +of eight possible I/O Base addresses using the following table:: + + + Switch | Hex I/O + 1 2 3 | Address + -------|-------- + 0 0 0 | 260 + 0 0 1 | 290 + 0 1 0 | 2E0 (Manufacturer's default) + 0 1 1 | 2F0 + 1 0 0 | 300 + 1 0 1 | 350 + 1 1 0 | 380 + 1 1 1 | 3E0 + + +Setting the Base Memory (RAM) buffer Address +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The memory buffer requires 2K of a 16K block of RAM. The base of this +16K block can be located in any of eight positions. +Switches 4-6 of switch group S1 select the Base of the 16K block. +Within that 16K address space, the buffer may be assigned any one of four +positions, determined by the offset, switches 7 and 8 of group S1. + +:: + + Switch | Hex RAM | Hex ROM + 4 5 6 7 8 | Address | Address *) + -----------|---------|----------- + 0 0 0 0 0 | C0000 | C2000 + 0 0 0 0 1 | C0800 | C2000 + 0 0 0 1 0 | C1000 | C2000 + 0 0 0 1 1 | C1800 | C2000 + | | + 0 0 1 0 0 | C4000 | C6000 + 0 0 1 0 1 | C4800 | C6000 + 0 0 1 1 0 | C5000 | C6000 + 0 0 1 1 1 | C5800 | C6000 + | | + 0 1 0 0 0 | CC000 | CE000 + 0 1 0 0 1 | CC800 | CE000 + 0 1 0 1 0 | CD000 | CE000 + 0 1 0 1 1 | CD800 | CE000 + | | + 0 1 1 0 0 | D0000 | D2000 (Manufacturer's default) + 0 1 1 0 1 | D0800 | D2000 + 0 1 1 1 0 | D1000 | D2000 + 0 1 1 1 1 | D1800 | D2000 + | | + 1 0 0 0 0 | D4000 | D6000 + 1 0 0 0 1 | D4800 | D6000 + 1 0 0 1 0 | D5000 | D6000 + 1 0 0 1 1 | D5800 | D6000 + | | + 1 0 1 0 0 | D8000 | DA000 + 1 0 1 0 1 | D8800 | DA000 + 1 0 1 1 0 | D9000 | DA000 + 1 0 1 1 1 | D9800 | DA000 + | | + 1 1 0 0 0 | DC000 | DE000 + 1 1 0 0 1 | DC800 | DE000 + 1 1 0 1 0 | DD000 | DE000 + 1 1 0 1 1 | DD800 | DE000 + | | + 1 1 1 0 0 | E0000 | E2000 + 1 1 1 0 1 | E0800 | E2000 + 1 1 1 1 0 | E1000 | E2000 + 1 1 1 1 1 | E1800 | E2000 + + *) To enable the 8K Boot PROM install the jumper ROM. + The default is jumper ROM not installed. + + +Setting the Timeouts and Interrupt +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The jumpers labeled EXT1 and EXT2 are used to determine the timeout +parameters. These two jumpers are normally left open. + +To select a hardware interrupt level set one (only one!) of the jumpers +IRQ2, IRQ3, IRQ4, IRQ5, IRQ7. The Manufacturer's default is IRQ2. + + +Configuring the PC130E for Star or Bus Topology +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The single jumper labeled STAR is used to configure the PC130E board for +star or bus topology. +When the jumper is installed, the board may be used in a star network, when +it is removed, the board can be used in a bus topology. + + +Diagnostic LEDs +^^^^^^^^^^^^^^^ + +Two diagnostic LEDs are visible on the rear bracket of the board. +The green LED monitors the network activity: the red one shows the +board activity:: + + Green | Status Red | Status + -------|------------------- ---------|------------------- + on | normal activity flash/on | data transfer + blink | reconfiguration off | no data transfer; + off | defective board or | incorrect memory or + | node ID is zero | I/O address + + +PC500/PC550 Longboard (16-bit cards) +------------------------------------ + + - from Juergen Seifert + + + .. note:: + + There is another Version of the PC500 called Short Version, which + is different in hard- and software! The most important differences + are: + + - The long board has no Shared memory. + - On the long board the selection of the interrupt is done by binary + coded switch, on the short board directly by jumper. + +[Avery's note: pay special attention to that: the long board HAS NO SHARED +MEMORY. This means the current Linux-ARCnet driver can't use these cards. +I have obtained a PC500Longboard and will be doing some experiments on it in +the future, but don't hold your breath. Thanks again to Juergen Seifert for +his advice about this!] + +This description has been written by Juergen Seifert +using information from the following Original SMC Manual + + "Configuration Guide for SMC ARCNET-PC500/PC550 + Series Network Controller Boards Pub. # 900.033 Rev. A + November, 1989" + +ARCNET is a registered trademark of the Datapoint Corporation +SMC is a registered trademark of the Standard Microsystems Corporation + +The PC500 is equipped with a standard BNC female connector for connection +to RG-62/U coax cable. +The board is designed both for point-to-point connection in star networks +and for connection to bus networks. + +The PC550 is equipped with two modular RJ11-type jacks for connection +to twisted pair wiring. +It can be used in a star or a daisy-chained (BUS) network. + +:: + + 1 + 0 9 8 7 6 5 4 3 2 1 6 5 4 3 2 1 + ____________________________________________________________________ + < | SW1 | | SW2 | | + > |_____________________| |_____________| | + < IRQ |I/O Addr | + > ___| + < CR4 |___| + > CR3 |___| + < ___| + > N | | 8 + < o | | 7 + > d | S | 6 + < e | W | 5 + > A | 3 | 4 + < d | | 3 + > d | | 2 + < r |___| 1 + > |o| _____| + < |o| | J1 | + > 3 1 JP6 |_____| + < |o|o| JP2 | J2 | + > |o|o| |_____| + < 4 2__ ______________| + > | | | + <____| |_____________________________________________| + +Legend:: + + SW1 1-6: I/O Base Address Select + 7-10: Interrupt Select + SW2 1-6: Reserved for Future Use + SW3 1-8: Node ID Select + JP2 1-4: Extended Timeout Select + JP6 Selected - Star Topology (PC500 only) + Deselected - Bus Topology (PC500 only) + CR3 Green Monitors Network Activity + CR4 Red Monitors Board Activity + J1 BNC RG62/U Connector (PC500 only) + J1 6-position Telephone Jack (PC550 only) + J2 6-position Telephone Jack (PC550 only) + +Setting one of the switches to Off/Open means "1", On/Closed means "0". + + +Setting the Node ID +^^^^^^^^^^^^^^^^^^^ + +The eight switches in group SW3 are used to set the node ID. Each node +attached to the network must have an unique node ID which must be +different from 0. +Switch 1 serves as the least significant bit (LSB). + +The node ID is the sum of the values of all switches set to "1" +These values are:: + + Switch | Value + -------|------- + 1 | 1 + 2 | 2 + 3 | 4 + 4 | 8 + 5 | 16 + 6 | 32 + 7 | 64 + 8 | 128 + +Some Examples:: + + Switch | Hex | Decimal + 8 7 6 5 4 3 2 1 | Node ID | Node ID + ----------------|---------|--------- + 0 0 0 0 0 0 0 0 | not allowed + 0 0 0 0 0 0 0 1 | 1 | 1 + 0 0 0 0 0 0 1 0 | 2 | 2 + 0 0 0 0 0 0 1 1 | 3 | 3 + . . . | | + 0 1 0 1 0 1 0 1 | 55 | 85 + . . . | | + 1 0 1 0 1 0 1 0 | AA | 170 + . . . | | + 1 1 1 1 1 1 0 1 | FD | 253 + 1 1 1 1 1 1 1 0 | FE | 254 + 1 1 1 1 1 1 1 1 | FF | 255 + + +Setting the I/O Base Address +^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The first six switches in switch group SW1 are used to select one +of 32 possible I/O Base addresses using the following table:: + + Switch | Hex I/O + 6 5 4 3 2 1 | Address + -------------|-------- + 0 1 0 0 0 0 | 200 + 0 1 0 0 0 1 | 210 + 0 1 0 0 1 0 | 220 + 0 1 0 0 1 1 | 230 + 0 1 0 1 0 0 | 240 + 0 1 0 1 0 1 | 250 + 0 1 0 1 1 0 | 260 + 0 1 0 1 1 1 | 270 + 0 1 1 0 0 0 | 280 + 0 1 1 0 0 1 | 290 + 0 1 1 0 1 0 | 2A0 + 0 1 1 0 1 1 | 2B0 + 0 1 1 1 0 0 | 2C0 + 0 1 1 1 0 1 | 2D0 + 0 1 1 1 1 0 | 2E0 (Manufacturer's default) + 0 1 1 1 1 1 | 2F0 + 1 1 0 0 0 0 | 300 + 1 1 0 0 0 1 | 310 + 1 1 0 0 1 0 | 320 + 1 1 0 0 1 1 | 330 + 1 1 0 1 0 0 | 340 + 1 1 0 1 0 1 | 350 + 1 1 0 1 1 0 | 360 + 1 1 0 1 1 1 | 370 + 1 1 1 0 0 0 | 380 + 1 1 1 0 0 1 | 390 + 1 1 1 0 1 0 | 3A0 + 1 1 1 0 1 1 | 3B0 + 1 1 1 1 0 0 | 3C0 + 1 1 1 1 0 1 | 3D0 + 1 1 1 1 1 0 | 3E0 + 1 1 1 1 1 1 | 3F0 + + +Setting the Interrupt +^^^^^^^^^^^^^^^^^^^^^ + +Switches seven through ten of switch group SW1 are used to select the +interrupt level. The interrupt level is binary coded, so selections +from 0 to 15 would be possible, but only the following eight values will +be supported: 3, 4, 5, 7, 9, 10, 11, 12. + +:: + + Switch | IRQ + 10 9 8 7 | + ---------|-------- + 0 0 1 1 | 3 + 0 1 0 0 | 4 + 0 1 0 1 | 5 + 0 1 1 1 | 7 + 1 0 0 1 | 9 (=2) (default) + 1 0 1 0 | 10 + 1 0 1 1 | 11 + 1 1 0 0 | 12 + + +Setting the Timeouts +^^^^^^^^^^^^^^^^^^^^ + +The two jumpers JP2 (1-4) are used to determine the timeout parameters. +These two jumpers are normally left open. +Refer to the COM9026 Data Sheet for alternate configurations. + + +Configuring the PC500 for Star or Bus Topology +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The single jumper labeled JP6 is used to configure the PC500 board for +star or bus topology. +When the jumper is installed, the board may be used in a star network, when +it is removed, the board can be used in a bus topology. + + +Diagnostic LEDs +^^^^^^^^^^^^^^^ + +Two diagnostic LEDs are visible on the rear bracket of the board. +The green LED monitors the network activity: the red one shows the +board activity:: + + Green | Status Red | Status + -------|------------------- ---------|------------------- + on | normal activity flash/on | data transfer + blink | reconfiguration off | no data transfer; + off | defective board or | incorrect memory or + | node ID is zero | I/O address + + +PC710 (8-bit card) +------------------ + + - from J.S. van Oosten + +Note: this data is gathered by experimenting and looking at info of other +cards. However, I'm sure I got 99% of the settings right. + +The SMC710 card resembles the PC270 card, but is much more basic (i.e. no +LEDs, RJ11 jacks, etc.) and 8 bit. Here's a little drawing:: + + _______________________________________ + | +---------+ +---------+ |____ + | | S2 | | S1 | | + | +---------+ +---------+ | + | | + | +===+ __ | + | | R | | | X-tal ###___ + | | O | |__| ####__'| + | | M | || ### + | +===+ | + | | + | .. JP1 +----------+ | + | .. | big chip | | + | .. | 90C63 | | + | .. | | | + | .. +----------+ | + ------- ----------- + ||||||||||||||||||||| + +The row of jumpers at JP1 actually consists of 8 jumpers, (sometimes +labelled) the same as on the PC270, from top to bottom: EXT2, EXT1, ROM, +IRQ7, IRQ5, IRQ4, IRQ3, IRQ2 (gee, wonder what they would do? :-) ) + +S1 and S2 perform the same function as on the PC270, only their numbers +are swapped (S1 is the nodeaddress, S2 sets IO- and RAM-address). + +I know it works when connected to a PC110 type ARCnet board. + + +***************************************************************************** + +Possibly SMC +============ + +LCS-8830(-T) (8 and 16-bit cards) +--------------------------------- + + - from Mathias Katzer + - Marek Michalkiewicz says the + LCS-8830 is slightly different from LCS-8830-T. These are 8 bit, BUS + only (the JP0 jumper is hardwired), and BNC only. + +This is a LCS-8830-T made by SMC, I think ('SMC' only appears on one PLCC, +nowhere else, not even on the few Xeroxed sheets from the manual). + +SMC ARCnet Board Type LCS-8830-T:: + + ------------------------------------ + | | + | JP3 88 8 JP2 | + | ##### | \ | + | ##### ET1 ET2 ###| + | 8 ###| + | U3 SW 1 JP0 ###| Phone Jacks + | -- ###| + | | | | + | | | SW2 | + | | | | + | | | ##### | + | -- ##### #### BNC Connector + | #### + | 888888 JP1 | + | 234567 | + -- ------- + ||||||||||||||||||||||||||| + -------------------------- + + + SW1: DIP-Switches for Station Address + SW2: DIP-Switches for Memory Base and I/O Base addresses + + JP0: If closed, internal termination on (default open) + JP1: IRQ Jumpers + JP2: Boot-ROM enabled if closed + JP3: Jumpers for response timeout + + U3: Boot-ROM Socket + + + ET1 ET2 Response Time Idle Time Reconfiguration Time + + 78 86 840 + X 285 316 1680 + X 563 624 1680 + X X 1130 1237 1680 + + (X means closed jumper) + + (DIP-Switch downwards means "0") + +The station address is binary-coded with SW1. + +The I/O base address is coded with DIP-Switches 6,7 and 8 of SW2: + +======== ======== +Switches Base +678 Address +======== ======== +000 260-26f +100 290-29f +010 2e0-2ef +110 2f0-2ff +001 300-30f +101 350-35f +011 380-38f +111 3e0-3ef +======== ======== + + +DIP Switches 1-5 of SW2 encode the RAM and ROM Address Range: + +======== ============= ================ +Switches RAM ROM +12345 Address Range Address Range +======== ============= ================ +00000 C:0000-C:07ff C:2000-C:3fff +10000 C:0800-C:0fff +01000 C:1000-C:17ff +11000 C:1800-C:1fff +00100 C:4000-C:47ff C:6000-C:7fff +10100 C:4800-C:4fff +01100 C:5000-C:57ff +11100 C:5800-C:5fff +00010 C:C000-C:C7ff C:E000-C:ffff +10010 C:C800-C:Cfff +01010 C:D000-C:D7ff +11010 C:D800-C:Dfff +00110 D:0000-D:07ff D:2000-D:3fff +10110 D:0800-D:0fff +01110 D:1000-D:17ff +11110 D:1800-D:1fff +00001 D:4000-D:47ff D:6000-D:7fff +10001 D:4800-D:4fff +01001 D:5000-D:57ff +11001 D:5800-D:5fff +00101 D:8000-D:87ff D:A000-D:bfff +10101 D:8800-D:8fff +01101 D:9000-D:97ff +11101 D:9800-D:9fff +00011 D:C000-D:c7ff D:E000-D:ffff +10011 D:C800-D:cfff +01011 D:D000-D:d7ff +11011 D:D800-D:dfff +00111 E:0000-E:07ff E:2000-E:3fff +10111 E:0800-E:0fff +01111 E:1000-E:17ff +11111 E:1800-E:1fff +======== ============= ================ + + +PureData Corp +============= + +PDI507 (8-bit card) +-------------------- + + - from Mark Rejhon (slight modifications by Avery) + - Avery's note: I think PDI508 cards (but definitely NOT PDI508Plus cards) + are mostly the same as this. PDI508Plus cards appear to be mainly + software-configured. + +Jumpers: + + There is a jumper array at the bottom of the card, near the edge + connector. This array is labelled J1. They control the IRQs and + something else. Put only one jumper on the IRQ pins. + + ETS1, ETS2 are for timing on very long distance networks. See the + more general information near the top of this file. + + There is a J2 jumper on two pins. A jumper should be put on them, + since it was already there when I got the card. I don't know what + this jumper is for though. + + There is a two-jumper array for J3. I don't know what it is for, + but there were already two jumpers on it when I got the card. It's + a six pin grid in a two-by-three fashion. The jumpers were + configured as follows:: + + .-------. + o | o o | + :-------: ------> Accessible end of card with connectors + o | o o | in this direction -------> + `-------' + +Carl de Billy explains J3 and J4: + + J3 Diagram:: + + .-------. + o | o o | + :-------: TWIST Technology + o | o o | + `-------' + .-------. + | o o | o + :-------: COAX Technology + | o o | o + `-------' + + - If using coax cable in a bus topology the J4 jumper must be removed; + place it on one pin. + + - If using bus topology with twisted pair wiring move the J3 + jumpers so they connect the middle pin and the pins closest to the RJ11 + Connectors. Also the J4 jumper must be removed; place it on one pin of + J4 jumper for storage. + + - If using star topology with twisted pair wiring move the J3 + jumpers so they connect the middle pin and the pins closest to the RJ11 + connectors. + + +DIP Switches: + + The DIP switches accessible on the accessible end of the card while + it is installed, is used to set the ARCnet address. There are 8 + switches. Use an address from 1 to 254 + + ========== ========================= + Switch No. ARCnet address + 12345678 + ========== ========================= + 00000000 FF (Don't use this!) + 00000001 FE + 00000010 FD + ... + 11111101 2 + 11111110 1 + 11111111 0 (Don't use this!) + ========== ========================= + + There is another array of eight DIP switches at the top of the + card. There are five labelled MS0-MS4 which seem to control the + memory address, and another three labelled IO0-IO2 which seem to + control the base I/O address of the card. + + This was difficult to test by trial and error, and the I/O addresses + are in a weird order. This was tested by setting the DIP switches, + rebooting the computer, and attempting to load ARCETHER at various + addresses (mostly between 0x200 and 0x400). The address that caused + the red transmit LED to blink, is the one that I thought works. + + Also, the address 0x3D0 seem to have a special meaning, since the + ARCETHER packet driver loaded fine, but without the red LED + blinking. I don't know what 0x3D0 is for though. I recommend using + an address of 0x300 since Windows may not like addresses below + 0x300. + + ============= =========== + IO Switch No. I/O address + 210 + ============= =========== + 111 0x260 + 110 0x290 + 101 0x2E0 + 100 0x2F0 + 011 0x300 + 010 0x350 + 001 0x380 + 000 0x3E0 + ============= =========== + + The memory switches set a reserved address space of 0x1000 bytes + (0x100 segment units, or 4k). For example if I set an address of + 0xD000, it will use up addresses 0xD000 to 0xD100. + + The memory switches were tested by booting using QEMM386 stealth, + and using LOADHI to see what address automatically became excluded + from the upper memory regions, and then attempting to load ARCETHER + using these addresses. + + I recommend using an ARCnet memory address of 0xD000, and putting + the EMS page frame at 0xC000 while using QEMM stealth mode. That + way, you get contiguous high memory from 0xD100 almost all the way + the end of the megabyte. + + Memory Switch 0 (MS0) didn't seem to work properly when set to OFF + on my card. It could be malfunctioning on my card. Experiment with + it ON first, and if it doesn't work, set it to OFF. (It may be a + modifier for the 0x200 bit?) + + ============= ============================================ + MS Switch No. + 43210 Memory address + ============= ============================================ + 00001 0xE100 (guessed - was not detected by QEMM) + 00011 0xE000 (guessed - was not detected by QEMM) + 00101 0xDD00 + 00111 0xDC00 + 01001 0xD900 + 01011 0xD800 + 01101 0xD500 + 01111 0xD400 + 10001 0xD100 + 10011 0xD000 + 10101 0xCD00 + 10111 0xCC00 + 11001 0xC900 (guessed - crashes tested system) + 11011 0xC800 (guessed - crashes tested system) + 11101 0xC500 (guessed - crashes tested system) + 11111 0xC400 (guessed - crashes tested system) + ============= ============================================ + +CNet Technology Inc. +==================== + +120 Series (8-bit cards) +------------------------ + - from Juergen Seifert + +This description has been written by Juergen Seifert +using information from the following Original CNet Manual + + "ARCNET USER'S MANUAL for + CN120A + CN120AB + CN120TP + CN120ST + CN120SBT + P/N:12-01-0007 + Revision 3.00" + +ARCNET is a registered trademark of the Datapoint Corporation + +- P/N 120A ARCNET 8 bit XT/AT Star +- P/N 120AB ARCNET 8 bit XT/AT Bus +- P/N 120TP ARCNET 8 bit XT/AT Twisted Pair +- P/N 120ST ARCNET 8 bit XT/AT Star, Twisted Pair +- P/N 120SBT ARCNET 8 bit XT/AT Star, Bus, Twisted Pair + +:: + + __________________________________________________________________ + | | + | ___| + | LED |___| + | ___| + | N | | ID7 + | o | | ID6 + | d | S | ID5 + | e | W | ID4 + | ___________________ A | 2 | ID3 + | | | d | | ID2 + | | | 1 2 3 4 5 6 7 8 d | | ID1 + | | | _________________ r |___| ID0 + | | 90C65 || SW1 | ____| + | JP 8 7 | ||_________________| | | + | |o|o| JP1 | | | J2 | + | |o|o| |oo| | | JP 1 1 1 | | + | ______________ | | 0 1 2 |____| + | | PROM | |___________________| |o|o|o| _____| + | > SOCKET | JP 6 5 4 3 2 |o|o|o| | J1 | + | |______________| |o|o|o|o|o| |o|o|o| |_____| + |_____ |o|o|o|o|o| ______________| + | | + |_____________________________________________| + +Legend:: + + 90C65 ARCNET Probe + S1 1-5: Base Memory Address Select + 6-8: Base I/O Address Select + S2 1-8: Node ID Select (ID0-ID7) + JP1 ROM Enable Select + JP2 IRQ2 + JP3 IRQ3 + JP4 IRQ4 + JP5 IRQ5 + JP6 IRQ7 + JP7/JP8 ET1, ET2 Timeout Parameters + JP10/JP11 Coax / Twisted Pair Select (CN120ST/SBT only) + JP12 Terminator Select (CN120AB/ST/SBT only) + J1 BNC RG62/U Connector (all except CN120TP) + J2 Two 6-position Telephone Jack (CN120TP/ST/SBT only) + +Setting one of the switches to Off means "1", On means "0". + + +Setting the Node ID +^^^^^^^^^^^^^^^^^^^ + +The eight switches in SW2 are used to set the node ID. Each node attached +to the network must have an unique node ID which must be different from 0. +Switch 1 (ID0) serves as the least significant bit (LSB). + +The node ID is the sum of the values of all switches set to "1" +These values are: + + ======= ====== ===== + Switch Label Value + ======= ====== ===== + 1 ID0 1 + 2 ID1 2 + 3 ID2 4 + 4 ID3 8 + 5 ID4 16 + 6 ID5 32 + 7 ID6 64 + 8 ID7 128 + ======= ====== ===== + +Some Examples:: + + Switch | Hex | Decimal + 8 7 6 5 4 3 2 1 | Node ID | Node ID + ----------------|---------|--------- + 0 0 0 0 0 0 0 0 | not allowed + 0 0 0 0 0 0 0 1 | 1 | 1 + 0 0 0 0 0 0 1 0 | 2 | 2 + 0 0 0 0 0 0 1 1 | 3 | 3 + . . . | | + 0 1 0 1 0 1 0 1 | 55 | 85 + . . . | | + 1 0 1 0 1 0 1 0 | AA | 170 + . . . | | + 1 1 1 1 1 1 0 1 | FD | 253 + 1 1 1 1 1 1 1 0 | FE | 254 + 1 1 1 1 1 1 1 1 | FF | 255 + + +Setting the I/O Base Address +^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The last three switches in switch block SW1 are used to select one +of eight possible I/O Base addresses using the following table:: + + + Switch | Hex I/O + 6 7 8 | Address + ------------|-------- + ON ON ON | 260 + OFF ON ON | 290 + ON OFF ON | 2E0 (Manufacturer's default) + OFF OFF ON | 2F0 + ON ON OFF | 300 + OFF ON OFF | 350 + ON OFF OFF | 380 + OFF OFF OFF | 3E0 + + +Setting the Base Memory (RAM) buffer Address +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The memory buffer (RAM) requires 2K. The base of this buffer can be +located in any of eight positions. The address of the Boot Prom is +memory base + 8K or memory base + 0x2000. +Switches 1-5 of switch block SW1 select the Memory Base address. + +:: + + Switch | Hex RAM | Hex ROM + 1 2 3 4 5 | Address | Address *) + --------------------|---------|----------- + ON ON ON ON ON | C0000 | C2000 + ON ON OFF ON ON | C4000 | C6000 + ON ON ON OFF ON | CC000 | CE000 + ON ON OFF OFF ON | D0000 | D2000 (Manufacturer's default) + ON ON ON ON OFF | D4000 | D6000 + ON ON OFF ON OFF | D8000 | DA000 + ON ON ON OFF OFF | DC000 | DE000 + ON ON OFF OFF OFF | E0000 | E2000 + + *) To enable the Boot ROM install the jumper JP1 + +.. note:: + + Since the switches 1 and 2 are always set to ON it may be possible + that they can be used to add an offset of 2K, 4K or 6K to the base + address, but this feature is not documented in the manual and I + haven't tested it yet. + + +Setting the Interrupt Line +^^^^^^^^^^^^^^^^^^^^^^^^^^ + +To select a hardware interrupt level install one (only one!) of the jumpers +JP2, JP3, JP4, JP5, JP6. JP2 is the default:: + + Jumper | IRQ + -------|----- + 2 | 2 + 3 | 3 + 4 | 4 + 5 | 5 + 6 | 7 + + +Setting the Internal Terminator on CN120AB/TP/SBT +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The jumper JP12 is used to enable the internal terminator:: + + ----- + 0 | 0 | + ----- ON | | ON + | 0 | | 0 | + | | OFF ----- OFF + | 0 | 0 + ----- + Terminator Terminator + disabled enabled + + +Selecting the Connector Type on CN120ST/SBT +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +:: + + JP10 JP11 JP10 JP11 + ----- ----- + 0 0 | 0 | | 0 | + ----- ----- | | | | + | 0 | | 0 | | 0 | | 0 | + | | | | ----- ----- + | 0 | | 0 | 0 0 + ----- ----- + Coaxial Cable Twisted Pair Cable + (Default) + + +Setting the Timeout Parameters +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The jumpers labeled EXT1 and EXT2 are used to determine the timeout +parameters. These two jumpers are normally left open. + + +CNet Technology Inc. +==================== + +160 Series (16-bit cards) +------------------------- + - from Juergen Seifert + +This description has been written by Juergen Seifert +using information from the following Original CNet Manual + + "ARCNET USER'S MANUAL for + CN160A CN160AB CN160TP + P/N:12-01-0006 Revision 3.00" + +ARCNET is a registered trademark of the Datapoint Corporation + +- P/N 160A ARCNET 16 bit XT/AT Star +- P/N 160AB ARCNET 16 bit XT/AT Bus +- P/N 160TP ARCNET 16 bit XT/AT Twisted Pair + +:: + + ___________________________________________________________________ + < _________________________ ___| + > |oo| JP2 | | LED |___| + < |oo| JP1 | 9026 | LED |___| + > |_________________________| ___| + < N | | ID7 + > 1 o | | ID6 + < 1 2 3 4 5 6 7 8 9 0 d | S | ID5 + > _______________ _____________________ e | W | ID4 + < | PROM | | SW1 | A | 2 | ID3 + > > SOCKET | |_____________________| d | | ID2 + < |_______________| | IO-Base | MEM | d | | ID1 + > r |___| ID0 + < ____| + > | | + < | J1 | + > | | + < |____| + > 1 1 1 1 | + < 3 4 5 6 7 JP 8 9 0 1 2 3 | + > |o|o|o|o|o| |o|o|o|o|o|o| | + < |o|o|o|o|o| __ |o|o|o|o|o|o| ___________| + > | | | + <____________| |_______________________________________| + +Legend:: + + 9026 ARCNET Probe + SW1 1-6: Base I/O Address Select + 7-10: Base Memory Address Select + SW2 1-8: Node ID Select (ID0-ID7) + JP1/JP2 ET1, ET2 Timeout Parameters + JP3-JP13 Interrupt Select + J1 BNC RG62/U Connector (CN160A/AB only) + J1 Two 6-position Telephone Jack (CN160TP only) + LED + +Setting one of the switches to Off means "1", On means "0". + + +Setting the Node ID +^^^^^^^^^^^^^^^^^^^ + +The eight switches in SW2 are used to set the node ID. Each node attached +to the network must have an unique node ID which must be different from 0. +Switch 1 (ID0) serves as the least significant bit (LSB). + +The node ID is the sum of the values of all switches set to "1" +These values are:: + + Switch | Label | Value + -------|-------|------- + 1 | ID0 | 1 + 2 | ID1 | 2 + 3 | ID2 | 4 + 4 | ID3 | 8 + 5 | ID4 | 16 + 6 | ID5 | 32 + 7 | ID6 | 64 + 8 | ID7 | 128 + +Some Examples:: + + Switch | Hex | Decimal + 8 7 6 5 4 3 2 1 | Node ID | Node ID + ----------------|---------|--------- + 0 0 0 0 0 0 0 0 | not allowed + 0 0 0 0 0 0 0 1 | 1 | 1 + 0 0 0 0 0 0 1 0 | 2 | 2 + 0 0 0 0 0 0 1 1 | 3 | 3 + . . . | | + 0 1 0 1 0 1 0 1 | 55 | 85 + . . . | | + 1 0 1 0 1 0 1 0 | AA | 170 + . . . | | + 1 1 1 1 1 1 0 1 | FD | 253 + 1 1 1 1 1 1 1 0 | FE | 254 + 1 1 1 1 1 1 1 1 | FF | 255 + + +Setting the I/O Base Address +^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The first six switches in switch block SW1 are used to select the I/O Base +address using the following table:: + + Switch | Hex I/O + 1 2 3 4 5 6 | Address + ------------------------|-------- + OFF ON ON OFF OFF ON | 260 + OFF ON OFF ON ON OFF | 290 + OFF ON OFF OFF OFF ON | 2E0 (Manufacturer's default) + OFF ON OFF OFF OFF OFF | 2F0 + OFF OFF ON ON ON ON | 300 + OFF OFF ON OFF ON OFF | 350 + OFF OFF OFF ON ON ON | 380 + OFF OFF OFF OFF OFF ON | 3E0 + +Note: Other IO-Base addresses seem to be selectable, but only the above + combinations are documented. + + +Setting the Base Memory (RAM) buffer Address +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The switches 7-10 of switch block SW1 are used to select the Memory +Base address of the RAM (2K) and the PROM:: + + Switch | Hex RAM | Hex ROM + 7 8 9 10 | Address | Address + ----------------|---------|----------- + OFF OFF ON ON | C0000 | C8000 + OFF OFF ON OFF | D0000 | D8000 (Default) + OFF OFF OFF ON | E0000 | E8000 + +.. note:: + + Other MEM-Base addresses seem to be selectable, but only the above + combinations are documented. + + +Setting the Interrupt Line +^^^^^^^^^^^^^^^^^^^^^^^^^^ + +To select a hardware interrupt level install one (only one!) of the jumpers +JP3 through JP13 using the following table:: + + Jumper | IRQ + -------|----------------- + 3 | 14 + 4 | 15 + 5 | 12 + 6 | 11 + 7 | 10 + 8 | 3 + 9 | 4 + 10 | 5 + 11 | 6 + 12 | 7 + 13 | 2 (=9) Default! + +.. note:: + + - Do not use JP11=IRQ6, it may conflict with your Floppy Disk + Controller + - Use JP3=IRQ14 only, if you don't have an IDE-, MFM-, or RLL- + Hard Disk, it may conflict with their controllers + + +Setting the Timeout Parameters +------------------------------ + +The jumpers labeled JP1 and JP2 are used to determine the timeout +parameters. These two jumpers are normally left open. + + +Lantech +======= + +8-bit card, unknown model +------------------------- + - from Vlad Lungu - his e-mail address seemed broken at + the time I tried to reach him. Sorry Vlad, if you didn't get my reply. + +:: + + ________________________________________________________________ + | 1 8 | + | ___________ __| + | | SW1 | LED |__| + | |__________| | + | ___| + | _____________________ |S | 8 + | | | |W | + | | | |2 | + | | | |__| 1 + | | UM9065L | |o| JP4 ____|____ + | | | |o| | CN | + | | | |________| + | | | | + | |___________________| | + | | + | | + | _____________ | + | | | | + | | PROM | |ooooo| JP6 | + | |____________| |ooooo| | + |_____________ _ _| + |____________________________________________| |__| + + +UM9065L : ARCnet Controller + +SW 1 : Shared Memory Address and I/O Base + +:: + + ON=0 + + 12345|Memory Address + -----|-------------- + 00001| D4000 + 00010| CC000 + 00110| D0000 + 01110| D1000 + 01101| D9000 + 10010| CC800 + 10011| DC800 + 11110| D1800 + +It seems that the bits are considered in reverse order. Also, you must +observe that some of those addresses are unusual and I didn't probe them; I +used a memory dump in DOS to identify them. For the 00000 configuration and +some others that I didn't write here the card seems to conflict with the +video card (an S3 GENDAC). I leave the full decoding of those addresses to +you. + +:: + + 678| I/O Address + ---|------------ + 000| 260 + 001| failed probe + 010| 2E0 + 011| 380 + 100| 290 + 101| 350 + 110| failed probe + 111| 3E0 + + SW 2 : Node ID (binary coded) + + JP 4 : Boot PROM enable CLOSE - enabled + OPEN - disabled + + JP 6 : IRQ set (ONLY ONE jumper on 1-5 for IRQ 2-6) + + +Acer +==== + +8-bit card, Model 5210-003 +-------------------------- + + - from Vojtech Pavlik using portions of the existing + arcnet-hardware file. + +This is a 90C26 based card. Its configuration seems similar to the SMC +PC100, but has some additional jumpers I don't know the meaning of. + +:: + + __ + | | + ___________|__|_________________________ + | | | | + | | BNC | | + | |______| ___| + | _____________________ |___ + | | | | + | | Hybrid IC | | + | | | o|o J1 | + | |_____________________| 8|8 | + | 8|8 J5 | + | o|o | + | 8|8 | + |__ 8|8 | + (|__| LED o|o | + | 8|8 | + | 8|8 J15 | + | | + | _____ | + | | | _____ | + | | | | | ___| + | | | | | | + | _____ | ROM | | UFS | | + | | | | | | | | + | | | ___ | | | | | + | | | | | |__.__| |__.__| | + | | NCR | |XTL| _____ _____ | + | | | |___| | | | | | + | |90C26| | | | | | + | | | | RAM | | UFS | | + | | | J17 o|o | | | | | + | | | J16 o|o | | | | | + | |__.__| |__.__| |__.__| | + | ___ | + | | |8 | + | |SW2| | + | | | | + | |___|1 | + | ___ | + | | |10 J18 o|o | + | | | o|o | + | |SW1| o|o | + | | | J21 o|o | + | |___|1 | + | | + |____________________________________| + + +Legend:: + + 90C26 ARCNET Chip + XTL 20 MHz Crystal + SW1 1-6 Base I/O Address Select + 7-10 Memory Address Select + SW2 1-8 Node ID Select (ID0-ID7) + J1-J5 IRQ Select + J6-J21 Unknown (Probably extra timeouts & ROM enable ...) + LED1 Activity LED + BNC Coax connector (STAR ARCnet) + RAM 2k of SRAM + ROM Boot ROM socket + UFS Unidentified Flying Sockets + + +Setting the Node ID +^^^^^^^^^^^^^^^^^^^ + +The eight switches in SW2 are used to set the node ID. Each node attached +to the network must have an unique node ID which must not be 0. +Switch 1 (ID0) serves as the least significant bit (LSB). + +Setting one of the switches to OFF means "1", ON means "0". + +The node ID is the sum of the values of all switches set to "1" +These values are:: + + Switch | Value + -------|------- + 1 | 1 + 2 | 2 + 3 | 4 + 4 | 8 + 5 | 16 + 6 | 32 + 7 | 64 + 8 | 128 + +Don't set this to 0 or 255; these values are reserved. + + +Setting the I/O Base Address +^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The switches 1 to 6 of switch block SW1 are used to select one +of 32 possible I/O Base addresses using the following tables:: + + | Hex + Switch | Value + -------|------- + 1 | 200 + 2 | 100 + 3 | 80 + 4 | 40 + 5 | 20 + 6 | 10 + +The I/O address is sum of all switches set to "1". Remember that +the I/O address space bellow 0x200 is RESERVED for mainboard, so +switch 1 should be ALWAYS SET TO OFF. + + +Setting the Base Memory (RAM) buffer Address +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The memory buffer (RAM) requires 2K. The base of this buffer can be +located in any of sixteen positions. However, the addresses below +A0000 are likely to cause system hang because there's main RAM. + +Jumpers 7-10 of switch block SW1 select the Memory Base address:: + + Switch | Hex RAM + 7 8 9 10 | Address + ----------------|--------- + OFF OFF OFF OFF | F0000 (conflicts with main BIOS) + OFF OFF OFF ON | E0000 + OFF OFF ON OFF | D0000 + OFF OFF ON ON | C0000 (conflicts with video BIOS) + OFF ON OFF OFF | B0000 (conflicts with mono video) + OFF ON OFF ON | A0000 (conflicts with graphics) + + +Setting the Interrupt Line +^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Jumpers 1-5 of the jumper block J1 control the IRQ level. ON means +shorted, OFF means open:: + + Jumper | IRQ + 1 2 3 4 5 | + ---------------------------- + ON OFF OFF OFF OFF | 7 + OFF ON OFF OFF OFF | 5 + OFF OFF ON OFF OFF | 4 + OFF OFF OFF ON OFF | 3 + OFF OFF OFF OFF ON | 2 + + +Unknown jumpers & sockets +^^^^^^^^^^^^^^^^^^^^^^^^^ + +I know nothing about these. I just guess that J16&J17 are timeout +jumpers and maybe one of J18-J21 selects ROM. Also J6-J10 and +J11-J15 are connecting IRQ2-7 to some pins on the UFSs. I can't +guess the purpose. + +Datapoint? +========== + +LAN-ARC-8, an 8-bit card +------------------------ + + - from Vojtech Pavlik + +This is another SMC 90C65-based ARCnet card. I couldn't identify the +manufacturer, but it might be DataPoint, because the card has the +original arcNet logo in its upper right corner. + +:: + + _______________________________________________________ + | _________ | + | | SW2 | ON arcNet | + | |_________| OFF ___| + | _____________ 1 ______ 8 | | 8 + | | | SW1 | XTAL | ____________ | S | + | > RAM (2k) | |______|| | | W | + | |_____________| | H | | 3 | + | _________|_____ y | |___| 1 + | _________ | | |b | | + | |_________| | | |r | | + | | SMC | |i | | + | | 90C65| |d | | + | _________ | | | | | + | | SW1 | ON | | |I | | + | |_________| OFF |_________|_____/C | _____| + | 1 8 | | | |___ + | ______________ | | | BNC |___| + | | | |____________| |_____| + | > EPROM SOCKET | _____________ | + | |______________| |_____________| | + | ______________| + | | + |________________________________________| + +Legend:: + + 90C65 ARCNET Chip + SW1 1-5: Base Memory Address Select + 6-8: Base I/O Address Select + SW2 1-8: Node ID Select + SW3 1-5: IRQ Select + 6-7: Extra Timeout + 8 : ROM Enable + BNC Coax connector + XTAL 20 MHz Crystal + + +Setting the Node ID +^^^^^^^^^^^^^^^^^^^ + +The eight switches in SW3 are used to set the node ID. Each node attached +to the network must have an unique node ID which must not be 0. +Switch 1 serves as the least significant bit (LSB). + +Setting one of the switches to Off means "1", On means "0". + +The node ID is the sum of the values of all switches set to "1" +These values are:: + + Switch | Value + -------|------- + 1 | 1 + 2 | 2 + 3 | 4 + 4 | 8 + 5 | 16 + 6 | 32 + 7 | 64 + 8 | 128 + + +Setting the I/O Base Address +^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The last three switches in switch block SW1 are used to select one +of eight possible I/O Base addresses using the following table:: + + + Switch | Hex I/O + 6 7 8 | Address + ------------|-------- + ON ON ON | 260 + OFF ON ON | 290 + ON OFF ON | 2E0 (Manufacturer's default) + OFF OFF ON | 2F0 + ON ON OFF | 300 + OFF ON OFF | 350 + ON OFF OFF | 380 + OFF OFF OFF | 3E0 + + +Setting the Base Memory (RAM) buffer Address +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The memory buffer (RAM) requires 2K. The base of this buffer can be +located in any of eight positions. The address of the Boot Prom is +memory base + 0x2000. + +Jumpers 3-5 of switch block SW1 select the Memory Base address. + +:: + + Switch | Hex RAM | Hex ROM + 1 2 3 4 5 | Address | Address *) + --------------------|---------|----------- + ON ON ON ON ON | C0000 | C2000 + ON ON OFF ON ON | C4000 | C6000 + ON ON ON OFF ON | CC000 | CE000 + ON ON OFF OFF ON | D0000 | D2000 (Manufacturer's default) + ON ON ON ON OFF | D4000 | D6000 + ON ON OFF ON OFF | D8000 | DA000 + ON ON ON OFF OFF | DC000 | DE000 + ON ON OFF OFF OFF | E0000 | E2000 + + *) To enable the Boot ROM set the switch 8 of switch block SW3 to position ON. + +The switches 1 and 2 probably add 0x0800 and 0x1000 to RAM base address. + + +Setting the Interrupt Line +^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Switches 1-5 of the switch block SW3 control the IRQ level:: + + Jumper | IRQ + 1 2 3 4 5 | + ---------------------------- + ON OFF OFF OFF OFF | 3 + OFF ON OFF OFF OFF | 4 + OFF OFF ON OFF OFF | 5 + OFF OFF OFF ON OFF | 7 + OFF OFF OFF OFF ON | 2 + + +Setting the Timeout Parameters +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The switches 6-7 of the switch block SW3 are used to determine the timeout +parameters. These two switches are normally left in the OFF position. + + +Topware +======= + +8-bit card, TA-ARC/10 +--------------------- + + - from Vojtech Pavlik + +This is another very similar 90C65 card. Most of the switches and jumpers +are the same as on other clones. + +:: + + _____________________________________________________________________ + | ___________ | | ______ | + | |SW2 NODE ID| | | | XTAL | | + | |___________| | Hybrid IC | |______| | + | ___________ | | __| + | |SW1 MEM+I/O| |_________________________| LED1|__|) + | |___________| 1 2 | + | J3 |o|o| TIMEOUT ______| + | ______________ |o|o| | | + | | | ___________________ | RJ | + | > EPROM SOCKET | | \ |------| + |J2 |______________| | | | | + ||o| | | |______| + ||o| ROM ENABLE | SMC | _________ | + | _____________ | 90C65 | |_________| _____| + | | | | | | |___ + | > RAM (2k) | | | | BNC |___| + | |_____________| | | |_____| + | |____________________| | + | ________ IRQ 2 3 4 5 7 ___________ | + ||________| |o|o|o|o|o| |___________| | + |________ J1|o|o|o|o|o| ______________| + | | + |_____________________________________________| + +Legend:: + + 90C65 ARCNET Chip + XTAL 20 MHz Crystal + SW1 1-5 Base Memory Address Select + 6-8 Base I/O Address Select + SW2 1-8 Node ID Select (ID0-ID7) + J1 IRQ Select + J2 ROM Enable + J3 Extra Timeout + LED1 Activity LED + BNC Coax connector (BUS ARCnet) + RJ Twisted Pair Connector (daisy chain) + + +Setting the Node ID +^^^^^^^^^^^^^^^^^^^ + +The eight switches in SW2 are used to set the node ID. Each node attached to +the network must have an unique node ID which must not be 0. Switch 1 (ID0) +serves as the least significant bit (LSB). + +Setting one of the switches to Off means "1", On means "0". + +The node ID is the sum of the values of all switches set to "1" +These values are:: + + Switch | Label | Value + -------|-------|------- + 1 | ID0 | 1 + 2 | ID1 | 2 + 3 | ID2 | 4 + 4 | ID3 | 8 + 5 | ID4 | 16 + 6 | ID5 | 32 + 7 | ID6 | 64 + 8 | ID7 | 128 + +Setting the I/O Base Address +^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The last three switches in switch block SW1 are used to select one +of eight possible I/O Base addresses using the following table:: + + + Switch | Hex I/O + 6 7 8 | Address + ------------|-------- + ON ON ON | 260 (Manufacturer's default) + OFF ON ON | 290 + ON OFF ON | 2E0 + OFF OFF ON | 2F0 + ON ON OFF | 300 + OFF ON OFF | 350 + ON OFF OFF | 380 + OFF OFF OFF | 3E0 + + +Setting the Base Memory (RAM) buffer Address +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The memory buffer (RAM) requires 2K. The base of this buffer can be +located in any of eight positions. The address of the Boot Prom is +memory base + 0x2000. + +Jumpers 3-5 of switch block SW1 select the Memory Base address. + +:: + + Switch | Hex RAM | Hex ROM + 1 2 3 4 5 | Address | Address *) + --------------------|---------|----------- + ON ON ON ON ON | C0000 | C2000 + ON ON OFF ON ON | C4000 | C6000 (Manufacturer's default) + ON ON ON OFF ON | CC000 | CE000 + ON ON OFF OFF ON | D0000 | D2000 + ON ON ON ON OFF | D4000 | D6000 + ON ON OFF ON OFF | D8000 | DA000 + ON ON ON OFF OFF | DC000 | DE000 + ON ON OFF OFF OFF | E0000 | E2000 + + *) To enable the Boot ROM short the jumper J2. + +The jumpers 1 and 2 probably add 0x0800 and 0x1000 to RAM address. + + +Setting the Interrupt Line +^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Jumpers 1-5 of the jumper block J1 control the IRQ level. ON means +shorted, OFF means open:: + + Jumper | IRQ + 1 2 3 4 5 | + ---------------------------- + ON OFF OFF OFF OFF | 2 + OFF ON OFF OFF OFF | 3 + OFF OFF ON OFF OFF | 4 + OFF OFF OFF ON OFF | 5 + OFF OFF OFF OFF ON | 7 + + +Setting the Timeout Parameters +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The jumpers J3 are used to set the timeout parameters. These two +jumpers are normally left open. + +Thomas-Conrad +============= + +Model #500-6242-0097 REV A (8-bit card) +--------------------------------------- + + - from Lars Karlsson <100617.3473@compuserve.com> + +:: + + ________________________________________________________ + | ________ ________ |_____ + | |........| |........| | + | |________| |________| ___| + | SW 3 SW 1 | | + | Base I/O Base Addr. Station | | + | address | | + | ______ switch | | + | | | | | + | | | |___| + | | | ______ |___._ + | |______| |______| ____| BNC + | Jumper- _____| Connector + | Main chip block _ __| ' + | | | | RJ Connector + | |_| | with 110 Ohm + | |__ Terminator + | ___________ __| + | |...........| | RJ-jack + | |...........| _____ | (unused) + | |___________| |_____| |__ + | Boot PROM socket IRQ-jumpers |_ Diagnostic + |________ __ _| LED (red) + | | | | | | | | | | | | | | | | | | | | | | + | | | | | | | | | | | | | | | | | | | | |________| + | + | + +And here are the settings for some of the switches and jumpers on the cards. + +:: + + I/O + + 1 2 3 4 5 6 7 8 + + 2E0----- 0 0 0 1 0 0 0 1 + 2F0----- 0 0 0 1 0 0 0 0 + 300----- 0 0 0 0 1 1 1 1 + 350----- 0 0 0 0 1 1 1 0 + +"0" in the above example means switch is off "1" means that it is on. + +:: + + ShMem address. + + 1 2 3 4 5 6 7 8 + + CX00--0 0 1 1 | | | + DX00--0 0 1 0 | + X000--------- 1 1 | + X400--------- 1 0 | + X800--------- 0 1 | + XC00--------- 0 0 + ENHANCED----------- 1 + COMPATIBLE--------- 0 + +:: + + IRQ + + + 3 4 5 7 2 + . . . . . + . . . . . + + +There is a DIP-switch with 8 switches, used to set the shared memory address +to be used. The first 6 switches set the address, the 7th doesn't have any +function, and the 8th switch is used to select "compatible" or "enhanced". +When I got my two cards, one of them had this switch set to "enhanced". That +card didn't work at all, it wasn't even recognized by the driver. The other +card had this switch set to "compatible" and it behaved absolutely normally. I +guess that the switch on one of the cards, must have been changed accidentally +when the card was taken out of its former host. The question remains +unanswered, what is the purpose of the "enhanced" position? + +[Avery's note: "enhanced" probably either disables shared memory (use IO +ports instead) or disables IO ports (use memory addresses instead). This +varies by the type of card involved. I fail to see how either of these +enhance anything. Send me more detailed information about this mode, or +just use "compatible" mode instead.] + +Waterloo Microsystems Inc. ?? +============================= + +8-bit card (C) 1985 +------------------- + - from Robert Michael Best + +[Avery's note: these don't work with my driver for some reason. These cards +SEEM to have settings similar to the PDI508Plus, which is +software-configured and doesn't work with my driver either. The "Waterloo +chip" is a boot PROM, probably designed specifically for the University of +Waterloo. If you have any further information about this card, please +e-mail me.] + +The probe has not been able to detect the card on any of the J2 settings, +and I tried them again with the "Waterloo" chip removed. + +:: + + _____________________________________________________________________ + | \/ \/ ___ __ __ | + | C4 C4 |^| | M || ^ ||^| | + | -- -- |_| | 5 || || | C3 | + | \/ \/ C10 |___|| ||_| | + | C4 C4 _ _ | | ?? | + | -- -- | \/ || | | + | | || | | + | | || C1 | | + | | || | \/ _____| + | | C6 || | C9 | |___ + | | || | -- | BNC |___| + | | || | >C7| |_____| + | | || | | + | __ __ |____||_____| 1 2 3 6 | + || ^ | >C4| |o|o|o|o|o|o| J2 >C4| | + || | |o|o|o|o|o|o| | + || C2 | >C4| >C4| | + || | >C8| | + || | 2 3 4 5 6 7 IRQ >C4| | + ||_____| |o|o|o|o|o|o| J3 | + |_______ |o|o|o|o|o|o| _______________| + | | + |_____________________________________________| + + C1 -- "COM9026 + SMC 8638" + In a chip socket. + + C2 -- "@Copyright + Waterloo Microsystems Inc. + 1985" + In a chip Socket with info printed on a label covering a round window + showing the circuit inside. (The window indicates it is an EPROM chip.) + + C3 -- "COM9032 + SMC 8643" + In a chip socket. + + C4 -- "74LS" + 9 total no sockets. + + M5 -- "50006-136 + 20.000000 MHZ + MTQ-T1-S3 + 0 M-TRON 86-40" + Metallic case with 4 pins, no socket. + + C6 -- "MOSTEK@TC8643 + MK6116N-20 + MALAYSIA" + No socket. + + C7 -- No stamp or label but in a 20 pin chip socket. + + C8 -- "PAL10L8CN + 8623" + In a 20 pin socket. + + C9 -- "PAl16R4A-2CN + 8641" + In a 20 pin socket. + + C10 -- "M8640 + NMC + 9306N" + In an 8 pin socket. + + ?? -- Some components on a smaller board and attached with 20 pins all + along the side closest to the BNC connector. The are coated in a dark + resin. + +On the board there are two jumper banks labeled J2 and J3. The +manufacturer didn't put a J1 on the board. The two boards I have both +came with a jumper box for each bank. + +:: + + J2 -- Numbered 1 2 3 4 5 6. + 4 and 5 are not stamped due to solder points. + + J3 -- IRQ 2 3 4 5 6 7 + +The board itself has a maple leaf stamped just above the irq jumpers +and "-2 46-86" beside C2. Between C1 and C6 "ASS 'Y 300163" and "@1986 +CORMAN CUSTOM ELECTRONICS CORP." stamped just below the BNC connector. +Below that "MADE IN CANADA" + +No Name +======= + +8-bit cards, 16-bit cards +------------------------- + + - from Juergen Seifert + +I have named this ARCnet card "NONAME", since there is no name of any +manufacturer on the Installation manual nor on the shipping box. The only +hint to the existence of a manufacturer at all is written in copper, +it is "Made in Taiwan" + +This description has been written by Juergen Seifert +using information from the Original + + "ARCnet Installation Manual" + +:: + + ________________________________________________________________ + | |STAR| BUS| T/P| | + | |____|____|____| | + | _____________________ | + | | | | + | | | | + | | | | + | | SMC | | + | | | | + | | COM90C65 | | + | | | | + | | | | + | |__________-__________| | + | _____| + | _______________ | CN | + | | PROM | |_____| + | > SOCKET | | + | |_______________| 1 2 3 4 5 6 7 8 1 2 3 4 5 6 7 8 | + | _______________ _______________ | + | |o|o|o|o|o|o|o|o| | SW1 || SW2 || + | |o|o|o|o|o|o|o|o| |_______________||_______________|| + |___ 2 3 4 5 7 E E R Node ID IOB__|__MEM____| + | \ IRQ / T T O | + |__________________1_2_M______________________| + +Legend:: + + COM90C65: ARCnet Probe + S1 1-8: Node ID Select + S2 1-3: I/O Base Address Select + 4-6: Memory Base Address Select + 7-8: RAM Offset Select + ET1, ET2 Extended Timeout Select + ROM ROM Enable Select + CN RG62 Coax Connector + STAR| BUS | T/P Three fields for placing a sign (colored circle) + indicating the topology of the card + +Setting one of the switches to Off means "1", On means "0". + + +Setting the Node ID +^^^^^^^^^^^^^^^^^^^ + +The eight switches in group SW1 are used to set the node ID. +Each node attached to the network must have an unique node ID which +must be different from 0. +Switch 8 serves as the least significant bit (LSB). + +The node ID is the sum of the values of all switches set to "1" +These values are:: + + Switch | Value + -------|------- + 8 | 1 + 7 | 2 + 6 | 4 + 5 | 8 + 4 | 16 + 3 | 32 + 2 | 64 + 1 | 128 + +Some Examples:: + + Switch | Hex | Decimal + 1 2 3 4 5 6 7 8 | Node ID | Node ID + ----------------|---------|--------- + 0 0 0 0 0 0 0 0 | not allowed + 0 0 0 0 0 0 0 1 | 1 | 1 + 0 0 0 0 0 0 1 0 | 2 | 2 + 0 0 0 0 0 0 1 1 | 3 | 3 + . . . | | + 0 1 0 1 0 1 0 1 | 55 | 85 + . . . | | + 1 0 1 0 1 0 1 0 | AA | 170 + . . . | | + 1 1 1 1 1 1 0 1 | FD | 253 + 1 1 1 1 1 1 1 0 | FE | 254 + 1 1 1 1 1 1 1 1 | FF | 255 + + +Setting the I/O Base Address +^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The first three switches in switch group SW2 are used to select one +of eight possible I/O Base addresses using the following table:: + + Switch | Hex I/O + 1 2 3 | Address + ------------|-------- + ON ON ON | 260 + ON ON OFF | 290 + ON OFF ON | 2E0 (Manufacturer's default) + ON OFF OFF | 2F0 + OFF ON ON | 300 + OFF ON OFF | 350 + OFF OFF ON | 380 + OFF OFF OFF | 3E0 + + +Setting the Base Memory (RAM) buffer Address +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The memory buffer requires 2K of a 16K block of RAM. The base of this +16K block can be located in any of eight positions. +Switches 4-6 of switch group SW2 select the Base of the 16K block. +Within that 16K address space, the buffer may be assigned any one of four +positions, determined by the offset, switches 7 and 8 of group SW2. + +:: + + Switch | Hex RAM | Hex ROM + 4 5 6 7 8 | Address | Address *) + -----------|---------|----------- + 0 0 0 0 0 | C0000 | C2000 + 0 0 0 0 1 | C0800 | C2000 + 0 0 0 1 0 | C1000 | C2000 + 0 0 0 1 1 | C1800 | C2000 + | | + 0 0 1 0 0 | C4000 | C6000 + 0 0 1 0 1 | C4800 | C6000 + 0 0 1 1 0 | C5000 | C6000 + 0 0 1 1 1 | C5800 | C6000 + | | + 0 1 0 0 0 | CC000 | CE000 + 0 1 0 0 1 | CC800 | CE000 + 0 1 0 1 0 | CD000 | CE000 + 0 1 0 1 1 | CD800 | CE000 + | | + 0 1 1 0 0 | D0000 | D2000 (Manufacturer's default) + 0 1 1 0 1 | D0800 | D2000 + 0 1 1 1 0 | D1000 | D2000 + 0 1 1 1 1 | D1800 | D2000 + | | + 1 0 0 0 0 | D4000 | D6000 + 1 0 0 0 1 | D4800 | D6000 + 1 0 0 1 0 | D5000 | D6000 + 1 0 0 1 1 | D5800 | D6000 + | | + 1 0 1 0 0 | D8000 | DA000 + 1 0 1 0 1 | D8800 | DA000 + 1 0 1 1 0 | D9000 | DA000 + 1 0 1 1 1 | D9800 | DA000 + | | + 1 1 0 0 0 | DC000 | DE000 + 1 1 0 0 1 | DC800 | DE000 + 1 1 0 1 0 | DD000 | DE000 + 1 1 0 1 1 | DD800 | DE000 + | | + 1 1 1 0 0 | E0000 | E2000 + 1 1 1 0 1 | E0800 | E2000 + 1 1 1 1 0 | E1000 | E2000 + 1 1 1 1 1 | E1800 | E2000 + + *) To enable the 8K Boot PROM install the jumper ROM. + The default is jumper ROM not installed. + + +Setting Interrupt Request Lines (IRQ) +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +To select a hardware interrupt level set one (only one!) of the jumpers +IRQ2, IRQ3, IRQ4, IRQ5 or IRQ7. The manufacturer's default is IRQ2. + + +Setting the Timeouts +^^^^^^^^^^^^^^^^^^^^ + +The two jumpers labeled ET1 and ET2 are used to determine the timeout +parameters (response and reconfiguration time). Every node in a network +must be set to the same timeout values. + +:: + + ET1 ET2 | Response Time (us) | Reconfiguration Time (ms) + --------|--------------------|-------------------------- + Off Off | 78 | 840 (Default) + Off On | 285 | 1680 + On Off | 563 | 1680 + On On | 1130 | 1680 + +On means jumper installed, Off means jumper not installed + + +16-BIT ARCNET +------------- + +The manual of my 8-Bit NONAME ARCnet Card contains another description +of a 16-Bit Coax / Twisted Pair Card. This description is incomplete, +because there are missing two pages in the manual booklet. (The table +of contents reports pages ... 2-9, 2-11, 2-12, 3-1, ... but inside +the booklet there is a different way of counting ... 2-9, 2-10, A-1, +(empty page), 3-1, ..., 3-18, A-1 (again), A-2) +Also the picture of the board layout is not as good as the picture of +8-Bit card, because there isn't any letter like "SW1" written to the +picture. + +Should somebody have such a board, please feel free to complete this +description or to send a mail to me! + +This description has been written by Juergen Seifert +using information from the Original + + "ARCnet Installation Manual" + +:: + + ___________________________________________________________________ + < _________________ _________________ | + > | SW? || SW? | | + < |_________________||_________________| | + > ____________________ | + < | | | + > | | | + < | | | + > | | | + < | | | + > | | | + < | | | + > |____________________| | + < ____| + > ____________________ | | + < | | | J1 | + > | < | | + < |____________________| ? ? ? ? ? ? |____| + > |o|o|o|o|o|o| | + < |o|o|o|o|o|o| | + > | + < __ ___________| + > | | | + <____________| |_______________________________________| + + +Setting one of the switches to Off means "1", On means "0". + + +Setting the Node ID +^^^^^^^^^^^^^^^^^^^ + +The eight switches in group SW2 are used to set the node ID. +Each node attached to the network must have an unique node ID which +must be different from 0. +Switch 8 serves as the least significant bit (LSB). + +The node ID is the sum of the values of all switches set to "1" +These values are:: + + Switch | Value + -------|------- + 8 | 1 + 7 | 2 + 6 | 4 + 5 | 8 + 4 | 16 + 3 | 32 + 2 | 64 + 1 | 128 + +Some Examples:: + + Switch | Hex | Decimal + 1 2 3 4 5 6 7 8 | Node ID | Node ID + ----------------|---------|--------- + 0 0 0 0 0 0 0 0 | not allowed + 0 0 0 0 0 0 0 1 | 1 | 1 + 0 0 0 0 0 0 1 0 | 2 | 2 + 0 0 0 0 0 0 1 1 | 3 | 3 + . . . | | + 0 1 0 1 0 1 0 1 | 55 | 85 + . . . | | + 1 0 1 0 1 0 1 0 | AA | 170 + . . . | | + 1 1 1 1 1 1 0 1 | FD | 253 + 1 1 1 1 1 1 1 0 | FE | 254 + 1 1 1 1 1 1 1 1 | FF | 255 + + +Setting the I/O Base Address +^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The first three switches in switch group SW1 are used to select one +of eight possible I/O Base addresses using the following table:: + + Switch | Hex I/O + 3 2 1 | Address + ------------|-------- + ON ON ON | 260 + ON ON OFF | 290 + ON OFF ON | 2E0 (Manufacturer's default) + ON OFF OFF | 2F0 + OFF ON ON | 300 + OFF ON OFF | 350 + OFF OFF ON | 380 + OFF OFF OFF | 3E0 + + +Setting the Base Memory (RAM) buffer Address +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The memory buffer requires 2K of a 16K block of RAM. The base of this +16K block can be located in any of eight positions. +Switches 6-8 of switch group SW1 select the Base of the 16K block. +Within that 16K address space, the buffer may be assigned any one of four +positions, determined by the offset, switches 4 and 5 of group SW1:: + + Switch | Hex RAM | Hex ROM + 8 7 6 5 4 | Address | Address + -----------|---------|----------- + 0 0 0 0 0 | C0000 | C2000 + 0 0 0 0 1 | C0800 | C2000 + 0 0 0 1 0 | C1000 | C2000 + 0 0 0 1 1 | C1800 | C2000 + | | + 0 0 1 0 0 | C4000 | C6000 + 0 0 1 0 1 | C4800 | C6000 + 0 0 1 1 0 | C5000 | C6000 + 0 0 1 1 1 | C5800 | C6000 + | | + 0 1 0 0 0 | CC000 | CE000 + 0 1 0 0 1 | CC800 | CE000 + 0 1 0 1 0 | CD000 | CE000 + 0 1 0 1 1 | CD800 | CE000 + | | + 0 1 1 0 0 | D0000 | D2000 (Manufacturer's default) + 0 1 1 0 1 | D0800 | D2000 + 0 1 1 1 0 | D1000 | D2000 + 0 1 1 1 1 | D1800 | D2000 + | | + 1 0 0 0 0 | D4000 | D6000 + 1 0 0 0 1 | D4800 | D6000 + 1 0 0 1 0 | D5000 | D6000 + 1 0 0 1 1 | D5800 | D6000 + | | + 1 0 1 0 0 | D8000 | DA000 + 1 0 1 0 1 | D8800 | DA000 + 1 0 1 1 0 | D9000 | DA000 + 1 0 1 1 1 | D9800 | DA000 + | | + 1 1 0 0 0 | DC000 | DE000 + 1 1 0 0 1 | DC800 | DE000 + 1 1 0 1 0 | DD000 | DE000 + 1 1 0 1 1 | DD800 | DE000 + | | + 1 1 1 0 0 | E0000 | E2000 + 1 1 1 0 1 | E0800 | E2000 + 1 1 1 1 0 | E1000 | E2000 + 1 1 1 1 1 | E1800 | E2000 + + +Setting Interrupt Request Lines (IRQ) +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +?????????????????????????????????????? + + +Setting the Timeouts +^^^^^^^^^^^^^^^^^^^^ + +?????????????????????????????????????? + + +8-bit cards ("Made in Taiwan R.O.C.") +------------------------------------- + + - from Vojtech Pavlik + +I have named this ARCnet card "NONAME", since I got only the card with +no manual at all and the only text identifying the manufacturer is +"MADE IN TAIWAN R.O.C" printed on the card. + +:: + + ____________________________________________________________ + | 1 2 3 4 5 6 7 8 | + | |o|o| JP1 o|o|o|o|o|o|o|o| ON | + | + o|o|o|o|o|o|o|o| ___| + | _____________ o|o|o|o|o|o|o|o| OFF _____ | | ID7 + | | | SW1 | | | | ID6 + | > RAM (2k) | ____________________ | H | | S | ID5 + | |_____________| | || y | | W | ID4 + | | || b | | 2 | ID3 + | | || r | | | ID2 + | | || i | | | ID1 + | | 90C65 || d | |___| ID0 + | SW3 | || | | + | |o|o|o|o|o|o|o|o| ON | || I | | + | |o|o|o|o|o|o|o|o| | || C | | + | |o|o|o|o|o|o|o|o| OFF |____________________|| | _____| + | 1 2 3 4 5 6 7 8 | | | |___ + | ______________ | | | BNC |___| + | | | |_____| |_____| + | > EPROM SOCKET | | + | |______________| | + | ______________| + | | + |_____________________________________________| + +Legend:: + + 90C65 ARCNET Chip + SW1 1-5: Base Memory Address Select + 6-8: Base I/O Address Select + SW2 1-8: Node ID Select (ID0-ID7) + SW3 1-5: IRQ Select + 6-7: Extra Timeout + 8 : ROM Enable + JP1 Led connector + BNC Coax connector + +Although the jumpers SW1 and SW3 are marked SW, not JP, they are jumpers, not +switches. + +Setting the jumpers to ON means connecting the upper two pins, off the bottom +two - or - in case of IRQ setting, connecting none of them at all. + +Setting the Node ID +^^^^^^^^^^^^^^^^^^^ + +The eight switches in SW2 are used to set the node ID. Each node attached +to the network must have an unique node ID which must not be 0. +Switch 1 (ID0) serves as the least significant bit (LSB). + +Setting one of the switches to Off means "1", On means "0". + +The node ID is the sum of the values of all switches set to "1" +These values are:: + + Switch | Label | Value + -------|-------|------- + 1 | ID0 | 1 + 2 | ID1 | 2 + 3 | ID2 | 4 + 4 | ID3 | 8 + 5 | ID4 | 16 + 6 | ID5 | 32 + 7 | ID6 | 64 + 8 | ID7 | 128 + +Some Examples:: + + Switch | Hex | Decimal + 8 7 6 5 4 3 2 1 | Node ID | Node ID + ----------------|---------|--------- + 0 0 0 0 0 0 0 0 | not allowed + 0 0 0 0 0 0 0 1 | 1 | 1 + 0 0 0 0 0 0 1 0 | 2 | 2 + 0 0 0 0 0 0 1 1 | 3 | 3 + . . . | | + 0 1 0 1 0 1 0 1 | 55 | 85 + . . . | | + 1 0 1 0 1 0 1 0 | AA | 170 + . . . | | + 1 1 1 1 1 1 0 1 | FD | 253 + 1 1 1 1 1 1 1 0 | FE | 254 + 1 1 1 1 1 1 1 1 | FF | 255 + + +Setting the I/O Base Address +^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The last three switches in switch block SW1 are used to select one +of eight possible I/O Base addresses using the following table:: + + + Switch | Hex I/O + 6 7 8 | Address + ------------|-------- + ON ON ON | 260 + OFF ON ON | 290 + ON OFF ON | 2E0 (Manufacturer's default) + OFF OFF ON | 2F0 + ON ON OFF | 300 + OFF ON OFF | 350 + ON OFF OFF | 380 + OFF OFF OFF | 3E0 + + +Setting the Base Memory (RAM) buffer Address +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The memory buffer (RAM) requires 2K. The base of this buffer can be +located in any of eight positions. The address of the Boot Prom is +memory base + 0x2000. + +Jumpers 3-5 of jumper block SW1 select the Memory Base address. + +:: + + Switch | Hex RAM | Hex ROM + 1 2 3 4 5 | Address | Address *) + --------------------|---------|----------- + ON ON ON ON ON | C0000 | C2000 + ON ON OFF ON ON | C4000 | C6000 + ON ON ON OFF ON | CC000 | CE000 + ON ON OFF OFF ON | D0000 | D2000 (Manufacturer's default) + ON ON ON ON OFF | D4000 | D6000 + ON ON OFF ON OFF | D8000 | DA000 + ON ON ON OFF OFF | DC000 | DE000 + ON ON OFF OFF OFF | E0000 | E2000 + + *) To enable the Boot ROM set the jumper 8 of jumper block SW3 to position ON. + +The jumpers 1 and 2 probably add 0x0800, 0x1000 and 0x1800 to RAM adders. + +Setting the Interrupt Line +^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Jumpers 1-5 of the jumper block SW3 control the IRQ level:: + + Jumper | IRQ + 1 2 3 4 5 | + ---------------------------- + ON OFF OFF OFF OFF | 2 + OFF ON OFF OFF OFF | 3 + OFF OFF ON OFF OFF | 4 + OFF OFF OFF ON OFF | 5 + OFF OFF OFF OFF ON | 7 + + +Setting the Timeout Parameters +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The jumpers 6-7 of the jumper block SW3 are used to determine the timeout +parameters. These two jumpers are normally left in the OFF position. + + + +(Generic Model 9058) +-------------------- + - from Andrew J. Kroll + - Sorry this sat in my to-do box for so long, Andrew! (yikes - over a + year!) + +:: + + _____ + | < + | .---' + ________________________________________________________________ | | + | | SW2 | | | + | ___________ |_____________| | | + | | | 1 2 3 4 5 6 ___| | + | > 6116 RAM | _________ 8 | | | + | |___________| |20MHzXtal| 7 | | | + | |_________| __________ 6 | S | | + | 74LS373 | |- 5 | W | | + | _________ | E |- 4 | | | + | >_______| ______________|..... P |- 3 | 3 | | + | | | : O |- 2 | | | + | | | : X |- 1 |___| | + | ________________ | | : Y |- | | + | | SW1 | | SL90C65 | : |- | | + | |________________| | | : B |- | | + | 1 2 3 4 5 6 7 8 | | : O |- | | + | |_________o____|..../ A |- _______| | + | ____________________ | R |- | |------, + | | | | D |- | BNC | # | + | > 2764 PROM SOCKET | |__________|- |_______|------' + | |____________________| _________ | | + | >________| <- 74LS245 | | + | | | + |___ ______________| | + |H H H H H H H H H H H H H H H H H H H H H H H| | | + |U_U_U_U_U_U_U_U_U_U_U_U_U_U_U_U_U_U_U_U_U_U_U| | | + \| + +Legend:: + + SL90C65 ARCNET Controller / Transceiver /Logic + SW1 1-5: IRQ Select + 6: ET1 + 7: ET2 + 8: ROM ENABLE + SW2 1-3: Memory Buffer/PROM Address + 3-6: I/O Address Map + SW3 1-8: Node ID Select + BNC BNC RG62/U Connection + *I* have had success using RG59B/U with *NO* terminators! + What gives?! + +SW1: Timeouts, Interrupt and ROM +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +To select a hardware interrupt level set one (only one!) of the dip switches +up (on) SW1...(switches 1-5) +IRQ3, IRQ4, IRQ5, IRQ7, IRQ2. The Manufacturer's default is IRQ2. + +The switches on SW1 labeled EXT1 (switch 6) and EXT2 (switch 7) +are used to determine the timeout parameters. These two dip switches +are normally left off (down). + + To enable the 8K Boot PROM position SW1 switch 8 on (UP) labeled ROM. + The default is jumper ROM not installed. + + +Setting the I/O Base Address +^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The last three switches in switch group SW2 are used to select one +of eight possible I/O Base addresses using the following table:: + + + Switch | Hex I/O + 4 5 6 | Address + -------|-------- + 0 0 0 | 260 + 0 0 1 | 290 + 0 1 0 | 2E0 (Manufacturer's default) + 0 1 1 | 2F0 + 1 0 0 | 300 + 1 0 1 | 350 + 1 1 0 | 380 + 1 1 1 | 3E0 + + +Setting the Base Memory Address (RAM & ROM) +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The memory buffer requires 2K of a 16K block of RAM. The base of this +16K block can be located in any of eight positions. +Switches 1-3 of switch group SW2 select the Base of the 16K block. +(0 = DOWN, 1 = UP) +I could, however, only verify two settings... + + +:: + + Switch| Hex RAM | Hex ROM + 1 2 3 | Address | Address + ------|---------|----------- + 0 0 0 | E0000 | E2000 + 0 0 1 | D0000 | D2000 (Manufacturer's default) + 0 1 0 | ????? | ????? + 0 1 1 | ????? | ????? + 1 0 0 | ????? | ????? + 1 0 1 | ????? | ????? + 1 1 0 | ????? | ????? + 1 1 1 | ????? | ????? + + +Setting the Node ID +^^^^^^^^^^^^^^^^^^^ + +The eight switches in group SW3 are used to set the node ID. +Each node attached to the network must have an unique node ID which +must be different from 0. +Switch 1 serves as the least significant bit (LSB). +switches in the DOWN position are OFF (0) and in the UP position are ON (1) + +The node ID is the sum of the values of all switches set to "1" +These values are:: + + Switch | Value + -------|------- + 1 | 1 + 2 | 2 + 3 | 4 + 4 | 8 + 5 | 16 + 6 | 32 + 7 | 64 + 8 | 128 + +Some Examples:: + + Switch# | Hex | Decimal + 8 7 6 5 4 3 2 1 | Node ID | Node ID + ----------------|---------|--------- + 0 0 0 0 0 0 0 0 | not allowed <-. + 0 0 0 0 0 0 0 1 | 1 | 1 | + 0 0 0 0 0 0 1 0 | 2 | 2 | + 0 0 0 0 0 0 1 1 | 3 | 3 | + . . . | | | + 0 1 0 1 0 1 0 1 | 55 | 85 | + . . . | | + Don't use 0 or 255! + 1 0 1 0 1 0 1 0 | AA | 170 | + . . . | | | + 1 1 1 1 1 1 0 1 | FD | 253 | + 1 1 1 1 1 1 1 0 | FE | 254 | + 1 1 1 1 1 1 1 1 | FF | 255 <-' + + +Tiara +===== + +(model unknown) +--------------- + + - from Christoph Lameter + + +Here is information about my card as far as I could figure it out:: + + + ----------------------------------------------- tiara + Tiara LanCard of Tiara Computer Systems. + + +----------------------------------------------+ + ! ! Transmitter Unit ! ! + ! +------------------+ ------- + ! MEM Coax Connector + ! ROM 7654321 <- I/O ------- + ! : : +--------+ ! + ! : : ! 90C66LJ! +++ + ! : : ! ! !D Switch to set + ! : : ! ! !I the Nodenumber + ! : : +--------+ !P + ! !++ + ! 234567 <- IRQ ! + +------------!!!!!!!!!!!!!!!!!!!!!!!!--------+ + !!!!!!!!!!!!!!!!!!!!!!!! + +- 0 = Jumper Installed +- 1 = Open + +Top Jumper line Bit 7 = ROM Enable 654=Memory location 321=I/O + +Settings for Memory Location (Top Jumper Line) + +=== ================ +456 Address selected +=== ================ +000 C0000 +001 C4000 +010 CC000 +011 D0000 +100 D4000 +101 D8000 +110 DC000 +111 E0000 +=== ================ + +Settings for I/O Address (Top Jumper Line) + +=== ==== +123 Port +=== ==== +000 260 +001 290 +010 2E0 +011 2F0 +100 300 +101 350 +110 380 +111 3E0 +=== ==== + +Settings for IRQ Selection (Lower Jumper Line) + +====== ===== +234567 +====== ===== +011111 IRQ 2 +101111 IRQ 3 +110111 IRQ 4 +111011 IRQ 5 +111110 IRQ 7 +====== ===== + +Other Cards +=========== + +I have no information on other models of ARCnet cards at the moment. Please +send any and all info to: + + apenwarr@worldvisions.ca + +Thanks. diff --git a/Documentation/networking/arcnet-hardware.txt b/Documentation/networking/arcnet-hardware.txt deleted file mode 100644 index 731de411513c..000000000000 --- a/Documentation/networking/arcnet-hardware.txt +++ /dev/null @@ -1,3133 +0,0 @@ - ------------------------------------------------------------------------------ -1) This file is a supplement to arcnet.txt. Please read that for general - driver configuration help. ------------------------------------------------------------------------------ -2) This file is no longer Linux-specific. It should probably be moved out of - the kernel sources. Ideas? ------------------------------------------------------------------------------ - -Because so many people (myself included) seem to have obtained ARCnet cards -without manuals, this file contains a quick introduction to ARCnet hardware, -some cabling tips, and a listing of all jumper settings I can find. Please -e-mail apenwarr@worldvisions.ca with any settings for your particular card, -or any other information you have! - - -INTRODUCTION TO ARCNET ----------------------- - -ARCnet is a network type which works in a way similar to popular Ethernet -networks but which is also different in some very important ways. - -First of all, you can get ARCnet cards in at least two speeds: 2.5 Mbps -(slower than Ethernet) and 100 Mbps (faster than normal Ethernet). In fact, -there are others as well, but these are less common. The different hardware -types, as far as I'm aware, are not compatible and so you cannot wire a -100 Mbps card to a 2.5 Mbps card, and so on. From what I hear, my driver does -work with 100 Mbps cards, but I haven't been able to verify this myself, -since I only have the 2.5 Mbps variety. It is probably not going to saturate -your 100 Mbps card. Stop complaining. :) - -You also cannot connect an ARCnet card to any kind of Ethernet card and -expect it to work. - -There are two "types" of ARCnet - STAR topology and BUS topology. This -refers to how the cards are meant to be wired together. According to most -available documentation, you can only connect STAR cards to STAR cards and -BUS cards to BUS cards. That makes sense, right? Well, it's not quite -true; see below under "Cabling." - -Once you get past these little stumbling blocks, ARCnet is actually quite a -well-designed standard. It uses something called "modified token passing" -which makes it completely incompatible with so-called "Token Ring" cards, -but which makes transfers much more reliable than Ethernet does. In fact, -ARCnet will guarantee that a packet arrives safely at the destination, and -even if it can't possibly be delivered properly (ie. because of a cable -break, or because the destination computer does not exist) it will at least -tell the sender about it. - -Because of the carefully defined action of the "token", it will always make -a pass around the "ring" within a maximum length of time. This makes it -useful for realtime networks. - -In addition, all known ARCnet cards have an (almost) identical programming -interface. This means that with one ARCnet driver you can support any -card, whereas with Ethernet each manufacturer uses what is sometimes a -completely different programming interface, leading to a lot of different, -sometimes very similar, Ethernet drivers. Of course, always using the same -programming interface also means that when high-performance hardware -facilities like PCI bus mastering DMA appear, it's hard to take advantage of -them. Let's not go into that. - -One thing that makes ARCnet cards difficult to program for, however, is the -limit on their packet sizes; standard ARCnet can only send packets that are -up to 508 bytes in length. This is smaller than the Internet "bare minimum" -of 576 bytes, let alone the Ethernet MTU of 1500. To compensate, an extra -level of encapsulation is defined by RFC1201, which I call "packet -splitting," that allows "virtual packets" to grow as large as 64K each, -although they are generally kept down to the Ethernet-style 1500 bytes. - -For more information on the advantages and disadvantages (mostly the -advantages) of ARCnet networks, you might try the "ARCnet Trade Association" -WWW page: - http://www.arcnet.com - - -CABLING ARCNET NETWORKS ------------------------ - -This section was rewritten by - Vojtech Pavlik -using information from several people, including: - Avery Pennraun - Stephen A. Wood - John Paul Morrison - Joachim Koenig -and Avery touched it up a bit, at Vojtech's request. - -ARCnet (the classic 2.5 Mbps version) can be connected by two different -types of cabling: coax and twisted pair. The other ARCnet-type networks -(100 Mbps TCNS and 320 kbps - 32 Mbps ARCnet Plus) use different types of -cabling (Type1, Fiber, C1, C4, C5). - -For a coax network, you "should" use 93 Ohm RG-62 cable. But other cables -also work fine, because ARCnet is a very stable network. I personally use 75 -Ohm TV antenna cable. - -Cards for coax cabling are shipped in two different variants: for BUS and -STAR network topologies. They are mostly the same. The only difference -lies in the hybrid chip installed. BUS cards use high impedance output, -while STAR use low impedance. Low impedance card (STAR) is electrically -equal to a high impedance one with a terminator installed. - -Usually, the ARCnet networks are built up from STAR cards and hubs. There -are two types of hubs - active and passive. Passive hubs are small boxes -with four BNC connectors containing four 47 Ohm resistors: - - | | wires - R + junction --R-+-R- R 47 Ohm resistors - R - | - -The shielding is connected together. Active hubs are much more complicated; -they are powered and contain electronics to amplify the signal and send it -to other segments of the net. They usually have eight connectors. Active -hubs come in two variants - dumb and smart. The dumb variant just -amplifies, but the smart one decodes to digital and encodes back all packets -coming through. This is much better if you have several hubs in the net, -since many dumb active hubs may worsen the signal quality. - -And now to the cabling. What you can connect together: - -1. A card to a card. This is the simplest way of creating a 2-computer - network. - -2. A card to a passive hub. Remember that all unused connectors on the hub - must be properly terminated with 93 Ohm (or something else if you don't - have the right ones) terminators. - (Avery's note: oops, I didn't know that. Mine (TV cable) works - anyway, though.) - -3. A card to an active hub. Here is no need to terminate the unused - connectors except some kind of aesthetic feeling. But, there may not be - more than eleven active hubs between any two computers. That of course - doesn't limit the number of active hubs on the network. - -4. An active hub to another. - -5. An active hub to passive hub. - -Remember that you cannot connect two passive hubs together. The power loss -implied by such a connection is too high for the net to operate reliably. - -An example of a typical ARCnet network: - - R S - STAR type card - S------H--------A-------S R - Terminator - | | H - Hub - | | A - Active hub - | S----H----S - S | - | - S - -The BUS topology is very similar to the one used by Ethernet. The only -difference is in cable and terminators: they should be 93 Ohm. Ethernet -uses 50 Ohm impedance. You use T connectors to put the computers on a single -line of cable, the bus. You have to put terminators at both ends of the -cable. A typical BUS ARCnet network looks like: - - RT----T------T------T------T------TR - B B B B B B - - B - BUS type card - R - Terminator - T - T connector - -But that is not all! The two types can be connected together. According to -the official documentation the only way of connecting them is using an active -hub: - - A------T------T------TR - | B B B - S---H---S - | - S - -The official docs also state that you can use STAR cards at the ends of -BUS network in place of a BUS card and a terminator: - - S------T------T------S - B B - -But, according to my own experiments, you can simply hang a BUS type card -anywhere in middle of a cable in a STAR topology network. And more - you -can use the bus card in place of any star card if you use a terminator. Then -you can build very complicated networks fulfilling all your needs! An -example: - - S - | - RT------T-------T------H------S - B B B | - | R - S------A------T-------T-------A-------H------TR - | B B | | B - | S BT | - | | | S----A-----S - S------H---A----S | | - | | S------T----H---S | - S S B R S - -A basically different cabling scheme is used with Twisted Pair cabling. Each -of the TP cards has two RJ (phone-cord style) connectors. The cards are -then daisy-chained together using a cable connecting every two neighboring -cards. The ends are terminated with RJ 93 Ohm terminators which plug into -the empty connectors of cards on the ends of the chain. An example: - - ___________ ___________ - _R_|_ _|_|_ _|_R_ - | | | | | | - |Card | |Card | |Card | - |_____| |_____| |_____| - - -There are also hubs for the TP topology. There is nothing difficult -involved in using them; you just connect a TP chain to a hub on any end or -even at both. This way you can create almost any network configuration. -The maximum of 11 hubs between any two computers on the net applies here as -well. An example: - - RP-------P--------P--------H-----P------P-----PR - | - RP-----H--------P--------H-----P------PR - | | - PR PR - - R - RJ Terminator - P - TP Card - H - TP Hub - -Like any network, ARCnet has a limited cable length. These are the maximum -cable lengths between two active ends (an active end being an active hub or -a STAR card). - - RG-62 93 Ohm up to 650 m - RG-59/U 75 Ohm up to 457 m - RG-11/U 75 Ohm up to 533 m - IBM Type 1 150 Ohm up to 200 m - IBM Type 3 100 Ohm up to 100 m - -The maximum length of all cables connected to a passive hub is limited to 65 -meters for RG-62 cabling; less for others. You can see that using passive -hubs in a large network is a bad idea. The maximum length of a single "BUS -Trunk" is about 300 meters for RG-62. The maximum distance between the two -most distant points of the net is limited to 3000 meters. The maximum length -of a TP cable between two cards/hubs is 650 meters. - - -SETTING THE JUMPERS -------------------- - -All ARCnet cards should have a total of four or five different settings: - - - the I/O address: this is the "port" your ARCnet card is on. Probed - values in the Linux ARCnet driver are only from 0x200 through 0x3F0. (If - your card has additional ones, which is possible, please tell me.) This - should not be the same as any other device on your system. According to - a doc I got from Novell, MS Windows prefers values of 0x300 or more, - eating net connections on my system (at least) otherwise. My guess is - this may be because, if your card is at 0x2E0, probing for a serial port - at 0x2E8 will reset the card and probably mess things up royally. - - Avery's favourite: 0x300. - - - the IRQ: on 8-bit cards, it might be 2 (9), 3, 4, 5, or 7. - on 16-bit cards, it might be 2 (9), 3, 4, 5, 7, or 10-15. - - Make sure this is different from any other card on your system. Note - that IRQ2 is the same as IRQ9, as far as Linux is concerned. You can - "cat /proc/interrupts" for a somewhat complete list of which ones are in - use at any given time. Here is a list of common usages from Vojtech - Pavlik : - ("Not on bus" means there is no way for a card to generate this - interrupt) - IRQ 0 - Timer 0 (Not on bus) - IRQ 1 - Keyboard (Not on bus) - IRQ 2 - IRQ Controller 2 (Not on bus, nor does interrupt the CPU) - IRQ 3 - COM2 - IRQ 4 - COM1 - IRQ 5 - FREE (LPT2 if you have it; sometimes COM3; maybe PLIP) - IRQ 6 - Floppy disk controller - IRQ 7 - FREE (LPT1 if you don't use the polling driver; PLIP) - IRQ 8 - Realtime Clock Interrupt (Not on bus) - IRQ 9 - FREE (VGA vertical sync interrupt if enabled) - IRQ 10 - FREE - IRQ 11 - FREE - IRQ 12 - FREE - IRQ 13 - Numeric Coprocessor (Not on bus) - IRQ 14 - Fixed Disk Controller - IRQ 15 - FREE (Fixed Disk Controller 2 if you have it) - - Note: IRQ 9 is used on some video cards for the "vertical retrace" - interrupt. This interrupt would have been handy for things like - video games, as it occurs exactly once per screen refresh, but - unfortunately IBM cancelled this feature starting with the original - VGA and thus many VGA/SVGA cards do not support it. For this - reason, no modern software uses this interrupt and it can almost - always be safely disabled, if your video card supports it at all. - - If your card for some reason CANNOT disable this IRQ (usually there - is a jumper), one solution would be to clip the printed circuit - contact on the board: it's the fourth contact from the left on the - back side. I take no responsibility if you try this. - - - Avery's favourite: IRQ2 (actually IRQ9). Watch that VGA, though. - - - the memory address: Unlike most cards, ARCnets use "shared memory" for - copying buffers around. Make SURE it doesn't conflict with any other - used memory in your system! - A0000 - VGA graphics memory (ok if you don't have VGA) - B0000 - Monochrome text mode - C0000 \ One of these is your VGA BIOS - usually C0000. - E0000 / - F0000 - System BIOS - - Anything less than 0xA0000 is, well, a BAD idea since it isn't above - 640k. - - Avery's favourite: 0xD0000 - - - the station address: Every ARCnet card has its own "unique" network - address from 0 to 255. Unlike Ethernet, you can set this address - yourself with a jumper or switch (or on some cards, with special - software). Since it's only 8 bits, you can only have 254 ARCnet cards - on a network. DON'T use 0 or 255, since these are reserved (although - neat stuff will probably happen if you DO use them). By the way, if you - haven't already guessed, don't set this the same as any other ARCnet on - your network! - - Avery's favourite: 3 and 4. Not that it matters. - - - There may be ETS1 and ETS2 settings. These may or may not make a - difference on your card (many manuals call them "reserved"), but are - used to change the delays used when powering up a computer on the - network. This is only necessary when wiring VERY long range ARCnet - networks, on the order of 4km or so; in any case, the only real - requirement here is that all cards on the network with ETS1 and ETS2 - jumpers have them in the same position. Chris Hindy - sent in a chart with actual values for this: - ET1 ET2 Response Time Reconfiguration Time - --- --- ------------- -------------------- - open open 74.7us 840us - open closed 283.4us 1680us - closed open 561.8us 1680us - closed closed 1118.6us 1680us - - Make sure you set ETS1 and ETS2 to the SAME VALUE for all cards on your - network. - -Also, on many cards (not mine, though) there are red and green LED's. -Vojtech Pavlik tells me this is what they mean: - GREEN RED Status - ----- --- ------ - OFF OFF Power off - OFF Short flashes Cabling problems (broken cable or not - terminated) - OFF (short) ON Card init - ON ON Normal state - everything OK, nothing - happens - ON Long flashes Data transfer - ON OFF Never happens (maybe when wrong ID) - - -The following is all the specific information people have sent me about -their own particular ARCnet cards. It is officially a mess, and contains -huge amounts of duplicated information. I have no time to fix it. If you -want to, PLEASE DO! Just send me a 'diff -u' of all your changes. - -The model # is listed right above specifics for that card, so you should be -able to use your text viewer's "search" function to find the entry you want. -If you don't KNOW what kind of card you have, try looking through the -various diagrams to see if you can tell. - -If your model isn't listed and/or has different settings, PLEASE PLEASE -tell me. I had to figure mine out without the manual, and it WASN'T FUN! - -Even if your ARCnet model isn't listed, but has the same jumpers as another -model that is, please e-mail me to say so. - -Cards Listed in this file (in this order, mostly): - - Manufacturer Model # Bits - ------------ ------- ---- - SMC PC100 8 - SMC PC110 8 - SMC PC120 8 - SMC PC130 8 - SMC PC270E 8 - SMC PC500 16 - SMC PC500Longboard 16 - SMC PC550Longboard 16 - SMC PC600 16 - SMC PC710 8 - SMC? LCS-8830(-T) 8/16 - Puredata PDI507 8 - CNet Tech CN120-Series 8 - CNet Tech CN160-Series 16 - Lantech? UM9065L chipset 8 - Acer 5210-003 8 - Datapoint? LAN-ARC-8 8 - Topware TA-ARC/10 8 - Thomas-Conrad 500-6242-0097 REV A 8 - Waterloo? (C)1985 Waterloo Micro. 8 - No Name -- 8/16 - No Name Taiwan R.O.C? 8 - No Name Model 9058 8 - Tiara Tiara Lancard? 8 - - -** SMC = Standard Microsystems Corp. -** CNet Tech = CNet Technology, Inc. - - -Unclassified Stuff ------------------- - - Please send any other information you can find. - - - And some other stuff (more info is welcome!): - From: root@ultraworld.xs4all.nl (Timo Hilbrink) - To: apenwarr@foxnet.net (Avery Pennarun) - Date: Wed, 26 Oct 1994 02:10:32 +0000 (GMT) - Reply-To: timoh@xs4all.nl - - [...parts deleted...] - - About the jumpers: On my PC130 there is one more jumper, located near the - cable-connector and it's for changing to star or bus topology; - closed: star - open: bus - On the PC500 are some more jumper-pins, one block labeled with RX,PDN,TXI - and another with ALE,LA17,LA18,LA19 these are undocumented.. - - [...more parts deleted...] - - --- CUT --- - - -** Standard Microsystems Corp (SMC) ** -PC100, PC110, PC120, PC130 (8-bit cards) -PC500, PC600 (16-bit cards) ---------------------------------- - - mainly from Avery Pennarun . Values depicted - are from Avery's setup. - - special thanks to Timo Hilbrink for noting that PC120, - 130, 500, and 600 all have the same switches as Avery's PC100. - PC500/600 have several extra, undocumented pins though. (?) - - PC110 settings were verified by Stephen A. Wood - - Also, the JP- and S-numbers probably don't match your card exactly. Try - to find jumpers/switches with the same number of settings - it's - probably more reliable. - - - JP5 [|] : : : : -(IRQ Setting) IRQ2 IRQ3 IRQ4 IRQ5 IRQ7 - Put exactly one jumper on exactly one set of pins. - - - 1 2 3 4 5 6 7 8 9 10 - S1 /----------------------------------\ -(I/O and Memory | 1 1 * 0 0 0 0 * 1 1 0 1 | - addresses) \----------------------------------/ - |--| |--------| |--------| - (a) (b) (m) - - WARNING. It's very important when setting these which way - you're holding the card, and which way you think is '1'! - - If you suspect that your settings are not being made - correctly, try reversing the direction or inverting the - switch positions. - - a: The first digit of the I/O address. - Setting Value - ------- ----- - 00 0 - 01 1 - 10 2 - 11 3 - - b: The second digit of the I/O address. - Setting Value - ------- ----- - 0000 0 - 0001 1 - 0010 2 - ... ... - 1110 E - 1111 F - - The I/O address is in the form ab0. For example, if - a is 0x2 and b is 0xE, the address will be 0x2E0. - - DO NOT SET THIS LESS THAN 0x200!!!!! - - - m: The first digit of the memory address. - Setting Value - ------- ----- - 0000 0 - 0001 1 - 0010 2 - ... ... - 1110 E - 1111 F - - The memory address is in the form m0000. For example, if - m is D, the address will be 0xD0000. - - DO NOT SET THIS TO C0000, F0000, OR LESS THAN A0000! - - 1 2 3 4 5 6 7 8 - S2 /--------------------------\ -(Station Address) | 1 1 0 0 0 0 0 0 | - \--------------------------/ - - Setting Value - ------- ----- - 00000000 00 - 10000000 01 - 01000000 02 - ... - 01111111 FE - 11111111 FF - - Note that this is binary with the digits reversed! - - DO NOT SET THIS TO 0 OR 255 (0xFF)! - - -***************************************************************************** - -** Standard Microsystems Corp (SMC) ** -PC130E/PC270E (8-bit cards) ---------------------------- - - from Juergen Seifert - - -STANDARD MICROSYSTEMS CORPORATION (SMC) ARCNET(R)-PC130E/PC270E -=============================================================== - -This description has been written by Juergen Seifert -using information from the following Original SMC Manual - - "Configuration Guide for - ARCNET(R)-PC130E/PC270 - Network Controller Boards - Pub. # 900.044A - June, 1989" - -ARCNET is a registered trademark of the Datapoint Corporation -SMC is a registered trademark of the Standard Microsystems Corporation - -The PC130E is an enhanced version of the PC130 board, is equipped with a -standard BNC female connector for connection to RG-62/U coax cable. -Since this board is designed both for point-to-point connection in star -networks and for connection to bus networks, it is downwardly compatible -with all the other standard boards designed for coax networks (that is, -the PC120, PC110 and PC100 star topology boards and the PC220, PC210 and -PC200 bus topology boards). - -The PC270E is an enhanced version of the PC260 board, is equipped with two -modular RJ11-type jacks for connection to twisted pair wiring. -It can be used in a star or a daisy-chained network. - - - 8 7 6 5 4 3 2 1 - ________________________________________________________________ - | | S1 | | - | |_________________| | - | Offs|Base |I/O Addr | - | RAM Addr | ___| - | ___ ___ CR3 |___| - | | \/ | CR4 |___| - | | PROM | ___| - | | | N | | 8 - | | SOCKET | o | | 7 - | |________| d | | 6 - | ___________________ e | | 5 - | | | A | S | 4 - | |oo| EXT2 | | d | 2 | 3 - | |oo| EXT1 | SMC | d | | 2 - | |oo| ROM | 90C63 | r |___| 1 - | |oo| IRQ7 | | |o| _____| - | |oo| IRQ5 | | |o| | J1 | - | |oo| IRQ4 | | STAR |_____| - | |oo| IRQ3 | | | J2 | - | |oo| IRQ2 |___________________| |_____| - |___ ______________| - | | - |_____________________________________________| - -Legend: - -SMC 90C63 ARCNET Controller / Transceiver /Logic -S1 1-3: I/O Base Address Select - 4-6: Memory Base Address Select - 7-8: RAM Offset Select -S2 1-8: Node ID Select -EXT Extended Timeout Select -ROM ROM Enable Select -STAR Selected - Star Topology (PC130E only) - Deselected - Bus Topology (PC130E only) -CR3/CR4 Diagnostic LEDs -J1 BNC RG62/U Connector (PC130E only) -J1 6-position Telephone Jack (PC270E only) -J2 6-position Telephone Jack (PC270E only) - -Setting one of the switches to Off/Open means "1", On/Closed means "0". - - -Setting the Node ID -------------------- - -The eight switches in group S2 are used to set the node ID. -These switches work in a way similar to the PC100-series cards; see that -entry for more information. - - -Setting the I/O Base Address ----------------------------- - -The first three switches in switch group S1 are used to select one -of eight possible I/O Base addresses using the following table - - - Switch | Hex I/O - 1 2 3 | Address - -------|-------- - 0 0 0 | 260 - 0 0 1 | 290 - 0 1 0 | 2E0 (Manufacturer's default) - 0 1 1 | 2F0 - 1 0 0 | 300 - 1 0 1 | 350 - 1 1 0 | 380 - 1 1 1 | 3E0 - - -Setting the Base Memory (RAM) buffer Address --------------------------------------------- - -The memory buffer requires 2K of a 16K block of RAM. The base of this -16K block can be located in any of eight positions. -Switches 4-6 of switch group S1 select the Base of the 16K block. -Within that 16K address space, the buffer may be assigned any one of four -positions, determined by the offset, switches 7 and 8 of group S1. - - Switch | Hex RAM | Hex ROM - 4 5 6 7 8 | Address | Address *) - -----------|---------|----------- - 0 0 0 0 0 | C0000 | C2000 - 0 0 0 0 1 | C0800 | C2000 - 0 0 0 1 0 | C1000 | C2000 - 0 0 0 1 1 | C1800 | C2000 - | | - 0 0 1 0 0 | C4000 | C6000 - 0 0 1 0 1 | C4800 | C6000 - 0 0 1 1 0 | C5000 | C6000 - 0 0 1 1 1 | C5800 | C6000 - | | - 0 1 0 0 0 | CC000 | CE000 - 0 1 0 0 1 | CC800 | CE000 - 0 1 0 1 0 | CD000 | CE000 - 0 1 0 1 1 | CD800 | CE000 - | | - 0 1 1 0 0 | D0000 | D2000 (Manufacturer's default) - 0 1 1 0 1 | D0800 | D2000 - 0 1 1 1 0 | D1000 | D2000 - 0 1 1 1 1 | D1800 | D2000 - | | - 1 0 0 0 0 | D4000 | D6000 - 1 0 0 0 1 | D4800 | D6000 - 1 0 0 1 0 | D5000 | D6000 - 1 0 0 1 1 | D5800 | D6000 - | | - 1 0 1 0 0 | D8000 | DA000 - 1 0 1 0 1 | D8800 | DA000 - 1 0 1 1 0 | D9000 | DA000 - 1 0 1 1 1 | D9800 | DA000 - | | - 1 1 0 0 0 | DC000 | DE000 - 1 1 0 0 1 | DC800 | DE000 - 1 1 0 1 0 | DD000 | DE000 - 1 1 0 1 1 | DD800 | DE000 - | | - 1 1 1 0 0 | E0000 | E2000 - 1 1 1 0 1 | E0800 | E2000 - 1 1 1 1 0 | E1000 | E2000 - 1 1 1 1 1 | E1800 | E2000 - -*) To enable the 8K Boot PROM install the jumper ROM. - The default is jumper ROM not installed. - - -Setting the Timeouts and Interrupt ----------------------------------- - -The jumpers labeled EXT1 and EXT2 are used to determine the timeout -parameters. These two jumpers are normally left open. - -To select a hardware interrupt level set one (only one!) of the jumpers -IRQ2, IRQ3, IRQ4, IRQ5, IRQ7. The Manufacturer's default is IRQ2. - - -Configuring the PC130E for Star or Bus Topology ------------------------------------------------ - -The single jumper labeled STAR is used to configure the PC130E board for -star or bus topology. -When the jumper is installed, the board may be used in a star network, when -it is removed, the board can be used in a bus topology. - - -Diagnostic LEDs ---------------- - -Two diagnostic LEDs are visible on the rear bracket of the board. -The green LED monitors the network activity: the red one shows the -board activity: - - Green | Status Red | Status - -------|------------------- ---------|------------------- - on | normal activity flash/on | data transfer - blink | reconfiguration off | no data transfer; - off | defective board or | incorrect memory or - | node ID is zero | I/O address - - -***************************************************************************** - -** Standard Microsystems Corp (SMC) ** -PC500/PC550 Longboard (16-bit cards) -------------------------------------- - - from Juergen Seifert - - -STANDARD MICROSYSTEMS CORPORATION (SMC) ARCNET-PC500/PC550 Long Board -===================================================================== - -Note: There is another Version of the PC500 called Short Version, which - is different in hard- and software! The most important differences - are: - - The long board has no Shared memory. - - On the long board the selection of the interrupt is done by binary - coded switch, on the short board directly by jumper. - -[Avery's note: pay special attention to that: the long board HAS NO SHARED -MEMORY. This means the current Linux-ARCnet driver can't use these cards. -I have obtained a PC500Longboard and will be doing some experiments on it in -the future, but don't hold your breath. Thanks again to Juergen Seifert for -his advice about this!] - -This description has been written by Juergen Seifert -using information from the following Original SMC Manual - - "Configuration Guide for - SMC ARCNET-PC500/PC550 - Series Network Controller Boards - Pub. # 900.033 Rev. A - November, 1989" - -ARCNET is a registered trademark of the Datapoint Corporation -SMC is a registered trademark of the Standard Microsystems Corporation - -The PC500 is equipped with a standard BNC female connector for connection -to RG-62/U coax cable. -The board is designed both for point-to-point connection in star networks -and for connection to bus networks. - -The PC550 is equipped with two modular RJ11-type jacks for connection -to twisted pair wiring. -It can be used in a star or a daisy-chained (BUS) network. - - 1 - 0 9 8 7 6 5 4 3 2 1 6 5 4 3 2 1 - ____________________________________________________________________ - < | SW1 | | SW2 | | - > |_____________________| |_____________| | - < IRQ |I/O Addr | - > ___| - < CR4 |___| - > CR3 |___| - < ___| - > N | | 8 - < o | | 7 - > d | S | 6 - < e | W | 5 - > A | 3 | 4 - < d | | 3 - > d | | 2 - < r |___| 1 - > |o| _____| - < |o| | J1 | - > 3 1 JP6 |_____| - < |o|o| JP2 | J2 | - > |o|o| |_____| - < 4 2__ ______________| - > | | | - <____| |_____________________________________________| - -Legend: - -SW1 1-6: I/O Base Address Select - 7-10: Interrupt Select -SW2 1-6: Reserved for Future Use -SW3 1-8: Node ID Select -JP2 1-4: Extended Timeout Select -JP6 Selected - Star Topology (PC500 only) - Deselected - Bus Topology (PC500 only) -CR3 Green Monitors Network Activity -CR4 Red Monitors Board Activity -J1 BNC RG62/U Connector (PC500 only) -J1 6-position Telephone Jack (PC550 only) -J2 6-position Telephone Jack (PC550 only) - -Setting one of the switches to Off/Open means "1", On/Closed means "0". - - -Setting the Node ID -------------------- - -The eight switches in group SW3 are used to set the node ID. Each node -attached to the network must have an unique node ID which must be -different from 0. -Switch 1 serves as the least significant bit (LSB). - -The node ID is the sum of the values of all switches set to "1" -These values are: - - Switch | Value - -------|------- - 1 | 1 - 2 | 2 - 3 | 4 - 4 | 8 - 5 | 16 - 6 | 32 - 7 | 64 - 8 | 128 - -Some Examples: - - Switch | Hex | Decimal - 8 7 6 5 4 3 2 1 | Node ID | Node ID - ----------------|---------|--------- - 0 0 0 0 0 0 0 0 | not allowed - 0 0 0 0 0 0 0 1 | 1 | 1 - 0 0 0 0 0 0 1 0 | 2 | 2 - 0 0 0 0 0 0 1 1 | 3 | 3 - . . . | | - 0 1 0 1 0 1 0 1 | 55 | 85 - . . . | | - 1 0 1 0 1 0 1 0 | AA | 170 - . . . | | - 1 1 1 1 1 1 0 1 | FD | 253 - 1 1 1 1 1 1 1 0 | FE | 254 - 1 1 1 1 1 1 1 1 | FF | 255 - - -Setting the I/O Base Address ----------------------------- - -The first six switches in switch group SW1 are used to select one -of 32 possible I/O Base addresses using the following table - - Switch | Hex I/O - 6 5 4 3 2 1 | Address - -------------|-------- - 0 1 0 0 0 0 | 200 - 0 1 0 0 0 1 | 210 - 0 1 0 0 1 0 | 220 - 0 1 0 0 1 1 | 230 - 0 1 0 1 0 0 | 240 - 0 1 0 1 0 1 | 250 - 0 1 0 1 1 0 | 260 - 0 1 0 1 1 1 | 270 - 0 1 1 0 0 0 | 280 - 0 1 1 0 0 1 | 290 - 0 1 1 0 1 0 | 2A0 - 0 1 1 0 1 1 | 2B0 - 0 1 1 1 0 0 | 2C0 - 0 1 1 1 0 1 | 2D0 - 0 1 1 1 1 0 | 2E0 (Manufacturer's default) - 0 1 1 1 1 1 | 2F0 - 1 1 0 0 0 0 | 300 - 1 1 0 0 0 1 | 310 - 1 1 0 0 1 0 | 320 - 1 1 0 0 1 1 | 330 - 1 1 0 1 0 0 | 340 - 1 1 0 1 0 1 | 350 - 1 1 0 1 1 0 | 360 - 1 1 0 1 1 1 | 370 - 1 1 1 0 0 0 | 380 - 1 1 1 0 0 1 | 390 - 1 1 1 0 1 0 | 3A0 - 1 1 1 0 1 1 | 3B0 - 1 1 1 1 0 0 | 3C0 - 1 1 1 1 0 1 | 3D0 - 1 1 1 1 1 0 | 3E0 - 1 1 1 1 1 1 | 3F0 - - -Setting the Interrupt ---------------------- - -Switches seven through ten of switch group SW1 are used to select the -interrupt level. The interrupt level is binary coded, so selections -from 0 to 15 would be possible, but only the following eight values will -be supported: 3, 4, 5, 7, 9, 10, 11, 12. - - Switch | IRQ - 10 9 8 7 | - ---------|-------- - 0 0 1 1 | 3 - 0 1 0 0 | 4 - 0 1 0 1 | 5 - 0 1 1 1 | 7 - 1 0 0 1 | 9 (=2) (default) - 1 0 1 0 | 10 - 1 0 1 1 | 11 - 1 1 0 0 | 12 - - -Setting the Timeouts --------------------- - -The two jumpers JP2 (1-4) are used to determine the timeout parameters. -These two jumpers are normally left open. -Refer to the COM9026 Data Sheet for alternate configurations. - - -Configuring the PC500 for Star or Bus Topology ----------------------------------------------- - -The single jumper labeled JP6 is used to configure the PC500 board for -star or bus topology. -When the jumper is installed, the board may be used in a star network, when -it is removed, the board can be used in a bus topology. - - -Diagnostic LEDs ---------------- - -Two diagnostic LEDs are visible on the rear bracket of the board. -The green LED monitors the network activity: the red one shows the -board activity: - - Green | Status Red | Status - -------|------------------- ---------|------------------- - on | normal activity flash/on | data transfer - blink | reconfiguration off | no data transfer; - off | defective board or | incorrect memory or - | node ID is zero | I/O address - - -***************************************************************************** - -** SMC ** -PC710 (8-bit card) ------------------- - - from J.S. van Oosten - -Note: this data is gathered by experimenting and looking at info of other -cards. However, I'm sure I got 99% of the settings right. - -The SMC710 card resembles the PC270 card, but is much more basic (i.e. no -LEDs, RJ11 jacks, etc.) and 8 bit. Here's a little drawing: - - _______________________________________ - | +---------+ +---------+ |____ - | | S2 | | S1 | | - | +---------+ +---------+ | - | | - | +===+ __ | - | | R | | | X-tal ###___ - | | O | |__| ####__'| - | | M | || ### - | +===+ | - | | - | .. JP1 +----------+ | - | .. | big chip | | - | .. | 90C63 | | - | .. | | | - | .. +----------+ | - ------- ----------- - ||||||||||||||||||||| - -The row of jumpers at JP1 actually consists of 8 jumpers, (sometimes -labelled) the same as on the PC270, from top to bottom: EXT2, EXT1, ROM, -IRQ7, IRQ5, IRQ4, IRQ3, IRQ2 (gee, wonder what they would do? :-) ) - -S1 and S2 perform the same function as on the PC270, only their numbers -are swapped (S1 is the nodeaddress, S2 sets IO- and RAM-address). - -I know it works when connected to a PC110 type ARCnet board. - - -***************************************************************************** - -** Possibly SMC ** -LCS-8830(-T) (8 and 16-bit cards) ---------------------------------- - - from Mathias Katzer - - Marek Michalkiewicz says the - LCS-8830 is slightly different from LCS-8830-T. These are 8 bit, BUS - only (the JP0 jumper is hardwired), and BNC only. - -This is a LCS-8830-T made by SMC, I think ('SMC' only appears on one PLCC, -nowhere else, not even on the few Xeroxed sheets from the manual). - -SMC ARCnet Board Type LCS-8830-T - - ------------------------------------ - | | - | JP3 88 8 JP2 | - | ##### | \ | - | ##### ET1 ET2 ###| - | 8 ###| - | U3 SW 1 JP0 ###| Phone Jacks - | -- ###| - | | | | - | | | SW2 | - | | | | - | | | ##### | - | -- ##### #### BNC Connector - | #### - | 888888 JP1 | - | 234567 | - -- ------- - ||||||||||||||||||||||||||| - -------------------------- - - -SW1: DIP-Switches for Station Address -SW2: DIP-Switches for Memory Base and I/O Base addresses - -JP0: If closed, internal termination on (default open) -JP1: IRQ Jumpers -JP2: Boot-ROM enabled if closed -JP3: Jumpers for response timeout - -U3: Boot-ROM Socket - - -ET1 ET2 Response Time Idle Time Reconfiguration Time - - 78 86 840 - X 285 316 1680 - X 563 624 1680 - X X 1130 1237 1680 - -(X means closed jumper) - -(DIP-Switch downwards means "0") - -The station address is binary-coded with SW1. - -The I/O base address is coded with DIP-Switches 6,7 and 8 of SW2: - -Switches Base -678 Address -000 260-26f -100 290-29f -010 2e0-2ef -110 2f0-2ff -001 300-30f -101 350-35f -011 380-38f -111 3e0-3ef - - -DIP Switches 1-5 of SW2 encode the RAM and ROM Address Range: - -Switches RAM ROM -12345 Address Range Address Range -00000 C:0000-C:07ff C:2000-C:3fff -10000 C:0800-C:0fff -01000 C:1000-C:17ff -11000 C:1800-C:1fff -00100 C:4000-C:47ff C:6000-C:7fff -10100 C:4800-C:4fff -01100 C:5000-C:57ff -11100 C:5800-C:5fff -00010 C:C000-C:C7ff C:E000-C:ffff -10010 C:C800-C:Cfff -01010 C:D000-C:D7ff -11010 C:D800-C:Dfff -00110 D:0000-D:07ff D:2000-D:3fff -10110 D:0800-D:0fff -01110 D:1000-D:17ff -11110 D:1800-D:1fff -00001 D:4000-D:47ff D:6000-D:7fff -10001 D:4800-D:4fff -01001 D:5000-D:57ff -11001 D:5800-D:5fff -00101 D:8000-D:87ff D:A000-D:bfff -10101 D:8800-D:8fff -01101 D:9000-D:97ff -11101 D:9800-D:9fff -00011 D:C000-D:c7ff D:E000-D:ffff -10011 D:C800-D:cfff -01011 D:D000-D:d7ff -11011 D:D800-D:dfff -00111 E:0000-E:07ff E:2000-E:3fff -10111 E:0800-E:0fff -01111 E:1000-E:17ff -11111 E:1800-E:1fff - - -***************************************************************************** - -** PureData Corp ** -PDI507 (8-bit card) --------------------- - - from Mark Rejhon (slight modifications by Avery) - - Avery's note: I think PDI508 cards (but definitely NOT PDI508Plus cards) - are mostly the same as this. PDI508Plus cards appear to be mainly - software-configured. - -Jumpers: - There is a jumper array at the bottom of the card, near the edge - connector. This array is labelled J1. They control the IRQs and - something else. Put only one jumper on the IRQ pins. - - ETS1, ETS2 are for timing on very long distance networks. See the - more general information near the top of this file. - - There is a J2 jumper on two pins. A jumper should be put on them, - since it was already there when I got the card. I don't know what - this jumper is for though. - - There is a two-jumper array for J3. I don't know what it is for, - but there were already two jumpers on it when I got the card. It's - a six pin grid in a two-by-three fashion. The jumpers were - configured as follows: - - .-------. - o | o o | - :-------: ------> Accessible end of card with connectors - o | o o | in this direction -------> - `-------' - -Carl de Billy explains J3 and J4: - - J3 Diagram: - - .-------. - o | o o | - :-------: TWIST Technology - o | o o | - `-------' - .-------. - | o o | o - :-------: COAX Technology - | o o | o - `-------' - - - If using coax cable in a bus topology the J4 jumper must be removed; - place it on one pin. - - - If using bus topology with twisted pair wiring move the J3 - jumpers so they connect the middle pin and the pins closest to the RJ11 - Connectors. Also the J4 jumper must be removed; place it on one pin of - J4 jumper for storage. - - - If using star topology with twisted pair wiring move the J3 - jumpers so they connect the middle pin and the pins closest to the RJ11 - connectors. - - -DIP Switches: - - The DIP switches accessible on the accessible end of the card while - it is installed, is used to set the ARCnet address. There are 8 - switches. Use an address from 1 to 254. - - Switch No. - 12345678 ARCnet address - ----------------------------------------- - 00000000 FF (Don't use this!) - 00000001 FE - 00000010 FD - .... - 11111101 2 - 11111110 1 - 11111111 0 (Don't use this!) - - There is another array of eight DIP switches at the top of the - card. There are five labelled MS0-MS4 which seem to control the - memory address, and another three labelled IO0-IO2 which seem to - control the base I/O address of the card. - - This was difficult to test by trial and error, and the I/O addresses - are in a weird order. This was tested by setting the DIP switches, - rebooting the computer, and attempting to load ARCETHER at various - addresses (mostly between 0x200 and 0x400). The address that caused - the red transmit LED to blink, is the one that I thought works. - - Also, the address 0x3D0 seem to have a special meaning, since the - ARCETHER packet driver loaded fine, but without the red LED - blinking. I don't know what 0x3D0 is for though. I recommend using - an address of 0x300 since Windows may not like addresses below - 0x300. - - IO Switch No. - 210 I/O address - ------------------------------- - 111 0x260 - 110 0x290 - 101 0x2E0 - 100 0x2F0 - 011 0x300 - 010 0x350 - 001 0x380 - 000 0x3E0 - - The memory switches set a reserved address space of 0x1000 bytes - (0x100 segment units, or 4k). For example if I set an address of - 0xD000, it will use up addresses 0xD000 to 0xD100. - - The memory switches were tested by booting using QEMM386 stealth, - and using LOADHI to see what address automatically became excluded - from the upper memory regions, and then attempting to load ARCETHER - using these addresses. - - I recommend using an ARCnet memory address of 0xD000, and putting - the EMS page frame at 0xC000 while using QEMM stealth mode. That - way, you get contiguous high memory from 0xD100 almost all the way - the end of the megabyte. - - Memory Switch 0 (MS0) didn't seem to work properly when set to OFF - on my card. It could be malfunctioning on my card. Experiment with - it ON first, and if it doesn't work, set it to OFF. (It may be a - modifier for the 0x200 bit?) - - MS Switch No. - 43210 Memory address - -------------------------------- - 00001 0xE100 (guessed - was not detected by QEMM) - 00011 0xE000 (guessed - was not detected by QEMM) - 00101 0xDD00 - 00111 0xDC00 - 01001 0xD900 - 01011 0xD800 - 01101 0xD500 - 01111 0xD400 - 10001 0xD100 - 10011 0xD000 - 10101 0xCD00 - 10111 0xCC00 - 11001 0xC900 (guessed - crashes tested system) - 11011 0xC800 (guessed - crashes tested system) - 11101 0xC500 (guessed - crashes tested system) - 11111 0xC400 (guessed - crashes tested system) - - -***************************************************************************** - -** CNet Technology Inc. ** -120 Series (8-bit cards) ------------------------- - - from Juergen Seifert - - -CNET TECHNOLOGY INC. (CNet) ARCNET 120A SERIES -============================================== - -This description has been written by Juergen Seifert -using information from the following Original CNet Manual - - "ARCNET - USER'S MANUAL - for - CN120A - CN120AB - CN120TP - CN120ST - CN120SBT - P/N:12-01-0007 - Revision 3.00" - -ARCNET is a registered trademark of the Datapoint Corporation - -P/N 120A ARCNET 8 bit XT/AT Star -P/N 120AB ARCNET 8 bit XT/AT Bus -P/N 120TP ARCNET 8 bit XT/AT Twisted Pair -P/N 120ST ARCNET 8 bit XT/AT Star, Twisted Pair -P/N 120SBT ARCNET 8 bit XT/AT Star, Bus, Twisted Pair - - __________________________________________________________________ - | | - | ___| - | LED |___| - | ___| - | N | | ID7 - | o | | ID6 - | d | S | ID5 - | e | W | ID4 - | ___________________ A | 2 | ID3 - | | | d | | ID2 - | | | 1 2 3 4 5 6 7 8 d | | ID1 - | | | _________________ r |___| ID0 - | | 90C65 || SW1 | ____| - | JP 8 7 | ||_________________| | | - | |o|o| JP1 | | | J2 | - | |o|o| |oo| | | JP 1 1 1 | | - | ______________ | | 0 1 2 |____| - | | PROM | |___________________| |o|o|o| _____| - | > SOCKET | JP 6 5 4 3 2 |o|o|o| | J1 | - | |______________| |o|o|o|o|o| |o|o|o| |_____| - |_____ |o|o|o|o|o| ______________| - | | - |_____________________________________________| - -Legend: - -90C65 ARCNET Probe -S1 1-5: Base Memory Address Select - 6-8: Base I/O Address Select -S2 1-8: Node ID Select (ID0-ID7) -JP1 ROM Enable Select -JP2 IRQ2 -JP3 IRQ3 -JP4 IRQ4 -JP5 IRQ5 -JP6 IRQ7 -JP7/JP8 ET1, ET2 Timeout Parameters -JP10/JP11 Coax / Twisted Pair Select (CN120ST/SBT only) -JP12 Terminator Select (CN120AB/ST/SBT only) -J1 BNC RG62/U Connector (all except CN120TP) -J2 Two 6-position Telephone Jack (CN120TP/ST/SBT only) - -Setting one of the switches to Off means "1", On means "0". - - -Setting the Node ID -------------------- - -The eight switches in SW2 are used to set the node ID. Each node attached -to the network must have an unique node ID which must be different from 0. -Switch 1 (ID0) serves as the least significant bit (LSB). - -The node ID is the sum of the values of all switches set to "1" -These values are: - - Switch | Label | Value - -------|-------|------- - 1 | ID0 | 1 - 2 | ID1 | 2 - 3 | ID2 | 4 - 4 | ID3 | 8 - 5 | ID4 | 16 - 6 | ID5 | 32 - 7 | ID6 | 64 - 8 | ID7 | 128 - -Some Examples: - - Switch | Hex | Decimal - 8 7 6 5 4 3 2 1 | Node ID | Node ID - ----------------|---------|--------- - 0 0 0 0 0 0 0 0 | not allowed - 0 0 0 0 0 0 0 1 | 1 | 1 - 0 0 0 0 0 0 1 0 | 2 | 2 - 0 0 0 0 0 0 1 1 | 3 | 3 - . . . | | - 0 1 0 1 0 1 0 1 | 55 | 85 - . . . | | - 1 0 1 0 1 0 1 0 | AA | 170 - . . . | | - 1 1 1 1 1 1 0 1 | FD | 253 - 1 1 1 1 1 1 1 0 | FE | 254 - 1 1 1 1 1 1 1 1 | FF | 255 - - -Setting the I/O Base Address ----------------------------- - -The last three switches in switch block SW1 are used to select one -of eight possible I/O Base addresses using the following table - - - Switch | Hex I/O - 6 7 8 | Address - ------------|-------- - ON ON ON | 260 - OFF ON ON | 290 - ON OFF ON | 2E0 (Manufacturer's default) - OFF OFF ON | 2F0 - ON ON OFF | 300 - OFF ON OFF | 350 - ON OFF OFF | 380 - OFF OFF OFF | 3E0 - - -Setting the Base Memory (RAM) buffer Address --------------------------------------------- - -The memory buffer (RAM) requires 2K. The base of this buffer can be -located in any of eight positions. The address of the Boot Prom is -memory base + 8K or memory base + 0x2000. -Switches 1-5 of switch block SW1 select the Memory Base address. - - Switch | Hex RAM | Hex ROM - 1 2 3 4 5 | Address | Address *) - --------------------|---------|----------- - ON ON ON ON ON | C0000 | C2000 - ON ON OFF ON ON | C4000 | C6000 - ON ON ON OFF ON | CC000 | CE000 - ON ON OFF OFF ON | D0000 | D2000 (Manufacturer's default) - ON ON ON ON OFF | D4000 | D6000 - ON ON OFF ON OFF | D8000 | DA000 - ON ON ON OFF OFF | DC000 | DE000 - ON ON OFF OFF OFF | E0000 | E2000 - -*) To enable the Boot ROM install the jumper JP1 - -Note: Since the switches 1 and 2 are always set to ON it may be possible - that they can be used to add an offset of 2K, 4K or 6K to the base - address, but this feature is not documented in the manual and I - haven't tested it yet. - - -Setting the Interrupt Line --------------------------- - -To select a hardware interrupt level install one (only one!) of the jumpers -JP2, JP3, JP4, JP5, JP6. JP2 is the default. - - Jumper | IRQ - -------|----- - 2 | 2 - 3 | 3 - 4 | 4 - 5 | 5 - 6 | 7 - - -Setting the Internal Terminator on CN120AB/TP/SBT --------------------------------------------------- - -The jumper JP12 is used to enable the internal terminator. - - ----- - 0 | 0 | - ----- ON | | ON - | 0 | | 0 | - | | OFF ----- OFF - | 0 | 0 - ----- - Terminator Terminator - disabled enabled - - -Selecting the Connector Type on CN120ST/SBT -------------------------------------------- - - JP10 JP11 JP10 JP11 - ----- ----- - 0 0 | 0 | | 0 | - ----- ----- | | | | - | 0 | | 0 | | 0 | | 0 | - | | | | ----- ----- - | 0 | | 0 | 0 0 - ----- ----- - Coaxial Cable Twisted Pair Cable - (Default) - - -Setting the Timeout Parameters ------------------------------- - -The jumpers labeled EXT1 and EXT2 are used to determine the timeout -parameters. These two jumpers are normally left open. - - - -***************************************************************************** - -** CNet Technology Inc. ** -160 Series (16-bit cards) -------------------------- - - from Juergen Seifert - -CNET TECHNOLOGY INC. (CNet) ARCNET 160A SERIES -============================================== - -This description has been written by Juergen Seifert -using information from the following Original CNet Manual - - "ARCNET - USER'S MANUAL - for - CN160A - CN160AB - CN160TP - P/N:12-01-0006 - Revision 3.00" - -ARCNET is a registered trademark of the Datapoint Corporation - -P/N 160A ARCNET 16 bit XT/AT Star -P/N 160AB ARCNET 16 bit XT/AT Bus -P/N 160TP ARCNET 16 bit XT/AT Twisted Pair - - ___________________________________________________________________ - < _________________________ ___| - > |oo| JP2 | | LED |___| - < |oo| JP1 | 9026 | LED |___| - > |_________________________| ___| - < N | | ID7 - > 1 o | | ID6 - < 1 2 3 4 5 6 7 8 9 0 d | S | ID5 - > _______________ _____________________ e | W | ID4 - < | PROM | | SW1 | A | 2 | ID3 - > > SOCKET | |_____________________| d | | ID2 - < |_______________| | IO-Base | MEM | d | | ID1 - > r |___| ID0 - < ____| - > | | - < | J1 | - > | | - < |____| - > 1 1 1 1 | - < 3 4 5 6 7 JP 8 9 0 1 2 3 | - > |o|o|o|o|o| |o|o|o|o|o|o| | - < |o|o|o|o|o| __ |o|o|o|o|o|o| ___________| - > | | | - <____________| |_______________________________________| - -Legend: - -9026 ARCNET Probe -SW1 1-6: Base I/O Address Select - 7-10: Base Memory Address Select -SW2 1-8: Node ID Select (ID0-ID7) -JP1/JP2 ET1, ET2 Timeout Parameters -JP3-JP13 Interrupt Select -J1 BNC RG62/U Connector (CN160A/AB only) -J1 Two 6-position Telephone Jack (CN160TP only) -LED - -Setting one of the switches to Off means "1", On means "0". - - -Setting the Node ID -------------------- - -The eight switches in SW2 are used to set the node ID. Each node attached -to the network must have an unique node ID which must be different from 0. -Switch 1 (ID0) serves as the least significant bit (LSB). - -The node ID is the sum of the values of all switches set to "1" -These values are: - - Switch | Label | Value - -------|-------|------- - 1 | ID0 | 1 - 2 | ID1 | 2 - 3 | ID2 | 4 - 4 | ID3 | 8 - 5 | ID4 | 16 - 6 | ID5 | 32 - 7 | ID6 | 64 - 8 | ID7 | 128 - -Some Examples: - - Switch | Hex | Decimal - 8 7 6 5 4 3 2 1 | Node ID | Node ID - ----------------|---------|--------- - 0 0 0 0 0 0 0 0 | not allowed - 0 0 0 0 0 0 0 1 | 1 | 1 - 0 0 0 0 0 0 1 0 | 2 | 2 - 0 0 0 0 0 0 1 1 | 3 | 3 - . . . | | - 0 1 0 1 0 1 0 1 | 55 | 85 - . . . | | - 1 0 1 0 1 0 1 0 | AA | 170 - . . . | | - 1 1 1 1 1 1 0 1 | FD | 253 - 1 1 1 1 1 1 1 0 | FE | 254 - 1 1 1 1 1 1 1 1 | FF | 255 - - -Setting the I/O Base Address ----------------------------- - -The first six switches in switch block SW1 are used to select the I/O Base -address using the following table: - - Switch | Hex I/O - 1 2 3 4 5 6 | Address - ------------------------|-------- - OFF ON ON OFF OFF ON | 260 - OFF ON OFF ON ON OFF | 290 - OFF ON OFF OFF OFF ON | 2E0 (Manufacturer's default) - OFF ON OFF OFF OFF OFF | 2F0 - OFF OFF ON ON ON ON | 300 - OFF OFF ON OFF ON OFF | 350 - OFF OFF OFF ON ON ON | 380 - OFF OFF OFF OFF OFF ON | 3E0 - -Note: Other IO-Base addresses seem to be selectable, but only the above - combinations are documented. - - -Setting the Base Memory (RAM) buffer Address --------------------------------------------- - -The switches 7-10 of switch block SW1 are used to select the Memory -Base address of the RAM (2K) and the PROM. - - Switch | Hex RAM | Hex ROM - 7 8 9 10 | Address | Address - ----------------|---------|----------- - OFF OFF ON ON | C0000 | C8000 - OFF OFF ON OFF | D0000 | D8000 (Default) - OFF OFF OFF ON | E0000 | E8000 - -Note: Other MEM-Base addresses seem to be selectable, but only the above - combinations are documented. - - -Setting the Interrupt Line --------------------------- - -To select a hardware interrupt level install one (only one!) of the jumpers -JP3 through JP13 using the following table: - - Jumper | IRQ - -------|----------------- - 3 | 14 - 4 | 15 - 5 | 12 - 6 | 11 - 7 | 10 - 8 | 3 - 9 | 4 - 10 | 5 - 11 | 6 - 12 | 7 - 13 | 2 (=9) Default! - -Note: - Do not use JP11=IRQ6, it may conflict with your Floppy Disk - Controller - - Use JP3=IRQ14 only, if you don't have an IDE-, MFM-, or RLL- - Hard Disk, it may conflict with their controllers - - -Setting the Timeout Parameters ------------------------------- - -The jumpers labeled JP1 and JP2 are used to determine the timeout -parameters. These two jumpers are normally left open. - - -***************************************************************************** - -** Lantech ** -8-bit card, unknown model -------------------------- - - from Vlad Lungu - his e-mail address seemed broken at - the time I tried to reach him. Sorry Vlad, if you didn't get my reply. - - ________________________________________________________________ - | 1 8 | - | ___________ __| - | | SW1 | LED |__| - | |__________| | - | ___| - | _____________________ |S | 8 - | | | |W | - | | | |2 | - | | | |__| 1 - | | UM9065L | |o| JP4 ____|____ - | | | |o| | CN | - | | | |________| - | | | | - | |___________________| | - | | - | | - | _____________ | - | | | | - | | PROM | |ooooo| JP6 | - | |____________| |ooooo| | - |_____________ _ _| - |____________________________________________| |__| - - -UM9065L : ARCnet Controller - -SW 1 : Shared Memory Address and I/O Base - - ON=0 - - 12345|Memory Address - -----|-------------- - 00001| D4000 - 00010| CC000 - 00110| D0000 - 01110| D1000 - 01101| D9000 - 10010| CC800 - 10011| DC800 - 11110| D1800 - -It seems that the bits are considered in reverse order. Also, you must -observe that some of those addresses are unusual and I didn't probe them; I -used a memory dump in DOS to identify them. For the 00000 configuration and -some others that I didn't write here the card seems to conflict with the -video card (an S3 GENDAC). I leave the full decoding of those addresses to -you. - - 678| I/O Address - ---|------------ - 000| 260 - 001| failed probe - 010| 2E0 - 011| 380 - 100| 290 - 101| 350 - 110| failed probe - 111| 3E0 - -SW 2 : Node ID (binary coded) - -JP 4 : Boot PROM enable CLOSE - enabled - OPEN - disabled - -JP 6 : IRQ set (ONLY ONE jumper on 1-5 for IRQ 2-6) - - -***************************************************************************** - -** Acer ** -8-bit card, Model 5210-003 --------------------------- - - from Vojtech Pavlik using portions of the existing - arcnet-hardware file. - -This is a 90C26 based card. Its configuration seems similar to the SMC -PC100, but has some additional jumpers I don't know the meaning of. - - __ - | | - ___________|__|_________________________ - | | | | - | | BNC | | - | |______| ___| - | _____________________ |___ - | | | | - | | Hybrid IC | | - | | | o|o J1 | - | |_____________________| 8|8 | - | 8|8 J5 | - | o|o | - | 8|8 | - |__ 8|8 | - (|__| LED o|o | - | 8|8 | - | 8|8 J15 | - | | - | _____ | - | | | _____ | - | | | | | ___| - | | | | | | - | _____ | ROM | | UFS | | - | | | | | | | | - | | | ___ | | | | | - | | | | | |__.__| |__.__| | - | | NCR | |XTL| _____ _____ | - | | | |___| | | | | | - | |90C26| | | | | | - | | | | RAM | | UFS | | - | | | J17 o|o | | | | | - | | | J16 o|o | | | | | - | |__.__| |__.__| |__.__| | - | ___ | - | | |8 | - | |SW2| | - | | | | - | |___|1 | - | ___ | - | | |10 J18 o|o | - | | | o|o | - | |SW1| o|o | - | | | J21 o|o | - | |___|1 | - | | - |____________________________________| - - -Legend: - -90C26 ARCNET Chip -XTL 20 MHz Crystal -SW1 1-6 Base I/O Address Select - 7-10 Memory Address Select -SW2 1-8 Node ID Select (ID0-ID7) -J1-J5 IRQ Select -J6-J21 Unknown (Probably extra timeouts & ROM enable ...) -LED1 Activity LED -BNC Coax connector (STAR ARCnet) -RAM 2k of SRAM -ROM Boot ROM socket -UFS Unidentified Flying Sockets - - -Setting the Node ID -------------------- - -The eight switches in SW2 are used to set the node ID. Each node attached -to the network must have an unique node ID which must not be 0. -Switch 1 (ID0) serves as the least significant bit (LSB). - -Setting one of the switches to OFF means "1", ON means "0". - -The node ID is the sum of the values of all switches set to "1" -These values are: - - Switch | Value - -------|------- - 1 | 1 - 2 | 2 - 3 | 4 - 4 | 8 - 5 | 16 - 6 | 32 - 7 | 64 - 8 | 128 - -Don't set this to 0 or 255; these values are reserved. - - -Setting the I/O Base Address ----------------------------- - -The switches 1 to 6 of switch block SW1 are used to select one -of 32 possible I/O Base addresses using the following tables - - | Hex - Switch | Value - -------|------- - 1 | 200 - 2 | 100 - 3 | 80 - 4 | 40 - 5 | 20 - 6 | 10 - -The I/O address is sum of all switches set to "1". Remember that -the I/O address space bellow 0x200 is RESERVED for mainboard, so -switch 1 should be ALWAYS SET TO OFF. - - -Setting the Base Memory (RAM) buffer Address --------------------------------------------- - -The memory buffer (RAM) requires 2K. The base of this buffer can be -located in any of sixteen positions. However, the addresses below -A0000 are likely to cause system hang because there's main RAM. - -Jumpers 7-10 of switch block SW1 select the Memory Base address. - - Switch | Hex RAM - 7 8 9 10 | Address - ----------------|--------- - OFF OFF OFF OFF | F0000 (conflicts with main BIOS) - OFF OFF OFF ON | E0000 - OFF OFF ON OFF | D0000 - OFF OFF ON ON | C0000 (conflicts with video BIOS) - OFF ON OFF OFF | B0000 (conflicts with mono video) - OFF ON OFF ON | A0000 (conflicts with graphics) - - -Setting the Interrupt Line --------------------------- - -Jumpers 1-5 of the jumper block J1 control the IRQ level. ON means -shorted, OFF means open. - - Jumper | IRQ - 1 2 3 4 5 | - ---------------------------- - ON OFF OFF OFF OFF | 7 - OFF ON OFF OFF OFF | 5 - OFF OFF ON OFF OFF | 4 - OFF OFF OFF ON OFF | 3 - OFF OFF OFF OFF ON | 2 - - -Unknown jumpers & sockets -------------------------- - -I know nothing about these. I just guess that J16&J17 are timeout -jumpers and maybe one of J18-J21 selects ROM. Also J6-J10 and -J11-J15 are connecting IRQ2-7 to some pins on the UFSs. I can't -guess the purpose. - - -***************************************************************************** - -** Datapoint? ** -LAN-ARC-8, an 8-bit card ------------------------- - - from Vojtech Pavlik - -This is another SMC 90C65-based ARCnet card. I couldn't identify the -manufacturer, but it might be DataPoint, because the card has the -original arcNet logo in its upper right corner. - - _______________________________________________________ - | _________ | - | | SW2 | ON arcNet | - | |_________| OFF ___| - | _____________ 1 ______ 8 | | 8 - | | | SW1 | XTAL | ____________ | S | - | > RAM (2k) | |______|| | | W | - | |_____________| | H | | 3 | - | _________|_____ y | |___| 1 - | _________ | | |b | | - | |_________| | | |r | | - | | SMC | |i | | - | | 90C65| |d | | - | _________ | | | | | - | | SW1 | ON | | |I | | - | |_________| OFF |_________|_____/C | _____| - | 1 8 | | | |___ - | ______________ | | | BNC |___| - | | | |____________| |_____| - | > EPROM SOCKET | _____________ | - | |______________| |_____________| | - | ______________| - | | - |________________________________________| - -Legend: - -90C65 ARCNET Chip -SW1 1-5: Base Memory Address Select - 6-8: Base I/O Address Select -SW2 1-8: Node ID Select -SW3 1-5: IRQ Select - 6-7: Extra Timeout - 8 : ROM Enable -BNC Coax connector -XTAL 20 MHz Crystal - - -Setting the Node ID -------------------- - -The eight switches in SW3 are used to set the node ID. Each node attached -to the network must have an unique node ID which must not be 0. -Switch 1 serves as the least significant bit (LSB). - -Setting one of the switches to Off means "1", On means "0". - -The node ID is the sum of the values of all switches set to "1" -These values are: - - Switch | Value - -------|------- - 1 | 1 - 2 | 2 - 3 | 4 - 4 | 8 - 5 | 16 - 6 | 32 - 7 | 64 - 8 | 128 - - -Setting the I/O Base Address ----------------------------- - -The last three switches in switch block SW1 are used to select one -of eight possible I/O Base addresses using the following table - - - Switch | Hex I/O - 6 7 8 | Address - ------------|-------- - ON ON ON | 260 - OFF ON ON | 290 - ON OFF ON | 2E0 (Manufacturer's default) - OFF OFF ON | 2F0 - ON ON OFF | 300 - OFF ON OFF | 350 - ON OFF OFF | 380 - OFF OFF OFF | 3E0 - - -Setting the Base Memory (RAM) buffer Address --------------------------------------------- - -The memory buffer (RAM) requires 2K. The base of this buffer can be -located in any of eight positions. The address of the Boot Prom is -memory base + 0x2000. -Jumpers 3-5 of switch block SW1 select the Memory Base address. - - Switch | Hex RAM | Hex ROM - 1 2 3 4 5 | Address | Address *) - --------------------|---------|----------- - ON ON ON ON ON | C0000 | C2000 - ON ON OFF ON ON | C4000 | C6000 - ON ON ON OFF ON | CC000 | CE000 - ON ON OFF OFF ON | D0000 | D2000 (Manufacturer's default) - ON ON ON ON OFF | D4000 | D6000 - ON ON OFF ON OFF | D8000 | DA000 - ON ON ON OFF OFF | DC000 | DE000 - ON ON OFF OFF OFF | E0000 | E2000 - -*) To enable the Boot ROM set the switch 8 of switch block SW3 to position ON. - -The switches 1 and 2 probably add 0x0800 and 0x1000 to RAM base address. - - -Setting the Interrupt Line --------------------------- - -Switches 1-5 of the switch block SW3 control the IRQ level. - - Jumper | IRQ - 1 2 3 4 5 | - ---------------------------- - ON OFF OFF OFF OFF | 3 - OFF ON OFF OFF OFF | 4 - OFF OFF ON OFF OFF | 5 - OFF OFF OFF ON OFF | 7 - OFF OFF OFF OFF ON | 2 - - -Setting the Timeout Parameters ------------------------------- - -The switches 6-7 of the switch block SW3 are used to determine the timeout -parameters. These two switches are normally left in the OFF position. - - -***************************************************************************** - -** Topware ** -8-bit card, TA-ARC/10 -------------------------- - - from Vojtech Pavlik - -This is another very similar 90C65 card. Most of the switches and jumpers -are the same as on other clones. - - _____________________________________________________________________ -| ___________ | | ______ | -| |SW2 NODE ID| | | | XTAL | | -| |___________| | Hybrid IC | |______| | -| ___________ | | __| -| |SW1 MEM+I/O| |_________________________| LED1|__|) -| |___________| 1 2 | -| J3 |o|o| TIMEOUT ______| -| ______________ |o|o| | | -| | | ___________________ | RJ | -| > EPROM SOCKET | | \ |------| -|J2 |______________| | | | | -||o| | | |______| -||o| ROM ENABLE | SMC | _________ | -| _____________ | 90C65 | |_________| _____| -| | | | | | |___ -| > RAM (2k) | | | | BNC |___| -| |_____________| | | |_____| -| |____________________| | -| ________ IRQ 2 3 4 5 7 ___________ | -||________| |o|o|o|o|o| |___________| | -|________ J1|o|o|o|o|o| ______________| - | | - |_____________________________________________| - -Legend: - -90C65 ARCNET Chip -XTAL 20 MHz Crystal -SW1 1-5 Base Memory Address Select - 6-8 Base I/O Address Select -SW2 1-8 Node ID Select (ID0-ID7) -J1 IRQ Select -J2 ROM Enable -J3 Extra Timeout -LED1 Activity LED -BNC Coax connector (BUS ARCnet) -RJ Twisted Pair Connector (daisy chain) - - -Setting the Node ID -------------------- - -The eight switches in SW2 are used to set the node ID. Each node attached to -the network must have an unique node ID which must not be 0. Switch 1 (ID0) -serves as the least significant bit (LSB). - -Setting one of the switches to Off means "1", On means "0". - -The node ID is the sum of the values of all switches set to "1" -These values are: - - Switch | Label | Value - -------|-------|------- - 1 | ID0 | 1 - 2 | ID1 | 2 - 3 | ID2 | 4 - 4 | ID3 | 8 - 5 | ID4 | 16 - 6 | ID5 | 32 - 7 | ID6 | 64 - 8 | ID7 | 128 - -Setting the I/O Base Address ----------------------------- - -The last three switches in switch block SW1 are used to select one -of eight possible I/O Base addresses using the following table: - - - Switch | Hex I/O - 6 7 8 | Address - ------------|-------- - ON ON ON | 260 (Manufacturer's default) - OFF ON ON | 290 - ON OFF ON | 2E0 - OFF OFF ON | 2F0 - ON ON OFF | 300 - OFF ON OFF | 350 - ON OFF OFF | 380 - OFF OFF OFF | 3E0 - - -Setting the Base Memory (RAM) buffer Address --------------------------------------------- - -The memory buffer (RAM) requires 2K. The base of this buffer can be -located in any of eight positions. The address of the Boot Prom is -memory base + 0x2000. -Jumpers 3-5 of switch block SW1 select the Memory Base address. - - Switch | Hex RAM | Hex ROM - 1 2 3 4 5 | Address | Address *) - --------------------|---------|----------- - ON ON ON ON ON | C0000 | C2000 - ON ON OFF ON ON | C4000 | C6000 (Manufacturer's default) - ON ON ON OFF ON | CC000 | CE000 - ON ON OFF OFF ON | D0000 | D2000 - ON ON ON ON OFF | D4000 | D6000 - ON ON OFF ON OFF | D8000 | DA000 - ON ON ON OFF OFF | DC000 | DE000 - ON ON OFF OFF OFF | E0000 | E2000 - -*) To enable the Boot ROM short the jumper J2. - -The jumpers 1 and 2 probably add 0x0800 and 0x1000 to RAM address. - - -Setting the Interrupt Line --------------------------- - -Jumpers 1-5 of the jumper block J1 control the IRQ level. ON means -shorted, OFF means open. - - Jumper | IRQ - 1 2 3 4 5 | - ---------------------------- - ON OFF OFF OFF OFF | 2 - OFF ON OFF OFF OFF | 3 - OFF OFF ON OFF OFF | 4 - OFF OFF OFF ON OFF | 5 - OFF OFF OFF OFF ON | 7 - - -Setting the Timeout Parameters ------------------------------- - -The jumpers J3 are used to set the timeout parameters. These two -jumpers are normally left open. - - -***************************************************************************** - -** Thomas-Conrad ** -Model #500-6242-0097 REV A (8-bit card) ---------------------------------------- - - from Lars Karlsson <100617.3473@compuserve.com> - - ________________________________________________________ - | ________ ________ |_____ - | |........| |........| | - | |________| |________| ___| - | SW 3 SW 1 | | - | Base I/O Base Addr. Station | | - | address | | - | ______ switch | | - | | | | | - | | | |___| - | | | ______ |___._ - | |______| |______| ____| BNC - | Jumper- _____| Connector - | Main chip block _ __| ' - | | | | RJ Connector - | |_| | with 110 Ohm - | |__ Terminator - | ___________ __| - | |...........| | RJ-jack - | |...........| _____ | (unused) - | |___________| |_____| |__ - | Boot PROM socket IRQ-jumpers |_ Diagnostic - |________ __ _| LED (red) - | | | | | | | | | | | | | | | | | | | | | | - | | | | | | | | | | | | | | | | | | | | |________| - | - | - -And here are the settings for some of the switches and jumpers on the cards. - - - I/O - - 1 2 3 4 5 6 7 8 - -2E0----- 0 0 0 1 0 0 0 1 -2F0----- 0 0 0 1 0 0 0 0 -300----- 0 0 0 0 1 1 1 1 -350----- 0 0 0 0 1 1 1 0 - -"0" in the above example means switch is off "1" means that it is on. - - - ShMem address. - - 1 2 3 4 5 6 7 8 - -CX00--0 0 1 1 | | | -DX00--0 0 1 0 | -X000--------- 1 1 | -X400--------- 1 0 | -X800--------- 0 1 | -XC00--------- 0 0 -ENHANCED----------- 1 -COMPATIBLE--------- 0 - - - IRQ - - - 3 4 5 7 2 - . . . . . - . . . . . - - -There is a DIP-switch with 8 switches, used to set the shared memory address -to be used. The first 6 switches set the address, the 7th doesn't have any -function, and the 8th switch is used to select "compatible" or "enhanced". -When I got my two cards, one of them had this switch set to "enhanced". That -card didn't work at all, it wasn't even recognized by the driver. The other -card had this switch set to "compatible" and it behaved absolutely normally. I -guess that the switch on one of the cards, must have been changed accidentally -when the card was taken out of its former host. The question remains -unanswered, what is the purpose of the "enhanced" position? - -[Avery's note: "enhanced" probably either disables shared memory (use IO -ports instead) or disables IO ports (use memory addresses instead). This -varies by the type of card involved. I fail to see how either of these -enhance anything. Send me more detailed information about this mode, or -just use "compatible" mode instead.] - - -***************************************************************************** - -** Waterloo Microsystems Inc. ?? ** -8-bit card (C) 1985 -------------------- - - from Robert Michael Best - -[Avery's note: these don't work with my driver for some reason. These cards -SEEM to have settings similar to the PDI508Plus, which is -software-configured and doesn't work with my driver either. The "Waterloo -chip" is a boot PROM, probably designed specifically for the University of -Waterloo. If you have any further information about this card, please -e-mail me.] - -The probe has not been able to detect the card on any of the J2 settings, -and I tried them again with the "Waterloo" chip removed. - - _____________________________________________________________________ -| \/ \/ ___ __ __ | -| C4 C4 |^| | M || ^ ||^| | -| -- -- |_| | 5 || || | C3 | -| \/ \/ C10 |___|| ||_| | -| C4 C4 _ _ | | ?? | -| -- -- | \/ || | | -| | || | | -| | || C1 | | -| | || | \/ _____| -| | C6 || | C9 | |___ -| | || | -- | BNC |___| -| | || | >C7| |_____| -| | || | | -| __ __ |____||_____| 1 2 3 6 | -|| ^ | >C4| |o|o|o|o|o|o| J2 >C4| | -|| | |o|o|o|o|o|o| | -|| C2 | >C4| >C4| | -|| | >C8| | -|| | 2 3 4 5 6 7 IRQ >C4| | -||_____| |o|o|o|o|o|o| J3 | -|_______ |o|o|o|o|o|o| _______________| - | | - |_____________________________________________| - -C1 -- "COM9026 - SMC 8638" - In a chip socket. - -C2 -- "@Copyright - Waterloo Microsystems Inc. - 1985" - In a chip Socket with info printed on a label covering a round window - showing the circuit inside. (The window indicates it is an EPROM chip.) - -C3 -- "COM9032 - SMC 8643" - In a chip socket. - -C4 -- "74LS" - 9 total no sockets. - -M5 -- "50006-136 - 20.000000 MHZ - MTQ-T1-S3 - 0 M-TRON 86-40" - Metallic case with 4 pins, no socket. - -C6 -- "MOSTEK@TC8643 - MK6116N-20 - MALAYSIA" - No socket. - -C7 -- No stamp or label but in a 20 pin chip socket. - -C8 -- "PAL10L8CN - 8623" - In a 20 pin socket. - -C9 -- "PAl16R4A-2CN - 8641" - In a 20 pin socket. - -C10 -- "M8640 - NMC - 9306N" - In an 8 pin socket. - -?? -- Some components on a smaller board and attached with 20 pins all - along the side closest to the BNC connector. The are coated in a dark - resin. - -On the board there are two jumper banks labeled J2 and J3. The -manufacturer didn't put a J1 on the board. The two boards I have both -came with a jumper box for each bank. - -J2 -- Numbered 1 2 3 4 5 6. - 4 and 5 are not stamped due to solder points. - -J3 -- IRQ 2 3 4 5 6 7 - -The board itself has a maple leaf stamped just above the irq jumpers -and "-2 46-86" beside C2. Between C1 and C6 "ASS 'Y 300163" and "@1986 -CORMAN CUSTOM ELECTRONICS CORP." stamped just below the BNC connector. -Below that "MADE IN CANADA" - - -***************************************************************************** - -** No Name ** -8-bit cards, 16-bit cards -------------------------- - - from Juergen Seifert - -NONAME 8-BIT ARCNET -=================== - -I have named this ARCnet card "NONAME", since there is no name of any -manufacturer on the Installation manual nor on the shipping box. The only -hint to the existence of a manufacturer at all is written in copper, -it is "Made in Taiwan" - -This description has been written by Juergen Seifert -using information from the Original - "ARCnet Installation Manual" - - - ________________________________________________________________ - | |STAR| BUS| T/P| | - | |____|____|____| | - | _____________________ | - | | | | - | | | | - | | | | - | | SMC | | - | | | | - | | COM90C65 | | - | | | | - | | | | - | |__________-__________| | - | _____| - | _______________ | CN | - | | PROM | |_____| - | > SOCKET | | - | |_______________| 1 2 3 4 5 6 7 8 1 2 3 4 5 6 7 8 | - | _______________ _______________ | - | |o|o|o|o|o|o|o|o| | SW1 || SW2 || - | |o|o|o|o|o|o|o|o| |_______________||_______________|| - |___ 2 3 4 5 7 E E R Node ID IOB__|__MEM____| - | \ IRQ / T T O | - |__________________1_2_M______________________| - -Legend: - -COM90C65: ARCnet Probe -S1 1-8: Node ID Select -S2 1-3: I/O Base Address Select - 4-6: Memory Base Address Select - 7-8: RAM Offset Select -ET1, ET2 Extended Timeout Select -ROM ROM Enable Select -CN RG62 Coax Connector -STAR| BUS | T/P Three fields for placing a sign (colored circle) - indicating the topology of the card - -Setting one of the switches to Off means "1", On means "0". - - -Setting the Node ID -------------------- - -The eight switches in group SW1 are used to set the node ID. -Each node attached to the network must have an unique node ID which -must be different from 0. -Switch 8 serves as the least significant bit (LSB). - -The node ID is the sum of the values of all switches set to "1" -These values are: - - Switch | Value - -------|------- - 8 | 1 - 7 | 2 - 6 | 4 - 5 | 8 - 4 | 16 - 3 | 32 - 2 | 64 - 1 | 128 - -Some Examples: - - Switch | Hex | Decimal - 1 2 3 4 5 6 7 8 | Node ID | Node ID - ----------------|---------|--------- - 0 0 0 0 0 0 0 0 | not allowed - 0 0 0 0 0 0 0 1 | 1 | 1 - 0 0 0 0 0 0 1 0 | 2 | 2 - 0 0 0 0 0 0 1 1 | 3 | 3 - . . . | | - 0 1 0 1 0 1 0 1 | 55 | 85 - . . . | | - 1 0 1 0 1 0 1 0 | AA | 170 - . . . | | - 1 1 1 1 1 1 0 1 | FD | 253 - 1 1 1 1 1 1 1 0 | FE | 254 - 1 1 1 1 1 1 1 1 | FF | 255 - - -Setting the I/O Base Address ----------------------------- - -The first three switches in switch group SW2 are used to select one -of eight possible I/O Base addresses using the following table - - Switch | Hex I/O - 1 2 3 | Address - ------------|-------- - ON ON ON | 260 - ON ON OFF | 290 - ON OFF ON | 2E0 (Manufacturer's default) - ON OFF OFF | 2F0 - OFF ON ON | 300 - OFF ON OFF | 350 - OFF OFF ON | 380 - OFF OFF OFF | 3E0 - - -Setting the Base Memory (RAM) buffer Address --------------------------------------------- - -The memory buffer requires 2K of a 16K block of RAM. The base of this -16K block can be located in any of eight positions. -Switches 4-6 of switch group SW2 select the Base of the 16K block. -Within that 16K address space, the buffer may be assigned any one of four -positions, determined by the offset, switches 7 and 8 of group SW2. - - Switch | Hex RAM | Hex ROM - 4 5 6 7 8 | Address | Address *) - -----------|---------|----------- - 0 0 0 0 0 | C0000 | C2000 - 0 0 0 0 1 | C0800 | C2000 - 0 0 0 1 0 | C1000 | C2000 - 0 0 0 1 1 | C1800 | C2000 - | | - 0 0 1 0 0 | C4000 | C6000 - 0 0 1 0 1 | C4800 | C6000 - 0 0 1 1 0 | C5000 | C6000 - 0 0 1 1 1 | C5800 | C6000 - | | - 0 1 0 0 0 | CC000 | CE000 - 0 1 0 0 1 | CC800 | CE000 - 0 1 0 1 0 | CD000 | CE000 - 0 1 0 1 1 | CD800 | CE000 - | | - 0 1 1 0 0 | D0000 | D2000 (Manufacturer's default) - 0 1 1 0 1 | D0800 | D2000 - 0 1 1 1 0 | D1000 | D2000 - 0 1 1 1 1 | D1800 | D2000 - | | - 1 0 0 0 0 | D4000 | D6000 - 1 0 0 0 1 | D4800 | D6000 - 1 0 0 1 0 | D5000 | D6000 - 1 0 0 1 1 | D5800 | D6000 - | | - 1 0 1 0 0 | D8000 | DA000 - 1 0 1 0 1 | D8800 | DA000 - 1 0 1 1 0 | D9000 | DA000 - 1 0 1 1 1 | D9800 | DA000 - | | - 1 1 0 0 0 | DC000 | DE000 - 1 1 0 0 1 | DC800 | DE000 - 1 1 0 1 0 | DD000 | DE000 - 1 1 0 1 1 | DD800 | DE000 - | | - 1 1 1 0 0 | E0000 | E2000 - 1 1 1 0 1 | E0800 | E2000 - 1 1 1 1 0 | E1000 | E2000 - 1 1 1 1 1 | E1800 | E2000 - -*) To enable the 8K Boot PROM install the jumper ROM. - The default is jumper ROM not installed. - - -Setting Interrupt Request Lines (IRQ) -------------------------------------- - -To select a hardware interrupt level set one (only one!) of the jumpers -IRQ2, IRQ3, IRQ4, IRQ5 or IRQ7. The manufacturer's default is IRQ2. - - -Setting the Timeouts --------------------- - -The two jumpers labeled ET1 and ET2 are used to determine the timeout -parameters (response and reconfiguration time). Every node in a network -must be set to the same timeout values. - - ET1 ET2 | Response Time (us) | Reconfiguration Time (ms) - --------|--------------------|-------------------------- - Off Off | 78 | 840 (Default) - Off On | 285 | 1680 - On Off | 563 | 1680 - On On | 1130 | 1680 - -On means jumper installed, Off means jumper not installed - - -NONAME 16-BIT ARCNET -==================== - -The manual of my 8-Bit NONAME ARCnet Card contains another description -of a 16-Bit Coax / Twisted Pair Card. This description is incomplete, -because there are missing two pages in the manual booklet. (The table -of contents reports pages ... 2-9, 2-11, 2-12, 3-1, ... but inside -the booklet there is a different way of counting ... 2-9, 2-10, A-1, -(empty page), 3-1, ..., 3-18, A-1 (again), A-2) -Also the picture of the board layout is not as good as the picture of -8-Bit card, because there isn't any letter like "SW1" written to the -picture. -Should somebody have such a board, please feel free to complete this -description or to send a mail to me! - -This description has been written by Juergen Seifert -using information from the Original - "ARCnet Installation Manual" - - - ___________________________________________________________________ - < _________________ _________________ | - > | SW? || SW? | | - < |_________________||_________________| | - > ____________________ | - < | | | - > | | | - < | | | - > | | | - < | | | - > | | | - < | | | - > |____________________| | - < ____| - > ____________________ | | - < | | | J1 | - > | < | | - < |____________________| ? ? ? ? ? ? |____| - > |o|o|o|o|o|o| | - < |o|o|o|o|o|o| | - > | - < __ ___________| - > | | | - <____________| |_______________________________________| - - -Setting one of the switches to Off means "1", On means "0". - - -Setting the Node ID -------------------- - -The eight switches in group SW2 are used to set the node ID. -Each node attached to the network must have an unique node ID which -must be different from 0. -Switch 8 serves as the least significant bit (LSB). - -The node ID is the sum of the values of all switches set to "1" -These values are: - - Switch | Value - -------|------- - 8 | 1 - 7 | 2 - 6 | 4 - 5 | 8 - 4 | 16 - 3 | 32 - 2 | 64 - 1 | 128 - -Some Examples: - - Switch | Hex | Decimal - 1 2 3 4 5 6 7 8 | Node ID | Node ID - ----------------|---------|--------- - 0 0 0 0 0 0 0 0 | not allowed - 0 0 0 0 0 0 0 1 | 1 | 1 - 0 0 0 0 0 0 1 0 | 2 | 2 - 0 0 0 0 0 0 1 1 | 3 | 3 - . . . | | - 0 1 0 1 0 1 0 1 | 55 | 85 - . . . | | - 1 0 1 0 1 0 1 0 | AA | 170 - . . . | | - 1 1 1 1 1 1 0 1 | FD | 253 - 1 1 1 1 1 1 1 0 | FE | 254 - 1 1 1 1 1 1 1 1 | FF | 255 - - -Setting the I/O Base Address ----------------------------- - -The first three switches in switch group SW1 are used to select one -of eight possible I/O Base addresses using the following table - - Switch | Hex I/O - 3 2 1 | Address - ------------|-------- - ON ON ON | 260 - ON ON OFF | 290 - ON OFF ON | 2E0 (Manufacturer's default) - ON OFF OFF | 2F0 - OFF ON ON | 300 - OFF ON OFF | 350 - OFF OFF ON | 380 - OFF OFF OFF | 3E0 - - -Setting the Base Memory (RAM) buffer Address --------------------------------------------- - -The memory buffer requires 2K of a 16K block of RAM. The base of this -16K block can be located in any of eight positions. -Switches 6-8 of switch group SW1 select the Base of the 16K block. -Within that 16K address space, the buffer may be assigned any one of four -positions, determined by the offset, switches 4 and 5 of group SW1. - - Switch | Hex RAM | Hex ROM - 8 7 6 5 4 | Address | Address - -----------|---------|----------- - 0 0 0 0 0 | C0000 | C2000 - 0 0 0 0 1 | C0800 | C2000 - 0 0 0 1 0 | C1000 | C2000 - 0 0 0 1 1 | C1800 | C2000 - | | - 0 0 1 0 0 | C4000 | C6000 - 0 0 1 0 1 | C4800 | C6000 - 0 0 1 1 0 | C5000 | C6000 - 0 0 1 1 1 | C5800 | C6000 - | | - 0 1 0 0 0 | CC000 | CE000 - 0 1 0 0 1 | CC800 | CE000 - 0 1 0 1 0 | CD000 | CE000 - 0 1 0 1 1 | CD800 | CE000 - | | - 0 1 1 0 0 | D0000 | D2000 (Manufacturer's default) - 0 1 1 0 1 | D0800 | D2000 - 0 1 1 1 0 | D1000 | D2000 - 0 1 1 1 1 | D1800 | D2000 - | | - 1 0 0 0 0 | D4000 | D6000 - 1 0 0 0 1 | D4800 | D6000 - 1 0 0 1 0 | D5000 | D6000 - 1 0 0 1 1 | D5800 | D6000 - | | - 1 0 1 0 0 | D8000 | DA000 - 1 0 1 0 1 | D8800 | DA000 - 1 0 1 1 0 | D9000 | DA000 - 1 0 1 1 1 | D9800 | DA000 - | | - 1 1 0 0 0 | DC000 | DE000 - 1 1 0 0 1 | DC800 | DE000 - 1 1 0 1 0 | DD000 | DE000 - 1 1 0 1 1 | DD800 | DE000 - | | - 1 1 1 0 0 | E0000 | E2000 - 1 1 1 0 1 | E0800 | E2000 - 1 1 1 1 0 | E1000 | E2000 - 1 1 1 1 1 | E1800 | E2000 - - -Setting Interrupt Request Lines (IRQ) -------------------------------------- - -?????????????????????????????????????? - - -Setting the Timeouts --------------------- - -?????????????????????????????????????? - - -***************************************************************************** - -** No Name ** -8-bit cards ("Made in Taiwan R.O.C.") ------------ - - from Vojtech Pavlik - -I have named this ARCnet card "NONAME", since I got only the card with -no manual at all and the only text identifying the manufacturer is -"MADE IN TAIWAN R.O.C" printed on the card. - - ____________________________________________________________ - | 1 2 3 4 5 6 7 8 | - | |o|o| JP1 o|o|o|o|o|o|o|o| ON | - | + o|o|o|o|o|o|o|o| ___| - | _____________ o|o|o|o|o|o|o|o| OFF _____ | | ID7 - | | | SW1 | | | | ID6 - | > RAM (2k) | ____________________ | H | | S | ID5 - | |_____________| | || y | | W | ID4 - | | || b | | 2 | ID3 - | | || r | | | ID2 - | | || i | | | ID1 - | | 90C65 || d | |___| ID0 - | SW3 | || | | - | |o|o|o|o|o|o|o|o| ON | || I | | - | |o|o|o|o|o|o|o|o| | || C | | - | |o|o|o|o|o|o|o|o| OFF |____________________|| | _____| - | 1 2 3 4 5 6 7 8 | | | |___ - | ______________ | | | BNC |___| - | | | |_____| |_____| - | > EPROM SOCKET | | - | |______________| | - | ______________| - | | - |_____________________________________________| - -Legend: - -90C65 ARCNET Chip -SW1 1-5: Base Memory Address Select - 6-8: Base I/O Address Select -SW2 1-8: Node ID Select (ID0-ID7) -SW3 1-5: IRQ Select - 6-7: Extra Timeout - 8 : ROM Enable -JP1 Led connector -BNC Coax connector - -Although the jumpers SW1 and SW3 are marked SW, not JP, they are jumpers, not -switches. - -Setting the jumpers to ON means connecting the upper two pins, off the bottom -two - or - in case of IRQ setting, connecting none of them at all. - -Setting the Node ID -------------------- - -The eight switches in SW2 are used to set the node ID. Each node attached -to the network must have an unique node ID which must not be 0. -Switch 1 (ID0) serves as the least significant bit (LSB). - -Setting one of the switches to Off means "1", On means "0". - -The node ID is the sum of the values of all switches set to "1" -These values are: - - Switch | Label | Value - -------|-------|------- - 1 | ID0 | 1 - 2 | ID1 | 2 - 3 | ID2 | 4 - 4 | ID3 | 8 - 5 | ID4 | 16 - 6 | ID5 | 32 - 7 | ID6 | 64 - 8 | ID7 | 128 - -Some Examples: - - Switch | Hex | Decimal - 8 7 6 5 4 3 2 1 | Node ID | Node ID - ----------------|---------|--------- - 0 0 0 0 0 0 0 0 | not allowed - 0 0 0 0 0 0 0 1 | 1 | 1 - 0 0 0 0 0 0 1 0 | 2 | 2 - 0 0 0 0 0 0 1 1 | 3 | 3 - . . . | | - 0 1 0 1 0 1 0 1 | 55 | 85 - . . . | | - 1 0 1 0 1 0 1 0 | AA | 170 - . . . | | - 1 1 1 1 1 1 0 1 | FD | 253 - 1 1 1 1 1 1 1 0 | FE | 254 - 1 1 1 1 1 1 1 1 | FF | 255 - - -Setting the I/O Base Address ----------------------------- - -The last three switches in switch block SW1 are used to select one -of eight possible I/O Base addresses using the following table - - - Switch | Hex I/O - 6 7 8 | Address - ------------|-------- - ON ON ON | 260 - OFF ON ON | 290 - ON OFF ON | 2E0 (Manufacturer's default) - OFF OFF ON | 2F0 - ON ON OFF | 300 - OFF ON OFF | 350 - ON OFF OFF | 380 - OFF OFF OFF | 3E0 - - -Setting the Base Memory (RAM) buffer Address --------------------------------------------- - -The memory buffer (RAM) requires 2K. The base of this buffer can be -located in any of eight positions. The address of the Boot Prom is -memory base + 0x2000. -Jumpers 3-5 of jumper block SW1 select the Memory Base address. - - Switch | Hex RAM | Hex ROM - 1 2 3 4 5 | Address | Address *) - --------------------|---------|----------- - ON ON ON ON ON | C0000 | C2000 - ON ON OFF ON ON | C4000 | C6000 - ON ON ON OFF ON | CC000 | CE000 - ON ON OFF OFF ON | D0000 | D2000 (Manufacturer's default) - ON ON ON ON OFF | D4000 | D6000 - ON ON OFF ON OFF | D8000 | DA000 - ON ON ON OFF OFF | DC000 | DE000 - ON ON OFF OFF OFF | E0000 | E2000 - -*) To enable the Boot ROM set the jumper 8 of jumper block SW3 to position ON. - -The jumpers 1 and 2 probably add 0x0800, 0x1000 and 0x1800 to RAM adders. - -Setting the Interrupt Line --------------------------- - -Jumpers 1-5 of the jumper block SW3 control the IRQ level. - - Jumper | IRQ - 1 2 3 4 5 | - ---------------------------- - ON OFF OFF OFF OFF | 2 - OFF ON OFF OFF OFF | 3 - OFF OFF ON OFF OFF | 4 - OFF OFF OFF ON OFF | 5 - OFF OFF OFF OFF ON | 7 - - -Setting the Timeout Parameters ------------------------------- - -The jumpers 6-7 of the jumper block SW3 are used to determine the timeout -parameters. These two jumpers are normally left in the OFF position. - - -***************************************************************************** - -** No Name ** -(Generic Model 9058) --------------------- - - from Andrew J. Kroll - - Sorry this sat in my to-do box for so long, Andrew! (yikes - over a - year!) - _____ - | < - | .---' - ________________________________________________________________ | | - | | SW2 | | | - | ___________ |_____________| | | - | | | 1 2 3 4 5 6 ___| | - | > 6116 RAM | _________ 8 | | | - | |___________| |20MHzXtal| 7 | | | - | |_________| __________ 6 | S | | - | 74LS373 | |- 5 | W | | - | _________ | E |- 4 | | | - | >_______| ______________|..... P |- 3 | 3 | | - | | | : O |- 2 | | | - | | | : X |- 1 |___| | - | ________________ | | : Y |- | | - | | SW1 | | SL90C65 | : |- | | - | |________________| | | : B |- | | - | 1 2 3 4 5 6 7 8 | | : O |- | | - | |_________o____|..../ A |- _______| | - | ____________________ | R |- | |------, - | | | | D |- | BNC | # | - | > 2764 PROM SOCKET | |__________|- |_______|------' - | |____________________| _________ | | - | >________| <- 74LS245 | | - | | | - |___ ______________| | - |H H H H H H H H H H H H H H H H H H H H H H H| | | - |U_U_U_U_U_U_U_U_U_U_U_U_U_U_U_U_U_U_U_U_U_U_U| | | - \| -Legend: - -SL90C65 ARCNET Controller / Transceiver /Logic -SW1 1-5: IRQ Select - 6: ET1 - 7: ET2 - 8: ROM ENABLE -SW2 1-3: Memory Buffer/PROM Address - 3-6: I/O Address Map -SW3 1-8: Node ID Select -BNC BNC RG62/U Connection - *I* have had success using RG59B/U with *NO* terminators! - What gives?! - -SW1: Timeouts, Interrupt and ROM ---------------------------------- - -To select a hardware interrupt level set one (only one!) of the dip switches -up (on) SW1...(switches 1-5) -IRQ3, IRQ4, IRQ5, IRQ7, IRQ2. The Manufacturer's default is IRQ2. - -The switches on SW1 labeled EXT1 (switch 6) and EXT2 (switch 7) -are used to determine the timeout parameters. These two dip switches -are normally left off (down). - - To enable the 8K Boot PROM position SW1 switch 8 on (UP) labeled ROM. - The default is jumper ROM not installed. - - -Setting the I/O Base Address ----------------------------- - -The last three switches in switch group SW2 are used to select one -of eight possible I/O Base addresses using the following table - - - Switch | Hex I/O - 4 5 6 | Address - -------|-------- - 0 0 0 | 260 - 0 0 1 | 290 - 0 1 0 | 2E0 (Manufacturer's default) - 0 1 1 | 2F0 - 1 0 0 | 300 - 1 0 1 | 350 - 1 1 0 | 380 - 1 1 1 | 3E0 - - -Setting the Base Memory Address (RAM & ROM) -------------------------------------------- - -The memory buffer requires 2K of a 16K block of RAM. The base of this -16K block can be located in any of eight positions. -Switches 1-3 of switch group SW2 select the Base of the 16K block. -(0 = DOWN, 1 = UP) -I could, however, only verify two settings... - - Switch| Hex RAM | Hex ROM - 1 2 3 | Address | Address - ------|---------|----------- - 0 0 0 | E0000 | E2000 - 0 0 1 | D0000 | D2000 (Manufacturer's default) - 0 1 0 | ????? | ????? - 0 1 1 | ????? | ????? - 1 0 0 | ????? | ????? - 1 0 1 | ????? | ????? - 1 1 0 | ????? | ????? - 1 1 1 | ????? | ????? - - -Setting the Node ID -------------------- - -The eight switches in group SW3 are used to set the node ID. -Each node attached to the network must have an unique node ID which -must be different from 0. -Switch 1 serves as the least significant bit (LSB). -switches in the DOWN position are OFF (0) and in the UP position are ON (1) - -The node ID is the sum of the values of all switches set to "1" -These values are: - Switch | Value - -------|------- - 1 | 1 - 2 | 2 - 3 | 4 - 4 | 8 - 5 | 16 - 6 | 32 - 7 | 64 - 8 | 128 - -Some Examples: - - Switch# | Hex | Decimal -8 7 6 5 4 3 2 1 | Node ID | Node ID -----------------|---------|--------- -0 0 0 0 0 0 0 0 | not allowed <-. -0 0 0 0 0 0 0 1 | 1 | 1 | -0 0 0 0 0 0 1 0 | 2 | 2 | -0 0 0 0 0 0 1 1 | 3 | 3 | - . . . | | | -0 1 0 1 0 1 0 1 | 55 | 85 | - . . . | | + Don't use 0 or 255! -1 0 1 0 1 0 1 0 | AA | 170 | - . . . | | | -1 1 1 1 1 1 0 1 | FD | 253 | -1 1 1 1 1 1 1 0 | FE | 254 | -1 1 1 1 1 1 1 1 | FF | 255 <-' - - -***************************************************************************** - -** Tiara ** -(model unknown) -------------------------- - - from Christoph Lameter - - -Here is information about my card as far as I could figure it out: ------------------------------------------------ tiara -Tiara LanCard of Tiara Computer Systems. - -+----------------------------------------------+ -! ! Transmitter Unit ! ! -! +------------------+ ------- -! MEM Coax Connector -! ROM 7654321 <- I/O ------- -! : : +--------+ ! -! : : ! 90C66LJ! +++ -! : : ! ! !D Switch to set -! : : ! ! !I the Nodenumber -! : : +--------+ !P -! !++ -! 234567 <- IRQ ! -+------------!!!!!!!!!!!!!!!!!!!!!!!!--------+ - !!!!!!!!!!!!!!!!!!!!!!!! - -0 = Jumper Installed -1 = Open - -Top Jumper line Bit 7 = ROM Enable 654=Memory location 321=I/O - -Settings for Memory Location (Top Jumper Line) -456 Address selected -000 C0000 -001 C4000 -010 CC000 -011 D0000 -100 D4000 -101 D8000 -110 DC000 -111 E0000 - -Settings for I/O Address (Top Jumper Line) -123 Port -000 260 -001 290 -010 2E0 -011 2F0 -100 300 -101 350 -110 380 -111 3E0 - -Settings for IRQ Selection (Lower Jumper Line) -234567 -011111 IRQ 2 -101111 IRQ 3 -110111 IRQ 4 -111011 IRQ 5 -111110 IRQ 7 - -***************************************************************************** - - -Other Cards ------------ - -I have no information on other models of ARCnet cards at the moment. Please -send any and all info to: - apenwarr@worldvisions.ca - -Thanks. diff --git a/Documentation/networking/index.rst b/Documentation/networking/index.rst index 96ffad845fd9..5da18e024fcb 100644 --- a/Documentation/networking/index.rst +++ b/Documentation/networking/index.rst @@ -39,6 +39,7 @@ Contents: 6lowpan 6pack altera_tse + arcnet-hardware .. only:: subproject and html -- cgit v1.2.3-59-g8ed1b From 08bab46f00d0f0fe9709a05b7cdfe909a4258b01 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Tue, 28 Apr 2020 00:01:20 +0200 Subject: docs: networking: convert arcnet.txt to ReST - add SPDX header; - use document title markup; - add notes markups; - mark code blocks and literals as such; - mark tables as such; - adjust identation, whitespaces and blank lines; - add to networking/index.rst. Signed-off-by: Mauro Carvalho Chehab Signed-off-by: David S. Miller --- Documentation/networking/arcnet.rst | 594 ++++++++++++++++++++++++++++++++++++ Documentation/networking/arcnet.txt | 556 --------------------------------- Documentation/networking/index.rst | 1 + drivers/net/arcnet/Kconfig | 6 +- 4 files changed, 598 insertions(+), 559 deletions(-) create mode 100644 Documentation/networking/arcnet.rst delete mode 100644 Documentation/networking/arcnet.txt diff --git a/Documentation/networking/arcnet.rst b/Documentation/networking/arcnet.rst new file mode 100644 index 000000000000..e93d9820f0f1 --- /dev/null +++ b/Documentation/networking/arcnet.rst @@ -0,0 +1,594 @@ +.. SPDX-License-Identifier: GPL-2.0 + +====== +ARCnet +====== + +.. note:: + + See also arcnet-hardware.txt in this directory for jumper-setting + and cabling information if you're like many of us and didn't happen to get a + manual with your ARCnet card. + +Since no one seems to listen to me otherwise, perhaps a poem will get your +attention:: + + This driver's getting fat and beefy, + But my cat is still named Fifi. + +Hmm, I think I'm allowed to call that a poem, even though it's only two +lines. Hey, I'm in Computer Science, not English. Give me a break. + +The point is: I REALLY REALLY REALLY REALLY REALLY want to hear from you if +you test this and get it working. Or if you don't. Or anything. + +ARCnet 0.32 ALPHA first made it into the Linux kernel 1.1.80 - this was +nice, but after that even FEWER people started writing to me because they +didn't even have to install the patch. + +Come on, be a sport! Send me a success report! + +(hey, that was even better than my original poem... this is getting bad!) + + +.. warning:: + + If you don't e-mail me about your success/failure soon, I may be forced to + start SINGING. And we don't want that, do we? + + (You know, it might be argued that I'm pushing this point a little too much. + If you think so, why not flame me in a quick little e-mail? Please also + include the type of card(s) you're using, software, size of network, and + whether it's working or not.) + + My e-mail address is: apenwarr@worldvisions.ca + +These are the ARCnet drivers for Linux. + +This new release (2.91) has been put together by David Woodhouse +, in an attempt to tidy up the driver after adding support +for yet another chipset. Now the generic support has been separated from the +individual chipset drivers, and the source files aren't quite so packed with +#ifdefs! I've changed this file a bit, but kept it in the first person from +Avery, because I didn't want to completely rewrite it. + +The previous release resulted from many months of on-and-off effort from me +(Avery Pennarun), many bug reports/fixes and suggestions from others, and in +particular a lot of input and coding from Tomasz Motylewski. Starting with +ARCnet 2.10 ALPHA, Tomasz's all-new-and-improved RFC1051 support has been +included and seems to be working fine! + + +Where do I discuss these drivers? +--------------------------------- + +Tomasz has been so kind as to set up a new and improved mailing list. +Subscribe by sending a message with the BODY "subscribe linux-arcnet YOUR +REAL NAME" to listserv@tichy.ch.uj.edu.pl. Then, to submit messages to the +list, mail to linux-arcnet@tichy.ch.uj.edu.pl. + +There are archives of the mailing list at: + + http://epistolary.org/mailman/listinfo.cgi/arcnet + +The people on linux-net@vger.kernel.org (now defunct, replaced by +netdev@vger.kernel.org) have also been known to be very helpful, especially +when we're talking about ALPHA Linux kernels that may or may not work right +in the first place. + + +Other Drivers and Info +---------------------- + +You can try my ARCNET page on the World Wide Web at: + + http://www.qis.net/~jschmitz/arcnet/ + +Also, SMC (one of the companies that makes ARCnet cards) has a WWW site you +might be interested in, which includes several drivers for various cards +including ARCnet. Try: + + http://www.smc.com/ + +Performance Technologies makes various network software that supports +ARCnet: + + http://www.perftech.com/ or ftp to ftp.perftech.com. + +Novell makes a networking stack for DOS which includes ARCnet drivers. Try +FTPing to ftp.novell.com. + +You can get the Crynwr packet driver collection (including arcether.com, the +one you'll want to use with ARCnet cards) from +oak.oakland.edu:/simtel/msdos/pktdrvr. It won't work perfectly on a 386+ +without patches, though, and also doesn't like several cards. Fixed +versions are available on my WWW page, or via e-mail if you don't have WWW +access. + + +Installing the Driver +--------------------- + +All you will need to do in order to install the driver is:: + + make config + (be sure to choose ARCnet in the network devices + and at least one chipset driver.) + make clean + make zImage + +If you obtained this ARCnet package as an upgrade to the ARCnet driver in +your current kernel, you will need to first copy arcnet.c over the one in +the linux/drivers/net directory. + +You will know the driver is installed properly if you get some ARCnet +messages when you reboot into the new Linux kernel. + +There are four chipset options: + + 1. Standard ARCnet COM90xx chipset. + +This is the normal ARCnet card, which you've probably got. This is the only +chipset driver which will autoprobe if not told where the card is. +It following options on the command line:: + + com90xx=[[,[,]]][,] | + +If you load the chipset support as a module, the options are:: + + io= irq= shmem= device= + +To disable the autoprobe, just specify "com90xx=" on the kernel command line. +To specify the name alone, but allow autoprobe, just put "com90xx=" + + 2. ARCnet COM20020 chipset. + +This is the new chipset from SMC with support for promiscuous mode (packet +sniffing), extra diagnostic information, etc. Unfortunately, there is no +sensible method of autoprobing for these cards. You must specify the I/O +address on the kernel command line. + +The command line options are:: + + com20020=[,[,[,backplane[,CKP[,timeout]]]]][,name] + +If you load the chipset support as a module, the options are:: + + io= irq= node= backplane= clock= + timeout= device= + +The COM20020 chipset allows you to set the node ID in software, overriding the +default which is still set in DIP switches on the card. If you don't have the +COM20020 data sheets, and you don't know what the other three options refer +to, then they won't interest you - forget them. + + 3. ARCnet COM90xx chipset in IO-mapped mode. + +This will also work with the normal ARCnet cards, but doesn't use the shared +memory. It performs less well than the above driver, but is provided in case +you have a card which doesn't support shared memory, or (strangely) in case +you have so many ARCnet cards in your machine that you run out of shmem slots. +If you don't give the IO address on the kernel command line, then the driver +will not find the card. + +The command line options are:: + + com90io=[,][,] + +If you load the chipset support as a module, the options are: + io= irq= device= + + 4. ARCnet RIM I cards. + +These are COM90xx chips which are _completely_ memory mapped. The support for +these is not tested. If you have one, please mail the author with a success +report. All options must be specified, except the device name. +Command line options:: + + arcrimi=,,[,] + +If you load the chipset support as a module, the options are:: + + shmem= irq= node= device= + + +Loadable Module Support +----------------------- + +Configure and rebuild Linux. When asked, answer 'm' to "Generic ARCnet +support" and to support for your ARCnet chipset if you want to use the +loadable module. You can also say 'y' to "Generic ARCnet support" and 'm' +to the chipset support if you wish. + +:: + + make config + make clean + make zImage + make modules + +If you're using a loadable module, you need to use insmod to load it, and +you can specify various characteristics of your card on the command +line. (In recent versions of the driver, autoprobing is much more reliable +and works as a module, so most of this is now unnecessary.) + +For example:: + + cd /usr/src/linux/modules + insmod arcnet.o + insmod com90xx.o + insmod com20020.o io=0x2e0 device=eth1 + + +Using the Driver +---------------- + +If you build your kernel with ARCnet COM90xx support included, it should +probe for your card automatically when you boot. If you use a different +chipset driver complied into the kernel, you must give the necessary options +on the kernel command line, as detailed above. + +Go read the NET-2-HOWTO and ETHERNET-HOWTO for Linux; they should be +available where you picked up this driver. Think of your ARCnet as a +souped-up (or down, as the case may be) Ethernet card. + +By the way, be sure to change all references from "eth0" to "arc0" in the +HOWTOs. Remember that ARCnet isn't a "true" Ethernet, and the device name +is DIFFERENT. + + +Multiple Cards in One Computer +------------------------------ + +Linux has pretty good support for this now, but since I've been busy, the +ARCnet driver has somewhat suffered in this respect. COM90xx support, if +compiled into the kernel, will (try to) autodetect all the installed cards. + +If you have other cards, with support compiled into the kernel, then you can +just repeat the options on the kernel command line, e.g.:: + + LILO: linux com20020=0x2e0 com20020=0x380 com90io=0x260 + +If you have the chipset support built as a loadable module, then you need to +do something like this:: + + insmod -o arc0 com90xx + insmod -o arc1 com20020 io=0x2e0 + insmod -o arc2 com90xx + +The ARCnet drivers will now sort out their names automatically. + + +How do I get it to work with...? +-------------------------------- + +NFS: + Should be fine linux->linux, just pretend you're using Ethernet cards. + oak.oakland.edu:/simtel/msdos/nfs has some nice DOS clients. There + is also a DOS-based NFS server called SOSS. It doesn't multitask + quite the way Linux does (actually, it doesn't multitask AT ALL) but + you never know what you might need. + + With AmiTCP (and possibly others), you may need to set the following + options in your Amiga nfstab: MD 1024 MR 1024 MW 1024 + (Thanks to Christian Gottschling + for this.) + + Probably these refer to maximum NFS data/read/write block sizes. I + don't know why the defaults on the Amiga didn't work; write to me if + you know more. + +DOS: + If you're using the freeware arcether.com, you might want to install + the driver patch from my web page. It helps with PC/TCP, and also + can get arcether to load if it timed out too quickly during + initialization. In fact, if you use it on a 386+ you REALLY need + the patch, really. + +Windows: + See DOS :) Trumpet Winsock works fine with either the Novell or + Arcether client, assuming you remember to load winpkt of course. + +LAN Manager and Windows for Workgroups: + These programs use protocols that + are incompatible with the Internet standard. They try to pretend + the cards are Ethernet, and confuse everyone else on the network. + + However, v2.00 and higher of the Linux ARCnet driver supports this + protocol via the 'arc0e' device. See the section on "Multiprotocol + Support" for more information. + + Using the freeware Samba server and clients for Linux, you can now + interface quite nicely with TCP/IP-based WfWg or Lan Manager + networks. + +Windows 95: + Tools are included with Win95 that let you use either the LANMAN + style network drivers (NDIS) or Novell drivers (ODI) to handle your + ARCnet packets. If you use ODI, you'll need to use the 'arc0' + device with Linux. If you use NDIS, then try the 'arc0e' device. + See the "Multiprotocol Support" section below if you need arc0e, + you're completely insane, and/or you need to build some kind of + hybrid network that uses both encapsulation types. + +OS/2: + I've been told it works under Warp Connect with an ARCnet driver from + SMC. You need to use the 'arc0e' interface for this. If you get + the SMC driver to work with the TCP/IP stuff included in the + "normal" Warp Bonus Pack, let me know. + + ftp.microsoft.com also has a freeware "Lan Manager for OS/2" client + which should use the same protocol as WfWg does. I had no luck + installing it under Warp, however. Please mail me with any results. + +NetBSD/AmiTCP: + These use an old version of the Internet standard ARCnet + protocol (RFC1051) which is compatible with the Linux driver v2.10 + ALPHA and above using the arc0s device. (See "Multiprotocol ARCnet" + below.) ** Newer versions of NetBSD apparently support RFC1201. + + +Using Multiprotocol ARCnet +-------------------------- + +The ARCnet driver v2.10 ALPHA supports three protocols, each on its own +"virtual network device": + + ====== =============================================================== + arc0 RFC1201 protocol, the official Internet standard which just + happens to be 100% compatible with Novell's TRXNET driver. + Version 1.00 of the ARCnet driver supported _only_ this + protocol. arc0 is the fastest of the three protocols (for + whatever reason), and allows larger packets to be used + because it supports RFC1201 "packet splitting" operations. + Unless you have a specific need to use a different protocol, + I strongly suggest that you stick with this one. + + arc0e "Ethernet-Encapsulation" which sends packets over ARCnet + that are actually a lot like Ethernet packets, including the + 6-byte hardware addresses. This protocol is compatible with + Microsoft's NDIS ARCnet driver, like the one in WfWg and + LANMAN. Because the MTU of 493 is actually smaller than the + one "required" by TCP/IP (576), there is a chance that some + network operations will not function properly. The Linux + TCP/IP layer can compensate in most cases, however, by + automatically fragmenting the TCP/IP packets to make them + fit. arc0e also works slightly more slowly than arc0, for + reasons yet to be determined. (Probably it's the smaller + MTU that does it.) + + arc0s The "[s]imple" RFC1051 protocol is the "previous" Internet + standard that is completely incompatible with the new + standard. Some software today, however, continues to + support the old standard (and only the old standard) + including NetBSD and AmiTCP. RFC1051 also does not support + RFC1201's packet splitting, and the MTU of 507 is still + smaller than the Internet "requirement," so it's quite + possible that you may run into problems. It's also slower + than RFC1201 by about 25%, for the same reason as arc0e. + + The arc0s support was contributed by Tomasz Motylewski + and modified somewhat by me. Bugs are probably my fault. + ====== =============================================================== + +You can choose not to compile arc0e and arc0s into the driver if you want - +this will save you a bit of memory and avoid confusion when eg. trying to +use the "NFS-root" stuff in recent Linux kernels. + +The arc0e and arc0s devices are created automatically when you first +ifconfig the arc0 device. To actually use them, though, you need to also +ifconfig the other virtual devices you need. There are a number of ways you +can set up your network then: + + +1. Single Protocol. + + This is the simplest way to configure your network: use just one of the + two available protocols. As mentioned above, it's a good idea to use + only arc0 unless you have a good reason (like some other software, ie. + WfWg, that only works with arc0e). + + If you need only arc0, then the following commands should get you going:: + + ifconfig arc0 MY.IP.ADD.RESS + route add MY.IP.ADD.RESS arc0 + route add -net SUB.NET.ADD.RESS arc0 + [add other local routes here] + + If you need arc0e (and only arc0e), it's a little different:: + + ifconfig arc0 MY.IP.ADD.RESS + ifconfig arc0e MY.IP.ADD.RESS + route add MY.IP.ADD.RESS arc0e + route add -net SUB.NET.ADD.RESS arc0e + + arc0s works much the same way as arc0e. + + +2. More than one protocol on the same wire. + + Now things start getting confusing. To even try it, you may need to be + partly crazy. Here's what *I* did. :) Note that I don't include arc0s in + my home network; I don't have any NetBSD or AmiTCP computers, so I only + use arc0s during limited testing. + + I have three computers on my home network; two Linux boxes (which prefer + RFC1201 protocol, for reasons listed above), and one XT that can't run + Linux but runs the free Microsoft LANMAN Client instead. + + Worse, one of the Linux computers (freedom) also has a modem and acts as + a router to my Internet provider. The other Linux box (insight) also has + its own IP address and needs to use freedom as its default gateway. The + XT (patience), however, does not have its own Internet IP address and so + I assigned it one on a "private subnet" (as defined by RFC1597). + + To start with, take a simple network with just insight and freedom. + Insight needs to: + + - talk to freedom via RFC1201 (arc0) protocol, because I like it + more and it's faster. + - use freedom as its Internet gateway. + + That's pretty easy to do. Set up insight like this:: + + ifconfig arc0 insight + route add insight arc0 + route add freedom arc0 /* I would use the subnet here (like I said + to to in "single protocol" above), + but the rest of the subnet + unfortunately lies across the PPP + link on freedom, which confuses + things. */ + route add default gw freedom + + And freedom gets configured like so:: + + ifconfig arc0 freedom + route add freedom arc0 + route add insight arc0 + /* and default gateway is configured by pppd */ + + Great, now insight talks to freedom directly on arc0, and sends packets + to the Internet through freedom. If you didn't know how to do the above, + you should probably stop reading this section now because it only gets + worse. + + Now, how do I add patience into the network? It will be using LANMAN + Client, which means I need the arc0e device. It needs to be able to talk + to both insight and freedom, and also use freedom as a gateway to the + Internet. (Recall that patience has a "private IP address" which won't + work on the Internet; that's okay, I configured Linux IP masquerading on + freedom for this subnet). + + So patience (necessarily; I don't have another IP number from my + provider) has an IP address on a different subnet than freedom and + insight, but needs to use freedom as an Internet gateway. Worse, most + DOS networking programs, including LANMAN, have braindead networking + schemes that rely completely on the netmask and a 'default gateway' to + determine how to route packets. This means that to get to freedom or + insight, patience WILL send through its default gateway, regardless of + the fact that both freedom and insight (courtesy of the arc0e device) + could understand a direct transmission. + + I compensate by giving freedom an extra IP address - aliased 'gatekeeper' - + that is on my private subnet, the same subnet that patience is on. I + then define gatekeeper to be the default gateway for patience. + + To configure freedom (in addition to the commands above):: + + ifconfig arc0e gatekeeper + route add gatekeeper arc0e + route add patience arc0e + + This way, freedom will send all packets for patience through arc0e, + giving its IP address as gatekeeper (on the private subnet). When it + talks to insight or the Internet, it will use its "freedom" Internet IP + address. + + You will notice that we haven't configured the arc0e device on insight. + This would work, but is not really necessary, and would require me to + assign insight another special IP number from my private subnet. Since + both insight and patience are using freedom as their default gateway, the + two can already talk to each other. + + It's quite fortunate that I set things up like this the first time (cough + cough) because it's really handy when I boot insight into DOS. There, it + runs the Novell ODI protocol stack, which only works with RFC1201 ARCnet. + In this mode it would be impossible for insight to communicate directly + with patience, since the Novell stack is incompatible with Microsoft's + Ethernet-Encap. Without changing any settings on freedom or patience, I + simply set freedom as the default gateway for insight (now in DOS, + remember) and all the forwarding happens "automagically" between the two + hosts that would normally not be able to communicate at all. + + For those who like diagrams, I have created two "virtual subnets" on the + same physical ARCnet wire. You can picture it like this:: + + + [RFC1201 NETWORK] [ETHER-ENCAP NETWORK] + (registered Internet subnet) (RFC1597 private subnet) + + (IP Masquerade) + /---------------\ * /---------------\ + | | * | | + | +-Freedom-*-Gatekeeper-+ | + | | | * | | + \-------+-------/ | * \-------+-------/ + | | | + Insight | Patience + (Internet) + + + +It works: what now? +------------------- + +Send mail describing your setup, preferably including driver version, kernel +version, ARCnet card model, CPU type, number of systems on your network, and +list of software in use to me at the following address: + + apenwarr@worldvisions.ca + +I do send (sometimes automated) replies to all messages I receive. My email +can be weird (and also usually gets forwarded all over the place along the +way to me), so if you don't get a reply within a reasonable time, please +resend. + + +It doesn't work: what now? +-------------------------- + +Do the same as above, but also include the output of the ifconfig and route +commands, as well as any pertinent log entries (ie. anything that starts +with "arcnet:" and has shown up since the last reboot) in your mail. + +If you want to try fixing it yourself (I strongly recommend that you mail me +about the problem first, since it might already have been solved) you may +want to try some of the debug levels available. For heavy testing on +D_DURING or more, it would be a REALLY good idea to kill your klogd daemon +first! D_DURING displays 4-5 lines for each packet sent or received. D_TX, +D_RX, and D_SKB actually DISPLAY each packet as it is sent or received, +which is obviously quite big. + +Starting with v2.40 ALPHA, the autoprobe routines have changed +significantly. In particular, they won't tell you why the card was not +found unless you turn on the D_INIT_REASONS debugging flag. + +Once the driver is running, you can run the arcdump shell script (available +from me or in the full ARCnet package, if you have it) as root to list the +contents of the arcnet buffers at any time. To make any sense at all out of +this, you should grab the pertinent RFCs. (some are listed near the top of +arcnet.c). arcdump assumes your card is at 0xD0000. If it isn't, edit the +script. + +Buffers 0 and 1 are used for receiving, and Buffers 2 and 3 are for sending. +Ping-pong buffers are implemented both ways. + +If your debug level includes D_DURING and you did NOT define SLOW_XMIT_COPY, +the buffers are cleared to a constant value of 0x42 every time the card is +reset (which should only happen when you do an ifconfig up, or when Linux +decides that the driver is broken). During a transmit, unused parts of the +buffer will be cleared to 0x42 as well. This is to make it easier to figure +out which bytes are being used by a packet. + +You can change the debug level without recompiling the kernel by typing:: + + ifconfig arc0 down metric 1xxx + /etc/rc.d/rc.inet1 + +where "xxx" is the debug level you want. For example, "metric 1015" would put +you at debug level 15. Debug level 7 is currently the default. + +Note that the debug level is (starting with v1.90 ALPHA) a binary +combination of different debug flags; so debug level 7 is really 1+2+4 or +D_NORMAL+D_EXTRA+D_INIT. To include D_DURING, you would add 16 to this, +resulting in debug level 23. + +If you don't understand that, you probably don't want to know anyway. +E-mail me about your problem. + + +I want to send money: what now? +------------------------------- + +Go take a nap or something. You'll feel better in the morning. diff --git a/Documentation/networking/arcnet.txt b/Documentation/networking/arcnet.txt deleted file mode 100644 index aff97f47c05c..000000000000 --- a/Documentation/networking/arcnet.txt +++ /dev/null @@ -1,556 +0,0 @@ ----------------------------------------------------------------------------- -NOTE: See also arcnet-hardware.txt in this directory for jumper-setting -and cabling information if you're like many of us and didn't happen to get a -manual with your ARCnet card. ----------------------------------------------------------------------------- - -Since no one seems to listen to me otherwise, perhaps a poem will get your -attention: - This driver's getting fat and beefy, - But my cat is still named Fifi. - -Hmm, I think I'm allowed to call that a poem, even though it's only two -lines. Hey, I'm in Computer Science, not English. Give me a break. - -The point is: I REALLY REALLY REALLY REALLY REALLY want to hear from you if -you test this and get it working. Or if you don't. Or anything. - -ARCnet 0.32 ALPHA first made it into the Linux kernel 1.1.80 - this was -nice, but after that even FEWER people started writing to me because they -didn't even have to install the patch. - -Come on, be a sport! Send me a success report! - -(hey, that was even better than my original poem... this is getting bad!) - - --------- -WARNING: --------- - -If you don't e-mail me about your success/failure soon, I may be forced to -start SINGING. And we don't want that, do we? - -(You know, it might be argued that I'm pushing this point a little too much. -If you think so, why not flame me in a quick little e-mail? Please also -include the type of card(s) you're using, software, size of network, and -whether it's working or not.) - -My e-mail address is: apenwarr@worldvisions.ca - - ---------------------------------------------------------------------------- - - -These are the ARCnet drivers for Linux. - - -This new release (2.91) has been put together by David Woodhouse -, in an attempt to tidy up the driver after adding support -for yet another chipset. Now the generic support has been separated from the -individual chipset drivers, and the source files aren't quite so packed with -#ifdefs! I've changed this file a bit, but kept it in the first person from -Avery, because I didn't want to completely rewrite it. - -The previous release resulted from many months of on-and-off effort from me -(Avery Pennarun), many bug reports/fixes and suggestions from others, and in -particular a lot of input and coding from Tomasz Motylewski. Starting with -ARCnet 2.10 ALPHA, Tomasz's all-new-and-improved RFC1051 support has been -included and seems to be working fine! - - -Where do I discuss these drivers? ---------------------------------- - -Tomasz has been so kind as to set up a new and improved mailing list. -Subscribe by sending a message with the BODY "subscribe linux-arcnet YOUR -REAL NAME" to listserv@tichy.ch.uj.edu.pl. Then, to submit messages to the -list, mail to linux-arcnet@tichy.ch.uj.edu.pl. - -There are archives of the mailing list at: - http://epistolary.org/mailman/listinfo.cgi/arcnet - -The people on linux-net@vger.kernel.org (now defunct, replaced by -netdev@vger.kernel.org) have also been known to be very helpful, especially -when we're talking about ALPHA Linux kernels that may or may not work right -in the first place. - - -Other Drivers and Info ----------------------- - -You can try my ARCNET page on the World Wide Web at: - http://www.qis.net/~jschmitz/arcnet/ - -Also, SMC (one of the companies that makes ARCnet cards) has a WWW site you -might be interested in, which includes several drivers for various cards -including ARCnet. Try: - http://www.smc.com/ - -Performance Technologies makes various network software that supports -ARCnet: - http://www.perftech.com/ or ftp to ftp.perftech.com. - -Novell makes a networking stack for DOS which includes ARCnet drivers. Try -FTPing to ftp.novell.com. - -You can get the Crynwr packet driver collection (including arcether.com, the -one you'll want to use with ARCnet cards) from -oak.oakland.edu:/simtel/msdos/pktdrvr. It won't work perfectly on a 386+ -without patches, though, and also doesn't like several cards. Fixed -versions are available on my WWW page, or via e-mail if you don't have WWW -access. - - -Installing the Driver ---------------------- - -All you will need to do in order to install the driver is: - make config - (be sure to choose ARCnet in the network devices - and at least one chipset driver.) - make clean - make zImage - -If you obtained this ARCnet package as an upgrade to the ARCnet driver in -your current kernel, you will need to first copy arcnet.c over the one in -the linux/drivers/net directory. - -You will know the driver is installed properly if you get some ARCnet -messages when you reboot into the new Linux kernel. - -There are four chipset options: - - 1. Standard ARCnet COM90xx chipset. - -This is the normal ARCnet card, which you've probably got. This is the only -chipset driver which will autoprobe if not told where the card is. -It following options on the command line: - com90xx=[[,[,]]][,] | - -If you load the chipset support as a module, the options are: - io= irq= shmem= device= - -To disable the autoprobe, just specify "com90xx=" on the kernel command line. -To specify the name alone, but allow autoprobe, just put "com90xx=" - - 2. ARCnet COM20020 chipset. - -This is the new chipset from SMC with support for promiscuous mode (packet -sniffing), extra diagnostic information, etc. Unfortunately, there is no -sensible method of autoprobing for these cards. You must specify the I/O -address on the kernel command line. -The command line options are: - com20020=[,[,[,backplane[,CKP[,timeout]]]]][,name] - -If you load the chipset support as a module, the options are: - io= irq= node= backplane= clock= - timeout= device= - -The COM20020 chipset allows you to set the node ID in software, overriding the -default which is still set in DIP switches on the card. If you don't have the -COM20020 data sheets, and you don't know what the other three options refer -to, then they won't interest you - forget them. - - 3. ARCnet COM90xx chipset in IO-mapped mode. - -This will also work with the normal ARCnet cards, but doesn't use the shared -memory. It performs less well than the above driver, but is provided in case -you have a card which doesn't support shared memory, or (strangely) in case -you have so many ARCnet cards in your machine that you run out of shmem slots. -If you don't give the IO address on the kernel command line, then the driver -will not find the card. -The command line options are: - com90io=[,][,] - -If you load the chipset support as a module, the options are: - io= irq= device= - - 4. ARCnet RIM I cards. - -These are COM90xx chips which are _completely_ memory mapped. The support for -these is not tested. If you have one, please mail the author with a success -report. All options must be specified, except the device name. -Command line options: - arcrimi=,,[,] - -If you load the chipset support as a module, the options are: - shmem= irq= node= device= - - -Loadable Module Support ------------------------ - -Configure and rebuild Linux. When asked, answer 'm' to "Generic ARCnet -support" and to support for your ARCnet chipset if you want to use the -loadable module. You can also say 'y' to "Generic ARCnet support" and 'm' -to the chipset support if you wish. - - make config - make clean - make zImage - make modules - -If you're using a loadable module, you need to use insmod to load it, and -you can specify various characteristics of your card on the command -line. (In recent versions of the driver, autoprobing is much more reliable -and works as a module, so most of this is now unnecessary.) - -For example: - cd /usr/src/linux/modules - insmod arcnet.o - insmod com90xx.o - insmod com20020.o io=0x2e0 device=eth1 - - -Using the Driver ----------------- - -If you build your kernel with ARCnet COM90xx support included, it should -probe for your card automatically when you boot. If you use a different -chipset driver complied into the kernel, you must give the necessary options -on the kernel command line, as detailed above. - -Go read the NET-2-HOWTO and ETHERNET-HOWTO for Linux; they should be -available where you picked up this driver. Think of your ARCnet as a -souped-up (or down, as the case may be) Ethernet card. - -By the way, be sure to change all references from "eth0" to "arc0" in the -HOWTOs. Remember that ARCnet isn't a "true" Ethernet, and the device name -is DIFFERENT. - - -Multiple Cards in One Computer ------------------------------- - -Linux has pretty good support for this now, but since I've been busy, the -ARCnet driver has somewhat suffered in this respect. COM90xx support, if -compiled into the kernel, will (try to) autodetect all the installed cards. - -If you have other cards, with support compiled into the kernel, then you can -just repeat the options on the kernel command line, e.g.: -LILO: linux com20020=0x2e0 com20020=0x380 com90io=0x260 - -If you have the chipset support built as a loadable module, then you need to -do something like this: - insmod -o arc0 com90xx - insmod -o arc1 com20020 io=0x2e0 - insmod -o arc2 com90xx -The ARCnet drivers will now sort out their names automatically. - - -How do I get it to work with...? --------------------------------- - -NFS: Should be fine linux->linux, just pretend you're using Ethernet cards. - oak.oakland.edu:/simtel/msdos/nfs has some nice DOS clients. There - is also a DOS-based NFS server called SOSS. It doesn't multitask - quite the way Linux does (actually, it doesn't multitask AT ALL) but - you never know what you might need. - - With AmiTCP (and possibly others), you may need to set the following - options in your Amiga nfstab: MD 1024 MR 1024 MW 1024 - (Thanks to Christian Gottschling - for this.) - - Probably these refer to maximum NFS data/read/write block sizes. I - don't know why the defaults on the Amiga didn't work; write to me if - you know more. - -DOS: If you're using the freeware arcether.com, you might want to install - the driver patch from my web page. It helps with PC/TCP, and also - can get arcether to load if it timed out too quickly during - initialization. In fact, if you use it on a 386+ you REALLY need - the patch, really. - -Windows: See DOS :) Trumpet Winsock works fine with either the Novell or - Arcether client, assuming you remember to load winpkt of course. - -LAN Manager and Windows for Workgroups: These programs use protocols that - are incompatible with the Internet standard. They try to pretend - the cards are Ethernet, and confuse everyone else on the network. - - However, v2.00 and higher of the Linux ARCnet driver supports this - protocol via the 'arc0e' device. See the section on "Multiprotocol - Support" for more information. - - Using the freeware Samba server and clients for Linux, you can now - interface quite nicely with TCP/IP-based WfWg or Lan Manager - networks. - -Windows 95: Tools are included with Win95 that let you use either the LANMAN - style network drivers (NDIS) or Novell drivers (ODI) to handle your - ARCnet packets. If you use ODI, you'll need to use the 'arc0' - device with Linux. If you use NDIS, then try the 'arc0e' device. - See the "Multiprotocol Support" section below if you need arc0e, - you're completely insane, and/or you need to build some kind of - hybrid network that uses both encapsulation types. - -OS/2: I've been told it works under Warp Connect with an ARCnet driver from - SMC. You need to use the 'arc0e' interface for this. If you get - the SMC driver to work with the TCP/IP stuff included in the - "normal" Warp Bonus Pack, let me know. - - ftp.microsoft.com also has a freeware "Lan Manager for OS/2" client - which should use the same protocol as WfWg does. I had no luck - installing it under Warp, however. Please mail me with any results. - -NetBSD/AmiTCP: These use an old version of the Internet standard ARCnet - protocol (RFC1051) which is compatible with the Linux driver v2.10 - ALPHA and above using the arc0s device. (See "Multiprotocol ARCnet" - below.) ** Newer versions of NetBSD apparently support RFC1201. - - -Using Multiprotocol ARCnet --------------------------- - -The ARCnet driver v2.10 ALPHA supports three protocols, each on its own -"virtual network device": - - arc0 - RFC1201 protocol, the official Internet standard which just - happens to be 100% compatible with Novell's TRXNET driver. - Version 1.00 of the ARCnet driver supported _only_ this - protocol. arc0 is the fastest of the three protocols (for - whatever reason), and allows larger packets to be used - because it supports RFC1201 "packet splitting" operations. - Unless you have a specific need to use a different protocol, - I strongly suggest that you stick with this one. - - arc0e - "Ethernet-Encapsulation" which sends packets over ARCnet - that are actually a lot like Ethernet packets, including the - 6-byte hardware addresses. This protocol is compatible with - Microsoft's NDIS ARCnet driver, like the one in WfWg and - LANMAN. Because the MTU of 493 is actually smaller than the - one "required" by TCP/IP (576), there is a chance that some - network operations will not function properly. The Linux - TCP/IP layer can compensate in most cases, however, by - automatically fragmenting the TCP/IP packets to make them - fit. arc0e also works slightly more slowly than arc0, for - reasons yet to be determined. (Probably it's the smaller - MTU that does it.) - - arc0s - The "[s]imple" RFC1051 protocol is the "previous" Internet - standard that is completely incompatible with the new - standard. Some software today, however, continues to - support the old standard (and only the old standard) - including NetBSD and AmiTCP. RFC1051 also does not support - RFC1201's packet splitting, and the MTU of 507 is still - smaller than the Internet "requirement," so it's quite - possible that you may run into problems. It's also slower - than RFC1201 by about 25%, for the same reason as arc0e. - - The arc0s support was contributed by Tomasz Motylewski - and modified somewhat by me. Bugs are probably my fault. - -You can choose not to compile arc0e and arc0s into the driver if you want - -this will save you a bit of memory and avoid confusion when eg. trying to -use the "NFS-root" stuff in recent Linux kernels. - -The arc0e and arc0s devices are created automatically when you first -ifconfig the arc0 device. To actually use them, though, you need to also -ifconfig the other virtual devices you need. There are a number of ways you -can set up your network then: - - -1. Single Protocol. - - This is the simplest way to configure your network: use just one of the - two available protocols. As mentioned above, it's a good idea to use - only arc0 unless you have a good reason (like some other software, ie. - WfWg, that only works with arc0e). - - If you need only arc0, then the following commands should get you going: - ifconfig arc0 MY.IP.ADD.RESS - route add MY.IP.ADD.RESS arc0 - route add -net SUB.NET.ADD.RESS arc0 - [add other local routes here] - - If you need arc0e (and only arc0e), it's a little different: - ifconfig arc0 MY.IP.ADD.RESS - ifconfig arc0e MY.IP.ADD.RESS - route add MY.IP.ADD.RESS arc0e - route add -net SUB.NET.ADD.RESS arc0e - - arc0s works much the same way as arc0e. - - -2. More than one protocol on the same wire. - - Now things start getting confusing. To even try it, you may need to be - partly crazy. Here's what *I* did. :) Note that I don't include arc0s in - my home network; I don't have any NetBSD or AmiTCP computers, so I only - use arc0s during limited testing. - - I have three computers on my home network; two Linux boxes (which prefer - RFC1201 protocol, for reasons listed above), and one XT that can't run - Linux but runs the free Microsoft LANMAN Client instead. - - Worse, one of the Linux computers (freedom) also has a modem and acts as - a router to my Internet provider. The other Linux box (insight) also has - its own IP address and needs to use freedom as its default gateway. The - XT (patience), however, does not have its own Internet IP address and so - I assigned it one on a "private subnet" (as defined by RFC1597). - - To start with, take a simple network with just insight and freedom. - Insight needs to: - - talk to freedom via RFC1201 (arc0) protocol, because I like it - more and it's faster. - - use freedom as its Internet gateway. - - That's pretty easy to do. Set up insight like this: - ifconfig arc0 insight - route add insight arc0 - route add freedom arc0 /* I would use the subnet here (like I said - to to in "single protocol" above), - but the rest of the subnet - unfortunately lies across the PPP - link on freedom, which confuses - things. */ - route add default gw freedom - - And freedom gets configured like so: - ifconfig arc0 freedom - route add freedom arc0 - route add insight arc0 - /* and default gateway is configured by pppd */ - - Great, now insight talks to freedom directly on arc0, and sends packets - to the Internet through freedom. If you didn't know how to do the above, - you should probably stop reading this section now because it only gets - worse. - - Now, how do I add patience into the network? It will be using LANMAN - Client, which means I need the arc0e device. It needs to be able to talk - to both insight and freedom, and also use freedom as a gateway to the - Internet. (Recall that patience has a "private IP address" which won't - work on the Internet; that's okay, I configured Linux IP masquerading on - freedom for this subnet). - - So patience (necessarily; I don't have another IP number from my - provider) has an IP address on a different subnet than freedom and - insight, but needs to use freedom as an Internet gateway. Worse, most - DOS networking programs, including LANMAN, have braindead networking - schemes that rely completely on the netmask and a 'default gateway' to - determine how to route packets. This means that to get to freedom or - insight, patience WILL send through its default gateway, regardless of - the fact that both freedom and insight (courtesy of the arc0e device) - could understand a direct transmission. - - I compensate by giving freedom an extra IP address - aliased 'gatekeeper' - - that is on my private subnet, the same subnet that patience is on. I - then define gatekeeper to be the default gateway for patience. - - To configure freedom (in addition to the commands above): - ifconfig arc0e gatekeeper - route add gatekeeper arc0e - route add patience arc0e - - This way, freedom will send all packets for patience through arc0e, - giving its IP address as gatekeeper (on the private subnet). When it - talks to insight or the Internet, it will use its "freedom" Internet IP - address. - - You will notice that we haven't configured the arc0e device on insight. - This would work, but is not really necessary, and would require me to - assign insight another special IP number from my private subnet. Since - both insight and patience are using freedom as their default gateway, the - two can already talk to each other. - - It's quite fortunate that I set things up like this the first time (cough - cough) because it's really handy when I boot insight into DOS. There, it - runs the Novell ODI protocol stack, which only works with RFC1201 ARCnet. - In this mode it would be impossible for insight to communicate directly - with patience, since the Novell stack is incompatible with Microsoft's - Ethernet-Encap. Without changing any settings on freedom or patience, I - simply set freedom as the default gateway for insight (now in DOS, - remember) and all the forwarding happens "automagically" between the two - hosts that would normally not be able to communicate at all. - - For those who like diagrams, I have created two "virtual subnets" on the - same physical ARCnet wire. You can picture it like this: - - - [RFC1201 NETWORK] [ETHER-ENCAP NETWORK] - (registered Internet subnet) (RFC1597 private subnet) - - (IP Masquerade) - /---------------\ * /---------------\ - | | * | | - | +-Freedom-*-Gatekeeper-+ | - | | | * | | - \-------+-------/ | * \-------+-------/ - | | | - Insight | Patience - (Internet) - - - -It works: what now? -------------------- - -Send mail describing your setup, preferably including driver version, kernel -version, ARCnet card model, CPU type, number of systems on your network, and -list of software in use to me at the following address: - apenwarr@worldvisions.ca - -I do send (sometimes automated) replies to all messages I receive. My email -can be weird (and also usually gets forwarded all over the place along the -way to me), so if you don't get a reply within a reasonable time, please -resend. - - -It doesn't work: what now? --------------------------- - -Do the same as above, but also include the output of the ifconfig and route -commands, as well as any pertinent log entries (ie. anything that starts -with "arcnet:" and has shown up since the last reboot) in your mail. - -If you want to try fixing it yourself (I strongly recommend that you mail me -about the problem first, since it might already have been solved) you may -want to try some of the debug levels available. For heavy testing on -D_DURING or more, it would be a REALLY good idea to kill your klogd daemon -first! D_DURING displays 4-5 lines for each packet sent or received. D_TX, -D_RX, and D_SKB actually DISPLAY each packet as it is sent or received, -which is obviously quite big. - -Starting with v2.40 ALPHA, the autoprobe routines have changed -significantly. In particular, they won't tell you why the card was not -found unless you turn on the D_INIT_REASONS debugging flag. - -Once the driver is running, you can run the arcdump shell script (available -from me or in the full ARCnet package, if you have it) as root to list the -contents of the arcnet buffers at any time. To make any sense at all out of -this, you should grab the pertinent RFCs. (some are listed near the top of -arcnet.c). arcdump assumes your card is at 0xD0000. If it isn't, edit the -script. - -Buffers 0 and 1 are used for receiving, and Buffers 2 and 3 are for sending. -Ping-pong buffers are implemented both ways. - -If your debug level includes D_DURING and you did NOT define SLOW_XMIT_COPY, -the buffers are cleared to a constant value of 0x42 every time the card is -reset (which should only happen when you do an ifconfig up, or when Linux -decides that the driver is broken). During a transmit, unused parts of the -buffer will be cleared to 0x42 as well. This is to make it easier to figure -out which bytes are being used by a packet. - -You can change the debug level without recompiling the kernel by typing: - ifconfig arc0 down metric 1xxx - /etc/rc.d/rc.inet1 -where "xxx" is the debug level you want. For example, "metric 1015" would put -you at debug level 15. Debug level 7 is currently the default. - -Note that the debug level is (starting with v1.90 ALPHA) a binary -combination of different debug flags; so debug level 7 is really 1+2+4 or -D_NORMAL+D_EXTRA+D_INIT. To include D_DURING, you would add 16 to this, -resulting in debug level 23. - -If you don't understand that, you probably don't want to know anyway. -E-mail me about your problem. - - -I want to send money: what now? -------------------------------- - -Go take a nap or something. You'll feel better in the morning. diff --git a/Documentation/networking/index.rst b/Documentation/networking/index.rst index 5da18e024fcb..3e0a4bb23ef9 100644 --- a/Documentation/networking/index.rst +++ b/Documentation/networking/index.rst @@ -40,6 +40,7 @@ Contents: 6pack altera_tse arcnet-hardware + arcnet .. only:: subproject and html diff --git a/drivers/net/arcnet/Kconfig b/drivers/net/arcnet/Kconfig index 27551bf3d7e4..43eef60653b2 100644 --- a/drivers/net/arcnet/Kconfig +++ b/drivers/net/arcnet/Kconfig @@ -9,7 +9,7 @@ menuconfig ARCNET ---help--- If you have a network card of this type, say Y and check out the (arguably) beautiful poetry in - . + . You need both this driver, and the driver for the particular ARCnet chipset of your card. If you don't know, then it's probably a @@ -28,7 +28,7 @@ config ARCNET_1201 arc0 device. You need to say Y here to communicate with industry-standard RFC1201 implementations, like the arcether.com packet driver or most DOS/Windows ODI drivers. Please read the - ARCnet documentation in + ARCnet documentation in for more information about using arc0. config ARCNET_1051 @@ -42,7 +42,7 @@ config ARCNET_1051 industry-standard RFC1201 implementations, like the arcether.com packet driver or most DOS/Windows ODI drivers. RFC1201 is included automatically as the arc0 device. Please read the ARCnet - documentation in for more + documentation in for more information about using arc0e and arc0s. config ARCNET_RAW -- cgit v1.2.3-59-g8ed1b From ff2269f16a1e1a7f8bbe72920d3d285ba3943572 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Tue, 28 Apr 2020 00:01:21 +0200 Subject: docs: networking: convert atm.txt to ReST There isn't much to be done here. Just: - add SPDX header; - add a document title. Signed-off-by: Mauro Carvalho Chehab Signed-off-by: David S. Miller --- Documentation/networking/atm.rst | 14 ++++++++++++++ Documentation/networking/atm.txt | 8 -------- Documentation/networking/index.rst | 1 + net/atm/Kconfig | 2 +- 4 files changed, 16 insertions(+), 9 deletions(-) create mode 100644 Documentation/networking/atm.rst delete mode 100644 Documentation/networking/atm.txt diff --git a/Documentation/networking/atm.rst b/Documentation/networking/atm.rst new file mode 100644 index 000000000000..c1df8c038525 --- /dev/null +++ b/Documentation/networking/atm.rst @@ -0,0 +1,14 @@ +.. SPDX-License-Identifier: GPL-2.0 + +=== +ATM +=== + +In order to use anything but the most primitive functions of ATM, +several user-mode programs are required to assist the kernel. These +programs and related material can be found via the ATM on Linux Web +page at http://linux-atm.sourceforge.net/ + +If you encounter problems with ATM, please report them on the ATM +on Linux mailing list. Subscription information, archives, etc., +can be found on http://linux-atm.sourceforge.net/ diff --git a/Documentation/networking/atm.txt b/Documentation/networking/atm.txt deleted file mode 100644 index 82921cee77fe..000000000000 --- a/Documentation/networking/atm.txt +++ /dev/null @@ -1,8 +0,0 @@ -In order to use anything but the most primitive functions of ATM, -several user-mode programs are required to assist the kernel. These -programs and related material can be found via the ATM on Linux Web -page at http://linux-atm.sourceforge.net/ - -If you encounter problems with ATM, please report them on the ATM -on Linux mailing list. Subscription information, archives, etc., -can be found on http://linux-atm.sourceforge.net/ diff --git a/Documentation/networking/index.rst b/Documentation/networking/index.rst index 3e0a4bb23ef9..841f3c3905d5 100644 --- a/Documentation/networking/index.rst +++ b/Documentation/networking/index.rst @@ -41,6 +41,7 @@ Contents: altera_tse arcnet-hardware arcnet + atm .. only:: subproject and html diff --git a/net/atm/Kconfig b/net/atm/Kconfig index 271f682e8438..e61dcc9f85b2 100644 --- a/net/atm/Kconfig +++ b/net/atm/Kconfig @@ -16,7 +16,7 @@ config ATM of your ATM card below. Note that you need a set of user-space programs to actually make use - of ATM. See the file for + of ATM. See the file for further details. config ATM_CLIP -- cgit v1.2.3-59-g8ed1b From 20b943f075574c233de51fa2f0124a97f0298be1 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Tue, 28 Apr 2020 00:01:22 +0200 Subject: docs: networking: convert ax25.txt to ReST There isn't much to be done here. Just: - add SPDX header; - add a document title. Signed-off-by: Mauro Carvalho Chehab Signed-off-by: David S. Miller --- Documentation/networking/ax25.rst | 16 ++++++++++++++++ Documentation/networking/ax25.txt | 10 ---------- Documentation/networking/index.rst | 1 + net/ax25/Kconfig | 6 +++--- 4 files changed, 20 insertions(+), 13 deletions(-) create mode 100644 Documentation/networking/ax25.rst delete mode 100644 Documentation/networking/ax25.txt diff --git a/Documentation/networking/ax25.rst b/Documentation/networking/ax25.rst new file mode 100644 index 000000000000..824afd7002db --- /dev/null +++ b/Documentation/networking/ax25.rst @@ -0,0 +1,16 @@ +.. SPDX-License-Identifier: GPL-2.0 + +===== +AX.25 +===== + +To use the amateur radio protocols within Linux you will need to get a +suitable copy of the AX.25 Utilities. More detailed information about +AX.25, NET/ROM and ROSE, associated programs and and utilities can be +found on http://www.linux-ax25.org. + +There is an active mailing list for discussing Linux amateur radio matters +called linux-hams@vger.kernel.org. To subscribe to it, send a message to +majordomo@vger.kernel.org with the words "subscribe linux-hams" in the body +of the message, the subject field is ignored. You don't need to be +subscribed to post but of course that means you might miss an answer. diff --git a/Documentation/networking/ax25.txt b/Documentation/networking/ax25.txt deleted file mode 100644 index 8257dbf9be57..000000000000 --- a/Documentation/networking/ax25.txt +++ /dev/null @@ -1,10 +0,0 @@ -To use the amateur radio protocols within Linux you will need to get a -suitable copy of the AX.25 Utilities. More detailed information about -AX.25, NET/ROM and ROSE, associated programs and and utilities can be -found on http://www.linux-ax25.org. - -There is an active mailing list for discussing Linux amateur radio matters -called linux-hams@vger.kernel.org. To subscribe to it, send a message to -majordomo@vger.kernel.org with the words "subscribe linux-hams" in the body -of the message, the subject field is ignored. You don't need to be -subscribed to post but of course that means you might miss an answer. diff --git a/Documentation/networking/index.rst b/Documentation/networking/index.rst index 841f3c3905d5..6a5858b27cf6 100644 --- a/Documentation/networking/index.rst +++ b/Documentation/networking/index.rst @@ -42,6 +42,7 @@ Contents: arcnet-hardware arcnet atm + ax25 .. only:: subproject and html diff --git a/net/ax25/Kconfig b/net/ax25/Kconfig index 043fd5437809..97d686d115c0 100644 --- a/net/ax25/Kconfig +++ b/net/ax25/Kconfig @@ -40,7 +40,7 @@ config AX25 radio as well as information about how to configure an AX.25 port is contained in the AX25-HOWTO, available from . You might also want to - check out the file in the + check out the file in the kernel source. More information about digital amateur radio in general is on the WWW at . @@ -88,7 +88,7 @@ config NETROM users as well as information about how to configure an AX.25 port is contained in the Linux Ham Wiki, available from . You also might want to check out the - file . More information about + file . More information about digital amateur radio in general is on the WWW at . @@ -107,7 +107,7 @@ config ROSE users as well as information about how to configure an AX.25 port is contained in the Linux Ham Wiki, available from . You also might want to check out the - file . More information about + file . More information about digital amateur radio in general is on the WWW at . -- cgit v1.2.3-59-g8ed1b From b5fcf32d7d4b647c0f3aa612d91d25996a49bcd9 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Tue, 28 Apr 2020 00:01:23 +0200 Subject: docs: networking: convert baycom.txt to ReST - add SPDX header; - adjust titles and chapters, adding proper markups; - mark code blocks and literals as such; - mark tables as such; - adjust identation, whitespaces and blank lines; - add to networking/index.rst. Signed-off-by: Mauro Carvalho Chehab Signed-off-by: David S. Miller --- Documentation/networking/baycom.rst | 174 ++++++++++++++++++++++++++++++++++++ Documentation/networking/baycom.txt | 158 -------------------------------- Documentation/networking/index.rst | 1 + drivers/net/hamradio/Kconfig | 8 +- 4 files changed, 179 insertions(+), 162 deletions(-) create mode 100644 Documentation/networking/baycom.rst delete mode 100644 Documentation/networking/baycom.txt diff --git a/Documentation/networking/baycom.rst b/Documentation/networking/baycom.rst new file mode 100644 index 000000000000..fe2d010f0e86 --- /dev/null +++ b/Documentation/networking/baycom.rst @@ -0,0 +1,174 @@ +.. SPDX-License-Identifier: GPL-2.0 + +=============================== +Linux Drivers for Baycom Modems +=============================== + +Thomas M. Sailer, HB9JNX/AE4WA, + +The drivers for the baycom modems have been split into +separate drivers as they did not share any code, and the driver +and device names have changed. + +This document describes the Linux Kernel Drivers for simple Baycom style +amateur radio modems. + +The following drivers are available: +==================================== + +baycom_ser_fdx: + This driver supports the SER12 modems either full or half duplex. + Its baud rate may be changed via the ``baud`` module parameter, + therefore it supports just about every bit bang modem on a + serial port. Its devices are called bcsf0 through bcsf3. + This is the recommended driver for SER12 type modems, + however if you have a broken UART clone that does not have working + delta status bits, you may try baycom_ser_hdx. + +baycom_ser_hdx: + This is an alternative driver for SER12 type modems. + It only supports half duplex, and only 1200 baud. Its devices + are called bcsh0 through bcsh3. Use this driver only if baycom_ser_fdx + does not work with your UART. + +baycom_par: + This driver supports the par96 and picpar modems. + Its devices are called bcp0 through bcp3. + +baycom_epp: + This driver supports the EPP modem. + Its devices are called bce0 through bce3. + This driver is work-in-progress. + +The following modems are supported: + +======= ======================================================================== +ser12 This is a very simple 1200 baud AFSK modem. The modem consists only + of a modulator/demodulator chip, usually a TI TCM3105. The computer + is responsible for regenerating the receiver bit clock, as well as + for handling the HDLC protocol. The modem connects to a serial port, + hence the name. Since the serial port is not used as an async serial + port, the kernel driver for serial ports cannot be used, and this + driver only supports standard serial hardware (8250, 16450, 16550) + +par96 This is a modem for 9600 baud FSK compatible to the G3RUH standard. + The modem does all the filtering and regenerates the receiver clock. + Data is transferred from and to the PC via a shift register. + The shift register is filled with 16 bits and an interrupt is signalled. + The PC then empties the shift register in a burst. This modem connects + to the parallel port, hence the name. The modem leaves the + implementation of the HDLC protocol and the scrambler polynomial to + the PC. + +picpar This is a redesign of the par96 modem by Henning Rech, DF9IC. The modem + is protocol compatible to par96, but uses only three low power ICs + and can therefore be fed from the parallel port and does not require + an additional power supply. Furthermore, it incorporates a carrier + detect circuitry. + +EPP This is a high-speed modem adaptor that connects to an enhanced parallel + port. + + Its target audience is users working over a high speed hub (76.8kbit/s). + +eppfpga This is a redesign of the EPP adaptor. +======= ======================================================================== + +All of the above modems only support half duplex communications. However, +the driver supports the KISS (see below) fullduplex command. It then simply +starts to send as soon as there's a packet to transmit and does not care +about DCD, i.e. it starts to send even if there's someone else on the channel. +This command is required by some implementations of the DAMA channel +access protocol. + + +The Interface of the drivers +============================ + +Unlike previous drivers, these drivers are no longer character devices, +but they are now true kernel network interfaces. Installation is therefore +simple. Once installed, four interfaces named bc{sf,sh,p,e}[0-3] are available. +sethdlc from the ax25 utilities may be used to set driver states etc. +Users of userland AX.25 stacks may use the net2kiss utility (also available +in the ax25 utilities package) to convert packets of a network interface +to a KISS stream on a pseudo tty. There's also a patch available from +me for WAMPES which allows attaching a kernel network interface directly. + + +Configuring the driver +====================== + +Every time a driver is inserted into the kernel, it has to know which +modems it should access at which ports. This can be done with the setbaycom +utility. If you are only using one modem, you can also configure the +driver from the insmod command line (or by means of an option line in +``/etc/modprobe.d/*.conf``). + +Examples:: + + modprobe baycom_ser_fdx mode="ser12*" iobase=0x3f8 irq=4 + sethdlc -i bcsf0 -p mode "ser12*" io 0x3f8 irq 4 + +Both lines configure the first port to drive a ser12 modem at the first +serial port (COM1 under DOS). The * in the mode parameter instructs the driver +to use the software DCD algorithm (see below):: + + insmod baycom_par mode="picpar" iobase=0x378 + sethdlc -i bcp0 -p mode "picpar" io 0x378 + +Both lines configure the first port to drive a picpar modem at the +first parallel port (LPT1 under DOS). (Note: picpar implies +hardware DCD, par96 implies software DCD). + +The channel access parameters can be set with sethdlc -a or kissparms. +Note that both utilities interpret the values slightly differently. + + +Hardware DCD versus Software DCD +================================ + +To avoid collisions on the air, the driver must know when the channel is +busy. This is the task of the DCD circuitry/software. The driver may either +utilise a software DCD algorithm (options=1) or use a DCD signal from +the hardware (options=0). + +======= ================================================================= +ser12 if software DCD is utilised, the radio's squelch should always be + open. It is highly recommended to use the software DCD algorithm, + as it is much faster than most hardware squelch circuitry. The + disadvantage is a slightly higher load on the system. + +par96 the software DCD algorithm for this type of modem is rather poor. + The modem simply does not provide enough information to implement + a reasonable DCD algorithm in software. Therefore, if your radio + feeds the DCD input of the PAR96 modem, the use of the hardware + DCD circuitry is recommended. + +picpar the picpar modem features a builtin DCD hardware, which is highly + recommended. +======= ================================================================= + + + +Compatibility with the rest of the Linux kernel +=============================================== + +The serial driver and the baycom serial drivers compete +for the same hardware resources. Of course only one driver can access a given +interface at a time. The serial driver grabs all interfaces it can find at +startup time. Therefore the baycom drivers subsequently won't be able to +access a serial port. You might therefore find it necessary to release +a port owned by the serial driver with 'setserial /dev/ttyS# uart none', where +# is the number of the interface. The baycom drivers do not reserve any +ports at startup, unless one is specified on the 'insmod' command line. Another +method to solve the problem is to compile all drivers as modules and +leave it to kmod to load the correct driver depending on the application. + +The parallel port drivers (baycom_par, baycom_epp) now use the parport subsystem +to arbitrate the ports between different client drivers. + +vy 73s de + +Tom Sailer, sailer@ife.ee.ethz.ch + +hb9jnx @ hb9w.ampr.org diff --git a/Documentation/networking/baycom.txt b/Documentation/networking/baycom.txt deleted file mode 100644 index 688f18fd4467..000000000000 --- a/Documentation/networking/baycom.txt +++ /dev/null @@ -1,158 +0,0 @@ - LINUX DRIVERS FOR BAYCOM MODEMS - - Thomas M. Sailer, HB9JNX/AE4WA, - -!!NEW!! (04/98) The drivers for the baycom modems have been split into -separate drivers as they did not share any code, and the driver -and device names have changed. - -This document describes the Linux Kernel Drivers for simple Baycom style -amateur radio modems. - -The following drivers are available: - -baycom_ser_fdx: - This driver supports the SER12 modems either full or half duplex. - Its baud rate may be changed via the `baud' module parameter, - therefore it supports just about every bit bang modem on a - serial port. Its devices are called bcsf0 through bcsf3. - This is the recommended driver for SER12 type modems, - however if you have a broken UART clone that does not have working - delta status bits, you may try baycom_ser_hdx. - -baycom_ser_hdx: - This is an alternative driver for SER12 type modems. - It only supports half duplex, and only 1200 baud. Its devices - are called bcsh0 through bcsh3. Use this driver only if baycom_ser_fdx - does not work with your UART. - -baycom_par: - This driver supports the par96 and picpar modems. - Its devices are called bcp0 through bcp3. - -baycom_epp: - This driver supports the EPP modem. - Its devices are called bce0 through bce3. - This driver is work-in-progress. - -The following modems are supported: - -ser12: This is a very simple 1200 baud AFSK modem. The modem consists only - of a modulator/demodulator chip, usually a TI TCM3105. The computer - is responsible for regenerating the receiver bit clock, as well as - for handling the HDLC protocol. The modem connects to a serial port, - hence the name. Since the serial port is not used as an async serial - port, the kernel driver for serial ports cannot be used, and this - driver only supports standard serial hardware (8250, 16450, 16550) - -par96: This is a modem for 9600 baud FSK compatible to the G3RUH standard. - The modem does all the filtering and regenerates the receiver clock. - Data is transferred from and to the PC via a shift register. - The shift register is filled with 16 bits and an interrupt is signalled. - The PC then empties the shift register in a burst. This modem connects - to the parallel port, hence the name. The modem leaves the - implementation of the HDLC protocol and the scrambler polynomial to - the PC. - -picpar: This is a redesign of the par96 modem by Henning Rech, DF9IC. The modem - is protocol compatible to par96, but uses only three low power ICs - and can therefore be fed from the parallel port and does not require - an additional power supply. Furthermore, it incorporates a carrier - detect circuitry. - -EPP: This is a high-speed modem adaptor that connects to an enhanced parallel port. - Its target audience is users working over a high speed hub (76.8kbit/s). - -eppfpga: This is a redesign of the EPP adaptor. - - - -All of the above modems only support half duplex communications. However, -the driver supports the KISS (see below) fullduplex command. It then simply -starts to send as soon as there's a packet to transmit and does not care -about DCD, i.e. it starts to send even if there's someone else on the channel. -This command is required by some implementations of the DAMA channel -access protocol. - - -The Interface of the drivers - -Unlike previous drivers, these drivers are no longer character devices, -but they are now true kernel network interfaces. Installation is therefore -simple. Once installed, four interfaces named bc{sf,sh,p,e}[0-3] are available. -sethdlc from the ax25 utilities may be used to set driver states etc. -Users of userland AX.25 stacks may use the net2kiss utility (also available -in the ax25 utilities package) to convert packets of a network interface -to a KISS stream on a pseudo tty. There's also a patch available from -me for WAMPES which allows attaching a kernel network interface directly. - - -Configuring the driver - -Every time a driver is inserted into the kernel, it has to know which -modems it should access at which ports. This can be done with the setbaycom -utility. If you are only using one modem, you can also configure the -driver from the insmod command line (or by means of an option line in -/etc/modprobe.d/*.conf). - -Examples: - modprobe baycom_ser_fdx mode="ser12*" iobase=0x3f8 irq=4 - sethdlc -i bcsf0 -p mode "ser12*" io 0x3f8 irq 4 - -Both lines configure the first port to drive a ser12 modem at the first -serial port (COM1 under DOS). The * in the mode parameter instructs the driver to use -the software DCD algorithm (see below). - - insmod baycom_par mode="picpar" iobase=0x378 - sethdlc -i bcp0 -p mode "picpar" io 0x378 - -Both lines configure the first port to drive a picpar modem at the -first parallel port (LPT1 under DOS). (Note: picpar implies -hardware DCD, par96 implies software DCD). - -The channel access parameters can be set with sethdlc -a or kissparms. -Note that both utilities interpret the values slightly differently. - - -Hardware DCD versus Software DCD - -To avoid collisions on the air, the driver must know when the channel is -busy. This is the task of the DCD circuitry/software. The driver may either -utilise a software DCD algorithm (options=1) or use a DCD signal from -the hardware (options=0). - -ser12: if software DCD is utilised, the radio's squelch should always be - open. It is highly recommended to use the software DCD algorithm, - as it is much faster than most hardware squelch circuitry. The - disadvantage is a slightly higher load on the system. - -par96: the software DCD algorithm for this type of modem is rather poor. - The modem simply does not provide enough information to implement - a reasonable DCD algorithm in software. Therefore, if your radio - feeds the DCD input of the PAR96 modem, the use of the hardware - DCD circuitry is recommended. - -picpar: the picpar modem features a builtin DCD hardware, which is highly - recommended. - - - -Compatibility with the rest of the Linux kernel - -The serial driver and the baycom serial drivers compete -for the same hardware resources. Of course only one driver can access a given -interface at a time. The serial driver grabs all interfaces it can find at -startup time. Therefore the baycom drivers subsequently won't be able to -access a serial port. You might therefore find it necessary to release -a port owned by the serial driver with 'setserial /dev/ttyS# uart none', where -# is the number of the interface. The baycom drivers do not reserve any -ports at startup, unless one is specified on the 'insmod' command line. Another -method to solve the problem is to compile all drivers as modules and -leave it to kmod to load the correct driver depending on the application. - -The parallel port drivers (baycom_par, baycom_epp) now use the parport subsystem -to arbitrate the ports between different client drivers. - -vy 73s de -Tom Sailer, sailer@ife.ee.ethz.ch -hb9jnx @ hb9w.ampr.org diff --git a/Documentation/networking/index.rst b/Documentation/networking/index.rst index 6a5858b27cf6..fbf845fbaff7 100644 --- a/Documentation/networking/index.rst +++ b/Documentation/networking/index.rst @@ -43,6 +43,7 @@ Contents: arcnet atm ax25 + baycom .. only:: subproject and html diff --git a/drivers/net/hamradio/Kconfig b/drivers/net/hamradio/Kconfig index bf306fed04cc..fe409819b56d 100644 --- a/drivers/net/hamradio/Kconfig +++ b/drivers/net/hamradio/Kconfig @@ -127,7 +127,7 @@ config BAYCOM_SER_FDX your serial interface chip. To configure the driver, use the sethdlc utility available in the standard ax25 utilities package. For information on the modems, see and - . + . To compile this driver as a module, choose M here: the module will be called baycom_ser_fdx. This is recommended. @@ -145,7 +145,7 @@ config BAYCOM_SER_HDX the driver, use the sethdlc utility available in the standard ax25 utilities package. For information on the modems, see and - . + . To compile this driver as a module, choose M here: the module will be called baycom_ser_hdx. This is recommended. @@ -160,7 +160,7 @@ config BAYCOM_PAR par96 designs. To configure the driver, use the sethdlc utility available in the standard ax25 utilities package. For information on the modems, see and the file - . + . To compile this driver as a module, choose M here: the module will be called baycom_par. This is recommended. @@ -175,7 +175,7 @@ config BAYCOM_EPP designs. To configure the driver, use the sethdlc utility available in the standard ax25 utilities package. For information on the modems, see and the file - . + . To compile this driver as a module, choose M here: the module will be called baycom_epp. This is recommended. -- cgit v1.2.3-59-g8ed1b From a362032eca22d03071c4613f6ca503be982bf375 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Tue, 28 Apr 2020 00:01:24 +0200 Subject: docs: networking: convert bonding.txt to ReST - add SPDX header; - adjust titles and chapters, adding proper markups; - comment out text-only TOC from html/pdf output; - mark code blocks and literals as such; - mark tables as such; - add notes markups; - adjust identation, whitespaces and blank lines; - add to networking/index.rst. Signed-off-by: Mauro Carvalho Chehab Signed-off-by: David S. Miller --- Documentation/networking/bonding.rst | 2890 ++++++++++++++++++++ Documentation/networking/bonding.txt | 2837 ------------------- .../networking/device_drivers/intel/e100.rst | 2 +- .../networking/device_drivers/intel/ixgb.rst | 2 +- Documentation/networking/index.rst | 1 + drivers/net/Kconfig | 2 +- 6 files changed, 2894 insertions(+), 2840 deletions(-) create mode 100644 Documentation/networking/bonding.rst delete mode 100644 Documentation/networking/bonding.txt diff --git a/Documentation/networking/bonding.rst b/Documentation/networking/bonding.rst new file mode 100644 index 000000000000..dd49f95d28d3 --- /dev/null +++ b/Documentation/networking/bonding.rst @@ -0,0 +1,2890 @@ +.. SPDX-License-Identifier: GPL-2.0 + +=================================== +Linux Ethernet Bonding Driver HOWTO +=================================== + +Latest update: 27 April 2011 + +Initial release: Thomas Davis + +Corrections, HA extensions: 2000/10/03-15: + + - Willy Tarreau + - Constantine Gavrilov + - Chad N. Tindel + - Janice Girouard + - Jay Vosburgh + +Reorganized and updated Feb 2005 by Jay Vosburgh +Added Sysfs information: 2006/04/24 + + - Mitch Williams + +Introduction +============ + +The Linux bonding driver provides a method for aggregating +multiple network interfaces into a single logical "bonded" interface. +The behavior of the bonded interfaces depends upon the mode; generally +speaking, modes provide either hot standby or load balancing services. +Additionally, link integrity monitoring may be performed. + +The bonding driver originally came from Donald Becker's +beowulf patches for kernel 2.0. It has changed quite a bit since, and +the original tools from extreme-linux and beowulf sites will not work +with this version of the driver. + +For new versions of the driver, updated userspace tools, and +who to ask for help, please follow the links at the end of this file. + +.. Table of Contents + + 1. Bonding Driver Installation + + 2. Bonding Driver Options + + 3. Configuring Bonding Devices + 3.1 Configuration with Sysconfig Support + 3.1.1 Using DHCP with Sysconfig + 3.1.2 Configuring Multiple Bonds with Sysconfig + 3.2 Configuration with Initscripts Support + 3.2.1 Using DHCP with Initscripts + 3.2.2 Configuring Multiple Bonds with Initscripts + 3.3 Configuring Bonding Manually with Ifenslave + 3.3.1 Configuring Multiple Bonds Manually + 3.4 Configuring Bonding Manually via Sysfs + 3.5 Configuration with Interfaces Support + 3.6 Overriding Configuration for Special Cases + 3.7 Configuring LACP for 802.3ad mode in a more secure way + + 4. Querying Bonding Configuration + 4.1 Bonding Configuration + 4.2 Network Configuration + + 5. Switch Configuration + + 6. 802.1q VLAN Support + + 7. Link Monitoring + 7.1 ARP Monitor Operation + 7.2 Configuring Multiple ARP Targets + 7.3 MII Monitor Operation + + 8. Potential Trouble Sources + 8.1 Adventures in Routing + 8.2 Ethernet Device Renaming + 8.3 Painfully Slow Or No Failed Link Detection By Miimon + + 9. SNMP agents + + 10. Promiscuous mode + + 11. Configuring Bonding for High Availability + 11.1 High Availability in a Single Switch Topology + 11.2 High Availability in a Multiple Switch Topology + 11.2.1 HA Bonding Mode Selection for Multiple Switch Topology + 11.2.2 HA Link Monitoring for Multiple Switch Topology + + 12. Configuring Bonding for Maximum Throughput + 12.1 Maximum Throughput in a Single Switch Topology + 12.1.1 MT Bonding Mode Selection for Single Switch Topology + 12.1.2 MT Link Monitoring for Single Switch Topology + 12.2 Maximum Throughput in a Multiple Switch Topology + 12.2.1 MT Bonding Mode Selection for Multiple Switch Topology + 12.2.2 MT Link Monitoring for Multiple Switch Topology + + 13. Switch Behavior Issues + 13.1 Link Establishment and Failover Delays + 13.2 Duplicated Incoming Packets + + 14. Hardware Specific Considerations + 14.1 IBM BladeCenter + + 15. Frequently Asked Questions + + 16. Resources and Links + + +1. Bonding Driver Installation +============================== + +Most popular distro kernels ship with the bonding driver +already available as a module. If your distro does not, or you +have need to compile bonding from source (e.g., configuring and +installing a mainline kernel from kernel.org), you'll need to perform +the following steps: + +1.1 Configure and build the kernel with bonding +----------------------------------------------- + +The current version of the bonding driver is available in the +drivers/net/bonding subdirectory of the most recent kernel source +(which is available on http://kernel.org). Most users "rolling their +own" will want to use the most recent kernel from kernel.org. + +Configure kernel with "make menuconfig" (or "make xconfig" or +"make config"), then select "Bonding driver support" in the "Network +device support" section. It is recommended that you configure the +driver as module since it is currently the only way to pass parameters +to the driver or configure more than one bonding device. + +Build and install the new kernel and modules. + +1.2 Bonding Control Utility +--------------------------- + +It is recommended to configure bonding via iproute2 (netlink) +or sysfs, the old ifenslave control utility is obsolete. + +2. Bonding Driver Options +========================= + +Options for the bonding driver are supplied as parameters to the +bonding module at load time, or are specified via sysfs. + +Module options may be given as command line arguments to the +insmod or modprobe command, but are usually specified in either the +``/etc/modprobe.d/*.conf`` configuration files, or in a distro-specific +configuration file (some of which are detailed in the next section). + +Details on bonding support for sysfs is provided in the +"Configuring Bonding Manually via Sysfs" section, below. + +The available bonding driver parameters are listed below. If a +parameter is not specified the default value is used. When initially +configuring a bond, it is recommended "tail -f /var/log/messages" be +run in a separate window to watch for bonding driver error messages. + +It is critical that either the miimon or arp_interval and +arp_ip_target parameters be specified, otherwise serious network +degradation will occur during link failures. Very few devices do not +support at least miimon, so there is really no reason not to use it. + +Options with textual values will accept either the text name +or, for backwards compatibility, the option value. E.g., +"mode=802.3ad" and "mode=4" set the same mode. + +The parameters are as follows: + +active_slave + + Specifies the new active slave for modes that support it + (active-backup, balance-alb and balance-tlb). Possible values + are the name of any currently enslaved interface, or an empty + string. If a name is given, the slave and its link must be up in order + to be selected as the new active slave. If an empty string is + specified, the current active slave is cleared, and a new active + slave is selected automatically. + + Note that this is only available through the sysfs interface. No module + parameter by this name exists. + + The normal value of this option is the name of the currently + active slave, or the empty string if there is no active slave or + the current mode does not use an active slave. + +ad_actor_sys_prio + + In an AD system, this specifies the system priority. The allowed range + is 1 - 65535. If the value is not specified, it takes 65535 as the + default value. + + This parameter has effect only in 802.3ad mode and is available through + SysFs interface. + +ad_actor_system + + In an AD system, this specifies the mac-address for the actor in + protocol packet exchanges (LACPDUs). The value cannot be NULL or + multicast. It is preferred to have the local-admin bit set for this + mac but driver does not enforce it. If the value is not given then + system defaults to using the masters' mac address as actors' system + address. + + This parameter has effect only in 802.3ad mode and is available through + SysFs interface. + +ad_select + + Specifies the 802.3ad aggregation selection logic to use. The + possible values and their effects are: + + stable or 0 + + The active aggregator is chosen by largest aggregate + bandwidth. + + Reselection of the active aggregator occurs only when all + slaves of the active aggregator are down or the active + aggregator has no slaves. + + This is the default value. + + bandwidth or 1 + + The active aggregator is chosen by largest aggregate + bandwidth. Reselection occurs if: + + - A slave is added to or removed from the bond + + - Any slave's link state changes + + - Any slave's 802.3ad association state changes + + - The bond's administrative state changes to up + + count or 2 + + The active aggregator is chosen by the largest number of + ports (slaves). Reselection occurs as described under the + "bandwidth" setting, above. + + The bandwidth and count selection policies permit failover of + 802.3ad aggregations when partial failure of the active aggregator + occurs. This keeps the aggregator with the highest availability + (either in bandwidth or in number of ports) active at all times. + + This option was added in bonding version 3.4.0. + +ad_user_port_key + + In an AD system, the port-key has three parts as shown below - + + ===== ============ + Bits Use + ===== ============ + 00 Duplex + 01-05 Speed + 06-15 User-defined + ===== ============ + + This defines the upper 10 bits of the port key. The values can be + from 0 - 1023. If not given, the system defaults to 0. + + This parameter has effect only in 802.3ad mode and is available through + SysFs interface. + +all_slaves_active + + Specifies that duplicate frames (received on inactive ports) should be + dropped (0) or delivered (1). + + Normally, bonding will drop duplicate frames (received on inactive + ports), which is desirable for most users. But there are some times + it is nice to allow duplicate frames to be delivered. + + The default value is 0 (drop duplicate frames received on inactive + ports). + +arp_interval + + Specifies the ARP link monitoring frequency in milliseconds. + + The ARP monitor works by periodically checking the slave + devices to determine whether they have sent or received + traffic recently (the precise criteria depends upon the + bonding mode, and the state of the slave). Regular traffic is + generated via ARP probes issued for the addresses specified by + the arp_ip_target option. + + This behavior can be modified by the arp_validate option, + below. + + If ARP monitoring is used in an etherchannel compatible mode + (modes 0 and 2), the switch should be configured in a mode + that evenly distributes packets across all links. If the + switch is configured to distribute the packets in an XOR + fashion, all replies from the ARP targets will be received on + the same link which could cause the other team members to + fail. ARP monitoring should not be used in conjunction with + miimon. A value of 0 disables ARP monitoring. The default + value is 0. + +arp_ip_target + + Specifies the IP addresses to use as ARP monitoring peers when + arp_interval is > 0. These are the targets of the ARP request + sent to determine the health of the link to the targets. + Specify these values in ddd.ddd.ddd.ddd format. Multiple IP + addresses must be separated by a comma. At least one IP + address must be given for ARP monitoring to function. The + maximum number of targets that can be specified is 16. The + default value is no IP addresses. + +arp_validate + + Specifies whether or not ARP probes and replies should be + validated in any mode that supports arp monitoring, or whether + non-ARP traffic should be filtered (disregarded) for link + monitoring purposes. + + Possible values are: + + none or 0 + + No validation or filtering is performed. + + active or 1 + + Validation is performed only for the active slave. + + backup or 2 + + Validation is performed only for backup slaves. + + all or 3 + + Validation is performed for all slaves. + + filter or 4 + + Filtering is applied to all slaves. No validation is + performed. + + filter_active or 5 + + Filtering is applied to all slaves, validation is performed + only for the active slave. + + filter_backup or 6 + + Filtering is applied to all slaves, validation is performed + only for backup slaves. + + Validation: + + Enabling validation causes the ARP monitor to examine the incoming + ARP requests and replies, and only consider a slave to be up if it + is receiving the appropriate ARP traffic. + + For an active slave, the validation checks ARP replies to confirm + that they were generated by an arp_ip_target. Since backup slaves + do not typically receive these replies, the validation performed + for backup slaves is on the broadcast ARP request sent out via the + active slave. It is possible that some switch or network + configurations may result in situations wherein the backup slaves + do not receive the ARP requests; in such a situation, validation + of backup slaves must be disabled. + + The validation of ARP requests on backup slaves is mainly helping + bonding to decide which slaves are more likely to work in case of + the active slave failure, it doesn't really guarantee that the + backup slave will work if it's selected as the next active slave. + + Validation is useful in network configurations in which multiple + bonding hosts are concurrently issuing ARPs to one or more targets + beyond a common switch. Should the link between the switch and + target fail (but not the switch itself), the probe traffic + generated by the multiple bonding instances will fool the standard + ARP monitor into considering the links as still up. Use of + validation can resolve this, as the ARP monitor will only consider + ARP requests and replies associated with its own instance of + bonding. + + Filtering: + + Enabling filtering causes the ARP monitor to only use incoming ARP + packets for link availability purposes. Arriving packets that are + not ARPs are delivered normally, but do not count when determining + if a slave is available. + + Filtering operates by only considering the reception of ARP + packets (any ARP packet, regardless of source or destination) when + determining if a slave has received traffic for link availability + purposes. + + Filtering is useful in network configurations in which significant + levels of third party broadcast traffic would fool the standard + ARP monitor into considering the links as still up. Use of + filtering can resolve this, as only ARP traffic is considered for + link availability purposes. + + This option was added in bonding version 3.1.0. + +arp_all_targets + + Specifies the quantity of arp_ip_targets that must be reachable + in order for the ARP monitor to consider a slave as being up. + This option affects only active-backup mode for slaves with + arp_validation enabled. + + Possible values are: + + any or 0 + + consider the slave up only when any of the arp_ip_targets + is reachable + + all or 1 + + consider the slave up only when all of the arp_ip_targets + are reachable + +downdelay + + Specifies the time, in milliseconds, to wait before disabling + a slave after a link failure has been detected. This option + is only valid for the miimon link monitor. The downdelay + value should be a multiple of the miimon value; if not, it + will be rounded down to the nearest multiple. The default + value is 0. + +fail_over_mac + + Specifies whether active-backup mode should set all slaves to + the same MAC address at enslavement (the traditional + behavior), or, when enabled, perform special handling of the + bond's MAC address in accordance with the selected policy. + + Possible values are: + + none or 0 + + This setting disables fail_over_mac, and causes + bonding to set all slaves of an active-backup bond to + the same MAC address at enslavement time. This is the + default. + + active or 1 + + The "active" fail_over_mac policy indicates that the + MAC address of the bond should always be the MAC + address of the currently active slave. The MAC + address of the slaves is not changed; instead, the MAC + address of the bond changes during a failover. + + This policy is useful for devices that cannot ever + alter their MAC address, or for devices that refuse + incoming broadcasts with their own source MAC (which + interferes with the ARP monitor). + + The down side of this policy is that every device on + the network must be updated via gratuitous ARP, + vs. just updating a switch or set of switches (which + often takes place for any traffic, not just ARP + traffic, if the switch snoops incoming traffic to + update its tables) for the traditional method. If the + gratuitous ARP is lost, communication may be + disrupted. + + When this policy is used in conjunction with the mii + monitor, devices which assert link up prior to being + able to actually transmit and receive are particularly + susceptible to loss of the gratuitous ARP, and an + appropriate updelay setting may be required. + + follow or 2 + + The "follow" fail_over_mac policy causes the MAC + address of the bond to be selected normally (normally + the MAC address of the first slave added to the bond). + However, the second and subsequent slaves are not set + to this MAC address while they are in a backup role; a + slave is programmed with the bond's MAC address at + failover time (and the formerly active slave receives + the newly active slave's MAC address). + + This policy is useful for multiport devices that + either become confused or incur a performance penalty + when multiple ports are programmed with the same MAC + address. + + + The default policy is none, unless the first slave cannot + change its MAC address, in which case the active policy is + selected by default. + + This option may be modified via sysfs only when no slaves are + present in the bond. + + This option was added in bonding version 3.2.0. The "follow" + policy was added in bonding version 3.3.0. + +lacp_rate + + Option specifying the rate in which we'll ask our link partner + to transmit LACPDU packets in 802.3ad mode. Possible values + are: + + slow or 0 + Request partner to transmit LACPDUs every 30 seconds + + fast or 1 + Request partner to transmit LACPDUs every 1 second + + The default is slow. + +max_bonds + + Specifies the number of bonding devices to create for this + instance of the bonding driver. E.g., if max_bonds is 3, and + the bonding driver is not already loaded, then bond0, bond1 + and bond2 will be created. The default value is 1. Specifying + a value of 0 will load bonding, but will not create any devices. + +miimon + + Specifies the MII link monitoring frequency in milliseconds. + This determines how often the link state of each slave is + inspected for link failures. A value of zero disables MII + link monitoring. A value of 100 is a good starting point. + The use_carrier option, below, affects how the link state is + determined. See the High Availability section for additional + information. The default value is 0. + +min_links + + Specifies the minimum number of links that must be active before + asserting carrier. It is similar to the Cisco EtherChannel min-links + feature. This allows setting the minimum number of member ports that + must be up (link-up state) before marking the bond device as up + (carrier on). This is useful for situations where higher level services + such as clustering want to ensure a minimum number of low bandwidth + links are active before switchover. This option only affect 802.3ad + mode. + + The default value is 0. This will cause carrier to be asserted (for + 802.3ad mode) whenever there is an active aggregator, regardless of the + number of available links in that aggregator. Note that, because an + aggregator cannot be active without at least one available link, + setting this option to 0 or to 1 has the exact same effect. + +mode + + Specifies one of the bonding policies. The default is + balance-rr (round robin). Possible values are: + + balance-rr or 0 + + Round-robin policy: Transmit packets in sequential + order from the first available slave through the + last. This mode provides load balancing and fault + tolerance. + + active-backup or 1 + + Active-backup policy: Only one slave in the bond is + active. A different slave becomes active if, and only + if, the active slave fails. The bond's MAC address is + externally visible on only one port (network adapter) + to avoid confusing the switch. + + In bonding version 2.6.2 or later, when a failover + occurs in active-backup mode, bonding will issue one + or more gratuitous ARPs on the newly active slave. + One gratuitous ARP is issued for the bonding master + interface and each VLAN interfaces configured above + it, provided that the interface has at least one IP + address configured. Gratuitous ARPs issued for VLAN + interfaces are tagged with the appropriate VLAN id. + + This mode provides fault tolerance. The primary + option, documented below, affects the behavior of this + mode. + + balance-xor or 2 + + XOR policy: Transmit based on the selected transmit + hash policy. The default policy is a simple [(source + MAC address XOR'd with destination MAC address XOR + packet type ID) modulo slave count]. Alternate transmit + policies may be selected via the xmit_hash_policy option, + described below. + + This mode provides load balancing and fault tolerance. + + broadcast or 3 + + Broadcast policy: transmits everything on all slave + interfaces. This mode provides fault tolerance. + + 802.3ad or 4 + + IEEE 802.3ad Dynamic link aggregation. Creates + aggregation groups that share the same speed and + duplex settings. Utilizes all slaves in the active + aggregator according to the 802.3ad specification. + + Slave selection for outgoing traffic is done according + to the transmit hash policy, which may be changed from + the default simple XOR policy via the xmit_hash_policy + option, documented below. Note that not all transmit + policies may be 802.3ad compliant, particularly in + regards to the packet mis-ordering requirements of + section 43.2.4 of the 802.3ad standard. Differing + peer implementations will have varying tolerances for + noncompliance. + + Prerequisites: + + 1. Ethtool support in the base drivers for retrieving + the speed and duplex of each slave. + + 2. A switch that supports IEEE 802.3ad Dynamic link + aggregation. + + Most switches will require some type of configuration + to enable 802.3ad mode. + + balance-tlb or 5 + + Adaptive transmit load balancing: channel bonding that + does not require any special switch support. + + In tlb_dynamic_lb=1 mode; the outgoing traffic is + distributed according to the current load (computed + relative to the speed) on each slave. + + In tlb_dynamic_lb=0 mode; the load balancing based on + current load is disabled and the load is distributed + only using the hash distribution. + + Incoming traffic is received by the current slave. + If the receiving slave fails, another slave takes over + the MAC address of the failed receiving slave. + + Prerequisite: + + Ethtool support in the base drivers for retrieving the + speed of each slave. + + balance-alb or 6 + + Adaptive load balancing: includes balance-tlb plus + receive load balancing (rlb) for IPV4 traffic, and + does not require any special switch support. The + receive load balancing is achieved by ARP negotiation. + The bonding driver intercepts the ARP Replies sent by + the local system on their way out and overwrites the + source hardware address with the unique hardware + address of one of the slaves in the bond such that + different peers use different hardware addresses for + the server. + + Receive traffic from connections created by the server + is also balanced. When the local system sends an ARP + Request the bonding driver copies and saves the peer's + IP information from the ARP packet. When the ARP + Reply arrives from the peer, its hardware address is + retrieved and the bonding driver initiates an ARP + reply to this peer assigning it to one of the slaves + in the bond. A problematic outcome of using ARP + negotiation for balancing is that each time that an + ARP request is broadcast it uses the hardware address + of the bond. Hence, peers learn the hardware address + of the bond and the balancing of receive traffic + collapses to the current slave. This is handled by + sending updates (ARP Replies) to all the peers with + their individually assigned hardware address such that + the traffic is redistributed. Receive traffic is also + redistributed when a new slave is added to the bond + and when an inactive slave is re-activated. The + receive load is distributed sequentially (round robin) + among the group of highest speed slaves in the bond. + + When a link is reconnected or a new slave joins the + bond the receive traffic is redistributed among all + active slaves in the bond by initiating ARP Replies + with the selected MAC address to each of the + clients. The updelay parameter (detailed below) must + be set to a value equal or greater than the switch's + forwarding delay so that the ARP Replies sent to the + peers will not be blocked by the switch. + + Prerequisites: + + 1. Ethtool support in the base drivers for retrieving + the speed of each slave. + + 2. Base driver support for setting the hardware + address of a device while it is open. This is + required so that there will always be one slave in the + team using the bond hardware address (the + curr_active_slave) while having a unique hardware + address for each slave in the bond. If the + curr_active_slave fails its hardware address is + swapped with the new curr_active_slave that was + chosen. + +num_grat_arp, +num_unsol_na + + Specify the number of peer notifications (gratuitous ARPs and + unsolicited IPv6 Neighbor Advertisements) to be issued after a + failover event. As soon as the link is up on the new slave + (possibly immediately) a peer notification is sent on the + bonding device and each VLAN sub-device. This is repeated at + the rate specified by peer_notif_delay if the number is + greater than 1. + + The valid range is 0 - 255; the default value is 1. These options + affect only the active-backup mode. These options were added for + bonding versions 3.3.0 and 3.4.0 respectively. + + From Linux 3.0 and bonding version 3.7.1, these notifications + are generated by the ipv4 and ipv6 code and the numbers of + repetitions cannot be set independently. + +packets_per_slave + + Specify the number of packets to transmit through a slave before + moving to the next one. When set to 0 then a slave is chosen at + random. + + The valid range is 0 - 65535; the default value is 1. This option + has effect only in balance-rr mode. + +peer_notif_delay + + Specify the delay, in milliseconds, between each peer + notification (gratuitous ARP and unsolicited IPv6 Neighbor + Advertisement) when they are issued after a failover event. + This delay should be a multiple of the link monitor interval + (arp_interval or miimon, whichever is active). The default + value is 0 which means to match the value of the link monitor + interval. + +primary + + A string (eth0, eth2, etc) specifying which slave is the + primary device. The specified device will always be the + active slave while it is available. Only when the primary is + off-line will alternate devices be used. This is useful when + one slave is preferred over another, e.g., when one slave has + higher throughput than another. + + The primary option is only valid for active-backup(1), + balance-tlb (5) and balance-alb (6) mode. + +primary_reselect + + Specifies the reselection policy for the primary slave. This + affects how the primary slave is chosen to become the active slave + when failure of the active slave or recovery of the primary slave + occurs. This option is designed to prevent flip-flopping between + the primary slave and other slaves. Possible values are: + + always or 0 (default) + + The primary slave becomes the active slave whenever it + comes back up. + + better or 1 + + The primary slave becomes the active slave when it comes + back up, if the speed and duplex of the primary slave is + better than the speed and duplex of the current active + slave. + + failure or 2 + + The primary slave becomes the active slave only if the + current active slave fails and the primary slave is up. + + The primary_reselect setting is ignored in two cases: + + If no slaves are active, the first slave to recover is + made the active slave. + + When initially enslaved, the primary slave is always made + the active slave. + + Changing the primary_reselect policy via sysfs will cause an + immediate selection of the best active slave according to the new + policy. This may or may not result in a change of the active + slave, depending upon the circumstances. + + This option was added for bonding version 3.6.0. + +tlb_dynamic_lb + + Specifies if dynamic shuffling of flows is enabled in tlb + mode. The value has no effect on any other modes. + + The default behavior of tlb mode is to shuffle active flows across + slaves based on the load in that interval. This gives nice lb + characteristics but can cause packet reordering. If re-ordering is + a concern use this variable to disable flow shuffling and rely on + load balancing provided solely by the hash distribution. + xmit-hash-policy can be used to select the appropriate hashing for + the setup. + + The sysfs entry can be used to change the setting per bond device + and the initial value is derived from the module parameter. The + sysfs entry is allowed to be changed only if the bond device is + down. + + The default value is "1" that enables flow shuffling while value "0" + disables it. This option was added in bonding driver 3.7.1 + + +updelay + + Specifies the time, in milliseconds, to wait before enabling a + slave after a link recovery has been detected. This option is + only valid for the miimon link monitor. The updelay value + should be a multiple of the miimon value; if not, it will be + rounded down to the nearest multiple. The default value is 0. + +use_carrier + + Specifies whether or not miimon should use MII or ETHTOOL + ioctls vs. netif_carrier_ok() to determine the link + status. The MII or ETHTOOL ioctls are less efficient and + utilize a deprecated calling sequence within the kernel. The + netif_carrier_ok() relies on the device driver to maintain its + state with netif_carrier_on/off; at this writing, most, but + not all, device drivers support this facility. + + If bonding insists that the link is up when it should not be, + it may be that your network device driver does not support + netif_carrier_on/off. The default state for netif_carrier is + "carrier on," so if a driver does not support netif_carrier, + it will appear as if the link is always up. In this case, + setting use_carrier to 0 will cause bonding to revert to the + MII / ETHTOOL ioctl method to determine the link state. + + A value of 1 enables the use of netif_carrier_ok(), a value of + 0 will use the deprecated MII / ETHTOOL ioctls. The default + value is 1. + +xmit_hash_policy + + Selects the transmit hash policy to use for slave selection in + balance-xor, 802.3ad, and tlb modes. Possible values are: + + layer2 + + Uses XOR of hardware MAC addresses and packet type ID + field to generate the hash. The formula is + + hash = source MAC XOR destination MAC XOR packet type ID + slave number = hash modulo slave count + + This algorithm will place all traffic to a particular + network peer on the same slave. + + This algorithm is 802.3ad compliant. + + layer2+3 + + This policy uses a combination of layer2 and layer3 + protocol information to generate the hash. + + Uses XOR of hardware MAC addresses and IP addresses to + generate the hash. The formula is + + hash = source MAC XOR destination MAC XOR packet type ID + hash = hash XOR source IP XOR destination IP + hash = hash XOR (hash RSHIFT 16) + hash = hash XOR (hash RSHIFT 8) + And then hash is reduced modulo slave count. + + If the protocol is IPv6 then the source and destination + addresses are first hashed using ipv6_addr_hash. + + This algorithm will place all traffic to a particular + network peer on the same slave. For non-IP traffic, + the formula is the same as for the layer2 transmit + hash policy. + + This policy is intended to provide a more balanced + distribution of traffic than layer2 alone, especially + in environments where a layer3 gateway device is + required to reach most destinations. + + This algorithm is 802.3ad compliant. + + layer3+4 + + This policy uses upper layer protocol information, + when available, to generate the hash. This allows for + traffic to a particular network peer to span multiple + slaves, although a single connection will not span + multiple slaves. + + The formula for unfragmented TCP and UDP packets is + + hash = source port, destination port (as in the header) + hash = hash XOR source IP XOR destination IP + hash = hash XOR (hash RSHIFT 16) + hash = hash XOR (hash RSHIFT 8) + And then hash is reduced modulo slave count. + + If the protocol is IPv6 then the source and destination + addresses are first hashed using ipv6_addr_hash. + + For fragmented TCP or UDP packets and all other IPv4 and + IPv6 protocol traffic, the source and destination port + information is omitted. For non-IP traffic, the + formula is the same as for the layer2 transmit hash + policy. + + This algorithm is not fully 802.3ad compliant. A + single TCP or UDP conversation containing both + fragmented and unfragmented packets will see packets + striped across two interfaces. This may result in out + of order delivery. Most traffic types will not meet + this criteria, as TCP rarely fragments traffic, and + most UDP traffic is not involved in extended + conversations. Other implementations of 802.3ad may + or may not tolerate this noncompliance. + + encap2+3 + + This policy uses the same formula as layer2+3 but it + relies on skb_flow_dissect to obtain the header fields + which might result in the use of inner headers if an + encapsulation protocol is used. For example this will + improve the performance for tunnel users because the + packets will be distributed according to the encapsulated + flows. + + encap3+4 + + This policy uses the same formula as layer3+4 but it + relies on skb_flow_dissect to obtain the header fields + which might result in the use of inner headers if an + encapsulation protocol is used. For example this will + improve the performance for tunnel users because the + packets will be distributed according to the encapsulated + flows. + + The default value is layer2. This option was added in bonding + version 2.6.3. In earlier versions of bonding, this parameter + does not exist, and the layer2 policy is the only policy. The + layer2+3 value was added for bonding version 3.2.2. + +resend_igmp + + Specifies the number of IGMP membership reports to be issued after + a failover event. One membership report is issued immediately after + the failover, subsequent packets are sent in each 200ms interval. + + The valid range is 0 - 255; the default value is 1. A value of 0 + prevents the IGMP membership report from being issued in response + to the failover event. + + This option is useful for bonding modes balance-rr (0), active-backup + (1), balance-tlb (5) and balance-alb (6), in which a failover can + switch the IGMP traffic from one slave to another. Therefore a fresh + IGMP report must be issued to cause the switch to forward the incoming + IGMP traffic over the newly selected slave. + + This option was added for bonding version 3.7.0. + +lp_interval + + Specifies the number of seconds between instances where the bonding + driver sends learning packets to each slaves peer switch. + + The valid range is 1 - 0x7fffffff; the default value is 1. This Option + has effect only in balance-tlb and balance-alb modes. + +3. Configuring Bonding Devices +============================== + +You can configure bonding using either your distro's network +initialization scripts, or manually using either iproute2 or the +sysfs interface. Distros generally use one of three packages for the +network initialization scripts: initscripts, sysconfig or interfaces. +Recent versions of these packages have support for bonding, while older +versions do not. + +We will first describe the options for configuring bonding for +distros using versions of initscripts, sysconfig and interfaces with full +or partial support for bonding, then provide information on enabling +bonding without support from the network initialization scripts (i.e., +older versions of initscripts or sysconfig). + +If you're unsure whether your distro uses sysconfig, +initscripts or interfaces, or don't know if it's new enough, have no fear. +Determining this is fairly straightforward. + +First, look for a file called interfaces in /etc/network directory. +If this file is present in your system, then your system use interfaces. See +Configuration with Interfaces Support. + +Else, issue the command:: + + $ rpm -qf /sbin/ifup + +It will respond with a line of text starting with either +"initscripts" or "sysconfig," followed by some numbers. This is the +package that provides your network initialization scripts. + +Next, to determine if your installation supports bonding, +issue the command:: + + $ grep ifenslave /sbin/ifup + +If this returns any matches, then your initscripts or +sysconfig has support for bonding. + +3.1 Configuration with Sysconfig Support +---------------------------------------- + +This section applies to distros using a version of sysconfig +with bonding support, for example, SuSE Linux Enterprise Server 9. + +SuSE SLES 9's networking configuration system does support +bonding, however, at this writing, the YaST system configuration +front end does not provide any means to work with bonding devices. +Bonding devices can be managed by hand, however, as follows. + +First, if they have not already been configured, configure the +slave devices. On SLES 9, this is most easily done by running the +yast2 sysconfig configuration utility. The goal is for to create an +ifcfg-id file for each slave device. The simplest way to accomplish +this is to configure the devices for DHCP (this is only to get the +file ifcfg-id file created; see below for some issues with DHCP). The +name of the configuration file for each device will be of the form:: + + ifcfg-id-xx:xx:xx:xx:xx:xx + +Where the "xx" portion will be replaced with the digits from +the device's permanent MAC address. + +Once the set of ifcfg-id-xx:xx:xx:xx:xx:xx files has been +created, it is necessary to edit the configuration files for the slave +devices (the MAC addresses correspond to those of the slave devices). +Before editing, the file will contain multiple lines, and will look +something like this:: + + BOOTPROTO='dhcp' + STARTMODE='on' + USERCTL='no' + UNIQUE='XNzu.WeZGOGF+4wE' + _nm_name='bus-pci-0001:61:01.0' + +Change the BOOTPROTO and STARTMODE lines to the following:: + + BOOTPROTO='none' + STARTMODE='off' + +Do not alter the UNIQUE or _nm_name lines. Remove any other +lines (USERCTL, etc). + +Once the ifcfg-id-xx:xx:xx:xx:xx:xx files have been modified, +it's time to create the configuration file for the bonding device +itself. This file is named ifcfg-bondX, where X is the number of the +bonding device to create, starting at 0. The first such file is +ifcfg-bond0, the second is ifcfg-bond1, and so on. The sysconfig +network configuration system will correctly start multiple instances +of bonding. + +The contents of the ifcfg-bondX file is as follows:: + + BOOTPROTO="static" + BROADCAST="10.0.2.255" + IPADDR="10.0.2.10" + NETMASK="255.255.0.0" + NETWORK="10.0.2.0" + REMOTE_IPADDR="" + STARTMODE="onboot" + BONDING_MASTER="yes" + BONDING_MODULE_OPTS="mode=active-backup miimon=100" + BONDING_SLAVE0="eth0" + BONDING_SLAVE1="bus-pci-0000:06:08.1" + +Replace the sample BROADCAST, IPADDR, NETMASK and NETWORK +values with the appropriate values for your network. + +The STARTMODE specifies when the device is brought online. +The possible values are: + + ======== ====================================================== + onboot The device is started at boot time. If you're not + sure, this is probably what you want. + + manual The device is started only when ifup is called + manually. Bonding devices may be configured this + way if you do not wish them to start automatically + at boot for some reason. + + hotplug The device is started by a hotplug event. This is not + a valid choice for a bonding device. + + off or The device configuration is ignored. + ignore + ======== ====================================================== + +The line BONDING_MASTER='yes' indicates that the device is a +bonding master device. The only useful value is "yes." + +The contents of BONDING_MODULE_OPTS are supplied to the +instance of the bonding module for this device. Specify the options +for the bonding mode, link monitoring, and so on here. Do not include +the max_bonds bonding parameter; this will confuse the configuration +system if you have multiple bonding devices. + +Finally, supply one BONDING_SLAVEn="slave device" for each +slave. where "n" is an increasing value, one for each slave. The +"slave device" is either an interface name, e.g., "eth0", or a device +specifier for the network device. The interface name is easier to +find, but the ethN names are subject to change at boot time if, e.g., +a device early in the sequence has failed. The device specifiers +(bus-pci-0000:06:08.1 in the example above) specify the physical +network device, and will not change unless the device's bus location +changes (for example, it is moved from one PCI slot to another). The +example above uses one of each type for demonstration purposes; most +configurations will choose one or the other for all slave devices. + +When all configuration files have been modified or created, +networking must be restarted for the configuration changes to take +effect. This can be accomplished via the following:: + + # /etc/init.d/network restart + +Note that the network control script (/sbin/ifdown) will +remove the bonding module as part of the network shutdown processing, +so it is not necessary to remove the module by hand if, e.g., the +module parameters have changed. + +Also, at this writing, YaST/YaST2 will not manage bonding +devices (they do not show bonding interfaces on its list of network +devices). It is necessary to edit the configuration file by hand to +change the bonding configuration. + +Additional general options and details of the ifcfg file +format can be found in an example ifcfg template file:: + + /etc/sysconfig/network/ifcfg.template + +Note that the template does not document the various ``BONDING_*`` +settings described above, but does describe many of the other options. + +3.1.1 Using DHCP with Sysconfig +------------------------------- + +Under sysconfig, configuring a device with BOOTPROTO='dhcp' +will cause it to query DHCP for its IP address information. At this +writing, this does not function for bonding devices; the scripts +attempt to obtain the device address from DHCP prior to adding any of +the slave devices. Without active slaves, the DHCP requests are not +sent to the network. + +3.1.2 Configuring Multiple Bonds with Sysconfig +----------------------------------------------- + +The sysconfig network initialization system is capable of +handling multiple bonding devices. All that is necessary is for each +bonding instance to have an appropriately configured ifcfg-bondX file +(as described above). Do not specify the "max_bonds" parameter to any +instance of bonding, as this will confuse sysconfig. If you require +multiple bonding devices with identical parameters, create multiple +ifcfg-bondX files. + +Because the sysconfig scripts supply the bonding module +options in the ifcfg-bondX file, it is not necessary to add them to +the system ``/etc/modules.d/*.conf`` configuration files. + +3.2 Configuration with Initscripts Support +------------------------------------------ + +This section applies to distros using a recent version of +initscripts with bonding support, for example, Red Hat Enterprise Linux +version 3 or later, Fedora, etc. On these systems, the network +initialization scripts have knowledge of bonding, and can be configured to +control bonding devices. Note that older versions of the initscripts +package have lower levels of support for bonding; this will be noted where +applicable. + +These distros will not automatically load the network adapter +driver unless the ethX device is configured with an IP address. +Because of this constraint, users must manually configure a +network-script file for all physical adapters that will be members of +a bondX link. Network script files are located in the directory: + +/etc/sysconfig/network-scripts + +The file name must be prefixed with "ifcfg-eth" and suffixed +with the adapter's physical adapter number. For example, the script +for eth0 would be named /etc/sysconfig/network-scripts/ifcfg-eth0. +Place the following text in the file:: + + DEVICE=eth0 + USERCTL=no + ONBOOT=yes + MASTER=bond0 + SLAVE=yes + BOOTPROTO=none + +The DEVICE= line will be different for every ethX device and +must correspond with the name of the file, i.e., ifcfg-eth1 must have +a device line of DEVICE=eth1. The setting of the MASTER= line will +also depend on the final bonding interface name chosen for your bond. +As with other network devices, these typically start at 0, and go up +one for each device, i.e., the first bonding instance is bond0, the +second is bond1, and so on. + +Next, create a bond network script. The file name for this +script will be /etc/sysconfig/network-scripts/ifcfg-bondX where X is +the number of the bond. For bond0 the file is named "ifcfg-bond0", +for bond1 it is named "ifcfg-bond1", and so on. Within that file, +place the following text:: + + DEVICE=bond0 + IPADDR=192.168.1.1 + NETMASK=255.255.255.0 + NETWORK=192.168.1.0 + BROADCAST=192.168.1.255 + ONBOOT=yes + BOOTPROTO=none + USERCTL=no + +Be sure to change the networking specific lines (IPADDR, +NETMASK, NETWORK and BROADCAST) to match your network configuration. + +For later versions of initscripts, such as that found with Fedora +7 (or later) and Red Hat Enterprise Linux version 5 (or later), it is possible, +and, indeed, preferable, to specify the bonding options in the ifcfg-bond0 +file, e.g. a line of the format:: + + BONDING_OPTS="mode=active-backup arp_interval=60 arp_ip_target=192.168.1.254" + +will configure the bond with the specified options. The options +specified in BONDING_OPTS are identical to the bonding module parameters +except for the arp_ip_target field when using versions of initscripts older +than and 8.57 (Fedora 8) and 8.45.19 (Red Hat Enterprise Linux 5.2). When +using older versions each target should be included as a separate option and +should be preceded by a '+' to indicate it should be added to the list of +queried targets, e.g.,:: + + arp_ip_target=+192.168.1.1 arp_ip_target=+192.168.1.2 + +is the proper syntax to specify multiple targets. When specifying +options via BONDING_OPTS, it is not necessary to edit +``/etc/modprobe.d/*.conf``. + +For even older versions of initscripts that do not support +BONDING_OPTS, it is necessary to edit /etc/modprobe.d/*.conf, depending upon +your distro) to load the bonding module with your desired options when the +bond0 interface is brought up. The following lines in /etc/modprobe.d/*.conf +will load the bonding module, and select its options: + + alias bond0 bonding + options bond0 mode=balance-alb miimon=100 + +Replace the sample parameters with the appropriate set of +options for your configuration. + +Finally run "/etc/rc.d/init.d/network restart" as root. This +will restart the networking subsystem and your bond link should be now +up and running. + +3.2.1 Using DHCP with Initscripts +--------------------------------- + +Recent versions of initscripts (the versions supplied with Fedora +Core 3 and Red Hat Enterprise Linux 4, or later versions, are reported to +work) have support for assigning IP information to bonding devices via +DHCP. + +To configure bonding for DHCP, configure it as described +above, except replace the line "BOOTPROTO=none" with "BOOTPROTO=dhcp" +and add a line consisting of "TYPE=Bonding". Note that the TYPE value +is case sensitive. + +3.2.2 Configuring Multiple Bonds with Initscripts +------------------------------------------------- + +Initscripts packages that are included with Fedora 7 and Red Hat +Enterprise Linux 5 support multiple bonding interfaces by simply +specifying the appropriate BONDING_OPTS= in ifcfg-bondX where X is the +number of the bond. This support requires sysfs support in the kernel, +and a bonding driver of version 3.0.0 or later. Other configurations may +not support this method for specifying multiple bonding interfaces; for +those instances, see the "Configuring Multiple Bonds Manually" section, +below. + +3.3 Configuring Bonding Manually with iproute2 +----------------------------------------------- + +This section applies to distros whose network initialization +scripts (the sysconfig or initscripts package) do not have specific +knowledge of bonding. One such distro is SuSE Linux Enterprise Server +version 8. + +The general method for these systems is to place the bonding +module parameters into a config file in /etc/modprobe.d/ (as +appropriate for the installed distro), then add modprobe and/or +`ip link` commands to the system's global init script. The name of +the global init script differs; for sysconfig, it is +/etc/init.d/boot.local and for initscripts it is /etc/rc.d/rc.local. + +For example, if you wanted to make a simple bond of two e100 +devices (presumed to be eth0 and eth1), and have it persist across +reboots, edit the appropriate file (/etc/init.d/boot.local or +/etc/rc.d/rc.local), and add the following:: + + modprobe bonding mode=balance-alb miimon=100 + modprobe e100 + ifconfig bond0 192.168.1.1 netmask 255.255.255.0 up + ip link set eth0 master bond0 + ip link set eth1 master bond0 + +Replace the example bonding module parameters and bond0 +network configuration (IP address, netmask, etc) with the appropriate +values for your configuration. + +Unfortunately, this method will not provide support for the +ifup and ifdown scripts on the bond devices. To reload the bonding +configuration, it is necessary to run the initialization script, e.g.,:: + + # /etc/init.d/boot.local + +or:: + + # /etc/rc.d/rc.local + +It may be desirable in such a case to create a separate script +which only initializes the bonding configuration, then call that +separate script from within boot.local. This allows for bonding to be +enabled without re-running the entire global init script. + +To shut down the bonding devices, it is necessary to first +mark the bonding device itself as being down, then remove the +appropriate device driver modules. For our example above, you can do +the following:: + + # ifconfig bond0 down + # rmmod bonding + # rmmod e100 + +Again, for convenience, it may be desirable to create a script +with these commands. + + +3.3.1 Configuring Multiple Bonds Manually +----------------------------------------- + +This section contains information on configuring multiple +bonding devices with differing options for those systems whose network +initialization scripts lack support for configuring multiple bonds. + +If you require multiple bonding devices, but all with the same +options, you may wish to use the "max_bonds" module parameter, +documented above. + +To create multiple bonding devices with differing options, it is +preferable to use bonding parameters exported by sysfs, documented in the +section below. + +For versions of bonding without sysfs support, the only means to +provide multiple instances of bonding with differing options is to load +the bonding driver multiple times. Note that current versions of the +sysconfig network initialization scripts handle this automatically; if +your distro uses these scripts, no special action is needed. See the +section Configuring Bonding Devices, above, if you're not sure about your +network initialization scripts. + +To load multiple instances of the module, it is necessary to +specify a different name for each instance (the module loading system +requires that every loaded module, even multiple instances of the same +module, have a unique name). This is accomplished by supplying multiple +sets of bonding options in ``/etc/modprobe.d/*.conf``, for example:: + + alias bond0 bonding + options bond0 -o bond0 mode=balance-rr miimon=100 + + alias bond1 bonding + options bond1 -o bond1 mode=balance-alb miimon=50 + +will load the bonding module two times. The first instance is +named "bond0" and creates the bond0 device in balance-rr mode with an +miimon of 100. The second instance is named "bond1" and creates the +bond1 device in balance-alb mode with an miimon of 50. + +In some circumstances (typically with older distributions), +the above does not work, and the second bonding instance never sees +its options. In that case, the second options line can be substituted +as follows:: + + install bond1 /sbin/modprobe --ignore-install bonding -o bond1 \ + mode=balance-alb miimon=50 + +This may be repeated any number of times, specifying a new and +unique name in place of bond1 for each subsequent instance. + +It has been observed that some Red Hat supplied kernels are unable +to rename modules at load time (the "-o bond1" part). Attempts to pass +that option to modprobe will produce an "Operation not permitted" error. +This has been reported on some Fedora Core kernels, and has been seen on +RHEL 4 as well. On kernels exhibiting this problem, it will be impossible +to configure multiple bonds with differing parameters (as they are older +kernels, and also lack sysfs support). + +3.4 Configuring Bonding Manually via Sysfs +------------------------------------------ + +Starting with version 3.0.0, Channel Bonding may be configured +via the sysfs interface. This interface allows dynamic configuration +of all bonds in the system without unloading the module. It also +allows for adding and removing bonds at runtime. Ifenslave is no +longer required, though it is still supported. + +Use of the sysfs interface allows you to use multiple bonds +with different configurations without having to reload the module. +It also allows you to use multiple, differently configured bonds when +bonding is compiled into the kernel. + +You must have the sysfs filesystem mounted to configure +bonding this way. The examples in this document assume that you +are using the standard mount point for sysfs, e.g. /sys. If your +sysfs filesystem is mounted elsewhere, you will need to adjust the +example paths accordingly. + +Creating and Destroying Bonds +----------------------------- +To add a new bond foo:: + + # echo +foo > /sys/class/net/bonding_masters + +To remove an existing bond bar:: + + # echo -bar > /sys/class/net/bonding_masters + +To show all existing bonds:: + + # cat /sys/class/net/bonding_masters + +.. note:: + + due to 4K size limitation of sysfs files, this list may be + truncated if you have more than a few hundred bonds. This is unlikely + to occur under normal operating conditions. + +Adding and Removing Slaves +-------------------------- +Interfaces may be enslaved to a bond using the file +/sys/class/net//bonding/slaves. The semantics for this file +are the same as for the bonding_masters file. + +To enslave interface eth0 to bond bond0:: + + # ifconfig bond0 up + # echo +eth0 > /sys/class/net/bond0/bonding/slaves + +To free slave eth0 from bond bond0:: + + # echo -eth0 > /sys/class/net/bond0/bonding/slaves + +When an interface is enslaved to a bond, symlinks between the +two are created in the sysfs filesystem. In this case, you would get +/sys/class/net/bond0/slave_eth0 pointing to /sys/class/net/eth0, and +/sys/class/net/eth0/master pointing to /sys/class/net/bond0. + +This means that you can tell quickly whether or not an +interface is enslaved by looking for the master symlink. Thus: +# echo -eth0 > /sys/class/net/eth0/master/bonding/slaves +will free eth0 from whatever bond it is enslaved to, regardless of +the name of the bond interface. + +Changing a Bond's Configuration +------------------------------- +Each bond may be configured individually by manipulating the +files located in /sys/class/net//bonding + +The names of these files correspond directly with the command- +line parameters described elsewhere in this file, and, with the +exception of arp_ip_target, they accept the same values. To see the +current setting, simply cat the appropriate file. + +A few examples will be given here; for specific usage +guidelines for each parameter, see the appropriate section in this +document. + +To configure bond0 for balance-alb mode:: + + # ifconfig bond0 down + # echo 6 > /sys/class/net/bond0/bonding/mode + - or - + # echo balance-alb > /sys/class/net/bond0/bonding/mode + +.. note:: + + The bond interface must be down before the mode can be changed. + +To enable MII monitoring on bond0 with a 1 second interval:: + + # echo 1000 > /sys/class/net/bond0/bonding/miimon + +.. note:: + + If ARP monitoring is enabled, it will disabled when MII + monitoring is enabled, and vice-versa. + +To add ARP targets:: + + # echo +192.168.0.100 > /sys/class/net/bond0/bonding/arp_ip_target + # echo +192.168.0.101 > /sys/class/net/bond0/bonding/arp_ip_target + +.. note:: + + up to 16 target addresses may be specified. + +To remove an ARP target:: + + # echo -192.168.0.100 > /sys/class/net/bond0/bonding/arp_ip_target + +To configure the interval between learning packet transmits:: + + # echo 12 > /sys/class/net/bond0/bonding/lp_interval + +.. note:: + + the lp_interval is the number of seconds between instances where + the bonding driver sends learning packets to each slaves peer switch. The + default interval is 1 second. + +Example Configuration +--------------------- +We begin with the same example that is shown in section 3.3, +executed with sysfs, and without using ifenslave. + +To make a simple bond of two e100 devices (presumed to be eth0 +and eth1), and have it persist across reboots, edit the appropriate +file (/etc/init.d/boot.local or /etc/rc.d/rc.local), and add the +following:: + + modprobe bonding + modprobe e100 + echo balance-alb > /sys/class/net/bond0/bonding/mode + ifconfig bond0 192.168.1.1 netmask 255.255.255.0 up + echo 100 > /sys/class/net/bond0/bonding/miimon + echo +eth0 > /sys/class/net/bond0/bonding/slaves + echo +eth1 > /sys/class/net/bond0/bonding/slaves + +To add a second bond, with two e1000 interfaces in +active-backup mode, using ARP monitoring, add the following lines to +your init script:: + + modprobe e1000 + echo +bond1 > /sys/class/net/bonding_masters + echo active-backup > /sys/class/net/bond1/bonding/mode + ifconfig bond1 192.168.2.1 netmask 255.255.255.0 up + echo +192.168.2.100 /sys/class/net/bond1/bonding/arp_ip_target + echo 2000 > /sys/class/net/bond1/bonding/arp_interval + echo +eth2 > /sys/class/net/bond1/bonding/slaves + echo +eth3 > /sys/class/net/bond1/bonding/slaves + +3.5 Configuration with Interfaces Support +----------------------------------------- + +This section applies to distros which use /etc/network/interfaces file +to describe network interface configuration, most notably Debian and it's +derivatives. + +The ifup and ifdown commands on Debian don't support bonding out of +the box. The ifenslave-2.6 package should be installed to provide bonding +support. Once installed, this package will provide ``bond-*`` options +to be used into /etc/network/interfaces. + +Note that ifenslave-2.6 package will load the bonding module and use +the ifenslave command when appropriate. + +Example Configurations +---------------------- + +In /etc/network/interfaces, the following stanza will configure bond0, in +active-backup mode, with eth0 and eth1 as slaves:: + + auto bond0 + iface bond0 inet dhcp + bond-slaves eth0 eth1 + bond-mode active-backup + bond-miimon 100 + bond-primary eth0 eth1 + +If the above configuration doesn't work, you might have a system using +upstart for system startup. This is most notably true for recent +Ubuntu versions. The following stanza in /etc/network/interfaces will +produce the same result on those systems:: + + auto bond0 + iface bond0 inet dhcp + bond-slaves none + bond-mode active-backup + bond-miimon 100 + + auto eth0 + iface eth0 inet manual + bond-master bond0 + bond-primary eth0 eth1 + + auto eth1 + iface eth1 inet manual + bond-master bond0 + bond-primary eth0 eth1 + +For a full list of ``bond-*`` supported options in /etc/network/interfaces and +some more advanced examples tailored to you particular distros, see the files in +/usr/share/doc/ifenslave-2.6. + +3.6 Overriding Configuration for Special Cases +---------------------------------------------- + +When using the bonding driver, the physical port which transmits a frame is +typically selected by the bonding driver, and is not relevant to the user or +system administrator. The output port is simply selected using the policies of +the selected bonding mode. On occasion however, it is helpful to direct certain +classes of traffic to certain physical interfaces on output to implement +slightly more complex policies. For example, to reach a web server over a +bonded interface in which eth0 connects to a private network, while eth1 +connects via a public network, it may be desirous to bias the bond to send said +traffic over eth0 first, using eth1 only as a fall back, while all other traffic +can safely be sent over either interface. Such configurations may be achieved +using the traffic control utilities inherent in linux. + +By default the bonding driver is multiqueue aware and 16 queues are created +when the driver initializes (see Documentation/networking/multiqueue.txt +for details). If more or less queues are desired the module parameter +tx_queues can be used to change this value. There is no sysfs parameter +available as the allocation is done at module init time. + +The output of the file /proc/net/bonding/bondX has changed so the output Queue +ID is now printed for each slave:: + + Bonding Mode: fault-tolerance (active-backup) + Primary Slave: None + Currently Active Slave: eth0 + MII Status: up + MII Polling Interval (ms): 0 + Up Delay (ms): 0 + Down Delay (ms): 0 + + Slave Interface: eth0 + MII Status: up + Link Failure Count: 0 + Permanent HW addr: 00:1a:a0:12:8f:cb + Slave queue ID: 0 + + Slave Interface: eth1 + MII Status: up + Link Failure Count: 0 + Permanent HW addr: 00:1a:a0:12:8f:cc + Slave queue ID: 2 + +The queue_id for a slave can be set using the command:: + + # echo "eth1:2" > /sys/class/net/bond0/bonding/queue_id + +Any interface that needs a queue_id set should set it with multiple calls +like the one above until proper priorities are set for all interfaces. On +distributions that allow configuration via initscripts, multiple 'queue_id' +arguments can be added to BONDING_OPTS to set all needed slave queues. + +These queue id's can be used in conjunction with the tc utility to configure +a multiqueue qdisc and filters to bias certain traffic to transmit on certain +slave devices. For instance, say we wanted, in the above configuration to +force all traffic bound to 192.168.1.100 to use eth1 in the bond as its output +device. The following commands would accomplish this:: + + # tc qdisc add dev bond0 handle 1 root multiq + + # tc filter add dev bond0 protocol ip parent 1: prio 1 u32 match ip \ + dst 192.168.1.100 action skbedit queue_mapping 2 + +These commands tell the kernel to attach a multiqueue queue discipline to the +bond0 interface and filter traffic enqueued to it, such that packets with a dst +ip of 192.168.1.100 have their output queue mapping value overwritten to 2. +This value is then passed into the driver, causing the normal output path +selection policy to be overridden, selecting instead qid 2, which maps to eth1. + +Note that qid values begin at 1. Qid 0 is reserved to initiate to the driver +that normal output policy selection should take place. One benefit to simply +leaving the qid for a slave to 0 is the multiqueue awareness in the bonding +driver that is now present. This awareness allows tc filters to be placed on +slave devices as well as bond devices and the bonding driver will simply act as +a pass-through for selecting output queues on the slave device rather than +output port selection. + +This feature first appeared in bonding driver version 3.7.0 and support for +output slave selection was limited to round-robin and active-backup modes. + +3.7 Configuring LACP for 802.3ad mode in a more secure way +---------------------------------------------------------- + +When using 802.3ad bonding mode, the Actor (host) and Partner (switch) +exchange LACPDUs. These LACPDUs cannot be sniffed, because they are +destined to link local mac addresses (which switches/bridges are not +supposed to forward). However, most of the values are easily predictable +or are simply the machine's MAC address (which is trivially known to all +other hosts in the same L2). This implies that other machines in the L2 +domain can spoof LACPDU packets from other hosts to the switch and potentially +cause mayhem by joining (from the point of view of the switch) another +machine's aggregate, thus receiving a portion of that hosts incoming +traffic and / or spoofing traffic from that machine themselves (potentially +even successfully terminating some portion of flows). Though this is not +a likely scenario, one could avoid this possibility by simply configuring +few bonding parameters: + + (a) ad_actor_system : You can set a random mac-address that can be used for + these LACPDU exchanges. The value can not be either NULL or Multicast. + Also it's preferable to set the local-admin bit. Following shell code + generates a random mac-address as described above:: + + # sys_mac_addr=$(printf '%02x:%02x:%02x:%02x:%02x:%02x' \ + $(( (RANDOM & 0xFE) | 0x02 )) \ + $(( RANDOM & 0xFF )) \ + $(( RANDOM & 0xFF )) \ + $(( RANDOM & 0xFF )) \ + $(( RANDOM & 0xFF )) \ + $(( RANDOM & 0xFF ))) + # echo $sys_mac_addr > /sys/class/net/bond0/bonding/ad_actor_system + + (b) ad_actor_sys_prio : Randomize the system priority. The default value + is 65535, but system can take the value from 1 - 65535. Following shell + code generates random priority and sets it:: + + # sys_prio=$(( 1 + RANDOM + RANDOM )) + # echo $sys_prio > /sys/class/net/bond0/bonding/ad_actor_sys_prio + + (c) ad_user_port_key : Use the user portion of the port-key. The default + keeps this empty. These are the upper 10 bits of the port-key and value + ranges from 0 - 1023. Following shell code generates these 10 bits and + sets it:: + + # usr_port_key=$(( RANDOM & 0x3FF )) + # echo $usr_port_key > /sys/class/net/bond0/bonding/ad_user_port_key + + +4 Querying Bonding Configuration +================================= + +4.1 Bonding Configuration +------------------------- + +Each bonding device has a read-only file residing in the +/proc/net/bonding directory. The file contents include information +about the bonding configuration, options and state of each slave. + +For example, the contents of /proc/net/bonding/bond0 after the +driver is loaded with parameters of mode=0 and miimon=1000 is +generally as follows:: + + Ethernet Channel Bonding Driver: 2.6.1 (October 29, 2004) + Bonding Mode: load balancing (round-robin) + Currently Active Slave: eth0 + MII Status: up + MII Polling Interval (ms): 1000 + Up Delay (ms): 0 + Down Delay (ms): 0 + + Slave Interface: eth1 + MII Status: up + Link Failure Count: 1 + + Slave Interface: eth0 + MII Status: up + Link Failure Count: 1 + +The precise format and contents will change depending upon the +bonding configuration, state, and version of the bonding driver. + +4.2 Network configuration +------------------------- + +The network configuration can be inspected using the ifconfig +command. Bonding devices will have the MASTER flag set; Bonding slave +devices will have the SLAVE flag set. The ifconfig output does not +contain information on which slaves are associated with which masters. + +In the example below, the bond0 interface is the master +(MASTER) while eth0 and eth1 are slaves (SLAVE). Notice all slaves of +bond0 have the same MAC address (HWaddr) as bond0 for all modes except +TLB and ALB that require a unique MAC address for each slave:: + + # /sbin/ifconfig + bond0 Link encap:Ethernet HWaddr 00:C0:F0:1F:37:B4 + inet addr:XXX.XXX.XXX.YYY Bcast:XXX.XXX.XXX.255 Mask:255.255.252.0 + UP BROADCAST RUNNING MASTER MULTICAST MTU:1500 Metric:1 + RX packets:7224794 errors:0 dropped:0 overruns:0 frame:0 + TX packets:3286647 errors:1 dropped:0 overruns:1 carrier:0 + collisions:0 txqueuelen:0 + + eth0 Link encap:Ethernet HWaddr 00:C0:F0:1F:37:B4 + UP BROADCAST RUNNING SLAVE MULTICAST MTU:1500 Metric:1 + RX packets:3573025 errors:0 dropped:0 overruns:0 frame:0 + TX packets:1643167 errors:1 dropped:0 overruns:1 carrier:0 + collisions:0 txqueuelen:100 + Interrupt:10 Base address:0x1080 + + eth1 Link encap:Ethernet HWaddr 00:C0:F0:1F:37:B4 + UP BROADCAST RUNNING SLAVE MULTICAST MTU:1500 Metric:1 + RX packets:3651769 errors:0 dropped:0 overruns:0 frame:0 + TX packets:1643480 errors:0 dropped:0 overruns:0 carrier:0 + collisions:0 txqueuelen:100 + Interrupt:9 Base address:0x1400 + +5. Switch Configuration +======================= + +For this section, "switch" refers to whatever system the +bonded devices are directly connected to (i.e., where the other end of +the cable plugs into). This may be an actual dedicated switch device, +or it may be another regular system (e.g., another computer running +Linux), + +The active-backup, balance-tlb and balance-alb modes do not +require any specific configuration of the switch. + +The 802.3ad mode requires that the switch have the appropriate +ports configured as an 802.3ad aggregation. The precise method used +to configure this varies from switch to switch, but, for example, a +Cisco 3550 series switch requires that the appropriate ports first be +grouped together in a single etherchannel instance, then that +etherchannel is set to mode "lacp" to enable 802.3ad (instead of +standard EtherChannel). + +The balance-rr, balance-xor and broadcast modes generally +require that the switch have the appropriate ports grouped together. +The nomenclature for such a group differs between switches, it may be +called an "etherchannel" (as in the Cisco example, above), a "trunk +group" or some other similar variation. For these modes, each switch +will also have its own configuration options for the switch's transmit +policy to the bond. Typical choices include XOR of either the MAC or +IP addresses. The transmit policy of the two peers does not need to +match. For these three modes, the bonding mode really selects a +transmit policy for an EtherChannel group; all three will interoperate +with another EtherChannel group. + + +6. 802.1q VLAN Support +====================== + +It is possible to configure VLAN devices over a bond interface +using the 8021q driver. However, only packets coming from the 8021q +driver and passing through bonding will be tagged by default. Self +generated packets, for example, bonding's learning packets or ARP +packets generated by either ALB mode or the ARP monitor mechanism, are +tagged internally by bonding itself. As a result, bonding must +"learn" the VLAN IDs configured above it, and use those IDs to tag +self generated packets. + +For reasons of simplicity, and to support the use of adapters +that can do VLAN hardware acceleration offloading, the bonding +interface declares itself as fully hardware offloading capable, it gets +the add_vid/kill_vid notifications to gather the necessary +information, and it propagates those actions to the slaves. In case +of mixed adapter types, hardware accelerated tagged packets that +should go through an adapter that is not offloading capable are +"un-accelerated" by the bonding driver so the VLAN tag sits in the +regular location. + +VLAN interfaces *must* be added on top of a bonding interface +only after enslaving at least one slave. The bonding interface has a +hardware address of 00:00:00:00:00:00 until the first slave is added. +If the VLAN interface is created prior to the first enslavement, it +would pick up the all-zeroes hardware address. Once the first slave +is attached to the bond, the bond device itself will pick up the +slave's hardware address, which is then available for the VLAN device. + +Also, be aware that a similar problem can occur if all slaves +are released from a bond that still has one or more VLAN interfaces on +top of it. When a new slave is added, the bonding interface will +obtain its hardware address from the first slave, which might not +match the hardware address of the VLAN interfaces (which was +ultimately copied from an earlier slave). + +There are two methods to insure that the VLAN device operates +with the correct hardware address if all slaves are removed from a +bond interface: + +1. Remove all VLAN interfaces then recreate them + +2. Set the bonding interface's hardware address so that it +matches the hardware address of the VLAN interfaces. + +Note that changing a VLAN interface's HW address would set the +underlying device -- i.e. the bonding interface -- to promiscuous +mode, which might not be what you want. + + +7. Link Monitoring +================== + +The bonding driver at present supports two schemes for +monitoring a slave device's link state: the ARP monitor and the MII +monitor. + +At the present time, due to implementation restrictions in the +bonding driver itself, it is not possible to enable both ARP and MII +monitoring simultaneously. + +7.1 ARP Monitor Operation +------------------------- + +The ARP monitor operates as its name suggests: it sends ARP +queries to one or more designated peer systems on the network, and +uses the response as an indication that the link is operating. This +gives some assurance that traffic is actually flowing to and from one +or more peers on the local network. + +The ARP monitor relies on the device driver itself to verify +that traffic is flowing. In particular, the driver must keep up to +date the last receive time, dev->last_rx. Drivers that use NETIF_F_LLTX +flag must also update netdev_queue->trans_start. If they do not, then the +ARP monitor will immediately fail any slaves using that driver, and +those slaves will stay down. If networking monitoring (tcpdump, etc) +shows the ARP requests and replies on the network, then it may be that +your device driver is not updating last_rx and trans_start. + +7.2 Configuring Multiple ARP Targets +------------------------------------ + +While ARP monitoring can be done with just one target, it can +be useful in a High Availability setup to have several targets to +monitor. In the case of just one target, the target itself may go +down or have a problem making it unresponsive to ARP requests. Having +an additional target (or several) increases the reliability of the ARP +monitoring. + +Multiple ARP targets must be separated by commas as follows:: + + # example options for ARP monitoring with three targets + alias bond0 bonding + options bond0 arp_interval=60 arp_ip_target=192.168.0.1,192.168.0.3,192.168.0.9 + +For just a single target the options would resemble:: + + # example options for ARP monitoring with one target + alias bond0 bonding + options bond0 arp_interval=60 arp_ip_target=192.168.0.100 + + +7.3 MII Monitor Operation +------------------------- + +The MII monitor monitors only the carrier state of the local +network interface. It accomplishes this in one of three ways: by +depending upon the device driver to maintain its carrier state, by +querying the device's MII registers, or by making an ethtool query to +the device. + +If the use_carrier module parameter is 1 (the default value), +then the MII monitor will rely on the driver for carrier state +information (via the netif_carrier subsystem). As explained in the +use_carrier parameter information, above, if the MII monitor fails to +detect carrier loss on the device (e.g., when the cable is physically +disconnected), it may be that the driver does not support +netif_carrier. + +If use_carrier is 0, then the MII monitor will first query the +device's (via ioctl) MII registers and check the link state. If that +request fails (not just that it returns carrier down), then the MII +monitor will make an ethtool ETHOOL_GLINK request to attempt to obtain +the same information. If both methods fail (i.e., the driver either +does not support or had some error in processing both the MII register +and ethtool requests), then the MII monitor will assume the link is +up. + +8. Potential Sources of Trouble +=============================== + +8.1 Adventures in Routing +------------------------- + +When bonding is configured, it is important that the slave +devices not have routes that supersede routes of the master (or, +generally, not have routes at all). For example, suppose the bonding +device bond0 has two slaves, eth0 and eth1, and the routing table is +as follows:: + + Kernel IP routing table + Destination Gateway Genmask Flags MSS Window irtt Iface + 10.0.0.0 0.0.0.0 255.255.0.0 U 40 0 0 eth0 + 10.0.0.0 0.0.0.0 255.255.0.0 U 40 0 0 eth1 + 10.0.0.0 0.0.0.0 255.255.0.0 U 40 0 0 bond0 + 127.0.0.0 0.0.0.0 255.0.0.0 U 40 0 0 lo + +This routing configuration will likely still update the +receive/transmit times in the driver (needed by the ARP monitor), but +may bypass the bonding driver (because outgoing traffic to, in this +case, another host on network 10 would use eth0 or eth1 before bond0). + +The ARP monitor (and ARP itself) may become confused by this +configuration, because ARP requests (generated by the ARP monitor) +will be sent on one interface (bond0), but the corresponding reply +will arrive on a different interface (eth0). This reply looks to ARP +as an unsolicited ARP reply (because ARP matches replies on an +interface basis), and is discarded. The MII monitor is not affected +by the state of the routing table. + +The solution here is simply to insure that slaves do not have +routes of their own, and if for some reason they must, those routes do +not supersede routes of their master. This should generally be the +case, but unusual configurations or errant manual or automatic static +route additions may cause trouble. + +8.2 Ethernet Device Renaming +---------------------------- + +On systems with network configuration scripts that do not +associate physical devices directly with network interface names (so +that the same physical device always has the same "ethX" name), it may +be necessary to add some special logic to config files in +/etc/modprobe.d/. + +For example, given a modules.conf containing the following:: + + alias bond0 bonding + options bond0 mode=some-mode miimon=50 + alias eth0 tg3 + alias eth1 tg3 + alias eth2 e1000 + alias eth3 e1000 + +If neither eth0 and eth1 are slaves to bond0, then when the +bond0 interface comes up, the devices may end up reordered. This +happens because bonding is loaded first, then its slave device's +drivers are loaded next. Since no other drivers have been loaded, +when the e1000 driver loads, it will receive eth0 and eth1 for its +devices, but the bonding configuration tries to enslave eth2 and eth3 +(which may later be assigned to the tg3 devices). + +Adding the following:: + + add above bonding e1000 tg3 + +causes modprobe to load e1000 then tg3, in that order, when +bonding is loaded. This command is fully documented in the +modules.conf manual page. + +On systems utilizing modprobe an equivalent problem can occur. +In this case, the following can be added to config files in +/etc/modprobe.d/ as:: + + softdep bonding pre: tg3 e1000 + +This will load tg3 and e1000 modules before loading the bonding one. +Full documentation on this can be found in the modprobe.d and modprobe +manual pages. + +8.3. Painfully Slow Or No Failed Link Detection By Miimon +--------------------------------------------------------- + +By default, bonding enables the use_carrier option, which +instructs bonding to trust the driver to maintain carrier state. + +As discussed in the options section, above, some drivers do +not support the netif_carrier_on/_off link state tracking system. +With use_carrier enabled, bonding will always see these links as up, +regardless of their actual state. + +Additionally, other drivers do support netif_carrier, but do +not maintain it in real time, e.g., only polling the link state at +some fixed interval. In this case, miimon will detect failures, but +only after some long period of time has expired. If it appears that +miimon is very slow in detecting link failures, try specifying +use_carrier=0 to see if that improves the failure detection time. If +it does, then it may be that the driver checks the carrier state at a +fixed interval, but does not cache the MII register values (so the +use_carrier=0 method of querying the registers directly works). If +use_carrier=0 does not improve the failover, then the driver may cache +the registers, or the problem may be elsewhere. + +Also, remember that miimon only checks for the device's +carrier state. It has no way to determine the state of devices on or +beyond other ports of a switch, or if a switch is refusing to pass +traffic while still maintaining carrier on. + +9. SNMP agents +=============== + +If running SNMP agents, the bonding driver should be loaded +before any network drivers participating in a bond. This requirement +is due to the interface index (ipAdEntIfIndex) being associated to +the first interface found with a given IP address. That is, there is +only one ipAdEntIfIndex for each IP address. For example, if eth0 and +eth1 are slaves of bond0 and the driver for eth0 is loaded before the +bonding driver, the interface for the IP address will be associated +with the eth0 interface. This configuration is shown below, the IP +address 192.168.1.1 has an interface index of 2 which indexes to eth0 +in the ifDescr table (ifDescr.2). + +:: + + interfaces.ifTable.ifEntry.ifDescr.1 = lo + interfaces.ifTable.ifEntry.ifDescr.2 = eth0 + interfaces.ifTable.ifEntry.ifDescr.3 = eth1 + interfaces.ifTable.ifEntry.ifDescr.4 = eth2 + interfaces.ifTable.ifEntry.ifDescr.5 = eth3 + interfaces.ifTable.ifEntry.ifDescr.6 = bond0 + ip.ipAddrTable.ipAddrEntry.ipAdEntIfIndex.10.10.10.10 = 5 + ip.ipAddrTable.ipAddrEntry.ipAdEntIfIndex.192.168.1.1 = 2 + ip.ipAddrTable.ipAddrEntry.ipAdEntIfIndex.10.74.20.94 = 4 + ip.ipAddrTable.ipAddrEntry.ipAdEntIfIndex.127.0.0.1 = 1 + +This problem is avoided by loading the bonding driver before +any network drivers participating in a bond. Below is an example of +loading the bonding driver first, the IP address 192.168.1.1 is +correctly associated with ifDescr.2. + + interfaces.ifTable.ifEntry.ifDescr.1 = lo + interfaces.ifTable.ifEntry.ifDescr.2 = bond0 + interfaces.ifTable.ifEntry.ifDescr.3 = eth0 + interfaces.ifTable.ifEntry.ifDescr.4 = eth1 + interfaces.ifTable.ifEntry.ifDescr.5 = eth2 + interfaces.ifTable.ifEntry.ifDescr.6 = eth3 + ip.ipAddrTable.ipAddrEntry.ipAdEntIfIndex.10.10.10.10 = 6 + ip.ipAddrTable.ipAddrEntry.ipAdEntIfIndex.192.168.1.1 = 2 + ip.ipAddrTable.ipAddrEntry.ipAdEntIfIndex.10.74.20.94 = 5 + ip.ipAddrTable.ipAddrEntry.ipAdEntIfIndex.127.0.0.1 = 1 + +While some distributions may not report the interface name in +ifDescr, the association between the IP address and IfIndex remains +and SNMP functions such as Interface_Scan_Next will report that +association. + +10. Promiscuous mode +==================== + +When running network monitoring tools, e.g., tcpdump, it is +common to enable promiscuous mode on the device, so that all traffic +is seen (instead of seeing only traffic destined for the local host). +The bonding driver handles promiscuous mode changes to the bonding +master device (e.g., bond0), and propagates the setting to the slave +devices. + +For the balance-rr, balance-xor, broadcast, and 802.3ad modes, +the promiscuous mode setting is propagated to all slaves. + +For the active-backup, balance-tlb and balance-alb modes, the +promiscuous mode setting is propagated only to the active slave. + +For balance-tlb mode, the active slave is the slave currently +receiving inbound traffic. + +For balance-alb mode, the active slave is the slave used as a +"primary." This slave is used for mode-specific control traffic, for +sending to peers that are unassigned or if the load is unbalanced. + +For the active-backup, balance-tlb and balance-alb modes, when +the active slave changes (e.g., due to a link failure), the +promiscuous setting will be propagated to the new active slave. + +11. Configuring Bonding for High Availability +============================================= + +High Availability refers to configurations that provide +maximum network availability by having redundant or backup devices, +links or switches between the host and the rest of the world. The +goal is to provide the maximum availability of network connectivity +(i.e., the network always works), even though other configurations +could provide higher throughput. + +11.1 High Availability in a Single Switch Topology +-------------------------------------------------- + +If two hosts (or a host and a single switch) are directly +connected via multiple physical links, then there is no availability +penalty to optimizing for maximum bandwidth. In this case, there is +only one switch (or peer), so if it fails, there is no alternative +access to fail over to. Additionally, the bonding load balance modes +support link monitoring of their members, so if individual links fail, +the load will be rebalanced across the remaining devices. + +See Section 12, "Configuring Bonding for Maximum Throughput" +for information on configuring bonding with one peer device. + +11.2 High Availability in a Multiple Switch Topology +---------------------------------------------------- + +With multiple switches, the configuration of bonding and the +network changes dramatically. In multiple switch topologies, there is +a trade off between network availability and usable bandwidth. + +Below is a sample network, configured to maximize the +availability of the network:: + + | | + |port3 port3| + +-----+----+ +-----+----+ + | |port2 ISL port2| | + | switch A +--------------------------+ switch B | + | | | | + +-----+----+ +-----++---+ + |port1 port1| + | +-------+ | + +-------------+ host1 +---------------+ + eth0 +-------+ eth1 + +In this configuration, there is a link between the two +switches (ISL, or inter switch link), and multiple ports connecting to +the outside world ("port3" on each switch). There is no technical +reason that this could not be extended to a third switch. + +11.2.1 HA Bonding Mode Selection for Multiple Switch Topology +------------------------------------------------------------- + +In a topology such as the example above, the active-backup and +broadcast modes are the only useful bonding modes when optimizing for +availability; the other modes require all links to terminate on the +same peer for them to behave rationally. + +active-backup: + This is generally the preferred mode, particularly if + the switches have an ISL and play together well. If the + network configuration is such that one switch is specifically + a backup switch (e.g., has lower capacity, higher cost, etc), + then the primary option can be used to insure that the + preferred link is always used when it is available. + +broadcast: + This mode is really a special purpose mode, and is suitable + only for very specific needs. For example, if the two + switches are not connected (no ISL), and the networks beyond + them are totally independent. In this case, if it is + necessary for some specific one-way traffic to reach both + independent networks, then the broadcast mode may be suitable. + +11.2.2 HA Link Monitoring Selection for Multiple Switch Topology +---------------------------------------------------------------- + +The choice of link monitoring ultimately depends upon your +switch. If the switch can reliably fail ports in response to other +failures, then either the MII or ARP monitors should work. For +example, in the above example, if the "port3" link fails at the remote +end, the MII monitor has no direct means to detect this. The ARP +monitor could be configured with a target at the remote end of port3, +thus detecting that failure without switch support. + +In general, however, in a multiple switch topology, the ARP +monitor can provide a higher level of reliability in detecting end to +end connectivity failures (which may be caused by the failure of any +individual component to pass traffic for any reason). Additionally, +the ARP monitor should be configured with multiple targets (at least +one for each switch in the network). This will insure that, +regardless of which switch is active, the ARP monitor has a suitable +target to query. + +Note, also, that of late many switches now support a functionality +generally referred to as "trunk failover." This is a feature of the +switch that causes the link state of a particular switch port to be set +down (or up) when the state of another switch port goes down (or up). +Its purpose is to propagate link failures from logically "exterior" ports +to the logically "interior" ports that bonding is able to monitor via +miimon. Availability and configuration for trunk failover varies by +switch, but this can be a viable alternative to the ARP monitor when using +suitable switches. + +12. Configuring Bonding for Maximum Throughput +============================================== + +12.1 Maximizing Throughput in a Single Switch Topology +------------------------------------------------------ + +In a single switch configuration, the best method to maximize +throughput depends upon the application and network environment. The +various load balancing modes each have strengths and weaknesses in +different environments, as detailed below. + +For this discussion, we will break down the topologies into +two categories. Depending upon the destination of most traffic, we +categorize them into either "gatewayed" or "local" configurations. + +In a gatewayed configuration, the "switch" is acting primarily +as a router, and the majority of traffic passes through this router to +other networks. An example would be the following:: + + + +----------+ +----------+ + | |eth0 port1| | to other networks + | Host A +---------------------+ router +-------------------> + | +---------------------+ | Hosts B and C are out + | |eth1 port2| | here somewhere + +----------+ +----------+ + +The router may be a dedicated router device, or another host +acting as a gateway. For our discussion, the important point is that +the majority of traffic from Host A will pass through the router to +some other network before reaching its final destination. + +In a gatewayed network configuration, although Host A may +communicate with many other systems, all of its traffic will be sent +and received via one other peer on the local network, the router. + +Note that the case of two systems connected directly via +multiple physical links is, for purposes of configuring bonding, the +same as a gatewayed configuration. In that case, it happens that all +traffic is destined for the "gateway" itself, not some other network +beyond the gateway. + +In a local configuration, the "switch" is acting primarily as +a switch, and the majority of traffic passes through this switch to +reach other stations on the same network. An example would be the +following:: + + +----------+ +----------+ +--------+ + | |eth0 port1| +-------+ Host B | + | Host A +------------+ switch |port3 +--------+ + | +------------+ | +--------+ + | |eth1 port2| +------------------+ Host C | + +----------+ +----------+port4 +--------+ + + +Again, the switch may be a dedicated switch device, or another +host acting as a gateway. For our discussion, the important point is +that the majority of traffic from Host A is destined for other hosts +on the same local network (Hosts B and C in the above example). + +In summary, in a gatewayed configuration, traffic to and from +the bonded device will be to the same MAC level peer on the network +(the gateway itself, i.e., the router), regardless of its final +destination. In a local configuration, traffic flows directly to and +from the final destinations, thus, each destination (Host B, Host C) +will be addressed directly by their individual MAC addresses. + +This distinction between a gatewayed and a local network +configuration is important because many of the load balancing modes +available use the MAC addresses of the local network source and +destination to make load balancing decisions. The behavior of each +mode is described below. + + +12.1.1 MT Bonding Mode Selection for Single Switch Topology +----------------------------------------------------------- + +This configuration is the easiest to set up and to understand, +although you will have to decide which bonding mode best suits your +needs. The trade offs for each mode are detailed below: + +balance-rr: + This mode is the only mode that will permit a single + TCP/IP connection to stripe traffic across multiple + interfaces. It is therefore the only mode that will allow a + single TCP/IP stream to utilize more than one interface's + worth of throughput. This comes at a cost, however: the + striping generally results in peer systems receiving packets out + of order, causing TCP/IP's congestion control system to kick + in, often by retransmitting segments. + + It is possible to adjust TCP/IP's congestion limits by + altering the net.ipv4.tcp_reordering sysctl parameter. The + usual default value is 3. But keep in mind TCP stack is able + to automatically increase this when it detects reorders. + + Note that the fraction of packets that will be delivered out of + order is highly variable, and is unlikely to be zero. The level + of reordering depends upon a variety of factors, including the + networking interfaces, the switch, and the topology of the + configuration. Speaking in general terms, higher speed network + cards produce more reordering (due to factors such as packet + coalescing), and a "many to many" topology will reorder at a + higher rate than a "many slow to one fast" configuration. + + Many switches do not support any modes that stripe traffic + (instead choosing a port based upon IP or MAC level addresses); + for those devices, traffic for a particular connection flowing + through the switch to a balance-rr bond will not utilize greater + than one interface's worth of bandwidth. + + If you are utilizing protocols other than TCP/IP, UDP for + example, and your application can tolerate out of order + delivery, then this mode can allow for single stream datagram + performance that scales near linearly as interfaces are added + to the bond. + + This mode requires the switch to have the appropriate ports + configured for "etherchannel" or "trunking." + +active-backup: + There is not much advantage in this network topology to + the active-backup mode, as the inactive backup devices are all + connected to the same peer as the primary. In this case, a + load balancing mode (with link monitoring) will provide the + same level of network availability, but with increased + available bandwidth. On the plus side, active-backup mode + does not require any configuration of the switch, so it may + have value if the hardware available does not support any of + the load balance modes. + +balance-xor: + This mode will limit traffic such that packets destined + for specific peers will always be sent over the same + interface. Since the destination is determined by the MAC + addresses involved, this mode works best in a "local" network + configuration (as described above), with destinations all on + the same local network. This mode is likely to be suboptimal + if all your traffic is passed through a single router (i.e., a + "gatewayed" network configuration, as described above). + + As with balance-rr, the switch ports need to be configured for + "etherchannel" or "trunking." + +broadcast: + Like active-backup, there is not much advantage to this + mode in this type of network topology. + +802.3ad: + This mode can be a good choice for this type of network + topology. The 802.3ad mode is an IEEE standard, so all peers + that implement 802.3ad should interoperate well. The 802.3ad + protocol includes automatic configuration of the aggregates, + so minimal manual configuration of the switch is needed + (typically only to designate that some set of devices is + available for 802.3ad). The 802.3ad standard also mandates + that frames be delivered in order (within certain limits), so + in general single connections will not see misordering of + packets. The 802.3ad mode does have some drawbacks: the + standard mandates that all devices in the aggregate operate at + the same speed and duplex. Also, as with all bonding load + balance modes other than balance-rr, no single connection will + be able to utilize more than a single interface's worth of + bandwidth. + + Additionally, the linux bonding 802.3ad implementation + distributes traffic by peer (using an XOR of MAC addresses + and packet type ID), so in a "gatewayed" configuration, all + outgoing traffic will generally use the same device. Incoming + traffic may also end up on a single device, but that is + dependent upon the balancing policy of the peer's 802.3ad + implementation. In a "local" configuration, traffic will be + distributed across the devices in the bond. + + Finally, the 802.3ad mode mandates the use of the MII monitor, + therefore, the ARP monitor is not available in this mode. + +balance-tlb: + The balance-tlb mode balances outgoing traffic by peer. + Since the balancing is done according to MAC address, in a + "gatewayed" configuration (as described above), this mode will + send all traffic across a single device. However, in a + "local" network configuration, this mode balances multiple + local network peers across devices in a vaguely intelligent + manner (not a simple XOR as in balance-xor or 802.3ad mode), + so that mathematically unlucky MAC addresses (i.e., ones that + XOR to the same value) will not all "bunch up" on a single + interface. + + Unlike 802.3ad, interfaces may be of differing speeds, and no + special switch configuration is required. On the down side, + in this mode all incoming traffic arrives over a single + interface, this mode requires certain ethtool support in the + network device driver of the slave interfaces, and the ARP + monitor is not available. + +balance-alb: + This mode is everything that balance-tlb is, and more. + It has all of the features (and restrictions) of balance-tlb, + and will also balance incoming traffic from local network + peers (as described in the Bonding Module Options section, + above). + + The only additional down side to this mode is that the network + device driver must support changing the hardware address while + the device is open. + +12.1.2 MT Link Monitoring for Single Switch Topology +---------------------------------------------------- + +The choice of link monitoring may largely depend upon which +mode you choose to use. The more advanced load balancing modes do not +support the use of the ARP monitor, and are thus restricted to using +the MII monitor (which does not provide as high a level of end to end +assurance as the ARP monitor). + +12.2 Maximum Throughput in a Multiple Switch Topology +----------------------------------------------------- + +Multiple switches may be utilized to optimize for throughput +when they are configured in parallel as part of an isolated network +between two or more systems, for example:: + + +-----------+ + | Host A | + +-+---+---+-+ + | | | + +--------+ | +---------+ + | | | + +------+---+ +-----+----+ +-----+----+ + | Switch A | | Switch B | | Switch C | + +------+---+ +-----+----+ +-----+----+ + | | | + +--------+ | +---------+ + | | | + +-+---+---+-+ + | Host B | + +-----------+ + +In this configuration, the switches are isolated from one +another. One reason to employ a topology such as this is for an +isolated network with many hosts (a cluster configured for high +performance, for example), using multiple smaller switches can be more +cost effective than a single larger switch, e.g., on a network with 24 +hosts, three 24 port switches can be significantly less expensive than +a single 72 port switch. + +If access beyond the network is required, an individual host +can be equipped with an additional network device connected to an +external network; this host then additionally acts as a gateway. + +12.2.1 MT Bonding Mode Selection for Multiple Switch Topology +------------------------------------------------------------- + +In actual practice, the bonding mode typically employed in +configurations of this type is balance-rr. Historically, in this +network configuration, the usual caveats about out of order packet +delivery are mitigated by the use of network adapters that do not do +any kind of packet coalescing (via the use of NAPI, or because the +device itself does not generate interrupts until some number of +packets has arrived). When employed in this fashion, the balance-rr +mode allows individual connections between two hosts to effectively +utilize greater than one interface's bandwidth. + +12.2.2 MT Link Monitoring for Multiple Switch Topology +------------------------------------------------------ + +Again, in actual practice, the MII monitor is most often used +in this configuration, as performance is given preference over +availability. The ARP monitor will function in this topology, but its +advantages over the MII monitor are mitigated by the volume of probes +needed as the number of systems involved grows (remember that each +host in the network is configured with bonding). + +13. Switch Behavior Issues +========================== + +13.1 Link Establishment and Failover Delays +------------------------------------------- + +Some switches exhibit undesirable behavior with regard to the +timing of link up and down reporting by the switch. + +First, when a link comes up, some switches may indicate that +the link is up (carrier available), but not pass traffic over the +interface for some period of time. This delay is typically due to +some type of autonegotiation or routing protocol, but may also occur +during switch initialization (e.g., during recovery after a switch +failure). If you find this to be a problem, specify an appropriate +value to the updelay bonding module option to delay the use of the +relevant interface(s). + +Second, some switches may "bounce" the link state one or more +times while a link is changing state. This occurs most commonly while +the switch is initializing. Again, an appropriate updelay value may +help. + +Note that when a bonding interface has no active links, the +driver will immediately reuse the first link that goes up, even if the +updelay parameter has been specified (the updelay is ignored in this +case). If there are slave interfaces waiting for the updelay timeout +to expire, the interface that first went into that state will be +immediately reused. This reduces down time of the network if the +value of updelay has been overestimated, and since this occurs only in +cases with no connectivity, there is no additional penalty for +ignoring the updelay. + +In addition to the concerns about switch timings, if your +switches take a long time to go into backup mode, it may be desirable +to not activate a backup interface immediately after a link goes down. +Failover may be delayed via the downdelay bonding module option. + +13.2 Duplicated Incoming Packets +-------------------------------- + +NOTE: Starting with version 3.0.2, the bonding driver has logic to +suppress duplicate packets, which should largely eliminate this problem. +The following description is kept for reference. + +It is not uncommon to observe a short burst of duplicated +traffic when the bonding device is first used, or after it has been +idle for some period of time. This is most easily observed by issuing +a "ping" to some other host on the network, and noticing that the +output from ping flags duplicates (typically one per slave). + +For example, on a bond in active-backup mode with five slaves +all connected to one switch, the output may appear as follows:: + + # ping -n 10.0.4.2 + PING 10.0.4.2 (10.0.4.2) from 10.0.3.10 : 56(84) bytes of data. + 64 bytes from 10.0.4.2: icmp_seq=1 ttl=64 time=13.7 ms + 64 bytes from 10.0.4.2: icmp_seq=1 ttl=64 time=13.8 ms (DUP!) + 64 bytes from 10.0.4.2: icmp_seq=1 ttl=64 time=13.8 ms (DUP!) + 64 bytes from 10.0.4.2: icmp_seq=1 ttl=64 time=13.8 ms (DUP!) + 64 bytes from 10.0.4.2: icmp_seq=1 ttl=64 time=13.8 ms (DUP!) + 64 bytes from 10.0.4.2: icmp_seq=2 ttl=64 time=0.216 ms + 64 bytes from 10.0.4.2: icmp_seq=3 ttl=64 time=0.267 ms + 64 bytes from 10.0.4.2: icmp_seq=4 ttl=64 time=0.222 ms + +This is not due to an error in the bonding driver, rather, it +is a side effect of how many switches update their MAC forwarding +tables. Initially, the switch does not associate the MAC address in +the packet with a particular switch port, and so it may send the +traffic to all ports until its MAC forwarding table is updated. Since +the interfaces attached to the bond may occupy multiple ports on a +single switch, when the switch (temporarily) floods the traffic to all +ports, the bond device receives multiple copies of the same packet +(one per slave device). + +The duplicated packet behavior is switch dependent, some +switches exhibit this, and some do not. On switches that display this +behavior, it can be induced by clearing the MAC forwarding table (on +most Cisco switches, the privileged command "clear mac address-table +dynamic" will accomplish this). + +14. Hardware Specific Considerations +==================================== + +This section contains additional information for configuring +bonding on specific hardware platforms, or for interfacing bonding +with particular switches or other devices. + +14.1 IBM BladeCenter +-------------------- + +This applies to the JS20 and similar systems. + +On the JS20 blades, the bonding driver supports only +balance-rr, active-backup, balance-tlb and balance-alb modes. This is +largely due to the network topology inside the BladeCenter, detailed +below. + +JS20 network adapter information +-------------------------------- + +All JS20s come with two Broadcom Gigabit Ethernet ports +integrated on the planar (that's "motherboard" in IBM-speak). In the +BladeCenter chassis, the eth0 port of all JS20 blades is hard wired to +I/O Module #1; similarly, all eth1 ports are wired to I/O Module #2. +An add-on Broadcom daughter card can be installed on a JS20 to provide +two more Gigabit Ethernet ports. These ports, eth2 and eth3, are +wired to I/O Modules 3 and 4, respectively. + +Each I/O Module may contain either a switch or a passthrough +module (which allows ports to be directly connected to an external +switch). Some bonding modes require a specific BladeCenter internal +network topology in order to function; these are detailed below. + +Additional BladeCenter-specific networking information can be +found in two IBM Redbooks (www.ibm.com/redbooks): + +- "IBM eServer BladeCenter Networking Options" +- "IBM eServer BladeCenter Layer 2-7 Network Switching" + +BladeCenter networking configuration +------------------------------------ + +Because a BladeCenter can be configured in a very large number +of ways, this discussion will be confined to describing basic +configurations. + +Normally, Ethernet Switch Modules (ESMs) are used in I/O +modules 1 and 2. In this configuration, the eth0 and eth1 ports of a +JS20 will be connected to different internal switches (in the +respective I/O modules). + +A passthrough module (OPM or CPM, optical or copper, +passthrough module) connects the I/O module directly to an external +switch. By using PMs in I/O module #1 and #2, the eth0 and eth1 +interfaces of a JS20 can be redirected to the outside world and +connected to a common external switch. + +Depending upon the mix of ESMs and PMs, the network will +appear to bonding as either a single switch topology (all PMs) or as a +multiple switch topology (one or more ESMs, zero or more PMs). It is +also possible to connect ESMs together, resulting in a configuration +much like the example in "High Availability in a Multiple Switch +Topology," above. + +Requirements for specific modes +------------------------------- + +The balance-rr mode requires the use of passthrough modules +for devices in the bond, all connected to an common external switch. +That switch must be configured for "etherchannel" or "trunking" on the +appropriate ports, as is usual for balance-rr. + +The balance-alb and balance-tlb modes will function with +either switch modules or passthrough modules (or a mix). The only +specific requirement for these modes is that all network interfaces +must be able to reach all destinations for traffic sent over the +bonding device (i.e., the network must converge at some point outside +the BladeCenter). + +The active-backup mode has no additional requirements. + +Link monitoring issues +---------------------- + +When an Ethernet Switch Module is in place, only the ARP +monitor will reliably detect link loss to an external switch. This is +nothing unusual, but examination of the BladeCenter cabinet would +suggest that the "external" network ports are the ethernet ports for +the system, when it fact there is a switch between these "external" +ports and the devices on the JS20 system itself. The MII monitor is +only able to detect link failures between the ESM and the JS20 system. + +When a passthrough module is in place, the MII monitor does +detect failures to the "external" port, which is then directly +connected to the JS20 system. + +Other concerns +-------------- + +The Serial Over LAN (SoL) link is established over the primary +ethernet (eth0) only, therefore, any loss of link to eth0 will result +in losing your SoL connection. It will not fail over with other +network traffic, as the SoL system is beyond the control of the +bonding driver. + +It may be desirable to disable spanning tree on the switch +(either the internal Ethernet Switch Module, or an external switch) to +avoid fail-over delay issues when using bonding. + + +15. Frequently Asked Questions +============================== + +1. Is it SMP safe? +------------------- + +Yes. The old 2.0.xx channel bonding patch was not SMP safe. +The new driver was designed to be SMP safe from the start. + +2. What type of cards will work with it? +----------------------------------------- + +Any Ethernet type cards (you can even mix cards - a Intel +EtherExpress PRO/100 and a 3com 3c905b, for example). For most modes, +devices need not be of the same speed. + +Starting with version 3.2.1, bonding also supports Infiniband +slaves in active-backup mode. + +3. How many bonding devices can I have? +---------------------------------------- + +There is no limit. + +4. How many slaves can a bonding device have? +---------------------------------------------- + +This is limited only by the number of network interfaces Linux +supports and/or the number of network cards you can place in your +system. + +5. What happens when a slave link dies? +---------------------------------------- + +If link monitoring is enabled, then the failing device will be +disabled. The active-backup mode will fail over to a backup link, and +other modes will ignore the failed link. The link will continue to be +monitored, and should it recover, it will rejoin the bond (in whatever +manner is appropriate for the mode). See the sections on High +Availability and the documentation for each mode for additional +information. + +Link monitoring can be enabled via either the miimon or +arp_interval parameters (described in the module parameters section, +above). In general, miimon monitors the carrier state as sensed by +the underlying network device, and the arp monitor (arp_interval) +monitors connectivity to another host on the local network. + +If no link monitoring is configured, the bonding driver will +be unable to detect link failures, and will assume that all links are +always available. This will likely result in lost packets, and a +resulting degradation of performance. The precise performance loss +depends upon the bonding mode and network configuration. + +6. Can bonding be used for High Availability? +---------------------------------------------- + +Yes. See the section on High Availability for details. + +7. Which switches/systems does it work with? +--------------------------------------------- + +The full answer to this depends upon the desired mode. + +In the basic balance modes (balance-rr and balance-xor), it +works with any system that supports etherchannel (also called +trunking). Most managed switches currently available have such +support, and many unmanaged switches as well. + +The advanced balance modes (balance-tlb and balance-alb) do +not have special switch requirements, but do need device drivers that +support specific features (described in the appropriate section under +module parameters, above). + +In 802.3ad mode, it works with systems that support IEEE +802.3ad Dynamic Link Aggregation. Most managed and many unmanaged +switches currently available support 802.3ad. + +The active-backup mode should work with any Layer-II switch. + +8. Where does a bonding device get its MAC address from? +--------------------------------------------------------- + +When using slave devices that have fixed MAC addresses, or when +the fail_over_mac option is enabled, the bonding device's MAC address is +the MAC address of the active slave. + +For other configurations, if not explicitly configured (with +ifconfig or ip link), the MAC address of the bonding device is taken from +its first slave device. This MAC address is then passed to all following +slaves and remains persistent (even if the first slave is removed) until +the bonding device is brought down or reconfigured. + +If you wish to change the MAC address, you can set it with +ifconfig or ip link:: + + # ifconfig bond0 hw ether 00:11:22:33:44:55 + + # ip link set bond0 address 66:77:88:99:aa:bb + +The MAC address can be also changed by bringing down/up the +device and then changing its slaves (or their order):: + + # ifconfig bond0 down ; modprobe -r bonding + # ifconfig bond0 .... up + # ifenslave bond0 eth... + +This method will automatically take the address from the next +slave that is added. + +To restore your slaves' MAC addresses, you need to detach them +from the bond (``ifenslave -d bond0 eth0``). The bonding driver will +then restore the MAC addresses that the slaves had before they were +enslaved. + +16. Resources and Links +======================= + +The latest version of the bonding driver can be found in the latest +version of the linux kernel, found on http://kernel.org + +The latest version of this document can be found in the latest kernel +source (named Documentation/networking/bonding.rst). + +Discussions regarding the usage of the bonding driver take place on the +bonding-devel mailing list, hosted at sourceforge.net. If you have questions or +problems, post them to the list. The list address is: + +bonding-devel@lists.sourceforge.net + +The administrative interface (to subscribe or unsubscribe) can +be found at: + +https://lists.sourceforge.net/lists/listinfo/bonding-devel + +Discussions regarding the development of the bonding driver take place +on the main Linux network mailing list, hosted at vger.kernel.org. The list +address is: + +netdev@vger.kernel.org + +The administrative interface (to subscribe or unsubscribe) can +be found at: + +http://vger.kernel.org/vger-lists.html#netdev + +Donald Becker's Ethernet Drivers and diag programs may be found at : + + - http://web.archive.org/web/%2E/http://www.scyld.com/network/ + +You will also find a lot of information regarding Ethernet, NWay, MII, +etc. at www.scyld.com. diff --git a/Documentation/networking/bonding.txt b/Documentation/networking/bonding.txt deleted file mode 100644 index e3abfbd32f71..000000000000 --- a/Documentation/networking/bonding.txt +++ /dev/null @@ -1,2837 +0,0 @@ - - Linux Ethernet Bonding Driver HOWTO - - Latest update: 27 April 2011 - -Initial release : Thomas Davis -Corrections, HA extensions : 2000/10/03-15 : - - Willy Tarreau - - Constantine Gavrilov - - Chad N. Tindel - - Janice Girouard - - Jay Vosburgh - -Reorganized and updated Feb 2005 by Jay Vosburgh -Added Sysfs information: 2006/04/24 - - Mitch Williams - -Introduction -============ - - The Linux bonding driver provides a method for aggregating -multiple network interfaces into a single logical "bonded" interface. -The behavior of the bonded interfaces depends upon the mode; generally -speaking, modes provide either hot standby or load balancing services. -Additionally, link integrity monitoring may be performed. - - The bonding driver originally came from Donald Becker's -beowulf patches for kernel 2.0. It has changed quite a bit since, and -the original tools from extreme-linux and beowulf sites will not work -with this version of the driver. - - For new versions of the driver, updated userspace tools, and -who to ask for help, please follow the links at the end of this file. - -Table of Contents -================= - -1. Bonding Driver Installation - -2. Bonding Driver Options - -3. Configuring Bonding Devices -3.1 Configuration with Sysconfig Support -3.1.1 Using DHCP with Sysconfig -3.1.2 Configuring Multiple Bonds with Sysconfig -3.2 Configuration with Initscripts Support -3.2.1 Using DHCP with Initscripts -3.2.2 Configuring Multiple Bonds with Initscripts -3.3 Configuring Bonding Manually with Ifenslave -3.3.1 Configuring Multiple Bonds Manually -3.4 Configuring Bonding Manually via Sysfs -3.5 Configuration with Interfaces Support -3.6 Overriding Configuration for Special Cases -3.7 Configuring LACP for 802.3ad mode in a more secure way - -4. Querying Bonding Configuration -4.1 Bonding Configuration -4.2 Network Configuration - -5. Switch Configuration - -6. 802.1q VLAN Support - -7. Link Monitoring -7.1 ARP Monitor Operation -7.2 Configuring Multiple ARP Targets -7.3 MII Monitor Operation - -8. Potential Trouble Sources -8.1 Adventures in Routing -8.2 Ethernet Device Renaming -8.3 Painfully Slow Or No Failed Link Detection By Miimon - -9. SNMP agents - -10. Promiscuous mode - -11. Configuring Bonding for High Availability -11.1 High Availability in a Single Switch Topology -11.2 High Availability in a Multiple Switch Topology -11.2.1 HA Bonding Mode Selection for Multiple Switch Topology -11.2.2 HA Link Monitoring for Multiple Switch Topology - -12. Configuring Bonding for Maximum Throughput -12.1 Maximum Throughput in a Single Switch Topology -12.1.1 MT Bonding Mode Selection for Single Switch Topology -12.1.2 MT Link Monitoring for Single Switch Topology -12.2 Maximum Throughput in a Multiple Switch Topology -12.2.1 MT Bonding Mode Selection for Multiple Switch Topology -12.2.2 MT Link Monitoring for Multiple Switch Topology - -13. Switch Behavior Issues -13.1 Link Establishment and Failover Delays -13.2 Duplicated Incoming Packets - -14. Hardware Specific Considerations -14.1 IBM BladeCenter - -15. Frequently Asked Questions - -16. Resources and Links - - -1. Bonding Driver Installation -============================== - - Most popular distro kernels ship with the bonding driver -already available as a module. If your distro does not, or you -have need to compile bonding from source (e.g., configuring and -installing a mainline kernel from kernel.org), you'll need to perform -the following steps: - -1.1 Configure and build the kernel with bonding ------------------------------------------------ - - The current version of the bonding driver is available in the -drivers/net/bonding subdirectory of the most recent kernel source -(which is available on http://kernel.org). Most users "rolling their -own" will want to use the most recent kernel from kernel.org. - - Configure kernel with "make menuconfig" (or "make xconfig" or -"make config"), then select "Bonding driver support" in the "Network -device support" section. It is recommended that you configure the -driver as module since it is currently the only way to pass parameters -to the driver or configure more than one bonding device. - - Build and install the new kernel and modules. - -1.2 Bonding Control Utility -------------------------------------- - - It is recommended to configure bonding via iproute2 (netlink) -or sysfs, the old ifenslave control utility is obsolete. - -2. Bonding Driver Options -========================= - - Options for the bonding driver are supplied as parameters to the -bonding module at load time, or are specified via sysfs. - - Module options may be given as command line arguments to the -insmod or modprobe command, but are usually specified in either the -/etc/modprobe.d/*.conf configuration files, or in a distro-specific -configuration file (some of which are detailed in the next section). - - Details on bonding support for sysfs is provided in the -"Configuring Bonding Manually via Sysfs" section, below. - - The available bonding driver parameters are listed below. If a -parameter is not specified the default value is used. When initially -configuring a bond, it is recommended "tail -f /var/log/messages" be -run in a separate window to watch for bonding driver error messages. - - It is critical that either the miimon or arp_interval and -arp_ip_target parameters be specified, otherwise serious network -degradation will occur during link failures. Very few devices do not -support at least miimon, so there is really no reason not to use it. - - Options with textual values will accept either the text name -or, for backwards compatibility, the option value. E.g., -"mode=802.3ad" and "mode=4" set the same mode. - - The parameters are as follows: - -active_slave - - Specifies the new active slave for modes that support it - (active-backup, balance-alb and balance-tlb). Possible values - are the name of any currently enslaved interface, or an empty - string. If a name is given, the slave and its link must be up in order - to be selected as the new active slave. If an empty string is - specified, the current active slave is cleared, and a new active - slave is selected automatically. - - Note that this is only available through the sysfs interface. No module - parameter by this name exists. - - The normal value of this option is the name of the currently - active slave, or the empty string if there is no active slave or - the current mode does not use an active slave. - -ad_actor_sys_prio - - In an AD system, this specifies the system priority. The allowed range - is 1 - 65535. If the value is not specified, it takes 65535 as the - default value. - - This parameter has effect only in 802.3ad mode and is available through - SysFs interface. - -ad_actor_system - - In an AD system, this specifies the mac-address for the actor in - protocol packet exchanges (LACPDUs). The value cannot be NULL or - multicast. It is preferred to have the local-admin bit set for this - mac but driver does not enforce it. If the value is not given then - system defaults to using the masters' mac address as actors' system - address. - - This parameter has effect only in 802.3ad mode and is available through - SysFs interface. - -ad_select - - Specifies the 802.3ad aggregation selection logic to use. The - possible values and their effects are: - - stable or 0 - - The active aggregator is chosen by largest aggregate - bandwidth. - - Reselection of the active aggregator occurs only when all - slaves of the active aggregator are down or the active - aggregator has no slaves. - - This is the default value. - - bandwidth or 1 - - The active aggregator is chosen by largest aggregate - bandwidth. Reselection occurs if: - - - A slave is added to or removed from the bond - - - Any slave's link state changes - - - Any slave's 802.3ad association state changes - - - The bond's administrative state changes to up - - count or 2 - - The active aggregator is chosen by the largest number of - ports (slaves). Reselection occurs as described under the - "bandwidth" setting, above. - - The bandwidth and count selection policies permit failover of - 802.3ad aggregations when partial failure of the active aggregator - occurs. This keeps the aggregator with the highest availability - (either in bandwidth or in number of ports) active at all times. - - This option was added in bonding version 3.4.0. - -ad_user_port_key - - In an AD system, the port-key has three parts as shown below - - - Bits Use - 00 Duplex - 01-05 Speed - 06-15 User-defined - - This defines the upper 10 bits of the port key. The values can be - from 0 - 1023. If not given, the system defaults to 0. - - This parameter has effect only in 802.3ad mode and is available through - SysFs interface. - -all_slaves_active - - Specifies that duplicate frames (received on inactive ports) should be - dropped (0) or delivered (1). - - Normally, bonding will drop duplicate frames (received on inactive - ports), which is desirable for most users. But there are some times - it is nice to allow duplicate frames to be delivered. - - The default value is 0 (drop duplicate frames received on inactive - ports). - -arp_interval - - Specifies the ARP link monitoring frequency in milliseconds. - - The ARP monitor works by periodically checking the slave - devices to determine whether they have sent or received - traffic recently (the precise criteria depends upon the - bonding mode, and the state of the slave). Regular traffic is - generated via ARP probes issued for the addresses specified by - the arp_ip_target option. - - This behavior can be modified by the arp_validate option, - below. - - If ARP monitoring is used in an etherchannel compatible mode - (modes 0 and 2), the switch should be configured in a mode - that evenly distributes packets across all links. If the - switch is configured to distribute the packets in an XOR - fashion, all replies from the ARP targets will be received on - the same link which could cause the other team members to - fail. ARP monitoring should not be used in conjunction with - miimon. A value of 0 disables ARP monitoring. The default - value is 0. - -arp_ip_target - - Specifies the IP addresses to use as ARP monitoring peers when - arp_interval is > 0. These are the targets of the ARP request - sent to determine the health of the link to the targets. - Specify these values in ddd.ddd.ddd.ddd format. Multiple IP - addresses must be separated by a comma. At least one IP - address must be given for ARP monitoring to function. The - maximum number of targets that can be specified is 16. The - default value is no IP addresses. - -arp_validate - - Specifies whether or not ARP probes and replies should be - validated in any mode that supports arp monitoring, or whether - non-ARP traffic should be filtered (disregarded) for link - monitoring purposes. - - Possible values are: - - none or 0 - - No validation or filtering is performed. - - active or 1 - - Validation is performed only for the active slave. - - backup or 2 - - Validation is performed only for backup slaves. - - all or 3 - - Validation is performed for all slaves. - - filter or 4 - - Filtering is applied to all slaves. No validation is - performed. - - filter_active or 5 - - Filtering is applied to all slaves, validation is performed - only for the active slave. - - filter_backup or 6 - - Filtering is applied to all slaves, validation is performed - only for backup slaves. - - Validation: - - Enabling validation causes the ARP monitor to examine the incoming - ARP requests and replies, and only consider a slave to be up if it - is receiving the appropriate ARP traffic. - - For an active slave, the validation checks ARP replies to confirm - that they were generated by an arp_ip_target. Since backup slaves - do not typically receive these replies, the validation performed - for backup slaves is on the broadcast ARP request sent out via the - active slave. It is possible that some switch or network - configurations may result in situations wherein the backup slaves - do not receive the ARP requests; in such a situation, validation - of backup slaves must be disabled. - - The validation of ARP requests on backup slaves is mainly helping - bonding to decide which slaves are more likely to work in case of - the active slave failure, it doesn't really guarantee that the - backup slave will work if it's selected as the next active slave. - - Validation is useful in network configurations in which multiple - bonding hosts are concurrently issuing ARPs to one or more targets - beyond a common switch. Should the link between the switch and - target fail (but not the switch itself), the probe traffic - generated by the multiple bonding instances will fool the standard - ARP monitor into considering the links as still up. Use of - validation can resolve this, as the ARP monitor will only consider - ARP requests and replies associated with its own instance of - bonding. - - Filtering: - - Enabling filtering causes the ARP monitor to only use incoming ARP - packets for link availability purposes. Arriving packets that are - not ARPs are delivered normally, but do not count when determining - if a slave is available. - - Filtering operates by only considering the reception of ARP - packets (any ARP packet, regardless of source or destination) when - determining if a slave has received traffic for link availability - purposes. - - Filtering is useful in network configurations in which significant - levels of third party broadcast traffic would fool the standard - ARP monitor into considering the links as still up. Use of - filtering can resolve this, as only ARP traffic is considered for - link availability purposes. - - This option was added in bonding version 3.1.0. - -arp_all_targets - - Specifies the quantity of arp_ip_targets that must be reachable - in order for the ARP monitor to consider a slave as being up. - This option affects only active-backup mode for slaves with - arp_validation enabled. - - Possible values are: - - any or 0 - - consider the slave up only when any of the arp_ip_targets - is reachable - - all or 1 - - consider the slave up only when all of the arp_ip_targets - are reachable - -downdelay - - Specifies the time, in milliseconds, to wait before disabling - a slave after a link failure has been detected. This option - is only valid for the miimon link monitor. The downdelay - value should be a multiple of the miimon value; if not, it - will be rounded down to the nearest multiple. The default - value is 0. - -fail_over_mac - - Specifies whether active-backup mode should set all slaves to - the same MAC address at enslavement (the traditional - behavior), or, when enabled, perform special handling of the - bond's MAC address in accordance with the selected policy. - - Possible values are: - - none or 0 - - This setting disables fail_over_mac, and causes - bonding to set all slaves of an active-backup bond to - the same MAC address at enslavement time. This is the - default. - - active or 1 - - The "active" fail_over_mac policy indicates that the - MAC address of the bond should always be the MAC - address of the currently active slave. The MAC - address of the slaves is not changed; instead, the MAC - address of the bond changes during a failover. - - This policy is useful for devices that cannot ever - alter their MAC address, or for devices that refuse - incoming broadcasts with their own source MAC (which - interferes with the ARP monitor). - - The down side of this policy is that every device on - the network must be updated via gratuitous ARP, - vs. just updating a switch or set of switches (which - often takes place for any traffic, not just ARP - traffic, if the switch snoops incoming traffic to - update its tables) for the traditional method. If the - gratuitous ARP is lost, communication may be - disrupted. - - When this policy is used in conjunction with the mii - monitor, devices which assert link up prior to being - able to actually transmit and receive are particularly - susceptible to loss of the gratuitous ARP, and an - appropriate updelay setting may be required. - - follow or 2 - - The "follow" fail_over_mac policy causes the MAC - address of the bond to be selected normally (normally - the MAC address of the first slave added to the bond). - However, the second and subsequent slaves are not set - to this MAC address while they are in a backup role; a - slave is programmed with the bond's MAC address at - failover time (and the formerly active slave receives - the newly active slave's MAC address). - - This policy is useful for multiport devices that - either become confused or incur a performance penalty - when multiple ports are programmed with the same MAC - address. - - - The default policy is none, unless the first slave cannot - change its MAC address, in which case the active policy is - selected by default. - - This option may be modified via sysfs only when no slaves are - present in the bond. - - This option was added in bonding version 3.2.0. The "follow" - policy was added in bonding version 3.3.0. - -lacp_rate - - Option specifying the rate in which we'll ask our link partner - to transmit LACPDU packets in 802.3ad mode. Possible values - are: - - slow or 0 - Request partner to transmit LACPDUs every 30 seconds - - fast or 1 - Request partner to transmit LACPDUs every 1 second - - The default is slow. - -max_bonds - - Specifies the number of bonding devices to create for this - instance of the bonding driver. E.g., if max_bonds is 3, and - the bonding driver is not already loaded, then bond0, bond1 - and bond2 will be created. The default value is 1. Specifying - a value of 0 will load bonding, but will not create any devices. - -miimon - - Specifies the MII link monitoring frequency in milliseconds. - This determines how often the link state of each slave is - inspected for link failures. A value of zero disables MII - link monitoring. A value of 100 is a good starting point. - The use_carrier option, below, affects how the link state is - determined. See the High Availability section for additional - information. The default value is 0. - -min_links - - Specifies the minimum number of links that must be active before - asserting carrier. It is similar to the Cisco EtherChannel min-links - feature. This allows setting the minimum number of member ports that - must be up (link-up state) before marking the bond device as up - (carrier on). This is useful for situations where higher level services - such as clustering want to ensure a minimum number of low bandwidth - links are active before switchover. This option only affect 802.3ad - mode. - - The default value is 0. This will cause carrier to be asserted (for - 802.3ad mode) whenever there is an active aggregator, regardless of the - number of available links in that aggregator. Note that, because an - aggregator cannot be active without at least one available link, - setting this option to 0 or to 1 has the exact same effect. - -mode - - Specifies one of the bonding policies. The default is - balance-rr (round robin). Possible values are: - - balance-rr or 0 - - Round-robin policy: Transmit packets in sequential - order from the first available slave through the - last. This mode provides load balancing and fault - tolerance. - - active-backup or 1 - - Active-backup policy: Only one slave in the bond is - active. A different slave becomes active if, and only - if, the active slave fails. The bond's MAC address is - externally visible on only one port (network adapter) - to avoid confusing the switch. - - In bonding version 2.6.2 or later, when a failover - occurs in active-backup mode, bonding will issue one - or more gratuitous ARPs on the newly active slave. - One gratuitous ARP is issued for the bonding master - interface and each VLAN interfaces configured above - it, provided that the interface has at least one IP - address configured. Gratuitous ARPs issued for VLAN - interfaces are tagged with the appropriate VLAN id. - - This mode provides fault tolerance. The primary - option, documented below, affects the behavior of this - mode. - - balance-xor or 2 - - XOR policy: Transmit based on the selected transmit - hash policy. The default policy is a simple [(source - MAC address XOR'd with destination MAC address XOR - packet type ID) modulo slave count]. Alternate transmit - policies may be selected via the xmit_hash_policy option, - described below. - - This mode provides load balancing and fault tolerance. - - broadcast or 3 - - Broadcast policy: transmits everything on all slave - interfaces. This mode provides fault tolerance. - - 802.3ad or 4 - - IEEE 802.3ad Dynamic link aggregation. Creates - aggregation groups that share the same speed and - duplex settings. Utilizes all slaves in the active - aggregator according to the 802.3ad specification. - - Slave selection for outgoing traffic is done according - to the transmit hash policy, which may be changed from - the default simple XOR policy via the xmit_hash_policy - option, documented below. Note that not all transmit - policies may be 802.3ad compliant, particularly in - regards to the packet mis-ordering requirements of - section 43.2.4 of the 802.3ad standard. Differing - peer implementations will have varying tolerances for - noncompliance. - - Prerequisites: - - 1. Ethtool support in the base drivers for retrieving - the speed and duplex of each slave. - - 2. A switch that supports IEEE 802.3ad Dynamic link - aggregation. - - Most switches will require some type of configuration - to enable 802.3ad mode. - - balance-tlb or 5 - - Adaptive transmit load balancing: channel bonding that - does not require any special switch support. - - In tlb_dynamic_lb=1 mode; the outgoing traffic is - distributed according to the current load (computed - relative to the speed) on each slave. - - In tlb_dynamic_lb=0 mode; the load balancing based on - current load is disabled and the load is distributed - only using the hash distribution. - - Incoming traffic is received by the current slave. - If the receiving slave fails, another slave takes over - the MAC address of the failed receiving slave. - - Prerequisite: - - Ethtool support in the base drivers for retrieving the - speed of each slave. - - balance-alb or 6 - - Adaptive load balancing: includes balance-tlb plus - receive load balancing (rlb) for IPV4 traffic, and - does not require any special switch support. The - receive load balancing is achieved by ARP negotiation. - The bonding driver intercepts the ARP Replies sent by - the local system on their way out and overwrites the - source hardware address with the unique hardware - address of one of the slaves in the bond such that - different peers use different hardware addresses for - the server. - - Receive traffic from connections created by the server - is also balanced. When the local system sends an ARP - Request the bonding driver copies and saves the peer's - IP information from the ARP packet. When the ARP - Reply arrives from the peer, its hardware address is - retrieved and the bonding driver initiates an ARP - reply to this peer assigning it to one of the slaves - in the bond. A problematic outcome of using ARP - negotiation for balancing is that each time that an - ARP request is broadcast it uses the hardware address - of the bond. Hence, peers learn the hardware address - of the bond and the balancing of receive traffic - collapses to the current slave. This is handled by - sending updates (ARP Replies) to all the peers with - their individually assigned hardware address such that - the traffic is redistributed. Receive traffic is also - redistributed when a new slave is added to the bond - and when an inactive slave is re-activated. The - receive load is distributed sequentially (round robin) - among the group of highest speed slaves in the bond. - - When a link is reconnected or a new slave joins the - bond the receive traffic is redistributed among all - active slaves in the bond by initiating ARP Replies - with the selected MAC address to each of the - clients. The updelay parameter (detailed below) must - be set to a value equal or greater than the switch's - forwarding delay so that the ARP Replies sent to the - peers will not be blocked by the switch. - - Prerequisites: - - 1. Ethtool support in the base drivers for retrieving - the speed of each slave. - - 2. Base driver support for setting the hardware - address of a device while it is open. This is - required so that there will always be one slave in the - team using the bond hardware address (the - curr_active_slave) while having a unique hardware - address for each slave in the bond. If the - curr_active_slave fails its hardware address is - swapped with the new curr_active_slave that was - chosen. - -num_grat_arp -num_unsol_na - - Specify the number of peer notifications (gratuitous ARPs and - unsolicited IPv6 Neighbor Advertisements) to be issued after a - failover event. As soon as the link is up on the new slave - (possibly immediately) a peer notification is sent on the - bonding device and each VLAN sub-device. This is repeated at - the rate specified by peer_notif_delay if the number is - greater than 1. - - The valid range is 0 - 255; the default value is 1. These options - affect only the active-backup mode. These options were added for - bonding versions 3.3.0 and 3.4.0 respectively. - - From Linux 3.0 and bonding version 3.7.1, these notifications - are generated by the ipv4 and ipv6 code and the numbers of - repetitions cannot be set independently. - -packets_per_slave - - Specify the number of packets to transmit through a slave before - moving to the next one. When set to 0 then a slave is chosen at - random. - - The valid range is 0 - 65535; the default value is 1. This option - has effect only in balance-rr mode. - -peer_notif_delay - - Specify the delay, in milliseconds, between each peer - notification (gratuitous ARP and unsolicited IPv6 Neighbor - Advertisement) when they are issued after a failover event. - This delay should be a multiple of the link monitor interval - (arp_interval or miimon, whichever is active). The default - value is 0 which means to match the value of the link monitor - interval. - -primary - - A string (eth0, eth2, etc) specifying which slave is the - primary device. The specified device will always be the - active slave while it is available. Only when the primary is - off-line will alternate devices be used. This is useful when - one slave is preferred over another, e.g., when one slave has - higher throughput than another. - - The primary option is only valid for active-backup(1), - balance-tlb (5) and balance-alb (6) mode. - -primary_reselect - - Specifies the reselection policy for the primary slave. This - affects how the primary slave is chosen to become the active slave - when failure of the active slave or recovery of the primary slave - occurs. This option is designed to prevent flip-flopping between - the primary slave and other slaves. Possible values are: - - always or 0 (default) - - The primary slave becomes the active slave whenever it - comes back up. - - better or 1 - - The primary slave becomes the active slave when it comes - back up, if the speed and duplex of the primary slave is - better than the speed and duplex of the current active - slave. - - failure or 2 - - The primary slave becomes the active slave only if the - current active slave fails and the primary slave is up. - - The primary_reselect setting is ignored in two cases: - - If no slaves are active, the first slave to recover is - made the active slave. - - When initially enslaved, the primary slave is always made - the active slave. - - Changing the primary_reselect policy via sysfs will cause an - immediate selection of the best active slave according to the new - policy. This may or may not result in a change of the active - slave, depending upon the circumstances. - - This option was added for bonding version 3.6.0. - -tlb_dynamic_lb - - Specifies if dynamic shuffling of flows is enabled in tlb - mode. The value has no effect on any other modes. - - The default behavior of tlb mode is to shuffle active flows across - slaves based on the load in that interval. This gives nice lb - characteristics but can cause packet reordering. If re-ordering is - a concern use this variable to disable flow shuffling and rely on - load balancing provided solely by the hash distribution. - xmit-hash-policy can be used to select the appropriate hashing for - the setup. - - The sysfs entry can be used to change the setting per bond device - and the initial value is derived from the module parameter. The - sysfs entry is allowed to be changed only if the bond device is - down. - - The default value is "1" that enables flow shuffling while value "0" - disables it. This option was added in bonding driver 3.7.1 - - -updelay - - Specifies the time, in milliseconds, to wait before enabling a - slave after a link recovery has been detected. This option is - only valid for the miimon link monitor. The updelay value - should be a multiple of the miimon value; if not, it will be - rounded down to the nearest multiple. The default value is 0. - -use_carrier - - Specifies whether or not miimon should use MII or ETHTOOL - ioctls vs. netif_carrier_ok() to determine the link - status. The MII or ETHTOOL ioctls are less efficient and - utilize a deprecated calling sequence within the kernel. The - netif_carrier_ok() relies on the device driver to maintain its - state with netif_carrier_on/off; at this writing, most, but - not all, device drivers support this facility. - - If bonding insists that the link is up when it should not be, - it may be that your network device driver does not support - netif_carrier_on/off. The default state for netif_carrier is - "carrier on," so if a driver does not support netif_carrier, - it will appear as if the link is always up. In this case, - setting use_carrier to 0 will cause bonding to revert to the - MII / ETHTOOL ioctl method to determine the link state. - - A value of 1 enables the use of netif_carrier_ok(), a value of - 0 will use the deprecated MII / ETHTOOL ioctls. The default - value is 1. - -xmit_hash_policy - - Selects the transmit hash policy to use for slave selection in - balance-xor, 802.3ad, and tlb modes. Possible values are: - - layer2 - - Uses XOR of hardware MAC addresses and packet type ID - field to generate the hash. The formula is - - hash = source MAC XOR destination MAC XOR packet type ID - slave number = hash modulo slave count - - This algorithm will place all traffic to a particular - network peer on the same slave. - - This algorithm is 802.3ad compliant. - - layer2+3 - - This policy uses a combination of layer2 and layer3 - protocol information to generate the hash. - - Uses XOR of hardware MAC addresses and IP addresses to - generate the hash. The formula is - - hash = source MAC XOR destination MAC XOR packet type ID - hash = hash XOR source IP XOR destination IP - hash = hash XOR (hash RSHIFT 16) - hash = hash XOR (hash RSHIFT 8) - And then hash is reduced modulo slave count. - - If the protocol is IPv6 then the source and destination - addresses are first hashed using ipv6_addr_hash. - - This algorithm will place all traffic to a particular - network peer on the same slave. For non-IP traffic, - the formula is the same as for the layer2 transmit - hash policy. - - This policy is intended to provide a more balanced - distribution of traffic than layer2 alone, especially - in environments where a layer3 gateway device is - required to reach most destinations. - - This algorithm is 802.3ad compliant. - - layer3+4 - - This policy uses upper layer protocol information, - when available, to generate the hash. This allows for - traffic to a particular network peer to span multiple - slaves, although a single connection will not span - multiple slaves. - - The formula for unfragmented TCP and UDP packets is - - hash = source port, destination port (as in the header) - hash = hash XOR source IP XOR destination IP - hash = hash XOR (hash RSHIFT 16) - hash = hash XOR (hash RSHIFT 8) - And then hash is reduced modulo slave count. - - If the protocol is IPv6 then the source and destination - addresses are first hashed using ipv6_addr_hash. - - For fragmented TCP or UDP packets and all other IPv4 and - IPv6 protocol traffic, the source and destination port - information is omitted. For non-IP traffic, the - formula is the same as for the layer2 transmit hash - policy. - - This algorithm is not fully 802.3ad compliant. A - single TCP or UDP conversation containing both - fragmented and unfragmented packets will see packets - striped across two interfaces. This may result in out - of order delivery. Most traffic types will not meet - this criteria, as TCP rarely fragments traffic, and - most UDP traffic is not involved in extended - conversations. Other implementations of 802.3ad may - or may not tolerate this noncompliance. - - encap2+3 - - This policy uses the same formula as layer2+3 but it - relies on skb_flow_dissect to obtain the header fields - which might result in the use of inner headers if an - encapsulation protocol is used. For example this will - improve the performance for tunnel users because the - packets will be distributed according to the encapsulated - flows. - - encap3+4 - - This policy uses the same formula as layer3+4 but it - relies on skb_flow_dissect to obtain the header fields - which might result in the use of inner headers if an - encapsulation protocol is used. For example this will - improve the performance for tunnel users because the - packets will be distributed according to the encapsulated - flows. - - The default value is layer2. This option was added in bonding - version 2.6.3. In earlier versions of bonding, this parameter - does not exist, and the layer2 policy is the only policy. The - layer2+3 value was added for bonding version 3.2.2. - -resend_igmp - - Specifies the number of IGMP membership reports to be issued after - a failover event. One membership report is issued immediately after - the failover, subsequent packets are sent in each 200ms interval. - - The valid range is 0 - 255; the default value is 1. A value of 0 - prevents the IGMP membership report from being issued in response - to the failover event. - - This option is useful for bonding modes balance-rr (0), active-backup - (1), balance-tlb (5) and balance-alb (6), in which a failover can - switch the IGMP traffic from one slave to another. Therefore a fresh - IGMP report must be issued to cause the switch to forward the incoming - IGMP traffic over the newly selected slave. - - This option was added for bonding version 3.7.0. - -lp_interval - - Specifies the number of seconds between instances where the bonding - driver sends learning packets to each slaves peer switch. - - The valid range is 1 - 0x7fffffff; the default value is 1. This Option - has effect only in balance-tlb and balance-alb modes. - -3. Configuring Bonding Devices -============================== - - You can configure bonding using either your distro's network -initialization scripts, or manually using either iproute2 or the -sysfs interface. Distros generally use one of three packages for the -network initialization scripts: initscripts, sysconfig or interfaces. -Recent versions of these packages have support for bonding, while older -versions do not. - - We will first describe the options for configuring bonding for -distros using versions of initscripts, sysconfig and interfaces with full -or partial support for bonding, then provide information on enabling -bonding without support from the network initialization scripts (i.e., -older versions of initscripts or sysconfig). - - If you're unsure whether your distro uses sysconfig, -initscripts or interfaces, or don't know if it's new enough, have no fear. -Determining this is fairly straightforward. - - First, look for a file called interfaces in /etc/network directory. -If this file is present in your system, then your system use interfaces. See -Configuration with Interfaces Support. - - Else, issue the command: - -$ rpm -qf /sbin/ifup - - It will respond with a line of text starting with either -"initscripts" or "sysconfig," followed by some numbers. This is the -package that provides your network initialization scripts. - - Next, to determine if your installation supports bonding, -issue the command: - -$ grep ifenslave /sbin/ifup - - If this returns any matches, then your initscripts or -sysconfig has support for bonding. - -3.1 Configuration with Sysconfig Support ----------------------------------------- - - This section applies to distros using a version of sysconfig -with bonding support, for example, SuSE Linux Enterprise Server 9. - - SuSE SLES 9's networking configuration system does support -bonding, however, at this writing, the YaST system configuration -front end does not provide any means to work with bonding devices. -Bonding devices can be managed by hand, however, as follows. - - First, if they have not already been configured, configure the -slave devices. On SLES 9, this is most easily done by running the -yast2 sysconfig configuration utility. The goal is for to create an -ifcfg-id file for each slave device. The simplest way to accomplish -this is to configure the devices for DHCP (this is only to get the -file ifcfg-id file created; see below for some issues with DHCP). The -name of the configuration file for each device will be of the form: - -ifcfg-id-xx:xx:xx:xx:xx:xx - - Where the "xx" portion will be replaced with the digits from -the device's permanent MAC address. - - Once the set of ifcfg-id-xx:xx:xx:xx:xx:xx files has been -created, it is necessary to edit the configuration files for the slave -devices (the MAC addresses correspond to those of the slave devices). -Before editing, the file will contain multiple lines, and will look -something like this: - -BOOTPROTO='dhcp' -STARTMODE='on' -USERCTL='no' -UNIQUE='XNzu.WeZGOGF+4wE' -_nm_name='bus-pci-0001:61:01.0' - - Change the BOOTPROTO and STARTMODE lines to the following: - -BOOTPROTO='none' -STARTMODE='off' - - Do not alter the UNIQUE or _nm_name lines. Remove any other -lines (USERCTL, etc). - - Once the ifcfg-id-xx:xx:xx:xx:xx:xx files have been modified, -it's time to create the configuration file for the bonding device -itself. This file is named ifcfg-bondX, where X is the number of the -bonding device to create, starting at 0. The first such file is -ifcfg-bond0, the second is ifcfg-bond1, and so on. The sysconfig -network configuration system will correctly start multiple instances -of bonding. - - The contents of the ifcfg-bondX file is as follows: - -BOOTPROTO="static" -BROADCAST="10.0.2.255" -IPADDR="10.0.2.10" -NETMASK="255.255.0.0" -NETWORK="10.0.2.0" -REMOTE_IPADDR="" -STARTMODE="onboot" -BONDING_MASTER="yes" -BONDING_MODULE_OPTS="mode=active-backup miimon=100" -BONDING_SLAVE0="eth0" -BONDING_SLAVE1="bus-pci-0000:06:08.1" - - Replace the sample BROADCAST, IPADDR, NETMASK and NETWORK -values with the appropriate values for your network. - - The STARTMODE specifies when the device is brought online. -The possible values are: - - onboot: The device is started at boot time. If you're not - sure, this is probably what you want. - - manual: The device is started only when ifup is called - manually. Bonding devices may be configured this - way if you do not wish them to start automatically - at boot for some reason. - - hotplug: The device is started by a hotplug event. This is not - a valid choice for a bonding device. - - off or ignore: The device configuration is ignored. - - The line BONDING_MASTER='yes' indicates that the device is a -bonding master device. The only useful value is "yes." - - The contents of BONDING_MODULE_OPTS are supplied to the -instance of the bonding module for this device. Specify the options -for the bonding mode, link monitoring, and so on here. Do not include -the max_bonds bonding parameter; this will confuse the configuration -system if you have multiple bonding devices. - - Finally, supply one BONDING_SLAVEn="slave device" for each -slave. where "n" is an increasing value, one for each slave. The -"slave device" is either an interface name, e.g., "eth0", or a device -specifier for the network device. The interface name is easier to -find, but the ethN names are subject to change at boot time if, e.g., -a device early in the sequence has failed. The device specifiers -(bus-pci-0000:06:08.1 in the example above) specify the physical -network device, and will not change unless the device's bus location -changes (for example, it is moved from one PCI slot to another). The -example above uses one of each type for demonstration purposes; most -configurations will choose one or the other for all slave devices. - - When all configuration files have been modified or created, -networking must be restarted for the configuration changes to take -effect. This can be accomplished via the following: - -# /etc/init.d/network restart - - Note that the network control script (/sbin/ifdown) will -remove the bonding module as part of the network shutdown processing, -so it is not necessary to remove the module by hand if, e.g., the -module parameters have changed. - - Also, at this writing, YaST/YaST2 will not manage bonding -devices (they do not show bonding interfaces on its list of network -devices). It is necessary to edit the configuration file by hand to -change the bonding configuration. - - Additional general options and details of the ifcfg file -format can be found in an example ifcfg template file: - -/etc/sysconfig/network/ifcfg.template - - Note that the template does not document the various BONDING_ -settings described above, but does describe many of the other options. - -3.1.1 Using DHCP with Sysconfig -------------------------------- - - Under sysconfig, configuring a device with BOOTPROTO='dhcp' -will cause it to query DHCP for its IP address information. At this -writing, this does not function for bonding devices; the scripts -attempt to obtain the device address from DHCP prior to adding any of -the slave devices. Without active slaves, the DHCP requests are not -sent to the network. - -3.1.2 Configuring Multiple Bonds with Sysconfig ------------------------------------------------ - - The sysconfig network initialization system is capable of -handling multiple bonding devices. All that is necessary is for each -bonding instance to have an appropriately configured ifcfg-bondX file -(as described above). Do not specify the "max_bonds" parameter to any -instance of bonding, as this will confuse sysconfig. If you require -multiple bonding devices with identical parameters, create multiple -ifcfg-bondX files. - - Because the sysconfig scripts supply the bonding module -options in the ifcfg-bondX file, it is not necessary to add them to -the system /etc/modules.d/*.conf configuration files. - -3.2 Configuration with Initscripts Support ------------------------------------------- - - This section applies to distros using a recent version of -initscripts with bonding support, for example, Red Hat Enterprise Linux -version 3 or later, Fedora, etc. On these systems, the network -initialization scripts have knowledge of bonding, and can be configured to -control bonding devices. Note that older versions of the initscripts -package have lower levels of support for bonding; this will be noted where -applicable. - - These distros will not automatically load the network adapter -driver unless the ethX device is configured with an IP address. -Because of this constraint, users must manually configure a -network-script file for all physical adapters that will be members of -a bondX link. Network script files are located in the directory: - -/etc/sysconfig/network-scripts - - The file name must be prefixed with "ifcfg-eth" and suffixed -with the adapter's physical adapter number. For example, the script -for eth0 would be named /etc/sysconfig/network-scripts/ifcfg-eth0. -Place the following text in the file: - -DEVICE=eth0 -USERCTL=no -ONBOOT=yes -MASTER=bond0 -SLAVE=yes -BOOTPROTO=none - - The DEVICE= line will be different for every ethX device and -must correspond with the name of the file, i.e., ifcfg-eth1 must have -a device line of DEVICE=eth1. The setting of the MASTER= line will -also depend on the final bonding interface name chosen for your bond. -As with other network devices, these typically start at 0, and go up -one for each device, i.e., the first bonding instance is bond0, the -second is bond1, and so on. - - Next, create a bond network script. The file name for this -script will be /etc/sysconfig/network-scripts/ifcfg-bondX where X is -the number of the bond. For bond0 the file is named "ifcfg-bond0", -for bond1 it is named "ifcfg-bond1", and so on. Within that file, -place the following text: - -DEVICE=bond0 -IPADDR=192.168.1.1 -NETMASK=255.255.255.0 -NETWORK=192.168.1.0 -BROADCAST=192.168.1.255 -ONBOOT=yes -BOOTPROTO=none -USERCTL=no - - Be sure to change the networking specific lines (IPADDR, -NETMASK, NETWORK and BROADCAST) to match your network configuration. - - For later versions of initscripts, such as that found with Fedora -7 (or later) and Red Hat Enterprise Linux version 5 (or later), it is possible, -and, indeed, preferable, to specify the bonding options in the ifcfg-bond0 -file, e.g. a line of the format: - -BONDING_OPTS="mode=active-backup arp_interval=60 arp_ip_target=192.168.1.254" - - will configure the bond with the specified options. The options -specified in BONDING_OPTS are identical to the bonding module parameters -except for the arp_ip_target field when using versions of initscripts older -than and 8.57 (Fedora 8) and 8.45.19 (Red Hat Enterprise Linux 5.2). When -using older versions each target should be included as a separate option and -should be preceded by a '+' to indicate it should be added to the list of -queried targets, e.g., - - arp_ip_target=+192.168.1.1 arp_ip_target=+192.168.1.2 - - is the proper syntax to specify multiple targets. When specifying -options via BONDING_OPTS, it is not necessary to edit /etc/modprobe.d/*.conf. - - For even older versions of initscripts that do not support -BONDING_OPTS, it is necessary to edit /etc/modprobe.d/*.conf, depending upon -your distro) to load the bonding module with your desired options when the -bond0 interface is brought up. The following lines in /etc/modprobe.d/*.conf -will load the bonding module, and select its options: - -alias bond0 bonding -options bond0 mode=balance-alb miimon=100 - - Replace the sample parameters with the appropriate set of -options for your configuration. - - Finally run "/etc/rc.d/init.d/network restart" as root. This -will restart the networking subsystem and your bond link should be now -up and running. - -3.2.1 Using DHCP with Initscripts ---------------------------------- - - Recent versions of initscripts (the versions supplied with Fedora -Core 3 and Red Hat Enterprise Linux 4, or later versions, are reported to -work) have support for assigning IP information to bonding devices via -DHCP. - - To configure bonding for DHCP, configure it as described -above, except replace the line "BOOTPROTO=none" with "BOOTPROTO=dhcp" -and add a line consisting of "TYPE=Bonding". Note that the TYPE value -is case sensitive. - -3.2.2 Configuring Multiple Bonds with Initscripts -------------------------------------------------- - - Initscripts packages that are included with Fedora 7 and Red Hat -Enterprise Linux 5 support multiple bonding interfaces by simply -specifying the appropriate BONDING_OPTS= in ifcfg-bondX where X is the -number of the bond. This support requires sysfs support in the kernel, -and a bonding driver of version 3.0.0 or later. Other configurations may -not support this method for specifying multiple bonding interfaces; for -those instances, see the "Configuring Multiple Bonds Manually" section, -below. - -3.3 Configuring Bonding Manually with iproute2 ------------------------------------------------ - - This section applies to distros whose network initialization -scripts (the sysconfig or initscripts package) do not have specific -knowledge of bonding. One such distro is SuSE Linux Enterprise Server -version 8. - - The general method for these systems is to place the bonding -module parameters into a config file in /etc/modprobe.d/ (as -appropriate for the installed distro), then add modprobe and/or -`ip link` commands to the system's global init script. The name of -the global init script differs; for sysconfig, it is -/etc/init.d/boot.local and for initscripts it is /etc/rc.d/rc.local. - - For example, if you wanted to make a simple bond of two e100 -devices (presumed to be eth0 and eth1), and have it persist across -reboots, edit the appropriate file (/etc/init.d/boot.local or -/etc/rc.d/rc.local), and add the following: - -modprobe bonding mode=balance-alb miimon=100 -modprobe e100 -ifconfig bond0 192.168.1.1 netmask 255.255.255.0 up -ip link set eth0 master bond0 -ip link set eth1 master bond0 - - Replace the example bonding module parameters and bond0 -network configuration (IP address, netmask, etc) with the appropriate -values for your configuration. - - Unfortunately, this method will not provide support for the -ifup and ifdown scripts on the bond devices. To reload the bonding -configuration, it is necessary to run the initialization script, e.g., - -# /etc/init.d/boot.local - - or - -# /etc/rc.d/rc.local - - It may be desirable in such a case to create a separate script -which only initializes the bonding configuration, then call that -separate script from within boot.local. This allows for bonding to be -enabled without re-running the entire global init script. - - To shut down the bonding devices, it is necessary to first -mark the bonding device itself as being down, then remove the -appropriate device driver modules. For our example above, you can do -the following: - -# ifconfig bond0 down -# rmmod bonding -# rmmod e100 - - Again, for convenience, it may be desirable to create a script -with these commands. - - -3.3.1 Configuring Multiple Bonds Manually ------------------------------------------ - - This section contains information on configuring multiple -bonding devices with differing options for those systems whose network -initialization scripts lack support for configuring multiple bonds. - - If you require multiple bonding devices, but all with the same -options, you may wish to use the "max_bonds" module parameter, -documented above. - - To create multiple bonding devices with differing options, it is -preferable to use bonding parameters exported by sysfs, documented in the -section below. - - For versions of bonding without sysfs support, the only means to -provide multiple instances of bonding with differing options is to load -the bonding driver multiple times. Note that current versions of the -sysconfig network initialization scripts handle this automatically; if -your distro uses these scripts, no special action is needed. See the -section Configuring Bonding Devices, above, if you're not sure about your -network initialization scripts. - - To load multiple instances of the module, it is necessary to -specify a different name for each instance (the module loading system -requires that every loaded module, even multiple instances of the same -module, have a unique name). This is accomplished by supplying multiple -sets of bonding options in /etc/modprobe.d/*.conf, for example: - -alias bond0 bonding -options bond0 -o bond0 mode=balance-rr miimon=100 - -alias bond1 bonding -options bond1 -o bond1 mode=balance-alb miimon=50 - - will load the bonding module two times. The first instance is -named "bond0" and creates the bond0 device in balance-rr mode with an -miimon of 100. The second instance is named "bond1" and creates the -bond1 device in balance-alb mode with an miimon of 50. - - In some circumstances (typically with older distributions), -the above does not work, and the second bonding instance never sees -its options. In that case, the second options line can be substituted -as follows: - -install bond1 /sbin/modprobe --ignore-install bonding -o bond1 \ - mode=balance-alb miimon=50 - - This may be repeated any number of times, specifying a new and -unique name in place of bond1 for each subsequent instance. - - It has been observed that some Red Hat supplied kernels are unable -to rename modules at load time (the "-o bond1" part). Attempts to pass -that option to modprobe will produce an "Operation not permitted" error. -This has been reported on some Fedora Core kernels, and has been seen on -RHEL 4 as well. On kernels exhibiting this problem, it will be impossible -to configure multiple bonds with differing parameters (as they are older -kernels, and also lack sysfs support). - -3.4 Configuring Bonding Manually via Sysfs ------------------------------------------- - - Starting with version 3.0.0, Channel Bonding may be configured -via the sysfs interface. This interface allows dynamic configuration -of all bonds in the system without unloading the module. It also -allows for adding and removing bonds at runtime. Ifenslave is no -longer required, though it is still supported. - - Use of the sysfs interface allows you to use multiple bonds -with different configurations without having to reload the module. -It also allows you to use multiple, differently configured bonds when -bonding is compiled into the kernel. - - You must have the sysfs filesystem mounted to configure -bonding this way. The examples in this document assume that you -are using the standard mount point for sysfs, e.g. /sys. If your -sysfs filesystem is mounted elsewhere, you will need to adjust the -example paths accordingly. - -Creating and Destroying Bonds ------------------------------ -To add a new bond foo: -# echo +foo > /sys/class/net/bonding_masters - -To remove an existing bond bar: -# echo -bar > /sys/class/net/bonding_masters - -To show all existing bonds: -# cat /sys/class/net/bonding_masters - -NOTE: due to 4K size limitation of sysfs files, this list may be -truncated if you have more than a few hundred bonds. This is unlikely -to occur under normal operating conditions. - -Adding and Removing Slaves --------------------------- - Interfaces may be enslaved to a bond using the file -/sys/class/net//bonding/slaves. The semantics for this file -are the same as for the bonding_masters file. - -To enslave interface eth0 to bond bond0: -# ifconfig bond0 up -# echo +eth0 > /sys/class/net/bond0/bonding/slaves - -To free slave eth0 from bond bond0: -# echo -eth0 > /sys/class/net/bond0/bonding/slaves - - When an interface is enslaved to a bond, symlinks between the -two are created in the sysfs filesystem. In this case, you would get -/sys/class/net/bond0/slave_eth0 pointing to /sys/class/net/eth0, and -/sys/class/net/eth0/master pointing to /sys/class/net/bond0. - - This means that you can tell quickly whether or not an -interface is enslaved by looking for the master symlink. Thus: -# echo -eth0 > /sys/class/net/eth0/master/bonding/slaves -will free eth0 from whatever bond it is enslaved to, regardless of -the name of the bond interface. - -Changing a Bond's Configuration -------------------------------- - Each bond may be configured individually by manipulating the -files located in /sys/class/net//bonding - - The names of these files correspond directly with the command- -line parameters described elsewhere in this file, and, with the -exception of arp_ip_target, they accept the same values. To see the -current setting, simply cat the appropriate file. - - A few examples will be given here; for specific usage -guidelines for each parameter, see the appropriate section in this -document. - -To configure bond0 for balance-alb mode: -# ifconfig bond0 down -# echo 6 > /sys/class/net/bond0/bonding/mode - - or - -# echo balance-alb > /sys/class/net/bond0/bonding/mode - NOTE: The bond interface must be down before the mode can be -changed. - -To enable MII monitoring on bond0 with a 1 second interval: -# echo 1000 > /sys/class/net/bond0/bonding/miimon - NOTE: If ARP monitoring is enabled, it will disabled when MII -monitoring is enabled, and vice-versa. - -To add ARP targets: -# echo +192.168.0.100 > /sys/class/net/bond0/bonding/arp_ip_target -# echo +192.168.0.101 > /sys/class/net/bond0/bonding/arp_ip_target - NOTE: up to 16 target addresses may be specified. - -To remove an ARP target: -# echo -192.168.0.100 > /sys/class/net/bond0/bonding/arp_ip_target - -To configure the interval between learning packet transmits: -# echo 12 > /sys/class/net/bond0/bonding/lp_interval - NOTE: the lp_interval is the number of seconds between instances where -the bonding driver sends learning packets to each slaves peer switch. The -default interval is 1 second. - -Example Configuration ---------------------- - We begin with the same example that is shown in section 3.3, -executed with sysfs, and without using ifenslave. - - To make a simple bond of two e100 devices (presumed to be eth0 -and eth1), and have it persist across reboots, edit the appropriate -file (/etc/init.d/boot.local or /etc/rc.d/rc.local), and add the -following: - -modprobe bonding -modprobe e100 -echo balance-alb > /sys/class/net/bond0/bonding/mode -ifconfig bond0 192.168.1.1 netmask 255.255.255.0 up -echo 100 > /sys/class/net/bond0/bonding/miimon -echo +eth0 > /sys/class/net/bond0/bonding/slaves -echo +eth1 > /sys/class/net/bond0/bonding/slaves - - To add a second bond, with two e1000 interfaces in -active-backup mode, using ARP monitoring, add the following lines to -your init script: - -modprobe e1000 -echo +bond1 > /sys/class/net/bonding_masters -echo active-backup > /sys/class/net/bond1/bonding/mode -ifconfig bond1 192.168.2.1 netmask 255.255.255.0 up -echo +192.168.2.100 /sys/class/net/bond1/bonding/arp_ip_target -echo 2000 > /sys/class/net/bond1/bonding/arp_interval -echo +eth2 > /sys/class/net/bond1/bonding/slaves -echo +eth3 > /sys/class/net/bond1/bonding/slaves - -3.5 Configuration with Interfaces Support ------------------------------------------ - - This section applies to distros which use /etc/network/interfaces file -to describe network interface configuration, most notably Debian and it's -derivatives. - - The ifup and ifdown commands on Debian don't support bonding out of -the box. The ifenslave-2.6 package should be installed to provide bonding -support. Once installed, this package will provide bond-* options to be used -into /etc/network/interfaces. - - Note that ifenslave-2.6 package will load the bonding module and use -the ifenslave command when appropriate. - -Example Configurations ----------------------- - -In /etc/network/interfaces, the following stanza will configure bond0, in -active-backup mode, with eth0 and eth1 as slaves. - -auto bond0 -iface bond0 inet dhcp - bond-slaves eth0 eth1 - bond-mode active-backup - bond-miimon 100 - bond-primary eth0 eth1 - -If the above configuration doesn't work, you might have a system using -upstart for system startup. This is most notably true for recent -Ubuntu versions. The following stanza in /etc/network/interfaces will -produce the same result on those systems. - -auto bond0 -iface bond0 inet dhcp - bond-slaves none - bond-mode active-backup - bond-miimon 100 - -auto eth0 -iface eth0 inet manual - bond-master bond0 - bond-primary eth0 eth1 - -auto eth1 -iface eth1 inet manual - bond-master bond0 - bond-primary eth0 eth1 - -For a full list of bond-* supported options in /etc/network/interfaces and some -more advanced examples tailored to you particular distros, see the files in -/usr/share/doc/ifenslave-2.6. - -3.6 Overriding Configuration for Special Cases ----------------------------------------------- - -When using the bonding driver, the physical port which transmits a frame is -typically selected by the bonding driver, and is not relevant to the user or -system administrator. The output port is simply selected using the policies of -the selected bonding mode. On occasion however, it is helpful to direct certain -classes of traffic to certain physical interfaces on output to implement -slightly more complex policies. For example, to reach a web server over a -bonded interface in which eth0 connects to a private network, while eth1 -connects via a public network, it may be desirous to bias the bond to send said -traffic over eth0 first, using eth1 only as a fall back, while all other traffic -can safely be sent over either interface. Such configurations may be achieved -using the traffic control utilities inherent in linux. - -By default the bonding driver is multiqueue aware and 16 queues are created -when the driver initializes (see Documentation/networking/multiqueue.txt -for details). If more or less queues are desired the module parameter -tx_queues can be used to change this value. There is no sysfs parameter -available as the allocation is done at module init time. - -The output of the file /proc/net/bonding/bondX has changed so the output Queue -ID is now printed for each slave: - -Bonding Mode: fault-tolerance (active-backup) -Primary Slave: None -Currently Active Slave: eth0 -MII Status: up -MII Polling Interval (ms): 0 -Up Delay (ms): 0 -Down Delay (ms): 0 - -Slave Interface: eth0 -MII Status: up -Link Failure Count: 0 -Permanent HW addr: 00:1a:a0:12:8f:cb -Slave queue ID: 0 - -Slave Interface: eth1 -MII Status: up -Link Failure Count: 0 -Permanent HW addr: 00:1a:a0:12:8f:cc -Slave queue ID: 2 - -The queue_id for a slave can be set using the command: - -# echo "eth1:2" > /sys/class/net/bond0/bonding/queue_id - -Any interface that needs a queue_id set should set it with multiple calls -like the one above until proper priorities are set for all interfaces. On -distributions that allow configuration via initscripts, multiple 'queue_id' -arguments can be added to BONDING_OPTS to set all needed slave queues. - -These queue id's can be used in conjunction with the tc utility to configure -a multiqueue qdisc and filters to bias certain traffic to transmit on certain -slave devices. For instance, say we wanted, in the above configuration to -force all traffic bound to 192.168.1.100 to use eth1 in the bond as its output -device. The following commands would accomplish this: - -# tc qdisc add dev bond0 handle 1 root multiq - -# tc filter add dev bond0 protocol ip parent 1: prio 1 u32 match ip dst \ - 192.168.1.100 action skbedit queue_mapping 2 - -These commands tell the kernel to attach a multiqueue queue discipline to the -bond0 interface and filter traffic enqueued to it, such that packets with a dst -ip of 192.168.1.100 have their output queue mapping value overwritten to 2. -This value is then passed into the driver, causing the normal output path -selection policy to be overridden, selecting instead qid 2, which maps to eth1. - -Note that qid values begin at 1. Qid 0 is reserved to initiate to the driver -that normal output policy selection should take place. One benefit to simply -leaving the qid for a slave to 0 is the multiqueue awareness in the bonding -driver that is now present. This awareness allows tc filters to be placed on -slave devices as well as bond devices and the bonding driver will simply act as -a pass-through for selecting output queues on the slave device rather than -output port selection. - -This feature first appeared in bonding driver version 3.7.0 and support for -output slave selection was limited to round-robin and active-backup modes. - -3.7 Configuring LACP for 802.3ad mode in a more secure way ----------------------------------------------------------- - -When using 802.3ad bonding mode, the Actor (host) and Partner (switch) -exchange LACPDUs. These LACPDUs cannot be sniffed, because they are -destined to link local mac addresses (which switches/bridges are not -supposed to forward). However, most of the values are easily predictable -or are simply the machine's MAC address (which is trivially known to all -other hosts in the same L2). This implies that other machines in the L2 -domain can spoof LACPDU packets from other hosts to the switch and potentially -cause mayhem by joining (from the point of view of the switch) another -machine's aggregate, thus receiving a portion of that hosts incoming -traffic and / or spoofing traffic from that machine themselves (potentially -even successfully terminating some portion of flows). Though this is not -a likely scenario, one could avoid this possibility by simply configuring -few bonding parameters: - - (a) ad_actor_system : You can set a random mac-address that can be used for - these LACPDU exchanges. The value can not be either NULL or Multicast. - Also it's preferable to set the local-admin bit. Following shell code - generates a random mac-address as described above. - - # sys_mac_addr=$(printf '%02x:%02x:%02x:%02x:%02x:%02x' \ - $(( (RANDOM & 0xFE) | 0x02 )) \ - $(( RANDOM & 0xFF )) \ - $(( RANDOM & 0xFF )) \ - $(( RANDOM & 0xFF )) \ - $(( RANDOM & 0xFF )) \ - $(( RANDOM & 0xFF ))) - # echo $sys_mac_addr > /sys/class/net/bond0/bonding/ad_actor_system - - (b) ad_actor_sys_prio : Randomize the system priority. The default value - is 65535, but system can take the value from 1 - 65535. Following shell - code generates random priority and sets it. - - # sys_prio=$(( 1 + RANDOM + RANDOM )) - # echo $sys_prio > /sys/class/net/bond0/bonding/ad_actor_sys_prio - - (c) ad_user_port_key : Use the user portion of the port-key. The default - keeps this empty. These are the upper 10 bits of the port-key and value - ranges from 0 - 1023. Following shell code generates these 10 bits and - sets it. - - # usr_port_key=$(( RANDOM & 0x3FF )) - # echo $usr_port_key > /sys/class/net/bond0/bonding/ad_user_port_key - - -4 Querying Bonding Configuration -================================= - -4.1 Bonding Configuration -------------------------- - - Each bonding device has a read-only file residing in the -/proc/net/bonding directory. The file contents include information -about the bonding configuration, options and state of each slave. - - For example, the contents of /proc/net/bonding/bond0 after the -driver is loaded with parameters of mode=0 and miimon=1000 is -generally as follows: - - Ethernet Channel Bonding Driver: 2.6.1 (October 29, 2004) - Bonding Mode: load balancing (round-robin) - Currently Active Slave: eth0 - MII Status: up - MII Polling Interval (ms): 1000 - Up Delay (ms): 0 - Down Delay (ms): 0 - - Slave Interface: eth1 - MII Status: up - Link Failure Count: 1 - - Slave Interface: eth0 - MII Status: up - Link Failure Count: 1 - - The precise format and contents will change depending upon the -bonding configuration, state, and version of the bonding driver. - -4.2 Network configuration -------------------------- - - The network configuration can be inspected using the ifconfig -command. Bonding devices will have the MASTER flag set; Bonding slave -devices will have the SLAVE flag set. The ifconfig output does not -contain information on which slaves are associated with which masters. - - In the example below, the bond0 interface is the master -(MASTER) while eth0 and eth1 are slaves (SLAVE). Notice all slaves of -bond0 have the same MAC address (HWaddr) as bond0 for all modes except -TLB and ALB that require a unique MAC address for each slave. - -# /sbin/ifconfig -bond0 Link encap:Ethernet HWaddr 00:C0:F0:1F:37:B4 - inet addr:XXX.XXX.XXX.YYY Bcast:XXX.XXX.XXX.255 Mask:255.255.252.0 - UP BROADCAST RUNNING MASTER MULTICAST MTU:1500 Metric:1 - RX packets:7224794 errors:0 dropped:0 overruns:0 frame:0 - TX packets:3286647 errors:1 dropped:0 overruns:1 carrier:0 - collisions:0 txqueuelen:0 - -eth0 Link encap:Ethernet HWaddr 00:C0:F0:1F:37:B4 - UP BROADCAST RUNNING SLAVE MULTICAST MTU:1500 Metric:1 - RX packets:3573025 errors:0 dropped:0 overruns:0 frame:0 - TX packets:1643167 errors:1 dropped:0 overruns:1 carrier:0 - collisions:0 txqueuelen:100 - Interrupt:10 Base address:0x1080 - -eth1 Link encap:Ethernet HWaddr 00:C0:F0:1F:37:B4 - UP BROADCAST RUNNING SLAVE MULTICAST MTU:1500 Metric:1 - RX packets:3651769 errors:0 dropped:0 overruns:0 frame:0 - TX packets:1643480 errors:0 dropped:0 overruns:0 carrier:0 - collisions:0 txqueuelen:100 - Interrupt:9 Base address:0x1400 - -5. Switch Configuration -======================= - - For this section, "switch" refers to whatever system the -bonded devices are directly connected to (i.e., where the other end of -the cable plugs into). This may be an actual dedicated switch device, -or it may be another regular system (e.g., another computer running -Linux), - - The active-backup, balance-tlb and balance-alb modes do not -require any specific configuration of the switch. - - The 802.3ad mode requires that the switch have the appropriate -ports configured as an 802.3ad aggregation. The precise method used -to configure this varies from switch to switch, but, for example, a -Cisco 3550 series switch requires that the appropriate ports first be -grouped together in a single etherchannel instance, then that -etherchannel is set to mode "lacp" to enable 802.3ad (instead of -standard EtherChannel). - - The balance-rr, balance-xor and broadcast modes generally -require that the switch have the appropriate ports grouped together. -The nomenclature for such a group differs between switches, it may be -called an "etherchannel" (as in the Cisco example, above), a "trunk -group" or some other similar variation. For these modes, each switch -will also have its own configuration options for the switch's transmit -policy to the bond. Typical choices include XOR of either the MAC or -IP addresses. The transmit policy of the two peers does not need to -match. For these three modes, the bonding mode really selects a -transmit policy for an EtherChannel group; all three will interoperate -with another EtherChannel group. - - -6. 802.1q VLAN Support -====================== - - It is possible to configure VLAN devices over a bond interface -using the 8021q driver. However, only packets coming from the 8021q -driver and passing through bonding will be tagged by default. Self -generated packets, for example, bonding's learning packets or ARP -packets generated by either ALB mode or the ARP monitor mechanism, are -tagged internally by bonding itself. As a result, bonding must -"learn" the VLAN IDs configured above it, and use those IDs to tag -self generated packets. - - For reasons of simplicity, and to support the use of adapters -that can do VLAN hardware acceleration offloading, the bonding -interface declares itself as fully hardware offloading capable, it gets -the add_vid/kill_vid notifications to gather the necessary -information, and it propagates those actions to the slaves. In case -of mixed adapter types, hardware accelerated tagged packets that -should go through an adapter that is not offloading capable are -"un-accelerated" by the bonding driver so the VLAN tag sits in the -regular location. - - VLAN interfaces *must* be added on top of a bonding interface -only after enslaving at least one slave. The bonding interface has a -hardware address of 00:00:00:00:00:00 until the first slave is added. -If the VLAN interface is created prior to the first enslavement, it -would pick up the all-zeroes hardware address. Once the first slave -is attached to the bond, the bond device itself will pick up the -slave's hardware address, which is then available for the VLAN device. - - Also, be aware that a similar problem can occur if all slaves -are released from a bond that still has one or more VLAN interfaces on -top of it. When a new slave is added, the bonding interface will -obtain its hardware address from the first slave, which might not -match the hardware address of the VLAN interfaces (which was -ultimately copied from an earlier slave). - - There are two methods to insure that the VLAN device operates -with the correct hardware address if all slaves are removed from a -bond interface: - - 1. Remove all VLAN interfaces then recreate them - - 2. Set the bonding interface's hardware address so that it -matches the hardware address of the VLAN interfaces. - - Note that changing a VLAN interface's HW address would set the -underlying device -- i.e. the bonding interface -- to promiscuous -mode, which might not be what you want. - - -7. Link Monitoring -================== - - The bonding driver at present supports two schemes for -monitoring a slave device's link state: the ARP monitor and the MII -monitor. - - At the present time, due to implementation restrictions in the -bonding driver itself, it is not possible to enable both ARP and MII -monitoring simultaneously. - -7.1 ARP Monitor Operation -------------------------- - - The ARP monitor operates as its name suggests: it sends ARP -queries to one or more designated peer systems on the network, and -uses the response as an indication that the link is operating. This -gives some assurance that traffic is actually flowing to and from one -or more peers on the local network. - - The ARP monitor relies on the device driver itself to verify -that traffic is flowing. In particular, the driver must keep up to -date the last receive time, dev->last_rx. Drivers that use NETIF_F_LLTX -flag must also update netdev_queue->trans_start. If they do not, then the -ARP monitor will immediately fail any slaves using that driver, and -those slaves will stay down. If networking monitoring (tcpdump, etc) -shows the ARP requests and replies on the network, then it may be that -your device driver is not updating last_rx and trans_start. - -7.2 Configuring Multiple ARP Targets ------------------------------------- - - While ARP monitoring can be done with just one target, it can -be useful in a High Availability setup to have several targets to -monitor. In the case of just one target, the target itself may go -down or have a problem making it unresponsive to ARP requests. Having -an additional target (or several) increases the reliability of the ARP -monitoring. - - Multiple ARP targets must be separated by commas as follows: - -# example options for ARP monitoring with three targets -alias bond0 bonding -options bond0 arp_interval=60 arp_ip_target=192.168.0.1,192.168.0.3,192.168.0.9 - - For just a single target the options would resemble: - -# example options for ARP monitoring with one target -alias bond0 bonding -options bond0 arp_interval=60 arp_ip_target=192.168.0.100 - - -7.3 MII Monitor Operation -------------------------- - - The MII monitor monitors only the carrier state of the local -network interface. It accomplishes this in one of three ways: by -depending upon the device driver to maintain its carrier state, by -querying the device's MII registers, or by making an ethtool query to -the device. - - If the use_carrier module parameter is 1 (the default value), -then the MII monitor will rely on the driver for carrier state -information (via the netif_carrier subsystem). As explained in the -use_carrier parameter information, above, if the MII monitor fails to -detect carrier loss on the device (e.g., when the cable is physically -disconnected), it may be that the driver does not support -netif_carrier. - - If use_carrier is 0, then the MII monitor will first query the -device's (via ioctl) MII registers and check the link state. If that -request fails (not just that it returns carrier down), then the MII -monitor will make an ethtool ETHOOL_GLINK request to attempt to obtain -the same information. If both methods fail (i.e., the driver either -does not support or had some error in processing both the MII register -and ethtool requests), then the MII monitor will assume the link is -up. - -8. Potential Sources of Trouble -=============================== - -8.1 Adventures in Routing -------------------------- - - When bonding is configured, it is important that the slave -devices not have routes that supersede routes of the master (or, -generally, not have routes at all). For example, suppose the bonding -device bond0 has two slaves, eth0 and eth1, and the routing table is -as follows: - -Kernel IP routing table -Destination Gateway Genmask Flags MSS Window irtt Iface -10.0.0.0 0.0.0.0 255.255.0.0 U 40 0 0 eth0 -10.0.0.0 0.0.0.0 255.255.0.0 U 40 0 0 eth1 -10.0.0.0 0.0.0.0 255.255.0.0 U 40 0 0 bond0 -127.0.0.0 0.0.0.0 255.0.0.0 U 40 0 0 lo - - This routing configuration will likely still update the -receive/transmit times in the driver (needed by the ARP monitor), but -may bypass the bonding driver (because outgoing traffic to, in this -case, another host on network 10 would use eth0 or eth1 before bond0). - - The ARP monitor (and ARP itself) may become confused by this -configuration, because ARP requests (generated by the ARP monitor) -will be sent on one interface (bond0), but the corresponding reply -will arrive on a different interface (eth0). This reply looks to ARP -as an unsolicited ARP reply (because ARP matches replies on an -interface basis), and is discarded. The MII monitor is not affected -by the state of the routing table. - - The solution here is simply to insure that slaves do not have -routes of their own, and if for some reason they must, those routes do -not supersede routes of their master. This should generally be the -case, but unusual configurations or errant manual or automatic static -route additions may cause trouble. - -8.2 Ethernet Device Renaming ----------------------------- - - On systems with network configuration scripts that do not -associate physical devices directly with network interface names (so -that the same physical device always has the same "ethX" name), it may -be necessary to add some special logic to config files in -/etc/modprobe.d/. - - For example, given a modules.conf containing the following: - -alias bond0 bonding -options bond0 mode=some-mode miimon=50 -alias eth0 tg3 -alias eth1 tg3 -alias eth2 e1000 -alias eth3 e1000 - - If neither eth0 and eth1 are slaves to bond0, then when the -bond0 interface comes up, the devices may end up reordered. This -happens because bonding is loaded first, then its slave device's -drivers are loaded next. Since no other drivers have been loaded, -when the e1000 driver loads, it will receive eth0 and eth1 for its -devices, but the bonding configuration tries to enslave eth2 and eth3 -(which may later be assigned to the tg3 devices). - - Adding the following: - -add above bonding e1000 tg3 - - causes modprobe to load e1000 then tg3, in that order, when -bonding is loaded. This command is fully documented in the -modules.conf manual page. - - On systems utilizing modprobe an equivalent problem can occur. -In this case, the following can be added to config files in -/etc/modprobe.d/ as: - -softdep bonding pre: tg3 e1000 - - This will load tg3 and e1000 modules before loading the bonding one. -Full documentation on this can be found in the modprobe.d and modprobe -manual pages. - -8.3. Painfully Slow Or No Failed Link Detection By Miimon ---------------------------------------------------------- - - By default, bonding enables the use_carrier option, which -instructs bonding to trust the driver to maintain carrier state. - - As discussed in the options section, above, some drivers do -not support the netif_carrier_on/_off link state tracking system. -With use_carrier enabled, bonding will always see these links as up, -regardless of their actual state. - - Additionally, other drivers do support netif_carrier, but do -not maintain it in real time, e.g., only polling the link state at -some fixed interval. In this case, miimon will detect failures, but -only after some long period of time has expired. If it appears that -miimon is very slow in detecting link failures, try specifying -use_carrier=0 to see if that improves the failure detection time. If -it does, then it may be that the driver checks the carrier state at a -fixed interval, but does not cache the MII register values (so the -use_carrier=0 method of querying the registers directly works). If -use_carrier=0 does not improve the failover, then the driver may cache -the registers, or the problem may be elsewhere. - - Also, remember that miimon only checks for the device's -carrier state. It has no way to determine the state of devices on or -beyond other ports of a switch, or if a switch is refusing to pass -traffic while still maintaining carrier on. - -9. SNMP agents -=============== - - If running SNMP agents, the bonding driver should be loaded -before any network drivers participating in a bond. This requirement -is due to the interface index (ipAdEntIfIndex) being associated to -the first interface found with a given IP address. That is, there is -only one ipAdEntIfIndex for each IP address. For example, if eth0 and -eth1 are slaves of bond0 and the driver for eth0 is loaded before the -bonding driver, the interface for the IP address will be associated -with the eth0 interface. This configuration is shown below, the IP -address 192.168.1.1 has an interface index of 2 which indexes to eth0 -in the ifDescr table (ifDescr.2). - - interfaces.ifTable.ifEntry.ifDescr.1 = lo - interfaces.ifTable.ifEntry.ifDescr.2 = eth0 - interfaces.ifTable.ifEntry.ifDescr.3 = eth1 - interfaces.ifTable.ifEntry.ifDescr.4 = eth2 - interfaces.ifTable.ifEntry.ifDescr.5 = eth3 - interfaces.ifTable.ifEntry.ifDescr.6 = bond0 - ip.ipAddrTable.ipAddrEntry.ipAdEntIfIndex.10.10.10.10 = 5 - ip.ipAddrTable.ipAddrEntry.ipAdEntIfIndex.192.168.1.1 = 2 - ip.ipAddrTable.ipAddrEntry.ipAdEntIfIndex.10.74.20.94 = 4 - ip.ipAddrTable.ipAddrEntry.ipAdEntIfIndex.127.0.0.1 = 1 - - This problem is avoided by loading the bonding driver before -any network drivers participating in a bond. Below is an example of -loading the bonding driver first, the IP address 192.168.1.1 is -correctly associated with ifDescr.2. - - interfaces.ifTable.ifEntry.ifDescr.1 = lo - interfaces.ifTable.ifEntry.ifDescr.2 = bond0 - interfaces.ifTable.ifEntry.ifDescr.3 = eth0 - interfaces.ifTable.ifEntry.ifDescr.4 = eth1 - interfaces.ifTable.ifEntry.ifDescr.5 = eth2 - interfaces.ifTable.ifEntry.ifDescr.6 = eth3 - ip.ipAddrTable.ipAddrEntry.ipAdEntIfIndex.10.10.10.10 = 6 - ip.ipAddrTable.ipAddrEntry.ipAdEntIfIndex.192.168.1.1 = 2 - ip.ipAddrTable.ipAddrEntry.ipAdEntIfIndex.10.74.20.94 = 5 - ip.ipAddrTable.ipAddrEntry.ipAdEntIfIndex.127.0.0.1 = 1 - - While some distributions may not report the interface name in -ifDescr, the association between the IP address and IfIndex remains -and SNMP functions such as Interface_Scan_Next will report that -association. - -10. Promiscuous mode -==================== - - When running network monitoring tools, e.g., tcpdump, it is -common to enable promiscuous mode on the device, so that all traffic -is seen (instead of seeing only traffic destined for the local host). -The bonding driver handles promiscuous mode changes to the bonding -master device (e.g., bond0), and propagates the setting to the slave -devices. - - For the balance-rr, balance-xor, broadcast, and 802.3ad modes, -the promiscuous mode setting is propagated to all slaves. - - For the active-backup, balance-tlb and balance-alb modes, the -promiscuous mode setting is propagated only to the active slave. - - For balance-tlb mode, the active slave is the slave currently -receiving inbound traffic. - - For balance-alb mode, the active slave is the slave used as a -"primary." This slave is used for mode-specific control traffic, for -sending to peers that are unassigned or if the load is unbalanced. - - For the active-backup, balance-tlb and balance-alb modes, when -the active slave changes (e.g., due to a link failure), the -promiscuous setting will be propagated to the new active slave. - -11. Configuring Bonding for High Availability -============================================= - - High Availability refers to configurations that provide -maximum network availability by having redundant or backup devices, -links or switches between the host and the rest of the world. The -goal is to provide the maximum availability of network connectivity -(i.e., the network always works), even though other configurations -could provide higher throughput. - -11.1 High Availability in a Single Switch Topology --------------------------------------------------- - - If two hosts (or a host and a single switch) are directly -connected via multiple physical links, then there is no availability -penalty to optimizing for maximum bandwidth. In this case, there is -only one switch (or peer), so if it fails, there is no alternative -access to fail over to. Additionally, the bonding load balance modes -support link monitoring of their members, so if individual links fail, -the load will be rebalanced across the remaining devices. - - See Section 12, "Configuring Bonding for Maximum Throughput" -for information on configuring bonding with one peer device. - -11.2 High Availability in a Multiple Switch Topology ----------------------------------------------------- - - With multiple switches, the configuration of bonding and the -network changes dramatically. In multiple switch topologies, there is -a trade off between network availability and usable bandwidth. - - Below is a sample network, configured to maximize the -availability of the network: - - | | - |port3 port3| - +-----+----+ +-----+----+ - | |port2 ISL port2| | - | switch A +--------------------------+ switch B | - | | | | - +-----+----+ +-----++---+ - |port1 port1| - | +-------+ | - +-------------+ host1 +---------------+ - eth0 +-------+ eth1 - - In this configuration, there is a link between the two -switches (ISL, or inter switch link), and multiple ports connecting to -the outside world ("port3" on each switch). There is no technical -reason that this could not be extended to a third switch. - -11.2.1 HA Bonding Mode Selection for Multiple Switch Topology -------------------------------------------------------------- - - In a topology such as the example above, the active-backup and -broadcast modes are the only useful bonding modes when optimizing for -availability; the other modes require all links to terminate on the -same peer for them to behave rationally. - -active-backup: This is generally the preferred mode, particularly if - the switches have an ISL and play together well. If the - network configuration is such that one switch is specifically - a backup switch (e.g., has lower capacity, higher cost, etc), - then the primary option can be used to insure that the - preferred link is always used when it is available. - -broadcast: This mode is really a special purpose mode, and is suitable - only for very specific needs. For example, if the two - switches are not connected (no ISL), and the networks beyond - them are totally independent. In this case, if it is - necessary for some specific one-way traffic to reach both - independent networks, then the broadcast mode may be suitable. - -11.2.2 HA Link Monitoring Selection for Multiple Switch Topology ----------------------------------------------------------------- - - The choice of link monitoring ultimately depends upon your -switch. If the switch can reliably fail ports in response to other -failures, then either the MII or ARP monitors should work. For -example, in the above example, if the "port3" link fails at the remote -end, the MII monitor has no direct means to detect this. The ARP -monitor could be configured with a target at the remote end of port3, -thus detecting that failure without switch support. - - In general, however, in a multiple switch topology, the ARP -monitor can provide a higher level of reliability in detecting end to -end connectivity failures (which may be caused by the failure of any -individual component to pass traffic for any reason). Additionally, -the ARP monitor should be configured with multiple targets (at least -one for each switch in the network). This will insure that, -regardless of which switch is active, the ARP monitor has a suitable -target to query. - - Note, also, that of late many switches now support a functionality -generally referred to as "trunk failover." This is a feature of the -switch that causes the link state of a particular switch port to be set -down (or up) when the state of another switch port goes down (or up). -Its purpose is to propagate link failures from logically "exterior" ports -to the logically "interior" ports that bonding is able to monitor via -miimon. Availability and configuration for trunk failover varies by -switch, but this can be a viable alternative to the ARP monitor when using -suitable switches. - -12. Configuring Bonding for Maximum Throughput -============================================== - -12.1 Maximizing Throughput in a Single Switch Topology ------------------------------------------------------- - - In a single switch configuration, the best method to maximize -throughput depends upon the application and network environment. The -various load balancing modes each have strengths and weaknesses in -different environments, as detailed below. - - For this discussion, we will break down the topologies into -two categories. Depending upon the destination of most traffic, we -categorize them into either "gatewayed" or "local" configurations. - - In a gatewayed configuration, the "switch" is acting primarily -as a router, and the majority of traffic passes through this router to -other networks. An example would be the following: - - - +----------+ +----------+ - | |eth0 port1| | to other networks - | Host A +---------------------+ router +-------------------> - | +---------------------+ | Hosts B and C are out - | |eth1 port2| | here somewhere - +----------+ +----------+ - - The router may be a dedicated router device, or another host -acting as a gateway. For our discussion, the important point is that -the majority of traffic from Host A will pass through the router to -some other network before reaching its final destination. - - In a gatewayed network configuration, although Host A may -communicate with many other systems, all of its traffic will be sent -and received via one other peer on the local network, the router. - - Note that the case of two systems connected directly via -multiple physical links is, for purposes of configuring bonding, the -same as a gatewayed configuration. In that case, it happens that all -traffic is destined for the "gateway" itself, not some other network -beyond the gateway. - - In a local configuration, the "switch" is acting primarily as -a switch, and the majority of traffic passes through this switch to -reach other stations on the same network. An example would be the -following: - - +----------+ +----------+ +--------+ - | |eth0 port1| +-------+ Host B | - | Host A +------------+ switch |port3 +--------+ - | +------------+ | +--------+ - | |eth1 port2| +------------------+ Host C | - +----------+ +----------+port4 +--------+ - - - Again, the switch may be a dedicated switch device, or another -host acting as a gateway. For our discussion, the important point is -that the majority of traffic from Host A is destined for other hosts -on the same local network (Hosts B and C in the above example). - - In summary, in a gatewayed configuration, traffic to and from -the bonded device will be to the same MAC level peer on the network -(the gateway itself, i.e., the router), regardless of its final -destination. In a local configuration, traffic flows directly to and -from the final destinations, thus, each destination (Host B, Host C) -will be addressed directly by their individual MAC addresses. - - This distinction between a gatewayed and a local network -configuration is important because many of the load balancing modes -available use the MAC addresses of the local network source and -destination to make load balancing decisions. The behavior of each -mode is described below. - - -12.1.1 MT Bonding Mode Selection for Single Switch Topology ------------------------------------------------------------ - - This configuration is the easiest to set up and to understand, -although you will have to decide which bonding mode best suits your -needs. The trade offs for each mode are detailed below: - -balance-rr: This mode is the only mode that will permit a single - TCP/IP connection to stripe traffic across multiple - interfaces. It is therefore the only mode that will allow a - single TCP/IP stream to utilize more than one interface's - worth of throughput. This comes at a cost, however: the - striping generally results in peer systems receiving packets out - of order, causing TCP/IP's congestion control system to kick - in, often by retransmitting segments. - - It is possible to adjust TCP/IP's congestion limits by - altering the net.ipv4.tcp_reordering sysctl parameter. The - usual default value is 3. But keep in mind TCP stack is able - to automatically increase this when it detects reorders. - - Note that the fraction of packets that will be delivered out of - order is highly variable, and is unlikely to be zero. The level - of reordering depends upon a variety of factors, including the - networking interfaces, the switch, and the topology of the - configuration. Speaking in general terms, higher speed network - cards produce more reordering (due to factors such as packet - coalescing), and a "many to many" topology will reorder at a - higher rate than a "many slow to one fast" configuration. - - Many switches do not support any modes that stripe traffic - (instead choosing a port based upon IP or MAC level addresses); - for those devices, traffic for a particular connection flowing - through the switch to a balance-rr bond will not utilize greater - than one interface's worth of bandwidth. - - If you are utilizing protocols other than TCP/IP, UDP for - example, and your application can tolerate out of order - delivery, then this mode can allow for single stream datagram - performance that scales near linearly as interfaces are added - to the bond. - - This mode requires the switch to have the appropriate ports - configured for "etherchannel" or "trunking." - -active-backup: There is not much advantage in this network topology to - the active-backup mode, as the inactive backup devices are all - connected to the same peer as the primary. In this case, a - load balancing mode (with link monitoring) will provide the - same level of network availability, but with increased - available bandwidth. On the plus side, active-backup mode - does not require any configuration of the switch, so it may - have value if the hardware available does not support any of - the load balance modes. - -balance-xor: This mode will limit traffic such that packets destined - for specific peers will always be sent over the same - interface. Since the destination is determined by the MAC - addresses involved, this mode works best in a "local" network - configuration (as described above), with destinations all on - the same local network. This mode is likely to be suboptimal - if all your traffic is passed through a single router (i.e., a - "gatewayed" network configuration, as described above). - - As with balance-rr, the switch ports need to be configured for - "etherchannel" or "trunking." - -broadcast: Like active-backup, there is not much advantage to this - mode in this type of network topology. - -802.3ad: This mode can be a good choice for this type of network - topology. The 802.3ad mode is an IEEE standard, so all peers - that implement 802.3ad should interoperate well. The 802.3ad - protocol includes automatic configuration of the aggregates, - so minimal manual configuration of the switch is needed - (typically only to designate that some set of devices is - available for 802.3ad). The 802.3ad standard also mandates - that frames be delivered in order (within certain limits), so - in general single connections will not see misordering of - packets. The 802.3ad mode does have some drawbacks: the - standard mandates that all devices in the aggregate operate at - the same speed and duplex. Also, as with all bonding load - balance modes other than balance-rr, no single connection will - be able to utilize more than a single interface's worth of - bandwidth. - - Additionally, the linux bonding 802.3ad implementation - distributes traffic by peer (using an XOR of MAC addresses - and packet type ID), so in a "gatewayed" configuration, all - outgoing traffic will generally use the same device. Incoming - traffic may also end up on a single device, but that is - dependent upon the balancing policy of the peer's 802.3ad - implementation. In a "local" configuration, traffic will be - distributed across the devices in the bond. - - Finally, the 802.3ad mode mandates the use of the MII monitor, - therefore, the ARP monitor is not available in this mode. - -balance-tlb: The balance-tlb mode balances outgoing traffic by peer. - Since the balancing is done according to MAC address, in a - "gatewayed" configuration (as described above), this mode will - send all traffic across a single device. However, in a - "local" network configuration, this mode balances multiple - local network peers across devices in a vaguely intelligent - manner (not a simple XOR as in balance-xor or 802.3ad mode), - so that mathematically unlucky MAC addresses (i.e., ones that - XOR to the same value) will not all "bunch up" on a single - interface. - - Unlike 802.3ad, interfaces may be of differing speeds, and no - special switch configuration is required. On the down side, - in this mode all incoming traffic arrives over a single - interface, this mode requires certain ethtool support in the - network device driver of the slave interfaces, and the ARP - monitor is not available. - -balance-alb: This mode is everything that balance-tlb is, and more. - It has all of the features (and restrictions) of balance-tlb, - and will also balance incoming traffic from local network - peers (as described in the Bonding Module Options section, - above). - - The only additional down side to this mode is that the network - device driver must support changing the hardware address while - the device is open. - -12.1.2 MT Link Monitoring for Single Switch Topology ----------------------------------------------------- - - The choice of link monitoring may largely depend upon which -mode you choose to use. The more advanced load balancing modes do not -support the use of the ARP monitor, and are thus restricted to using -the MII monitor (which does not provide as high a level of end to end -assurance as the ARP monitor). - -12.2 Maximum Throughput in a Multiple Switch Topology ------------------------------------------------------ - - Multiple switches may be utilized to optimize for throughput -when they are configured in parallel as part of an isolated network -between two or more systems, for example: - - +-----------+ - | Host A | - +-+---+---+-+ - | | | - +--------+ | +---------+ - | | | - +------+---+ +-----+----+ +-----+----+ - | Switch A | | Switch B | | Switch C | - +------+---+ +-----+----+ +-----+----+ - | | | - +--------+ | +---------+ - | | | - +-+---+---+-+ - | Host B | - +-----------+ - - In this configuration, the switches are isolated from one -another. One reason to employ a topology such as this is for an -isolated network with many hosts (a cluster configured for high -performance, for example), using multiple smaller switches can be more -cost effective than a single larger switch, e.g., on a network with 24 -hosts, three 24 port switches can be significantly less expensive than -a single 72 port switch. - - If access beyond the network is required, an individual host -can be equipped with an additional network device connected to an -external network; this host then additionally acts as a gateway. - -12.2.1 MT Bonding Mode Selection for Multiple Switch Topology -------------------------------------------------------------- - - In actual practice, the bonding mode typically employed in -configurations of this type is balance-rr. Historically, in this -network configuration, the usual caveats about out of order packet -delivery are mitigated by the use of network adapters that do not do -any kind of packet coalescing (via the use of NAPI, or because the -device itself does not generate interrupts until some number of -packets has arrived). When employed in this fashion, the balance-rr -mode allows individual connections between two hosts to effectively -utilize greater than one interface's bandwidth. - -12.2.2 MT Link Monitoring for Multiple Switch Topology ------------------------------------------------------- - - Again, in actual practice, the MII monitor is most often used -in this configuration, as performance is given preference over -availability. The ARP monitor will function in this topology, but its -advantages over the MII monitor are mitigated by the volume of probes -needed as the number of systems involved grows (remember that each -host in the network is configured with bonding). - -13. Switch Behavior Issues -========================== - -13.1 Link Establishment and Failover Delays -------------------------------------------- - - Some switches exhibit undesirable behavior with regard to the -timing of link up and down reporting by the switch. - - First, when a link comes up, some switches may indicate that -the link is up (carrier available), but not pass traffic over the -interface for some period of time. This delay is typically due to -some type of autonegotiation or routing protocol, but may also occur -during switch initialization (e.g., during recovery after a switch -failure). If you find this to be a problem, specify an appropriate -value to the updelay bonding module option to delay the use of the -relevant interface(s). - - Second, some switches may "bounce" the link state one or more -times while a link is changing state. This occurs most commonly while -the switch is initializing. Again, an appropriate updelay value may -help. - - Note that when a bonding interface has no active links, the -driver will immediately reuse the first link that goes up, even if the -updelay parameter has been specified (the updelay is ignored in this -case). If there are slave interfaces waiting for the updelay timeout -to expire, the interface that first went into that state will be -immediately reused. This reduces down time of the network if the -value of updelay has been overestimated, and since this occurs only in -cases with no connectivity, there is no additional penalty for -ignoring the updelay. - - In addition to the concerns about switch timings, if your -switches take a long time to go into backup mode, it may be desirable -to not activate a backup interface immediately after a link goes down. -Failover may be delayed via the downdelay bonding module option. - -13.2 Duplicated Incoming Packets --------------------------------- - - NOTE: Starting with version 3.0.2, the bonding driver has logic to -suppress duplicate packets, which should largely eliminate this problem. -The following description is kept for reference. - - It is not uncommon to observe a short burst of duplicated -traffic when the bonding device is first used, or after it has been -idle for some period of time. This is most easily observed by issuing -a "ping" to some other host on the network, and noticing that the -output from ping flags duplicates (typically one per slave). - - For example, on a bond in active-backup mode with five slaves -all connected to one switch, the output may appear as follows: - -# ping -n 10.0.4.2 -PING 10.0.4.2 (10.0.4.2) from 10.0.3.10 : 56(84) bytes of data. -64 bytes from 10.0.4.2: icmp_seq=1 ttl=64 time=13.7 ms -64 bytes from 10.0.4.2: icmp_seq=1 ttl=64 time=13.8 ms (DUP!) -64 bytes from 10.0.4.2: icmp_seq=1 ttl=64 time=13.8 ms (DUP!) -64 bytes from 10.0.4.2: icmp_seq=1 ttl=64 time=13.8 ms (DUP!) -64 bytes from 10.0.4.2: icmp_seq=1 ttl=64 time=13.8 ms (DUP!) -64 bytes from 10.0.4.2: icmp_seq=2 ttl=64 time=0.216 ms -64 bytes from 10.0.4.2: icmp_seq=3 ttl=64 time=0.267 ms -64 bytes from 10.0.4.2: icmp_seq=4 ttl=64 time=0.222 ms - - This is not due to an error in the bonding driver, rather, it -is a side effect of how many switches update their MAC forwarding -tables. Initially, the switch does not associate the MAC address in -the packet with a particular switch port, and so it may send the -traffic to all ports until its MAC forwarding table is updated. Since -the interfaces attached to the bond may occupy multiple ports on a -single switch, when the switch (temporarily) floods the traffic to all -ports, the bond device receives multiple copies of the same packet -(one per slave device). - - The duplicated packet behavior is switch dependent, some -switches exhibit this, and some do not. On switches that display this -behavior, it can be induced by clearing the MAC forwarding table (on -most Cisco switches, the privileged command "clear mac address-table -dynamic" will accomplish this). - -14. Hardware Specific Considerations -==================================== - - This section contains additional information for configuring -bonding on specific hardware platforms, or for interfacing bonding -with particular switches or other devices. - -14.1 IBM BladeCenter --------------------- - - This applies to the JS20 and similar systems. - - On the JS20 blades, the bonding driver supports only -balance-rr, active-backup, balance-tlb and balance-alb modes. This is -largely due to the network topology inside the BladeCenter, detailed -below. - -JS20 network adapter information --------------------------------- - - All JS20s come with two Broadcom Gigabit Ethernet ports -integrated on the planar (that's "motherboard" in IBM-speak). In the -BladeCenter chassis, the eth0 port of all JS20 blades is hard wired to -I/O Module #1; similarly, all eth1 ports are wired to I/O Module #2. -An add-on Broadcom daughter card can be installed on a JS20 to provide -two more Gigabit Ethernet ports. These ports, eth2 and eth3, are -wired to I/O Modules 3 and 4, respectively. - - Each I/O Module may contain either a switch or a passthrough -module (which allows ports to be directly connected to an external -switch). Some bonding modes require a specific BladeCenter internal -network topology in order to function; these are detailed below. - - Additional BladeCenter-specific networking information can be -found in two IBM Redbooks (www.ibm.com/redbooks): - -"IBM eServer BladeCenter Networking Options" -"IBM eServer BladeCenter Layer 2-7 Network Switching" - -BladeCenter networking configuration ------------------------------------- - - Because a BladeCenter can be configured in a very large number -of ways, this discussion will be confined to describing basic -configurations. - - Normally, Ethernet Switch Modules (ESMs) are used in I/O -modules 1 and 2. In this configuration, the eth0 and eth1 ports of a -JS20 will be connected to different internal switches (in the -respective I/O modules). - - A passthrough module (OPM or CPM, optical or copper, -passthrough module) connects the I/O module directly to an external -switch. By using PMs in I/O module #1 and #2, the eth0 and eth1 -interfaces of a JS20 can be redirected to the outside world and -connected to a common external switch. - - Depending upon the mix of ESMs and PMs, the network will -appear to bonding as either a single switch topology (all PMs) or as a -multiple switch topology (one or more ESMs, zero or more PMs). It is -also possible to connect ESMs together, resulting in a configuration -much like the example in "High Availability in a Multiple Switch -Topology," above. - -Requirements for specific modes -------------------------------- - - The balance-rr mode requires the use of passthrough modules -for devices in the bond, all connected to an common external switch. -That switch must be configured for "etherchannel" or "trunking" on the -appropriate ports, as is usual for balance-rr. - - The balance-alb and balance-tlb modes will function with -either switch modules or passthrough modules (or a mix). The only -specific requirement for these modes is that all network interfaces -must be able to reach all destinations for traffic sent over the -bonding device (i.e., the network must converge at some point outside -the BladeCenter). - - The active-backup mode has no additional requirements. - -Link monitoring issues ----------------------- - - When an Ethernet Switch Module is in place, only the ARP -monitor will reliably detect link loss to an external switch. This is -nothing unusual, but examination of the BladeCenter cabinet would -suggest that the "external" network ports are the ethernet ports for -the system, when it fact there is a switch between these "external" -ports and the devices on the JS20 system itself. The MII monitor is -only able to detect link failures between the ESM and the JS20 system. - - When a passthrough module is in place, the MII monitor does -detect failures to the "external" port, which is then directly -connected to the JS20 system. - -Other concerns --------------- - - The Serial Over LAN (SoL) link is established over the primary -ethernet (eth0) only, therefore, any loss of link to eth0 will result -in losing your SoL connection. It will not fail over with other -network traffic, as the SoL system is beyond the control of the -bonding driver. - - It may be desirable to disable spanning tree on the switch -(either the internal Ethernet Switch Module, or an external switch) to -avoid fail-over delay issues when using bonding. - - -15. Frequently Asked Questions -============================== - -1. Is it SMP safe? - - Yes. The old 2.0.xx channel bonding patch was not SMP safe. -The new driver was designed to be SMP safe from the start. - -2. What type of cards will work with it? - - Any Ethernet type cards (you can even mix cards - a Intel -EtherExpress PRO/100 and a 3com 3c905b, for example). For most modes, -devices need not be of the same speed. - - Starting with version 3.2.1, bonding also supports Infiniband -slaves in active-backup mode. - -3. How many bonding devices can I have? - - There is no limit. - -4. How many slaves can a bonding device have? - - This is limited only by the number of network interfaces Linux -supports and/or the number of network cards you can place in your -system. - -5. What happens when a slave link dies? - - If link monitoring is enabled, then the failing device will be -disabled. The active-backup mode will fail over to a backup link, and -other modes will ignore the failed link. The link will continue to be -monitored, and should it recover, it will rejoin the bond (in whatever -manner is appropriate for the mode). See the sections on High -Availability and the documentation for each mode for additional -information. - - Link monitoring can be enabled via either the miimon or -arp_interval parameters (described in the module parameters section, -above). In general, miimon monitors the carrier state as sensed by -the underlying network device, and the arp monitor (arp_interval) -monitors connectivity to another host on the local network. - - If no link monitoring is configured, the bonding driver will -be unable to detect link failures, and will assume that all links are -always available. This will likely result in lost packets, and a -resulting degradation of performance. The precise performance loss -depends upon the bonding mode and network configuration. - -6. Can bonding be used for High Availability? - - Yes. See the section on High Availability for details. - -7. Which switches/systems does it work with? - - The full answer to this depends upon the desired mode. - - In the basic balance modes (balance-rr and balance-xor), it -works with any system that supports etherchannel (also called -trunking). Most managed switches currently available have such -support, and many unmanaged switches as well. - - The advanced balance modes (balance-tlb and balance-alb) do -not have special switch requirements, but do need device drivers that -support specific features (described in the appropriate section under -module parameters, above). - - In 802.3ad mode, it works with systems that support IEEE -802.3ad Dynamic Link Aggregation. Most managed and many unmanaged -switches currently available support 802.3ad. - - The active-backup mode should work with any Layer-II switch. - -8. Where does a bonding device get its MAC address from? - - When using slave devices that have fixed MAC addresses, or when -the fail_over_mac option is enabled, the bonding device's MAC address is -the MAC address of the active slave. - - For other configurations, if not explicitly configured (with -ifconfig or ip link), the MAC address of the bonding device is taken from -its first slave device. This MAC address is then passed to all following -slaves and remains persistent (even if the first slave is removed) until -the bonding device is brought down or reconfigured. - - If you wish to change the MAC address, you can set it with -ifconfig or ip link: - -# ifconfig bond0 hw ether 00:11:22:33:44:55 - -# ip link set bond0 address 66:77:88:99:aa:bb - - The MAC address can be also changed by bringing down/up the -device and then changing its slaves (or their order): - -# ifconfig bond0 down ; modprobe -r bonding -# ifconfig bond0 .... up -# ifenslave bond0 eth... - - This method will automatically take the address from the next -slave that is added. - - To restore your slaves' MAC addresses, you need to detach them -from the bond (`ifenslave -d bond0 eth0'). The bonding driver will -then restore the MAC addresses that the slaves had before they were -enslaved. - -16. Resources and Links -======================= - - The latest version of the bonding driver can be found in the latest -version of the linux kernel, found on http://kernel.org - - The latest version of this document can be found in the latest kernel -source (named Documentation/networking/bonding.txt). - - Discussions regarding the usage of the bonding driver take place on the -bonding-devel mailing list, hosted at sourceforge.net. If you have questions or -problems, post them to the list. The list address is: - -bonding-devel@lists.sourceforge.net - - The administrative interface (to subscribe or unsubscribe) can -be found at: - -https://lists.sourceforge.net/lists/listinfo/bonding-devel - - Discussions regarding the development of the bonding driver take place -on the main Linux network mailing list, hosted at vger.kernel.org. The list -address is: - -netdev@vger.kernel.org - - The administrative interface (to subscribe or unsubscribe) can -be found at: - -http://vger.kernel.org/vger-lists.html#netdev - -Donald Becker's Ethernet Drivers and diag programs may be found at : - - http://web.archive.org/web/*/http://www.scyld.com/network/ - -You will also find a lot of information regarding Ethernet, NWay, MII, -etc. at www.scyld.com. - --- END -- diff --git a/Documentation/networking/device_drivers/intel/e100.rst b/Documentation/networking/device_drivers/intel/e100.rst index caf023cc88de..3ac21e7119a7 100644 --- a/Documentation/networking/device_drivers/intel/e100.rst +++ b/Documentation/networking/device_drivers/intel/e100.rst @@ -33,7 +33,7 @@ The following features are now available in supported kernels: - SNMP Channel Bonding documentation can be found in the Linux kernel source: -/Documentation/networking/bonding.txt +/Documentation/networking/bonding.rst Identifying Your Adapter diff --git a/Documentation/networking/device_drivers/intel/ixgb.rst b/Documentation/networking/device_drivers/intel/ixgb.rst index 945018207a92..ab624f1a44a8 100644 --- a/Documentation/networking/device_drivers/intel/ixgb.rst +++ b/Documentation/networking/device_drivers/intel/ixgb.rst @@ -37,7 +37,7 @@ The following features are available in this kernel: - SNMP Channel Bonding documentation can be found in the Linux kernel source: -/Documentation/networking/bonding.txt +/Documentation/networking/bonding.rst The driver information previously displayed in the /proc filesystem is not supported in this release. Alternatively, you can use ethtool (version 1.6 diff --git a/Documentation/networking/index.rst b/Documentation/networking/index.rst index fbf845fbaff7..22b872834ef0 100644 --- a/Documentation/networking/index.rst +++ b/Documentation/networking/index.rst @@ -44,6 +44,7 @@ Contents: atm ax25 baycom + bonding .. only:: subproject and html diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig index b103fbdd0f68..4ab6d343fd86 100644 --- a/drivers/net/Kconfig +++ b/drivers/net/Kconfig @@ -50,7 +50,7 @@ config BONDING The driver supports multiple bonding modes to allow for both high performance and high availability operation. - Refer to for more + Refer to for more information. To compile this driver as a module, choose M here: the module -- cgit v1.2.3-59-g8ed1b From 92f06f4226fd9bdd6fbbd2e8b84601fc14b5855e Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Tue, 28 Apr 2020 00:01:25 +0200 Subject: docs: networking: convert cdc_mbim.txt to ReST - add SPDX header; - mark code blocks and literals as such; - use :field: markup; - adjust identation, whitespaces and blank lines; - add to networking/index.rst. Signed-off-by: Mauro Carvalho Chehab Signed-off-by: David S. Miller --- Documentation/networking/cdc_mbim.rst | 355 ++++++++++++++++++++++++++++++++++ Documentation/networking/cdc_mbim.txt | 339 -------------------------------- Documentation/networking/index.rst | 1 + 3 files changed, 356 insertions(+), 339 deletions(-) create mode 100644 Documentation/networking/cdc_mbim.rst delete mode 100644 Documentation/networking/cdc_mbim.txt diff --git a/Documentation/networking/cdc_mbim.rst b/Documentation/networking/cdc_mbim.rst new file mode 100644 index 000000000000..0048409c06b4 --- /dev/null +++ b/Documentation/networking/cdc_mbim.rst @@ -0,0 +1,355 @@ +.. SPDX-License-Identifier: GPL-2.0 + +====================================================== +cdc_mbim - Driver for CDC MBIM Mobile Broadband modems +====================================================== + +The cdc_mbim driver supports USB devices conforming to the "Universal +Serial Bus Communications Class Subclass Specification for Mobile +Broadband Interface Model" [1], which is a further development of +"Universal Serial Bus Communications Class Subclass Specifications for +Network Control Model Devices" [2] optimized for Mobile Broadband +devices, aka "3G/LTE modems". + + +Command Line Parameters +======================= + +The cdc_mbim driver has no parameters of its own. But the probing +behaviour for NCM 1.0 backwards compatible MBIM functions (an +"NCM/MBIM function" as defined in section 3.2 of [1]) is affected +by a cdc_ncm driver parameter: + +prefer_mbim +----------- +:Type: Boolean +:Valid Range: N/Y (0-1) +:Default Value: Y (MBIM is preferred) + +This parameter sets the system policy for NCM/MBIM functions. Such +functions will be handled by either the cdc_ncm driver or the cdc_mbim +driver depending on the prefer_mbim setting. Setting prefer_mbim=N +makes the cdc_mbim driver ignore these functions and lets the cdc_ncm +driver handle them instead. + +The parameter is writable, and can be changed at any time. A manual +unbind/bind is required to make the change effective for NCM/MBIM +functions bound to the "wrong" driver + + +Basic usage +=========== + +MBIM functions are inactive when unmanaged. The cdc_mbim driver only +provides a userspace interface to the MBIM control channel, and will +not participate in the management of the function. This implies that a +userspace MBIM management application always is required to enable a +MBIM function. + +Such userspace applications includes, but are not limited to: + + - mbimcli (included with the libmbim [3] library), and + - ModemManager [4] + +Establishing a MBIM IP session reequires at least these actions by the +management application: + + - open the control channel + - configure network connection settings + - connect to network + - configure IP interface + +Management application development +---------------------------------- +The driver <-> userspace interfaces are described below. The MBIM +control channel protocol is described in [1]. + + +MBIM control channel userspace ABI +================================== + +/dev/cdc-wdmX character device +------------------------------ +The driver creates a two-way pipe to the MBIM function control channel +using the cdc-wdm driver as a subdriver. The userspace end of the +control channel pipe is a /dev/cdc-wdmX character device. + +The cdc_mbim driver does not process or police messages on the control +channel. The channel is fully delegated to the userspace management +application. It is therefore up to this application to ensure that it +complies with all the control channel requirements in [1]. + +The cdc-wdmX device is created as a child of the MBIM control +interface USB device. The character device associated with a specific +MBIM function can be looked up using sysfs. For example:: + + bjorn@nemi:~$ ls /sys/bus/usb/drivers/cdc_mbim/2-4:2.12/usbmisc + cdc-wdm0 + + bjorn@nemi:~$ grep . /sys/bus/usb/drivers/cdc_mbim/2-4:2.12/usbmisc/cdc-wdm0/dev + 180:0 + + +USB configuration descriptors +----------------------------- +The wMaxControlMessage field of the CDC MBIM functional descriptor +limits the maximum control message size. The managament application is +responsible for negotiating a control message size complying with the +requirements in section 9.3.1 of [1], taking this descriptor field +into consideration. + +The userspace application can access the CDC MBIM functional +descriptor of a MBIM function using either of the two USB +configuration descriptor kernel interfaces described in [6] or [7]. + +See also the ioctl documentation below. + + +Fragmentation +------------- +The userspace application is responsible for all control message +fragmentation and defragmentaion, as described in section 9.5 of [1]. + + +/dev/cdc-wdmX write() +--------------------- +The MBIM control messages from the management application *must not* +exceed the negotiated control message size. + + +/dev/cdc-wdmX read() +-------------------- +The management application *must* accept control messages of up the +negotiated control message size. + + +/dev/cdc-wdmX ioctl() +--------------------- +IOCTL_WDM_MAX_COMMAND: Get Maximum Command Size +This ioctl returns the wMaxControlMessage field of the CDC MBIM +functional descriptor for MBIM devices. This is intended as a +convenience, eliminating the need to parse the USB descriptors from +userspace. + +:: + + #include + #include + #include + #include + #include + int main() + { + __u16 max; + int fd = open("/dev/cdc-wdm0", O_RDWR); + if (!ioctl(fd, IOCTL_WDM_MAX_COMMAND, &max)) + printf("wMaxControlMessage is %d\n", max); + } + + +Custom device services +---------------------- +The MBIM specification allows vendors to freely define additional +services. This is fully supported by the cdc_mbim driver. + +Support for new MBIM services, including vendor specified services, is +implemented entirely in userspace, like the rest of the MBIM control +protocol + +New services should be registered in the MBIM Registry [5]. + + + +MBIM data channel userspace ABI +=============================== + +wwanY network device +-------------------- +The cdc_mbim driver represents the MBIM data channel as a single +network device of the "wwan" type. This network device is initially +mapped to MBIM IP session 0. + + +Multiplexed IP sessions (IPS) +----------------------------- +MBIM allows multiplexing up to 256 IP sessions over a single USB data +channel. The cdc_mbim driver models such IP sessions as 802.1q VLAN +subdevices of the master wwanY device, mapping MBIM IP session Z to +VLAN ID Z for all values of Z greater than 0. + +The device maximum Z is given in the MBIM_DEVICE_CAPS_INFO structure +described in section 10.5.1 of [1]. + +The userspace management application is responsible for adding new +VLAN links prior to establishing MBIM IP sessions where the SessionId +is greater than 0. These links can be added by using the normal VLAN +kernel interfaces, either ioctl or netlink. + +For example, adding a link for a MBIM IP session with SessionId 3:: + + ip link add link wwan0 name wwan0.3 type vlan id 3 + +The driver will automatically map the "wwan0.3" network device to MBIM +IP session 3. + + +Device Service Streams (DSS) +---------------------------- +MBIM also allows up to 256 non-IP data streams to be multiplexed over +the same shared USB data channel. The cdc_mbim driver models these +sessions as another set of 802.1q VLAN subdevices of the master wwanY +device, mapping MBIM DSS session A to VLAN ID (256 + A) for all values +of A. + +The device maximum A is given in the MBIM_DEVICE_SERVICES_INFO +structure described in section 10.5.29 of [1]. + +The DSS VLAN subdevices are used as a practical interface between the +shared MBIM data channel and a MBIM DSS aware userspace application. +It is not intended to be presented as-is to an end user. The +assumption is that a userspace application initiating a DSS session +also takes care of the necessary framing of the DSS data, presenting +the stream to the end user in an appropriate way for the stream type. + +The network device ABI requires a dummy ethernet header for every DSS +data frame being transported. The contents of this header is +arbitrary, with the following exceptions: + + - TX frames using an IP protocol (0x0800 or 0x86dd) will be dropped + - RX frames will have the protocol field set to ETH_P_802_3 (but will + not be properly formatted 802.3 frames) + - RX frames will have the destination address set to the hardware + address of the master device + +The DSS supporting userspace management application is responsible for +adding the dummy ethernet header on TX and stripping it on RX. + +This is a simple example using tools commonly available, exporting +DssSessionId 5 as a pty character device pointed to by a /dev/nmea +symlink:: + + ip link add link wwan0 name wwan0.dss5 type vlan id 261 + ip link set dev wwan0.dss5 up + socat INTERFACE:wwan0.dss5,type=2 PTY:,echo=0,link=/dev/nmea + +This is only an example, most suitable for testing out a DSS +service. Userspace applications supporting specific MBIM DSS services +are expected to use the tools and programming interfaces required by +that service. + +Note that adding VLAN links for DSS sessions is entirely optional. A +management application may instead choose to bind a packet socket +directly to the master network device, using the received VLAN tags to +map frames to the correct DSS session and adding 18 byte VLAN ethernet +headers with the appropriate tag on TX. In this case using a socket +filter is recommended, matching only the DSS VLAN subset. This avoid +unnecessary copying of unrelated IP session data to userspace. For +example:: + + static struct sock_filter dssfilter[] = { + /* use special negative offsets to get VLAN tag */ + BPF_STMT(BPF_LD|BPF_B|BPF_ABS, SKF_AD_OFF + SKF_AD_VLAN_TAG_PRESENT), + BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, 1, 0, 6), /* true */ + + /* verify DSS VLAN range */ + BPF_STMT(BPF_LD|BPF_H|BPF_ABS, SKF_AD_OFF + SKF_AD_VLAN_TAG), + BPF_JUMP(BPF_JMP|BPF_JGE|BPF_K, 256, 0, 4), /* 256 is first DSS VLAN */ + BPF_JUMP(BPF_JMP|BPF_JGE|BPF_K, 512, 3, 0), /* 511 is last DSS VLAN */ + + /* verify ethertype */ + BPF_STMT(BPF_LD|BPF_H|BPF_ABS, 2 * ETH_ALEN), + BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, ETH_P_802_3, 0, 1), + + BPF_STMT(BPF_RET|BPF_K, (u_int)-1), /* accept */ + BPF_STMT(BPF_RET|BPF_K, 0), /* ignore */ + }; + + + +Tagged IP session 0 VLAN +------------------------ +As described above, MBIM IP session 0 is treated as special by the +driver. It is initially mapped to untagged frames on the wwanY +network device. + +This mapping implies a few restrictions on multiplexed IPS and DSS +sessions, which may not always be practical: + + - no IPS or DSS session can use a frame size greater than the MTU on + IP session 0 + - no IPS or DSS session can be in the up state unless the network + device representing IP session 0 also is up + +These problems can be avoided by optionally making the driver map IP +session 0 to a VLAN subdevice, similar to all other IP sessions. This +behaviour is triggered by adding a VLAN link for the magic VLAN ID +4094. The driver will then immediately start mapping MBIM IP session +0 to this VLAN, and will drop untagged frames on the master wwanY +device. + +Tip: It might be less confusing to the end user to name this VLAN +subdevice after the MBIM SessionID instead of the VLAN ID. For +example:: + + ip link add link wwan0 name wwan0.0 type vlan id 4094 + + +VLAN mapping +------------ + +Summarizing the cdc_mbim driver mapping described above, we have this +relationship between VLAN tags on the wwanY network device and MBIM +sessions on the shared USB data channel:: + + VLAN ID MBIM type MBIM SessionID Notes + --------------------------------------------------------- + untagged IPS 0 a) + 1 - 255 IPS 1 - 255 + 256 - 511 DSS 0 - 255 + 512 - 4093 b) + 4094 IPS 0 c) + + a) if no VLAN ID 4094 link exists, else dropped + b) unsupported VLAN range, unconditionally dropped + c) if a VLAN ID 4094 link exists, else dropped + + + + +References +========== + + 1) USB Implementers Forum, Inc. - "Universal Serial Bus + Communications Class Subclass Specification for Mobile Broadband + Interface Model", Revision 1.0 (Errata 1), May 1, 2013 + + - http://www.usb.org/developers/docs/devclass_docs/ + + 2) USB Implementers Forum, Inc. - "Universal Serial Bus + Communications Class Subclass Specifications for Network Control + Model Devices", Revision 1.0 (Errata 1), November 24, 2010 + + - http://www.usb.org/developers/docs/devclass_docs/ + + 3) libmbim - "a glib-based library for talking to WWAN modems and + devices which speak the Mobile Interface Broadband Model (MBIM) + protocol" + + - http://www.freedesktop.org/wiki/Software/libmbim/ + + 4) ModemManager - "a DBus-activated daemon which controls mobile + broadband (2G/3G/4G) devices and connections" + + - http://www.freedesktop.org/wiki/Software/ModemManager/ + + 5) "MBIM (Mobile Broadband Interface Model) Registry" + + - http://compliance.usb.org/mbim/ + + 6) "/sys/kernel/debug/usb/devices output format" + + - Documentation/driver-api/usb/usb.rst + + 7) "/sys/bus/usb/devices/.../descriptors" + + - Documentation/ABI/stable/sysfs-bus-usb diff --git a/Documentation/networking/cdc_mbim.txt b/Documentation/networking/cdc_mbim.txt deleted file mode 100644 index 4e68f0bc5dba..000000000000 --- a/Documentation/networking/cdc_mbim.txt +++ /dev/null @@ -1,339 +0,0 @@ - cdc_mbim - Driver for CDC MBIM Mobile Broadband modems - ======================================================== - -The cdc_mbim driver supports USB devices conforming to the "Universal -Serial Bus Communications Class Subclass Specification for Mobile -Broadband Interface Model" [1], which is a further development of -"Universal Serial Bus Communications Class Subclass Specifications for -Network Control Model Devices" [2] optimized for Mobile Broadband -devices, aka "3G/LTE modems". - - -Command Line Parameters -======================= - -The cdc_mbim driver has no parameters of its own. But the probing -behaviour for NCM 1.0 backwards compatible MBIM functions (an -"NCM/MBIM function" as defined in section 3.2 of [1]) is affected -by a cdc_ncm driver parameter: - -prefer_mbim ------------ -Type: Boolean -Valid Range: N/Y (0-1) -Default Value: Y (MBIM is preferred) - -This parameter sets the system policy for NCM/MBIM functions. Such -functions will be handled by either the cdc_ncm driver or the cdc_mbim -driver depending on the prefer_mbim setting. Setting prefer_mbim=N -makes the cdc_mbim driver ignore these functions and lets the cdc_ncm -driver handle them instead. - -The parameter is writable, and can be changed at any time. A manual -unbind/bind is required to make the change effective for NCM/MBIM -functions bound to the "wrong" driver - - -Basic usage -=========== - -MBIM functions are inactive when unmanaged. The cdc_mbim driver only -provides a userspace interface to the MBIM control channel, and will -not participate in the management of the function. This implies that a -userspace MBIM management application always is required to enable a -MBIM function. - -Such userspace applications includes, but are not limited to: - - mbimcli (included with the libmbim [3] library), and - - ModemManager [4] - -Establishing a MBIM IP session reequires at least these actions by the -management application: - - open the control channel - - configure network connection settings - - connect to network - - configure IP interface - -Management application development ----------------------------------- -The driver <-> userspace interfaces are described below. The MBIM -control channel protocol is described in [1]. - - -MBIM control channel userspace ABI -================================== - -/dev/cdc-wdmX character device ------------------------------- -The driver creates a two-way pipe to the MBIM function control channel -using the cdc-wdm driver as a subdriver. The userspace end of the -control channel pipe is a /dev/cdc-wdmX character device. - -The cdc_mbim driver does not process or police messages on the control -channel. The channel is fully delegated to the userspace management -application. It is therefore up to this application to ensure that it -complies with all the control channel requirements in [1]. - -The cdc-wdmX device is created as a child of the MBIM control -interface USB device. The character device associated with a specific -MBIM function can be looked up using sysfs. For example: - - bjorn@nemi:~$ ls /sys/bus/usb/drivers/cdc_mbim/2-4:2.12/usbmisc - cdc-wdm0 - - bjorn@nemi:~$ grep . /sys/bus/usb/drivers/cdc_mbim/2-4:2.12/usbmisc/cdc-wdm0/dev - 180:0 - - -USB configuration descriptors ------------------------------ -The wMaxControlMessage field of the CDC MBIM functional descriptor -limits the maximum control message size. The managament application is -responsible for negotiating a control message size complying with the -requirements in section 9.3.1 of [1], taking this descriptor field -into consideration. - -The userspace application can access the CDC MBIM functional -descriptor of a MBIM function using either of the two USB -configuration descriptor kernel interfaces described in [6] or [7]. - -See also the ioctl documentation below. - - -Fragmentation -------------- -The userspace application is responsible for all control message -fragmentation and defragmentaion, as described in section 9.5 of [1]. - - -/dev/cdc-wdmX write() ---------------------- -The MBIM control messages from the management application *must not* -exceed the negotiated control message size. - - -/dev/cdc-wdmX read() --------------------- -The management application *must* accept control messages of up the -negotiated control message size. - - -/dev/cdc-wdmX ioctl() --------------------- -IOCTL_WDM_MAX_COMMAND: Get Maximum Command Size -This ioctl returns the wMaxControlMessage field of the CDC MBIM -functional descriptor for MBIM devices. This is intended as a -convenience, eliminating the need to parse the USB descriptors from -userspace. - - #include - #include - #include - #include - #include - int main() - { - __u16 max; - int fd = open("/dev/cdc-wdm0", O_RDWR); - if (!ioctl(fd, IOCTL_WDM_MAX_COMMAND, &max)) - printf("wMaxControlMessage is %d\n", max); - } - - -Custom device services ----------------------- -The MBIM specification allows vendors to freely define additional -services. This is fully supported by the cdc_mbim driver. - -Support for new MBIM services, including vendor specified services, is -implemented entirely in userspace, like the rest of the MBIM control -protocol - -New services should be registered in the MBIM Registry [5]. - - - -MBIM data channel userspace ABI -=============================== - -wwanY network device --------------------- -The cdc_mbim driver represents the MBIM data channel as a single -network device of the "wwan" type. This network device is initially -mapped to MBIM IP session 0. - - -Multiplexed IP sessions (IPS) ------------------------------ -MBIM allows multiplexing up to 256 IP sessions over a single USB data -channel. The cdc_mbim driver models such IP sessions as 802.1q VLAN -subdevices of the master wwanY device, mapping MBIM IP session Z to -VLAN ID Z for all values of Z greater than 0. - -The device maximum Z is given in the MBIM_DEVICE_CAPS_INFO structure -described in section 10.5.1 of [1]. - -The userspace management application is responsible for adding new -VLAN links prior to establishing MBIM IP sessions where the SessionId -is greater than 0. These links can be added by using the normal VLAN -kernel interfaces, either ioctl or netlink. - -For example, adding a link for a MBIM IP session with SessionId 3: - - ip link add link wwan0 name wwan0.3 type vlan id 3 - -The driver will automatically map the "wwan0.3" network device to MBIM -IP session 3. - - -Device Service Streams (DSS) ----------------------------- -MBIM also allows up to 256 non-IP data streams to be multiplexed over -the same shared USB data channel. The cdc_mbim driver models these -sessions as another set of 802.1q VLAN subdevices of the master wwanY -device, mapping MBIM DSS session A to VLAN ID (256 + A) for all values -of A. - -The device maximum A is given in the MBIM_DEVICE_SERVICES_INFO -structure described in section 10.5.29 of [1]. - -The DSS VLAN subdevices are used as a practical interface between the -shared MBIM data channel and a MBIM DSS aware userspace application. -It is not intended to be presented as-is to an end user. The -assumption is that a userspace application initiating a DSS session -also takes care of the necessary framing of the DSS data, presenting -the stream to the end user in an appropriate way for the stream type. - -The network device ABI requires a dummy ethernet header for every DSS -data frame being transported. The contents of this header is -arbitrary, with the following exceptions: - - TX frames using an IP protocol (0x0800 or 0x86dd) will be dropped - - RX frames will have the protocol field set to ETH_P_802_3 (but will - not be properly formatted 802.3 frames) - - RX frames will have the destination address set to the hardware - address of the master device - -The DSS supporting userspace management application is responsible for -adding the dummy ethernet header on TX and stripping it on RX. - -This is a simple example using tools commonly available, exporting -DssSessionId 5 as a pty character device pointed to by a /dev/nmea -symlink: - - ip link add link wwan0 name wwan0.dss5 type vlan id 261 - ip link set dev wwan0.dss5 up - socat INTERFACE:wwan0.dss5,type=2 PTY:,echo=0,link=/dev/nmea - -This is only an example, most suitable for testing out a DSS -service. Userspace applications supporting specific MBIM DSS services -are expected to use the tools and programming interfaces required by -that service. - -Note that adding VLAN links for DSS sessions is entirely optional. A -management application may instead choose to bind a packet socket -directly to the master network device, using the received VLAN tags to -map frames to the correct DSS session and adding 18 byte VLAN ethernet -headers with the appropriate tag on TX. In this case using a socket -filter is recommended, matching only the DSS VLAN subset. This avoid -unnecessary copying of unrelated IP session data to userspace. For -example: - - static struct sock_filter dssfilter[] = { - /* use special negative offsets to get VLAN tag */ - BPF_STMT(BPF_LD|BPF_B|BPF_ABS, SKF_AD_OFF + SKF_AD_VLAN_TAG_PRESENT), - BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, 1, 0, 6), /* true */ - - /* verify DSS VLAN range */ - BPF_STMT(BPF_LD|BPF_H|BPF_ABS, SKF_AD_OFF + SKF_AD_VLAN_TAG), - BPF_JUMP(BPF_JMP|BPF_JGE|BPF_K, 256, 0, 4), /* 256 is first DSS VLAN */ - BPF_JUMP(BPF_JMP|BPF_JGE|BPF_K, 512, 3, 0), /* 511 is last DSS VLAN */ - - /* verify ethertype */ - BPF_STMT(BPF_LD|BPF_H|BPF_ABS, 2 * ETH_ALEN), - BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, ETH_P_802_3, 0, 1), - - BPF_STMT(BPF_RET|BPF_K, (u_int)-1), /* accept */ - BPF_STMT(BPF_RET|BPF_K, 0), /* ignore */ - }; - - - -Tagged IP session 0 VLAN ------------------------- -As described above, MBIM IP session 0 is treated as special by the -driver. It is initially mapped to untagged frames on the wwanY -network device. - -This mapping implies a few restrictions on multiplexed IPS and DSS -sessions, which may not always be practical: - - no IPS or DSS session can use a frame size greater than the MTU on - IP session 0 - - no IPS or DSS session can be in the up state unless the network - device representing IP session 0 also is up - -These problems can be avoided by optionally making the driver map IP -session 0 to a VLAN subdevice, similar to all other IP sessions. This -behaviour is triggered by adding a VLAN link for the magic VLAN ID -4094. The driver will then immediately start mapping MBIM IP session -0 to this VLAN, and will drop untagged frames on the master wwanY -device. - -Tip: It might be less confusing to the end user to name this VLAN -subdevice after the MBIM SessionID instead of the VLAN ID. For -example: - - ip link add link wwan0 name wwan0.0 type vlan id 4094 - - -VLAN mapping ------------- - -Summarizing the cdc_mbim driver mapping described above, we have this -relationship between VLAN tags on the wwanY network device and MBIM -sessions on the shared USB data channel: - - VLAN ID MBIM type MBIM SessionID Notes - --------------------------------------------------------- - untagged IPS 0 a) - 1 - 255 IPS 1 - 255 - 256 - 511 DSS 0 - 255 - 512 - 4093 b) - 4094 IPS 0 c) - - a) if no VLAN ID 4094 link exists, else dropped - b) unsupported VLAN range, unconditionally dropped - c) if a VLAN ID 4094 link exists, else dropped - - - - -References -========== - -[1] USB Implementers Forum, Inc. - "Universal Serial Bus - Communications Class Subclass Specification for Mobile Broadband - Interface Model", Revision 1.0 (Errata 1), May 1, 2013 - - http://www.usb.org/developers/docs/devclass_docs/ - -[2] USB Implementers Forum, Inc. - "Universal Serial Bus - Communications Class Subclass Specifications for Network Control - Model Devices", Revision 1.0 (Errata 1), November 24, 2010 - - http://www.usb.org/developers/docs/devclass_docs/ - -[3] libmbim - "a glib-based library for talking to WWAN modems and - devices which speak the Mobile Interface Broadband Model (MBIM) - protocol" - - http://www.freedesktop.org/wiki/Software/libmbim/ - -[4] ModemManager - "a DBus-activated daemon which controls mobile - broadband (2G/3G/4G) devices and connections" - - http://www.freedesktop.org/wiki/Software/ModemManager/ - -[5] "MBIM (Mobile Broadband Interface Model) Registry" - - http://compliance.usb.org/mbim/ - -[6] "/sys/kernel/debug/usb/devices output format" - - Documentation/driver-api/usb/usb.rst - -[7] "/sys/bus/usb/devices/.../descriptors" - - Documentation/ABI/stable/sysfs-bus-usb diff --git a/Documentation/networking/index.rst b/Documentation/networking/index.rst index 22b872834ef0..55802abd65a0 100644 --- a/Documentation/networking/index.rst +++ b/Documentation/networking/index.rst @@ -45,6 +45,7 @@ Contents: ax25 baycom bonding + cdc_mbim .. only:: subproject and html -- cgit v1.2.3-59-g8ed1b From 99b0e82dc5e36edb625f519121d4398628f05e95 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Tue, 28 Apr 2020 00:01:26 +0200 Subject: docs: networking: convert cops.txt to ReST - add SPDX header; - adjust titles and chapters, adding proper markups; - mark code blocks and literals as such; - adjust identation, whitespaces and blank lines; - add to networking/index.rst. Signed-off-by: Mauro Carvalho Chehab Signed-off-by: David S. Miller --- Documentation/networking/cops.rst | 80 ++++++++++++++++++++++++++++++++++++++ Documentation/networking/cops.txt | 63 ------------------------------ Documentation/networking/index.rst | 1 + drivers/net/appletalk/Kconfig | 2 +- 4 files changed, 82 insertions(+), 64 deletions(-) create mode 100644 Documentation/networking/cops.rst delete mode 100644 Documentation/networking/cops.txt diff --git a/Documentation/networking/cops.rst b/Documentation/networking/cops.rst new file mode 100644 index 000000000000..964ba80599a9 --- /dev/null +++ b/Documentation/networking/cops.rst @@ -0,0 +1,80 @@ +.. SPDX-License-Identifier: GPL-2.0 + +======================================== +The COPS LocalTalk Linux driver (cops.c) +======================================== + +By Jay Schulist + +This driver has two modes and they are: Dayna mode and Tangent mode. +Each mode corresponds with the type of card. It has been found +that there are 2 main types of cards and all other cards are +the same and just have different names or only have minor differences +such as more IO ports. As this driver is tested it will +become more clear exactly what cards are supported. + +Right now these cards are known to work with the COPS driver. The +LT-200 cards work in a somewhat more limited capacity than the +DL200 cards, which work very well and are in use by many people. + +TANGENT driver mode: + - Tangent ATB-II, Novell NL-1000, Daystar Digital LT-200 + +DAYNA driver mode: + - Dayna DL2000/DaynaTalk PC (Half Length), COPS LT-95, + - Farallon PhoneNET PC III, Farallon PhoneNET PC II + +Other cards possibly supported mode unknown though: + - Dayna DL2000 (Full length) + +The COPS driver defaults to using Dayna mode. To change the driver's +mode if you built a driver with dual support use board_type=1 or +board_type=2 for Dayna or Tangent with insmod. + +Operation/loading of the driver +=============================== + +Use modprobe like this: /sbin/modprobe cops.o (IO #) (IRQ #) +If you do not specify any options the driver will try and use the IO = 0x240, +IRQ = 5. As of right now I would only use IRQ 5 for the card, if autoprobing. + +To load multiple COPS driver Localtalk cards you can do one of the following:: + + insmod cops io=0x240 irq=5 + insmod -o cops2 cops io=0x260 irq=3 + +Or in lilo.conf put something like this:: + + append="ether=5,0x240,lt0 ether=3,0x260,lt1" + +Then bring up the interface with ifconfig. It will look something like this:: + + lt0 Link encap:UNSPEC HWaddr 00-00-00-00-00-00-00-F7-00-00-00-00-00-00-00-00 + inet addr:192.168.1.2 Bcast:192.168.1.255 Mask:255.255.255.0 + UP BROADCAST RUNNING NOARP MULTICAST MTU:600 Metric:1 + RX packets:0 errors:0 dropped:0 overruns:0 frame:0 + TX packets:0 errors:0 dropped:0 overruns:0 carrier:0 coll:0 + +Netatalk Configuration +====================== + +You will need to configure atalkd with something like the following to make +it work with the cops.c driver. + +* For single LTalk card use:: + + dummy -seed -phase 2 -net 2000 -addr 2000.10 -zone "1033" + lt0 -seed -phase 1 -net 1000 -addr 1000.50 -zone "1033" + +* For multiple cards, Ethernet and LocalTalk:: + + eth0 -seed -phase 2 -net 3000 -addr 3000.20 -zone "1033" + lt0 -seed -phase 1 -net 1000 -addr 1000.50 -zone "1033" + +* For multiple LocalTalk cards, and an Ethernet card. + +* Order seems to matter here, Ethernet last:: + + lt0 -seed -phase 1 -net 1000 -addr 1000.10 -zone "LocalTalk1" + lt1 -seed -phase 1 -net 2000 -addr 2000.20 -zone "LocalTalk2" + eth0 -seed -phase 2 -net 3000 -addr 3000.30 -zone "EtherTalk" diff --git a/Documentation/networking/cops.txt b/Documentation/networking/cops.txt deleted file mode 100644 index 3e344b448e07..000000000000 --- a/Documentation/networking/cops.txt +++ /dev/null @@ -1,63 +0,0 @@ -Text File for the COPS LocalTalk Linux driver (cops.c). - By Jay Schulist - -This driver has two modes and they are: Dayna mode and Tangent mode. -Each mode corresponds with the type of card. It has been found -that there are 2 main types of cards and all other cards are -the same and just have different names or only have minor differences -such as more IO ports. As this driver is tested it will -become more clear exactly what cards are supported. - -Right now these cards are known to work with the COPS driver. The -LT-200 cards work in a somewhat more limited capacity than the -DL200 cards, which work very well and are in use by many people. - -TANGENT driver mode: - Tangent ATB-II, Novell NL-1000, Daystar Digital LT-200 -DAYNA driver mode: - Dayna DL2000/DaynaTalk PC (Half Length), COPS LT-95, - Farallon PhoneNET PC III, Farallon PhoneNET PC II -Other cards possibly supported mode unknown though: - Dayna DL2000 (Full length) - -The COPS driver defaults to using Dayna mode. To change the driver's -mode if you built a driver with dual support use board_type=1 or -board_type=2 for Dayna or Tangent with insmod. - -** Operation/loading of the driver. -Use modprobe like this: /sbin/modprobe cops.o (IO #) (IRQ #) -If you do not specify any options the driver will try and use the IO = 0x240, -IRQ = 5. As of right now I would only use IRQ 5 for the card, if autoprobing. - -To load multiple COPS driver Localtalk cards you can do one of the following. - -insmod cops io=0x240 irq=5 -insmod -o cops2 cops io=0x260 irq=3 - -Or in lilo.conf put something like this: - append="ether=5,0x240,lt0 ether=3,0x260,lt1" - -Then bring up the interface with ifconfig. It will look something like this: -lt0 Link encap:UNSPEC HWaddr 00-00-00-00-00-00-00-F7-00-00-00-00-00-00-00-00 - inet addr:192.168.1.2 Bcast:192.168.1.255 Mask:255.255.255.0 - UP BROADCAST RUNNING NOARP MULTICAST MTU:600 Metric:1 - RX packets:0 errors:0 dropped:0 overruns:0 frame:0 - TX packets:0 errors:0 dropped:0 overruns:0 carrier:0 coll:0 - -** Netatalk Configuration -You will need to configure atalkd with something like the following to make -it work with the cops.c driver. - -* For single LTalk card use. -dummy -seed -phase 2 -net 2000 -addr 2000.10 -zone "1033" -lt0 -seed -phase 1 -net 1000 -addr 1000.50 -zone "1033" - -* For multiple cards, Ethernet and LocalTalk. -eth0 -seed -phase 2 -net 3000 -addr 3000.20 -zone "1033" -lt0 -seed -phase 1 -net 1000 -addr 1000.50 -zone "1033" - -* For multiple LocalTalk cards, and an Ethernet card. -* Order seems to matter here, Ethernet last. -lt0 -seed -phase 1 -net 1000 -addr 1000.10 -zone "LocalTalk1" -lt1 -seed -phase 1 -net 2000 -addr 2000.20 -zone "LocalTalk2" -eth0 -seed -phase 2 -net 3000 -addr 3000.30 -zone "EtherTalk" diff --git a/Documentation/networking/index.rst b/Documentation/networking/index.rst index 55802abd65a0..7b596810d479 100644 --- a/Documentation/networking/index.rst +++ b/Documentation/networking/index.rst @@ -46,6 +46,7 @@ Contents: baycom bonding cdc_mbim + cops .. only:: subproject and html diff --git a/drivers/net/appletalk/Kconfig b/drivers/net/appletalk/Kconfig index af509b05ac5c..d4e51c048f62 100644 --- a/drivers/net/appletalk/Kconfig +++ b/drivers/net/appletalk/Kconfig @@ -59,7 +59,7 @@ config COPS package. This driver is experimental, which means that it may not work. This driver will only work if you choose "AppleTalk DDP" networking support, above. - Please read the file . + Please read the file . config COPS_DAYNA bool "Dayna firmware support" -- cgit v1.2.3-59-g8ed1b From 9a9891fbdf935c270388fca856c117ad71c02458 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Tue, 28 Apr 2020 00:01:27 +0200 Subject: docs: networking: convert cxacru.txt to ReST - add SPDX header; - add a document title; - mark code blocks and literals as such; - mark lists as such; - adjust identation, whitespaces and blank lines; - add to networking/index.rst. Signed-off-by: Mauro Carvalho Chehab Signed-off-by: David S. Miller --- Documentation/networking/cxacru.rst | 120 ++++++++++++++++++++++++++++++++++++ Documentation/networking/cxacru.txt | 100 ------------------------------ Documentation/networking/index.rst | 1 + 3 files changed, 121 insertions(+), 100 deletions(-) create mode 100644 Documentation/networking/cxacru.rst delete mode 100644 Documentation/networking/cxacru.txt diff --git a/Documentation/networking/cxacru.rst b/Documentation/networking/cxacru.rst new file mode 100644 index 000000000000..6088af2ffeda --- /dev/null +++ b/Documentation/networking/cxacru.rst @@ -0,0 +1,120 @@ +.. SPDX-License-Identifier: GPL-2.0 + +======================== +ATM cxacru device driver +======================== + +Firmware is required for this device: http://accessrunner.sourceforge.net/ + +While it is capable of managing/maintaining the ADSL connection without the +module loaded, the device will sometimes stop responding after unloading the +driver and it is necessary to unplug/remove power to the device to fix this. + +Note: support for cxacru-cf.bin has been removed. It was not loaded correctly +so it had no effect on the device configuration. Fixing it could have stopped +existing devices working when an invalid configuration is supplied. + +There is a script cxacru-cf.py to convert an existing file to the sysfs form. + +Detected devices will appear as ATM devices named "cxacru". In /sys/class/atm/ +these are directories named cxacruN where N is the device number. A symlink +named device points to the USB interface device's directory which contains +several sysfs attribute files for retrieving device statistics: + +* adsl_controller_version + +* adsl_headend +* adsl_headend_environment + + - Information about the remote headend. + +* adsl_config + + - Configuration writing interface. + - Write parameters in hexadecimal format =, + separated by whitespace, e.g.: + + "1=0 a=5" + + - Up to 7 parameters at a time will be sent and the modem will restart + the ADSL connection when any value is set. These are logged for future + reference. + +* downstream_attenuation (dB) +* downstream_bits_per_frame +* downstream_rate (kbps) +* downstream_snr_margin (dB) + + - Downstream stats. + +* upstream_attenuation (dB) +* upstream_bits_per_frame +* upstream_rate (kbps) +* upstream_snr_margin (dB) +* transmitter_power (dBm/Hz) + + - Upstream stats. + +* downstream_crc_errors +* downstream_fec_errors +* downstream_hec_errors +* upstream_crc_errors +* upstream_fec_errors +* upstream_hec_errors + + - Error counts. + +* line_startable + + - Indicates that ADSL support on the device + is/can be enabled, see adsl_start. + +* line_status + + - "initialising" + - "down" + - "attempting to activate" + - "training" + - "channel analysis" + - "exchange" + - "waiting" + - "up" + + Changes between "down" and "attempting to activate" + if there is no signal. + +* link_status + + - "not connected" + - "connected" + - "lost" + +* mac_address + +* modulation + + - "" (when not connected) + - "ANSI T1.413" + - "ITU-T G.992.1 (G.DMT)" + - "ITU-T G.992.2 (G.LITE)" + +* startup_attempts + + - Count of total attempts to initialise ADSL. + +To enable/disable ADSL, the following can be written to the adsl_state file: + + - "start" + - "stop + - "restart" (stops, waits 1.5s, then starts) + - "poll" (used to resume status polling if it was disabled due to failure) + +Changes in adsl/line state are reported via kernel log messages:: + + [4942145.150704] ATM dev 0: ADSL state: running + [4942243.663766] ATM dev 0: ADSL line: down + [4942249.665075] ATM dev 0: ADSL line: attempting to activate + [4942253.654954] ATM dev 0: ADSL line: training + [4942255.666387] ATM dev 0: ADSL line: channel analysis + [4942259.656262] ATM dev 0: ADSL line: exchange + [2635357.696901] ATM dev 0: ADSL line: up (8128 kb/s down | 832 kb/s up) diff --git a/Documentation/networking/cxacru.txt b/Documentation/networking/cxacru.txt deleted file mode 100644 index 2cce04457b4d..000000000000 --- a/Documentation/networking/cxacru.txt +++ /dev/null @@ -1,100 +0,0 @@ -Firmware is required for this device: http://accessrunner.sourceforge.net/ - -While it is capable of managing/maintaining the ADSL connection without the -module loaded, the device will sometimes stop responding after unloading the -driver and it is necessary to unplug/remove power to the device to fix this. - -Note: support for cxacru-cf.bin has been removed. It was not loaded correctly -so it had no effect on the device configuration. Fixing it could have stopped -existing devices working when an invalid configuration is supplied. - -There is a script cxacru-cf.py to convert an existing file to the sysfs form. - -Detected devices will appear as ATM devices named "cxacru". In /sys/class/atm/ -these are directories named cxacruN where N is the device number. A symlink -named device points to the USB interface device's directory which contains -several sysfs attribute files for retrieving device statistics: - -* adsl_controller_version - -* adsl_headend -* adsl_headend_environment - Information about the remote headend. - -* adsl_config - Configuration writing interface. - Write parameters in hexadecimal format =, - separated by whitespace, e.g.: - "1=0 a=5" - Up to 7 parameters at a time will be sent and the modem will restart - the ADSL connection when any value is set. These are logged for future - reference. - -* downstream_attenuation (dB) -* downstream_bits_per_frame -* downstream_rate (kbps) -* downstream_snr_margin (dB) - Downstream stats. - -* upstream_attenuation (dB) -* upstream_bits_per_frame -* upstream_rate (kbps) -* upstream_snr_margin (dB) -* transmitter_power (dBm/Hz) - Upstream stats. - -* downstream_crc_errors -* downstream_fec_errors -* downstream_hec_errors -* upstream_crc_errors -* upstream_fec_errors -* upstream_hec_errors - Error counts. - -* line_startable - Indicates that ADSL support on the device - is/can be enabled, see adsl_start. - -* line_status - "initialising" - "down" - "attempting to activate" - "training" - "channel analysis" - "exchange" - "waiting" - "up" - - Changes between "down" and "attempting to activate" - if there is no signal. - -* link_status - "not connected" - "connected" - "lost" - -* mac_address - -* modulation - "" (when not connected) - "ANSI T1.413" - "ITU-T G.992.1 (G.DMT)" - "ITU-T G.992.2 (G.LITE)" - -* startup_attempts - Count of total attempts to initialise ADSL. - -To enable/disable ADSL, the following can be written to the adsl_state file: - "start" - "stop - "restart" (stops, waits 1.5s, then starts) - "poll" (used to resume status polling if it was disabled due to failure) - -Changes in adsl/line state are reported via kernel log messages: - [4942145.150704] ATM dev 0: ADSL state: running - [4942243.663766] ATM dev 0: ADSL line: down - [4942249.665075] ATM dev 0: ADSL line: attempting to activate - [4942253.654954] ATM dev 0: ADSL line: training - [4942255.666387] ATM dev 0: ADSL line: channel analysis - [4942259.656262] ATM dev 0: ADSL line: exchange - [2635357.696901] ATM dev 0: ADSL line: up (8128 kb/s down | 832 kb/s up) diff --git a/Documentation/networking/index.rst b/Documentation/networking/index.rst index 7b596810d479..4c8e896490e0 100644 --- a/Documentation/networking/index.rst +++ b/Documentation/networking/index.rst @@ -47,6 +47,7 @@ Contents: bonding cdc_mbim cops + cxacru .. only:: subproject and html -- cgit v1.2.3-59-g8ed1b From 33155bac6519f545137d9c46d2e59e5f8332dd50 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Tue, 28 Apr 2020 00:01:28 +0200 Subject: docs: networking: convert dccp.txt to ReST - add SPDX header; - adjust title markup; - comment out text-only TOC from html/pdf output; - mark code blocks and literals as such; - adjust identation, whitespaces and blank lines; - add to networking/index.rst. Signed-off-by: Mauro Carvalho Chehab Signed-off-by: David S. Miller --- Documentation/networking/dccp.rst | 216 +++++++++++++++++++++++++++++++++++++ Documentation/networking/dccp.txt | 207 ----------------------------------- Documentation/networking/index.rst | 1 + 3 files changed, 217 insertions(+), 207 deletions(-) create mode 100644 Documentation/networking/dccp.rst delete mode 100644 Documentation/networking/dccp.txt diff --git a/Documentation/networking/dccp.rst b/Documentation/networking/dccp.rst new file mode 100644 index 000000000000..dde16be04456 --- /dev/null +++ b/Documentation/networking/dccp.rst @@ -0,0 +1,216 @@ +.. SPDX-License-Identifier: GPL-2.0 + +============= +DCCP protocol +============= + + +.. Contents + - Introduction + - Missing features + - Socket options + - Sysctl variables + - IOCTLs + - Other tunables + - Notes + + +Introduction +============ +Datagram Congestion Control Protocol (DCCP) is an unreliable, connection +oriented protocol designed to solve issues present in UDP and TCP, particularly +for real-time and multimedia (streaming) traffic. +It divides into a base protocol (RFC 4340) and pluggable congestion control +modules called CCIDs. Like pluggable TCP congestion control, at least one CCID +needs to be enabled in order for the protocol to function properly. In the Linux +implementation, this is the TCP-like CCID2 (RFC 4341). Additional CCIDs, such as +the TCP-friendly CCID3 (RFC 4342), are optional. +For a brief introduction to CCIDs and suggestions for choosing a CCID to match +given applications, see section 10 of RFC 4340. + +It has a base protocol and pluggable congestion control IDs (CCIDs). + +DCCP is a Proposed Standard (RFC 2026), and the homepage for DCCP as a protocol +is at http://www.ietf.org/html.charters/dccp-charter.html + + +Missing features +================ +The Linux DCCP implementation does not currently support all the features that are +specified in RFCs 4340...42. + +The known bugs are at: + + http://www.linuxfoundation.org/collaborate/workgroups/networking/todo#DCCP + +For more up-to-date versions of the DCCP implementation, please consider using +the experimental DCCP test tree; instructions for checking this out are on: +http://www.linuxfoundation.org/collaborate/workgroups/networking/dccp_testing#Experimental_DCCP_source_tree + + +Socket options +============== +DCCP_SOCKOPT_QPOLICY_ID sets the dequeuing policy for outgoing packets. It takes +a policy ID as argument and can only be set before the connection (i.e. changes +during an established connection are not supported). Currently, two policies are +defined: the "simple" policy (DCCPQ_POLICY_SIMPLE), which does nothing special, +and a priority-based variant (DCCPQ_POLICY_PRIO). The latter allows to pass an +u32 priority value as ancillary data to sendmsg(), where higher numbers indicate +a higher packet priority (similar to SO_PRIORITY). This ancillary data needs to +be formatted using a cmsg(3) message header filled in as follows:: + + cmsg->cmsg_level = SOL_DCCP; + cmsg->cmsg_type = DCCP_SCM_PRIORITY; + cmsg->cmsg_len = CMSG_LEN(sizeof(uint32_t)); /* or CMSG_LEN(4) */ + +DCCP_SOCKOPT_QPOLICY_TXQLEN sets the maximum length of the output queue. A zero +value is always interpreted as unbounded queue length. If different from zero, +the interpretation of this parameter depends on the current dequeuing policy +(see above): the "simple" policy will enforce a fixed queue size by returning +EAGAIN, whereas the "prio" policy enforces a fixed queue length by dropping the +lowest-priority packet first. The default value for this parameter is +initialised from /proc/sys/net/dccp/default/tx_qlen. + +DCCP_SOCKOPT_SERVICE sets the service. The specification mandates use of +service codes (RFC 4340, sec. 8.1.2); if this socket option is not set, +the socket will fall back to 0 (which means that no meaningful service code +is present). On active sockets this is set before connect(); specifying more +than one code has no effect (all subsequent service codes are ignored). The +case is different for passive sockets, where multiple service codes (up to 32) +can be set before calling bind(). + +DCCP_SOCKOPT_GET_CUR_MPS is read-only and retrieves the current maximum packet +size (application payload size) in bytes, see RFC 4340, section 14. + +DCCP_SOCKOPT_AVAILABLE_CCIDS is also read-only and returns the list of CCIDs +supported by the endpoint. The option value is an array of type uint8_t whose +size is passed as option length. The minimum array size is 4 elements, the +value returned in the optlen argument always reflects the true number of +built-in CCIDs. + +DCCP_SOCKOPT_CCID is write-only and sets both the TX and RX CCIDs at the same +time, combining the operation of the next two socket options. This option is +preferable over the latter two, since often applications will use the same +type of CCID for both directions; and mixed use of CCIDs is not currently well +understood. This socket option takes as argument at least one uint8_t value, or +an array of uint8_t values, which must match available CCIDS (see above). CCIDs +must be registered on the socket before calling connect() or listen(). + +DCCP_SOCKOPT_TX_CCID is read/write. It returns the current CCID (if set) or sets +the preference list for the TX CCID, using the same format as DCCP_SOCKOPT_CCID. +Please note that the getsockopt argument type here is ``int``, not uint8_t. + +DCCP_SOCKOPT_RX_CCID is analogous to DCCP_SOCKOPT_TX_CCID, but for the RX CCID. + +DCCP_SOCKOPT_SERVER_TIMEWAIT enables the server (listening socket) to hold +timewait state when closing the connection (RFC 4340, 8.3). The usual case is +that the closing server sends a CloseReq, whereupon the client holds timewait +state. When this boolean socket option is on, the server sends a Close instead +and will enter TIMEWAIT. This option must be set after accept() returns. + +DCCP_SOCKOPT_SEND_CSCOV and DCCP_SOCKOPT_RECV_CSCOV are used for setting the +partial checksum coverage (RFC 4340, sec. 9.2). The default is that checksums +always cover the entire packet and that only fully covered application data is +accepted by the receiver. Hence, when using this feature on the sender, it must +be enabled at the receiver, too with suitable choice of CsCov. + +DCCP_SOCKOPT_SEND_CSCOV sets the sender checksum coverage. Values in the + range 0..15 are acceptable. The default setting is 0 (full coverage), + values between 1..15 indicate partial coverage. + +DCCP_SOCKOPT_RECV_CSCOV is for the receiver and has a different meaning: it + sets a threshold, where again values 0..15 are acceptable. The default + of 0 means that all packets with a partial coverage will be discarded. + Values in the range 1..15 indicate that packets with minimally such a + coverage value are also acceptable. The higher the number, the more + restrictive this setting (see [RFC 4340, sec. 9.2.1]). Partial coverage + settings are inherited to the child socket after accept(). + +The following two options apply to CCID 3 exclusively and are getsockopt()-only. +In either case, a TFRC info struct (defined in ) is returned. + +DCCP_SOCKOPT_CCID_RX_INFO + Returns a ``struct tfrc_rx_info`` in optval; the buffer for optval and + optlen must be set to at least sizeof(struct tfrc_rx_info). + +DCCP_SOCKOPT_CCID_TX_INFO + Returns a ``struct tfrc_tx_info`` in optval; the buffer for optval and + optlen must be set to at least sizeof(struct tfrc_tx_info). + +On unidirectional connections it is useful to close the unused half-connection +via shutdown (SHUT_WR or SHUT_RD): this will reduce per-packet processing costs. + + +Sysctl variables +================ +Several DCCP default parameters can be managed by the following sysctls +(sysctl net.dccp.default or /proc/sys/net/dccp/default): + +request_retries + The number of active connection initiation retries (the number of + Requests minus one) before timing out. In addition, it also governs + the behaviour of the other, passive side: this variable also sets + the number of times DCCP repeats sending a Response when the initial + handshake does not progress from RESPOND to OPEN (i.e. when no Ack + is received after the initial Request). This value should be greater + than 0, suggested is less than 10. Analogue of tcp_syn_retries. + +retries1 + How often a DCCP Response is retransmitted until the listening DCCP + side considers its connecting peer dead. Analogue of tcp_retries1. + +retries2 + The number of times a general DCCP packet is retransmitted. This has + importance for retransmitted acknowledgments and feature negotiation, + data packets are never retransmitted. Analogue of tcp_retries2. + +tx_ccid = 2 + Default CCID for the sender-receiver half-connection. Depending on the + choice of CCID, the Send Ack Vector feature is enabled automatically. + +rx_ccid = 2 + Default CCID for the receiver-sender half-connection; see tx_ccid. + +seq_window = 100 + The initial sequence window (sec. 7.5.2) of the sender. This influences + the local ackno validity and the remote seqno validity windows (7.5.1). + Values in the range Wmin = 32 (RFC 4340, 7.5.2) up to 2^32-1 can be set. + +tx_qlen = 5 + The size of the transmit buffer in packets. A value of 0 corresponds + to an unbounded transmit buffer. + +sync_ratelimit = 125 ms + The timeout between subsequent DCCP-Sync packets sent in response to + sequence-invalid packets on the same socket (RFC 4340, 7.5.4). The unit + of this parameter is milliseconds; a value of 0 disables rate-limiting. + + +IOCTLS +====== +FIONREAD + Works as in udp(7): returns in the ``int`` argument pointer the size of + the next pending datagram in bytes, or 0 when no datagram is pending. + + +Other tunables +============== +Per-route rto_min support + CCID-2 supports the RTAX_RTO_MIN per-route setting for the minimum value + of the RTO timer. This setting can be modified via the 'rto_min' option + of iproute2; for example:: + + > ip route change 10.0.0.0/24 rto_min 250j dev wlan0 + > ip route add 10.0.0.254/32 rto_min 800j dev wlan0 + > ip route show dev wlan0 + + CCID-3 also supports the rto_min setting: it is used to define the lower + bound for the expiry of the nofeedback timer. This can be useful on LANs + with very low RTTs (e.g., loopback, Gbit ethernet). + + +Notes +===== +DCCP does not travel through NAT successfully at present on many boxes. This is +because the checksum covers the pseudo-header as per TCP and UDP. Linux NAT +support for DCCP has been added. diff --git a/Documentation/networking/dccp.txt b/Documentation/networking/dccp.txt deleted file mode 100644 index 55c575fcaf17..000000000000 --- a/Documentation/networking/dccp.txt +++ /dev/null @@ -1,207 +0,0 @@ -DCCP protocol -============= - - -Contents -======== -- Introduction -- Missing features -- Socket options -- Sysctl variables -- IOCTLs -- Other tunables -- Notes - - -Introduction -============ -Datagram Congestion Control Protocol (DCCP) is an unreliable, connection -oriented protocol designed to solve issues present in UDP and TCP, particularly -for real-time and multimedia (streaming) traffic. -It divides into a base protocol (RFC 4340) and pluggable congestion control -modules called CCIDs. Like pluggable TCP congestion control, at least one CCID -needs to be enabled in order for the protocol to function properly. In the Linux -implementation, this is the TCP-like CCID2 (RFC 4341). Additional CCIDs, such as -the TCP-friendly CCID3 (RFC 4342), are optional. -For a brief introduction to CCIDs and suggestions for choosing a CCID to match -given applications, see section 10 of RFC 4340. - -It has a base protocol and pluggable congestion control IDs (CCIDs). - -DCCP is a Proposed Standard (RFC 2026), and the homepage for DCCP as a protocol -is at http://www.ietf.org/html.charters/dccp-charter.html - - -Missing features -================ -The Linux DCCP implementation does not currently support all the features that are -specified in RFCs 4340...42. - -The known bugs are at: - http://www.linuxfoundation.org/collaborate/workgroups/networking/todo#DCCP - -For more up-to-date versions of the DCCP implementation, please consider using -the experimental DCCP test tree; instructions for checking this out are on: -http://www.linuxfoundation.org/collaborate/workgroups/networking/dccp_testing#Experimental_DCCP_source_tree - - -Socket options -============== -DCCP_SOCKOPT_QPOLICY_ID sets the dequeuing policy for outgoing packets. It takes -a policy ID as argument and can only be set before the connection (i.e. changes -during an established connection are not supported). Currently, two policies are -defined: the "simple" policy (DCCPQ_POLICY_SIMPLE), which does nothing special, -and a priority-based variant (DCCPQ_POLICY_PRIO). The latter allows to pass an -u32 priority value as ancillary data to sendmsg(), where higher numbers indicate -a higher packet priority (similar to SO_PRIORITY). This ancillary data needs to -be formatted using a cmsg(3) message header filled in as follows: - cmsg->cmsg_level = SOL_DCCP; - cmsg->cmsg_type = DCCP_SCM_PRIORITY; - cmsg->cmsg_len = CMSG_LEN(sizeof(uint32_t)); /* or CMSG_LEN(4) */ - -DCCP_SOCKOPT_QPOLICY_TXQLEN sets the maximum length of the output queue. A zero -value is always interpreted as unbounded queue length. If different from zero, -the interpretation of this parameter depends on the current dequeuing policy -(see above): the "simple" policy will enforce a fixed queue size by returning -EAGAIN, whereas the "prio" policy enforces a fixed queue length by dropping the -lowest-priority packet first. The default value for this parameter is -initialised from /proc/sys/net/dccp/default/tx_qlen. - -DCCP_SOCKOPT_SERVICE sets the service. The specification mandates use of -service codes (RFC 4340, sec. 8.1.2); if this socket option is not set, -the socket will fall back to 0 (which means that no meaningful service code -is present). On active sockets this is set before connect(); specifying more -than one code has no effect (all subsequent service codes are ignored). The -case is different for passive sockets, where multiple service codes (up to 32) -can be set before calling bind(). - -DCCP_SOCKOPT_GET_CUR_MPS is read-only and retrieves the current maximum packet -size (application payload size) in bytes, see RFC 4340, section 14. - -DCCP_SOCKOPT_AVAILABLE_CCIDS is also read-only and returns the list of CCIDs -supported by the endpoint. The option value is an array of type uint8_t whose -size is passed as option length. The minimum array size is 4 elements, the -value returned in the optlen argument always reflects the true number of -built-in CCIDs. - -DCCP_SOCKOPT_CCID is write-only and sets both the TX and RX CCIDs at the same -time, combining the operation of the next two socket options. This option is -preferable over the latter two, since often applications will use the same -type of CCID for both directions; and mixed use of CCIDs is not currently well -understood. This socket option takes as argument at least one uint8_t value, or -an array of uint8_t values, which must match available CCIDS (see above). CCIDs -must be registered on the socket before calling connect() or listen(). - -DCCP_SOCKOPT_TX_CCID is read/write. It returns the current CCID (if set) or sets -the preference list for the TX CCID, using the same format as DCCP_SOCKOPT_CCID. -Please note that the getsockopt argument type here is `int', not uint8_t. - -DCCP_SOCKOPT_RX_CCID is analogous to DCCP_SOCKOPT_TX_CCID, but for the RX CCID. - -DCCP_SOCKOPT_SERVER_TIMEWAIT enables the server (listening socket) to hold -timewait state when closing the connection (RFC 4340, 8.3). The usual case is -that the closing server sends a CloseReq, whereupon the client holds timewait -state. When this boolean socket option is on, the server sends a Close instead -and will enter TIMEWAIT. This option must be set after accept() returns. - -DCCP_SOCKOPT_SEND_CSCOV and DCCP_SOCKOPT_RECV_CSCOV are used for setting the -partial checksum coverage (RFC 4340, sec. 9.2). The default is that checksums -always cover the entire packet and that only fully covered application data is -accepted by the receiver. Hence, when using this feature on the sender, it must -be enabled at the receiver, too with suitable choice of CsCov. - -DCCP_SOCKOPT_SEND_CSCOV sets the sender checksum coverage. Values in the - range 0..15 are acceptable. The default setting is 0 (full coverage), - values between 1..15 indicate partial coverage. -DCCP_SOCKOPT_RECV_CSCOV is for the receiver and has a different meaning: it - sets a threshold, where again values 0..15 are acceptable. The default - of 0 means that all packets with a partial coverage will be discarded. - Values in the range 1..15 indicate that packets with minimally such a - coverage value are also acceptable. The higher the number, the more - restrictive this setting (see [RFC 4340, sec. 9.2.1]). Partial coverage - settings are inherited to the child socket after accept(). - -The following two options apply to CCID 3 exclusively and are getsockopt()-only. -In either case, a TFRC info struct (defined in ) is returned. -DCCP_SOCKOPT_CCID_RX_INFO - Returns a `struct tfrc_rx_info' in optval; the buffer for optval and - optlen must be set to at least sizeof(struct tfrc_rx_info). -DCCP_SOCKOPT_CCID_TX_INFO - Returns a `struct tfrc_tx_info' in optval; the buffer for optval and - optlen must be set to at least sizeof(struct tfrc_tx_info). - -On unidirectional connections it is useful to close the unused half-connection -via shutdown (SHUT_WR or SHUT_RD): this will reduce per-packet processing costs. - - -Sysctl variables -================ -Several DCCP default parameters can be managed by the following sysctls -(sysctl net.dccp.default or /proc/sys/net/dccp/default): - -request_retries - The number of active connection initiation retries (the number of - Requests minus one) before timing out. In addition, it also governs - the behaviour of the other, passive side: this variable also sets - the number of times DCCP repeats sending a Response when the initial - handshake does not progress from RESPOND to OPEN (i.e. when no Ack - is received after the initial Request). This value should be greater - than 0, suggested is less than 10. Analogue of tcp_syn_retries. - -retries1 - How often a DCCP Response is retransmitted until the listening DCCP - side considers its connecting peer dead. Analogue of tcp_retries1. - -retries2 - The number of times a general DCCP packet is retransmitted. This has - importance for retransmitted acknowledgments and feature negotiation, - data packets are never retransmitted. Analogue of tcp_retries2. - -tx_ccid = 2 - Default CCID for the sender-receiver half-connection. Depending on the - choice of CCID, the Send Ack Vector feature is enabled automatically. - -rx_ccid = 2 - Default CCID for the receiver-sender half-connection; see tx_ccid. - -seq_window = 100 - The initial sequence window (sec. 7.5.2) of the sender. This influences - the local ackno validity and the remote seqno validity windows (7.5.1). - Values in the range Wmin = 32 (RFC 4340, 7.5.2) up to 2^32-1 can be set. - -tx_qlen = 5 - The size of the transmit buffer in packets. A value of 0 corresponds - to an unbounded transmit buffer. - -sync_ratelimit = 125 ms - The timeout between subsequent DCCP-Sync packets sent in response to - sequence-invalid packets on the same socket (RFC 4340, 7.5.4). The unit - of this parameter is milliseconds; a value of 0 disables rate-limiting. - - -IOCTLS -====== -FIONREAD - Works as in udp(7): returns in the `int' argument pointer the size of - the next pending datagram in bytes, or 0 when no datagram is pending. - - -Other tunables -============== -Per-route rto_min support - CCID-2 supports the RTAX_RTO_MIN per-route setting for the minimum value - of the RTO timer. This setting can be modified via the 'rto_min' option - of iproute2; for example: - > ip route change 10.0.0.0/24 rto_min 250j dev wlan0 - > ip route add 10.0.0.254/32 rto_min 800j dev wlan0 - > ip route show dev wlan0 - CCID-3 also supports the rto_min setting: it is used to define the lower - bound for the expiry of the nofeedback timer. This can be useful on LANs - with very low RTTs (e.g., loopback, Gbit ethernet). - - -Notes -===== -DCCP does not travel through NAT successfully at present on many boxes. This is -because the checksum covers the pseudo-header as per TCP and UDP. Linux NAT -support for DCCP has been added. diff --git a/Documentation/networking/index.rst b/Documentation/networking/index.rst index 4c8e896490e0..3894043332de 100644 --- a/Documentation/networking/index.rst +++ b/Documentation/networking/index.rst @@ -48,6 +48,7 @@ Contents: cdc_mbim cops cxacru + dccp .. only:: subproject and html -- cgit v1.2.3-59-g8ed1b From 8447bb44ef7c452cbc94c04fc38d4946a3ef9165 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Tue, 28 Apr 2020 00:01:29 +0200 Subject: docs: networking: convert dctcp.txt to ReST - add SPDX header; - adjust title markup; - mark code blocks and literals as such; - adjust identation, whitespaces and blank lines; - add to networking/index.rst. Signed-off-by: Mauro Carvalho Chehab Signed-off-by: David S. Miller --- Documentation/networking/dctcp.rst | 52 ++++++++++++++++++++++++++++++++++++++ Documentation/networking/dctcp.txt | 44 -------------------------------- Documentation/networking/index.rst | 1 + 3 files changed, 53 insertions(+), 44 deletions(-) create mode 100644 Documentation/networking/dctcp.rst delete mode 100644 Documentation/networking/dctcp.txt diff --git a/Documentation/networking/dctcp.rst b/Documentation/networking/dctcp.rst new file mode 100644 index 000000000000..4cc8bb2dad50 --- /dev/null +++ b/Documentation/networking/dctcp.rst @@ -0,0 +1,52 @@ +.. SPDX-License-Identifier: GPL-2.0 + +====================== +DCTCP (DataCenter TCP) +====================== + +DCTCP is an enhancement to the TCP congestion control algorithm for data +center networks and leverages Explicit Congestion Notification (ECN) in +the data center network to provide multi-bit feedback to the end hosts. + +To enable it on end hosts:: + + sysctl -w net.ipv4.tcp_congestion_control=dctcp + sysctl -w net.ipv4.tcp_ecn_fallback=0 (optional) + +All switches in the data center network running DCTCP must support ECN +marking and be configured for marking when reaching defined switch buffer +thresholds. The default ECN marking threshold heuristic for DCTCP on +switches is 20 packets (30KB) at 1Gbps, and 65 packets (~100KB) at 10Gbps, +but might need further careful tweaking. + +For more details, see below documents: + +Paper: + +The algorithm is further described in detail in the following two +SIGCOMM/SIGMETRICS papers: + + i) Mohammad Alizadeh, Albert Greenberg, David A. Maltz, Jitendra Padhye, + Parveen Patel, Balaji Prabhakar, Sudipta Sengupta, and Murari Sridharan: + + "Data Center TCP (DCTCP)", Data Center Networks session" + + Proc. ACM SIGCOMM, New Delhi, 2010. + + http://simula.stanford.edu/~alizade/Site/DCTCP_files/dctcp-final.pdf + http://www.sigcomm.org/ccr/papers/2010/October/1851275.1851192 + +ii) Mohammad Alizadeh, Adel Javanmard, and Balaji Prabhakar: + + "Analysis of DCTCP: Stability, Convergence, and Fairness" + Proc. ACM SIGMETRICS, San Jose, 2011. + + http://simula.stanford.edu/~alizade/Site/DCTCP_files/dctcp_analysis-full.pdf + +IETF informational draft: + + http://tools.ietf.org/html/draft-bensley-tcpm-dctcp-00 + +DCTCP site: + + http://simula.stanford.edu/~alizade/Site/DCTCP.html diff --git a/Documentation/networking/dctcp.txt b/Documentation/networking/dctcp.txt deleted file mode 100644 index 13a857753208..000000000000 --- a/Documentation/networking/dctcp.txt +++ /dev/null @@ -1,44 +0,0 @@ -DCTCP (DataCenter TCP) ----------------------- - -DCTCP is an enhancement to the TCP congestion control algorithm for data -center networks and leverages Explicit Congestion Notification (ECN) in -the data center network to provide multi-bit feedback to the end hosts. - -To enable it on end hosts: - - sysctl -w net.ipv4.tcp_congestion_control=dctcp - sysctl -w net.ipv4.tcp_ecn_fallback=0 (optional) - -All switches in the data center network running DCTCP must support ECN -marking and be configured for marking when reaching defined switch buffer -thresholds. The default ECN marking threshold heuristic for DCTCP on -switches is 20 packets (30KB) at 1Gbps, and 65 packets (~100KB) at 10Gbps, -but might need further careful tweaking. - -For more details, see below documents: - -Paper: - -The algorithm is further described in detail in the following two -SIGCOMM/SIGMETRICS papers: - - i) Mohammad Alizadeh, Albert Greenberg, David A. Maltz, Jitendra Padhye, - Parveen Patel, Balaji Prabhakar, Sudipta Sengupta, and Murari Sridharan: - "Data Center TCP (DCTCP)", Data Center Networks session - Proc. ACM SIGCOMM, New Delhi, 2010. - http://simula.stanford.edu/~alizade/Site/DCTCP_files/dctcp-final.pdf - http://www.sigcomm.org/ccr/papers/2010/October/1851275.1851192 - -ii) Mohammad Alizadeh, Adel Javanmard, and Balaji Prabhakar: - "Analysis of DCTCP: Stability, Convergence, and Fairness" - Proc. ACM SIGMETRICS, San Jose, 2011. - http://simula.stanford.edu/~alizade/Site/DCTCP_files/dctcp_analysis-full.pdf - -IETF informational draft: - - http://tools.ietf.org/html/draft-bensley-tcpm-dctcp-00 - -DCTCP site: - - http://simula.stanford.edu/~alizade/Site/DCTCP.html diff --git a/Documentation/networking/index.rst b/Documentation/networking/index.rst index 3894043332de..9e83d3bda4e0 100644 --- a/Documentation/networking/index.rst +++ b/Documentation/networking/index.rst @@ -49,6 +49,7 @@ Contents: cops cxacru dccp + dctcp .. only:: subproject and html -- cgit v1.2.3-59-g8ed1b From 9a69fb9c21c4bf4107becb877729544759bdd059 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Tue, 28 Apr 2020 00:01:30 +0200 Subject: docs: networking: convert decnet.txt to ReST - add SPDX header; - adjust titles and chapters, adding proper markups; - mark lists as such; - mark code blocks and literals as such; - adjust identation, whitespaces and blank lines; - add to networking/index.rst. Signed-off-by: Mauro Carvalho Chehab Signed-off-by: David S. Miller --- Documentation/admin-guide/kernel-parameters.txt | 2 +- Documentation/networking/decnet.rst | 243 ++++++++++++++++++++++++ Documentation/networking/decnet.txt | 230 ---------------------- Documentation/networking/index.rst | 1 + MAINTAINERS | 2 +- net/decnet/Kconfig | 4 +- 6 files changed, 248 insertions(+), 234 deletions(-) create mode 100644 Documentation/networking/decnet.rst delete mode 100644 Documentation/networking/decnet.txt diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index f2a93c8679e8..b23ab11587a6 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -831,7 +831,7 @@ decnet.addr= [HW,NET] Format: [,] - See also Documentation/networking/decnet.txt. + See also Documentation/networking/decnet.rst. default_hugepagesz= [same as hugepagesz=] The size of the default diff --git a/Documentation/networking/decnet.rst b/Documentation/networking/decnet.rst new file mode 100644 index 000000000000..b8bc11ff8370 --- /dev/null +++ b/Documentation/networking/decnet.rst @@ -0,0 +1,243 @@ +.. SPDX-License-Identifier: GPL-2.0 + +========================================= +Linux DECnet Networking Layer Information +========================================= + +1. Other documentation.... +========================== + + - Project Home Pages + - http://www.chygwyn.com/ - Kernel info + - http://linux-decnet.sourceforge.net/ - Userland tools + - http://www.sourceforge.net/projects/linux-decnet/ - Status page + +2. Configuring the kernel +========================= + +Be sure to turn on the following options: + + - CONFIG_DECNET (obviously) + - CONFIG_PROC_FS (to see what's going on) + - CONFIG_SYSCTL (for easy configuration) + +if you want to try out router support (not properly debugged yet) +you'll need the following options as well... + + - CONFIG_DECNET_ROUTER (to be able to add/delete routes) + - CONFIG_NETFILTER (will be required for the DECnet routing daemon) + +Don't turn on SIOCGIFCONF support for DECnet unless you are really sure +that you need it, in general you won't and it can cause ifconfig to +malfunction. + +Run time configuration has changed slightly from the 2.4 system. If you +want to configure an endnode, then the simplified procedure is as follows: + + - Set the MAC address on your ethernet card before starting _any_ other + network protocols. + +As soon as your network card is brought into the UP state, DECnet should +start working. If you need something more complicated or are unsure how +to set the MAC address, see the next section. Also all configurations which +worked with 2.4 will work under 2.5 with no change. + +3. Command line options +======================= + +You can set a DECnet address on the kernel command line for compatibility +with the 2.4 configuration procedure, but in general it's not needed any more. +If you do st a DECnet address on the command line, it has only one purpose +which is that its added to the addresses on the loopback device. + +With 2.4 kernels, DECnet would only recognise addresses as local if they +were added to the loopback device. In 2.5, any local interface address +can be used to loop back to the local machine. Of course this does not +prevent you adding further addresses to the loopback device if you +want to. + +N.B. Since the address list of an interface determines the addresses for +which "hello" messages are sent, if you don't set an address on the loopback +interface then you won't see any entries in /proc/net/neigh for the local +host until such time as you start a connection. This doesn't affect the +operation of the local communications in any other way though. + +The kernel command line takes options looking like the following:: + + decnet.addr=1,2 + +the two numbers are the node address 1,2 = 1.2 For 2.2.xx kernels +and early 2.3.xx kernels, you must use a comma when specifying the +DECnet address like this. For more recent 2.3.xx kernels, you may +use almost any character except space, although a `.` would be the most +obvious choice :-) + +There used to be a third number specifying the node type. This option +has gone away in favour of a per interface node type. This is now set +using /proc/sys/net/decnet/conf//forwarding. This file can be +set with a single digit, 0=EndNode, 1=L1 Router and 2=L2 Router. + +There are also equivalent options for modules. The node address can +also be set through the /proc/sys/net/decnet/ files, as can other system +parameters. + +Currently the only supported devices are ethernet and ip_gre. The +ethernet address of your ethernet card has to be set according to the DECnet +address of the node in order for it to be autoconfigured (and then appear in +/proc/net/decnet_dev). There is a utility available at the above +FTP sites called dn2ethaddr which can compute the correct ethernet +address to use. The address can be set by ifconfig either before or +at the time the device is brought up. If you are using RedHat you can +add the line:: + + MACADDR=AA:00:04:00:03:04 + +or something similar, to /etc/sysconfig/network-scripts/ifcfg-eth0 or +wherever your network card's configuration lives. Setting the MAC address +of your ethernet card to an address starting with "hi-ord" will cause a +DECnet address which matches to be added to the interface (which you can +verify with iproute2). + +The default device for routing can be set through the /proc filesystem +by setting /proc/sys/net/decnet/default_device to the +device you want DECnet to route packets out of when no specific route +is available. Usually this will be eth0, for example:: + + echo -n "eth0" >/proc/sys/net/decnet/default_device + +If you don't set the default device, then it will default to the first +ethernet card which has been autoconfigured as described above. You can +confirm that by looking in the default_device file of course. + +There is a list of what the other files under /proc/sys/net/decnet/ do +on the kernel patch web site (shown above). + +4. Run time kernel configuration +================================ + + +This is either done through the sysctl/proc interface (see the kernel web +pages for details on what the various options do) or through the iproute2 +package in the same way as IPv4/6 configuration is performed. + +Documentation for iproute2 is included with the package, although there is +as yet no specific section on DECnet, most of the features apply to both +IP and DECnet, albeit with DECnet addresses instead of IP addresses and +a reduced functionality. + +If you want to configure a DECnet router you'll need the iproute2 package +since its the _only_ way to add and delete routes currently. Eventually +there will be a routing daemon to send and receive routing messages for +each interface and update the kernel routing tables accordingly. The +routing daemon will use netfilter to listen to routing packets, and +rtnetlink to update the kernels routing tables. + +The DECnet raw socket layer has been removed since it was there purely +for use by the routing daemon which will now use netfilter (a much cleaner +and more generic solution) instead. + +5. How can I tell if its working? +================================= + +Here is a quick guide of what to look for in order to know if your DECnet +kernel subsystem is working. + + - Is the node address set (see /proc/sys/net/decnet/node_address) + - Is the node of the correct type + (see /proc/sys/net/decnet/conf//forwarding) + - Is the Ethernet MAC address of each Ethernet card set to match + the DECnet address. If in doubt use the dn2ethaddr utility available + at the ftp archive. + - If the previous two steps are satisfied, and the Ethernet card is up, + you should find that it is listed in /proc/net/decnet_dev and also + that it appears as a directory in /proc/sys/net/decnet/conf/. The + loopback device (lo) should also appear and is required to communicate + within a node. + - If you have any DECnet routers on your network, they should appear + in /proc/net/decnet_neigh, otherwise this file will only contain the + entry for the node itself (if it doesn't check to see if lo is up). + - If you want to send to any node which is not listed in the + /proc/net/decnet_neigh file, you'll need to set the default device + to point to an Ethernet card with connection to a router. This is + again done with the /proc/sys/net/decnet/default_device file. + - Try starting a simple server and client, like the dnping/dnmirror + over the loopback interface. With luck they should communicate. + For this step and those after, you'll need the DECnet library + which can be obtained from the above ftp sites as well as the + actual utilities themselves. + - If this seems to work, then try talking to a node on your local + network, and see if you can obtain the same results. + - At this point you are on your own... :-) + +6. How to send a bug report +=========================== + +If you've found a bug and want to report it, then there are several things +you can do to help me work out exactly what it is that is wrong. Useful +information (_most_ of which _is_ _essential_) includes: + + - What kernel version are you running ? + - What version of the patch are you running ? + - How far though the above set of tests can you get ? + - What is in the /proc/decnet* files and /proc/sys/net/decnet/* files ? + - Which services are you running ? + - Which client caused the problem ? + - How much data was being transferred ? + - Was the network congested ? + - How can the problem be reproduced ? + - Can you use tcpdump to get a trace ? (N.B. Most (all?) versions of + tcpdump don't understand how to dump DECnet properly, so including + the hex listing of the packet contents is _essential_, usually the -x flag. + You may also need to increase the length grabbed with the -s flag. The + -e flag also provides very useful information (ethernet MAC addresses)) + +7. MAC FAQ +========== + +A quick FAQ on ethernet MAC addresses to explain how Linux and DECnet +interact and how to get the best performance from your hardware. + +Ethernet cards are designed to normally only pass received network frames +to a host computer when they are addressed to it, or to the broadcast address. + +Linux has an interface which allows the setting of extra addresses for +an ethernet card to listen to. If the ethernet card supports it, the +filtering operation will be done in hardware, if not the extra unwanted packets +received will be discarded by the host computer. In the latter case, +significant processor time and bus bandwidth can be used up on a busy +network (see the NAPI documentation for a longer explanation of these +effects). + +DECnet makes use of this interface to allow running DECnet on an ethernet +card which has already been configured using TCP/IP (presumably using the +built in MAC address of the card, as usual) and/or to allow multiple DECnet +addresses on each physical interface. If you do this, be aware that if your +ethernet card doesn't support perfect hashing in its MAC address filter +then your computer will be doing more work than required. Some cards +will simply set themselves into promiscuous mode in order to receive +packets from the DECnet specified addresses. So if you have one of these +cards its better to set the MAC address of the card as described above +to gain the best efficiency. Better still is to use a card which supports +NAPI as well. + + +8. Mailing list +=============== + +If you are keen to get involved in development, or want to ask questions +about configuration, or even just report bugs, then there is a mailing +list that you can join, details are at: + +http://sourceforge.net/mail/?group_id=4993 + +9. Legal Info +============= + +The Linux DECnet project team have placed their code under the GPL. The +software is provided "as is" and without warranty express or implied. +DECnet is a trademark of Compaq. This software is not a product of +Compaq. We acknowledge the help of people at Compaq in providing extra +documentation above and beyond what was previously publicly available. + +Steve Whitehouse + diff --git a/Documentation/networking/decnet.txt b/Documentation/networking/decnet.txt deleted file mode 100644 index d192f8b9948b..000000000000 --- a/Documentation/networking/decnet.txt +++ /dev/null @@ -1,230 +0,0 @@ - Linux DECnet Networking Layer Information - =========================================== - -1) Other documentation.... - - o Project Home Pages - http://www.chygwyn.com/ - Kernel info - http://linux-decnet.sourceforge.net/ - Userland tools - http://www.sourceforge.net/projects/linux-decnet/ - Status page - -2) Configuring the kernel - -Be sure to turn on the following options: - - CONFIG_DECNET (obviously) - CONFIG_PROC_FS (to see what's going on) - CONFIG_SYSCTL (for easy configuration) - -if you want to try out router support (not properly debugged yet) -you'll need the following options as well... - - CONFIG_DECNET_ROUTER (to be able to add/delete routes) - CONFIG_NETFILTER (will be required for the DECnet routing daemon) - -Don't turn on SIOCGIFCONF support for DECnet unless you are really sure -that you need it, in general you won't and it can cause ifconfig to -malfunction. - -Run time configuration has changed slightly from the 2.4 system. If you -want to configure an endnode, then the simplified procedure is as follows: - - o Set the MAC address on your ethernet card before starting _any_ other - network protocols. - -As soon as your network card is brought into the UP state, DECnet should -start working. If you need something more complicated or are unsure how -to set the MAC address, see the next section. Also all configurations which -worked with 2.4 will work under 2.5 with no change. - -3) Command line options - -You can set a DECnet address on the kernel command line for compatibility -with the 2.4 configuration procedure, but in general it's not needed any more. -If you do st a DECnet address on the command line, it has only one purpose -which is that its added to the addresses on the loopback device. - -With 2.4 kernels, DECnet would only recognise addresses as local if they -were added to the loopback device. In 2.5, any local interface address -can be used to loop back to the local machine. Of course this does not -prevent you adding further addresses to the loopback device if you -want to. - -N.B. Since the address list of an interface determines the addresses for -which "hello" messages are sent, if you don't set an address on the loopback -interface then you won't see any entries in /proc/net/neigh for the local -host until such time as you start a connection. This doesn't affect the -operation of the local communications in any other way though. - -The kernel command line takes options looking like the following: - - decnet.addr=1,2 - -the two numbers are the node address 1,2 = 1.2 For 2.2.xx kernels -and early 2.3.xx kernels, you must use a comma when specifying the -DECnet address like this. For more recent 2.3.xx kernels, you may -use almost any character except space, although a `.` would be the most -obvious choice :-) - -There used to be a third number specifying the node type. This option -has gone away in favour of a per interface node type. This is now set -using /proc/sys/net/decnet/conf//forwarding. This file can be -set with a single digit, 0=EndNode, 1=L1 Router and 2=L2 Router. - -There are also equivalent options for modules. The node address can -also be set through the /proc/sys/net/decnet/ files, as can other system -parameters. - -Currently the only supported devices are ethernet and ip_gre. The -ethernet address of your ethernet card has to be set according to the DECnet -address of the node in order for it to be autoconfigured (and then appear in -/proc/net/decnet_dev). There is a utility available at the above -FTP sites called dn2ethaddr which can compute the correct ethernet -address to use. The address can be set by ifconfig either before or -at the time the device is brought up. If you are using RedHat you can -add the line: - - MACADDR=AA:00:04:00:03:04 - -or something similar, to /etc/sysconfig/network-scripts/ifcfg-eth0 or -wherever your network card's configuration lives. Setting the MAC address -of your ethernet card to an address starting with "hi-ord" will cause a -DECnet address which matches to be added to the interface (which you can -verify with iproute2). - -The default device for routing can be set through the /proc filesystem -by setting /proc/sys/net/decnet/default_device to the -device you want DECnet to route packets out of when no specific route -is available. Usually this will be eth0, for example: - - echo -n "eth0" >/proc/sys/net/decnet/default_device - -If you don't set the default device, then it will default to the first -ethernet card which has been autoconfigured as described above. You can -confirm that by looking in the default_device file of course. - -There is a list of what the other files under /proc/sys/net/decnet/ do -on the kernel patch web site (shown above). - -4) Run time kernel configuration - -This is either done through the sysctl/proc interface (see the kernel web -pages for details on what the various options do) or through the iproute2 -package in the same way as IPv4/6 configuration is performed. - -Documentation for iproute2 is included with the package, although there is -as yet no specific section on DECnet, most of the features apply to both -IP and DECnet, albeit with DECnet addresses instead of IP addresses and -a reduced functionality. - -If you want to configure a DECnet router you'll need the iproute2 package -since its the _only_ way to add and delete routes currently. Eventually -there will be a routing daemon to send and receive routing messages for -each interface and update the kernel routing tables accordingly. The -routing daemon will use netfilter to listen to routing packets, and -rtnetlink to update the kernels routing tables. - -The DECnet raw socket layer has been removed since it was there purely -for use by the routing daemon which will now use netfilter (a much cleaner -and more generic solution) instead. - -5) How can I tell if its working ? - -Here is a quick guide of what to look for in order to know if your DECnet -kernel subsystem is working. - - - Is the node address set (see /proc/sys/net/decnet/node_address) - - Is the node of the correct type - (see /proc/sys/net/decnet/conf//forwarding) - - Is the Ethernet MAC address of each Ethernet card set to match - the DECnet address. If in doubt use the dn2ethaddr utility available - at the ftp archive. - - If the previous two steps are satisfied, and the Ethernet card is up, - you should find that it is listed in /proc/net/decnet_dev and also - that it appears as a directory in /proc/sys/net/decnet/conf/. The - loopback device (lo) should also appear and is required to communicate - within a node. - - If you have any DECnet routers on your network, they should appear - in /proc/net/decnet_neigh, otherwise this file will only contain the - entry for the node itself (if it doesn't check to see if lo is up). - - If you want to send to any node which is not listed in the - /proc/net/decnet_neigh file, you'll need to set the default device - to point to an Ethernet card with connection to a router. This is - again done with the /proc/sys/net/decnet/default_device file. - - Try starting a simple server and client, like the dnping/dnmirror - over the loopback interface. With luck they should communicate. - For this step and those after, you'll need the DECnet library - which can be obtained from the above ftp sites as well as the - actual utilities themselves. - - If this seems to work, then try talking to a node on your local - network, and see if you can obtain the same results. - - At this point you are on your own... :-) - -6) How to send a bug report - -If you've found a bug and want to report it, then there are several things -you can do to help me work out exactly what it is that is wrong. Useful -information (_most_ of which _is_ _essential_) includes: - - - What kernel version are you running ? - - What version of the patch are you running ? - - How far though the above set of tests can you get ? - - What is in the /proc/decnet* files and /proc/sys/net/decnet/* files ? - - Which services are you running ? - - Which client caused the problem ? - - How much data was being transferred ? - - Was the network congested ? - - How can the problem be reproduced ? - - Can you use tcpdump to get a trace ? (N.B. Most (all?) versions of - tcpdump don't understand how to dump DECnet properly, so including - the hex listing of the packet contents is _essential_, usually the -x flag. - You may also need to increase the length grabbed with the -s flag. The - -e flag also provides very useful information (ethernet MAC addresses)) - -7) MAC FAQ - -A quick FAQ on ethernet MAC addresses to explain how Linux and DECnet -interact and how to get the best performance from your hardware. - -Ethernet cards are designed to normally only pass received network frames -to a host computer when they are addressed to it, or to the broadcast address. - -Linux has an interface which allows the setting of extra addresses for -an ethernet card to listen to. If the ethernet card supports it, the -filtering operation will be done in hardware, if not the extra unwanted packets -received will be discarded by the host computer. In the latter case, -significant processor time and bus bandwidth can be used up on a busy -network (see the NAPI documentation for a longer explanation of these -effects). - -DECnet makes use of this interface to allow running DECnet on an ethernet -card which has already been configured using TCP/IP (presumably using the -built in MAC address of the card, as usual) and/or to allow multiple DECnet -addresses on each physical interface. If you do this, be aware that if your -ethernet card doesn't support perfect hashing in its MAC address filter -then your computer will be doing more work than required. Some cards -will simply set themselves into promiscuous mode in order to receive -packets from the DECnet specified addresses. So if you have one of these -cards its better to set the MAC address of the card as described above -to gain the best efficiency. Better still is to use a card which supports -NAPI as well. - - -8) Mailing list - -If you are keen to get involved in development, or want to ask questions -about configuration, or even just report bugs, then there is a mailing -list that you can join, details are at: - -http://sourceforge.net/mail/?group_id=4993 - -9) Legal Info - -The Linux DECnet project team have placed their code under the GPL. The -software is provided "as is" and without warranty express or implied. -DECnet is a trademark of Compaq. This software is not a product of -Compaq. We acknowledge the help of people at Compaq in providing extra -documentation above and beyond what was previously publicly available. - -Steve Whitehouse - diff --git a/Documentation/networking/index.rst b/Documentation/networking/index.rst index 9e83d3bda4e0..e17432492745 100644 --- a/Documentation/networking/index.rst +++ b/Documentation/networking/index.rst @@ -50,6 +50,7 @@ Contents: cxacru dccp dctcp + decnet .. only:: subproject and html diff --git a/MAINTAINERS b/MAINTAINERS index 453fe0713e68..7323bfc1720f 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -4728,7 +4728,7 @@ DECnet NETWORK LAYER L: linux-decnet-user@lists.sourceforge.net S: Orphan W: http://linux-decnet.sourceforge.net -F: Documentation/networking/decnet.txt +F: Documentation/networking/decnet.rst F: net/decnet/ DECSTATION PLATFORM SUPPORT diff --git a/net/decnet/Kconfig b/net/decnet/Kconfig index 0935453ccfd5..8f98fb2f2ec9 100644 --- a/net/decnet/Kconfig +++ b/net/decnet/Kconfig @@ -15,7 +15,7 @@ config DECNET . More detailed documentation is available in - . + . Be sure to say Y to "/proc file system support" and "Sysctl support" below when using DECnet, since you will need sysctl support to aid @@ -40,4 +40,4 @@ config DECNET_ROUTER filtering" option will be required for the forthcoming routing daemon to work. - See for more information. + See for more information. -- cgit v1.2.3-59-g8ed1b From 5f32c920c23b75654a839aa87c344b2bcaf350e2 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Tue, 28 Apr 2020 00:01:31 +0200 Subject: docs: networking: convert defza.txt to ReST Not much to be done here: - add SPDX header; - add a document title; - use :field: markup for the version number; - add to networking/index.rst. Signed-off-by: Mauro Carvalho Chehab Signed-off-by: David S. Miller --- Documentation/networking/defza.rst | 63 ++++++++++++++++++++++++++++++++++++++ Documentation/networking/defza.txt | 57 ---------------------------------- Documentation/networking/index.rst | 1 + 3 files changed, 64 insertions(+), 57 deletions(-) create mode 100644 Documentation/networking/defza.rst delete mode 100644 Documentation/networking/defza.txt diff --git a/Documentation/networking/defza.rst b/Documentation/networking/defza.rst new file mode 100644 index 000000000000..73c2f793ea26 --- /dev/null +++ b/Documentation/networking/defza.rst @@ -0,0 +1,63 @@ +.. SPDX-License-Identifier: GPL-2.0 + +===================================================== +Notes on the DEC FDDIcontroller 700 (DEFZA-xx) driver +===================================================== + +:Version: v.1.1.4 + + +DEC FDDIcontroller 700 is DEC's first-generation TURBOchannel FDDI +network card, designed in 1990 specifically for the DECstation 5000 +model 200 workstation. The board is a single attachment station and +it was manufactured in two variations, both of which are supported. + +First is the SAS MMF DEFZA-AA option, the original design implementing +the standard MMF-PMD, however with a pair of ST connectors rather than +the usual MIC connector. The other one is the SAS ThinWire/STP DEFZA-CA +option, denoted 700-C, with the network medium selectable by a switch +between the DEC proprietary ThinWire-PMD using a BNC connector and the +standard STP-PMD using a DE-9F connector. This option can interface to +a DECconcentrator 500 device and, in the case of the STP-PMD, also other +FDDI equipment and was designed to make it easier to transition from +existing IEEE 802.3 10BASE2 Ethernet and IEEE 802.5 Token Ring networks +by providing means to reuse existing cabling. + +This driver handles any number of cards installed in a single system. +They get fddi0, fddi1, etc. interface names assigned in the order of +increasing TURBOchannel slot numbers. + +The board only supports DMA on the receive side. Transmission involves +the use of PIO. As a result under a heavy transmission load there will +be a significant impact on system performance. + +The board supports a 64-entry CAM for matching destination addresses. +Two entries are preoccupied by the Directed Beacon and Ring Purger +multicast addresses and the rest is used as a multicast filter. An +all-multi mode is also supported for LLC frames and it is used if +requested explicitly or if the CAM overflows. The promiscuous mode +supports separate enables for LLC and SMT frames, but this driver +doesn't support changing them individually. + + +Known problems: + +None. + + +To do: + +5. MAC address change. The card does not support changing the Media + Access Controller's address registers but a similar effect can be + achieved by adding an alias to the CAM. There is no way to disable + matching against the original address though. + +7. Queueing incoming/outgoing SMT frames in the driver if the SMT + receive/RMC transmit ring is full. (?) + +8. Retrieving/reporting FDDI/SNMP stats. + + +Both success and failure reports are welcome. + +Maciej W. Rozycki diff --git a/Documentation/networking/defza.txt b/Documentation/networking/defza.txt deleted file mode 100644 index 663e4a906751..000000000000 --- a/Documentation/networking/defza.txt +++ /dev/null @@ -1,57 +0,0 @@ -Notes on the DEC FDDIcontroller 700 (DEFZA-xx) driver v.1.1.4. - - -DEC FDDIcontroller 700 is DEC's first-generation TURBOchannel FDDI -network card, designed in 1990 specifically for the DECstation 5000 -model 200 workstation. The board is a single attachment station and -it was manufactured in two variations, both of which are supported. - -First is the SAS MMF DEFZA-AA option, the original design implementing -the standard MMF-PMD, however with a pair of ST connectors rather than -the usual MIC connector. The other one is the SAS ThinWire/STP DEFZA-CA -option, denoted 700-C, with the network medium selectable by a switch -between the DEC proprietary ThinWire-PMD using a BNC connector and the -standard STP-PMD using a DE-9F connector. This option can interface to -a DECconcentrator 500 device and, in the case of the STP-PMD, also other -FDDI equipment and was designed to make it easier to transition from -existing IEEE 802.3 10BASE2 Ethernet and IEEE 802.5 Token Ring networks -by providing means to reuse existing cabling. - -This driver handles any number of cards installed in a single system. -They get fddi0, fddi1, etc. interface names assigned in the order of -increasing TURBOchannel slot numbers. - -The board only supports DMA on the receive side. Transmission involves -the use of PIO. As a result under a heavy transmission load there will -be a significant impact on system performance. - -The board supports a 64-entry CAM for matching destination addresses. -Two entries are preoccupied by the Directed Beacon and Ring Purger -multicast addresses and the rest is used as a multicast filter. An -all-multi mode is also supported for LLC frames and it is used if -requested explicitly or if the CAM overflows. The promiscuous mode -supports separate enables for LLC and SMT frames, but this driver -doesn't support changing them individually. - - -Known problems: - -None. - - -To do: - -5. MAC address change. The card does not support changing the Media - Access Controller's address registers but a similar effect can be - achieved by adding an alias to the CAM. There is no way to disable - matching against the original address though. - -7. Queueing incoming/outgoing SMT frames in the driver if the SMT - receive/RMC transmit ring is full. (?) - -8. Retrieving/reporting FDDI/SNMP stats. - - -Both success and failure reports are welcome. - -Maciej W. Rozycki diff --git a/Documentation/networking/index.rst b/Documentation/networking/index.rst index e17432492745..c893127004b9 100644 --- a/Documentation/networking/index.rst +++ b/Documentation/networking/index.rst @@ -51,6 +51,7 @@ Contents: dccp dctcp decnet + defza .. only:: subproject and html -- cgit v1.2.3-59-g8ed1b From 9dfe1361261be48c92fd7cb26909cbcd5d496220 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Tue, 28 Apr 2020 00:01:32 +0200 Subject: docs: networking: convert dns_resolver.txt to ReST - add SPDX header; - adjust titles and chapters, adding proper markups; - comment out text-only TOC from html/pdf output; - mark code blocks and literals as such; - adjust identation, whitespaces and blank lines; - add to networking/index.rst. Signed-off-by: Mauro Carvalho Chehab Signed-off-by: David S. Miller --- Documentation/networking/dns_resolver.rst | 155 +++++++++++++++++++++++++++++ Documentation/networking/dns_resolver.txt | 157 ------------------------------ Documentation/networking/index.rst | 1 + net/ceph/Kconfig | 2 +- net/dns_resolver/Kconfig | 2 +- net/dns_resolver/dns_key.c | 2 +- net/dns_resolver/dns_query.c | 2 +- 7 files changed, 160 insertions(+), 161 deletions(-) create mode 100644 Documentation/networking/dns_resolver.rst delete mode 100644 Documentation/networking/dns_resolver.txt diff --git a/Documentation/networking/dns_resolver.rst b/Documentation/networking/dns_resolver.rst new file mode 100644 index 000000000000..add4d59a99a5 --- /dev/null +++ b/Documentation/networking/dns_resolver.rst @@ -0,0 +1,155 @@ +.. SPDX-License-Identifier: GPL-2.0 + +=================== +DNS Resolver Module +=================== + +.. Contents: + + - Overview. + - Compilation. + - Setting up. + - Usage. + - Mechanism. + - Debugging. + + +Overview +======== + +The DNS resolver module provides a way for kernel services to make DNS queries +by way of requesting a key of key type dns_resolver. These queries are +upcalled to userspace through /sbin/request-key. + +These routines must be supported by userspace tools dns.upcall, cifs.upcall and +request-key. It is under development and does not yet provide the full feature +set. The features it does support include: + + (*) Implements the dns_resolver key_type to contact userspace. + +It does not yet support the following AFS features: + + (*) Dns query support for AFSDB resource record. + +This code is extracted from the CIFS filesystem. + + +Compilation +=========== + +The module should be enabled by turning on the kernel configuration options:: + + CONFIG_DNS_RESOLVER - tristate "DNS Resolver support" + + +Setting up +========== + +To set up this facility, the /etc/request-key.conf file must be altered so that +/sbin/request-key can appropriately direct the upcalls. For example, to handle +basic dname to IPv4/IPv6 address resolution, the following line should be +added:: + + + #OP TYPE DESC CO-INFO PROGRAM ARG1 ARG2 ARG3 ... + #====== ============ ======= ======= ========================== + create dns_resolver * * /usr/sbin/cifs.upcall %k + +To direct a query for query type 'foo', a line of the following should be added +before the more general line given above as the first match is the one taken:: + + create dns_resolver foo:* * /usr/sbin/dns.foo %k + + +Usage +===== + +To make use of this facility, one of the following functions that are +implemented in the module can be called after doing:: + + #include + + :: + + int dns_query(const char *type, const char *name, size_t namelen, + const char *options, char **_result, time_t *_expiry); + + This is the basic access function. It looks for a cached DNS query and if + it doesn't find it, it upcalls to userspace to make a new DNS query, which + may then be cached. The key description is constructed as a string of the + form:: + + [:] + + where optionally specifies the particular upcall program to invoke, + and thus the type of query to do, and specifies the string to be + looked up. The default query type is a straight hostname to IP address + set lookup. + + The name parameter is not required to be a NUL-terminated string, and its + length should be given by the namelen argument. + + The options parameter may be NULL or it may be a set of options + appropriate to the query type. + + The return value is a string appropriate to the query type. For instance, + for the default query type it is just a list of comma-separated IPv4 and + IPv6 addresses. The caller must free the result. + + The length of the result string is returned on success, and a negative + error code is returned otherwise. -EKEYREJECTED will be returned if the + DNS lookup failed. + + If _expiry is non-NULL, the expiry time (TTL) of the result will be + returned also. + +The kernel maintains an internal keyring in which it caches looked up keys. +This can be cleared by any process that has the CAP_SYS_ADMIN capability by +the use of KEYCTL_KEYRING_CLEAR on the keyring ID. + + +Reading DNS Keys from Userspace +=============================== + +Keys of dns_resolver type can be read from userspace using keyctl_read() or +"keyctl read/print/pipe". + + +Mechanism +========= + +The dnsresolver module registers a key type called "dns_resolver". Keys of +this type are used to transport and cache DNS lookup results from userspace. + +When dns_query() is invoked, it calls request_key() to search the local +keyrings for a cached DNS result. If that fails to find one, it upcalls to +userspace to get a new result. + +Upcalls to userspace are made through the request_key() upcall vector, and are +directed by means of configuration lines in /etc/request-key.conf that tell +/sbin/request-key what program to run to instantiate the key. + +The upcall handler program is responsible for querying the DNS, processing the +result into a form suitable for passing to the keyctl_instantiate_key() +routine. This then passes the data to dns_resolver_instantiate() which strips +off and processes any options included in the data, and then attaches the +remainder of the string to the key as its payload. + +The upcall handler program should set the expiry time on the key to that of the +lowest TTL of all the records it has extracted a result from. This means that +the key will be discarded and recreated when the data it holds has expired. + +dns_query() returns a copy of the value attached to the key, or an error if +that is indicated instead. + +See for further +information about request-key function. + + +Debugging +========= + +Debugging messages can be turned on dynamically by writing a 1 into the +following file:: + + /sys/module/dnsresolver/parameters/debug diff --git a/Documentation/networking/dns_resolver.txt b/Documentation/networking/dns_resolver.txt deleted file mode 100644 index eaa8f9a6fd5d..000000000000 --- a/Documentation/networking/dns_resolver.txt +++ /dev/null @@ -1,157 +0,0 @@ - =================== - DNS Resolver Module - =================== - -Contents: - - - Overview. - - Compilation. - - Setting up. - - Usage. - - Mechanism. - - Debugging. - - -======== -OVERVIEW -======== - -The DNS resolver module provides a way for kernel services to make DNS queries -by way of requesting a key of key type dns_resolver. These queries are -upcalled to userspace through /sbin/request-key. - -These routines must be supported by userspace tools dns.upcall, cifs.upcall and -request-key. It is under development and does not yet provide the full feature -set. The features it does support include: - - (*) Implements the dns_resolver key_type to contact userspace. - -It does not yet support the following AFS features: - - (*) Dns query support for AFSDB resource record. - -This code is extracted from the CIFS filesystem. - - -=========== -COMPILATION -=========== - -The module should be enabled by turning on the kernel configuration options: - - CONFIG_DNS_RESOLVER - tristate "DNS Resolver support" - - -========== -SETTING UP -========== - -To set up this facility, the /etc/request-key.conf file must be altered so that -/sbin/request-key can appropriately direct the upcalls. For example, to handle -basic dname to IPv4/IPv6 address resolution, the following line should be -added: - - #OP TYPE DESC CO-INFO PROGRAM ARG1 ARG2 ARG3 ... - #====== ============ ======= ======= ========================== - create dns_resolver * * /usr/sbin/cifs.upcall %k - -To direct a query for query type 'foo', a line of the following should be added -before the more general line given above as the first match is the one taken. - - create dns_resolver foo:* * /usr/sbin/dns.foo %k - - -===== -USAGE -===== - -To make use of this facility, one of the following functions that are -implemented in the module can be called after doing: - - #include - - (1) int dns_query(const char *type, const char *name, size_t namelen, - const char *options, char **_result, time_t *_expiry); - - This is the basic access function. It looks for a cached DNS query and if - it doesn't find it, it upcalls to userspace to make a new DNS query, which - may then be cached. The key description is constructed as a string of the - form: - - [:] - - where optionally specifies the particular upcall program to invoke, - and thus the type of query to do, and specifies the string to be - looked up. The default query type is a straight hostname to IP address - set lookup. - - The name parameter is not required to be a NUL-terminated string, and its - length should be given by the namelen argument. - - The options parameter may be NULL or it may be a set of options - appropriate to the query type. - - The return value is a string appropriate to the query type. For instance, - for the default query type it is just a list of comma-separated IPv4 and - IPv6 addresses. The caller must free the result. - - The length of the result string is returned on success, and a negative - error code is returned otherwise. -EKEYREJECTED will be returned if the - DNS lookup failed. - - If _expiry is non-NULL, the expiry time (TTL) of the result will be - returned also. - -The kernel maintains an internal keyring in which it caches looked up keys. -This can be cleared by any process that has the CAP_SYS_ADMIN capability by -the use of KEYCTL_KEYRING_CLEAR on the keyring ID. - - -=============================== -READING DNS KEYS FROM USERSPACE -=============================== - -Keys of dns_resolver type can be read from userspace using keyctl_read() or -"keyctl read/print/pipe". - - -========= -MECHANISM -========= - -The dnsresolver module registers a key type called "dns_resolver". Keys of -this type are used to transport and cache DNS lookup results from userspace. - -When dns_query() is invoked, it calls request_key() to search the local -keyrings for a cached DNS result. If that fails to find one, it upcalls to -userspace to get a new result. - -Upcalls to userspace are made through the request_key() upcall vector, and are -directed by means of configuration lines in /etc/request-key.conf that tell -/sbin/request-key what program to run to instantiate the key. - -The upcall handler program is responsible for querying the DNS, processing the -result into a form suitable for passing to the keyctl_instantiate_key() -routine. This then passes the data to dns_resolver_instantiate() which strips -off and processes any options included in the data, and then attaches the -remainder of the string to the key as its payload. - -The upcall handler program should set the expiry time on the key to that of the -lowest TTL of all the records it has extracted a result from. This means that -the key will be discarded and recreated when the data it holds has expired. - -dns_query() returns a copy of the value attached to the key, or an error if -that is indicated instead. - -See for further -information about request-key function. - - -========= -DEBUGGING -========= - -Debugging messages can be turned on dynamically by writing a 1 into the -following file: - - /sys/module/dnsresolver/parameters/debug diff --git a/Documentation/networking/index.rst b/Documentation/networking/index.rst index c893127004b9..55746038a7e9 100644 --- a/Documentation/networking/index.rst +++ b/Documentation/networking/index.rst @@ -52,6 +52,7 @@ Contents: dctcp decnet defza + dns_resolver .. only:: subproject and html diff --git a/net/ceph/Kconfig b/net/ceph/Kconfig index 2e8e6f904920..d7bec7adc267 100644 --- a/net/ceph/Kconfig +++ b/net/ceph/Kconfig @@ -39,6 +39,6 @@ config CEPH_LIB_USE_DNS_RESOLVER be resolved using the CONFIG_DNS_RESOLVER facility. For information on how to use CONFIG_DNS_RESOLVER consult - Documentation/networking/dns_resolver.txt + Documentation/networking/dns_resolver.rst If unsure, say N. diff --git a/net/dns_resolver/Kconfig b/net/dns_resolver/Kconfig index 0a1c2238b4bd..255df9b6e9e8 100644 --- a/net/dns_resolver/Kconfig +++ b/net/dns_resolver/Kconfig @@ -19,7 +19,7 @@ config DNS_RESOLVER SMB2 later. DNS Resolver is supported by the userspace upcall helper "/sbin/dns.resolver" via /etc/request-key.conf. - See for further + See for further information. To compile this as a module, choose M here: the module will be called diff --git a/net/dns_resolver/dns_key.c b/net/dns_resolver/dns_key.c index ad53eb31d40f..3aced951d5ab 100644 --- a/net/dns_resolver/dns_key.c +++ b/net/dns_resolver/dns_key.c @@ -1,6 +1,6 @@ /* Key type used to cache DNS lookups made by the kernel * - * See Documentation/networking/dns_resolver.txt + * See Documentation/networking/dns_resolver.rst * * Copyright (c) 2007 Igor Mammedov * Author(s): Igor Mammedov (niallain@gmail.com) diff --git a/net/dns_resolver/dns_query.c b/net/dns_resolver/dns_query.c index cab4e0df924f..82b084cc1cc6 100644 --- a/net/dns_resolver/dns_query.c +++ b/net/dns_resolver/dns_query.c @@ -1,7 +1,7 @@ /* Upcall routine, designed to work as a key type and working through * /sbin/request-key to contact userspace when handling DNS queries. * - * See Documentation/networking/dns_resolver.txt + * See Documentation/networking/dns_resolver.rst * * Copyright (c) 2007 Igor Mammedov * Author(s): Igor Mammedov (niallain@gmail.com) -- cgit v1.2.3-59-g8ed1b From 28d23311ff35ac97ff20608f47c84c95d6389c33 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Tue, 28 Apr 2020 00:01:33 +0200 Subject: docs: networking: convert driver.txt to ReST - add SPDX header; - add a document title; - mark code blocks and literals as such; - adjust identation, whitespaces and blank lines; - add to networking/index.rst. Signed-off-by: Mauro Carvalho Chehab Signed-off-by: David S. Miller --- Documentation/networking/driver.rst | 97 +++++++++++++++++++++++++++++++++++++ Documentation/networking/driver.txt | 93 ----------------------------------- Documentation/networking/index.rst | 1 + 3 files changed, 98 insertions(+), 93 deletions(-) create mode 100644 Documentation/networking/driver.rst delete mode 100644 Documentation/networking/driver.txt diff --git a/Documentation/networking/driver.rst b/Documentation/networking/driver.rst new file mode 100644 index 000000000000..c8f59dbda46f --- /dev/null +++ b/Documentation/networking/driver.rst @@ -0,0 +1,97 @@ +.. SPDX-License-Identifier: GPL-2.0 + +===================== +Softnet Driver Issues +===================== + +Transmit path guidelines: + +1) The ndo_start_xmit method must not return NETDEV_TX_BUSY under + any normal circumstances. It is considered a hard error unless + there is no way your device can tell ahead of time when it's + transmit function will become busy. + + Instead it must maintain the queue properly. For example, + for a driver implementing scatter-gather this means:: + + static netdev_tx_t drv_hard_start_xmit(struct sk_buff *skb, + struct net_device *dev) + { + struct drv *dp = netdev_priv(dev); + + lock_tx(dp); + ... + /* This is a hard error log it. */ + if (TX_BUFFS_AVAIL(dp) <= (skb_shinfo(skb)->nr_frags + 1)) { + netif_stop_queue(dev); + unlock_tx(dp); + printk(KERN_ERR PFX "%s: BUG! Tx Ring full when queue awake!\n", + dev->name); + return NETDEV_TX_BUSY; + } + + ... queue packet to card ... + ... update tx consumer index ... + + if (TX_BUFFS_AVAIL(dp) <= (MAX_SKB_FRAGS + 1)) + netif_stop_queue(dev); + + ... + unlock_tx(dp); + ... + return NETDEV_TX_OK; + } + + And then at the end of your TX reclamation event handling:: + + if (netif_queue_stopped(dp->dev) && + TX_BUFFS_AVAIL(dp) > (MAX_SKB_FRAGS + 1)) + netif_wake_queue(dp->dev); + + For a non-scatter-gather supporting card, the three tests simply become:: + + /* This is a hard error log it. */ + if (TX_BUFFS_AVAIL(dp) <= 0) + + and:: + + if (TX_BUFFS_AVAIL(dp) == 0) + + and:: + + if (netif_queue_stopped(dp->dev) && + TX_BUFFS_AVAIL(dp) > 0) + netif_wake_queue(dp->dev); + +2) An ndo_start_xmit method must not modify the shared parts of a + cloned SKB. + +3) Do not forget that once you return NETDEV_TX_OK from your + ndo_start_xmit method, it is your driver's responsibility to free + up the SKB and in some finite amount of time. + + For example, this means that it is not allowed for your TX + mitigation scheme to let TX packets "hang out" in the TX + ring unreclaimed forever if no new TX packets are sent. + This error can deadlock sockets waiting for send buffer room + to be freed up. + + If you return NETDEV_TX_BUSY from the ndo_start_xmit method, you + must not keep any reference to that SKB and you must not attempt + to free it up. + +Probing guidelines: + +1) Any hardware layer address you obtain for your device should + be verified. For example, for ethernet check it with + linux/etherdevice.h:is_valid_ether_addr() + +Close/stop guidelines: + +1) After the ndo_stop routine has been called, the hardware must + not receive or transmit any data. All in flight packets must + be aborted. If necessary, poll or wait for completion of + any reset commands. + +2) The ndo_stop routine will be called by unregister_netdevice + if device is still UP. diff --git a/Documentation/networking/driver.txt b/Documentation/networking/driver.txt deleted file mode 100644 index da59e2884130..000000000000 --- a/Documentation/networking/driver.txt +++ /dev/null @@ -1,93 +0,0 @@ -Document about softnet driver issues - -Transmit path guidelines: - -1) The ndo_start_xmit method must not return NETDEV_TX_BUSY under - any normal circumstances. It is considered a hard error unless - there is no way your device can tell ahead of time when it's - transmit function will become busy. - - Instead it must maintain the queue properly. For example, - for a driver implementing scatter-gather this means: - - static netdev_tx_t drv_hard_start_xmit(struct sk_buff *skb, - struct net_device *dev) - { - struct drv *dp = netdev_priv(dev); - - lock_tx(dp); - ... - /* This is a hard error log it. */ - if (TX_BUFFS_AVAIL(dp) <= (skb_shinfo(skb)->nr_frags + 1)) { - netif_stop_queue(dev); - unlock_tx(dp); - printk(KERN_ERR PFX "%s: BUG! Tx Ring full when queue awake!\n", - dev->name); - return NETDEV_TX_BUSY; - } - - ... queue packet to card ... - ... update tx consumer index ... - - if (TX_BUFFS_AVAIL(dp) <= (MAX_SKB_FRAGS + 1)) - netif_stop_queue(dev); - - ... - unlock_tx(dp); - ... - return NETDEV_TX_OK; - } - - And then at the end of your TX reclamation event handling: - - if (netif_queue_stopped(dp->dev) && - TX_BUFFS_AVAIL(dp) > (MAX_SKB_FRAGS + 1)) - netif_wake_queue(dp->dev); - - For a non-scatter-gather supporting card, the three tests simply become: - - /* This is a hard error log it. */ - if (TX_BUFFS_AVAIL(dp) <= 0) - - and: - - if (TX_BUFFS_AVAIL(dp) == 0) - - and: - - if (netif_queue_stopped(dp->dev) && - TX_BUFFS_AVAIL(dp) > 0) - netif_wake_queue(dp->dev); - -2) An ndo_start_xmit method must not modify the shared parts of a - cloned SKB. - -3) Do not forget that once you return NETDEV_TX_OK from your - ndo_start_xmit method, it is your driver's responsibility to free - up the SKB and in some finite amount of time. - - For example, this means that it is not allowed for your TX - mitigation scheme to let TX packets "hang out" in the TX - ring unreclaimed forever if no new TX packets are sent. - This error can deadlock sockets waiting for send buffer room - to be freed up. - - If you return NETDEV_TX_BUSY from the ndo_start_xmit method, you - must not keep any reference to that SKB and you must not attempt - to free it up. - -Probing guidelines: - -1) Any hardware layer address you obtain for your device should - be verified. For example, for ethernet check it with - linux/etherdevice.h:is_valid_ether_addr() - -Close/stop guidelines: - -1) After the ndo_stop routine has been called, the hardware must - not receive or transmit any data. All in flight packets must - be aborted. If necessary, poll or wait for completion of - any reset commands. - -2) The ndo_stop routine will be called by unregister_netdevice - if device is still UP. diff --git a/Documentation/networking/index.rst b/Documentation/networking/index.rst index 55746038a7e9..313f66900bce 100644 --- a/Documentation/networking/index.rst +++ b/Documentation/networking/index.rst @@ -53,6 +53,7 @@ Contents: decnet defza dns_resolver + driver .. only:: subproject and html -- cgit v1.2.3-59-g8ed1b From 06df65723b69a3d4691737503654400c35f9ca5a Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Tue, 28 Apr 2020 00:01:34 +0200 Subject: docs: networking: convert eql.txt to ReST - add SPDX header; - add a document title; - adjust titles and chapters, adding proper markups; - mark code blocks and literals as such; - mark tables as such; - adjust identation, whitespaces and blank lines; - add to networking/index.rst. Signed-off-by: Mauro Carvalho Chehab Signed-off-by: David S. Miller --- Documentation/networking/eql.rst | 373 ++++++++++++++++++++++++++ Documentation/networking/eql.txt | 528 ------------------------------------- Documentation/networking/index.rst | 1 + drivers/net/Kconfig | 2 +- 4 files changed, 375 insertions(+), 529 deletions(-) create mode 100644 Documentation/networking/eql.rst delete mode 100644 Documentation/networking/eql.txt diff --git a/Documentation/networking/eql.rst b/Documentation/networking/eql.rst new file mode 100644 index 000000000000..a628c4c81166 --- /dev/null +++ b/Documentation/networking/eql.rst @@ -0,0 +1,373 @@ +.. SPDX-License-Identifier: GPL-2.0 + +========================================== +EQL Driver: Serial IP Load Balancing HOWTO +========================================== + + Simon "Guru Aleph-Null" Janes, simon@ncm.com + + v1.1, February 27, 1995 + + This is the manual for the EQL device driver. EQL is a software device + that lets you load-balance IP serial links (SLIP or uncompressed PPP) + to increase your bandwidth. It will not reduce your latency (i.e. ping + times) except in the case where you already have lots of traffic on + your link, in which it will help them out. This driver has been tested + with the 1.1.75 kernel, and is known to have patched cleanly with + 1.1.86. Some testing with 1.1.92 has been done with the v1.1 patch + which was only created to patch cleanly in the very latest kernel + source trees. (Yes, it worked fine.) + +1. Introduction +=============== + + Which is worse? A huge fee for a 56K leased line or two phone lines? + It's probably the former. If you find yourself craving more bandwidth, + and have a ISP that is flexible, it is now possible to bind modems + together to work as one point-to-point link to increase your + bandwidth. All without having to have a special black box on either + side. + + + The eql driver has only been tested with the Livingston PortMaster-2e + terminal server. I do not know if other terminal servers support load- + balancing, but I do know that the PortMaster does it, and does it + almost as well as the eql driver seems to do it (-- Unfortunately, in + my testing so far, the Livingston PortMaster 2e's load-balancing is a + good 1 to 2 KB/s slower than the test machine working with a 28.8 Kbps + and 14.4 Kbps connection. However, I am not sure that it really is + the PortMaster, or if it's Linux's TCP drivers. I'm told that Linux's + TCP implementation is pretty fast though.--) + + + I suggest to ISPs out there that it would probably be fair to charge + a load-balancing client 75% of the cost of the second line and 50% of + the cost of the third line etc... + + + Hey, we can all dream you know... + + +2. Kernel Configuration +======================= + + Here I describe the general steps of getting a kernel up and working + with the eql driver. From patching, building, to installing. + + +2.1. Patching The Kernel +------------------------ + + If you do not have or cannot get a copy of the kernel with the eql + driver folded into it, get your copy of the driver from + ftp://slaughter.ncm.com/pub/Linux/LOAD_BALANCING/eql-1.1.tar.gz. + Unpack this archive someplace obvious like /usr/local/src/. It will + create the following files:: + + -rw-r--r-- guru/ncm 198 Jan 19 18:53 1995 eql-1.1/NO-WARRANTY + -rw-r--r-- guru/ncm 30620 Feb 27 21:40 1995 eql-1.1/eql-1.1.patch + -rwxr-xr-x guru/ncm 16111 Jan 12 22:29 1995 eql-1.1/eql_enslave + -rw-r--r-- guru/ncm 2195 Jan 10 21:48 1995 eql-1.1/eql_enslave.c + + Unpack a recent kernel (something after 1.1.92) someplace convenient + like say /usr/src/linux-1.1.92.eql. Use symbolic links to point + /usr/src/linux to this development directory. + + + Apply the patch by running the commands:: + + cd /usr/src + patch + ". Here are some example enslavings:: + + eql_enslave eql sl0 28800 + eql_enslave eql ppp0 14400 + eql_enslave eql sl1 57600 + + When you want to free a device from its life of slavery, you can + either down the device with ifconfig (eql will automatically bury the + dead slave and remove it from its queue) or use eql_emancipate to free + it. (-- Or just ifconfig it down, and the eql driver will take it out + for you.--):: + + eql_emancipate eql sl0 + eql_emancipate eql ppp0 + eql_emancipate eql sl1 + + +3.3. DSLIP Configuration for the eql Device +------------------------------------------- + + The general idea is to bring up and keep up as many SLIP connections + as you need, automatically. + + +3.3.1. /etc/slip/runslip.conf +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + + Here is an example runslip.conf:: + + name sl-line-1 + enabled + baud 38400 + mtu 576 + ducmd -e /etc/slip/dialout/cua2-288.xp -t 9 + command eql_enslave eql $interface 28800 + address 198.67.33.239 + line /dev/cua2 + + name sl-line-2 + enabled + baud 38400 + mtu 576 + ducmd -e /etc/slip/dialout/cua3-288.xp -t 9 + command eql_enslave eql $interface 28800 + address 198.67.33.239 + line /dev/cua3 + + +3.4. Using PPP and the eql Device +--------------------------------- + + I have not yet done any load-balancing testing for PPP devices, mainly + because I don't have a PPP-connection manager like SLIP has with + DSLIP. I did find a good tip from LinuxNET:Billy for PPP performance: + make sure you have asyncmap set to something so that control + characters are not escaped. + + + I tried to fix up a PPP script/system for redialing lost PPP + connections for use with the eql driver the weekend of Feb 25-26 '95 + (Hereafter known as the 8-hour PPP Hate Festival). Perhaps later this + year. + + +4. About the Slave Scheduler Algorithm +====================================== + + The slave scheduler probably could be replaced with a dozen other + things and push traffic much faster. The formula in the current set + up of the driver was tuned to handle slaves with wildly different + bits-per-second "priorities". + + + All testing I have done was with two 28.8 V.FC modems, one connecting + at 28800 bps or slower, and the other connecting at 14400 bps all the + time. + + + One version of the scheduler was able to push 5.3 K/s through the + 28800 and 14400 connections, but when the priorities on the links were + very wide apart (57600 vs. 14400) the "faster" modem received all + traffic and the "slower" modem starved. + + +5. Testers' Reports +=================== + + Some people have experimented with the eql device with newer + kernels (than 1.1.75). I have since updated the driver to patch + cleanly in newer kernels because of the removal of the old "slave- + balancing" driver config option. + + + - icee from LinuxNET patched 1.1.86 without any rejects and was able + to boot the kernel and enslave a couple of ISDN PPP links. + +5.1. Randolph Bentson's Test Report +----------------------------------- + + :: + + From bentson@grieg.seaslug.org Wed Feb 8 19:08:09 1995 + Date: Tue, 7 Feb 95 22:57 PST + From: Randolph Bentson + To: guru@ncm.com + Subject: EQL driver tests + + + I have been checking out your eql driver. (Nice work, that!) + Although you may already done this performance testing, here + are some data I've discovered. + + Randolph Bentson + bentson@grieg.seaslug.org + +------------------------------------------------------------------ + + + A pseudo-device driver, EQL, written by Simon Janes, can be used + to bundle multiple SLIP connections into what appears to be a + single connection. This allows one to improve dial-up network + connectivity gradually, without having to buy expensive DSU/CSU + hardware and services. + + I have done some testing of this software, with two goals in + mind: first, to ensure it actually works as described and + second, as a method of exercising my device driver. + + The following performance measurements were derived from a set + of SLIP connections run between two Linux systems (1.1.84) using + a 486DX2/66 with a Cyclom-8Ys and a 486SLC/40 with a Cyclom-16Y. + (Ports 0,1,2,3 were used. A later configuration will distribute + port selection across the different Cirrus chips on the boards.) + Once a link was established, I timed a binary ftp transfer of + 289284 bytes of data. If there were no overhead (packet headers, + inter-character and inter-packet delays, etc.) the transfers + would take the following times:: + + bits/sec seconds + 345600 8.3 + 234600 12.3 + 172800 16.7 + 153600 18.8 + 76800 37.6 + 57600 50.2 + 38400 75.3 + 28800 100.4 + 19200 150.6 + 9600 301.3 + + A single line running at the lower speeds and with large packets + comes to within 2% of this. Performance is limited for the higher + speeds (as predicted by the Cirrus databook) to an aggregate of + about 160 kbits/sec. The next round of testing will distribute + the load across two or more Cirrus chips. + + The good news is that one gets nearly the full advantage of the + second, third, and fourth line's bandwidth. (The bad news is + that the connection establishment seemed fragile for the higher + speeds. Once established, the connection seemed robust enough.) + + ====== ======== === ======== ======= ======= === + #lines speed mtu seconds theory actual %of + kbit/sec duration speed speed max + ====== ======== === ======== ======= ======= === + 3 115200 900 _ 345600 + 3 115200 400 18.1 345600 159825 46 + 2 115200 900 _ 230400 + 2 115200 600 18.1 230400 159825 69 + 2 115200 400 19.3 230400 149888 65 + 4 57600 900 _ 234600 + 4 57600 600 _ 234600 + 4 57600 400 _ 234600 + 3 57600 600 20.9 172800 138413 80 + 3 57600 900 21.2 172800 136455 78 + 3 115200 600 21.7 345600 133311 38 + 3 57600 400 22.5 172800 128571 74 + 4 38400 900 25.2 153600 114795 74 + 4 38400 600 26.4 153600 109577 71 + 4 38400 400 27.3 153600 105965 68 + 2 57600 900 29.1 115200 99410.3 86 + 1 115200 900 30.7 115200 94229.3 81 + 2 57600 600 30.2 115200 95789.4 83 + 3 38400 900 30.3 115200 95473.3 82 + 3 38400 600 31.2 115200 92719.2 80 + 1 115200 600 31.3 115200 92423 80 + 2 57600 400 32.3 115200 89561.6 77 + 1 115200 400 32.8 115200 88196.3 76 + 3 38400 400 33.5 115200 86353.4 74 + 2 38400 900 43.7 76800 66197.7 86 + 2 38400 600 44 76800 65746.4 85 + 2 38400 400 47.2 76800 61289 79 + 4 19200 900 50.8 76800 56945.7 74 + 4 19200 400 53.2 76800 54376.7 70 + 4 19200 600 53.7 76800 53870.4 70 + 1 57600 900 54.6 57600 52982.4 91 + 1 57600 600 56.2 57600 51474 89 + 3 19200 900 60.5 57600 47815.5 83 + 1 57600 400 60.2 57600 48053.8 83 + 3 19200 600 62 57600 46658.7 81 + 3 19200 400 64.7 57600 44711.6 77 + 1 38400 900 79.4 38400 36433.8 94 + 1 38400 600 82.4 38400 35107.3 91 + 2 19200 900 84.4 38400 34275.4 89 + 1 38400 400 86.8 38400 33327.6 86 + 2 19200 600 87.6 38400 33023.3 85 + 2 19200 400 91.2 38400 31719.7 82 + 4 9600 900 94.7 38400 30547.4 79 + 4 9600 400 106 38400 27290.9 71 + 4 9600 600 110 38400 26298.5 68 + 3 9600 900 118 28800 24515.6 85 + 3 9600 600 120 28800 24107 83 + 3 9600 400 131 28800 22082.7 76 + 1 19200 900 155 19200 18663.5 97 + 1 19200 600 161 19200 17968 93 + 1 19200 400 170 19200 17016.7 88 + 2 9600 600 176 19200 16436.6 85 + 2 9600 900 180 19200 16071.3 83 + 2 9600 400 181 19200 15982.5 83 + 1 9600 900 305 9600 9484.72 98 + 1 9600 600 314 9600 9212.87 95 + 1 9600 400 332 9600 8713.37 90 + ====== ======== === ======== ======= ======= === + +5.2. Anthony Healy's Report +--------------------------- + + :: + + Date: Mon, 13 Feb 1995 16:17:29 +1100 (EST) + From: Antony Healey + To: Simon Janes + Subject: Re: Load Balancing + + Hi Simon, + I've installed your patch and it works great. I have trialed + it over twin SL/IP lines, just over null modems, but I was + able to data at over 48Kb/s [ISDN link -Simon]. I managed a + transfer of up to 7.5 Kbyte/s on one go, but averaged around + 6.4 Kbyte/s, which I think is pretty cool. :) diff --git a/Documentation/networking/eql.txt b/Documentation/networking/eql.txt deleted file mode 100644 index 0f1550150f05..000000000000 --- a/Documentation/networking/eql.txt +++ /dev/null @@ -1,528 +0,0 @@ - EQL Driver: Serial IP Load Balancing HOWTO - Simon "Guru Aleph-Null" Janes, simon@ncm.com - v1.1, February 27, 1995 - - This is the manual for the EQL device driver. EQL is a software device - that lets you load-balance IP serial links (SLIP or uncompressed PPP) - to increase your bandwidth. It will not reduce your latency (i.e. ping - times) except in the case where you already have lots of traffic on - your link, in which it will help them out. This driver has been tested - with the 1.1.75 kernel, and is known to have patched cleanly with - 1.1.86. Some testing with 1.1.92 has been done with the v1.1 patch - which was only created to patch cleanly in the very latest kernel - source trees. (Yes, it worked fine.) - - 1. Introduction - - Which is worse? A huge fee for a 56K leased line or two phone lines? - It's probably the former. If you find yourself craving more bandwidth, - and have a ISP that is flexible, it is now possible to bind modems - together to work as one point-to-point link to increase your - bandwidth. All without having to have a special black box on either - side. - - - The eql driver has only been tested with the Livingston PortMaster-2e - terminal server. I do not know if other terminal servers support load- - balancing, but I do know that the PortMaster does it, and does it - almost as well as the eql driver seems to do it (-- Unfortunately, in - my testing so far, the Livingston PortMaster 2e's load-balancing is a - good 1 to 2 KB/s slower than the test machine working with a 28.8 Kbps - and 14.4 Kbps connection. However, I am not sure that it really is - the PortMaster, or if it's Linux's TCP drivers. I'm told that Linux's - TCP implementation is pretty fast though.--) - - - I suggest to ISPs out there that it would probably be fair to charge - a load-balancing client 75% of the cost of the second line and 50% of - the cost of the third line etc... - - - Hey, we can all dream you know... - - - 2. Kernel Configuration - - Here I describe the general steps of getting a kernel up and working - with the eql driver. From patching, building, to installing. - - - 2.1. Patching The Kernel - - If you do not have or cannot get a copy of the kernel with the eql - driver folded into it, get your copy of the driver from - ftp://slaughter.ncm.com/pub/Linux/LOAD_BALANCING/eql-1.1.tar.gz. - Unpack this archive someplace obvious like /usr/local/src/. It will - create the following files: - - - - ______________________________________________________________________ - -rw-r--r-- guru/ncm 198 Jan 19 18:53 1995 eql-1.1/NO-WARRANTY - -rw-r--r-- guru/ncm 30620 Feb 27 21:40 1995 eql-1.1/eql-1.1.patch - -rwxr-xr-x guru/ncm 16111 Jan 12 22:29 1995 eql-1.1/eql_enslave - -rw-r--r-- guru/ncm 2195 Jan 10 21:48 1995 eql-1.1/eql_enslave.c - ______________________________________________________________________ - - Unpack a recent kernel (something after 1.1.92) someplace convenient - like say /usr/src/linux-1.1.92.eql. Use symbolic links to point - /usr/src/linux to this development directory. - - - Apply the patch by running the commands: - - - ______________________________________________________________________ - cd /usr/src - patch - ". Here are some example enslavings: - - - - ______________________________________________________________________ - eql_enslave eql sl0 28800 - eql_enslave eql ppp0 14400 - eql_enslave eql sl1 57600 - ______________________________________________________________________ - - - - - - When you want to free a device from its life of slavery, you can - either down the device with ifconfig (eql will automatically bury the - dead slave and remove it from its queue) or use eql_emancipate to free - it. (-- Or just ifconfig it down, and the eql driver will take it out - for you.--) - - - - ______________________________________________________________________ - eql_emancipate eql sl0 - eql_emancipate eql ppp0 - eql_emancipate eql sl1 - ______________________________________________________________________ - - - - - - 3.3. DSLIP Configuration for the eql Device - - The general idea is to bring up and keep up as many SLIP connections - as you need, automatically. - - - 3.3.1. /etc/slip/runslip.conf - - Here is an example runslip.conf: - - - - - - - - - - - - - - - - ______________________________________________________________________ - name sl-line-1 - enabled - baud 38400 - mtu 576 - ducmd -e /etc/slip/dialout/cua2-288.xp -t 9 - command eql_enslave eql $interface 28800 - address 198.67.33.239 - line /dev/cua2 - - name sl-line-2 - enabled - baud 38400 - mtu 576 - ducmd -e /etc/slip/dialout/cua3-288.xp -t 9 - command eql_enslave eql $interface 28800 - address 198.67.33.239 - line /dev/cua3 - ______________________________________________________________________ - - - - - - 3.4. Using PPP and the eql Device - - I have not yet done any load-balancing testing for PPP devices, mainly - because I don't have a PPP-connection manager like SLIP has with - DSLIP. I did find a good tip from LinuxNET:Billy for PPP performance: - make sure you have asyncmap set to something so that control - characters are not escaped. - - - I tried to fix up a PPP script/system for redialing lost PPP - connections for use with the eql driver the weekend of Feb 25-26 '95 - (Hereafter known as the 8-hour PPP Hate Festival). Perhaps later this - year. - - - 4. About the Slave Scheduler Algorithm - - The slave scheduler probably could be replaced with a dozen other - things and push traffic much faster. The formula in the current set - up of the driver was tuned to handle slaves with wildly different - bits-per-second "priorities". - - - All testing I have done was with two 28.8 V.FC modems, one connecting - at 28800 bps or slower, and the other connecting at 14400 bps all the - time. - - - One version of the scheduler was able to push 5.3 K/s through the - 28800 and 14400 connections, but when the priorities on the links were - very wide apart (57600 vs. 14400) the "faster" modem received all - traffic and the "slower" modem starved. - - - 5. Testers' Reports - - Some people have experimented with the eql device with newer - kernels (than 1.1.75). I have since updated the driver to patch - cleanly in newer kernels because of the removal of the old "slave- - balancing" driver config option. - - - o icee from LinuxNET patched 1.1.86 without any rejects and was able - to boot the kernel and enslave a couple of ISDN PPP links. - - 5.1. Randolph Bentson's Test Report - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - From bentson@grieg.seaslug.org Wed Feb 8 19:08:09 1995 - Date: Tue, 7 Feb 95 22:57 PST - From: Randolph Bentson - To: guru@ncm.com - Subject: EQL driver tests - - - I have been checking out your eql driver. (Nice work, that!) - Although you may already done this performance testing, here - are some data I've discovered. - - Randolph Bentson - bentson@grieg.seaslug.org - - --------------------------------------------------------- - - - A pseudo-device driver, EQL, written by Simon Janes, can be used - to bundle multiple SLIP connections into what appears to be a - single connection. This allows one to improve dial-up network - connectivity gradually, without having to buy expensive DSU/CSU - hardware and services. - - I have done some testing of this software, with two goals in - mind: first, to ensure it actually works as described and - second, as a method of exercising my device driver. - - The following performance measurements were derived from a set - of SLIP connections run between two Linux systems (1.1.84) using - a 486DX2/66 with a Cyclom-8Ys and a 486SLC/40 with a Cyclom-16Y. - (Ports 0,1,2,3 were used. A later configuration will distribute - port selection across the different Cirrus chips on the boards.) - Once a link was established, I timed a binary ftp transfer of - 289284 bytes of data. If there were no overhead (packet headers, - inter-character and inter-packet delays, etc.) the transfers - would take the following times: - - bits/sec seconds - 345600 8.3 - 234600 12.3 - 172800 16.7 - 153600 18.8 - 76800 37.6 - 57600 50.2 - 38400 75.3 - 28800 100.4 - 19200 150.6 - 9600 301.3 - - A single line running at the lower speeds and with large packets - comes to within 2% of this. Performance is limited for the higher - speeds (as predicted by the Cirrus databook) to an aggregate of - about 160 kbits/sec. The next round of testing will distribute - the load across two or more Cirrus chips. - - The good news is that one gets nearly the full advantage of the - second, third, and fourth line's bandwidth. (The bad news is - that the connection establishment seemed fragile for the higher - speeds. Once established, the connection seemed robust enough.) - - #lines speed mtu seconds theory actual %of - kbit/sec duration speed speed max - 3 115200 900 _ 345600 - 3 115200 400 18.1 345600 159825 46 - 2 115200 900 _ 230400 - 2 115200 600 18.1 230400 159825 69 - 2 115200 400 19.3 230400 149888 65 - 4 57600 900 _ 234600 - 4 57600 600 _ 234600 - 4 57600 400 _ 234600 - 3 57600 600 20.9 172800 138413 80 - 3 57600 900 21.2 172800 136455 78 - 3 115200 600 21.7 345600 133311 38 - 3 57600 400 22.5 172800 128571 74 - 4 38400 900 25.2 153600 114795 74 - 4 38400 600 26.4 153600 109577 71 - 4 38400 400 27.3 153600 105965 68 - 2 57600 900 29.1 115200 99410.3 86 - 1 115200 900 30.7 115200 94229.3 81 - 2 57600 600 30.2 115200 95789.4 83 - 3 38400 900 30.3 115200 95473.3 82 - 3 38400 600 31.2 115200 92719.2 80 - 1 115200 600 31.3 115200 92423 80 - 2 57600 400 32.3 115200 89561.6 77 - 1 115200 400 32.8 115200 88196.3 76 - 3 38400 400 33.5 115200 86353.4 74 - 2 38400 900 43.7 76800 66197.7 86 - 2 38400 600 44 76800 65746.4 85 - 2 38400 400 47.2 76800 61289 79 - 4 19200 900 50.8 76800 56945.7 74 - 4 19200 400 53.2 76800 54376.7 70 - 4 19200 600 53.7 76800 53870.4 70 - 1 57600 900 54.6 57600 52982.4 91 - 1 57600 600 56.2 57600 51474 89 - 3 19200 900 60.5 57600 47815.5 83 - 1 57600 400 60.2 57600 48053.8 83 - 3 19200 600 62 57600 46658.7 81 - 3 19200 400 64.7 57600 44711.6 77 - 1 38400 900 79.4 38400 36433.8 94 - 1 38400 600 82.4 38400 35107.3 91 - 2 19200 900 84.4 38400 34275.4 89 - 1 38400 400 86.8 38400 33327.6 86 - 2 19200 600 87.6 38400 33023.3 85 - 2 19200 400 91.2 38400 31719.7 82 - 4 9600 900 94.7 38400 30547.4 79 - 4 9600 400 106 38400 27290.9 71 - 4 9600 600 110 38400 26298.5 68 - 3 9600 900 118 28800 24515.6 85 - 3 9600 600 120 28800 24107 83 - 3 9600 400 131 28800 22082.7 76 - 1 19200 900 155 19200 18663.5 97 - 1 19200 600 161 19200 17968 93 - 1 19200 400 170 19200 17016.7 88 - 2 9600 600 176 19200 16436.6 85 - 2 9600 900 180 19200 16071.3 83 - 2 9600 400 181 19200 15982.5 83 - 1 9600 900 305 9600 9484.72 98 - 1 9600 600 314 9600 9212.87 95 - 1 9600 400 332 9600 8713.37 90 - - - - - - 5.2. Anthony Healy's Report - - - - - - - - Date: Mon, 13 Feb 1995 16:17:29 +1100 (EST) - From: Antony Healey - To: Simon Janes - Subject: Re: Load Balancing - - Hi Simon, - I've installed your patch and it works great. I have trialed - it over twin SL/IP lines, just over null modems, but I was - able to data at over 48Kb/s [ISDN link -Simon]. I managed a - transfer of up to 7.5 Kbyte/s on one go, but averaged around - 6.4 Kbyte/s, which I think is pretty cool. :) - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - diff --git a/Documentation/networking/index.rst b/Documentation/networking/index.rst index 313f66900bce..9ef6ef42bdc5 100644 --- a/Documentation/networking/index.rst +++ b/Documentation/networking/index.rst @@ -54,6 +54,7 @@ Contents: defza dns_resolver driver + eql .. only:: subproject and html diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig index 4ab6d343fd86..c822f4a6d166 100644 --- a/drivers/net/Kconfig +++ b/drivers/net/Kconfig @@ -126,7 +126,7 @@ config EQUALIZER Linux driver or with a Livingston Portmaster 2e. Say Y if you want this and read - . You may also want to read + . You may also want to read section 6.2 of the NET-3-HOWTO, available from . -- cgit v1.2.3-59-g8ed1b From aee113427c5d205730b2c1a023661799f41aca23 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Tue, 28 Apr 2020 00:01:35 +0200 Subject: docs: networking: convert fib_trie.txt to ReST - add SPDX header; - adjust title markup; - adjust identation, whitespaces and blank lines; - add to networking/index.rst. Signed-off-by: Mauro Carvalho Chehab Signed-off-by: David S. Miller --- Documentation/networking/fib_trie.rst | 149 ++++++++++++++++++++++++++++++++++ Documentation/networking/fib_trie.txt | 145 --------------------------------- Documentation/networking/index.rst | 1 + 3 files changed, 150 insertions(+), 145 deletions(-) create mode 100644 Documentation/networking/fib_trie.rst delete mode 100644 Documentation/networking/fib_trie.txt diff --git a/Documentation/networking/fib_trie.rst b/Documentation/networking/fib_trie.rst new file mode 100644 index 000000000000..f1435b7fcdb7 --- /dev/null +++ b/Documentation/networking/fib_trie.rst @@ -0,0 +1,149 @@ +.. SPDX-License-Identifier: GPL-2.0 + +============================ +LC-trie implementation notes +============================ + +Node types +---------- +leaf + An end node with data. This has a copy of the relevant key, along + with 'hlist' with routing table entries sorted by prefix length. + See struct leaf and struct leaf_info. + +trie node or tnode + An internal node, holding an array of child (leaf or tnode) pointers, + indexed through a subset of the key. See Level Compression. + +A few concepts explained +------------------------ +Bits (tnode) + The number of bits in the key segment used for indexing into the + child array - the "child index". See Level Compression. + +Pos (tnode) + The position (in the key) of the key segment used for indexing into + the child array. See Path Compression. + +Path Compression / skipped bits + Any given tnode is linked to from the child array of its parent, using + a segment of the key specified by the parent's "pos" and "bits" + In certain cases, this tnode's own "pos" will not be immediately + adjacent to the parent (pos+bits), but there will be some bits + in the key skipped over because they represent a single path with no + deviations. These "skipped bits" constitute Path Compression. + Note that the search algorithm will simply skip over these bits when + searching, making it necessary to save the keys in the leaves to + verify that they actually do match the key we are searching for. + +Level Compression / child arrays + the trie is kept level balanced moving, under certain conditions, the + children of a full child (see "full_children") up one level, so that + instead of a pure binary tree, each internal node ("tnode") may + contain an arbitrarily large array of links to several children. + Conversely, a tnode with a mostly empty child array (see empty_children) + may be "halved", having some of its children moved downwards one level, + in order to avoid ever-increasing child arrays. + +empty_children + the number of positions in the child array of a given tnode that are + NULL. + +full_children + the number of children of a given tnode that aren't path compressed. + (in other words, they aren't NULL or leaves and their "pos" is equal + to this tnode's "pos"+"bits"). + + (The word "full" here is used more in the sense of "complete" than + as the opposite of "empty", which might be a tad confusing.) + +Comments +--------- + +We have tried to keep the structure of the code as close to fib_hash as +possible to allow verification and help up reviewing. + +fib_find_node() + A good start for understanding this code. This function implements a + straightforward trie lookup. + +fib_insert_node() + Inserts a new leaf node in the trie. This is bit more complicated than + fib_find_node(). Inserting a new node means we might have to run the + level compression algorithm on part of the trie. + +trie_leaf_remove() + Looks up a key, deletes it and runs the level compression algorithm. + +trie_rebalance() + The key function for the dynamic trie after any change in the trie + it is run to optimize and reorganize. It will walk the trie upwards + towards the root from a given tnode, doing a resize() at each step + to implement level compression. + +resize() + Analyzes a tnode and optimizes the child array size by either inflating + or shrinking it repeatedly until it fulfills the criteria for optimal + level compression. This part follows the original paper pretty closely + and there may be some room for experimentation here. + +inflate() + Doubles the size of the child array within a tnode. Used by resize(). + +halve() + Halves the size of the child array within a tnode - the inverse of + inflate(). Used by resize(); + +fn_trie_insert(), fn_trie_delete(), fn_trie_select_default() + The route manipulation functions. Should conform pretty closely to the + corresponding functions in fib_hash. + +fn_trie_flush() + This walks the full trie (using nextleaf()) and searches for empty + leaves which have to be removed. + +fn_trie_dump() + Dumps the routing table ordered by prefix length. This is somewhat + slower than the corresponding fib_hash function, as we have to walk the + entire trie for each prefix length. In comparison, fib_hash is organized + as one "zone"/hash per prefix length. + +Locking +------- + +fib_lock is used for an RW-lock in the same way that this is done in fib_hash. +However, the functions are somewhat separated for other possible locking +scenarios. It might conceivably be possible to run trie_rebalance via RCU +to avoid read_lock in the fn_trie_lookup() function. + +Main lookup mechanism +--------------------- +fn_trie_lookup() is the main lookup function. + +The lookup is in its simplest form just like fib_find_node(). We descend the +trie, key segment by key segment, until we find a leaf. check_leaf() does +the fib_semantic_match in the leaf's sorted prefix hlist. + +If we find a match, we are done. + +If we don't find a match, we enter prefix matching mode. The prefix length, +starting out at the same as the key length, is reduced one step at a time, +and we backtrack upwards through the trie trying to find a longest matching +prefix. The goal is always to reach a leaf and get a positive result from the +fib_semantic_match mechanism. + +Inside each tnode, the search for longest matching prefix consists of searching +through the child array, chopping off (zeroing) the least significant "1" of +the child index until we find a match or the child index consists of nothing but +zeros. + +At this point we backtrack (t->stats.backtrack++) up the trie, continuing to +chop off part of the key in order to find the longest matching prefix. + +At this point we will repeatedly descend subtries to look for a match, and there +are some optimizations available that can provide us with "shortcuts" to avoid +descending into dead ends. Look for "HL_OPTIMIZE" sections in the code. + +To alleviate any doubts about the correctness of the route selection process, +a new netlink operation has been added. Look for NETLINK_FIB_LOOKUP, which +gives userland access to fib_lookup(). diff --git a/Documentation/networking/fib_trie.txt b/Documentation/networking/fib_trie.txt deleted file mode 100644 index fe719388518b..000000000000 --- a/Documentation/networking/fib_trie.txt +++ /dev/null @@ -1,145 +0,0 @@ - LC-trie implementation notes. - -Node types ----------- -leaf - An end node with data. This has a copy of the relevant key, along - with 'hlist' with routing table entries sorted by prefix length. - See struct leaf and struct leaf_info. - -trie node or tnode - An internal node, holding an array of child (leaf or tnode) pointers, - indexed through a subset of the key. See Level Compression. - -A few concepts explained ------------------------- -Bits (tnode) - The number of bits in the key segment used for indexing into the - child array - the "child index". See Level Compression. - -Pos (tnode) - The position (in the key) of the key segment used for indexing into - the child array. See Path Compression. - -Path Compression / skipped bits - Any given tnode is linked to from the child array of its parent, using - a segment of the key specified by the parent's "pos" and "bits" - In certain cases, this tnode's own "pos" will not be immediately - adjacent to the parent (pos+bits), but there will be some bits - in the key skipped over because they represent a single path with no - deviations. These "skipped bits" constitute Path Compression. - Note that the search algorithm will simply skip over these bits when - searching, making it necessary to save the keys in the leaves to - verify that they actually do match the key we are searching for. - -Level Compression / child arrays - the trie is kept level balanced moving, under certain conditions, the - children of a full child (see "full_children") up one level, so that - instead of a pure binary tree, each internal node ("tnode") may - contain an arbitrarily large array of links to several children. - Conversely, a tnode with a mostly empty child array (see empty_children) - may be "halved", having some of its children moved downwards one level, - in order to avoid ever-increasing child arrays. - -empty_children - the number of positions in the child array of a given tnode that are - NULL. - -full_children - the number of children of a given tnode that aren't path compressed. - (in other words, they aren't NULL or leaves and their "pos" is equal - to this tnode's "pos"+"bits"). - - (The word "full" here is used more in the sense of "complete" than - as the opposite of "empty", which might be a tad confusing.) - -Comments ---------- - -We have tried to keep the structure of the code as close to fib_hash as -possible to allow verification and help up reviewing. - -fib_find_node() - A good start for understanding this code. This function implements a - straightforward trie lookup. - -fib_insert_node() - Inserts a new leaf node in the trie. This is bit more complicated than - fib_find_node(). Inserting a new node means we might have to run the - level compression algorithm on part of the trie. - -trie_leaf_remove() - Looks up a key, deletes it and runs the level compression algorithm. - -trie_rebalance() - The key function for the dynamic trie after any change in the trie - it is run to optimize and reorganize. It will walk the trie upwards - towards the root from a given tnode, doing a resize() at each step - to implement level compression. - -resize() - Analyzes a tnode and optimizes the child array size by either inflating - or shrinking it repeatedly until it fulfills the criteria for optimal - level compression. This part follows the original paper pretty closely - and there may be some room for experimentation here. - -inflate() - Doubles the size of the child array within a tnode. Used by resize(). - -halve() - Halves the size of the child array within a tnode - the inverse of - inflate(). Used by resize(); - -fn_trie_insert(), fn_trie_delete(), fn_trie_select_default() - The route manipulation functions. Should conform pretty closely to the - corresponding functions in fib_hash. - -fn_trie_flush() - This walks the full trie (using nextleaf()) and searches for empty - leaves which have to be removed. - -fn_trie_dump() - Dumps the routing table ordered by prefix length. This is somewhat - slower than the corresponding fib_hash function, as we have to walk the - entire trie for each prefix length. In comparison, fib_hash is organized - as one "zone"/hash per prefix length. - -Locking -------- - -fib_lock is used for an RW-lock in the same way that this is done in fib_hash. -However, the functions are somewhat separated for other possible locking -scenarios. It might conceivably be possible to run trie_rebalance via RCU -to avoid read_lock in the fn_trie_lookup() function. - -Main lookup mechanism ---------------------- -fn_trie_lookup() is the main lookup function. - -The lookup is in its simplest form just like fib_find_node(). We descend the -trie, key segment by key segment, until we find a leaf. check_leaf() does -the fib_semantic_match in the leaf's sorted prefix hlist. - -If we find a match, we are done. - -If we don't find a match, we enter prefix matching mode. The prefix length, -starting out at the same as the key length, is reduced one step at a time, -and we backtrack upwards through the trie trying to find a longest matching -prefix. The goal is always to reach a leaf and get a positive result from the -fib_semantic_match mechanism. - -Inside each tnode, the search for longest matching prefix consists of searching -through the child array, chopping off (zeroing) the least significant "1" of -the child index until we find a match or the child index consists of nothing but -zeros. - -At this point we backtrack (t->stats.backtrack++) up the trie, continuing to -chop off part of the key in order to find the longest matching prefix. - -At this point we will repeatedly descend subtries to look for a match, and there -are some optimizations available that can provide us with "shortcuts" to avoid -descending into dead ends. Look for "HL_OPTIMIZE" sections in the code. - -To alleviate any doubts about the correctness of the route selection process, -a new netlink operation has been added. Look for NETLINK_FIB_LOOKUP, which -gives userland access to fib_lookup(). diff --git a/Documentation/networking/index.rst b/Documentation/networking/index.rst index 9ef6ef42bdc5..807abe25ae4b 100644 --- a/Documentation/networking/index.rst +++ b/Documentation/networking/index.rst @@ -55,6 +55,7 @@ Contents: dns_resolver driver eql + fib_trie .. only:: subproject and html -- cgit v1.2.3-59-g8ed1b From cb3f0d56e153398a035eb22769d2cb2837f29747 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Tue, 28 Apr 2020 00:01:36 +0200 Subject: docs: networking: convert filter.txt to ReST - add SPDX header; - adjust title markup; - mark code blocks and literals as such; - use footnote markup; - mark tables as such; - adjust identation, whitespaces and blank lines; - add to networking/index.rst. Signed-off-by: Mauro Carvalho Chehab Signed-off-by: David S. Miller --- Documentation/bpf/index.rst | 4 +- Documentation/networking/filter.rst | 1651 ++++++++++++++++++++++++++++++ Documentation/networking/filter.txt | 1545 ---------------------------- Documentation/networking/index.rst | 1 + Documentation/networking/packet_mmap.txt | 2 +- MAINTAINERS | 2 +- tools/bpf/bpf_asm.c | 2 +- tools/bpf/bpf_dbg.c | 2 +- 8 files changed, 1658 insertions(+), 1551 deletions(-) create mode 100644 Documentation/networking/filter.rst delete mode 100644 Documentation/networking/filter.txt diff --git a/Documentation/bpf/index.rst b/Documentation/bpf/index.rst index f99677f3572f..38b4db8be7a2 100644 --- a/Documentation/bpf/index.rst +++ b/Documentation/bpf/index.rst @@ -7,7 +7,7 @@ Filter) facility, with a focus on the extended BPF version (eBPF). This kernel side documentation is still work in progress. The main textual documentation is (for historical reasons) described in -`Documentation/networking/filter.txt`_, which describe both classical +`Documentation/networking/filter.rst`_, which describe both classical and extended BPF instruction-set. The Cilium project also maintains a `BPF and XDP Reference Guide`_ that goes into great technical depth about the BPF Architecture. @@ -59,7 +59,7 @@ Testing and debugging BPF .. Links: -.. _Documentation/networking/filter.txt: ../networking/filter.txt +.. _Documentation/networking/filter.rst: ../networking/filter.txt .. _man-pages: https://www.kernel.org/doc/man-pages/ .. _bpf(2): http://man7.org/linux/man-pages/man2/bpf.2.html .. _BPF and XDP Reference Guide: http://cilium.readthedocs.io/en/latest/bpf/ diff --git a/Documentation/networking/filter.rst b/Documentation/networking/filter.rst new file mode 100644 index 000000000000..a1d3e192b9fa --- /dev/null +++ b/Documentation/networking/filter.rst @@ -0,0 +1,1651 @@ +.. SPDX-License-Identifier: GPL-2.0 + +======================================================= +Linux Socket Filtering aka Berkeley Packet Filter (BPF) +======================================================= + +Introduction +------------ + +Linux Socket Filtering (LSF) is derived from the Berkeley Packet Filter. +Though there are some distinct differences between the BSD and Linux +Kernel filtering, but when we speak of BPF or LSF in Linux context, we +mean the very same mechanism of filtering in the Linux kernel. + +BPF allows a user-space program to attach a filter onto any socket and +allow or disallow certain types of data to come through the socket. LSF +follows exactly the same filter code structure as BSD's BPF, so referring +to the BSD bpf.4 manpage is very helpful in creating filters. + +On Linux, BPF is much simpler than on BSD. One does not have to worry +about devices or anything like that. You simply create your filter code, +send it to the kernel via the SO_ATTACH_FILTER option and if your filter +code passes the kernel check on it, you then immediately begin filtering +data on that socket. + +You can also detach filters from your socket via the SO_DETACH_FILTER +option. This will probably not be used much since when you close a socket +that has a filter on it the filter is automagically removed. The other +less common case may be adding a different filter on the same socket where +you had another filter that is still running: the kernel takes care of +removing the old one and placing your new one in its place, assuming your +filter has passed the checks, otherwise if it fails the old filter will +remain on that socket. + +SO_LOCK_FILTER option allows to lock the filter attached to a socket. Once +set, a filter cannot be removed or changed. This allows one process to +setup a socket, attach a filter, lock it then drop privileges and be +assured that the filter will be kept until the socket is closed. + +The biggest user of this construct might be libpcap. Issuing a high-level +filter command like `tcpdump -i em1 port 22` passes through the libpcap +internal compiler that generates a structure that can eventually be loaded +via SO_ATTACH_FILTER to the kernel. `tcpdump -i em1 port 22 -ddd` +displays what is being placed into this structure. + +Although we were only speaking about sockets here, BPF in Linux is used +in many more places. There's xt_bpf for netfilter, cls_bpf in the kernel +qdisc layer, SECCOMP-BPF (SECure COMPuting [1]_), and lots of other places +such as team driver, PTP code, etc where BPF is being used. + +.. [1] Documentation/userspace-api/seccomp_filter.rst + +Original BPF paper: + +Steven McCanne and Van Jacobson. 1993. The BSD packet filter: a new +architecture for user-level packet capture. In Proceedings of the +USENIX Winter 1993 Conference Proceedings on USENIX Winter 1993 +Conference Proceedings (USENIX'93). USENIX Association, Berkeley, +CA, USA, 2-2. [http://www.tcpdump.org/papers/bpf-usenix93.pdf] + +Structure +--------- + +User space applications include which contains the +following relevant structures:: + + struct sock_filter { /* Filter block */ + __u16 code; /* Actual filter code */ + __u8 jt; /* Jump true */ + __u8 jf; /* Jump false */ + __u32 k; /* Generic multiuse field */ + }; + +Such a structure is assembled as an array of 4-tuples, that contains +a code, jt, jf and k value. jt and jf are jump offsets and k a generic +value to be used for a provided code:: + + struct sock_fprog { /* Required for SO_ATTACH_FILTER. */ + unsigned short len; /* Number of filter blocks */ + struct sock_filter __user *filter; + }; + +For socket filtering, a pointer to this structure (as shown in +follow-up example) is being passed to the kernel through setsockopt(2). + +Example +------- + +:: + + #include + #include + #include + #include + /* ... */ + + /* From the example above: tcpdump -i em1 port 22 -dd */ + struct sock_filter code[] = { + { 0x28, 0, 0, 0x0000000c }, + { 0x15, 0, 8, 0x000086dd }, + { 0x30, 0, 0, 0x00000014 }, + { 0x15, 2, 0, 0x00000084 }, + { 0x15, 1, 0, 0x00000006 }, + { 0x15, 0, 17, 0x00000011 }, + { 0x28, 0, 0, 0x00000036 }, + { 0x15, 14, 0, 0x00000016 }, + { 0x28, 0, 0, 0x00000038 }, + { 0x15, 12, 13, 0x00000016 }, + { 0x15, 0, 12, 0x00000800 }, + { 0x30, 0, 0, 0x00000017 }, + { 0x15, 2, 0, 0x00000084 }, + { 0x15, 1, 0, 0x00000006 }, + { 0x15, 0, 8, 0x00000011 }, + { 0x28, 0, 0, 0x00000014 }, + { 0x45, 6, 0, 0x00001fff }, + { 0xb1, 0, 0, 0x0000000e }, + { 0x48, 0, 0, 0x0000000e }, + { 0x15, 2, 0, 0x00000016 }, + { 0x48, 0, 0, 0x00000010 }, + { 0x15, 0, 1, 0x00000016 }, + { 0x06, 0, 0, 0x0000ffff }, + { 0x06, 0, 0, 0x00000000 }, + }; + + struct sock_fprog bpf = { + .len = ARRAY_SIZE(code), + .filter = code, + }; + + sock = socket(PF_PACKET, SOCK_RAW, htons(ETH_P_ALL)); + if (sock < 0) + /* ... bail out ... */ + + ret = setsockopt(sock, SOL_SOCKET, SO_ATTACH_FILTER, &bpf, sizeof(bpf)); + if (ret < 0) + /* ... bail out ... */ + + /* ... */ + close(sock); + +The above example code attaches a socket filter for a PF_PACKET socket +in order to let all IPv4/IPv6 packets with port 22 pass. The rest will +be dropped for this socket. + +The setsockopt(2) call to SO_DETACH_FILTER doesn't need any arguments +and SO_LOCK_FILTER for preventing the filter to be detached, takes an +integer value with 0 or 1. + +Note that socket filters are not restricted to PF_PACKET sockets only, +but can also be used on other socket families. + +Summary of system calls: + + * setsockopt(sockfd, SOL_SOCKET, SO_ATTACH_FILTER, &val, sizeof(val)); + * setsockopt(sockfd, SOL_SOCKET, SO_DETACH_FILTER, &val, sizeof(val)); + * setsockopt(sockfd, SOL_SOCKET, SO_LOCK_FILTER, &val, sizeof(val)); + +Normally, most use cases for socket filtering on packet sockets will be +covered by libpcap in high-level syntax, so as an application developer +you should stick to that. libpcap wraps its own layer around all that. + +Unless i) using/linking to libpcap is not an option, ii) the required BPF +filters use Linux extensions that are not supported by libpcap's compiler, +iii) a filter might be more complex and not cleanly implementable with +libpcap's compiler, or iv) particular filter codes should be optimized +differently than libpcap's internal compiler does; then in such cases +writing such a filter "by hand" can be of an alternative. For example, +xt_bpf and cls_bpf users might have requirements that could result in +more complex filter code, or one that cannot be expressed with libpcap +(e.g. different return codes for various code paths). Moreover, BPF JIT +implementors may wish to manually write test cases and thus need low-level +access to BPF code as well. + +BPF engine and instruction set +------------------------------ + +Under tools/bpf/ there's a small helper tool called bpf_asm which can +be used to write low-level filters for example scenarios mentioned in the +previous section. Asm-like syntax mentioned here has been implemented in +bpf_asm and will be used for further explanations (instead of dealing with +less readable opcodes directly, principles are the same). The syntax is +closely modelled after Steven McCanne's and Van Jacobson's BPF paper. + +The BPF architecture consists of the following basic elements: + + ======= ==================================================== + Element Description + ======= ==================================================== + A 32 bit wide accumulator + X 32 bit wide X register + M[] 16 x 32 bit wide misc registers aka "scratch memory + store", addressable from 0 to 15 + ======= ==================================================== + +A program, that is translated by bpf_asm into "opcodes" is an array that +consists of the following elements (as already mentioned):: + + op:16, jt:8, jf:8, k:32 + +The element op is a 16 bit wide opcode that has a particular instruction +encoded. jt and jf are two 8 bit wide jump targets, one for condition +"jump if true", the other one "jump if false". Eventually, element k +contains a miscellaneous argument that can be interpreted in different +ways depending on the given instruction in op. + +The instruction set consists of load, store, branch, alu, miscellaneous +and return instructions that are also represented in bpf_asm syntax. This +table lists all bpf_asm instructions available resp. what their underlying +opcodes as defined in linux/filter.h stand for: + + =========== =================== ===================== + Instruction Addressing mode Description + =========== =================== ===================== + ld 1, 2, 3, 4, 12 Load word into A + ldi 4 Load word into A + ldh 1, 2 Load half-word into A + ldb 1, 2 Load byte into A + ldx 3, 4, 5, 12 Load word into X + ldxi 4 Load word into X + ldxb 5 Load byte into X + + st 3 Store A into M[] + stx 3 Store X into M[] + + jmp 6 Jump to label + ja 6 Jump to label + jeq 7, 8, 9, 10 Jump on A == + jneq 9, 10 Jump on A != + jne 9, 10 Jump on A != + jlt 9, 10 Jump on A < + jle 9, 10 Jump on A <= + jgt 7, 8, 9, 10 Jump on A > + jge 7, 8, 9, 10 Jump on A >= + jset 7, 8, 9, 10 Jump on A & + + add 0, 4 A + + sub 0, 4 A - + mul 0, 4 A * + div 0, 4 A / + mod 0, 4 A % + neg !A + and 0, 4 A & + or 0, 4 A | + xor 0, 4 A ^ + lsh 0, 4 A << + rsh 0, 4 A >> + + tax Copy A into X + txa Copy X into A + + ret 4, 11 Return + =========== =================== ===================== + +The next table shows addressing formats from the 2nd column: + + =============== =================== =============================================== + Addressing mode Syntax Description + =============== =================== =============================================== + 0 x/%x Register X + 1 [k] BHW at byte offset k in the packet + 2 [x + k] BHW at the offset X + k in the packet + 3 M[k] Word at offset k in M[] + 4 #k Literal value stored in k + 5 4*([k]&0xf) Lower nibble * 4 at byte offset k in the packet + 6 L Jump label L + 7 #k,Lt,Lf Jump to Lt if true, otherwise jump to Lf + 8 x/%x,Lt,Lf Jump to Lt if true, otherwise jump to Lf + 9 #k,Lt Jump to Lt if predicate is true + 10 x/%x,Lt Jump to Lt if predicate is true + 11 a/%a Accumulator A + 12 extension BPF extension + =============== =================== =============================================== + +The Linux kernel also has a couple of BPF extensions that are used along +with the class of load instructions by "overloading" the k argument with +a negative offset + a particular extension offset. The result of such BPF +extensions are loaded into A. + +Possible BPF extensions are shown in the following table: + + =================================== ================================================= + Extension Description + =================================== ================================================= + len skb->len + proto skb->protocol + type skb->pkt_type + poff Payload start offset + ifidx skb->dev->ifindex + nla Netlink attribute of type X with offset A + nlan Nested Netlink attribute of type X with offset A + mark skb->mark + queue skb->queue_mapping + hatype skb->dev->type + rxhash skb->hash + cpu raw_smp_processor_id() + vlan_tci skb_vlan_tag_get(skb) + vlan_avail skb_vlan_tag_present(skb) + vlan_tpid skb->vlan_proto + rand prandom_u32() + =================================== ================================================= + +These extensions can also be prefixed with '#'. +Examples for low-level BPF: + +**ARP packets**:: + + ldh [12] + jne #0x806, drop + ret #-1 + drop: ret #0 + +**IPv4 TCP packets**:: + + ldh [12] + jne #0x800, drop + ldb [23] + jneq #6, drop + ret #-1 + drop: ret #0 + +**(Accelerated) VLAN w/ id 10**:: + + ld vlan_tci + jneq #10, drop + ret #-1 + drop: ret #0 + +**icmp random packet sampling, 1 in 4**: + + ldh [12] + jne #0x800, drop + ldb [23] + jneq #1, drop + # get a random uint32 number + ld rand + mod #4 + jneq #1, drop + ret #-1 + drop: ret #0 + +**SECCOMP filter example**:: + + ld [4] /* offsetof(struct seccomp_data, arch) */ + jne #0xc000003e, bad /* AUDIT_ARCH_X86_64 */ + ld [0] /* offsetof(struct seccomp_data, nr) */ + jeq #15, good /* __NR_rt_sigreturn */ + jeq #231, good /* __NR_exit_group */ + jeq #60, good /* __NR_exit */ + jeq #0, good /* __NR_read */ + jeq #1, good /* __NR_write */ + jeq #5, good /* __NR_fstat */ + jeq #9, good /* __NR_mmap */ + jeq #14, good /* __NR_rt_sigprocmask */ + jeq #13, good /* __NR_rt_sigaction */ + jeq #35, good /* __NR_nanosleep */ + bad: ret #0 /* SECCOMP_RET_KILL_THREAD */ + good: ret #0x7fff0000 /* SECCOMP_RET_ALLOW */ + +The above example code can be placed into a file (here called "foo"), and +then be passed to the bpf_asm tool for generating opcodes, output that xt_bpf +and cls_bpf understands and can directly be loaded with. Example with above +ARP code:: + + $ ./bpf_asm foo + 4,40 0 0 12,21 0 1 2054,6 0 0 4294967295,6 0 0 0, + +In copy and paste C-like output:: + + $ ./bpf_asm -c foo + { 0x28, 0, 0, 0x0000000c }, + { 0x15, 0, 1, 0x00000806 }, + { 0x06, 0, 0, 0xffffffff }, + { 0x06, 0, 0, 0000000000 }, + +In particular, as usage with xt_bpf or cls_bpf can result in more complex BPF +filters that might not be obvious at first, it's good to test filters before +attaching to a live system. For that purpose, there's a small tool called +bpf_dbg under tools/bpf/ in the kernel source directory. This debugger allows +for testing BPF filters against given pcap files, single stepping through the +BPF code on the pcap's packets and to do BPF machine register dumps. + +Starting bpf_dbg is trivial and just requires issuing:: + + # ./bpf_dbg + +In case input and output do not equal stdin/stdout, bpf_dbg takes an +alternative stdin source as a first argument, and an alternative stdout +sink as a second one, e.g. `./bpf_dbg test_in.txt test_out.txt`. + +Other than that, a particular libreadline configuration can be set via +file "~/.bpf_dbg_init" and the command history is stored in the file +"~/.bpf_dbg_history". + +Interaction in bpf_dbg happens through a shell that also has auto-completion +support (follow-up example commands starting with '>' denote bpf_dbg shell). +The usual workflow would be to ... + +* load bpf 6,40 0 0 12,21 0 3 2048,48 0 0 23,21 0 1 1,6 0 0 65535,6 0 0 0 + Loads a BPF filter from standard output of bpf_asm, or transformed via + e.g. ``tcpdump -iem1 -ddd port 22 | tr '\n' ','``. Note that for JIT + debugging (next section), this command creates a temporary socket and + loads the BPF code into the kernel. Thus, this will also be useful for + JIT developers. + +* load pcap foo.pcap + + Loads standard tcpdump pcap file. + +* run [] + +bpf passes:1 fails:9 + Runs through all packets from a pcap to account how many passes and fails + the filter will generate. A limit of packets to traverse can be given. + +* disassemble:: + + l0: ldh [12] + l1: jeq #0x800, l2, l5 + l2: ldb [23] + l3: jeq #0x1, l4, l5 + l4: ret #0xffff + l5: ret #0 + + Prints out BPF code disassembly. + +* dump:: + + /* { op, jt, jf, k }, */ + { 0x28, 0, 0, 0x0000000c }, + { 0x15, 0, 3, 0x00000800 }, + { 0x30, 0, 0, 0x00000017 }, + { 0x15, 0, 1, 0x00000001 }, + { 0x06, 0, 0, 0x0000ffff }, + { 0x06, 0, 0, 0000000000 }, + + Prints out C-style BPF code dump. + +* breakpoint 0:: + + breakpoint at: l0: ldh [12] + +* breakpoint 1:: + + breakpoint at: l1: jeq #0x800, l2, l5 + + ... + + Sets breakpoints at particular BPF instructions. Issuing a `run` command + will walk through the pcap file continuing from the current packet and + break when a breakpoint is being hit (another `run` will continue from + the currently active breakpoint executing next instructions): + + * run:: + + -- register dump -- + pc: [0] <-- program counter + code: [40] jt[0] jf[0] k[12] <-- plain BPF code of current instruction + curr: l0: ldh [12] <-- disassembly of current instruction + A: [00000000][0] <-- content of A (hex, decimal) + X: [00000000][0] <-- content of X (hex, decimal) + M[0,15]: [00000000][0] <-- folded content of M (hex, decimal) + -- packet dump -- <-- Current packet from pcap (hex) + len: 42 + 0: 00 19 cb 55 55 a4 00 14 a4 43 78 69 08 06 00 01 + 16: 08 00 06 04 00 01 00 14 a4 43 78 69 0a 3b 01 26 + 32: 00 00 00 00 00 00 0a 3b 01 01 + (breakpoint) + > + + * breakpoint:: + + breakpoints: 0 1 + + Prints currently set breakpoints. + +* step [-, +] + + Performs single stepping through the BPF program from the current pc + offset. Thus, on each step invocation, above register dump is issued. + This can go forwards and backwards in time, a plain `step` will break + on the next BPF instruction, thus +1. (No `run` needs to be issued here.) + +* select + + Selects a given packet from the pcap file to continue from. Thus, on + the next `run` or `step`, the BPF program is being evaluated against + the user pre-selected packet. Numbering starts just as in Wireshark + with index 1. + +* quit + + Exits bpf_dbg. + +JIT compiler +------------ + +The Linux kernel has a built-in BPF JIT compiler for x86_64, SPARC, +PowerPC, ARM, ARM64, MIPS, RISC-V and s390 and can be enabled through +CONFIG_BPF_JIT. The JIT compiler is transparently invoked for each +attached filter from user space or for internal kernel users if it has +been previously enabled by root:: + + echo 1 > /proc/sys/net/core/bpf_jit_enable + +For JIT developers, doing audits etc, each compile run can output the generated +opcode image into the kernel log via:: + + echo 2 > /proc/sys/net/core/bpf_jit_enable + +Example output from dmesg:: + + [ 3389.935842] flen=6 proglen=70 pass=3 image=ffffffffa0069c8f + [ 3389.935847] JIT code: 00000000: 55 48 89 e5 48 83 ec 60 48 89 5d f8 44 8b 4f 68 + [ 3389.935849] JIT code: 00000010: 44 2b 4f 6c 4c 8b 87 d8 00 00 00 be 0c 00 00 00 + [ 3389.935850] JIT code: 00000020: e8 1d 94 ff e0 3d 00 08 00 00 75 16 be 17 00 00 + [ 3389.935851] JIT code: 00000030: 00 e8 28 94 ff e0 83 f8 01 75 07 b8 ff ff 00 00 + [ 3389.935852] JIT code: 00000040: eb 02 31 c0 c9 c3 + +When CONFIG_BPF_JIT_ALWAYS_ON is enabled, bpf_jit_enable is permanently set to 1 and +setting any other value than that will return in failure. This is even the case for +setting bpf_jit_enable to 2, since dumping the final JIT image into the kernel log +is discouraged and introspection through bpftool (under tools/bpf/bpftool/) is the +generally recommended approach instead. + +In the kernel source tree under tools/bpf/, there's bpf_jit_disasm for +generating disassembly out of the kernel log's hexdump:: + + # ./bpf_jit_disasm + 70 bytes emitted from JIT compiler (pass:3, flen:6) + ffffffffa0069c8f + : + 0: push %rbp + 1: mov %rsp,%rbp + 4: sub $0x60,%rsp + 8: mov %rbx,-0x8(%rbp) + c: mov 0x68(%rdi),%r9d + 10: sub 0x6c(%rdi),%r9d + 14: mov 0xd8(%rdi),%r8 + 1b: mov $0xc,%esi + 20: callq 0xffffffffe0ff9442 + 25: cmp $0x800,%eax + 2a: jne 0x0000000000000042 + 2c: mov $0x17,%esi + 31: callq 0xffffffffe0ff945e + 36: cmp $0x1,%eax + 39: jne 0x0000000000000042 + 3b: mov $0xffff,%eax + 40: jmp 0x0000000000000044 + 42: xor %eax,%eax + 44: leaveq + 45: retq + + Issuing option `-o` will "annotate" opcodes to resulting assembler + instructions, which can be very useful for JIT developers: + + # ./bpf_jit_disasm -o + 70 bytes emitted from JIT compiler (pass:3, flen:6) + ffffffffa0069c8f + : + 0: push %rbp + 55 + 1: mov %rsp,%rbp + 48 89 e5 + 4: sub $0x60,%rsp + 48 83 ec 60 + 8: mov %rbx,-0x8(%rbp) + 48 89 5d f8 + c: mov 0x68(%rdi),%r9d + 44 8b 4f 68 + 10: sub 0x6c(%rdi),%r9d + 44 2b 4f 6c + 14: mov 0xd8(%rdi),%r8 + 4c 8b 87 d8 00 00 00 + 1b: mov $0xc,%esi + be 0c 00 00 00 + 20: callq 0xffffffffe0ff9442 + e8 1d 94 ff e0 + 25: cmp $0x800,%eax + 3d 00 08 00 00 + 2a: jne 0x0000000000000042 + 75 16 + 2c: mov $0x17,%esi + be 17 00 00 00 + 31: callq 0xffffffffe0ff945e + e8 28 94 ff e0 + 36: cmp $0x1,%eax + 83 f8 01 + 39: jne 0x0000000000000042 + 75 07 + 3b: mov $0xffff,%eax + b8 ff ff 00 00 + 40: jmp 0x0000000000000044 + eb 02 + 42: xor %eax,%eax + 31 c0 + 44: leaveq + c9 + 45: retq + c3 + +For BPF JIT developers, bpf_jit_disasm, bpf_asm and bpf_dbg provides a useful +toolchain for developing and testing the kernel's JIT compiler. + +BPF kernel internals +-------------------- +Internally, for the kernel interpreter, a different instruction set +format with similar underlying principles from BPF described in previous +paragraphs is being used. However, the instruction set format is modelled +closer to the underlying architecture to mimic native instruction sets, so +that a better performance can be achieved (more details later). This new +ISA is called 'eBPF' or 'internal BPF' interchangeably. (Note: eBPF which +originates from [e]xtended BPF is not the same as BPF extensions! While +eBPF is an ISA, BPF extensions date back to classic BPF's 'overloading' +of BPF_LD | BPF_{B,H,W} | BPF_ABS instruction.) + +It is designed to be JITed with one to one mapping, which can also open up +the possibility for GCC/LLVM compilers to generate optimized eBPF code through +an eBPF backend that performs almost as fast as natively compiled code. + +The new instruction set was originally designed with the possible goal in +mind to write programs in "restricted C" and compile into eBPF with a optional +GCC/LLVM backend, so that it can just-in-time map to modern 64-bit CPUs with +minimal performance overhead over two steps, that is, C -> eBPF -> native code. + +Currently, the new format is being used for running user BPF programs, which +includes seccomp BPF, classic socket filters, cls_bpf traffic classifier, +team driver's classifier for its load-balancing mode, netfilter's xt_bpf +extension, PTP dissector/classifier, and much more. They are all internally +converted by the kernel into the new instruction set representation and run +in the eBPF interpreter. For in-kernel handlers, this all works transparently +by using bpf_prog_create() for setting up the filter, resp. +bpf_prog_destroy() for destroying it. The macro +BPF_PROG_RUN(filter, ctx) transparently invokes eBPF interpreter or JITed +code to run the filter. 'filter' is a pointer to struct bpf_prog that we +got from bpf_prog_create(), and 'ctx' the given context (e.g. +skb pointer). All constraints and restrictions from bpf_check_classic() apply +before a conversion to the new layout is being done behind the scenes! + +Currently, the classic BPF format is being used for JITing on most +32-bit architectures, whereas x86-64, aarch64, s390x, powerpc64, +sparc64, arm32, riscv64, riscv32 perform JIT compilation from eBPF +instruction set. + +Some core changes of the new internal format: + +- Number of registers increase from 2 to 10: + + The old format had two registers A and X, and a hidden frame pointer. The + new layout extends this to be 10 internal registers and a read-only frame + pointer. Since 64-bit CPUs are passing arguments to functions via registers + the number of args from eBPF program to in-kernel function is restricted + to 5 and one register is used to accept return value from an in-kernel + function. Natively, x86_64 passes first 6 arguments in registers, aarch64/ + sparcv9/mips64 have 7 - 8 registers for arguments; x86_64 has 6 callee saved + registers, and aarch64/sparcv9/mips64 have 11 or more callee saved registers. + + Therefore, eBPF calling convention is defined as: + + * R0 - return value from in-kernel function, and exit value for eBPF program + * R1 - R5 - arguments from eBPF program to in-kernel function + * R6 - R9 - callee saved registers that in-kernel function will preserve + * R10 - read-only frame pointer to access stack + + Thus, all eBPF registers map one to one to HW registers on x86_64, aarch64, + etc, and eBPF calling convention maps directly to ABIs used by the kernel on + 64-bit architectures. + + On 32-bit architectures JIT may map programs that use only 32-bit arithmetic + and may let more complex programs to be interpreted. + + R0 - R5 are scratch registers and eBPF program needs spill/fill them if + necessary across calls. Note that there is only one eBPF program (== one + eBPF main routine) and it cannot call other eBPF functions, it can only + call predefined in-kernel functions, though. + +- Register width increases from 32-bit to 64-bit: + + Still, the semantics of the original 32-bit ALU operations are preserved + via 32-bit subregisters. All eBPF registers are 64-bit with 32-bit lower + subregisters that zero-extend into 64-bit if they are being written to. + That behavior maps directly to x86_64 and arm64 subregister definition, but + makes other JITs more difficult. + + 32-bit architectures run 64-bit internal BPF programs via interpreter. + Their JITs may convert BPF programs that only use 32-bit subregisters into + native instruction set and let the rest being interpreted. + + Operation is 64-bit, because on 64-bit architectures, pointers are also + 64-bit wide, and we want to pass 64-bit values in/out of kernel functions, + so 32-bit eBPF registers would otherwise require to define register-pair + ABI, thus, there won't be able to use a direct eBPF register to HW register + mapping and JIT would need to do combine/split/move operations for every + register in and out of the function, which is complex, bug prone and slow. + Another reason is the use of atomic 64-bit counters. + +- Conditional jt/jf targets replaced with jt/fall-through: + + While the original design has constructs such as ``if (cond) jump_true; + else jump_false;``, they are being replaced into alternative constructs like + ``if (cond) jump_true; /* else fall-through */``. + +- Introduces bpf_call insn and register passing convention for zero overhead + calls from/to other kernel functions: + + Before an in-kernel function call, the internal BPF program needs to + place function arguments into R1 to R5 registers to satisfy calling + convention, then the interpreter will take them from registers and pass + to in-kernel function. If R1 - R5 registers are mapped to CPU registers + that are used for argument passing on given architecture, the JIT compiler + doesn't need to emit extra moves. Function arguments will be in the correct + registers and BPF_CALL instruction will be JITed as single 'call' HW + instruction. This calling convention was picked to cover common call + situations without performance penalty. + + After an in-kernel function call, R1 - R5 are reset to unreadable and R0 has + a return value of the function. Since R6 - R9 are callee saved, their state + is preserved across the call. + + For example, consider three C functions:: + + u64 f1() { return (*_f2)(1); } + u64 f2(u64 a) { return f3(a + 1, a); } + u64 f3(u64 a, u64 b) { return a - b; } + + GCC can compile f1, f3 into x86_64:: + + f1: + movl $1, %edi + movq _f2(%rip), %rax + jmp *%rax + f3: + movq %rdi, %rax + subq %rsi, %rax + ret + + Function f2 in eBPF may look like:: + + f2: + bpf_mov R2, R1 + bpf_add R1, 1 + bpf_call f3 + bpf_exit + + If f2 is JITed and the pointer stored to ``_f2``. The calls f1 -> f2 -> f3 and + returns will be seamless. Without JIT, __bpf_prog_run() interpreter needs to + be used to call into f2. + + For practical reasons all eBPF programs have only one argument 'ctx' which is + already placed into R1 (e.g. on __bpf_prog_run() startup) and the programs + can call kernel functions with up to 5 arguments. Calls with 6 or more arguments + are currently not supported, but these restrictions can be lifted if necessary + in the future. + + On 64-bit architectures all register map to HW registers one to one. For + example, x86_64 JIT compiler can map them as ... + + :: + + R0 - rax + R1 - rdi + R2 - rsi + R3 - rdx + R4 - rcx + R5 - r8 + R6 - rbx + R7 - r13 + R8 - r14 + R9 - r15 + R10 - rbp + + ... since x86_64 ABI mandates rdi, rsi, rdx, rcx, r8, r9 for argument passing + and rbx, r12 - r15 are callee saved. + + Then the following internal BPF pseudo-program:: + + bpf_mov R6, R1 /* save ctx */ + bpf_mov R2, 2 + bpf_mov R3, 3 + bpf_mov R4, 4 + bpf_mov R5, 5 + bpf_call foo + bpf_mov R7, R0 /* save foo() return value */ + bpf_mov R1, R6 /* restore ctx for next call */ + bpf_mov R2, 6 + bpf_mov R3, 7 + bpf_mov R4, 8 + bpf_mov R5, 9 + bpf_call bar + bpf_add R0, R7 + bpf_exit + + After JIT to x86_64 may look like:: + + push %rbp + mov %rsp,%rbp + sub $0x228,%rsp + mov %rbx,-0x228(%rbp) + mov %r13,-0x220(%rbp) + mov %rdi,%rbx + mov $0x2,%esi + mov $0x3,%edx + mov $0x4,%ecx + mov $0x5,%r8d + callq foo + mov %rax,%r13 + mov %rbx,%rdi + mov $0x6,%esi + mov $0x7,%edx + mov $0x8,%ecx + mov $0x9,%r8d + callq bar + add %r13,%rax + mov -0x228(%rbp),%rbx + mov -0x220(%rbp),%r13 + leaveq + retq + + Which is in this example equivalent in C to:: + + u64 bpf_filter(u64 ctx) + { + return foo(ctx, 2, 3, 4, 5) + bar(ctx, 6, 7, 8, 9); + } + + In-kernel functions foo() and bar() with prototype: u64 (*)(u64 arg1, u64 + arg2, u64 arg3, u64 arg4, u64 arg5); will receive arguments in proper + registers and place their return value into ``%rax`` which is R0 in eBPF. + Prologue and epilogue are emitted by JIT and are implicit in the + interpreter. R0-R5 are scratch registers, so eBPF program needs to preserve + them across the calls as defined by calling convention. + + For example the following program is invalid:: + + bpf_mov R1, 1 + bpf_call foo + bpf_mov R0, R1 + bpf_exit + + After the call the registers R1-R5 contain junk values and cannot be read. + An in-kernel eBPF verifier is used to validate internal BPF programs. + +Also in the new design, eBPF is limited to 4096 insns, which means that any +program will terminate quickly and will only call a fixed number of kernel +functions. Original BPF and the new format are two operand instructions, +which helps to do one-to-one mapping between eBPF insn and x86 insn during JIT. + +The input context pointer for invoking the interpreter function is generic, +its content is defined by a specific use case. For seccomp register R1 points +to seccomp_data, for converted BPF filters R1 points to a skb. + +A program, that is translated internally consists of the following elements:: + + op:16, jt:8, jf:8, k:32 ==> op:8, dst_reg:4, src_reg:4, off:16, imm:32 + +So far 87 internal BPF instructions were implemented. 8-bit 'op' opcode field +has room for new instructions. Some of them may use 16/24/32 byte encoding. New +instructions must be multiple of 8 bytes to preserve backward compatibility. + +Internal BPF is a general purpose RISC instruction set. Not every register and +every instruction are used during translation from original BPF to new format. +For example, socket filters are not using ``exclusive add`` instruction, but +tracing filters may do to maintain counters of events, for example. Register R9 +is not used by socket filters either, but more complex filters may be running +out of registers and would have to resort to spill/fill to stack. + +Internal BPF can be used as a generic assembler for last step performance +optimizations, socket filters and seccomp are using it as assembler. Tracing +filters may use it as assembler to generate code from kernel. In kernel usage +may not be bounded by security considerations, since generated internal BPF code +may be optimizing internal code path and not being exposed to the user space. +Safety of internal BPF can come from a verifier (TBD). In such use cases as +described, it may be used as safe instruction set. + +Just like the original BPF, the new format runs within a controlled environment, +is deterministic and the kernel can easily prove that. The safety of the program +can be determined in two steps: first step does depth-first-search to disallow +loops and other CFG validation; second step starts from the first insn and +descends all possible paths. It simulates execution of every insn and observes +the state change of registers and stack. + +eBPF opcode encoding +-------------------- + +eBPF is reusing most of the opcode encoding from classic to simplify conversion +of classic BPF to eBPF. For arithmetic and jump instructions the 8-bit 'code' +field is divided into three parts:: + + +----------------+--------+--------------------+ + | 4 bits | 1 bit | 3 bits | + | operation code | source | instruction class | + +----------------+--------+--------------------+ + (MSB) (LSB) + +Three LSB bits store instruction class which is one of: + + =================== =============== + Classic BPF classes eBPF classes + =================== =============== + BPF_LD 0x00 BPF_LD 0x00 + BPF_LDX 0x01 BPF_LDX 0x01 + BPF_ST 0x02 BPF_ST 0x02 + BPF_STX 0x03 BPF_STX 0x03 + BPF_ALU 0x04 BPF_ALU 0x04 + BPF_JMP 0x05 BPF_JMP 0x05 + BPF_RET 0x06 BPF_JMP32 0x06 + BPF_MISC 0x07 BPF_ALU64 0x07 + =================== =============== + +When BPF_CLASS(code) == BPF_ALU or BPF_JMP, 4th bit encodes source operand ... + + :: + + BPF_K 0x00 + BPF_X 0x08 + + * in classic BPF, this means:: + + BPF_SRC(code) == BPF_X - use register X as source operand + BPF_SRC(code) == BPF_K - use 32-bit immediate as source operand + + * in eBPF, this means:: + + BPF_SRC(code) == BPF_X - use 'src_reg' register as source operand + BPF_SRC(code) == BPF_K - use 32-bit immediate as source operand + +... and four MSB bits store operation code. + +If BPF_CLASS(code) == BPF_ALU or BPF_ALU64 [ in eBPF ], BPF_OP(code) is one of:: + + BPF_ADD 0x00 + BPF_SUB 0x10 + BPF_MUL 0x20 + BPF_DIV 0x30 + BPF_OR 0x40 + BPF_AND 0x50 + BPF_LSH 0x60 + BPF_RSH 0x70 + BPF_NEG 0x80 + BPF_MOD 0x90 + BPF_XOR 0xa0 + BPF_MOV 0xb0 /* eBPF only: mov reg to reg */ + BPF_ARSH 0xc0 /* eBPF only: sign extending shift right */ + BPF_END 0xd0 /* eBPF only: endianness conversion */ + +If BPF_CLASS(code) == BPF_JMP or BPF_JMP32 [ in eBPF ], BPF_OP(code) is one of:: + + BPF_JA 0x00 /* BPF_JMP only */ + BPF_JEQ 0x10 + BPF_JGT 0x20 + BPF_JGE 0x30 + BPF_JSET 0x40 + BPF_JNE 0x50 /* eBPF only: jump != */ + BPF_JSGT 0x60 /* eBPF only: signed '>' */ + BPF_JSGE 0x70 /* eBPF only: signed '>=' */ + BPF_CALL 0x80 /* eBPF BPF_JMP only: function call */ + BPF_EXIT 0x90 /* eBPF BPF_JMP only: function return */ + BPF_JLT 0xa0 /* eBPF only: unsigned '<' */ + BPF_JLE 0xb0 /* eBPF only: unsigned '<=' */ + BPF_JSLT 0xc0 /* eBPF only: signed '<' */ + BPF_JSLE 0xd0 /* eBPF only: signed '<=' */ + +So BPF_ADD | BPF_X | BPF_ALU means 32-bit addition in both classic BPF +and eBPF. There are only two registers in classic BPF, so it means A += X. +In eBPF it means dst_reg = (u32) dst_reg + (u32) src_reg; similarly, +BPF_XOR | BPF_K | BPF_ALU means A ^= imm32 in classic BPF and analogous +src_reg = (u32) src_reg ^ (u32) imm32 in eBPF. + +Classic BPF is using BPF_MISC class to represent A = X and X = A moves. +eBPF is using BPF_MOV | BPF_X | BPF_ALU code instead. Since there are no +BPF_MISC operations in eBPF, the class 7 is used as BPF_ALU64 to mean +exactly the same operations as BPF_ALU, but with 64-bit wide operands +instead. So BPF_ADD | BPF_X | BPF_ALU64 means 64-bit addition, i.e.: +dst_reg = dst_reg + src_reg + +Classic BPF wastes the whole BPF_RET class to represent a single ``ret`` +operation. Classic BPF_RET | BPF_K means copy imm32 into return register +and perform function exit. eBPF is modeled to match CPU, so BPF_JMP | BPF_EXIT +in eBPF means function exit only. The eBPF program needs to store return +value into register R0 before doing a BPF_EXIT. Class 6 in eBPF is used as +BPF_JMP32 to mean exactly the same operations as BPF_JMP, but with 32-bit wide +operands for the comparisons instead. + +For load and store instructions the 8-bit 'code' field is divided as:: + + +--------+--------+-------------------+ + | 3 bits | 2 bits | 3 bits | + | mode | size | instruction class | + +--------+--------+-------------------+ + (MSB) (LSB) + +Size modifier is one of ... + +:: + + BPF_W 0x00 /* word */ + BPF_H 0x08 /* half word */ + BPF_B 0x10 /* byte */ + BPF_DW 0x18 /* eBPF only, double word */ + +... which encodes size of load/store operation:: + + B - 1 byte + H - 2 byte + W - 4 byte + DW - 8 byte (eBPF only) + +Mode modifier is one of:: + + BPF_IMM 0x00 /* used for 32-bit mov in classic BPF and 64-bit in eBPF */ + BPF_ABS 0x20 + BPF_IND 0x40 + BPF_MEM 0x60 + BPF_LEN 0x80 /* classic BPF only, reserved in eBPF */ + BPF_MSH 0xa0 /* classic BPF only, reserved in eBPF */ + BPF_XADD 0xc0 /* eBPF only, exclusive add */ + +eBPF has two non-generic instructions: (BPF_ABS | | BPF_LD) and +(BPF_IND | | BPF_LD) which are used to access packet data. + +They had to be carried over from classic to have strong performance of +socket filters running in eBPF interpreter. These instructions can only +be used when interpreter context is a pointer to ``struct sk_buff`` and +have seven implicit operands. Register R6 is an implicit input that must +contain pointer to sk_buff. Register R0 is an implicit output which contains +the data fetched from the packet. Registers R1-R5 are scratch registers +and must not be used to store the data across BPF_ABS | BPF_LD or +BPF_IND | BPF_LD instructions. + +These instructions have implicit program exit condition as well. When +eBPF program is trying to access the data beyond the packet boundary, +the interpreter will abort the execution of the program. JIT compilers +therefore must preserve this property. src_reg and imm32 fields are +explicit inputs to these instructions. + +For example:: + + BPF_IND | BPF_W | BPF_LD means: + + R0 = ntohl(*(u32 *) (((struct sk_buff *) R6)->data + src_reg + imm32)) + and R1 - R5 were scratched. + +Unlike classic BPF instruction set, eBPF has generic load/store operations:: + + BPF_MEM | | BPF_STX: *(size *) (dst_reg + off) = src_reg + BPF_MEM | | BPF_ST: *(size *) (dst_reg + off) = imm32 + BPF_MEM | | BPF_LDX: dst_reg = *(size *) (src_reg + off) + BPF_XADD | BPF_W | BPF_STX: lock xadd *(u32 *)(dst_reg + off16) += src_reg + BPF_XADD | BPF_DW | BPF_STX: lock xadd *(u64 *)(dst_reg + off16) += src_reg + +Where size is one of: BPF_B or BPF_H or BPF_W or BPF_DW. Note that 1 and +2 byte atomic increments are not supported. + +eBPF has one 16-byte instruction: BPF_LD | BPF_DW | BPF_IMM which consists +of two consecutive ``struct bpf_insn`` 8-byte blocks and interpreted as single +instruction that loads 64-bit immediate value into a dst_reg. +Classic BPF has similar instruction: BPF_LD | BPF_W | BPF_IMM which loads +32-bit immediate value into a register. + +eBPF verifier +------------- +The safety of the eBPF program is determined in two steps. + +First step does DAG check to disallow loops and other CFG validation. +In particular it will detect programs that have unreachable instructions. +(though classic BPF checker allows them) + +Second step starts from the first insn and descends all possible paths. +It simulates execution of every insn and observes the state change of +registers and stack. + +At the start of the program the register R1 contains a pointer to context +and has type PTR_TO_CTX. +If verifier sees an insn that does R2=R1, then R2 has now type +PTR_TO_CTX as well and can be used on the right hand side of expression. +If R1=PTR_TO_CTX and insn is R2=R1+R1, then R2=SCALAR_VALUE, +since addition of two valid pointers makes invalid pointer. +(In 'secure' mode verifier will reject any type of pointer arithmetic to make +sure that kernel addresses don't leak to unprivileged users) + +If register was never written to, it's not readable:: + + bpf_mov R0 = R2 + bpf_exit + +will be rejected, since R2 is unreadable at the start of the program. + +After kernel function call, R1-R5 are reset to unreadable and +R0 has a return type of the function. + +Since R6-R9 are callee saved, their state is preserved across the call. + +:: + + bpf_mov R6 = 1 + bpf_call foo + bpf_mov R0 = R6 + bpf_exit + +is a correct program. If there was R1 instead of R6, it would have +been rejected. + +load/store instructions are allowed only with registers of valid types, which +are PTR_TO_CTX, PTR_TO_MAP, PTR_TO_STACK. They are bounds and alignment checked. +For example:: + + bpf_mov R1 = 1 + bpf_mov R2 = 2 + bpf_xadd *(u32 *)(R1 + 3) += R2 + bpf_exit + +will be rejected, since R1 doesn't have a valid pointer type at the time of +execution of instruction bpf_xadd. + +At the start R1 type is PTR_TO_CTX (a pointer to generic ``struct bpf_context``) +A callback is used to customize verifier to restrict eBPF program access to only +certain fields within ctx structure with specified size and alignment. + +For example, the following insn:: + + bpf_ld R0 = *(u32 *)(R6 + 8) + +intends to load a word from address R6 + 8 and store it into R0 +If R6=PTR_TO_CTX, via is_valid_access() callback the verifier will know +that offset 8 of size 4 bytes can be accessed for reading, otherwise +the verifier will reject the program. +If R6=PTR_TO_STACK, then access should be aligned and be within +stack bounds, which are [-MAX_BPF_STACK, 0). In this example offset is 8, +so it will fail verification, since it's out of bounds. + +The verifier will allow eBPF program to read data from stack only after +it wrote into it. + +Classic BPF verifier does similar check with M[0-15] memory slots. +For example:: + + bpf_ld R0 = *(u32 *)(R10 - 4) + bpf_exit + +is invalid program. +Though R10 is correct read-only register and has type PTR_TO_STACK +and R10 - 4 is within stack bounds, there were no stores into that location. + +Pointer register spill/fill is tracked as well, since four (R6-R9) +callee saved registers may not be enough for some programs. + +Allowed function calls are customized with bpf_verifier_ops->get_func_proto() +The eBPF verifier will check that registers match argument constraints. +After the call register R0 will be set to return type of the function. + +Function calls is a main mechanism to extend functionality of eBPF programs. +Socket filters may let programs to call one set of functions, whereas tracing +filters may allow completely different set. + +If a function made accessible to eBPF program, it needs to be thought through +from safety point of view. The verifier will guarantee that the function is +called with valid arguments. + +seccomp vs socket filters have different security restrictions for classic BPF. +Seccomp solves this by two stage verifier: classic BPF verifier is followed +by seccomp verifier. In case of eBPF one configurable verifier is shared for +all use cases. + +See details of eBPF verifier in kernel/bpf/verifier.c + +Register value tracking +----------------------- +In order to determine the safety of an eBPF program, the verifier must track +the range of possible values in each register and also in each stack slot. +This is done with ``struct bpf_reg_state``, defined in include/linux/ +bpf_verifier.h, which unifies tracking of scalar and pointer values. Each +register state has a type, which is either NOT_INIT (the register has not been +written to), SCALAR_VALUE (some value which is not usable as a pointer), or a +pointer type. The types of pointers describe their base, as follows: + + + PTR_TO_CTX + Pointer to bpf_context. + CONST_PTR_TO_MAP + Pointer to struct bpf_map. "Const" because arithmetic + on these pointers is forbidden. + PTR_TO_MAP_VALUE + Pointer to the value stored in a map element. + PTR_TO_MAP_VALUE_OR_NULL + Either a pointer to a map value, or NULL; map accesses + (see section 'eBPF maps', below) return this type, + which becomes a PTR_TO_MAP_VALUE when checked != NULL. + Arithmetic on these pointers is forbidden. + PTR_TO_STACK + Frame pointer. + PTR_TO_PACKET + skb->data. + PTR_TO_PACKET_END + skb->data + headlen; arithmetic forbidden. + PTR_TO_SOCKET + Pointer to struct bpf_sock_ops, implicitly refcounted. + PTR_TO_SOCKET_OR_NULL + Either a pointer to a socket, or NULL; socket lookup + returns this type, which becomes a PTR_TO_SOCKET when + checked != NULL. PTR_TO_SOCKET is reference-counted, + so programs must release the reference through the + socket release function before the end of the program. + Arithmetic on these pointers is forbidden. + +However, a pointer may be offset from this base (as a result of pointer +arithmetic), and this is tracked in two parts: the 'fixed offset' and 'variable +offset'. The former is used when an exactly-known value (e.g. an immediate +operand) is added to a pointer, while the latter is used for values which are +not exactly known. The variable offset is also used in SCALAR_VALUEs, to track +the range of possible values in the register. + +The verifier's knowledge about the variable offset consists of: + +* minimum and maximum values as unsigned +* minimum and maximum values as signed + +* knowledge of the values of individual bits, in the form of a 'tnum': a u64 + 'mask' and a u64 'value'. 1s in the mask represent bits whose value is unknown; + 1s in the value represent bits known to be 1. Bits known to be 0 have 0 in both + mask and value; no bit should ever be 1 in both. For example, if a byte is read + into a register from memory, the register's top 56 bits are known zero, while + the low 8 are unknown - which is represented as the tnum (0x0; 0xff). If we + then OR this with 0x40, we get (0x40; 0xbf), then if we add 1 we get (0x0; + 0x1ff), because of potential carries. + +Besides arithmetic, the register state can also be updated by conditional +branches. For instance, if a SCALAR_VALUE is compared > 8, in the 'true' branch +it will have a umin_value (unsigned minimum value) of 9, whereas in the 'false' +branch it will have a umax_value of 8. A signed compare (with BPF_JSGT or +BPF_JSGE) would instead update the signed minimum/maximum values. Information +from the signed and unsigned bounds can be combined; for instance if a value is +first tested < 8 and then tested s> 4, the verifier will conclude that the value +is also > 4 and s< 8, since the bounds prevent crossing the sign boundary. + +PTR_TO_PACKETs with a variable offset part have an 'id', which is common to all +pointers sharing that same variable offset. This is important for packet range +checks: after adding a variable to a packet pointer register A, if you then copy +it to another register B and then add a constant 4 to A, both registers will +share the same 'id' but the A will have a fixed offset of +4. Then if A is +bounds-checked and found to be less than a PTR_TO_PACKET_END, the register B is +now known to have a safe range of at least 4 bytes. See 'Direct packet access', +below, for more on PTR_TO_PACKET ranges. + +The 'id' field is also used on PTR_TO_MAP_VALUE_OR_NULL, common to all copies of +the pointer returned from a map lookup. This means that when one copy is +checked and found to be non-NULL, all copies can become PTR_TO_MAP_VALUEs. +As well as range-checking, the tracked information is also used for enforcing +alignment of pointer accesses. For instance, on most systems the packet pointer +is 2 bytes after a 4-byte alignment. If a program adds 14 bytes to that to jump +over the Ethernet header, then reads IHL and addes (IHL * 4), the resulting +pointer will have a variable offset known to be 4n+2 for some n, so adding the 2 +bytes (NET_IP_ALIGN) gives a 4-byte alignment and so word-sized accesses through +that pointer are safe. +The 'id' field is also used on PTR_TO_SOCKET and PTR_TO_SOCKET_OR_NULL, common +to all copies of the pointer returned from a socket lookup. This has similar +behaviour to the handling for PTR_TO_MAP_VALUE_OR_NULL->PTR_TO_MAP_VALUE, but +it also handles reference tracking for the pointer. PTR_TO_SOCKET implicitly +represents a reference to the corresponding ``struct sock``. To ensure that the +reference is not leaked, it is imperative to NULL-check the reference and in +the non-NULL case, and pass the valid reference to the socket release function. + +Direct packet access +-------------------- +In cls_bpf and act_bpf programs the verifier allows direct access to the packet +data via skb->data and skb->data_end pointers. +Ex:: + + 1: r4 = *(u32 *)(r1 +80) /* load skb->data_end */ + 2: r3 = *(u32 *)(r1 +76) /* load skb->data */ + 3: r5 = r3 + 4: r5 += 14 + 5: if r5 > r4 goto pc+16 + R1=ctx R3=pkt(id=0,off=0,r=14) R4=pkt_end R5=pkt(id=0,off=14,r=14) R10=fp + 6: r0 = *(u16 *)(r3 +12) /* access 12 and 13 bytes of the packet */ + +this 2byte load from the packet is safe to do, since the program author +did check ``if (skb->data + 14 > skb->data_end) goto err`` at insn #5 which +means that in the fall-through case the register R3 (which points to skb->data) +has at least 14 directly accessible bytes. The verifier marks it +as R3=pkt(id=0,off=0,r=14). +id=0 means that no additional variables were added to the register. +off=0 means that no additional constants were added. +r=14 is the range of safe access which means that bytes [R3, R3 + 14) are ok. +Note that R5 is marked as R5=pkt(id=0,off=14,r=14). It also points +to the packet data, but constant 14 was added to the register, so +it now points to ``skb->data + 14`` and accessible range is [R5, R5 + 14 - 14) +which is zero bytes. + +More complex packet access may look like:: + + + R0=inv1 R1=ctx R3=pkt(id=0,off=0,r=14) R4=pkt_end R5=pkt(id=0,off=14,r=14) R10=fp + 6: r0 = *(u8 *)(r3 +7) /* load 7th byte from the packet */ + 7: r4 = *(u8 *)(r3 +12) + 8: r4 *= 14 + 9: r3 = *(u32 *)(r1 +76) /* load skb->data */ + 10: r3 += r4 + 11: r2 = r1 + 12: r2 <<= 48 + 13: r2 >>= 48 + 14: r3 += r2 + 15: r2 = r3 + 16: r2 += 8 + 17: r1 = *(u32 *)(r1 +80) /* load skb->data_end */ + 18: if r2 > r1 goto pc+2 + R0=inv(id=0,umax_value=255,var_off=(0x0; 0xff)) R1=pkt_end R2=pkt(id=2,off=8,r=8) R3=pkt(id=2,off=0,r=8) R4=inv(id=0,umax_value=3570,var_off=(0x0; 0xfffe)) R5=pkt(id=0,off=14,r=14) R10=fp + 19: r1 = *(u8 *)(r3 +4) + +The state of the register R3 is R3=pkt(id=2,off=0,r=8) +id=2 means that two ``r3 += rX`` instructions were seen, so r3 points to some +offset within a packet and since the program author did +``if (r3 + 8 > r1) goto err`` at insn #18, the safe range is [R3, R3 + 8). +The verifier only allows 'add'/'sub' operations on packet registers. Any other +operation will set the register state to 'SCALAR_VALUE' and it won't be +available for direct packet access. + +Operation ``r3 += rX`` may overflow and become less than original skb->data, +therefore the verifier has to prevent that. So when it sees ``r3 += rX`` +instruction and rX is more than 16-bit value, any subsequent bounds-check of r3 +against skb->data_end will not give us 'range' information, so attempts to read +through the pointer will give "invalid access to packet" error. + +Ex. after insn ``r4 = *(u8 *)(r3 +12)`` (insn #7 above) the state of r4 is +R4=inv(id=0,umax_value=255,var_off=(0x0; 0xff)) which means that upper 56 bits +of the register are guaranteed to be zero, and nothing is known about the lower +8 bits. After insn ``r4 *= 14`` the state becomes +R4=inv(id=0,umax_value=3570,var_off=(0x0; 0xfffe)), since multiplying an 8-bit +value by constant 14 will keep upper 52 bits as zero, also the least significant +bit will be zero as 14 is even. Similarly ``r2 >>= 48`` will make +R2=inv(id=0,umax_value=65535,var_off=(0x0; 0xffff)), since the shift is not sign +extending. This logic is implemented in adjust_reg_min_max_vals() function, +which calls adjust_ptr_min_max_vals() for adding pointer to scalar (or vice +versa) and adjust_scalar_min_max_vals() for operations on two scalars. + +The end result is that bpf program author can access packet directly +using normal C code as:: + + void *data = (void *)(long)skb->data; + void *data_end = (void *)(long)skb->data_end; + struct eth_hdr *eth = data; + struct iphdr *iph = data + sizeof(*eth); + struct udphdr *udp = data + sizeof(*eth) + sizeof(*iph); + + if (data + sizeof(*eth) + sizeof(*iph) + sizeof(*udp) > data_end) + return 0; + if (eth->h_proto != htons(ETH_P_IP)) + return 0; + if (iph->protocol != IPPROTO_UDP || iph->ihl != 5) + return 0; + if (udp->dest == 53 || udp->source == 9) + ...; + +which makes such programs easier to write comparing to LD_ABS insn +and significantly faster. + +eBPF maps +--------- +'maps' is a generic storage of different types for sharing data between kernel +and userspace. + +The maps are accessed from user space via BPF syscall, which has commands: + +- create a map with given type and attributes + ``map_fd = bpf(BPF_MAP_CREATE, union bpf_attr *attr, u32 size)`` + using attr->map_type, attr->key_size, attr->value_size, attr->max_entries + returns process-local file descriptor or negative error + +- lookup key in a given map + ``err = bpf(BPF_MAP_LOOKUP_ELEM, union bpf_attr *attr, u32 size)`` + using attr->map_fd, attr->key, attr->value + returns zero and stores found elem into value or negative error + +- create or update key/value pair in a given map + ``err = bpf(BPF_MAP_UPDATE_ELEM, union bpf_attr *attr, u32 size)`` + using attr->map_fd, attr->key, attr->value + returns zero or negative error + +- find and delete element by key in a given map + ``err = bpf(BPF_MAP_DELETE_ELEM, union bpf_attr *attr, u32 size)`` + using attr->map_fd, attr->key + +- to delete map: close(fd) + Exiting process will delete maps automatically + +userspace programs use this syscall to create/access maps that eBPF programs +are concurrently updating. + +maps can have different types: hash, array, bloom filter, radix-tree, etc. + +The map is defined by: + + - type + - max number of elements + - key size in bytes + - value size in bytes + +Pruning +------- +The verifier does not actually walk all possible paths through the program. For +each new branch to analyse, the verifier looks at all the states it's previously +been in when at this instruction. If any of them contain the current state as a +subset, the branch is 'pruned' - that is, the fact that the previous state was +accepted implies the current state would be as well. For instance, if in the +previous state, r1 held a packet-pointer, and in the current state, r1 holds a +packet-pointer with a range as long or longer and at least as strict an +alignment, then r1 is safe. Similarly, if r2 was NOT_INIT before then it can't +have been used by any path from that point, so any value in r2 (including +another NOT_INIT) is safe. The implementation is in the function regsafe(). +Pruning considers not only the registers but also the stack (and any spilled +registers it may hold). They must all be safe for the branch to be pruned. +This is implemented in states_equal(). + +Understanding eBPF verifier messages +------------------------------------ + +The following are few examples of invalid eBPF programs and verifier error +messages as seen in the log: + +Program with unreachable instructions:: + + static struct bpf_insn prog[] = { + BPF_EXIT_INSN(), + BPF_EXIT_INSN(), + }; + +Error: + + unreachable insn 1 + +Program that reads uninitialized register:: + + BPF_MOV64_REG(BPF_REG_0, BPF_REG_2), + BPF_EXIT_INSN(), + +Error:: + + 0: (bf) r0 = r2 + R2 !read_ok + +Program that doesn't initialize R0 before exiting:: + + BPF_MOV64_REG(BPF_REG_2, BPF_REG_1), + BPF_EXIT_INSN(), + +Error:: + + 0: (bf) r2 = r1 + 1: (95) exit + R0 !read_ok + +Program that accesses stack out of bounds:: + + BPF_ST_MEM(BPF_DW, BPF_REG_10, 8, 0), + BPF_EXIT_INSN(), + +Error:: + + 0: (7a) *(u64 *)(r10 +8) = 0 + invalid stack off=8 size=8 + +Program that doesn't initialize stack before passing its address into function:: + + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), + BPF_EXIT_INSN(), + +Error:: + + 0: (bf) r2 = r10 + 1: (07) r2 += -8 + 2: (b7) r1 = 0x0 + 3: (85) call 1 + invalid indirect read from stack off -8+0 size 8 + +Program that uses invalid map_fd=0 while calling to map_lookup_elem() function:: + + BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), + BPF_EXIT_INSN(), + +Error:: + + 0: (7a) *(u64 *)(r10 -8) = 0 + 1: (bf) r2 = r10 + 2: (07) r2 += -8 + 3: (b7) r1 = 0x0 + 4: (85) call 1 + fd 0 is not pointing to valid bpf_map + +Program that doesn't check return value of map_lookup_elem() before accessing +map element:: + + BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), + BPF_ST_MEM(BPF_DW, BPF_REG_0, 0, 0), + BPF_EXIT_INSN(), + +Error:: + + 0: (7a) *(u64 *)(r10 -8) = 0 + 1: (bf) r2 = r10 + 2: (07) r2 += -8 + 3: (b7) r1 = 0x0 + 4: (85) call 1 + 5: (7a) *(u64 *)(r0 +0) = 0 + R0 invalid mem access 'map_value_or_null' + +Program that correctly checks map_lookup_elem() returned value for NULL, but +accesses the memory with incorrect alignment:: + + BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 1), + BPF_ST_MEM(BPF_DW, BPF_REG_0, 4, 0), + BPF_EXIT_INSN(), + +Error:: + + 0: (7a) *(u64 *)(r10 -8) = 0 + 1: (bf) r2 = r10 + 2: (07) r2 += -8 + 3: (b7) r1 = 1 + 4: (85) call 1 + 5: (15) if r0 == 0x0 goto pc+1 + R0=map_ptr R10=fp + 6: (7a) *(u64 *)(r0 +4) = 0 + misaligned access off 4 size 8 + +Program that correctly checks map_lookup_elem() returned value for NULL and +accesses memory with correct alignment in one side of 'if' branch, but fails +to do so in the other side of 'if' branch:: + + BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2), + BPF_ST_MEM(BPF_DW, BPF_REG_0, 0, 0), + BPF_EXIT_INSN(), + BPF_ST_MEM(BPF_DW, BPF_REG_0, 0, 1), + BPF_EXIT_INSN(), + +Error:: + + 0: (7a) *(u64 *)(r10 -8) = 0 + 1: (bf) r2 = r10 + 2: (07) r2 += -8 + 3: (b7) r1 = 1 + 4: (85) call 1 + 5: (15) if r0 == 0x0 goto pc+2 + R0=map_ptr R10=fp + 6: (7a) *(u64 *)(r0 +0) = 0 + 7: (95) exit + + from 5 to 8: R0=imm0 R10=fp + 8: (7a) *(u64 *)(r0 +0) = 1 + R0 invalid mem access 'imm' + +Program that performs a socket lookup then sets the pointer to NULL without +checking it:: + + BPF_MOV64_IMM(BPF_REG_2, 0), + BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_2, -8), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_MOV64_IMM(BPF_REG_3, 4), + BPF_MOV64_IMM(BPF_REG_4, 0), + BPF_MOV64_IMM(BPF_REG_5, 0), + BPF_EMIT_CALL(BPF_FUNC_sk_lookup_tcp), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + +Error:: + + 0: (b7) r2 = 0 + 1: (63) *(u32 *)(r10 -8) = r2 + 2: (bf) r2 = r10 + 3: (07) r2 += -8 + 4: (b7) r3 = 4 + 5: (b7) r4 = 0 + 6: (b7) r5 = 0 + 7: (85) call bpf_sk_lookup_tcp#65 + 8: (b7) r0 = 0 + 9: (95) exit + Unreleased reference id=1, alloc_insn=7 + +Program that performs a socket lookup but does not NULL-check the returned +value:: + + BPF_MOV64_IMM(BPF_REG_2, 0), + BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_2, -8), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_MOV64_IMM(BPF_REG_3, 4), + BPF_MOV64_IMM(BPF_REG_4, 0), + BPF_MOV64_IMM(BPF_REG_5, 0), + BPF_EMIT_CALL(BPF_FUNC_sk_lookup_tcp), + BPF_EXIT_INSN(), + +Error:: + + 0: (b7) r2 = 0 + 1: (63) *(u32 *)(r10 -8) = r2 + 2: (bf) r2 = r10 + 3: (07) r2 += -8 + 4: (b7) r3 = 4 + 5: (b7) r4 = 0 + 6: (b7) r5 = 0 + 7: (85) call bpf_sk_lookup_tcp#65 + 8: (95) exit + Unreleased reference id=1, alloc_insn=7 + +Testing +------- + +Next to the BPF toolchain, the kernel also ships a test module that contains +various test cases for classic and internal BPF that can be executed against +the BPF interpreter and JIT compiler. It can be found in lib/test_bpf.c and +enabled via Kconfig:: + + CONFIG_TEST_BPF=m + +After the module has been built and installed, the test suite can be executed +via insmod or modprobe against 'test_bpf' module. Results of the test cases +including timings in nsec can be found in the kernel log (dmesg). + +Misc +---- + +Also trinity, the Linux syscall fuzzer, has built-in support for BPF and +SECCOMP-BPF kernel fuzzing. + +Written by +---------- + +The document was written in the hope that it is found useful and in order +to give potential BPF hackers or security auditors a better overview of +the underlying architecture. + +- Jay Schulist +- Daniel Borkmann +- Alexei Starovoitov diff --git a/Documentation/networking/filter.txt b/Documentation/networking/filter.txt deleted file mode 100644 index 2f0f8b17dade..000000000000 --- a/Documentation/networking/filter.txt +++ /dev/null @@ -1,1545 +0,0 @@ -Linux Socket Filtering aka Berkeley Packet Filter (BPF) -======================================================= - -Introduction ------------- - -Linux Socket Filtering (LSF) is derived from the Berkeley Packet Filter. -Though there are some distinct differences between the BSD and Linux -Kernel filtering, but when we speak of BPF or LSF in Linux context, we -mean the very same mechanism of filtering in the Linux kernel. - -BPF allows a user-space program to attach a filter onto any socket and -allow or disallow certain types of data to come through the socket. LSF -follows exactly the same filter code structure as BSD's BPF, so referring -to the BSD bpf.4 manpage is very helpful in creating filters. - -On Linux, BPF is much simpler than on BSD. One does not have to worry -about devices or anything like that. You simply create your filter code, -send it to the kernel via the SO_ATTACH_FILTER option and if your filter -code passes the kernel check on it, you then immediately begin filtering -data on that socket. - -You can also detach filters from your socket via the SO_DETACH_FILTER -option. This will probably not be used much since when you close a socket -that has a filter on it the filter is automagically removed. The other -less common case may be adding a different filter on the same socket where -you had another filter that is still running: the kernel takes care of -removing the old one and placing your new one in its place, assuming your -filter has passed the checks, otherwise if it fails the old filter will -remain on that socket. - -SO_LOCK_FILTER option allows to lock the filter attached to a socket. Once -set, a filter cannot be removed or changed. This allows one process to -setup a socket, attach a filter, lock it then drop privileges and be -assured that the filter will be kept until the socket is closed. - -The biggest user of this construct might be libpcap. Issuing a high-level -filter command like `tcpdump -i em1 port 22` passes through the libpcap -internal compiler that generates a structure that can eventually be loaded -via SO_ATTACH_FILTER to the kernel. `tcpdump -i em1 port 22 -ddd` -displays what is being placed into this structure. - -Although we were only speaking about sockets here, BPF in Linux is used -in many more places. There's xt_bpf for netfilter, cls_bpf in the kernel -qdisc layer, SECCOMP-BPF (SECure COMPuting [1]), and lots of other places -such as team driver, PTP code, etc where BPF is being used. - - [1] Documentation/userspace-api/seccomp_filter.rst - -Original BPF paper: - -Steven McCanne and Van Jacobson. 1993. The BSD packet filter: a new -architecture for user-level packet capture. In Proceedings of the -USENIX Winter 1993 Conference Proceedings on USENIX Winter 1993 -Conference Proceedings (USENIX'93). USENIX Association, Berkeley, -CA, USA, 2-2. [http://www.tcpdump.org/papers/bpf-usenix93.pdf] - -Structure ---------- - -User space applications include which contains the -following relevant structures: - -struct sock_filter { /* Filter block */ - __u16 code; /* Actual filter code */ - __u8 jt; /* Jump true */ - __u8 jf; /* Jump false */ - __u32 k; /* Generic multiuse field */ -}; - -Such a structure is assembled as an array of 4-tuples, that contains -a code, jt, jf and k value. jt and jf are jump offsets and k a generic -value to be used for a provided code. - -struct sock_fprog { /* Required for SO_ATTACH_FILTER. */ - unsigned short len; /* Number of filter blocks */ - struct sock_filter __user *filter; -}; - -For socket filtering, a pointer to this structure (as shown in -follow-up example) is being passed to the kernel through setsockopt(2). - -Example -------- - -#include -#include -#include -#include -/* ... */ - -/* From the example above: tcpdump -i em1 port 22 -dd */ -struct sock_filter code[] = { - { 0x28, 0, 0, 0x0000000c }, - { 0x15, 0, 8, 0x000086dd }, - { 0x30, 0, 0, 0x00000014 }, - { 0x15, 2, 0, 0x00000084 }, - { 0x15, 1, 0, 0x00000006 }, - { 0x15, 0, 17, 0x00000011 }, - { 0x28, 0, 0, 0x00000036 }, - { 0x15, 14, 0, 0x00000016 }, - { 0x28, 0, 0, 0x00000038 }, - { 0x15, 12, 13, 0x00000016 }, - { 0x15, 0, 12, 0x00000800 }, - { 0x30, 0, 0, 0x00000017 }, - { 0x15, 2, 0, 0x00000084 }, - { 0x15, 1, 0, 0x00000006 }, - { 0x15, 0, 8, 0x00000011 }, - { 0x28, 0, 0, 0x00000014 }, - { 0x45, 6, 0, 0x00001fff }, - { 0xb1, 0, 0, 0x0000000e }, - { 0x48, 0, 0, 0x0000000e }, - { 0x15, 2, 0, 0x00000016 }, - { 0x48, 0, 0, 0x00000010 }, - { 0x15, 0, 1, 0x00000016 }, - { 0x06, 0, 0, 0x0000ffff }, - { 0x06, 0, 0, 0x00000000 }, -}; - -struct sock_fprog bpf = { - .len = ARRAY_SIZE(code), - .filter = code, -}; - -sock = socket(PF_PACKET, SOCK_RAW, htons(ETH_P_ALL)); -if (sock < 0) - /* ... bail out ... */ - -ret = setsockopt(sock, SOL_SOCKET, SO_ATTACH_FILTER, &bpf, sizeof(bpf)); -if (ret < 0) - /* ... bail out ... */ - -/* ... */ -close(sock); - -The above example code attaches a socket filter for a PF_PACKET socket -in order to let all IPv4/IPv6 packets with port 22 pass. The rest will -be dropped for this socket. - -The setsockopt(2) call to SO_DETACH_FILTER doesn't need any arguments -and SO_LOCK_FILTER for preventing the filter to be detached, takes an -integer value with 0 or 1. - -Note that socket filters are not restricted to PF_PACKET sockets only, -but can also be used on other socket families. - -Summary of system calls: - - * setsockopt(sockfd, SOL_SOCKET, SO_ATTACH_FILTER, &val, sizeof(val)); - * setsockopt(sockfd, SOL_SOCKET, SO_DETACH_FILTER, &val, sizeof(val)); - * setsockopt(sockfd, SOL_SOCKET, SO_LOCK_FILTER, &val, sizeof(val)); - -Normally, most use cases for socket filtering on packet sockets will be -covered by libpcap in high-level syntax, so as an application developer -you should stick to that. libpcap wraps its own layer around all that. - -Unless i) using/linking to libpcap is not an option, ii) the required BPF -filters use Linux extensions that are not supported by libpcap's compiler, -iii) a filter might be more complex and not cleanly implementable with -libpcap's compiler, or iv) particular filter codes should be optimized -differently than libpcap's internal compiler does; then in such cases -writing such a filter "by hand" can be of an alternative. For example, -xt_bpf and cls_bpf users might have requirements that could result in -more complex filter code, or one that cannot be expressed with libpcap -(e.g. different return codes for various code paths). Moreover, BPF JIT -implementors may wish to manually write test cases and thus need low-level -access to BPF code as well. - -BPF engine and instruction set ------------------------------- - -Under tools/bpf/ there's a small helper tool called bpf_asm which can -be used to write low-level filters for example scenarios mentioned in the -previous section. Asm-like syntax mentioned here has been implemented in -bpf_asm and will be used for further explanations (instead of dealing with -less readable opcodes directly, principles are the same). The syntax is -closely modelled after Steven McCanne's and Van Jacobson's BPF paper. - -The BPF architecture consists of the following basic elements: - - Element Description - - A 32 bit wide accumulator - X 32 bit wide X register - M[] 16 x 32 bit wide misc registers aka "scratch memory - store", addressable from 0 to 15 - -A program, that is translated by bpf_asm into "opcodes" is an array that -consists of the following elements (as already mentioned): - - op:16, jt:8, jf:8, k:32 - -The element op is a 16 bit wide opcode that has a particular instruction -encoded. jt and jf are two 8 bit wide jump targets, one for condition -"jump if true", the other one "jump if false". Eventually, element k -contains a miscellaneous argument that can be interpreted in different -ways depending on the given instruction in op. - -The instruction set consists of load, store, branch, alu, miscellaneous -and return instructions that are also represented in bpf_asm syntax. This -table lists all bpf_asm instructions available resp. what their underlying -opcodes as defined in linux/filter.h stand for: - - Instruction Addressing mode Description - - ld 1, 2, 3, 4, 12 Load word into A - ldi 4 Load word into A - ldh 1, 2 Load half-word into A - ldb 1, 2 Load byte into A - ldx 3, 4, 5, 12 Load word into X - ldxi 4 Load word into X - ldxb 5 Load byte into X - - st 3 Store A into M[] - stx 3 Store X into M[] - - jmp 6 Jump to label - ja 6 Jump to label - jeq 7, 8, 9, 10 Jump on A == - jneq 9, 10 Jump on A != - jne 9, 10 Jump on A != - jlt 9, 10 Jump on A < - jle 9, 10 Jump on A <= - jgt 7, 8, 9, 10 Jump on A > - jge 7, 8, 9, 10 Jump on A >= - jset 7, 8, 9, 10 Jump on A & - - add 0, 4 A + - sub 0, 4 A - - mul 0, 4 A * - div 0, 4 A / - mod 0, 4 A % - neg !A - and 0, 4 A & - or 0, 4 A | - xor 0, 4 A ^ - lsh 0, 4 A << - rsh 0, 4 A >> - - tax Copy A into X - txa Copy X into A - - ret 4, 11 Return - -The next table shows addressing formats from the 2nd column: - - Addressing mode Syntax Description - - 0 x/%x Register X - 1 [k] BHW at byte offset k in the packet - 2 [x + k] BHW at the offset X + k in the packet - 3 M[k] Word at offset k in M[] - 4 #k Literal value stored in k - 5 4*([k]&0xf) Lower nibble * 4 at byte offset k in the packet - 6 L Jump label L - 7 #k,Lt,Lf Jump to Lt if true, otherwise jump to Lf - 8 x/%x,Lt,Lf Jump to Lt if true, otherwise jump to Lf - 9 #k,Lt Jump to Lt if predicate is true - 10 x/%x,Lt Jump to Lt if predicate is true - 11 a/%a Accumulator A - 12 extension BPF extension - -The Linux kernel also has a couple of BPF extensions that are used along -with the class of load instructions by "overloading" the k argument with -a negative offset + a particular extension offset. The result of such BPF -extensions are loaded into A. - -Possible BPF extensions are shown in the following table: - - Extension Description - - len skb->len - proto skb->protocol - type skb->pkt_type - poff Payload start offset - ifidx skb->dev->ifindex - nla Netlink attribute of type X with offset A - nlan Nested Netlink attribute of type X with offset A - mark skb->mark - queue skb->queue_mapping - hatype skb->dev->type - rxhash skb->hash - cpu raw_smp_processor_id() - vlan_tci skb_vlan_tag_get(skb) - vlan_avail skb_vlan_tag_present(skb) - vlan_tpid skb->vlan_proto - rand prandom_u32() - -These extensions can also be prefixed with '#'. -Examples for low-level BPF: - -** ARP packets: - - ldh [12] - jne #0x806, drop - ret #-1 - drop: ret #0 - -** IPv4 TCP packets: - - ldh [12] - jne #0x800, drop - ldb [23] - jneq #6, drop - ret #-1 - drop: ret #0 - -** (Accelerated) VLAN w/ id 10: - - ld vlan_tci - jneq #10, drop - ret #-1 - drop: ret #0 - -** icmp random packet sampling, 1 in 4 - ldh [12] - jne #0x800, drop - ldb [23] - jneq #1, drop - # get a random uint32 number - ld rand - mod #4 - jneq #1, drop - ret #-1 - drop: ret #0 - -** SECCOMP filter example: - - ld [4] /* offsetof(struct seccomp_data, arch) */ - jne #0xc000003e, bad /* AUDIT_ARCH_X86_64 */ - ld [0] /* offsetof(struct seccomp_data, nr) */ - jeq #15, good /* __NR_rt_sigreturn */ - jeq #231, good /* __NR_exit_group */ - jeq #60, good /* __NR_exit */ - jeq #0, good /* __NR_read */ - jeq #1, good /* __NR_write */ - jeq #5, good /* __NR_fstat */ - jeq #9, good /* __NR_mmap */ - jeq #14, good /* __NR_rt_sigprocmask */ - jeq #13, good /* __NR_rt_sigaction */ - jeq #35, good /* __NR_nanosleep */ - bad: ret #0 /* SECCOMP_RET_KILL_THREAD */ - good: ret #0x7fff0000 /* SECCOMP_RET_ALLOW */ - -The above example code can be placed into a file (here called "foo"), and -then be passed to the bpf_asm tool for generating opcodes, output that xt_bpf -and cls_bpf understands and can directly be loaded with. Example with above -ARP code: - -$ ./bpf_asm foo -4,40 0 0 12,21 0 1 2054,6 0 0 4294967295,6 0 0 0, - -In copy and paste C-like output: - -$ ./bpf_asm -c foo -{ 0x28, 0, 0, 0x0000000c }, -{ 0x15, 0, 1, 0x00000806 }, -{ 0x06, 0, 0, 0xffffffff }, -{ 0x06, 0, 0, 0000000000 }, - -In particular, as usage with xt_bpf or cls_bpf can result in more complex BPF -filters that might not be obvious at first, it's good to test filters before -attaching to a live system. For that purpose, there's a small tool called -bpf_dbg under tools/bpf/ in the kernel source directory. This debugger allows -for testing BPF filters against given pcap files, single stepping through the -BPF code on the pcap's packets and to do BPF machine register dumps. - -Starting bpf_dbg is trivial and just requires issuing: - -# ./bpf_dbg - -In case input and output do not equal stdin/stdout, bpf_dbg takes an -alternative stdin source as a first argument, and an alternative stdout -sink as a second one, e.g. `./bpf_dbg test_in.txt test_out.txt`. - -Other than that, a particular libreadline configuration can be set via -file "~/.bpf_dbg_init" and the command history is stored in the file -"~/.bpf_dbg_history". - -Interaction in bpf_dbg happens through a shell that also has auto-completion -support (follow-up example commands starting with '>' denote bpf_dbg shell). -The usual workflow would be to ... - -> load bpf 6,40 0 0 12,21 0 3 2048,48 0 0 23,21 0 1 1,6 0 0 65535,6 0 0 0 - Loads a BPF filter from standard output of bpf_asm, or transformed via - e.g. `tcpdump -iem1 -ddd port 22 | tr '\n' ','`. Note that for JIT - debugging (next section), this command creates a temporary socket and - loads the BPF code into the kernel. Thus, this will also be useful for - JIT developers. - -> load pcap foo.pcap - Loads standard tcpdump pcap file. - -> run [] -bpf passes:1 fails:9 - Runs through all packets from a pcap to account how many passes and fails - the filter will generate. A limit of packets to traverse can be given. - -> disassemble -l0: ldh [12] -l1: jeq #0x800, l2, l5 -l2: ldb [23] -l3: jeq #0x1, l4, l5 -l4: ret #0xffff -l5: ret #0 - Prints out BPF code disassembly. - -> dump -/* { op, jt, jf, k }, */ -{ 0x28, 0, 0, 0x0000000c }, -{ 0x15, 0, 3, 0x00000800 }, -{ 0x30, 0, 0, 0x00000017 }, -{ 0x15, 0, 1, 0x00000001 }, -{ 0x06, 0, 0, 0x0000ffff }, -{ 0x06, 0, 0, 0000000000 }, - Prints out C-style BPF code dump. - -> breakpoint 0 -breakpoint at: l0: ldh [12] -> breakpoint 1 -breakpoint at: l1: jeq #0x800, l2, l5 - ... - Sets breakpoints at particular BPF instructions. Issuing a `run` command - will walk through the pcap file continuing from the current packet and - break when a breakpoint is being hit (another `run` will continue from - the currently active breakpoint executing next instructions): - - > run - -- register dump -- - pc: [0] <-- program counter - code: [40] jt[0] jf[0] k[12] <-- plain BPF code of current instruction - curr: l0: ldh [12] <-- disassembly of current instruction - A: [00000000][0] <-- content of A (hex, decimal) - X: [00000000][0] <-- content of X (hex, decimal) - M[0,15]: [00000000][0] <-- folded content of M (hex, decimal) - -- packet dump -- <-- Current packet from pcap (hex) - len: 42 - 0: 00 19 cb 55 55 a4 00 14 a4 43 78 69 08 06 00 01 - 16: 08 00 06 04 00 01 00 14 a4 43 78 69 0a 3b 01 26 - 32: 00 00 00 00 00 00 0a 3b 01 01 - (breakpoint) - > - -> breakpoint -breakpoints: 0 1 - Prints currently set breakpoints. - -> step [-, +] - Performs single stepping through the BPF program from the current pc - offset. Thus, on each step invocation, above register dump is issued. - This can go forwards and backwards in time, a plain `step` will break - on the next BPF instruction, thus +1. (No `run` needs to be issued here.) - -> select - Selects a given packet from the pcap file to continue from. Thus, on - the next `run` or `step`, the BPF program is being evaluated against - the user pre-selected packet. Numbering starts just as in Wireshark - with index 1. - -> quit -# - Exits bpf_dbg. - -JIT compiler ------------- - -The Linux kernel has a built-in BPF JIT compiler for x86_64, SPARC, -PowerPC, ARM, ARM64, MIPS, RISC-V and s390 and can be enabled through -CONFIG_BPF_JIT. The JIT compiler is transparently invoked for each -attached filter from user space or for internal kernel users if it has -been previously enabled by root: - - echo 1 > /proc/sys/net/core/bpf_jit_enable - -For JIT developers, doing audits etc, each compile run can output the generated -opcode image into the kernel log via: - - echo 2 > /proc/sys/net/core/bpf_jit_enable - -Example output from dmesg: - -[ 3389.935842] flen=6 proglen=70 pass=3 image=ffffffffa0069c8f -[ 3389.935847] JIT code: 00000000: 55 48 89 e5 48 83 ec 60 48 89 5d f8 44 8b 4f 68 -[ 3389.935849] JIT code: 00000010: 44 2b 4f 6c 4c 8b 87 d8 00 00 00 be 0c 00 00 00 -[ 3389.935850] JIT code: 00000020: e8 1d 94 ff e0 3d 00 08 00 00 75 16 be 17 00 00 -[ 3389.935851] JIT code: 00000030: 00 e8 28 94 ff e0 83 f8 01 75 07 b8 ff ff 00 00 -[ 3389.935852] JIT code: 00000040: eb 02 31 c0 c9 c3 - -When CONFIG_BPF_JIT_ALWAYS_ON is enabled, bpf_jit_enable is permanently set to 1 and -setting any other value than that will return in failure. This is even the case for -setting bpf_jit_enable to 2, since dumping the final JIT image into the kernel log -is discouraged and introspection through bpftool (under tools/bpf/bpftool/) is the -generally recommended approach instead. - -In the kernel source tree under tools/bpf/, there's bpf_jit_disasm for -generating disassembly out of the kernel log's hexdump: - -# ./bpf_jit_disasm -70 bytes emitted from JIT compiler (pass:3, flen:6) -ffffffffa0069c8f + : - 0: push %rbp - 1: mov %rsp,%rbp - 4: sub $0x60,%rsp - 8: mov %rbx,-0x8(%rbp) - c: mov 0x68(%rdi),%r9d - 10: sub 0x6c(%rdi),%r9d - 14: mov 0xd8(%rdi),%r8 - 1b: mov $0xc,%esi - 20: callq 0xffffffffe0ff9442 - 25: cmp $0x800,%eax - 2a: jne 0x0000000000000042 - 2c: mov $0x17,%esi - 31: callq 0xffffffffe0ff945e - 36: cmp $0x1,%eax - 39: jne 0x0000000000000042 - 3b: mov $0xffff,%eax - 40: jmp 0x0000000000000044 - 42: xor %eax,%eax - 44: leaveq - 45: retq - -Issuing option `-o` will "annotate" opcodes to resulting assembler -instructions, which can be very useful for JIT developers: - -# ./bpf_jit_disasm -o -70 bytes emitted from JIT compiler (pass:3, flen:6) -ffffffffa0069c8f + : - 0: push %rbp - 55 - 1: mov %rsp,%rbp - 48 89 e5 - 4: sub $0x60,%rsp - 48 83 ec 60 - 8: mov %rbx,-0x8(%rbp) - 48 89 5d f8 - c: mov 0x68(%rdi),%r9d - 44 8b 4f 68 - 10: sub 0x6c(%rdi),%r9d - 44 2b 4f 6c - 14: mov 0xd8(%rdi),%r8 - 4c 8b 87 d8 00 00 00 - 1b: mov $0xc,%esi - be 0c 00 00 00 - 20: callq 0xffffffffe0ff9442 - e8 1d 94 ff e0 - 25: cmp $0x800,%eax - 3d 00 08 00 00 - 2a: jne 0x0000000000000042 - 75 16 - 2c: mov $0x17,%esi - be 17 00 00 00 - 31: callq 0xffffffffe0ff945e - e8 28 94 ff e0 - 36: cmp $0x1,%eax - 83 f8 01 - 39: jne 0x0000000000000042 - 75 07 - 3b: mov $0xffff,%eax - b8 ff ff 00 00 - 40: jmp 0x0000000000000044 - eb 02 - 42: xor %eax,%eax - 31 c0 - 44: leaveq - c9 - 45: retq - c3 - -For BPF JIT developers, bpf_jit_disasm, bpf_asm and bpf_dbg provides a useful -toolchain for developing and testing the kernel's JIT compiler. - -BPF kernel internals --------------------- -Internally, for the kernel interpreter, a different instruction set -format with similar underlying principles from BPF described in previous -paragraphs is being used. However, the instruction set format is modelled -closer to the underlying architecture to mimic native instruction sets, so -that a better performance can be achieved (more details later). This new -ISA is called 'eBPF' or 'internal BPF' interchangeably. (Note: eBPF which -originates from [e]xtended BPF is not the same as BPF extensions! While -eBPF is an ISA, BPF extensions date back to classic BPF's 'overloading' -of BPF_LD | BPF_{B,H,W} | BPF_ABS instruction.) - -It is designed to be JITed with one to one mapping, which can also open up -the possibility for GCC/LLVM compilers to generate optimized eBPF code through -an eBPF backend that performs almost as fast as natively compiled code. - -The new instruction set was originally designed with the possible goal in -mind to write programs in "restricted C" and compile into eBPF with a optional -GCC/LLVM backend, so that it can just-in-time map to modern 64-bit CPUs with -minimal performance overhead over two steps, that is, C -> eBPF -> native code. - -Currently, the new format is being used for running user BPF programs, which -includes seccomp BPF, classic socket filters, cls_bpf traffic classifier, -team driver's classifier for its load-balancing mode, netfilter's xt_bpf -extension, PTP dissector/classifier, and much more. They are all internally -converted by the kernel into the new instruction set representation and run -in the eBPF interpreter. For in-kernel handlers, this all works transparently -by using bpf_prog_create() for setting up the filter, resp. -bpf_prog_destroy() for destroying it. The macro -BPF_PROG_RUN(filter, ctx) transparently invokes eBPF interpreter or JITed -code to run the filter. 'filter' is a pointer to struct bpf_prog that we -got from bpf_prog_create(), and 'ctx' the given context (e.g. -skb pointer). All constraints and restrictions from bpf_check_classic() apply -before a conversion to the new layout is being done behind the scenes! - -Currently, the classic BPF format is being used for JITing on most -32-bit architectures, whereas x86-64, aarch64, s390x, powerpc64, -sparc64, arm32, riscv64, riscv32 perform JIT compilation from eBPF -instruction set. - -Some core changes of the new internal format: - -- Number of registers increase from 2 to 10: - - The old format had two registers A and X, and a hidden frame pointer. The - new layout extends this to be 10 internal registers and a read-only frame - pointer. Since 64-bit CPUs are passing arguments to functions via registers - the number of args from eBPF program to in-kernel function is restricted - to 5 and one register is used to accept return value from an in-kernel - function. Natively, x86_64 passes first 6 arguments in registers, aarch64/ - sparcv9/mips64 have 7 - 8 registers for arguments; x86_64 has 6 callee saved - registers, and aarch64/sparcv9/mips64 have 11 or more callee saved registers. - - Therefore, eBPF calling convention is defined as: - - * R0 - return value from in-kernel function, and exit value for eBPF program - * R1 - R5 - arguments from eBPF program to in-kernel function - * R6 - R9 - callee saved registers that in-kernel function will preserve - * R10 - read-only frame pointer to access stack - - Thus, all eBPF registers map one to one to HW registers on x86_64, aarch64, - etc, and eBPF calling convention maps directly to ABIs used by the kernel on - 64-bit architectures. - - On 32-bit architectures JIT may map programs that use only 32-bit arithmetic - and may let more complex programs to be interpreted. - - R0 - R5 are scratch registers and eBPF program needs spill/fill them if - necessary across calls. Note that there is only one eBPF program (== one - eBPF main routine) and it cannot call other eBPF functions, it can only - call predefined in-kernel functions, though. - -- Register width increases from 32-bit to 64-bit: - - Still, the semantics of the original 32-bit ALU operations are preserved - via 32-bit subregisters. All eBPF registers are 64-bit with 32-bit lower - subregisters that zero-extend into 64-bit if they are being written to. - That behavior maps directly to x86_64 and arm64 subregister definition, but - makes other JITs more difficult. - - 32-bit architectures run 64-bit internal BPF programs via interpreter. - Their JITs may convert BPF programs that only use 32-bit subregisters into - native instruction set and let the rest being interpreted. - - Operation is 64-bit, because on 64-bit architectures, pointers are also - 64-bit wide, and we want to pass 64-bit values in/out of kernel functions, - so 32-bit eBPF registers would otherwise require to define register-pair - ABI, thus, there won't be able to use a direct eBPF register to HW register - mapping and JIT would need to do combine/split/move operations for every - register in and out of the function, which is complex, bug prone and slow. - Another reason is the use of atomic 64-bit counters. - -- Conditional jt/jf targets replaced with jt/fall-through: - - While the original design has constructs such as "if (cond) jump_true; - else jump_false;", they are being replaced into alternative constructs like - "if (cond) jump_true; /* else fall-through */". - -- Introduces bpf_call insn and register passing convention for zero overhead - calls from/to other kernel functions: - - Before an in-kernel function call, the internal BPF program needs to - place function arguments into R1 to R5 registers to satisfy calling - convention, then the interpreter will take them from registers and pass - to in-kernel function. If R1 - R5 registers are mapped to CPU registers - that are used for argument passing on given architecture, the JIT compiler - doesn't need to emit extra moves. Function arguments will be in the correct - registers and BPF_CALL instruction will be JITed as single 'call' HW - instruction. This calling convention was picked to cover common call - situations without performance penalty. - - After an in-kernel function call, R1 - R5 are reset to unreadable and R0 has - a return value of the function. Since R6 - R9 are callee saved, their state - is preserved across the call. - - For example, consider three C functions: - - u64 f1() { return (*_f2)(1); } - u64 f2(u64 a) { return f3(a + 1, a); } - u64 f3(u64 a, u64 b) { return a - b; } - - GCC can compile f1, f3 into x86_64: - - f1: - movl $1, %edi - movq _f2(%rip), %rax - jmp *%rax - f3: - movq %rdi, %rax - subq %rsi, %rax - ret - - Function f2 in eBPF may look like: - - f2: - bpf_mov R2, R1 - bpf_add R1, 1 - bpf_call f3 - bpf_exit - - If f2 is JITed and the pointer stored to '_f2'. The calls f1 -> f2 -> f3 and - returns will be seamless. Without JIT, __bpf_prog_run() interpreter needs to - be used to call into f2. - - For practical reasons all eBPF programs have only one argument 'ctx' which is - already placed into R1 (e.g. on __bpf_prog_run() startup) and the programs - can call kernel functions with up to 5 arguments. Calls with 6 or more arguments - are currently not supported, but these restrictions can be lifted if necessary - in the future. - - On 64-bit architectures all register map to HW registers one to one. For - example, x86_64 JIT compiler can map them as ... - - R0 - rax - R1 - rdi - R2 - rsi - R3 - rdx - R4 - rcx - R5 - r8 - R6 - rbx - R7 - r13 - R8 - r14 - R9 - r15 - R10 - rbp - - ... since x86_64 ABI mandates rdi, rsi, rdx, rcx, r8, r9 for argument passing - and rbx, r12 - r15 are callee saved. - - Then the following internal BPF pseudo-program: - - bpf_mov R6, R1 /* save ctx */ - bpf_mov R2, 2 - bpf_mov R3, 3 - bpf_mov R4, 4 - bpf_mov R5, 5 - bpf_call foo - bpf_mov R7, R0 /* save foo() return value */ - bpf_mov R1, R6 /* restore ctx for next call */ - bpf_mov R2, 6 - bpf_mov R3, 7 - bpf_mov R4, 8 - bpf_mov R5, 9 - bpf_call bar - bpf_add R0, R7 - bpf_exit - - After JIT to x86_64 may look like: - - push %rbp - mov %rsp,%rbp - sub $0x228,%rsp - mov %rbx,-0x228(%rbp) - mov %r13,-0x220(%rbp) - mov %rdi,%rbx - mov $0x2,%esi - mov $0x3,%edx - mov $0x4,%ecx - mov $0x5,%r8d - callq foo - mov %rax,%r13 - mov %rbx,%rdi - mov $0x6,%esi - mov $0x7,%edx - mov $0x8,%ecx - mov $0x9,%r8d - callq bar - add %r13,%rax - mov -0x228(%rbp),%rbx - mov -0x220(%rbp),%r13 - leaveq - retq - - Which is in this example equivalent in C to: - - u64 bpf_filter(u64 ctx) - { - return foo(ctx, 2, 3, 4, 5) + bar(ctx, 6, 7, 8, 9); - } - - In-kernel functions foo() and bar() with prototype: u64 (*)(u64 arg1, u64 - arg2, u64 arg3, u64 arg4, u64 arg5); will receive arguments in proper - registers and place their return value into '%rax' which is R0 in eBPF. - Prologue and epilogue are emitted by JIT and are implicit in the - interpreter. R0-R5 are scratch registers, so eBPF program needs to preserve - them across the calls as defined by calling convention. - - For example the following program is invalid: - - bpf_mov R1, 1 - bpf_call foo - bpf_mov R0, R1 - bpf_exit - - After the call the registers R1-R5 contain junk values and cannot be read. - An in-kernel eBPF verifier is used to validate internal BPF programs. - -Also in the new design, eBPF is limited to 4096 insns, which means that any -program will terminate quickly and will only call a fixed number of kernel -functions. Original BPF and the new format are two operand instructions, -which helps to do one-to-one mapping between eBPF insn and x86 insn during JIT. - -The input context pointer for invoking the interpreter function is generic, -its content is defined by a specific use case. For seccomp register R1 points -to seccomp_data, for converted BPF filters R1 points to a skb. - -A program, that is translated internally consists of the following elements: - - op:16, jt:8, jf:8, k:32 ==> op:8, dst_reg:4, src_reg:4, off:16, imm:32 - -So far 87 internal BPF instructions were implemented. 8-bit 'op' opcode field -has room for new instructions. Some of them may use 16/24/32 byte encoding. New -instructions must be multiple of 8 bytes to preserve backward compatibility. - -Internal BPF is a general purpose RISC instruction set. Not every register and -every instruction are used during translation from original BPF to new format. -For example, socket filters are not using 'exclusive add' instruction, but -tracing filters may do to maintain counters of events, for example. Register R9 -is not used by socket filters either, but more complex filters may be running -out of registers and would have to resort to spill/fill to stack. - -Internal BPF can be used as a generic assembler for last step performance -optimizations, socket filters and seccomp are using it as assembler. Tracing -filters may use it as assembler to generate code from kernel. In kernel usage -may not be bounded by security considerations, since generated internal BPF code -may be optimizing internal code path and not being exposed to the user space. -Safety of internal BPF can come from a verifier (TBD). In such use cases as -described, it may be used as safe instruction set. - -Just like the original BPF, the new format runs within a controlled environment, -is deterministic and the kernel can easily prove that. The safety of the program -can be determined in two steps: first step does depth-first-search to disallow -loops and other CFG validation; second step starts from the first insn and -descends all possible paths. It simulates execution of every insn and observes -the state change of registers and stack. - -eBPF opcode encoding --------------------- - -eBPF is reusing most of the opcode encoding from classic to simplify conversion -of classic BPF to eBPF. For arithmetic and jump instructions the 8-bit 'code' -field is divided into three parts: - - +----------------+--------+--------------------+ - | 4 bits | 1 bit | 3 bits | - | operation code | source | instruction class | - +----------------+--------+--------------------+ - (MSB) (LSB) - -Three LSB bits store instruction class which is one of: - - Classic BPF classes: eBPF classes: - - BPF_LD 0x00 BPF_LD 0x00 - BPF_LDX 0x01 BPF_LDX 0x01 - BPF_ST 0x02 BPF_ST 0x02 - BPF_STX 0x03 BPF_STX 0x03 - BPF_ALU 0x04 BPF_ALU 0x04 - BPF_JMP 0x05 BPF_JMP 0x05 - BPF_RET 0x06 BPF_JMP32 0x06 - BPF_MISC 0x07 BPF_ALU64 0x07 - -When BPF_CLASS(code) == BPF_ALU or BPF_JMP, 4th bit encodes source operand ... - - BPF_K 0x00 - BPF_X 0x08 - - * in classic BPF, this means: - - BPF_SRC(code) == BPF_X - use register X as source operand - BPF_SRC(code) == BPF_K - use 32-bit immediate as source operand - - * in eBPF, this means: - - BPF_SRC(code) == BPF_X - use 'src_reg' register as source operand - BPF_SRC(code) == BPF_K - use 32-bit immediate as source operand - -... and four MSB bits store operation code. - -If BPF_CLASS(code) == BPF_ALU or BPF_ALU64 [ in eBPF ], BPF_OP(code) is one of: - - BPF_ADD 0x00 - BPF_SUB 0x10 - BPF_MUL 0x20 - BPF_DIV 0x30 - BPF_OR 0x40 - BPF_AND 0x50 - BPF_LSH 0x60 - BPF_RSH 0x70 - BPF_NEG 0x80 - BPF_MOD 0x90 - BPF_XOR 0xa0 - BPF_MOV 0xb0 /* eBPF only: mov reg to reg */ - BPF_ARSH 0xc0 /* eBPF only: sign extending shift right */ - BPF_END 0xd0 /* eBPF only: endianness conversion */ - -If BPF_CLASS(code) == BPF_JMP or BPF_JMP32 [ in eBPF ], BPF_OP(code) is one of: - - BPF_JA 0x00 /* BPF_JMP only */ - BPF_JEQ 0x10 - BPF_JGT 0x20 - BPF_JGE 0x30 - BPF_JSET 0x40 - BPF_JNE 0x50 /* eBPF only: jump != */ - BPF_JSGT 0x60 /* eBPF only: signed '>' */ - BPF_JSGE 0x70 /* eBPF only: signed '>=' */ - BPF_CALL 0x80 /* eBPF BPF_JMP only: function call */ - BPF_EXIT 0x90 /* eBPF BPF_JMP only: function return */ - BPF_JLT 0xa0 /* eBPF only: unsigned '<' */ - BPF_JLE 0xb0 /* eBPF only: unsigned '<=' */ - BPF_JSLT 0xc0 /* eBPF only: signed '<' */ - BPF_JSLE 0xd0 /* eBPF only: signed '<=' */ - -So BPF_ADD | BPF_X | BPF_ALU means 32-bit addition in both classic BPF -and eBPF. There are only two registers in classic BPF, so it means A += X. -In eBPF it means dst_reg = (u32) dst_reg + (u32) src_reg; similarly, -BPF_XOR | BPF_K | BPF_ALU means A ^= imm32 in classic BPF and analogous -src_reg = (u32) src_reg ^ (u32) imm32 in eBPF. - -Classic BPF is using BPF_MISC class to represent A = X and X = A moves. -eBPF is using BPF_MOV | BPF_X | BPF_ALU code instead. Since there are no -BPF_MISC operations in eBPF, the class 7 is used as BPF_ALU64 to mean -exactly the same operations as BPF_ALU, but with 64-bit wide operands -instead. So BPF_ADD | BPF_X | BPF_ALU64 means 64-bit addition, i.e.: -dst_reg = dst_reg + src_reg - -Classic BPF wastes the whole BPF_RET class to represent a single 'ret' -operation. Classic BPF_RET | BPF_K means copy imm32 into return register -and perform function exit. eBPF is modeled to match CPU, so BPF_JMP | BPF_EXIT -in eBPF means function exit only. The eBPF program needs to store return -value into register R0 before doing a BPF_EXIT. Class 6 in eBPF is used as -BPF_JMP32 to mean exactly the same operations as BPF_JMP, but with 32-bit wide -operands for the comparisons instead. - -For load and store instructions the 8-bit 'code' field is divided as: - - +--------+--------+-------------------+ - | 3 bits | 2 bits | 3 bits | - | mode | size | instruction class | - +--------+--------+-------------------+ - (MSB) (LSB) - -Size modifier is one of ... - - BPF_W 0x00 /* word */ - BPF_H 0x08 /* half word */ - BPF_B 0x10 /* byte */ - BPF_DW 0x18 /* eBPF only, double word */ - -... which encodes size of load/store operation: - - B - 1 byte - H - 2 byte - W - 4 byte - DW - 8 byte (eBPF only) - -Mode modifier is one of: - - BPF_IMM 0x00 /* used for 32-bit mov in classic BPF and 64-bit in eBPF */ - BPF_ABS 0x20 - BPF_IND 0x40 - BPF_MEM 0x60 - BPF_LEN 0x80 /* classic BPF only, reserved in eBPF */ - BPF_MSH 0xa0 /* classic BPF only, reserved in eBPF */ - BPF_XADD 0xc0 /* eBPF only, exclusive add */ - -eBPF has two non-generic instructions: (BPF_ABS | | BPF_LD) and -(BPF_IND | | BPF_LD) which are used to access packet data. - -They had to be carried over from classic to have strong performance of -socket filters running in eBPF interpreter. These instructions can only -be used when interpreter context is a pointer to 'struct sk_buff' and -have seven implicit operands. Register R6 is an implicit input that must -contain pointer to sk_buff. Register R0 is an implicit output which contains -the data fetched from the packet. Registers R1-R5 are scratch registers -and must not be used to store the data across BPF_ABS | BPF_LD or -BPF_IND | BPF_LD instructions. - -These instructions have implicit program exit condition as well. When -eBPF program is trying to access the data beyond the packet boundary, -the interpreter will abort the execution of the program. JIT compilers -therefore must preserve this property. src_reg and imm32 fields are -explicit inputs to these instructions. - -For example: - - BPF_IND | BPF_W | BPF_LD means: - - R0 = ntohl(*(u32 *) (((struct sk_buff *) R6)->data + src_reg + imm32)) - and R1 - R5 were scratched. - -Unlike classic BPF instruction set, eBPF has generic load/store operations: - -BPF_MEM | | BPF_STX: *(size *) (dst_reg + off) = src_reg -BPF_MEM | | BPF_ST: *(size *) (dst_reg + off) = imm32 -BPF_MEM | | BPF_LDX: dst_reg = *(size *) (src_reg + off) -BPF_XADD | BPF_W | BPF_STX: lock xadd *(u32 *)(dst_reg + off16) += src_reg -BPF_XADD | BPF_DW | BPF_STX: lock xadd *(u64 *)(dst_reg + off16) += src_reg - -Where size is one of: BPF_B or BPF_H or BPF_W or BPF_DW. Note that 1 and -2 byte atomic increments are not supported. - -eBPF has one 16-byte instruction: BPF_LD | BPF_DW | BPF_IMM which consists -of two consecutive 'struct bpf_insn' 8-byte blocks and interpreted as single -instruction that loads 64-bit immediate value into a dst_reg. -Classic BPF has similar instruction: BPF_LD | BPF_W | BPF_IMM which loads -32-bit immediate value into a register. - -eBPF verifier -------------- -The safety of the eBPF program is determined in two steps. - -First step does DAG check to disallow loops and other CFG validation. -In particular it will detect programs that have unreachable instructions. -(though classic BPF checker allows them) - -Second step starts from the first insn and descends all possible paths. -It simulates execution of every insn and observes the state change of -registers and stack. - -At the start of the program the register R1 contains a pointer to context -and has type PTR_TO_CTX. -If verifier sees an insn that does R2=R1, then R2 has now type -PTR_TO_CTX as well and can be used on the right hand side of expression. -If R1=PTR_TO_CTX and insn is R2=R1+R1, then R2=SCALAR_VALUE, -since addition of two valid pointers makes invalid pointer. -(In 'secure' mode verifier will reject any type of pointer arithmetic to make -sure that kernel addresses don't leak to unprivileged users) - -If register was never written to, it's not readable: - bpf_mov R0 = R2 - bpf_exit -will be rejected, since R2 is unreadable at the start of the program. - -After kernel function call, R1-R5 are reset to unreadable and -R0 has a return type of the function. - -Since R6-R9 are callee saved, their state is preserved across the call. - bpf_mov R6 = 1 - bpf_call foo - bpf_mov R0 = R6 - bpf_exit -is a correct program. If there was R1 instead of R6, it would have -been rejected. - -load/store instructions are allowed only with registers of valid types, which -are PTR_TO_CTX, PTR_TO_MAP, PTR_TO_STACK. They are bounds and alignment checked. -For example: - bpf_mov R1 = 1 - bpf_mov R2 = 2 - bpf_xadd *(u32 *)(R1 + 3) += R2 - bpf_exit -will be rejected, since R1 doesn't have a valid pointer type at the time of -execution of instruction bpf_xadd. - -At the start R1 type is PTR_TO_CTX (a pointer to generic 'struct bpf_context') -A callback is used to customize verifier to restrict eBPF program access to only -certain fields within ctx structure with specified size and alignment. - -For example, the following insn: - bpf_ld R0 = *(u32 *)(R6 + 8) -intends to load a word from address R6 + 8 and store it into R0 -If R6=PTR_TO_CTX, via is_valid_access() callback the verifier will know -that offset 8 of size 4 bytes can be accessed for reading, otherwise -the verifier will reject the program. -If R6=PTR_TO_STACK, then access should be aligned and be within -stack bounds, which are [-MAX_BPF_STACK, 0). In this example offset is 8, -so it will fail verification, since it's out of bounds. - -The verifier will allow eBPF program to read data from stack only after -it wrote into it. -Classic BPF verifier does similar check with M[0-15] memory slots. -For example: - bpf_ld R0 = *(u32 *)(R10 - 4) - bpf_exit -is invalid program. -Though R10 is correct read-only register and has type PTR_TO_STACK -and R10 - 4 is within stack bounds, there were no stores into that location. - -Pointer register spill/fill is tracked as well, since four (R6-R9) -callee saved registers may not be enough for some programs. - -Allowed function calls are customized with bpf_verifier_ops->get_func_proto() -The eBPF verifier will check that registers match argument constraints. -After the call register R0 will be set to return type of the function. - -Function calls is a main mechanism to extend functionality of eBPF programs. -Socket filters may let programs to call one set of functions, whereas tracing -filters may allow completely different set. - -If a function made accessible to eBPF program, it needs to be thought through -from safety point of view. The verifier will guarantee that the function is -called with valid arguments. - -seccomp vs socket filters have different security restrictions for classic BPF. -Seccomp solves this by two stage verifier: classic BPF verifier is followed -by seccomp verifier. In case of eBPF one configurable verifier is shared for -all use cases. - -See details of eBPF verifier in kernel/bpf/verifier.c - -Register value tracking ------------------------ -In order to determine the safety of an eBPF program, the verifier must track -the range of possible values in each register and also in each stack slot. -This is done with 'struct bpf_reg_state', defined in include/linux/ -bpf_verifier.h, which unifies tracking of scalar and pointer values. Each -register state has a type, which is either NOT_INIT (the register has not been -written to), SCALAR_VALUE (some value which is not usable as a pointer), or a -pointer type. The types of pointers describe their base, as follows: - PTR_TO_CTX Pointer to bpf_context. - CONST_PTR_TO_MAP Pointer to struct bpf_map. "Const" because arithmetic - on these pointers is forbidden. - PTR_TO_MAP_VALUE Pointer to the value stored in a map element. - PTR_TO_MAP_VALUE_OR_NULL - Either a pointer to a map value, or NULL; map accesses - (see section 'eBPF maps', below) return this type, - which becomes a PTR_TO_MAP_VALUE when checked != NULL. - Arithmetic on these pointers is forbidden. - PTR_TO_STACK Frame pointer. - PTR_TO_PACKET skb->data. - PTR_TO_PACKET_END skb->data + headlen; arithmetic forbidden. - PTR_TO_SOCKET Pointer to struct bpf_sock_ops, implicitly refcounted. - PTR_TO_SOCKET_OR_NULL - Either a pointer to a socket, or NULL; socket lookup - returns this type, which becomes a PTR_TO_SOCKET when - checked != NULL. PTR_TO_SOCKET is reference-counted, - so programs must release the reference through the - socket release function before the end of the program. - Arithmetic on these pointers is forbidden. -However, a pointer may be offset from this base (as a result of pointer -arithmetic), and this is tracked in two parts: the 'fixed offset' and 'variable -offset'. The former is used when an exactly-known value (e.g. an immediate -operand) is added to a pointer, while the latter is used for values which are -not exactly known. The variable offset is also used in SCALAR_VALUEs, to track -the range of possible values in the register. -The verifier's knowledge about the variable offset consists of: -* minimum and maximum values as unsigned -* minimum and maximum values as signed -* knowledge of the values of individual bits, in the form of a 'tnum': a u64 -'mask' and a u64 'value'. 1s in the mask represent bits whose value is unknown; -1s in the value represent bits known to be 1. Bits known to be 0 have 0 in both -mask and value; no bit should ever be 1 in both. For example, if a byte is read -into a register from memory, the register's top 56 bits are known zero, while -the low 8 are unknown - which is represented as the tnum (0x0; 0xff). If we -then OR this with 0x40, we get (0x40; 0xbf), then if we add 1 we get (0x0; -0x1ff), because of potential carries. - -Besides arithmetic, the register state can also be updated by conditional -branches. For instance, if a SCALAR_VALUE is compared > 8, in the 'true' branch -it will have a umin_value (unsigned minimum value) of 9, whereas in the 'false' -branch it will have a umax_value of 8. A signed compare (with BPF_JSGT or -BPF_JSGE) would instead update the signed minimum/maximum values. Information -from the signed and unsigned bounds can be combined; for instance if a value is -first tested < 8 and then tested s> 4, the verifier will conclude that the value -is also > 4 and s< 8, since the bounds prevent crossing the sign boundary. - -PTR_TO_PACKETs with a variable offset part have an 'id', which is common to all -pointers sharing that same variable offset. This is important for packet range -checks: after adding a variable to a packet pointer register A, if you then copy -it to another register B and then add a constant 4 to A, both registers will -share the same 'id' but the A will have a fixed offset of +4. Then if A is -bounds-checked and found to be less than a PTR_TO_PACKET_END, the register B is -now known to have a safe range of at least 4 bytes. See 'Direct packet access', -below, for more on PTR_TO_PACKET ranges. - -The 'id' field is also used on PTR_TO_MAP_VALUE_OR_NULL, common to all copies of -the pointer returned from a map lookup. This means that when one copy is -checked and found to be non-NULL, all copies can become PTR_TO_MAP_VALUEs. -As well as range-checking, the tracked information is also used for enforcing -alignment of pointer accesses. For instance, on most systems the packet pointer -is 2 bytes after a 4-byte alignment. If a program adds 14 bytes to that to jump -over the Ethernet header, then reads IHL and addes (IHL * 4), the resulting -pointer will have a variable offset known to be 4n+2 for some n, so adding the 2 -bytes (NET_IP_ALIGN) gives a 4-byte alignment and so word-sized accesses through -that pointer are safe. -The 'id' field is also used on PTR_TO_SOCKET and PTR_TO_SOCKET_OR_NULL, common -to all copies of the pointer returned from a socket lookup. This has similar -behaviour to the handling for PTR_TO_MAP_VALUE_OR_NULL->PTR_TO_MAP_VALUE, but -it also handles reference tracking for the pointer. PTR_TO_SOCKET implicitly -represents a reference to the corresponding 'struct sock'. To ensure that the -reference is not leaked, it is imperative to NULL-check the reference and in -the non-NULL case, and pass the valid reference to the socket release function. - -Direct packet access --------------------- -In cls_bpf and act_bpf programs the verifier allows direct access to the packet -data via skb->data and skb->data_end pointers. -Ex: -1: r4 = *(u32 *)(r1 +80) /* load skb->data_end */ -2: r3 = *(u32 *)(r1 +76) /* load skb->data */ -3: r5 = r3 -4: r5 += 14 -5: if r5 > r4 goto pc+16 -R1=ctx R3=pkt(id=0,off=0,r=14) R4=pkt_end R5=pkt(id=0,off=14,r=14) R10=fp -6: r0 = *(u16 *)(r3 +12) /* access 12 and 13 bytes of the packet */ - -this 2byte load from the packet is safe to do, since the program author -did check 'if (skb->data + 14 > skb->data_end) goto err' at insn #5 which -means that in the fall-through case the register R3 (which points to skb->data) -has at least 14 directly accessible bytes. The verifier marks it -as R3=pkt(id=0,off=0,r=14). -id=0 means that no additional variables were added to the register. -off=0 means that no additional constants were added. -r=14 is the range of safe access which means that bytes [R3, R3 + 14) are ok. -Note that R5 is marked as R5=pkt(id=0,off=14,r=14). It also points -to the packet data, but constant 14 was added to the register, so -it now points to 'skb->data + 14' and accessible range is [R5, R5 + 14 - 14) -which is zero bytes. - -More complex packet access may look like: - R0=inv1 R1=ctx R3=pkt(id=0,off=0,r=14) R4=pkt_end R5=pkt(id=0,off=14,r=14) R10=fp - 6: r0 = *(u8 *)(r3 +7) /* load 7th byte from the packet */ - 7: r4 = *(u8 *)(r3 +12) - 8: r4 *= 14 - 9: r3 = *(u32 *)(r1 +76) /* load skb->data */ -10: r3 += r4 -11: r2 = r1 -12: r2 <<= 48 -13: r2 >>= 48 -14: r3 += r2 -15: r2 = r3 -16: r2 += 8 -17: r1 = *(u32 *)(r1 +80) /* load skb->data_end */ -18: if r2 > r1 goto pc+2 - R0=inv(id=0,umax_value=255,var_off=(0x0; 0xff)) R1=pkt_end R2=pkt(id=2,off=8,r=8) R3=pkt(id=2,off=0,r=8) R4=inv(id=0,umax_value=3570,var_off=(0x0; 0xfffe)) R5=pkt(id=0,off=14,r=14) R10=fp -19: r1 = *(u8 *)(r3 +4) -The state of the register R3 is R3=pkt(id=2,off=0,r=8) -id=2 means that two 'r3 += rX' instructions were seen, so r3 points to some -offset within a packet and since the program author did -'if (r3 + 8 > r1) goto err' at insn #18, the safe range is [R3, R3 + 8). -The verifier only allows 'add'/'sub' operations on packet registers. Any other -operation will set the register state to 'SCALAR_VALUE' and it won't be -available for direct packet access. -Operation 'r3 += rX' may overflow and become less than original skb->data, -therefore the verifier has to prevent that. So when it sees 'r3 += rX' -instruction and rX is more than 16-bit value, any subsequent bounds-check of r3 -against skb->data_end will not give us 'range' information, so attempts to read -through the pointer will give "invalid access to packet" error. -Ex. after insn 'r4 = *(u8 *)(r3 +12)' (insn #7 above) the state of r4 is -R4=inv(id=0,umax_value=255,var_off=(0x0; 0xff)) which means that upper 56 bits -of the register are guaranteed to be zero, and nothing is known about the lower -8 bits. After insn 'r4 *= 14' the state becomes -R4=inv(id=0,umax_value=3570,var_off=(0x0; 0xfffe)), since multiplying an 8-bit -value by constant 14 will keep upper 52 bits as zero, also the least significant -bit will be zero as 14 is even. Similarly 'r2 >>= 48' will make -R2=inv(id=0,umax_value=65535,var_off=(0x0; 0xffff)), since the shift is not sign -extending. This logic is implemented in adjust_reg_min_max_vals() function, -which calls adjust_ptr_min_max_vals() for adding pointer to scalar (or vice -versa) and adjust_scalar_min_max_vals() for operations on two scalars. - -The end result is that bpf program author can access packet directly -using normal C code as: - void *data = (void *)(long)skb->data; - void *data_end = (void *)(long)skb->data_end; - struct eth_hdr *eth = data; - struct iphdr *iph = data + sizeof(*eth); - struct udphdr *udp = data + sizeof(*eth) + sizeof(*iph); - - if (data + sizeof(*eth) + sizeof(*iph) + sizeof(*udp) > data_end) - return 0; - if (eth->h_proto != htons(ETH_P_IP)) - return 0; - if (iph->protocol != IPPROTO_UDP || iph->ihl != 5) - return 0; - if (udp->dest == 53 || udp->source == 9) - ...; -which makes such programs easier to write comparing to LD_ABS insn -and significantly faster. - -eBPF maps ---------- -'maps' is a generic storage of different types for sharing data between kernel -and userspace. - -The maps are accessed from user space via BPF syscall, which has commands: -- create a map with given type and attributes - map_fd = bpf(BPF_MAP_CREATE, union bpf_attr *attr, u32 size) - using attr->map_type, attr->key_size, attr->value_size, attr->max_entries - returns process-local file descriptor or negative error - -- lookup key in a given map - err = bpf(BPF_MAP_LOOKUP_ELEM, union bpf_attr *attr, u32 size) - using attr->map_fd, attr->key, attr->value - returns zero and stores found elem into value or negative error - -- create or update key/value pair in a given map - err = bpf(BPF_MAP_UPDATE_ELEM, union bpf_attr *attr, u32 size) - using attr->map_fd, attr->key, attr->value - returns zero or negative error - -- find and delete element by key in a given map - err = bpf(BPF_MAP_DELETE_ELEM, union bpf_attr *attr, u32 size) - using attr->map_fd, attr->key - -- to delete map: close(fd) - Exiting process will delete maps automatically - -userspace programs use this syscall to create/access maps that eBPF programs -are concurrently updating. - -maps can have different types: hash, array, bloom filter, radix-tree, etc. - -The map is defined by: - . type - . max number of elements - . key size in bytes - . value size in bytes - -Pruning -------- -The verifier does not actually walk all possible paths through the program. For -each new branch to analyse, the verifier looks at all the states it's previously -been in when at this instruction. If any of them contain the current state as a -subset, the branch is 'pruned' - that is, the fact that the previous state was -accepted implies the current state would be as well. For instance, if in the -previous state, r1 held a packet-pointer, and in the current state, r1 holds a -packet-pointer with a range as long or longer and at least as strict an -alignment, then r1 is safe. Similarly, if r2 was NOT_INIT before then it can't -have been used by any path from that point, so any value in r2 (including -another NOT_INIT) is safe. The implementation is in the function regsafe(). -Pruning considers not only the registers but also the stack (and any spilled -registers it may hold). They must all be safe for the branch to be pruned. -This is implemented in states_equal(). - -Understanding eBPF verifier messages ------------------------------------- - -The following are few examples of invalid eBPF programs and verifier error -messages as seen in the log: - -Program with unreachable instructions: -static struct bpf_insn prog[] = { - BPF_EXIT_INSN(), - BPF_EXIT_INSN(), -}; -Error: - unreachable insn 1 - -Program that reads uninitialized register: - BPF_MOV64_REG(BPF_REG_0, BPF_REG_2), - BPF_EXIT_INSN(), -Error: - 0: (bf) r0 = r2 - R2 !read_ok - -Program that doesn't initialize R0 before exiting: - BPF_MOV64_REG(BPF_REG_2, BPF_REG_1), - BPF_EXIT_INSN(), -Error: - 0: (bf) r2 = r1 - 1: (95) exit - R0 !read_ok - -Program that accesses stack out of bounds: - BPF_ST_MEM(BPF_DW, BPF_REG_10, 8, 0), - BPF_EXIT_INSN(), -Error: - 0: (7a) *(u64 *)(r10 +8) = 0 - invalid stack off=8 size=8 - -Program that doesn't initialize stack before passing its address into function: - BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), - BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), - BPF_LD_MAP_FD(BPF_REG_1, 0), - BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), - BPF_EXIT_INSN(), -Error: - 0: (bf) r2 = r10 - 1: (07) r2 += -8 - 2: (b7) r1 = 0x0 - 3: (85) call 1 - invalid indirect read from stack off -8+0 size 8 - -Program that uses invalid map_fd=0 while calling to map_lookup_elem() function: - BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), - BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), - BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), - BPF_LD_MAP_FD(BPF_REG_1, 0), - BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), - BPF_EXIT_INSN(), -Error: - 0: (7a) *(u64 *)(r10 -8) = 0 - 1: (bf) r2 = r10 - 2: (07) r2 += -8 - 3: (b7) r1 = 0x0 - 4: (85) call 1 - fd 0 is not pointing to valid bpf_map - -Program that doesn't check return value of map_lookup_elem() before accessing -map element: - BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), - BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), - BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), - BPF_LD_MAP_FD(BPF_REG_1, 0), - BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), - BPF_ST_MEM(BPF_DW, BPF_REG_0, 0, 0), - BPF_EXIT_INSN(), -Error: - 0: (7a) *(u64 *)(r10 -8) = 0 - 1: (bf) r2 = r10 - 2: (07) r2 += -8 - 3: (b7) r1 = 0x0 - 4: (85) call 1 - 5: (7a) *(u64 *)(r0 +0) = 0 - R0 invalid mem access 'map_value_or_null' - -Program that correctly checks map_lookup_elem() returned value for NULL, but -accesses the memory with incorrect alignment: - BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), - BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), - BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), - BPF_LD_MAP_FD(BPF_REG_1, 0), - BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), - BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 1), - BPF_ST_MEM(BPF_DW, BPF_REG_0, 4, 0), - BPF_EXIT_INSN(), -Error: - 0: (7a) *(u64 *)(r10 -8) = 0 - 1: (bf) r2 = r10 - 2: (07) r2 += -8 - 3: (b7) r1 = 1 - 4: (85) call 1 - 5: (15) if r0 == 0x0 goto pc+1 - R0=map_ptr R10=fp - 6: (7a) *(u64 *)(r0 +4) = 0 - misaligned access off 4 size 8 - -Program that correctly checks map_lookup_elem() returned value for NULL and -accesses memory with correct alignment in one side of 'if' branch, but fails -to do so in the other side of 'if' branch: - BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), - BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), - BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), - BPF_LD_MAP_FD(BPF_REG_1, 0), - BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), - BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2), - BPF_ST_MEM(BPF_DW, BPF_REG_0, 0, 0), - BPF_EXIT_INSN(), - BPF_ST_MEM(BPF_DW, BPF_REG_0, 0, 1), - BPF_EXIT_INSN(), -Error: - 0: (7a) *(u64 *)(r10 -8) = 0 - 1: (bf) r2 = r10 - 2: (07) r2 += -8 - 3: (b7) r1 = 1 - 4: (85) call 1 - 5: (15) if r0 == 0x0 goto pc+2 - R0=map_ptr R10=fp - 6: (7a) *(u64 *)(r0 +0) = 0 - 7: (95) exit - - from 5 to 8: R0=imm0 R10=fp - 8: (7a) *(u64 *)(r0 +0) = 1 - R0 invalid mem access 'imm' - -Program that performs a socket lookup then sets the pointer to NULL without -checking it: -value: - BPF_MOV64_IMM(BPF_REG_2, 0), - BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_2, -8), - BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), - BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), - BPF_MOV64_IMM(BPF_REG_3, 4), - BPF_MOV64_IMM(BPF_REG_4, 0), - BPF_MOV64_IMM(BPF_REG_5, 0), - BPF_EMIT_CALL(BPF_FUNC_sk_lookup_tcp), - BPF_MOV64_IMM(BPF_REG_0, 0), - BPF_EXIT_INSN(), -Error: - 0: (b7) r2 = 0 - 1: (63) *(u32 *)(r10 -8) = r2 - 2: (bf) r2 = r10 - 3: (07) r2 += -8 - 4: (b7) r3 = 4 - 5: (b7) r4 = 0 - 6: (b7) r5 = 0 - 7: (85) call bpf_sk_lookup_tcp#65 - 8: (b7) r0 = 0 - 9: (95) exit - Unreleased reference id=1, alloc_insn=7 - -Program that performs a socket lookup but does not NULL-check the returned -value: - BPF_MOV64_IMM(BPF_REG_2, 0), - BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_2, -8), - BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), - BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), - BPF_MOV64_IMM(BPF_REG_3, 4), - BPF_MOV64_IMM(BPF_REG_4, 0), - BPF_MOV64_IMM(BPF_REG_5, 0), - BPF_EMIT_CALL(BPF_FUNC_sk_lookup_tcp), - BPF_EXIT_INSN(), -Error: - 0: (b7) r2 = 0 - 1: (63) *(u32 *)(r10 -8) = r2 - 2: (bf) r2 = r10 - 3: (07) r2 += -8 - 4: (b7) r3 = 4 - 5: (b7) r4 = 0 - 6: (b7) r5 = 0 - 7: (85) call bpf_sk_lookup_tcp#65 - 8: (95) exit - Unreleased reference id=1, alloc_insn=7 - -Testing -------- - -Next to the BPF toolchain, the kernel also ships a test module that contains -various test cases for classic and internal BPF that can be executed against -the BPF interpreter and JIT compiler. It can be found in lib/test_bpf.c and -enabled via Kconfig: - - CONFIG_TEST_BPF=m - -After the module has been built and installed, the test suite can be executed -via insmod or modprobe against 'test_bpf' module. Results of the test cases -including timings in nsec can be found in the kernel log (dmesg). - -Misc ----- - -Also trinity, the Linux syscall fuzzer, has built-in support for BPF and -SECCOMP-BPF kernel fuzzing. - -Written by ----------- - -The document was written in the hope that it is found useful and in order -to give potential BPF hackers or security auditors a better overview of -the underlying architecture. - -Jay Schulist -Daniel Borkmann -Alexei Starovoitov diff --git a/Documentation/networking/index.rst b/Documentation/networking/index.rst index 807abe25ae4b..144ed838c1a9 100644 --- a/Documentation/networking/index.rst +++ b/Documentation/networking/index.rst @@ -56,6 +56,7 @@ Contents: driver eql fib_trie + filter .. only:: subproject and html diff --git a/Documentation/networking/packet_mmap.txt b/Documentation/networking/packet_mmap.txt index 999eb41da81d..494614573c67 100644 --- a/Documentation/networking/packet_mmap.txt +++ b/Documentation/networking/packet_mmap.txt @@ -1051,7 +1051,7 @@ for more information on hardware timestamps. ------------------------------------------------------------------------------- - Packet sockets work well together with Linux socket filters, thus you also - might want to have a look at Documentation/networking/filter.txt + might want to have a look at Documentation/networking/filter.rst -------------------------------------------------------------------------------- + THANKS diff --git a/MAINTAINERS b/MAINTAINERS index 7323bfc1720f..4ec6d2741d36 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -3192,7 +3192,7 @@ Q: https://patchwork.ozlabs.org/project/netdev/list/?delegate=77147 T: git git://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf.git T: git git://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next.git F: Documentation/bpf/ -F: Documentation/networking/filter.txt +F: Documentation/networking/filter.rst F: arch/*/net/* F: include/linux/bpf* F: include/linux/filter.h diff --git a/tools/bpf/bpf_asm.c b/tools/bpf/bpf_asm.c index e5f95e3eede3..0063c3c029e7 100644 --- a/tools/bpf/bpf_asm.c +++ b/tools/bpf/bpf_asm.c @@ -11,7 +11,7 @@ * * How to get into it: * - * 1) read Documentation/networking/filter.txt + * 1) read Documentation/networking/filter.rst * 2) Run `bpf_asm [-c] ` to translate into binary * blob that is loadable with xt_bpf, cls_bpf et al. Note: -c will * pretty print a C-like construct. diff --git a/tools/bpf/bpf_dbg.c b/tools/bpf/bpf_dbg.c index 9d3766e653a9..a0ebcdf59c31 100644 --- a/tools/bpf/bpf_dbg.c +++ b/tools/bpf/bpf_dbg.c @@ -13,7 +13,7 @@ * for making a verdict when multiple simple BPF programs are combined * into one in order to prevent parsing same headers multiple times. * - * More on how to debug BPF opcodes see Documentation/networking/filter.txt + * More on how to debug BPF opcodes see Documentation/networking/filter.rst * which is the main document on BPF. Mini howto for getting started: * * 1) `./bpf_dbg` to enter the shell (shell cmds denoted with '>'): -- cgit v1.2.3-59-g8ed1b From 62502dff2c5012c19727bd992b0101a816095f1e Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Tue, 28 Apr 2020 00:01:37 +0200 Subject: docs: networking: convert fore200e.txt to ReST - add SPDX header; - adjust title markup; - mark code blocks and literals as such; - adjust identation, whitespaces and blank lines; - add to networking/index.rst. Signed-off-by: Mauro Carvalho Chehab Signed-off-by: David S. Miller --- Documentation/networking/fore200e.rst | 66 +++++++++++++++++++++++++++++++++++ Documentation/networking/fore200e.txt | 64 --------------------------------- Documentation/networking/index.rst | 1 + drivers/atm/Kconfig | 2 +- 4 files changed, 68 insertions(+), 65 deletions(-) create mode 100644 Documentation/networking/fore200e.rst delete mode 100644 Documentation/networking/fore200e.txt diff --git a/Documentation/networking/fore200e.rst b/Documentation/networking/fore200e.rst new file mode 100644 index 000000000000..55df9ec09ac8 --- /dev/null +++ b/Documentation/networking/fore200e.rst @@ -0,0 +1,66 @@ +.. SPDX-License-Identifier: GPL-2.0 + +============================================= +FORE Systems PCA-200E/SBA-200E ATM NIC driver +============================================= + +This driver adds support for the FORE Systems 200E-series ATM adapters +to the Linux operating system. It is based on the earlier PCA-200E driver +written by Uwe Dannowski. + +The driver simultaneously supports PCA-200E and SBA-200E adapters on +i386, alpha (untested), powerpc, sparc and sparc64 archs. + +The intent is to enable the use of different models of FORE adapters at the +same time, by hosts that have several bus interfaces (such as PCI+SBUS, +or PCI+EISA). + +Only PCI and SBUS devices are currently supported by the driver, but support +for other bus interfaces such as EISA should not be too hard to add. + + +Firmware Copyright Notice +------------------------- + +Please read the fore200e_firmware_copyright file present +in the linux/drivers/atm directory for details and restrictions. + + +Firmware Updates +---------------- + +The FORE Systems 200E-series driver is shipped with firmware data being +uploaded to the ATM adapters at system boot time or at module loading time. +The supplied firmware images should work with all adapters. + +However, if you encounter problems (the firmware doesn't start or the driver +is unable to read the PROM data), you may consider trying another firmware +version. Alternative binary firmware images can be found somewhere on the +ForeThought CD-ROM supplied with your adapter by FORE Systems. + +You can also get the latest firmware images from FORE Systems at +https://en.wikipedia.org/wiki/FORE_Systems. Register TACTics Online and go to +the 'software updates' pages. The firmware binaries are part of +the various ForeThought software distributions. + +Notice that different versions of the PCA-200E firmware exist, depending +on the endianness of the host architecture. The driver is shipped with +both little and big endian PCA firmware images. + +Name and location of the new firmware images can be set at kernel +configuration time: + +1. Copy the new firmware binary files (with .bin, .bin1 or .bin2 suffix) + to some directory, such as linux/drivers/atm. + +2. Reconfigure your kernel to set the new firmware name and location. + Expected pathnames are absolute or relative to the drivers/atm directory. + +3. Rebuild and re-install your kernel or your module. + + +Feedback +-------- + +Feedback is welcome. Please send success stories/bug reports/ +patches/improvement/comments/flames to . diff --git a/Documentation/networking/fore200e.txt b/Documentation/networking/fore200e.txt deleted file mode 100644 index 1f98f62b4370..000000000000 --- a/Documentation/networking/fore200e.txt +++ /dev/null @@ -1,64 +0,0 @@ - -FORE Systems PCA-200E/SBA-200E ATM NIC driver ---------------------------------------------- - -This driver adds support for the FORE Systems 200E-series ATM adapters -to the Linux operating system. It is based on the earlier PCA-200E driver -written by Uwe Dannowski. - -The driver simultaneously supports PCA-200E and SBA-200E adapters on -i386, alpha (untested), powerpc, sparc and sparc64 archs. - -The intent is to enable the use of different models of FORE adapters at the -same time, by hosts that have several bus interfaces (such as PCI+SBUS, -or PCI+EISA). - -Only PCI and SBUS devices are currently supported by the driver, but support -for other bus interfaces such as EISA should not be too hard to add. - - -Firmware Copyright Notice -------------------------- - -Please read the fore200e_firmware_copyright file present -in the linux/drivers/atm directory for details and restrictions. - - -Firmware Updates ----------------- - -The FORE Systems 200E-series driver is shipped with firmware data being -uploaded to the ATM adapters at system boot time or at module loading time. -The supplied firmware images should work with all adapters. - -However, if you encounter problems (the firmware doesn't start or the driver -is unable to read the PROM data), you may consider trying another firmware -version. Alternative binary firmware images can be found somewhere on the -ForeThought CD-ROM supplied with your adapter by FORE Systems. - -You can also get the latest firmware images from FORE Systems at -https://en.wikipedia.org/wiki/FORE_Systems. Register TACTics Online and go to -the 'software updates' pages. The firmware binaries are part of -the various ForeThought software distributions. - -Notice that different versions of the PCA-200E firmware exist, depending -on the endianness of the host architecture. The driver is shipped with -both little and big endian PCA firmware images. - -Name and location of the new firmware images can be set at kernel -configuration time: - -1. Copy the new firmware binary files (with .bin, .bin1 or .bin2 suffix) - to some directory, such as linux/drivers/atm. - -2. Reconfigure your kernel to set the new firmware name and location. - Expected pathnames are absolute or relative to the drivers/atm directory. - -3. Rebuild and re-install your kernel or your module. - - -Feedback --------- - -Feedback is welcome. Please send success stories/bug reports/ -patches/improvement/comments/flames to . diff --git a/Documentation/networking/index.rst b/Documentation/networking/index.rst index 144ed838c1a9..b2fb8b907d68 100644 --- a/Documentation/networking/index.rst +++ b/Documentation/networking/index.rst @@ -57,6 +57,7 @@ Contents: eql fib_trie filter + fore200e .. only:: subproject and html diff --git a/drivers/atm/Kconfig b/drivers/atm/Kconfig index 8c37294f1d1e..4af7cbdcc349 100644 --- a/drivers/atm/Kconfig +++ b/drivers/atm/Kconfig @@ -336,7 +336,7 @@ config ATM_FORE200E on PCI and SBUS hosts. Say Y (or M to compile as a module named fore_200e) here if you have one of these ATM adapters. - See the file for + See the file for further details. config ATM_FORE200E_USE_TASKLET -- cgit v1.2.3-59-g8ed1b From 5b0d74b54c7f1cb9c65955df78dffe112e1959c1 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Tue, 28 Apr 2020 00:01:38 +0200 Subject: docs: networking: convert framerelay.txt to ReST - add SPDX header; - add a document title; - adjust identation, whitespaces and blank lines; - add to networking/index.rst. Signed-off-by: Mauro Carvalho Chehab Signed-off-by: David S. Miller --- Documentation/networking/framerelay.rst | 44 +++++++++++++++++++++++++++++++++ Documentation/networking/framerelay.txt | 39 ----------------------------- Documentation/networking/index.rst | 1 + drivers/net/wan/Kconfig | 4 +-- 4 files changed, 47 insertions(+), 41 deletions(-) create mode 100644 Documentation/networking/framerelay.rst delete mode 100644 Documentation/networking/framerelay.txt diff --git a/Documentation/networking/framerelay.rst b/Documentation/networking/framerelay.rst new file mode 100644 index 000000000000..6d904399ec6d --- /dev/null +++ b/Documentation/networking/framerelay.rst @@ -0,0 +1,44 @@ +.. SPDX-License-Identifier: GPL-2.0 + +================ +Frame Relay (FR) +================ + +Frame Relay (FR) support for linux is built into a two tiered system of device +drivers. The upper layer implements RFC1490 FR specification, and uses the +Data Link Connection Identifier (DLCI) as its hardware address. Usually these +are assigned by your network supplier, they give you the number/numbers of +the Virtual Connections (VC) assigned to you. + +Each DLCI is a point-to-point link between your machine and a remote one. +As such, a separate device is needed to accommodate the routing. Within the +net-tools archives is 'dlcicfg'. This program will communicate with the +base "DLCI" device, and create new net devices named 'dlci00', 'dlci01'... +The configuration script will ask you how many DLCIs you need, as well as +how many DLCIs you want to assign to each Frame Relay Access Device (FRAD). + +The DLCI uses a number of function calls to communicate with the FRAD, all +of which are stored in the FRAD's private data area. assoc/deassoc, +activate/deactivate and dlci_config. The DLCI supplies a receive function +to the FRAD to accept incoming packets. + +With this initial offering, only 1 FRAD driver is available. With many thanks +to Sangoma Technologies, David Mandelstam & Gene Kozin, the S502A, S502E & +S508 are supported. This driver is currently set up for only FR, but as +Sangoma makes more firmware modules available, it can be updated to provide +them as well. + +Configuration of the FRAD makes use of another net-tools program, 'fradcfg'. +This program makes use of a configuration file (which dlcicfg can also read) +to specify the types of boards to be configured as FRADs, as well as perform +any board specific configuration. The Sangoma module of fradcfg loads the +FR firmware into the card, sets the irq/port/memory information, and provides +an initial configuration. + +Additional FRAD device drivers can be added as hardware is available. + +At this time, the dlcicfg and fradcfg programs have not been incorporated into +the net-tools distribution. They can be found at ftp.invlogic.com, in +/pub/linux. Note that with OS/2 FTPD, you end up in /pub by default, so just +use 'cd linux'. v0.10 is for use on pre-2.0.3 and earlier, v0.15 is for +pre-2.0.4 and later. diff --git a/Documentation/networking/framerelay.txt b/Documentation/networking/framerelay.txt deleted file mode 100644 index 1a0b720440dd..000000000000 --- a/Documentation/networking/framerelay.txt +++ /dev/null @@ -1,39 +0,0 @@ -Frame Relay (FR) support for linux is built into a two tiered system of device -drivers. The upper layer implements RFC1490 FR specification, and uses the -Data Link Connection Identifier (DLCI) as its hardware address. Usually these -are assigned by your network supplier, they give you the number/numbers of -the Virtual Connections (VC) assigned to you. - -Each DLCI is a point-to-point link between your machine and a remote one. -As such, a separate device is needed to accommodate the routing. Within the -net-tools archives is 'dlcicfg'. This program will communicate with the -base "DLCI" device, and create new net devices named 'dlci00', 'dlci01'... -The configuration script will ask you how many DLCIs you need, as well as -how many DLCIs you want to assign to each Frame Relay Access Device (FRAD). - -The DLCI uses a number of function calls to communicate with the FRAD, all -of which are stored in the FRAD's private data area. assoc/deassoc, -activate/deactivate and dlci_config. The DLCI supplies a receive function -to the FRAD to accept incoming packets. - -With this initial offering, only 1 FRAD driver is available. With many thanks -to Sangoma Technologies, David Mandelstam & Gene Kozin, the S502A, S502E & -S508 are supported. This driver is currently set up for only FR, but as -Sangoma makes more firmware modules available, it can be updated to provide -them as well. - -Configuration of the FRAD makes use of another net-tools program, 'fradcfg'. -This program makes use of a configuration file (which dlcicfg can also read) -to specify the types of boards to be configured as FRADs, as well as perform -any board specific configuration. The Sangoma module of fradcfg loads the -FR firmware into the card, sets the irq/port/memory information, and provides -an initial configuration. - -Additional FRAD device drivers can be added as hardware is available. - -At this time, the dlcicfg and fradcfg programs have not been incorporated into -the net-tools distribution. They can be found at ftp.invlogic.com, in -/pub/linux. Note that with OS/2 FTPD, you end up in /pub by default, so just -use 'cd linux'. v0.10 is for use on pre-2.0.3 and earlier, v0.15 is for -pre-2.0.4 and later. - diff --git a/Documentation/networking/index.rst b/Documentation/networking/index.rst index b2fb8b907d68..4e225f1f7039 100644 --- a/Documentation/networking/index.rst +++ b/Documentation/networking/index.rst @@ -58,6 +58,7 @@ Contents: fib_trie filter fore200e + framerelay .. only:: subproject and html diff --git a/drivers/net/wan/Kconfig b/drivers/net/wan/Kconfig index dbc0e3f7a3e2..3e21726c36e8 100644 --- a/drivers/net/wan/Kconfig +++ b/drivers/net/wan/Kconfig @@ -336,7 +336,7 @@ config DLCI To use frame relay, you need supporting hardware (called FRAD) and certain programs from the net-tools package as explained in - . + . To compile this driver as a module, choose M here: the module will be called dlci. @@ -361,7 +361,7 @@ config SDLA These are multi-protocol cards, but only Frame Relay is supported by the driver at this time. Please read - . + . To compile this driver as a module, choose M here: the module will be called sdla. -- cgit v1.2.3-59-g8ed1b From 16128ad8f927850a1121b7645c6381341d9c0b63 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Tue, 28 Apr 2020 00:01:39 +0200 Subject: docs: networking: convert generic-hdlc.txt to ReST - add SPDX header; - adjust title markup; - mark code blocks and literals as such; - adjust identation, whitespaces and blank lines; - add to networking/index.rst. Signed-off-by: Mauro Carvalho Chehab Signed-off-by: David S. Miller --- Documentation/networking/generic-hdlc.rst | 170 ++++++++++++++++++++++++++++++ Documentation/networking/generic-hdlc.txt | 132 ----------------------- Documentation/networking/index.rst | 1 + 3 files changed, 171 insertions(+), 132 deletions(-) create mode 100644 Documentation/networking/generic-hdlc.rst delete mode 100644 Documentation/networking/generic-hdlc.txt diff --git a/Documentation/networking/generic-hdlc.rst b/Documentation/networking/generic-hdlc.rst new file mode 100644 index 000000000000..1c3bb5cb98d4 --- /dev/null +++ b/Documentation/networking/generic-hdlc.rst @@ -0,0 +1,170 @@ +.. SPDX-License-Identifier: GPL-2.0 + +================== +Generic HDLC layer +================== + +Krzysztof Halasa + + +Generic HDLC layer currently supports: + +1. Frame Relay (ANSI, CCITT, Cisco and no LMI) + + - Normal (routed) and Ethernet-bridged (Ethernet device emulation) + interfaces can share a single PVC. + - ARP support (no InARP support in the kernel - there is an + experimental InARP user-space daemon available on: + http://www.kernel.org/pub/linux/utils/net/hdlc/). + +2. raw HDLC - either IP (IPv4) interface or Ethernet device emulation +3. Cisco HDLC +4. PPP +5. X.25 (uses X.25 routines). + +Generic HDLC is a protocol driver only - it needs a low-level driver +for your particular hardware. + +Ethernet device emulation (using HDLC or Frame-Relay PVC) is compatible +with IEEE 802.1Q (VLANs) and 802.1D (Ethernet bridging). + + +Make sure the hdlc.o and the hardware driver are loaded. It should +create a number of "hdlc" (hdlc0 etc) network devices, one for each +WAN port. You'll need the "sethdlc" utility, get it from: + + http://www.kernel.org/pub/linux/utils/net/hdlc/ + +Compile sethdlc.c utility:: + + gcc -O2 -Wall -o sethdlc sethdlc.c + +Make sure you're using a correct version of sethdlc for your kernel. + +Use sethdlc to set physical interface, clock rate, HDLC mode used, +and add any required PVCs if using Frame Relay. +Usually you want something like:: + + sethdlc hdlc0 clock int rate 128000 + sethdlc hdlc0 cisco interval 10 timeout 25 + +or:: + + sethdlc hdlc0 rs232 clock ext + sethdlc hdlc0 fr lmi ansi + sethdlc hdlc0 create 99 + ifconfig hdlc0 up + ifconfig pvc0 localIP pointopoint remoteIP + +In Frame Relay mode, ifconfig master hdlc device up (without assigning +any IP address to it) before using pvc devices. + + +Setting interface: + +* v35 | rs232 | x21 | t1 | e1 + - sets physical interface for a given port + if the card has software-selectable interfaces + loopback + - activate hardware loopback (for testing only) +* clock ext + - both RX clock and TX clock external +* clock int + - both RX clock and TX clock internal +* clock txint + - RX clock external, TX clock internal +* clock txfromrx + - RX clock external, TX clock derived from RX clock +* rate + - sets clock rate in bps (for "int" or "txint" clock only) + + +Setting protocol: + +* hdlc - sets raw HDLC (IP-only) mode + + nrz / nrzi / fm-mark / fm-space / manchester - sets transmission code + + no-parity / crc16 / crc16-pr0 (CRC16 with preset zeros) / crc32-itu + + crc16-itu (CRC16 with ITU-T polynomial) / crc16-itu-pr0 - sets parity + +* hdlc-eth - Ethernet device emulation using HDLC. Parity and encoding + as above. + +* cisco - sets Cisco HDLC mode (IP, IPv6 and IPX supported) + + interval - time in seconds between keepalive packets + + timeout - time in seconds after last received keepalive packet before + we assume the link is down + +* ppp - sets synchronous PPP mode + +* x25 - sets X.25 mode + +* fr - Frame Relay mode + + lmi ansi / ccitt / cisco / none - LMI (link management) type + + dce - Frame Relay DCE (network) side LMI instead of default DTE (user). + + It has nothing to do with clocks! + + - t391 - link integrity verification polling timer (in seconds) - user + - t392 - polling verification timer (in seconds) - network + - n391 - full status polling counter - user + - n392 - error threshold - both user and network + - n393 - monitored events count - both user and network + +Frame-Relay only: + +* create n | delete n - adds / deletes PVC interface with DLCI #n. + Newly created interface will be named pvc0, pvc1 etc. + +* create ether n | delete ether n - adds a device for Ethernet-bridged + frames. The device will be named pvceth0, pvceth1 etc. + + + + +Board-specific issues +--------------------- + +n2.o and c101.o need parameters to work:: + + insmod n2 hw=io,irq,ram,ports[:io,irq,...] + +example:: + + insmod n2 hw=0x300,10,0xD0000,01 + +or:: + + insmod c101 hw=irq,ram[:irq,...] + +example:: + + insmod c101 hw=9,0xdc000 + +If built into the kernel, these drivers need kernel (command line) parameters:: + + n2.hw=io,irq,ram,ports:... + +or:: + + c101.hw=irq,ram:... + + + +If you have a problem with N2, C101 or PLX200SYN card, you can issue the +"private" command to see port's packet descriptor rings (in kernel logs):: + + sethdlc hdlc0 private + +The hardware driver has to be build with #define DEBUG_RINGS. +Attaching this info to bug reports would be helpful. Anyway, let me know +if you have problems using this. + +For patches and other info look at: +. diff --git a/Documentation/networking/generic-hdlc.txt b/Documentation/networking/generic-hdlc.txt deleted file mode 100644 index 4eb3cc40b702..000000000000 --- a/Documentation/networking/generic-hdlc.txt +++ /dev/null @@ -1,132 +0,0 @@ -Generic HDLC layer -Krzysztof Halasa - - -Generic HDLC layer currently supports: -1. Frame Relay (ANSI, CCITT, Cisco and no LMI) - - Normal (routed) and Ethernet-bridged (Ethernet device emulation) - interfaces can share a single PVC. - - ARP support (no InARP support in the kernel - there is an - experimental InARP user-space daemon available on: - http://www.kernel.org/pub/linux/utils/net/hdlc/). -2. raw HDLC - either IP (IPv4) interface or Ethernet device emulation -3. Cisco HDLC -4. PPP -5. X.25 (uses X.25 routines). - -Generic HDLC is a protocol driver only - it needs a low-level driver -for your particular hardware. - -Ethernet device emulation (using HDLC or Frame-Relay PVC) is compatible -with IEEE 802.1Q (VLANs) and 802.1D (Ethernet bridging). - - -Make sure the hdlc.o and the hardware driver are loaded. It should -create a number of "hdlc" (hdlc0 etc) network devices, one for each -WAN port. You'll need the "sethdlc" utility, get it from: - http://www.kernel.org/pub/linux/utils/net/hdlc/ - -Compile sethdlc.c utility: - gcc -O2 -Wall -o sethdlc sethdlc.c -Make sure you're using a correct version of sethdlc for your kernel. - -Use sethdlc to set physical interface, clock rate, HDLC mode used, -and add any required PVCs if using Frame Relay. -Usually you want something like: - - sethdlc hdlc0 clock int rate 128000 - sethdlc hdlc0 cisco interval 10 timeout 25 -or - sethdlc hdlc0 rs232 clock ext - sethdlc hdlc0 fr lmi ansi - sethdlc hdlc0 create 99 - ifconfig hdlc0 up - ifconfig pvc0 localIP pointopoint remoteIP - -In Frame Relay mode, ifconfig master hdlc device up (without assigning -any IP address to it) before using pvc devices. - - -Setting interface: - -* v35 | rs232 | x21 | t1 | e1 - sets physical interface for a given port - if the card has software-selectable interfaces - loopback - activate hardware loopback (for testing only) -* clock ext - both RX clock and TX clock external -* clock int - both RX clock and TX clock internal -* clock txint - RX clock external, TX clock internal -* clock txfromrx - RX clock external, TX clock derived from RX clock -* rate - sets clock rate in bps (for "int" or "txint" clock only) - - -Setting protocol: - -* hdlc - sets raw HDLC (IP-only) mode - nrz / nrzi / fm-mark / fm-space / manchester - sets transmission code - no-parity / crc16 / crc16-pr0 (CRC16 with preset zeros) / crc32-itu - crc16-itu (CRC16 with ITU-T polynomial) / crc16-itu-pr0 - sets parity - -* hdlc-eth - Ethernet device emulation using HDLC. Parity and encoding - as above. - -* cisco - sets Cisco HDLC mode (IP, IPv6 and IPX supported) - interval - time in seconds between keepalive packets - timeout - time in seconds after last received keepalive packet before - we assume the link is down - -* ppp - sets synchronous PPP mode - -* x25 - sets X.25 mode - -* fr - Frame Relay mode - lmi ansi / ccitt / cisco / none - LMI (link management) type - dce - Frame Relay DCE (network) side LMI instead of default DTE (user). - It has nothing to do with clocks! - t391 - link integrity verification polling timer (in seconds) - user - t392 - polling verification timer (in seconds) - network - n391 - full status polling counter - user - n392 - error threshold - both user and network - n393 - monitored events count - both user and network - -Frame-Relay only: -* create n | delete n - adds / deletes PVC interface with DLCI #n. - Newly created interface will be named pvc0, pvc1 etc. - -* create ether n | delete ether n - adds a device for Ethernet-bridged - frames. The device will be named pvceth0, pvceth1 etc. - - - - -Board-specific issues ---------------------- - -n2.o and c101.o need parameters to work: - - insmod n2 hw=io,irq,ram,ports[:io,irq,...] -example: - insmod n2 hw=0x300,10,0xD0000,01 - -or - insmod c101 hw=irq,ram[:irq,...] -example: - insmod c101 hw=9,0xdc000 - -If built into the kernel, these drivers need kernel (command line) parameters: - n2.hw=io,irq,ram,ports:... -or - c101.hw=irq,ram:... - - - -If you have a problem with N2, C101 or PLX200SYN card, you can issue the -"private" command to see port's packet descriptor rings (in kernel logs): - - sethdlc hdlc0 private - -The hardware driver has to be build with #define DEBUG_RINGS. -Attaching this info to bug reports would be helpful. Anyway, let me know -if you have problems using this. - -For patches and other info look at: -. diff --git a/Documentation/networking/index.rst b/Documentation/networking/index.rst index 4e225f1f7039..d34824b27264 100644 --- a/Documentation/networking/index.rst +++ b/Documentation/networking/index.rst @@ -59,6 +59,7 @@ Contents: filter fore200e framerelay + generic-hdlc .. only:: subproject and html -- cgit v1.2.3-59-g8ed1b From 110662503de20f21ab22cf409753124d0977a339 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Tue, 28 Apr 2020 00:01:40 +0200 Subject: docs: networking: convert generic_netlink.txt to ReST Not much to be done here: - add SPDX header; - add a document title; - add to networking/index.rst. Signed-off-by: Mauro Carvalho Chehab Signed-off-by: David S. Miller --- Documentation/networking/generic_netlink.rst | 9 +++++++++ Documentation/networking/generic_netlink.txt | 3 --- Documentation/networking/index.rst | 1 + 3 files changed, 10 insertions(+), 3 deletions(-) create mode 100644 Documentation/networking/generic_netlink.rst delete mode 100644 Documentation/networking/generic_netlink.txt diff --git a/Documentation/networking/generic_netlink.rst b/Documentation/networking/generic_netlink.rst new file mode 100644 index 000000000000..59e04ccf80c1 --- /dev/null +++ b/Documentation/networking/generic_netlink.rst @@ -0,0 +1,9 @@ +.. SPDX-License-Identifier: GPL-2.0 + +=============== +Generic Netlink +=============== + +A wiki document on how to use Generic Netlink can be found here: + + * http://www.linuxfoundation.org/collaborate/workgroups/networking/generic_netlink_howto diff --git a/Documentation/networking/generic_netlink.txt b/Documentation/networking/generic_netlink.txt deleted file mode 100644 index 3e071115ca90..000000000000 --- a/Documentation/networking/generic_netlink.txt +++ /dev/null @@ -1,3 +0,0 @@ -A wiki document on how to use Generic Netlink can be found here: - - * http://www.linuxfoundation.org/collaborate/workgroups/networking/generic_netlink_howto diff --git a/Documentation/networking/index.rst b/Documentation/networking/index.rst index d34824b27264..42e556509e22 100644 --- a/Documentation/networking/index.rst +++ b/Documentation/networking/index.rst @@ -60,6 +60,7 @@ Contents: fore200e framerelay generic-hdlc + generic_netlink .. only:: subproject and html -- cgit v1.2.3-59-g8ed1b From 8c498935585680284e5f3e5294d3c901b7c89d57 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Tue, 28 Apr 2020 00:01:41 +0200 Subject: docs: networking: convert gen_stats.txt to ReST - add SPDX header; - mark code blocks and literals as such; - mark tables as such; - mark lists as such; - adjust identation, whitespaces and blank lines; - add to networking/index.rst. Signed-off-by: Mauro Carvalho Chehab Signed-off-by: David S. Miller --- Documentation/networking/gen_stats.rst | 129 +++++++++++++++++++++++++++++++++ Documentation/networking/gen_stats.txt | 119 ------------------------------ Documentation/networking/index.rst | 1 + net/core/gen_stats.c | 2 +- 4 files changed, 131 insertions(+), 120 deletions(-) create mode 100644 Documentation/networking/gen_stats.rst delete mode 100644 Documentation/networking/gen_stats.txt diff --git a/Documentation/networking/gen_stats.rst b/Documentation/networking/gen_stats.rst new file mode 100644 index 000000000000..595a83b9a61b --- /dev/null +++ b/Documentation/networking/gen_stats.rst @@ -0,0 +1,129 @@ +.. SPDX-License-Identifier: GPL-2.0 + +=============================================== +Generic networking statistics for netlink users +=============================================== + +Statistic counters are grouped into structs: + +==================== ===================== ===================== +Struct TLV type Description +==================== ===================== ===================== +gnet_stats_basic TCA_STATS_BASIC Basic statistics +gnet_stats_rate_est TCA_STATS_RATE_EST Rate estimator +gnet_stats_queue TCA_STATS_QUEUE Queue statistics +none TCA_STATS_APP Application specific +==================== ===================== ===================== + + +Collecting: +----------- + +Declare the statistic structs you need:: + + struct mystruct { + struct gnet_stats_basic bstats; + struct gnet_stats_queue qstats; + ... + }; + +Update statistics, in dequeue() methods only, (while owning qdisc->running):: + + mystruct->tstats.packet++; + mystruct->qstats.backlog += skb->pkt_len; + + +Export to userspace (Dump): +--------------------------- + +:: + + my_dumping_routine(struct sk_buff *skb, ...) + { + struct gnet_dump dump; + + if (gnet_stats_start_copy(skb, TCA_STATS2, &mystruct->lock, &dump, + TCA_PAD) < 0) + goto rtattr_failure; + + if (gnet_stats_copy_basic(&dump, &mystruct->bstats) < 0 || + gnet_stats_copy_queue(&dump, &mystruct->qstats) < 0 || + gnet_stats_copy_app(&dump, &xstats, sizeof(xstats)) < 0) + goto rtattr_failure; + + if (gnet_stats_finish_copy(&dump) < 0) + goto rtattr_failure; + ... + } + +TCA_STATS/TCA_XSTATS backward compatibility: +-------------------------------------------- + +Prior users of struct tc_stats and xstats can maintain backward +compatibility by calling the compat wrappers to keep providing the +existing TLV types:: + + my_dumping_routine(struct sk_buff *skb, ...) + { + if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, + TCA_XSTATS, &mystruct->lock, &dump, + TCA_PAD) < 0) + goto rtattr_failure; + ... + } + +A struct tc_stats will be filled out during gnet_stats_copy_* calls +and appended to the skb. TCA_XSTATS is provided if gnet_stats_copy_app +was called. + + +Locking: +-------- + +Locks are taken before writing and released once all statistics have +been written. Locks are always released in case of an error. You +are responsible for making sure that the lock is initialized. + + +Rate Estimator: +--------------- + +0) Prepare an estimator attribute. Most likely this would be in user + space. The value of this TLV should contain a tc_estimator structure. + As usual, such a TLV needs to be 32 bit aligned and therefore the + length needs to be appropriately set, etc. The estimator interval + and ewma log need to be converted to the appropriate values. + tc_estimator.c::tc_setup_estimator() is advisable to be used as the + conversion routine. It does a few clever things. It takes a time + interval in microsecs, a time constant also in microsecs and a struct + tc_estimator to be populated. The returned tc_estimator can be + transported to the kernel. Transfer such a structure in a TLV of type + TCA_RATE to your code in the kernel. + +In the kernel when setting up: + +1) make sure you have basic stats and rate stats setup first. +2) make sure you have initialized stats lock that is used to setup such + stats. +3) Now initialize a new estimator:: + + int ret = gen_new_estimator(my_basicstats,my_rate_est_stats, + mystats_lock, attr_with_tcestimator_struct); + + if ret == 0 + success + else + failed + +From now on, every time you dump my_rate_est_stats it will contain +up-to-date info. + +Once you are done, call gen_kill_estimator(my_basicstats, +my_rate_est_stats) Make sure that my_basicstats and my_rate_est_stats +are still valid (i.e still exist) at the time of making this call. + + +Authors: +-------- +- Thomas Graf +- Jamal Hadi Salim diff --git a/Documentation/networking/gen_stats.txt b/Documentation/networking/gen_stats.txt deleted file mode 100644 index 179b18ce45ff..000000000000 --- a/Documentation/networking/gen_stats.txt +++ /dev/null @@ -1,119 +0,0 @@ -Generic networking statistics for netlink users -====================================================================== - -Statistic counters are grouped into structs: - -Struct TLV type Description ----------------------------------------------------------------------- -gnet_stats_basic TCA_STATS_BASIC Basic statistics -gnet_stats_rate_est TCA_STATS_RATE_EST Rate estimator -gnet_stats_queue TCA_STATS_QUEUE Queue statistics -none TCA_STATS_APP Application specific - - -Collecting: ------------ - -Declare the statistic structs you need: -struct mystruct { - struct gnet_stats_basic bstats; - struct gnet_stats_queue qstats; - ... -}; - -Update statistics, in dequeue() methods only, (while owning qdisc->running) -mystruct->tstats.packet++; -mystruct->qstats.backlog += skb->pkt_len; - - -Export to userspace (Dump): ---------------------------- - -my_dumping_routine(struct sk_buff *skb, ...) -{ - struct gnet_dump dump; - - if (gnet_stats_start_copy(skb, TCA_STATS2, &mystruct->lock, &dump, - TCA_PAD) < 0) - goto rtattr_failure; - - if (gnet_stats_copy_basic(&dump, &mystruct->bstats) < 0 || - gnet_stats_copy_queue(&dump, &mystruct->qstats) < 0 || - gnet_stats_copy_app(&dump, &xstats, sizeof(xstats)) < 0) - goto rtattr_failure; - - if (gnet_stats_finish_copy(&dump) < 0) - goto rtattr_failure; - ... -} - -TCA_STATS/TCA_XSTATS backward compatibility: --------------------------------------------- - -Prior users of struct tc_stats and xstats can maintain backward -compatibility by calling the compat wrappers to keep providing the -existing TLV types. - -my_dumping_routine(struct sk_buff *skb, ...) -{ - if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, - TCA_XSTATS, &mystruct->lock, &dump, - TCA_PAD) < 0) - goto rtattr_failure; - ... -} - -A struct tc_stats will be filled out during gnet_stats_copy_* calls -and appended to the skb. TCA_XSTATS is provided if gnet_stats_copy_app -was called. - - -Locking: --------- - -Locks are taken before writing and released once all statistics have -been written. Locks are always released in case of an error. You -are responsible for making sure that the lock is initialized. - - -Rate Estimator: --------------- - -0) Prepare an estimator attribute. Most likely this would be in user - space. The value of this TLV should contain a tc_estimator structure. - As usual, such a TLV needs to be 32 bit aligned and therefore the - length needs to be appropriately set, etc. The estimator interval - and ewma log need to be converted to the appropriate values. - tc_estimator.c::tc_setup_estimator() is advisable to be used as the - conversion routine. It does a few clever things. It takes a time - interval in microsecs, a time constant also in microsecs and a struct - tc_estimator to be populated. The returned tc_estimator can be - transported to the kernel. Transfer such a structure in a TLV of type - TCA_RATE to your code in the kernel. - -In the kernel when setting up: -1) make sure you have basic stats and rate stats setup first. -2) make sure you have initialized stats lock that is used to setup such - stats. -3) Now initialize a new estimator: - - int ret = gen_new_estimator(my_basicstats,my_rate_est_stats, - mystats_lock, attr_with_tcestimator_struct); - - if ret == 0 - success - else - failed - -From now on, every time you dump my_rate_est_stats it will contain -up-to-date info. - -Once you are done, call gen_kill_estimator(my_basicstats, -my_rate_est_stats) Make sure that my_basicstats and my_rate_est_stats -are still valid (i.e still exist) at the time of making this call. - - -Authors: --------- -Thomas Graf -Jamal Hadi Salim diff --git a/Documentation/networking/index.rst b/Documentation/networking/index.rst index 42e556509e22..33afbb67f3fa 100644 --- a/Documentation/networking/index.rst +++ b/Documentation/networking/index.rst @@ -61,6 +61,7 @@ Contents: framerelay generic-hdlc generic_netlink + gen_stats .. only:: subproject and html diff --git a/net/core/gen_stats.c b/net/core/gen_stats.c index 1d653fbfcf52..e491b083b348 100644 --- a/net/core/gen_stats.c +++ b/net/core/gen_stats.c @@ -6,7 +6,7 @@ * Jamal Hadi Salim * Alexey Kuznetsov, * - * See Documentation/networking/gen_stats.txt + * See Documentation/networking/gen_stats.rst */ #include -- cgit v1.2.3-59-g8ed1b From 81baecb6f6dc507f1b565e711b5193cdbb3fa939 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Tue, 28 Apr 2020 00:01:42 +0200 Subject: docs: networking: convert gtp.txt to ReST - add SPDX header; - adjust titles and chapters, adding proper markups; - add notes markups; - adjust identation, whitespaces and blank lines; - add to networking/index.rst. Signed-off-by: Mauro Carvalho Chehab Signed-off-by: David S. Miller --- Documentation/networking/gtp.rst | 251 +++++++++++++++++++++++++++++++++++++ Documentation/networking/gtp.txt | 230 --------------------------------- Documentation/networking/index.rst | 1 + 3 files changed, 252 insertions(+), 230 deletions(-) create mode 100644 Documentation/networking/gtp.rst delete mode 100644 Documentation/networking/gtp.txt diff --git a/Documentation/networking/gtp.rst b/Documentation/networking/gtp.rst new file mode 100644 index 000000000000..1563fb94b289 --- /dev/null +++ b/Documentation/networking/gtp.rst @@ -0,0 +1,251 @@ +.. SPDX-License-Identifier: GPL-2.0 + +===================================== +The Linux kernel GTP tunneling module +===================================== + +Documentation by + Harald Welte and + Andreas Schultz + +In 'drivers/net/gtp.c' you are finding a kernel-level implementation +of a GTP tunnel endpoint. + +What is GTP +=========== + +GTP is the Generic Tunnel Protocol, which is a 3GPP protocol used for +tunneling User-IP payload between a mobile station (phone, modem) +and the interconnection between an external packet data network (such +as the internet). + +So when you start a 'data connection' from your mobile phone, the +phone will use the control plane to signal for the establishment of +such a tunnel between that external data network and the phone. The +tunnel endpoints thus reside on the phone and in the gateway. All +intermediate nodes just transport the encapsulated packet. + +The phone itself does not implement GTP but uses some other +technology-dependent protocol stack for transmitting the user IP +payload, such as LLC/SNDCP/RLC/MAC. + +At some network element inside the cellular operator infrastructure +(SGSN in case of GPRS/EGPRS or classic UMTS, hNodeB in case of a 3G +femtocell, eNodeB in case of 4G/LTE), the cellular protocol stacking +is translated into GTP *without breaking the end-to-end tunnel*. So +intermediate nodes just perform some specific relay function. + +At some point the GTP packet ends up on the so-called GGSN (GSM/UMTS) +or P-GW (LTE), which terminates the tunnel, decapsulates the packet +and forwards it onto an external packet data network. This can be +public internet, but can also be any private IP network (or even +theoretically some non-IP network like X.25). + +You can find the protocol specification in 3GPP TS 29.060, available +publicly via the 3GPP website at http://www.3gpp.org/DynaReport/29060.htm + +A direct PDF link to v13.6.0 is provided for convenience below: +http://www.etsi.org/deliver/etsi_ts/129000_129099/129060/13.06.00_60/ts_129060v130600p.pdf + +The Linux GTP tunnelling module +=============================== + +The module implements the function of a tunnel endpoint, i.e. it is +able to decapsulate tunneled IP packets in the uplink originated by +the phone, and encapsulate raw IP packets received from the external +packet network in downlink towards the phone. + +It *only* implements the so-called 'user plane', carrying the User-IP +payload, called GTP-U. It does not implement the 'control plane', +which is a signaling protocol used for establishment and teardown of +GTP tunnels (GTP-C). + +So in order to have a working GGSN/P-GW setup, you will need a +userspace program that implements the GTP-C protocol and which then +uses the netlink interface provided by the GTP-U module in the kernel +to configure the kernel module. + +This split architecture follows the tunneling modules of other +protocols, e.g. PPPoE or L2TP, where you also run a userspace daemon +to handle the tunnel establishment, authentication etc. and only the +data plane is accelerated inside the kernel. + +Don't be confused by terminology: The GTP User Plane goes through +kernel accelerated path, while the GTP Control Plane goes to +Userspace :) + +The official homepage of the module is at +https://osmocom.org/projects/linux-kernel-gtp-u/wiki + +Userspace Programs with Linux Kernel GTP-U support +================================================== + +At the time of this writing, there are at least two Free Software +implementations that implement GTP-C and can use the netlink interface +to make use of the Linux kernel GTP-U support: + +* OpenGGSN (classic 2G/3G GGSN in C): + https://osmocom.org/projects/openggsn/wiki/OpenGGSN + +* ergw (GGSN + P-GW in Erlang): + https://github.com/travelping/ergw + +Userspace Library / Command Line Utilities +========================================== + +There is a userspace library called 'libgtpnl' which is based on +libmnl and which implements a C-language API towards the netlink +interface provided by the Kernel GTP module: + +http://git.osmocom.org/libgtpnl/ + +Protocol Versions +================= + +There are two different versions of GTP-U: v0 [GSM TS 09.60] and v1 +[3GPP TS 29.281]. Both are implemented in the Kernel GTP module. +Version 0 is a legacy version, and deprecated from recent 3GPP +specifications. + +GTP-U uses UDP for transporting PDUs. The receiving UDP port is 2151 +for GTPv1-U and 3386 for GTPv0-U. + +There are three versions of GTP-C: v0, v1, and v2. As the kernel +doesn't implement GTP-C, we don't have to worry about this. It's the +responsibility of the control plane implementation in userspace to +implement that. + +IPv6 +==== + +The 3GPP specifications indicate either IPv4 or IPv6 can be used both +on the inner (user) IP layer, or on the outer (transport) layer. + +Unfortunately, the Kernel module currently supports IPv6 neither for +the User IP payload, nor for the outer IP layer. Patches or other +Contributions to fix this are most welcome! + +Mailing List +============ + +If you have questions regarding how to use the Kernel GTP module from +your own software, or want to contribute to the code, please use the +osmocom-net-grps mailing list for related discussion. The list can be +reached at osmocom-net-gprs@lists.osmocom.org and the mailman +interface for managing your subscription is at +https://lists.osmocom.org/mailman/listinfo/osmocom-net-gprs + +Issue Tracker +============= + +The Osmocom project maintains an issue tracker for the Kernel GTP-U +module at +https://osmocom.org/projects/linux-kernel-gtp-u/issues + +History / Acknowledgements +========================== + +The Module was originally created in 2012 by Harald Welte, but never +completed. Pablo came in to finish the mess Harald left behind. But +doe to a lack of user interest, it never got merged. + +In 2015, Andreas Schultz came to the rescue and fixed lots more bugs, +extended it with new features and finally pushed all of us to get it +mainline, where it was merged in 4.7.0. + +Architectural Details +===================== + +Local GTP-U entity and tunnel identification +-------------------------------------------- + +GTP-U uses UDP for transporting PDU's. The receiving UDP port is 2152 +for GTPv1-U and 3386 for GTPv0-U. + +There is only one GTP-U entity (and therefor SGSN/GGSN/S-GW/PDN-GW +instance) per IP address. Tunnel Endpoint Identifier (TEID) are unique +per GTP-U entity. + +A specific tunnel is only defined by the destination entity. Since the +destination port is constant, only the destination IP and TEID define +a tunnel. The source IP and Port have no meaning for the tunnel. + +Therefore: + + * when sending, the remote entity is defined by the remote IP and + the tunnel endpoint id. The source IP and port have no meaning and + can be changed at any time. + + * when receiving the local entity is defined by the local + destination IP and the tunnel endpoint id. The source IP and port + have no meaning and can change at any time. + +[3GPP TS 29.281] Section 4.3.0 defines this so:: + + The TEID in the GTP-U header is used to de-multiplex traffic + incoming from remote tunnel endpoints so that it is delivered to the + User plane entities in a way that allows multiplexing of different + users, different packet protocols and different QoS levels. + Therefore no two remote GTP-U endpoints shall send traffic to a + GTP-U protocol entity using the same TEID value except + for data forwarding as part of mobility procedures. + +The definition above only defines that two remote GTP-U endpoints +*should not* send to the same TEID, it *does not* forbid or exclude +such a scenario. In fact, the mentioned mobility procedures make it +necessary that the GTP-U entity accepts traffic for TEIDs from +multiple or unknown peers. + +Therefore, the receiving side identifies tunnels exclusively based on +TEIDs, not based on the source IP! + +APN vs. Network Device +====================== + +The GTP-U driver creates a Linux network device for each Gi/SGi +interface. + +[3GPP TS 29.281] calls the Gi/SGi reference point an interface. This +may lead to the impression that the GGSN/P-GW can have only one such +interface. + +Correct is that the Gi/SGi reference point defines the interworking +between +the 3GPP packet domain (PDN) based on GTP-U tunnel and IP +based networks. + +There is no provision in any of the 3GPP documents that limits the +number of Gi/SGi interfaces implemented by a GGSN/P-GW. + +[3GPP TS 29.061] Section 11.3 makes it clear that the selection of a +specific Gi/SGi interfaces is made through the Access Point Name +(APN):: + + 2. each private network manages its own addressing. In general this + will result in different private networks having overlapping + address ranges. A logically separate connection (e.g. an IP in IP + tunnel or layer 2 virtual circuit) is used between the GGSN/P-GW + and each private network. + + In this case the IP address alone is not necessarily unique. The + pair of values, Access Point Name (APN) and IPv4 address and/or + IPv6 prefixes, is unique. + +In order to support the overlapping address range use case, each APN +is mapped to a separate Gi/SGi interface (network device). + +.. note:: + + The Access Point Name is purely a control plane (GTP-C) concept. + At the GTP-U level, only Tunnel Endpoint Identifiers are present in + GTP-U packets and network devices are known + +Therefore for a given UE the mapping in IP to PDN network is: + + * network device + MS IP -> Peer IP + Peer TEID, + +and from PDN to IP network: + + * local GTP-U IP + TEID -> network device + +Furthermore, before a received T-PDU is injected into the network +device the MS IP is checked against the IP recorded in PDP context. diff --git a/Documentation/networking/gtp.txt b/Documentation/networking/gtp.txt deleted file mode 100644 index 6966bbec1ecb..000000000000 --- a/Documentation/networking/gtp.txt +++ /dev/null @@ -1,230 +0,0 @@ -The Linux kernel GTP tunneling module -====================================================================== -Documentation by Harald Welte and - Andreas Schultz - -In 'drivers/net/gtp.c' you are finding a kernel-level implementation -of a GTP tunnel endpoint. - -== What is GTP == - -GTP is the Generic Tunnel Protocol, which is a 3GPP protocol used for -tunneling User-IP payload between a mobile station (phone, modem) -and the interconnection between an external packet data network (such -as the internet). - -So when you start a 'data connection' from your mobile phone, the -phone will use the control plane to signal for the establishment of -such a tunnel between that external data network and the phone. The -tunnel endpoints thus reside on the phone and in the gateway. All -intermediate nodes just transport the encapsulated packet. - -The phone itself does not implement GTP but uses some other -technology-dependent protocol stack for transmitting the user IP -payload, such as LLC/SNDCP/RLC/MAC. - -At some network element inside the cellular operator infrastructure -(SGSN in case of GPRS/EGPRS or classic UMTS, hNodeB in case of a 3G -femtocell, eNodeB in case of 4G/LTE), the cellular protocol stacking -is translated into GTP *without breaking the end-to-end tunnel*. So -intermediate nodes just perform some specific relay function. - -At some point the GTP packet ends up on the so-called GGSN (GSM/UMTS) -or P-GW (LTE), which terminates the tunnel, decapsulates the packet -and forwards it onto an external packet data network. This can be -public internet, but can also be any private IP network (or even -theoretically some non-IP network like X.25). - -You can find the protocol specification in 3GPP TS 29.060, available -publicly via the 3GPP website at http://www.3gpp.org/DynaReport/29060.htm - -A direct PDF link to v13.6.0 is provided for convenience below: -http://www.etsi.org/deliver/etsi_ts/129000_129099/129060/13.06.00_60/ts_129060v130600p.pdf - -== The Linux GTP tunnelling module == - -The module implements the function of a tunnel endpoint, i.e. it is -able to decapsulate tunneled IP packets in the uplink originated by -the phone, and encapsulate raw IP packets received from the external -packet network in downlink towards the phone. - -It *only* implements the so-called 'user plane', carrying the User-IP -payload, called GTP-U. It does not implement the 'control plane', -which is a signaling protocol used for establishment and teardown of -GTP tunnels (GTP-C). - -So in order to have a working GGSN/P-GW setup, you will need a -userspace program that implements the GTP-C protocol and which then -uses the netlink interface provided by the GTP-U module in the kernel -to configure the kernel module. - -This split architecture follows the tunneling modules of other -protocols, e.g. PPPoE or L2TP, where you also run a userspace daemon -to handle the tunnel establishment, authentication etc. and only the -data plane is accelerated inside the kernel. - -Don't be confused by terminology: The GTP User Plane goes through -kernel accelerated path, while the GTP Control Plane goes to -Userspace :) - -The official homepage of the module is at -https://osmocom.org/projects/linux-kernel-gtp-u/wiki - -== Userspace Programs with Linux Kernel GTP-U support == - -At the time of this writing, there are at least two Free Software -implementations that implement GTP-C and can use the netlink interface -to make use of the Linux kernel GTP-U support: - -* OpenGGSN (classic 2G/3G GGSN in C): - https://osmocom.org/projects/openggsn/wiki/OpenGGSN - -* ergw (GGSN + P-GW in Erlang): - https://github.com/travelping/ergw - -== Userspace Library / Command Line Utilities == - -There is a userspace library called 'libgtpnl' which is based on -libmnl and which implements a C-language API towards the netlink -interface provided by the Kernel GTP module: - -http://git.osmocom.org/libgtpnl/ - -== Protocol Versions == - -There are two different versions of GTP-U: v0 [GSM TS 09.60] and v1 -[3GPP TS 29.281]. Both are implemented in the Kernel GTP module. -Version 0 is a legacy version, and deprecated from recent 3GPP -specifications. - -GTP-U uses UDP for transporting PDUs. The receiving UDP port is 2151 -for GTPv1-U and 3386 for GTPv0-U. - -There are three versions of GTP-C: v0, v1, and v2. As the kernel -doesn't implement GTP-C, we don't have to worry about this. It's the -responsibility of the control plane implementation in userspace to -implement that. - -== IPv6 == - -The 3GPP specifications indicate either IPv4 or IPv6 can be used both -on the inner (user) IP layer, or on the outer (transport) layer. - -Unfortunately, the Kernel module currently supports IPv6 neither for -the User IP payload, nor for the outer IP layer. Patches or other -Contributions to fix this are most welcome! - -== Mailing List == - -If yo have questions regarding how to use the Kernel GTP module from -your own software, or want to contribute to the code, please use the -osmocom-net-grps mailing list for related discussion. The list can be -reached at osmocom-net-gprs@lists.osmocom.org and the mailman -interface for managing your subscription is at -https://lists.osmocom.org/mailman/listinfo/osmocom-net-gprs - -== Issue Tracker == - -The Osmocom project maintains an issue tracker for the Kernel GTP-U -module at -https://osmocom.org/projects/linux-kernel-gtp-u/issues - -== History / Acknowledgements == - -The Module was originally created in 2012 by Harald Welte, but never -completed. Pablo came in to finish the mess Harald left behind. But -doe to a lack of user interest, it never got merged. - -In 2015, Andreas Schultz came to the rescue and fixed lots more bugs, -extended it with new features and finally pushed all of us to get it -mainline, where it was merged in 4.7.0. - -== Architectural Details == - -=== Local GTP-U entity and tunnel identification === - -GTP-U uses UDP for transporting PDU's. The receiving UDP port is 2152 -for GTPv1-U and 3386 for GTPv0-U. - -There is only one GTP-U entity (and therefor SGSN/GGSN/S-GW/PDN-GW -instance) per IP address. Tunnel Endpoint Identifier (TEID) are unique -per GTP-U entity. - -A specific tunnel is only defined by the destination entity. Since the -destination port is constant, only the destination IP and TEID define -a tunnel. The source IP and Port have no meaning for the tunnel. - -Therefore: - - * when sending, the remote entity is defined by the remote IP and - the tunnel endpoint id. The source IP and port have no meaning and - can be changed at any time. - - * when receiving the local entity is defined by the local - destination IP and the tunnel endpoint id. The source IP and port - have no meaning and can change at any time. - -[3GPP TS 29.281] Section 4.3.0 defines this so: - -> The TEID in the GTP-U header is used to de-multiplex traffic -> incoming from remote tunnel endpoints so that it is delivered to the -> User plane entities in a way that allows multiplexing of different -> users, different packet protocols and different QoS levels. -> Therefore no two remote GTP-U endpoints shall send traffic to a -> GTP-U protocol entity using the same TEID value except -> for data forwarding as part of mobility procedures. - -The definition above only defines that two remote GTP-U endpoints -*should not* send to the same TEID, it *does not* forbid or exclude -such a scenario. In fact, the mentioned mobility procedures make it -necessary that the GTP-U entity accepts traffic for TEIDs from -multiple or unknown peers. - -Therefore, the receiving side identifies tunnels exclusively based on -TEIDs, not based on the source IP! - -== APN vs. Network Device == - -The GTP-U driver creates a Linux network device for each Gi/SGi -interface. - -[3GPP TS 29.281] calls the Gi/SGi reference point an interface. This -may lead to the impression that the GGSN/P-GW can have only one such -interface. - -Correct is that the Gi/SGi reference point defines the interworking -between +the 3GPP packet domain (PDN) based on GTP-U tunnel and IP -based networks. - -There is no provision in any of the 3GPP documents that limits the -number of Gi/SGi interfaces implemented by a GGSN/P-GW. - -[3GPP TS 29.061] Section 11.3 makes it clear that the selection of a -specific Gi/SGi interfaces is made through the Access Point Name -(APN): - -> 2. each private network manages its own addressing. In general this -> will result in different private networks having overlapping -> address ranges. A logically separate connection (e.g. an IP in IP -> tunnel or layer 2 virtual circuit) is used between the GGSN/P-GW -> and each private network. -> -> In this case the IP address alone is not necessarily unique. The -> pair of values, Access Point Name (APN) and IPv4 address and/or -> IPv6 prefixes, is unique. - -In order to support the overlapping address range use case, each APN -is mapped to a separate Gi/SGi interface (network device). - -NOTE: The Access Point Name is purely a control plane (GTP-C) concept. -At the GTP-U level, only Tunnel Endpoint Identifiers are present in -GTP-U packets and network devices are known - -Therefore for a given UE the mapping in IP to PDN network is: - * network device + MS IP -> Peer IP + Peer TEID, - -and from PDN to IP network: - * local GTP-U IP + TEID -> network device - -Furthermore, before a received T-PDU is injected into the network -device the MS IP is checked against the IP recorded in PDP context. diff --git a/Documentation/networking/index.rst b/Documentation/networking/index.rst index 33afbb67f3fa..b29a08d1f941 100644 --- a/Documentation/networking/index.rst +++ b/Documentation/networking/index.rst @@ -62,6 +62,7 @@ Contents: generic-hdlc generic_netlink gen_stats + gtp .. only:: subproject and html -- cgit v1.2.3-59-g8ed1b From 3c3a2fde4d88bb3d6c0592b4b7754f26dab9f697 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Tue, 28 Apr 2020 00:01:43 +0200 Subject: docs: networking: convert hinic.txt to ReST Not much to be done here: - add SPDX header; - adjust titles and chapters, adding proper markups; - adjust identation, whitespaces and blank lines; - add to networking/index.rst. Signed-off-by: Mauro Carvalho Chehab Signed-off-by: David S. Miller --- Documentation/networking/hinic.rst | 128 +++++++++++++++++++++++++++++++++++++ Documentation/networking/hinic.txt | 125 ------------------------------------ Documentation/networking/index.rst | 1 + MAINTAINERS | 2 +- 4 files changed, 130 insertions(+), 126 deletions(-) create mode 100644 Documentation/networking/hinic.rst delete mode 100644 Documentation/networking/hinic.txt diff --git a/Documentation/networking/hinic.rst b/Documentation/networking/hinic.rst new file mode 100644 index 000000000000..867ac8f4e04a --- /dev/null +++ b/Documentation/networking/hinic.rst @@ -0,0 +1,128 @@ +.. SPDX-License-Identifier: GPL-2.0 + +============================================================ +Linux Kernel Driver for Huawei Intelligent NIC(HiNIC) family +============================================================ + +Overview: +========= +HiNIC is a network interface card for the Data Center Area. + +The driver supports a range of link-speed devices (10GbE, 25GbE, 40GbE, etc.). +The driver supports also a negotiated and extendable feature set. + +Some HiNIC devices support SR-IOV. This driver is used for Physical Function +(PF). + +HiNIC devices support MSI-X interrupt vector for each Tx/Rx queue and +adaptive interrupt moderation. + +HiNIC devices support also various offload features such as checksum offload, +TCP Transmit Segmentation Offload(TSO), Receive-Side Scaling(RSS) and +LRO(Large Receive Offload). + + +Supported PCI vendor ID/device IDs: +=================================== + +19e5:1822 - HiNIC PF + + +Driver Architecture and Source Code: +==================================== + +hinic_dev - Implement a Logical Network device that is independent from +specific HW details about HW data structure formats. + +hinic_hwdev - Implement the HW details of the device and include the components +for accessing the PCI NIC. + +hinic_hwdev contains the following components: +=============================================== + +HW Interface: +============= + +The interface for accessing the pci device (DMA memory and PCI BARs). +(hinic_hw_if.c, hinic_hw_if.h) + +Configuration Status Registers Area that describes the HW Registers on the +configuration and status BAR0. (hinic_hw_csr.h) + +MGMT components: +================ + +Asynchronous Event Queues(AEQs) - The event queues for receiving messages from +the MGMT modules on the cards. (hinic_hw_eqs.c, hinic_hw_eqs.h) + +Application Programmable Interface commands(API CMD) - Interface for sending +MGMT commands to the card. (hinic_hw_api_cmd.c, hinic_hw_api_cmd.h) + +Management (MGMT) - the PF to MGMT channel that uses API CMD for sending MGMT +commands to the card and receives notifications from the MGMT modules on the +card by AEQs. Also set the addresses of the IO CMDQs in HW. +(hinic_hw_mgmt.c, hinic_hw_mgmt.h) + +IO components: +============== + +Completion Event Queues(CEQs) - The completion Event Queues that describe IO +tasks that are finished. (hinic_hw_eqs.c, hinic_hw_eqs.h) + +Work Queues(WQ) - Contain the memory and operations for use by CMD queues and +the Queue Pairs. The WQ is a Memory Block in a Page. The Block contains +pointers to Memory Areas that are the Memory for the Work Queue Elements(WQEs). +(hinic_hw_wq.c, hinic_hw_wq.h) + +Command Queues(CMDQ) - The queues for sending commands for IO management and is +used to set the QPs addresses in HW. The commands completion events are +accumulated on the CEQ that is configured to receive the CMDQ completion events. +(hinic_hw_cmdq.c, hinic_hw_cmdq.h) + +Queue Pairs(QPs) - The HW Receive and Send queues for Receiving and Transmitting +Data. (hinic_hw_qp.c, hinic_hw_qp.h, hinic_hw_qp_ctxt.h) + +IO - de/constructs all the IO components. (hinic_hw_io.c, hinic_hw_io.h) + +HW device: +========== + +HW device - de/constructs the HW Interface, the MGMT components on the +initialization of the driver and the IO components on the case of Interface +UP/DOWN Events. (hinic_hw_dev.c, hinic_hw_dev.h) + + +hinic_dev contains the following components: +=============================================== + +PCI ID table - Contains the supported PCI Vendor/Device IDs. +(hinic_pci_tbl.h) + +Port Commands - Send commands to the HW device for port management +(MAC, Vlan, MTU, ...). (hinic_port.c, hinic_port.h) + +Tx Queues - Logical Tx Queues that use the HW Send Queues for transmit. +The Logical Tx queue is not dependent on the format of the HW Send Queue. +(hinic_tx.c, hinic_tx.h) + +Rx Queues - Logical Rx Queues that use the HW Receive Queues for receive. +The Logical Rx queue is not dependent on the format of the HW Receive Queue. +(hinic_rx.c, hinic_rx.h) + +hinic_dev - de/constructs the Logical Tx and Rx Queues. +(hinic_main.c, hinic_dev.h) + + +Miscellaneous +============= + +Common functions that are used by HW and Logical Device. +(hinic_common.c, hinic_common.h) + + +Support +======= + +If an issue is identified with the released source code on the supported kernel +with a supported adapter, email the specific information related to the issue to +aviad.krawczyk@huawei.com. diff --git a/Documentation/networking/hinic.txt b/Documentation/networking/hinic.txt deleted file mode 100644 index 989366a4039c..000000000000 --- a/Documentation/networking/hinic.txt +++ /dev/null @@ -1,125 +0,0 @@ -Linux Kernel Driver for Huawei Intelligent NIC(HiNIC) family -============================================================ - -Overview: -========= -HiNIC is a network interface card for the Data Center Area. - -The driver supports a range of link-speed devices (10GbE, 25GbE, 40GbE, etc.). -The driver supports also a negotiated and extendable feature set. - -Some HiNIC devices support SR-IOV. This driver is used for Physical Function -(PF). - -HiNIC devices support MSI-X interrupt vector for each Tx/Rx queue and -adaptive interrupt moderation. - -HiNIC devices support also various offload features such as checksum offload, -TCP Transmit Segmentation Offload(TSO), Receive-Side Scaling(RSS) and -LRO(Large Receive Offload). - - -Supported PCI vendor ID/device IDs: -=================================== - -19e5:1822 - HiNIC PF - - -Driver Architecture and Source Code: -==================================== - -hinic_dev - Implement a Logical Network device that is independent from -specific HW details about HW data structure formats. - -hinic_hwdev - Implement the HW details of the device and include the components -for accessing the PCI NIC. - -hinic_hwdev contains the following components: -=============================================== - -HW Interface: -============= - -The interface for accessing the pci device (DMA memory and PCI BARs). -(hinic_hw_if.c, hinic_hw_if.h) - -Configuration Status Registers Area that describes the HW Registers on the -configuration and status BAR0. (hinic_hw_csr.h) - -MGMT components: -================ - -Asynchronous Event Queues(AEQs) - The event queues for receiving messages from -the MGMT modules on the cards. (hinic_hw_eqs.c, hinic_hw_eqs.h) - -Application Programmable Interface commands(API CMD) - Interface for sending -MGMT commands to the card. (hinic_hw_api_cmd.c, hinic_hw_api_cmd.h) - -Management (MGMT) - the PF to MGMT channel that uses API CMD for sending MGMT -commands to the card and receives notifications from the MGMT modules on the -card by AEQs. Also set the addresses of the IO CMDQs in HW. -(hinic_hw_mgmt.c, hinic_hw_mgmt.h) - -IO components: -============== - -Completion Event Queues(CEQs) - The completion Event Queues that describe IO -tasks that are finished. (hinic_hw_eqs.c, hinic_hw_eqs.h) - -Work Queues(WQ) - Contain the memory and operations for use by CMD queues and -the Queue Pairs. The WQ is a Memory Block in a Page. The Block contains -pointers to Memory Areas that are the Memory for the Work Queue Elements(WQEs). -(hinic_hw_wq.c, hinic_hw_wq.h) - -Command Queues(CMDQ) - The queues for sending commands for IO management and is -used to set the QPs addresses in HW. The commands completion events are -accumulated on the CEQ that is configured to receive the CMDQ completion events. -(hinic_hw_cmdq.c, hinic_hw_cmdq.h) - -Queue Pairs(QPs) - The HW Receive and Send queues for Receiving and Transmitting -Data. (hinic_hw_qp.c, hinic_hw_qp.h, hinic_hw_qp_ctxt.h) - -IO - de/constructs all the IO components. (hinic_hw_io.c, hinic_hw_io.h) - -HW device: -========== - -HW device - de/constructs the HW Interface, the MGMT components on the -initialization of the driver and the IO components on the case of Interface -UP/DOWN Events. (hinic_hw_dev.c, hinic_hw_dev.h) - - -hinic_dev contains the following components: -=============================================== - -PCI ID table - Contains the supported PCI Vendor/Device IDs. -(hinic_pci_tbl.h) - -Port Commands - Send commands to the HW device for port management -(MAC, Vlan, MTU, ...). (hinic_port.c, hinic_port.h) - -Tx Queues - Logical Tx Queues that use the HW Send Queues for transmit. -The Logical Tx queue is not dependent on the format of the HW Send Queue. -(hinic_tx.c, hinic_tx.h) - -Rx Queues - Logical Rx Queues that use the HW Receive Queues for receive. -The Logical Rx queue is not dependent on the format of the HW Receive Queue. -(hinic_rx.c, hinic_rx.h) - -hinic_dev - de/constructs the Logical Tx and Rx Queues. -(hinic_main.c, hinic_dev.h) - - -Miscellaneous: -============= - -Common functions that are used by HW and Logical Device. -(hinic_common.c, hinic_common.h) - - -Support -======= - -If an issue is identified with the released source code on the supported kernel -with a supported adapter, email the specific information related to the issue to -aviad.krawczyk@huawei.com. diff --git a/Documentation/networking/index.rst b/Documentation/networking/index.rst index b29a08d1f941..5a7889df1375 100644 --- a/Documentation/networking/index.rst +++ b/Documentation/networking/index.rst @@ -63,6 +63,7 @@ Contents: generic_netlink gen_stats gtp + hinic .. only:: subproject and html diff --git a/MAINTAINERS b/MAINTAINERS index 4ec6d2741d36..df5e4ccc1ccb 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -7815,7 +7815,7 @@ HUAWEI ETHERNET DRIVER M: Aviad Krawczyk L: netdev@vger.kernel.org S: Supported -F: Documentation/networking/hinic.txt +F: Documentation/networking/hinic.rst F: drivers/net/ethernet/huawei/hinic/ HUGETLB FILESYSTEM -- cgit v1.2.3-59-g8ed1b From 1d2698fa05f57ba2900e1ff50ac33ec85d2087d3 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Tue, 28 Apr 2020 00:01:44 +0200 Subject: docs: networking: convert ila.txt to ReST - add SPDX header; - adjust title markup; - mark code blocks and literals as such; - adjust identation, whitespaces and blank lines; - add to networking/index.rst. Signed-off-by: Mauro Carvalho Chehab Signed-off-by: David S. Miller --- Documentation/networking/ila.rst | 296 +++++++++++++++++++++++++++++++++++++ Documentation/networking/ila.txt | 285 ----------------------------------- Documentation/networking/index.rst | 1 + 3 files changed, 297 insertions(+), 285 deletions(-) create mode 100644 Documentation/networking/ila.rst delete mode 100644 Documentation/networking/ila.txt diff --git a/Documentation/networking/ila.rst b/Documentation/networking/ila.rst new file mode 100644 index 000000000000..5ac0a6270b17 --- /dev/null +++ b/Documentation/networking/ila.rst @@ -0,0 +1,296 @@ +.. SPDX-License-Identifier: GPL-2.0 + +=================================== +Identifier Locator Addressing (ILA) +=================================== + + +Introduction +============ + +Identifier-locator addressing (ILA) is a technique used with IPv6 that +differentiates between location and identity of a network node. Part of an +address expresses the immutable identity of the node, and another part +indicates the location of the node which can be dynamic. Identifier-locator +addressing can be used to efficiently implement overlay networks for +network virtualization as well as solutions for use cases in mobility. + +ILA can be thought of as means to implement an overlay network without +encapsulation. This is accomplished by performing network address +translation on destination addresses as a packet traverses a network. To +the network, an ILA translated packet appears to be no different than any +other IPv6 packet. For instance, if the transport protocol is TCP then an +ILA translated packet looks like just another TCP/IPv6 packet. The +advantage of this is that ILA is transparent to the network so that +optimizations in the network, such as ECMP, RSS, GRO, GSO, etc., just work. + +The ILA protocol is described in Internet-Draft draft-herbert-intarea-ila. + + +ILA terminology +=============== + + - Identifier + A number that identifies an addressable node in the network + independent of its location. ILA identifiers are sixty-four + bit values. + + - Locator + A network prefix that routes to a physical host. Locators + provide the topological location of an addressed node. ILA + locators are sixty-four bit prefixes. + + - ILA mapping + A mapping of an ILA identifier to a locator (or to a + locator and meta data). An ILA domain maintains a database + that contains mappings for all destinations in the domain. + + - SIR address + An IPv6 address composed of a SIR prefix (upper sixty- + four bits) and an identifier (lower sixty-four bits). + SIR addresses are visible to applications and provide a + means for them to address nodes independent of their + location. + + - ILA address + An IPv6 address composed of a locator (upper sixty-four + bits) and an identifier (low order sixty-four bits). ILA + addresses are never visible to an application. + + - ILA host + An end host that is capable of performing ILA translations + on transmit or receive. + + - ILA router + A network node that performs ILA translation and forwarding + of translated packets. + + - ILA forwarding cache + A type of ILA router that only maintains a working set + cache of mappings. + + - ILA node + A network node capable of performing ILA translations. This + can be an ILA router, ILA forwarding cache, or ILA host. + + +Operation +========= + +There are two fundamental operations with ILA: + + - Translate a SIR address to an ILA address. This is performed on ingress + to an ILA overlay. + + - Translate an ILA address to a SIR address. This is performed on egress + from the ILA overlay. + +ILA can be deployed either on end hosts or intermediate devices in the +network; these are provided by "ILA hosts" and "ILA routers" respectively. +Configuration and datapath for these two points of deployment is somewhat +different. + +The diagram below illustrates the flow of packets through ILA as well +as showing ILA hosts and routers:: + + +--------+ +--------+ + | Host A +-+ +--->| Host B | + | | | (2) ILA (') | | + +--------+ | ...addressed.... ( ) +--------+ + V +---+--+ . packet . +---+--+ (_) + (1) SIR | | ILA |----->-------->---->| ILA | | (3) SIR + addressed +->|router| . . |router|->-+ addressed + packet +---+--+ . IPv6 . +---+--+ packet + / . Network . + / . . +--+-++--------+ + +--------+ / . . |ILA || Host | + | Host +--+ . .- -|host|| | + | | . . +--+-++--------+ + +--------+ ................ + + +Transport checksum handling +=========================== + +When an address is translated by ILA, an encapsulated transport checksum +that includes the translated address in a pseudo header may be rendered +incorrect on the wire. This is a problem for intermediate devices, +including checksum offload in NICs, that process the checksum. There are +three options to deal with this: + +- no action Allow the checksum to be incorrect on the wire. Before + a receiver verifies a checksum the ILA to SIR address + translation must be done. + +- adjust transport checksum + When ILA translation is performed the packet is parsed + and if a transport layer checksum is found then it is + adjusted to reflect the correct checksum per the + translated address. + +- checksum neutral mapping + When an address is translated the difference can be offset + elsewhere in a part of the packet that is covered by + the checksum. The low order sixteen bits of the identifier + are used. This method is preferred since it doesn't require + parsing a packet beyond the IP header and in most cases the + adjustment can be precomputed and saved with the mapping. + +Note that the checksum neutral adjustment affects the low order sixteen +bits of the identifier. When ILA to SIR address translation is done on +egress the low order bits are restored to the original value which +restores the identifier as it was originally sent. + + +Identifier types +================ + +ILA defines different types of identifiers for different use cases. + +The defined types are: + + 0: interface identifier + + 1: locally unique identifier + + 2: virtual networking identifier for IPv4 address + + 3: virtual networking identifier for IPv6 unicast address + + 4: virtual networking identifier for IPv6 multicast address + + 5: non-local address identifier + +In the current implementation of kernel ILA only locally unique identifiers +(LUID) are supported. LUID allows for a generic, unformatted 64 bit +identifier. + + +Identifier formats +================== + +Kernel ILA supports two optional fields in an identifier for formatting: +"C-bit" and "identifier type". The presence of these fields is determined +by configuration as demonstrated below. + +If the identifier type is present it occupies the three highest order +bits of an identifier. The possible values are given in the above list. + +If the C-bit is present, this is used as an indication that checksum +neutral mapping has been done. The C-bit can only be set in an +ILA address, never a SIR address. + +In the simplest format the identifier types, C-bit, and checksum +adjustment value are not present so an identifier is considered an +unstructured sixty-four bit value:: + + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | Identifier | + + + + | | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + +The checksum neutral adjustment may be configured to always be +present using neutral-map-auto. In this case there is no C-bit, but the +checksum adjustment is in the low order 16 bits. The identifier is +still sixty-four bits:: + + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | Identifier | + | +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | | Checksum-neutral adjustment | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + +The C-bit may used to explicitly indicate that checksum neutral +mapping has been applied to an ILA address. The format is:: + + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | |C| Identifier | + | +-+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | | Checksum-neutral adjustment | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + +The identifier type field may be present to indicate the identifier +type. If it is not present then the type is inferred based on mapping +configuration. The checksum neutral adjustment may automatically +used with the identifier type as illustrated below:: + + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | Type| Identifier | + +-+-+-+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | | Checksum-neutral adjustment | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + +If the identifier type and the C-bit can be present simultaneously so +the identifier format would be:: + + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | Type|C| Identifier | + +-+-+-+-+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | | Checksum-neutral adjustment | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + + +Configuration +============= + +There are two methods to configure ILA mappings. One is by using LWT routes +and the other is ila_xlat (called from NFHOOK PREROUTING hook). ila_xlat +is intended to be used in the receive path for ILA hosts . + +An ILA router has also been implemented in XDP. Description of that is +outside the scope of this document. + +The usage of for ILA LWT routes is: + +ip route add DEST/128 encap ila LOC csum-mode MODE ident-type TYPE via ADDR + +Destination (DEST) can either be a SIR address (for an ILA host or ingress +ILA router) or an ILA address (egress ILA router). LOC is the sixty-four +bit locator (with format W:X:Y:Z) that overwrites the upper sixty-four +bits of the destination address. Checksum MODE is one of "no-action", +"adj-transport", "neutral-map", and "neutral-map-auto". If neutral-map is +set then the C-bit will be present. Identifier TYPE one of "luid" or +"use-format." In the case of use-format, the identifier type field is +present and the effective type is taken from that. + +The usage of ila_xlat is: + +ip ila add loc_match MATCH loc LOC csum-mode MODE ident-type TYPE + +MATCH indicates the incoming locator that must be matched to apply +a the translaiton. LOC is the locator that overwrites the upper +sixty-four bits of the destination address. MODE and TYPE have the +same meanings as described above. + + +Some examples +============= + +:: + + # Configure an ILA route that uses checksum neutral mapping as well + # as type field. Note that the type field is set in the SIR address + # (the 2000 implies type is 1 which is LUID). + ip route add 3333:0:0:1:2000:0:1:87/128 encap ila 2001:0:87:0 \ + csum-mode neutral-map ident-type use-format + + # Configure an ILA LWT route that uses auto checksum neutral mapping + # (no C-bit) and configure identifier type to be LUID so that the + # identifier type field will not be present. + ip route add 3333:0:0:1:2000:0:2:87/128 encap ila 2001:0:87:1 \ + csum-mode neutral-map-auto ident-type luid + + ila_xlat configuration + + # Configure an ILA to SIR mapping that matches a locator and overwrites + # it with a SIR address (3333:0:0:1 in this example). The C-bit and + # identifier field are used. + ip ila add loc_match 2001:0:119:0 loc 3333:0:0:1 \ + csum-mode neutral-map-auto ident-type use-format + + # Configure an ILA to SIR mapping where checksum neutral is automatically + # set without the C-bit and the identifier type is configured to be LUID + # so that the identifier type field is not present. + ip ila add loc_match 2001:0:119:0 loc 3333:0:0:1 \ + csum-mode neutral-map-auto ident-type use-format diff --git a/Documentation/networking/ila.txt b/Documentation/networking/ila.txt deleted file mode 100644 index a17dac9dc915..000000000000 --- a/Documentation/networking/ila.txt +++ /dev/null @@ -1,285 +0,0 @@ -Identifier Locator Addressing (ILA) - - -Introduction -============ - -Identifier-locator addressing (ILA) is a technique used with IPv6 that -differentiates between location and identity of a network node. Part of an -address expresses the immutable identity of the node, and another part -indicates the location of the node which can be dynamic. Identifier-locator -addressing can be used to efficiently implement overlay networks for -network virtualization as well as solutions for use cases in mobility. - -ILA can be thought of as means to implement an overlay network without -encapsulation. This is accomplished by performing network address -translation on destination addresses as a packet traverses a network. To -the network, an ILA translated packet appears to be no different than any -other IPv6 packet. For instance, if the transport protocol is TCP then an -ILA translated packet looks like just another TCP/IPv6 packet. The -advantage of this is that ILA is transparent to the network so that -optimizations in the network, such as ECMP, RSS, GRO, GSO, etc., just work. - -The ILA protocol is described in Internet-Draft draft-herbert-intarea-ila. - - -ILA terminology -=============== - - - Identifier A number that identifies an addressable node in the network - independent of its location. ILA identifiers are sixty-four - bit values. - - - Locator A network prefix that routes to a physical host. Locators - provide the topological location of an addressed node. ILA - locators are sixty-four bit prefixes. - - - ILA mapping - A mapping of an ILA identifier to a locator (or to a - locator and meta data). An ILA domain maintains a database - that contains mappings for all destinations in the domain. - - - SIR address - An IPv6 address composed of a SIR prefix (upper sixty- - four bits) and an identifier (lower sixty-four bits). - SIR addresses are visible to applications and provide a - means for them to address nodes independent of their - location. - - - ILA address - An IPv6 address composed of a locator (upper sixty-four - bits) and an identifier (low order sixty-four bits). ILA - addresses are never visible to an application. - - - ILA host An end host that is capable of performing ILA translations - on transmit or receive. - - - ILA router A network node that performs ILA translation and forwarding - of translated packets. - - - ILA forwarding cache - A type of ILA router that only maintains a working set - cache of mappings. - - - ILA node A network node capable of performing ILA translations. This - can be an ILA router, ILA forwarding cache, or ILA host. - - -Operation -========= - -There are two fundamental operations with ILA: - - - Translate a SIR address to an ILA address. This is performed on ingress - to an ILA overlay. - - - Translate an ILA address to a SIR address. This is performed on egress - from the ILA overlay. - -ILA can be deployed either on end hosts or intermediate devices in the -network; these are provided by "ILA hosts" and "ILA routers" respectively. -Configuration and datapath for these two points of deployment is somewhat -different. - -The diagram below illustrates the flow of packets through ILA as well -as showing ILA hosts and routers. - - +--------+ +--------+ - | Host A +-+ +--->| Host B | - | | | (2) ILA (') | | - +--------+ | ...addressed.... ( ) +--------+ - V +---+--+ . packet . +---+--+ (_) - (1) SIR | | ILA |----->-------->---->| ILA | | (3) SIR - addressed +->|router| . . |router|->-+ addressed - packet +---+--+ . IPv6 . +---+--+ packet - / . Network . - / . . +--+-++--------+ - +--------+ / . . |ILA || Host | - | Host +--+ . .- -|host|| | - | | . . +--+-++--------+ - +--------+ ................ - - -Transport checksum handling -=========================== - -When an address is translated by ILA, an encapsulated transport checksum -that includes the translated address in a pseudo header may be rendered -incorrect on the wire. This is a problem for intermediate devices, -including checksum offload in NICs, that process the checksum. There are -three options to deal with this: - -- no action Allow the checksum to be incorrect on the wire. Before - a receiver verifies a checksum the ILA to SIR address - translation must be done. - -- adjust transport checksum - When ILA translation is performed the packet is parsed - and if a transport layer checksum is found then it is - adjusted to reflect the correct checksum per the - translated address. - -- checksum neutral mapping - When an address is translated the difference can be offset - elsewhere in a part of the packet that is covered by - the checksum. The low order sixteen bits of the identifier - are used. This method is preferred since it doesn't require - parsing a packet beyond the IP header and in most cases the - adjustment can be precomputed and saved with the mapping. - -Note that the checksum neutral adjustment affects the low order sixteen -bits of the identifier. When ILA to SIR address translation is done on -egress the low order bits are restored to the original value which -restores the identifier as it was originally sent. - - -Identifier types -================ - -ILA defines different types of identifiers for different use cases. - -The defined types are: - - 0: interface identifier - - 1: locally unique identifier - - 2: virtual networking identifier for IPv4 address - - 3: virtual networking identifier for IPv6 unicast address - - 4: virtual networking identifier for IPv6 multicast address - - 5: non-local address identifier - -In the current implementation of kernel ILA only locally unique identifiers -(LUID) are supported. LUID allows for a generic, unformatted 64 bit -identifier. - - -Identifier formats -================== - -Kernel ILA supports two optional fields in an identifier for formatting: -"C-bit" and "identifier type". The presence of these fields is determined -by configuration as demonstrated below. - -If the identifier type is present it occupies the three highest order -bits of an identifier. The possible values are given in the above list. - -If the C-bit is present, this is used as an indication that checksum -neutral mapping has been done. The C-bit can only be set in an -ILA address, never a SIR address. - -In the simplest format the identifier types, C-bit, and checksum -adjustment value are not present so an identifier is considered an -unstructured sixty-four bit value. - - +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ - | Identifier | - + + - | | - +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ - -The checksum neutral adjustment may be configured to always be -present using neutral-map-auto. In this case there is no C-bit, but the -checksum adjustment is in the low order 16 bits. The identifier is -still sixty-four bits. - - +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ - | Identifier | - | +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ - | | Checksum-neutral adjustment | - +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ - -The C-bit may used to explicitly indicate that checksum neutral -mapping has been applied to an ILA address. The format is: - - +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ - | |C| Identifier | - | +-+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ - | | Checksum-neutral adjustment | - +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ - -The identifier type field may be present to indicate the identifier -type. If it is not present then the type is inferred based on mapping -configuration. The checksum neutral adjustment may automatically -used with the identifier type as illustrated below. - - +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ - | Type| Identifier | - +-+-+-+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ - | | Checksum-neutral adjustment | - +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ - -If the identifier type and the C-bit can be present simultaneously so -the identifier format would be: - - +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ - | Type|C| Identifier | - +-+-+-+-+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ - | | Checksum-neutral adjustment | - +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ - - -Configuration -============= - -There are two methods to configure ILA mappings. One is by using LWT routes -and the other is ila_xlat (called from NFHOOK PREROUTING hook). ila_xlat -is intended to be used in the receive path for ILA hosts . - -An ILA router has also been implemented in XDP. Description of that is -outside the scope of this document. - -The usage of for ILA LWT routes is: - -ip route add DEST/128 encap ila LOC csum-mode MODE ident-type TYPE via ADDR - -Destination (DEST) can either be a SIR address (for an ILA host or ingress -ILA router) or an ILA address (egress ILA router). LOC is the sixty-four -bit locator (with format W:X:Y:Z) that overwrites the upper sixty-four -bits of the destination address. Checksum MODE is one of "no-action", -"adj-transport", "neutral-map", and "neutral-map-auto". If neutral-map is -set then the C-bit will be present. Identifier TYPE one of "luid" or -"use-format." In the case of use-format, the identifier type field is -present and the effective type is taken from that. - -The usage of ila_xlat is: - -ip ila add loc_match MATCH loc LOC csum-mode MODE ident-type TYPE - -MATCH indicates the incoming locator that must be matched to apply -a the translaiton. LOC is the locator that overwrites the upper -sixty-four bits of the destination address. MODE and TYPE have the -same meanings as described above. - - -Some examples -============= - -# Configure an ILA route that uses checksum neutral mapping as well -# as type field. Note that the type field is set in the SIR address -# (the 2000 implies type is 1 which is LUID). -ip route add 3333:0:0:1:2000:0:1:87/128 encap ila 2001:0:87:0 \ - csum-mode neutral-map ident-type use-format - -# Configure an ILA LWT route that uses auto checksum neutral mapping -# (no C-bit) and configure identifier type to be LUID so that the -# identifier type field will not be present. -ip route add 3333:0:0:1:2000:0:2:87/128 encap ila 2001:0:87:1 \ - csum-mode neutral-map-auto ident-type luid - -ila_xlat configuration - -# Configure an ILA to SIR mapping that matches a locator and overwrites -# it with a SIR address (3333:0:0:1 in this example). The C-bit and -# identifier field are used. -ip ila add loc_match 2001:0:119:0 loc 3333:0:0:1 \ - csum-mode neutral-map-auto ident-type use-format - -# Configure an ILA to SIR mapping where checksum neutral is automatically -# set without the C-bit and the identifier type is configured to be LUID -# so that the identifier type field is not present. -ip ila add loc_match 2001:0:119:0 loc 3333:0:0:1 \ - csum-mode neutral-map-auto ident-type use-format diff --git a/Documentation/networking/index.rst b/Documentation/networking/index.rst index 5a7889df1375..488971f6b650 100644 --- a/Documentation/networking/index.rst +++ b/Documentation/networking/index.rst @@ -64,6 +64,7 @@ Contents: gen_stats gtp hinic + ila .. only:: subproject and html -- cgit v1.2.3-59-g8ed1b From 7cdb25400f7e8624414260d1b0fa70da280b2303 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Tue, 28 Apr 2020 00:01:45 +0200 Subject: docs: networking: convert ipddp.txt to ReST Not much to be done here: - add SPDX header; - use a document title from existing text; - adjust a chapter markup; - add to networking/index.rst. Signed-off-by: Mauro Carvalho Chehab Signed-off-by: David S. Miller --- Documentation/networking/index.rst | 1 + Documentation/networking/ipddp.rst | 78 ++++++++++++++++++++++++++++++++++++++ Documentation/networking/ipddp.txt | 73 ----------------------------------- Documentation/networking/ltpc.txt | 2 +- drivers/net/appletalk/Kconfig | 4 +- 5 files changed, 82 insertions(+), 76 deletions(-) create mode 100644 Documentation/networking/ipddp.rst delete mode 100644 Documentation/networking/ipddp.txt diff --git a/Documentation/networking/index.rst b/Documentation/networking/index.rst index 488971f6b650..cf85d0a73144 100644 --- a/Documentation/networking/index.rst +++ b/Documentation/networking/index.rst @@ -65,6 +65,7 @@ Contents: gtp hinic ila + ipddp .. only:: subproject and html diff --git a/Documentation/networking/ipddp.rst b/Documentation/networking/ipddp.rst new file mode 100644 index 000000000000..be7091b77927 --- /dev/null +++ b/Documentation/networking/ipddp.rst @@ -0,0 +1,78 @@ +.. SPDX-License-Identifier: GPL-2.0 + +========================================================= +AppleTalk-IP Decapsulation and AppleTalk-IP Encapsulation +========================================================= + +Documentation ipddp.c + +This file is written by Jay Schulist + +Introduction +------------ + +AppleTalk-IP (IPDDP) is the method computers connected to AppleTalk +networks can use to communicate via IP. AppleTalk-IP is simply IP datagrams +inside AppleTalk packets. + +Through this driver you can either allow your Linux box to communicate +IP over an AppleTalk network or you can provide IP gatewaying functions +for your AppleTalk users. + +You can currently encapsulate or decapsulate AppleTalk-IP on LocalTalk, +EtherTalk and PPPTalk. The only limit on the protocol is that of what +kernel AppleTalk layer and drivers are available. + +Each mode requires its own user space software. + +Compiling AppleTalk-IP Decapsulation/Encapsulation +================================================== + +AppleTalk-IP decapsulation needs to be compiled into your kernel. You +will need to turn on AppleTalk-IP driver support. Then you will need to +select ONE of the two options; IP to AppleTalk-IP encapsulation support or +AppleTalk-IP to IP decapsulation support. If you compile the driver +statically you will only be able to use the driver for the function you have +enabled in the kernel. If you compile the driver as a module you can +select what mode you want it to run in via a module loading param. +ipddp_mode=1 for AppleTalk-IP encapsulation and ipddp_mode=2 for +AppleTalk-IP to IP decapsulation. + +Basic instructions for user space tools +======================================= + +I will briefly describe the operation of the tools, but you will +need to consult the supporting documentation for each set of tools. + +Decapsulation - You will need to download a software package called +MacGate. In this distribution there will be a tool called MacRoute +which enables you to add routes to the kernel for your Macs by hand. +Also the tool MacRegGateWay is included to register the +proper IP Gateway and IP addresses for your machine. Included in this +distribution is a patch to netatalk-1.4b2+asun2.0a17.2 (available from +ftp.u.washington.edu/pub/user-supported/asun/) this patch is optional +but it allows automatic adding and deleting of routes for Macs. (Handy +for locations with large Mac installations) + +Encapsulation - You will need to download a software daemon called ipddpd. +This software expects there to be an AppleTalk-IP gateway on the network. +You will also need to add the proper routes to route your Linux box's IP +traffic out the ipddp interface. + +Common Uses of ipddp.c +---------------------- +Of course AppleTalk-IP decapsulation and encapsulation, but specifically +decapsulation is being used most for connecting LocalTalk networks to +IP networks. Although it has been used on EtherTalk networks to allow +Macs that are only able to tunnel IP over EtherTalk. + +Encapsulation has been used to allow a Linux box stuck on a LocalTalk +network to use IP. It should work equally well if you are stuck on an +EtherTalk only network. + +Further Assistance +------------------- +You can contact me (Jay Schulist ) with any +questions regarding decapsulation or encapsulation. Bradford W. Johnson + originally wrote the ipddp.c driver for IP +encapsulation in AppleTalk. diff --git a/Documentation/networking/ipddp.txt b/Documentation/networking/ipddp.txt deleted file mode 100644 index ba5c217fffe0..000000000000 --- a/Documentation/networking/ipddp.txt +++ /dev/null @@ -1,73 +0,0 @@ -Text file for ipddp.c: - AppleTalk-IP Decapsulation and AppleTalk-IP Encapsulation - -This text file is written by Jay Schulist - -Introduction ------------- - -AppleTalk-IP (IPDDP) is the method computers connected to AppleTalk -networks can use to communicate via IP. AppleTalk-IP is simply IP datagrams -inside AppleTalk packets. - -Through this driver you can either allow your Linux box to communicate -IP over an AppleTalk network or you can provide IP gatewaying functions -for your AppleTalk users. - -You can currently encapsulate or decapsulate AppleTalk-IP on LocalTalk, -EtherTalk and PPPTalk. The only limit on the protocol is that of what -kernel AppleTalk layer and drivers are available. - -Each mode requires its own user space software. - -Compiling AppleTalk-IP Decapsulation/Encapsulation -================================================= - -AppleTalk-IP decapsulation needs to be compiled into your kernel. You -will need to turn on AppleTalk-IP driver support. Then you will need to -select ONE of the two options; IP to AppleTalk-IP encapsulation support or -AppleTalk-IP to IP decapsulation support. If you compile the driver -statically you will only be able to use the driver for the function you have -enabled in the kernel. If you compile the driver as a module you can -select what mode you want it to run in via a module loading param. -ipddp_mode=1 for AppleTalk-IP encapsulation and ipddp_mode=2 for -AppleTalk-IP to IP decapsulation. - -Basic instructions for user space tools -======================================= - -I will briefly describe the operation of the tools, but you will -need to consult the supporting documentation for each set of tools. - -Decapsulation - You will need to download a software package called -MacGate. In this distribution there will be a tool called MacRoute -which enables you to add routes to the kernel for your Macs by hand. -Also the tool MacRegGateWay is included to register the -proper IP Gateway and IP addresses for your machine. Included in this -distribution is a patch to netatalk-1.4b2+asun2.0a17.2 (available from -ftp.u.washington.edu/pub/user-supported/asun/) this patch is optional -but it allows automatic adding and deleting of routes for Macs. (Handy -for locations with large Mac installations) - -Encapsulation - You will need to download a software daemon called ipddpd. -This software expects there to be an AppleTalk-IP gateway on the network. -You will also need to add the proper routes to route your Linux box's IP -traffic out the ipddp interface. - -Common Uses of ipddp.c ----------------------- -Of course AppleTalk-IP decapsulation and encapsulation, but specifically -decapsulation is being used most for connecting LocalTalk networks to -IP networks. Although it has been used on EtherTalk networks to allow -Macs that are only able to tunnel IP over EtherTalk. - -Encapsulation has been used to allow a Linux box stuck on a LocalTalk -network to use IP. It should work equally well if you are stuck on an -EtherTalk only network. - -Further Assistance -------------------- -You can contact me (Jay Schulist ) with any -questions regarding decapsulation or encapsulation. Bradford W. Johnson - originally wrote the ipddp.c driver for IP -encapsulation in AppleTalk. diff --git a/Documentation/networking/ltpc.txt b/Documentation/networking/ltpc.txt index 0bf3220c715b..a005a73b76d0 100644 --- a/Documentation/networking/ltpc.txt +++ b/Documentation/networking/ltpc.txt @@ -99,7 +99,7 @@ treat the LocalTalk device like an ordinary Ethernet device, even if that's what it looks like to Netatalk. Instead, you follow the same procedure as for doing IP in EtherTalk. -See Documentation/networking/ipddp.txt for more information about the +See Documentation/networking/ipddp.rst for more information about the kernel driver and userspace tools needed. -------------------------------------- diff --git a/drivers/net/appletalk/Kconfig b/drivers/net/appletalk/Kconfig index d4e51c048f62..ccde6479050c 100644 --- a/drivers/net/appletalk/Kconfig +++ b/drivers/net/appletalk/Kconfig @@ -86,7 +86,7 @@ config IPDDP box is stuck on an AppleTalk only network) or decapsulate (e.g. if you want your Linux box to act as an Internet gateway for a zoo of AppleTalk connected Macs). Please see the file - for more information. + for more information. If you say Y here, the AppleTalk-IP support will be compiled into the kernel. In this case, you can either use encapsulation or @@ -107,4 +107,4 @@ config IPDDP_ENCAP IP packets inside AppleTalk frames; this is useful if your Linux box is stuck on an AppleTalk network (which hopefully contains a decapsulator somewhere). Please see - for more information. + for more information. -- cgit v1.2.3-59-g8ed1b From 9de1fcdf36e7e00693a260865a5f2a58af1c7040 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Tue, 28 Apr 2020 00:01:46 +0200 Subject: docs: networking: convert ip_dynaddr.txt to ReST - add SPDX header; - adjust title markup; - mark code blocks and literals as such; - adjust identation, whitespaces and blank lines; - add to networking/index.rst. Signed-off-by: Mauro Carvalho Chehab Signed-off-by: David S. Miller --- Documentation/networking/index.rst | 1 + Documentation/networking/ip_dynaddr.rst | 40 +++++++++++++++++++++++++++++++++ Documentation/networking/ip_dynaddr.txt | 29 ------------------------ 3 files changed, 41 insertions(+), 29 deletions(-) create mode 100644 Documentation/networking/ip_dynaddr.rst delete mode 100644 Documentation/networking/ip_dynaddr.txt diff --git a/Documentation/networking/index.rst b/Documentation/networking/index.rst index cf85d0a73144..f81aeb87aa28 100644 --- a/Documentation/networking/index.rst +++ b/Documentation/networking/index.rst @@ -66,6 +66,7 @@ Contents: hinic ila ipddp + ip_dynaddr .. only:: subproject and html diff --git a/Documentation/networking/ip_dynaddr.rst b/Documentation/networking/ip_dynaddr.rst new file mode 100644 index 000000000000..eacc0c780c7f --- /dev/null +++ b/Documentation/networking/ip_dynaddr.rst @@ -0,0 +1,40 @@ +.. SPDX-License-Identifier: GPL-2.0 + +================================== +IP dynamic address hack-port v0.03 +================================== + +This stuff allows diald ONESHOT connections to get established by +dynamically changing packet source address (and socket's if local procs). +It is implemented for TCP diald-box connections(1) and IP_MASQuerading(2). + +If enabled\ [#]_ and forwarding interface has changed: + + 1) Socket (and packet) source address is rewritten ON RETRANSMISSIONS + while in SYN_SENT state (diald-box processes). + 2) Out-bounded MASQueraded source address changes ON OUTPUT (when + internal host does retransmission) until a packet from outside is + received by the tunnel. + +This is specially helpful for auto dialup links (diald), where the +``actual`` outgoing address is unknown at the moment the link is +going up. So, the *same* (local AND masqueraded) connections requests that +bring the link up will be able to get established. + +.. [#] At boot, by default no address rewriting is attempted. + + To enable:: + + # echo 1 > /proc/sys/net/ipv4/ip_dynaddr + + To enable verbose mode:: + + # echo 2 > /proc/sys/net/ipv4/ip_dynaddr + + To disable (default):: + + # echo 0 > /proc/sys/net/ipv4/ip_dynaddr + +Enjoy! + +Juanjo diff --git a/Documentation/networking/ip_dynaddr.txt b/Documentation/networking/ip_dynaddr.txt deleted file mode 100644 index 45f3c1268e86..000000000000 --- a/Documentation/networking/ip_dynaddr.txt +++ /dev/null @@ -1,29 +0,0 @@ -IP dynamic address hack-port v0.03 -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -This stuff allows diald ONESHOT connections to get established by -dynamically changing packet source address (and socket's if local procs). -It is implemented for TCP diald-box connections(1) and IP_MASQuerading(2). - -If enabled[*] and forwarding interface has changed: - 1) Socket (and packet) source address is rewritten ON RETRANSMISSIONS - while in SYN_SENT state (diald-box processes). - 2) Out-bounded MASQueraded source address changes ON OUTPUT (when - internal host does retransmission) until a packet from outside is - received by the tunnel. - -This is specially helpful for auto dialup links (diald), where the -``actual'' outgoing address is unknown at the moment the link is -going up. So, the *same* (local AND masqueraded) connections requests that -bring the link up will be able to get established. - -[*] At boot, by default no address rewriting is attempted. - To enable: - # echo 1 > /proc/sys/net/ipv4/ip_dynaddr - To enable verbose mode: - # echo 2 > /proc/sys/net/ipv4/ip_dynaddr - To disable (default) - # echo 0 > /proc/sys/net/ipv4/ip_dynaddr - -Enjoy! - --- Juanjo -- cgit v1.2.3-59-g8ed1b From aac86c887ed66ac4f467821ebf75373124a148d7 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Tue, 28 Apr 2020 00:01:47 +0200 Subject: docs: networking: convert iphase.txt to ReST - add SPDX header; - adjust title using the proper markup; - mark code blocks and literals as such; - mark tables as such; - mark lists as such; - adjust identation, whitespaces and blank lines; - add to networking/index.rst. Signed-off-by: Mauro Carvalho Chehab Signed-off-by: David S. Miller --- Documentation/networking/index.rst | 1 + Documentation/networking/iphase.rst | 193 ++++++++++++++++++++++++++++++++++++ Documentation/networking/iphase.txt | 158 ----------------------------- drivers/atm/Kconfig | 2 +- 4 files changed, 195 insertions(+), 159 deletions(-) create mode 100644 Documentation/networking/iphase.rst delete mode 100644 Documentation/networking/iphase.txt diff --git a/Documentation/networking/index.rst b/Documentation/networking/index.rst index f81aeb87aa28..505eaa41ca2b 100644 --- a/Documentation/networking/index.rst +++ b/Documentation/networking/index.rst @@ -67,6 +67,7 @@ Contents: ila ipddp ip_dynaddr + iphase .. only:: subproject and html diff --git a/Documentation/networking/iphase.rst b/Documentation/networking/iphase.rst new file mode 100644 index 000000000000..92d9b757d75a --- /dev/null +++ b/Documentation/networking/iphase.rst @@ -0,0 +1,193 @@ +.. SPDX-License-Identifier: GPL-2.0 + +================================== +ATM (i)Chip IA Linux Driver Source +================================== + + READ ME FISRT + +-------------------------------------------------------------------------------- + + Read This Before You Begin! + +-------------------------------------------------------------------------------- + +Description +=========== + +This is the README file for the Interphase PCI ATM (i)Chip IA Linux driver +source release. + +The features and limitations of this driver are as follows: + + - A single VPI (VPI value of 0) is supported. + - Supports 4K VCs for the server board (with 512K control memory) and 1K + VCs for the client board (with 128K control memory). + - UBR, ABR and CBR service categories are supported. + - Only AAL5 is supported. + - Supports setting of PCR on the VCs. + - Multiple adapters in a system are supported. + - All variants of Interphase ATM PCI (i)Chip adapter cards are supported, + including x575 (OC3, control memory 128K , 512K and packet memory 128K, + 512K and 1M), x525 (UTP25) and x531 (DS3 and E3). See + http://www.iphase.com/ + for details. + - Only x86 platforms are supported. + - SMP is supported. + + +Before You Start +================ + + +Installation +------------ + +1. Installing the adapters in the system + + To install the ATM adapters in the system, follow the steps below. + + a. Login as root. + b. Shut down the system and power off the system. + c. Install one or more ATM adapters in the system. + d. Connect each adapter to a port on an ATM switch. The green 'Link' + LED on the front panel of the adapter will be on if the adapter is + connected to the switch properly when the system is powered up. + e. Power on and boot the system. + +2. [ Removed ] + +3. Rebuild kernel with ABR support + + [ a. and b. removed ] + + c. Reconfigure the kernel, choose the Interphase ia driver through "make + menuconfig" or "make xconfig". + d. Rebuild the kernel, loadable modules and the atm tools. + e. Install the new built kernel and modules and reboot. + +4. Load the adapter hardware driver (ia driver) if it is built as a module + + a. Login as root. + b. Change directory to /lib/modules//atm. + c. Run "insmod suni.o;insmod iphase.o" + The yellow 'status' LED on the front panel of the adapter will blink + while the driver is loaded in the system. + d. To verify that the 'ia' driver is loaded successfully, run the + following command:: + + cat /proc/atm/devices + + If the driver is loaded successfully, the output of the command will + be similar to the following lines:: + + Itf Type ESI/"MAC"addr AAL(TX,err,RX,err,drop) ... + 0 ia xxxxxxxxx 0 ( 0 0 0 0 0 ) 5 ( 0 0 0 0 0 ) + + You can also check the system log file /var/log/messages for messages + related to the ATM driver. + +5. Ia Driver Configuration + +5.1 Configuration of adapter buffers + The (i)Chip boards have 3 different packet RAM size variants: 128K, 512K and + 1M. The RAM size decides the number of buffers and buffer size. The default + size and number of buffers are set as following: + + ========= ======= ====== ====== ====== ====== ====== + Total Rx RAM Tx RAM Rx Buf Tx Buf Rx buf Tx buf + RAM size size size size size cnt cnt + ========= ======= ====== ====== ====== ====== ====== + 128K 64K 64K 10K 10K 6 6 + 512K 256K 256K 10K 10K 25 25 + 1M 512K 512K 10K 10K 51 51 + ========= ======= ====== ====== ====== ====== ====== + + These setting should work well in most environments, but can be + changed by typing the following command:: + + insmod /ia.o IA_RX_BUF= IA_RX_BUF_SZ= \ + IA_TX_BUF= IA_TX_BUF_SZ= + + Where: + + - RX_CNT = number of receive buffers in the range (1-128) + - RX_SIZE = size of receive buffers in the range (48-64K) + - TX_CNT = number of transmit buffers in the range (1-128) + - TX_SIZE = size of transmit buffers in the range (48-64K) + + 1. Transmit and receive buffer size must be a multiple of 4. + 2. Care should be taken so that the memory required for the + transmit and receive buffers is less than or equal to the + total adapter packet memory. + +5.2 Turn on ia debug trace + + When the ia driver is built with the CONFIG_ATM_IA_DEBUG flag, the driver + can provide more debug trace if needed. There is a bit mask variable, + IADebugFlag, which controls the output of the traces. You can find the bit + map of the IADebugFlag in iphase.h. + The debug trace can be turn on through the insmod command line option, for + example, "insmod iphase.o IADebugFlag=0xffffffff" can turn on all the debug + traces together with loading the driver. + +6. Ia Driver Test Using ttcp_atm and PVC + + For the PVC setup, the test machines can either be connected back-to-back or + through a switch. If connected through the switch, the switch must be + configured for the PVC(s). + + a. For UBR test: + + At the test machine intended to receive data, type:: + + ttcp_atm -r -a -s 0.100 + + At the other test machine, type:: + + ttcp_atm -t -a -s 0.100 -n 10000 + + Run "ttcp_atm -h" to display more options of the ttcp_atm tool. + b. For ABR test: + + It is the same as the UBR testing, but with an extra command option:: + + -Pabr:max_pcr= + + where: + + xxx = the maximum peak cell rate, from 170 - 353207. + + This option must be set on both the machines. + + c. For CBR test: + + It is the same as the UBR testing, but with an extra command option:: + + -Pcbr:max_pcr= + + where: + + xxx = the maximum peak cell rate, from 170 - 353207. + + This option may only be set on the transmit machine. + + +Outstanding Issues +================== + + + +Contact Information +------------------- + +:: + + Customer Support: + United States: Telephone: (214) 654-5555 + Fax: (214) 654-5500 + E-Mail: intouch@iphase.com + Europe: Telephone: 33 (0)1 41 15 44 00 + Fax: 33 (0)1 41 15 12 13 + World Wide Web: http://www.iphase.com + Anonymous FTP: ftp.iphase.com diff --git a/Documentation/networking/iphase.txt b/Documentation/networking/iphase.txt deleted file mode 100644 index 670b72f16585..000000000000 --- a/Documentation/networking/iphase.txt +++ /dev/null @@ -1,158 +0,0 @@ - - READ ME FISRT - ATM (i)Chip IA Linux Driver Source --------------------------------------------------------------------------------- - Read This Before You Begin! --------------------------------------------------------------------------------- - -Description ------------ - -This is the README file for the Interphase PCI ATM (i)Chip IA Linux driver -source release. - -The features and limitations of this driver are as follows: - - A single VPI (VPI value of 0) is supported. - - Supports 4K VCs for the server board (with 512K control memory) and 1K - VCs for the client board (with 128K control memory). - - UBR, ABR and CBR service categories are supported. - - Only AAL5 is supported. - - Supports setting of PCR on the VCs. - - Multiple adapters in a system are supported. - - All variants of Interphase ATM PCI (i)Chip adapter cards are supported, - including x575 (OC3, control memory 128K , 512K and packet memory 128K, - 512K and 1M), x525 (UTP25) and x531 (DS3 and E3). See - http://www.iphase.com/ - for details. - - Only x86 platforms are supported. - - SMP is supported. - - -Before You Start ----------------- - - -Installation ------------- - -1. Installing the adapters in the system - To install the ATM adapters in the system, follow the steps below. - a. Login as root. - b. Shut down the system and power off the system. - c. Install one or more ATM adapters in the system. - d. Connect each adapter to a port on an ATM switch. The green 'Link' - LED on the front panel of the adapter will be on if the adapter is - connected to the switch properly when the system is powered up. - e. Power on and boot the system. - -2. [ Removed ] - -3. Rebuild kernel with ABR support - [ a. and b. removed ] - c. Reconfigure the kernel, choose the Interphase ia driver through "make - menuconfig" or "make xconfig". - d. Rebuild the kernel, loadable modules and the atm tools. - e. Install the new built kernel and modules and reboot. - -4. Load the adapter hardware driver (ia driver) if it is built as a module - a. Login as root. - b. Change directory to /lib/modules//atm. - c. Run "insmod suni.o;insmod iphase.o" - The yellow 'status' LED on the front panel of the adapter will blink - while the driver is loaded in the system. - d. To verify that the 'ia' driver is loaded successfully, run the - following command: - - cat /proc/atm/devices - - If the driver is loaded successfully, the output of the command will - be similar to the following lines: - - Itf Type ESI/"MAC"addr AAL(TX,err,RX,err,drop) ... - 0 ia xxxxxxxxx 0 ( 0 0 0 0 0 ) 5 ( 0 0 0 0 0 ) - - You can also check the system log file /var/log/messages for messages - related to the ATM driver. - -5. Ia Driver Configuration - -5.1 Configuration of adapter buffers - The (i)Chip boards have 3 different packet RAM size variants: 128K, 512K and - 1M. The RAM size decides the number of buffers and buffer size. The default - size and number of buffers are set as following: - - Total Rx RAM Tx RAM Rx Buf Tx Buf Rx buf Tx buf - RAM size size size size size cnt cnt - -------- ------ ------ ------ ------ ------ ------ - 128K 64K 64K 10K 10K 6 6 - 512K 256K 256K 10K 10K 25 25 - 1M 512K 512K 10K 10K 51 51 - - These setting should work well in most environments, but can be - changed by typing the following command: - - insmod /ia.o IA_RX_BUF= IA_RX_BUF_SZ= \ - IA_TX_BUF= IA_TX_BUF_SZ= - Where: - RX_CNT = number of receive buffers in the range (1-128) - RX_SIZE = size of receive buffers in the range (48-64K) - TX_CNT = number of transmit buffers in the range (1-128) - TX_SIZE = size of transmit buffers in the range (48-64K) - - 1. Transmit and receive buffer size must be a multiple of 4. - 2. Care should be taken so that the memory required for the - transmit and receive buffers is less than or equal to the - total adapter packet memory. - -5.2 Turn on ia debug trace - - When the ia driver is built with the CONFIG_ATM_IA_DEBUG flag, the driver - can provide more debug trace if needed. There is a bit mask variable, - IADebugFlag, which controls the output of the traces. You can find the bit - map of the IADebugFlag in iphase.h. - The debug trace can be turn on through the insmod command line option, for - example, "insmod iphase.o IADebugFlag=0xffffffff" can turn on all the debug - traces together with loading the driver. - -6. Ia Driver Test Using ttcp_atm and PVC - - For the PVC setup, the test machines can either be connected back-to-back or - through a switch. If connected through the switch, the switch must be - configured for the PVC(s). - - a. For UBR test: - At the test machine intended to receive data, type: - ttcp_atm -r -a -s 0.100 - At the other test machine, type: - ttcp_atm -t -a -s 0.100 -n 10000 - Run "ttcp_atm -h" to display more options of the ttcp_atm tool. - b. For ABR test: - It is the same as the UBR testing, but with an extra command option: - -Pabr:max_pcr= - where: - xxx = the maximum peak cell rate, from 170 - 353207. - This option must be set on both the machines. - c. For CBR test: - It is the same as the UBR testing, but with an extra command option: - -Pcbr:max_pcr= - where: - xxx = the maximum peak cell rate, from 170 - 353207. - This option may only be set on the transmit machine. - - -OUTSTANDING ISSUES ------------------- - - - -Contact Information -------------------- - - Customer Support: - United States: Telephone: (214) 654-5555 - Fax: (214) 654-5500 - E-Mail: intouch@iphase.com - Europe: Telephone: 33 (0)1 41 15 44 00 - Fax: 33 (0)1 41 15 12 13 - World Wide Web: http://www.iphase.com - Anonymous FTP: ftp.iphase.com diff --git a/drivers/atm/Kconfig b/drivers/atm/Kconfig index 4af7cbdcc349..cfb0d16b60ad 100644 --- a/drivers/atm/Kconfig +++ b/drivers/atm/Kconfig @@ -306,7 +306,7 @@ config ATM_IA for more info about the cards. Say Y (or M to compile as a module named iphase) here if you have one of these cards. - See the file for further + See the file for further details. config ATM_IA_DEBUG -- cgit v1.2.3-59-g8ed1b From 355e656e017c3b42deb57d125d86c4cbd277d6db Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Tue, 28 Apr 2020 00:01:48 +0200 Subject: docs: networking: convert ipsec.txt to ReST Not much to be done here: - add SPDX header; - add a document title; - adjust identation, whitespaces and blank lines; - add to networking/index.rst. Signed-off-by: Mauro Carvalho Chehab Signed-off-by: David S. Miller --- Documentation/networking/index.rst | 1 + Documentation/networking/ipsec.rst | 46 ++++++++++++++++++++++++++++++++++++++ Documentation/networking/ipsec.txt | 38 ------------------------------- 3 files changed, 47 insertions(+), 38 deletions(-) create mode 100644 Documentation/networking/ipsec.rst delete mode 100644 Documentation/networking/ipsec.txt diff --git a/Documentation/networking/index.rst b/Documentation/networking/index.rst index 505eaa41ca2b..3efb4608649a 100644 --- a/Documentation/networking/index.rst +++ b/Documentation/networking/index.rst @@ -68,6 +68,7 @@ Contents: ipddp ip_dynaddr iphase + ipsec .. only:: subproject and html diff --git a/Documentation/networking/ipsec.rst b/Documentation/networking/ipsec.rst new file mode 100644 index 000000000000..afe9d7b48be3 --- /dev/null +++ b/Documentation/networking/ipsec.rst @@ -0,0 +1,46 @@ +.. SPDX-License-Identifier: GPL-2.0 + +===== +IPsec +===== + + +Here documents known IPsec corner cases which need to be keep in mind when +deploy various IPsec configuration in real world production environment. + +1. IPcomp: + Small IP packet won't get compressed at sender, and failed on + policy check on receiver. + +Quote from RFC3173:: + + 2.2. Non-Expansion Policy + + If the total size of a compressed payload and the IPComp header, as + defined in section 3, is not smaller than the size of the original + payload, the IP datagram MUST be sent in the original non-compressed + form. To clarify: If an IP datagram is sent non-compressed, no + + IPComp header is added to the datagram. This policy ensures saving + the decompression processing cycles and avoiding incurring IP + datagram fragmentation when the expanded datagram is larger than the + MTU. + + Small IP datagrams are likely to expand as a result of compression. + Therefore, a numeric threshold should be applied before compression, + where IP datagrams of size smaller than the threshold are sent in the + original form without attempting compression. The numeric threshold + is implementation dependent. + +Current IPComp implementation is indeed by the book, while as in practice +when sending non-compressed packet to the peer (whether or not packet len +is smaller than the threshold or the compressed len is larger than original +packet len), the packet is dropped when checking the policy as this packet +matches the selector but not coming from any XFRM layer, i.e., with no +security path. Such naked packet will not eventually make it to upper layer. +The result is much more wired to the user when ping peer with different +payload length. + +One workaround is try to set "level use" for each policy if user observed +above scenario. The consequence of doing so is small packet(uncompressed) +will skip policy checking on receiver side. diff --git a/Documentation/networking/ipsec.txt b/Documentation/networking/ipsec.txt deleted file mode 100644 index ba794b7e51be..000000000000 --- a/Documentation/networking/ipsec.txt +++ /dev/null @@ -1,38 +0,0 @@ - -Here documents known IPsec corner cases which need to be keep in mind when -deploy various IPsec configuration in real world production environment. - -1. IPcomp: Small IP packet won't get compressed at sender, and failed on - policy check on receiver. - -Quote from RFC3173: -2.2. Non-Expansion Policy - - If the total size of a compressed payload and the IPComp header, as - defined in section 3, is not smaller than the size of the original - payload, the IP datagram MUST be sent in the original non-compressed - form. To clarify: If an IP datagram is sent non-compressed, no - - IPComp header is added to the datagram. This policy ensures saving - the decompression processing cycles and avoiding incurring IP - datagram fragmentation when the expanded datagram is larger than the - MTU. - - Small IP datagrams are likely to expand as a result of compression. - Therefore, a numeric threshold should be applied before compression, - where IP datagrams of size smaller than the threshold are sent in the - original form without attempting compression. The numeric threshold - is implementation dependent. - -Current IPComp implementation is indeed by the book, while as in practice -when sending non-compressed packet to the peer (whether or not packet len -is smaller than the threshold or the compressed len is larger than original -packet len), the packet is dropped when checking the policy as this packet -matches the selector but not coming from any XFRM layer, i.e., with no -security path. Such naked packet will not eventually make it to upper layer. -The result is much more wired to the user when ping peer with different -payload length. - -One workaround is try to set "level use" for each policy if user observed -above scenario. The consequence of doing so is small packet(uncompressed) -will skip policy checking on receiver side. -- cgit v1.2.3-59-g8ed1b From 1cec2cacaaec5d53adc04dd3ecfdb687b26c0e89 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Tue, 28 Apr 2020 00:01:49 +0200 Subject: docs: networking: convert ip-sysctl.txt to ReST - add SPDX header; - adjust titles and chapters, adding proper markups; - mark code blocks and literals as such; - mark lists as such; - mark tables as such; - use footnote markup; - adjust identation, whitespaces and blank lines; - add to networking/index.rst. Signed-off-by: Mauro Carvalho Chehab Signed-off-by: David S. Miller --- Documentation/admin-guide/kernel-parameters.txt | 2 +- Documentation/admin-guide/sysctl/net.rst | 2 +- Documentation/networking/index.rst | 1 + Documentation/networking/ip-sysctl.rst | 2649 +++++++++++++++++++++++ Documentation/networking/ip-sysctl.txt | 2374 -------------------- Documentation/networking/snmp_counter.rst | 2 +- net/Kconfig | 2 +- net/ipv4/Kconfig | 2 +- net/ipv4/icmp.c | 2 +- 9 files changed, 2656 insertions(+), 2380 deletions(-) create mode 100644 Documentation/networking/ip-sysctl.rst delete mode 100644 Documentation/networking/ip-sysctl.txt diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index b23ab11587a6..e37db6f1be64 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -4910,7 +4910,7 @@ Set the number of tcp_metrics_hash slots. Default value is 8192 or 16384 depending on total ram pages. This is used to specify the TCP metrics - cache size. See Documentation/networking/ip-sysctl.txt + cache size. See Documentation/networking/ip-sysctl.rst "tcp_no_metrics_save" section for more details. tdfx= [HW,DRM] diff --git a/Documentation/admin-guide/sysctl/net.rst b/Documentation/admin-guide/sysctl/net.rst index e043c9213388..84e3348a9543 100644 --- a/Documentation/admin-guide/sysctl/net.rst +++ b/Documentation/admin-guide/sysctl/net.rst @@ -353,7 +353,7 @@ socket's buffer. It will not take effect unless PF_UNIX flag is specified. 3. /proc/sys/net/ipv4 - IPV4 settings ------------------------------------- -Please see: Documentation/networking/ip-sysctl.txt and ipvs-sysctl.txt for +Please see: Documentation/networking/ip-sysctl.rst and ipvs-sysctl.txt for descriptions of these entries. diff --git a/Documentation/networking/index.rst b/Documentation/networking/index.rst index 3efb4608649a..7d133d8dbe2a 100644 --- a/Documentation/networking/index.rst +++ b/Documentation/networking/index.rst @@ -69,6 +69,7 @@ Contents: ip_dynaddr iphase ipsec + ip-sysctl .. only:: subproject and html diff --git a/Documentation/networking/ip-sysctl.rst b/Documentation/networking/ip-sysctl.rst new file mode 100644 index 000000000000..38f811d4b2f0 --- /dev/null +++ b/Documentation/networking/ip-sysctl.rst @@ -0,0 +1,2649 @@ +.. SPDX-License-Identifier: GPL-2.0 + +========= +IP Sysctl +========= + +/proc/sys/net/ipv4/* Variables +============================== + +ip_forward - BOOLEAN + - 0 - disabled (default) + - not 0 - enabled + + Forward Packets between interfaces. + + This variable is special, its change resets all configuration + parameters to their default state (RFC1122 for hosts, RFC1812 + for routers) + +ip_default_ttl - INTEGER + Default value of TTL field (Time To Live) for outgoing (but not + forwarded) IP packets. Should be between 1 and 255 inclusive. + Default: 64 (as recommended by RFC1700) + +ip_no_pmtu_disc - INTEGER + Disable Path MTU Discovery. If enabled in mode 1 and a + fragmentation-required ICMP is received, the PMTU to this + destination will be set to min_pmtu (see below). You will need + to raise min_pmtu to the smallest interface MTU on your system + manually if you want to avoid locally generated fragments. + + In mode 2 incoming Path MTU Discovery messages will be + discarded. Outgoing frames are handled the same as in mode 1, + implicitly setting IP_PMTUDISC_DONT on every created socket. + + Mode 3 is a hardened pmtu discover mode. The kernel will only + accept fragmentation-needed errors if the underlying protocol + can verify them besides a plain socket lookup. Current + protocols for which pmtu events will be honored are TCP, SCTP + and DCCP as they verify e.g. the sequence number or the + association. This mode should not be enabled globally but is + only intended to secure e.g. name servers in namespaces where + TCP path mtu must still work but path MTU information of other + protocols should be discarded. If enabled globally this mode + could break other protocols. + + Possible values: 0-3 + + Default: FALSE + +min_pmtu - INTEGER + default 552 - minimum discovered Path MTU + +ip_forward_use_pmtu - BOOLEAN + By default we don't trust protocol path MTUs while forwarding + because they could be easily forged and can lead to unwanted + fragmentation by the router. + You only need to enable this if you have user-space software + which tries to discover path mtus by itself and depends on the + kernel honoring this information. This is normally not the + case. + + Default: 0 (disabled) + + Possible values: + + - 0 - disabled + - 1 - enabled + +fwmark_reflect - BOOLEAN + Controls the fwmark of kernel-generated IPv4 reply packets that are not + associated with a socket for example, TCP RSTs or ICMP echo replies). + If unset, these packets have a fwmark of zero. If set, they have the + fwmark of the packet they are replying to. + + Default: 0 + +fib_multipath_use_neigh - BOOLEAN + Use status of existing neighbor entry when determining nexthop for + multipath routes. If disabled, neighbor information is not used and + packets could be directed to a failed nexthop. Only valid for kernels + built with CONFIG_IP_ROUTE_MULTIPATH enabled. + + Default: 0 (disabled) + + Possible values: + + - 0 - disabled + - 1 - enabled + +fib_multipath_hash_policy - INTEGER + Controls which hash policy to use for multipath routes. Only valid + for kernels built with CONFIG_IP_ROUTE_MULTIPATH enabled. + + Default: 0 (Layer 3) + + Possible values: + + - 0 - Layer 3 + - 1 - Layer 4 + - 2 - Layer 3 or inner Layer 3 if present + +fib_sync_mem - UNSIGNED INTEGER + Amount of dirty memory from fib entries that can be backlogged before + synchronize_rcu is forced. + + Default: 512kB Minimum: 64kB Maximum: 64MB + +ip_forward_update_priority - INTEGER + Whether to update SKB priority from "TOS" field in IPv4 header after it + is forwarded. The new SKB priority is mapped from TOS field value + according to an rt_tos2priority table (see e.g. man tc-prio). + + Default: 1 (Update priority.) + + Possible values: + + - 0 - Do not update priority. + - 1 - Update priority. + +route/max_size - INTEGER + Maximum number of routes allowed in the kernel. Increase + this when using large numbers of interfaces and/or routes. + + From linux kernel 3.6 onwards, this is deprecated for ipv4 + as route cache is no longer used. + +neigh/default/gc_thresh1 - INTEGER + Minimum number of entries to keep. Garbage collector will not + purge entries if there are fewer than this number. + + Default: 128 + +neigh/default/gc_thresh2 - INTEGER + Threshold when garbage collector becomes more aggressive about + purging entries. Entries older than 5 seconds will be cleared + when over this number. + + Default: 512 + +neigh/default/gc_thresh3 - INTEGER + Maximum number of non-PERMANENT neighbor entries allowed. Increase + this when using large numbers of interfaces and when communicating + with large numbers of directly-connected peers. + + Default: 1024 + +neigh/default/unres_qlen_bytes - INTEGER + The maximum number of bytes which may be used by packets + queued for each unresolved address by other network layers. + (added in linux 3.3) + + Setting negative value is meaningless and will return error. + + Default: SK_WMEM_MAX, (same as net.core.wmem_default). + + Exact value depends on architecture and kernel options, + but should be enough to allow queuing 256 packets + of medium size. + +neigh/default/unres_qlen - INTEGER + The maximum number of packets which may be queued for each + unresolved address by other network layers. + + (deprecated in linux 3.3) : use unres_qlen_bytes instead. + + Prior to linux 3.3, the default value is 3 which may cause + unexpected packet loss. The current default value is calculated + according to default value of unres_qlen_bytes and true size of + packet. + + Default: 101 + +mtu_expires - INTEGER + Time, in seconds, that cached PMTU information is kept. + +min_adv_mss - INTEGER + The advertised MSS depends on the first hop route MTU, but will + never be lower than this setting. + +IP Fragmentation: + +ipfrag_high_thresh - LONG INTEGER + Maximum memory used to reassemble IP fragments. + +ipfrag_low_thresh - LONG INTEGER + (Obsolete since linux-4.17) + Maximum memory used to reassemble IP fragments before the kernel + begins to remove incomplete fragment queues to free up resources. + The kernel still accepts new fragments for defragmentation. + +ipfrag_time - INTEGER + Time in seconds to keep an IP fragment in memory. + +ipfrag_max_dist - INTEGER + ipfrag_max_dist is a non-negative integer value which defines the + maximum "disorder" which is allowed among fragments which share a + common IP source address. Note that reordering of packets is + not unusual, but if a large number of fragments arrive from a source + IP address while a particular fragment queue remains incomplete, it + probably indicates that one or more fragments belonging to that queue + have been lost. When ipfrag_max_dist is positive, an additional check + is done on fragments before they are added to a reassembly queue - if + ipfrag_max_dist (or more) fragments have arrived from a particular IP + address between additions to any IP fragment queue using that source + address, it's presumed that one or more fragments in the queue are + lost. The existing fragment queue will be dropped, and a new one + started. An ipfrag_max_dist value of zero disables this check. + + Using a very small value, e.g. 1 or 2, for ipfrag_max_dist can + result in unnecessarily dropping fragment queues when normal + reordering of packets occurs, which could lead to poor application + performance. Using a very large value, e.g. 50000, increases the + likelihood of incorrectly reassembling IP fragments that originate + from different IP datagrams, which could result in data corruption. + Default: 64 + +INET peer storage +================= + +inet_peer_threshold - INTEGER + The approximate size of the storage. Starting from this threshold + entries will be thrown aggressively. This threshold also determines + entries' time-to-live and time intervals between garbage collection + passes. More entries, less time-to-live, less GC interval. + +inet_peer_minttl - INTEGER + Minimum time-to-live of entries. Should be enough to cover fragment + time-to-live on the reassembling side. This minimum time-to-live is + guaranteed if the pool size is less than inet_peer_threshold. + Measured in seconds. + +inet_peer_maxttl - INTEGER + Maximum time-to-live of entries. Unused entries will expire after + this period of time if there is no memory pressure on the pool (i.e. + when the number of entries in the pool is very small). + Measured in seconds. + +TCP variables +============= + +somaxconn - INTEGER + Limit of socket listen() backlog, known in userspace as SOMAXCONN. + Defaults to 4096. (Was 128 before linux-5.4) + See also tcp_max_syn_backlog for additional tuning for TCP sockets. + +tcp_abort_on_overflow - BOOLEAN + If listening service is too slow to accept new connections, + reset them. Default state is FALSE. It means that if overflow + occurred due to a burst, connection will recover. Enable this + option _only_ if you are really sure that listening daemon + cannot be tuned to accept connections faster. Enabling this + option can harm clients of your server. + +tcp_adv_win_scale - INTEGER + Count buffering overhead as bytes/2^tcp_adv_win_scale + (if tcp_adv_win_scale > 0) or bytes-bytes/2^(-tcp_adv_win_scale), + if it is <= 0. + + Possible values are [-31, 31], inclusive. + + Default: 1 + +tcp_allowed_congestion_control - STRING + Show/set the congestion control choices available to non-privileged + processes. The list is a subset of those listed in + tcp_available_congestion_control. + + Default is "reno" and the default setting (tcp_congestion_control). + +tcp_app_win - INTEGER + Reserve max(window/2^tcp_app_win, mss) of window for application + buffer. Value 0 is special, it means that nothing is reserved. + + Default: 31 + +tcp_autocorking - BOOLEAN + Enable TCP auto corking : + When applications do consecutive small write()/sendmsg() system calls, + we try to coalesce these small writes as much as possible, to lower + total amount of sent packets. This is done if at least one prior + packet for the flow is waiting in Qdisc queues or device transmit + queue. Applications can still use TCP_CORK for optimal behavior + when they know how/when to uncork their sockets. + + Default : 1 + +tcp_available_congestion_control - STRING + Shows the available congestion control choices that are registered. + More congestion control algorithms may be available as modules, + but not loaded. + +tcp_base_mss - INTEGER + The initial value of search_low to be used by the packetization layer + Path MTU discovery (MTU probing). If MTU probing is enabled, + this is the initial MSS used by the connection. + +tcp_mtu_probe_floor - INTEGER + If MTU probing is enabled this caps the minimum MSS used for search_low + for the connection. + + Default : 48 + +tcp_min_snd_mss - INTEGER + TCP SYN and SYNACK messages usually advertise an ADVMSS option, + as described in RFC 1122 and RFC 6691. + + If this ADVMSS option is smaller than tcp_min_snd_mss, + it is silently capped to tcp_min_snd_mss. + + Default : 48 (at least 8 bytes of payload per segment) + +tcp_congestion_control - STRING + Set the congestion control algorithm to be used for new + connections. The algorithm "reno" is always available, but + additional choices may be available based on kernel configuration. + Default is set as part of kernel configuration. + For passive connections, the listener congestion control choice + is inherited. + + [see setsockopt(listenfd, SOL_TCP, TCP_CONGESTION, "name" ...) ] + +tcp_dsack - BOOLEAN + Allows TCP to send "duplicate" SACKs. + +tcp_early_retrans - INTEGER + Tail loss probe (TLP) converts RTOs occurring due to tail + losses into fast recovery (draft-ietf-tcpm-rack). Note that + TLP requires RACK to function properly (see tcp_recovery below) + + Possible values: + + - 0 disables TLP + - 3 or 4 enables TLP + + Default: 3 + +tcp_ecn - INTEGER + Control use of Explicit Congestion Notification (ECN) by TCP. + ECN is used only when both ends of the TCP connection indicate + support for it. This feature is useful in avoiding losses due + to congestion by allowing supporting routers to signal + congestion before having to drop packets. + + Possible values are: + + = ===================================================== + 0 Disable ECN. Neither initiate nor accept ECN. + 1 Enable ECN when requested by incoming connections and + also request ECN on outgoing connection attempts. + 2 Enable ECN when requested by incoming connections + but do not request ECN on outgoing connections. + = ===================================================== + + Default: 2 + +tcp_ecn_fallback - BOOLEAN + If the kernel detects that ECN connection misbehaves, enable fall + back to non-ECN. Currently, this knob implements the fallback + from RFC3168, section 6.1.1.1., but we reserve that in future, + additional detection mechanisms could be implemented under this + knob. The value is not used, if tcp_ecn or per route (or congestion + control) ECN settings are disabled. + + Default: 1 (fallback enabled) + +tcp_fack - BOOLEAN + This is a legacy option, it has no effect anymore. + +tcp_fin_timeout - INTEGER + The length of time an orphaned (no longer referenced by any + application) connection will remain in the FIN_WAIT_2 state + before it is aborted at the local end. While a perfectly + valid "receive only" state for an un-orphaned connection, an + orphaned connection in FIN_WAIT_2 state could otherwise wait + forever for the remote to close its end of the connection. + + Cf. tcp_max_orphans + + Default: 60 seconds + +tcp_frto - INTEGER + Enables Forward RTO-Recovery (F-RTO) defined in RFC5682. + F-RTO is an enhanced recovery algorithm for TCP retransmission + timeouts. It is particularly beneficial in networks where the + RTT fluctuates (e.g., wireless). F-RTO is sender-side only + modification. It does not require any support from the peer. + + By default it's enabled with a non-zero value. 0 disables F-RTO. + +tcp_fwmark_accept - BOOLEAN + If set, incoming connections to listening sockets that do not have a + socket mark will set the mark of the accepting socket to the fwmark of + the incoming SYN packet. This will cause all packets on that connection + (starting from the first SYNACK) to be sent with that fwmark. The + listening socket's mark is unchanged. Listening sockets that already + have a fwmark set via setsockopt(SOL_SOCKET, SO_MARK, ...) are + unaffected. + + Default: 0 + +tcp_invalid_ratelimit - INTEGER + Limit the maximal rate for sending duplicate acknowledgments + in response to incoming TCP packets that are for an existing + connection but that are invalid due to any of these reasons: + + (a) out-of-window sequence number, + (b) out-of-window acknowledgment number, or + (c) PAWS (Protection Against Wrapped Sequence numbers) check failure + + This can help mitigate simple "ack loop" DoS attacks, wherein + a buggy or malicious middlebox or man-in-the-middle can + rewrite TCP header fields in manner that causes each endpoint + to think that the other is sending invalid TCP segments, thus + causing each side to send an unterminating stream of duplicate + acknowledgments for invalid segments. + + Using 0 disables rate-limiting of dupacks in response to + invalid segments; otherwise this value specifies the minimal + space between sending such dupacks, in milliseconds. + + Default: 500 (milliseconds). + +tcp_keepalive_time - INTEGER + How often TCP sends out keepalive messages when keepalive is enabled. + Default: 2hours. + +tcp_keepalive_probes - INTEGER + How many keepalive probes TCP sends out, until it decides that the + connection is broken. Default value: 9. + +tcp_keepalive_intvl - INTEGER + How frequently the probes are send out. Multiplied by + tcp_keepalive_probes it is time to kill not responding connection, + after probes started. Default value: 75sec i.e. connection + will be aborted after ~11 minutes of retries. + +tcp_l3mdev_accept - BOOLEAN + Enables child sockets to inherit the L3 master device index. + Enabling this option allows a "global" listen socket to work + across L3 master domains (e.g., VRFs) with connected sockets + derived from the listen socket to be bound to the L3 domain in + which the packets originated. Only valid when the kernel was + compiled with CONFIG_NET_L3_MASTER_DEV. + + Default: 0 (disabled) + +tcp_low_latency - BOOLEAN + This is a legacy option, it has no effect anymore. + +tcp_max_orphans - INTEGER + Maximal number of TCP sockets not attached to any user file handle, + held by system. If this number is exceeded orphaned connections are + reset immediately and warning is printed. This limit exists + only to prevent simple DoS attacks, you _must_ not rely on this + or lower the limit artificially, but rather increase it + (probably, after increasing installed memory), + if network conditions require more than default value, + and tune network services to linger and kill such states + more aggressively. Let me to remind again: each orphan eats + up to ~64K of unswappable memory. + +tcp_max_syn_backlog - INTEGER + Maximal number of remembered connection requests (SYN_RECV), + which have not received an acknowledgment from connecting client. + + This is a per-listener limit. + + The minimal value is 128 for low memory machines, and it will + increase in proportion to the memory of machine. + + If server suffers from overload, try increasing this number. + + Remember to also check /proc/sys/net/core/somaxconn + A SYN_RECV request socket consumes about 304 bytes of memory. + +tcp_max_tw_buckets - INTEGER + Maximal number of timewait sockets held by system simultaneously. + If this number is exceeded time-wait socket is immediately destroyed + and warning is printed. This limit exists only to prevent + simple DoS attacks, you _must_ not lower the limit artificially, + but rather increase it (probably, after increasing installed memory), + if network conditions require more than default value. + +tcp_mem - vector of 3 INTEGERs: min, pressure, max + min: below this number of pages TCP is not bothered about its + memory appetite. + + pressure: when amount of memory allocated by TCP exceeds this number + of pages, TCP moderates its memory consumption and enters memory + pressure mode, which is exited when memory consumption falls + under "min". + + max: number of pages allowed for queueing by all TCP sockets. + + Defaults are calculated at boot time from amount of available + memory. + +tcp_min_rtt_wlen - INTEGER + The window length of the windowed min filter to track the minimum RTT. + A shorter window lets a flow more quickly pick up new (higher) + minimum RTT when it is moved to a longer path (e.g., due to traffic + engineering). A longer window makes the filter more resistant to RTT + inflations such as transient congestion. The unit is seconds. + + Possible values: 0 - 86400 (1 day) + + Default: 300 + +tcp_moderate_rcvbuf - BOOLEAN + If set, TCP performs receive buffer auto-tuning, attempting to + automatically size the buffer (no greater than tcp_rmem[2]) to + match the size required by the path for full throughput. Enabled by + default. + +tcp_mtu_probing - INTEGER + Controls TCP Packetization-Layer Path MTU Discovery. Takes three + values: + + - 0 - Disabled + - 1 - Disabled by default, enabled when an ICMP black hole detected + - 2 - Always enabled, use initial MSS of tcp_base_mss. + +tcp_probe_interval - UNSIGNED INTEGER + Controls how often to start TCP Packetization-Layer Path MTU + Discovery reprobe. The default is reprobing every 10 minutes as + per RFC4821. + +tcp_probe_threshold - INTEGER + Controls when TCP Packetization-Layer Path MTU Discovery probing + will stop in respect to the width of search range in bytes. Default + is 8 bytes. + +tcp_no_metrics_save - BOOLEAN + By default, TCP saves various connection metrics in the route cache + when the connection closes, so that connections established in the + near future can use these to set initial conditions. Usually, this + increases overall performance, but may sometimes cause performance + degradation. If set, TCP will not cache metrics on closing + connections. + +tcp_no_ssthresh_metrics_save - BOOLEAN + Controls whether TCP saves ssthresh metrics in the route cache. + + Default is 1, which disables ssthresh metrics. + +tcp_orphan_retries - INTEGER + This value influences the timeout of a locally closed TCP connection, + when RTO retransmissions remain unacknowledged. + See tcp_retries2 for more details. + + The default value is 8. + + If your machine is a loaded WEB server, + you should think about lowering this value, such sockets + may consume significant resources. Cf. tcp_max_orphans. + +tcp_recovery - INTEGER + This value is a bitmap to enable various experimental loss recovery + features. + + ========= ============================================================= + RACK: 0x1 enables the RACK loss detection for fast detection of lost + retransmissions and tail drops. It also subsumes and disables + RFC6675 recovery for SACK connections. + + RACK: 0x2 makes RACK's reordering window static (min_rtt/4). + + RACK: 0x4 disables RACK's DUPACK threshold heuristic + ========= ============================================================= + + Default: 0x1 + +tcp_reordering - INTEGER + Initial reordering level of packets in a TCP stream. + TCP stack can then dynamically adjust flow reordering level + between this initial value and tcp_max_reordering + + Default: 3 + +tcp_max_reordering - INTEGER + Maximal reordering level of packets in a TCP stream. + 300 is a fairly conservative value, but you might increase it + if paths are using per packet load balancing (like bonding rr mode) + + Default: 300 + +tcp_retrans_collapse - BOOLEAN + Bug-to-bug compatibility with some broken printers. + On retransmit try to send bigger packets to work around bugs in + certain TCP stacks. + +tcp_retries1 - INTEGER + This value influences the time, after which TCP decides, that + something is wrong due to unacknowledged RTO retransmissions, + and reports this suspicion to the network layer. + See tcp_retries2 for more details. + + RFC 1122 recommends at least 3 retransmissions, which is the + default. + +tcp_retries2 - INTEGER + This value influences the timeout of an alive TCP connection, + when RTO retransmissions remain unacknowledged. + Given a value of N, a hypothetical TCP connection following + exponential backoff with an initial RTO of TCP_RTO_MIN would + retransmit N times before killing the connection at the (N+1)th RTO. + + The default value of 15 yields a hypothetical timeout of 924.6 + seconds and is a lower bound for the effective timeout. + TCP will effectively time out at the first RTO which exceeds the + hypothetical timeout. + + RFC 1122 recommends at least 100 seconds for the timeout, + which corresponds to a value of at least 8. + +tcp_rfc1337 - BOOLEAN + If set, the TCP stack behaves conforming to RFC1337. If unset, + we are not conforming to RFC, but prevent TCP TIME_WAIT + assassination. + + Default: 0 + +tcp_rmem - vector of 3 INTEGERs: min, default, max + min: Minimal size of receive buffer used by TCP sockets. + It is guaranteed to each TCP socket, even under moderate memory + pressure. + + Default: 4K + + default: initial size of receive buffer used by TCP sockets. + This value overrides net.core.rmem_default used by other protocols. + Default: 87380 bytes. This value results in window of 65535 with + default setting of tcp_adv_win_scale and tcp_app_win:0 and a bit + less for default tcp_app_win. See below about these variables. + + max: maximal size of receive buffer allowed for automatically + selected receiver buffers for TCP socket. This value does not override + net.core.rmem_max. Calling setsockopt() with SO_RCVBUF disables + automatic tuning of that socket's receive buffer size, in which + case this value is ignored. + Default: between 87380B and 6MB, depending on RAM size. + +tcp_sack - BOOLEAN + Enable select acknowledgments (SACKS). + +tcp_comp_sack_delay_ns - LONG INTEGER + TCP tries to reduce number of SACK sent, using a timer + based on 5% of SRTT, capped by this sysctl, in nano seconds. + The default is 1ms, based on TSO autosizing period. + + Default : 1,000,000 ns (1 ms) + +tcp_comp_sack_nr - INTEGER + Max number of SACK that can be compressed. + Using 0 disables SACK compression. + + Default : 44 + +tcp_slow_start_after_idle - BOOLEAN + If set, provide RFC2861 behavior and time out the congestion + window after an idle period. An idle period is defined at + the current RTO. If unset, the congestion window will not + be timed out after an idle period. + + Default: 1 + +tcp_stdurg - BOOLEAN + Use the Host requirements interpretation of the TCP urgent pointer field. + Most hosts use the older BSD interpretation, so if you turn this on + Linux might not communicate correctly with them. + + Default: FALSE + +tcp_synack_retries - INTEGER + Number of times SYNACKs for a passive TCP connection attempt will + be retransmitted. Should not be higher than 255. Default value + is 5, which corresponds to 31seconds till the last retransmission + with the current initial RTO of 1second. With this the final timeout + for a passive TCP connection will happen after 63seconds. + +tcp_syncookies - INTEGER + Only valid when the kernel was compiled with CONFIG_SYN_COOKIES + Send out syncookies when the syn backlog queue of a socket + overflows. This is to prevent against the common 'SYN flood attack' + Default: 1 + + Note, that syncookies is fallback facility. + It MUST NOT be used to help highly loaded servers to stand + against legal connection rate. If you see SYN flood warnings + in your logs, but investigation shows that they occur + because of overload with legal connections, you should tune + another parameters until this warning disappear. + See: tcp_max_syn_backlog, tcp_synack_retries, tcp_abort_on_overflow. + + syncookies seriously violate TCP protocol, do not allow + to use TCP extensions, can result in serious degradation + of some services (f.e. SMTP relaying), visible not by you, + but your clients and relays, contacting you. While you see + SYN flood warnings in logs not being really flooded, your server + is seriously misconfigured. + + If you want to test which effects syncookies have to your + network connections you can set this knob to 2 to enable + unconditionally generation of syncookies. + +tcp_fastopen - INTEGER + Enable TCP Fast Open (RFC7413) to send and accept data in the opening + SYN packet. + + The client support is enabled by flag 0x1 (on by default). The client + then must use sendmsg() or sendto() with the MSG_FASTOPEN flag, + rather than connect() to send data in SYN. + + The server support is enabled by flag 0x2 (off by default). Then + either enable for all listeners with another flag (0x400) or + enable individual listeners via TCP_FASTOPEN socket option with + the option value being the length of the syn-data backlog. + + The values (bitmap) are + + ===== ======== ====================================================== + 0x1 (client) enables sending data in the opening SYN on the client. + 0x2 (server) enables the server support, i.e., allowing data in + a SYN packet to be accepted and passed to the + application before 3-way handshake finishes. + 0x4 (client) send data in the opening SYN regardless of cookie + availability and without a cookie option. + 0x200 (server) accept data-in-SYN w/o any cookie option present. + 0x400 (server) enable all listeners to support Fast Open by + default without explicit TCP_FASTOPEN socket option. + ===== ======== ====================================================== + + Default: 0x1 + + Note that that additional client or server features are only + effective if the basic support (0x1 and 0x2) are enabled respectively. + +tcp_fastopen_blackhole_timeout_sec - INTEGER + Initial time period in second to disable Fastopen on active TCP sockets + when a TFO firewall blackhole issue happens. + This time period will grow exponentially when more blackhole issues + get detected right after Fastopen is re-enabled and will reset to + initial value when the blackhole issue goes away. + 0 to disable the blackhole detection. + + By default, it is set to 1hr. + +tcp_fastopen_key - list of comma separated 32-digit hexadecimal INTEGERs + The list consists of a primary key and an optional backup key. The + primary key is used for both creating and validating cookies, while the + optional backup key is only used for validating cookies. The purpose of + the backup key is to maximize TFO validation when keys are rotated. + + A randomly chosen primary key may be configured by the kernel if + the tcp_fastopen sysctl is set to 0x400 (see above), or if the + TCP_FASTOPEN setsockopt() optname is set and a key has not been + previously configured via sysctl. If keys are configured via + setsockopt() by using the TCP_FASTOPEN_KEY optname, then those + per-socket keys will be used instead of any keys that are specified via + sysctl. + + A key is specified as 4 8-digit hexadecimal integers which are separated + by a '-' as: xxxxxxxx-xxxxxxxx-xxxxxxxx-xxxxxxxx. Leading zeros may be + omitted. A primary and a backup key may be specified by separating them + by a comma. If only one key is specified, it becomes the primary key and + any previously configured backup keys are removed. + +tcp_syn_retries - INTEGER + Number of times initial SYNs for an active TCP connection attempt + will be retransmitted. Should not be higher than 127. Default value + is 6, which corresponds to 63seconds till the last retransmission + with the current initial RTO of 1second. With this the final timeout + for an active TCP connection attempt will happen after 127seconds. + +tcp_timestamps - INTEGER + Enable timestamps as defined in RFC1323. + + - 0: Disabled. + - 1: Enable timestamps as defined in RFC1323 and use random offset for + each connection rather than only using the current time. + - 2: Like 1, but without random offsets. + + Default: 1 + +tcp_min_tso_segs - INTEGER + Minimal number of segments per TSO frame. + + Since linux-3.12, TCP does an automatic sizing of TSO frames, + depending on flow rate, instead of filling 64Kbytes packets. + For specific usages, it's possible to force TCP to build big + TSO frames. Note that TCP stack might split too big TSO packets + if available window is too small. + + Default: 2 + +tcp_pacing_ss_ratio - INTEGER + sk->sk_pacing_rate is set by TCP stack using a ratio applied + to current rate. (current_rate = cwnd * mss / srtt) + If TCP is in slow start, tcp_pacing_ss_ratio is applied + to let TCP probe for bigger speeds, assuming cwnd can be + doubled every other RTT. + + Default: 200 + +tcp_pacing_ca_ratio - INTEGER + sk->sk_pacing_rate is set by TCP stack using a ratio applied + to current rate. (current_rate = cwnd * mss / srtt) + If TCP is in congestion avoidance phase, tcp_pacing_ca_ratio + is applied to conservatively probe for bigger throughput. + + Default: 120 + +tcp_tso_win_divisor - INTEGER + This allows control over what percentage of the congestion window + can be consumed by a single TSO frame. + The setting of this parameter is a choice between burstiness and + building larger TSO frames. + + Default: 3 + +tcp_tw_reuse - INTEGER + Enable reuse of TIME-WAIT sockets for new connections when it is + safe from protocol viewpoint. + + - 0 - disable + - 1 - global enable + - 2 - enable for loopback traffic only + + It should not be changed without advice/request of technical + experts. + + Default: 2 + +tcp_window_scaling - BOOLEAN + Enable window scaling as defined in RFC1323. + +tcp_wmem - vector of 3 INTEGERs: min, default, max + min: Amount of memory reserved for send buffers for TCP sockets. + Each TCP socket has rights to use it due to fact of its birth. + + Default: 4K + + default: initial size of send buffer used by TCP sockets. This + value overrides net.core.wmem_default used by other protocols. + + It is usually lower than net.core.wmem_default. + + Default: 16K + + max: Maximal amount of memory allowed for automatically tuned + send buffers for TCP sockets. This value does not override + net.core.wmem_max. Calling setsockopt() with SO_SNDBUF disables + automatic tuning of that socket's send buffer size, in which case + this value is ignored. + + Default: between 64K and 4MB, depending on RAM size. + +tcp_notsent_lowat - UNSIGNED INTEGER + A TCP socket can control the amount of unsent bytes in its write queue, + thanks to TCP_NOTSENT_LOWAT socket option. poll()/select()/epoll() + reports POLLOUT events if the amount of unsent bytes is below a per + socket value, and if the write queue is not full. sendmsg() will + also not add new buffers if the limit is hit. + + This global variable controls the amount of unsent data for + sockets not using TCP_NOTSENT_LOWAT. For these sockets, a change + to the global variable has immediate effect. + + Default: UINT_MAX (0xFFFFFFFF) + +tcp_workaround_signed_windows - BOOLEAN + If set, assume no receipt of a window scaling option means the + remote TCP is broken and treats the window as a signed quantity. + If unset, assume the remote TCP is not broken even if we do + not receive a window scaling option from them. + + Default: 0 + +tcp_thin_linear_timeouts - BOOLEAN + Enable dynamic triggering of linear timeouts for thin streams. + If set, a check is performed upon retransmission by timeout to + determine if the stream is thin (less than 4 packets in flight). + As long as the stream is found to be thin, up to 6 linear + timeouts may be performed before exponential backoff mode is + initiated. This improves retransmission latency for + non-aggressive thin streams, often found to be time-dependent. + For more information on thin streams, see + Documentation/networking/tcp-thin.txt + + Default: 0 + +tcp_limit_output_bytes - INTEGER + Controls TCP Small Queue limit per tcp socket. + TCP bulk sender tends to increase packets in flight until it + gets losses notifications. With SNDBUF autotuning, this can + result in a large amount of packets queued on the local machine + (e.g.: qdiscs, CPU backlog, or device) hurting latency of other + flows, for typical pfifo_fast qdiscs. tcp_limit_output_bytes + limits the number of bytes on qdisc or device to reduce artificial + RTT/cwnd and reduce bufferbloat. + + Default: 1048576 (16 * 65536) + +tcp_challenge_ack_limit - INTEGER + Limits number of Challenge ACK sent per second, as recommended + in RFC 5961 (Improving TCP's Robustness to Blind In-Window Attacks) + Default: 1000 + +tcp_rx_skb_cache - BOOLEAN + Controls a per TCP socket cache of one skb, that might help + performance of some workloads. This might be dangerous + on systems with a lot of TCP sockets, since it increases + memory usage. + + Default: 0 (disabled) + +UDP variables +============= + +udp_l3mdev_accept - BOOLEAN + Enabling this option allows a "global" bound socket to work + across L3 master domains (e.g., VRFs) with packets capable of + being received regardless of the L3 domain in which they + originated. Only valid when the kernel was compiled with + CONFIG_NET_L3_MASTER_DEV. + + Default: 0 (disabled) + +udp_mem - vector of 3 INTEGERs: min, pressure, max + Number of pages allowed for queueing by all UDP sockets. + + min: Below this number of pages UDP is not bothered about its + memory appetite. When amount of memory allocated by UDP exceeds + this number, UDP starts to moderate memory usage. + + pressure: This value was introduced to follow format of tcp_mem. + + max: Number of pages allowed for queueing by all UDP sockets. + + Default is calculated at boot time from amount of available memory. + +udp_rmem_min - INTEGER + Minimal size of receive buffer used by UDP sockets in moderation. + Each UDP socket is able to use the size for receiving data, even if + total pages of UDP sockets exceed udp_mem pressure. The unit is byte. + + Default: 4K + +udp_wmem_min - INTEGER + Minimal size of send buffer used by UDP sockets in moderation. + Each UDP socket is able to use the size for sending data, even if + total pages of UDP sockets exceed udp_mem pressure. The unit is byte. + + Default: 4K + +RAW variables +============= + +raw_l3mdev_accept - BOOLEAN + Enabling this option allows a "global" bound socket to work + across L3 master domains (e.g., VRFs) with packets capable of + being received regardless of the L3 domain in which they + originated. Only valid when the kernel was compiled with + CONFIG_NET_L3_MASTER_DEV. + + Default: 1 (enabled) + +CIPSOv4 Variables +================= + +cipso_cache_enable - BOOLEAN + If set, enable additions to and lookups from the CIPSO label mapping + cache. If unset, additions are ignored and lookups always result in a + miss. However, regardless of the setting the cache is still + invalidated when required when means you can safely toggle this on and + off and the cache will always be "safe". + + Default: 1 + +cipso_cache_bucket_size - INTEGER + The CIPSO label cache consists of a fixed size hash table with each + hash bucket containing a number of cache entries. This variable limits + the number of entries in each hash bucket; the larger the value the + more CIPSO label mappings that can be cached. When the number of + entries in a given hash bucket reaches this limit adding new entries + causes the oldest entry in the bucket to be removed to make room. + + Default: 10 + +cipso_rbm_optfmt - BOOLEAN + Enable the "Optimized Tag 1 Format" as defined in section 3.4.2.6 of + the CIPSO draft specification (see Documentation/netlabel for details). + This means that when set the CIPSO tag will be padded with empty + categories in order to make the packet data 32-bit aligned. + + Default: 0 + +cipso_rbm_structvalid - BOOLEAN + If set, do a very strict check of the CIPSO option when + ip_options_compile() is called. If unset, relax the checks done during + ip_options_compile(). Either way is "safe" as errors are caught else + where in the CIPSO processing code but setting this to 0 (False) should + result in less work (i.e. it should be faster) but could cause problems + with other implementations that require strict checking. + + Default: 0 + +IP Variables +============ + +ip_local_port_range - 2 INTEGERS + Defines the local port range that is used by TCP and UDP to + choose the local port. The first number is the first, the + second the last local port number. + If possible, it is better these numbers have different parity + (one even and one odd value). + Must be greater than or equal to ip_unprivileged_port_start. + The default values are 32768 and 60999 respectively. + +ip_local_reserved_ports - list of comma separated ranges + Specify the ports which are reserved for known third-party + applications. These ports will not be used by automatic port + assignments (e.g. when calling connect() or bind() with port + number 0). Explicit port allocation behavior is unchanged. + + The format used for both input and output is a comma separated + list of ranges (e.g. "1,2-4,10-10" for ports 1, 2, 3, 4 and + 10). Writing to the file will clear all previously reserved + ports and update the current list with the one given in the + input. + + Note that ip_local_port_range and ip_local_reserved_ports + settings are independent and both are considered by the kernel + when determining which ports are available for automatic port + assignments. + + You can reserve ports which are not in the current + ip_local_port_range, e.g.:: + + $ cat /proc/sys/net/ipv4/ip_local_port_range + 32000 60999 + $ cat /proc/sys/net/ipv4/ip_local_reserved_ports + 8080,9148 + + although this is redundant. However such a setting is useful + if later the port range is changed to a value that will + include the reserved ports. + + Default: Empty + +ip_unprivileged_port_start - INTEGER + This is a per-namespace sysctl. It defines the first + unprivileged port in the network namespace. Privileged ports + require root or CAP_NET_BIND_SERVICE in order to bind to them. + To disable all privileged ports, set this to 0. They must not + overlap with the ip_local_port_range. + + Default: 1024 + +ip_nonlocal_bind - BOOLEAN + If set, allows processes to bind() to non-local IP addresses, + which can be quite useful - but may break some applications. + + Default: 0 + +ip_autobind_reuse - BOOLEAN + By default, bind() does not select the ports automatically even if + the new socket and all sockets bound to the port have SO_REUSEADDR. + ip_autobind_reuse allows bind() to reuse the port and this is useful + when you use bind()+connect(), but may break some applications. + The preferred solution is to use IP_BIND_ADDRESS_NO_PORT and this + option should only be set by experts. + Default: 0 + +ip_dynaddr - BOOLEAN + If set non-zero, enables support for dynamic addresses. + If set to a non-zero value larger than 1, a kernel log + message will be printed when dynamic address rewriting + occurs. + + Default: 0 + +ip_early_demux - BOOLEAN + Optimize input packet processing down to one demux for + certain kinds of local sockets. Currently we only do this + for established TCP and connected UDP sockets. + + It may add an additional cost for pure routing workloads that + reduces overall throughput, in such case you should disable it. + + Default: 1 + +ping_group_range - 2 INTEGERS + Restrict ICMP_PROTO datagram sockets to users in the group range. + The default is "1 0", meaning, that nobody (not even root) may + create ping sockets. Setting it to "100 100" would grant permissions + to the single group. "0 4294967295" would enable it for the world, "100 + 4294967295" would enable it for the users, but not daemons. + +tcp_early_demux - BOOLEAN + Enable early demux for established TCP sockets. + + Default: 1 + +udp_early_demux - BOOLEAN + Enable early demux for connected UDP sockets. Disable this if + your system could experience more unconnected load. + + Default: 1 + +icmp_echo_ignore_all - BOOLEAN + If set non-zero, then the kernel will ignore all ICMP ECHO + requests sent to it. + + Default: 0 + +icmp_echo_ignore_broadcasts - BOOLEAN + If set non-zero, then the kernel will ignore all ICMP ECHO and + TIMESTAMP requests sent to it via broadcast/multicast. + + Default: 1 + +icmp_ratelimit - INTEGER + Limit the maximal rates for sending ICMP packets whose type matches + icmp_ratemask (see below) to specific targets. + 0 to disable any limiting, + otherwise the minimal space between responses in milliseconds. + Note that another sysctl, icmp_msgs_per_sec limits the number + of ICMP packets sent on all targets. + + Default: 1000 + +icmp_msgs_per_sec - INTEGER + Limit maximal number of ICMP packets sent per second from this host. + Only messages whose type matches icmp_ratemask (see below) are + controlled by this limit. + + Default: 1000 + +icmp_msgs_burst - INTEGER + icmp_msgs_per_sec controls number of ICMP packets sent per second, + while icmp_msgs_burst controls the burst size of these packets. + + Default: 50 + +icmp_ratemask - INTEGER + Mask made of ICMP types for which rates are being limited. + + Significant bits: IHGFEDCBA9876543210 + + Default mask: 0000001100000011000 (6168) + + Bit definitions (see include/linux/icmp.h): + + = ========================= + 0 Echo Reply + 3 Destination Unreachable [1]_ + 4 Source Quench [1]_ + 5 Redirect + 8 Echo Request + B Time Exceeded [1]_ + C Parameter Problem [1]_ + D Timestamp Request + E Timestamp Reply + F Info Request + G Info Reply + H Address Mask Request + I Address Mask Reply + = ========================= + + .. [1] These are rate limited by default (see default mask above) + +icmp_ignore_bogus_error_responses - BOOLEAN + Some routers violate RFC1122 by sending bogus responses to broadcast + frames. Such violations are normally logged via a kernel warning. + If this is set to TRUE, the kernel will not give such warnings, which + will avoid log file clutter. + + Default: 1 + +icmp_errors_use_inbound_ifaddr - BOOLEAN + + If zero, icmp error messages are sent with the primary address of + the exiting interface. + + If non-zero, the message will be sent with the primary address of + the interface that received the packet that caused the icmp error. + This is the behaviour network many administrators will expect from + a router. And it can make debugging complicated network layouts + much easier. + + Note that if no primary address exists for the interface selected, + then the primary address of the first non-loopback interface that + has one will be used regardless of this setting. + + Default: 0 + +igmp_max_memberships - INTEGER + Change the maximum number of multicast groups we can subscribe to. + Default: 20 + + Theoretical maximum value is bounded by having to send a membership + report in a single datagram (i.e. the report can't span multiple + datagrams, or risk confusing the switch and leaving groups you don't + intend to). + + The number of supported groups 'M' is bounded by the number of group + report entries you can fit into a single datagram of 65535 bytes. + + M = 65536-sizeof (ip header)/(sizeof(Group record)) + + Group records are variable length, with a minimum of 12 bytes. + So net.ipv4.igmp_max_memberships should not be set higher than: + + (65536-24) / 12 = 5459 + + The value 5459 assumes no IP header options, so in practice + this number may be lower. + +igmp_max_msf - INTEGER + Maximum number of addresses allowed in the source filter list for a + multicast group. + + Default: 10 + +igmp_qrv - INTEGER + Controls the IGMP query robustness variable (see RFC2236 8.1). + + Default: 2 (as specified by RFC2236 8.1) + + Minimum: 1 (as specified by RFC6636 4.5) + +force_igmp_version - INTEGER + - 0 - (default) No enforcement of a IGMP version, IGMPv1/v2 fallback + allowed. Will back to IGMPv3 mode again if all IGMPv1/v2 Querier + Present timer expires. + - 1 - Enforce to use IGMP version 1. Will also reply IGMPv1 report if + receive IGMPv2/v3 query. + - 2 - Enforce to use IGMP version 2. Will fallback to IGMPv1 if receive + IGMPv1 query message. Will reply report if receive IGMPv3 query. + - 3 - Enforce to use IGMP version 3. The same react with default 0. + + .. note:: + + this is not the same with force_mld_version because IGMPv3 RFC3376 + Security Considerations does not have clear description that we could + ignore other version messages completely as MLDv2 RFC3810. So make + this value as default 0 is recommended. + +``conf/interface/*`` + changes special settings per interface (where + interface" is the name of your network interface) + +``conf/all/*`` + is special, changes the settings for all interfaces + +log_martians - BOOLEAN + Log packets with impossible addresses to kernel log. + log_martians for the interface will be enabled if at least one of + conf/{all,interface}/log_martians is set to TRUE, + it will be disabled otherwise + +accept_redirects - BOOLEAN + Accept ICMP redirect messages. + accept_redirects for the interface will be enabled if: + + - both conf/{all,interface}/accept_redirects are TRUE in the case + forwarding for the interface is enabled + + or + + - at least one of conf/{all,interface}/accept_redirects is TRUE in the + case forwarding for the interface is disabled + + accept_redirects for the interface will be disabled otherwise + + default: + + - TRUE (host) + - FALSE (router) + +forwarding - BOOLEAN + Enable IP forwarding on this interface. This controls whether packets + received _on_ this interface can be forwarded. + +mc_forwarding - BOOLEAN + Do multicast routing. The kernel needs to be compiled with CONFIG_MROUTE + and a multicast routing daemon is required. + conf/all/mc_forwarding must also be set to TRUE to enable multicast + routing for the interface + +medium_id - INTEGER + Integer value used to differentiate the devices by the medium they + are attached to. Two devices can have different id values when + the broadcast packets are received only on one of them. + The default value 0 means that the device is the only interface + to its medium, value of -1 means that medium is not known. + + Currently, it is used to change the proxy_arp behavior: + the proxy_arp feature is enabled for packets forwarded between + two devices attached to different media. + +proxy_arp - BOOLEAN + Do proxy arp. + + proxy_arp for the interface will be enabled if at least one of + conf/{all,interface}/proxy_arp is set to TRUE, + it will be disabled otherwise + +proxy_arp_pvlan - BOOLEAN + Private VLAN proxy arp. + + Basically allow proxy arp replies back to the same interface + (from which the ARP request/solicitation was received). + + This is done to support (ethernet) switch features, like RFC + 3069, where the individual ports are NOT allowed to + communicate with each other, but they are allowed to talk to + the upstream router. As described in RFC 3069, it is possible + to allow these hosts to communicate through the upstream + router by proxy_arp'ing. Don't need to be used together with + proxy_arp. + + This technology is known by different names: + + In RFC 3069 it is called VLAN Aggregation. + Cisco and Allied Telesyn call it Private VLAN. + Hewlett-Packard call it Source-Port filtering or port-isolation. + Ericsson call it MAC-Forced Forwarding (RFC Draft). + +shared_media - BOOLEAN + Send(router) or accept(host) RFC1620 shared media redirects. + Overrides secure_redirects. + + shared_media for the interface will be enabled if at least one of + conf/{all,interface}/shared_media is set to TRUE, + it will be disabled otherwise + + default TRUE + +secure_redirects - BOOLEAN + Accept ICMP redirect messages only to gateways listed in the + interface's current gateway list. Even if disabled, RFC1122 redirect + rules still apply. + + Overridden by shared_media. + + secure_redirects for the interface will be enabled if at least one of + conf/{all,interface}/secure_redirects is set to TRUE, + it will be disabled otherwise + + default TRUE + +send_redirects - BOOLEAN + Send redirects, if router. + + send_redirects for the interface will be enabled if at least one of + conf/{all,interface}/send_redirects is set to TRUE, + it will be disabled otherwise + + Default: TRUE + +bootp_relay - BOOLEAN + Accept packets with source address 0.b.c.d destined + not to this host as local ones. It is supposed, that + BOOTP relay daemon will catch and forward such packets. + conf/all/bootp_relay must also be set to TRUE to enable BOOTP relay + for the interface + + default FALSE + + Not Implemented Yet. + +accept_source_route - BOOLEAN + Accept packets with SRR option. + conf/all/accept_source_route must also be set to TRUE to accept packets + with SRR option on the interface + + default + + - TRUE (router) + - FALSE (host) + +accept_local - BOOLEAN + Accept packets with local source addresses. In combination with + suitable routing, this can be used to direct packets between two + local interfaces over the wire and have them accepted properly. + default FALSE + +route_localnet - BOOLEAN + Do not consider loopback addresses as martian source or destination + while routing. This enables the use of 127/8 for local routing purposes. + + default FALSE + +rp_filter - INTEGER + - 0 - No source validation. + - 1 - Strict mode as defined in RFC3704 Strict Reverse Path + Each incoming packet is tested against the FIB and if the interface + is not the best reverse path the packet check will fail. + By default failed packets are discarded. + - 2 - Loose mode as defined in RFC3704 Loose Reverse Path + Each incoming packet's source address is also tested against the FIB + and if the source address is not reachable via any interface + the packet check will fail. + + Current recommended practice in RFC3704 is to enable strict mode + to prevent IP spoofing from DDos attacks. If using asymmetric routing + or other complicated routing, then loose mode is recommended. + + The max value from conf/{all,interface}/rp_filter is used + when doing source validation on the {interface}. + + Default value is 0. Note that some distributions enable it + in startup scripts. + +arp_filter - BOOLEAN + - 1 - Allows you to have multiple network interfaces on the same + subnet, and have the ARPs for each interface be answered + based on whether or not the kernel would route a packet from + the ARP'd IP out that interface (therefore you must use source + based routing for this to work). In other words it allows control + of which cards (usually 1) will respond to an arp request. + + - 0 - (default) The kernel can respond to arp requests with addresses + from other interfaces. This may seem wrong but it usually makes + sense, because it increases the chance of successful communication. + IP addresses are owned by the complete host on Linux, not by + particular interfaces. Only for more complex setups like load- + balancing, does this behaviour cause problems. + + arp_filter for the interface will be enabled if at least one of + conf/{all,interface}/arp_filter is set to TRUE, + it will be disabled otherwise + +arp_announce - INTEGER + Define different restriction levels for announcing the local + source IP address from IP packets in ARP requests sent on + interface: + + - 0 - (default) Use any local address, configured on any interface + - 1 - Try to avoid local addresses that are not in the target's + subnet for this interface. This mode is useful when target + hosts reachable via this interface require the source IP + address in ARP requests to be part of their logical network + configured on the receiving interface. When we generate the + request we will check all our subnets that include the + target IP and will preserve the source address if it is from + such subnet. If there is no such subnet we select source + address according to the rules for level 2. + - 2 - Always use the best local address for this target. + In this mode we ignore the source address in the IP packet + and try to select local address that we prefer for talks with + the target host. Such local address is selected by looking + for primary IP addresses on all our subnets on the outgoing + interface that include the target IP address. If no suitable + local address is found we select the first local address + we have on the outgoing interface or on all other interfaces, + with the hope we will receive reply for our request and + even sometimes no matter the source IP address we announce. + + The max value from conf/{all,interface}/arp_announce is used. + + Increasing the restriction level gives more chance for + receiving answer from the resolved target while decreasing + the level announces more valid sender's information. + +arp_ignore - INTEGER + Define different modes for sending replies in response to + received ARP requests that resolve local target IP addresses: + + - 0 - (default): reply for any local target IP address, configured + on any interface + - 1 - reply only if the target IP address is local address + configured on the incoming interface + - 2 - reply only if the target IP address is local address + configured on the incoming interface and both with the + sender's IP address are part from same subnet on this interface + - 3 - do not reply for local addresses configured with scope host, + only resolutions for global and link addresses are replied + - 4-7 - reserved + - 8 - do not reply for all local addresses + + The max value from conf/{all,interface}/arp_ignore is used + when ARP request is received on the {interface} + +arp_notify - BOOLEAN + Define mode for notification of address and device changes. + + == ========================================================== + 0 (default): do nothing + 1 Generate gratuitous arp requests when device is brought up + or hardware address changes. + == ========================================================== + +arp_accept - BOOLEAN + Define behavior for gratuitous ARP frames who's IP is not + already present in the ARP table: + + - 0 - don't create new entries in the ARP table + - 1 - create new entries in the ARP table + + Both replies and requests type gratuitous arp will trigger the + ARP table to be updated, if this setting is on. + + If the ARP table already contains the IP address of the + gratuitous arp frame, the arp table will be updated regardless + if this setting is on or off. + +mcast_solicit - INTEGER + The maximum number of multicast probes in INCOMPLETE state, + when the associated hardware address is unknown. Defaults + to 3. + +ucast_solicit - INTEGER + The maximum number of unicast probes in PROBE state, when + the hardware address is being reconfirmed. Defaults to 3. + +app_solicit - INTEGER + The maximum number of probes to send to the user space ARP daemon + via netlink before dropping back to multicast probes (see + mcast_resolicit). Defaults to 0. + +mcast_resolicit - INTEGER + The maximum number of multicast probes after unicast and + app probes in PROBE state. Defaults to 0. + +disable_policy - BOOLEAN + Disable IPSEC policy (SPD) for this interface + +disable_xfrm - BOOLEAN + Disable IPSEC encryption on this interface, whatever the policy + +igmpv2_unsolicited_report_interval - INTEGER + The interval in milliseconds in which the next unsolicited + IGMPv1 or IGMPv2 report retransmit will take place. + + Default: 10000 (10 seconds) + +igmpv3_unsolicited_report_interval - INTEGER + The interval in milliseconds in which the next unsolicited + IGMPv3 report retransmit will take place. + + Default: 1000 (1 seconds) + +promote_secondaries - BOOLEAN + When a primary IP address is removed from this interface + promote a corresponding secondary IP address instead of + removing all the corresponding secondary IP addresses. + +drop_unicast_in_l2_multicast - BOOLEAN + Drop any unicast IP packets that are received in link-layer + multicast (or broadcast) frames. + + This behavior (for multicast) is actually a SHOULD in RFC + 1122, but is disabled by default for compatibility reasons. + + Default: off (0) + +drop_gratuitous_arp - BOOLEAN + Drop all gratuitous ARP frames, for example if there's a known + good ARP proxy on the network and such frames need not be used + (or in the case of 802.11, must not be used to prevent attacks.) + + Default: off (0) + + +tag - INTEGER + Allows you to write a number, which can be used as required. + + Default value is 0. + +xfrm4_gc_thresh - INTEGER + (Obsolete since linux-4.14) + The threshold at which we will start garbage collecting for IPv4 + destination cache entries. At twice this value the system will + refuse new allocations. + +igmp_link_local_mcast_reports - BOOLEAN + Enable IGMP reports for link local multicast groups in the + 224.0.0.X range. + + Default TRUE + +Alexey Kuznetsov. +kuznet@ms2.inr.ac.ru + +Updated by: + +- Andi Kleen + ak@muc.de +- Nicolas Delon + delon.nicolas@wanadoo.fr + + + + +/proc/sys/net/ipv6/* Variables +============================== + +IPv6 has no global variables such as tcp_*. tcp_* settings under ipv4/ also +apply to IPv6 [XXX?]. + +bindv6only - BOOLEAN + Default value for IPV6_V6ONLY socket option, + which restricts use of the IPv6 socket to IPv6 communication + only. + + - TRUE: disable IPv4-mapped address feature + - FALSE: enable IPv4-mapped address feature + + Default: FALSE (as specified in RFC3493) + +flowlabel_consistency - BOOLEAN + Protect the consistency (and unicity) of flow label. + You have to disable it to use IPV6_FL_F_REFLECT flag on the + flow label manager. + + - TRUE: enabled + - FALSE: disabled + + Default: TRUE + +auto_flowlabels - INTEGER + Automatically generate flow labels based on a flow hash of the + packet. This allows intermediate devices, such as routers, to + identify packet flows for mechanisms like Equal Cost Multipath + Routing (see RFC 6438). + + = =========================================================== + 0 automatic flow labels are completely disabled + 1 automatic flow labels are enabled by default, they can be + disabled on a per socket basis using the IPV6_AUTOFLOWLABEL + socket option + 2 automatic flow labels are allowed, they may be enabled on a + per socket basis using the IPV6_AUTOFLOWLABEL socket option + 3 automatic flow labels are enabled and enforced, they cannot + be disabled by the socket option + = =========================================================== + + Default: 1 + +flowlabel_state_ranges - BOOLEAN + Split the flow label number space into two ranges. 0-0x7FFFF is + reserved for the IPv6 flow manager facility, 0x80000-0xFFFFF + is reserved for stateless flow labels as described in RFC6437. + + - TRUE: enabled + - FALSE: disabled + + Default: true + +flowlabel_reflect - INTEGER + Control flow label reflection. Needed for Path MTU + Discovery to work with Equal Cost Multipath Routing in anycast + environments. See RFC 7690 and: + https://tools.ietf.org/html/draft-wang-6man-flow-label-reflection-01 + + This is a bitmask. + + - 1: enabled for established flows + + Note that this prevents automatic flowlabel changes, as done + in "tcp: change IPv6 flow-label upon receiving spurious retransmission" + and "tcp: Change txhash on every SYN and RTO retransmit" + + - 2: enabled for TCP RESET packets (no active listener) + If set, a RST packet sent in response to a SYN packet on a closed + port will reflect the incoming flow label. + + - 4: enabled for ICMPv6 echo reply messages. + + Default: 0 + +fib_multipath_hash_policy - INTEGER + Controls which hash policy to use for multipath routes. + + Default: 0 (Layer 3) + + Possible values: + + - 0 - Layer 3 (source and destination addresses plus flow label) + - 1 - Layer 4 (standard 5-tuple) + - 2 - Layer 3 or inner Layer 3 if present + +anycast_src_echo_reply - BOOLEAN + Controls the use of anycast addresses as source addresses for ICMPv6 + echo reply + + - TRUE: enabled + - FALSE: disabled + + Default: FALSE + +idgen_delay - INTEGER + Controls the delay in seconds after which time to retry + privacy stable address generation if a DAD conflict is + detected. + + Default: 1 (as specified in RFC7217) + +idgen_retries - INTEGER + Controls the number of retries to generate a stable privacy + address if a DAD conflict is detected. + + Default: 3 (as specified in RFC7217) + +mld_qrv - INTEGER + Controls the MLD query robustness variable (see RFC3810 9.1). + + Default: 2 (as specified by RFC3810 9.1) + + Minimum: 1 (as specified by RFC6636 4.5) + +max_dst_opts_number - INTEGER + Maximum number of non-padding TLVs allowed in a Destination + options extension header. If this value is less than zero + then unknown options are disallowed and the number of known + TLVs allowed is the absolute value of this number. + + Default: 8 + +max_hbh_opts_number - INTEGER + Maximum number of non-padding TLVs allowed in a Hop-by-Hop + options extension header. If this value is less than zero + then unknown options are disallowed and the number of known + TLVs allowed is the absolute value of this number. + + Default: 8 + +max_dst_opts_length - INTEGER + Maximum length allowed for a Destination options extension + header. + + Default: INT_MAX (unlimited) + +max_hbh_length - INTEGER + Maximum length allowed for a Hop-by-Hop options extension + header. + + Default: INT_MAX (unlimited) + +skip_notify_on_dev_down - BOOLEAN + Controls whether an RTM_DELROUTE message is generated for routes + removed when a device is taken down or deleted. IPv4 does not + generate this message; IPv6 does by default. Setting this sysctl + to true skips the message, making IPv4 and IPv6 on par in relying + on userspace caches to track link events and evict routes. + + Default: false (generate message) + +nexthop_compat_mode - BOOLEAN + New nexthop API provides a means for managing nexthops independent of + prefixes. Backwards compatibilty with old route format is enabled by + default which means route dumps and notifications contain the new + nexthop attribute but also the full, expanded nexthop definition. + Further, updates or deletes of a nexthop configuration generate route + notifications for each fib entry using the nexthop. Once a system + understands the new API, this sysctl can be disabled to achieve full + performance benefits of the new API by disabling the nexthop expansion + and extraneous notifications. + Default: true (backward compat mode) + +IPv6 Fragmentation: + +ip6frag_high_thresh - INTEGER + Maximum memory used to reassemble IPv6 fragments. When + ip6frag_high_thresh bytes of memory is allocated for this purpose, + the fragment handler will toss packets until ip6frag_low_thresh + is reached. + +ip6frag_low_thresh - INTEGER + See ip6frag_high_thresh + +ip6frag_time - INTEGER + Time in seconds to keep an IPv6 fragment in memory. + +IPv6 Segment Routing: + +seg6_flowlabel - INTEGER + Controls the behaviour of computing the flowlabel of outer + IPv6 header in case of SR T.encaps + + == ======================================================= + -1 set flowlabel to zero. + 0 copy flowlabel from Inner packet in case of Inner IPv6 + (Set flowlabel to 0 in case IPv4/L2) + 1 Compute the flowlabel using seg6_make_flowlabel() + == ======================================================= + + Default is 0. + +``conf/default/*``: + Change the interface-specific default settings. + + +``conf/all/*``: + Change all the interface-specific settings. + + [XXX: Other special features than forwarding?] + +conf/all/forwarding - BOOLEAN + Enable global IPv6 forwarding between all interfaces. + + IPv4 and IPv6 work differently here; e.g. netfilter must be used + to control which interfaces may forward packets and which not. + + This also sets all interfaces' Host/Router setting + 'forwarding' to the specified value. See below for details. + + This referred to as global forwarding. + +proxy_ndp - BOOLEAN + Do proxy ndp. + +fwmark_reflect - BOOLEAN + Controls the fwmark of kernel-generated IPv6 reply packets that are not + associated with a socket for example, TCP RSTs or ICMPv6 echo replies). + If unset, these packets have a fwmark of zero. If set, they have the + fwmark of the packet they are replying to. + + Default: 0 + +``conf/interface/*``: + Change special settings per interface. + + The functional behaviour for certain settings is different + depending on whether local forwarding is enabled or not. + +accept_ra - INTEGER + Accept Router Advertisements; autoconfigure using them. + + It also determines whether or not to transmit Router + Solicitations. If and only if the functional setting is to + accept Router Advertisements, Router Solicitations will be + transmitted. + + Possible values are: + + == =========================================================== + 0 Do not accept Router Advertisements. + 1 Accept Router Advertisements if forwarding is disabled. + 2 Overrule forwarding behaviour. Accept Router Advertisements + even if forwarding is enabled. + == =========================================================== + + Functional default: + + - enabled if local forwarding is disabled. + - disabled if local forwarding is enabled. + +accept_ra_defrtr - BOOLEAN + Learn default router in Router Advertisement. + + Functional default: + + - enabled if accept_ra is enabled. + - disabled if accept_ra is disabled. + +accept_ra_from_local - BOOLEAN + Accept RA with source-address that is found on local machine + if the RA is otherwise proper and able to be accepted. + + Default is to NOT accept these as it may be an un-intended + network loop. + + Functional default: + + - enabled if accept_ra_from_local is enabled + on a specific interface. + - disabled if accept_ra_from_local is disabled + on a specific interface. + +accept_ra_min_hop_limit - INTEGER + Minimum hop limit Information in Router Advertisement. + + Hop limit Information in Router Advertisement less than this + variable shall be ignored. + + Default: 1 + +accept_ra_pinfo - BOOLEAN + Learn Prefix Information in Router Advertisement. + + Functional default: + + - enabled if accept_ra is enabled. + - disabled if accept_ra is disabled. + +accept_ra_rt_info_min_plen - INTEGER + Minimum prefix length of Route Information in RA. + + Route Information w/ prefix smaller than this variable shall + be ignored. + + Functional default: + + * 0 if accept_ra_rtr_pref is enabled. + * -1 if accept_ra_rtr_pref is disabled. + +accept_ra_rt_info_max_plen - INTEGER + Maximum prefix length of Route Information in RA. + + Route Information w/ prefix larger than this variable shall + be ignored. + + Functional default: + + * 0 if accept_ra_rtr_pref is enabled. + * -1 if accept_ra_rtr_pref is disabled. + +accept_ra_rtr_pref - BOOLEAN + Accept Router Preference in RA. + + Functional default: + + - enabled if accept_ra is enabled. + - disabled if accept_ra is disabled. + +accept_ra_mtu - BOOLEAN + Apply the MTU value specified in RA option 5 (RFC4861). If + disabled, the MTU specified in the RA will be ignored. + + Functional default: + + - enabled if accept_ra is enabled. + - disabled if accept_ra is disabled. + +accept_redirects - BOOLEAN + Accept Redirects. + + Functional default: + + - enabled if local forwarding is disabled. + - disabled if local forwarding is enabled. + +accept_source_route - INTEGER + Accept source routing (routing extension header). + + - >= 0: Accept only routing header type 2. + - < 0: Do not accept routing header. + + Default: 0 + +autoconf - BOOLEAN + Autoconfigure addresses using Prefix Information in Router + Advertisements. + + Functional default: + + - enabled if accept_ra_pinfo is enabled. + - disabled if accept_ra_pinfo is disabled. + +dad_transmits - INTEGER + The amount of Duplicate Address Detection probes to send. + + Default: 1 + +forwarding - INTEGER + Configure interface-specific Host/Router behaviour. + + .. note:: + + It is recommended to have the same setting on all + interfaces; mixed router/host scenarios are rather uncommon. + + Possible values are: + + - 0 Forwarding disabled + - 1 Forwarding enabled + + **FALSE (0)**: + + By default, Host behaviour is assumed. This means: + + 1. IsRouter flag is not set in Neighbour Advertisements. + 2. If accept_ra is TRUE (default), transmit Router + Solicitations. + 3. If accept_ra is TRUE (default), accept Router + Advertisements (and do autoconfiguration). + 4. If accept_redirects is TRUE (default), accept Redirects. + + **TRUE (1)**: + + If local forwarding is enabled, Router behaviour is assumed. + This means exactly the reverse from the above: + + 1. IsRouter flag is set in Neighbour Advertisements. + 2. Router Solicitations are not sent unless accept_ra is 2. + 3. Router Advertisements are ignored unless accept_ra is 2. + 4. Redirects are ignored. + + Default: 0 (disabled) if global forwarding is disabled (default), + otherwise 1 (enabled). + +hop_limit - INTEGER + Default Hop Limit to set. + + Default: 64 + +mtu - INTEGER + Default Maximum Transfer Unit + + Default: 1280 (IPv6 required minimum) + +ip_nonlocal_bind - BOOLEAN + If set, allows processes to bind() to non-local IPv6 addresses, + which can be quite useful - but may break some applications. + + Default: 0 + +router_probe_interval - INTEGER + Minimum interval (in seconds) between Router Probing described + in RFC4191. + + Default: 60 + +router_solicitation_delay - INTEGER + Number of seconds to wait after interface is brought up + before sending Router Solicitations. + + Default: 1 + +router_solicitation_interval - INTEGER + Number of seconds to wait between Router Solicitations. + + Default: 4 + +router_solicitations - INTEGER + Number of Router Solicitations to send until assuming no + routers are present. + + Default: 3 + +use_oif_addrs_only - BOOLEAN + When enabled, the candidate source addresses for destinations + routed via this interface are restricted to the set of addresses + configured on this interface (vis. RFC 6724, section 4). + + Default: false + +use_tempaddr - INTEGER + Preference for Privacy Extensions (RFC3041). + + * <= 0 : disable Privacy Extensions + * == 1 : enable Privacy Extensions, but prefer public + addresses over temporary addresses. + * > 1 : enable Privacy Extensions and prefer temporary + addresses over public addresses. + + Default: + + * 0 (for most devices) + * -1 (for point-to-point devices and loopback devices) + +temp_valid_lft - INTEGER + valid lifetime (in seconds) for temporary addresses. + + Default: 604800 (7 days) + +temp_prefered_lft - INTEGER + Preferred lifetime (in seconds) for temporary addresses. + + Default: 86400 (1 day) + +keep_addr_on_down - INTEGER + Keep all IPv6 addresses on an interface down event. If set static + global addresses with no expiration time are not flushed. + + * >0 : enabled + * 0 : system default + * <0 : disabled + + Default: 0 (addresses are removed) + +max_desync_factor - INTEGER + Maximum value for DESYNC_FACTOR, which is a random value + that ensures that clients don't synchronize with each + other and generate new addresses at exactly the same time. + value is in seconds. + + Default: 600 + +regen_max_retry - INTEGER + Number of attempts before give up attempting to generate + valid temporary addresses. + + Default: 5 + +max_addresses - INTEGER + Maximum number of autoconfigured addresses per interface. Setting + to zero disables the limitation. It is not recommended to set this + value too large (or to zero) because it would be an easy way to + crash the kernel by allowing too many addresses to be created. + + Default: 16 + +disable_ipv6 - BOOLEAN + Disable IPv6 operation. If accept_dad is set to 2, this value + will be dynamically set to TRUE if DAD fails for the link-local + address. + + Default: FALSE (enable IPv6 operation) + + When this value is changed from 1 to 0 (IPv6 is being enabled), + it will dynamically create a link-local address on the given + interface and start Duplicate Address Detection, if necessary. + + When this value is changed from 0 to 1 (IPv6 is being disabled), + it will dynamically delete all addresses and routes on the given + interface. From now on it will not possible to add addresses/routes + to the selected interface. + +accept_dad - INTEGER + Whether to accept DAD (Duplicate Address Detection). + + == ============================================================== + 0 Disable DAD + 1 Enable DAD (default) + 2 Enable DAD, and disable IPv6 operation if MAC-based duplicate + link-local address has been found. + == ============================================================== + + DAD operation and mode on a given interface will be selected according + to the maximum value of conf/{all,interface}/accept_dad. + +force_tllao - BOOLEAN + Enable sending the target link-layer address option even when + responding to a unicast neighbor solicitation. + + Default: FALSE + + Quoting from RFC 2461, section 4.4, Target link-layer address: + + "The option MUST be included for multicast solicitations in order to + avoid infinite Neighbor Solicitation "recursion" when the peer node + does not have a cache entry to return a Neighbor Advertisements + message. When responding to unicast solicitations, the option can be + omitted since the sender of the solicitation has the correct link- + layer address; otherwise it would not have be able to send the unicast + solicitation in the first place. However, including the link-layer + address in this case adds little overhead and eliminates a potential + race condition where the sender deletes the cached link-layer address + prior to receiving a response to a previous solicitation." + +ndisc_notify - BOOLEAN + Define mode for notification of address and device changes. + + * 0 - (default): do nothing + * 1 - Generate unsolicited neighbour advertisements when device is brought + up or hardware address changes. + +ndisc_tclass - INTEGER + The IPv6 Traffic Class to use by default when sending IPv6 Neighbor + Discovery (Router Solicitation, Router Advertisement, Neighbor + Solicitation, Neighbor Advertisement, Redirect) messages. + These 8 bits can be interpreted as 6 high order bits holding the DSCP + value and 2 low order bits representing ECN (which you probably want + to leave cleared). + + * 0 - (default) + +mldv1_unsolicited_report_interval - INTEGER + The interval in milliseconds in which the next unsolicited + MLDv1 report retransmit will take place. + + Default: 10000 (10 seconds) + +mldv2_unsolicited_report_interval - INTEGER + The interval in milliseconds in which the next unsolicited + MLDv2 report retransmit will take place. + + Default: 1000 (1 second) + +force_mld_version - INTEGER + * 0 - (default) No enforcement of a MLD version, MLDv1 fallback allowed + * 1 - Enforce to use MLD version 1 + * 2 - Enforce to use MLD version 2 + +suppress_frag_ndisc - INTEGER + Control RFC 6980 (Security Implications of IPv6 Fragmentation + with IPv6 Neighbor Discovery) behavior: + + * 1 - (default) discard fragmented neighbor discovery packets + * 0 - allow fragmented neighbor discovery packets + +optimistic_dad - BOOLEAN + Whether to perform Optimistic Duplicate Address Detection (RFC 4429). + + * 0: disabled (default) + * 1: enabled + + Optimistic Duplicate Address Detection for the interface will be enabled + if at least one of conf/{all,interface}/optimistic_dad is set to 1, + it will be disabled otherwise. + +use_optimistic - BOOLEAN + If enabled, do not classify optimistic addresses as deprecated during + source address selection. Preferred addresses will still be chosen + before optimistic addresses, subject to other ranking in the source + address selection algorithm. + + * 0: disabled (default) + * 1: enabled + + This will be enabled if at least one of + conf/{all,interface}/use_optimistic is set to 1, disabled otherwise. + +stable_secret - IPv6 address + This IPv6 address will be used as a secret to generate IPv6 + addresses for link-local addresses and autoconfigured + ones. All addresses generated after setting this secret will + be stable privacy ones by default. This can be changed via the + addrgenmode ip-link. conf/default/stable_secret is used as the + secret for the namespace, the interface specific ones can + overwrite that. Writes to conf/all/stable_secret are refused. + + It is recommended to generate this secret during installation + of a system and keep it stable after that. + + By default the stable secret is unset. + +addr_gen_mode - INTEGER + Defines how link-local and autoconf addresses are generated. + + = ================================================================= + 0 generate address based on EUI64 (default) + 1 do no generate a link-local address, use EUI64 for addresses + generated from autoconf + 2 generate stable privacy addresses, using the secret from + stable_secret (RFC7217) + 3 generate stable privacy addresses, using a random secret if unset + = ================================================================= + +drop_unicast_in_l2_multicast - BOOLEAN + Drop any unicast IPv6 packets that are received in link-layer + multicast (or broadcast) frames. + + By default this is turned off. + +drop_unsolicited_na - BOOLEAN + Drop all unsolicited neighbor advertisements, for example if there's + a known good NA proxy on the network and such frames need not be used + (or in the case of 802.11, must not be used to prevent attacks.) + + By default this is turned off. + +enhanced_dad - BOOLEAN + Include a nonce option in the IPv6 neighbor solicitation messages used for + duplicate address detection per RFC7527. A received DAD NS will only signal + a duplicate address if the nonce is different. This avoids any false + detection of duplicates due to loopback of the NS messages that we send. + The nonce option will be sent on an interface unless both of + conf/{all,interface}/enhanced_dad are set to FALSE. + + Default: TRUE + +``icmp/*``: +=========== + +ratelimit - INTEGER + Limit the maximal rates for sending ICMPv6 messages. + + 0 to disable any limiting, + otherwise the minimal space between responses in milliseconds. + + Default: 1000 + +ratemask - list of comma separated ranges + For ICMPv6 message types matching the ranges in the ratemask, limit + the sending of the message according to ratelimit parameter. + + The format used for both input and output is a comma separated + list of ranges (e.g. "0-127,129" for ICMPv6 message type 0 to 127 and + 129). Writing to the file will clear all previous ranges of ICMPv6 + message types and update the current list with the input. + + Refer to: https://www.iana.org/assignments/icmpv6-parameters/icmpv6-parameters.xhtml + for numerical values of ICMPv6 message types, e.g. echo request is 128 + and echo reply is 129. + + Default: 0-1,3-127 (rate limit ICMPv6 errors except Packet Too Big) + +echo_ignore_all - BOOLEAN + If set non-zero, then the kernel will ignore all ICMP ECHO + requests sent to it over the IPv6 protocol. + + Default: 0 + +echo_ignore_multicast - BOOLEAN + If set non-zero, then the kernel will ignore all ICMP ECHO + requests sent to it over the IPv6 protocol via multicast. + + Default: 0 + +echo_ignore_anycast - BOOLEAN + If set non-zero, then the kernel will ignore all ICMP ECHO + requests sent to it over the IPv6 protocol destined to anycast address. + + Default: 0 + +xfrm6_gc_thresh - INTEGER + (Obsolete since linux-4.14) + The threshold at which we will start garbage collecting for IPv6 + destination cache entries. At twice this value the system will + refuse new allocations. + + +IPv6 Update by: +Pekka Savola +YOSHIFUJI Hideaki / USAGI Project + + +/proc/sys/net/bridge/* Variables: +================================= + +bridge-nf-call-arptables - BOOLEAN + - 1 : pass bridged ARP traffic to arptables' FORWARD chain. + - 0 : disable this. + + Default: 1 + +bridge-nf-call-iptables - BOOLEAN + - 1 : pass bridged IPv4 traffic to iptables' chains. + - 0 : disable this. + + Default: 1 + +bridge-nf-call-ip6tables - BOOLEAN + - 1 : pass bridged IPv6 traffic to ip6tables' chains. + - 0 : disable this. + + Default: 1 + +bridge-nf-filter-vlan-tagged - BOOLEAN + - 1 : pass bridged vlan-tagged ARP/IP/IPv6 traffic to {arp,ip,ip6}tables. + - 0 : disable this. + + Default: 0 + +bridge-nf-filter-pppoe-tagged - BOOLEAN + - 1 : pass bridged pppoe-tagged IP/IPv6 traffic to {ip,ip6}tables. + - 0 : disable this. + + Default: 0 + +bridge-nf-pass-vlan-input-dev - BOOLEAN + - 1: if bridge-nf-filter-vlan-tagged is enabled, try to find a vlan + interface on the bridge and set the netfilter input device to the + vlan. This allows use of e.g. "iptables -i br0.1" and makes the + REDIRECT target work with vlan-on-top-of-bridge interfaces. When no + matching vlan interface is found, or this switch is off, the input + device is set to the bridge interface. + + - 0: disable bridge netfilter vlan interface lookup. + + Default: 0 + +``proc/sys/net/sctp/*`` Variables: +================================== + +addip_enable - BOOLEAN + Enable or disable extension of Dynamic Address Reconfiguration + (ADD-IP) functionality specified in RFC5061. This extension provides + the ability to dynamically add and remove new addresses for the SCTP + associations. + + 1: Enable extension. + + 0: Disable extension. + + Default: 0 + +pf_enable - INTEGER + Enable or disable pf (pf is short for potentially failed) state. A value + of pf_retrans > path_max_retrans also disables pf state. That is, one of + both pf_enable and pf_retrans > path_max_retrans can disable pf state. + Since pf_retrans and path_max_retrans can be changed by userspace + application, sometimes user expects to disable pf state by the value of + pf_retrans > path_max_retrans, but occasionally the value of pf_retrans + or path_max_retrans is changed by the user application, this pf state is + enabled. As such, it is necessary to add this to dynamically enable + and disable pf state. See: + https://datatracker.ietf.org/doc/draft-ietf-tsvwg-sctp-failover for + details. + + 1: Enable pf. + + 0: Disable pf. + + Default: 1 + +pf_expose - INTEGER + Unset or enable/disable pf (pf is short for potentially failed) state + exposure. Applications can control the exposure of the PF path state + in the SCTP_PEER_ADDR_CHANGE event and the SCTP_GET_PEER_ADDR_INFO + sockopt. When it's unset, no SCTP_PEER_ADDR_CHANGE event with + SCTP_ADDR_PF state will be sent and a SCTP_PF-state transport info + can be got via SCTP_GET_PEER_ADDR_INFO sockopt; When it's enabled, + a SCTP_PEER_ADDR_CHANGE event will be sent for a transport becoming + SCTP_PF state and a SCTP_PF-state transport info can be got via + SCTP_GET_PEER_ADDR_INFO sockopt; When it's diabled, no + SCTP_PEER_ADDR_CHANGE event will be sent and it returns -EACCES when + trying to get a SCTP_PF-state transport info via SCTP_GET_PEER_ADDR_INFO + sockopt. + + 0: Unset pf state exposure, Compatible with old applications. + + 1: Disable pf state exposure. + + 2: Enable pf state exposure. + + Default: 0 + +addip_noauth_enable - BOOLEAN + Dynamic Address Reconfiguration (ADD-IP) requires the use of + authentication to protect the operations of adding or removing new + addresses. This requirement is mandated so that unauthorized hosts + would not be able to hijack associations. However, older + implementations may not have implemented this requirement while + allowing the ADD-IP extension. For reasons of interoperability, + we provide this variable to control the enforcement of the + authentication requirement. + + == =============================================================== + 1 Allow ADD-IP extension to be used without authentication. This + should only be set in a closed environment for interoperability + with older implementations. + + 0 Enforce the authentication requirement + == =============================================================== + + Default: 0 + +auth_enable - BOOLEAN + Enable or disable Authenticated Chunks extension. This extension + provides the ability to send and receive authenticated chunks and is + required for secure operation of Dynamic Address Reconfiguration + (ADD-IP) extension. + + - 1: Enable this extension. + - 0: Disable this extension. + + Default: 0 + +prsctp_enable - BOOLEAN + Enable or disable the Partial Reliability extension (RFC3758) which + is used to notify peers that a given DATA should no longer be expected. + + - 1: Enable extension + - 0: Disable + + Default: 1 + +max_burst - INTEGER + The limit of the number of new packets that can be initially sent. It + controls how bursty the generated traffic can be. + + Default: 4 + +association_max_retrans - INTEGER + Set the maximum number for retransmissions that an association can + attempt deciding that the remote end is unreachable. If this value + is exceeded, the association is terminated. + + Default: 10 + +max_init_retransmits - INTEGER + The maximum number of retransmissions of INIT and COOKIE-ECHO chunks + that an association will attempt before declaring the destination + unreachable and terminating. + + Default: 8 + +path_max_retrans - INTEGER + The maximum number of retransmissions that will be attempted on a given + path. Once this threshold is exceeded, the path is considered + unreachable, and new traffic will use a different path when the + association is multihomed. + + Default: 5 + +pf_retrans - INTEGER + The number of retransmissions that will be attempted on a given path + before traffic is redirected to an alternate transport (should one + exist). Note this is distinct from path_max_retrans, as a path that + passes the pf_retrans threshold can still be used. Its only + deprioritized when a transmission path is selected by the stack. This + setting is primarily used to enable fast failover mechanisms without + having to reduce path_max_retrans to a very low value. See: + http://www.ietf.org/id/draft-nishida-tsvwg-sctp-failover-05.txt + for details. Note also that a value of pf_retrans > path_max_retrans + disables this feature. Since both pf_retrans and path_max_retrans can + be changed by userspace application, a variable pf_enable is used to + disable pf state. + + Default: 0 + +ps_retrans - INTEGER + Primary.Switchover.Max.Retrans (PSMR), it's a tunable parameter coming + from section-5 "Primary Path Switchover" in rfc7829. The primary path + will be changed to another active path when the path error counter on + the old primary path exceeds PSMR, so that "the SCTP sender is allowed + to continue data transmission on a new working path even when the old + primary destination address becomes active again". Note this feature + is disabled by initializing 'ps_retrans' per netns as 0xffff by default, + and its value can't be less than 'pf_retrans' when changing by sysctl. + + Default: 0xffff + +rto_initial - INTEGER + The initial round trip timeout value in milliseconds that will be used + in calculating round trip times. This is the initial time interval + for retransmissions. + + Default: 3000 + +rto_max - INTEGER + The maximum value (in milliseconds) of the round trip timeout. This + is the largest time interval that can elapse between retransmissions. + + Default: 60000 + +rto_min - INTEGER + The minimum value (in milliseconds) of the round trip timeout. This + is the smallest time interval the can elapse between retransmissions. + + Default: 1000 + +hb_interval - INTEGER + The interval (in milliseconds) between HEARTBEAT chunks. These chunks + are sent at the specified interval on idle paths to probe the state of + a given path between 2 associations. + + Default: 30000 + +sack_timeout - INTEGER + The amount of time (in milliseconds) that the implementation will wait + to send a SACK. + + Default: 200 + +valid_cookie_life - INTEGER + The default lifetime of the SCTP cookie (in milliseconds). The cookie + is used during association establishment. + + Default: 60000 + +cookie_preserve_enable - BOOLEAN + Enable or disable the ability to extend the lifetime of the SCTP cookie + that is used during the establishment phase of SCTP association + + - 1: Enable cookie lifetime extension. + - 0: Disable + + Default: 1 + +cookie_hmac_alg - STRING + Select the hmac algorithm used when generating the cookie value sent by + a listening sctp socket to a connecting client in the INIT-ACK chunk. + Valid values are: + + * md5 + * sha1 + * none + + Ability to assign md5 or sha1 as the selected alg is predicated on the + configuration of those algorithms at build time (CONFIG_CRYPTO_MD5 and + CONFIG_CRYPTO_SHA1). + + Default: Dependent on configuration. MD5 if available, else SHA1 if + available, else none. + +rcvbuf_policy - INTEGER + Determines if the receive buffer is attributed to the socket or to + association. SCTP supports the capability to create multiple + associations on a single socket. When using this capability, it is + possible that a single stalled association that's buffering a lot + of data may block other associations from delivering their data by + consuming all of the receive buffer space. To work around this, + the rcvbuf_policy could be set to attribute the receiver buffer space + to each association instead of the socket. This prevents the described + blocking. + + - 1: rcvbuf space is per association + - 0: rcvbuf space is per socket + + Default: 0 + +sndbuf_policy - INTEGER + Similar to rcvbuf_policy above, this applies to send buffer space. + + - 1: Send buffer is tracked per association + - 0: Send buffer is tracked per socket. + + Default: 0 + +sctp_mem - vector of 3 INTEGERs: min, pressure, max + Number of pages allowed for queueing by all SCTP sockets. + + min: Below this number of pages SCTP is not bothered about its + memory appetite. When amount of memory allocated by SCTP exceeds + this number, SCTP starts to moderate memory usage. + + pressure: This value was introduced to follow format of tcp_mem. + + max: Number of pages allowed for queueing by all SCTP sockets. + + Default is calculated at boot time from amount of available memory. + +sctp_rmem - vector of 3 INTEGERs: min, default, max + Only the first value ("min") is used, "default" and "max" are + ignored. + + min: Minimal size of receive buffer used by SCTP socket. + It is guaranteed to each SCTP socket (but not association) even + under moderate memory pressure. + + Default: 4K + +sctp_wmem - vector of 3 INTEGERs: min, default, max + Currently this tunable has no effect. + +addr_scope_policy - INTEGER + Control IPv4 address scoping - draft-stewart-tsvwg-sctp-ipv4-00 + + - 0 - Disable IPv4 address scoping + - 1 - Enable IPv4 address scoping + - 2 - Follow draft but allow IPv4 private addresses + - 3 - Follow draft but allow IPv4 link local addresses + + Default: 1 + + +``/proc/sys/net/core/*`` +======================== + + Please see: Documentation/admin-guide/sysctl/net.rst for descriptions of these entries. + + +``/proc/sys/net/unix/*`` +======================== + +max_dgram_qlen - INTEGER + The maximum length of dgram socket receive queue + + Default: 10 + diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt deleted file mode 100644 index 5cdc37c34830..000000000000 --- a/Documentation/networking/ip-sysctl.txt +++ /dev/null @@ -1,2374 +0,0 @@ -/proc/sys/net/ipv4/* Variables: - -ip_forward - BOOLEAN - 0 - disabled (default) - not 0 - enabled - - Forward Packets between interfaces. - - This variable is special, its change resets all configuration - parameters to their default state (RFC1122 for hosts, RFC1812 - for routers) - -ip_default_ttl - INTEGER - Default value of TTL field (Time To Live) for outgoing (but not - forwarded) IP packets. Should be between 1 and 255 inclusive. - Default: 64 (as recommended by RFC1700) - -ip_no_pmtu_disc - INTEGER - Disable Path MTU Discovery. If enabled in mode 1 and a - fragmentation-required ICMP is received, the PMTU to this - destination will be set to min_pmtu (see below). You will need - to raise min_pmtu to the smallest interface MTU on your system - manually if you want to avoid locally generated fragments. - - In mode 2 incoming Path MTU Discovery messages will be - discarded. Outgoing frames are handled the same as in mode 1, - implicitly setting IP_PMTUDISC_DONT on every created socket. - - Mode 3 is a hardened pmtu discover mode. The kernel will only - accept fragmentation-needed errors if the underlying protocol - can verify them besides a plain socket lookup. Current - protocols for which pmtu events will be honored are TCP, SCTP - and DCCP as they verify e.g. the sequence number or the - association. This mode should not be enabled globally but is - only intended to secure e.g. name servers in namespaces where - TCP path mtu must still work but path MTU information of other - protocols should be discarded. If enabled globally this mode - could break other protocols. - - Possible values: 0-3 - Default: FALSE - -min_pmtu - INTEGER - default 552 - minimum discovered Path MTU - -ip_forward_use_pmtu - BOOLEAN - By default we don't trust protocol path MTUs while forwarding - because they could be easily forged and can lead to unwanted - fragmentation by the router. - You only need to enable this if you have user-space software - which tries to discover path mtus by itself and depends on the - kernel honoring this information. This is normally not the - case. - Default: 0 (disabled) - Possible values: - 0 - disabled - 1 - enabled - -fwmark_reflect - BOOLEAN - Controls the fwmark of kernel-generated IPv4 reply packets that are not - associated with a socket for example, TCP RSTs or ICMP echo replies). - If unset, these packets have a fwmark of zero. If set, they have the - fwmark of the packet they are replying to. - Default: 0 - -fib_multipath_use_neigh - BOOLEAN - Use status of existing neighbor entry when determining nexthop for - multipath routes. If disabled, neighbor information is not used and - packets could be directed to a failed nexthop. Only valid for kernels - built with CONFIG_IP_ROUTE_MULTIPATH enabled. - Default: 0 (disabled) - Possible values: - 0 - disabled - 1 - enabled - -fib_multipath_hash_policy - INTEGER - Controls which hash policy to use for multipath routes. Only valid - for kernels built with CONFIG_IP_ROUTE_MULTIPATH enabled. - Default: 0 (Layer 3) - Possible values: - 0 - Layer 3 - 1 - Layer 4 - 2 - Layer 3 or inner Layer 3 if present - -fib_sync_mem - UNSIGNED INTEGER - Amount of dirty memory from fib entries that can be backlogged before - synchronize_rcu is forced. - Default: 512kB Minimum: 64kB Maximum: 64MB - -ip_forward_update_priority - INTEGER - Whether to update SKB priority from "TOS" field in IPv4 header after it - is forwarded. The new SKB priority is mapped from TOS field value - according to an rt_tos2priority table (see e.g. man tc-prio). - Default: 1 (Update priority.) - Possible values: - 0 - Do not update priority. - 1 - Update priority. - -route/max_size - INTEGER - Maximum number of routes allowed in the kernel. Increase - this when using large numbers of interfaces and/or routes. - From linux kernel 3.6 onwards, this is deprecated for ipv4 - as route cache is no longer used. - -neigh/default/gc_thresh1 - INTEGER - Minimum number of entries to keep. Garbage collector will not - purge entries if there are fewer than this number. - Default: 128 - -neigh/default/gc_thresh2 - INTEGER - Threshold when garbage collector becomes more aggressive about - purging entries. Entries older than 5 seconds will be cleared - when over this number. - Default: 512 - -neigh/default/gc_thresh3 - INTEGER - Maximum number of non-PERMANENT neighbor entries allowed. Increase - this when using large numbers of interfaces and when communicating - with large numbers of directly-connected peers. - Default: 1024 - -neigh/default/unres_qlen_bytes - INTEGER - The maximum number of bytes which may be used by packets - queued for each unresolved address by other network layers. - (added in linux 3.3) - Setting negative value is meaningless and will return error. - Default: SK_WMEM_MAX, (same as net.core.wmem_default). - Exact value depends on architecture and kernel options, - but should be enough to allow queuing 256 packets - of medium size. - -neigh/default/unres_qlen - INTEGER - The maximum number of packets which may be queued for each - unresolved address by other network layers. - (deprecated in linux 3.3) : use unres_qlen_bytes instead. - Prior to linux 3.3, the default value is 3 which may cause - unexpected packet loss. The current default value is calculated - according to default value of unres_qlen_bytes and true size of - packet. - Default: 101 - -mtu_expires - INTEGER - Time, in seconds, that cached PMTU information is kept. - -min_adv_mss - INTEGER - The advertised MSS depends on the first hop route MTU, but will - never be lower than this setting. - -IP Fragmentation: - -ipfrag_high_thresh - LONG INTEGER - Maximum memory used to reassemble IP fragments. - -ipfrag_low_thresh - LONG INTEGER - (Obsolete since linux-4.17) - Maximum memory used to reassemble IP fragments before the kernel - begins to remove incomplete fragment queues to free up resources. - The kernel still accepts new fragments for defragmentation. - -ipfrag_time - INTEGER - Time in seconds to keep an IP fragment in memory. - -ipfrag_max_dist - INTEGER - ipfrag_max_dist is a non-negative integer value which defines the - maximum "disorder" which is allowed among fragments which share a - common IP source address. Note that reordering of packets is - not unusual, but if a large number of fragments arrive from a source - IP address while a particular fragment queue remains incomplete, it - probably indicates that one or more fragments belonging to that queue - have been lost. When ipfrag_max_dist is positive, an additional check - is done on fragments before they are added to a reassembly queue - if - ipfrag_max_dist (or more) fragments have arrived from a particular IP - address between additions to any IP fragment queue using that source - address, it's presumed that one or more fragments in the queue are - lost. The existing fragment queue will be dropped, and a new one - started. An ipfrag_max_dist value of zero disables this check. - - Using a very small value, e.g. 1 or 2, for ipfrag_max_dist can - result in unnecessarily dropping fragment queues when normal - reordering of packets occurs, which could lead to poor application - performance. Using a very large value, e.g. 50000, increases the - likelihood of incorrectly reassembling IP fragments that originate - from different IP datagrams, which could result in data corruption. - Default: 64 - -INET peer storage: - -inet_peer_threshold - INTEGER - The approximate size of the storage. Starting from this threshold - entries will be thrown aggressively. This threshold also determines - entries' time-to-live and time intervals between garbage collection - passes. More entries, less time-to-live, less GC interval. - -inet_peer_minttl - INTEGER - Minimum time-to-live of entries. Should be enough to cover fragment - time-to-live on the reassembling side. This minimum time-to-live is - guaranteed if the pool size is less than inet_peer_threshold. - Measured in seconds. - -inet_peer_maxttl - INTEGER - Maximum time-to-live of entries. Unused entries will expire after - this period of time if there is no memory pressure on the pool (i.e. - when the number of entries in the pool is very small). - Measured in seconds. - -TCP variables: - -somaxconn - INTEGER - Limit of socket listen() backlog, known in userspace as SOMAXCONN. - Defaults to 4096. (Was 128 before linux-5.4) - See also tcp_max_syn_backlog for additional tuning for TCP sockets. - -tcp_abort_on_overflow - BOOLEAN - If listening service is too slow to accept new connections, - reset them. Default state is FALSE. It means that if overflow - occurred due to a burst, connection will recover. Enable this - option _only_ if you are really sure that listening daemon - cannot be tuned to accept connections faster. Enabling this - option can harm clients of your server. - -tcp_adv_win_scale - INTEGER - Count buffering overhead as bytes/2^tcp_adv_win_scale - (if tcp_adv_win_scale > 0) or bytes-bytes/2^(-tcp_adv_win_scale), - if it is <= 0. - Possible values are [-31, 31], inclusive. - Default: 1 - -tcp_allowed_congestion_control - STRING - Show/set the congestion control choices available to non-privileged - processes. The list is a subset of those listed in - tcp_available_congestion_control. - Default is "reno" and the default setting (tcp_congestion_control). - -tcp_app_win - INTEGER - Reserve max(window/2^tcp_app_win, mss) of window for application - buffer. Value 0 is special, it means that nothing is reserved. - Default: 31 - -tcp_autocorking - BOOLEAN - Enable TCP auto corking : - When applications do consecutive small write()/sendmsg() system calls, - we try to coalesce these small writes as much as possible, to lower - total amount of sent packets. This is done if at least one prior - packet for the flow is waiting in Qdisc queues or device transmit - queue. Applications can still use TCP_CORK for optimal behavior - when they know how/when to uncork their sockets. - Default : 1 - -tcp_available_congestion_control - STRING - Shows the available congestion control choices that are registered. - More congestion control algorithms may be available as modules, - but not loaded. - -tcp_base_mss - INTEGER - The initial value of search_low to be used by the packetization layer - Path MTU discovery (MTU probing). If MTU probing is enabled, - this is the initial MSS used by the connection. - -tcp_mtu_probe_floor - INTEGER - If MTU probing is enabled this caps the minimum MSS used for search_low - for the connection. - - Default : 48 - -tcp_min_snd_mss - INTEGER - TCP SYN and SYNACK messages usually advertise an ADVMSS option, - as described in RFC 1122 and RFC 6691. - If this ADVMSS option is smaller than tcp_min_snd_mss, - it is silently capped to tcp_min_snd_mss. - - Default : 48 (at least 8 bytes of payload per segment) - -tcp_congestion_control - STRING - Set the congestion control algorithm to be used for new - connections. The algorithm "reno" is always available, but - additional choices may be available based on kernel configuration. - Default is set as part of kernel configuration. - For passive connections, the listener congestion control choice - is inherited. - [see setsockopt(listenfd, SOL_TCP, TCP_CONGESTION, "name" ...) ] - -tcp_dsack - BOOLEAN - Allows TCP to send "duplicate" SACKs. - -tcp_early_retrans - INTEGER - Tail loss probe (TLP) converts RTOs occurring due to tail - losses into fast recovery (draft-ietf-tcpm-rack). Note that - TLP requires RACK to function properly (see tcp_recovery below) - Possible values: - 0 disables TLP - 3 or 4 enables TLP - Default: 3 - -tcp_ecn - INTEGER - Control use of Explicit Congestion Notification (ECN) by TCP. - ECN is used only when both ends of the TCP connection indicate - support for it. This feature is useful in avoiding losses due - to congestion by allowing supporting routers to signal - congestion before having to drop packets. - Possible values are: - 0 Disable ECN. Neither initiate nor accept ECN. - 1 Enable ECN when requested by incoming connections and - also request ECN on outgoing connection attempts. - 2 Enable ECN when requested by incoming connections - but do not request ECN on outgoing connections. - Default: 2 - -tcp_ecn_fallback - BOOLEAN - If the kernel detects that ECN connection misbehaves, enable fall - back to non-ECN. Currently, this knob implements the fallback - from RFC3168, section 6.1.1.1., but we reserve that in future, - additional detection mechanisms could be implemented under this - knob. The value is not used, if tcp_ecn or per route (or congestion - control) ECN settings are disabled. - Default: 1 (fallback enabled) - -tcp_fack - BOOLEAN - This is a legacy option, it has no effect anymore. - -tcp_fin_timeout - INTEGER - The length of time an orphaned (no longer referenced by any - application) connection will remain in the FIN_WAIT_2 state - before it is aborted at the local end. While a perfectly - valid "receive only" state for an un-orphaned connection, an - orphaned connection in FIN_WAIT_2 state could otherwise wait - forever for the remote to close its end of the connection. - Cf. tcp_max_orphans - Default: 60 seconds - -tcp_frto - INTEGER - Enables Forward RTO-Recovery (F-RTO) defined in RFC5682. - F-RTO is an enhanced recovery algorithm for TCP retransmission - timeouts. It is particularly beneficial in networks where the - RTT fluctuates (e.g., wireless). F-RTO is sender-side only - modification. It does not require any support from the peer. - - By default it's enabled with a non-zero value. 0 disables F-RTO. - -tcp_fwmark_accept - BOOLEAN - If set, incoming connections to listening sockets that do not have a - socket mark will set the mark of the accepting socket to the fwmark of - the incoming SYN packet. This will cause all packets on that connection - (starting from the first SYNACK) to be sent with that fwmark. The - listening socket's mark is unchanged. Listening sockets that already - have a fwmark set via setsockopt(SOL_SOCKET, SO_MARK, ...) are - unaffected. - - Default: 0 - -tcp_invalid_ratelimit - INTEGER - Limit the maximal rate for sending duplicate acknowledgments - in response to incoming TCP packets that are for an existing - connection but that are invalid due to any of these reasons: - - (a) out-of-window sequence number, - (b) out-of-window acknowledgment number, or - (c) PAWS (Protection Against Wrapped Sequence numbers) check failure - - This can help mitigate simple "ack loop" DoS attacks, wherein - a buggy or malicious middlebox or man-in-the-middle can - rewrite TCP header fields in manner that causes each endpoint - to think that the other is sending invalid TCP segments, thus - causing each side to send an unterminating stream of duplicate - acknowledgments for invalid segments. - - Using 0 disables rate-limiting of dupacks in response to - invalid segments; otherwise this value specifies the minimal - space between sending such dupacks, in milliseconds. - - Default: 500 (milliseconds). - -tcp_keepalive_time - INTEGER - How often TCP sends out keepalive messages when keepalive is enabled. - Default: 2hours. - -tcp_keepalive_probes - INTEGER - How many keepalive probes TCP sends out, until it decides that the - connection is broken. Default value: 9. - -tcp_keepalive_intvl - INTEGER - How frequently the probes are send out. Multiplied by - tcp_keepalive_probes it is time to kill not responding connection, - after probes started. Default value: 75sec i.e. connection - will be aborted after ~11 minutes of retries. - -tcp_l3mdev_accept - BOOLEAN - Enables child sockets to inherit the L3 master device index. - Enabling this option allows a "global" listen socket to work - across L3 master domains (e.g., VRFs) with connected sockets - derived from the listen socket to be bound to the L3 domain in - which the packets originated. Only valid when the kernel was - compiled with CONFIG_NET_L3_MASTER_DEV. - Default: 0 (disabled) - -tcp_low_latency - BOOLEAN - This is a legacy option, it has no effect anymore. - -tcp_max_orphans - INTEGER - Maximal number of TCP sockets not attached to any user file handle, - held by system. If this number is exceeded orphaned connections are - reset immediately and warning is printed. This limit exists - only to prevent simple DoS attacks, you _must_ not rely on this - or lower the limit artificially, but rather increase it - (probably, after increasing installed memory), - if network conditions require more than default value, - and tune network services to linger and kill such states - more aggressively. Let me to remind again: each orphan eats - up to ~64K of unswappable memory. - -tcp_max_syn_backlog - INTEGER - Maximal number of remembered connection requests (SYN_RECV), - which have not received an acknowledgment from connecting client. - This is a per-listener limit. - The minimal value is 128 for low memory machines, and it will - increase in proportion to the memory of machine. - If server suffers from overload, try increasing this number. - Remember to also check /proc/sys/net/core/somaxconn - A SYN_RECV request socket consumes about 304 bytes of memory. - -tcp_max_tw_buckets - INTEGER - Maximal number of timewait sockets held by system simultaneously. - If this number is exceeded time-wait socket is immediately destroyed - and warning is printed. This limit exists only to prevent - simple DoS attacks, you _must_ not lower the limit artificially, - but rather increase it (probably, after increasing installed memory), - if network conditions require more than default value. - -tcp_mem - vector of 3 INTEGERs: min, pressure, max - min: below this number of pages TCP is not bothered about its - memory appetite. - - pressure: when amount of memory allocated by TCP exceeds this number - of pages, TCP moderates its memory consumption and enters memory - pressure mode, which is exited when memory consumption falls - under "min". - - max: number of pages allowed for queueing by all TCP sockets. - - Defaults are calculated at boot time from amount of available - memory. - -tcp_min_rtt_wlen - INTEGER - The window length of the windowed min filter to track the minimum RTT. - A shorter window lets a flow more quickly pick up new (higher) - minimum RTT when it is moved to a longer path (e.g., due to traffic - engineering). A longer window makes the filter more resistant to RTT - inflations such as transient congestion. The unit is seconds. - Possible values: 0 - 86400 (1 day) - Default: 300 - -tcp_moderate_rcvbuf - BOOLEAN - If set, TCP performs receive buffer auto-tuning, attempting to - automatically size the buffer (no greater than tcp_rmem[2]) to - match the size required by the path for full throughput. Enabled by - default. - -tcp_mtu_probing - INTEGER - Controls TCP Packetization-Layer Path MTU Discovery. Takes three - values: - 0 - Disabled - 1 - Disabled by default, enabled when an ICMP black hole detected - 2 - Always enabled, use initial MSS of tcp_base_mss. - -tcp_probe_interval - UNSIGNED INTEGER - Controls how often to start TCP Packetization-Layer Path MTU - Discovery reprobe. The default is reprobing every 10 minutes as - per RFC4821. - -tcp_probe_threshold - INTEGER - Controls when TCP Packetization-Layer Path MTU Discovery probing - will stop in respect to the width of search range in bytes. Default - is 8 bytes. - -tcp_no_metrics_save - BOOLEAN - By default, TCP saves various connection metrics in the route cache - when the connection closes, so that connections established in the - near future can use these to set initial conditions. Usually, this - increases overall performance, but may sometimes cause performance - degradation. If set, TCP will not cache metrics on closing - connections. - -tcp_no_ssthresh_metrics_save - BOOLEAN - Controls whether TCP saves ssthresh metrics in the route cache. - Default is 1, which disables ssthresh metrics. - -tcp_orphan_retries - INTEGER - This value influences the timeout of a locally closed TCP connection, - when RTO retransmissions remain unacknowledged. - See tcp_retries2 for more details. - - The default value is 8. - If your machine is a loaded WEB server, - you should think about lowering this value, such sockets - may consume significant resources. Cf. tcp_max_orphans. - -tcp_recovery - INTEGER - This value is a bitmap to enable various experimental loss recovery - features. - - RACK: 0x1 enables the RACK loss detection for fast detection of lost - retransmissions and tail drops. It also subsumes and disables - RFC6675 recovery for SACK connections. - RACK: 0x2 makes RACK's reordering window static (min_rtt/4). - RACK: 0x4 disables RACK's DUPACK threshold heuristic - - Default: 0x1 - -tcp_reordering - INTEGER - Initial reordering level of packets in a TCP stream. - TCP stack can then dynamically adjust flow reordering level - between this initial value and tcp_max_reordering - Default: 3 - -tcp_max_reordering - INTEGER - Maximal reordering level of packets in a TCP stream. - 300 is a fairly conservative value, but you might increase it - if paths are using per packet load balancing (like bonding rr mode) - Default: 300 - -tcp_retrans_collapse - BOOLEAN - Bug-to-bug compatibility with some broken printers. - On retransmit try to send bigger packets to work around bugs in - certain TCP stacks. - -tcp_retries1 - INTEGER - This value influences the time, after which TCP decides, that - something is wrong due to unacknowledged RTO retransmissions, - and reports this suspicion to the network layer. - See tcp_retries2 for more details. - - RFC 1122 recommends at least 3 retransmissions, which is the - default. - -tcp_retries2 - INTEGER - This value influences the timeout of an alive TCP connection, - when RTO retransmissions remain unacknowledged. - Given a value of N, a hypothetical TCP connection following - exponential backoff with an initial RTO of TCP_RTO_MIN would - retransmit N times before killing the connection at the (N+1)th RTO. - - The default value of 15 yields a hypothetical timeout of 924.6 - seconds and is a lower bound for the effective timeout. - TCP will effectively time out at the first RTO which exceeds the - hypothetical timeout. - - RFC 1122 recommends at least 100 seconds for the timeout, - which corresponds to a value of at least 8. - -tcp_rfc1337 - BOOLEAN - If set, the TCP stack behaves conforming to RFC1337. If unset, - we are not conforming to RFC, but prevent TCP TIME_WAIT - assassination. - Default: 0 - -tcp_rmem - vector of 3 INTEGERs: min, default, max - min: Minimal size of receive buffer used by TCP sockets. - It is guaranteed to each TCP socket, even under moderate memory - pressure. - Default: 4K - - default: initial size of receive buffer used by TCP sockets. - This value overrides net.core.rmem_default used by other protocols. - Default: 87380 bytes. This value results in window of 65535 with - default setting of tcp_adv_win_scale and tcp_app_win:0 and a bit - less for default tcp_app_win. See below about these variables. - - max: maximal size of receive buffer allowed for automatically - selected receiver buffers for TCP socket. This value does not override - net.core.rmem_max. Calling setsockopt() with SO_RCVBUF disables - automatic tuning of that socket's receive buffer size, in which - case this value is ignored. - Default: between 87380B and 6MB, depending on RAM size. - -tcp_sack - BOOLEAN - Enable select acknowledgments (SACKS). - -tcp_comp_sack_delay_ns - LONG INTEGER - TCP tries to reduce number of SACK sent, using a timer - based on 5% of SRTT, capped by this sysctl, in nano seconds. - The default is 1ms, based on TSO autosizing period. - - Default : 1,000,000 ns (1 ms) - -tcp_comp_sack_nr - INTEGER - Max number of SACK that can be compressed. - Using 0 disables SACK compression. - - Default : 44 - -tcp_slow_start_after_idle - BOOLEAN - If set, provide RFC2861 behavior and time out the congestion - window after an idle period. An idle period is defined at - the current RTO. If unset, the congestion window will not - be timed out after an idle period. - Default: 1 - -tcp_stdurg - BOOLEAN - Use the Host requirements interpretation of the TCP urgent pointer field. - Most hosts use the older BSD interpretation, so if you turn this on - Linux might not communicate correctly with them. - Default: FALSE - -tcp_synack_retries - INTEGER - Number of times SYNACKs for a passive TCP connection attempt will - be retransmitted. Should not be higher than 255. Default value - is 5, which corresponds to 31seconds till the last retransmission - with the current initial RTO of 1second. With this the final timeout - for a passive TCP connection will happen after 63seconds. - -tcp_syncookies - INTEGER - Only valid when the kernel was compiled with CONFIG_SYN_COOKIES - Send out syncookies when the syn backlog queue of a socket - overflows. This is to prevent against the common 'SYN flood attack' - Default: 1 - - Note, that syncookies is fallback facility. - It MUST NOT be used to help highly loaded servers to stand - against legal connection rate. If you see SYN flood warnings - in your logs, but investigation shows that they occur - because of overload with legal connections, you should tune - another parameters until this warning disappear. - See: tcp_max_syn_backlog, tcp_synack_retries, tcp_abort_on_overflow. - - syncookies seriously violate TCP protocol, do not allow - to use TCP extensions, can result in serious degradation - of some services (f.e. SMTP relaying), visible not by you, - but your clients and relays, contacting you. While you see - SYN flood warnings in logs not being really flooded, your server - is seriously misconfigured. - - If you want to test which effects syncookies have to your - network connections you can set this knob to 2 to enable - unconditionally generation of syncookies. - -tcp_fastopen - INTEGER - Enable TCP Fast Open (RFC7413) to send and accept data in the opening - SYN packet. - - The client support is enabled by flag 0x1 (on by default). The client - then must use sendmsg() or sendto() with the MSG_FASTOPEN flag, - rather than connect() to send data in SYN. - - The server support is enabled by flag 0x2 (off by default). Then - either enable for all listeners with another flag (0x400) or - enable individual listeners via TCP_FASTOPEN socket option with - the option value being the length of the syn-data backlog. - - The values (bitmap) are - 0x1: (client) enables sending data in the opening SYN on the client. - 0x2: (server) enables the server support, i.e., allowing data in - a SYN packet to be accepted and passed to the - application before 3-way handshake finishes. - 0x4: (client) send data in the opening SYN regardless of cookie - availability and without a cookie option. - 0x200: (server) accept data-in-SYN w/o any cookie option present. - 0x400: (server) enable all listeners to support Fast Open by - default without explicit TCP_FASTOPEN socket option. - - Default: 0x1 - - Note that that additional client or server features are only - effective if the basic support (0x1 and 0x2) are enabled respectively. - -tcp_fastopen_blackhole_timeout_sec - INTEGER - Initial time period in second to disable Fastopen on active TCP sockets - when a TFO firewall blackhole issue happens. - This time period will grow exponentially when more blackhole issues - get detected right after Fastopen is re-enabled and will reset to - initial value when the blackhole issue goes away. - 0 to disable the blackhole detection. - By default, it is set to 1hr. - -tcp_fastopen_key - list of comma separated 32-digit hexadecimal INTEGERs - The list consists of a primary key and an optional backup key. The - primary key is used for both creating and validating cookies, while the - optional backup key is only used for validating cookies. The purpose of - the backup key is to maximize TFO validation when keys are rotated. - - A randomly chosen primary key may be configured by the kernel if - the tcp_fastopen sysctl is set to 0x400 (see above), or if the - TCP_FASTOPEN setsockopt() optname is set and a key has not been - previously configured via sysctl. If keys are configured via - setsockopt() by using the TCP_FASTOPEN_KEY optname, then those - per-socket keys will be used instead of any keys that are specified via - sysctl. - - A key is specified as 4 8-digit hexadecimal integers which are separated - by a '-' as: xxxxxxxx-xxxxxxxx-xxxxxxxx-xxxxxxxx. Leading zeros may be - omitted. A primary and a backup key may be specified by separating them - by a comma. If only one key is specified, it becomes the primary key and - any previously configured backup keys are removed. - -tcp_syn_retries - INTEGER - Number of times initial SYNs for an active TCP connection attempt - will be retransmitted. Should not be higher than 127. Default value - is 6, which corresponds to 63seconds till the last retransmission - with the current initial RTO of 1second. With this the final timeout - for an active TCP connection attempt will happen after 127seconds. - -tcp_timestamps - INTEGER -Enable timestamps as defined in RFC1323. - 0: Disabled. - 1: Enable timestamps as defined in RFC1323 and use random offset for - each connection rather than only using the current time. - 2: Like 1, but without random offsets. - Default: 1 - -tcp_min_tso_segs - INTEGER - Minimal number of segments per TSO frame. - Since linux-3.12, TCP does an automatic sizing of TSO frames, - depending on flow rate, instead of filling 64Kbytes packets. - For specific usages, it's possible to force TCP to build big - TSO frames. Note that TCP stack might split too big TSO packets - if available window is too small. - Default: 2 - -tcp_pacing_ss_ratio - INTEGER - sk->sk_pacing_rate is set by TCP stack using a ratio applied - to current rate. (current_rate = cwnd * mss / srtt) - If TCP is in slow start, tcp_pacing_ss_ratio is applied - to let TCP probe for bigger speeds, assuming cwnd can be - doubled every other RTT. - Default: 200 - -tcp_pacing_ca_ratio - INTEGER - sk->sk_pacing_rate is set by TCP stack using a ratio applied - to current rate. (current_rate = cwnd * mss / srtt) - If TCP is in congestion avoidance phase, tcp_pacing_ca_ratio - is applied to conservatively probe for bigger throughput. - Default: 120 - -tcp_tso_win_divisor - INTEGER - This allows control over what percentage of the congestion window - can be consumed by a single TSO frame. - The setting of this parameter is a choice between burstiness and - building larger TSO frames. - Default: 3 - -tcp_tw_reuse - INTEGER - Enable reuse of TIME-WAIT sockets for new connections when it is - safe from protocol viewpoint. - 0 - disable - 1 - global enable - 2 - enable for loopback traffic only - It should not be changed without advice/request of technical - experts. - Default: 2 - -tcp_window_scaling - BOOLEAN - Enable window scaling as defined in RFC1323. - -tcp_wmem - vector of 3 INTEGERs: min, default, max - min: Amount of memory reserved for send buffers for TCP sockets. - Each TCP socket has rights to use it due to fact of its birth. - Default: 4K - - default: initial size of send buffer used by TCP sockets. This - value overrides net.core.wmem_default used by other protocols. - It is usually lower than net.core.wmem_default. - Default: 16K - - max: Maximal amount of memory allowed for automatically tuned - send buffers for TCP sockets. This value does not override - net.core.wmem_max. Calling setsockopt() with SO_SNDBUF disables - automatic tuning of that socket's send buffer size, in which case - this value is ignored. - Default: between 64K and 4MB, depending on RAM size. - -tcp_notsent_lowat - UNSIGNED INTEGER - A TCP socket can control the amount of unsent bytes in its write queue, - thanks to TCP_NOTSENT_LOWAT socket option. poll()/select()/epoll() - reports POLLOUT events if the amount of unsent bytes is below a per - socket value, and if the write queue is not full. sendmsg() will - also not add new buffers if the limit is hit. - - This global variable controls the amount of unsent data for - sockets not using TCP_NOTSENT_LOWAT. For these sockets, a change - to the global variable has immediate effect. - - Default: UINT_MAX (0xFFFFFFFF) - -tcp_workaround_signed_windows - BOOLEAN - If set, assume no receipt of a window scaling option means the - remote TCP is broken and treats the window as a signed quantity. - If unset, assume the remote TCP is not broken even if we do - not receive a window scaling option from them. - Default: 0 - -tcp_thin_linear_timeouts - BOOLEAN - Enable dynamic triggering of linear timeouts for thin streams. - If set, a check is performed upon retransmission by timeout to - determine if the stream is thin (less than 4 packets in flight). - As long as the stream is found to be thin, up to 6 linear - timeouts may be performed before exponential backoff mode is - initiated. This improves retransmission latency for - non-aggressive thin streams, often found to be time-dependent. - For more information on thin streams, see - Documentation/networking/tcp-thin.txt - Default: 0 - -tcp_limit_output_bytes - INTEGER - Controls TCP Small Queue limit per tcp socket. - TCP bulk sender tends to increase packets in flight until it - gets losses notifications. With SNDBUF autotuning, this can - result in a large amount of packets queued on the local machine - (e.g.: qdiscs, CPU backlog, or device) hurting latency of other - flows, for typical pfifo_fast qdiscs. tcp_limit_output_bytes - limits the number of bytes on qdisc or device to reduce artificial - RTT/cwnd and reduce bufferbloat. - Default: 1048576 (16 * 65536) - -tcp_challenge_ack_limit - INTEGER - Limits number of Challenge ACK sent per second, as recommended - in RFC 5961 (Improving TCP's Robustness to Blind In-Window Attacks) - Default: 1000 - -tcp_rx_skb_cache - BOOLEAN - Controls a per TCP socket cache of one skb, that might help - performance of some workloads. This might be dangerous - on systems with a lot of TCP sockets, since it increases - memory usage. - - Default: 0 (disabled) - -UDP variables: - -udp_l3mdev_accept - BOOLEAN - Enabling this option allows a "global" bound socket to work - across L3 master domains (e.g., VRFs) with packets capable of - being received regardless of the L3 domain in which they - originated. Only valid when the kernel was compiled with - CONFIG_NET_L3_MASTER_DEV. - Default: 0 (disabled) - -udp_mem - vector of 3 INTEGERs: min, pressure, max - Number of pages allowed for queueing by all UDP sockets. - - min: Below this number of pages UDP is not bothered about its - memory appetite. When amount of memory allocated by UDP exceeds - this number, UDP starts to moderate memory usage. - - pressure: This value was introduced to follow format of tcp_mem. - - max: Number of pages allowed for queueing by all UDP sockets. - - Default is calculated at boot time from amount of available memory. - -udp_rmem_min - INTEGER - Minimal size of receive buffer used by UDP sockets in moderation. - Each UDP socket is able to use the size for receiving data, even if - total pages of UDP sockets exceed udp_mem pressure. The unit is byte. - Default: 4K - -udp_wmem_min - INTEGER - Minimal size of send buffer used by UDP sockets in moderation. - Each UDP socket is able to use the size for sending data, even if - total pages of UDP sockets exceed udp_mem pressure. The unit is byte. - Default: 4K - -RAW variables: - -raw_l3mdev_accept - BOOLEAN - Enabling this option allows a "global" bound socket to work - across L3 master domains (e.g., VRFs) with packets capable of - being received regardless of the L3 domain in which they - originated. Only valid when the kernel was compiled with - CONFIG_NET_L3_MASTER_DEV. - Default: 1 (enabled) - -CIPSOv4 Variables: - -cipso_cache_enable - BOOLEAN - If set, enable additions to and lookups from the CIPSO label mapping - cache. If unset, additions are ignored and lookups always result in a - miss. However, regardless of the setting the cache is still - invalidated when required when means you can safely toggle this on and - off and the cache will always be "safe". - Default: 1 - -cipso_cache_bucket_size - INTEGER - The CIPSO label cache consists of a fixed size hash table with each - hash bucket containing a number of cache entries. This variable limits - the number of entries in each hash bucket; the larger the value the - more CIPSO label mappings that can be cached. When the number of - entries in a given hash bucket reaches this limit adding new entries - causes the oldest entry in the bucket to be removed to make room. - Default: 10 - -cipso_rbm_optfmt - BOOLEAN - Enable the "Optimized Tag 1 Format" as defined in section 3.4.2.6 of - the CIPSO draft specification (see Documentation/netlabel for details). - This means that when set the CIPSO tag will be padded with empty - categories in order to make the packet data 32-bit aligned. - Default: 0 - -cipso_rbm_structvalid - BOOLEAN - If set, do a very strict check of the CIPSO option when - ip_options_compile() is called. If unset, relax the checks done during - ip_options_compile(). Either way is "safe" as errors are caught else - where in the CIPSO processing code but setting this to 0 (False) should - result in less work (i.e. it should be faster) but could cause problems - with other implementations that require strict checking. - Default: 0 - -IP Variables: - -ip_local_port_range - 2 INTEGERS - Defines the local port range that is used by TCP and UDP to - choose the local port. The first number is the first, the - second the last local port number. - If possible, it is better these numbers have different parity - (one even and one odd value). - Must be greater than or equal to ip_unprivileged_port_start. - The default values are 32768 and 60999 respectively. - -ip_local_reserved_ports - list of comma separated ranges - Specify the ports which are reserved for known third-party - applications. These ports will not be used by automatic port - assignments (e.g. when calling connect() or bind() with port - number 0). Explicit port allocation behavior is unchanged. - - The format used for both input and output is a comma separated - list of ranges (e.g. "1,2-4,10-10" for ports 1, 2, 3, 4 and - 10). Writing to the file will clear all previously reserved - ports and update the current list with the one given in the - input. - - Note that ip_local_port_range and ip_local_reserved_ports - settings are independent and both are considered by the kernel - when determining which ports are available for automatic port - assignments. - - You can reserve ports which are not in the current - ip_local_port_range, e.g.: - - $ cat /proc/sys/net/ipv4/ip_local_port_range - 32000 60999 - $ cat /proc/sys/net/ipv4/ip_local_reserved_ports - 8080,9148 - - although this is redundant. However such a setting is useful - if later the port range is changed to a value that will - include the reserved ports. - - Default: Empty - -ip_unprivileged_port_start - INTEGER - This is a per-namespace sysctl. It defines the first - unprivileged port in the network namespace. Privileged ports - require root or CAP_NET_BIND_SERVICE in order to bind to them. - To disable all privileged ports, set this to 0. They must not - overlap with the ip_local_port_range. - - Default: 1024 - -ip_nonlocal_bind - BOOLEAN - If set, allows processes to bind() to non-local IP addresses, - which can be quite useful - but may break some applications. - Default: 0 - -ip_autobind_reuse - BOOLEAN - By default, bind() does not select the ports automatically even if - the new socket and all sockets bound to the port have SO_REUSEADDR. - ip_autobind_reuse allows bind() to reuse the port and this is useful - when you use bind()+connect(), but may break some applications. - The preferred solution is to use IP_BIND_ADDRESS_NO_PORT and this - option should only be set by experts. - Default: 0 - -ip_dynaddr - BOOLEAN - If set non-zero, enables support for dynamic addresses. - If set to a non-zero value larger than 1, a kernel log - message will be printed when dynamic address rewriting - occurs. - Default: 0 - -ip_early_demux - BOOLEAN - Optimize input packet processing down to one demux for - certain kinds of local sockets. Currently we only do this - for established TCP and connected UDP sockets. - - It may add an additional cost for pure routing workloads that - reduces overall throughput, in such case you should disable it. - Default: 1 - -ping_group_range - 2 INTEGERS - Restrict ICMP_PROTO datagram sockets to users in the group range. - The default is "1 0", meaning, that nobody (not even root) may - create ping sockets. Setting it to "100 100" would grant permissions - to the single group. "0 4294967295" would enable it for the world, "100 - 4294967295" would enable it for the users, but not daemons. - -tcp_early_demux - BOOLEAN - Enable early demux for established TCP sockets. - Default: 1 - -udp_early_demux - BOOLEAN - Enable early demux for connected UDP sockets. Disable this if - your system could experience more unconnected load. - Default: 1 - -icmp_echo_ignore_all - BOOLEAN - If set non-zero, then the kernel will ignore all ICMP ECHO - requests sent to it. - Default: 0 - -icmp_echo_ignore_broadcasts - BOOLEAN - If set non-zero, then the kernel will ignore all ICMP ECHO and - TIMESTAMP requests sent to it via broadcast/multicast. - Default: 1 - -icmp_ratelimit - INTEGER - Limit the maximal rates for sending ICMP packets whose type matches - icmp_ratemask (see below) to specific targets. - 0 to disable any limiting, - otherwise the minimal space between responses in milliseconds. - Note that another sysctl, icmp_msgs_per_sec limits the number - of ICMP packets sent on all targets. - Default: 1000 - -icmp_msgs_per_sec - INTEGER - Limit maximal number of ICMP packets sent per second from this host. - Only messages whose type matches icmp_ratemask (see below) are - controlled by this limit. - Default: 1000 - -icmp_msgs_burst - INTEGER - icmp_msgs_per_sec controls number of ICMP packets sent per second, - while icmp_msgs_burst controls the burst size of these packets. - Default: 50 - -icmp_ratemask - INTEGER - Mask made of ICMP types for which rates are being limited. - Significant bits: IHGFEDCBA9876543210 - Default mask: 0000001100000011000 (6168) - - Bit definitions (see include/linux/icmp.h): - 0 Echo Reply - 3 Destination Unreachable * - 4 Source Quench * - 5 Redirect - 8 Echo Request - B Time Exceeded * - C Parameter Problem * - D Timestamp Request - E Timestamp Reply - F Info Request - G Info Reply - H Address Mask Request - I Address Mask Reply - - * These are rate limited by default (see default mask above) - -icmp_ignore_bogus_error_responses - BOOLEAN - Some routers violate RFC1122 by sending bogus responses to broadcast - frames. Such violations are normally logged via a kernel warning. - If this is set to TRUE, the kernel will not give such warnings, which - will avoid log file clutter. - Default: 1 - -icmp_errors_use_inbound_ifaddr - BOOLEAN - - If zero, icmp error messages are sent with the primary address of - the exiting interface. - - If non-zero, the message will be sent with the primary address of - the interface that received the packet that caused the icmp error. - This is the behaviour network many administrators will expect from - a router. And it can make debugging complicated network layouts - much easier. - - Note that if no primary address exists for the interface selected, - then the primary address of the first non-loopback interface that - has one will be used regardless of this setting. - - Default: 0 - -igmp_max_memberships - INTEGER - Change the maximum number of multicast groups we can subscribe to. - Default: 20 - - Theoretical maximum value is bounded by having to send a membership - report in a single datagram (i.e. the report can't span multiple - datagrams, or risk confusing the switch and leaving groups you don't - intend to). - - The number of supported groups 'M' is bounded by the number of group - report entries you can fit into a single datagram of 65535 bytes. - - M = 65536-sizeof (ip header)/(sizeof(Group record)) - - Group records are variable length, with a minimum of 12 bytes. - So net.ipv4.igmp_max_memberships should not be set higher than: - - (65536-24) / 12 = 5459 - - The value 5459 assumes no IP header options, so in practice - this number may be lower. - -igmp_max_msf - INTEGER - Maximum number of addresses allowed in the source filter list for a - multicast group. - Default: 10 - -igmp_qrv - INTEGER - Controls the IGMP query robustness variable (see RFC2236 8.1). - Default: 2 (as specified by RFC2236 8.1) - Minimum: 1 (as specified by RFC6636 4.5) - -force_igmp_version - INTEGER - 0 - (default) No enforcement of a IGMP version, IGMPv1/v2 fallback - allowed. Will back to IGMPv3 mode again if all IGMPv1/v2 Querier - Present timer expires. - 1 - Enforce to use IGMP version 1. Will also reply IGMPv1 report if - receive IGMPv2/v3 query. - 2 - Enforce to use IGMP version 2. Will fallback to IGMPv1 if receive - IGMPv1 query message. Will reply report if receive IGMPv3 query. - 3 - Enforce to use IGMP version 3. The same react with default 0. - - Note: this is not the same with force_mld_version because IGMPv3 RFC3376 - Security Considerations does not have clear description that we could - ignore other version messages completely as MLDv2 RFC3810. So make - this value as default 0 is recommended. - -conf/interface/* changes special settings per interface (where -"interface" is the name of your network interface) - -conf/all/* is special, changes the settings for all interfaces - -log_martians - BOOLEAN - Log packets with impossible addresses to kernel log. - log_martians for the interface will be enabled if at least one of - conf/{all,interface}/log_martians is set to TRUE, - it will be disabled otherwise - -accept_redirects - BOOLEAN - Accept ICMP redirect messages. - accept_redirects for the interface will be enabled if: - - both conf/{all,interface}/accept_redirects are TRUE in the case - forwarding for the interface is enabled - or - - at least one of conf/{all,interface}/accept_redirects is TRUE in the - case forwarding for the interface is disabled - accept_redirects for the interface will be disabled otherwise - default TRUE (host) - FALSE (router) - -forwarding - BOOLEAN - Enable IP forwarding on this interface. This controls whether packets - received _on_ this interface can be forwarded. - -mc_forwarding - BOOLEAN - Do multicast routing. The kernel needs to be compiled with CONFIG_MROUTE - and a multicast routing daemon is required. - conf/all/mc_forwarding must also be set to TRUE to enable multicast - routing for the interface - -medium_id - INTEGER - Integer value used to differentiate the devices by the medium they - are attached to. Two devices can have different id values when - the broadcast packets are received only on one of them. - The default value 0 means that the device is the only interface - to its medium, value of -1 means that medium is not known. - - Currently, it is used to change the proxy_arp behavior: - the proxy_arp feature is enabled for packets forwarded between - two devices attached to different media. - -proxy_arp - BOOLEAN - Do proxy arp. - proxy_arp for the interface will be enabled if at least one of - conf/{all,interface}/proxy_arp is set to TRUE, - it will be disabled otherwise - -proxy_arp_pvlan - BOOLEAN - Private VLAN proxy arp. - Basically allow proxy arp replies back to the same interface - (from which the ARP request/solicitation was received). - - This is done to support (ethernet) switch features, like RFC - 3069, where the individual ports are NOT allowed to - communicate with each other, but they are allowed to talk to - the upstream router. As described in RFC 3069, it is possible - to allow these hosts to communicate through the upstream - router by proxy_arp'ing. Don't need to be used together with - proxy_arp. - - This technology is known by different names: - In RFC 3069 it is called VLAN Aggregation. - Cisco and Allied Telesyn call it Private VLAN. - Hewlett-Packard call it Source-Port filtering or port-isolation. - Ericsson call it MAC-Forced Forwarding (RFC Draft). - -shared_media - BOOLEAN - Send(router) or accept(host) RFC1620 shared media redirects. - Overrides secure_redirects. - shared_media for the interface will be enabled if at least one of - conf/{all,interface}/shared_media is set to TRUE, - it will be disabled otherwise - default TRUE - -secure_redirects - BOOLEAN - Accept ICMP redirect messages only to gateways listed in the - interface's current gateway list. Even if disabled, RFC1122 redirect - rules still apply. - Overridden by shared_media. - secure_redirects for the interface will be enabled if at least one of - conf/{all,interface}/secure_redirects is set to TRUE, - it will be disabled otherwise - default TRUE - -send_redirects - BOOLEAN - Send redirects, if router. - send_redirects for the interface will be enabled if at least one of - conf/{all,interface}/send_redirects is set to TRUE, - it will be disabled otherwise - Default: TRUE - -bootp_relay - BOOLEAN - Accept packets with source address 0.b.c.d destined - not to this host as local ones. It is supposed, that - BOOTP relay daemon will catch and forward such packets. - conf/all/bootp_relay must also be set to TRUE to enable BOOTP relay - for the interface - default FALSE - Not Implemented Yet. - -accept_source_route - BOOLEAN - Accept packets with SRR option. - conf/all/accept_source_route must also be set to TRUE to accept packets - with SRR option on the interface - default TRUE (router) - FALSE (host) - -accept_local - BOOLEAN - Accept packets with local source addresses. In combination with - suitable routing, this can be used to direct packets between two - local interfaces over the wire and have them accepted properly. - default FALSE - -route_localnet - BOOLEAN - Do not consider loopback addresses as martian source or destination - while routing. This enables the use of 127/8 for local routing purposes. - default FALSE - -rp_filter - INTEGER - 0 - No source validation. - 1 - Strict mode as defined in RFC3704 Strict Reverse Path - Each incoming packet is tested against the FIB and if the interface - is not the best reverse path the packet check will fail. - By default failed packets are discarded. - 2 - Loose mode as defined in RFC3704 Loose Reverse Path - Each incoming packet's source address is also tested against the FIB - and if the source address is not reachable via any interface - the packet check will fail. - - Current recommended practice in RFC3704 is to enable strict mode - to prevent IP spoofing from DDos attacks. If using asymmetric routing - or other complicated routing, then loose mode is recommended. - - The max value from conf/{all,interface}/rp_filter is used - when doing source validation on the {interface}. - - Default value is 0. Note that some distributions enable it - in startup scripts. - -arp_filter - BOOLEAN - 1 - Allows you to have multiple network interfaces on the same - subnet, and have the ARPs for each interface be answered - based on whether or not the kernel would route a packet from - the ARP'd IP out that interface (therefore you must use source - based routing for this to work). In other words it allows control - of which cards (usually 1) will respond to an arp request. - - 0 - (default) The kernel can respond to arp requests with addresses - from other interfaces. This may seem wrong but it usually makes - sense, because it increases the chance of successful communication. - IP addresses are owned by the complete host on Linux, not by - particular interfaces. Only for more complex setups like load- - balancing, does this behaviour cause problems. - - arp_filter for the interface will be enabled if at least one of - conf/{all,interface}/arp_filter is set to TRUE, - it will be disabled otherwise - -arp_announce - INTEGER - Define different restriction levels for announcing the local - source IP address from IP packets in ARP requests sent on - interface: - 0 - (default) Use any local address, configured on any interface - 1 - Try to avoid local addresses that are not in the target's - subnet for this interface. This mode is useful when target - hosts reachable via this interface require the source IP - address in ARP requests to be part of their logical network - configured on the receiving interface. When we generate the - request we will check all our subnets that include the - target IP and will preserve the source address if it is from - such subnet. If there is no such subnet we select source - address according to the rules for level 2. - 2 - Always use the best local address for this target. - In this mode we ignore the source address in the IP packet - and try to select local address that we prefer for talks with - the target host. Such local address is selected by looking - for primary IP addresses on all our subnets on the outgoing - interface that include the target IP address. If no suitable - local address is found we select the first local address - we have on the outgoing interface or on all other interfaces, - with the hope we will receive reply for our request and - even sometimes no matter the source IP address we announce. - - The max value from conf/{all,interface}/arp_announce is used. - - Increasing the restriction level gives more chance for - receiving answer from the resolved target while decreasing - the level announces more valid sender's information. - -arp_ignore - INTEGER - Define different modes for sending replies in response to - received ARP requests that resolve local target IP addresses: - 0 - (default): reply for any local target IP address, configured - on any interface - 1 - reply only if the target IP address is local address - configured on the incoming interface - 2 - reply only if the target IP address is local address - configured on the incoming interface and both with the - sender's IP address are part from same subnet on this interface - 3 - do not reply for local addresses configured with scope host, - only resolutions for global and link addresses are replied - 4-7 - reserved - 8 - do not reply for all local addresses - - The max value from conf/{all,interface}/arp_ignore is used - when ARP request is received on the {interface} - -arp_notify - BOOLEAN - Define mode for notification of address and device changes. - 0 - (default): do nothing - 1 - Generate gratuitous arp requests when device is brought up - or hardware address changes. - -arp_accept - BOOLEAN - Define behavior for gratuitous ARP frames who's IP is not - already present in the ARP table: - 0 - don't create new entries in the ARP table - 1 - create new entries in the ARP table - - Both replies and requests type gratuitous arp will trigger the - ARP table to be updated, if this setting is on. - - If the ARP table already contains the IP address of the - gratuitous arp frame, the arp table will be updated regardless - if this setting is on or off. - -mcast_solicit - INTEGER - The maximum number of multicast probes in INCOMPLETE state, - when the associated hardware address is unknown. Defaults - to 3. - -ucast_solicit - INTEGER - The maximum number of unicast probes in PROBE state, when - the hardware address is being reconfirmed. Defaults to 3. - -app_solicit - INTEGER - The maximum number of probes to send to the user space ARP daemon - via netlink before dropping back to multicast probes (see - mcast_resolicit). Defaults to 0. - -mcast_resolicit - INTEGER - The maximum number of multicast probes after unicast and - app probes in PROBE state. Defaults to 0. - -disable_policy - BOOLEAN - Disable IPSEC policy (SPD) for this interface - -disable_xfrm - BOOLEAN - Disable IPSEC encryption on this interface, whatever the policy - -igmpv2_unsolicited_report_interval - INTEGER - The interval in milliseconds in which the next unsolicited - IGMPv1 or IGMPv2 report retransmit will take place. - Default: 10000 (10 seconds) - -igmpv3_unsolicited_report_interval - INTEGER - The interval in milliseconds in which the next unsolicited - IGMPv3 report retransmit will take place. - Default: 1000 (1 seconds) - -promote_secondaries - BOOLEAN - When a primary IP address is removed from this interface - promote a corresponding secondary IP address instead of - removing all the corresponding secondary IP addresses. - -drop_unicast_in_l2_multicast - BOOLEAN - Drop any unicast IP packets that are received in link-layer - multicast (or broadcast) frames. - This behavior (for multicast) is actually a SHOULD in RFC - 1122, but is disabled by default for compatibility reasons. - Default: off (0) - -drop_gratuitous_arp - BOOLEAN - Drop all gratuitous ARP frames, for example if there's a known - good ARP proxy on the network and such frames need not be used - (or in the case of 802.11, must not be used to prevent attacks.) - Default: off (0) - - -tag - INTEGER - Allows you to write a number, which can be used as required. - Default value is 0. - -xfrm4_gc_thresh - INTEGER - (Obsolete since linux-4.14) - The threshold at which we will start garbage collecting for IPv4 - destination cache entries. At twice this value the system will - refuse new allocations. - -igmp_link_local_mcast_reports - BOOLEAN - Enable IGMP reports for link local multicast groups in the - 224.0.0.X range. - Default TRUE - -Alexey Kuznetsov. -kuznet@ms2.inr.ac.ru - -Updated by: -Andi Kleen -ak@muc.de -Nicolas Delon -delon.nicolas@wanadoo.fr - - - - -/proc/sys/net/ipv6/* Variables: - -IPv6 has no global variables such as tcp_*. tcp_* settings under ipv4/ also -apply to IPv6 [XXX?]. - -bindv6only - BOOLEAN - Default value for IPV6_V6ONLY socket option, - which restricts use of the IPv6 socket to IPv6 communication - only. - TRUE: disable IPv4-mapped address feature - FALSE: enable IPv4-mapped address feature - - Default: FALSE (as specified in RFC3493) - -flowlabel_consistency - BOOLEAN - Protect the consistency (and unicity) of flow label. - You have to disable it to use IPV6_FL_F_REFLECT flag on the - flow label manager. - TRUE: enabled - FALSE: disabled - Default: TRUE - -auto_flowlabels - INTEGER - Automatically generate flow labels based on a flow hash of the - packet. This allows intermediate devices, such as routers, to - identify packet flows for mechanisms like Equal Cost Multipath - Routing (see RFC 6438). - 0: automatic flow labels are completely disabled - 1: automatic flow labels are enabled by default, they can be - disabled on a per socket basis using the IPV6_AUTOFLOWLABEL - socket option - 2: automatic flow labels are allowed, they may be enabled on a - per socket basis using the IPV6_AUTOFLOWLABEL socket option - 3: automatic flow labels are enabled and enforced, they cannot - be disabled by the socket option - Default: 1 - -flowlabel_state_ranges - BOOLEAN - Split the flow label number space into two ranges. 0-0x7FFFF is - reserved for the IPv6 flow manager facility, 0x80000-0xFFFFF - is reserved for stateless flow labels as described in RFC6437. - TRUE: enabled - FALSE: disabled - Default: true - -flowlabel_reflect - INTEGER - Control flow label reflection. Needed for Path MTU - Discovery to work with Equal Cost Multipath Routing in anycast - environments. See RFC 7690 and: - https://tools.ietf.org/html/draft-wang-6man-flow-label-reflection-01 - - This is a bitmask. - 1: enabled for established flows - - Note that this prevents automatic flowlabel changes, as done - in "tcp: change IPv6 flow-label upon receiving spurious retransmission" - and "tcp: Change txhash on every SYN and RTO retransmit" - - 2: enabled for TCP RESET packets (no active listener) - If set, a RST packet sent in response to a SYN packet on a closed - port will reflect the incoming flow label. - - 4: enabled for ICMPv6 echo reply messages. - - Default: 0 - -fib_multipath_hash_policy - INTEGER - Controls which hash policy to use for multipath routes. - Default: 0 (Layer 3) - Possible values: - 0 - Layer 3 (source and destination addresses plus flow label) - 1 - Layer 4 (standard 5-tuple) - 2 - Layer 3 or inner Layer 3 if present - -anycast_src_echo_reply - BOOLEAN - Controls the use of anycast addresses as source addresses for ICMPv6 - echo reply - TRUE: enabled - FALSE: disabled - Default: FALSE - -idgen_delay - INTEGER - Controls the delay in seconds after which time to retry - privacy stable address generation if a DAD conflict is - detected. - Default: 1 (as specified in RFC7217) - -idgen_retries - INTEGER - Controls the number of retries to generate a stable privacy - address if a DAD conflict is detected. - Default: 3 (as specified in RFC7217) - -mld_qrv - INTEGER - Controls the MLD query robustness variable (see RFC3810 9.1). - Default: 2 (as specified by RFC3810 9.1) - Minimum: 1 (as specified by RFC6636 4.5) - -max_dst_opts_number - INTEGER - Maximum number of non-padding TLVs allowed in a Destination - options extension header. If this value is less than zero - then unknown options are disallowed and the number of known - TLVs allowed is the absolute value of this number. - Default: 8 - -max_hbh_opts_number - INTEGER - Maximum number of non-padding TLVs allowed in a Hop-by-Hop - options extension header. If this value is less than zero - then unknown options are disallowed and the number of known - TLVs allowed is the absolute value of this number. - Default: 8 - -max_dst_opts_length - INTEGER - Maximum length allowed for a Destination options extension - header. - Default: INT_MAX (unlimited) - -max_hbh_length - INTEGER - Maximum length allowed for a Hop-by-Hop options extension - header. - Default: INT_MAX (unlimited) - -skip_notify_on_dev_down - BOOLEAN - Controls whether an RTM_DELROUTE message is generated for routes - removed when a device is taken down or deleted. IPv4 does not - generate this message; IPv6 does by default. Setting this sysctl - to true skips the message, making IPv4 and IPv6 on par in relying - on userspace caches to track link events and evict routes. - Default: false (generate message) - -nexthop_compat_mode - BOOLEAN - New nexthop API provides a means for managing nexthops independent of - prefixes. Backwards compatibilty with old route format is enabled by - default which means route dumps and notifications contain the new - nexthop attribute but also the full, expanded nexthop definition. - Further, updates or deletes of a nexthop configuration generate route - notifications for each fib entry using the nexthop. Once a system - understands the new API, this sysctl can be disabled to achieve full - performance benefits of the new API by disabling the nexthop expansion - and extraneous notifications. - Default: true (backward compat mode) - -IPv6 Fragmentation: - -ip6frag_high_thresh - INTEGER - Maximum memory used to reassemble IPv6 fragments. When - ip6frag_high_thresh bytes of memory is allocated for this purpose, - the fragment handler will toss packets until ip6frag_low_thresh - is reached. - -ip6frag_low_thresh - INTEGER - See ip6frag_high_thresh - -ip6frag_time - INTEGER - Time in seconds to keep an IPv6 fragment in memory. - -IPv6 Segment Routing: - -seg6_flowlabel - INTEGER - Controls the behaviour of computing the flowlabel of outer - IPv6 header in case of SR T.encaps - - -1 set flowlabel to zero. - 0 copy flowlabel from Inner packet in case of Inner IPv6 - (Set flowlabel to 0 in case IPv4/L2) - 1 Compute the flowlabel using seg6_make_flowlabel() - - Default is 0. - -conf/default/*: - Change the interface-specific default settings. - - -conf/all/*: - Change all the interface-specific settings. - - [XXX: Other special features than forwarding?] - -conf/all/forwarding - BOOLEAN - Enable global IPv6 forwarding between all interfaces. - - IPv4 and IPv6 work differently here; e.g. netfilter must be used - to control which interfaces may forward packets and which not. - - This also sets all interfaces' Host/Router setting - 'forwarding' to the specified value. See below for details. - - This referred to as global forwarding. - -proxy_ndp - BOOLEAN - Do proxy ndp. - -fwmark_reflect - BOOLEAN - Controls the fwmark of kernel-generated IPv6 reply packets that are not - associated with a socket for example, TCP RSTs or ICMPv6 echo replies). - If unset, these packets have a fwmark of zero. If set, they have the - fwmark of the packet they are replying to. - Default: 0 - -conf/interface/*: - Change special settings per interface. - - The functional behaviour for certain settings is different - depending on whether local forwarding is enabled or not. - -accept_ra - INTEGER - Accept Router Advertisements; autoconfigure using them. - - It also determines whether or not to transmit Router - Solicitations. If and only if the functional setting is to - accept Router Advertisements, Router Solicitations will be - transmitted. - - Possible values are: - 0 Do not accept Router Advertisements. - 1 Accept Router Advertisements if forwarding is disabled. - 2 Overrule forwarding behaviour. Accept Router Advertisements - even if forwarding is enabled. - - Functional default: enabled if local forwarding is disabled. - disabled if local forwarding is enabled. - -accept_ra_defrtr - BOOLEAN - Learn default router in Router Advertisement. - - Functional default: enabled if accept_ra is enabled. - disabled if accept_ra is disabled. - -accept_ra_from_local - BOOLEAN - Accept RA with source-address that is found on local machine - if the RA is otherwise proper and able to be accepted. - Default is to NOT accept these as it may be an un-intended - network loop. - - Functional default: - enabled if accept_ra_from_local is enabled - on a specific interface. - disabled if accept_ra_from_local is disabled - on a specific interface. - -accept_ra_min_hop_limit - INTEGER - Minimum hop limit Information in Router Advertisement. - - Hop limit Information in Router Advertisement less than this - variable shall be ignored. - - Default: 1 - -accept_ra_pinfo - BOOLEAN - Learn Prefix Information in Router Advertisement. - - Functional default: enabled if accept_ra is enabled. - disabled if accept_ra is disabled. - -accept_ra_rt_info_min_plen - INTEGER - Minimum prefix length of Route Information in RA. - - Route Information w/ prefix smaller than this variable shall - be ignored. - - Functional default: 0 if accept_ra_rtr_pref is enabled. - -1 if accept_ra_rtr_pref is disabled. - -accept_ra_rt_info_max_plen - INTEGER - Maximum prefix length of Route Information in RA. - - Route Information w/ prefix larger than this variable shall - be ignored. - - Functional default: 0 if accept_ra_rtr_pref is enabled. - -1 if accept_ra_rtr_pref is disabled. - -accept_ra_rtr_pref - BOOLEAN - Accept Router Preference in RA. - - Functional default: enabled if accept_ra is enabled. - disabled if accept_ra is disabled. - -accept_ra_mtu - BOOLEAN - Apply the MTU value specified in RA option 5 (RFC4861). If - disabled, the MTU specified in the RA will be ignored. - - Functional default: enabled if accept_ra is enabled. - disabled if accept_ra is disabled. - -accept_redirects - BOOLEAN - Accept Redirects. - - Functional default: enabled if local forwarding is disabled. - disabled if local forwarding is enabled. - -accept_source_route - INTEGER - Accept source routing (routing extension header). - - >= 0: Accept only routing header type 2. - < 0: Do not accept routing header. - - Default: 0 - -autoconf - BOOLEAN - Autoconfigure addresses using Prefix Information in Router - Advertisements. - - Functional default: enabled if accept_ra_pinfo is enabled. - disabled if accept_ra_pinfo is disabled. - -dad_transmits - INTEGER - The amount of Duplicate Address Detection probes to send. - Default: 1 - -forwarding - INTEGER - Configure interface-specific Host/Router behaviour. - - Note: It is recommended to have the same setting on all - interfaces; mixed router/host scenarios are rather uncommon. - - Possible values are: - 0 Forwarding disabled - 1 Forwarding enabled - - FALSE (0): - - By default, Host behaviour is assumed. This means: - - 1. IsRouter flag is not set in Neighbour Advertisements. - 2. If accept_ra is TRUE (default), transmit Router - Solicitations. - 3. If accept_ra is TRUE (default), accept Router - Advertisements (and do autoconfiguration). - 4. If accept_redirects is TRUE (default), accept Redirects. - - TRUE (1): - - If local forwarding is enabled, Router behaviour is assumed. - This means exactly the reverse from the above: - - 1. IsRouter flag is set in Neighbour Advertisements. - 2. Router Solicitations are not sent unless accept_ra is 2. - 3. Router Advertisements are ignored unless accept_ra is 2. - 4. Redirects are ignored. - - Default: 0 (disabled) if global forwarding is disabled (default), - otherwise 1 (enabled). - -hop_limit - INTEGER - Default Hop Limit to set. - Default: 64 - -mtu - INTEGER - Default Maximum Transfer Unit - Default: 1280 (IPv6 required minimum) - -ip_nonlocal_bind - BOOLEAN - If set, allows processes to bind() to non-local IPv6 addresses, - which can be quite useful - but may break some applications. - Default: 0 - -router_probe_interval - INTEGER - Minimum interval (in seconds) between Router Probing described - in RFC4191. - - Default: 60 - -router_solicitation_delay - INTEGER - Number of seconds to wait after interface is brought up - before sending Router Solicitations. - Default: 1 - -router_solicitation_interval - INTEGER - Number of seconds to wait between Router Solicitations. - Default: 4 - -router_solicitations - INTEGER - Number of Router Solicitations to send until assuming no - routers are present. - Default: 3 - -use_oif_addrs_only - BOOLEAN - When enabled, the candidate source addresses for destinations - routed via this interface are restricted to the set of addresses - configured on this interface (vis. RFC 6724, section 4). - - Default: false - -use_tempaddr - INTEGER - Preference for Privacy Extensions (RFC3041). - <= 0 : disable Privacy Extensions - == 1 : enable Privacy Extensions, but prefer public - addresses over temporary addresses. - > 1 : enable Privacy Extensions and prefer temporary - addresses over public addresses. - Default: 0 (for most devices) - -1 (for point-to-point devices and loopback devices) - -temp_valid_lft - INTEGER - valid lifetime (in seconds) for temporary addresses. - Default: 604800 (7 days) - -temp_prefered_lft - INTEGER - Preferred lifetime (in seconds) for temporary addresses. - Default: 86400 (1 day) - -keep_addr_on_down - INTEGER - Keep all IPv6 addresses on an interface down event. If set static - global addresses with no expiration time are not flushed. - >0 : enabled - 0 : system default - <0 : disabled - - Default: 0 (addresses are removed) - -max_desync_factor - INTEGER - Maximum value for DESYNC_FACTOR, which is a random value - that ensures that clients don't synchronize with each - other and generate new addresses at exactly the same time. - value is in seconds. - Default: 600 - -regen_max_retry - INTEGER - Number of attempts before give up attempting to generate - valid temporary addresses. - Default: 5 - -max_addresses - INTEGER - Maximum number of autoconfigured addresses per interface. Setting - to zero disables the limitation. It is not recommended to set this - value too large (or to zero) because it would be an easy way to - crash the kernel by allowing too many addresses to be created. - Default: 16 - -disable_ipv6 - BOOLEAN - Disable IPv6 operation. If accept_dad is set to 2, this value - will be dynamically set to TRUE if DAD fails for the link-local - address. - Default: FALSE (enable IPv6 operation) - - When this value is changed from 1 to 0 (IPv6 is being enabled), - it will dynamically create a link-local address on the given - interface and start Duplicate Address Detection, if necessary. - - When this value is changed from 0 to 1 (IPv6 is being disabled), - it will dynamically delete all addresses and routes on the given - interface. From now on it will not possible to add addresses/routes - to the selected interface. - -accept_dad - INTEGER - Whether to accept DAD (Duplicate Address Detection). - 0: Disable DAD - 1: Enable DAD (default) - 2: Enable DAD, and disable IPv6 operation if MAC-based duplicate - link-local address has been found. - - DAD operation and mode on a given interface will be selected according - to the maximum value of conf/{all,interface}/accept_dad. - -force_tllao - BOOLEAN - Enable sending the target link-layer address option even when - responding to a unicast neighbor solicitation. - Default: FALSE - - Quoting from RFC 2461, section 4.4, Target link-layer address: - - "The option MUST be included for multicast solicitations in order to - avoid infinite Neighbor Solicitation "recursion" when the peer node - does not have a cache entry to return a Neighbor Advertisements - message. When responding to unicast solicitations, the option can be - omitted since the sender of the solicitation has the correct link- - layer address; otherwise it would not have be able to send the unicast - solicitation in the first place. However, including the link-layer - address in this case adds little overhead and eliminates a potential - race condition where the sender deletes the cached link-layer address - prior to receiving a response to a previous solicitation." - -ndisc_notify - BOOLEAN - Define mode for notification of address and device changes. - 0 - (default): do nothing - 1 - Generate unsolicited neighbour advertisements when device is brought - up or hardware address changes. - -ndisc_tclass - INTEGER - The IPv6 Traffic Class to use by default when sending IPv6 Neighbor - Discovery (Router Solicitation, Router Advertisement, Neighbor - Solicitation, Neighbor Advertisement, Redirect) messages. - These 8 bits can be interpreted as 6 high order bits holding the DSCP - value and 2 low order bits representing ECN (which you probably want - to leave cleared). - 0 - (default) - -mldv1_unsolicited_report_interval - INTEGER - The interval in milliseconds in which the next unsolicited - MLDv1 report retransmit will take place. - Default: 10000 (10 seconds) - -mldv2_unsolicited_report_interval - INTEGER - The interval in milliseconds in which the next unsolicited - MLDv2 report retransmit will take place. - Default: 1000 (1 second) - -force_mld_version - INTEGER - 0 - (default) No enforcement of a MLD version, MLDv1 fallback allowed - 1 - Enforce to use MLD version 1 - 2 - Enforce to use MLD version 2 - -suppress_frag_ndisc - INTEGER - Control RFC 6980 (Security Implications of IPv6 Fragmentation - with IPv6 Neighbor Discovery) behavior: - 1 - (default) discard fragmented neighbor discovery packets - 0 - allow fragmented neighbor discovery packets - -optimistic_dad - BOOLEAN - Whether to perform Optimistic Duplicate Address Detection (RFC 4429). - 0: disabled (default) - 1: enabled - - Optimistic Duplicate Address Detection for the interface will be enabled - if at least one of conf/{all,interface}/optimistic_dad is set to 1, - it will be disabled otherwise. - -use_optimistic - BOOLEAN - If enabled, do not classify optimistic addresses as deprecated during - source address selection. Preferred addresses will still be chosen - before optimistic addresses, subject to other ranking in the source - address selection algorithm. - 0: disabled (default) - 1: enabled - - This will be enabled if at least one of - conf/{all,interface}/use_optimistic is set to 1, disabled otherwise. - -stable_secret - IPv6 address - This IPv6 address will be used as a secret to generate IPv6 - addresses for link-local addresses and autoconfigured - ones. All addresses generated after setting this secret will - be stable privacy ones by default. This can be changed via the - addrgenmode ip-link. conf/default/stable_secret is used as the - secret for the namespace, the interface specific ones can - overwrite that. Writes to conf/all/stable_secret are refused. - - It is recommended to generate this secret during installation - of a system and keep it stable after that. - - By default the stable secret is unset. - -addr_gen_mode - INTEGER - Defines how link-local and autoconf addresses are generated. - - 0: generate address based on EUI64 (default) - 1: do no generate a link-local address, use EUI64 for addresses generated - from autoconf - 2: generate stable privacy addresses, using the secret from - stable_secret (RFC7217) - 3: generate stable privacy addresses, using a random secret if unset - -drop_unicast_in_l2_multicast - BOOLEAN - Drop any unicast IPv6 packets that are received in link-layer - multicast (or broadcast) frames. - - By default this is turned off. - -drop_unsolicited_na - BOOLEAN - Drop all unsolicited neighbor advertisements, for example if there's - a known good NA proxy on the network and such frames need not be used - (or in the case of 802.11, must not be used to prevent attacks.) - - By default this is turned off. - -enhanced_dad - BOOLEAN - Include a nonce option in the IPv6 neighbor solicitation messages used for - duplicate address detection per RFC7527. A received DAD NS will only signal - a duplicate address if the nonce is different. This avoids any false - detection of duplicates due to loopback of the NS messages that we send. - The nonce option will be sent on an interface unless both of - conf/{all,interface}/enhanced_dad are set to FALSE. - Default: TRUE - -icmp/*: -ratelimit - INTEGER - Limit the maximal rates for sending ICMPv6 messages. - 0 to disable any limiting, - otherwise the minimal space between responses in milliseconds. - Default: 1000 - -ratemask - list of comma separated ranges - For ICMPv6 message types matching the ranges in the ratemask, limit - the sending of the message according to ratelimit parameter. - - The format used for both input and output is a comma separated - list of ranges (e.g. "0-127,129" for ICMPv6 message type 0 to 127 and - 129). Writing to the file will clear all previous ranges of ICMPv6 - message types and update the current list with the input. - - Refer to: https://www.iana.org/assignments/icmpv6-parameters/icmpv6-parameters.xhtml - for numerical values of ICMPv6 message types, e.g. echo request is 128 - and echo reply is 129. - - Default: 0-1,3-127 (rate limit ICMPv6 errors except Packet Too Big) - -echo_ignore_all - BOOLEAN - If set non-zero, then the kernel will ignore all ICMP ECHO - requests sent to it over the IPv6 protocol. - Default: 0 - -echo_ignore_multicast - BOOLEAN - If set non-zero, then the kernel will ignore all ICMP ECHO - requests sent to it over the IPv6 protocol via multicast. - Default: 0 - -echo_ignore_anycast - BOOLEAN - If set non-zero, then the kernel will ignore all ICMP ECHO - requests sent to it over the IPv6 protocol destined to anycast address. - Default: 0 - -xfrm6_gc_thresh - INTEGER - (Obsolete since linux-4.14) - The threshold at which we will start garbage collecting for IPv6 - destination cache entries. At twice this value the system will - refuse new allocations. - - -IPv6 Update by: -Pekka Savola -YOSHIFUJI Hideaki / USAGI Project - - -/proc/sys/net/bridge/* Variables: - -bridge-nf-call-arptables - BOOLEAN - 1 : pass bridged ARP traffic to arptables' FORWARD chain. - 0 : disable this. - Default: 1 - -bridge-nf-call-iptables - BOOLEAN - 1 : pass bridged IPv4 traffic to iptables' chains. - 0 : disable this. - Default: 1 - -bridge-nf-call-ip6tables - BOOLEAN - 1 : pass bridged IPv6 traffic to ip6tables' chains. - 0 : disable this. - Default: 1 - -bridge-nf-filter-vlan-tagged - BOOLEAN - 1 : pass bridged vlan-tagged ARP/IP/IPv6 traffic to {arp,ip,ip6}tables. - 0 : disable this. - Default: 0 - -bridge-nf-filter-pppoe-tagged - BOOLEAN - 1 : pass bridged pppoe-tagged IP/IPv6 traffic to {ip,ip6}tables. - 0 : disable this. - Default: 0 - -bridge-nf-pass-vlan-input-dev - BOOLEAN - 1: if bridge-nf-filter-vlan-tagged is enabled, try to find a vlan - interface on the bridge and set the netfilter input device to the vlan. - This allows use of e.g. "iptables -i br0.1" and makes the REDIRECT - target work with vlan-on-top-of-bridge interfaces. When no matching - vlan interface is found, or this switch is off, the input device is - set to the bridge interface. - 0: disable bridge netfilter vlan interface lookup. - Default: 0 - -proc/sys/net/sctp/* Variables: - -addip_enable - BOOLEAN - Enable or disable extension of Dynamic Address Reconfiguration - (ADD-IP) functionality specified in RFC5061. This extension provides - the ability to dynamically add and remove new addresses for the SCTP - associations. - - 1: Enable extension. - - 0: Disable extension. - - Default: 0 - -pf_enable - INTEGER - Enable or disable pf (pf is short for potentially failed) state. A value - of pf_retrans > path_max_retrans also disables pf state. That is, one of - both pf_enable and pf_retrans > path_max_retrans can disable pf state. - Since pf_retrans and path_max_retrans can be changed by userspace - application, sometimes user expects to disable pf state by the value of - pf_retrans > path_max_retrans, but occasionally the value of pf_retrans - or path_max_retrans is changed by the user application, this pf state is - enabled. As such, it is necessary to add this to dynamically enable - and disable pf state. See: - https://datatracker.ietf.org/doc/draft-ietf-tsvwg-sctp-failover for - details. - - 1: Enable pf. - - 0: Disable pf. - - Default: 1 - -pf_expose - INTEGER - Unset or enable/disable pf (pf is short for potentially failed) state - exposure. Applications can control the exposure of the PF path state - in the SCTP_PEER_ADDR_CHANGE event and the SCTP_GET_PEER_ADDR_INFO - sockopt. When it's unset, no SCTP_PEER_ADDR_CHANGE event with - SCTP_ADDR_PF state will be sent and a SCTP_PF-state transport info - can be got via SCTP_GET_PEER_ADDR_INFO sockopt; When it's enabled, - a SCTP_PEER_ADDR_CHANGE event will be sent for a transport becoming - SCTP_PF state and a SCTP_PF-state transport info can be got via - SCTP_GET_PEER_ADDR_INFO sockopt; When it's diabled, no - SCTP_PEER_ADDR_CHANGE event will be sent and it returns -EACCES when - trying to get a SCTP_PF-state transport info via SCTP_GET_PEER_ADDR_INFO - sockopt. - - 0: Unset pf state exposure, Compatible with old applications. - - 1: Disable pf state exposure. - - 2: Enable pf state exposure. - - Default: 0 - -addip_noauth_enable - BOOLEAN - Dynamic Address Reconfiguration (ADD-IP) requires the use of - authentication to protect the operations of adding or removing new - addresses. This requirement is mandated so that unauthorized hosts - would not be able to hijack associations. However, older - implementations may not have implemented this requirement while - allowing the ADD-IP extension. For reasons of interoperability, - we provide this variable to control the enforcement of the - authentication requirement. - - 1: Allow ADD-IP extension to be used without authentication. This - should only be set in a closed environment for interoperability - with older implementations. - - 0: Enforce the authentication requirement - - Default: 0 - -auth_enable - BOOLEAN - Enable or disable Authenticated Chunks extension. This extension - provides the ability to send and receive authenticated chunks and is - required for secure operation of Dynamic Address Reconfiguration - (ADD-IP) extension. - - 1: Enable this extension. - 0: Disable this extension. - - Default: 0 - -prsctp_enable - BOOLEAN - Enable or disable the Partial Reliability extension (RFC3758) which - is used to notify peers that a given DATA should no longer be expected. - - 1: Enable extension - 0: Disable - - Default: 1 - -max_burst - INTEGER - The limit of the number of new packets that can be initially sent. It - controls how bursty the generated traffic can be. - - Default: 4 - -association_max_retrans - INTEGER - Set the maximum number for retransmissions that an association can - attempt deciding that the remote end is unreachable. If this value - is exceeded, the association is terminated. - - Default: 10 - -max_init_retransmits - INTEGER - The maximum number of retransmissions of INIT and COOKIE-ECHO chunks - that an association will attempt before declaring the destination - unreachable and terminating. - - Default: 8 - -path_max_retrans - INTEGER - The maximum number of retransmissions that will be attempted on a given - path. Once this threshold is exceeded, the path is considered - unreachable, and new traffic will use a different path when the - association is multihomed. - - Default: 5 - -pf_retrans - INTEGER - The number of retransmissions that will be attempted on a given path - before traffic is redirected to an alternate transport (should one - exist). Note this is distinct from path_max_retrans, as a path that - passes the pf_retrans threshold can still be used. Its only - deprioritized when a transmission path is selected by the stack. This - setting is primarily used to enable fast failover mechanisms without - having to reduce path_max_retrans to a very low value. See: - http://www.ietf.org/id/draft-nishida-tsvwg-sctp-failover-05.txt - for details. Note also that a value of pf_retrans > path_max_retrans - disables this feature. Since both pf_retrans and path_max_retrans can - be changed by userspace application, a variable pf_enable is used to - disable pf state. - - Default: 0 - -ps_retrans - INTEGER - Primary.Switchover.Max.Retrans (PSMR), it's a tunable parameter coming - from section-5 "Primary Path Switchover" in rfc7829. The primary path - will be changed to another active path when the path error counter on - the old primary path exceeds PSMR, so that "the SCTP sender is allowed - to continue data transmission on a new working path even when the old - primary destination address becomes active again". Note this feature - is disabled by initializing 'ps_retrans' per netns as 0xffff by default, - and its value can't be less than 'pf_retrans' when changing by sysctl. - - Default: 0xffff - -rto_initial - INTEGER - The initial round trip timeout value in milliseconds that will be used - in calculating round trip times. This is the initial time interval - for retransmissions. - - Default: 3000 - -rto_max - INTEGER - The maximum value (in milliseconds) of the round trip timeout. This - is the largest time interval that can elapse between retransmissions. - - Default: 60000 - -rto_min - INTEGER - The minimum value (in milliseconds) of the round trip timeout. This - is the smallest time interval the can elapse between retransmissions. - - Default: 1000 - -hb_interval - INTEGER - The interval (in milliseconds) between HEARTBEAT chunks. These chunks - are sent at the specified interval on idle paths to probe the state of - a given path between 2 associations. - - Default: 30000 - -sack_timeout - INTEGER - The amount of time (in milliseconds) that the implementation will wait - to send a SACK. - - Default: 200 - -valid_cookie_life - INTEGER - The default lifetime of the SCTP cookie (in milliseconds). The cookie - is used during association establishment. - - Default: 60000 - -cookie_preserve_enable - BOOLEAN - Enable or disable the ability to extend the lifetime of the SCTP cookie - that is used during the establishment phase of SCTP association - - 1: Enable cookie lifetime extension. - 0: Disable - - Default: 1 - -cookie_hmac_alg - STRING - Select the hmac algorithm used when generating the cookie value sent by - a listening sctp socket to a connecting client in the INIT-ACK chunk. - Valid values are: - * md5 - * sha1 - * none - Ability to assign md5 or sha1 as the selected alg is predicated on the - configuration of those algorithms at build time (CONFIG_CRYPTO_MD5 and - CONFIG_CRYPTO_SHA1). - - Default: Dependent on configuration. MD5 if available, else SHA1 if - available, else none. - -rcvbuf_policy - INTEGER - Determines if the receive buffer is attributed to the socket or to - association. SCTP supports the capability to create multiple - associations on a single socket. When using this capability, it is - possible that a single stalled association that's buffering a lot - of data may block other associations from delivering their data by - consuming all of the receive buffer space. To work around this, - the rcvbuf_policy could be set to attribute the receiver buffer space - to each association instead of the socket. This prevents the described - blocking. - - 1: rcvbuf space is per association - 0: rcvbuf space is per socket - - Default: 0 - -sndbuf_policy - INTEGER - Similar to rcvbuf_policy above, this applies to send buffer space. - - 1: Send buffer is tracked per association - 0: Send buffer is tracked per socket. - - Default: 0 - -sctp_mem - vector of 3 INTEGERs: min, pressure, max - Number of pages allowed for queueing by all SCTP sockets. - - min: Below this number of pages SCTP is not bothered about its - memory appetite. When amount of memory allocated by SCTP exceeds - this number, SCTP starts to moderate memory usage. - - pressure: This value was introduced to follow format of tcp_mem. - - max: Number of pages allowed for queueing by all SCTP sockets. - - Default is calculated at boot time from amount of available memory. - -sctp_rmem - vector of 3 INTEGERs: min, default, max - Only the first value ("min") is used, "default" and "max" are - ignored. - - min: Minimal size of receive buffer used by SCTP socket. - It is guaranteed to each SCTP socket (but not association) even - under moderate memory pressure. - - Default: 4K - -sctp_wmem - vector of 3 INTEGERs: min, default, max - Currently this tunable has no effect. - -addr_scope_policy - INTEGER - Control IPv4 address scoping - draft-stewart-tsvwg-sctp-ipv4-00 - - 0 - Disable IPv4 address scoping - 1 - Enable IPv4 address scoping - 2 - Follow draft but allow IPv4 private addresses - 3 - Follow draft but allow IPv4 link local addresses - - Default: 1 - - -/proc/sys/net/core/* - Please see: Documentation/admin-guide/sysctl/net.rst for descriptions of these entries. - - -/proc/sys/net/unix/* -max_dgram_qlen - INTEGER - The maximum length of dgram socket receive queue - - Default: 10 - diff --git a/Documentation/networking/snmp_counter.rst b/Documentation/networking/snmp_counter.rst index 10e11099e74a..4edd0d38779e 100644 --- a/Documentation/networking/snmp_counter.rst +++ b/Documentation/networking/snmp_counter.rst @@ -792,7 +792,7 @@ counters to indicate the ACK is skipped in which scenario. The ACK would only be skipped if the received packet is either a SYN packet or it has no data. -.. _sysctl document: https://www.kernel.org/doc/Documentation/networking/ip-sysctl.txt +.. _sysctl document: https://www.kernel.org/doc/Documentation/networking/ip-sysctl.rst * TcpExtTCPACKSkippedSynRecv diff --git a/net/Kconfig b/net/Kconfig index df8d8c9bd021..8b1f85820a6b 100644 --- a/net/Kconfig +++ b/net/Kconfig @@ -86,7 +86,7 @@ config INET "Sysctl support" below, you can change various aspects of the behavior of the TCP/IP code by writing to the (virtual) files in /proc/sys/net/ipv4/*; the options are explained in the file - . + . Short answer: say Y. diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig index 25a8888826b8..5da4733067fb 100644 --- a/net/ipv4/Kconfig +++ b/net/ipv4/Kconfig @@ -49,7 +49,7 @@ config IP_ADVANCED_ROUTER Note that some distributions enable it in startup scripts. For details about rp_filter strict and loose mode read - . + . If unsure, say N here. diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index fc61f51d87a3..956a806649f7 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -853,7 +853,7 @@ static bool icmp_unreach(struct sk_buff *skb) case ICMP_FRAG_NEEDED: /* for documentation of the ip_no_pmtu_disc * values please see - * Documentation/networking/ip-sysctl.txt + * Documentation/networking/ip-sysctl.rst */ switch (net->ipv4.sysctl_ip_no_pmtu_disc) { default: -- cgit v1.2.3-59-g8ed1b From 19093313cb0486d568232934bb80dd422d891623 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Tue, 28 Apr 2020 00:01:50 +0200 Subject: docs: networking: convert ipv6.txt to ReST Not much to be done here: - add SPDX header; - add a document title; - mark a literal as such, in order to avoid a warning; - add to networking/index.rst. Signed-off-by: Mauro Carvalho Chehab Signed-off-by: David S. Miller --- Documentation/admin-guide/kernel-parameters.txt | 6 +- Documentation/networking/index.rst | 1 + Documentation/networking/ipv6.rst | 78 +++++++++++++++++++++++++ Documentation/networking/ipv6.txt | 72 ----------------------- net/ipv6/Kconfig | 2 +- 5 files changed, 83 insertions(+), 76 deletions(-) create mode 100644 Documentation/networking/ipv6.rst delete mode 100644 Documentation/networking/ipv6.txt diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index e37db6f1be64..e43f2e1f2958 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -356,7 +356,7 @@ shot down by NMI autoconf= [IPV6] - See Documentation/networking/ipv6.txt. + See Documentation/networking/ipv6.rst. show_lapic= [APIC,X86] Advanced Programmable Interrupt Controller Limit apic dumping. The parameter defines the maximal @@ -872,7 +872,7 @@ miss to occur. disable= [IPV6] - See Documentation/networking/ipv6.txt. + See Documentation/networking/ipv6.rst. hardened_usercopy= [KNL] Under CONFIG_HARDENED_USERCOPY, whether @@ -912,7 +912,7 @@ to workaround buggy firmware. disable_ipv6= [IPV6] - See Documentation/networking/ipv6.txt. + See Documentation/networking/ipv6.rst. disable_mtrr_cleanup [X86] The kernel tries to adjust MTRR layout from continuous diff --git a/Documentation/networking/index.rst b/Documentation/networking/index.rst index 7d133d8dbe2a..709675464e51 100644 --- a/Documentation/networking/index.rst +++ b/Documentation/networking/index.rst @@ -70,6 +70,7 @@ Contents: iphase ipsec ip-sysctl + ipv6 .. only:: subproject and html diff --git a/Documentation/networking/ipv6.rst b/Documentation/networking/ipv6.rst new file mode 100644 index 000000000000..ba09c2f2dcc7 --- /dev/null +++ b/Documentation/networking/ipv6.rst @@ -0,0 +1,78 @@ +.. SPDX-License-Identifier: GPL-2.0 + +==== +IPv6 +==== + + +Options for the ipv6 module are supplied as parameters at load time. + +Module options may be given as command line arguments to the insmod +or modprobe command, but are usually specified in either +``/etc/modules.d/*.conf`` configuration files, or in a distro-specific +configuration file. + +The available ipv6 module parameters are listed below. If a parameter +is not specified the default value is used. + +The parameters are as follows: + +disable + + Specifies whether to load the IPv6 module, but disable all + its functionality. This might be used when another module + has a dependency on the IPv6 module being loaded, but no + IPv6 addresses or operations are desired. + + The possible values and their effects are: + + 0 + IPv6 is enabled. + + This is the default value. + + 1 + IPv6 is disabled. + + No IPv6 addresses will be added to interfaces, and + it will not be possible to open an IPv6 socket. + + A reboot is required to enable IPv6. + +autoconf + + Specifies whether to enable IPv6 address autoconfiguration + on all interfaces. This might be used when one does not wish + for addresses to be automatically generated from prefixes + received in Router Advertisements. + + The possible values and their effects are: + + 0 + IPv6 address autoconfiguration is disabled on all interfaces. + + Only the IPv6 loopback address (::1) and link-local addresses + will be added to interfaces. + + 1 + IPv6 address autoconfiguration is enabled on all interfaces. + + This is the default value. + +disable_ipv6 + + Specifies whether to disable IPv6 on all interfaces. + This might be used when no IPv6 addresses are desired. + + The possible values and their effects are: + + 0 + IPv6 is enabled on all interfaces. + + This is the default value. + + 1 + IPv6 is disabled on all interfaces. + + No IPv6 addresses will be added to interfaces. + diff --git a/Documentation/networking/ipv6.txt b/Documentation/networking/ipv6.txt deleted file mode 100644 index 6cd74fa55358..000000000000 --- a/Documentation/networking/ipv6.txt +++ /dev/null @@ -1,72 +0,0 @@ - -Options for the ipv6 module are supplied as parameters at load time. - -Module options may be given as command line arguments to the insmod -or modprobe command, but are usually specified in either -/etc/modules.d/*.conf configuration files, or in a distro-specific -configuration file. - -The available ipv6 module parameters are listed below. If a parameter -is not specified the default value is used. - -The parameters are as follows: - -disable - - Specifies whether to load the IPv6 module, but disable all - its functionality. This might be used when another module - has a dependency on the IPv6 module being loaded, but no - IPv6 addresses or operations are desired. - - The possible values and their effects are: - - 0 - IPv6 is enabled. - - This is the default value. - - 1 - IPv6 is disabled. - - No IPv6 addresses will be added to interfaces, and - it will not be possible to open an IPv6 socket. - - A reboot is required to enable IPv6. - -autoconf - - Specifies whether to enable IPv6 address autoconfiguration - on all interfaces. This might be used when one does not wish - for addresses to be automatically generated from prefixes - received in Router Advertisements. - - The possible values and their effects are: - - 0 - IPv6 address autoconfiguration is disabled on all interfaces. - - Only the IPv6 loopback address (::1) and link-local addresses - will be added to interfaces. - - 1 - IPv6 address autoconfiguration is enabled on all interfaces. - - This is the default value. - -disable_ipv6 - - Specifies whether to disable IPv6 on all interfaces. - This might be used when no IPv6 addresses are desired. - - The possible values and their effects are: - - 0 - IPv6 is enabled on all interfaces. - - This is the default value. - - 1 - IPv6 is disabled on all interfaces. - - No IPv6 addresses will be added to interfaces. - diff --git a/net/ipv6/Kconfig b/net/ipv6/Kconfig index 2ccaee98fddb..5a6111da26c4 100644 --- a/net/ipv6/Kconfig +++ b/net/ipv6/Kconfig @@ -13,7 +13,7 @@ menuconfig IPV6 For general information about IPv6, see . For specific information about IPv6 under Linux, see - Documentation/networking/ipv6.txt and read the HOWTO at + Documentation/networking/ipv6.rst and read the HOWTO at To compile this protocol support as a module, choose M here: the -- cgit v1.2.3-59-g8ed1b From 1dc2a785954bf4e562d0c85bea435ee56f705db5 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Tue, 28 Apr 2020 00:01:51 +0200 Subject: docs: networking: convert ipvlan.txt to ReST - add SPDX header; - adjust titles and chapters, adding proper markups; - mark code blocks and literals as such; - adjust identation, whitespaces and blank lines; - add to networking/index.rst. Signed-off-by: Mauro Carvalho Chehab Signed-off-by: David S. Miller --- Documentation/networking/index.rst | 1 + Documentation/networking/ipvlan.rst | 189 ++++++++++++++++++++++++++++++++++++ Documentation/networking/ipvlan.txt | 146 ---------------------------- 3 files changed, 190 insertions(+), 146 deletions(-) create mode 100644 Documentation/networking/ipvlan.rst delete mode 100644 Documentation/networking/ipvlan.txt diff --git a/Documentation/networking/index.rst b/Documentation/networking/index.rst index 709675464e51..54dee1575b54 100644 --- a/Documentation/networking/index.rst +++ b/Documentation/networking/index.rst @@ -71,6 +71,7 @@ Contents: ipsec ip-sysctl ipv6 + ipvlan .. only:: subproject and html diff --git a/Documentation/networking/ipvlan.rst b/Documentation/networking/ipvlan.rst new file mode 100644 index 000000000000..694adcba36b0 --- /dev/null +++ b/Documentation/networking/ipvlan.rst @@ -0,0 +1,189 @@ +.. SPDX-License-Identifier: GPL-2.0 + +=================== +IPVLAN Driver HOWTO +=================== + +Initial Release: + Mahesh Bandewar + +1. Introduction: +================ +This is conceptually very similar to the macvlan driver with one major +exception of using L3 for mux-ing /demux-ing among slaves. This property makes +the master device share the L2 with it's slave devices. I have developed this +driver in conjunction with network namespaces and not sure if there is use case +outside of it. + + +2. Building and Installation: +============================= + +In order to build the driver, please select the config item CONFIG_IPVLAN. +The driver can be built into the kernel (CONFIG_IPVLAN=y) or as a module +(CONFIG_IPVLAN=m). + + +3. Configuration: +================= + +There are no module parameters for this driver and it can be configured +using IProute2/ip utility. +:: + + ip link add link name type ipvlan [ mode MODE ] [ FLAGS ] + where + MODE: l3 (default) | l3s | l2 + FLAGS: bridge (default) | private | vepa + +e.g. + + (a) Following will create IPvlan link with eth0 as master in + L3 bridge mode:: + + bash# ip link add link eth0 name ipvl0 type ipvlan + (b) This command will create IPvlan link in L2 bridge mode:: + + bash# ip link add link eth0 name ipvl0 type ipvlan mode l2 bridge + + (c) This command will create an IPvlan device in L2 private mode:: + + bash# ip link add link eth0 name ipvlan type ipvlan mode l2 private + + (d) This command will create an IPvlan device in L2 vepa mode:: + + bash# ip link add link eth0 name ipvlan type ipvlan mode l2 vepa + + +4. Operating modes: +=================== + +IPvlan has two modes of operation - L2 and L3. For a given master device, +you can select one of these two modes and all slaves on that master will +operate in the same (selected) mode. The RX mode is almost identical except +that in L3 mode the slaves wont receive any multicast / broadcast traffic. +L3 mode is more restrictive since routing is controlled from the other (mostly) +default namespace. + +4.1 L2 mode: +------------ + +In this mode TX processing happens on the stack instance attached to the +slave device and packets are switched and queued to the master device to send +out. In this mode the slaves will RX/TX multicast and broadcast (if applicable) +as well. + +4.2 L3 mode: +------------ + +In this mode TX processing up to L3 happens on the stack instance attached +to the slave device and packets are switched to the stack instance of the +master device for the L2 processing and routing from that instance will be +used before packets are queued on the outbound device. In this mode the slaves +will not receive nor can send multicast / broadcast traffic. + +4.3 L3S mode: +------------- + +This is very similar to the L3 mode except that iptables (conn-tracking) +works in this mode and hence it is L3-symmetric (L3s). This will have slightly less +performance but that shouldn't matter since you are choosing this mode over plain-L3 +mode to make conn-tracking work. + +5. Mode flags: +============== + +At this time following mode flags are available + +5.1 bridge: +----------- +This is the default option. To configure the IPvlan port in this mode, +user can choose to either add this option on the command-line or don't specify +anything. This is the traditional mode where slaves can cross-talk among +themselves apart from talking through the master device. + +5.2 private: +------------ +If this option is added to the command-line, the port is set in private +mode. i.e. port won't allow cross communication between slaves. + +5.3 vepa: +--------- +If this is added to the command-line, the port is set in VEPA mode. +i.e. port will offload switching functionality to the external entity as +described in 802.1Qbg +Note: VEPA mode in IPvlan has limitations. IPvlan uses the mac-address of the +master-device, so the packets which are emitted in this mode for the adjacent +neighbor will have source and destination mac same. This will make the switch / +router send the redirect message. + +6. What to choose (macvlan vs. ipvlan)? +======================================= + +These two devices are very similar in many regards and the specific use +case could very well define which device to choose. if one of the following +situations defines your use case then you can choose to use ipvlan: + + +(a) The Linux host that is connected to the external switch / router has + policy configured that allows only one mac per port. +(b) No of virtual devices created on a master exceed the mac capacity and + puts the NIC in promiscuous mode and degraded performance is a concern. +(c) If the slave device is to be put into the hostile / untrusted network + namespace where L2 on the slave could be changed / misused. + + +6. Example configuration: +========================= + +:: + + +=============================================================+ + | Host: host1 | + | | + | +----------------------+ +----------------------+ | + | | NS:ns0 | | NS:ns1 | | + | | | | | | + | | | | | | + | | ipvl0 | | ipvl1 | | + | +----------#-----------+ +-----------#----------+ | + | # # | + | ################################ | + | # eth0 | + +==============================#==============================+ + + +(a) Create two network namespaces - ns0, ns1:: + + ip netns add ns0 + ip netns add ns1 + +(b) Create two ipvlan slaves on eth0 (master device):: + + ip link add link eth0 ipvl0 type ipvlan mode l2 + ip link add link eth0 ipvl1 type ipvlan mode l2 + +(c) Assign slaves to the respective network namespaces:: + + ip link set dev ipvl0 netns ns0 + ip link set dev ipvl1 netns ns1 + +(d) Now switch to the namespace (ns0 or ns1) to configure the slave devices + + - For ns0:: + + (1) ip netns exec ns0 bash + (2) ip link set dev ipvl0 up + (3) ip link set dev lo up + (4) ip -4 addr add 127.0.0.1 dev lo + (5) ip -4 addr add $IPADDR dev ipvl0 + (6) ip -4 route add default via $ROUTER dev ipvl0 + + - For ns1:: + + (1) ip netns exec ns1 bash + (2) ip link set dev ipvl1 up + (3) ip link set dev lo up + (4) ip -4 addr add 127.0.0.1 dev lo + (5) ip -4 addr add $IPADDR dev ipvl1 + (6) ip -4 route add default via $ROUTER dev ipvl1 diff --git a/Documentation/networking/ipvlan.txt b/Documentation/networking/ipvlan.txt deleted file mode 100644 index 27a38e50c287..000000000000 --- a/Documentation/networking/ipvlan.txt +++ /dev/null @@ -1,146 +0,0 @@ - - IPVLAN Driver HOWTO - -Initial Release: - Mahesh Bandewar - -1. Introduction: - This is conceptually very similar to the macvlan driver with one major -exception of using L3 for mux-ing /demux-ing among slaves. This property makes -the master device share the L2 with it's slave devices. I have developed this -driver in conjunction with network namespaces and not sure if there is use case -outside of it. - - -2. Building and Installation: - In order to build the driver, please select the config item CONFIG_IPVLAN. -The driver can be built into the kernel (CONFIG_IPVLAN=y) or as a module -(CONFIG_IPVLAN=m). - - -3. Configuration: - There are no module parameters for this driver and it can be configured -using IProute2/ip utility. - - ip link add link name type ipvlan [ mode MODE ] [ FLAGS ] - where - MODE: l3 (default) | l3s | l2 - FLAGS: bridge (default) | private | vepa - - e.g. - (a) Following will create IPvlan link with eth0 as master in - L3 bridge mode - bash# ip link add link eth0 name ipvl0 type ipvlan - (b) This command will create IPvlan link in L2 bridge mode. - bash# ip link add link eth0 name ipvl0 type ipvlan mode l2 bridge - (c) This command will create an IPvlan device in L2 private mode. - bash# ip link add link eth0 name ipvlan type ipvlan mode l2 private - (d) This command will create an IPvlan device in L2 vepa mode. - bash# ip link add link eth0 name ipvlan type ipvlan mode l2 vepa - - -4. Operating modes: - IPvlan has two modes of operation - L2 and L3. For a given master device, -you can select one of these two modes and all slaves on that master will -operate in the same (selected) mode. The RX mode is almost identical except -that in L3 mode the slaves wont receive any multicast / broadcast traffic. -L3 mode is more restrictive since routing is controlled from the other (mostly) -default namespace. - -4.1 L2 mode: - In this mode TX processing happens on the stack instance attached to the -slave device and packets are switched and queued to the master device to send -out. In this mode the slaves will RX/TX multicast and broadcast (if applicable) -as well. - -4.2 L3 mode: - In this mode TX processing up to L3 happens on the stack instance attached -to the slave device and packets are switched to the stack instance of the -master device for the L2 processing and routing from that instance will be -used before packets are queued on the outbound device. In this mode the slaves -will not receive nor can send multicast / broadcast traffic. - -4.3 L3S mode: - This is very similar to the L3 mode except that iptables (conn-tracking) -works in this mode and hence it is L3-symmetric (L3s). This will have slightly less -performance but that shouldn't matter since you are choosing this mode over plain-L3 -mode to make conn-tracking work. - -5. Mode flags: - At this time following mode flags are available - -5.1 bridge: - This is the default option. To configure the IPvlan port in this mode, -user can choose to either add this option on the command-line or don't specify -anything. This is the traditional mode where slaves can cross-talk among -themselves apart from talking through the master device. - -5.2 private: - If this option is added to the command-line, the port is set in private -mode. i.e. port won't allow cross communication between slaves. - -5.3 vepa: - If this is added to the command-line, the port is set in VEPA mode. -i.e. port will offload switching functionality to the external entity as -described in 802.1Qbg -Note: VEPA mode in IPvlan has limitations. IPvlan uses the mac-address of the -master-device, so the packets which are emitted in this mode for the adjacent -neighbor will have source and destination mac same. This will make the switch / -router send the redirect message. - -6. What to choose (macvlan vs. ipvlan)? - These two devices are very similar in many regards and the specific use -case could very well define which device to choose. if one of the following -situations defines your use case then you can choose to use ipvlan - - (a) The Linux host that is connected to the external switch / router has -policy configured that allows only one mac per port. - (b) No of virtual devices created on a master exceed the mac capacity and -puts the NIC in promiscuous mode and degraded performance is a concern. - (c) If the slave device is to be put into the hostile / untrusted network -namespace where L2 on the slave could be changed / misused. - - -6. Example configuration: - - +=============================================================+ - | Host: host1 | - | | - | +----------------------+ +----------------------+ | - | | NS:ns0 | | NS:ns1 | | - | | | | | | - | | | | | | - | | ipvl0 | | ipvl1 | | - | +----------#-----------+ +-----------#----------+ | - | # # | - | ################################ | - | # eth0 | - +==============================#==============================+ - - - (a) Create two network namespaces - ns0, ns1 - ip netns add ns0 - ip netns add ns1 - - (b) Create two ipvlan slaves on eth0 (master device) - ip link add link eth0 ipvl0 type ipvlan mode l2 - ip link add link eth0 ipvl1 type ipvlan mode l2 - - (c) Assign slaves to the respective network namespaces - ip link set dev ipvl0 netns ns0 - ip link set dev ipvl1 netns ns1 - - (d) Now switch to the namespace (ns0 or ns1) to configure the slave devices - - For ns0 - (1) ip netns exec ns0 bash - (2) ip link set dev ipvl0 up - (3) ip link set dev lo up - (4) ip -4 addr add 127.0.0.1 dev lo - (5) ip -4 addr add $IPADDR dev ipvl0 - (6) ip -4 route add default via $ROUTER dev ipvl0 - - For ns1 - (1) ip netns exec ns1 bash - (2) ip link set dev ipvl1 up - (3) ip link set dev lo up - (4) ip -4 addr add 127.0.0.1 dev lo - (5) ip -4 addr add $IPADDR dev ipvl1 - (6) ip -4 route add default via $ROUTER dev ipvl1 -- cgit v1.2.3-59-g8ed1b From 82a07bf33d7d0c3a194f62178e0fea2d68227b89 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Tue, 28 Apr 2020 00:01:52 +0200 Subject: docs: networking: convert ipvs-sysctl.txt to ReST - add SPDX header; - add a document title; - mark lists as such; - mark code blocks and literals as such; - adjust identation, whitespaces and blank lines; - add to networking/index.rst. Signed-off-by: Mauro Carvalho Chehab Acked-by: Simon Horman Signed-off-by: David S. Miller --- Documentation/admin-guide/sysctl/net.rst | 4 +- Documentation/networking/index.rst | 1 + Documentation/networking/ipvs-sysctl.rst | 302 +++++++++++++++++++++++++++++++ Documentation/networking/ipvs-sysctl.txt | 294 ------------------------------ MAINTAINERS | 2 +- 5 files changed, 306 insertions(+), 297 deletions(-) create mode 100644 Documentation/networking/ipvs-sysctl.rst delete mode 100644 Documentation/networking/ipvs-sysctl.txt diff --git a/Documentation/admin-guide/sysctl/net.rst b/Documentation/admin-guide/sysctl/net.rst index 84e3348a9543..2ad1b77a7182 100644 --- a/Documentation/admin-guide/sysctl/net.rst +++ b/Documentation/admin-guide/sysctl/net.rst @@ -353,8 +353,8 @@ socket's buffer. It will not take effect unless PF_UNIX flag is specified. 3. /proc/sys/net/ipv4 - IPV4 settings ------------------------------------- -Please see: Documentation/networking/ip-sysctl.rst and ipvs-sysctl.txt for -descriptions of these entries. +Please see: Documentation/networking/ip-sysctl.rst and +Documentation/admin-guide/sysctl/net.rst for descriptions of these entries. 4. Appletalk diff --git a/Documentation/networking/index.rst b/Documentation/networking/index.rst index 54dee1575b54..bbd4e0041457 100644 --- a/Documentation/networking/index.rst +++ b/Documentation/networking/index.rst @@ -72,6 +72,7 @@ Contents: ip-sysctl ipv6 ipvlan + ipvs-sysctl .. only:: subproject and html diff --git a/Documentation/networking/ipvs-sysctl.rst b/Documentation/networking/ipvs-sysctl.rst new file mode 100644 index 000000000000..be36c4600e8f --- /dev/null +++ b/Documentation/networking/ipvs-sysctl.rst @@ -0,0 +1,302 @@ +.. SPDX-License-Identifier: GPL-2.0 + +=========== +IPvs-sysctl +=========== + +/proc/sys/net/ipv4/vs/* Variables: +================================== + +am_droprate - INTEGER + default 10 + + It sets the always mode drop rate, which is used in the mode 3 + of the drop_rate defense. + +amemthresh - INTEGER + default 1024 + + It sets the available memory threshold (in pages), which is + used in the automatic modes of defense. When there is no + enough available memory, the respective strategy will be + enabled and the variable is automatically set to 2, otherwise + the strategy is disabled and the variable is set to 1. + +backup_only - BOOLEAN + - 0 - disabled (default) + - not 0 - enabled + + If set, disable the director function while the server is + in backup mode to avoid packet loops for DR/TUN methods. + +conn_reuse_mode - INTEGER + 1 - default + + Controls how ipvs will deal with connections that are detected + port reuse. It is a bitmap, with the values being: + + 0: disable any special handling on port reuse. The new + connection will be delivered to the same real server that was + servicing the previous connection. This will effectively + disable expire_nodest_conn. + + bit 1: enable rescheduling of new connections when it is safe. + That is, whenever expire_nodest_conn and for TCP sockets, when + the connection is in TIME_WAIT state (which is only possible if + you use NAT mode). + + bit 2: it is bit 1 plus, for TCP connections, when connections + are in FIN_WAIT state, as this is the last state seen by load + balancer in Direct Routing mode. This bit helps on adding new + real servers to a very busy cluster. + +conntrack - BOOLEAN + - 0 - disabled (default) + - not 0 - enabled + + If set, maintain connection tracking entries for + connections handled by IPVS. + + This should be enabled if connections handled by IPVS are to be + also handled by stateful firewall rules. That is, iptables rules + that make use of connection tracking. It is a performance + optimisation to disable this setting otherwise. + + Connections handled by the IPVS FTP application module + will have connection tracking entries regardless of this setting. + + Only available when IPVS is compiled with CONFIG_IP_VS_NFCT enabled. + +cache_bypass - BOOLEAN + - 0 - disabled (default) + - not 0 - enabled + + If it is enabled, forward packets to the original destination + directly when no cache server is available and destination + address is not local (iph->daddr is RTN_UNICAST). It is mostly + used in transparent web cache cluster. + +debug_level - INTEGER + - 0 - transmission error messages (default) + - 1 - non-fatal error messages + - 2 - configuration + - 3 - destination trash + - 4 - drop entry + - 5 - service lookup + - 6 - scheduling + - 7 - connection new/expire, lookup and synchronization + - 8 - state transition + - 9 - binding destination, template checks and applications + - 10 - IPVS packet transmission + - 11 - IPVS packet handling (ip_vs_in/ip_vs_out) + - 12 or more - packet traversal + + Only available when IPVS is compiled with CONFIG_IP_VS_DEBUG enabled. + + Higher debugging levels include the messages for lower debugging + levels, so setting debug level 2, includes level 0, 1 and 2 + messages. Thus, logging becomes more and more verbose the higher + the level. + +drop_entry - INTEGER + - 0 - disabled (default) + + The drop_entry defense is to randomly drop entries in the + connection hash table, just in order to collect back some + memory for new connections. In the current code, the + drop_entry procedure can be activated every second, then it + randomly scans 1/32 of the whole and drops entries that are in + the SYN-RECV/SYNACK state, which should be effective against + syn-flooding attack. + + The valid values of drop_entry are from 0 to 3, where 0 means + that this strategy is always disabled, 1 and 2 mean automatic + modes (when there is no enough available memory, the strategy + is enabled and the variable is automatically set to 2, + otherwise the strategy is disabled and the variable is set to + 1), and 3 means that that the strategy is always enabled. + +drop_packet - INTEGER + - 0 - disabled (default) + + The drop_packet defense is designed to drop 1/rate packets + before forwarding them to real servers. If the rate is 1, then + drop all the incoming packets. + + The value definition is the same as that of the drop_entry. In + the automatic mode, the rate is determined by the follow + formula: rate = amemthresh / (amemthresh - available_memory) + when available memory is less than the available memory + threshold. When the mode 3 is set, the always mode drop rate + is controlled by the /proc/sys/net/ipv4/vs/am_droprate. + +expire_nodest_conn - BOOLEAN + - 0 - disabled (default) + - not 0 - enabled + + The default value is 0, the load balancer will silently drop + packets when its destination server is not available. It may + be useful, when user-space monitoring program deletes the + destination server (because of server overload or wrong + detection) and add back the server later, and the connections + to the server can continue. + + If this feature is enabled, the load balancer will expire the + connection immediately when a packet arrives and its + destination server is not available, then the client program + will be notified that the connection is closed. This is + equivalent to the feature some people requires to flush + connections when its destination is not available. + +expire_quiescent_template - BOOLEAN + - 0 - disabled (default) + - not 0 - enabled + + When set to a non-zero value, the load balancer will expire + persistent templates when the destination server is quiescent. + This may be useful, when a user makes a destination server + quiescent by setting its weight to 0 and it is desired that + subsequent otherwise persistent connections are sent to a + different destination server. By default new persistent + connections are allowed to quiescent destination servers. + + If this feature is enabled, the load balancer will expire the + persistence template if it is to be used to schedule a new + connection and the destination server is quiescent. + +ignore_tunneled - BOOLEAN + - 0 - disabled (default) + - not 0 - enabled + + If set, ipvs will set the ipvs_property on all packets which are of + unrecognized protocols. This prevents us from routing tunneled + protocols like ipip, which is useful to prevent rescheduling + packets that have been tunneled to the ipvs host (i.e. to prevent + ipvs routing loops when ipvs is also acting as a real server). + +nat_icmp_send - BOOLEAN + - 0 - disabled (default) + - not 0 - enabled + + It controls sending icmp error messages (ICMP_DEST_UNREACH) + for VS/NAT when the load balancer receives packets from real + servers but the connection entries don't exist. + +pmtu_disc - BOOLEAN + - 0 - disabled + - not 0 - enabled (default) + + By default, reject with FRAG_NEEDED all DF packets that exceed + the PMTU, irrespective of the forwarding method. For TUN method + the flag can be disabled to fragment such packets. + +secure_tcp - INTEGER + - 0 - disabled (default) + + The secure_tcp defense is to use a more complicated TCP state + transition table. For VS/NAT, it also delays entering the + TCP ESTABLISHED state until the three way handshake is completed. + + The value definition is the same as that of drop_entry and + drop_packet. + +sync_threshold - vector of 2 INTEGERs: sync_threshold, sync_period + default 3 50 + + It sets synchronization threshold, which is the minimum number + of incoming packets that a connection needs to receive before + the connection will be synchronized. A connection will be + synchronized, every time the number of its incoming packets + modulus sync_period equals the threshold. The range of the + threshold is from 0 to sync_period. + + When sync_period and sync_refresh_period are 0, send sync only + for state changes or only once when pkts matches sync_threshold + +sync_refresh_period - UNSIGNED INTEGER + default 0 + + In seconds, difference in reported connection timer that triggers + new sync message. It can be used to avoid sync messages for the + specified period (or half of the connection timeout if it is lower) + if connection state is not changed since last sync. + + This is useful for normal connections with high traffic to reduce + sync rate. Additionally, retry sync_retries times with period of + sync_refresh_period/8. + +sync_retries - INTEGER + default 0 + + Defines sync retries with period of sync_refresh_period/8. Useful + to protect against loss of sync messages. The range of the + sync_retries is from 0 to 3. + +sync_qlen_max - UNSIGNED LONG + + Hard limit for queued sync messages that are not sent yet. It + defaults to 1/32 of the memory pages but actually represents + number of messages. It will protect us from allocating large + parts of memory when the sending rate is lower than the queuing + rate. + +sync_sock_size - INTEGER + default 0 + + Configuration of SNDBUF (master) or RCVBUF (slave) socket limit. + Default value is 0 (preserve system defaults). + +sync_ports - INTEGER + default 1 + + The number of threads that master and backup servers can use for + sync traffic. Every thread will use single UDP port, thread 0 will + use the default port 8848 while last thread will use port + 8848+sync_ports-1. + +snat_reroute - BOOLEAN + - 0 - disabled + - not 0 - enabled (default) + + If enabled, recalculate the route of SNATed packets from + realservers so that they are routed as if they originate from the + director. Otherwise they are routed as if they are forwarded by the + director. + + If policy routing is in effect then it is possible that the route + of a packet originating from a director is routed differently to a + packet being forwarded by the director. + + If policy routing is not in effect then the recalculated route will + always be the same as the original route so it is an optimisation + to disable snat_reroute and avoid the recalculation. + +sync_persist_mode - INTEGER + default 0 + + Controls the synchronisation of connections when using persistence + + 0: All types of connections are synchronised + + 1: Attempt to reduce the synchronisation traffic depending on + the connection type. For persistent services avoid synchronisation + for normal connections, do it only for persistence templates. + In such case, for TCP and SCTP it may need enabling sloppy_tcp and + sloppy_sctp flags on backup servers. For non-persistent services + such optimization is not applied, mode 0 is assumed. + +sync_version - INTEGER + default 1 + + The version of the synchronisation protocol used when sending + synchronisation messages. + + 0 selects the original synchronisation protocol (version 0). This + should be used when sending synchronisation messages to a legacy + system that only understands the original synchronisation protocol. + + 1 selects the current synchronisation protocol (version 1). This + should be used where possible. + + Kernels with this sync_version entry are able to receive messages + of both version 1 and version 2 of the synchronisation protocol. diff --git a/Documentation/networking/ipvs-sysctl.txt b/Documentation/networking/ipvs-sysctl.txt deleted file mode 100644 index 056898685d40..000000000000 --- a/Documentation/networking/ipvs-sysctl.txt +++ /dev/null @@ -1,294 +0,0 @@ -/proc/sys/net/ipv4/vs/* Variables: - -am_droprate - INTEGER - default 10 - - It sets the always mode drop rate, which is used in the mode 3 - of the drop_rate defense. - -amemthresh - INTEGER - default 1024 - - It sets the available memory threshold (in pages), which is - used in the automatic modes of defense. When there is no - enough available memory, the respective strategy will be - enabled and the variable is automatically set to 2, otherwise - the strategy is disabled and the variable is set to 1. - -backup_only - BOOLEAN - 0 - disabled (default) - not 0 - enabled - - If set, disable the director function while the server is - in backup mode to avoid packet loops for DR/TUN methods. - -conn_reuse_mode - INTEGER - 1 - default - - Controls how ipvs will deal with connections that are detected - port reuse. It is a bitmap, with the values being: - - 0: disable any special handling on port reuse. The new - connection will be delivered to the same real server that was - servicing the previous connection. This will effectively - disable expire_nodest_conn. - - bit 1: enable rescheduling of new connections when it is safe. - That is, whenever expire_nodest_conn and for TCP sockets, when - the connection is in TIME_WAIT state (which is only possible if - you use NAT mode). - - bit 2: it is bit 1 plus, for TCP connections, when connections - are in FIN_WAIT state, as this is the last state seen by load - balancer in Direct Routing mode. This bit helps on adding new - real servers to a very busy cluster. - -conntrack - BOOLEAN - 0 - disabled (default) - not 0 - enabled - - If set, maintain connection tracking entries for - connections handled by IPVS. - - This should be enabled if connections handled by IPVS are to be - also handled by stateful firewall rules. That is, iptables rules - that make use of connection tracking. It is a performance - optimisation to disable this setting otherwise. - - Connections handled by the IPVS FTP application module - will have connection tracking entries regardless of this setting. - - Only available when IPVS is compiled with CONFIG_IP_VS_NFCT enabled. - -cache_bypass - BOOLEAN - 0 - disabled (default) - not 0 - enabled - - If it is enabled, forward packets to the original destination - directly when no cache server is available and destination - address is not local (iph->daddr is RTN_UNICAST). It is mostly - used in transparent web cache cluster. - -debug_level - INTEGER - 0 - transmission error messages (default) - 1 - non-fatal error messages - 2 - configuration - 3 - destination trash - 4 - drop entry - 5 - service lookup - 6 - scheduling - 7 - connection new/expire, lookup and synchronization - 8 - state transition - 9 - binding destination, template checks and applications - 10 - IPVS packet transmission - 11 - IPVS packet handling (ip_vs_in/ip_vs_out) - 12 or more - packet traversal - - Only available when IPVS is compiled with CONFIG_IP_VS_DEBUG enabled. - - Higher debugging levels include the messages for lower debugging - levels, so setting debug level 2, includes level 0, 1 and 2 - messages. Thus, logging becomes more and more verbose the higher - the level. - -drop_entry - INTEGER - 0 - disabled (default) - - The drop_entry defense is to randomly drop entries in the - connection hash table, just in order to collect back some - memory for new connections. In the current code, the - drop_entry procedure can be activated every second, then it - randomly scans 1/32 of the whole and drops entries that are in - the SYN-RECV/SYNACK state, which should be effective against - syn-flooding attack. - - The valid values of drop_entry are from 0 to 3, where 0 means - that this strategy is always disabled, 1 and 2 mean automatic - modes (when there is no enough available memory, the strategy - is enabled and the variable is automatically set to 2, - otherwise the strategy is disabled and the variable is set to - 1), and 3 means that that the strategy is always enabled. - -drop_packet - INTEGER - 0 - disabled (default) - - The drop_packet defense is designed to drop 1/rate packets - before forwarding them to real servers. If the rate is 1, then - drop all the incoming packets. - - The value definition is the same as that of the drop_entry. In - the automatic mode, the rate is determined by the follow - formula: rate = amemthresh / (amemthresh - available_memory) - when available memory is less than the available memory - threshold. When the mode 3 is set, the always mode drop rate - is controlled by the /proc/sys/net/ipv4/vs/am_droprate. - -expire_nodest_conn - BOOLEAN - 0 - disabled (default) - not 0 - enabled - - The default value is 0, the load balancer will silently drop - packets when its destination server is not available. It may - be useful, when user-space monitoring program deletes the - destination server (because of server overload or wrong - detection) and add back the server later, and the connections - to the server can continue. - - If this feature is enabled, the load balancer will expire the - connection immediately when a packet arrives and its - destination server is not available, then the client program - will be notified that the connection is closed. This is - equivalent to the feature some people requires to flush - connections when its destination is not available. - -expire_quiescent_template - BOOLEAN - 0 - disabled (default) - not 0 - enabled - - When set to a non-zero value, the load balancer will expire - persistent templates when the destination server is quiescent. - This may be useful, when a user makes a destination server - quiescent by setting its weight to 0 and it is desired that - subsequent otherwise persistent connections are sent to a - different destination server. By default new persistent - connections are allowed to quiescent destination servers. - - If this feature is enabled, the load balancer will expire the - persistence template if it is to be used to schedule a new - connection and the destination server is quiescent. - -ignore_tunneled - BOOLEAN - 0 - disabled (default) - not 0 - enabled - - If set, ipvs will set the ipvs_property on all packets which are of - unrecognized protocols. This prevents us from routing tunneled - protocols like ipip, which is useful to prevent rescheduling - packets that have been tunneled to the ipvs host (i.e. to prevent - ipvs routing loops when ipvs is also acting as a real server). - -nat_icmp_send - BOOLEAN - 0 - disabled (default) - not 0 - enabled - - It controls sending icmp error messages (ICMP_DEST_UNREACH) - for VS/NAT when the load balancer receives packets from real - servers but the connection entries don't exist. - -pmtu_disc - BOOLEAN - 0 - disabled - not 0 - enabled (default) - - By default, reject with FRAG_NEEDED all DF packets that exceed - the PMTU, irrespective of the forwarding method. For TUN method - the flag can be disabled to fragment such packets. - -secure_tcp - INTEGER - 0 - disabled (default) - - The secure_tcp defense is to use a more complicated TCP state - transition table. For VS/NAT, it also delays entering the - TCP ESTABLISHED state until the three way handshake is completed. - - The value definition is the same as that of drop_entry and - drop_packet. - -sync_threshold - vector of 2 INTEGERs: sync_threshold, sync_period - default 3 50 - - It sets synchronization threshold, which is the minimum number - of incoming packets that a connection needs to receive before - the connection will be synchronized. A connection will be - synchronized, every time the number of its incoming packets - modulus sync_period equals the threshold. The range of the - threshold is from 0 to sync_period. - - When sync_period and sync_refresh_period are 0, send sync only - for state changes or only once when pkts matches sync_threshold - -sync_refresh_period - UNSIGNED INTEGER - default 0 - - In seconds, difference in reported connection timer that triggers - new sync message. It can be used to avoid sync messages for the - specified period (or half of the connection timeout if it is lower) - if connection state is not changed since last sync. - - This is useful for normal connections with high traffic to reduce - sync rate. Additionally, retry sync_retries times with period of - sync_refresh_period/8. - -sync_retries - INTEGER - default 0 - - Defines sync retries with period of sync_refresh_period/8. Useful - to protect against loss of sync messages. The range of the - sync_retries is from 0 to 3. - -sync_qlen_max - UNSIGNED LONG - - Hard limit for queued sync messages that are not sent yet. It - defaults to 1/32 of the memory pages but actually represents - number of messages. It will protect us from allocating large - parts of memory when the sending rate is lower than the queuing - rate. - -sync_sock_size - INTEGER - default 0 - - Configuration of SNDBUF (master) or RCVBUF (slave) socket limit. - Default value is 0 (preserve system defaults). - -sync_ports - INTEGER - default 1 - - The number of threads that master and backup servers can use for - sync traffic. Every thread will use single UDP port, thread 0 will - use the default port 8848 while last thread will use port - 8848+sync_ports-1. - -snat_reroute - BOOLEAN - 0 - disabled - not 0 - enabled (default) - - If enabled, recalculate the route of SNATed packets from - realservers so that they are routed as if they originate from the - director. Otherwise they are routed as if they are forwarded by the - director. - - If policy routing is in effect then it is possible that the route - of a packet originating from a director is routed differently to a - packet being forwarded by the director. - - If policy routing is not in effect then the recalculated route will - always be the same as the original route so it is an optimisation - to disable snat_reroute and avoid the recalculation. - -sync_persist_mode - INTEGER - default 0 - - Controls the synchronisation of connections when using persistence - - 0: All types of connections are synchronised - 1: Attempt to reduce the synchronisation traffic depending on - the connection type. For persistent services avoid synchronisation - for normal connections, do it only for persistence templates. - In such case, for TCP and SCTP it may need enabling sloppy_tcp and - sloppy_sctp flags on backup servers. For non-persistent services - such optimization is not applied, mode 0 is assumed. - -sync_version - INTEGER - default 1 - - The version of the synchronisation protocol used when sending - synchronisation messages. - - 0 selects the original synchronisation protocol (version 0). This - should be used when sending synchronisation messages to a legacy - system that only understands the original synchronisation protocol. - - 1 selects the current synchronisation protocol (version 1). This - should be used where possible. - - Kernels with this sync_version entry are able to receive messages - of both version 1 and version 2 of the synchronisation protocol. diff --git a/MAINTAINERS b/MAINTAINERS index df5e4ccc1ccb..3a5f52a3c055 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -8934,7 +8934,7 @@ L: lvs-devel@vger.kernel.org S: Maintained T: git git://git.kernel.org/pub/scm/linux/kernel/git/horms/ipvs-next.git T: git git://git.kernel.org/pub/scm/linux/kernel/git/horms/ipvs.git -F: Documentation/networking/ipvs-sysctl.txt +F: Documentation/networking/ipvs-sysctl.rst F: include/net/ip_vs.h F: include/uapi/linux/ip_vs.h F: net/netfilter/ipvs/ -- cgit v1.2.3-59-g8ed1b From b9dd2bea2245dd8ba4f68e801af93e4b38bfe6b0 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Tue, 28 Apr 2020 00:01:53 +0200 Subject: docs: networking: convert kcm.txt to ReST - add SPDX header; - adjust titles and chapters, adding proper markups; - mark code blocks and literals as such; - adjust identation, whitespaces and blank lines; - add to networking/index.rst. Signed-off-by: Mauro Carvalho Chehab Signed-off-by: David S. Miller --- Documentation/networking/index.rst | 1 + Documentation/networking/kcm.rst | 290 +++++++++++++++++++++++++++++++++++++ Documentation/networking/kcm.txt | 285 ------------------------------------ 3 files changed, 291 insertions(+), 285 deletions(-) create mode 100644 Documentation/networking/kcm.rst delete mode 100644 Documentation/networking/kcm.txt diff --git a/Documentation/networking/index.rst b/Documentation/networking/index.rst index bbd4e0041457..e1ff08b94d90 100644 --- a/Documentation/networking/index.rst +++ b/Documentation/networking/index.rst @@ -73,6 +73,7 @@ Contents: ipv6 ipvlan ipvs-sysctl + kcm .. only:: subproject and html diff --git a/Documentation/networking/kcm.rst b/Documentation/networking/kcm.rst new file mode 100644 index 000000000000..db0f5560ac1c --- /dev/null +++ b/Documentation/networking/kcm.rst @@ -0,0 +1,290 @@ +.. SPDX-License-Identifier: GPL-2.0 + +============================= +Kernel Connection Multiplexor +============================= + +Kernel Connection Multiplexor (KCM) is a mechanism that provides a message based +interface over TCP for generic application protocols. With KCM an application +can efficiently send and receive application protocol messages over TCP using +datagram sockets. + +KCM implements an NxM multiplexor in the kernel as diagrammed below:: + + +------------+ +------------+ +------------+ +------------+ + | KCM socket | | KCM socket | | KCM socket | | KCM socket | + +------------+ +------------+ +------------+ +------------+ + | | | | + +-----------+ | | +----------+ + | | | | + +----------------------------------+ + | Multiplexor | + +----------------------------------+ + | | | | | + +---------+ | | | ------------+ + | | | | | + +----------+ +----------+ +----------+ +----------+ +----------+ + | Psock | | Psock | | Psock | | Psock | | Psock | + +----------+ +----------+ +----------+ +----------+ +----------+ + | | | | | + +----------+ +----------+ +----------+ +----------+ +----------+ + | TCP sock | | TCP sock | | TCP sock | | TCP sock | | TCP sock | + +----------+ +----------+ +----------+ +----------+ +----------+ + +KCM sockets +=========== + +The KCM sockets provide the user interface to the multiplexor. All the KCM sockets +bound to a multiplexor are considered to have equivalent function, and I/O +operations in different sockets may be done in parallel without the need for +synchronization between threads in userspace. + +Multiplexor +=========== + +The multiplexor provides the message steering. In the transmit path, messages +written on a KCM socket are sent atomically on an appropriate TCP socket. +Similarly, in the receive path, messages are constructed on each TCP socket +(Psock) and complete messages are steered to a KCM socket. + +TCP sockets & Psocks +==================== + +TCP sockets may be bound to a KCM multiplexor. A Psock structure is allocated +for each bound TCP socket, this structure holds the state for constructing +messages on receive as well as other connection specific information for KCM. + +Connected mode semantics +======================== + +Each multiplexor assumes that all attached TCP connections are to the same +destination and can use the different connections for load balancing when +transmitting. The normal send and recv calls (include sendmmsg and recvmmsg) +can be used to send and receive messages from the KCM socket. + +Socket types +============ + +KCM supports SOCK_DGRAM and SOCK_SEQPACKET socket types. + +Message delineation +------------------- + +Messages are sent over a TCP stream with some application protocol message +format that typically includes a header which frames the messages. The length +of a received message can be deduced from the application protocol header +(often just a simple length field). + +A TCP stream must be parsed to determine message boundaries. Berkeley Packet +Filter (BPF) is used for this. When attaching a TCP socket to a multiplexor a +BPF program must be specified. The program is called at the start of receiving +a new message and is given an skbuff that contains the bytes received so far. +It parses the message header and returns the length of the message. Given this +information, KCM will construct the message of the stated length and deliver it +to a KCM socket. + +TCP socket management +--------------------- + +When a TCP socket is attached to a KCM multiplexor data ready (POLLIN) and +write space available (POLLOUT) events are handled by the multiplexor. If there +is a state change (disconnection) or other error on a TCP socket, an error is +posted on the TCP socket so that a POLLERR event happens and KCM discontinues +using the socket. When the application gets the error notification for a +TCP socket, it should unattach the socket from KCM and then handle the error +condition (the typical response is to close the socket and create a new +connection if necessary). + +KCM limits the maximum receive message size to be the size of the receive +socket buffer on the attached TCP socket (the socket buffer size can be set by +SO_RCVBUF). If the length of a new message reported by the BPF program is +greater than this limit a corresponding error (EMSGSIZE) is posted on the TCP +socket. The BPF program may also enforce a maximum messages size and report an +error when it is exceeded. + +A timeout may be set for assembling messages on a receive socket. The timeout +value is taken from the receive timeout of the attached TCP socket (this is set +by SO_RCVTIMEO). If the timer expires before assembly is complete an error +(ETIMEDOUT) is posted on the socket. + +User interface +============== + +Creating a multiplexor +---------------------- + +A new multiplexor and initial KCM socket is created by a socket call:: + + socket(AF_KCM, type, protocol) + +- type is either SOCK_DGRAM or SOCK_SEQPACKET +- protocol is KCMPROTO_CONNECTED + +Cloning KCM sockets +------------------- + +After the first KCM socket is created using the socket call as described +above, additional sockets for the multiplexor can be created by cloning +a KCM socket. This is accomplished by an ioctl on a KCM socket:: + + /* From linux/kcm.h */ + struct kcm_clone { + int fd; + }; + + struct kcm_clone info; + + memset(&info, 0, sizeof(info)); + + err = ioctl(kcmfd, SIOCKCMCLONE, &info); + + if (!err) + newkcmfd = info.fd; + +Attach transport sockets +------------------------ + +Attaching of transport sockets to a multiplexor is performed by calling an +ioctl on a KCM socket for the multiplexor. e.g.:: + + /* From linux/kcm.h */ + struct kcm_attach { + int fd; + int bpf_fd; + }; + + struct kcm_attach info; + + memset(&info, 0, sizeof(info)); + + info.fd = tcpfd; + info.bpf_fd = bpf_prog_fd; + + ioctl(kcmfd, SIOCKCMATTACH, &info); + +The kcm_attach structure contains: + + - fd: file descriptor for TCP socket being attached + - bpf_prog_fd: file descriptor for compiled BPF program downloaded + +Unattach transport sockets +-------------------------- + +Unattaching a transport socket from a multiplexor is straightforward. An +"unattach" ioctl is done with the kcm_unattach structure as the argument:: + + /* From linux/kcm.h */ + struct kcm_unattach { + int fd; + }; + + struct kcm_unattach info; + + memset(&info, 0, sizeof(info)); + + info.fd = cfd; + + ioctl(fd, SIOCKCMUNATTACH, &info); + +Disabling receive on KCM socket +------------------------------- + +A setsockopt is used to disable or enable receiving on a KCM socket. +When receive is disabled, any pending messages in the socket's +receive buffer are moved to other sockets. This feature is useful +if an application thread knows that it will be doing a lot of +work on a request and won't be able to service new messages for a +while. Example use:: + + int val = 1; + + setsockopt(kcmfd, SOL_KCM, KCM_RECV_DISABLE, &val, sizeof(val)) + +BFP programs for message delineation +------------------------------------ + +BPF programs can be compiled using the BPF LLVM backend. For example, +the BPF program for parsing Thrift is:: + + #include "bpf.h" /* for __sk_buff */ + #include "bpf_helpers.h" /* for load_word intrinsic */ + + SEC("socket_kcm") + int bpf_prog1(struct __sk_buff *skb) + { + return load_word(skb, 0) + 4; + } + + char _license[] SEC("license") = "GPL"; + +Use in applications +=================== + +KCM accelerates application layer protocols. Specifically, it allows +applications to use a message based interface for sending and receiving +messages. The kernel provides necessary assurances that messages are sent +and received atomically. This relieves much of the burden applications have +in mapping a message based protocol onto the TCP stream. KCM also make +application layer messages a unit of work in the kernel for the purposes of +steering and scheduling, which in turn allows a simpler networking model in +multithreaded applications. + +Configurations +-------------- + +In an Nx1 configuration, KCM logically provides multiple socket handles +to the same TCP connection. This allows parallelism between in I/O +operations on the TCP socket (for instance copyin and copyout of data is +parallelized). In an application, a KCM socket can be opened for each +processing thread and inserted into the epoll (similar to how SO_REUSEPORT +is used to allow multiple listener sockets on the same port). + +In a MxN configuration, multiple connections are established to the +same destination. These are used for simple load balancing. + +Message batching +---------------- + +The primary purpose of KCM is load balancing between KCM sockets and hence +threads in a nominal use case. Perfect load balancing, that is steering +each received message to a different KCM socket or steering each sent +message to a different TCP socket, can negatively impact performance +since this doesn't allow for affinities to be established. Balancing +based on groups, or batches of messages, can be beneficial for performance. + +On transmit, there are three ways an application can batch (pipeline) +messages on a KCM socket. + + 1) Send multiple messages in a single sendmmsg. + 2) Send a group of messages each with a sendmsg call, where all messages + except the last have MSG_BATCH in the flags of sendmsg call. + 3) Create "super message" composed of multiple messages and send this + with a single sendmsg. + +On receive, the KCM module attempts to queue messages received on the +same KCM socket during each TCP ready callback. The targeted KCM socket +changes at each receive ready callback on the KCM socket. The application +does not need to configure this. + +Error handling +-------------- + +An application should include a thread to monitor errors raised on +the TCP connection. Normally, this will be done by placing each +TCP socket attached to a KCM multiplexor in epoll set for POLLERR +event. If an error occurs on an attached TCP socket, KCM sets an EPIPE +on the socket thus waking up the application thread. When the application +sees the error (which may just be a disconnect) it should unattach the +socket from KCM and then close it. It is assumed that once an error is +posted on the TCP socket the data stream is unrecoverable (i.e. an error +may have occurred in the middle of receiving a message). + +TCP connection monitoring +------------------------- + +In KCM there is no means to correlate a message to the TCP socket that +was used to send or receive the message (except in the case there is +only one attached TCP socket). However, the application does retain +an open file descriptor to the socket so it will be able to get statistics +from the socket which can be used in detecting issues (such as high +retransmissions on the socket). diff --git a/Documentation/networking/kcm.txt b/Documentation/networking/kcm.txt deleted file mode 100644 index b773a5278ac4..000000000000 --- a/Documentation/networking/kcm.txt +++ /dev/null @@ -1,285 +0,0 @@ -Kernel Connection Multiplexor ------------------------------ - -Kernel Connection Multiplexor (KCM) is a mechanism that provides a message based -interface over TCP for generic application protocols. With KCM an application -can efficiently send and receive application protocol messages over TCP using -datagram sockets. - -KCM implements an NxM multiplexor in the kernel as diagrammed below: - -+------------+ +------------+ +------------+ +------------+ -| KCM socket | | KCM socket | | KCM socket | | KCM socket | -+------------+ +------------+ +------------+ +------------+ - | | | | - +-----------+ | | +----------+ - | | | | - +----------------------------------+ - | Multiplexor | - +----------------------------------+ - | | | | | - +---------+ | | | ------------+ - | | | | | -+----------+ +----------+ +----------+ +----------+ +----------+ -| Psock | | Psock | | Psock | | Psock | | Psock | -+----------+ +----------+ +----------+ +----------+ +----------+ - | | | | | -+----------+ +----------+ +----------+ +----------+ +----------+ -| TCP sock | | TCP sock | | TCP sock | | TCP sock | | TCP sock | -+----------+ +----------+ +----------+ +----------+ +----------+ - -KCM sockets ------------ - -The KCM sockets provide the user interface to the multiplexor. All the KCM sockets -bound to a multiplexor are considered to have equivalent function, and I/O -operations in different sockets may be done in parallel without the need for -synchronization between threads in userspace. - -Multiplexor ------------ - -The multiplexor provides the message steering. In the transmit path, messages -written on a KCM socket are sent atomically on an appropriate TCP socket. -Similarly, in the receive path, messages are constructed on each TCP socket -(Psock) and complete messages are steered to a KCM socket. - -TCP sockets & Psocks --------------------- - -TCP sockets may be bound to a KCM multiplexor. A Psock structure is allocated -for each bound TCP socket, this structure holds the state for constructing -messages on receive as well as other connection specific information for KCM. - -Connected mode semantics ------------------------- - -Each multiplexor assumes that all attached TCP connections are to the same -destination and can use the different connections for load balancing when -transmitting. The normal send and recv calls (include sendmmsg and recvmmsg) -can be used to send and receive messages from the KCM socket. - -Socket types ------------- - -KCM supports SOCK_DGRAM and SOCK_SEQPACKET socket types. - -Message delineation -------------------- - -Messages are sent over a TCP stream with some application protocol message -format that typically includes a header which frames the messages. The length -of a received message can be deduced from the application protocol header -(often just a simple length field). - -A TCP stream must be parsed to determine message boundaries. Berkeley Packet -Filter (BPF) is used for this. When attaching a TCP socket to a multiplexor a -BPF program must be specified. The program is called at the start of receiving -a new message and is given an skbuff that contains the bytes received so far. -It parses the message header and returns the length of the message. Given this -information, KCM will construct the message of the stated length and deliver it -to a KCM socket. - -TCP socket management ---------------------- - -When a TCP socket is attached to a KCM multiplexor data ready (POLLIN) and -write space available (POLLOUT) events are handled by the multiplexor. If there -is a state change (disconnection) or other error on a TCP socket, an error is -posted on the TCP socket so that a POLLERR event happens and KCM discontinues -using the socket. When the application gets the error notification for a -TCP socket, it should unattach the socket from KCM and then handle the error -condition (the typical response is to close the socket and create a new -connection if necessary). - -KCM limits the maximum receive message size to be the size of the receive -socket buffer on the attached TCP socket (the socket buffer size can be set by -SO_RCVBUF). If the length of a new message reported by the BPF program is -greater than this limit a corresponding error (EMSGSIZE) is posted on the TCP -socket. The BPF program may also enforce a maximum messages size and report an -error when it is exceeded. - -A timeout may be set for assembling messages on a receive socket. The timeout -value is taken from the receive timeout of the attached TCP socket (this is set -by SO_RCVTIMEO). If the timer expires before assembly is complete an error -(ETIMEDOUT) is posted on the socket. - -User interface -============== - -Creating a multiplexor ----------------------- - -A new multiplexor and initial KCM socket is created by a socket call: - - socket(AF_KCM, type, protocol) - - - type is either SOCK_DGRAM or SOCK_SEQPACKET - - protocol is KCMPROTO_CONNECTED - -Cloning KCM sockets -------------------- - -After the first KCM socket is created using the socket call as described -above, additional sockets for the multiplexor can be created by cloning -a KCM socket. This is accomplished by an ioctl on a KCM socket: - - /* From linux/kcm.h */ - struct kcm_clone { - int fd; - }; - - struct kcm_clone info; - - memset(&info, 0, sizeof(info)); - - err = ioctl(kcmfd, SIOCKCMCLONE, &info); - - if (!err) - newkcmfd = info.fd; - -Attach transport sockets ------------------------- - -Attaching of transport sockets to a multiplexor is performed by calling an -ioctl on a KCM socket for the multiplexor. e.g.: - - /* From linux/kcm.h */ - struct kcm_attach { - int fd; - int bpf_fd; - }; - - struct kcm_attach info; - - memset(&info, 0, sizeof(info)); - - info.fd = tcpfd; - info.bpf_fd = bpf_prog_fd; - - ioctl(kcmfd, SIOCKCMATTACH, &info); - -The kcm_attach structure contains: - fd: file descriptor for TCP socket being attached - bpf_prog_fd: file descriptor for compiled BPF program downloaded - -Unattach transport sockets --------------------------- - -Unattaching a transport socket from a multiplexor is straightforward. An -"unattach" ioctl is done with the kcm_unattach structure as the argument: - - /* From linux/kcm.h */ - struct kcm_unattach { - int fd; - }; - - struct kcm_unattach info; - - memset(&info, 0, sizeof(info)); - - info.fd = cfd; - - ioctl(fd, SIOCKCMUNATTACH, &info); - -Disabling receive on KCM socket -------------------------------- - -A setsockopt is used to disable or enable receiving on a KCM socket. -When receive is disabled, any pending messages in the socket's -receive buffer are moved to other sockets. This feature is useful -if an application thread knows that it will be doing a lot of -work on a request and won't be able to service new messages for a -while. Example use: - - int val = 1; - - setsockopt(kcmfd, SOL_KCM, KCM_RECV_DISABLE, &val, sizeof(val)) - -BFP programs for message delineation ------------------------------------- - -BPF programs can be compiled using the BPF LLVM backend. For example, -the BPF program for parsing Thrift is: - - #include "bpf.h" /* for __sk_buff */ - #include "bpf_helpers.h" /* for load_word intrinsic */ - - SEC("socket_kcm") - int bpf_prog1(struct __sk_buff *skb) - { - return load_word(skb, 0) + 4; - } - - char _license[] SEC("license") = "GPL"; - -Use in applications -=================== - -KCM accelerates application layer protocols. Specifically, it allows -applications to use a message based interface for sending and receiving -messages. The kernel provides necessary assurances that messages are sent -and received atomically. This relieves much of the burden applications have -in mapping a message based protocol onto the TCP stream. KCM also make -application layer messages a unit of work in the kernel for the purposes of -steering and scheduling, which in turn allows a simpler networking model in -multithreaded applications. - -Configurations --------------- - -In an Nx1 configuration, KCM logically provides multiple socket handles -to the same TCP connection. This allows parallelism between in I/O -operations on the TCP socket (for instance copyin and copyout of data is -parallelized). In an application, a KCM socket can be opened for each -processing thread and inserted into the epoll (similar to how SO_REUSEPORT -is used to allow multiple listener sockets on the same port). - -In a MxN configuration, multiple connections are established to the -same destination. These are used for simple load balancing. - -Message batching ----------------- - -The primary purpose of KCM is load balancing between KCM sockets and hence -threads in a nominal use case. Perfect load balancing, that is steering -each received message to a different KCM socket or steering each sent -message to a different TCP socket, can negatively impact performance -since this doesn't allow for affinities to be established. Balancing -based on groups, or batches of messages, can be beneficial for performance. - -On transmit, there are three ways an application can batch (pipeline) -messages on a KCM socket. - 1) Send multiple messages in a single sendmmsg. - 2) Send a group of messages each with a sendmsg call, where all messages - except the last have MSG_BATCH in the flags of sendmsg call. - 3) Create "super message" composed of multiple messages and send this - with a single sendmsg. - -On receive, the KCM module attempts to queue messages received on the -same KCM socket during each TCP ready callback. The targeted KCM socket -changes at each receive ready callback on the KCM socket. The application -does not need to configure this. - -Error handling --------------- - -An application should include a thread to monitor errors raised on -the TCP connection. Normally, this will be done by placing each -TCP socket attached to a KCM multiplexor in epoll set for POLLERR -event. If an error occurs on an attached TCP socket, KCM sets an EPIPE -on the socket thus waking up the application thread. When the application -sees the error (which may just be a disconnect) it should unattach the -socket from KCM and then close it. It is assumed that once an error is -posted on the TCP socket the data stream is unrecoverable (i.e. an error -may have occurred in the middle of receiving a message). - -TCP connection monitoring -------------------------- - -In KCM there is no means to correlate a message to the TCP socket that -was used to send or receive the message (except in the case there is -only one attached TCP socket). However, the application does retain -an open file descriptor to the socket so it will be able to get statistics -from the socket which can be used in detecting issues (such as high -retransmissions on the socket). -- cgit v1.2.3-59-g8ed1b From 9b329d0dbe413bf46eb5010edd06b3076960a60a Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Tue, 28 Apr 2020 15:30:48 -0700 Subject: selftests/bpf: fix test_sysctl_prog with alu32 Similar to commit b7a0d65d80a0 ("bpf, testing: Workaround a verifier failure for test_progs") fix test_sysctl_prog.c as well. Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/progs/test_sysctl_prog.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/bpf/progs/test_sysctl_prog.c b/tools/testing/selftests/bpf/progs/test_sysctl_prog.c index 2d0b0b82a78a..50525235380e 100644 --- a/tools/testing/selftests/bpf/progs/test_sysctl_prog.c +++ b/tools/testing/selftests/bpf/progs/test_sysctl_prog.c @@ -45,7 +45,7 @@ int sysctl_tcp_mem(struct bpf_sysctl *ctx) unsigned long tcp_mem[3] = {0, 0, 0}; char value[MAX_VALUE_STR_LEN]; unsigned char i, off = 0; - int ret; + volatile int ret; if (ctx->write) return 0; -- cgit v1.2.3-59-g8ed1b From f9d041271cf44ca02eed0cc82e1a6d8c814c53ed Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 28 Apr 2020 17:16:05 -0700 Subject: bpf: Refactor bpf_link update handling Make bpf_link update support more generic by making it into another bpf_link_ops methods. This allows generic syscall handling code to be agnostic to various conditionally compiled features (e.g., the case of CONFIG_CGROUP_BPF). This also allows to keep link type-specific code to remain static within respective code base. Refactor existing bpf_cgroup_link code and take advantage of this. Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200429001614.1544-2-andriin@fb.com --- include/linux/bpf-cgroup.h | 12 ------------ include/linux/bpf.h | 3 ++- kernel/bpf/cgroup.c | 30 ++++++++++++++++++++++++++++-- kernel/bpf/syscall.c | 11 ++++------- kernel/cgroup/cgroup.c | 27 --------------------------- 5 files changed, 34 insertions(+), 49 deletions(-) diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h index 0b41fd5fc96b..a9cb9a5bf8e9 100644 --- a/include/linux/bpf-cgroup.h +++ b/include/linux/bpf-cgroup.h @@ -100,8 +100,6 @@ int __cgroup_bpf_attach(struct cgroup *cgrp, int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog, struct bpf_cgroup_link *link, enum bpf_attach_type type); -int __cgroup_bpf_replace(struct cgroup *cgrp, struct bpf_cgroup_link *link, - struct bpf_prog *new_prog); int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr, union bpf_attr __user *uattr); @@ -112,8 +110,6 @@ int cgroup_bpf_attach(struct cgroup *cgrp, u32 flags); int cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog, enum bpf_attach_type type); -int cgroup_bpf_replace(struct bpf_link *link, struct bpf_prog *old_prog, - struct bpf_prog *new_prog); int cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr, union bpf_attr __user *uattr); @@ -353,7 +349,6 @@ int cgroup_bpf_prog_query(const union bpf_attr *attr, #else struct bpf_prog; -struct bpf_link; struct cgroup_bpf {}; static inline int cgroup_bpf_inherit(struct cgroup *cgrp) { return 0; } static inline void cgroup_bpf_offline(struct cgroup *cgrp) {} @@ -377,13 +372,6 @@ static inline int cgroup_bpf_link_attach(const union bpf_attr *attr, return -EINVAL; } -static inline int cgroup_bpf_replace(struct bpf_link *link, - struct bpf_prog *old_prog, - struct bpf_prog *new_prog) -{ - return -EINVAL; -} - static inline int cgroup_bpf_prog_query(const union bpf_attr *attr, union bpf_attr __user *uattr) { diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 10960cfabea4..81c8620cb4c4 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1093,7 +1093,8 @@ struct bpf_link { struct bpf_link_ops { void (*release)(struct bpf_link *link); void (*dealloc)(struct bpf_link *link); - + int (*update_prog)(struct bpf_link *link, struct bpf_prog *new_prog, + struct bpf_prog *old_prog); }; void bpf_link_init(struct bpf_link *link, const struct bpf_link_ops *ops, diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index bf634959885c..da6e48e802b2 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -557,8 +557,9 @@ found: * * Must be called with cgroup_mutex held. */ -int __cgroup_bpf_replace(struct cgroup *cgrp, struct bpf_cgroup_link *link, - struct bpf_prog *new_prog) +static int __cgroup_bpf_replace(struct cgroup *cgrp, + struct bpf_cgroup_link *link, + struct bpf_prog *new_prog) { struct list_head *progs = &cgrp->bpf.progs[link->type]; struct bpf_prog *old_prog; @@ -583,6 +584,30 @@ int __cgroup_bpf_replace(struct cgroup *cgrp, struct bpf_cgroup_link *link, return 0; } +static int cgroup_bpf_replace(struct bpf_link *link, struct bpf_prog *new_prog, + struct bpf_prog *old_prog) +{ + struct bpf_cgroup_link *cg_link; + int ret; + + cg_link = container_of(link, struct bpf_cgroup_link, link); + + mutex_lock(&cgroup_mutex); + /* link might have been auto-released by dying cgroup, so fail */ + if (!cg_link->cgroup) { + ret = -EINVAL; + goto out_unlock; + } + if (old_prog && link->prog != old_prog) { + ret = -EPERM; + goto out_unlock; + } + ret = __cgroup_bpf_replace(cg_link->cgroup, cg_link, new_prog); +out_unlock: + mutex_unlock(&cgroup_mutex); + return ret; +} + static struct bpf_prog_list *find_detach_entry(struct list_head *progs, struct bpf_prog *prog, struct bpf_cgroup_link *link, @@ -811,6 +836,7 @@ static void bpf_cgroup_link_dealloc(struct bpf_link *link) const struct bpf_link_ops bpf_cgroup_link_lops = { .release = bpf_cgroup_link_release, .dealloc = bpf_cgroup_link_dealloc, + .update_prog = cgroup_bpf_replace, }; int cgroup_bpf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 7626b8024471..f5358e1462eb 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -3645,13 +3645,10 @@ static int link_update(union bpf_attr *attr) goto out_put_progs; } -#ifdef CONFIG_CGROUP_BPF - if (link->ops == &bpf_cgroup_link_lops) { - ret = cgroup_bpf_replace(link, old_prog, new_prog); - goto out_put_progs; - } -#endif - ret = -EINVAL; + if (link->ops->update_prog) + ret = link->ops->update_prog(link, new_prog, old_prog); + else + ret = EINVAL; out_put_progs: if (old_prog) diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 06b5ea9d899d..557a9b9d2244 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -6508,33 +6508,6 @@ int cgroup_bpf_attach(struct cgroup *cgrp, return ret; } -int cgroup_bpf_replace(struct bpf_link *link, struct bpf_prog *old_prog, - struct bpf_prog *new_prog) -{ - struct bpf_cgroup_link *cg_link; - int ret; - - if (link->ops != &bpf_cgroup_link_lops) - return -EINVAL; - - cg_link = container_of(link, struct bpf_cgroup_link, link); - - mutex_lock(&cgroup_mutex); - /* link might have been auto-released by dying cgroup, so fail */ - if (!cg_link->cgroup) { - ret = -EINVAL; - goto out_unlock; - } - if (old_prog && link->prog != old_prog) { - ret = -EPERM; - goto out_unlock; - } - ret = __cgroup_bpf_replace(cg_link->cgroup, cg_link, new_prog); -out_unlock: - mutex_unlock(&cgroup_mutex); - return ret; -} - int cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog, enum bpf_attach_type type) { -- cgit v1.2.3-59-g8ed1b From a3b80e1078943dc12553166fb08e258463dec013 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 28 Apr 2020 17:16:06 -0700 Subject: bpf: Allocate ID for bpf_link Generate ID for each bpf_link using IDR, similarly to bpf_map and bpf_prog. bpf_link creation, initialization, attachment, and exposing to user-space through FD and ID is a complicated multi-step process, abstract it away through bpf_link_primer and bpf_link_prime(), bpf_link_settle(), and bpf_link_cleanup() internal API. They guarantee that until bpf_link is properly attached, user-space won't be able to access partially-initialized bpf_link either from FD or ID. All this allows to simplify bpf_link attachment and error handling code. Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200429001614.1544-3-andriin@fb.com --- include/linux/bpf.h | 17 ++++-- include/uapi/linux/bpf.h | 1 + kernel/bpf/cgroup.c | 14 ++--- kernel/bpf/syscall.c | 143 ++++++++++++++++++++++++++++++++--------------- 4 files changed, 118 insertions(+), 57 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 81c8620cb4c4..875d1f0af803 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1085,11 +1085,19 @@ int bpf_prog_new_fd(struct bpf_prog *prog); struct bpf_link { atomic64_t refcnt; + u32 id; const struct bpf_link_ops *ops; struct bpf_prog *prog; struct work_struct work; }; +struct bpf_link_primer { + struct bpf_link *link; + struct file *file; + int fd; + u32 id; +}; + struct bpf_link_ops { void (*release)(struct bpf_link *link); void (*dealloc)(struct bpf_link *link); @@ -1097,10 +1105,11 @@ struct bpf_link_ops { struct bpf_prog *old_prog); }; -void bpf_link_init(struct bpf_link *link, const struct bpf_link_ops *ops, - struct bpf_prog *prog); -void bpf_link_cleanup(struct bpf_link *link, struct file *link_file, - int link_fd); +void bpf_link_init(struct bpf_link *link, + const struct bpf_link_ops *ops, struct bpf_prog *prog); +int bpf_link_prime(struct bpf_link *link, struct bpf_link_primer *primer); +int bpf_link_settle(struct bpf_link_primer *primer); +void bpf_link_cleanup(struct bpf_link_primer *primer); void bpf_link_inc(struct bpf_link *link); void bpf_link_put(struct bpf_link *link); int bpf_link_new_fd(struct bpf_link *link); diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 4a6c47f3febe..6121aa487465 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -523,6 +523,7 @@ union bpf_attr { __u32 prog_id; __u32 map_id; __u32 btf_id; + __u32 link_id; }; __u32 next_id; __u32 open_flags; diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index da6e48e802b2..1bdf37fca879 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -841,10 +841,10 @@ const struct bpf_link_ops bpf_cgroup_link_lops = { int cgroup_bpf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) { + struct bpf_link_primer link_primer; struct bpf_cgroup_link *link; - struct file *link_file; struct cgroup *cgrp; - int err, link_fd; + int err; if (attr->link_create.flags) return -EINVAL; @@ -862,22 +862,20 @@ int cgroup_bpf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) link->cgroup = cgrp; link->type = attr->link_create.attach_type; - link_file = bpf_link_new_file(&link->link, &link_fd); - if (IS_ERR(link_file)) { + err = bpf_link_prime(&link->link, &link_primer); + if (err) { kfree(link); - err = PTR_ERR(link_file); goto out_put_cgroup; } err = cgroup_bpf_attach(cgrp, NULL, NULL, link, link->type, BPF_F_ALLOW_MULTI); if (err) { - bpf_link_cleanup(&link->link, link_file, link_fd); + bpf_link_cleanup(&link_primer); goto out_put_cgroup; } - fd_install(link_fd, link_file); - return link_fd; + return bpf_link_settle(&link_primer); out_put_cgroup: cgroup_put(cgrp); diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index f5358e1462eb..5439e05e3d25 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -42,6 +42,8 @@ static DEFINE_IDR(prog_idr); static DEFINE_SPINLOCK(prog_idr_lock); static DEFINE_IDR(map_idr); static DEFINE_SPINLOCK(map_idr_lock); +static DEFINE_IDR(link_idr); +static DEFINE_SPINLOCK(link_idr_lock); int sysctl_unprivileged_bpf_disabled __read_mostly; @@ -2181,25 +2183,38 @@ static int bpf_obj_get(const union bpf_attr *attr) attr->file_flags); } -void bpf_link_init(struct bpf_link *link, const struct bpf_link_ops *ops, - struct bpf_prog *prog) +void bpf_link_init(struct bpf_link *link, + const struct bpf_link_ops *ops, struct bpf_prog *prog) { atomic64_set(&link->refcnt, 1); + link->id = 0; link->ops = ops; link->prog = prog; } +static void bpf_link_free_id(int id) +{ + if (!id) + return; + + spin_lock_bh(&link_idr_lock); + idr_remove(&link_idr, id); + spin_unlock_bh(&link_idr_lock); +} + /* Clean up bpf_link and corresponding anon_inode file and FD. After * anon_inode is created, bpf_link can't be just kfree()'d due to deferred - * anon_inode's release() call. This helper manages marking bpf_link as - * defunct, releases anon_inode file and puts reserved FD. + * anon_inode's release() call. This helper marksbpf_link as + * defunct, releases anon_inode file and puts reserved FD. bpf_prog's refcnt + * is not decremented, it's the responsibility of a calling code that failed + * to complete bpf_link initialization. */ -void bpf_link_cleanup(struct bpf_link *link, struct file *link_file, - int link_fd) +void bpf_link_cleanup(struct bpf_link_primer *primer) { - link->prog = NULL; - fput(link_file); - put_unused_fd(link_fd); + primer->link->prog = NULL; + bpf_link_free_id(primer->id); + fput(primer->file); + put_unused_fd(primer->fd); } void bpf_link_inc(struct bpf_link *link) @@ -2210,6 +2225,7 @@ void bpf_link_inc(struct bpf_link *link) /* bpf_link_free is guaranteed to be called from process context */ static void bpf_link_free(struct bpf_link *link) { + bpf_link_free_id(link->id); if (link->prog) { /* detach BPF program, clean up used resources */ link->ops->release(link); @@ -2275,9 +2291,11 @@ static void bpf_link_show_fdinfo(struct seq_file *m, struct file *filp) bin2hex(prog_tag, prog->tag, sizeof(prog->tag)); seq_printf(m, "link_type:\t%s\n" + "link_id:\t%u\n" "prog_tag:\t%s\n" "prog_id:\t%u\n", link_type, + link->id, prog_tag, prog->aux->id); } @@ -2292,36 +2310,76 @@ static const struct file_operations bpf_link_fops = { .write = bpf_dummy_write, }; -int bpf_link_new_fd(struct bpf_link *link) +static int bpf_link_alloc_id(struct bpf_link *link) { - return anon_inode_getfd("bpf-link", &bpf_link_fops, link, O_CLOEXEC); -} + int id; + + idr_preload(GFP_KERNEL); + spin_lock_bh(&link_idr_lock); + id = idr_alloc_cyclic(&link_idr, link, 1, INT_MAX, GFP_ATOMIC); + spin_unlock_bh(&link_idr_lock); + idr_preload_end(); -/* Similar to bpf_link_new_fd, create anon_inode for given bpf_link, but - * instead of immediately installing fd in fdtable, just reserve it and - * return. Caller then need to either install it with fd_install(fd, file) or - * release with put_unused_fd(fd). - * This is useful for cases when bpf_link attachment/detachment are - * complicated and expensive operations and should be delayed until all the fd - * reservation and anon_inode creation succeeds. + return id; +} + +/* Prepare bpf_link to be exposed to user-space by allocating anon_inode file, + * reserving unused FD and allocating ID from link_idr. This is to be paired + * with bpf_link_settle() to install FD and ID and expose bpf_link to + * user-space, if bpf_link is successfully attached. If not, bpf_link and + * pre-allocated resources are to be freed with bpf_cleanup() call. All the + * transient state is passed around in struct bpf_link_primer. + * This is preferred way to create and initialize bpf_link, especially when + * there are complicated and expensive operations inbetween creating bpf_link + * itself and attaching it to BPF hook. By using bpf_link_prime() and + * bpf_link_settle() kernel code using bpf_link doesn't have to perform + * expensive (and potentially failing) roll back operations in a rare case + * that file, FD, or ID can't be allocated. */ -struct file *bpf_link_new_file(struct bpf_link *link, int *reserved_fd) +int bpf_link_prime(struct bpf_link *link, struct bpf_link_primer *primer) { struct file *file; - int fd; + int fd, id; fd = get_unused_fd_flags(O_CLOEXEC); if (fd < 0) - return ERR_PTR(fd); + return fd; file = anon_inode_getfile("bpf_link", &bpf_link_fops, link, O_CLOEXEC); if (IS_ERR(file)) { put_unused_fd(fd); - return file; + return PTR_ERR(file); } - *reserved_fd = fd; - return file; + id = bpf_link_alloc_id(link); + if (id < 0) { + put_unused_fd(fd); + fput(file); + return id; + } + + primer->link = link; + primer->file = file; + primer->fd = fd; + primer->id = id; + return 0; +} + +int bpf_link_settle(struct bpf_link_primer *primer) +{ + /* make bpf_link fetchable by ID */ + spin_lock_bh(&link_idr_lock); + primer->link->id = primer->id; + spin_unlock_bh(&link_idr_lock); + /* make bpf_link fetchable by FD */ + fd_install(primer->fd, primer->file); + /* pass through installed FD */ + return primer->fd; +} + +int bpf_link_new_fd(struct bpf_link *link) +{ + return anon_inode_getfd("bpf-link", &bpf_link_fops, link, O_CLOEXEC); } struct bpf_link *bpf_link_get_from_fd(u32 ufd) @@ -2367,9 +2425,9 @@ static const struct bpf_link_ops bpf_tracing_link_lops = { static int bpf_tracing_prog_attach(struct bpf_prog *prog) { + struct bpf_link_primer link_primer; struct bpf_tracing_link *link; - struct file *link_file; - int link_fd, err; + int err; switch (prog->type) { case BPF_PROG_TYPE_TRACING: @@ -2404,22 +2462,19 @@ static int bpf_tracing_prog_attach(struct bpf_prog *prog) } bpf_link_init(&link->link, &bpf_tracing_link_lops, prog); - link_file = bpf_link_new_file(&link->link, &link_fd); - if (IS_ERR(link_file)) { + err = bpf_link_prime(&link->link, &link_primer); + if (err) { kfree(link); - err = PTR_ERR(link_file); goto out_put_prog; } err = bpf_trampoline_link_prog(prog); if (err) { - bpf_link_cleanup(&link->link, link_file, link_fd); + bpf_link_cleanup(&link_primer); goto out_put_prog; } - fd_install(link_fd, link_file); - return link_fd; - + return bpf_link_settle(&link_primer); out_put_prog: bpf_prog_put(prog); return err; @@ -2447,7 +2502,7 @@ static void bpf_raw_tp_link_dealloc(struct bpf_link *link) kfree(raw_tp); } -static const struct bpf_link_ops bpf_raw_tp_lops = { +static const struct bpf_link_ops bpf_raw_tp_link_lops = { .release = bpf_raw_tp_link_release, .dealloc = bpf_raw_tp_link_dealloc, }; @@ -2456,13 +2511,13 @@ static const struct bpf_link_ops bpf_raw_tp_lops = { static int bpf_raw_tracepoint_open(const union bpf_attr *attr) { + struct bpf_link_primer link_primer; struct bpf_raw_tp_link *link; struct bpf_raw_event_map *btp; - struct file *link_file; struct bpf_prog *prog; const char *tp_name; char buf[128]; - int link_fd, err; + int err; if (CHECK_ATTR(BPF_RAW_TRACEPOINT_OPEN)) return -EINVAL; @@ -2515,24 +2570,22 @@ static int bpf_raw_tracepoint_open(const union bpf_attr *attr) err = -ENOMEM; goto out_put_btp; } - bpf_link_init(&link->link, &bpf_raw_tp_lops, prog); + bpf_link_init(&link->link, &bpf_raw_tp_link_lops, prog); link->btp = btp; - link_file = bpf_link_new_file(&link->link, &link_fd); - if (IS_ERR(link_file)) { + err = bpf_link_prime(&link->link, &link_primer); + if (err) { kfree(link); - err = PTR_ERR(link_file); goto out_put_btp; } err = bpf_probe_register(link->btp, prog); if (err) { - bpf_link_cleanup(&link->link, link_file, link_fd); + bpf_link_cleanup(&link_primer); goto out_put_btp; } - fd_install(link_fd, link_file); - return link_fd; + return bpf_link_settle(&link_primer); out_put_btp: bpf_put_raw_tracepoint(btp); @@ -3464,7 +3517,7 @@ static int bpf_task_fd_query(const union bpf_attr *attr, if (file->f_op == &bpf_link_fops) { struct bpf_link *link = file->private_data; - if (link->ops == &bpf_raw_tp_lops) { + if (link->ops == &bpf_raw_tp_link_lops) { struct bpf_raw_tp_link *raw_tp = container_of(link, struct bpf_raw_tp_link, link); struct bpf_raw_event_map *btp = raw_tp->btp; -- cgit v1.2.3-59-g8ed1b From 2d602c8cf40d65d4a7ac34fe18648d8778e6e594 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 28 Apr 2020 17:16:07 -0700 Subject: bpf: Support GET_FD_BY_ID and GET_NEXT_ID for bpf_link Add support to look up bpf_link by ID and iterate over all existing bpf_links in the system. GET_FD_BY_ID code handles not-yet-ready bpf_link by checking that its ID hasn't been set to non-zero value yet. Setting bpf_link's ID is done as the very last step in finalizing bpf_link, together with installing FD. This approach allows users of bpf_link in kernel code to not worry about races between user-space and kernel code that hasn't finished attaching and initializing bpf_link. Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200429001614.1544-4-andriin@fb.com --- include/uapi/linux/bpf.h | 2 ++ kernel/bpf/syscall.c | 49 ++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 51 insertions(+) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 6121aa487465..7e6541fceade 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -113,6 +113,8 @@ enum bpf_cmd { BPF_MAP_DELETE_BATCH, BPF_LINK_CREATE, BPF_LINK_UPDATE, + BPF_LINK_GET_FD_BY_ID, + BPF_LINK_GET_NEXT_ID, }; enum bpf_map_type { diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 5439e05e3d25..1c213a730502 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -3713,6 +3713,48 @@ out_put_link: return ret; } +static int bpf_link_inc_not_zero(struct bpf_link *link) +{ + return atomic64_fetch_add_unless(&link->refcnt, 1, 0) ? 0 : -ENOENT; +} + +#define BPF_LINK_GET_FD_BY_ID_LAST_FIELD link_id + +static int bpf_link_get_fd_by_id(const union bpf_attr *attr) +{ + struct bpf_link *link; + u32 id = attr->link_id; + int fd, err; + + if (CHECK_ATTR(BPF_LINK_GET_FD_BY_ID)) + return -EINVAL; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + spin_lock_bh(&link_idr_lock); + link = idr_find(&link_idr, id); + /* before link is "settled", ID is 0, pretend it doesn't exist yet */ + if (link) { + if (link->id) + err = bpf_link_inc_not_zero(link); + else + err = -EAGAIN; + } else { + err = -ENOENT; + } + spin_unlock_bh(&link_idr_lock); + + if (err) + return err; + + fd = bpf_link_new_fd(link); + if (fd < 0) + bpf_link_put(link); + + return fd; +} + SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size) { union bpf_attr attr; @@ -3830,6 +3872,13 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz case BPF_LINK_UPDATE: err = link_update(&attr); break; + case BPF_LINK_GET_FD_BY_ID: + err = bpf_link_get_fd_by_id(&attr); + break; + case BPF_LINK_GET_NEXT_ID: + err = bpf_obj_get_next_id(&attr, uattr, + &link_idr, &link_idr_lock); + break; default: err = -EINVAL; break; -- cgit v1.2.3-59-g8ed1b From f2e10bff16a0fdd41ba278c84da9813700e356af Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 28 Apr 2020 17:16:08 -0700 Subject: bpf: Add support for BPF_OBJ_GET_INFO_BY_FD for bpf_link Add ability to fetch bpf_link details through BPF_OBJ_GET_INFO_BY_FD command. Also enhance show_fdinfo to potentially include bpf_link type-specific information (similarly to obj_info). Also introduce enum bpf_link_type stored in bpf_link itself and expose it in UAPI. bpf_link_tracing also now will store and return bpf_attach_type. Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200429001614.1544-5-andriin@fb.com --- include/linux/bpf-cgroup.h | 2 - include/linux/bpf.h | 8 ++- include/linux/bpf_types.h | 6 ++ include/uapi/linux/bpf.h | 28 ++++++++ kernel/bpf/btf.c | 2 + kernel/bpf/cgroup.c | 43 +++++++++++- kernel/bpf/syscall.c | 155 ++++++++++++++++++++++++++++++++++++----- kernel/bpf/verifier.c | 2 + tools/include/uapi/linux/bpf.h | 31 +++++++++ 9 files changed, 253 insertions(+), 24 deletions(-) diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h index a9cb9a5bf8e9..272626cc3fc9 100644 --- a/include/linux/bpf-cgroup.h +++ b/include/linux/bpf-cgroup.h @@ -57,8 +57,6 @@ struct bpf_cgroup_link { enum bpf_attach_type type; }; -extern const struct bpf_link_ops bpf_cgroup_link_lops; - struct bpf_prog_list { struct list_head node; struct bpf_prog *prog; diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 875d1f0af803..c07b1d2f3824 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1026,9 +1026,11 @@ extern const struct file_operations bpf_prog_fops; extern const struct bpf_verifier_ops _name ## _verifier_ops; #define BPF_MAP_TYPE(_id, _ops) \ extern const struct bpf_map_ops _ops; +#define BPF_LINK_TYPE(_id, _name) #include #undef BPF_PROG_TYPE #undef BPF_MAP_TYPE +#undef BPF_LINK_TYPE extern const struct bpf_prog_ops bpf_offload_prog_ops; extern const struct bpf_verifier_ops tc_cls_act_analyzer_ops; @@ -1086,6 +1088,7 @@ int bpf_prog_new_fd(struct bpf_prog *prog); struct bpf_link { atomic64_t refcnt; u32 id; + enum bpf_link_type type; const struct bpf_link_ops *ops; struct bpf_prog *prog; struct work_struct work; @@ -1103,9 +1106,12 @@ struct bpf_link_ops { void (*dealloc)(struct bpf_link *link); int (*update_prog)(struct bpf_link *link, struct bpf_prog *new_prog, struct bpf_prog *old_prog); + void (*show_fdinfo)(const struct bpf_link *link, struct seq_file *seq); + int (*fill_link_info)(const struct bpf_link *link, + struct bpf_link_info *info); }; -void bpf_link_init(struct bpf_link *link, +void bpf_link_init(struct bpf_link *link, enum bpf_link_type type, const struct bpf_link_ops *ops, struct bpf_prog *prog); int bpf_link_prime(struct bpf_link *link, struct bpf_link_primer *primer); int bpf_link_settle(struct bpf_link_primer *primer); diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h index ba0c2d56f8a3..8345cdf553b8 100644 --- a/include/linux/bpf_types.h +++ b/include/linux/bpf_types.h @@ -118,3 +118,9 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_STACK, stack_map_ops) #if defined(CONFIG_BPF_JIT) BPF_MAP_TYPE(BPF_MAP_TYPE_STRUCT_OPS, bpf_struct_ops_map_ops) #endif + +BPF_LINK_TYPE(BPF_LINK_TYPE_RAW_TRACEPOINT, raw_tracepoint) +BPF_LINK_TYPE(BPF_LINK_TYPE_TRACING, tracing) +#ifdef CONFIG_CGROUP_BPF +BPF_LINK_TYPE(BPF_LINK_TYPE_CGROUP, cgroup) +#endif diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 7e6541fceade..0eccafae55bb 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -222,6 +222,15 @@ enum bpf_attach_type { #define MAX_BPF_ATTACH_TYPE __MAX_BPF_ATTACH_TYPE +enum bpf_link_type { + BPF_LINK_TYPE_UNSPEC = 0, + BPF_LINK_TYPE_RAW_TRACEPOINT = 1, + BPF_LINK_TYPE_TRACING = 2, + BPF_LINK_TYPE_CGROUP = 3, + + MAX_BPF_LINK_TYPE, +}; + /* cgroup-bpf attach flags used in BPF_PROG_ATTACH command * * NONE(default): No further bpf programs allowed in the subtree. @@ -3612,6 +3621,25 @@ struct bpf_btf_info { __u32 id; } __attribute__((aligned(8))); +struct bpf_link_info { + __u32 type; + __u32 id; + __u32 prog_id; + union { + struct { + __aligned_u64 tp_name; /* in/out: tp_name buffer ptr */ + __u32 tp_name_len; /* in/out: tp_name buffer len */ + } raw_tracepoint; + struct { + __u32 attach_type; + } tracing; + struct { + __u64 cgroup_id; + __u32 attach_type; + } cgroup; + }; +} __attribute__((aligned(8))); + /* User bpf_sock_addr struct to access socket fields and sockaddr struct passed * by user and intended to be used by socket (e.g. to bind to, depends on * attach attach type). diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index d65c6912bdaf..a2cfba89a8e1 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -3482,6 +3482,7 @@ extern char __weak __stop_BTF[]; extern struct btf *btf_vmlinux; #define BPF_MAP_TYPE(_id, _ops) +#define BPF_LINK_TYPE(_id, _name) static union { struct bpf_ctx_convert { #define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type) \ @@ -3508,6 +3509,7 @@ static u8 bpf_ctx_convert_map[] = { 0, /* avoid empty array */ }; #undef BPF_MAP_TYPE +#undef BPF_LINK_TYPE static const struct btf_member * btf_get_prog_ctx_type(struct bpf_verifier_log *log, struct btf *btf, diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 1bdf37fca879..5c0e964105ac 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -833,10 +833,48 @@ static void bpf_cgroup_link_dealloc(struct bpf_link *link) kfree(cg_link); } -const struct bpf_link_ops bpf_cgroup_link_lops = { +static void bpf_cgroup_link_show_fdinfo(const struct bpf_link *link, + struct seq_file *seq) +{ + struct bpf_cgroup_link *cg_link = + container_of(link, struct bpf_cgroup_link, link); + u64 cg_id = 0; + + mutex_lock(&cgroup_mutex); + if (cg_link->cgroup) + cg_id = cgroup_id(cg_link->cgroup); + mutex_unlock(&cgroup_mutex); + + seq_printf(seq, + "cgroup_id:\t%llu\n" + "attach_type:\t%d\n", + cg_id, + cg_link->type); +} + +static int bpf_cgroup_link_fill_link_info(const struct bpf_link *link, + struct bpf_link_info *info) +{ + struct bpf_cgroup_link *cg_link = + container_of(link, struct bpf_cgroup_link, link); + u64 cg_id = 0; + + mutex_lock(&cgroup_mutex); + if (cg_link->cgroup) + cg_id = cgroup_id(cg_link->cgroup); + mutex_unlock(&cgroup_mutex); + + info->cgroup.cgroup_id = cg_id; + info->cgroup.attach_type = cg_link->type; + return 0; +} + +static const struct bpf_link_ops bpf_cgroup_link_lops = { .release = bpf_cgroup_link_release, .dealloc = bpf_cgroup_link_dealloc, .update_prog = cgroup_bpf_replace, + .show_fdinfo = bpf_cgroup_link_show_fdinfo, + .fill_link_info = bpf_cgroup_link_fill_link_info, }; int cgroup_bpf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) @@ -858,7 +896,8 @@ int cgroup_bpf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) err = -ENOMEM; goto out_put_cgroup; } - bpf_link_init(&link->link, &bpf_cgroup_link_lops, prog); + bpf_link_init(&link->link, BPF_LINK_TYPE_CGROUP, &bpf_cgroup_link_lops, + prog); link->cgroup = cgrp; link->type = attr->link_create.attach_type; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 1c213a730502..d23c04cbe14f 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -51,9 +51,11 @@ static const struct bpf_map_ops * const bpf_map_types[] = { #define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type) #define BPF_MAP_TYPE(_id, _ops) \ [_id] = &_ops, +#define BPF_LINK_TYPE(_id, _name) #include #undef BPF_PROG_TYPE #undef BPF_MAP_TYPE +#undef BPF_LINK_TYPE }; /* @@ -1548,9 +1550,11 @@ static const struct bpf_prog_ops * const bpf_prog_types[] = { #define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type) \ [_id] = & _name ## _prog_ops, #define BPF_MAP_TYPE(_id, _ops) +#define BPF_LINK_TYPE(_id, _name) #include #undef BPF_PROG_TYPE #undef BPF_MAP_TYPE +#undef BPF_LINK_TYPE }; static int find_prog_type(enum bpf_prog_type type, struct bpf_prog *prog) @@ -2183,10 +2187,11 @@ static int bpf_obj_get(const union bpf_attr *attr) attr->file_flags); } -void bpf_link_init(struct bpf_link *link, +void bpf_link_init(struct bpf_link *link, enum bpf_link_type type, const struct bpf_link_ops *ops, struct bpf_prog *prog) { atomic64_set(&link->refcnt, 1); + link->type = type; link->id = 0; link->ops = ops; link->prog = prog; @@ -2266,27 +2271,23 @@ static int bpf_link_release(struct inode *inode, struct file *filp) return 0; } -#ifdef CONFIG_PROC_FS -static const struct bpf_link_ops bpf_raw_tp_lops; -static const struct bpf_link_ops bpf_tracing_link_lops; +#define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type) +#define BPF_MAP_TYPE(_id, _ops) +#define BPF_LINK_TYPE(_id, _name) [_id] = #_name, +static const char *bpf_link_type_strs[] = { + [BPF_LINK_TYPE_UNSPEC] = "", +#include +}; +#undef BPF_PROG_TYPE +#undef BPF_MAP_TYPE +#undef BPF_LINK_TYPE +#ifdef CONFIG_PROC_FS static void bpf_link_show_fdinfo(struct seq_file *m, struct file *filp) { const struct bpf_link *link = filp->private_data; const struct bpf_prog *prog = link->prog; char prog_tag[sizeof(prog->tag) * 2 + 1] = { }; - const char *link_type; - - if (link->ops == &bpf_raw_tp_lops) - link_type = "raw_tracepoint"; - else if (link->ops == &bpf_tracing_link_lops) - link_type = "tracing"; -#ifdef CONFIG_CGROUP_BPF - else if (link->ops == &bpf_cgroup_link_lops) - link_type = "cgroup"; -#endif - else - link_type = "unknown"; bin2hex(prog_tag, prog->tag, sizeof(prog->tag)); seq_printf(m, @@ -2294,10 +2295,12 @@ static void bpf_link_show_fdinfo(struct seq_file *m, struct file *filp) "link_id:\t%u\n" "prog_tag:\t%s\n" "prog_id:\t%u\n", - link_type, + bpf_link_type_strs[link->type], link->id, prog_tag, prog->aux->id); + if (link->ops->show_fdinfo) + link->ops->show_fdinfo(link, m); } #endif @@ -2403,6 +2406,7 @@ struct bpf_link *bpf_link_get_from_fd(u32 ufd) struct bpf_tracing_link { struct bpf_link link; + enum bpf_attach_type attach_type; }; static void bpf_tracing_link_release(struct bpf_link *link) @@ -2418,9 +2422,33 @@ static void bpf_tracing_link_dealloc(struct bpf_link *link) kfree(tr_link); } +static void bpf_tracing_link_show_fdinfo(const struct bpf_link *link, + struct seq_file *seq) +{ + struct bpf_tracing_link *tr_link = + container_of(link, struct bpf_tracing_link, link); + + seq_printf(seq, + "attach_type:\t%d\n", + tr_link->attach_type); +} + +static int bpf_tracing_link_fill_link_info(const struct bpf_link *link, + struct bpf_link_info *info) +{ + struct bpf_tracing_link *tr_link = + container_of(link, struct bpf_tracing_link, link); + + info->tracing.attach_type = tr_link->attach_type; + + return 0; +} + static const struct bpf_link_ops bpf_tracing_link_lops = { .release = bpf_tracing_link_release, .dealloc = bpf_tracing_link_dealloc, + .show_fdinfo = bpf_tracing_link_show_fdinfo, + .fill_link_info = bpf_tracing_link_fill_link_info, }; static int bpf_tracing_prog_attach(struct bpf_prog *prog) @@ -2460,7 +2488,9 @@ static int bpf_tracing_prog_attach(struct bpf_prog *prog) err = -ENOMEM; goto out_put_prog; } - bpf_link_init(&link->link, &bpf_tracing_link_lops, prog); + bpf_link_init(&link->link, BPF_LINK_TYPE_TRACING, + &bpf_tracing_link_lops, prog); + link->attach_type = prog->expected_attach_type; err = bpf_link_prime(&link->link, &link_primer); if (err) { @@ -2502,9 +2532,56 @@ static void bpf_raw_tp_link_dealloc(struct bpf_link *link) kfree(raw_tp); } +static void bpf_raw_tp_link_show_fdinfo(const struct bpf_link *link, + struct seq_file *seq) +{ + struct bpf_raw_tp_link *raw_tp_link = + container_of(link, struct bpf_raw_tp_link, link); + + seq_printf(seq, + "tp_name:\t%s\n", + raw_tp_link->btp->tp->name); +} + +static int bpf_raw_tp_link_fill_link_info(const struct bpf_link *link, + struct bpf_link_info *info) +{ + struct bpf_raw_tp_link *raw_tp_link = + container_of(link, struct bpf_raw_tp_link, link); + char __user *ubuf = u64_to_user_ptr(info->raw_tracepoint.tp_name); + const char *tp_name = raw_tp_link->btp->tp->name; + u32 ulen = info->raw_tracepoint.tp_name_len; + size_t tp_len = strlen(tp_name); + + if (ulen && !ubuf) + return -EINVAL; + + info->raw_tracepoint.tp_name_len = tp_len + 1; + + if (!ubuf) + return 0; + + if (ulen >= tp_len + 1) { + if (copy_to_user(ubuf, tp_name, tp_len + 1)) + return -EFAULT; + } else { + char zero = '\0'; + + if (copy_to_user(ubuf, tp_name, ulen - 1)) + return -EFAULT; + if (put_user(zero, ubuf + ulen - 1)) + return -EFAULT; + return -ENOSPC; + } + + return 0; +} + static const struct bpf_link_ops bpf_raw_tp_link_lops = { .release = bpf_raw_tp_link_release, .dealloc = bpf_raw_tp_link_dealloc, + .show_fdinfo = bpf_raw_tp_link_show_fdinfo, + .fill_link_info = bpf_raw_tp_link_fill_link_info, }; #define BPF_RAW_TRACEPOINT_OPEN_LAST_FIELD raw_tracepoint.prog_fd @@ -2570,7 +2647,8 @@ static int bpf_raw_tracepoint_open(const union bpf_attr *attr) err = -ENOMEM; goto out_put_btp; } - bpf_link_init(&link->link, &bpf_raw_tp_link_lops, prog); + bpf_link_init(&link->link, BPF_LINK_TYPE_RAW_TRACEPOINT, + &bpf_raw_tp_link_lops, prog); link->btp = btp; err = bpf_link_prime(&link->link, &link_primer); @@ -3366,6 +3444,42 @@ static int bpf_btf_get_info_by_fd(struct btf *btf, return btf_get_info_by_fd(btf, attr, uattr); } +static int bpf_link_get_info_by_fd(struct bpf_link *link, + const union bpf_attr *attr, + union bpf_attr __user *uattr) +{ + struct bpf_link_info __user *uinfo = u64_to_user_ptr(attr->info.info); + struct bpf_link_info info; + u32 info_len = attr->info.info_len; + int err; + + err = bpf_check_uarg_tail_zero(uinfo, sizeof(info), info_len); + if (err) + return err; + info_len = min_t(u32, sizeof(info), info_len); + + memset(&info, 0, sizeof(info)); + if (copy_from_user(&info, uinfo, info_len)) + return -EFAULT; + + info.type = link->type; + info.id = link->id; + info.prog_id = link->prog->aux->id; + + if (link->ops->fill_link_info) { + err = link->ops->fill_link_info(link, &info); + if (err) + return err; + } + + if (copy_to_user(uinfo, &info, info_len) || + put_user(info_len, &uattr->info.info_len)) + return -EFAULT; + + return 0; +} + + #define BPF_OBJ_GET_INFO_BY_FD_LAST_FIELD info.info static int bpf_obj_get_info_by_fd(const union bpf_attr *attr, @@ -3390,6 +3504,9 @@ static int bpf_obj_get_info_by_fd(const union bpf_attr *attr, uattr); else if (f.file->f_op == &btf_fops) err = bpf_btf_get_info_by_fd(f.file->private_data, attr, uattr); + else if (f.file->f_op == &bpf_link_fops) + err = bpf_link_get_info_by_fd(f.file->private_data, + attr, uattr); else err = -EINVAL; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 91728e0f27eb..2b337e32aa94 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -28,9 +28,11 @@ static const struct bpf_verifier_ops * const bpf_verifier_ops[] = { #define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type) \ [_id] = & _name ## _verifier_ops, #define BPF_MAP_TYPE(_id, _ops) +#define BPF_LINK_TYPE(_id, _name) #include #undef BPF_PROG_TYPE #undef BPF_MAP_TYPE +#undef BPF_LINK_TYPE }; /* bpf_check() is a static code analyzer that walks eBPF program diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 4a6c47f3febe..0eccafae55bb 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -113,6 +113,8 @@ enum bpf_cmd { BPF_MAP_DELETE_BATCH, BPF_LINK_CREATE, BPF_LINK_UPDATE, + BPF_LINK_GET_FD_BY_ID, + BPF_LINK_GET_NEXT_ID, }; enum bpf_map_type { @@ -220,6 +222,15 @@ enum bpf_attach_type { #define MAX_BPF_ATTACH_TYPE __MAX_BPF_ATTACH_TYPE +enum bpf_link_type { + BPF_LINK_TYPE_UNSPEC = 0, + BPF_LINK_TYPE_RAW_TRACEPOINT = 1, + BPF_LINK_TYPE_TRACING = 2, + BPF_LINK_TYPE_CGROUP = 3, + + MAX_BPF_LINK_TYPE, +}; + /* cgroup-bpf attach flags used in BPF_PROG_ATTACH command * * NONE(default): No further bpf programs allowed in the subtree. @@ -523,6 +534,7 @@ union bpf_attr { __u32 prog_id; __u32 map_id; __u32 btf_id; + __u32 link_id; }; __u32 next_id; __u32 open_flags; @@ -3609,6 +3621,25 @@ struct bpf_btf_info { __u32 id; } __attribute__((aligned(8))); +struct bpf_link_info { + __u32 type; + __u32 id; + __u32 prog_id; + union { + struct { + __aligned_u64 tp_name; /* in/out: tp_name buffer ptr */ + __u32 tp_name_len; /* in/out: tp_name buffer len */ + } raw_tracepoint; + struct { + __u32 attach_type; + } tracing; + struct { + __u64 cgroup_id; + __u32 attach_type; + } cgroup; + }; +} __attribute__((aligned(8))); + /* User bpf_sock_addr struct to access socket fields and sockaddr struct passed * by user and intended to be used by socket (e.g. to bind to, depends on * attach attach type). -- cgit v1.2.3-59-g8ed1b From 0dbc866832a0fbf9f2b98d412da44c5cfd1b7756 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 28 Apr 2020 17:16:09 -0700 Subject: libbpf: Add low-level APIs for new bpf_link commands Add low-level API calls for bpf_link_get_next_id() and bpf_link_get_fd_by_id(). Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200429001614.1544-6-andriin@fb.com --- tools/lib/bpf/bpf.c | 19 +++++++++++++++++-- tools/lib/bpf/bpf.h | 4 +++- tools/lib/bpf/libbpf.map | 6 ++++++ 3 files changed, 26 insertions(+), 3 deletions(-) diff --git a/tools/lib/bpf/bpf.c b/tools/lib/bpf/bpf.c index 5cc1b0785d18..8f2f0958d446 100644 --- a/tools/lib/bpf/bpf.c +++ b/tools/lib/bpf/bpf.c @@ -721,6 +721,11 @@ int bpf_btf_get_next_id(__u32 start_id, __u32 *next_id) return bpf_obj_get_next_id(start_id, next_id, BPF_BTF_GET_NEXT_ID); } +int bpf_link_get_next_id(__u32 start_id, __u32 *next_id) +{ + return bpf_obj_get_next_id(start_id, next_id, BPF_LINK_GET_NEXT_ID); +} + int bpf_prog_get_fd_by_id(__u32 id) { union bpf_attr attr; @@ -751,13 +756,23 @@ int bpf_btf_get_fd_by_id(__u32 id) return sys_bpf(BPF_BTF_GET_FD_BY_ID, &attr, sizeof(attr)); } -int bpf_obj_get_info_by_fd(int prog_fd, void *info, __u32 *info_len) +int bpf_link_get_fd_by_id(__u32 id) +{ + union bpf_attr attr; + + memset(&attr, 0, sizeof(attr)); + attr.link_id = id; + + return sys_bpf(BPF_LINK_GET_FD_BY_ID, &attr, sizeof(attr)); +} + +int bpf_obj_get_info_by_fd(int bpf_fd, void *info, __u32 *info_len) { union bpf_attr attr; int err; memset(&attr, 0, sizeof(attr)); - attr.info.bpf_fd = prog_fd; + attr.info.bpf_fd = bpf_fd; attr.info.info_len = *info_len; attr.info.info = ptr_to_u64(info); diff --git a/tools/lib/bpf/bpf.h b/tools/lib/bpf/bpf.h index 46d47afdd887..335b457b3a25 100644 --- a/tools/lib/bpf/bpf.h +++ b/tools/lib/bpf/bpf.h @@ -216,10 +216,12 @@ LIBBPF_API int bpf_prog_test_run(int prog_fd, int repeat, void *data, LIBBPF_API int bpf_prog_get_next_id(__u32 start_id, __u32 *next_id); LIBBPF_API int bpf_map_get_next_id(__u32 start_id, __u32 *next_id); LIBBPF_API int bpf_btf_get_next_id(__u32 start_id, __u32 *next_id); +LIBBPF_API int bpf_link_get_next_id(__u32 start_id, __u32 *next_id); LIBBPF_API int bpf_prog_get_fd_by_id(__u32 id); LIBBPF_API int bpf_map_get_fd_by_id(__u32 id); LIBBPF_API int bpf_btf_get_fd_by_id(__u32 id); -LIBBPF_API int bpf_obj_get_info_by_fd(int prog_fd, void *info, __u32 *info_len); +LIBBPF_API int bpf_link_get_fd_by_id(__u32 id); +LIBBPF_API int bpf_obj_get_info_by_fd(int bpf_fd, void *info, __u32 *info_len); LIBBPF_API int bpf_prog_query(int target_fd, enum bpf_attach_type type, __u32 query_flags, __u32 *attach_flags, __u32 *prog_ids, __u32 *prog_cnt); diff --git a/tools/lib/bpf/libbpf.map b/tools/lib/bpf/libbpf.map index bb8831605b25..7cd49aa38005 100644 --- a/tools/lib/bpf/libbpf.map +++ b/tools/lib/bpf/libbpf.map @@ -254,3 +254,9 @@ LIBBPF_0.0.8 { bpf_program__set_lsm; bpf_set_link_xdp_fd_opts; } LIBBPF_0.0.7; + +LIBBPF_0.0.9 { + global: + bpf_link_get_fd_by_id; + bpf_link_get_next_id; +} LIBBPF_0.0.8; -- cgit v1.2.3-59-g8ed1b From 2c2837b09e9ab4874353186599609fa2e1ccabce Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 28 Apr 2020 17:16:10 -0700 Subject: selftests/bpf: Test bpf_link's get_next_id, get_fd_by_id, and get_obj_info Extend bpf_obj_id selftest to verify bpf_link's observability APIs. Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200429001614.1544-7-andriin@fb.com --- .../testing/selftests/bpf/prog_tests/bpf_obj_id.c | 110 +++++++++++++++++++-- tools/testing/selftests/bpf/progs/test_obj_id.c | 14 +-- 2 files changed, 104 insertions(+), 20 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_obj_id.c b/tools/testing/selftests/bpf/prog_tests/bpf_obj_id.c index f10029821e16..7afa4160416f 100644 --- a/tools/testing/selftests/bpf/prog_tests/bpf_obj_id.c +++ b/tools/testing/selftests/bpf/prog_tests/bpf_obj_id.c @@ -1,26 +1,30 @@ // SPDX-License-Identifier: GPL-2.0 #include +#define nr_iters 2 + void test_bpf_obj_id(void) { const __u64 array_magic_value = 0xfaceb00c; const __u32 array_key = 0; - const int nr_iters = 2; const char *file = "./test_obj_id.o"; const char *expected_prog_name = "test_obj_id"; const char *expected_map_name = "test_map_id"; const __u64 nsec_per_sec = 1000000000; - struct bpf_object *objs[nr_iters]; + struct bpf_object *objs[nr_iters] = {}; + struct bpf_link *links[nr_iters] = {}; + struct bpf_program *prog; int prog_fds[nr_iters], map_fds[nr_iters]; /* +1 to test for the info_len returned by kernel */ struct bpf_prog_info prog_infos[nr_iters + 1]; struct bpf_map_info map_infos[nr_iters + 1]; + struct bpf_link_info link_infos[nr_iters + 1]; /* Each prog only uses one map. +1 to test nr_map_ids * returned by kernel. */ __u32 map_ids[nr_iters + 1]; - char jited_insns[128], xlated_insns[128], zeros[128]; + char jited_insns[128], xlated_insns[128], zeros[128], tp_name[128]; __u32 i, next_id, info_len, nr_id_found, duration = 0; struct timespec real_time_ts, boot_time_ts; int err = 0; @@ -36,14 +40,15 @@ void test_bpf_obj_id(void) CHECK(err >= 0 || errno != ENOENT, "get-fd-by-notexist-map-id", "err %d errno %d\n", err, errno); - for (i = 0; i < nr_iters; i++) - objs[i] = NULL; + err = bpf_link_get_fd_by_id(0); + CHECK(err >= 0 || errno != ENOENT, + "get-fd-by-notexist-link-id", "err %d errno %d\n", err, errno); /* Check bpf_obj_get_info_by_fd() */ bzero(zeros, sizeof(zeros)); for (i = 0; i < nr_iters; i++) { now = time(NULL); - err = bpf_prog_load(file, BPF_PROG_TYPE_SOCKET_FILTER, + err = bpf_prog_load(file, BPF_PROG_TYPE_RAW_TRACEPOINT, &objs[i], &prog_fds[i]); /* test_obj_id.o is a dumb prog. It should never fail * to load. @@ -60,6 +65,17 @@ void test_bpf_obj_id(void) if (CHECK_FAIL(err)) goto done; + prog = bpf_object__find_program_by_title(objs[i], + "raw_tp/sys_enter"); + if (CHECK_FAIL(!prog)) + goto done; + links[i] = bpf_program__attach(prog); + err = libbpf_get_error(links[i]); + if (CHECK(err, "prog_attach", "prog #%d, err %d\n", i, err)) { + links[i] = NULL; + goto done; + } + /* Check getting map info */ info_len = sizeof(struct bpf_map_info) * 2; bzero(&map_infos[i], info_len); @@ -107,7 +123,7 @@ void test_bpf_obj_id(void) load_time = (real_time_ts.tv_sec - boot_time_ts.tv_sec) + (prog_infos[i].load_time / nsec_per_sec); if (CHECK(err || - prog_infos[i].type != BPF_PROG_TYPE_SOCKET_FILTER || + prog_infos[i].type != BPF_PROG_TYPE_RAW_TRACEPOINT || info_len != sizeof(struct bpf_prog_info) || (env.jit_enabled && !prog_infos[i].jited_prog_len) || (env.jit_enabled && @@ -120,7 +136,11 @@ void test_bpf_obj_id(void) *(int *)(long)prog_infos[i].map_ids != map_infos[i].id || strcmp((char *)prog_infos[i].name, expected_prog_name), "get-prog-info(fd)", - "err %d errno %d i %d type %d(%d) info_len %u(%zu) jit_enabled %d jited_prog_len %u xlated_prog_len %u jited_prog %d xlated_prog %d load_time %lu(%lu) uid %u(%u) nr_map_ids %u(%u) map_id %u(%u) name %s(%s)\n", + "err %d errno %d i %d type %d(%d) info_len %u(%zu) " + "jit_enabled %d jited_prog_len %u xlated_prog_len %u " + "jited_prog %d xlated_prog %d load_time %lu(%lu) " + "uid %u(%u) nr_map_ids %u(%u) map_id %u(%u) " + "name %s(%s)\n", err, errno, i, prog_infos[i].type, BPF_PROG_TYPE_SOCKET_FILTER, info_len, sizeof(struct bpf_prog_info), @@ -135,6 +155,33 @@ void test_bpf_obj_id(void) *(int *)(long)prog_infos[i].map_ids, map_infos[i].id, prog_infos[i].name, expected_prog_name)) goto done; + + /* Check getting link info */ + info_len = sizeof(struct bpf_link_info) * 2; + bzero(&link_infos[i], info_len); + link_infos[i].raw_tracepoint.tp_name = (__u64)&tp_name; + link_infos[i].raw_tracepoint.tp_name_len = sizeof(tp_name); + err = bpf_obj_get_info_by_fd(bpf_link__fd(links[i]), + &link_infos[i], &info_len); + if (CHECK(err || + link_infos[i].type != BPF_LINK_TYPE_RAW_TRACEPOINT || + link_infos[i].prog_id != prog_infos[i].id || + link_infos[i].raw_tracepoint.tp_name != (__u64)&tp_name || + strcmp((char *)link_infos[i].raw_tracepoint.tp_name, + "sys_enter") || + info_len != sizeof(struct bpf_link_info), + "get-link-info(fd)", + "err %d errno %d info_len %u(%zu) type %d(%d) id %d " + "prog_id %d (%d) tp_name %s(%s)\n", + err, errno, + info_len, sizeof(struct bpf_link_info), + link_infos[i].type, BPF_LINK_TYPE_RAW_TRACEPOINT, + link_infos[i].id, + link_infos[i].prog_id, prog_infos[i].id, + (char *)link_infos[i].raw_tracepoint.tp_name, + "sys_enter")) + goto done; + } /* Check bpf_prog_get_next_id() */ @@ -247,7 +294,52 @@ void test_bpf_obj_id(void) "nr_id_found %u(%u)\n", nr_id_found, nr_iters); + /* Check bpf_link_get_next_id() */ + nr_id_found = 0; + next_id = 0; + while (!bpf_link_get_next_id(next_id, &next_id)) { + struct bpf_link_info link_info; + int link_fd, cmp_res; + + info_len = sizeof(link_info); + memset(&link_info, 0, info_len); + + link_fd = bpf_link_get_fd_by_id(next_id); + if (link_fd < 0 && errno == ENOENT) + /* The bpf_link is in the dead row */ + continue; + if (CHECK(link_fd < 0, "get-link-fd(next_id)", + "link_fd %d next_id %u errno %d\n", + link_fd, next_id, errno)) + break; + + for (i = 0; i < nr_iters; i++) + if (link_infos[i].id == next_id) + break; + + if (i == nr_iters) + continue; + + nr_id_found++; + + err = bpf_obj_get_info_by_fd(link_fd, &link_info, &info_len); + cmp_res = memcmp(&link_info, &link_infos[i], + offsetof(struct bpf_link_info, raw_tracepoint)); + CHECK(err || info_len != sizeof(link_info) || cmp_res, + "check get-link-info(next_id->fd)", + "err %d errno %d info_len %u(%zu) memcmp %d\n", + err, errno, info_len, sizeof(struct bpf_link_info), + cmp_res); + + close(link_fd); + } + CHECK(nr_id_found != nr_iters, + "check total link id found by get_next_id", + "nr_id_found %u(%u)\n", nr_id_found, nr_iters); + done: - for (i = 0; i < nr_iters; i++) + for (i = 0; i < nr_iters; i++) { + bpf_link__destroy(links[i]); bpf_object__close(objs[i]); + } } diff --git a/tools/testing/selftests/bpf/progs/test_obj_id.c b/tools/testing/selftests/bpf/progs/test_obj_id.c index 98b9de2fafd0..ded71b3ff6b4 100644 --- a/tools/testing/selftests/bpf/progs/test_obj_id.c +++ b/tools/testing/selftests/bpf/progs/test_obj_id.c @@ -3,16 +3,8 @@ */ #include #include -#include #include -/* It is a dumb bpf program such that it must have no - * issue to be loaded since testing the verifier is - * not the focus here. - */ - -int _version SEC("version") = 1; - struct { __uint(type, BPF_MAP_TYPE_ARRAY); __uint(max_entries, 1); @@ -20,13 +12,13 @@ struct { __type(value, __u64); } test_map_id SEC(".maps"); -SEC("test_obj_id_dummy") -int test_obj_id(struct __sk_buff *skb) +SEC("raw_tp/sys_enter") +int test_obj_id(void *ctx) { __u32 key = 0; __u64 *value; value = bpf_map_lookup_elem(&test_map_id, &key); - return TC_ACT_OK; + return 0; } -- cgit v1.2.3-59-g8ed1b From 50325b1761e31ad17d252e795af72a9af8c5a7d7 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 28 Apr 2020 17:16:11 -0700 Subject: bpftool: Expose attach_type-to-string array to non-cgroup code Move attach_type_strings into main.h for access in non-cgroup code. bpf_attach_type is used for non-cgroup attach types quite widely now. So also complete missing string translations for non-cgroup attach types. Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Reviewed-by: Quentin Monnet Link: https://lore.kernel.org/bpf/20200429001614.1544-8-andriin@fb.com --- tools/bpf/bpftool/cgroup.c | 48 +++++++++++++++------------------------------- tools/bpf/bpftool/main.h | 32 +++++++++++++++++++++++++++++++ 2 files changed, 47 insertions(+), 33 deletions(-) diff --git a/tools/bpf/bpftool/cgroup.c b/tools/bpf/bpftool/cgroup.c index 62c6a1d7cd18..1693c802bb20 100644 --- a/tools/bpf/bpftool/cgroup.c +++ b/tools/bpf/bpftool/cgroup.c @@ -31,42 +31,20 @@ static unsigned int query_flags; -static const char * const attach_type_strings[] = { - [BPF_CGROUP_INET_INGRESS] = "ingress", - [BPF_CGROUP_INET_EGRESS] = "egress", - [BPF_CGROUP_INET_SOCK_CREATE] = "sock_create", - [BPF_CGROUP_SOCK_OPS] = "sock_ops", - [BPF_CGROUP_DEVICE] = "device", - [BPF_CGROUP_INET4_BIND] = "bind4", - [BPF_CGROUP_INET6_BIND] = "bind6", - [BPF_CGROUP_INET4_CONNECT] = "connect4", - [BPF_CGROUP_INET6_CONNECT] = "connect6", - [BPF_CGROUP_INET4_POST_BIND] = "post_bind4", - [BPF_CGROUP_INET6_POST_BIND] = "post_bind6", - [BPF_CGROUP_UDP4_SENDMSG] = "sendmsg4", - [BPF_CGROUP_UDP6_SENDMSG] = "sendmsg6", - [BPF_CGROUP_SYSCTL] = "sysctl", - [BPF_CGROUP_UDP4_RECVMSG] = "recvmsg4", - [BPF_CGROUP_UDP6_RECVMSG] = "recvmsg6", - [BPF_CGROUP_GETSOCKOPT] = "getsockopt", - [BPF_CGROUP_SETSOCKOPT] = "setsockopt", - [__MAX_BPF_ATTACH_TYPE] = NULL, -}; - static enum bpf_attach_type parse_attach_type(const char *str) { enum bpf_attach_type type; for (type = 0; type < __MAX_BPF_ATTACH_TYPE; type++) { - if (attach_type_strings[type] && - is_prefix(str, attach_type_strings[type])) + if (attach_type_name[type] && + is_prefix(str, attach_type_name[type])) return type; } return __MAX_BPF_ATTACH_TYPE; } -static int show_bpf_prog(int id, const char *attach_type_str, +static int show_bpf_prog(int id, enum bpf_attach_type attach_type, const char *attach_flags_str, int level) { @@ -86,18 +64,22 @@ static int show_bpf_prog(int id, const char *attach_type_str, if (json_output) { jsonw_start_object(json_wtr); jsonw_uint_field(json_wtr, "id", info.id); - jsonw_string_field(json_wtr, "attach_type", - attach_type_str); + if (attach_type < ARRAY_SIZE(attach_type_name)) + jsonw_string_field(json_wtr, "attach_type", + attach_type_name[attach_type]); + else + jsonw_uint_field(json_wtr, "attach_type", attach_type); jsonw_string_field(json_wtr, "attach_flags", attach_flags_str); jsonw_string_field(json_wtr, "name", info.name); jsonw_end_object(json_wtr); } else { - printf("%s%-8u %-15s %-15s %-15s\n", level ? " " : "", - info.id, - attach_type_str, - attach_flags_str, - info.name); + printf("%s%-8u ", level ? " " : "", info.id); + if (attach_type < ARRAY_SIZE(attach_type_name)) + printf("%-15s", attach_type_name[attach_type]); + else + printf("type %-10u", attach_type); + printf(" %-15s %-15s\n", attach_flags_str, info.name); } close(prog_fd); @@ -171,7 +153,7 @@ static int show_attached_bpf_progs(int cgroup_fd, enum bpf_attach_type type, } for (iter = 0; iter < prog_cnt; iter++) - show_bpf_prog(prog_ids[iter], attach_type_strings[type], + show_bpf_prog(prog_ids[iter], type, attach_flags_str, level); return 0; diff --git a/tools/bpf/bpftool/main.h b/tools/bpf/bpftool/main.h index 86f14ce26fd7..99d84bd1d5b2 100644 --- a/tools/bpf/bpftool/main.h +++ b/tools/bpf/bpftool/main.h @@ -83,6 +83,38 @@ static const char * const prog_type_name[] = { [BPF_PROG_TYPE_EXT] = "ext", }; +static const char * const attach_type_name[__MAX_BPF_ATTACH_TYPE] = { + [BPF_CGROUP_INET_INGRESS] = "ingress", + [BPF_CGROUP_INET_EGRESS] = "egress", + [BPF_CGROUP_INET_SOCK_CREATE] = "sock_create", + [BPF_CGROUP_SOCK_OPS] = "sock_ops", + [BPF_CGROUP_DEVICE] = "device", + [BPF_CGROUP_INET4_BIND] = "bind4", + [BPF_CGROUP_INET6_BIND] = "bind6", + [BPF_CGROUP_INET4_CONNECT] = "connect4", + [BPF_CGROUP_INET6_CONNECT] = "connect6", + [BPF_CGROUP_INET4_POST_BIND] = "post_bind4", + [BPF_CGROUP_INET6_POST_BIND] = "post_bind6", + [BPF_CGROUP_UDP4_SENDMSG] = "sendmsg4", + [BPF_CGROUP_UDP6_SENDMSG] = "sendmsg6", + [BPF_CGROUP_SYSCTL] = "sysctl", + [BPF_CGROUP_UDP4_RECVMSG] = "recvmsg4", + [BPF_CGROUP_UDP6_RECVMSG] = "recvmsg6", + [BPF_CGROUP_GETSOCKOPT] = "getsockopt", + [BPF_CGROUP_SETSOCKOPT] = "setsockopt", + + [BPF_SK_SKB_STREAM_PARSER] = "sk_skb_stream_parser", + [BPF_SK_SKB_STREAM_VERDICT] = "sk_skb_stream_verdict", + [BPF_SK_MSG_VERDICT] = "sk_msg_verdict", + [BPF_LIRC_MODE2] = "lirc_mode2", + [BPF_FLOW_DISSECTOR] = "flow_dissector", + [BPF_TRACE_RAW_TP] = "raw_tp", + [BPF_TRACE_FENTRY] = "fentry", + [BPF_TRACE_FEXIT] = "fexit", + [BPF_MODIFY_RETURN] = "mod_ret", + [BPF_LSM_MAC] = "lsm_mac", +}; + extern const char * const map_type_name[]; extern const size_t map_type_name_size; -- cgit v1.2.3-59-g8ed1b From c5481f9a954f27b8730c1dfeebbc9b3b5b2b2481 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 28 Apr 2020 17:16:12 -0700 Subject: bpftool: Add bpf_link show and pin support Add `bpftool link show` and `bpftool link pin` commands. Example plain output for `link show` (with showing pinned paths): [vmuser@archvm bpf]$ sudo ~/local/linux/tools/bpf/bpftool/bpftool -f link 1: tracing prog 12 prog_type tracing attach_type fentry pinned /sys/fs/bpf/my_test_link pinned /sys/fs/bpf/my_test_link2 2: tracing prog 13 prog_type tracing attach_type fentry 3: tracing prog 14 prog_type tracing attach_type fentry 4: tracing prog 15 prog_type tracing attach_type fentry 5: tracing prog 16 prog_type tracing attach_type fentry 6: tracing prog 17 prog_type tracing attach_type fentry 7: raw_tracepoint prog 21 tp 'sys_enter' 8: cgroup prog 25 cgroup_id 584 attach_type egress 9: cgroup prog 25 cgroup_id 599 attach_type egress 10: cgroup prog 25 cgroup_id 614 attach_type egress 11: cgroup prog 25 cgroup_id 629 attach_type egress Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Reviewed-by: Quentin Monnet Link: https://lore.kernel.org/bpf/20200429001614.1544-9-andriin@fb.com --- tools/bpf/bpftool/common.c | 2 + tools/bpf/bpftool/link.c | 333 +++++++++++++++++++++++++++++++++++++++++++++ tools/bpf/bpftool/main.c | 6 +- tools/bpf/bpftool/main.h | 5 + 4 files changed, 345 insertions(+), 1 deletion(-) create mode 100644 tools/bpf/bpftool/link.c diff --git a/tools/bpf/bpftool/common.c b/tools/bpf/bpftool/common.c index f2223dbdfb0a..c47bdc65de8e 100644 --- a/tools/bpf/bpftool/common.c +++ b/tools/bpf/bpftool/common.c @@ -262,6 +262,8 @@ int get_fd_type(int fd) return BPF_OBJ_MAP; else if (strstr(buf, "bpf-prog")) return BPF_OBJ_PROG; + else if (strstr(buf, "bpf-link")) + return BPF_OBJ_LINK; return BPF_OBJ_UNKNOWN; } diff --git a/tools/bpf/bpftool/link.c b/tools/bpf/bpftool/link.c new file mode 100644 index 000000000000..adc7dc431ed8 --- /dev/null +++ b/tools/bpf/bpftool/link.c @@ -0,0 +1,333 @@ +// SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +/* Copyright (C) 2020 Facebook */ + +#include +#include +#include +#include + +#include + +#include "json_writer.h" +#include "main.h" + +static const char * const link_type_name[] = { + [BPF_LINK_TYPE_UNSPEC] = "unspec", + [BPF_LINK_TYPE_RAW_TRACEPOINT] = "raw_tracepoint", + [BPF_LINK_TYPE_TRACING] = "tracing", + [BPF_LINK_TYPE_CGROUP] = "cgroup", +}; + +static int link_parse_fd(int *argc, char ***argv) +{ + if (is_prefix(**argv, "id")) { + unsigned int id; + char *endptr; + + NEXT_ARGP(); + + id = strtoul(**argv, &endptr, 0); + if (*endptr) { + p_err("can't parse %s as ID", **argv); + return -1; + } + NEXT_ARGP(); + + return bpf_link_get_fd_by_id(id); + } else if (is_prefix(**argv, "pinned")) { + char *path; + + NEXT_ARGP(); + + path = **argv; + NEXT_ARGP(); + + return open_obj_pinned_any(path, BPF_OBJ_LINK); + } + + p_err("expected 'id' or 'pinned', got: '%s'?", **argv); + return -1; +} + +static void +show_link_header_json(struct bpf_link_info *info, json_writer_t *wtr) +{ + jsonw_uint_field(wtr, "id", info->id); + if (info->type < ARRAY_SIZE(link_type_name)) + jsonw_string_field(wtr, "type", link_type_name[info->type]); + else + jsonw_uint_field(wtr, "type", info->type); + + jsonw_uint_field(json_wtr, "prog_id", info->prog_id); +} + +static int get_prog_info(int prog_id, struct bpf_prog_info *info) +{ + __u32 len = sizeof(*info); + int err, prog_fd; + + prog_fd = bpf_prog_get_fd_by_id(prog_id); + if (prog_fd < 0) + return prog_fd; + + memset(info, 0, sizeof(*info)); + err = bpf_obj_get_info_by_fd(prog_fd, info, &len); + if (err) + p_err("can't get prog info: %s", strerror(errno)); + close(prog_fd); + return err; +} + +static int show_link_close_json(int fd, struct bpf_link_info *info) +{ + struct bpf_prog_info prog_info; + int err; + + jsonw_start_object(json_wtr); + + show_link_header_json(info, json_wtr); + + switch (info->type) { + case BPF_LINK_TYPE_RAW_TRACEPOINT: + jsonw_string_field(json_wtr, "tp_name", + (const char *)info->raw_tracepoint.tp_name); + break; + case BPF_LINK_TYPE_TRACING: + err = get_prog_info(info->prog_id, &prog_info); + if (err) + return err; + + if (prog_info.type < ARRAY_SIZE(prog_type_name)) + jsonw_string_field(json_wtr, "prog_type", + prog_type_name[prog_info.type]); + else + jsonw_uint_field(json_wtr, "prog_type", + prog_info.type); + + if (info->tracing.attach_type < ARRAY_SIZE(attach_type_name)) + jsonw_string_field(json_wtr, "attach_type", + attach_type_name[info->tracing.attach_type]); + else + jsonw_uint_field(json_wtr, "attach_type", + info->tracing.attach_type); + break; + case BPF_LINK_TYPE_CGROUP: + jsonw_lluint_field(json_wtr, "cgroup_id", + info->cgroup.cgroup_id); + if (info->cgroup.attach_type < ARRAY_SIZE(attach_type_name)) + jsonw_string_field(json_wtr, "attach_type", + attach_type_name[info->cgroup.attach_type]); + else + jsonw_uint_field(json_wtr, "attach_type", + info->cgroup.attach_type); + break; + default: + break; + } + + if (!hash_empty(link_table.table)) { + struct pinned_obj *obj; + + jsonw_name(json_wtr, "pinned"); + jsonw_start_array(json_wtr); + hash_for_each_possible(link_table.table, obj, hash, info->id) { + if (obj->id == info->id) + jsonw_string(json_wtr, obj->path); + } + jsonw_end_array(json_wtr); + } + jsonw_end_object(json_wtr); + + return 0; +} + +static void show_link_header_plain(struct bpf_link_info *info) +{ + printf("%u: ", info->id); + if (info->type < ARRAY_SIZE(link_type_name)) + printf("%s ", link_type_name[info->type]); + else + printf("type %u ", info->type); + + printf("prog %u ", info->prog_id); +} + +static int show_link_close_plain(int fd, struct bpf_link_info *info) +{ + struct bpf_prog_info prog_info; + int err; + + show_link_header_plain(info); + + switch (info->type) { + case BPF_LINK_TYPE_RAW_TRACEPOINT: + printf("\n\ttp '%s' ", + (const char *)info->raw_tracepoint.tp_name); + break; + case BPF_LINK_TYPE_TRACING: + err = get_prog_info(info->prog_id, &prog_info); + if (err) + return err; + + if (prog_info.type < ARRAY_SIZE(prog_type_name)) + printf("\n\tprog_type %s ", + prog_type_name[prog_info.type]); + else + printf("\n\tprog_type %u ", prog_info.type); + + if (info->tracing.attach_type < ARRAY_SIZE(attach_type_name)) + printf("attach_type %s ", + attach_type_name[info->tracing.attach_type]); + else + printf("attach_type %u ", info->tracing.attach_type); + break; + case BPF_LINK_TYPE_CGROUP: + printf("\n\tcgroup_id %zu ", (size_t)info->cgroup.cgroup_id); + if (info->cgroup.attach_type < ARRAY_SIZE(attach_type_name)) + printf("attach_type %s ", + attach_type_name[info->cgroup.attach_type]); + else + printf("attach_type %u ", info->cgroup.attach_type); + break; + default: + break; + } + + if (!hash_empty(link_table.table)) { + struct pinned_obj *obj; + + hash_for_each_possible(link_table.table, obj, hash, info->id) { + if (obj->id == info->id) + printf("\n\tpinned %s", obj->path); + } + } + + printf("\n"); + + return 0; +} + +static int do_show_link(int fd) +{ + struct bpf_link_info info; + __u32 len = sizeof(info); + char raw_tp_name[256]; + int err; + + memset(&info, 0, sizeof(info)); +again: + err = bpf_obj_get_info_by_fd(fd, &info, &len); + if (err) { + p_err("can't get link info: %s", + strerror(errno)); + close(fd); + return err; + } + if (info.type == BPF_LINK_TYPE_RAW_TRACEPOINT && + !info.raw_tracepoint.tp_name) { + info.raw_tracepoint.tp_name = (unsigned long)&raw_tp_name; + info.raw_tracepoint.tp_name_len = sizeof(raw_tp_name); + goto again; + } + + if (json_output) + show_link_close_json(fd, &info); + else + show_link_close_plain(fd, &info); + + close(fd); + return 0; +} + +static int do_show(int argc, char **argv) +{ + __u32 id = 0; + int err, fd; + + if (show_pinned) + build_pinned_obj_table(&link_table, BPF_OBJ_LINK); + + if (argc == 2) { + fd = link_parse_fd(&argc, &argv); + if (fd < 0) + return fd; + return do_show_link(fd); + } + + if (argc) + return BAD_ARG(); + + if (json_output) + jsonw_start_array(json_wtr); + while (true) { + err = bpf_link_get_next_id(id, &id); + if (err) { + if (errno == ENOENT) + break; + p_err("can't get next link: %s%s", strerror(errno), + errno == EINVAL ? " -- kernel too old?" : ""); + break; + } + + fd = bpf_link_get_fd_by_id(id); + if (fd < 0) { + if (errno == ENOENT) + continue; + p_err("can't get link by id (%u): %s", + id, strerror(errno)); + break; + } + + err = do_show_link(fd); + if (err) + break; + } + if (json_output) + jsonw_end_array(json_wtr); + + return errno == ENOENT ? 0 : -1; +} + +static int do_pin(int argc, char **argv) +{ + int err; + + err = do_pin_any(argc, argv, link_parse_fd); + if (!err && json_output) + jsonw_null(json_wtr); + return err; +} + +static int do_help(int argc, char **argv) +{ + if (json_output) { + jsonw_null(json_wtr); + return 0; + } + + fprintf(stderr, + "Usage: %1$s %2$s { show | list } [LINK]\n" + " %1$s %2$s pin LINK FILE\n" + " %1$s %2$s help\n" + "\n" + " " HELP_SPEC_LINK "\n" + " " HELP_SPEC_PROGRAM "\n" + " " HELP_SPEC_OPTIONS "\n" + "", + bin_name, argv[-2]); + + return 0; +} + +static const struct cmd cmds[] = { + { "show", do_show }, + { "list", do_show }, + { "help", do_help }, + { "pin", do_pin }, + { 0 } +}; + +int do_link(int argc, char **argv) +{ + return cmd_select(cmds, argc, argv, do_help); +} diff --git a/tools/bpf/bpftool/main.c b/tools/bpf/bpftool/main.c index 466c269eabdd..1413a154806e 100644 --- a/tools/bpf/bpftool/main.c +++ b/tools/bpf/bpftool/main.c @@ -30,6 +30,7 @@ bool verifier_logs; bool relaxed_maps; struct pinned_obj_table prog_table; struct pinned_obj_table map_table; +struct pinned_obj_table link_table; static void __noreturn clean_and_exit(int i) { @@ -58,7 +59,7 @@ static int do_help(int argc, char **argv) " %s batch file FILE\n" " %s version\n" "\n" - " OBJECT := { prog | map | cgroup | perf | net | feature | btf | gen | struct_ops }\n" + " OBJECT := { prog | map | link | cgroup | perf | net | feature | btf | gen | struct_ops }\n" " " HELP_SPEC_OPTIONS "\n" "", bin_name, bin_name, bin_name); @@ -215,6 +216,7 @@ static const struct cmd cmds[] = { { "batch", do_batch }, { "prog", do_prog }, { "map", do_map }, + { "link", do_link }, { "cgroup", do_cgroup }, { "perf", do_perf }, { "net", do_net }, @@ -364,6 +366,7 @@ int main(int argc, char **argv) hash_init(prog_table.table); hash_init(map_table.table); + hash_init(link_table.table); opterr = 0; while ((opt = getopt_long(argc, argv, "Vhpjfmnd", @@ -422,6 +425,7 @@ int main(int argc, char **argv) if (show_pinned) { delete_pinned_obj_table(&prog_table); delete_pinned_obj_table(&map_table); + delete_pinned_obj_table(&link_table); } return ret; diff --git a/tools/bpf/bpftool/main.h b/tools/bpf/bpftool/main.h index 99d84bd1d5b2..9b1fb81a8331 100644 --- a/tools/bpf/bpftool/main.h +++ b/tools/bpf/bpftool/main.h @@ -50,6 +50,8 @@ "\t {-m|--mapcompat} | {-n|--nomount} }" #define HELP_SPEC_MAP \ "MAP := { id MAP_ID | pinned FILE | name MAP_NAME }" +#define HELP_SPEC_LINK \ + "LINK := { id LINK_ID | pinned FILE }" static const char * const prog_type_name[] = { [BPF_PROG_TYPE_UNSPEC] = "unspec", @@ -122,6 +124,7 @@ enum bpf_obj_type { BPF_OBJ_UNKNOWN, BPF_OBJ_PROG, BPF_OBJ_MAP, + BPF_OBJ_LINK, }; extern const char *bin_name; @@ -134,6 +137,7 @@ extern bool verifier_logs; extern bool relaxed_maps; extern struct pinned_obj_table prog_table; extern struct pinned_obj_table map_table; +extern struct pinned_obj_table link_table; void __printf(1, 2) p_err(const char *fmt, ...); void __printf(1, 2) p_info(const char *fmt, ...); @@ -185,6 +189,7 @@ int do_pin_fd(int fd, const char *name); int do_prog(int argc, char **arg); int do_map(int argc, char **arg); +int do_link(int argc, char **arg); int do_event_pipe(int argc, char **argv); int do_cgroup(int argc, char **arg); int do_perf(int argc, char **arg); -- cgit v1.2.3-59-g8ed1b From 7464d013ccd4db8544df5eddb05ddd509b9c46e5 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 28 Apr 2020 17:16:13 -0700 Subject: bpftool: Add bpftool-link manpage Add bpftool-link manpage with information and examples of link-related commands. Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Reviewed-by: Quentin Monnet Link: https://lore.kernel.org/bpf/20200429001614.1544-10-andriin@fb.com --- tools/bpf/bpftool/Documentation/bpftool-link.rst | 118 +++++++++++++++++++++++ 1 file changed, 118 insertions(+) create mode 100644 tools/bpf/bpftool/Documentation/bpftool-link.rst diff --git a/tools/bpf/bpftool/Documentation/bpftool-link.rst b/tools/bpf/bpftool/Documentation/bpftool-link.rst new file mode 100644 index 000000000000..ee6500d6e6e4 --- /dev/null +++ b/tools/bpf/bpftool/Documentation/bpftool-link.rst @@ -0,0 +1,118 @@ +================ +bpftool-link +================ +------------------------------------------------------------------------------- +tool for inspection and simple manipulation of eBPF links +------------------------------------------------------------------------------- + +:Manual section: 8 + +SYNOPSIS +======== + + **bpftool** [*OPTIONS*] **link** *COMMAND* + + *OPTIONS* := { { **-j** | **--json** } [{ **-p** | **--pretty** }] | { **-f** | **--bpffs** } } + + *COMMANDS* := { **show** | **list** | **pin** | **help** } + +LINK COMMANDS +============= + +| **bpftool** **link { show | list }** [*LINK*] +| **bpftool** **link pin** *LINK* *FILE* +| **bpftool** **link help** +| +| *LINK* := { **id** *LINK_ID* | **pinned** *FILE* } + + +DESCRIPTION +=========== + **bpftool link { show | list }** [*LINK*] + Show information about active links. If *LINK* is + specified show information only about given link, + otherwise list all links currently active on the system. + + Output will start with link ID followed by link type and + zero or more named attributes, some of which depend on type + of link. + + **bpftool link pin** *LINK* *FILE* + Pin link *LINK* as *FILE*. + + Note: *FILE* must be located in *bpffs* mount. It must not + contain a dot character ('.'), which is reserved for future + extensions of *bpffs*. + + **bpftool link help** + Print short help message. + +OPTIONS +======= + -h, --help + Print short generic help message (similar to **bpftool help**). + + -V, --version + Print version number (similar to **bpftool version**). + + -j, --json + Generate JSON output. For commands that cannot produce JSON, this + option has no effect. + + -p, --pretty + Generate human-readable JSON output. Implies **-j**. + + -f, --bpffs + When showing BPF links, show file names of pinned + links. + + -n, --nomount + Do not automatically attempt to mount any virtual file system + (such as tracefs or BPF virtual file system) when necessary. + + -d, --debug + Print all logs available, even debug-level information. This + includes logs from libbpf. + +EXAMPLES +======== +**# bpftool link show** + +:: + + 10: cgroup prog 25 + cgroup_id 614 attach_type egress + +**# bpftool --json --pretty link show** + +:: + + [{ + "type": "cgroup", + "prog_id": 25, + "cgroup_id": 614, + "attach_type": "egress" + } + ] + +| +| **# bpftool link pin id 10 /sys/fs/bpf/link** +| **# ls -l /sys/fs/bpf/** + +:: + + -rw------- 1 root root 0 Apr 23 21:39 link + + +SEE ALSO +======== + **bpf**\ (2), + **bpf-helpers**\ (7), + **bpftool**\ (8), + **bpftool-prog\ (8), + **bpftool-map**\ (8), + **bpftool-cgroup**\ (8), + **bpftool-feature**\ (8), + **bpftool-net**\ (8), + **bpftool-perf**\ (8), + **bpftool-btf**\ (8) -- cgit v1.2.3-59-g8ed1b From 5d085ad2e68cceec8332b23ea8f630a28b506366 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 28 Apr 2020 17:16:14 -0700 Subject: bpftool: Add link bash completions Extend bpftool's bash-completion script to handle new link command and its sub-commands. Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Reviewed-by: Quentin Monnet Link: https://lore.kernel.org/bpf/20200429001614.1544-11-andriin@fb.com --- tools/bpf/bpftool/bash-completion/bpftool | 39 +++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) diff --git a/tools/bpf/bpftool/bash-completion/bpftool b/tools/bpf/bpftool/bash-completion/bpftool index 45ee99b159e2..c033c3329f73 100644 --- a/tools/bpf/bpftool/bash-completion/bpftool +++ b/tools/bpf/bpftool/bash-completion/bpftool @@ -98,6 +98,12 @@ _bpftool_get_btf_ids() command sed -n 's/.*"id": \(.*\),$/\1/p' )" -- "$cur" ) ) } +_bpftool_get_link_ids() +{ + COMPREPLY+=( $( compgen -W "$( bpftool -jp link 2>&1 | \ + command sed -n 's/.*"id": \(.*\),$/\1/p' )" -- "$cur" ) ) +} + _bpftool_get_obj_map_names() { local obj @@ -1082,6 +1088,39 @@ _bpftool() ;; esac ;; + link) + case $command in + show|list|pin) + case $prev in + id) + _bpftool_get_link_ids + return 0 + ;; + esac + ;; + esac + + local LINK_TYPE='id pinned' + case $command in + show|list) + [[ $prev != "$command" ]] && return 0 + COMPREPLY=( $( compgen -W "$LINK_TYPE" -- "$cur" ) ) + return 0 + ;; + pin) + if [[ $prev == "$command" ]]; then + COMPREPLY=( $( compgen -W "$LINK_TYPE" -- "$cur" ) ) + else + _filedir + fi + return 0 + ;; + *) + [[ $prev == $object ]] && \ + COMPREPLY=( $( compgen -W 'help pin show list' -- "$cur" ) ) + ;; + esac + ;; esac } && complete -F _bpftool bpftool -- cgit v1.2.3-59-g8ed1b From 41017e56af6cf99122c86655f60fe4e1b75ecf48 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 28 Apr 2020 17:27:37 -0700 Subject: libbpf: Refactor BTF-defined map definition parsing logic Factor out BTF map definition logic into stand-alone routine for easier reuse for map-in-map case. Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200429002739.48006-2-andriin@fb.com --- tools/lib/bpf/libbpf.c | 195 ++++++++++++++++++++++++++----------------------- 1 file changed, 103 insertions(+), 92 deletions(-) diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 8e1dc6980fac..7d10436d7b58 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -1914,109 +1914,54 @@ static int build_map_pin_path(struct bpf_map *map, const char *path) return 0; } -static int bpf_object__init_user_btf_map(struct bpf_object *obj, - const struct btf_type *sec, - int var_idx, int sec_idx, - const Elf_Data *data, bool strict, - const char *pin_root_path) + +static int parse_btf_map_def(struct bpf_object *obj, + struct bpf_map *map, + const struct btf_type *def, + bool strict, + const char *pin_root_path) { - const struct btf_type *var, *def, *t; - const struct btf_var_secinfo *vi; - const struct btf_var *var_extra; + const struct btf_type *t; const struct btf_member *m; - const char *map_name; - struct bpf_map *map; int vlen, i; - vi = btf_var_secinfos(sec) + var_idx; - var = btf__type_by_id(obj->btf, vi->type); - var_extra = btf_var(var); - map_name = btf__name_by_offset(obj->btf, var->name_off); - vlen = btf_vlen(var); - - if (map_name == NULL || map_name[0] == '\0') { - pr_warn("map #%d: empty name.\n", var_idx); - return -EINVAL; - } - if ((__u64)vi->offset + vi->size > data->d_size) { - pr_warn("map '%s' BTF data is corrupted.\n", map_name); - return -EINVAL; - } - if (!btf_is_var(var)) { - pr_warn("map '%s': unexpected var kind %u.\n", - map_name, btf_kind(var)); - return -EINVAL; - } - if (var_extra->linkage != BTF_VAR_GLOBAL_ALLOCATED && - var_extra->linkage != BTF_VAR_STATIC) { - pr_warn("map '%s': unsupported var linkage %u.\n", - map_name, var_extra->linkage); - return -EOPNOTSUPP; - } - - def = skip_mods_and_typedefs(obj->btf, var->type, NULL); - if (!btf_is_struct(def)) { - pr_warn("map '%s': unexpected def kind %u.\n", - map_name, btf_kind(var)); - return -EINVAL; - } - if (def->size > vi->size) { - pr_warn("map '%s': invalid def size.\n", map_name); - return -EINVAL; - } - - map = bpf_object__add_map(obj); - if (IS_ERR(map)) - return PTR_ERR(map); - map->name = strdup(map_name); - if (!map->name) { - pr_warn("map '%s': failed to alloc map name.\n", map_name); - return -ENOMEM; - } - map->libbpf_type = LIBBPF_MAP_UNSPEC; - map->def.type = BPF_MAP_TYPE_UNSPEC; - map->sec_idx = sec_idx; - map->sec_offset = vi->offset; - pr_debug("map '%s': at sec_idx %d, offset %zu.\n", - map_name, map->sec_idx, map->sec_offset); - vlen = btf_vlen(def); m = btf_members(def); for (i = 0; i < vlen; i++, m++) { const char *name = btf__name_by_offset(obj->btf, m->name_off); if (!name) { - pr_warn("map '%s': invalid field #%d.\n", map_name, i); + pr_warn("map '%s': invalid field #%d.\n", map->name, i); return -EINVAL; } if (strcmp(name, "type") == 0) { - if (!get_map_field_int(map_name, obj->btf, m, + if (!get_map_field_int(map->name, obj->btf, m, &map->def.type)) return -EINVAL; pr_debug("map '%s': found type = %u.\n", - map_name, map->def.type); + map->name, map->def.type); } else if (strcmp(name, "max_entries") == 0) { - if (!get_map_field_int(map_name, obj->btf, m, + if (!get_map_field_int(map->name, obj->btf, m, &map->def.max_entries)) return -EINVAL; pr_debug("map '%s': found max_entries = %u.\n", - map_name, map->def.max_entries); + map->name, map->def.max_entries); } else if (strcmp(name, "map_flags") == 0) { - if (!get_map_field_int(map_name, obj->btf, m, + if (!get_map_field_int(map->name, obj->btf, m, &map->def.map_flags)) return -EINVAL; pr_debug("map '%s': found map_flags = %u.\n", - map_name, map->def.map_flags); + map->name, map->def.map_flags); } else if (strcmp(name, "key_size") == 0) { __u32 sz; - if (!get_map_field_int(map_name, obj->btf, m, &sz)) + if (!get_map_field_int(map->name, obj->btf, m, &sz)) return -EINVAL; pr_debug("map '%s': found key_size = %u.\n", - map_name, sz); + map->name, sz); if (map->def.key_size && map->def.key_size != sz) { pr_warn("map '%s': conflicting key size %u != %u.\n", - map_name, map->def.key_size, sz); + map->name, map->def.key_size, sz); return -EINVAL; } map->def.key_size = sz; @@ -2026,25 +1971,25 @@ static int bpf_object__init_user_btf_map(struct bpf_object *obj, t = btf__type_by_id(obj->btf, m->type); if (!t) { pr_warn("map '%s': key type [%d] not found.\n", - map_name, m->type); + map->name, m->type); return -EINVAL; } if (!btf_is_ptr(t)) { pr_warn("map '%s': key spec is not PTR: %u.\n", - map_name, btf_kind(t)); + map->name, btf_kind(t)); return -EINVAL; } sz = btf__resolve_size(obj->btf, t->type); if (sz < 0) { pr_warn("map '%s': can't determine key size for type [%u]: %zd.\n", - map_name, t->type, (ssize_t)sz); + map->name, t->type, (ssize_t)sz); return sz; } pr_debug("map '%s': found key [%u], sz = %zd.\n", - map_name, t->type, (ssize_t)sz); + map->name, t->type, (ssize_t)sz); if (map->def.key_size && map->def.key_size != sz) { pr_warn("map '%s': conflicting key size %u != %zd.\n", - map_name, map->def.key_size, (ssize_t)sz); + map->name, map->def.key_size, (ssize_t)sz); return -EINVAL; } map->def.key_size = sz; @@ -2052,13 +1997,13 @@ static int bpf_object__init_user_btf_map(struct bpf_object *obj, } else if (strcmp(name, "value_size") == 0) { __u32 sz; - if (!get_map_field_int(map_name, obj->btf, m, &sz)) + if (!get_map_field_int(map->name, obj->btf, m, &sz)) return -EINVAL; pr_debug("map '%s': found value_size = %u.\n", - map_name, sz); + map->name, sz); if (map->def.value_size && map->def.value_size != sz) { pr_warn("map '%s': conflicting value size %u != %u.\n", - map_name, map->def.value_size, sz); + map->name, map->def.value_size, sz); return -EINVAL; } map->def.value_size = sz; @@ -2068,25 +2013,25 @@ static int bpf_object__init_user_btf_map(struct bpf_object *obj, t = btf__type_by_id(obj->btf, m->type); if (!t) { pr_warn("map '%s': value type [%d] not found.\n", - map_name, m->type); + map->name, m->type); return -EINVAL; } if (!btf_is_ptr(t)) { pr_warn("map '%s': value spec is not PTR: %u.\n", - map_name, btf_kind(t)); + map->name, btf_kind(t)); return -EINVAL; } sz = btf__resolve_size(obj->btf, t->type); if (sz < 0) { pr_warn("map '%s': can't determine value size for type [%u]: %zd.\n", - map_name, t->type, (ssize_t)sz); + map->name, t->type, (ssize_t)sz); return sz; } pr_debug("map '%s': found value [%u], sz = %zd.\n", - map_name, t->type, (ssize_t)sz); + map->name, t->type, (ssize_t)sz); if (map->def.value_size && map->def.value_size != sz) { pr_warn("map '%s': conflicting value size %u != %zd.\n", - map_name, map->def.value_size, (ssize_t)sz); + map->name, map->def.value_size, (ssize_t)sz); return -EINVAL; } map->def.value_size = sz; @@ -2095,44 +2040,110 @@ static int bpf_object__init_user_btf_map(struct bpf_object *obj, __u32 val; int err; - if (!get_map_field_int(map_name, obj->btf, m, &val)) + if (!get_map_field_int(map->name, obj->btf, m, &val)) return -EINVAL; pr_debug("map '%s': found pinning = %u.\n", - map_name, val); + map->name, val); if (val != LIBBPF_PIN_NONE && val != LIBBPF_PIN_BY_NAME) { pr_warn("map '%s': invalid pinning value %u.\n", - map_name, val); + map->name, val); return -EINVAL; } if (val == LIBBPF_PIN_BY_NAME) { err = build_map_pin_path(map, pin_root_path); if (err) { pr_warn("map '%s': couldn't build pin path.\n", - map_name); + map->name); return err; } } } else { if (strict) { pr_warn("map '%s': unknown field '%s'.\n", - map_name, name); + map->name, name); return -ENOTSUP; } pr_debug("map '%s': ignoring unknown field '%s'.\n", - map_name, name); + map->name, name); } } if (map->def.type == BPF_MAP_TYPE_UNSPEC) { - pr_warn("map '%s': map type isn't specified.\n", map_name); + pr_warn("map '%s': map type isn't specified.\n", map->name); return -EINVAL; } return 0; } +static int bpf_object__init_user_btf_map(struct bpf_object *obj, + const struct btf_type *sec, + int var_idx, int sec_idx, + const Elf_Data *data, bool strict, + const char *pin_root_path) +{ + const struct btf_type *var, *def; + const struct btf_var_secinfo *vi; + const struct btf_var *var_extra; + const char *map_name; + struct bpf_map *map; + + vi = btf_var_secinfos(sec) + var_idx; + var = btf__type_by_id(obj->btf, vi->type); + var_extra = btf_var(var); + map_name = btf__name_by_offset(obj->btf, var->name_off); + + if (map_name == NULL || map_name[0] == '\0') { + pr_warn("map #%d: empty name.\n", var_idx); + return -EINVAL; + } + if ((__u64)vi->offset + vi->size > data->d_size) { + pr_warn("map '%s' BTF data is corrupted.\n", map_name); + return -EINVAL; + } + if (!btf_is_var(var)) { + pr_warn("map '%s': unexpected var kind %u.\n", + map_name, btf_kind(var)); + return -EINVAL; + } + if (var_extra->linkage != BTF_VAR_GLOBAL_ALLOCATED && + var_extra->linkage != BTF_VAR_STATIC) { + pr_warn("map '%s': unsupported var linkage %u.\n", + map_name, var_extra->linkage); + return -EOPNOTSUPP; + } + + def = skip_mods_and_typedefs(obj->btf, var->type, NULL); + if (!btf_is_struct(def)) { + pr_warn("map '%s': unexpected def kind %u.\n", + map_name, btf_kind(var)); + return -EINVAL; + } + if (def->size > vi->size) { + pr_warn("map '%s': invalid def size.\n", map_name); + return -EINVAL; + } + + map = bpf_object__add_map(obj); + if (IS_ERR(map)) + return PTR_ERR(map); + map->name = strdup(map_name); + if (!map->name) { + pr_warn("map '%s': failed to alloc map name.\n", map_name); + return -ENOMEM; + } + map->libbpf_type = LIBBPF_MAP_UNSPEC; + map->def.type = BPF_MAP_TYPE_UNSPEC; + map->sec_idx = sec_idx; + map->sec_offset = vi->offset; + pr_debug("map '%s': at sec_idx %d, offset %zu.\n", + map_name, map->sec_idx, map->sec_offset); + + return parse_btf_map_def(obj, map, def, strict, pin_root_path); +} + static int bpf_object__init_user_btf_maps(struct bpf_object *obj, bool strict, const char *pin_root_path) { -- cgit v1.2.3-59-g8ed1b From 2d39d7c56f115148b05d1d8c6b8698a5730c8b53 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 28 Apr 2020 17:27:38 -0700 Subject: libbpf: Refactor map creation logic and fix cleanup leak MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Factor out map creation and destruction logic to simplify code and especially error handling. Also fix map FD leak in case of partially successful map creation during bpf_object load operation. Fixes: 57a00f41644f ("libbpf: Add auto-pinning of maps when loading BPF objects") Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Acked-by: Toke Høiland-Jørgensen Link: https://lore.kernel.org/bpf/20200429002739.48006-3-andriin@fb.com --- tools/lib/bpf/libbpf.c | 226 ++++++++++++++++++++++++++----------------------- 1 file changed, 121 insertions(+), 105 deletions(-) diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 7d10436d7b58..9c845cf4cfcf 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -3493,107 +3493,111 @@ bpf_object__populate_internal_map(struct bpf_object *obj, struct bpf_map *map) return 0; } +static void bpf_map__destroy(struct bpf_map *map); + +static int bpf_object__create_map(struct bpf_object *obj, struct bpf_map *map) +{ + struct bpf_create_map_attr create_attr; + struct bpf_map_def *def = &map->def; + + memset(&create_attr, 0, sizeof(create_attr)); + + if (obj->caps.name) + create_attr.name = map->name; + create_attr.map_ifindex = map->map_ifindex; + create_attr.map_type = def->type; + create_attr.map_flags = def->map_flags; + create_attr.key_size = def->key_size; + create_attr.value_size = def->value_size; + + if (def->type == BPF_MAP_TYPE_PERF_EVENT_ARRAY && !def->max_entries) { + int nr_cpus; + + nr_cpus = libbpf_num_possible_cpus(); + if (nr_cpus < 0) { + pr_warn("map '%s': failed to determine number of system CPUs: %d\n", + map->name, nr_cpus); + return nr_cpus; + } + pr_debug("map '%s': setting size to %d\n", map->name, nr_cpus); + create_attr.max_entries = nr_cpus; + } else { + create_attr.max_entries = def->max_entries; + } + + if (bpf_map__is_struct_ops(map)) + create_attr.btf_vmlinux_value_type_id = + map->btf_vmlinux_value_type_id; + + create_attr.btf_fd = 0; + create_attr.btf_key_type_id = 0; + create_attr.btf_value_type_id = 0; + if (obj->btf && !bpf_map_find_btf_info(obj, map)) { + create_attr.btf_fd = btf__fd(obj->btf); + create_attr.btf_key_type_id = map->btf_key_type_id; + create_attr.btf_value_type_id = map->btf_value_type_id; + } + + map->fd = bpf_create_map_xattr(&create_attr); + if (map->fd < 0 && (create_attr.btf_key_type_id || + create_attr.btf_value_type_id)) { + char *cp, errmsg[STRERR_BUFSIZE]; + int err = -errno; + + cp = libbpf_strerror_r(err, errmsg, sizeof(errmsg)); + pr_warn("Error in bpf_create_map_xattr(%s):%s(%d). Retrying without BTF.\n", + map->name, cp, err); + create_attr.btf_fd = 0; + create_attr.btf_key_type_id = 0; + create_attr.btf_value_type_id = 0; + map->btf_key_type_id = 0; + map->btf_value_type_id = 0; + map->fd = bpf_create_map_xattr(&create_attr); + } + + if (map->fd < 0) + return -errno; + + return 0; +} + static int bpf_object__create_maps(struct bpf_object *obj) { - struct bpf_create_map_attr create_attr = {}; - int nr_cpus = 0; - unsigned int i; + struct bpf_map *map; + char *cp, errmsg[STRERR_BUFSIZE]; + unsigned int i, j; int err; for (i = 0; i < obj->nr_maps; i++) { - struct bpf_map *map = &obj->maps[i]; - struct bpf_map_def *def = &map->def; - char *cp, errmsg[STRERR_BUFSIZE]; - int *pfd = &map->fd; + map = &obj->maps[i]; if (map->pin_path) { err = bpf_object__reuse_map(map); if (err) { - pr_warn("error reusing pinned map %s\n", + pr_warn("map '%s': error reusing pinned map\n", map->name); - return err; + goto err_out; } } if (map->fd >= 0) { - pr_debug("skip map create (preset) %s: fd=%d\n", + pr_debug("map '%s': skipping creation (preset fd=%d)\n", map->name, map->fd); continue; } - if (obj->caps.name) - create_attr.name = map->name; - create_attr.map_ifindex = map->map_ifindex; - create_attr.map_type = def->type; - create_attr.map_flags = def->map_flags; - create_attr.key_size = def->key_size; - create_attr.value_size = def->value_size; - if (def->type == BPF_MAP_TYPE_PERF_EVENT_ARRAY && - !def->max_entries) { - if (!nr_cpus) - nr_cpus = libbpf_num_possible_cpus(); - if (nr_cpus < 0) { - pr_warn("failed to determine number of system CPUs: %d\n", - nr_cpus); - err = nr_cpus; - goto err_out; - } - pr_debug("map '%s': setting size to %d\n", - map->name, nr_cpus); - create_attr.max_entries = nr_cpus; - } else { - create_attr.max_entries = def->max_entries; - } - create_attr.btf_fd = 0; - create_attr.btf_key_type_id = 0; - create_attr.btf_value_type_id = 0; - if (bpf_map_type__is_map_in_map(def->type) && - map->inner_map_fd >= 0) - create_attr.inner_map_fd = map->inner_map_fd; - if (bpf_map__is_struct_ops(map)) - create_attr.btf_vmlinux_value_type_id = - map->btf_vmlinux_value_type_id; - - if (obj->btf && !bpf_map_find_btf_info(obj, map)) { - create_attr.btf_fd = btf__fd(obj->btf); - create_attr.btf_key_type_id = map->btf_key_type_id; - create_attr.btf_value_type_id = map->btf_value_type_id; - } - - *pfd = bpf_create_map_xattr(&create_attr); - if (*pfd < 0 && (create_attr.btf_key_type_id || - create_attr.btf_value_type_id)) { - err = -errno; - cp = libbpf_strerror_r(err, errmsg, sizeof(errmsg)); - pr_warn("Error in bpf_create_map_xattr(%s):%s(%d). Retrying without BTF.\n", - map->name, cp, err); - create_attr.btf_fd = 0; - create_attr.btf_key_type_id = 0; - create_attr.btf_value_type_id = 0; - map->btf_key_type_id = 0; - map->btf_value_type_id = 0; - *pfd = bpf_create_map_xattr(&create_attr); - } - - if (*pfd < 0) { - size_t j; + err = bpf_object__create_map(obj, map); + if (err) + goto err_out; - err = -errno; -err_out: - cp = libbpf_strerror_r(err, errmsg, sizeof(errmsg)); - pr_warn("failed to create map (name: '%s'): %s(%d)\n", - map->name, cp, err); - pr_perm_msg(err); - for (j = 0; j < i; j++) - zclose(obj->maps[j].fd); - return err; - } + pr_debug("map '%s': created successfully, fd=%d\n", map->name, + map->fd); if (bpf_map__is_internal(map)) { err = bpf_object__populate_internal_map(obj, map); if (err < 0) { - zclose(*pfd); + zclose(map->fd); goto err_out; } } @@ -3601,16 +3605,23 @@ err_out: if (map->pin_path && !map->pinned) { err = bpf_map__pin(map, NULL); if (err) { - pr_warn("failed to auto-pin map name '%s' at '%s'\n", - map->name, map->pin_path); - return err; + pr_warn("map '%s': failed to auto-pin at '%s': %d\n", + map->name, map->pin_path, err); + zclose(map->fd); + goto err_out; } } - - pr_debug("created map %s: fd=%d\n", map->name, *pfd); } return 0; + +err_out: + cp = libbpf_strerror_r(err, errmsg, sizeof(errmsg)); + pr_warn("map '%s': failed to create: %s(%d)\n", map->name, cp, err); + pr_perm_msg(err); + for (j = 0; j < i; j++) + zclose(obj->maps[j].fd); + return err; } static int @@ -5966,6 +5977,32 @@ int bpf_object__pin(struct bpf_object *obj, const char *path) return 0; } +static void bpf_map__destroy(struct bpf_map *map) +{ + if (map->clear_priv) + map->clear_priv(map, map->priv); + map->priv = NULL; + map->clear_priv = NULL; + + if (map->mmaped) { + munmap(map->mmaped, bpf_map_mmap_sz(map)); + map->mmaped = NULL; + } + + if (map->st_ops) { + zfree(&map->st_ops->data); + zfree(&map->st_ops->progs); + zfree(&map->st_ops->kern_func_off); + zfree(&map->st_ops); + } + + zfree(&map->name); + zfree(&map->pin_path); + + if (map->fd >= 0) + zclose(map->fd); +} + void bpf_object__close(struct bpf_object *obj) { size_t i; @@ -5981,29 +6018,8 @@ void bpf_object__close(struct bpf_object *obj) btf__free(obj->btf); btf_ext__free(obj->btf_ext); - for (i = 0; i < obj->nr_maps; i++) { - struct bpf_map *map = &obj->maps[i]; - - if (map->clear_priv) - map->clear_priv(map, map->priv); - map->priv = NULL; - map->clear_priv = NULL; - - if (map->mmaped) { - munmap(map->mmaped, bpf_map_mmap_sz(map)); - map->mmaped = NULL; - } - - if (map->st_ops) { - zfree(&map->st_ops->data); - zfree(&map->st_ops->progs); - zfree(&map->st_ops->kern_func_off); - zfree(&map->st_ops); - } - - zfree(&map->name); - zfree(&map->pin_path); - } + for (i = 0; i < obj->nr_maps; i++) + bpf_map__destroy(&obj->maps[i]); zfree(&obj->kconfig); zfree(&obj->externs); -- cgit v1.2.3-59-g8ed1b From 646f02ffdd49c466cb81642c2b013beb80092d01 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 28 Apr 2020 17:27:39 -0700 Subject: libbpf: Add BTF-defined map-in-map support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit As discussed at LPC 2019 ([0]), this patch brings (a quite belated) support for declarative BTF-defined map-in-map support in libbpf. It allows to define ARRAY_OF_MAPS and HASH_OF_MAPS BPF maps without any user-space initialization code involved. Additionally, it allows to initialize outer map's slots with references to respective inner maps at load time, also completely declaratively. Despite a weak type system of C, the way BTF-defined map-in-map definition works, it's actually quite hard to accidentally initialize outer map with incompatible inner maps. This being C, of course, it's still possible, but even that would be caught at load time and error returned with helpful debug log pointing exactly to the slot that failed to be initialized. As an example, here's a rather advanced HASH_OF_MAPS declaration and initialization example, filling slots #0 and #4 with two inner maps: #include struct inner_map { __uint(type, BPF_MAP_TYPE_ARRAY); __uint(max_entries, 1); __type(key, int); __type(value, int); } inner_map1 SEC(".maps"), inner_map2 SEC(".maps"); struct outer_hash { __uint(type, BPF_MAP_TYPE_HASH_OF_MAPS); __uint(max_entries, 5); __uint(key_size, sizeof(int)); __array(values, struct inner_map); } outer_hash SEC(".maps") = { .values = { [0] = &inner_map2, [4] = &inner_map1, }, }; Here's the relevant part of libbpf debug log showing pretty clearly of what's going on with map-in-map initialization: libbpf: .maps relo #0: for 6 value 0 rel.r_offset 96 name 260 ('inner_map1') libbpf: .maps relo #0: map 'outer_arr' slot [0] points to map 'inner_map1' libbpf: .maps relo #1: for 7 value 32 rel.r_offset 112 name 249 ('inner_map2') libbpf: .maps relo #1: map 'outer_arr' slot [2] points to map 'inner_map2' libbpf: .maps relo #2: for 7 value 32 rel.r_offset 144 name 249 ('inner_map2') libbpf: .maps relo #2: map 'outer_hash' slot [0] points to map 'inner_map2' libbpf: .maps relo #3: for 6 value 0 rel.r_offset 176 name 260 ('inner_map1') libbpf: .maps relo #3: map 'outer_hash' slot [4] points to map 'inner_map1' libbpf: map 'inner_map1': created successfully, fd=4 libbpf: map 'inner_map2': created successfully, fd=5 libbpf: map 'outer_hash': created successfully, fd=7 libbpf: map 'outer_hash': slot [0] set to map 'inner_map2' fd=5 libbpf: map 'outer_hash': slot [4] set to map 'inner_map1' fd=4 Notice from the log above that fd=6 (not logged explicitly) is used for inner "prototype" map, necessary for creation of outer map. It is destroyed immediately after outer map is created. See also included selftest with some extra comments explaining extra details of usage. Additionally, similar initialization syntax and libbpf functionality can be used to do initialization of BPF_PROG_ARRAY with references to BPF sub-programs. This can be done in follow up patches, if there will be a demand for this. [0] https://linuxplumbersconf.org/event/4/contributions/448/ Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Acked-by: Toke Høiland-Jørgensen Link: https://lore.kernel.org/bpf/20200429002739.48006-4-andriin@fb.com --- tools/lib/bpf/bpf_helpers.h | 1 + tools/lib/bpf/libbpf.c | 281 +++++++++++++++++++-- .../selftests/bpf/prog_tests/btf_map_in_map.c | 49 ++++ .../selftests/bpf/progs/test_btf_map_in_map.c | 76 ++++++ 4 files changed, 384 insertions(+), 23 deletions(-) create mode 100644 tools/testing/selftests/bpf/prog_tests/btf_map_in_map.c create mode 100644 tools/testing/selftests/bpf/progs/test_btf_map_in_map.c diff --git a/tools/lib/bpf/bpf_helpers.h b/tools/lib/bpf/bpf_helpers.h index 60aad054eea1..da00b87aa199 100644 --- a/tools/lib/bpf/bpf_helpers.h +++ b/tools/lib/bpf/bpf_helpers.h @@ -12,6 +12,7 @@ #define __uint(name, val) int (*name)[val] #define __type(name, val) typeof(val) *name +#define __array(name, val) typeof(val) *name[] /* Helper macro to print out debug messages */ #define bpf_printk(fmt, ...) \ diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 9c845cf4cfcf..445ee903f9cd 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -310,6 +310,7 @@ struct bpf_map { int map_ifindex; int inner_map_fd; struct bpf_map_def def; + __u32 btf_var_idx; __u32 btf_key_type_id; __u32 btf_value_type_id; __u32 btf_vmlinux_value_type_id; @@ -318,6 +319,9 @@ struct bpf_map { enum libbpf_map_type libbpf_type; void *mmaped; struct bpf_struct_ops *st_ops; + struct bpf_map *inner_map; + void **init_slots; + int init_slots_sz; char *pin_path; bool pinned; bool reused; @@ -389,6 +393,7 @@ struct bpf_object { int nr_reloc_sects; int maps_shndx; int btf_maps_shndx; + __u32 btf_maps_sec_btf_id; int text_shndx; int symbols_shndx; int data_shndx; @@ -1918,7 +1923,7 @@ static int build_map_pin_path(struct bpf_map *map, const char *path) static int parse_btf_map_def(struct bpf_object *obj, struct bpf_map *map, const struct btf_type *def, - bool strict, + bool strict, bool is_inner, const char *pin_root_path) { const struct btf_type *t; @@ -2036,10 +2041,79 @@ static int parse_btf_map_def(struct bpf_object *obj, } map->def.value_size = sz; map->btf_value_type_id = t->type; + } + else if (strcmp(name, "values") == 0) { + int err; + + if (is_inner) { + pr_warn("map '%s': multi-level inner maps not supported.\n", + map->name); + return -ENOTSUP; + } + if (i != vlen - 1) { + pr_warn("map '%s': '%s' member should be last.\n", + map->name, name); + return -EINVAL; + } + if (!bpf_map_type__is_map_in_map(map->def.type)) { + pr_warn("map '%s': should be map-in-map.\n", + map->name); + return -ENOTSUP; + } + if (map->def.value_size && map->def.value_size != 4) { + pr_warn("map '%s': conflicting value size %u != 4.\n", + map->name, map->def.value_size); + return -EINVAL; + } + map->def.value_size = 4; + t = btf__type_by_id(obj->btf, m->type); + if (!t) { + pr_warn("map '%s': map-in-map inner type [%d] not found.\n", + map->name, m->type); + return -EINVAL; + } + if (!btf_is_array(t) || btf_array(t)->nelems) { + pr_warn("map '%s': map-in-map inner spec is not a zero-sized array.\n", + map->name); + return -EINVAL; + } + t = skip_mods_and_typedefs(obj->btf, btf_array(t)->type, + NULL); + if (!btf_is_ptr(t)) { + pr_warn("map '%s': map-in-map inner def is of unexpected kind %u.\n", + map->name, btf_kind(t)); + return -EINVAL; + } + t = skip_mods_and_typedefs(obj->btf, t->type, NULL); + if (!btf_is_struct(t)) { + pr_warn("map '%s': map-in-map inner def is of unexpected kind %u.\n", + map->name, btf_kind(t)); + return -EINVAL; + } + + map->inner_map = calloc(1, sizeof(*map->inner_map)); + if (!map->inner_map) + return -ENOMEM; + map->inner_map->sec_idx = obj->efile.btf_maps_shndx; + map->inner_map->name = malloc(strlen(map->name) + + sizeof(".inner") + 1); + if (!map->inner_map->name) + return -ENOMEM; + sprintf(map->inner_map->name, "%s.inner", map->name); + + err = parse_btf_map_def(obj, map->inner_map, t, strict, + true /* is_inner */, NULL); + if (err) + return err; } else if (strcmp(name, "pinning") == 0) { __u32 val; int err; + if (is_inner) { + pr_debug("map '%s': inner def can't be pinned.\n", + map->name); + return -EINVAL; + } if (!get_map_field_int(map->name, obj->btf, m, &val)) return -EINVAL; pr_debug("map '%s': found pinning = %u.\n", @@ -2138,10 +2212,11 @@ static int bpf_object__init_user_btf_map(struct bpf_object *obj, map->def.type = BPF_MAP_TYPE_UNSPEC; map->sec_idx = sec_idx; map->sec_offset = vi->offset; + map->btf_var_idx = var_idx; pr_debug("map '%s': at sec_idx %d, offset %zu.\n", map_name, map->sec_idx, map->sec_offset); - return parse_btf_map_def(obj, map, def, strict, pin_root_path); + return parse_btf_map_def(obj, map, def, strict, false, pin_root_path); } static int bpf_object__init_user_btf_maps(struct bpf_object *obj, bool strict, @@ -2174,6 +2249,7 @@ static int bpf_object__init_user_btf_maps(struct bpf_object *obj, bool strict, name = btf__name_by_offset(obj->btf, t->name_off); if (strcmp(name, MAPS_ELF_SEC) == 0) { sec = t; + obj->efile.btf_maps_sec_btf_id = i; break; } } @@ -2560,7 +2636,8 @@ static int bpf_object__elf_collect(struct bpf_object *obj) /* Only do relo for section with exec instructions */ if (!section_have_execinstr(obj, sec) && - strcmp(name, ".rel" STRUCT_OPS_SEC)) { + strcmp(name, ".rel" STRUCT_OPS_SEC) && + strcmp(name, ".rel" MAPS_ELF_SEC)) { pr_debug("skip relo %s(%d) for section(%d)\n", name, idx, sec); continue; @@ -3538,6 +3615,22 @@ static int bpf_object__create_map(struct bpf_object *obj, struct bpf_map *map) create_attr.btf_value_type_id = map->btf_value_type_id; } + if (bpf_map_type__is_map_in_map(def->type)) { + if (map->inner_map) { + int err; + + err = bpf_object__create_map(obj, map->inner_map); + if (err) { + pr_warn("map '%s': failed to create inner map: %d\n", + map->name, err); + return err; + } + map->inner_map_fd = bpf_map__fd(map->inner_map); + } + if (map->inner_map_fd >= 0) + create_attr.inner_map_fd = map->inner_map_fd; + } + map->fd = bpf_create_map_xattr(&create_attr); if (map->fd < 0 && (create_attr.btf_key_type_id || create_attr.btf_value_type_id)) { @@ -3558,6 +3651,11 @@ static int bpf_object__create_map(struct bpf_object *obj, struct bpf_map *map) if (map->fd < 0) return -errno; + if (bpf_map_type__is_map_in_map(def->type) && map->inner_map) { + bpf_map__destroy(map->inner_map); + zfree(&map->inner_map); + } + return 0; } @@ -3602,6 +3700,31 @@ bpf_object__create_maps(struct bpf_object *obj) } } + if (map->init_slots_sz) { + for (j = 0; j < map->init_slots_sz; j++) { + const struct bpf_map *targ_map; + int fd; + + if (!map->init_slots[j]) + continue; + + targ_map = map->init_slots[j]; + fd = bpf_map__fd(targ_map); + err = bpf_map_update_elem(map->fd, &j, &fd, 0); + if (err) { + err = -errno; + pr_warn("map '%s': failed to initialize slot [%d] to map '%s' fd=%d: %d\n", + map->name, j, targ_map->name, + fd, err); + goto err_out; + } + pr_debug("map '%s': slot [%d] set to map '%s' fd=%d\n", + map->name, j, targ_map->name, fd); + } + zfree(&map->init_slots); + map->init_slots_sz = 0; + } + if (map->pin_path && !map->pinned) { err = bpf_map__pin(map, NULL); if (err) { @@ -4873,9 +4996,118 @@ bpf_object__relocate(struct bpf_object *obj, const char *targ_btf_path) return 0; } -static int bpf_object__collect_struct_ops_map_reloc(struct bpf_object *obj, - GElf_Shdr *shdr, - Elf_Data *data); +static int bpf_object__collect_st_ops_relos(struct bpf_object *obj, + GElf_Shdr *shdr, Elf_Data *data); + +static int bpf_object__collect_map_relos(struct bpf_object *obj, + GElf_Shdr *shdr, Elf_Data *data) +{ + int i, j, nrels, new_sz, ptr_sz = sizeof(void *); + const struct btf_type *sec, *var, *def; + const struct btf_var_secinfo *vi; + const struct btf_member *member; + struct bpf_map *map, *targ_map; + const char *name, *mname; + Elf_Data *symbols; + unsigned int moff; + GElf_Sym sym; + GElf_Rel rel; + void *tmp; + + if (!obj->efile.btf_maps_sec_btf_id || !obj->btf) + return -EINVAL; + sec = btf__type_by_id(obj->btf, obj->efile.btf_maps_sec_btf_id); + if (!sec) + return -EINVAL; + + symbols = obj->efile.symbols; + nrels = shdr->sh_size / shdr->sh_entsize; + for (i = 0; i < nrels; i++) { + if (!gelf_getrel(data, i, &rel)) { + pr_warn(".maps relo #%d: failed to get ELF relo\n", i); + return -LIBBPF_ERRNO__FORMAT; + } + if (!gelf_getsym(symbols, GELF_R_SYM(rel.r_info), &sym)) { + pr_warn(".maps relo #%d: symbol %zx not found\n", + i, (size_t)GELF_R_SYM(rel.r_info)); + return -LIBBPF_ERRNO__FORMAT; + } + name = elf_strptr(obj->efile.elf, obj->efile.strtabidx, + sym.st_name) ? : ""; + if (sym.st_shndx != obj->efile.btf_maps_shndx) { + pr_warn(".maps relo #%d: '%s' isn't a BTF-defined map\n", + i, name); + return -LIBBPF_ERRNO__RELOC; + } + + pr_debug(".maps relo #%d: for %zd value %zd rel.r_offset %zu name %d ('%s')\n", + i, (ssize_t)(rel.r_info >> 32), (size_t)sym.st_value, + (size_t)rel.r_offset, sym.st_name, name); + + for (j = 0; j < obj->nr_maps; j++) { + map = &obj->maps[j]; + if (map->sec_idx != obj->efile.btf_maps_shndx) + continue; + + vi = btf_var_secinfos(sec) + map->btf_var_idx; + if (vi->offset <= rel.r_offset && + rel.r_offset + sizeof(void *) <= vi->offset + vi->size) + break; + } + if (j == obj->nr_maps) { + pr_warn(".maps relo #%d: cannot find map '%s' at rel.r_offset %zu\n", + i, name, (size_t)rel.r_offset); + return -EINVAL; + } + + if (!bpf_map_type__is_map_in_map(map->def.type)) + return -EINVAL; + if (map->def.type == BPF_MAP_TYPE_HASH_OF_MAPS && + map->def.key_size != sizeof(int)) { + pr_warn(".maps relo #%d: hash-of-maps '%s' should have key size %zu.\n", + i, map->name, sizeof(int)); + return -EINVAL; + } + + targ_map = bpf_object__find_map_by_name(obj, name); + if (!targ_map) + return -ESRCH; + + var = btf__type_by_id(obj->btf, vi->type); + def = skip_mods_and_typedefs(obj->btf, var->type, NULL); + if (btf_vlen(def) == 0) + return -EINVAL; + member = btf_members(def) + btf_vlen(def) - 1; + mname = btf__name_by_offset(obj->btf, member->name_off); + if (strcmp(mname, "values")) + return -EINVAL; + + moff = btf_member_bit_offset(def, btf_vlen(def) - 1) / 8; + if (rel.r_offset - vi->offset < moff) + return -EINVAL; + + moff = rel.r_offset - vi->offset - moff; + if (moff % ptr_sz) + return -EINVAL; + moff /= ptr_sz; + if (moff >= map->init_slots_sz) { + new_sz = moff + 1; + tmp = realloc(map->init_slots, new_sz * ptr_sz); + if (!tmp) + return -ENOMEM; + map->init_slots = tmp; + memset(map->init_slots + map->init_slots_sz, 0, + (new_sz - map->init_slots_sz) * ptr_sz); + map->init_slots_sz = new_sz; + } + map->init_slots[moff] = targ_map; + + pr_debug(".maps relo #%d: map '%s' slot [%d] points to map '%s'\n", + i, map->name, moff, name); + } + + return 0; +} static int bpf_object__collect_reloc(struct bpf_object *obj) { @@ -4898,21 +5130,17 @@ static int bpf_object__collect_reloc(struct bpf_object *obj) } if (idx == obj->efile.st_ops_shndx) { - err = bpf_object__collect_struct_ops_map_reloc(obj, - shdr, - data); - if (err) - return err; - continue; - } - - prog = bpf_object__find_prog_by_idx(obj, idx); - if (!prog) { - pr_warn("relocation failed: no section(%d)\n", idx); - return -LIBBPF_ERRNO__RELOC; + err = bpf_object__collect_st_ops_relos(obj, shdr, data); + } else if (idx == obj->efile.btf_maps_shndx) { + err = bpf_object__collect_map_relos(obj, shdr, data); + } else { + prog = bpf_object__find_prog_by_idx(obj, idx); + if (!prog) { + pr_warn("relocation failed: no prog in section(%d)\n", idx); + return -LIBBPF_ERRNO__RELOC; + } + err = bpf_program__collect_reloc(prog, shdr, data, obj); } - - err = bpf_program__collect_reloc(prog, shdr, data, obj); if (err) return err; } @@ -5984,6 +6212,14 @@ static void bpf_map__destroy(struct bpf_map *map) map->priv = NULL; map->clear_priv = NULL; + if (map->inner_map) { + bpf_map__destroy(map->inner_map); + zfree(&map->inner_map); + } + + zfree(&map->init_slots); + map->init_slots_sz = 0; + if (map->mmaped) { munmap(map->mmaped, bpf_map_mmap_sz(map)); map->mmaped = NULL; @@ -6543,9 +6779,8 @@ static struct bpf_map *find_struct_ops_map_by_offset(struct bpf_object *obj, } /* Collect the reloc from ELF and populate the st_ops->progs[] */ -static int bpf_object__collect_struct_ops_map_reloc(struct bpf_object *obj, - GElf_Shdr *shdr, - Elf_Data *data) +static int bpf_object__collect_st_ops_relos(struct bpf_object *obj, + GElf_Shdr *shdr, Elf_Data *data) { const struct btf_member *member; struct bpf_struct_ops *st_ops; diff --git a/tools/testing/selftests/bpf/prog_tests/btf_map_in_map.c b/tools/testing/selftests/bpf/prog_tests/btf_map_in_map.c new file mode 100644 index 000000000000..f7ee8fa377ad --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/btf_map_in_map.c @@ -0,0 +1,49 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2020 Facebook */ + +#include + +#include "test_btf_map_in_map.skel.h" + +void test_btf_map_in_map(void) +{ + int duration = 0, err, key = 0, val; + struct test_btf_map_in_map* skel; + + skel = test_btf_map_in_map__open_and_load(); + if (CHECK(!skel, "skel_open", "failed to open&load skeleton\n")) + return; + + err = test_btf_map_in_map__attach(skel); + if (CHECK(err, "skel_attach", "skeleton attach failed: %d\n", err)) + goto cleanup; + + /* inner1 = input, inner2 = input + 1 */ + val = bpf_map__fd(skel->maps.inner_map1); + bpf_map_update_elem(bpf_map__fd(skel->maps.outer_arr), &key, &val, 0); + val = bpf_map__fd(skel->maps.inner_map2); + bpf_map_update_elem(bpf_map__fd(skel->maps.outer_hash), &key, &val, 0); + skel->bss->input = 1; + usleep(1); + + bpf_map_lookup_elem(bpf_map__fd(skel->maps.inner_map1), &key, &val); + CHECK(val != 1, "inner1", "got %d != exp %d\n", val, 1); + bpf_map_lookup_elem(bpf_map__fd(skel->maps.inner_map2), &key, &val); + CHECK(val != 2, "inner2", "got %d != exp %d\n", val, 2); + + /* inner1 = input + 1, inner2 = input */ + val = bpf_map__fd(skel->maps.inner_map2); + bpf_map_update_elem(bpf_map__fd(skel->maps.outer_arr), &key, &val, 0); + val = bpf_map__fd(skel->maps.inner_map1); + bpf_map_update_elem(bpf_map__fd(skel->maps.outer_hash), &key, &val, 0); + skel->bss->input = 3; + usleep(1); + + bpf_map_lookup_elem(bpf_map__fd(skel->maps.inner_map1), &key, &val); + CHECK(val != 4, "inner1", "got %d != exp %d\n", val, 4); + bpf_map_lookup_elem(bpf_map__fd(skel->maps.inner_map2), &key, &val); + CHECK(val != 3, "inner2", "got %d != exp %d\n", val, 3); + +cleanup: + test_btf_map_in_map__destroy(skel); +} diff --git a/tools/testing/selftests/bpf/progs/test_btf_map_in_map.c b/tools/testing/selftests/bpf/progs/test_btf_map_in_map.c new file mode 100644 index 000000000000..e5093796be97 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_btf_map_in_map.c @@ -0,0 +1,76 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright (c) 2020 Facebook */ +#include +#include + +struct inner_map { + __uint(type, BPF_MAP_TYPE_ARRAY); + __uint(max_entries, 1); + __type(key, int); + __type(value, int); +} inner_map1 SEC(".maps"), + inner_map2 SEC(".maps"); + +struct outer_arr { + __uint(type, BPF_MAP_TYPE_ARRAY_OF_MAPS); + __uint(max_entries, 3); + __uint(key_size, sizeof(int)); + __uint(value_size, sizeof(int)); + /* it's possible to use anonymous struct as inner map definition here */ + __array(values, struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + /* changing max_entries to 2 will fail during load + * due to incompatibility with inner_map definition */ + __uint(max_entries, 1); + __type(key, int); + __type(value, int); + }); +} outer_arr SEC(".maps") = { + /* (void *) cast is necessary because we didn't use `struct inner_map` + * in __inner(values, ...) + * Actually, a conscious effort is required to screw up initialization + * of inner map slots, which is a great thing! + */ + .values = { (void *)&inner_map1, 0, (void *)&inner_map2 }, +}; + +struct outer_hash { + __uint(type, BPF_MAP_TYPE_HASH_OF_MAPS); + __uint(max_entries, 5); + __uint(key_size, sizeof(int)); + /* Here everything works flawlessly due to reuse of struct inner_map + * and compiler will complain at the attempt to use non-inner_map + * references below. This is great experience. + */ + __array(values, struct inner_map); +} outer_hash SEC(".maps") = { + .values = { + [0] = &inner_map2, + [4] = &inner_map1, + }, +}; + +int input = 0; + +SEC("raw_tp/sys_enter") +int handle__sys_enter(void *ctx) +{ + struct inner_map *inner_map; + int key = 0, val; + + inner_map = bpf_map_lookup_elem(&outer_arr, &key); + if (!inner_map) + return 1; + val = input; + bpf_map_update_elem(inner_map, &key, &val, 0); + + inner_map = bpf_map_lookup_elem(&outer_hash, &key); + if (!inner_map) + return 1; + val = input + 1; + bpf_map_update_elem(inner_map, &key, &val, 0); + + return 0; +} + +char _license[] SEC("license") = "GPL"; -- cgit v1.2.3-59-g8ed1b From 76148faa161e7cfb2d7719f35b37d7db4f3f8596 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 28 Apr 2020 18:21:01 -0700 Subject: selftests/bpf: Ensure test flavors use correct skeletons Ensure that test runner flavors include their own skeletons from / directory. Previously, skeletons generated for no-flavor test_progs were used. Apart from fixing correctness, this also makes it possible to compile only flavors individually: $ make clean && make test_progs-no_alu32 ... now succeeds ... Fixes: 74b5a5968fe8 ("selftests/bpf: Replace test_progs and test_maps w/ general rule") Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200429012111.277390-2-andriin@fb.com --- tools/testing/selftests/bpf/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index 4e654d41c7af..01c95f8278c7 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -324,7 +324,7 @@ $(TRUNNER_TEST_OBJS): $(TRUNNER_OUTPUT)/%.test.o: \ $(TRUNNER_BPF_SKELS) \ $$(BPFOBJ) | $(TRUNNER_OUTPUT) $$(call msg,TEST-OBJ,$(TRUNNER_BINARY),$$@) - cd $$(@D) && $$(CC) $$(CFLAGS) -c $(CURDIR)/$$< $$(LDLIBS) -o $$(@F) + cd $$(@D) && $$(CC) -I. $$(CFLAGS) -c $(CURDIR)/$$< $$(LDLIBS) -o $$(@F) $(TRUNNER_EXTRA_OBJS): $(TRUNNER_OUTPUT)/%.o: \ %.c \ -- cgit v1.2.3-59-g8ed1b From 02995dd4bb02a5359a08e44abb3c18c2f456bd19 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 28 Apr 2020 18:21:02 -0700 Subject: selftests/bpf: Add SAN_CFLAGS param to selftests build to allow sanitizers Add ability to specify extra compiler flags with SAN_CFLAGS for compilation of all user-space C files. This allows to build all of selftest programs with, e.g., custom sanitizer flags, without requiring support for such sanitizers from anyone compiling selftest/bpf. As an example, to compile everything with AddressSanitizer, one would do: $ make clean && make SAN_CFLAGS="-fsanitize=address" For AddressSanitizer to work, one needs appropriate libasan shared library installed in the system, with version of libasan matching what GCC links against. E.g., GCC8 needs libasan5, while GCC7 uses libasan4. For CentOS 7, to build everything successfully one would need to: $ sudo yum install devtoolset-8-gcc devtoolset-libasan-devel $ scl enable devtoolset-8 bash # set up environment For Arch Linux to run selftests, one would need to install gcc-libs package to get libasan.so.5: $ sudo pacman -S gcc-libs N.B. EXTRA_CFLAGS name wasn't used, because it's also used by libbpf's Makefile and this causes few issues: 1. default "-g -Wall" flags are overriden; 2. compiling shared library with AddressSanitizer generates a bunch of symbols like: "_GLOBAL__sub_D_00099_0_btf_dump.c", "_GLOBAL__sub_D_00099_0_bpf.c", etc, which screws up versioned symbols check. Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Cc: Julia Kartseva Link: https://lore.kernel.org/bpf/20200429012111.277390-3-andriin@fb.com --- tools/testing/selftests/bpf/Makefile | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index 01c95f8278c7..887f06a514ee 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -20,9 +20,10 @@ CLANG ?= clang LLC ?= llc LLVM_OBJCOPY ?= llvm-objcopy BPF_GCC ?= $(shell command -v bpf-gcc;) -CFLAGS += -g -rdynamic -Wall -O2 $(GENFLAGS) -I$(CURDIR) \ - -I$(INCLUDE_DIR) -I$(GENDIR) -I$(LIBDIR) -I$(TOOLSINCDIR) \ - -I$(APIDIR) \ +SAN_CFLAGS ?= +CFLAGS += -g -rdynamic -Wall -O2 $(GENFLAGS) $(SAN_CFLAGS) \ + -I$(CURDIR) -I$(INCLUDE_DIR) -I$(GENDIR) -I$(LIBDIR) \ + -I$(TOOLSINCDIR) -I$(APIDIR) \ -Dbpf_prog_load=bpf_prog_test_load \ -Dbpf_load_program=bpf_test_load_program LDLIBS += -lcap -lelf -lz -lrt -lpthread -- cgit v1.2.3-59-g8ed1b From 42fce2cfb405e613f0355c4f92429d651bf0a5b3 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 28 Apr 2020 18:21:03 -0700 Subject: selftests/bpf: Convert test_hashmap into test_progs test Fold stand-alone test_hashmap test into test_progs. Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200429012111.277390-4-andriin@fb.com --- tools/testing/selftests/bpf/.gitignore | 2 - tools/testing/selftests/bpf/Makefile | 2 +- tools/testing/selftests/bpf/prog_tests/hashmap.c | 380 ++++++++++++++++++++++ tools/testing/selftests/bpf/test_hashmap.c | 382 ----------------------- 4 files changed, 381 insertions(+), 385 deletions(-) create mode 100644 tools/testing/selftests/bpf/prog_tests/hashmap.c delete mode 100644 tools/testing/selftests/bpf/test_hashmap.c diff --git a/tools/testing/selftests/bpf/.gitignore b/tools/testing/selftests/bpf/.gitignore index c30079c86998..16b9774d8b68 100644 --- a/tools/testing/selftests/bpf/.gitignore +++ b/tools/testing/selftests/bpf/.gitignore @@ -30,8 +30,6 @@ test_tcpnotify_user test_libbpf test_tcp_check_syncookie_user test_sysctl -test_hashmap -test_btf_dump test_current_pid_tgid_new_ns xdping test_cpp diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index 887f06a514ee..10f12a5aac20 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -33,7 +33,7 @@ TEST_GEN_PROGS = test_verifier test_tag test_maps test_lru_map test_lpm_map test test_align test_verifier_log test_dev_cgroup test_tcpbpf_user \ test_sock test_btf test_sockmap get_cgroup_id_user test_socket_cookie \ test_cgroup_storage \ - test_netcnt test_tcpnotify_user test_sock_fields test_sysctl test_hashmap \ + test_netcnt test_tcpnotify_user test_sock_fields test_sysctl \ test_progs-no_alu32 \ test_current_pid_tgid_new_ns diff --git a/tools/testing/selftests/bpf/prog_tests/hashmap.c b/tools/testing/selftests/bpf/prog_tests/hashmap.c new file mode 100644 index 000000000000..428d488830c6 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/hashmap.c @@ -0,0 +1,380 @@ +// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) + +/* + * Tests for libbpf's hashmap. + * + * Copyright (c) 2019 Facebook + */ +#include "test_progs.h" +#include "bpf/hashmap.h" + +static int duration = 0; + +static size_t hash_fn(const void *k, void *ctx) +{ + return (long)k; +} + +static bool equal_fn(const void *a, const void *b, void *ctx) +{ + return (long)a == (long)b; +} + +static inline size_t next_pow_2(size_t n) +{ + size_t r = 1; + + while (r < n) + r <<= 1; + return r; +} + +static inline size_t exp_cap(size_t sz) +{ + size_t r = next_pow_2(sz); + + if (sz * 4 / 3 > r) + r <<= 1; + return r; +} + +#define ELEM_CNT 62 + +static void test_hashmap_generic(void) +{ + struct hashmap_entry *entry, *tmp; + int err, bkt, found_cnt, i; + long long found_msk; + struct hashmap *map; + + map = hashmap__new(hash_fn, equal_fn, NULL); + if (CHECK(IS_ERR(map), "hashmap__new", + "failed to create map: %ld\n", PTR_ERR(map))) + return; + + for (i = 0; i < ELEM_CNT; i++) { + const void *oldk, *k = (const void *)(long)i; + void *oldv, *v = (void *)(long)(1024 + i); + + err = hashmap__update(map, k, v, &oldk, &oldv); + if (CHECK(err != -ENOENT, "hashmap__update", + "unexpected result: %d\n", err)) + goto cleanup; + + if (i % 2) { + err = hashmap__add(map, k, v); + } else { + err = hashmap__set(map, k, v, &oldk, &oldv); + if (CHECK(oldk != NULL || oldv != NULL, "check_kv", + "unexpected k/v: %p=%p\n", oldk, oldv)) + goto cleanup; + } + + if (CHECK(err, "elem_add", "failed to add k/v %ld = %ld: %d\n", + (long)k, (long)v, err)) + goto cleanup; + + if (CHECK(!hashmap__find(map, k, &oldv), "elem_find", + "failed to find key %ld\n", (long)k)) + goto cleanup; + if (CHECK(oldv != v, "elem_val", + "found value is wrong: %ld\n", (long)oldv)) + goto cleanup; + } + + if (CHECK(hashmap__size(map) != ELEM_CNT, "hashmap__size", + "invalid map size: %zu\n", hashmap__size(map))) + goto cleanup; + if (CHECK(hashmap__capacity(map) != exp_cap(hashmap__size(map)), + "hashmap_cap", + "unexpected map capacity: %zu\n", hashmap__capacity(map))) + goto cleanup; + + found_msk = 0; + hashmap__for_each_entry(map, entry, bkt) { + long k = (long)entry->key; + long v = (long)entry->value; + + found_msk |= 1ULL << k; + if (CHECK(v - k != 1024, "check_kv", + "invalid k/v pair: %ld = %ld\n", k, v)) + goto cleanup; + } + if (CHECK(found_msk != (1ULL << ELEM_CNT) - 1, "elem_cnt", + "not all keys iterated: %llx\n", found_msk)) + goto cleanup; + + for (i = 0; i < ELEM_CNT; i++) { + const void *oldk, *k = (const void *)(long)i; + void *oldv, *v = (void *)(long)(256 + i); + + err = hashmap__add(map, k, v); + if (CHECK(err != -EEXIST, "hashmap__add", + "unexpected add result: %d\n", err)) + goto cleanup; + + if (i % 2) + err = hashmap__update(map, k, v, &oldk, &oldv); + else + err = hashmap__set(map, k, v, &oldk, &oldv); + + if (CHECK(err, "elem_upd", + "failed to update k/v %ld = %ld: %d\n", + (long)k, (long)v, err)) + goto cleanup; + if (CHECK(!hashmap__find(map, k, &oldv), "elem_find", + "failed to find key %ld\n", (long)k)) + goto cleanup; + if (CHECK(oldv != v, "elem_val", + "found value is wrong: %ld\n", (long)oldv)) + goto cleanup; + } + + if (CHECK(hashmap__size(map) != ELEM_CNT, "hashmap__size", + "invalid updated map size: %zu\n", hashmap__size(map))) + goto cleanup; + if (CHECK(hashmap__capacity(map) != exp_cap(hashmap__size(map)), + "hashmap__capacity", + "unexpected map capacity: %zu\n", hashmap__capacity(map))) + goto cleanup; + + found_msk = 0; + hashmap__for_each_entry_safe(map, entry, tmp, bkt) { + long k = (long)entry->key; + long v = (long)entry->value; + + found_msk |= 1ULL << k; + if (CHECK(v - k != 256, "elem_check", + "invalid updated k/v pair: %ld = %ld\n", k, v)) + goto cleanup; + } + if (CHECK(found_msk != (1ULL << ELEM_CNT) - 1, "elem_cnt", + "not all keys iterated after update: %llx\n", found_msk)) + goto cleanup; + + found_cnt = 0; + hashmap__for_each_key_entry(map, entry, (void *)0) { + found_cnt++; + } + if (CHECK(!found_cnt, "found_cnt", + "didn't find any entries for key 0\n")) + goto cleanup; + + found_msk = 0; + found_cnt = 0; + hashmap__for_each_key_entry_safe(map, entry, tmp, (void *)0) { + const void *oldk, *k; + void *oldv, *v; + + k = entry->key; + v = entry->value; + + found_cnt++; + found_msk |= 1ULL << (long)k; + + if (CHECK(!hashmap__delete(map, k, &oldk, &oldv), "elem_del", + "failed to delete k/v %ld = %ld\n", + (long)k, (long)v)) + goto cleanup; + if (CHECK(oldk != k || oldv != v, "check_old", + "invalid deleted k/v: expected %ld = %ld, got %ld = %ld\n", + (long)k, (long)v, (long)oldk, (long)oldv)) + goto cleanup; + if (CHECK(hashmap__delete(map, k, &oldk, &oldv), "elem_del", + "unexpectedly deleted k/v %ld = %ld\n", + (long)oldk, (long)oldv)) + goto cleanup; + } + + if (CHECK(!found_cnt || !found_msk, "found_entries", + "didn't delete any key entries\n")) + goto cleanup; + if (CHECK(hashmap__size(map) != ELEM_CNT - found_cnt, "elem_cnt", + "invalid updated map size (already deleted: %d): %zu\n", + found_cnt, hashmap__size(map))) + goto cleanup; + if (CHECK(hashmap__capacity(map) != exp_cap(hashmap__size(map)), + "hashmap__capacity", + "unexpected map capacity: %zu\n", hashmap__capacity(map))) + goto cleanup; + + hashmap__for_each_entry_safe(map, entry, tmp, bkt) { + const void *oldk, *k; + void *oldv, *v; + + k = entry->key; + v = entry->value; + + found_cnt++; + found_msk |= 1ULL << (long)k; + + if (CHECK(!hashmap__delete(map, k, &oldk, &oldv), "elem_del", + "failed to delete k/v %ld = %ld\n", + (long)k, (long)v)) + goto cleanup; + if (CHECK(oldk != k || oldv != v, "elem_check", + "invalid old k/v: expect %ld = %ld, got %ld = %ld\n", + (long)k, (long)v, (long)oldk, (long)oldv)) + goto cleanup; + if (CHECK(hashmap__delete(map, k, &oldk, &oldv), "elem_del", + "unexpectedly deleted k/v %ld = %ld\n", + (long)k, (long)v)) + goto cleanup; + } + + if (CHECK(found_cnt != ELEM_CNT || found_msk != (1ULL << ELEM_CNT) - 1, + "found_cnt", + "not all keys were deleted: found_cnt:%d, found_msk:%llx\n", + found_cnt, found_msk)) + goto cleanup; + if (CHECK(hashmap__size(map) != 0, "hashmap__size", + "invalid updated map size (already deleted: %d): %zu\n", + found_cnt, hashmap__size(map))) + goto cleanup; + + found_cnt = 0; + hashmap__for_each_entry(map, entry, bkt) { + CHECK(false, "elem_exists", + "unexpected map entries left: %ld = %ld\n", + (long)entry->key, (long)entry->value); + goto cleanup; + } + + hashmap__clear(map); + hashmap__for_each_entry(map, entry, bkt) { + CHECK(false, "elem_exists", + "unexpected map entries left: %ld = %ld\n", + (long)entry->key, (long)entry->value); + goto cleanup; + } + +cleanup: + hashmap__free(map); +} + +static size_t collision_hash_fn(const void *k, void *ctx) +{ + return 0; +} + +static void test_hashmap_multimap(void) +{ + void *k1 = (void *)0, *k2 = (void *)1; + struct hashmap_entry *entry; + struct hashmap *map; + long found_msk; + int err, bkt; + + /* force collisions */ + map = hashmap__new(collision_hash_fn, equal_fn, NULL); + if (CHECK(IS_ERR(map), "hashmap__new", + "failed to create map: %ld\n", PTR_ERR(map))) + return; + + /* set up multimap: + * [0] -> 1, 2, 4; + * [1] -> 8, 16, 32; + */ + err = hashmap__append(map, k1, (void *)1); + if (CHECK(err, "elem_add", "failed to add k/v: %d\n", err)) + goto cleanup; + err = hashmap__append(map, k1, (void *)2); + if (CHECK(err, "elem_add", "failed to add k/v: %d\n", err)) + goto cleanup; + err = hashmap__append(map, k1, (void *)4); + if (CHECK(err, "elem_add", "failed to add k/v: %d\n", err)) + goto cleanup; + + err = hashmap__append(map, k2, (void *)8); + if (CHECK(err, "elem_add", "failed to add k/v: %d\n", err)) + goto cleanup; + err = hashmap__append(map, k2, (void *)16); + if (CHECK(err, "elem_add", "failed to add k/v: %d\n", err)) + goto cleanup; + err = hashmap__append(map, k2, (void *)32); + if (CHECK(err, "elem_add", "failed to add k/v: %d\n", err)) + goto cleanup; + + if (CHECK(hashmap__size(map) != 6, "hashmap_size", + "invalid map size: %zu\n", hashmap__size(map))) + goto cleanup; + + /* verify global iteration still works and sees all values */ + found_msk = 0; + hashmap__for_each_entry(map, entry, bkt) { + found_msk |= (long)entry->value; + } + if (CHECK(found_msk != (1 << 6) - 1, "found_msk", + "not all keys iterated: %lx\n", found_msk)) + goto cleanup; + + /* iterate values for key 1 */ + found_msk = 0; + hashmap__for_each_key_entry(map, entry, k1) { + found_msk |= (long)entry->value; + } + if (CHECK(found_msk != (1 | 2 | 4), "found_msk", + "invalid k1 values: %lx\n", found_msk)) + goto cleanup; + + /* iterate values for key 2 */ + found_msk = 0; + hashmap__for_each_key_entry(map, entry, k2) { + found_msk |= (long)entry->value; + } + if (CHECK(found_msk != (8 | 16 | 32), "found_msk", + "invalid k2 values: %lx\n", found_msk)) + goto cleanup; + +cleanup: + hashmap__free(map); +} + +static void test_hashmap_empty() +{ + struct hashmap_entry *entry; + int bkt; + struct hashmap *map; + void *k = (void *)0; + + /* force collisions */ + map = hashmap__new(hash_fn, equal_fn, NULL); + if (CHECK(IS_ERR(map), "hashmap__new", + "failed to create map: %ld\n", PTR_ERR(map))) + goto cleanup; + + if (CHECK(hashmap__size(map) != 0, "hashmap__size", + "invalid map size: %zu\n", hashmap__size(map))) + goto cleanup; + if (CHECK(hashmap__capacity(map) != 0, "hashmap__capacity", + "invalid map capacity: %zu\n", hashmap__capacity(map))) + goto cleanup; + if (CHECK(hashmap__find(map, k, NULL), "elem_find", + "unexpected find\n")) + goto cleanup; + if (CHECK(hashmap__delete(map, k, NULL, NULL), "elem_del", + "unexpected delete\n")) + goto cleanup; + + hashmap__for_each_entry(map, entry, bkt) { + CHECK(false, "elem_found", "unexpected iterated entry\n"); + goto cleanup; + } + hashmap__for_each_key_entry(map, entry, k) { + CHECK(false, "key_found", "unexpected key entry\n"); + goto cleanup; + } + +cleanup: + hashmap__free(map); +} + +void test_hashmap() +{ + if (test__start_subtest("generic")) + test_hashmap_generic(); + if (test__start_subtest("multimap")) + test_hashmap_multimap(); + if (test__start_subtest("empty")) + test_hashmap_empty(); +} diff --git a/tools/testing/selftests/bpf/test_hashmap.c b/tools/testing/selftests/bpf/test_hashmap.c deleted file mode 100644 index c490e012c23f..000000000000 --- a/tools/testing/selftests/bpf/test_hashmap.c +++ /dev/null @@ -1,382 +0,0 @@ -// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) - -/* - * Tests for libbpf's hashmap. - * - * Copyright (c) 2019 Facebook - */ -#include -#include -#include -#include "bpf/hashmap.h" - -#define CHECK(condition, format...) ({ \ - int __ret = !!(condition); \ - if (__ret) { \ - fprintf(stderr, "%s:%d:FAIL ", __func__, __LINE__); \ - fprintf(stderr, format); \ - } \ - __ret; \ -}) - -size_t hash_fn(const void *k, void *ctx) -{ - return (long)k; -} - -bool equal_fn(const void *a, const void *b, void *ctx) -{ - return (long)a == (long)b; -} - -static inline size_t next_pow_2(size_t n) -{ - size_t r = 1; - - while (r < n) - r <<= 1; - return r; -} - -static inline size_t exp_cap(size_t sz) -{ - size_t r = next_pow_2(sz); - - if (sz * 4 / 3 > r) - r <<= 1; - return r; -} - -#define ELEM_CNT 62 - -int test_hashmap_generic(void) -{ - struct hashmap_entry *entry, *tmp; - int err, bkt, found_cnt, i; - long long found_msk; - struct hashmap *map; - - fprintf(stderr, "%s: ", __func__); - - map = hashmap__new(hash_fn, equal_fn, NULL); - if (CHECK(IS_ERR(map), "failed to create map: %ld\n", PTR_ERR(map))) - return 1; - - for (i = 0; i < ELEM_CNT; i++) { - const void *oldk, *k = (const void *)(long)i; - void *oldv, *v = (void *)(long)(1024 + i); - - err = hashmap__update(map, k, v, &oldk, &oldv); - if (CHECK(err != -ENOENT, "unexpected result: %d\n", err)) - return 1; - - if (i % 2) { - err = hashmap__add(map, k, v); - } else { - err = hashmap__set(map, k, v, &oldk, &oldv); - if (CHECK(oldk != NULL || oldv != NULL, - "unexpected k/v: %p=%p\n", oldk, oldv)) - return 1; - } - - if (CHECK(err, "failed to add k/v %ld = %ld: %d\n", - (long)k, (long)v, err)) - return 1; - - if (CHECK(!hashmap__find(map, k, &oldv), - "failed to find key %ld\n", (long)k)) - return 1; - if (CHECK(oldv != v, "found value is wrong: %ld\n", (long)oldv)) - return 1; - } - - if (CHECK(hashmap__size(map) != ELEM_CNT, - "invalid map size: %zu\n", hashmap__size(map))) - return 1; - if (CHECK(hashmap__capacity(map) != exp_cap(hashmap__size(map)), - "unexpected map capacity: %zu\n", hashmap__capacity(map))) - return 1; - - found_msk = 0; - hashmap__for_each_entry(map, entry, bkt) { - long k = (long)entry->key; - long v = (long)entry->value; - - found_msk |= 1ULL << k; - if (CHECK(v - k != 1024, "invalid k/v pair: %ld = %ld\n", k, v)) - return 1; - } - if (CHECK(found_msk != (1ULL << ELEM_CNT) - 1, - "not all keys iterated: %llx\n", found_msk)) - return 1; - - for (i = 0; i < ELEM_CNT; i++) { - const void *oldk, *k = (const void *)(long)i; - void *oldv, *v = (void *)(long)(256 + i); - - err = hashmap__add(map, k, v); - if (CHECK(err != -EEXIST, "unexpected add result: %d\n", err)) - return 1; - - if (i % 2) - err = hashmap__update(map, k, v, &oldk, &oldv); - else - err = hashmap__set(map, k, v, &oldk, &oldv); - - if (CHECK(err, "failed to update k/v %ld = %ld: %d\n", - (long)k, (long)v, err)) - return 1; - if (CHECK(!hashmap__find(map, k, &oldv), - "failed to find key %ld\n", (long)k)) - return 1; - if (CHECK(oldv != v, "found value is wrong: %ld\n", (long)oldv)) - return 1; - } - - if (CHECK(hashmap__size(map) != ELEM_CNT, - "invalid updated map size: %zu\n", hashmap__size(map))) - return 1; - if (CHECK(hashmap__capacity(map) != exp_cap(hashmap__size(map)), - "unexpected map capacity: %zu\n", hashmap__capacity(map))) - return 1; - - found_msk = 0; - hashmap__for_each_entry_safe(map, entry, tmp, bkt) { - long k = (long)entry->key; - long v = (long)entry->value; - - found_msk |= 1ULL << k; - if (CHECK(v - k != 256, - "invalid updated k/v pair: %ld = %ld\n", k, v)) - return 1; - } - if (CHECK(found_msk != (1ULL << ELEM_CNT) - 1, - "not all keys iterated after update: %llx\n", found_msk)) - return 1; - - found_cnt = 0; - hashmap__for_each_key_entry(map, entry, (void *)0) { - found_cnt++; - } - if (CHECK(!found_cnt, "didn't find any entries for key 0\n")) - return 1; - - found_msk = 0; - found_cnt = 0; - hashmap__for_each_key_entry_safe(map, entry, tmp, (void *)0) { - const void *oldk, *k; - void *oldv, *v; - - k = entry->key; - v = entry->value; - - found_cnt++; - found_msk |= 1ULL << (long)k; - - if (CHECK(!hashmap__delete(map, k, &oldk, &oldv), - "failed to delete k/v %ld = %ld\n", - (long)k, (long)v)) - return 1; - if (CHECK(oldk != k || oldv != v, - "invalid deleted k/v: expected %ld = %ld, got %ld = %ld\n", - (long)k, (long)v, (long)oldk, (long)oldv)) - return 1; - if (CHECK(hashmap__delete(map, k, &oldk, &oldv), - "unexpectedly deleted k/v %ld = %ld\n", - (long)oldk, (long)oldv)) - return 1; - } - - if (CHECK(!found_cnt || !found_msk, - "didn't delete any key entries\n")) - return 1; - if (CHECK(hashmap__size(map) != ELEM_CNT - found_cnt, - "invalid updated map size (already deleted: %d): %zu\n", - found_cnt, hashmap__size(map))) - return 1; - if (CHECK(hashmap__capacity(map) != exp_cap(hashmap__size(map)), - "unexpected map capacity: %zu\n", hashmap__capacity(map))) - return 1; - - hashmap__for_each_entry_safe(map, entry, tmp, bkt) { - const void *oldk, *k; - void *oldv, *v; - - k = entry->key; - v = entry->value; - - found_cnt++; - found_msk |= 1ULL << (long)k; - - if (CHECK(!hashmap__delete(map, k, &oldk, &oldv), - "failed to delete k/v %ld = %ld\n", - (long)k, (long)v)) - return 1; - if (CHECK(oldk != k || oldv != v, - "invalid old k/v: expect %ld = %ld, got %ld = %ld\n", - (long)k, (long)v, (long)oldk, (long)oldv)) - return 1; - if (CHECK(hashmap__delete(map, k, &oldk, &oldv), - "unexpectedly deleted k/v %ld = %ld\n", - (long)k, (long)v)) - return 1; - } - - if (CHECK(found_cnt != ELEM_CNT || found_msk != (1ULL << ELEM_CNT) - 1, - "not all keys were deleted: found_cnt:%d, found_msk:%llx\n", - found_cnt, found_msk)) - return 1; - if (CHECK(hashmap__size(map) != 0, - "invalid updated map size (already deleted: %d): %zu\n", - found_cnt, hashmap__size(map))) - return 1; - - found_cnt = 0; - hashmap__for_each_entry(map, entry, bkt) { - CHECK(false, "unexpected map entries left: %ld = %ld\n", - (long)entry->key, (long)entry->value); - return 1; - } - - hashmap__free(map); - hashmap__for_each_entry(map, entry, bkt) { - CHECK(false, "unexpected map entries left: %ld = %ld\n", - (long)entry->key, (long)entry->value); - return 1; - } - - fprintf(stderr, "OK\n"); - return 0; -} - -size_t collision_hash_fn(const void *k, void *ctx) -{ - return 0; -} - -int test_hashmap_multimap(void) -{ - void *k1 = (void *)0, *k2 = (void *)1; - struct hashmap_entry *entry; - struct hashmap *map; - long found_msk; - int err, bkt; - - fprintf(stderr, "%s: ", __func__); - - /* force collisions */ - map = hashmap__new(collision_hash_fn, equal_fn, NULL); - if (CHECK(IS_ERR(map), "failed to create map: %ld\n", PTR_ERR(map))) - return 1; - - - /* set up multimap: - * [0] -> 1, 2, 4; - * [1] -> 8, 16, 32; - */ - err = hashmap__append(map, k1, (void *)1); - if (CHECK(err, "failed to add k/v: %d\n", err)) - return 1; - err = hashmap__append(map, k1, (void *)2); - if (CHECK(err, "failed to add k/v: %d\n", err)) - return 1; - err = hashmap__append(map, k1, (void *)4); - if (CHECK(err, "failed to add k/v: %d\n", err)) - return 1; - - err = hashmap__append(map, k2, (void *)8); - if (CHECK(err, "failed to add k/v: %d\n", err)) - return 1; - err = hashmap__append(map, k2, (void *)16); - if (CHECK(err, "failed to add k/v: %d\n", err)) - return 1; - err = hashmap__append(map, k2, (void *)32); - if (CHECK(err, "failed to add k/v: %d\n", err)) - return 1; - - if (CHECK(hashmap__size(map) != 6, - "invalid map size: %zu\n", hashmap__size(map))) - return 1; - - /* verify global iteration still works and sees all values */ - found_msk = 0; - hashmap__for_each_entry(map, entry, bkt) { - found_msk |= (long)entry->value; - } - if (CHECK(found_msk != (1 << 6) - 1, - "not all keys iterated: %lx\n", found_msk)) - return 1; - - /* iterate values for key 1 */ - found_msk = 0; - hashmap__for_each_key_entry(map, entry, k1) { - found_msk |= (long)entry->value; - } - if (CHECK(found_msk != (1 | 2 | 4), - "invalid k1 values: %lx\n", found_msk)) - return 1; - - /* iterate values for key 2 */ - found_msk = 0; - hashmap__for_each_key_entry(map, entry, k2) { - found_msk |= (long)entry->value; - } - if (CHECK(found_msk != (8 | 16 | 32), - "invalid k2 values: %lx\n", found_msk)) - return 1; - - fprintf(stderr, "OK\n"); - return 0; -} - -int test_hashmap_empty() -{ - struct hashmap_entry *entry; - int bkt; - struct hashmap *map; - void *k = (void *)0; - - fprintf(stderr, "%s: ", __func__); - - /* force collisions */ - map = hashmap__new(hash_fn, equal_fn, NULL); - if (CHECK(IS_ERR(map), "failed to create map: %ld\n", PTR_ERR(map))) - return 1; - - if (CHECK(hashmap__size(map) != 0, - "invalid map size: %zu\n", hashmap__size(map))) - return 1; - if (CHECK(hashmap__capacity(map) != 0, - "invalid map capacity: %zu\n", hashmap__capacity(map))) - return 1; - if (CHECK(hashmap__find(map, k, NULL), "unexpected find\n")) - return 1; - if (CHECK(hashmap__delete(map, k, NULL, NULL), "unexpected delete\n")) - return 1; - - hashmap__for_each_entry(map, entry, bkt) { - CHECK(false, "unexpected iterated entry\n"); - return 1; - } - hashmap__for_each_key_entry(map, entry, k) { - CHECK(false, "unexpected key entry\n"); - return 1; - } - - fprintf(stderr, "OK\n"); - return 0; -} - -int main(int argc, char **argv) -{ - bool failed = false; - - if (test_hashmap_generic()) - failed = true; - if (test_hashmap_multimap()) - failed = true; - if (test_hashmap_empty()) - failed = true; - - return failed; -} -- cgit v1.2.3-59-g8ed1b From 229bf8bf4d910510bc1a2fd0b89bd467cd71050d Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 28 Apr 2020 18:21:04 -0700 Subject: libbpf: Fix memory leak and possible double-free in hashmap__clear Fix memory leak in hashmap_clear() not freeing hashmap_entry structs for each of the remaining entries. Also NULL-out bucket list to prevent possible double-free between hashmap__clear() and hashmap__free(). Running test_progs-asan flavor clearly showed this problem. Reported-by: Alston Tang Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200429012111.277390-5-andriin@fb.com --- tools/lib/bpf/hashmap.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/tools/lib/bpf/hashmap.c b/tools/lib/bpf/hashmap.c index 54c30c802070..cffb96202e0d 100644 --- a/tools/lib/bpf/hashmap.c +++ b/tools/lib/bpf/hashmap.c @@ -59,7 +59,14 @@ struct hashmap *hashmap__new(hashmap_hash_fn hash_fn, void hashmap__clear(struct hashmap *map) { + struct hashmap_entry *cur, *tmp; + int bkt; + + hashmap__for_each_entry_safe(map, cur, tmp, bkt) { + free(cur); + } free(map->buckets); + map->buckets = NULL; map->cap = map->cap_bits = map->sz = 0; } -- cgit v1.2.3-59-g8ed1b From f25d5416d64c796aa639136eb0b076c8bd579b54 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 28 Apr 2020 18:21:05 -0700 Subject: selftests/bpf: Fix memory leak in test selector Free test selector substrings, which were strdup()'ed. Fixes: b65053cd94f4 ("selftests/bpf: Add whitelist/blacklist of test names to test_progs") Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200429012111.277390-6-andriin@fb.com --- tools/testing/selftests/bpf/test_progs.c | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/tools/testing/selftests/bpf/test_progs.c b/tools/testing/selftests/bpf/test_progs.c index b521e0a512b6..86d0020c9eec 100644 --- a/tools/testing/selftests/bpf/test_progs.c +++ b/tools/testing/selftests/bpf/test_progs.c @@ -420,6 +420,18 @@ static int libbpf_print_fn(enum libbpf_print_level level, return 0; } +static void free_str_set(const struct str_set *set) +{ + int i; + + if (!set) + return; + + for (i = 0; i < set->cnt; i++) + free((void *)set->strs[i]); + free(set->strs); +} + static int parse_str_list(const char *s, struct str_set *set) { char *input, *state = NULL, *next, **tmp, **strs = NULL; @@ -756,11 +768,11 @@ int main(int argc, char **argv) fprintf(stdout, "Summary: %d/%d PASSED, %d SKIPPED, %d FAILED\n", env.succ_cnt, env.sub_succ_cnt, env.skip_cnt, env.fail_cnt); - free(env.test_selector.blacklist.strs); - free(env.test_selector.whitelist.strs); + free_str_set(&env.test_selector.blacklist); + free_str_set(&env.test_selector.whitelist); free(env.test_selector.num_set); - free(env.subtest_selector.blacklist.strs); - free(env.subtest_selector.whitelist.strs); + free_str_set(&env.subtest_selector.blacklist); + free_str_set(&env.subtest_selector.whitelist); free(env.subtest_selector.num_set); return env.fail_cnt ? EXIT_FAILURE : EXIT_SUCCESS; -- cgit v1.2.3-59-g8ed1b From 9f56bb531a809ecaa7f0ddca61d2cf3adc1cb81a Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 28 Apr 2020 18:21:06 -0700 Subject: selftests/bpf: Fix memory leak in extract_build_id() getline() allocates string, which has to be freed. Fixes: 81f77fd0deeb ("bpf: add selftest for stackmap with BPF_F_STACK_BUILD_ID") Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Cc: Song Liu Link: https://lore.kernel.org/bpf/20200429012111.277390-7-andriin@fb.com --- tools/testing/selftests/bpf/test_progs.c | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/testing/selftests/bpf/test_progs.c b/tools/testing/selftests/bpf/test_progs.c index 86d0020c9eec..93970ec1c9e9 100644 --- a/tools/testing/selftests/bpf/test_progs.c +++ b/tools/testing/selftests/bpf/test_progs.c @@ -351,6 +351,7 @@ int extract_build_id(char *build_id, size_t size) len = size; memcpy(build_id, line, len); build_id[len] = '\0'; + free(line); return 0; err: fclose(fp); -- cgit v1.2.3-59-g8ed1b From 13c908495e5d51718a6da84ae925fa2aac056380 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 28 Apr 2020 18:21:07 -0700 Subject: selftests/bpf: Fix invalid memory reads in core_relo selftest Another one found by AddressSanitizer. input_len is bigger than actually initialized data size. Fixes: c7566a69695c ("selftests/bpf: Add field existence CO-RE relocs tests") Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200429012111.277390-8-andriin@fb.com --- tools/testing/selftests/bpf/prog_tests/core_reloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/bpf/prog_tests/core_reloc.c b/tools/testing/selftests/bpf/prog_tests/core_reloc.c index 31e177adbdf1..084ed26a7d78 100644 --- a/tools/testing/selftests/bpf/prog_tests/core_reloc.c +++ b/tools/testing/selftests/bpf/prog_tests/core_reloc.c @@ -392,7 +392,7 @@ static struct core_reloc_test_case test_cases[] = { .input = STRUCT_TO_CHAR_PTR(core_reloc_existence___minimal) { .a = 42, }, - .input_len = sizeof(struct core_reloc_existence), + .input_len = sizeof(struct core_reloc_existence___minimal), .output = STRUCT_TO_CHAR_PTR(core_reloc_existence_output) { .a_exists = 1, .b_exists = 0, -- cgit v1.2.3-59-g8ed1b From 3521ffa2ee9a48c3236c93f54ae11c074490ebce Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 28 Apr 2020 18:21:08 -0700 Subject: libbpf: Fix huge memory leak in libbpf_find_vmlinux_btf_id() BTF object wasn't freed. Fixes: a6ed02cac690 ("libbpf: Load btf_vmlinux only once per object.") Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Cc: KP Singh Link: https://lore.kernel.org/bpf/20200429012111.277390-9-andriin@fb.com --- tools/lib/bpf/libbpf.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 445ee903f9cd..d86ff8214b96 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -6934,6 +6934,7 @@ int libbpf_find_vmlinux_btf_id(const char *name, enum bpf_attach_type attach_type) { struct btf *btf; + int err; btf = libbpf_find_kernel_btf(); if (IS_ERR(btf)) { @@ -6941,7 +6942,9 @@ int libbpf_find_vmlinux_btf_id(const char *name, return -EINVAL; } - return __find_vmlinux_btf_id(btf, name, attach_type); + err = __find_vmlinux_btf_id(btf, name, attach_type); + btf__free(btf); + return err; } static int libbpf_find_prog_btf_id(const char *name, __u32 attach_prog_fd) -- cgit v1.2.3-59-g8ed1b From 36d0b6159f6a6f51f600bf1777702f7036fb9839 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 28 Apr 2020 18:21:09 -0700 Subject: selftests/bpf: Disable ASAN instrumentation for mmap()'ed memory read AddressSanitizer assumes that all memory dereferences are done against memory allocated by sanitizer's malloc()/free() code and not touched by anyone else. Seems like this doesn't hold for perf buffer memory. Disable instrumentation on perf buffer callback function. Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200429012111.277390-10-andriin@fb.com --- tools/testing/selftests/bpf/prog_tests/perf_buffer.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tools/testing/selftests/bpf/prog_tests/perf_buffer.c b/tools/testing/selftests/bpf/prog_tests/perf_buffer.c index 1450ea2dd4cc..a122ce3b360e 100644 --- a/tools/testing/selftests/bpf/prog_tests/perf_buffer.c +++ b/tools/testing/selftests/bpf/prog_tests/perf_buffer.c @@ -6,6 +6,11 @@ #include #include "bpf/libbpf_internal.h" +/* AddressSanitizer sometimes crashes due to data dereference below, due to + * this being mmap()'ed memory. Disable instrumentation with + * no_sanitize_address attribute + */ +__attribute__((no_sanitize_address)) static void on_sample(void *ctx, int cpu, void *data, __u32 size) { int cpu_data = *(int *)data, duration = 0; -- cgit v1.2.3-59-g8ed1b From 8d30e80a049ad699264e4a12911e349f93c7279a Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 28 Apr 2020 18:21:10 -0700 Subject: selftests/bpf: Fix bpf_link leak in ns_current_pid_tgid selftest If condition is inverted, but it's also just not necessary. Fixes: 1c1052e0140a ("tools/testing/selftests/bpf: Add self-tests for new helper bpf_get_ns_current_pid_tgid.") Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Cc: Carlos Neira Link: https://lore.kernel.org/bpf/20200429012111.277390-11-andriin@fb.com --- tools/testing/selftests/bpf/prog_tests/ns_current_pid_tgid.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/ns_current_pid_tgid.c b/tools/testing/selftests/bpf/prog_tests/ns_current_pid_tgid.c index 542240e16564..e74dc501b27f 100644 --- a/tools/testing/selftests/bpf/prog_tests/ns_current_pid_tgid.c +++ b/tools/testing/selftests/bpf/prog_tests/ns_current_pid_tgid.c @@ -80,9 +80,6 @@ void test_ns_current_pid_tgid(void) "User pid/tgid %llu BPF pid/tgid %llu\n", id, bss.pid_tgid)) goto cleanup; cleanup: - if (!link) { - bpf_link__destroy(link); - link = NULL; - } + bpf_link__destroy(link); bpf_object__close(obj); } -- cgit v1.2.3-59-g8ed1b From e4e8f4d047fdcf7ac7d944e266e85d8041f16cd6 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 28 Apr 2020 18:21:11 -0700 Subject: selftests/bpf: Add runqslower binary to .gitignore With recent changes, runqslower is being copied into selftests/bpf root directory. So add it into .gitignore. Fixes: b26d1e2b6028 ("selftests/bpf: Copy runqslower to OUTPUT directory") Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Cc: Veronika Kabatova Link: https://lore.kernel.org/bpf/20200429012111.277390-12-andriin@fb.com --- tools/testing/selftests/bpf/.gitignore | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/bpf/.gitignore b/tools/testing/selftests/bpf/.gitignore index 16b9774d8b68..3ff031972975 100644 --- a/tools/testing/selftests/bpf/.gitignore +++ b/tools/testing/selftests/bpf/.gitignore @@ -37,4 +37,4 @@ test_cpp /no_alu32 /bpf_gcc /tools - +/runqslower -- cgit v1.2.3-59-g8ed1b From 2e410da6a098a9ff25d22a56ecb724b3c36fd528 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Tue, 28 Apr 2020 10:14:02 +0200 Subject: staging: rtl8723bs: remove mgmt_frame_register method This was changed in cfg80211, so having it broke things, but there's no need to adjust since it's an empty implementation. Just remove it. Fixes: 6cd536fe62ef ("cfg80211: change internal management frame registration API") Signed-off-by: Johannes Berg Acked-by: Greg Kroah-Hartman Link: https://lore.kernel.org/r/20200428101400.ae19d651ec38.Ieb15844bb5ab93b3d7931d6561f42e3316ef8251@changeid Signed-off-by: Johannes Berg --- drivers/staging/rtl8723bs/os_dep/ioctl_cfg80211.c | 24 ----------------------- 1 file changed, 24 deletions(-) diff --git a/drivers/staging/rtl8723bs/os_dep/ioctl_cfg80211.c b/drivers/staging/rtl8723bs/os_dep/ioctl_cfg80211.c index 1ba85a43f05a..cd31ad2b8a7b 100644 --- a/drivers/staging/rtl8723bs/os_dep/ioctl_cfg80211.c +++ b/drivers/staging/rtl8723bs/os_dep/ioctl_cfg80211.c @@ -3163,29 +3163,6 @@ exit: return ret; } -static void cfg80211_rtw_mgmt_frame_register(struct wiphy *wiphy, - struct wireless_dev *wdev, - u16 frame_type, bool reg) -{ - struct net_device *ndev = wdev_to_ndev(wdev); - struct adapter *adapter; - - if (ndev == NULL) - goto exit; - - adapter = (struct adapter *)rtw_netdev_priv(ndev); - -#ifdef DEBUG_CFG80211 - DBG_871X(FUNC_ADPT_FMT" frame_type:%x, reg:%d\n", FUNC_ADPT_ARG(adapter), - frame_type, reg); -#endif - - if (frame_type != (IEEE80211_FTYPE_MGMT | IEEE80211_STYPE_PROBE_REQ)) - return; -exit: - return; -} - #if defined(CONFIG_PNO_SUPPORT) static int cfg80211_rtw_sched_scan_start(struct wiphy *wiphy, struct net_device *dev, @@ -3397,7 +3374,6 @@ static struct cfg80211_ops rtw_cfg80211_ops = { .change_bss = cfg80211_rtw_change_bss, .mgmt_tx = cfg80211_rtw_mgmt_tx, - .mgmt_frame_register = cfg80211_rtw_mgmt_frame_register, #if defined(CONFIG_PNO_SUPPORT) .sched_scan_start = cfg80211_rtw_sched_scan_start, -- cgit v1.2.3-59-g8ed1b From d530b9864073a714a6b6dcddee77c9b24074071f Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Tue, 28 Apr 2020 10:14:03 +0200 Subject: staging: wilc1000: adjust for management frame register API changes Adjust to the API changes in cfg80211 for management frame registration. Fixes: 6cd536fe62ef ("cfg80211: change internal management frame registration API") Signed-off-by: Johannes Berg Reviewed-by: Sergey Matyukevich Acked-by: Greg Kroah-Hartman Acked-by: Ajay Singh Link: https://lore.kernel.org/r/20200428101400.bac7e94c2bf8.I6a2287b9f68f35aff5f6de409c5ffa388de760e2@changeid Signed-off-by: Johannes Berg --- drivers/staging/wilc1000/cfg80211.c | 36 +++++++++++++++++------------------- drivers/staging/wilc1000/cfg80211.h | 5 +++-- drivers/staging/wilc1000/netdev.c | 21 +++++++++------------ drivers/staging/wilc1000/netdev.h | 9 +-------- 4 files changed, 30 insertions(+), 41 deletions(-) diff --git a/drivers/staging/wilc1000/cfg80211.c b/drivers/staging/wilc1000/cfg80211.c index 4bdcbc5fd2fd..b6065a0d660f 100644 --- a/drivers/staging/wilc1000/cfg80211.c +++ b/drivers/staging/wilc1000/cfg80211.c @@ -1217,33 +1217,31 @@ static int mgmt_tx_cancel_wait(struct wiphy *wiphy, return 0; } -void wilc_mgmt_frame_register(struct wiphy *wiphy, struct wireless_dev *wdev, - u16 frame_type, bool reg) +void wilc_update_mgmt_frame_registrations(struct wiphy *wiphy, + struct wireless_dev *wdev, + struct mgmt_frame_regs *upd) { struct wilc *wl = wiphy_priv(wiphy); struct wilc_vif *vif = netdev_priv(wdev->netdev); + u32 presp_bit = BIT(IEEE80211_STYPE_PROBE_REQ >> 4); + u32 action_bit = BIT(IEEE80211_STYPE_ACTION >> 4); - if (!frame_type) - return; + if (wl->initialized) { + bool prev = vif->mgmt_reg_stypes & presp_bit; + bool now = upd->interface_stypes & presp_bit; - switch (frame_type) { - case IEEE80211_STYPE_PROBE_REQ: - vif->frame_reg[0].type = frame_type; - vif->frame_reg[0].reg = reg; - break; + if (now != prev) + wilc_frame_register(vif, IEEE80211_STYPE_PROBE_REQ, now); - case IEEE80211_STYPE_ACTION: - vif->frame_reg[1].type = frame_type; - vif->frame_reg[1].reg = reg; - break; + prev = vif->mgmt_reg_stypes & action_bit; + now = upd->interface_stypes & action_bit; - default: - break; + if (now != prev) + wilc_frame_register(vif, IEEE80211_STYPE_ACTION, now); } - if (!wl->initialized) - return; - wilc_frame_register(vif, frame_type, reg); + vif->mgmt_reg_stypes = + upd->interface_stypes & (presp_bit | action_bit); } static int set_cqm_rssi_config(struct wiphy *wiphy, struct net_device *dev, @@ -1665,7 +1663,7 @@ static const struct cfg80211_ops wilc_cfg80211_ops = { .cancel_remain_on_channel = cancel_remain_on_channel, .mgmt_tx_cancel_wait = mgmt_tx_cancel_wait, .mgmt_tx = mgmt_tx, - .mgmt_frame_register = wilc_mgmt_frame_register, + .update_mgmt_frame_registrations = wilc_update_mgmt_frame_registrations, .set_power_mgmt = set_power_mgmt, .set_cqm_rssi_config = set_cqm_rssi_config, diff --git a/drivers/staging/wilc1000/cfg80211.h b/drivers/staging/wilc1000/cfg80211.h index 5e5d63f70df2..37b294cb3b37 100644 --- a/drivers/staging/wilc1000/cfg80211.h +++ b/drivers/staging/wilc1000/cfg80211.h @@ -21,8 +21,9 @@ void wilc_wfi_deinit_mon_interface(struct wilc *wl, bool rtnl_locked); struct net_device *wilc_wfi_init_mon_interface(struct wilc *wl, const char *name, struct net_device *real_dev); -void wilc_mgmt_frame_register(struct wiphy *wiphy, struct wireless_dev *wdev, - u16 frame_type, bool reg); +void wilc_update_mgmt_frame_registrations(struct wiphy *wiphy, + struct wireless_dev *wdev, + struct mgmt_frame_regs *upd); struct wilc_vif *wilc_get_interface(struct wilc *wl); struct wilc_vif *wilc_get_wl_to_vif(struct wilc *wl); void wlan_deinit_locks(struct wilc *wilc); diff --git a/drivers/staging/wilc1000/netdev.c b/drivers/staging/wilc1000/netdev.c index f94a17babd12..fda0ab97b02c 100644 --- a/drivers/staging/wilc1000/netdev.c +++ b/drivers/staging/wilc1000/netdev.c @@ -571,6 +571,7 @@ static int wilc_mac_open(struct net_device *ndev) struct wilc *wl = vif->wilc; unsigned char mac_add[ETH_ALEN] = {0}; int ret = 0; + struct mgmt_frame_regs mgmt_regs = {}; if (!wl || !wl->dev) { netdev_err(ndev, "device not ready\n"); @@ -602,14 +603,12 @@ static int wilc_mac_open(struct net_device *ndev) return -EINVAL; } - wilc_mgmt_frame_register(vif->ndev->ieee80211_ptr->wiphy, - vif->ndev->ieee80211_ptr, - vif->frame_reg[0].type, - vif->frame_reg[0].reg); - wilc_mgmt_frame_register(vif->ndev->ieee80211_ptr->wiphy, - vif->ndev->ieee80211_ptr, - vif->frame_reg[1].type, - vif->frame_reg[1].reg); + mgmt_regs.interface_stypes = vif->mgmt_reg_stypes; + /* so we detect a change */ + vif->mgmt_reg_stypes = 0; + wilc_update_mgmt_frame_registrations(vif->ndev->ieee80211_ptr->wiphy, + vif->ndev->ieee80211_ptr, + &mgmt_regs); netif_wake_queue(ndev); wl->open_ifcs++; vif->mac_opened = 1; @@ -792,12 +791,10 @@ void wilc_wfi_mgmt_rx(struct wilc *wilc, u8 *buff, u32 size) srcu_idx = srcu_read_lock(&wilc->srcu); list_for_each_entry_rcu(vif, &wilc->vif_list, list) { u16 type = le16_to_cpup((__le16 *)buff); + u32 type_bit = BIT(type >> 4); if (vif->priv.p2p_listen_state && - ((type == vif->frame_reg[0].type && - vif->frame_reg[0].reg) || - (type == vif->frame_reg[1].type && - vif->frame_reg[1].reg))) + vif->mgmt_reg_stypes & type_bit) wilc_wfi_p2p_rx(vif, buff, size); if (vif->monitor_flag) diff --git a/drivers/staging/wilc1000/netdev.h b/drivers/staging/wilc1000/netdev.h index 61cbec674a62..d0a006b68d08 100644 --- a/drivers/staging/wilc1000/netdev.h +++ b/drivers/staging/wilc1000/netdev.h @@ -24,8 +24,6 @@ #define PMKID_FOUND 1 #define NUM_STA_ASSOCIATED 8 -#define NUM_REG_FRAME 2 - #define TCP_ACK_FILTER_LINK_SPEED_THRESH 54 #define DEFAULT_LINK_SPEED 72 @@ -151,11 +149,6 @@ struct wilc_priv { u64 inc_roc_cookie; }; -struct frame_reg { - u16 type; - bool reg; -}; - #define MAX_TCP_SESSION 25 #define MAX_PENDING_ACKS 256 @@ -187,7 +180,7 @@ struct wilc_vif { u8 iftype; int monitor_flag; int mac_opened; - struct frame_reg frame_reg[NUM_REG_FRAME]; + u32 mgmt_reg_stypes; struct net_device_stats netstats; struct wilc *wilc; u8 bssid[ETH_ALEN]; -- cgit v1.2.3-59-g8ed1b From bedd7904e86c02ae80513c212ea25789d8bf4fb4 Mon Sep 17 00:00:00 2001 From: Jouni Malinen Date: Sat, 25 Apr 2020 18:57:11 +0300 Subject: mac80211_hwsim: Advertise support for multicast RX registration While mac80211_hwsim does not need this to configure RX filters, it is convenient to have this enabled for testing purposes. Signed-off-by: Jouni Malinen Link: https://lore.kernel.org/r/20200425155713.25687-3-jouni@codeaurora.org Signed-off-by: Johannes Berg --- drivers/net/wireless/mac80211_hwsim.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/net/wireless/mac80211_hwsim.c b/drivers/net/wireless/mac80211_hwsim.c index 05e8203aa6d9..bd1f4c249d11 100644 --- a/drivers/net/wireless/mac80211_hwsim.c +++ b/drivers/net/wireless/mac80211_hwsim.c @@ -1827,6 +1827,8 @@ static void mac80211_hwsim_configure_filter(struct ieee80211_hw *hw, data->rx_filter = 0; if (*total_flags & FIF_ALLMULTI) data->rx_filter |= FIF_ALLMULTI; + if (*total_flags & FIF_MCAST_ACTION) + data->rx_filter |= FIF_MCAST_ACTION; *total_flags = data->rx_filter; } @@ -3060,6 +3062,8 @@ static int mac80211_hwsim_new_radio(struct genl_info *info, NL80211_FEATURE_SCAN_RANDOM_MAC_ADDR; wiphy_ext_feature_set(hw->wiphy, NL80211_EXT_FEATURE_VHT_IBSS); wiphy_ext_feature_set(hw->wiphy, NL80211_EXT_FEATURE_BEACON_PROTECTION); + wiphy_ext_feature_set(hw->wiphy, + NL80211_EXT_FEATURE_MULTICAST_REGISTRATIONS); hw->wiphy->interface_modes = param->iftypes; -- cgit v1.2.3-59-g8ed1b From 08fad438bed0ada1a3308987862327286fcbb5f5 Mon Sep 17 00:00:00 2001 From: Jouni Malinen Date: Sat, 25 Apr 2020 18:57:12 +0300 Subject: mac80211: TX legacy rate control for Beacon frames Use the Beacon frame specific legacy rate configuration, if specified for AP or mesh, instead of the generic rate mask when selecting the TX rate for Beacon frames. Signed-off-by: Jouni Malinen Link: https://lore.kernel.org/r/20200425155713.25687-4-jouni@codeaurora.org Signed-off-by: Johannes Berg --- net/mac80211/cfg.c | 26 +++++++++++++++++++++++++- net/mac80211/ieee80211_i.h | 4 ++++ net/mac80211/mesh.c | 1 + net/mac80211/tx.c | 5 ++++- 4 files changed, 34 insertions(+), 2 deletions(-) diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index ae3e06375a28..548a384b0509 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -994,7 +994,7 @@ static int ieee80211_start_ap(struct wiphy *wiphy, struct net_device *dev, BSS_CHANGED_TWT | BSS_CHANGED_HE_OBSS_PD | BSS_CHANGED_HE_BSS_COLOR; - int err; + int i, err; int prev_beacon_int; old = sdata_dereference(sdata->u.ap.beacon, sdata); @@ -1085,6 +1085,17 @@ static int ieee80211_start_ap(struct wiphy *wiphy, struct net_device *dev, sdata->vif.bss_conf.p2p_noa_attr.oppps_ctwindow |= IEEE80211_P2P_OPPPS_ENABLE_BIT; + sdata->beacon_rate_set = false; + if (wiphy_ext_feature_isset(local->hw.wiphy, + NL80211_EXT_FEATURE_BEACON_RATE_LEGACY)) { + for (i = 0; i < NUM_NL80211_BANDS; i++) { + sdata->beacon_rateidx_mask[i] = + params->beacon_rate.control[i].legacy; + if (sdata->beacon_rateidx_mask[i]) + sdata->beacon_rate_set = true; + } + } + err = ieee80211_assign_beacon(sdata, ¶ms->beacon, NULL); if (err < 0) { ieee80211_vif_release_channel(sdata); @@ -1189,6 +1200,7 @@ static int ieee80211_stop_ap(struct wiphy *wiphy, struct net_device *dev) ieee80211_free_keys(sdata, true); sdata->vif.bss_conf.enable_beacon = false; + sdata->beacon_rate_set = false; sdata->vif.bss_conf.ssid_len = 0; clear_bit(SDATA_STATE_OFFCHANNEL_BEACON_STOPPED, &sdata->state); ieee80211_bss_info_change_notify(sdata, BSS_CHANGED_BEACON_ENABLED); @@ -1949,6 +1961,7 @@ static int copy_mesh_setup(struct ieee80211_if_mesh *ifmsh, const u8 *old_ie; struct ieee80211_sub_if_data *sdata = container_of(ifmsh, struct ieee80211_sub_if_data, u.mesh); + int i; /* allocate information elements */ new_ie = NULL; @@ -1987,6 +2000,17 @@ static int copy_mesh_setup(struct ieee80211_if_mesh *ifmsh, sdata->vif.bss_conf.beacon_int = setup->beacon_interval; sdata->vif.bss_conf.dtim_period = setup->dtim_period; + sdata->beacon_rate_set = false; + if (wiphy_ext_feature_isset(sdata->local->hw.wiphy, + NL80211_EXT_FEATURE_BEACON_RATE_LEGACY)) { + for (i = 0; i < NUM_NL80211_BANDS; i++) { + sdata->beacon_rateidx_mask[i] = + setup->beacon_rate.control[i].legacy; + if (sdata->beacon_rateidx_mask[i]) + sdata->beacon_rate_set = true; + } + } + return 0; } diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index 9407cf44305c..8cbae66b5cdb 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -962,6 +962,10 @@ struct ieee80211_sub_if_data { bool rc_has_vht_mcs_mask[NUM_NL80211_BANDS]; u16 rc_rateidx_vht_mcs_mask[NUM_NL80211_BANDS][NL80211_VHT_NSS_MAX]; + /* Beacon frame (non-MCS) rate (as a bitmap) */ + u32 beacon_rateidx_mask[NUM_NL80211_BANDS]; + bool beacon_rate_set; + union { struct ieee80211_if_ap ap; struct ieee80211_if_wds wds; diff --git a/net/mac80211/mesh.c b/net/mac80211/mesh.c index 36978a0e5000..5930d07b1e43 100644 --- a/net/mac80211/mesh.c +++ b/net/mac80211/mesh.c @@ -994,6 +994,7 @@ void ieee80211_stop_mesh(struct ieee80211_sub_if_data *sdata) /* stop the beacon */ ifmsh->mesh_id_len = 0; sdata->vif.bss_conf.enable_beacon = false; + sdata->beacon_rate_set = false; clear_bit(SDATA_STATE_OFFCHANNEL_BEACON_STOPPED, &sdata->state); ieee80211_bss_info_change_notify(sdata, BSS_CHANGED_BEACON_ENABLED); diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index 3dc1990e15c5..6dad67eb60b2 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -4883,7 +4883,10 @@ __ieee80211_beacon_get(struct ieee80211_hw *hw, txrc.bss_conf = &sdata->vif.bss_conf; txrc.skb = skb; txrc.reported_rate.idx = -1; - txrc.rate_idx_mask = sdata->rc_rateidx_mask[band]; + if (sdata->beacon_rate_set && sdata->beacon_rateidx_mask[band]) + txrc.rate_idx_mask = sdata->beacon_rateidx_mask[band]; + else + txrc.rate_idx_mask = sdata->rc_rateidx_mask[band]; txrc.bss = true; rate_control_get_rate(sdata, NULL, &txrc); -- cgit v1.2.3-59-g8ed1b From 1512bc076e5ba2c4e8a189a4dbc883d59b4c37ef Mon Sep 17 00:00:00 2001 From: Jouni Malinen Date: Sat, 25 Apr 2020 18:57:13 +0300 Subject: mac80211_hwsim: Claim support for setting Beacon frame TX legacy rate mac80211 takes care of rate control for the Beacon frames, so all mac80211_hwsim needs to do here is advertise support for this. Signed-off-by: Jouni Malinen Link: https://lore.kernel.org/r/20200425155713.25687-5-jouni@codeaurora.org Signed-off-by: Johannes Berg --- drivers/net/wireless/mac80211_hwsim.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/wireless/mac80211_hwsim.c b/drivers/net/wireless/mac80211_hwsim.c index bd1f4c249d11..29084096044e 100644 --- a/drivers/net/wireless/mac80211_hwsim.c +++ b/drivers/net/wireless/mac80211_hwsim.c @@ -3064,6 +3064,8 @@ static int mac80211_hwsim_new_radio(struct genl_info *info, wiphy_ext_feature_set(hw->wiphy, NL80211_EXT_FEATURE_BEACON_PROTECTION); wiphy_ext_feature_set(hw->wiphy, NL80211_EXT_FEATURE_MULTICAST_REGISTRATIONS); + wiphy_ext_feature_set(hw->wiphy, + NL80211_EXT_FEATURE_BEACON_RATE_LEGACY); hw->wiphy->interface_modes = param->iftypes; -- cgit v1.2.3-59-g8ed1b From 60689de46c7f6a0028c8b37b6f03db68cbfad8ed Mon Sep 17 00:00:00 2001 From: Rajkumar Manoharan Date: Fri, 24 Apr 2020 15:41:39 -0700 Subject: mac80211: fix memory overlap due to variable length param As of now HE operation element in bss_conf includes variable length optional field followed by other HE variable. Though the optional field never be used, actually it is referring to next member of the bss_conf structure which is not correct. Fix it by declaring needed HE operation fields within bss_conf itself. Signed-off-by: Rajkumar Manoharan Link: https://lore.kernel.org/r/1587768108-25248-2-git-send-email-rmanohar@codeaurora.org Signed-off-by: Johannes Berg --- drivers/net/wireless/ath/ath11k/mac.c | 3 +-- include/net/mac80211.h | 7 +++++-- net/mac80211/he.c | 13 +++++-------- 3 files changed, 11 insertions(+), 12 deletions(-) diff --git a/drivers/net/wireless/ath/ath11k/mac.c b/drivers/net/wireless/ath/ath11k/mac.c index 9f8bc19cc5ae..06d063274eea 100644 --- a/drivers/net/wireless/ath/ath11k/mac.c +++ b/drivers/net/wireless/ath/ath11k/mac.c @@ -1168,8 +1168,7 @@ static void ath11k_peer_assoc_h_he(struct ath11k *ar, sizeof(arg->peer_he_cap_macinfo)); memcpy(&arg->peer_he_cap_phyinfo, he_cap->he_cap_elem.phy_cap_info, sizeof(arg->peer_he_cap_phyinfo)); - memcpy(&arg->peer_he_ops, &vif->bss_conf.he_operation, - sizeof(arg->peer_he_ops)); + arg->peer_he_ops = vif->bss_conf.he_oper.params; /* the top most byte is used to indicate BSS color info */ arg->peer_he_ops &= 0xffffff; diff --git a/include/net/mac80211.h b/include/net/mac80211.h index ecb219e3ec4f..78f7ce586287 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -604,7 +604,7 @@ struct ieee80211_ftm_responder_params { * nontransmitted BSSIDs * @profile_periodicity: the least number of beacon frames need to be received * in order to discover all the nontransmitted BSSIDs in the set. - * @he_operation: HE operation information of the AP we are connected to + * @he_oper: HE operation information of the AP we are connected to * @he_obss_pd: OBSS Packet Detection parameters. * @he_bss_color: BSS coloring settings, if BSS supports HE */ @@ -668,7 +668,10 @@ struct ieee80211_bss_conf { u8 bssid_indicator; bool ema_ap; u8 profile_periodicity; - struct ieee80211_he_operation he_operation; + struct { + u32 params; + u16 nss_set; + } he_oper; struct ieee80211_he_obss_pd he_obss_pd; struct cfg80211_he_bss_color he_bss_color; }; diff --git a/net/mac80211/he.c b/net/mac80211/he.c index 1087f715338b..f520552b22be 100644 --- a/net/mac80211/he.c +++ b/net/mac80211/he.c @@ -57,17 +57,14 @@ ieee80211_he_cap_ie_to_sta_he_cap(struct ieee80211_sub_if_data *sdata, void ieee80211_he_op_ie_to_bss_conf(struct ieee80211_vif *vif, - const struct ieee80211_he_operation *he_op_ie_elem) + const struct ieee80211_he_operation *he_op_ie) { - struct ieee80211_he_operation *he_operation = - &vif->bss_conf.he_operation; - - if (!he_op_ie_elem) { - memset(he_operation, 0, sizeof(*he_operation)); + memset(&vif->bss_conf.he_oper, 0, sizeof(vif->bss_conf.he_oper)); + if (!he_op_ie) return; - } - vif->bss_conf.he_operation = *he_op_ie_elem; + vif->bss_conf.he_oper.params = __le32_to_cpu(he_op_ie->he_oper_params); + vif->bss_conf.he_oper.nss_set = __le16_to_cpu(he_op_ie->he_mcs_nss_set); } void -- cgit v1.2.3-59-g8ed1b From cb10228d234c49e2035bfce7bdb42c29e1049c5c Mon Sep 17 00:00:00 2001 From: Yonglong Liu Date: Wed, 29 Apr 2020 11:46:24 +0800 Subject: net: hns3: adds support for reading module eeprom info This patch adds support for reading the optical module eeprom info via "ethtool -m". Signed-off-by: Yonglong Liu Signed-off-by: Huazhong Tan Acked-by: Jakub Kicinski Signed-off-by: David S. Miller --- drivers/net/ethernet/hisilicon/hns3/hnae3.h | 4 + drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c | 75 +++++++++++++++ .../net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h | 15 +++ .../ethernet/hisilicon/hns3/hns3pf/hclge_main.c | 102 +++++++++++++++++++++ 4 files changed, 196 insertions(+) diff --git a/drivers/net/ethernet/hisilicon/hns3/hnae3.h b/drivers/net/ethernet/hisilicon/hns3/hnae3.h index 6291aa9f06b0..5602bf226687 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hnae3.h +++ b/drivers/net/ethernet/hisilicon/hns3/hnae3.h @@ -374,6 +374,8 @@ struct hnae3_ae_dev { * Set the max tx rate of specified vf. * set_vf_mac * Configure the default MAC for specified VF + * get_module_eeprom + * Get the optical module eeprom info. */ struct hnae3_ae_ops { int (*init_ae_dev)(struct hnae3_ae_dev *ae_dev); @@ -548,6 +550,8 @@ struct hnae3_ae_ops { int (*set_vf_rate)(struct hnae3_handle *handle, int vf, int min_tx_rate, int max_tx_rate, bool force); int (*set_vf_mac)(struct hnae3_handle *handle, int vf, u8 *p); + int (*get_module_eeprom)(struct hnae3_handle *handle, u32 offset, + u32 len, u8 *data); }; struct hnae3_dcb_ops { diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c b/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c index 4d9c85f049dc..1a105f2f87a4 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c @@ -4,6 +4,7 @@ #include #include #include +#include #include "hns3_enet.h" @@ -12,6 +13,11 @@ struct hns3_stats { int stats_offset; }; +struct hns3_sfp_type { + u8 type; + u8 ext_type; +}; + /* tqp related stats */ #define HNS3_TQP_STAT(_string, _member) { \ .stats_string = _string, \ @@ -1386,6 +1392,73 @@ static int hns3_set_fecparam(struct net_device *netdev, return ops->set_fec(handle, fec_mode); } +static int hns3_get_module_info(struct net_device *netdev, + struct ethtool_modinfo *modinfo) +{ +#define HNS3_SFF_8636_V1_3 0x03 + + struct hnae3_handle *handle = hns3_get_handle(netdev); + const struct hnae3_ae_ops *ops = handle->ae_algo->ops; + struct hns3_sfp_type sfp_type; + int ret; + + if (handle->pdev->revision == 0x20 || !ops->get_module_eeprom) + return -EOPNOTSUPP; + + memset(&sfp_type, 0, sizeof(sfp_type)); + ret = ops->get_module_eeprom(handle, 0, sizeof(sfp_type) / sizeof(u8), + (u8 *)&sfp_type); + if (ret) + return ret; + + switch (sfp_type.type) { + case SFF8024_ID_SFP: + modinfo->type = ETH_MODULE_SFF_8472; + modinfo->eeprom_len = ETH_MODULE_SFF_8472_LEN; + break; + case SFF8024_ID_QSFP_8438: + modinfo->type = ETH_MODULE_SFF_8436; + modinfo->eeprom_len = ETH_MODULE_SFF_8436_MAX_LEN; + break; + case SFF8024_ID_QSFP_8436_8636: + if (sfp_type.ext_type < HNS3_SFF_8636_V1_3) { + modinfo->type = ETH_MODULE_SFF_8436; + modinfo->eeprom_len = ETH_MODULE_SFF_8436_MAX_LEN; + } else { + modinfo->type = ETH_MODULE_SFF_8636; + modinfo->eeprom_len = ETH_MODULE_SFF_8636_MAX_LEN; + } + break; + case SFF8024_ID_QSFP28_8636: + modinfo->type = ETH_MODULE_SFF_8636; + modinfo->eeprom_len = ETH_MODULE_SFF_8636_MAX_LEN; + break; + default: + netdev_err(netdev, "Optical module unknown: %#x\n", + sfp_type.type); + return -EINVAL; + } + + return 0; +} + +static int hns3_get_module_eeprom(struct net_device *netdev, + struct ethtool_eeprom *ee, u8 *data) +{ + struct hnae3_handle *handle = hns3_get_handle(netdev); + const struct hnae3_ae_ops *ops = handle->ae_algo->ops; + + if (handle->pdev->revision == 0x20 || !ops->get_module_eeprom) + return -EOPNOTSUPP; + + if (!ee->len) + return -EINVAL; + + memset(data, 0, ee->len); + + return ops->get_module_eeprom(handle, ee->offset, ee->len, data); +} + #define HNS3_ETHTOOL_COALESCE (ETHTOOL_COALESCE_USECS | \ ETHTOOL_COALESCE_USE_ADAPTIVE | \ ETHTOOL_COALESCE_RX_USECS_HIGH | \ @@ -1449,6 +1522,8 @@ static const struct ethtool_ops hns3_ethtool_ops = { .set_msglevel = hns3_set_msglevel, .get_fecparam = hns3_get_fecparam, .set_fecparam = hns3_set_fecparam, + .get_module_info = hns3_get_module_info, + .get_module_eeprom = hns3_get_module_eeprom, }; void hns3_ethtool_set_ops(struct net_device *netdev) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h index 90e422efe590..9a9d752aedc5 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h @@ -270,6 +270,8 @@ enum hclge_opcode_type { HCLGE_OPC_M7_COMPAT_CFG = 0x701A, /* SFP command */ + HCLGE_OPC_GET_SFP_EEPROM = 0x7100, + HCLGE_OPC_GET_SFP_EXIST = 0x7101, HCLGE_OPC_GET_SFP_INFO = 0x7104, /* Error INT commands */ @@ -1054,6 +1056,19 @@ struct hclge_firmware_compat_cmd { u8 rsv[20]; }; +#define HCLGE_SFP_INFO_CMD_NUM 6 +#define HCLGE_SFP_INFO_BD0_LEN 20 +#define HCLGE_SFP_INFO_BDX_LEN 24 +#define HCLGE_SFP_INFO_MAX_LEN \ + (HCLGE_SFP_INFO_BD0_LEN + \ + (HCLGE_SFP_INFO_CMD_NUM - 1) * HCLGE_SFP_INFO_BDX_LEN) + +struct hclge_sfp_info_bd0_cmd { + __le16 offset; + __le16 read_len; + u8 data[HCLGE_SFP_INFO_BD0_LEN]; +}; + int hclge_cmd_init(struct hclge_dev *hdev); static inline void hclge_write_reg(void __iomem *base, u32 reg, u32 value) { diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c index e2fec832fdf0..71a54ddb51f5 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c @@ -11119,6 +11119,107 @@ static void hclge_sync_promisc_mode(struct hclge_dev *hdev) } } +static bool hclge_module_existed(struct hclge_dev *hdev) +{ + struct hclge_desc desc; + u32 existed; + int ret; + + hclge_cmd_setup_basic_desc(&desc, HCLGE_OPC_GET_SFP_EXIST, true); + ret = hclge_cmd_send(&hdev->hw, &desc, 1); + if (ret) { + dev_err(&hdev->pdev->dev, + "failed to get SFP exist state, ret = %d\n", ret); + return false; + } + + existed = le32_to_cpu(desc.data[0]); + + return existed != 0; +} + +/* need 6 bds(total 140 bytes) in one reading + * return the number of bytes actually read, 0 means read failed. + */ +static u16 hclge_get_sfp_eeprom_info(struct hclge_dev *hdev, u32 offset, + u32 len, u8 *data) +{ + struct hclge_desc desc[HCLGE_SFP_INFO_CMD_NUM]; + struct hclge_sfp_info_bd0_cmd *sfp_info_bd0; + u16 read_len; + u16 copy_len; + int ret; + int i; + + /* setup all 6 bds to read module eeprom info. */ + for (i = 0; i < HCLGE_SFP_INFO_CMD_NUM; i++) { + hclge_cmd_setup_basic_desc(&desc[i], HCLGE_OPC_GET_SFP_EEPROM, + true); + + /* bd0~bd4 need next flag */ + if (i < HCLGE_SFP_INFO_CMD_NUM - 1) + desc[i].flag |= cpu_to_le16(HCLGE_CMD_FLAG_NEXT); + } + + /* setup bd0, this bd contains offset and read length. */ + sfp_info_bd0 = (struct hclge_sfp_info_bd0_cmd *)desc[0].data; + sfp_info_bd0->offset = cpu_to_le16((u16)offset); + read_len = min_t(u16, len, HCLGE_SFP_INFO_MAX_LEN); + sfp_info_bd0->read_len = cpu_to_le16(read_len); + + ret = hclge_cmd_send(&hdev->hw, desc, i); + if (ret) { + dev_err(&hdev->pdev->dev, + "failed to get SFP eeprom info, ret = %d\n", ret); + return 0; + } + + /* copy sfp info from bd0 to out buffer. */ + copy_len = min_t(u16, len, HCLGE_SFP_INFO_BD0_LEN); + memcpy(data, sfp_info_bd0->data, copy_len); + read_len = copy_len; + + /* copy sfp info from bd1~bd5 to out buffer if needed. */ + for (i = 1; i < HCLGE_SFP_INFO_CMD_NUM; i++) { + if (read_len >= len) + return read_len; + + copy_len = min_t(u16, len - read_len, HCLGE_SFP_INFO_BDX_LEN); + memcpy(data + read_len, desc[i].data, copy_len); + read_len += copy_len; + } + + return read_len; +} + +static int hclge_get_module_eeprom(struct hnae3_handle *handle, u32 offset, + u32 len, u8 *data) +{ + struct hclge_vport *vport = hclge_get_vport(handle); + struct hclge_dev *hdev = vport->back; + u32 read_len = 0; + u16 data_len; + + if (hdev->hw.mac.media_type != HNAE3_MEDIA_TYPE_FIBER) + return -EOPNOTSUPP; + + if (!hclge_module_existed(hdev)) + return -ENXIO; + + while (read_len < len) { + data_len = hclge_get_sfp_eeprom_info(hdev, + offset + read_len, + len - read_len, + data + read_len); + if (!data_len) + return -EIO; + + read_len += data_len; + } + + return 0; +} + static const struct hnae3_ae_ops hclge_ops = { .init_ae_dev = hclge_init_ae_dev, .uninit_ae_dev = hclge_uninit_ae_dev, @@ -11211,6 +11312,7 @@ static const struct hnae3_ae_ops hclge_ops = { .set_vf_trust = hclge_set_vf_trust, .set_vf_rate = hclge_set_vf_rate, .set_vf_mac = hclge_set_vf_mac, + .get_module_eeprom = hclge_get_module_eeprom, }; static struct hnae3_ae_algo ae_algo = { -- cgit v1.2.3-59-g8ed1b From 00b5aac59966ec3d608dc150aba552121e7de4f0 Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Wed, 29 Apr 2020 07:58:20 +0000 Subject: ptp: ptp_ines: convert to devm_platform_ioremap_resource Use the helper function that wraps the calls to platform_get_resource() and devm_ioremap_resource() together. Signed-off-by: Wei Yongjun Signed-off-by: David S. Miller --- drivers/ptp/ptp_ines.c | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/drivers/ptp/ptp_ines.c b/drivers/ptp/ptp_ines.c index 52d77db39829..7711651ff19e 100644 --- a/drivers/ptp/ptp_ines.c +++ b/drivers/ptp/ptp_ines.c @@ -783,16 +783,10 @@ static struct mii_timestamping_ctrl ines_ctrl = { static int ines_ptp_ctrl_probe(struct platform_device *pld) { struct ines_clock *clock; - struct resource *res; void __iomem *addr; int err = 0; - res = platform_get_resource(pld, IORESOURCE_MEM, 0); - if (!res) { - dev_err(&pld->dev, "missing memory resource\n"); - return -EINVAL; - } - addr = devm_ioremap_resource(&pld->dev, res); + addr = devm_platform_ioremap_resource(pld, 0); if (IS_ERR(addr)) { err = PTR_ERR(addr); goto out; -- cgit v1.2.3-59-g8ed1b From a54776f2c4939bdee084c9ecd00a4a5a25b7c429 Mon Sep 17 00:00:00 2001 From: Yunjian Wang Date: Wed, 29 Apr 2020 18:20:58 +0800 Subject: netpoll: Fix use correct return type for ndo_start_xmit() The method ndo_start_xmit() returns a value of type netdev_tx_t. Fix the ndo function to use the correct type. Signed-off-by: Yunjian Wang Signed-off-by: David S. Miller --- net/core/netpoll.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/net/core/netpoll.c b/net/core/netpoll.c index 849380a622ef..15b366a1a958 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -69,10 +69,11 @@ module_param(carrier_timeout, uint, 0644); #define np_notice(np, fmt, ...) \ pr_notice("%s: " fmt, np->name, ##__VA_ARGS__) -static int netpoll_start_xmit(struct sk_buff *skb, struct net_device *dev, - struct netdev_queue *txq) +static netdev_tx_t netpoll_start_xmit(struct sk_buff *skb, + struct net_device *dev, + struct netdev_queue *txq) { - int status = NETDEV_TX_OK; + netdev_tx_t status = NETDEV_TX_OK; netdev_features_t features; features = netif_skb_features(skb); @@ -307,7 +308,7 @@ static int netpoll_owner_active(struct net_device *dev) void netpoll_send_skb_on_dev(struct netpoll *np, struct sk_buff *skb, struct net_device *dev) { - int status = NETDEV_TX_BUSY; + netdev_tx_t status = NETDEV_TX_BUSY; unsigned long tries; /* It is up to the caller to keep npinfo alive. */ struct netpoll_info *npinfo; -- cgit v1.2.3-59-g8ed1b From ad56623119fda623ade9bc570294a1c2b203d374 Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Wed, 29 Apr 2020 21:24:30 +0800 Subject: net: hsr: remove unused inline functions There's no callers in-tree anymore. Signed-off-by: YueHaibing Signed-off-by: David S. Miller --- net/hsr/hsr_main.h | 19 ------------------- 1 file changed, 19 deletions(-) diff --git a/net/hsr/hsr_main.h b/net/hsr/hsr_main.h index 7321cf8d6d2c..f74193465bf5 100644 --- a/net/hsr/hsr_main.h +++ b/net/hsr/hsr_main.h @@ -62,15 +62,6 @@ struct hsr_tag { * with the path field in-between, which seems strange. I'm guessing the MAC * address definition is in error. */ -static inline u16 get_hsr_tag_path(struct hsr_tag *ht) -{ - return ntohs(ht->path_and_LSDU_size) >> 12; -} - -static inline u16 get_hsr_tag_LSDU_size(struct hsr_tag *ht) -{ - return ntohs(ht->path_and_LSDU_size) & 0x0FFF; -} static inline void set_hsr_tag_path(struct hsr_tag *ht, u16 path) { @@ -103,16 +94,6 @@ struct hsr_sup_payload { unsigned char macaddress_A[ETH_ALEN]; } __packed; -static inline u16 get_hsr_stag_path(struct hsr_sup_tag *hst) -{ - return get_hsr_tag_path((struct hsr_tag *)hst); -} - -static inline u16 get_hsr_stag_HSR_ver(struct hsr_sup_tag *hst) -{ - return get_hsr_tag_LSDU_size((struct hsr_tag *)hst); -} - static inline void set_hsr_stag_path(struct hsr_sup_tag *hst, u16 path) { set_hsr_tag_path((struct hsr_tag *)hst, path); -- cgit v1.2.3-59-g8ed1b From 0477e032a9ea34780180d9d5289f9da06714de43 Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Wed, 29 Apr 2020 21:25:30 +0800 Subject: ila: remove unused inline function ila_addr_is_ila There's no callers in-tree anymore since commit 84287bb32856 ("ila: add checksum neutral map auto"). Signed-off-by: YueHaibing Signed-off-by: David S. Miller --- net/ipv6/ila/ila.h | 5 ----- 1 file changed, 5 deletions(-) diff --git a/net/ipv6/ila/ila.h b/net/ipv6/ila/ila.h index bb6fc0d54dae..ad5f6f6ba333 100644 --- a/net/ipv6/ila/ila.h +++ b/net/ipv6/ila/ila.h @@ -68,11 +68,6 @@ static inline struct ila_addr *ila_a2i(struct in6_addr *addr) return (struct ila_addr *)addr; } -static inline bool ila_addr_is_ila(struct ila_addr *iaddr) -{ - return (iaddr->ident.type != ILA_ATYPE_IID); -} - struct ila_params { struct ila_locator locator; struct ila_locator locator_match; -- cgit v1.2.3-59-g8ed1b From 21615efa6a69891fa287bade979d56dd68b09878 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Wed, 29 Apr 2020 12:15:42 -0700 Subject: Revert "net: ethernet: fec: Prevent MII event after MII_SPEED write" This reverts commit 790ab249b55d75fdb427b92f81964cd7cb525eec. This change needs more work. Signed-off-by: David S. Miller --- drivers/net/ethernet/freescale/fec_main.c | 17 ----------------- 1 file changed, 17 deletions(-) diff --git a/drivers/net/ethernet/freescale/fec_main.c b/drivers/net/ethernet/freescale/fec_main.c index aa5e744ec098..1ae075a246a3 100644 --- a/drivers/net/ethernet/freescale/fec_main.c +++ b/drivers/net/ethernet/freescale/fec_main.c @@ -996,9 +996,6 @@ fec_restart(struct net_device *ndev) writel(0x0, fep->hwp + FEC_X_CNTRL); } - /* Prevent an MII event being report when changing speed */ - writel(0, fep->hwp + FEC_MII_DATA); - /* Set MII speed */ writel(fep->phy_speed, fep->hwp + FEC_MII_SPEED); @@ -1185,10 +1182,6 @@ fec_stop(struct net_device *ndev) writel(val, fep->hwp + FEC_ECNTRL); fec_enet_stop_mode(fep, true); } - - /* Prevent an MII event being report when changing speed */ - writel(0, fep->hwp + FEC_MII_DATA); - writel(fep->phy_speed, fep->hwp + FEC_MII_SPEED); /* We have to keep ENET enabled to have MII interrupt stay working */ @@ -2149,16 +2142,6 @@ static int fec_enet_mii_init(struct platform_device *pdev) if (suppress_preamble) fep->phy_speed |= BIT(7); - /* Clear MMFR to avoid to generate MII event by writing MSCR. - * MII event generation condition: - * - writing MSCR: - * - mmfr[31:0]_not_zero & mscr[7:0]_is_zero & - * mscr_reg_data_in[7:0] != 0 - * - writing MMFR: - * - mscr[7:0]_not_zero - */ - writel(0, fep->hwp + FEC_MII_DATA); - writel(fep->phy_speed, fep->hwp + FEC_MII_SPEED); /* Clear any pending transaction complete indication */ -- cgit v1.2.3-59-g8ed1b From fdff704dc60418e9a1bac78ae09c857d05c65aa3 Mon Sep 17 00:00:00 2001 From: Karsten Graul Date: Wed, 29 Apr 2020 17:10:37 +0200 Subject: net/smc: rework pnet table to support SMC-R failover The pnet table stored pnet ids in the smc device structures. When a device is going down its smc device structure is freed, and when the device is brought online again it no longer has a pnet id set. Rework the pnet table implementation to store the device name with their assigned pnet id and apply the pnet id to devices when they are registered. Signed-off-by: Karsten Graul Reviewed-by: Ursula Braun Signed-off-by: David S. Miller --- net/smc/smc_ib.c | 5 +- net/smc/smc_ism.c | 3 +- net/smc/smc_pnet.c | 539 +++++++++++++++++++++++++++++++---------------------- net/smc/smc_pnet.h | 2 + 4 files changed, 319 insertions(+), 230 deletions(-) diff --git a/net/smc/smc_ib.c b/net/smc/smc_ib.c index 04b6fefb8bce..440f9e319a38 100644 --- a/net/smc/smc_ib.c +++ b/net/smc/smc_ib.c @@ -579,8 +579,9 @@ static void smc_ib_add_dev(struct ib_device *ibdev) i++) { set_bit(i, &smcibdev->port_event_mask); /* determine pnetids of the port */ - smc_pnetid_by_dev_port(ibdev->dev.parent, i, - smcibdev->pnetid[i]); + if (smc_pnetid_by_dev_port(ibdev->dev.parent, i, + smcibdev->pnetid[i])) + smc_pnetid_by_table_ib(smcibdev, i + 1); } schedule_work(&smcibdev->port_event_work); } diff --git a/net/smc/smc_ism.c b/net/smc/smc_ism.c index 5c4727d5066e..32be2da2cb85 100644 --- a/net/smc/smc_ism.c +++ b/net/smc/smc_ism.c @@ -296,7 +296,8 @@ struct smcd_dev *smcd_alloc_dev(struct device *parent, const char *name, device_initialize(&smcd->dev); dev_set_name(&smcd->dev, name); smcd->ops = ops; - smc_pnetid_by_dev_port(parent, 0, smcd->pnetid); + if (smc_pnetid_by_dev_port(parent, 0, smcd->pnetid)) + smc_pnetid_by_table_smcd(smcd); spin_lock_init(&smcd->lock); spin_lock_init(&smcd->lgr_lock); diff --git a/net/smc/smc_pnet.c b/net/smc/smc_pnet.c index 2a5ed47c3e08..bd01c71b827a 100644 --- a/net/smc/smc_pnet.c +++ b/net/smc/smc_pnet.c @@ -50,29 +50,26 @@ static struct nla_policy smc_pnet_policy[SMC_PNETID_MAX + 1] = { static struct genl_family smc_pnet_nl_family; -/** - * struct smc_user_pnetentry - pnet identifier name entry for/from user - * @list: List node. - * @pnet_name: Pnet identifier name - * @ndev: pointer to network device. - * @smcibdev: Pointer to IB device. - * @ib_port: Port of IB device. - * @smcd_dev: Pointer to smcd device. - */ -struct smc_user_pnetentry { - struct list_head list; - char pnet_name[SMC_MAX_PNETID_LEN + 1]; - struct net_device *ndev; - struct smc_ib_device *smcibdev; - u8 ib_port; - struct smcd_dev *smcd_dev; +enum smc_pnet_nametype { + SMC_PNET_ETH = 1, + SMC_PNET_IB = 2, }; /* pnet entry stored in pnet table */ struct smc_pnetentry { struct list_head list; char pnet_name[SMC_MAX_PNETID_LEN + 1]; - struct net_device *ndev; + enum smc_pnet_nametype type; + union { + struct { + char eth_name[IFNAMSIZ + 1]; + struct net_device *ndev; + }; + struct { + char ib_name[IB_DEVICE_NAME_MAX + 1]; + u8 ib_port; + }; + }; }; /* Check if two given pnetids match */ @@ -106,14 +103,15 @@ static int smc_pnet_remove_by_pnetid(struct net *net, char *pnet_name) sn = net_generic(net, smc_net_id); pnettable = &sn->pnettable; - /* remove netdevices */ + /* remove table entry */ write_lock(&pnettable->lock); list_for_each_entry_safe(pnetelem, tmp_pe, &pnettable->pnetlist, list) { if (!pnet_name || smc_pnet_match(pnetelem->pnet_name, pnet_name)) { list_del(&pnetelem->list); - dev_put(pnetelem->ndev); + if (pnetelem->type == SMC_PNET_ETH && pnetelem->ndev) + dev_put(pnetelem->ndev); kfree(pnetelem); rc = 0; } @@ -155,9 +153,9 @@ static int smc_pnet_remove_by_pnetid(struct net *net, char *pnet_name) return rc; } -/* Remove a pnet entry mentioning a given network device from the pnet table. +/* Add the reference to a given network device to the pnet table. */ -static int smc_pnet_remove_by_ndev(struct net_device *ndev) +static int smc_pnet_add_by_ndev(struct net_device *ndev) { struct smc_pnetentry *pnetelem, *tmp_pe; struct smc_pnettable *pnettable; @@ -171,10 +169,10 @@ static int smc_pnet_remove_by_ndev(struct net_device *ndev) write_lock(&pnettable->lock); list_for_each_entry_safe(pnetelem, tmp_pe, &pnettable->pnetlist, list) { - if (pnetelem->ndev == ndev) { - list_del(&pnetelem->list); - dev_put(pnetelem->ndev); - kfree(pnetelem); + if (pnetelem->type == SMC_PNET_ETH && !pnetelem->ndev && + !strncmp(pnetelem->eth_name, ndev->name, IFNAMSIZ)) { + dev_hold(ndev); + pnetelem->ndev = ndev; rc = 0; break; } @@ -183,80 +181,67 @@ static int smc_pnet_remove_by_ndev(struct net_device *ndev) return rc; } -/* Append a pnetid to the end of the pnet table if not already on this list. +/* Remove the reference to a given network device from the pnet table. */ -static int smc_pnet_enter(struct smc_pnettable *pnettable, - struct smc_user_pnetentry *new_pnetelem) +static int smc_pnet_remove_by_ndev(struct net_device *ndev) { - u8 pnet_null[SMC_MAX_PNETID_LEN] = {0}; - u8 ndev_pnetid[SMC_MAX_PNETID_LEN]; - struct smc_pnetentry *tmp_pnetelem; - struct smc_pnetentry *pnetelem; - bool new_smcddev = false; - struct net_device *ndev; - bool new_netdev = true; - bool new_ibdev = false; - - if (new_pnetelem->smcibdev) { - struct smc_ib_device *ib_dev = new_pnetelem->smcibdev; - int ib_port = new_pnetelem->ib_port; + struct smc_pnetentry *pnetelem, *tmp_pe; + struct smc_pnettable *pnettable; + struct net *net = dev_net(ndev); + struct smc_net *sn; + int rc = -ENOENT; - spin_lock(&smc_ib_devices.lock); - if (smc_pnet_match(ib_dev->pnetid[ib_port - 1], pnet_null)) { - memcpy(ib_dev->pnetid[ib_port - 1], - new_pnetelem->pnet_name, SMC_MAX_PNETID_LEN); - ib_dev->pnetid_by_user[ib_port - 1] = true; - new_ibdev = true; - } - spin_unlock(&smc_ib_devices.lock); - } - if (new_pnetelem->smcd_dev) { - struct smcd_dev *smcd_dev = new_pnetelem->smcd_dev; + /* get pnettable for namespace */ + sn = net_generic(net, smc_net_id); + pnettable = &sn->pnettable; - spin_lock(&smcd_dev_list.lock); - if (smc_pnet_match(smcd_dev->pnetid, pnet_null)) { - memcpy(smcd_dev->pnetid, new_pnetelem->pnet_name, - SMC_MAX_PNETID_LEN); - smcd_dev->pnetid_by_user = true; - new_smcddev = true; + write_lock(&pnettable->lock); + list_for_each_entry_safe(pnetelem, tmp_pe, &pnettable->pnetlist, list) { + if (pnetelem->type == SMC_PNET_ETH && pnetelem->ndev == ndev) { + dev_put(pnetelem->ndev); + pnetelem->ndev = NULL; + rc = 0; + break; } - spin_unlock(&smcd_dev_list.lock); } + write_unlock(&pnettable->lock); + return rc; +} - if (!new_pnetelem->ndev) - return (new_ibdev || new_smcddev) ? 0 : -EEXIST; +/* Apply pnetid to ib device when no pnetid is set. + */ +static bool smc_pnet_apply_ib(struct smc_ib_device *ib_dev, u8 ib_port, + char *pnet_name) +{ + u8 pnet_null[SMC_MAX_PNETID_LEN] = {0}; + bool applied = false; - /* check if (base) netdev already has a pnetid. If there is one, we do - * not want to add a pnet table entry - */ - ndev = pnet_find_base_ndev(new_pnetelem->ndev); - if (!smc_pnetid_by_dev_port(ndev->dev.parent, ndev->dev_port, - ndev_pnetid)) - return (new_ibdev || new_smcddev) ? 0 : -EEXIST; + spin_lock(&smc_ib_devices.lock); + if (smc_pnet_match(ib_dev->pnetid[ib_port - 1], pnet_null)) { + memcpy(ib_dev->pnetid[ib_port - 1], pnet_name, + SMC_MAX_PNETID_LEN); + ib_dev->pnetid_by_user[ib_port - 1] = true; + applied = true; + } + spin_unlock(&smc_ib_devices.lock); + return applied; +} - /* add a new netdev entry to the pnet table if there isn't one */ - tmp_pnetelem = kzalloc(sizeof(*pnetelem), GFP_KERNEL); - if (!tmp_pnetelem) - return -ENOMEM; - memcpy(tmp_pnetelem->pnet_name, new_pnetelem->pnet_name, - SMC_MAX_PNETID_LEN); - tmp_pnetelem->ndev = new_pnetelem->ndev; +/* Apply pnetid to smcd device when no pnetid is set. + */ +static bool smc_pnet_apply_smcd(struct smcd_dev *smcd_dev, char *pnet_name) +{ + u8 pnet_null[SMC_MAX_PNETID_LEN] = {0}; + bool applied = false; - write_lock(&pnettable->lock); - list_for_each_entry(pnetelem, &pnettable->pnetlist, list) { - if (pnetelem->ndev == new_pnetelem->ndev) - new_netdev = false; - } - if (new_netdev) { - dev_hold(tmp_pnetelem->ndev); - list_add_tail(&tmp_pnetelem->list, &pnettable->pnetlist); - write_unlock(&pnettable->lock); - } else { - write_unlock(&pnettable->lock); - kfree(tmp_pnetelem); + spin_lock(&smcd_dev_list.lock); + if (smc_pnet_match(smcd_dev->pnetid, pnet_null)) { + memcpy(smcd_dev->pnetid, pnet_name, SMC_MAX_PNETID_LEN); + smcd_dev->pnetid_by_user = true; + applied = true; } - - return (new_netdev || new_ibdev || new_smcddev) ? 0 : -EEXIST; + spin_unlock(&smcd_dev_list.lock); + return applied; } /* The limit for pnetid is 16 characters. @@ -323,57 +308,167 @@ out: return smcd_dev; } -/* Parse the supplied netlink attributes and fill a pnetentry structure. - * For ethernet and infiniband device names verify that the devices exist. +static int smc_pnet_add_eth(struct smc_pnettable *pnettable, struct net *net, + char *eth_name, char *pnet_name) +{ + struct smc_pnetentry *tmp_pe, *new_pe; + struct net_device *ndev, *base_ndev; + u8 ndev_pnetid[SMC_MAX_PNETID_LEN]; + bool new_netdev; + int rc; + + /* check if (base) netdev already has a pnetid. If there is one, we do + * not want to add a pnet table entry + */ + rc = -EEXIST; + ndev = dev_get_by_name(net, eth_name); /* dev_hold() */ + if (ndev) { + base_ndev = pnet_find_base_ndev(ndev); + if (!smc_pnetid_by_dev_port(base_ndev->dev.parent, + base_ndev->dev_port, ndev_pnetid)) + goto out_put; + } + + /* add a new netdev entry to the pnet table if there isn't one */ + rc = -ENOMEM; + new_pe = kzalloc(sizeof(*new_pe), GFP_KERNEL); + if (!new_pe) + goto out_put; + new_pe->type = SMC_PNET_ETH; + memcpy(new_pe->pnet_name, pnet_name, SMC_MAX_PNETID_LEN); + strncpy(new_pe->eth_name, eth_name, IFNAMSIZ); + new_pe->ndev = ndev; + + rc = -EEXIST; + new_netdev = true; + write_lock(&pnettable->lock); + list_for_each_entry(tmp_pe, &pnettable->pnetlist, list) { + if (tmp_pe->type == SMC_PNET_ETH && + !strncmp(tmp_pe->eth_name, eth_name, IFNAMSIZ)) { + new_netdev = false; + break; + } + } + if (new_netdev) { + list_add_tail(&new_pe->list, &pnettable->pnetlist); + write_unlock(&pnettable->lock); + } else { + write_unlock(&pnettable->lock); + kfree(new_pe); + goto out_put; + } + return 0; + +out_put: + if (ndev) + dev_put(ndev); + return rc; +} + +static int smc_pnet_add_ib(struct smc_pnettable *pnettable, char *ib_name, + u8 ib_port, char *pnet_name) +{ + struct smc_pnetentry *tmp_pe, *new_pe; + struct smc_ib_device *ib_dev; + bool smcddev_applied = true; + bool ibdev_applied = true; + struct smcd_dev *smcd_dev; + bool new_ibdev; + + /* try to apply the pnetid to active devices */ + ib_dev = smc_pnet_find_ib(ib_name); + if (ib_dev) + ibdev_applied = smc_pnet_apply_ib(ib_dev, ib_port, pnet_name); + smcd_dev = smc_pnet_find_smcd(ib_name); + if (smcd_dev) + smcddev_applied = smc_pnet_apply_smcd(smcd_dev, pnet_name); + /* Apply fails when a device has a hardware-defined pnetid set, do not + * add a pnet table entry in that case. + */ + if (!ibdev_applied || !smcddev_applied) + return -EEXIST; + + /* add a new ib entry to the pnet table if there isn't one */ + new_pe = kzalloc(sizeof(*new_pe), GFP_KERNEL); + if (!new_pe) + return -ENOMEM; + new_pe->type = SMC_PNET_IB; + memcpy(new_pe->pnet_name, pnet_name, SMC_MAX_PNETID_LEN); + strncpy(new_pe->ib_name, ib_name, IB_DEVICE_NAME_MAX); + new_pe->ib_port = ib_port; + + new_ibdev = true; + write_lock(&pnettable->lock); + list_for_each_entry(tmp_pe, &pnettable->pnetlist, list) { + if (tmp_pe->type == SMC_PNET_IB && + !strncmp(tmp_pe->ib_name, ib_name, IB_DEVICE_NAME_MAX)) { + new_ibdev = false; + break; + } + } + if (new_ibdev) { + list_add_tail(&new_pe->list, &pnettable->pnetlist); + write_unlock(&pnettable->lock); + } else { + write_unlock(&pnettable->lock); + kfree(new_pe); + } + return (new_ibdev) ? 0 : -EEXIST; +} + +/* Append a pnetid to the end of the pnet table if not already on this list. */ -static int smc_pnet_fill_entry(struct net *net, - struct smc_user_pnetentry *pnetelem, - struct nlattr *tb[]) +static int smc_pnet_enter(struct net *net, struct nlattr *tb[]) { - char *string, *ibname; + char pnet_name[SMC_MAX_PNETID_LEN + 1]; + struct smc_pnettable *pnettable; + bool new_netdev = false; + bool new_ibdev = false; + struct smc_net *sn; + u8 ibport = 1; + char *string; int rc; - memset(pnetelem, 0, sizeof(*pnetelem)); - INIT_LIST_HEAD(&pnetelem->list); + /* get pnettable for namespace */ + sn = net_generic(net, smc_net_id); + pnettable = &sn->pnettable; rc = -EINVAL; if (!tb[SMC_PNETID_NAME]) goto error; string = (char *)nla_data(tb[SMC_PNETID_NAME]); - if (!smc_pnetid_valid(string, pnetelem->pnet_name)) + if (!smc_pnetid_valid(string, pnet_name)) goto error; - rc = -EINVAL; if (tb[SMC_PNETID_ETHNAME]) { string = (char *)nla_data(tb[SMC_PNETID_ETHNAME]); - pnetelem->ndev = dev_get_by_name(net, string); - if (!pnetelem->ndev) + rc = smc_pnet_add_eth(pnettable, net, string, pnet_name); + if (!rc) + new_netdev = true; + else if (rc != -EEXIST) goto error; } /* if this is not the initial namespace, stop here */ if (net != &init_net) - return 0; + return new_netdev ? 0 : -EEXIST; rc = -EINVAL; if (tb[SMC_PNETID_IBNAME]) { - ibname = (char *)nla_data(tb[SMC_PNETID_IBNAME]); - ibname = strim(ibname); - pnetelem->smcibdev = smc_pnet_find_ib(ibname); - pnetelem->smcd_dev = smc_pnet_find_smcd(ibname); - if (!pnetelem->smcibdev && !pnetelem->smcd_dev) - goto error; - if (pnetelem->smcibdev) { - if (!tb[SMC_PNETID_IBPORT]) - goto error; - pnetelem->ib_port = nla_get_u8(tb[SMC_PNETID_IBPORT]); - if (pnetelem->ib_port < 1 || - pnetelem->ib_port > SMC_MAX_PORTS) + string = (char *)nla_data(tb[SMC_PNETID_IBNAME]); + string = strim(string); + if (tb[SMC_PNETID_IBPORT]) { + ibport = nla_get_u8(tb[SMC_PNETID_IBPORT]); + if (ibport < 1 || ibport > SMC_MAX_PORTS) goto error; } + rc = smc_pnet_add_ib(pnettable, string, ibport, pnet_name); + if (!rc) + new_ibdev = true; + else if (rc != -EEXIST) + goto error; } - - return 0; + return (new_netdev || new_ibdev) ? 0 : -EEXIST; error: return rc; @@ -381,28 +476,22 @@ error: /* Convert an smc_pnetentry to a netlink attribute sequence */ static int smc_pnet_set_nla(struct sk_buff *msg, - struct smc_user_pnetentry *pnetelem) + struct smc_pnetentry *pnetelem) { if (nla_put_string(msg, SMC_PNETID_NAME, pnetelem->pnet_name)) return -1; - if (pnetelem->ndev) { + if (pnetelem->type == SMC_PNET_ETH) { if (nla_put_string(msg, SMC_PNETID_ETHNAME, - pnetelem->ndev->name)) + pnetelem->eth_name)) return -1; } else { if (nla_put_string(msg, SMC_PNETID_ETHNAME, "n/a")) return -1; } - if (pnetelem->smcibdev) { - if (nla_put_string(msg, SMC_PNETID_IBNAME, - dev_name(pnetelem->smcibdev->ibdev->dev.parent)) || + if (pnetelem->type == SMC_PNET_IB) { + if (nla_put_string(msg, SMC_PNETID_IBNAME, pnetelem->ib_name) || nla_put_u8(msg, SMC_PNETID_IBPORT, pnetelem->ib_port)) return -1; - } else if (pnetelem->smcd_dev) { - if (nla_put_string(msg, SMC_PNETID_IBNAME, - dev_name(&pnetelem->smcd_dev->dev)) || - nla_put_u8(msg, SMC_PNETID_IBPORT, 1)) - return -1; } else { if (nla_put_string(msg, SMC_PNETID_IBNAME, "n/a") || nla_put_u8(msg, SMC_PNETID_IBPORT, 0xff)) @@ -415,21 +504,8 @@ static int smc_pnet_set_nla(struct sk_buff *msg, static int smc_pnet_add(struct sk_buff *skb, struct genl_info *info) { struct net *net = genl_info_net(info); - struct smc_user_pnetentry pnetelem; - struct smc_pnettable *pnettable; - struct smc_net *sn; - int rc; - - /* get pnettable for namespace */ - sn = net_generic(net, smc_net_id); - pnettable = &sn->pnettable; - rc = smc_pnet_fill_entry(net, &pnetelem, info->attrs); - if (!rc) - rc = smc_pnet_enter(pnettable, &pnetelem); - if (pnetelem.ndev) - dev_put(pnetelem.ndev); - return rc; + return smc_pnet_enter(net, info->attrs); } static int smc_pnet_del(struct sk_buff *skb, struct genl_info *info) @@ -450,7 +526,7 @@ static int smc_pnet_dump_start(struct netlink_callback *cb) static int smc_pnet_dumpinfo(struct sk_buff *skb, u32 portid, u32 seq, u32 flags, - struct smc_user_pnetentry *pnetelem) + struct smc_pnetentry *pnetelem) { void *hdr; @@ -469,91 +545,32 @@ static int smc_pnet_dumpinfo(struct sk_buff *skb, static int _smc_pnet_dump(struct net *net, struct sk_buff *skb, u32 portid, u32 seq, u8 *pnetid, int start_idx) { - struct smc_user_pnetentry tmp_entry; struct smc_pnettable *pnettable; struct smc_pnetentry *pnetelem; - struct smc_ib_device *ibdev; - struct smcd_dev *smcd_dev; struct smc_net *sn; int idx = 0; - int ibport; /* get pnettable for namespace */ sn = net_generic(net, smc_net_id); pnettable = &sn->pnettable; - /* dump netdevices */ + /* dump pnettable entries */ read_lock(&pnettable->lock); list_for_each_entry(pnetelem, &pnettable->pnetlist, list) { if (pnetid && !smc_pnet_match(pnetelem->pnet_name, pnetid)) continue; if (idx++ < start_idx) continue; - memset(&tmp_entry, 0, sizeof(tmp_entry)); - memcpy(&tmp_entry.pnet_name, pnetelem->pnet_name, - SMC_MAX_PNETID_LEN); - tmp_entry.ndev = pnetelem->ndev; + /* if this is not the initial namespace, dump only netdev */ + if (net != &init_net && pnetelem->type != SMC_PNET_ETH) + continue; if (smc_pnet_dumpinfo(skb, portid, seq, NLM_F_MULTI, - &tmp_entry)) { + pnetelem)) { --idx; break; } } read_unlock(&pnettable->lock); - - /* if this is not the initial namespace, stop here */ - if (net != &init_net) - return idx; - - /* dump ib devices */ - spin_lock(&smc_ib_devices.lock); - list_for_each_entry(ibdev, &smc_ib_devices.list, list) { - for (ibport = 0; ibport < SMC_MAX_PORTS; ibport++) { - if (ibdev->pnetid_by_user[ibport]) { - if (pnetid && - !smc_pnet_match(ibdev->pnetid[ibport], - pnetid)) - continue; - if (idx++ < start_idx) - continue; - memset(&tmp_entry, 0, sizeof(tmp_entry)); - memcpy(&tmp_entry.pnet_name, - ibdev->pnetid[ibport], - SMC_MAX_PNETID_LEN); - tmp_entry.smcibdev = ibdev; - tmp_entry.ib_port = ibport + 1; - if (smc_pnet_dumpinfo(skb, portid, seq, - NLM_F_MULTI, - &tmp_entry)) { - --idx; - break; - } - } - } - } - spin_unlock(&smc_ib_devices.lock); - - /* dump smcd devices */ - spin_lock(&smcd_dev_list.lock); - list_for_each_entry(smcd_dev, &smcd_dev_list.list, list) { - if (smcd_dev->pnetid_by_user) { - if (pnetid && !smc_pnet_match(smcd_dev->pnetid, pnetid)) - continue; - if (idx++ < start_idx) - continue; - memset(&tmp_entry, 0, sizeof(tmp_entry)); - memcpy(&tmp_entry.pnet_name, smcd_dev->pnetid, - SMC_MAX_PNETID_LEN); - tmp_entry.smcd_dev = smcd_dev; - if (smc_pnet_dumpinfo(skb, portid, seq, NLM_F_MULTI, - &tmp_entry)) { - --idx; - break; - } - } - } - spin_unlock(&smcd_dev_list.lock); - return idx; } @@ -659,6 +676,9 @@ static int smc_pnet_netdev_event(struct notifier_block *this, case NETDEV_UNREGISTER: smc_pnet_remove_by_ndev(event_dev); return NOTIFY_OK; + case NETDEV_REGISTER: + smc_pnet_add_by_ndev(event_dev); + return NOTIFY_OK; default: return NOTIFY_DONE; } @@ -744,7 +764,7 @@ static int smc_pnet_find_ndev_pnetid_by_table(struct net_device *ndev, read_lock(&pnettable->lock); list_for_each_entry(pnetelem, &pnettable->pnetlist, list) { - if (ndev == pnetelem->ndev) { + if (pnetelem->type == SMC_PNET_ETH && ndev == pnetelem->ndev) { /* get pnetid of netdev device */ memcpy(pnetid, pnetelem->pnet_name, SMC_MAX_PNETID_LEN); rc = 0; @@ -755,6 +775,34 @@ static int smc_pnet_find_ndev_pnetid_by_table(struct net_device *ndev, return rc; } +/* find a roce device for the given pnetid */ +static void _smc_pnet_find_roce_by_pnetid(u8 *pnet_id, + struct smc_init_info *ini) +{ + struct smc_ib_device *ibdev; + int i; + + ini->ib_dev = NULL; + spin_lock(&smc_ib_devices.lock); + list_for_each_entry(ibdev, &smc_ib_devices.list, list) { + for (i = 1; i <= SMC_MAX_PORTS; i++) { + if (!rdma_is_port_valid(ibdev->ibdev, i)) + continue; + if (smc_pnet_match(ibdev->pnetid[i - 1], pnet_id) && + smc_ib_port_active(ibdev, i) && + !test_bit(i - 1, ibdev->ports_going_away) && + !smc_ib_determine_gid(ibdev, i, ini->vlan_id, + ini->ib_gid, NULL)) { + ini->ib_dev = ibdev; + ini->ib_port = i; + goto out; + } + } + } +out: + spin_unlock(&smc_ib_devices.lock); +} + /* if handshake network device belongs to a roce device, return its * IB device and port */ @@ -801,8 +849,6 @@ static void smc_pnet_find_roce_by_pnetid(struct net_device *ndev, struct smc_init_info *ini) { u8 ndev_pnetid[SMC_MAX_PNETID_LEN]; - struct smc_ib_device *ibdev; - int i; ndev = pnet_find_base_ndev(ndev); if (smc_pnetid_by_dev_port(ndev->dev.parent, ndev->dev_port, @@ -811,25 +857,7 @@ static void smc_pnet_find_roce_by_pnetid(struct net_device *ndev, smc_pnet_find_rdma_dev(ndev, ini); return; /* pnetid could not be determined */ } - - spin_lock(&smc_ib_devices.lock); - list_for_each_entry(ibdev, &smc_ib_devices.list, list) { - for (i = 1; i <= SMC_MAX_PORTS; i++) { - if (!rdma_is_port_valid(ibdev->ibdev, i)) - continue; - if (smc_pnet_match(ibdev->pnetid[i - 1], ndev_pnetid) && - smc_ib_port_active(ibdev, i) && - !test_bit(i - 1, ibdev->ports_going_away) && - !smc_ib_determine_gid(ibdev, i, ini->vlan_id, - ini->ib_gid, NULL)) { - ini->ib_dev = ibdev; - ini->ib_port = i; - goto out; - } - } - } -out: - spin_unlock(&smc_ib_devices.lock); + _smc_pnet_find_roce_by_pnetid(ndev_pnetid, ini); } static void smc_pnet_find_ism_by_pnetid(struct net_device *ndev, @@ -895,3 +923,60 @@ out_rel: out: return; } + +/* Lookup and apply a pnet table entry to the given ib device. + */ +int smc_pnetid_by_table_ib(struct smc_ib_device *smcibdev, u8 ib_port) +{ + char *ib_name = smcibdev->ibdev->name; + struct smc_pnettable *pnettable; + struct smc_pnetentry *tmp_pe; + struct smc_net *sn; + int rc = -ENOENT; + + /* get pnettable for init namespace */ + sn = net_generic(&init_net, smc_net_id); + pnettable = &sn->pnettable; + + read_lock(&pnettable->lock); + list_for_each_entry(tmp_pe, &pnettable->pnetlist, list) { + if (tmp_pe->type == SMC_PNET_IB && + !strncmp(tmp_pe->ib_name, ib_name, IB_DEVICE_NAME_MAX) && + tmp_pe->ib_port == ib_port) { + smc_pnet_apply_ib(smcibdev, ib_port, tmp_pe->pnet_name); + rc = 0; + break; + } + } + read_unlock(&pnettable->lock); + + return rc; +} + +/* Lookup and apply a pnet table entry to the given smcd device. + */ +int smc_pnetid_by_table_smcd(struct smcd_dev *smcddev) +{ + const char *ib_name = dev_name(&smcddev->dev); + struct smc_pnettable *pnettable; + struct smc_pnetentry *tmp_pe; + struct smc_net *sn; + int rc = -ENOENT; + + /* get pnettable for init namespace */ + sn = net_generic(&init_net, smc_net_id); + pnettable = &sn->pnettable; + + read_lock(&pnettable->lock); + list_for_each_entry(tmp_pe, &pnettable->pnetlist, list) { + if (tmp_pe->type == SMC_PNET_IB && + !strncmp(tmp_pe->ib_name, ib_name, IB_DEVICE_NAME_MAX)) { + smc_pnet_apply_smcd(smcddev, tmp_pe->pnet_name); + rc = 0; + break; + } + } + read_unlock(&pnettable->lock); + + return rc; +} diff --git a/net/smc/smc_pnet.h b/net/smc/smc_pnet.h index 4564e4d69c2e..ea207f8fc6f7 100644 --- a/net/smc/smc_pnet.h +++ b/net/smc/smc_pnet.h @@ -46,5 +46,7 @@ void smc_pnet_exit(void); void smc_pnet_net_exit(struct net *net); void smc_pnet_find_roce_resource(struct sock *sk, struct smc_init_info *ini); void smc_pnet_find_ism_resource(struct sock *sk, struct smc_init_info *ini); +int smc_pnetid_by_table_ib(struct smc_ib_device *smcibdev, u8 ib_port); +int smc_pnetid_by_table_smcd(struct smcd_dev *smcd); #endif -- cgit v1.2.3-59-g8ed1b From f3c1deddb21c19fb0eec3c61e80567ef4a79b58b Mon Sep 17 00:00:00 2001 From: Karsten Graul Date: Wed, 29 Apr 2020 17:10:38 +0200 Subject: net/smc: separate function for link initialization Move the initialization of a new link into its own function, separate from smc_lgr_create, to allow more than one link per link group. Do an extra check if the IB device initialization was successful, and reset the link state if any error occurs during smcr_link_init(). And rename two existing functions to use the prefix smcr_ to indicate that they belong to the SMC-R code path. Signed-off-by: Karsten Graul Reviewed-by: Ursula Braun Signed-off-by: David S. Miller --- net/smc/smc_core.c | 114 +++++++++++++++++++++++++++++++---------------------- 1 file changed, 66 insertions(+), 48 deletions(-) diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index 824c5211b027..3bb45c33db22 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -179,7 +179,7 @@ void smc_lgr_cleanup_early(struct smc_connection *conn) * of the DELETE LINK sequence from server; or as server to * initiate the delete processing. See smc_llc_rx_delete_link(). */ -static int smc_link_send_delete(struct smc_link *lnk, bool orderly) +static int smcr_link_send_delete(struct smc_link *lnk, bool orderly) { if (lnk->state == SMC_LNK_ACTIVE && !smc_llc_send_delete_link(lnk, SMC_LLC_REQ, orderly)) { @@ -219,7 +219,7 @@ static void smc_lgr_free_work(struct work_struct *work) if (!lgr->is_smcd && !lgr->terminating) { /* try to send del link msg, on error free lgr immediately */ if (lnk->state == SMC_LNK_ACTIVE && - !smc_link_send_delete(lnk, true)) { + !smcr_link_send_delete(lnk, true)) { /* reschedule in case we never receive a response */ smc_lgr_schedule_free_work(lgr); spin_unlock_bh(lgr_lock); @@ -245,6 +245,64 @@ static void smc_lgr_terminate_work(struct work_struct *work) __smc_lgr_terminate(lgr, true); } +static int smcr_link_init(struct smc_link *lnk, u8 link_id, + struct smc_init_info *ini) +{ + u8 rndvec[3]; + int rc; + + get_device(&ini->ib_dev->ibdev->dev); + atomic_inc(&ini->ib_dev->lnk_cnt); + lnk->state = SMC_LNK_ACTIVATING; + lnk->link_id = link_id; + lnk->smcibdev = ini->ib_dev; + lnk->ibport = ini->ib_port; + lnk->path_mtu = ini->ib_dev->pattr[ini->ib_port - 1].active_mtu; + if (!ini->ib_dev->initialized) { + rc = (int)smc_ib_setup_per_ibdev(ini->ib_dev); + if (rc) + goto out; + } + get_random_bytes(rndvec, sizeof(rndvec)); + lnk->psn_initial = rndvec[0] + (rndvec[1] << 8) + + (rndvec[2] << 16); + rc = smc_ib_determine_gid(lnk->smcibdev, lnk->ibport, + ini->vlan_id, lnk->gid, &lnk->sgid_index); + if (rc) + goto out; + rc = smc_llc_link_init(lnk); + if (rc) + goto out; + rc = smc_wr_alloc_link_mem(lnk); + if (rc) + goto clear_llc_lnk; + rc = smc_ib_create_protection_domain(lnk); + if (rc) + goto free_link_mem; + rc = smc_ib_create_queue_pair(lnk); + if (rc) + goto dealloc_pd; + rc = smc_wr_create_link(lnk); + if (rc) + goto destroy_qp; + return 0; + +destroy_qp: + smc_ib_destroy_queue_pair(lnk); +dealloc_pd: + smc_ib_dealloc_protection_domain(lnk); +free_link_mem: + smc_wr_free_link_mem(lnk); +clear_llc_lnk: + smc_llc_link_clear(lnk); +out: + put_device(&ini->ib_dev->ibdev->dev); + memset(lnk, 0, sizeof(struct smc_link)); + if (!atomic_dec_return(&ini->ib_dev->lnk_cnt)) + wake_up(&ini->ib_dev->lnks_deleted); + return rc; +} + /* create a new SMC link group */ static int smc_lgr_create(struct smc_sock *smc, struct smc_init_info *ini) { @@ -252,7 +310,6 @@ static int smc_lgr_create(struct smc_sock *smc, struct smc_init_info *ini) struct list_head *lgr_list; struct smc_link *lnk; spinlock_t *lgr_lock; - u8 rndvec[3]; int rc = 0; int i; @@ -297,48 +354,17 @@ static int smc_lgr_create(struct smc_sock *smc, struct smc_init_info *ini) atomic_inc(&ini->ism_dev->lgr_cnt); } else { /* SMC-R specific settings */ - get_device(&ini->ib_dev->ibdev->dev); lgr->role = smc->listen_smc ? SMC_SERV : SMC_CLNT; memcpy(lgr->peer_systemid, ini->ib_lcl->id_for_peer, SMC_SYSTEMID_LEN); lnk = &lgr->lnk[SMC_SINGLE_LINK]; - /* initialize link */ - lnk->state = SMC_LNK_ACTIVATING; - lnk->link_id = SMC_SINGLE_LINK; - lnk->smcibdev = ini->ib_dev; - lnk->ibport = ini->ib_port; - lgr_list = &smc_lgr_list.list; - lgr_lock = &smc_lgr_list.lock; - lnk->path_mtu = - ini->ib_dev->pattr[ini->ib_port - 1].active_mtu; - if (!ini->ib_dev->initialized) - smc_ib_setup_per_ibdev(ini->ib_dev); - get_random_bytes(rndvec, sizeof(rndvec)); - lnk->psn_initial = rndvec[0] + (rndvec[1] << 8) + - (rndvec[2] << 16); - rc = smc_ib_determine_gid(lnk->smcibdev, lnk->ibport, - ini->vlan_id, lnk->gid, - &lnk->sgid_index); - if (rc) - goto free_lgr; - rc = smc_llc_link_init(lnk); + rc = smcr_link_init(lnk, SMC_SINGLE_LINK, ini); if (rc) goto free_lgr; - rc = smc_wr_alloc_link_mem(lnk); - if (rc) - goto clear_llc_lnk; - rc = smc_ib_create_protection_domain(lnk); - if (rc) - goto free_link_mem; - rc = smc_ib_create_queue_pair(lnk); - if (rc) - goto dealloc_pd; - rc = smc_wr_create_link(lnk); - if (rc) - goto destroy_qp; + lgr_list = &smc_lgr_list.list; + lgr_lock = &smc_lgr_list.lock; atomic_inc(&lgr_cnt); - atomic_inc(&ini->ib_dev->lnk_cnt); } smc->conn.lgr = lgr; spin_lock_bh(lgr_lock); @@ -346,14 +372,6 @@ static int smc_lgr_create(struct smc_sock *smc, struct smc_init_info *ini) spin_unlock_bh(lgr_lock); return 0; -destroy_qp: - smc_ib_destroy_queue_pair(lnk); -dealloc_pd: - smc_ib_dealloc_protection_domain(lnk); -free_link_mem: - smc_wr_free_link_mem(lnk); -clear_llc_lnk: - smc_llc_link_clear(lnk); free_lgr: kfree(lgr); ism_put_vlan: @@ -417,7 +435,7 @@ void smc_conn_free(struct smc_connection *conn) smc_lgr_schedule_free_work(lgr); } -static void smc_link_clear(struct smc_link *lnk) +static void smcr_link_clear(struct smc_link *lnk) { lnk->peer_qpn = 0; smc_llc_link_clear(lnk); @@ -426,6 +444,7 @@ static void smc_link_clear(struct smc_link *lnk) smc_ib_destroy_queue_pair(lnk); smc_ib_dealloc_protection_domain(lnk); smc_wr_free_link_mem(lnk); + put_device(&lnk->smcibdev->ibdev->dev); if (!atomic_dec_return(&lnk->smcibdev->lnk_cnt)) wake_up(&lnk->smcibdev->lnks_deleted); } @@ -512,8 +531,7 @@ static void smc_lgr_free(struct smc_link_group *lgr) if (!atomic_dec_return(&lgr->smcd->lgr_cnt)) wake_up(&lgr->smcd->lgrs_deleted); } else { - smc_link_clear(&lgr->lnk[SMC_SINGLE_LINK]); - put_device(&lgr->lnk[SMC_SINGLE_LINK].smcibdev->ibdev->dev); + smcr_link_clear(&lgr->lnk[SMC_SINGLE_LINK]); if (!atomic_dec_return(&lgr_cnt)) wake_up(&lgrs_deleted); } -- cgit v1.2.3-59-g8ed1b From 026c381fb4778d0d44af57b7ff674f31f04af221 Mon Sep 17 00:00:00 2001 From: Karsten Graul Date: Wed, 29 Apr 2020 17:10:39 +0200 Subject: net/smc: introduce link_idx for link group array The link_id is the index of the link in the array of the link group. When a link in the array is reused for a new link, a different unique link_id should be used, otherwise the index in the array could collide with the previous link at this array position. Use a new variable link_idx as array index, and make link_id an increasing unique id value. Signed-off-by: Karsten Graul Reviewed-by: Ursula Braun Signed-off-by: David S. Miller --- net/smc/smc_core.c | 34 +++++++++++++++++++++++++++++----- net/smc/smc_core.h | 2 ++ 2 files changed, 31 insertions(+), 5 deletions(-) diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index 3bb45c33db22..d233112ced2a 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -245,8 +245,28 @@ static void smc_lgr_terminate_work(struct work_struct *work) __smc_lgr_terminate(lgr, true); } -static int smcr_link_init(struct smc_link *lnk, u8 link_id, - struct smc_init_info *ini) +/* return next unique link id for the lgr */ +static u8 smcr_next_link_id(struct smc_link_group *lgr) +{ + u8 link_id; + int i; + + while (1) { + link_id = ++lgr->next_link_id; + if (!link_id) /* skip zero as link_id */ + link_id = ++lgr->next_link_id; + for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { + if (lgr->lnk[i].state != SMC_LNK_INACTIVE && + lgr->lnk[i].link_id == link_id) + continue; + } + break; + } + return link_id; +} + +static int smcr_link_init(struct smc_link_group *lgr, struct smc_link *lnk, + u8 link_idx, struct smc_init_info *ini) { u8 rndvec[3]; int rc; @@ -254,7 +274,8 @@ static int smcr_link_init(struct smc_link *lnk, u8 link_id, get_device(&ini->ib_dev->ibdev->dev); atomic_inc(&ini->ib_dev->lnk_cnt); lnk->state = SMC_LNK_ACTIVATING; - lnk->link_id = link_id; + lnk->link_id = smcr_next_link_id(lgr); + lnk->link_idx = link_idx; lnk->smcibdev = ini->ib_dev; lnk->ibport = ini->ib_port; lnk->path_mtu = ini->ib_dev->pattr[ini->ib_port - 1].active_mtu; @@ -310,6 +331,7 @@ static int smc_lgr_create(struct smc_sock *smc, struct smc_init_info *ini) struct list_head *lgr_list; struct smc_link *lnk; spinlock_t *lgr_lock; + u8 link_idx; int rc = 0; int i; @@ -338,6 +360,7 @@ static int smc_lgr_create(struct smc_sock *smc, struct smc_init_info *ini) INIT_LIST_HEAD(&lgr->sndbufs[i]); INIT_LIST_HEAD(&lgr->rmbs[i]); } + lgr->next_link_id = 0; smc_lgr_list.num += SMC_LGR_NUM_INCR; memcpy(&lgr->id, (u8 *)&smc_lgr_list.num, SMC_LGR_ID_SIZE); INIT_DELAYED_WORK(&lgr->free_work, smc_lgr_free_work); @@ -358,8 +381,9 @@ static int smc_lgr_create(struct smc_sock *smc, struct smc_init_info *ini) memcpy(lgr->peer_systemid, ini->ib_lcl->id_for_peer, SMC_SYSTEMID_LEN); - lnk = &lgr->lnk[SMC_SINGLE_LINK]; - rc = smcr_link_init(lnk, SMC_SINGLE_LINK, ini); + link_idx = SMC_SINGLE_LINK; + lnk = &lgr->lnk[link_idx]; + rc = smcr_link_init(lgr, lnk, link_idx, ini); if (rc) goto free_lgr; lgr_list = &smc_lgr_list.list; diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h index 8041db20c753..c459b0639bf3 100644 --- a/net/smc/smc_core.h +++ b/net/smc/smc_core.h @@ -115,6 +115,7 @@ struct smc_link { u8 peer_mac[ETH_ALEN]; /* = gid[8:10||13:15] */ u8 peer_gid[SMC_GID_SIZE]; /* gid of peer*/ u8 link_id; /* unique # within link group */ + u8 link_idx; /* index in lgr link array */ enum smc_link_state state; /* state of link */ struct workqueue_struct *llc_wq; /* single thread work queue */ @@ -222,6 +223,7 @@ struct smc_link_group { /* remote addr/key pairs */ DECLARE_BITMAP(rtokens_used_mask, SMC_RMBS_PER_LGR_MAX); /* used rtoken elements */ + u8 next_link_id; }; struct { /* SMC-D */ u64 peer_gid; -- cgit v1.2.3-59-g8ed1b From 387707fdf48697c667dd5e9715ac4feb41602d15 Mon Sep 17 00:00:00 2001 From: Karsten Graul Date: Wed, 29 Apr 2020 17:10:40 +0200 Subject: net/smc: convert static link ID to dynamic references As a preparation for the support of multiple links remove the usage of a static link id (SMC_SINGLE_LINK) and allow dynamic link ids. Signed-off-by: Karsten Graul Reviewed-by: Ursula Braun Signed-off-by: David S. Miller --- net/smc/af_smc.c | 17 +++++--------- net/smc/smc.h | 1 + net/smc/smc_cdc.c | 8 +++---- net/smc/smc_clc.c | 12 +++++----- net/smc/smc_core.c | 66 +++++++++++++++++++++++++++--------------------------- net/smc/smc_core.h | 7 +++--- net/smc/smc_ib.c | 58 +++++++++++++++++++++++------------------------ net/smc/smc_ib.h | 10 ++++----- net/smc/smc_llc.c | 10 ++++----- net/smc/smc_tx.c | 13 ++++++----- 10 files changed, 99 insertions(+), 103 deletions(-) diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 6fd44bdb0fc3..6e4bad8c64a8 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -343,7 +343,7 @@ static int smc_reg_rmb(struct smc_link *link, struct smc_buf_desc *rmb_desc, { if (!rmb_desc->wr_reg) { /* register memory region for new rmb */ - if (smc_wr_reg_send(link, rmb_desc->mr_rx[SMC_SINGLE_LINK])) { + if (smc_wr_reg_send(link, rmb_desc->mr_rx[link->link_idx])) { rmb_desc->regerr = 1; return -EFAULT; } @@ -362,12 +362,10 @@ static int smc_reg_rmb(struct smc_link *link, struct smc_buf_desc *rmb_desc, static int smc_clnt_conf_first_link(struct smc_sock *smc) { struct net *net = sock_net(smc->clcsock->sk); - struct smc_link_group *lgr = smc->conn.lgr; - struct smc_link *link; + struct smc_link *link = smc->conn.lnk; int rest; int rc; - link = &lgr->lnk[SMC_SINGLE_LINK]; /* receive CONFIRM LINK request from server over RoCE fabric */ rest = wait_for_completion_interruptible_timeout( &link->llc_confirm, @@ -610,7 +608,7 @@ static int smc_connect_rdma(struct smc_sock *smc, mutex_unlock(&smc_client_lgr_pending); return reason_code; } - link = &smc->conn.lgr->lnk[SMC_SINGLE_LINK]; + link = smc->conn.lnk; smc_conn_save_peer_info(smc, aclc); @@ -1002,13 +1000,10 @@ void smc_close_non_accepted(struct sock *sk) static int smc_serv_conf_first_link(struct smc_sock *smc) { struct net *net = sock_net(smc->clcsock->sk); - struct smc_link_group *lgr = smc->conn.lgr; - struct smc_link *link; + struct smc_link *link = smc->conn.lnk; int rest; int rc; - link = &lgr->lnk[SMC_SINGLE_LINK]; - if (smc_reg_rmb(link, smc->conn.rmb_desc, false)) return SMC_CLC_DECL_ERR_REGRMB; @@ -1194,7 +1189,7 @@ static int smc_listen_ism_init(struct smc_sock *new_smc, /* listen worker: register buffers */ static int smc_listen_rdma_reg(struct smc_sock *new_smc, int local_contact) { - struct smc_link *link = &new_smc->conn.lgr->lnk[SMC_SINGLE_LINK]; + struct smc_link *link = new_smc->conn.lnk; if (local_contact != SMC_FIRST_CONTACT) { if (smc_reg_rmb(link, new_smc->conn.rmb_desc, true)) @@ -1210,7 +1205,7 @@ static int smc_listen_rdma_finish(struct smc_sock *new_smc, struct smc_clc_msg_accept_confirm *cclc, int local_contact) { - struct smc_link *link = &new_smc->conn.lgr->lnk[SMC_SINGLE_LINK]; + struct smc_link *link = new_smc->conn.lnk; int reason_code = 0; if (local_contact == SMC_FIRST_CONTACT) diff --git a/net/smc/smc.h b/net/smc/smc.h index be11ba41190f..1a084afa7372 100644 --- a/net/smc/smc.h +++ b/net/smc/smc.h @@ -121,6 +121,7 @@ enum smc_urg_state { struct smc_connection { struct rb_node alert_node; struct smc_link_group *lgr; /* link group of connection */ + struct smc_link *lnk; /* assigned SMC-R link */ u32 alert_token_local; /* unique conn. id */ u8 peer_rmbe_idx; /* from tcp handshake */ int peer_rmbe_size; /* size of peer rx buffer */ diff --git a/net/smc/smc_cdc.c b/net/smc/smc_cdc.c index 164f1584861b..f64589d823aa 100644 --- a/net/smc/smc_cdc.c +++ b/net/smc/smc_cdc.c @@ -57,7 +57,7 @@ int smc_cdc_get_free_slot(struct smc_connection *conn, struct smc_rdma_wr **wr_rdma_buf, struct smc_cdc_tx_pend **pend) { - struct smc_link *link = &conn->lgr->lnk[SMC_SINGLE_LINK]; + struct smc_link *link = conn->lnk; int rc; rc = smc_wr_tx_get_free_slot(link, smc_cdc_tx_handler, wr_buf, @@ -91,12 +91,10 @@ int smc_cdc_msg_send(struct smc_connection *conn, struct smc_wr_buf *wr_buf, struct smc_cdc_tx_pend *pend) { + struct smc_link *link = conn->lnk; union smc_host_cursor cfed; - struct smc_link *link; int rc; - link = &conn->lgr->lnk[SMC_SINGLE_LINK]; - smc_cdc_add_pending_send(conn, pend); conn->tx_cdc_seq++; @@ -165,7 +163,7 @@ static void smc_cdc_tx_dismisser(struct smc_wr_tx_pend_priv *tx_pend) void smc_cdc_tx_dismiss_slots(struct smc_connection *conn) { - struct smc_link *link = &conn->lgr->lnk[SMC_SINGLE_LINK]; + struct smc_link *link = conn->lnk; smc_wr_tx_dismiss_slots(link, SMC_CDC_MSG_TYPE, smc_cdc_tx_filter, smc_cdc_tx_dismisser, diff --git a/net/smc/smc_clc.c b/net/smc/smc_clc.c index ea0068f0173c..d5627df24215 100644 --- a/net/smc/smc_clc.c +++ b/net/smc/smc_clc.c @@ -496,7 +496,7 @@ int smc_clc_send_confirm(struct smc_sock *smc) sizeof(SMCD_EYECATCHER)); } else { /* SMC-R specific settings */ - link = &conn->lgr->lnk[SMC_SINGLE_LINK]; + link = conn->lnk; memcpy(cclc.hdr.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER)); cclc.hdr.path = SMC_TYPE_R; @@ -508,13 +508,13 @@ int smc_clc_send_confirm(struct smc_sock *smc) ETH_ALEN); hton24(cclc.qpn, link->roce_qp->qp_num); cclc.rmb_rkey = - htonl(conn->rmb_desc->mr_rx[SMC_SINGLE_LINK]->rkey); + htonl(conn->rmb_desc->mr_rx[link->link_idx]->rkey); cclc.rmbe_idx = 1; /* for now: 1 RMB = 1 RMBE */ cclc.rmbe_alert_token = htonl(conn->alert_token_local); cclc.qp_mtu = min(link->path_mtu, link->peer_mtu); cclc.rmbe_size = conn->rmbe_size_short; cclc.rmb_dma_addr = cpu_to_be64((u64)sg_dma_address - (conn->rmb_desc->sgt[SMC_SINGLE_LINK].sgl)); + (conn->rmb_desc->sgt[link->link_idx].sgl)); hton24(cclc.psn, link->psn_initial); memcpy(cclc.smcr_trl.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER)); @@ -572,7 +572,7 @@ int smc_clc_send_accept(struct smc_sock *new_smc, int srv_first_contact) memcpy(aclc.hdr.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER)); aclc.hdr.path = SMC_TYPE_R; - link = &conn->lgr->lnk[SMC_SINGLE_LINK]; + link = conn->lnk; memcpy(aclc.lcl.id_for_peer, local_systemid, sizeof(local_systemid)); memcpy(&aclc.lcl.gid, link->gid, SMC_GID_SIZE); @@ -580,13 +580,13 @@ int smc_clc_send_accept(struct smc_sock *new_smc, int srv_first_contact) ETH_ALEN); hton24(aclc.qpn, link->roce_qp->qp_num); aclc.rmb_rkey = - htonl(conn->rmb_desc->mr_rx[SMC_SINGLE_LINK]->rkey); + htonl(conn->rmb_desc->mr_rx[link->link_idx]->rkey); aclc.rmbe_idx = 1; /* as long as 1 RMB = 1 RMBE */ aclc.rmbe_alert_token = htonl(conn->alert_token_local); aclc.qp_mtu = link->path_mtu; aclc.rmbe_size = conn->rmbe_size_short, aclc.rmb_dma_addr = cpu_to_be64((u64)sg_dma_address - (conn->rmb_desc->sgt[SMC_SINGLE_LINK].sgl)); + (conn->rmb_desc->sgt[link->link_idx].sgl)); hton24(aclc.psn, link->psn_initial); memcpy(aclc.smcr_trl.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER)); diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index d233112ced2a..1d695093f205 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -131,6 +131,11 @@ static void smc_lgr_register_conn(struct smc_connection *conn) conn->alert_token_local = 0; } smc_lgr_add_alert_token(conn); + + /* assign the new connection to a link */ + if (!conn->lgr->is_smcd) + conn->lnk = &conn->lgr->lnk[SMC_SINGLE_LINK]; + conn->lgr->conns_num++; } @@ -275,6 +280,7 @@ static int smcr_link_init(struct smc_link_group *lgr, struct smc_link *lnk, atomic_inc(&ini->ib_dev->lnk_cnt); lnk->state = SMC_LNK_ACTIVATING; lnk->link_id = smcr_next_link_id(lgr); + lnk->lgr = lgr; lnk->link_idx = link_idx; lnk->smcibdev = ini->ib_dev; lnk->ibport = ini->ib_port; @@ -421,7 +427,7 @@ static void smc_buf_unuse(struct smc_connection *conn, if (!lgr->is_smcd && !list_empty(&lgr->list)) { /* unregister rmb with peer */ smc_llc_do_delete_rkey( - &lgr->lnk[SMC_SINGLE_LINK], + conn->lnk, conn->rmb_desc); } conn->rmb_desc->used = 0; @@ -479,16 +485,15 @@ static void smcr_buf_free(struct smc_link_group *lgr, bool is_rmb, struct smc_link *lnk = &lgr->lnk[SMC_SINGLE_LINK]; if (is_rmb) { - if (buf_desc->mr_rx[SMC_SINGLE_LINK]) + if (buf_desc->mr_rx[lnk->link_idx]) smc_ib_put_memory_region( - buf_desc->mr_rx[SMC_SINGLE_LINK]); - smc_ib_buf_unmap_sg(lnk->smcibdev, buf_desc, - DMA_FROM_DEVICE); + buf_desc->mr_rx[lnk->link_idx]); + smc_ib_buf_unmap_sg(lnk, buf_desc, DMA_FROM_DEVICE); } else { - smc_ib_buf_unmap_sg(lnk->smcibdev, buf_desc, - DMA_TO_DEVICE); + smc_ib_buf_unmap_sg(lnk, buf_desc, DMA_TO_DEVICE); } - sg_free_table(&buf_desc->sgt[SMC_SINGLE_LINK]); + sg_free_table(&buf_desc->sgt[lnk->link_idx]); + if (buf_desc->pages) __free_pages(buf_desc->pages, buf_desc->order); kfree(buf_desc); @@ -1026,17 +1031,16 @@ static struct smc_buf_desc *smcr_new_buf_create(struct smc_link_group *lgr, /* build the sg table from the pages */ lnk = &lgr->lnk[SMC_SINGLE_LINK]; - rc = sg_alloc_table(&buf_desc->sgt[SMC_SINGLE_LINK], 1, - GFP_KERNEL); + rc = sg_alloc_table(&buf_desc->sgt[lnk->link_idx], 1, GFP_KERNEL); if (rc) { smc_buf_free(lgr, is_rmb, buf_desc); return ERR_PTR(rc); } - sg_set_buf(buf_desc->sgt[SMC_SINGLE_LINK].sgl, + sg_set_buf(buf_desc->sgt[lnk->link_idx].sgl, buf_desc->cpu_addr, bufsize); /* map sg table to DMA address */ - rc = smc_ib_buf_map_sg(lnk->smcibdev, buf_desc, + rc = smc_ib_buf_map_sg(lnk, buf_desc, is_rmb ? DMA_FROM_DEVICE : DMA_TO_DEVICE); /* SMC protocol depends on mapping to one DMA address only */ if (rc != 1) { @@ -1049,7 +1053,7 @@ static struct smc_buf_desc *smcr_new_buf_create(struct smc_link_group *lgr, rc = smc_ib_get_memory_region(lnk->roce_pd, IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE, - buf_desc); + buf_desc, lnk->link_idx); if (rc) { smc_buf_free(lgr, is_rmb, buf_desc); return ERR_PTR(rc); @@ -1174,22 +1178,16 @@ static int __smc_buf_create(struct smc_sock *smc, bool is_smcd, bool is_rmb) void smc_sndbuf_sync_sg_for_cpu(struct smc_connection *conn) { - struct smc_link_group *lgr = conn->lgr; - if (!conn->lgr || conn->lgr->is_smcd) return; - smc_ib_sync_sg_for_cpu(lgr->lnk[SMC_SINGLE_LINK].smcibdev, - conn->sndbuf_desc, DMA_TO_DEVICE); + smc_ib_sync_sg_for_cpu(conn->lnk, conn->sndbuf_desc, DMA_TO_DEVICE); } void smc_sndbuf_sync_sg_for_device(struct smc_connection *conn) { - struct smc_link_group *lgr = conn->lgr; - if (!conn->lgr || conn->lgr->is_smcd) return; - smc_ib_sync_sg_for_device(lgr->lnk[SMC_SINGLE_LINK].smcibdev, - conn->sndbuf_desc, DMA_TO_DEVICE); + smc_ib_sync_sg_for_device(conn->lnk, conn->sndbuf_desc, DMA_TO_DEVICE); } void smc_rmb_sync_sg_for_cpu(struct smc_connection *conn) @@ -1198,7 +1196,7 @@ void smc_rmb_sync_sg_for_cpu(struct smc_connection *conn) if (!conn->lgr || conn->lgr->is_smcd) return; - smc_ib_sync_sg_for_cpu(lgr->lnk[SMC_SINGLE_LINK].smcibdev, + smc_ib_sync_sg_for_cpu(&lgr->lnk[SMC_SINGLE_LINK], conn->rmb_desc, DMA_FROM_DEVICE); } @@ -1208,7 +1206,7 @@ void smc_rmb_sync_sg_for_device(struct smc_connection *conn) if (!conn->lgr || conn->lgr->is_smcd) return; - smc_ib_sync_sg_for_device(lgr->lnk[SMC_SINGLE_LINK].smcibdev, + smc_ib_sync_sg_for_device(&lgr->lnk[SMC_SINGLE_LINK], conn->rmb_desc, DMA_FROM_DEVICE); } @@ -1245,15 +1243,16 @@ static inline int smc_rmb_reserve_rtoken_idx(struct smc_link_group *lgr) } /* add a new rtoken from peer */ -int smc_rtoken_add(struct smc_link_group *lgr, __be64 nw_vaddr, __be32 nw_rkey) +int smc_rtoken_add(struct smc_link *lnk, __be64 nw_vaddr, __be32 nw_rkey) { + struct smc_link_group *lgr = smc_get_lgr(lnk); u64 dma_addr = be64_to_cpu(nw_vaddr); u32 rkey = ntohl(nw_rkey); int i; for (i = 0; i < SMC_RMBS_PER_LGR_MAX; i++) { - if ((lgr->rtokens[i][SMC_SINGLE_LINK].rkey == rkey) && - (lgr->rtokens[i][SMC_SINGLE_LINK].dma_addr == dma_addr) && + if (lgr->rtokens[i][lnk->link_idx].rkey == rkey && + lgr->rtokens[i][lnk->link_idx].dma_addr == dma_addr && test_bit(i, lgr->rtokens_used_mask)) { /* already in list */ return i; @@ -1262,22 +1261,23 @@ int smc_rtoken_add(struct smc_link_group *lgr, __be64 nw_vaddr, __be32 nw_rkey) i = smc_rmb_reserve_rtoken_idx(lgr); if (i < 0) return i; - lgr->rtokens[i][SMC_SINGLE_LINK].rkey = rkey; - lgr->rtokens[i][SMC_SINGLE_LINK].dma_addr = dma_addr; + lgr->rtokens[i][lnk->link_idx].rkey = rkey; + lgr->rtokens[i][lnk->link_idx].dma_addr = dma_addr; return i; } /* delete an rtoken */ -int smc_rtoken_delete(struct smc_link_group *lgr, __be32 nw_rkey) +int smc_rtoken_delete(struct smc_link *lnk, __be32 nw_rkey) { + struct smc_link_group *lgr = smc_get_lgr(lnk); u32 rkey = ntohl(nw_rkey); int i; for (i = 0; i < SMC_RMBS_PER_LGR_MAX; i++) { - if (lgr->rtokens[i][SMC_SINGLE_LINK].rkey == rkey && + if (lgr->rtokens[i][lnk->link_idx].rkey == rkey && test_bit(i, lgr->rtokens_used_mask)) { - lgr->rtokens[i][SMC_SINGLE_LINK].rkey = 0; - lgr->rtokens[i][SMC_SINGLE_LINK].dma_addr = 0; + lgr->rtokens[i][lnk->link_idx].rkey = 0; + lgr->rtokens[i][lnk->link_idx].dma_addr = 0; clear_bit(i, lgr->rtokens_used_mask); return 0; @@ -1290,7 +1290,7 @@ int smc_rtoken_delete(struct smc_link_group *lgr, __be32 nw_rkey) int smc_rmb_rtoken_handling(struct smc_connection *conn, struct smc_clc_msg_accept_confirm *clc) { - conn->rtoken_idx = smc_rtoken_add(conn->lgr, clc->rmb_dma_addr, + conn->rtoken_idx = smc_rtoken_add(conn->lnk, clc->rmb_dma_addr, clc->rmb_rkey); if (conn->rtoken_idx < 0) return conn->rtoken_idx; diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h index c459b0639bf3..c71c35a3596c 100644 --- a/net/smc/smc_core.h +++ b/net/smc/smc_core.h @@ -116,6 +116,7 @@ struct smc_link { u8 peer_gid[SMC_GID_SIZE]; /* gid of peer*/ u8 link_id; /* unique # within link group */ u8 link_idx; /* index in lgr link array */ + struct smc_link_group *lgr; /* parent link group */ enum smc_link_state state; /* state of link */ struct workqueue_struct *llc_wq; /* single thread work queue */ @@ -303,8 +304,8 @@ int smc_buf_create(struct smc_sock *smc, bool is_smcd); int smc_uncompress_bufsize(u8 compressed); int smc_rmb_rtoken_handling(struct smc_connection *conn, struct smc_clc_msg_accept_confirm *clc); -int smc_rtoken_add(struct smc_link_group *lgr, __be64 nw_vaddr, __be32 nw_rkey); -int smc_rtoken_delete(struct smc_link_group *lgr, __be32 nw_rkey); +int smc_rtoken_add(struct smc_link *lnk, __be64 nw_vaddr, __be32 nw_rkey); +int smc_rtoken_delete(struct smc_link *lnk, __be32 nw_rkey); void smc_sndbuf_sync_sg_for_cpu(struct smc_connection *conn); void smc_sndbuf_sync_sg_for_device(struct smc_connection *conn); void smc_rmb_sync_sg_for_cpu(struct smc_connection *conn); @@ -319,6 +320,6 @@ void smc_core_exit(void); static inline struct smc_link_group *smc_get_lgr(struct smc_link *link) { - return container_of(link, struct smc_link_group, lnk[SMC_SINGLE_LINK]); + return link->lgr; } #endif diff --git a/net/smc/smc_ib.c b/net/smc/smc_ib.c index 440f9e319a38..c090678a3e5a 100644 --- a/net/smc/smc_ib.c +++ b/net/smc/smc_ib.c @@ -389,15 +389,15 @@ void smc_ib_put_memory_region(struct ib_mr *mr) ib_dereg_mr(mr); } -static int smc_ib_map_mr_sg(struct smc_buf_desc *buf_slot) +static int smc_ib_map_mr_sg(struct smc_buf_desc *buf_slot, u8 link_idx) { unsigned int offset = 0; int sg_num; /* map the largest prefix of a dma mapped SG list */ - sg_num = ib_map_mr_sg(buf_slot->mr_rx[SMC_SINGLE_LINK], - buf_slot->sgt[SMC_SINGLE_LINK].sgl, - buf_slot->sgt[SMC_SINGLE_LINK].orig_nents, + sg_num = ib_map_mr_sg(buf_slot->mr_rx[link_idx], + buf_slot->sgt[link_idx].sgl, + buf_slot->sgt[link_idx].orig_nents, &offset, PAGE_SIZE); return sg_num; @@ -405,29 +405,29 @@ static int smc_ib_map_mr_sg(struct smc_buf_desc *buf_slot) /* Allocate a memory region and map the dma mapped SG list of buf_slot */ int smc_ib_get_memory_region(struct ib_pd *pd, int access_flags, - struct smc_buf_desc *buf_slot) + struct smc_buf_desc *buf_slot, u8 link_idx) { - if (buf_slot->mr_rx[SMC_SINGLE_LINK]) + if (buf_slot->mr_rx[link_idx]) return 0; /* already done */ - buf_slot->mr_rx[SMC_SINGLE_LINK] = + buf_slot->mr_rx[link_idx] = ib_alloc_mr(pd, IB_MR_TYPE_MEM_REG, 1 << buf_slot->order); - if (IS_ERR(buf_slot->mr_rx[SMC_SINGLE_LINK])) { + if (IS_ERR(buf_slot->mr_rx[link_idx])) { int rc; - rc = PTR_ERR(buf_slot->mr_rx[SMC_SINGLE_LINK]); - buf_slot->mr_rx[SMC_SINGLE_LINK] = NULL; + rc = PTR_ERR(buf_slot->mr_rx[link_idx]); + buf_slot->mr_rx[link_idx] = NULL; return rc; } - if (smc_ib_map_mr_sg(buf_slot) != 1) + if (smc_ib_map_mr_sg(buf_slot, link_idx) != 1) return -EINVAL; return 0; } /* synchronize buffer usage for cpu access */ -void smc_ib_sync_sg_for_cpu(struct smc_ib_device *smcibdev, +void smc_ib_sync_sg_for_cpu(struct smc_link *lnk, struct smc_buf_desc *buf_slot, enum dma_data_direction data_direction) { @@ -435,11 +435,11 @@ void smc_ib_sync_sg_for_cpu(struct smc_ib_device *smcibdev, unsigned int i; /* for now there is just one DMA address */ - for_each_sg(buf_slot->sgt[SMC_SINGLE_LINK].sgl, sg, - buf_slot->sgt[SMC_SINGLE_LINK].nents, i) { + for_each_sg(buf_slot->sgt[lnk->link_idx].sgl, sg, + buf_slot->sgt[lnk->link_idx].nents, i) { if (!sg_dma_len(sg)) break; - ib_dma_sync_single_for_cpu(smcibdev->ibdev, + ib_dma_sync_single_for_cpu(lnk->smcibdev->ibdev, sg_dma_address(sg), sg_dma_len(sg), data_direction); @@ -447,7 +447,7 @@ void smc_ib_sync_sg_for_cpu(struct smc_ib_device *smcibdev, } /* synchronize buffer usage for device access */ -void smc_ib_sync_sg_for_device(struct smc_ib_device *smcibdev, +void smc_ib_sync_sg_for_device(struct smc_link *lnk, struct smc_buf_desc *buf_slot, enum dma_data_direction data_direction) { @@ -455,11 +455,11 @@ void smc_ib_sync_sg_for_device(struct smc_ib_device *smcibdev, unsigned int i; /* for now there is just one DMA address */ - for_each_sg(buf_slot->sgt[SMC_SINGLE_LINK].sgl, sg, - buf_slot->sgt[SMC_SINGLE_LINK].nents, i) { + for_each_sg(buf_slot->sgt[lnk->link_idx].sgl, sg, + buf_slot->sgt[lnk->link_idx].nents, i) { if (!sg_dma_len(sg)) break; - ib_dma_sync_single_for_device(smcibdev->ibdev, + ib_dma_sync_single_for_device(lnk->smcibdev->ibdev, sg_dma_address(sg), sg_dma_len(sg), data_direction); @@ -467,15 +467,15 @@ void smc_ib_sync_sg_for_device(struct smc_ib_device *smcibdev, } /* Map a new TX or RX buffer SG-table to DMA */ -int smc_ib_buf_map_sg(struct smc_ib_device *smcibdev, +int smc_ib_buf_map_sg(struct smc_link *lnk, struct smc_buf_desc *buf_slot, enum dma_data_direction data_direction) { int mapped_nents; - mapped_nents = ib_dma_map_sg(smcibdev->ibdev, - buf_slot->sgt[SMC_SINGLE_LINK].sgl, - buf_slot->sgt[SMC_SINGLE_LINK].orig_nents, + mapped_nents = ib_dma_map_sg(lnk->smcibdev->ibdev, + buf_slot->sgt[lnk->link_idx].sgl, + buf_slot->sgt[lnk->link_idx].orig_nents, data_direction); if (!mapped_nents) return -ENOMEM; @@ -483,18 +483,18 @@ int smc_ib_buf_map_sg(struct smc_ib_device *smcibdev, return mapped_nents; } -void smc_ib_buf_unmap_sg(struct smc_ib_device *smcibdev, +void smc_ib_buf_unmap_sg(struct smc_link *lnk, struct smc_buf_desc *buf_slot, enum dma_data_direction data_direction) { - if (!buf_slot->sgt[SMC_SINGLE_LINK].sgl->dma_address) + if (!buf_slot->sgt[lnk->link_idx].sgl->dma_address) return; /* already unmapped */ - ib_dma_unmap_sg(smcibdev->ibdev, - buf_slot->sgt[SMC_SINGLE_LINK].sgl, - buf_slot->sgt[SMC_SINGLE_LINK].orig_nents, + ib_dma_unmap_sg(lnk->smcibdev->ibdev, + buf_slot->sgt[lnk->link_idx].sgl, + buf_slot->sgt[lnk->link_idx].orig_nents, data_direction); - buf_slot->sgt[SMC_SINGLE_LINK].sgl->dma_address = 0; + buf_slot->sgt[lnk->link_idx].sgl->dma_address = 0; } long smc_ib_setup_per_ibdev(struct smc_ib_device *smcibdev) diff --git a/net/smc/smc_ib.h b/net/smc/smc_ib.h index 5c2b115d36da..e6a696ae15f3 100644 --- a/net/smc/smc_ib.h +++ b/net/smc/smc_ib.h @@ -59,10 +59,10 @@ struct smc_link; int smc_ib_register_client(void) __init; void smc_ib_unregister_client(void); bool smc_ib_port_active(struct smc_ib_device *smcibdev, u8 ibport); -int smc_ib_buf_map_sg(struct smc_ib_device *smcibdev, +int smc_ib_buf_map_sg(struct smc_link *lnk, struct smc_buf_desc *buf_slot, enum dma_data_direction data_direction); -void smc_ib_buf_unmap_sg(struct smc_ib_device *smcibdev, +void smc_ib_buf_unmap_sg(struct smc_link *lnk, struct smc_buf_desc *buf_slot, enum dma_data_direction data_direction); void smc_ib_dealloc_protection_domain(struct smc_link *lnk); @@ -74,12 +74,12 @@ int smc_ib_modify_qp_rts(struct smc_link *lnk); int smc_ib_modify_qp_reset(struct smc_link *lnk); long smc_ib_setup_per_ibdev(struct smc_ib_device *smcibdev); int smc_ib_get_memory_region(struct ib_pd *pd, int access_flags, - struct smc_buf_desc *buf_slot); + struct smc_buf_desc *buf_slot, u8 link_idx); void smc_ib_put_memory_region(struct ib_mr *mr); -void smc_ib_sync_sg_for_cpu(struct smc_ib_device *smcibdev, +void smc_ib_sync_sg_for_cpu(struct smc_link *lnk, struct smc_buf_desc *buf_slot, enum dma_data_direction data_direction); -void smc_ib_sync_sg_for_device(struct smc_ib_device *smcibdev, +void smc_ib_sync_sg_for_device(struct smc_link *lnk, struct smc_buf_desc *buf_slot, enum dma_data_direction data_direction); int smc_ib_determine_gid(struct smc_ib_device *smcibdev, u8 ibport, diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c index 0e52aab53d97..34d0752ba6af 100644 --- a/net/smc/smc_llc.c +++ b/net/smc/smc_llc.c @@ -231,9 +231,9 @@ static int smc_llc_send_confirm_rkey(struct smc_link *link, rkeyllc->hd.common.type = SMC_LLC_CONFIRM_RKEY; rkeyllc->hd.length = sizeof(struct smc_llc_msg_confirm_rkey); rkeyllc->rtoken[0].rmb_key = - htonl(rmb_desc->mr_rx[SMC_SINGLE_LINK]->rkey); + htonl(rmb_desc->mr_rx[link->link_idx]->rkey); rkeyllc->rtoken[0].rmb_vaddr = cpu_to_be64( - (u64)sg_dma_address(rmb_desc->sgt[SMC_SINGLE_LINK].sgl)); + (u64)sg_dma_address(rmb_desc->sgt[link->link_idx].sgl)); /* send llc message */ rc = smc_wr_tx_send(link, pend); return rc; @@ -256,7 +256,7 @@ static int smc_llc_send_delete_rkey(struct smc_link *link, rkeyllc->hd.common.type = SMC_LLC_DELETE_RKEY; rkeyllc->hd.length = sizeof(struct smc_llc_msg_delete_rkey); rkeyllc->num_rkeys = 1; - rkeyllc->rkey[0] = htonl(rmb_desc->mr_rx[SMC_SINGLE_LINK]->rkey); + rkeyllc->rkey[0] = htonl(rmb_desc->mr_rx[link->link_idx]->rkey); /* send llc message */ rc = smc_wr_tx_send(link, pend); return rc; @@ -501,7 +501,7 @@ static void smc_llc_rx_confirm_rkey(struct smc_link *link, SMC_LLC_FLAG_RKEY_NEG; complete(&link->llc_confirm_rkey); } else { - rc = smc_rtoken_add(smc_get_lgr(link), + rc = smc_rtoken_add(link, llc->rtoken[0].rmb_vaddr, llc->rtoken[0].rmb_key); @@ -539,7 +539,7 @@ static void smc_llc_rx_delete_rkey(struct smc_link *link, } else { max = min_t(u8, llc->num_rkeys, SMC_LLC_DEL_RKEY_MAX); for (i = 0; i < max; i++) { - if (smc_rtoken_delete(smc_get_lgr(link), llc->rkey[i])) + if (smc_rtoken_delete(link, llc->rkey[i])) err_mask |= 1 << (SMC_LLC_DEL_RKEY_MAX - 1 - i); } diff --git a/net/smc/smc_tx.c b/net/smc/smc_tx.c index 9f1ade86d70e..d74bfe6a90f1 100644 --- a/net/smc/smc_tx.c +++ b/net/smc/smc_tx.c @@ -269,19 +269,18 @@ static int smc_tx_rdma_write(struct smc_connection *conn, int peer_rmbe_offset, int num_sges, struct ib_rdma_wr *rdma_wr) { struct smc_link_group *lgr = conn->lgr; - struct smc_link *link; + struct smc_link *link = conn->lnk; int rc; - link = &lgr->lnk[SMC_SINGLE_LINK]; rdma_wr->wr.wr_id = smc_wr_tx_get_next_wr_id(link); rdma_wr->wr.num_sge = num_sges; rdma_wr->remote_addr = - lgr->rtokens[conn->rtoken_idx][SMC_SINGLE_LINK].dma_addr + + lgr->rtokens[conn->rtoken_idx][link->link_idx].dma_addr + /* RMBE within RMB */ conn->tx_off + /* offset within RMBE */ peer_rmbe_offset; - rdma_wr->rkey = lgr->rtokens[conn->rtoken_idx][SMC_SINGLE_LINK].rkey; + rdma_wr->rkey = lgr->rtokens[conn->rtoken_idx][link->link_idx].rkey; rc = ib_post_send(link->roce_qp, &rdma_wr->wr, NULL); if (rc) smc_lgr_terminate_sched(lgr); @@ -310,8 +309,10 @@ static int smcr_tx_rdma_writes(struct smc_connection *conn, size_t len, size_t dst_off, size_t dst_len, struct smc_rdma_wr *wr_rdma_buf) { + struct smc_link *link = conn->lnk; + dma_addr_t dma_addr = - sg_dma_address(conn->sndbuf_desc->sgt[SMC_SINGLE_LINK].sgl); + sg_dma_address(conn->sndbuf_desc->sgt[link->link_idx].sgl); int src_len_sum = src_len, dst_len_sum = dst_len; int sent_count = src_off; int srcchunk, dstchunk; @@ -507,7 +508,7 @@ static int smcr_tx_sndbuf_nonempty(struct smc_connection *conn) if (!pflags->urg_data_present) { rc = smc_tx_rdma_writes(conn, wr_rdma_buf); if (rc) { - smc_wr_tx_put_slot(&conn->lgr->lnk[SMC_SINGLE_LINK], + smc_wr_tx_put_slot(conn->lnk, (struct smc_wr_tx_pend_priv *)pend); goto out_unlock; } -- cgit v1.2.3-59-g8ed1b From b9247544c1bccfe1b74ddf1dade719a69946cbb1 Mon Sep 17 00:00:00 2001 From: Karsten Graul Date: Wed, 29 Apr 2020 17:10:41 +0200 Subject: net/smc: convert static link ID instances to support multiple links As a preparation for the support of multiple links remove the usage of a static link id (SMC_SINGLE_LINK) and allow dynamic link ids. Signed-off-by: Karsten Graul Reviewed-by: Ursula Braun Signed-off-by: David S. Miller --- net/smc/af_smc.c | 54 ++++++--- net/smc/smc_clc.h | 1 + net/smc/smc_core.c | 332 ++++++++++++++++++++++++++++++++++++----------------- net/smc/smc_core.h | 37 +++--- net/smc/smc_llc.c | 2 + 5 files changed, 291 insertions(+), 135 deletions(-) diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 6e4bad8c64a8..890dc6422f8c 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -338,28 +338,48 @@ static void smc_copy_sock_settings_to_smc(struct smc_sock *smc) } /* register a new rmb, send confirm_rkey msg to register with peer */ -static int smc_reg_rmb(struct smc_link *link, struct smc_buf_desc *rmb_desc, - bool conf_rkey) +static int smcr_link_reg_rmb(struct smc_link *link, + struct smc_buf_desc *rmb_desc, bool conf_rkey) { - if (!rmb_desc->wr_reg) { + if (!rmb_desc->is_reg_mr[link->link_idx]) { /* register memory region for new rmb */ if (smc_wr_reg_send(link, rmb_desc->mr_rx[link->link_idx])) { - rmb_desc->regerr = 1; + rmb_desc->is_reg_err = true; return -EFAULT; } - rmb_desc->wr_reg = 1; + rmb_desc->is_reg_mr[link->link_idx] = true; } if (!conf_rkey) return 0; + /* exchange confirm_rkey msg with peer */ - if (smc_llc_do_confirm_rkey(link, rmb_desc)) { - rmb_desc->regerr = 1; - return -EFAULT; + if (!rmb_desc->is_conf_rkey) { + if (smc_llc_do_confirm_rkey(link, rmb_desc)) { + rmb_desc->is_reg_err = true; + return -EFAULT; + } + rmb_desc->is_conf_rkey = true; + } + return 0; +} + +/* register the new rmb on all links */ +static int smcr_lgr_reg_rmbs(struct smc_link_group *lgr, + struct smc_buf_desc *rmb_desc) +{ + int i, rc; + + for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { + if (lgr->lnk[i].state != SMC_LNK_ACTIVE) + continue; + rc = smcr_link_reg_rmb(&lgr->lnk[i], rmb_desc, true); + if (rc) + return rc; } return 0; } -static int smc_clnt_conf_first_link(struct smc_sock *smc) +static int smcr_clnt_conf_first_link(struct smc_sock *smc) { struct net *net = sock_net(smc->clcsock->sk); struct smc_link *link = smc->conn.lnk; @@ -387,7 +407,7 @@ static int smc_clnt_conf_first_link(struct smc_sock *smc) smc_wr_remember_qp_attr(link); - if (smc_reg_rmb(link, smc->conn.rmb_desc, false)) + if (smcr_link_reg_rmb(link, smc->conn.rmb_desc, false)) return SMC_CLC_DECL_ERR_REGRMB; /* send CONFIRM LINK response over RoCE fabric */ @@ -632,7 +652,7 @@ static int smc_connect_rdma(struct smc_sock *smc, return smc_connect_abort(smc, SMC_CLC_DECL_ERR_RDYLNK, ini->cln_first_contact); } else { - if (smc_reg_rmb(link, smc->conn.rmb_desc, true)) + if (smcr_lgr_reg_rmbs(smc->conn.lgr, smc->conn.rmb_desc)) return smc_connect_abort(smc, SMC_CLC_DECL_ERR_REGRMB, ini->cln_first_contact); } @@ -647,7 +667,7 @@ static int smc_connect_rdma(struct smc_sock *smc, if (ini->cln_first_contact == SMC_FIRST_CONTACT) { /* QP confirmation over RoCE fabric */ - reason_code = smc_clnt_conf_first_link(smc); + reason_code = smcr_clnt_conf_first_link(smc); if (reason_code) return smc_connect_abort(smc, reason_code, ini->cln_first_contact); @@ -997,14 +1017,14 @@ void smc_close_non_accepted(struct sock *sk) sock_put(sk); /* final sock_put */ } -static int smc_serv_conf_first_link(struct smc_sock *smc) +static int smcr_serv_conf_first_link(struct smc_sock *smc) { struct net *net = sock_net(smc->clcsock->sk); struct smc_link *link = smc->conn.lnk; int rest; int rc; - if (smc_reg_rmb(link, smc->conn.rmb_desc, false)) + if (smcr_link_reg_rmb(link, smc->conn.rmb_desc, false)) return SMC_CLC_DECL_ERR_REGRMB; /* send CONFIRM LINK request to client over the RoCE fabric */ @@ -1189,10 +1209,10 @@ static int smc_listen_ism_init(struct smc_sock *new_smc, /* listen worker: register buffers */ static int smc_listen_rdma_reg(struct smc_sock *new_smc, int local_contact) { - struct smc_link *link = new_smc->conn.lnk; + struct smc_connection *conn = &new_smc->conn; if (local_contact != SMC_FIRST_CONTACT) { - if (smc_reg_rmb(link, new_smc->conn.rmb_desc, true)) + if (smcr_lgr_reg_rmbs(conn->lgr, conn->rmb_desc)) return SMC_CLC_DECL_ERR_REGRMB; } smc_rmb_sync_sg_for_device(&new_smc->conn); @@ -1222,7 +1242,7 @@ static int smc_listen_rdma_finish(struct smc_sock *new_smc, goto decline; } /* QP confirmation over RoCE fabric */ - reason_code = smc_serv_conf_first_link(new_smc); + reason_code = smcr_serv_conf_first_link(new_smc); if (reason_code) goto decline; } diff --git a/net/smc/smc_clc.h b/net/smc/smc_clc.h index ca209272e5fa..4f2e150a2be1 100644 --- a/net/smc/smc_clc.h +++ b/net/smc/smc_clc.h @@ -44,6 +44,7 @@ #define SMC_CLC_DECL_DIFFPREFIX 0x03070000 /* IP prefix / subnet mismatch */ #define SMC_CLC_DECL_GETVLANERR 0x03080000 /* err to get vlan id of ip device*/ #define SMC_CLC_DECL_ISMVLANERR 0x03090000 /* err to reg vlan id on ism dev */ +#define SMC_CLC_DECL_NOACTLINK 0x030a0000 /* no active smc-r link in lgr */ #define SMC_CLC_DECL_SYNCERR 0x04000000 /* synchronization error */ #define SMC_CLC_DECL_PEERDECL 0x05000000 /* peer declined during handshake */ #define SMC_CLC_DECL_INTERR 0x09990000 /* internal error */ diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index 1d695093f205..5df3f8f41d19 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -116,7 +116,7 @@ static void smc_lgr_add_alert_token(struct smc_connection *conn) * Requires @conns_lock * Note that '0' is a reserved value and not assigned. */ -static void smc_lgr_register_conn(struct smc_connection *conn) +static int smc_lgr_register_conn(struct smc_connection *conn) { struct smc_sock *smc = container_of(conn, struct smc_sock, conn); static atomic_t nexttoken = ATOMIC_INIT(0); @@ -133,10 +133,22 @@ static void smc_lgr_register_conn(struct smc_connection *conn) smc_lgr_add_alert_token(conn); /* assign the new connection to a link */ - if (!conn->lgr->is_smcd) - conn->lnk = &conn->lgr->lnk[SMC_SINGLE_LINK]; + if (!conn->lgr->is_smcd) { + struct smc_link *lnk; + int i; + /* tbd - link balancing */ + for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { + lnk = &conn->lgr->lnk[i]; + if (lnk->state == SMC_LNK_ACTIVATING || + lnk->state == SMC_LNK_ACTIVE) + conn->lnk = lnk; + } + if (!conn->lnk) + return SMC_CLC_DECL_NOACTLINK; + } conn->lgr->conns_num++; + return 0; } /* Unregister connection and reset the alert token of the given connection< @@ -202,8 +214,8 @@ static void smc_lgr_free_work(struct work_struct *work) struct smc_link_group, free_work); spinlock_t *lgr_lock; - struct smc_link *lnk; bool conns; + int i; smc_lgr_list_head(lgr, &lgr_lock); spin_lock_bh(lgr_lock); @@ -220,25 +232,38 @@ static void smc_lgr_free_work(struct work_struct *work) } list_del_init(&lgr->list); /* remove from smc_lgr_list */ - lnk = &lgr->lnk[SMC_SINGLE_LINK]; if (!lgr->is_smcd && !lgr->terminating) { - /* try to send del link msg, on error free lgr immediately */ - if (lnk->state == SMC_LNK_ACTIVE && - !smcr_link_send_delete(lnk, true)) { - /* reschedule in case we never receive a response */ - smc_lgr_schedule_free_work(lgr); + bool do_wait = false; + + for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { + struct smc_link *lnk = &lgr->lnk[i]; + /* try to send del link msg, on err free immediately */ + if (lnk->state == SMC_LNK_ACTIVE && + !smcr_link_send_delete(lnk, true)) { + /* reschedule in case we never receive a resp */ + smc_lgr_schedule_free_work(lgr); + do_wait = true; + } + } + if (do_wait) { spin_unlock_bh(lgr_lock); - return; + return; /* wait for resp, see smc_llc_rx_delete_link */ } } lgr->freeing = 1; /* this instance does the freeing, no new schedule */ spin_unlock_bh(lgr_lock); cancel_delayed_work(&lgr->free_work); - if (!lgr->is_smcd && lnk->state != SMC_LNK_INACTIVE) - smc_llc_link_inactive(lnk); if (lgr->is_smcd && !lgr->terminating) smc_ism_signal_shutdown(lgr); + if (!lgr->is_smcd) { + for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { + struct smc_link *lnk = &lgr->lnk[i]; + + if (lnk->state != SMC_LNK_INACTIVE) + smc_llc_link_inactive(lnk); + } + } smc_lgr_free(lgr); } @@ -417,29 +442,37 @@ out: return rc; } +static void smcr_buf_unuse(struct smc_buf_desc *rmb_desc, + struct smc_link *lnk) +{ + struct smc_link_group *lgr = lnk->lgr; + + if (rmb_desc->is_conf_rkey && !list_empty(&lgr->list)) { + /* unregister rmb with peer */ + smc_llc_do_delete_rkey(lnk, rmb_desc); + rmb_desc->is_conf_rkey = false; + } + if (rmb_desc->is_reg_err) { + /* buf registration failed, reuse not possible */ + write_lock_bh(&lgr->rmbs_lock); + list_del(&rmb_desc->list); + write_unlock_bh(&lgr->rmbs_lock); + + smc_buf_free(lgr, true, rmb_desc); + } else { + rmb_desc->used = 0; + } +} + static void smc_buf_unuse(struct smc_connection *conn, struct smc_link_group *lgr) { if (conn->sndbuf_desc) conn->sndbuf_desc->used = 0; - if (conn->rmb_desc) { - if (!conn->rmb_desc->regerr) { - if (!lgr->is_smcd && !list_empty(&lgr->list)) { - /* unregister rmb with peer */ - smc_llc_do_delete_rkey( - conn->lnk, - conn->rmb_desc); - } - conn->rmb_desc->used = 0; - } else { - /* buf registration failed, reuse not possible */ - write_lock_bh(&lgr->rmbs_lock); - list_del(&conn->rmb_desc->list); - write_unlock_bh(&lgr->rmbs_lock); - - smc_buf_free(lgr, true, conn->rmb_desc); - } - } + if (conn->rmb_desc && lgr->is_smcd) + conn->rmb_desc->used = 0; + else if (conn->rmb_desc) + smcr_buf_unuse(conn->rmb_desc, conn->lnk); } /* remove a finished connection from its link group */ @@ -467,6 +500,8 @@ void smc_conn_free(struct smc_connection *conn) static void smcr_link_clear(struct smc_link *lnk) { + if (lnk->peer_qpn == 0) + return; lnk->peer_qpn = 0; smc_llc_link_clear(lnk); smc_ib_modify_qp_reset(lnk); @@ -482,17 +517,23 @@ static void smcr_link_clear(struct smc_link *lnk) static void smcr_buf_free(struct smc_link_group *lgr, bool is_rmb, struct smc_buf_desc *buf_desc) { - struct smc_link *lnk = &lgr->lnk[SMC_SINGLE_LINK]; + struct smc_link *lnk; + int i; - if (is_rmb) { - if (buf_desc->mr_rx[lnk->link_idx]) - smc_ib_put_memory_region( - buf_desc->mr_rx[lnk->link_idx]); - smc_ib_buf_unmap_sg(lnk, buf_desc, DMA_FROM_DEVICE); - } else { - smc_ib_buf_unmap_sg(lnk, buf_desc, DMA_TO_DEVICE); + for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { + lnk = &lgr->lnk[i]; + if (!buf_desc->is_map_ib[lnk->link_idx]) + continue; + if (is_rmb) { + if (buf_desc->mr_rx[lnk->link_idx]) + smc_ib_put_memory_region( + buf_desc->mr_rx[lnk->link_idx]); + smc_ib_buf_unmap_sg(lnk, buf_desc, DMA_FROM_DEVICE); + } else { + smc_ib_buf_unmap_sg(lnk, buf_desc, DMA_TO_DEVICE); + } + sg_free_table(&buf_desc->sgt[lnk->link_idx]); } - sg_free_table(&buf_desc->sgt[lnk->link_idx]); if (buf_desc->pages) __free_pages(buf_desc->pages, buf_desc->order); @@ -551,6 +592,8 @@ static void smc_lgr_free_bufs(struct smc_link_group *lgr) /* remove a link group */ static void smc_lgr_free(struct smc_link_group *lgr) { + int i; + smc_lgr_free_bufs(lgr); if (lgr->is_smcd) { if (!lgr->terminating) { @@ -560,7 +603,11 @@ static void smc_lgr_free(struct smc_link_group *lgr) if (!atomic_dec_return(&lgr->smcd->lgr_cnt)) wake_up(&lgr->smcd->lgrs_deleted); } else { - smcr_link_clear(&lgr->lnk[SMC_SINGLE_LINK]); + for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { + if (lgr->lnk[i].state == SMC_LNK_INACTIVE) + continue; + smcr_link_clear(&lgr->lnk[i]); + } if (!atomic_dec_return(&lgr_cnt)) wake_up(&lgrs_deleted); } @@ -628,16 +675,20 @@ static void smc_conn_kill(struct smc_connection *conn, bool soft) static void smc_lgr_cleanup(struct smc_link_group *lgr) { + int i; + if (lgr->is_smcd) { smc_ism_signal_shutdown(lgr); smcd_unregister_all_dmbs(lgr); smc_ism_put_vlan(lgr->smcd, lgr->vlan_id); put_device(&lgr->smcd->dev); } else { - struct smc_link *lnk = &lgr->lnk[SMC_SINGLE_LINK]; + for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { + struct smc_link *lnk = &lgr->lnk[i]; - if (lnk->state != SMC_LNK_INACTIVE) - smc_llc_link_inactive(lnk); + if (lnk->state != SMC_LNK_INACTIVE) + smc_llc_link_inactive(lnk); + } } } @@ -650,6 +701,7 @@ static void __smc_lgr_terminate(struct smc_link_group *lgr, bool soft) struct smc_connection *conn; struct smc_sock *smc; struct rb_node *node; + int i; if (lgr->terminating) return; /* lgr already terminating */ @@ -657,7 +709,8 @@ static void __smc_lgr_terminate(struct smc_link_group *lgr, bool soft) cancel_delayed_work_sync(&lgr->free_work); lgr->terminating = 1; if (!lgr->is_smcd) - smc_llc_link_inactive(&lgr->lnk[SMC_SINGLE_LINK]); + for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) + smc_llc_link_inactive(&lgr->lnk[i]); /* kill remaining link group connections */ read_lock_bh(&lgr->conns_lock); @@ -703,14 +756,22 @@ void smc_port_terminate(struct smc_ib_device *smcibdev, u8 ibport) { struct smc_link_group *lgr, *l; LIST_HEAD(lgr_free_list); + int i; spin_lock_bh(&smc_lgr_list.lock); list_for_each_entry_safe(lgr, l, &smc_lgr_list.list, list) { - if (!lgr->is_smcd && - lgr->lnk[SMC_SINGLE_LINK].smcibdev == smcibdev && - lgr->lnk[SMC_SINGLE_LINK].ibport == ibport) { - list_move(&lgr->list, &lgr_free_list); - lgr->freeing = 1; + if (lgr->is_smcd) + continue; + /* tbd - terminate only when no more links are active */ + for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { + if (lgr->lnk[i].state == SMC_LNK_INACTIVE || + lgr->lnk[i].state == SMC_LNK_DELETING) + continue; + if (lgr->lnk[i].smcibdev == smcibdev && + lgr->lnk[i].ibport == ibport) { + list_move(&lgr->list, &lgr_free_list); + lgr->freeing = 1; + } } } spin_unlock_bh(&smc_lgr_list.lock); @@ -775,6 +836,7 @@ void smc_smcr_terminate_all(struct smc_ib_device *smcibdev) { struct smc_link_group *lgr, *lg; LIST_HEAD(lgr_free_list); + int i; spin_lock_bh(&smc_lgr_list.lock); if (!smcibdev) { @@ -783,9 +845,12 @@ void smc_smcr_terminate_all(struct smc_ib_device *smcibdev) lgr->freeing = 1; } else { list_for_each_entry_safe(lgr, lg, &smc_lgr_list.list, list) { - if (lgr->lnk[SMC_SINGLE_LINK].smcibdev == smcibdev) { - list_move(&lgr->list, &lgr_free_list); - lgr->freeing = 1; + for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { + if (lgr->lnk[i].smcibdev == smcibdev) { + list_move(&lgr->list, &lgr_free_list); + lgr->freeing = 1; + break; + } } } } @@ -857,15 +922,21 @@ static bool smcr_lgr_match(struct smc_link_group *lgr, struct smc_clc_msg_local *lcl, enum smc_lgr_role role, u32 clcqpn) { - return !memcmp(lgr->peer_systemid, lcl->id_for_peer, - SMC_SYSTEMID_LEN) && - !memcmp(lgr->lnk[SMC_SINGLE_LINK].peer_gid, &lcl->gid, - SMC_GID_SIZE) && - !memcmp(lgr->lnk[SMC_SINGLE_LINK].peer_mac, lcl->mac, - sizeof(lcl->mac)) && - lgr->role == role && - (lgr->role == SMC_SERV || - lgr->lnk[SMC_SINGLE_LINK].peer_qpn == clcqpn); + int i; + + if (memcmp(lgr->peer_systemid, lcl->id_for_peer, SMC_SYSTEMID_LEN) || + lgr->role != role) + return false; + + for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { + if (lgr->lnk[i].state != SMC_LNK_ACTIVE) + continue; + if ((lgr->role == SMC_SERV || lgr->lnk[i].peer_qpn == clcqpn) && + !memcmp(lgr->lnk[i].peer_gid, &lcl->gid, SMC_GID_SIZE) && + !memcmp(lgr->lnk[i].peer_mac, lcl->mac, sizeof(lcl->mac))) + return true; + } + return false; } static bool smcd_lgr_match(struct smc_link_group *lgr, @@ -906,15 +977,17 @@ int smc_conn_create(struct smc_sock *smc, struct smc_init_info *ini) /* link group found */ ini->cln_first_contact = SMC_REUSE_CONTACT; conn->lgr = lgr; - smc_lgr_register_conn(conn); /* add smc conn to lgr */ - if (delayed_work_pending(&lgr->free_work)) - cancel_delayed_work(&lgr->free_work); + rc = smc_lgr_register_conn(conn); /* add conn to lgr */ write_unlock_bh(&lgr->conns_lock); + if (!rc && delayed_work_pending(&lgr->free_work)) + cancel_delayed_work(&lgr->free_work); break; } write_unlock_bh(&lgr->conns_lock); } spin_unlock_bh(lgr_lock); + if (rc) + return rc; if (role == SMC_CLNT && !ini->srv_first_contact && ini->cln_first_contact == SMC_FIRST_CONTACT) { @@ -932,8 +1005,10 @@ create: goto out; lgr = conn->lgr; write_lock_bh(&lgr->conns_lock); - smc_lgr_register_conn(conn); /* add smc conn to lgr */ + rc = smc_lgr_register_conn(conn); /* add smc conn to lgr */ write_unlock_bh(&lgr->conns_lock); + if (rc) + goto out; } conn->local_tx_ctrl.common.type = SMC_CDC_MSG_TYPE; conn->local_tx_ctrl.len = SMC_WR_TX_SIZE; @@ -1006,12 +1081,55 @@ static inline int smc_rmb_wnd_update_limit(int rmbe_size) return min_t(int, rmbe_size / 10, SOCK_MIN_SNDBUF / 2); } +/* map an rmb buf to a link */ +static int smcr_buf_map_link(struct smc_buf_desc *buf_desc, bool is_rmb, + struct smc_link *lnk) +{ + int rc; + + if (buf_desc->is_map_ib[lnk->link_idx]) + return 0; + + rc = sg_alloc_table(&buf_desc->sgt[lnk->link_idx], 1, GFP_KERNEL); + if (rc) + return rc; + sg_set_buf(buf_desc->sgt[lnk->link_idx].sgl, + buf_desc->cpu_addr, buf_desc->len); + + /* map sg table to DMA address */ + rc = smc_ib_buf_map_sg(lnk, buf_desc, + is_rmb ? DMA_FROM_DEVICE : DMA_TO_DEVICE); + /* SMC protocol depends on mapping to one DMA address only */ + if (rc != 1) { + rc = -EAGAIN; + goto free_table; + } + + /* create a new memory region for the RMB */ + if (is_rmb) { + rc = smc_ib_get_memory_region(lnk->roce_pd, + IB_ACCESS_REMOTE_WRITE | + IB_ACCESS_LOCAL_WRITE, + buf_desc, lnk->link_idx); + if (rc) + goto buf_unmap; + smc_ib_sync_sg_for_device(lnk, buf_desc, DMA_FROM_DEVICE); + } + buf_desc->is_map_ib[lnk->link_idx] = true; + return 0; + +buf_unmap: + smc_ib_buf_unmap_sg(lnk, buf_desc, + is_rmb ? DMA_FROM_DEVICE : DMA_TO_DEVICE); +free_table: + sg_free_table(&buf_desc->sgt[lnk->link_idx]); + return rc; +} + static struct smc_buf_desc *smcr_new_buf_create(struct smc_link_group *lgr, bool is_rmb, int bufsize) { struct smc_buf_desc *buf_desc; - struct smc_link *lnk; - int rc; /* try to alloc a new buffer */ buf_desc = kzalloc(sizeof(*buf_desc), GFP_KERNEL); @@ -1028,40 +1146,32 @@ static struct smc_buf_desc *smcr_new_buf_create(struct smc_link_group *lgr, return ERR_PTR(-EAGAIN); } buf_desc->cpu_addr = (void *)page_address(buf_desc->pages); + buf_desc->len = bufsize; + return buf_desc; +} - /* build the sg table from the pages */ - lnk = &lgr->lnk[SMC_SINGLE_LINK]; - rc = sg_alloc_table(&buf_desc->sgt[lnk->link_idx], 1, GFP_KERNEL); - if (rc) { - smc_buf_free(lgr, is_rmb, buf_desc); - return ERR_PTR(rc); - } - sg_set_buf(buf_desc->sgt[lnk->link_idx].sgl, - buf_desc->cpu_addr, bufsize); +/* map buf_desc on all usable links, + * unused buffers stay mapped as long as the link is up + */ +static int smcr_buf_map_usable_links(struct smc_link_group *lgr, + struct smc_buf_desc *buf_desc, bool is_rmb) +{ + int i, rc = 0; - /* map sg table to DMA address */ - rc = smc_ib_buf_map_sg(lnk, buf_desc, - is_rmb ? DMA_FROM_DEVICE : DMA_TO_DEVICE); - /* SMC protocol depends on mapping to one DMA address only */ - if (rc != 1) { - smc_buf_free(lgr, is_rmb, buf_desc); - return ERR_PTR(-EAGAIN); - } + for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { + struct smc_link *lnk = &lgr->lnk[i]; - /* create a new memory region for the RMB */ - if (is_rmb) { - rc = smc_ib_get_memory_region(lnk->roce_pd, - IB_ACCESS_REMOTE_WRITE | - IB_ACCESS_LOCAL_WRITE, - buf_desc, lnk->link_idx); - if (rc) { - smc_buf_free(lgr, is_rmb, buf_desc); - return ERR_PTR(rc); + if (lnk->state != SMC_LNK_ACTIVE && + lnk->state != SMC_LNK_ACTIVATING) + continue; + if (smcr_buf_map_link(buf_desc, is_rmb, lnk)) { + smcr_buf_unuse(buf_desc, lnk); + rc = -ENOMEM; + goto out; } } - - buf_desc->len = bufsize; - return buf_desc; +out: + return rc; } #define SMCD_DMBE_SIZES 7 /* 0 -> 16KB, 1 -> 32KB, .. 6 -> 1MB */ @@ -1159,6 +1269,12 @@ static int __smc_buf_create(struct smc_sock *smc, bool is_smcd, bool is_rmb) if (IS_ERR(buf_desc)) return -ENOMEM; + if (!is_smcd) { + if (smcr_buf_map_usable_links(lgr, buf_desc, is_rmb)) { + return -ENOMEM; + } + } + if (is_rmb) { conn->rmb_desc = buf_desc; conn->rmbe_size_short = bufsize_short; @@ -1192,22 +1308,32 @@ void smc_sndbuf_sync_sg_for_device(struct smc_connection *conn) void smc_rmb_sync_sg_for_cpu(struct smc_connection *conn) { - struct smc_link_group *lgr = conn->lgr; + int i; if (!conn->lgr || conn->lgr->is_smcd) return; - smc_ib_sync_sg_for_cpu(&lgr->lnk[SMC_SINGLE_LINK], - conn->rmb_desc, DMA_FROM_DEVICE); + for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { + if (conn->lgr->lnk[i].state != SMC_LNK_ACTIVE && + conn->lgr->lnk[i].state != SMC_LNK_ACTIVATING) + continue; + smc_ib_sync_sg_for_cpu(&conn->lgr->lnk[i], conn->rmb_desc, + DMA_FROM_DEVICE); + } } void smc_rmb_sync_sg_for_device(struct smc_connection *conn) { - struct smc_link_group *lgr = conn->lgr; + int i; if (!conn->lgr || conn->lgr->is_smcd) return; - smc_ib_sync_sg_for_device(&lgr->lnk[SMC_SINGLE_LINK], - conn->rmb_desc, DMA_FROM_DEVICE); + for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { + if (conn->lgr->lnk[i].state != SMC_LNK_ACTIVE && + conn->lgr->lnk[i].state != SMC_LNK_ACTIVATING) + continue; + smc_ib_sync_sg_for_device(&conn->lgr->lnk[i], conn->rmb_desc, + DMA_FROM_DEVICE); + } } /* create the send and receive buffer for an SMC socket; diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h index c71c35a3596c..66753ba23bc6 100644 --- a/net/smc/smc_core.h +++ b/net/smc/smc_core.h @@ -152,25 +152,32 @@ struct smc_buf_desc { struct page *pages; int len; /* length of buffer */ u32 used; /* currently used / unused */ - u8 wr_reg : 1; /* mem region registered */ - u8 regerr : 1; /* err during registration */ union { struct { /* SMC-R */ - struct sg_table sgt[SMC_LINKS_PER_LGR_MAX]; - /* virtual buffer */ - struct ib_mr *mr_rx[SMC_LINKS_PER_LGR_MAX]; - /* for rmb only: memory region - * incl. rkey provided to peer - */ - u32 order; /* allocation order */ + struct sg_table sgt[SMC_LINKS_PER_LGR_MAX]; + /* virtual buffer */ + struct ib_mr *mr_rx[SMC_LINKS_PER_LGR_MAX]; + /* for rmb only: memory region + * incl. rkey provided to peer + */ + u32 order; /* allocation order */ + + u8 is_conf_rkey; + /* confirm_rkey done */ + u8 is_reg_mr[SMC_LINKS_PER_LGR_MAX]; + /* mem region registered */ + u8 is_map_ib[SMC_LINKS_PER_LGR_MAX]; + /* mem region mapped to lnk */ + u8 is_reg_err; + /* buffer registration err */ }; struct { /* SMC-D */ - unsigned short sba_idx; - /* SBA index number */ - u64 token; - /* DMB token number */ - dma_addr_t dma_addr; - /* DMA address */ + unsigned short sba_idx; + /* SBA index number */ + u64 token; + /* DMB token number */ + dma_addr_t dma_addr; + /* DMA address */ }; }; }; diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c index 34d0752ba6af..903ae068da3a 100644 --- a/net/smc/smc_llc.c +++ b/net/smc/smc_llc.c @@ -662,6 +662,8 @@ void smc_llc_link_deleting(struct smc_link *link) /* called in tasklet context */ void smc_llc_link_inactive(struct smc_link *link) { + if (link->state == SMC_LNK_INACTIVE) + return; link->state = SMC_LNK_INACTIVE; cancel_delayed_work(&link->llc_testlink_wrk); smc_wr_wakeup_reg_wait(link); -- cgit v1.2.3-59-g8ed1b From e07d31dc16b0d77ff6b3f71cafe3a825fb80bed4 Mon Sep 17 00:00:00 2001 From: Karsten Graul Date: Wed, 29 Apr 2020 17:10:42 +0200 Subject: net/smc: multi-link support for smc_rmb_rtoken_handling() Extend smc_rmb_rtoken_handling() and smc_rtoken_delete() to support multiple links. Signed-off-by: Karsten Graul Reviewed-by: Ursula Braun Signed-off-by: David S. Miller --- net/smc/af_smc.c | 4 ++-- net/smc/smc_core.c | 14 ++++++++------ net/smc/smc_core.h | 2 +- 3 files changed, 11 insertions(+), 9 deletions(-) diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 890dc6422f8c..e39f6aedd3bd 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -640,7 +640,7 @@ static int smc_connect_rdma(struct smc_sock *smc, if (ini->cln_first_contact == SMC_FIRST_CONTACT) smc_link_save_peer_info(link, aclc); - if (smc_rmb_rtoken_handling(&smc->conn, aclc)) + if (smc_rmb_rtoken_handling(&smc->conn, link, aclc)) return smc_connect_abort(smc, SMC_CLC_DECL_ERR_RTOK, ini->cln_first_contact); @@ -1231,7 +1231,7 @@ static int smc_listen_rdma_finish(struct smc_sock *new_smc, if (local_contact == SMC_FIRST_CONTACT) smc_link_save_peer_info(link, cclc); - if (smc_rmb_rtoken_handling(&new_smc->conn, cclc)) { + if (smc_rmb_rtoken_handling(&new_smc->conn, link, cclc)) { reason_code = SMC_CLC_DECL_ERR_RTOK; goto decline; } diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index 5df3f8f41d19..e8897d60b27f 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -1392,19 +1392,20 @@ int smc_rtoken_add(struct smc_link *lnk, __be64 nw_vaddr, __be32 nw_rkey) return i; } -/* delete an rtoken */ +/* delete an rtoken from all links */ int smc_rtoken_delete(struct smc_link *lnk, __be32 nw_rkey) { struct smc_link_group *lgr = smc_get_lgr(lnk); u32 rkey = ntohl(nw_rkey); - int i; + int i, j; for (i = 0; i < SMC_RMBS_PER_LGR_MAX; i++) { if (lgr->rtokens[i][lnk->link_idx].rkey == rkey && test_bit(i, lgr->rtokens_used_mask)) { - lgr->rtokens[i][lnk->link_idx].rkey = 0; - lgr->rtokens[i][lnk->link_idx].dma_addr = 0; - + for (j = 0; j < SMC_LINKS_PER_LGR_MAX; j++) { + lgr->rtokens[i][j].rkey = 0; + lgr->rtokens[i][j].dma_addr = 0; + } clear_bit(i, lgr->rtokens_used_mask); return 0; } @@ -1414,9 +1415,10 @@ int smc_rtoken_delete(struct smc_link *lnk, __be32 nw_rkey) /* save rkey and dma_addr received from peer during clc handshake */ int smc_rmb_rtoken_handling(struct smc_connection *conn, + struct smc_link *lnk, struct smc_clc_msg_accept_confirm *clc) { - conn->rtoken_idx = smc_rtoken_add(conn->lnk, clc->rmb_dma_addr, + conn->rtoken_idx = smc_rtoken_add(lnk, clc->rmb_dma_addr, clc->rmb_rkey); if (conn->rtoken_idx < 0) return conn->rtoken_idx; diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h index 66753ba23bc6..f68ba187ecf8 100644 --- a/net/smc/smc_core.h +++ b/net/smc/smc_core.h @@ -309,7 +309,7 @@ void smc_smcd_terminate_all(struct smcd_dev *dev); void smc_smcr_terminate_all(struct smc_ib_device *smcibdev); int smc_buf_create(struct smc_sock *smc, bool is_smcd); int smc_uncompress_bufsize(u8 compressed); -int smc_rmb_rtoken_handling(struct smc_connection *conn, +int smc_rmb_rtoken_handling(struct smc_connection *conn, struct smc_link *link, struct smc_clc_msg_accept_confirm *clc); int smc_rtoken_add(struct smc_link *lnk, __be64 nw_vaddr, __be32 nw_rkey); int smc_rtoken_delete(struct smc_link *lnk, __be32 nw_rkey); -- cgit v1.2.3-59-g8ed1b From d854fcbfaeda9748c85de296fbe07b7763a1939c Mon Sep 17 00:00:00 2001 From: Karsten Graul Date: Wed, 29 Apr 2020 17:10:43 +0200 Subject: net/smc: add new link state and related helpers Before a link can be reused it must have been cleared. Lowest current link state is INACTIVE, which does not mean that the link is already cleared. Add a new state UNUSED that is set when the link is cleared and can be reused. Add helper smc_llc_usable_link() to find an active link in a link group, and smc_link_usable() to determine if a link is usable. Signed-off-by: Karsten Graul Reviewed-by: Ursula Braun Signed-off-by: David S. Miller --- net/smc/smc_core.c | 36 +++++++++++++++++++----------------- net/smc/smc_core.h | 9 +++++++++ net/smc/smc_llc.c | 4 ++-- net/smc/smc_llc.h | 11 +++++++++++ net/smc/smc_wr.c | 2 +- 5 files changed, 42 insertions(+), 20 deletions(-) diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index e8897d60b27f..57890cbd4e8a 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -260,7 +260,7 @@ static void smc_lgr_free_work(struct work_struct *work) for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { struct smc_link *lnk = &lgr->lnk[i]; - if (lnk->state != SMC_LNK_INACTIVE) + if (smc_link_usable(lnk)) smc_llc_link_inactive(lnk); } } @@ -286,7 +286,7 @@ static u8 smcr_next_link_id(struct smc_link_group *lgr) if (!link_id) /* skip zero as link_id */ link_id = ++lgr->next_link_id; for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { - if (lgr->lnk[i].state != SMC_LNK_INACTIVE && + if (smc_link_usable(&lgr->lnk[i]) && lgr->lnk[i].link_id == link_id) continue; } @@ -350,6 +350,7 @@ clear_llc_lnk: out: put_device(&ini->ib_dev->ibdev->dev); memset(lnk, 0, sizeof(struct smc_link)); + lnk->state = SMC_LNK_UNUSED; if (!atomic_dec_return(&ini->ib_dev->lnk_cnt)) wake_up(&ini->ib_dev->lnks_deleted); return rc; @@ -500,6 +501,8 @@ void smc_conn_free(struct smc_connection *conn) static void smcr_link_clear(struct smc_link *lnk) { + struct smc_ib_device *smcibdev; + if (lnk->peer_qpn == 0) return; lnk->peer_qpn = 0; @@ -510,8 +513,11 @@ static void smcr_link_clear(struct smc_link *lnk) smc_ib_dealloc_protection_domain(lnk); smc_wr_free_link_mem(lnk); put_device(&lnk->smcibdev->ibdev->dev); - if (!atomic_dec_return(&lnk->smcibdev->lnk_cnt)) - wake_up(&lnk->smcibdev->lnks_deleted); + smcibdev = lnk->smcibdev; + memset(lnk, 0, sizeof(struct smc_link)); + lnk->state = SMC_LNK_UNUSED; + if (!atomic_dec_return(&smcibdev->lnk_cnt)) + wake_up(&smcibdev->lnks_deleted); } static void smcr_buf_free(struct smc_link_group *lgr, bool is_rmb, @@ -604,9 +610,8 @@ static void smc_lgr_free(struct smc_link_group *lgr) wake_up(&lgr->smcd->lgrs_deleted); } else { for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { - if (lgr->lnk[i].state == SMC_LNK_INACTIVE) - continue; - smcr_link_clear(&lgr->lnk[i]); + if (lgr->lnk[i].state != SMC_LNK_UNUSED) + smcr_link_clear(&lgr->lnk[i]); } if (!atomic_dec_return(&lgr_cnt)) wake_up(&lgrs_deleted); @@ -686,7 +691,7 @@ static void smc_lgr_cleanup(struct smc_link_group *lgr) for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { struct smc_link *lnk = &lgr->lnk[i]; - if (lnk->state != SMC_LNK_INACTIVE) + if (smc_link_usable(lnk)) smc_llc_link_inactive(lnk); } } @@ -764,7 +769,7 @@ void smc_port_terminate(struct smc_ib_device *smcibdev, u8 ibport) continue; /* tbd - terminate only when no more links are active */ for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { - if (lgr->lnk[i].state == SMC_LNK_INACTIVE || + if (!smc_link_usable(&lgr->lnk[i]) || lgr->lnk[i].state == SMC_LNK_DELETING) continue; if (lgr->lnk[i].smcibdev == smcibdev && @@ -1161,8 +1166,7 @@ static int smcr_buf_map_usable_links(struct smc_link_group *lgr, for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { struct smc_link *lnk = &lgr->lnk[i]; - if (lnk->state != SMC_LNK_ACTIVE && - lnk->state != SMC_LNK_ACTIVATING) + if (!smc_link_usable(lnk)) continue; if (smcr_buf_map_link(buf_desc, is_rmb, lnk)) { smcr_buf_unuse(buf_desc, lnk); @@ -1294,14 +1298,14 @@ static int __smc_buf_create(struct smc_sock *smc, bool is_smcd, bool is_rmb) void smc_sndbuf_sync_sg_for_cpu(struct smc_connection *conn) { - if (!conn->lgr || conn->lgr->is_smcd) + if (!conn->lgr || conn->lgr->is_smcd || !smc_link_usable(conn->lnk)) return; smc_ib_sync_sg_for_cpu(conn->lnk, conn->sndbuf_desc, DMA_TO_DEVICE); } void smc_sndbuf_sync_sg_for_device(struct smc_connection *conn) { - if (!conn->lgr || conn->lgr->is_smcd) + if (!conn->lgr || conn->lgr->is_smcd || !smc_link_usable(conn->lnk)) return; smc_ib_sync_sg_for_device(conn->lnk, conn->sndbuf_desc, DMA_TO_DEVICE); } @@ -1313,8 +1317,7 @@ void smc_rmb_sync_sg_for_cpu(struct smc_connection *conn) if (!conn->lgr || conn->lgr->is_smcd) return; for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { - if (conn->lgr->lnk[i].state != SMC_LNK_ACTIVE && - conn->lgr->lnk[i].state != SMC_LNK_ACTIVATING) + if (!smc_link_usable(&conn->lgr->lnk[i])) continue; smc_ib_sync_sg_for_cpu(&conn->lgr->lnk[i], conn->rmb_desc, DMA_FROM_DEVICE); @@ -1328,8 +1331,7 @@ void smc_rmb_sync_sg_for_device(struct smc_connection *conn) if (!conn->lgr || conn->lgr->is_smcd) return; for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { - if (conn->lgr->lnk[i].state != SMC_LNK_ACTIVE && - conn->lgr->lnk[i].state != SMC_LNK_ACTIVATING) + if (!smc_link_usable(&conn->lgr->lnk[i])) continue; smc_ib_sync_sg_for_device(&conn->lgr->lnk[i], conn->rmb_desc, DMA_FROM_DEVICE); diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h index f68ba187ecf8..2b1960c8c8ce 100644 --- a/net/smc/smc_core.h +++ b/net/smc/smc_core.h @@ -32,6 +32,7 @@ enum smc_lgr_role { /* possible roles of a link group */ }; enum smc_link_state { /* possible states of a link */ + SMC_LNK_UNUSED, /* link is unused */ SMC_LNK_INACTIVE, /* link is inactive */ SMC_LNK_ACTIVATING, /* link is being activated */ SMC_LNK_ACTIVE, /* link is active */ @@ -295,6 +296,14 @@ static inline struct smc_connection *smc_lgr_find_conn( return res; } +/* returns true if the specified link is usable */ +static inline bool smc_link_usable(struct smc_link *lnk) +{ + if (lnk->state == SMC_LNK_UNUSED || lnk->state == SMC_LNK_INACTIVE) + return false; + return true; +} + struct smc_sock; struct smc_clc_msg_accept_confirm; struct smc_clc_msg_local; diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c index 903ae068da3a..c267f5006faa 100644 --- a/net/smc/smc_llc.c +++ b/net/smc/smc_llc.c @@ -372,7 +372,7 @@ static void smc_llc_send_message_work(struct work_struct *work) struct smc_wr_buf *wr_buf; int rc; - if (llcwrk->link->state == SMC_LNK_INACTIVE) + if (!smc_link_usable(llcwrk->link)) goto out; rc = smc_llc_add_pending_send(llcwrk->link, &wr_buf, &pend); if (rc) @@ -562,7 +562,7 @@ static void smc_llc_rx_handler(struct ib_wc *wc, void *buf) return; /* short message */ if (llc->raw.hdr.length != sizeof(*llc)) return; /* invalid message */ - if (link->state == SMC_LNK_INACTIVE) + if (!smc_link_usable(link)) return; /* link not active, drop msg */ switch (llc->raw.hdr.common.type) { diff --git a/net/smc/smc_llc.h b/net/smc/smc_llc.h index 461c0c3ef76e..08171131110c 100644 --- a/net/smc/smc_llc.h +++ b/net/smc/smc_llc.h @@ -35,6 +35,17 @@ enum smc_llc_msg_type { SMC_LLC_DELETE_RKEY = 0x09, }; +/* returns a usable link of the link group, or NULL */ +static inline struct smc_link *smc_llc_usable_link(struct smc_link_group *lgr) +{ + int i; + + for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) + if (smc_link_usable(&lgr->lnk[i])) + return &lgr->lnk[i]; + return NULL; +} + /* transmit */ int smc_llc_send_confirm_link(struct smc_link *lnk, enum smc_llc_reqresp reqresp); diff --git a/net/smc/smc_wr.c b/net/smc/smc_wr.c index 337ee52ad3d3..93223628c002 100644 --- a/net/smc/smc_wr.c +++ b/net/smc/smc_wr.c @@ -207,7 +207,7 @@ int smc_wr_tx_get_free_slot(struct smc_link *link, } else { rc = wait_event_interruptible_timeout( link->wr_tx_wait, - link->state == SMC_LNK_INACTIVE || + !smc_link_usable(link) || lgr->terminating || (smc_wr_tx_get_free_slot_index(link, &idx) != -EBUSY), SMC_WR_TX_WAIT_FREE_SLOT_TIME); -- cgit v1.2.3-59-g8ed1b From 1020e1ef53ceef715f2bc144eebbfe01e88effcf Mon Sep 17 00:00:00 2001 From: Karsten Graul Date: Wed, 29 Apr 2020 17:10:44 +0200 Subject: net/smc: move testlink work to system work queue The testlink work waits for a response to the testlink request and blocks the single threaded llc_wq. This type of work does not have to be serialized and can be moved to the system work queue. Signed-off-by: Karsten Graul Reviewed-by: Ursula Braun Signed-off-by: David S. Miller --- net/smc/smc_llc.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c index c267f5006faa..69cc0d65b437 100644 --- a/net/smc/smc_llc.c +++ b/net/smc/smc_llc.c @@ -613,14 +613,15 @@ static void smc_llc_testlink_work(struct work_struct *work) /* receive TEST LINK response over RoCE fabric */ rc = wait_for_completion_interruptible_timeout(&link->llc_testlink_resp, SMC_LLC_WAIT_TIME); + if (link->state != SMC_LNK_ACTIVE) + return; /* link state changed */ if (rc <= 0) { smc_lgr_terminate_sched(smc_get_lgr(link)); return; } next_interval = link->llc_testlink_time; out: - queue_delayed_work(link->llc_wq, &link->llc_testlink_wrk, - next_interval); + schedule_delayed_work(&link->llc_testlink_wrk, next_interval); } int smc_llc_link_init(struct smc_link *link) @@ -648,8 +649,8 @@ void smc_llc_link_active(struct smc_link *link, int testlink_time) link->state = SMC_LNK_ACTIVE; if (testlink_time) { link->llc_testlink_time = testlink_time * HZ; - queue_delayed_work(link->llc_wq, &link->llc_testlink_wrk, - link->llc_testlink_time); + schedule_delayed_work(&link->llc_testlink_wrk, + link->llc_testlink_time); } } @@ -665,7 +666,7 @@ void smc_llc_link_inactive(struct smc_link *link) if (link->state == SMC_LNK_INACTIVE) return; link->state = SMC_LNK_INACTIVE; - cancel_delayed_work(&link->llc_testlink_wrk); + cancel_delayed_work_sync(&link->llc_testlink_wrk); smc_wr_wakeup_reg_wait(link); smc_wr_wakeup_tx_wait(link); } -- cgit v1.2.3-59-g8ed1b From 2140ac26f8f501d3cc8f1575e6419f1a50779496 Mon Sep 17 00:00:00 2001 From: Karsten Graul Date: Wed, 29 Apr 2020 17:10:45 +0200 Subject: net/smc: simplify link deactivation Cancel the testlink worker during link clear processing and remove the extra function smc_llc_link_inactive(). Signed-off-by: Karsten Graul Reviewed-by: Ursula Braun Signed-off-by: David S. Miller --- net/smc/smc_core.c | 8 ++------ net/smc/smc_llc.c | 15 ++++----------- net/smc/smc_llc.h | 1 - 3 files changed, 6 insertions(+), 18 deletions(-) diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index 57890cbd4e8a..78ccfbf6e4af 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -261,7 +261,7 @@ static void smc_lgr_free_work(struct work_struct *work) struct smc_link *lnk = &lgr->lnk[i]; if (smc_link_usable(lnk)) - smc_llc_link_inactive(lnk); + lnk->state = SMC_LNK_INACTIVE; } } smc_lgr_free(lgr); @@ -692,7 +692,7 @@ static void smc_lgr_cleanup(struct smc_link_group *lgr) struct smc_link *lnk = &lgr->lnk[i]; if (smc_link_usable(lnk)) - smc_llc_link_inactive(lnk); + lnk->state = SMC_LNK_INACTIVE; } } } @@ -706,16 +706,12 @@ static void __smc_lgr_terminate(struct smc_link_group *lgr, bool soft) struct smc_connection *conn; struct smc_sock *smc; struct rb_node *node; - int i; if (lgr->terminating) return; /* lgr already terminating */ if (!soft) cancel_delayed_work_sync(&lgr->free_work); lgr->terminating = 1; - if (!lgr->is_smcd) - for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) - smc_llc_link_inactive(&lgr->lnk[i]); /* kill remaining link group connections */ read_lock_bh(&lgr->conns_lock); diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c index 69cc0d65b437..2f03131c85fd 100644 --- a/net/smc/smc_llc.c +++ b/net/smc/smc_llc.c @@ -660,22 +660,15 @@ void smc_llc_link_deleting(struct smc_link *link) smc_wr_wakeup_tx_wait(link); } -/* called in tasklet context */ -void smc_llc_link_inactive(struct smc_link *link) -{ - if (link->state == SMC_LNK_INACTIVE) - return; - link->state = SMC_LNK_INACTIVE; - cancel_delayed_work_sync(&link->llc_testlink_wrk); - smc_wr_wakeup_reg_wait(link); - smc_wr_wakeup_tx_wait(link); -} - /* called in worker context */ void smc_llc_link_clear(struct smc_link *link) { flush_workqueue(link->llc_wq); destroy_workqueue(link->llc_wq); + complete(&link->llc_testlink_resp); + cancel_delayed_work_sync(&link->llc_testlink_wrk); + smc_wr_wakeup_reg_wait(link); + smc_wr_wakeup_tx_wait(link); } /* register a new rtoken at the remote peer */ diff --git a/net/smc/smc_llc.h b/net/smc/smc_llc.h index 08171131110c..c2c9d48d079f 100644 --- a/net/smc/smc_llc.h +++ b/net/smc/smc_llc.h @@ -56,7 +56,6 @@ int smc_llc_send_delete_link(struct smc_link *link, int smc_llc_link_init(struct smc_link *link); void smc_llc_link_active(struct smc_link *link, int testlink_time); void smc_llc_link_deleting(struct smc_link *link); -void smc_llc_link_inactive(struct smc_link *link); void smc_llc_link_clear(struct smc_link *link); int smc_llc_do_confirm_rkey(struct smc_link *link, struct smc_buf_desc *rmb_desc); -- cgit v1.2.3-59-g8ed1b From 6c8968c421e0e6bea8a78ee4fdd043d850cd5b26 Mon Sep 17 00:00:00 2001 From: Karsten Graul Date: Wed, 29 Apr 2020 17:10:46 +0200 Subject: net/smc: use worker to process incoming llc messages Incoming llc messages are processed in irq tasklet context, and a worker is used to send outgoing messages. The worker is needed because getting a send buffer could result in a wait for a free buffer. To make sure all incoming llc messages are processed in a serialized way introduce an event queue and create a new queue entry for each message which is queued to this event queue. A new worker processes the event queue entries in order. And remove the use of a separate worker to send outgoing llc messages because the messages are processed in worker context already. With this event queue the serialized llc_wq work queue is obsolete, remove it. Signed-off-by: Karsten Graul Reviewed-by: Ursula Braun Signed-off-by: David S. Miller --- net/smc/smc_core.c | 4 +- net/smc/smc_core.h | 7 ++- net/smc/smc_llc.c | 142 ++++++++++++++++++++++++++++++++--------------------- net/smc/smc_llc.h | 1 + 4 files changed, 96 insertions(+), 58 deletions(-) diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index 78ccfbf6e4af..a1463da14614 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -412,7 +412,8 @@ static int smc_lgr_create(struct smc_sock *smc, struct smc_init_info *ini) lgr->role = smc->listen_smc ? SMC_SERV : SMC_CLNT; memcpy(lgr->peer_systemid, ini->ib_lcl->id_for_peer, SMC_SYSTEMID_LEN); - + INIT_LIST_HEAD(&lgr->llc_event_q); + spin_lock_init(&lgr->llc_event_q_lock); link_idx = SMC_SINGLE_LINK; lnk = &lgr->lnk[link_idx]; rc = smcr_link_init(lgr, lnk, link_idx, ini); @@ -613,6 +614,7 @@ static void smc_lgr_free(struct smc_link_group *lgr) if (lgr->lnk[i].state != SMC_LNK_UNUSED) smcr_link_clear(&lgr->lnk[i]); } + smc_llc_event_flush(lgr); if (!atomic_dec_return(&lgr_cnt)) wake_up(&lgrs_deleted); } diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h index 2b1960c8c8ce..6548e9a06f73 100644 --- a/net/smc/smc_core.h +++ b/net/smc/smc_core.h @@ -120,7 +120,6 @@ struct smc_link { struct smc_link_group *lgr; /* parent link group */ enum smc_link_state state; /* state of link */ - struct workqueue_struct *llc_wq; /* single thread work queue */ struct completion llc_confirm; /* wait for rx of conf link */ struct completion llc_confirm_resp; /* wait 4 rx of cnf lnk rsp */ int llc_confirm_rc; /* rc from confirm link msg */ @@ -233,6 +232,12 @@ struct smc_link_group { DECLARE_BITMAP(rtokens_used_mask, SMC_RMBS_PER_LGR_MAX); /* used rtoken elements */ u8 next_link_id; + struct list_head llc_event_q; + /* queue for llc events */ + spinlock_t llc_event_q_lock; + /* protects llc_event_q */ + struct work_struct llc_event_work; + /* llc event worker */ }; struct { /* SMC-D */ u64 peer_gid; diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c index 2f03131c85fd..be74876a36ae 100644 --- a/net/smc/smc_llc.c +++ b/net/smc/smc_llc.c @@ -134,6 +134,12 @@ union smc_llc_msg { #define SMC_LLC_FLAG_RESP 0x80 +struct smc_llc_qentry { + struct list_head list; + struct smc_link *link; + union smc_llc_msg msg; +}; + /********************************** send *************************************/ struct smc_llc_tx_pend { @@ -356,46 +362,20 @@ static int smc_llc_send_test_link(struct smc_link *link, u8 user_data[16]) return rc; } -struct smc_llc_send_work { - struct work_struct work; - struct smc_link *link; - int llclen; - union smc_llc_msg llcbuf; -}; - -/* worker that sends a prepared message */ -static void smc_llc_send_message_work(struct work_struct *work) +/* schedule an llc send on link, may wait for buffers */ +static int smc_llc_send_message(struct smc_link *link, void *llcbuf) { - struct smc_llc_send_work *llcwrk = container_of(work, - struct smc_llc_send_work, work); struct smc_wr_tx_pend_priv *pend; struct smc_wr_buf *wr_buf; int rc; - if (!smc_link_usable(llcwrk->link)) - goto out; - rc = smc_llc_add_pending_send(llcwrk->link, &wr_buf, &pend); + if (!smc_link_usable(link)) + return -ENOLINK; + rc = smc_llc_add_pending_send(link, &wr_buf, &pend); if (rc) - goto out; - memcpy(wr_buf, &llcwrk->llcbuf, llcwrk->llclen); - smc_wr_tx_send(llcwrk->link, pend); -out: - kfree(llcwrk); -} - -/* copy llcbuf and schedule an llc send on link */ -static int smc_llc_send_message(struct smc_link *link, void *llcbuf, int llclen) -{ - struct smc_llc_send_work *wrk = kmalloc(sizeof(*wrk), GFP_ATOMIC); - - if (!wrk) - return -ENOMEM; - INIT_WORK(&wrk->work, smc_llc_send_message_work); - wrk->link = link; - wrk->llclen = llclen; - memcpy(&wrk->llcbuf, llcbuf, llclen); - queue_work(link->llc_wq, &wrk->work); - return 0; + return rc; + memcpy(wr_buf, llcbuf, sizeof(union smc_llc_msg)); + return smc_wr_tx_send(link, pend); } /********************************* receive ***********************************/ @@ -452,7 +432,7 @@ static void smc_llc_rx_add_link(struct smc_link *link, link->smcibdev->mac[link->ibport - 1], link->gid, SMC_LLC_RESP); } - smc_llc_send_message(link, llc, sizeof(*llc)); + smc_llc_send_message(link, llc); } } @@ -474,7 +454,7 @@ static void smc_llc_rx_delete_link(struct smc_link *link, /* server requests to delete this link, send response */ smc_llc_prep_delete_link(llc, link, SMC_LLC_RESP, true); } - smc_llc_send_message(link, llc, sizeof(*llc)); + smc_llc_send_message(link, llc); smc_lgr_terminate_sched(lgr); } } @@ -487,7 +467,7 @@ static void smc_llc_rx_test_link(struct smc_link *link, complete(&link->llc_testlink_resp); } else { llc->hd.flags |= SMC_LLC_FLAG_RESP; - smc_llc_send_message(link, llc, sizeof(*llc)); + smc_llc_send_message(link, llc); } } @@ -510,7 +490,7 @@ static void smc_llc_rx_confirm_rkey(struct smc_link *link, llc->hd.flags |= SMC_LLC_FLAG_RESP; if (rc < 0) llc->hd.flags |= SMC_LLC_FLAG_RKEY_NEG; - smc_llc_send_message(link, llc, sizeof(*llc)); + smc_llc_send_message(link, llc); } } @@ -522,7 +502,7 @@ static void smc_llc_rx_confirm_rkey_cont(struct smc_link *link, } else { /* ignore rtokens for other links, we have only one link */ llc->hd.flags |= SMC_LLC_FLAG_RESP; - smc_llc_send_message(link, llc, sizeof(*llc)); + smc_llc_send_message(link, llc); } } @@ -549,21 +529,30 @@ static void smc_llc_rx_delete_rkey(struct smc_link *link, } llc->hd.flags |= SMC_LLC_FLAG_RESP; - smc_llc_send_message(link, llc, sizeof(*llc)); + smc_llc_send_message(link, llc); } } -static void smc_llc_rx_handler(struct ib_wc *wc, void *buf) +/* flush the llc event queue */ +void smc_llc_event_flush(struct smc_link_group *lgr) { - struct smc_link *link = (struct smc_link *)wc->qp->qp_context; - union smc_llc_msg *llc = buf; + struct smc_llc_qentry *qentry, *q; + + spin_lock_bh(&lgr->llc_event_q_lock); + list_for_each_entry_safe(qentry, q, &lgr->llc_event_q, list) { + list_del_init(&qentry->list); + kfree(qentry); + } + spin_unlock_bh(&lgr->llc_event_q_lock); +} + +static void smc_llc_event_handler(struct smc_llc_qentry *qentry) +{ + union smc_llc_msg *llc = &qentry->msg; + struct smc_link *link = qentry->link; - if (wc->byte_len < sizeof(*llc)) - return; /* short message */ - if (llc->raw.hdr.length != sizeof(*llc)) - return; /* invalid message */ if (!smc_link_usable(link)) - return; /* link not active, drop msg */ + goto out; switch (llc->raw.hdr.common.type) { case SMC_LLC_TEST_LINK: @@ -588,6 +577,54 @@ static void smc_llc_rx_handler(struct ib_wc *wc, void *buf) smc_llc_rx_delete_rkey(link, &llc->delete_rkey); break; } +out: + kfree(qentry); +} + +/* worker to process llc messages on the event queue */ +static void smc_llc_event_work(struct work_struct *work) +{ + struct smc_link_group *lgr = container_of(work, struct smc_link_group, + llc_event_work); + struct smc_llc_qentry *qentry; + +again: + spin_lock_bh(&lgr->llc_event_q_lock); + if (!list_empty(&lgr->llc_event_q)) { + qentry = list_first_entry(&lgr->llc_event_q, + struct smc_llc_qentry, list); + list_del_init(&qentry->list); + spin_unlock_bh(&lgr->llc_event_q_lock); + smc_llc_event_handler(qentry); + goto again; + } + spin_unlock_bh(&lgr->llc_event_q_lock); +} + +/* copy received msg and add it to the event queue */ +static void smc_llc_rx_handler(struct ib_wc *wc, void *buf) +{ + struct smc_link *link = (struct smc_link *)wc->qp->qp_context; + struct smc_link_group *lgr = link->lgr; + struct smc_llc_qentry *qentry; + union smc_llc_msg *llc = buf; + unsigned long flags; + + if (wc->byte_len < sizeof(*llc)) + return; /* short message */ + if (llc->raw.hdr.length != sizeof(*llc)) + return; /* invalid message */ + + qentry = kmalloc(sizeof(*qentry), GFP_ATOMIC); + if (!qentry) + return; + qentry->link = link; + INIT_LIST_HEAD(&qentry->list); + memcpy(&qentry->msg, llc, sizeof(union smc_llc_msg)); + spin_lock_irqsave(&lgr->llc_event_q_lock, flags); + list_add_tail(&qentry->list, &lgr->llc_event_q); + spin_unlock_irqrestore(&lgr->llc_event_q_lock, flags); + schedule_work(&link->lgr->llc_event_work); } /***************************** worker, utils *********************************/ @@ -626,12 +663,6 @@ out: int smc_llc_link_init(struct smc_link *link) { - struct smc_link_group *lgr = smc_get_lgr(link); - link->llc_wq = alloc_ordered_workqueue("llc_wq-%x:%x)", WQ_MEM_RECLAIM, - *((u32 *)lgr->id), - link->link_id); - if (!link->llc_wq) - return -ENOMEM; init_completion(&link->llc_confirm); init_completion(&link->llc_confirm_resp); init_completion(&link->llc_add); @@ -640,6 +671,7 @@ int smc_llc_link_init(struct smc_link *link) init_completion(&link->llc_delete_rkey); mutex_init(&link->llc_delete_rkey_mutex); init_completion(&link->llc_testlink_resp); + INIT_WORK(&link->lgr->llc_event_work, smc_llc_event_work); INIT_DELAYED_WORK(&link->llc_testlink_wrk, smc_llc_testlink_work); return 0; } @@ -663,8 +695,6 @@ void smc_llc_link_deleting(struct smc_link *link) /* called in worker context */ void smc_llc_link_clear(struct smc_link *link) { - flush_workqueue(link->llc_wq); - destroy_workqueue(link->llc_wq); complete(&link->llc_testlink_resp); cancel_delayed_work_sync(&link->llc_testlink_wrk); smc_wr_wakeup_reg_wait(link); diff --git a/net/smc/smc_llc.h b/net/smc/smc_llc.h index c2c9d48d079f..9de83495ad14 100644 --- a/net/smc/smc_llc.h +++ b/net/smc/smc_llc.h @@ -61,6 +61,7 @@ int smc_llc_do_confirm_rkey(struct smc_link *link, struct smc_buf_desc *rmb_desc); int smc_llc_do_delete_rkey(struct smc_link *link, struct smc_buf_desc *rmb_desc); +void smc_llc_event_flush(struct smc_link_group *lgr); int smc_llc_init(void) __init; #endif /* SMC_LLC_H */ -- cgit v1.2.3-59-g8ed1b From ef79d439cd124d9fb7258bb35d44c71aec11b829 Mon Sep 17 00:00:00 2001 From: Karsten Graul Date: Wed, 29 Apr 2020 17:10:47 +0200 Subject: net/smc: process llc responses in tasklet context When llc responses are received then possible waiters for this response are to be notified. This can be done in tasklet context, without to use a work in the llc work queue. Move all code that handles llc responses into smc_llc_rx_response(). Signed-off-by: Karsten Graul Reviewed-by: Ursula Braun Signed-off-by: David S. Miller --- net/smc/smc_core.h | 8 +- net/smc/smc_llc.c | 216 +++++++++++++++++++++++++++-------------------------- 2 files changed, 116 insertions(+), 108 deletions(-) diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h index 6548e9a06f73..d785656b3489 100644 --- a/net/smc/smc_core.h +++ b/net/smc/smc_core.h @@ -129,10 +129,10 @@ struct smc_link { struct delayed_work llc_testlink_wrk; /* testlink worker */ struct completion llc_testlink_resp; /* wait for rx of testlink */ int llc_testlink_time; /* testlink interval */ - struct completion llc_confirm_rkey; /* wait 4 rx of cnf rkey */ - int llc_confirm_rkey_rc; /* rc from cnf rkey msg */ - struct completion llc_delete_rkey; /* wait 4 rx of del rkey */ - int llc_delete_rkey_rc; /* rc from del rkey msg */ + struct completion llc_confirm_rkey_resp; /* w4 rx of cnf rkey */ + int llc_confirm_rkey_resp_rc; /* rc from cnf rkey */ + struct completion llc_delete_rkey_resp; /* w4 rx of del rkey */ + int llc_delete_rkey_resp_rc; /* rc from del rkey */ struct mutex llc_delete_rkey_mutex; /* serialize usage */ }; diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c index be74876a36ae..265889c8b03b 100644 --- a/net/smc/smc_llc.c +++ b/net/smc/smc_llc.c @@ -384,27 +384,17 @@ static void smc_llc_rx_confirm_link(struct smc_link *link, struct smc_llc_msg_confirm_link *llc) { struct smc_link_group *lgr = smc_get_lgr(link); - int conf_rc; + int conf_rc = 0; /* RMBE eyecatchers are not supported */ - if (llc->hd.flags & SMC_LLC_FLAG_NO_RMBE_EYEC) - conf_rc = 0; - else + if (!(llc->hd.flags & SMC_LLC_FLAG_NO_RMBE_EYEC)) conf_rc = ENOTSUPP; - if (llc->hd.flags & SMC_LLC_FLAG_RESP) { - if (lgr->role == SMC_SERV && - link->state == SMC_LNK_ACTIVATING) { - link->llc_confirm_resp_rc = conf_rc; - complete(&link->llc_confirm_resp); - } - } else { - if (lgr->role == SMC_CLNT && - link->state == SMC_LNK_ACTIVATING) { - link->llc_confirm_rc = conf_rc; - link->link_id = llc->link_num; - complete(&link->llc_confirm); - } + if (lgr->role == SMC_CLNT && + link->state == SMC_LNK_ACTIVATING) { + link->llc_confirm_rc = conf_rc; + link->link_id = llc->link_num; + complete(&link->llc_confirm); } } @@ -413,27 +403,22 @@ static void smc_llc_rx_add_link(struct smc_link *link, { struct smc_link_group *lgr = smc_get_lgr(link); - if (llc->hd.flags & SMC_LLC_FLAG_RESP) { - if (link->state == SMC_LNK_ACTIVATING) - complete(&link->llc_add_resp); - } else { - if (link->state == SMC_LNK_ACTIVATING) { - complete(&link->llc_add); - return; - } + if (link->state == SMC_LNK_ACTIVATING) { + complete(&link->llc_add); + return; + } - if (lgr->role == SMC_SERV) { - smc_llc_prep_add_link(llc, link, - link->smcibdev->mac[link->ibport - 1], - link->gid, SMC_LLC_REQ); + if (lgr->role == SMC_SERV) { + smc_llc_prep_add_link(llc, link, + link->smcibdev->mac[link->ibport - 1], + link->gid, SMC_LLC_REQ); - } else { - smc_llc_prep_add_link(llc, link, - link->smcibdev->mac[link->ibport - 1], - link->gid, SMC_LLC_RESP); - } - smc_llc_send_message(link, llc); + } else { + smc_llc_prep_add_link(llc, link, + link->smcibdev->mac[link->ibport - 1], + link->gid, SMC_LLC_RESP); } + smc_llc_send_message(link, llc); } static void smc_llc_rx_delete_link(struct smc_link *link, @@ -441,34 +426,24 @@ static void smc_llc_rx_delete_link(struct smc_link *link, { struct smc_link_group *lgr = smc_get_lgr(link); - if (llc->hd.flags & SMC_LLC_FLAG_RESP) { - if (lgr->role == SMC_SERV) - smc_lgr_schedule_free_work_fast(lgr); + smc_lgr_forget(lgr); + smc_llc_link_deleting(link); + if (lgr->role == SMC_SERV) { + /* client asks to delete this link, send request */ + smc_llc_prep_delete_link(llc, link, SMC_LLC_REQ, true); } else { - smc_lgr_forget(lgr); - smc_llc_link_deleting(link); - if (lgr->role == SMC_SERV) { - /* client asks to delete this link, send request */ - smc_llc_prep_delete_link(llc, link, SMC_LLC_REQ, true); - } else { - /* server requests to delete this link, send response */ - smc_llc_prep_delete_link(llc, link, SMC_LLC_RESP, true); - } - smc_llc_send_message(link, llc); - smc_lgr_terminate_sched(lgr); + /* server requests to delete this link, send response */ + smc_llc_prep_delete_link(llc, link, SMC_LLC_RESP, true); } + smc_llc_send_message(link, llc); + smc_lgr_terminate_sched(lgr); } static void smc_llc_rx_test_link(struct smc_link *link, struct smc_llc_msg_test_link *llc) { - if (llc->hd.flags & SMC_LLC_FLAG_RESP) { - if (link->state == SMC_LNK_ACTIVE) - complete(&link->llc_testlink_resp); - } else { - llc->hd.flags |= SMC_LLC_FLAG_RESP; - smc_llc_send_message(link, llc); - } + llc->hd.flags |= SMC_LLC_FLAG_RESP; + smc_llc_send_message(link, llc); } static void smc_llc_rx_confirm_rkey(struct smc_link *link, @@ -476,34 +451,24 @@ static void smc_llc_rx_confirm_rkey(struct smc_link *link, { int rc; - if (llc->hd.flags & SMC_LLC_FLAG_RESP) { - link->llc_confirm_rkey_rc = llc->hd.flags & - SMC_LLC_FLAG_RKEY_NEG; - complete(&link->llc_confirm_rkey); - } else { - rc = smc_rtoken_add(link, - llc->rtoken[0].rmb_vaddr, - llc->rtoken[0].rmb_key); + rc = smc_rtoken_add(link, + llc->rtoken[0].rmb_vaddr, + llc->rtoken[0].rmb_key); - /* ignore rtokens for other links, we have only one link */ + /* ignore rtokens for other links, we have only one link */ - llc->hd.flags |= SMC_LLC_FLAG_RESP; - if (rc < 0) - llc->hd.flags |= SMC_LLC_FLAG_RKEY_NEG; - smc_llc_send_message(link, llc); - } + llc->hd.flags |= SMC_LLC_FLAG_RESP; + if (rc < 0) + llc->hd.flags |= SMC_LLC_FLAG_RKEY_NEG; + smc_llc_send_message(link, llc); } static void smc_llc_rx_confirm_rkey_cont(struct smc_link *link, struct smc_llc_msg_confirm_rkey_cont *llc) { - if (llc->hd.flags & SMC_LLC_FLAG_RESP) { - /* unused as long as we don't send this type of msg */ - } else { - /* ignore rtokens for other links, we have only one link */ - llc->hd.flags |= SMC_LLC_FLAG_RESP; - smc_llc_send_message(link, llc); - } + /* ignore rtokens for other links, we have only one link */ + llc->hd.flags |= SMC_LLC_FLAG_RESP; + smc_llc_send_message(link, llc); } static void smc_llc_rx_delete_rkey(struct smc_link *link, @@ -512,25 +477,19 @@ static void smc_llc_rx_delete_rkey(struct smc_link *link, u8 err_mask = 0; int i, max; - if (llc->hd.flags & SMC_LLC_FLAG_RESP) { - link->llc_delete_rkey_rc = llc->hd.flags & - SMC_LLC_FLAG_RKEY_NEG; - complete(&link->llc_delete_rkey); - } else { - max = min_t(u8, llc->num_rkeys, SMC_LLC_DEL_RKEY_MAX); - for (i = 0; i < max; i++) { - if (smc_rtoken_delete(link, llc->rkey[i])) - err_mask |= 1 << (SMC_LLC_DEL_RKEY_MAX - 1 - i); - } - - if (err_mask) { - llc->hd.flags |= SMC_LLC_FLAG_RKEY_NEG; - llc->err_mask = err_mask; - } + max = min_t(u8, llc->num_rkeys, SMC_LLC_DEL_RKEY_MAX); + for (i = 0; i < max; i++) { + if (smc_rtoken_delete(link, llc->rkey[i])) + err_mask |= 1 << (SMC_LLC_DEL_RKEY_MAX - 1 - i); + } - llc->hd.flags |= SMC_LLC_FLAG_RESP; - smc_llc_send_message(link, llc); + if (err_mask) { + llc->hd.flags |= SMC_LLC_FLAG_RKEY_NEG; + llc->err_mask = err_mask; } + + llc->hd.flags |= SMC_LLC_FLAG_RESP; + smc_llc_send_message(link, llc); } /* flush the llc event queue */ @@ -601,6 +560,49 @@ again: spin_unlock_bh(&lgr->llc_event_q_lock); } +/* process llc responses in tasklet context */ +static void smc_llc_rx_response(struct smc_link *link, union smc_llc_msg *llc) +{ + int rc = 0; + + switch (llc->raw.hdr.common.type) { + case SMC_LLC_TEST_LINK: + if (link->state == SMC_LNK_ACTIVE) + complete(&link->llc_testlink_resp); + break; + case SMC_LLC_CONFIRM_LINK: + if (!(llc->raw.hdr.flags & SMC_LLC_FLAG_NO_RMBE_EYEC)) + rc = ENOTSUPP; + if (link->lgr->role == SMC_SERV && + link->state == SMC_LNK_ACTIVATING) { + link->llc_confirm_resp_rc = rc; + complete(&link->llc_confirm_resp); + } + break; + case SMC_LLC_ADD_LINK: + if (link->state == SMC_LNK_ACTIVATING) + complete(&link->llc_add_resp); + break; + case SMC_LLC_DELETE_LINK: + if (link->lgr->role == SMC_SERV) + smc_lgr_schedule_free_work_fast(link->lgr); + break; + case SMC_LLC_CONFIRM_RKEY: + link->llc_confirm_rkey_resp_rc = llc->raw.hdr.flags & + SMC_LLC_FLAG_RKEY_NEG; + complete(&link->llc_confirm_rkey_resp); + break; + case SMC_LLC_CONFIRM_RKEY_CONT: + /* unused as long as we don't send this type of msg */ + break; + case SMC_LLC_DELETE_RKEY: + link->llc_delete_rkey_resp_rc = llc->raw.hdr.flags & + SMC_LLC_FLAG_RKEY_NEG; + complete(&link->llc_delete_rkey_resp); + break; + } +} + /* copy received msg and add it to the event queue */ static void smc_llc_rx_handler(struct ib_wc *wc, void *buf) { @@ -615,6 +617,12 @@ static void smc_llc_rx_handler(struct ib_wc *wc, void *buf) if (llc->raw.hdr.length != sizeof(*llc)) return; /* invalid message */ + /* process responses immediately */ + if (llc->raw.hdr.flags & SMC_LLC_FLAG_RESP) { + smc_llc_rx_response(link, llc); + return; + } + qentry = kmalloc(sizeof(*qentry), GFP_ATOMIC); if (!qentry) return; @@ -667,8 +675,8 @@ int smc_llc_link_init(struct smc_link *link) init_completion(&link->llc_confirm_resp); init_completion(&link->llc_add); init_completion(&link->llc_add_resp); - init_completion(&link->llc_confirm_rkey); - init_completion(&link->llc_delete_rkey); + init_completion(&link->llc_confirm_rkey_resp); + init_completion(&link->llc_delete_rkey_resp); mutex_init(&link->llc_delete_rkey_mutex); init_completion(&link->llc_testlink_resp); INIT_WORK(&link->lgr->llc_event_work, smc_llc_event_work); @@ -708,14 +716,14 @@ int smc_llc_do_confirm_rkey(struct smc_link *link, int rc; /* protected by mutex smc_create_lgr_pending */ - reinit_completion(&link->llc_confirm_rkey); + reinit_completion(&link->llc_confirm_rkey_resp); rc = smc_llc_send_confirm_rkey(link, rmb_desc); if (rc) return rc; /* receive CONFIRM RKEY response from server over RoCE fabric */ - rc = wait_for_completion_interruptible_timeout(&link->llc_confirm_rkey, - SMC_LLC_WAIT_TIME); - if (rc <= 0 || link->llc_confirm_rkey_rc) + rc = wait_for_completion_interruptible_timeout( + &link->llc_confirm_rkey_resp, SMC_LLC_WAIT_TIME); + if (rc <= 0 || link->llc_confirm_rkey_resp_rc) return -EFAULT; return 0; } @@ -729,14 +737,14 @@ int smc_llc_do_delete_rkey(struct smc_link *link, mutex_lock(&link->llc_delete_rkey_mutex); if (link->state != SMC_LNK_ACTIVE) goto out; - reinit_completion(&link->llc_delete_rkey); + reinit_completion(&link->llc_delete_rkey_resp); rc = smc_llc_send_delete_rkey(link, rmb_desc); if (rc) goto out; /* receive DELETE RKEY response from server over RoCE fabric */ - rc = wait_for_completion_interruptible_timeout(&link->llc_delete_rkey, - SMC_LLC_WAIT_TIME); - if (rc <= 0 || link->llc_delete_rkey_rc) + rc = wait_for_completion_interruptible_timeout( + &link->llc_delete_rkey_resp, SMC_LLC_WAIT_TIME); + if (rc <= 0 || link->llc_delete_rkey_resp_rc) rc = -EFAULT; else rc = 0; -- cgit v1.2.3-59-g8ed1b From faca536008375bece23783e7382b5d0356c13ba5 Mon Sep 17 00:00:00 2001 From: Karsten Graul Date: Wed, 29 Apr 2020 17:10:48 +0200 Subject: net/smc: use mutex instead of rwlock_t to protect buffers The locks for sndbufs and rmbs are never used from atomic context. Using a mutex for these locks will allow to nest locks with other mutexes. Signed-off-by: Karsten Graul Reviewed-by: Ursula Braun Signed-off-by: David S. Miller --- net/smc/smc_core.c | 22 +++++++++++----------- net/smc/smc_core.h | 4 ++-- 2 files changed, 13 insertions(+), 13 deletions(-) diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index a1463da14614..8a43d2948493 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -385,8 +385,8 @@ static int smc_lgr_create(struct smc_sock *smc, struct smc_init_info *ini) lgr->freefast = 0; lgr->freeing = 0; lgr->vlan_id = ini->vlan_id; - rwlock_init(&lgr->sndbufs_lock); - rwlock_init(&lgr->rmbs_lock); + mutex_init(&lgr->sndbufs_lock); + mutex_init(&lgr->rmbs_lock); rwlock_init(&lgr->conns_lock); for (i = 0; i < SMC_RMBE_SIZES; i++) { INIT_LIST_HEAD(&lgr->sndbufs[i]); @@ -456,9 +456,9 @@ static void smcr_buf_unuse(struct smc_buf_desc *rmb_desc, } if (rmb_desc->is_reg_err) { /* buf registration failed, reuse not possible */ - write_lock_bh(&lgr->rmbs_lock); + mutex_lock(&lgr->rmbs_lock); list_del(&rmb_desc->list); - write_unlock_bh(&lgr->rmbs_lock); + mutex_unlock(&lgr->rmbs_lock); smc_buf_free(lgr, true, rmb_desc); } else { @@ -1059,19 +1059,19 @@ int smc_uncompress_bufsize(u8 compressed) * buffer size; if not available, return NULL */ static struct smc_buf_desc *smc_buf_get_slot(int compressed_bufsize, - rwlock_t *lock, + struct mutex *lock, struct list_head *buf_list) { struct smc_buf_desc *buf_slot; - read_lock_bh(lock); + mutex_lock(lock); list_for_each_entry(buf_slot, buf_list, list) { if (cmpxchg(&buf_slot->used, 0, 1) == 0) { - read_unlock_bh(lock); + mutex_unlock(lock); return buf_slot; } } - read_unlock_bh(lock); + mutex_unlock(lock); return NULL; } @@ -1220,8 +1220,8 @@ static int __smc_buf_create(struct smc_sock *smc, bool is_smcd, bool is_rmb) struct smc_link_group *lgr = conn->lgr; struct list_head *buf_list; int bufsize, bufsize_short; + struct mutex *lock; /* lock buffer list */ int sk_buf_size; - rwlock_t *lock; if (is_rmb) /* use socket recv buffer size (w/o overhead) as start value */ @@ -1262,9 +1262,9 @@ static int __smc_buf_create(struct smc_sock *smc, bool is_smcd, bool is_rmb) continue; buf_desc->used = 1; - write_lock_bh(lock); + mutex_lock(lock); list_add(&buf_desc->list, buf_list); - write_unlock_bh(lock); + mutex_unlock(lock); break; /* found */ } diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h index d785656b3489..379ced490c49 100644 --- a/net/smc/smc_core.h +++ b/net/smc/smc_core.h @@ -205,9 +205,9 @@ struct smc_link_group { unsigned short vlan_id; /* vlan id of link group */ struct list_head sndbufs[SMC_RMBE_SIZES];/* tx buffers */ - rwlock_t sndbufs_lock; /* protects tx buffers */ + struct mutex sndbufs_lock; /* protects tx buffers */ struct list_head rmbs[SMC_RMBE_SIZES]; /* rx buffers */ - rwlock_t rmbs_lock; /* protects rx buffers */ + struct mutex rmbs_lock; /* protects rx buffers */ u8 id[SMC_LGR_ID_SIZE]; /* unique lgr id */ struct delayed_work free_work; /* delayed freeing of an lgr */ -- cgit v1.2.3-59-g8ed1b From 00a049cfde95931c6832edad19d9a4be441cacf5 Mon Sep 17 00:00:00 2001 From: Karsten Graul Date: Wed, 29 Apr 2020 17:10:49 +0200 Subject: net/smc: move llc layer related init and clear into smc_llc.c Introduce smc_llc_lgr_init() and smc_llc_lgr_clear() to implement all llc layer specific initialization and cleanup in module smc_llc.c. Signed-off-by: Karsten Graul Reviewed-by: Ursula Braun Signed-off-by: David S. Miller --- net/smc/af_smc.c | 6 ++---- net/smc/smc_core.c | 6 +++--- net/smc/smc_core.h | 2 ++ net/smc/smc_llc.c | 26 +++++++++++++++++++++----- net/smc/smc_llc.h | 5 +++-- 5 files changed, 31 insertions(+), 14 deletions(-) diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index e39f6aedd3bd..e859e3f420d9 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -381,7 +381,6 @@ static int smcr_lgr_reg_rmbs(struct smc_link_group *lgr, static int smcr_clnt_conf_first_link(struct smc_sock *smc) { - struct net *net = sock_net(smc->clcsock->sk); struct smc_link *link = smc->conn.lnk; int rest; int rc; @@ -433,7 +432,7 @@ static int smcr_clnt_conf_first_link(struct smc_sock *smc) if (rc < 0) return SMC_CLC_DECL_TIMEOUT_AL; - smc_llc_link_active(link, net->ipv4.sysctl_tcp_keepalive_time); + smc_llc_link_active(link); return 0; } @@ -1019,7 +1018,6 @@ void smc_close_non_accepted(struct sock *sk) static int smcr_serv_conf_first_link(struct smc_sock *smc) { - struct net *net = sock_net(smc->clcsock->sk); struct smc_link *link = smc->conn.lnk; int rest; int rc; @@ -1065,7 +1063,7 @@ static int smcr_serv_conf_first_link(struct smc_sock *smc) return rc == -EAGAIN ? SMC_CLC_DECL_TIMEOUT_AL : rc; } - smc_llc_link_active(link, net->ipv4.sysctl_tcp_keepalive_time); + smc_llc_link_active(link); return 0; } diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index 8a43d2948493..db49f8cd5c95 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -412,8 +412,8 @@ static int smc_lgr_create(struct smc_sock *smc, struct smc_init_info *ini) lgr->role = smc->listen_smc ? SMC_SERV : SMC_CLNT; memcpy(lgr->peer_systemid, ini->ib_lcl->id_for_peer, SMC_SYSTEMID_LEN); - INIT_LIST_HEAD(&lgr->llc_event_q); - spin_lock_init(&lgr->llc_event_q_lock); + smc_llc_lgr_init(lgr, smc); + link_idx = SMC_SINGLE_LINK; lnk = &lgr->lnk[link_idx]; rc = smcr_link_init(lgr, lnk, link_idx, ini); @@ -614,7 +614,7 @@ static void smc_lgr_free(struct smc_link_group *lgr) if (lgr->lnk[i].state != SMC_LNK_UNUSED) smcr_link_clear(&lgr->lnk[i]); } - smc_llc_event_flush(lgr); + smc_llc_lgr_clear(lgr); if (!atomic_dec_return(&lgr_cnt)) wake_up(&lgrs_deleted); } diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h index 379ced490c49..b5781511063d 100644 --- a/net/smc/smc_core.h +++ b/net/smc/smc_core.h @@ -238,6 +238,8 @@ struct smc_link_group { /* protects llc_event_q */ struct work_struct llc_event_work; /* llc event worker */ + int llc_testlink_time; + /* link keep alive time */ }; struct { /* SMC-D */ u64 peer_gid; diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c index 265889c8b03b..e715dd6735ee 100644 --- a/net/smc/smc_llc.c +++ b/net/smc/smc_llc.c @@ -493,7 +493,7 @@ static void smc_llc_rx_delete_rkey(struct smc_link *link, } /* flush the llc event queue */ -void smc_llc_event_flush(struct smc_link_group *lgr) +static void smc_llc_event_flush(struct smc_link_group *lgr) { struct smc_llc_qentry *qentry, *q; @@ -669,6 +669,23 @@ out: schedule_delayed_work(&link->llc_testlink_wrk, next_interval); } +void smc_llc_lgr_init(struct smc_link_group *lgr, struct smc_sock *smc) +{ + struct net *net = sock_net(smc->clcsock->sk); + + INIT_WORK(&lgr->llc_event_work, smc_llc_event_work); + INIT_LIST_HEAD(&lgr->llc_event_q); + spin_lock_init(&lgr->llc_event_q_lock); + lgr->llc_testlink_time = net->ipv4.sysctl_tcp_keepalive_time; +} + +/* called after lgr was removed from lgr_list */ +void smc_llc_lgr_clear(struct smc_link_group *lgr) +{ + smc_llc_event_flush(lgr); + cancel_work_sync(&lgr->llc_event_work); +} + int smc_llc_link_init(struct smc_link *link) { init_completion(&link->llc_confirm); @@ -679,16 +696,15 @@ int smc_llc_link_init(struct smc_link *link) init_completion(&link->llc_delete_rkey_resp); mutex_init(&link->llc_delete_rkey_mutex); init_completion(&link->llc_testlink_resp); - INIT_WORK(&link->lgr->llc_event_work, smc_llc_event_work); INIT_DELAYED_WORK(&link->llc_testlink_wrk, smc_llc_testlink_work); return 0; } -void smc_llc_link_active(struct smc_link *link, int testlink_time) +void smc_llc_link_active(struct smc_link *link) { link->state = SMC_LNK_ACTIVE; - if (testlink_time) { - link->llc_testlink_time = testlink_time * HZ; + if (link->lgr->llc_testlink_time) { + link->llc_testlink_time = link->lgr->llc_testlink_time * HZ; schedule_delayed_work(&link->llc_testlink_wrk, link->llc_testlink_time); } diff --git a/net/smc/smc_llc.h b/net/smc/smc_llc.h index 9de83495ad14..66063f22166b 100644 --- a/net/smc/smc_llc.h +++ b/net/smc/smc_llc.h @@ -53,15 +53,16 @@ int smc_llc_send_add_link(struct smc_link *link, u8 mac[], u8 gid[], enum smc_llc_reqresp reqresp); int smc_llc_send_delete_link(struct smc_link *link, enum smc_llc_reqresp reqresp, bool orderly); +void smc_llc_lgr_init(struct smc_link_group *lgr, struct smc_sock *smc); +void smc_llc_lgr_clear(struct smc_link_group *lgr); int smc_llc_link_init(struct smc_link *link); -void smc_llc_link_active(struct smc_link *link, int testlink_time); +void smc_llc_link_active(struct smc_link *link); void smc_llc_link_deleting(struct smc_link *link); void smc_llc_link_clear(struct smc_link *link); int smc_llc_do_confirm_rkey(struct smc_link *link, struct smc_buf_desc *rmb_desc); int smc_llc_do_delete_rkey(struct smc_link *link, struct smc_buf_desc *rmb_desc); -void smc_llc_event_flush(struct smc_link_group *lgr); int smc_llc_init(void) __init; #endif /* SMC_LLC_H */ -- cgit v1.2.3-59-g8ed1b From e3450b79dfe47632ffa65042c6d5a6b48263da4e Mon Sep 17 00:00:00 2001 From: Quentin Monnet Date: Wed, 29 Apr 2020 15:45:04 +0100 Subject: tools: bpftool: For "feature probe" define "full_mode" bool as global The "full_mode" variable used for switching between full or partial feature probing (i.e. with or without probing helpers that will log warnings in kernel logs) was piped from the main do_probe() function down to probe_helpers_for_progtype(), where it is needed. Define it as a global variable: the calls will be more readable, and if other similar flags were to be used in the future, we could use global variables as well instead of extending again the list of arguments with new flags. Signed-off-by: Quentin Monnet Signed-off-by: Daniel Borkmann Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20200429144506.8999-2-quentin@isovalent.com --- tools/bpf/bpftool/feature.c | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/tools/bpf/bpftool/feature.c b/tools/bpf/bpftool/feature.c index 88718ee6a438..59e4cb44efbc 100644 --- a/tools/bpf/bpftool/feature.c +++ b/tools/bpf/bpftool/feature.c @@ -35,6 +35,8 @@ static const char * const helper_name[] = { #undef BPF_HELPER_MAKE_ENTRY +static bool full_mode; + /* Miscellaneous utility functions */ static bool check_procfs(void) @@ -540,8 +542,7 @@ probe_helper_for_progtype(enum bpf_prog_type prog_type, bool supported_type, static void probe_helpers_for_progtype(enum bpf_prog_type prog_type, bool supported_type, - const char *define_prefix, bool full_mode, - __u32 ifindex) + const char *define_prefix, __u32 ifindex) { const char *ptype_name = prog_type_name[prog_type]; char feat_name[128]; @@ -678,8 +679,7 @@ static void section_map_types(const char *define_prefix, __u32 ifindex) } static void -section_helpers(bool *supported_types, const char *define_prefix, - bool full_mode, __u32 ifindex) +section_helpers(bool *supported_types, const char *define_prefix, __u32 ifindex) { unsigned int i; @@ -704,8 +704,8 @@ section_helpers(bool *supported_types, const char *define_prefix, define_prefix, define_prefix, define_prefix, define_prefix); for (i = BPF_PROG_TYPE_UNSPEC + 1; i < ARRAY_SIZE(prog_type_name); i++) - probe_helpers_for_progtype(i, supported_types[i], - define_prefix, full_mode, ifindex); + probe_helpers_for_progtype(i, supported_types[i], define_prefix, + ifindex); print_end_section(); } @@ -725,7 +725,6 @@ static int do_probe(int argc, char **argv) enum probe_component target = COMPONENT_UNSPEC; const char *define_prefix = NULL; bool supported_types[128] = {}; - bool full_mode = false; __u32 ifindex = 0; char *ifname; @@ -803,7 +802,7 @@ static int do_probe(int argc, char **argv) goto exit_close_json; section_program_types(supported_types, define_prefix, ifindex); section_map_types(define_prefix, ifindex); - section_helpers(supported_types, define_prefix, full_mode, ifindex); + section_helpers(supported_types, define_prefix, ifindex); section_misc(define_prefix, ifindex); exit_close_json: -- cgit v1.2.3-59-g8ed1b From cf9bf714523dbbc97953be6de6ca14d57d4f8a21 Mon Sep 17 00:00:00 2001 From: Quentin Monnet Date: Wed, 29 Apr 2020 15:45:05 +0100 Subject: tools: bpftool: Allow unprivileged users to probe features There is demand for a way to identify what BPF helper functions are available to unprivileged users. To do so, allow unprivileged users to run "bpftool feature probe" to list BPF-related features. This will only show features accessible to those users, and may not reflect the full list of features available (to administrators) on the system. To avoid the case where bpftool is inadvertently run as non-root and would list only a subset of the features supported by the system when it would be expected to list all of them, running as unprivileged is gated behind the "unprivileged" keyword passed to the command line. When used by a privileged user, this keyword allows to drop the CAP_SYS_ADMIN and to list the features available to unprivileged users. Note that this addsd a dependency on libpcap for compiling bpftool. Note that there is no particular reason why the probes were restricted to root, other than the fact I did not need them for unprivileged and did not bother with the additional checks at the time probes were added. Signed-off-by: Quentin Monnet Signed-off-by: Daniel Borkmann Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20200429144506.8999-3-quentin@isovalent.com --- .../bpf/bpftool/Documentation/bpftool-feature.rst | 10 +- tools/bpf/bpftool/Makefile | 2 +- tools/bpf/bpftool/bash-completion/bpftool | 2 +- tools/bpf/bpftool/feature.c | 102 ++++++++++++++++++--- 4 files changed, 100 insertions(+), 16 deletions(-) diff --git a/tools/bpf/bpftool/Documentation/bpftool-feature.rst b/tools/bpf/bpftool/Documentation/bpftool-feature.rst index b04156cfd7a3..ca085944e4cf 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-feature.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-feature.rst @@ -19,7 +19,7 @@ SYNOPSIS FEATURE COMMANDS ================ -| **bpftool** **feature probe** [*COMPONENT*] [**full**] [**macros** [**prefix** *PREFIX*]] +| **bpftool** **feature probe** [*COMPONENT*] [**full**] [**unprivileged**] [**macros** [**prefix** *PREFIX*]] | **bpftool** **feature help** | | *COMPONENT* := { **kernel** | **dev** *NAME* } @@ -49,6 +49,14 @@ DESCRIPTION Keyword **kernel** can be omitted. If no probe target is specified, probing the kernel is the default behaviour. + When the **unprivileged** keyword is used, bpftool will dump + only the features available to a user who does not have the + **CAP_SYS_ADMIN** capability set. The features available in + that case usually represent a small subset of the parameters + supported by the system. Unprivileged users MUST use the + **unprivileged** keyword: This is to avoid misdetection if + bpftool is inadvertently run as non-root, for example. + **bpftool feature probe dev** *NAME* [**full**] [**macros** [**prefix** *PREFIX*]] Probe network device for supported eBPF features and dump results to the console. diff --git a/tools/bpf/bpftool/Makefile b/tools/bpf/bpftool/Makefile index f584d1fdfc64..89d7962a4a44 100644 --- a/tools/bpf/bpftool/Makefile +++ b/tools/bpf/bpftool/Makefile @@ -55,7 +55,7 @@ ifneq ($(EXTRA_LDFLAGS),) LDFLAGS += $(EXTRA_LDFLAGS) endif -LIBS = $(LIBBPF) -lelf -lz +LIBS = $(LIBBPF) -lelf -lz -lcap INSTALL ?= install RM ?= rm -f diff --git a/tools/bpf/bpftool/bash-completion/bpftool b/tools/bpf/bpftool/bash-completion/bpftool index c033c3329f73..fc989ead7313 100644 --- a/tools/bpf/bpftool/bash-completion/bpftool +++ b/tools/bpf/bpftool/bash-completion/bpftool @@ -1079,7 +1079,7 @@ _bpftool() COMPREPLY+=( $( compgen -W 'macros' -- "$cur" ) ) fi _bpftool_one_of_list 'kernel dev' - _bpftool_once_attr 'full' + _bpftool_once_attr 'full unprivileged' return 0 ;; *) diff --git a/tools/bpf/bpftool/feature.c b/tools/bpf/bpftool/feature.c index 59e4cb44efbc..952f4b1987c0 100644 --- a/tools/bpf/bpftool/feature.c +++ b/tools/bpf/bpftool/feature.c @@ -6,6 +6,7 @@ #include #include #include +#include #include #include @@ -36,6 +37,7 @@ static const char * const helper_name[] = { #undef BPF_HELPER_MAKE_ENTRY static bool full_mode; +static bool run_as_unprivileged; /* Miscellaneous utility functions */ @@ -473,6 +475,11 @@ probe_prog_type(enum bpf_prog_type prog_type, bool *supported_types, } res = bpf_probe_prog_type(prog_type, ifindex); + /* Probe may succeed even if program load fails, for unprivileged users + * check that we did not fail because of insufficient permissions + */ + if (run_as_unprivileged && errno == EPERM) + res = false; supported_types[prog_type] |= res; @@ -501,6 +508,10 @@ probe_map_type(enum bpf_map_type map_type, const char *define_prefix, res = bpf_probe_map_type(map_type, ifindex); + /* Probe result depends on the success of map creation, no additional + * check required for unprivileged users + */ + maxlen = sizeof(plain_desc) - strlen(plain_comment) - 1; if (strlen(map_type_name[map_type]) > maxlen) { p_info("map type name too long"); @@ -520,12 +531,17 @@ probe_helper_for_progtype(enum bpf_prog_type prog_type, bool supported_type, const char *define_prefix, unsigned int id, const char *ptype_name, __u32 ifindex) { - bool res; + bool res = false; - if (!supported_type) - res = false; - else + if (supported_type) { res = bpf_probe_helper(id, prog_type, ifindex); + /* Probe may succeed even if program load fails, for + * unprivileged users check that we did not fail because of + * insufficient permissions + */ + if (run_as_unprivileged && errno == EPERM) + res = false; + } if (json_output) { if (res) @@ -720,6 +736,65 @@ static void section_misc(const char *define_prefix, __u32 ifindex) print_end_section(); } +static int handle_perms(void) +{ + cap_value_t cap_list[1] = { CAP_SYS_ADMIN }; + bool has_sys_admin_cap = false; + cap_flag_value_t val; + int res = -1; + cap_t caps; + + caps = cap_get_proc(); + if (!caps) { + p_err("failed to get capabilities for process: %s", + strerror(errno)); + return -1; + } + + if (cap_get_flag(caps, CAP_SYS_ADMIN, CAP_EFFECTIVE, &val)) { + p_err("bug: failed to retrieve CAP_SYS_ADMIN status"); + goto exit_free; + } + if (val == CAP_SET) + has_sys_admin_cap = true; + + if (!run_as_unprivileged && !has_sys_admin_cap) { + p_err("full feature probing requires CAP_SYS_ADMIN, run as root or use 'unprivileged'"); + goto exit_free; + } + + if ((run_as_unprivileged && !has_sys_admin_cap) || + (!run_as_unprivileged && has_sys_admin_cap)) { + /* We are all good, exit now */ + res = 0; + goto exit_free; + } + + /* if (run_as_unprivileged && has_sys_admin_cap), drop CAP_SYS_ADMIN */ + + if (cap_set_flag(caps, CAP_EFFECTIVE, ARRAY_SIZE(cap_list), cap_list, + CAP_CLEAR)) { + p_err("bug: failed to clear CAP_SYS_ADMIN from capabilities"); + goto exit_free; + } + + if (cap_set_proc(caps)) { + p_err("failed to drop CAP_SYS_ADMIN: %s", strerror(errno)); + goto exit_free; + } + + res = 0; + +exit_free: + if (cap_free(caps) && !res) { + p_err("failed to clear storage object for capabilities: %s", + strerror(errno)); + res = -1; + } + + return res; +} + static int do_probe(int argc, char **argv) { enum probe_component target = COMPONENT_UNSPEC; @@ -728,14 +803,6 @@ static int do_probe(int argc, char **argv) __u32 ifindex = 0; char *ifname; - /* Detection assumes user has sufficient privileges (CAP_SYS_ADMIN). - * Let's approximate, and restrict usage to root user only. - */ - if (geteuid()) { - p_err("please run this command as root user"); - return -1; - } - set_max_rlimit(); while (argc) { @@ -784,6 +851,9 @@ static int do_probe(int argc, char **argv) if (!REQ_ARGS(1)) return -1; define_prefix = GET_ARG(); + } else if (is_prefix(*argv, "unprivileged")) { + run_as_unprivileged = true; + NEXT_ARG(); } else { p_err("expected no more arguments, 'kernel', 'dev', 'macros' or 'prefix', got: '%s'?", *argv); @@ -791,6 +861,12 @@ static int do_probe(int argc, char **argv) } } + /* Full feature detection requires CAP_SYS_ADMIN privilege. + * Let's approximate, and warn if user is not root. + */ + if (handle_perms()) + return -1; + if (json_output) { define_prefix = NULL; jsonw_start_object(json_wtr); @@ -821,7 +897,7 @@ static int do_help(int argc, char **argv) } fprintf(stderr, - "Usage: %s %s probe [COMPONENT] [full] [macros [prefix PREFIX]]\n" + "Usage: %s %s probe [COMPONENT] [full] [unprivileged] [macros [prefix PREFIX]]\n" " %s %s help\n" "\n" " COMPONENT := { kernel | dev NAME }\n" -- cgit v1.2.3-59-g8ed1b From 0b3b9ca3d154486baa08a41cbc62fde67ba8c6c3 Mon Sep 17 00:00:00 2001 From: Quentin Monnet Date: Wed, 29 Apr 2020 15:45:06 +0100 Subject: tools: bpftool: Make libcap dependency optional The new libcap dependency is not used for an essential feature of bpftool, and we could imagine building the tool without checks on CAP_SYS_ADMIN by disabling probing features as an unprivileged users. Make it so, in order to avoid a hard dependency on libcap, and to ease packaging/embedding of bpftool. Signed-off-by: Quentin Monnet Signed-off-by: Daniel Borkmann Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20200429144506.8999-4-quentin@isovalent.com --- .../bpf/bpftool/Documentation/bpftool-feature.rst | 4 +++- tools/bpf/bpftool/Makefile | 13 +++++++---- tools/bpf/bpftool/feature.c | 26 ++++++++++++++++++++++ 3 files changed, 38 insertions(+), 5 deletions(-) diff --git a/tools/bpf/bpftool/Documentation/bpftool-feature.rst b/tools/bpf/bpftool/Documentation/bpftool-feature.rst index ca085944e4cf..1fa755f55e0c 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-feature.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-feature.rst @@ -55,7 +55,9 @@ DESCRIPTION that case usually represent a small subset of the parameters supported by the system. Unprivileged users MUST use the **unprivileged** keyword: This is to avoid misdetection if - bpftool is inadvertently run as non-root, for example. + bpftool is inadvertently run as non-root, for example. This + keyword is unavailable if bpftool was compiled without + libcap. **bpftool feature probe dev** *NAME* [**full**] [**macros** [**prefix** *PREFIX*]] Probe network device for supported eBPF features and dump diff --git a/tools/bpf/bpftool/Makefile b/tools/bpf/bpftool/Makefile index 89d7962a4a44..2759f9cc3289 100644 --- a/tools/bpf/bpftool/Makefile +++ b/tools/bpf/bpftool/Makefile @@ -55,16 +55,15 @@ ifneq ($(EXTRA_LDFLAGS),) LDFLAGS += $(EXTRA_LDFLAGS) endif -LIBS = $(LIBBPF) -lelf -lz -lcap - INSTALL ?= install RM ?= rm -f CLANG ?= clang FEATURE_USER = .bpftool -FEATURE_TESTS = libbfd disassembler-four-args reallocarray zlib \ +FEATURE_TESTS = libbfd disassembler-four-args reallocarray zlib libcap \ + clang-bpf-global-var +FEATURE_DISPLAY = libbfd disassembler-four-args zlib libcap \ clang-bpf-global-var -FEATURE_DISPLAY = libbfd disassembler-four-args zlib clang-bpf-global-var check_feat := 1 NON_CHECK_FEAT_TARGETS := clean uninstall doc doc-clean doc-install doc-uninstall @@ -90,6 +89,12 @@ ifeq ($(feature-reallocarray), 0) CFLAGS += -DCOMPAT_NEED_REALLOCARRAY endif +LIBS = $(LIBBPF) -lelf -lz +ifeq ($(feature-libcap), 1) +CFLAGS += -DUSE_LIBCAP +LIBS += -lcap +endif + include $(wildcard $(OUTPUT)*.d) all: $(OUTPUT)bpftool diff --git a/tools/bpf/bpftool/feature.c b/tools/bpf/bpftool/feature.c index 952f4b1987c0..f54347f55ee0 100644 --- a/tools/bpf/bpftool/feature.c +++ b/tools/bpf/bpftool/feature.c @@ -6,7 +6,9 @@ #include #include #include +#ifdef USE_LIBCAP #include +#endif #include #include @@ -37,7 +39,9 @@ static const char * const helper_name[] = { #undef BPF_HELPER_MAKE_ENTRY static bool full_mode; +#ifdef USE_LIBCAP static bool run_as_unprivileged; +#endif /* Miscellaneous utility functions */ @@ -475,11 +479,13 @@ probe_prog_type(enum bpf_prog_type prog_type, bool *supported_types, } res = bpf_probe_prog_type(prog_type, ifindex); +#ifdef USE_LIBCAP /* Probe may succeed even if program load fails, for unprivileged users * check that we did not fail because of insufficient permissions */ if (run_as_unprivileged && errno == EPERM) res = false; +#endif supported_types[prog_type] |= res; @@ -535,12 +541,14 @@ probe_helper_for_progtype(enum bpf_prog_type prog_type, bool supported_type, if (supported_type) { res = bpf_probe_helper(id, prog_type, ifindex); +#ifdef USE_LIBCAP /* Probe may succeed even if program load fails, for * unprivileged users check that we did not fail because of * insufficient permissions */ if (run_as_unprivileged && errno == EPERM) res = false; +#endif } if (json_output) { @@ -738,6 +746,7 @@ static void section_misc(const char *define_prefix, __u32 ifindex) static int handle_perms(void) { +#ifdef USE_LIBCAP cap_value_t cap_list[1] = { CAP_SYS_ADMIN }; bool has_sys_admin_cap = false; cap_flag_value_t val; @@ -793,6 +802,18 @@ exit_free: } return res; +#else + /* Detection assumes user has sufficient privileges (CAP_SYS_ADMIN). + * We do not use libpcap so let's approximate, and restrict usage to + * root user only. + */ + if (geteuid()) { + p_err("full feature probing requires root privileges"); + return -1; + } + + return 0; +#endif /* USE_LIBCAP */ } static int do_probe(int argc, char **argv) @@ -852,8 +873,13 @@ static int do_probe(int argc, char **argv) return -1; define_prefix = GET_ARG(); } else if (is_prefix(*argv, "unprivileged")) { +#ifdef USE_LIBCAP run_as_unprivileged = true; NEXT_ARG(); +#else + p_err("unprivileged run not supported, recompile bpftool with libcap"); + return -1; +#endif } else { p_err("expected no more arguments, 'kernel', 'dev', 'macros' or 'prefix', got: '%s'?", *argv); -- cgit v1.2.3-59-g8ed1b From 64d85290d79c0677edb5a8ee2295b36c022fa5df Mon Sep 17 00:00:00 2001 From: Jakub Sitnicki Date: Wed, 29 Apr 2020 20:11:52 +0200 Subject: bpf: Allow bpf_map_lookup_elem for SOCKMAP and SOCKHASH White-list map lookup for SOCKMAP/SOCKHASH from BPF. Lookup returns a pointer to a full socket and acquires a reference if necessary. To support it we need to extend the verifier to know that: (1) register storing the lookup result holds a pointer to socket, if lookup was done on SOCKMAP/SOCKHASH, and that (2) map lookup on SOCKMAP/SOCKHASH is a reference acquiring operation, which needs a corresponding reference release with bpf_sk_release. On sock_map side, lookup handlers exposed via bpf_map_ops now bump sk_refcnt if socket is reference counted. In turn, bpf_sk_select_reuseport, the only in-kernel user of SOCKMAP/SOCKHASH ops->map_lookup_elem, was updated to release the reference. Sockets fetched from a map can be used in the same way as ones returned by BPF socket lookup helpers, such as bpf_sk_lookup_tcp. In particular, they can be used with bpf_sk_assign to direct packets toward a socket on TC ingress path. Suggested-by: Lorenz Bauer Signed-off-by: Jakub Sitnicki Signed-off-by: Daniel Borkmann Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20200429181154.479310-2-jakub@cloudflare.com --- kernel/bpf/verifier.c | 45 +++++++++++++++++++++++++++++++++++---------- net/core/filter.c | 4 ++++ net/core/sock_map.c | 18 ++++++++++++++++-- 3 files changed, 55 insertions(+), 12 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 2b337e32aa94..70ad009577f8 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -429,11 +429,30 @@ static bool is_release_function(enum bpf_func_id func_id) return func_id == BPF_FUNC_sk_release; } -static bool is_acquire_function(enum bpf_func_id func_id) +static bool may_be_acquire_function(enum bpf_func_id func_id) { return func_id == BPF_FUNC_sk_lookup_tcp || func_id == BPF_FUNC_sk_lookup_udp || - func_id == BPF_FUNC_skc_lookup_tcp; + func_id == BPF_FUNC_skc_lookup_tcp || + func_id == BPF_FUNC_map_lookup_elem; +} + +static bool is_acquire_function(enum bpf_func_id func_id, + const struct bpf_map *map) +{ + enum bpf_map_type map_type = map ? map->map_type : BPF_MAP_TYPE_UNSPEC; + + if (func_id == BPF_FUNC_sk_lookup_tcp || + func_id == BPF_FUNC_sk_lookup_udp || + func_id == BPF_FUNC_skc_lookup_tcp) + return true; + + if (func_id == BPF_FUNC_map_lookup_elem && + (map_type == BPF_MAP_TYPE_SOCKMAP || + map_type == BPF_MAP_TYPE_SOCKHASH)) + return true; + + return false; } static bool is_ptr_cast_function(enum bpf_func_id func_id) @@ -3934,7 +3953,8 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, func_id != BPF_FUNC_sock_map_update && func_id != BPF_FUNC_map_delete_elem && func_id != BPF_FUNC_msg_redirect_map && - func_id != BPF_FUNC_sk_select_reuseport) + func_id != BPF_FUNC_sk_select_reuseport && + func_id != BPF_FUNC_map_lookup_elem) goto error; break; case BPF_MAP_TYPE_SOCKHASH: @@ -3942,7 +3962,8 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, func_id != BPF_FUNC_sock_hash_update && func_id != BPF_FUNC_map_delete_elem && func_id != BPF_FUNC_msg_redirect_hash && - func_id != BPF_FUNC_sk_select_reuseport) + func_id != BPF_FUNC_sk_select_reuseport && + func_id != BPF_FUNC_map_lookup_elem) goto error; break; case BPF_MAP_TYPE_REUSEPORT_SOCKARRAY: @@ -4112,7 +4133,7 @@ static bool check_refcount_ok(const struct bpf_func_proto *fn, int func_id) /* A reference acquiring function cannot acquire * another refcounted ptr. */ - if (is_acquire_function(func_id) && count) + if (may_be_acquire_function(func_id) && count) return false; /* We only support one arg being unreferenced at the moment, @@ -4623,7 +4644,7 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn if (is_ptr_cast_function(func_id)) { /* For release_reference() */ regs[BPF_REG_0].ref_obj_id = meta.ref_obj_id; - } else if (is_acquire_function(func_id)) { + } else if (is_acquire_function(func_id, meta.map_ptr)) { int id = acquire_reference_state(env, insn_idx); if (id < 0) @@ -6532,12 +6553,16 @@ static void mark_ptr_or_null_reg(struct bpf_func_state *state, if (is_null) { reg->type = SCALAR_VALUE; } else if (reg->type == PTR_TO_MAP_VALUE_OR_NULL) { - if (reg->map_ptr->inner_map_meta) { + const struct bpf_map *map = reg->map_ptr; + + if (map->inner_map_meta) { reg->type = CONST_PTR_TO_MAP; - reg->map_ptr = reg->map_ptr->inner_map_meta; - } else if (reg->map_ptr->map_type == - BPF_MAP_TYPE_XSKMAP) { + reg->map_ptr = map->inner_map_meta; + } else if (map->map_type == BPF_MAP_TYPE_XSKMAP) { reg->type = PTR_TO_XDP_SOCK; + } else if (map->map_type == BPF_MAP_TYPE_SOCKMAP || + map->map_type == BPF_MAP_TYPE_SOCKHASH) { + reg->type = PTR_TO_SOCKET; } else { reg->type = PTR_TO_MAP_VALUE; } diff --git a/net/core/filter.c b/net/core/filter.c index da3b7a72c37c..70b32723e6be 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -8712,6 +8712,10 @@ BPF_CALL_4(sk_select_reuseport, struct sk_reuseport_kern *, reuse_kern, reuse = rcu_dereference(selected_sk->sk_reuseport_cb); if (!reuse) { + /* Lookup in sock_map can return TCP ESTABLISHED sockets. */ + if (sk_is_refcounted(selected_sk)) + sock_put(selected_sk); + /* reuseport_array has only sk with non NULL sk_reuseport_cb. * The only (!reuse) case here is - the sk has already been * unhashed (e.g. by close()), so treat it as -ENOENT. diff --git a/net/core/sock_map.c b/net/core/sock_map.c index b08dfae10f88..00a26cf2cfe9 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -343,7 +343,14 @@ static struct sock *__sock_map_lookup_elem(struct bpf_map *map, u32 key) static void *sock_map_lookup(struct bpf_map *map, void *key) { - return __sock_map_lookup_elem(map, *(u32 *)key); + struct sock *sk; + + sk = __sock_map_lookup_elem(map, *(u32 *)key); + if (!sk || !sk_fullsock(sk)) + return NULL; + if (sk_is_refcounted(sk) && !refcount_inc_not_zero(&sk->sk_refcnt)) + return NULL; + return sk; } static void *sock_map_lookup_sys(struct bpf_map *map, void *key) @@ -1051,7 +1058,14 @@ static void *sock_hash_lookup_sys(struct bpf_map *map, void *key) static void *sock_hash_lookup(struct bpf_map *map, void *key) { - return __sock_hash_lookup_elem(map, key); + struct sock *sk; + + sk = __sock_hash_lookup_elem(map, key); + if (!sk || !sk_fullsock(sk)) + return NULL; + if (sk_is_refcounted(sk) && !refcount_inc_not_zero(&sk->sk_refcnt)) + return NULL; + return sk; } static void sock_hash_release_progs(struct bpf_map *map) -- cgit v1.2.3-59-g8ed1b From 34a2cc6eee809f974111979f4c2b3c62aaaad457 Mon Sep 17 00:00:00 2001 From: Jakub Sitnicki Date: Wed, 29 Apr 2020 20:11:53 +0200 Subject: selftests/bpf: Test that lookup on SOCKMAP/SOCKHASH is allowed Now that bpf_map_lookup_elem() is white-listed for SOCKMAP/SOCKHASH, replace the tests which check that verifier prevents lookup on these map types with ones that ensure that lookup operation is permitted, but only with a release of acquired socket reference. Signed-off-by: Jakub Sitnicki Signed-off-by: Daniel Borkmann Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20200429181154.479310-3-jakub@cloudflare.com --- .../selftests/bpf/verifier/prevent_map_lookup.c | 30 ---------- tools/testing/selftests/bpf/verifier/sock.c | 70 ++++++++++++++++++++++ 2 files changed, 70 insertions(+), 30 deletions(-) diff --git a/tools/testing/selftests/bpf/verifier/prevent_map_lookup.c b/tools/testing/selftests/bpf/verifier/prevent_map_lookup.c index da7a4b37cb98..fc4e301260f6 100644 --- a/tools/testing/selftests/bpf/verifier/prevent_map_lookup.c +++ b/tools/testing/selftests/bpf/verifier/prevent_map_lookup.c @@ -1,33 +1,3 @@ -{ - "prevent map lookup in sockmap", - .insns = { - BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), - BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), - BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), - BPF_LD_MAP_FD(BPF_REG_1, 0), - BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), - BPF_EXIT_INSN(), - }, - .fixup_map_sockmap = { 3 }, - .result = REJECT, - .errstr = "cannot pass map_type 15 into func bpf_map_lookup_elem", - .prog_type = BPF_PROG_TYPE_SOCK_OPS, -}, -{ - "prevent map lookup in sockhash", - .insns = { - BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), - BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), - BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), - BPF_LD_MAP_FD(BPF_REG_1, 0), - BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), - BPF_EXIT_INSN(), - }, - .fixup_map_sockhash = { 3 }, - .result = REJECT, - .errstr = "cannot pass map_type 18 into func bpf_map_lookup_elem", - .prog_type = BPF_PROG_TYPE_SOCK_OPS, -}, { "prevent map lookup in stack trace", .insns = { diff --git a/tools/testing/selftests/bpf/verifier/sock.c b/tools/testing/selftests/bpf/verifier/sock.c index 9ed192e14f5f..f87ad69dbc62 100644 --- a/tools/testing/selftests/bpf/verifier/sock.c +++ b/tools/testing/selftests/bpf/verifier/sock.c @@ -516,3 +516,73 @@ .prog_type = BPF_PROG_TYPE_XDP, .result = ACCEPT, }, +{ + "bpf_map_lookup_elem(sockmap, &key)", + .insns = { + BPF_ST_MEM(BPF_W, BPF_REG_10, -4, 0), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -4), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .fixup_map_sockmap = { 3 }, + .prog_type = BPF_PROG_TYPE_SK_SKB, + .result = REJECT, + .errstr = "Unreleased reference id=2 alloc_insn=5", +}, +{ + "bpf_map_lookup_elem(sockhash, &key)", + .insns = { + BPF_ST_MEM(BPF_W, BPF_REG_10, -4, 0), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -4), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .fixup_map_sockhash = { 3 }, + .prog_type = BPF_PROG_TYPE_SK_SKB, + .result = REJECT, + .errstr = "Unreleased reference id=2 alloc_insn=5", +}, +{ + "bpf_map_lookup_elem(sockmap, &key); sk->type [fullsock field]; bpf_sk_release(sk)", + .insns = { + BPF_ST_MEM(BPF_W, BPF_REG_10, -4, 0), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -4), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem), + BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 1), + BPF_EXIT_INSN(), + BPF_MOV64_REG(BPF_REG_1, BPF_REG_0), + BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_0, offsetof(struct bpf_sock, type)), + BPF_EMIT_CALL(BPF_FUNC_sk_release), + BPF_EXIT_INSN(), + }, + .fixup_map_sockmap = { 3 }, + .prog_type = BPF_PROG_TYPE_SK_SKB, + .result = ACCEPT, +}, +{ + "bpf_map_lookup_elem(sockhash, &key); sk->type [fullsock field]; bpf_sk_release(sk)", + .insns = { + BPF_ST_MEM(BPF_W, BPF_REG_10, -4, 0), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -4), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem), + BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 1), + BPF_EXIT_INSN(), + BPF_MOV64_REG(BPF_REG_1, BPF_REG_0), + BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_0, offsetof(struct bpf_sock, type)), + BPF_EMIT_CALL(BPF_FUNC_sk_release), + BPF_EXIT_INSN(), + }, + .fixup_map_sockhash = { 3 }, + .prog_type = BPF_PROG_TYPE_SK_SKB, + .result = ACCEPT, +}, -- cgit v1.2.3-59-g8ed1b From 0b9ad56b1ea66382a3dcc8e3e7c54967bf8c6d94 Mon Sep 17 00:00:00 2001 From: Jakub Sitnicki Date: Wed, 29 Apr 2020 20:11:54 +0200 Subject: selftests/bpf: Use SOCKMAP for server sockets in bpf_sk_assign test Update bpf_sk_assign test to fetch the server socket from SOCKMAP, now that map lookup from BPF in SOCKMAP is enabled. This way the test TC BPF program doesn't need to know what address server socket is bound to. Signed-off-by: Jakub Sitnicki Signed-off-by: Daniel Borkmann Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20200429181154.479310-4-jakub@cloudflare.com --- tools/testing/selftests/bpf/Makefile | 2 +- tools/testing/selftests/bpf/prog_tests/sk_assign.c | 21 +++++- tools/testing/selftests/bpf/progs/test_sk_assign.c | 82 +++++++++------------- 3 files changed, 53 insertions(+), 52 deletions(-) diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index 10f12a5aac20..3d942be23d09 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -243,7 +243,7 @@ define GCC_BPF_BUILD_RULE $(BPF_GCC) $3 $4 -O2 -c $1 -o $2 endef -SKEL_BLACKLIST := btf__% test_pinning_invalid.c +SKEL_BLACKLIST := btf__% test_pinning_invalid.c test_sk_assign.c # Set up extra TRUNNER_XXX "temporary" variables in the environment (relies on # $eval()) and pass control to DEFINE_TEST_RUNNER_RULES. diff --git a/tools/testing/selftests/bpf/prog_tests/sk_assign.c b/tools/testing/selftests/bpf/prog_tests/sk_assign.c index d572e1a2c297..47fa04adc147 100644 --- a/tools/testing/selftests/bpf/prog_tests/sk_assign.c +++ b/tools/testing/selftests/bpf/prog_tests/sk_assign.c @@ -20,6 +20,7 @@ #define CONNECT_PORT 4321 #define TEST_DADDR (0xC0A80203) #define NS_SELF "/proc/self/ns/net" +#define SERVER_MAP_PATH "/sys/fs/bpf/tc/globals/server_map" static const struct timeval timeo_sec = { .tv_sec = 3 }; static const size_t timeo_optlen = sizeof(timeo_sec); @@ -265,6 +266,7 @@ void test_sk_assign(void) TEST("ipv6 udp addr redir", AF_INET6, SOCK_DGRAM, true), }; int server = -1; + int server_map; int self_net; self_net = open(NS_SELF, O_RDONLY); @@ -278,9 +280,17 @@ void test_sk_assign(void) goto cleanup; } + server_map = bpf_obj_get(SERVER_MAP_PATH); + if (CHECK_FAIL(server_map < 0)) { + perror("Unable to open " SERVER_MAP_PATH); + goto cleanup; + } + for (int i = 0; i < ARRAY_SIZE(tests) && !READ_ONCE(stop); i++) { struct test_sk_cfg *test = &tests[i]; const struct sockaddr *addr; + const int zero = 0; + int err; if (!test__start_subtest(test->name)) continue; @@ -288,7 +298,13 @@ void test_sk_assign(void) addr = (const struct sockaddr *)test->addr; server = start_server(addr, test->len, test->type); if (server == -1) - goto cleanup; + goto close; + + err = bpf_map_update_elem(server_map, &zero, &server, BPF_ANY); + if (CHECK_FAIL(err)) { + perror("Unable to update server_map"); + goto close; + } /* connect to unbound ports */ prepare_addr(test->addr, test->family, CONNECT_PORT, @@ -302,7 +318,10 @@ void test_sk_assign(void) close: close(server); + close(server_map); cleanup: + if (CHECK_FAIL(unlink(SERVER_MAP_PATH))) + perror("Unable to unlink " SERVER_MAP_PATH); if (CHECK_FAIL(setns(self_net, CLONE_NEWNET))) perror("Failed to setns("NS_SELF")"); close(self_net); diff --git a/tools/testing/selftests/bpf/progs/test_sk_assign.c b/tools/testing/selftests/bpf/progs/test_sk_assign.c index 8f530843b4da..1ecd987005d2 100644 --- a/tools/testing/selftests/bpf/progs/test_sk_assign.c +++ b/tools/testing/selftests/bpf/progs/test_sk_assign.c @@ -16,6 +16,26 @@ #include #include +/* Pin map under /sys/fs/bpf/tc/globals/ */ +#define PIN_GLOBAL_NS 2 + +/* Must match struct bpf_elf_map layout from iproute2 */ +struct { + __u32 type; + __u32 size_key; + __u32 size_value; + __u32 max_elem; + __u32 flags; + __u32 id; + __u32 pinning; +} server_map SEC("maps") = { + .type = BPF_MAP_TYPE_SOCKMAP, + .size_key = sizeof(int), + .size_value = sizeof(__u64), + .max_elem = 1, + .pinning = PIN_GLOBAL_NS, +}; + int _version SEC("version") = 1; char _license[] SEC("license") = "GPL"; @@ -72,7 +92,9 @@ handle_udp(struct __sk_buff *skb, struct bpf_sock_tuple *tuple, bool ipv4) { struct bpf_sock_tuple ln = {0}; struct bpf_sock *sk; + const int zero = 0; size_t tuple_len; + __be16 dport; int ret; tuple_len = ipv4 ? sizeof(tuple->ipv4) : sizeof(tuple->ipv6); @@ -83,32 +105,11 @@ handle_udp(struct __sk_buff *skb, struct bpf_sock_tuple *tuple, bool ipv4) if (sk) goto assign; - if (ipv4) { - if (tuple->ipv4.dport != bpf_htons(4321)) - return TC_ACT_OK; - - ln.ipv4.daddr = bpf_htonl(0x7f000001); - ln.ipv4.dport = bpf_htons(1234); - - sk = bpf_sk_lookup_udp(skb, &ln, sizeof(ln.ipv4), - BPF_F_CURRENT_NETNS, 0); - } else { - if (tuple->ipv6.dport != bpf_htons(4321)) - return TC_ACT_OK; - - /* Upper parts of daddr are already zero. */ - ln.ipv6.daddr[3] = bpf_htonl(0x1); - ln.ipv6.dport = bpf_htons(1234); - - sk = bpf_sk_lookup_udp(skb, &ln, sizeof(ln.ipv6), - BPF_F_CURRENT_NETNS, 0); - } + dport = ipv4 ? tuple->ipv4.dport : tuple->ipv6.dport; + if (dport != bpf_htons(4321)) + return TC_ACT_OK; - /* workaround: We can't do a single socket lookup here, because then - * the compiler will likely spill tuple_len to the stack. This makes it - * lose all bounds information in the verifier, which then rejects the - * call as unsafe. - */ + sk = bpf_map_lookup_elem(&server_map, &zero); if (!sk) return TC_ACT_SHOT; @@ -123,7 +124,9 @@ handle_tcp(struct __sk_buff *skb, struct bpf_sock_tuple *tuple, bool ipv4) { struct bpf_sock_tuple ln = {0}; struct bpf_sock *sk; + const int zero = 0; size_t tuple_len; + __be16 dport; int ret; tuple_len = ipv4 ? sizeof(tuple->ipv4) : sizeof(tuple->ipv6); @@ -137,32 +140,11 @@ handle_tcp(struct __sk_buff *skb, struct bpf_sock_tuple *tuple, bool ipv4) bpf_sk_release(sk); } - if (ipv4) { - if (tuple->ipv4.dport != bpf_htons(4321)) - return TC_ACT_OK; + dport = ipv4 ? tuple->ipv4.dport : tuple->ipv6.dport; + if (dport != bpf_htons(4321)) + return TC_ACT_OK; - ln.ipv4.daddr = bpf_htonl(0x7f000001); - ln.ipv4.dport = bpf_htons(1234); - - sk = bpf_skc_lookup_tcp(skb, &ln, sizeof(ln.ipv4), - BPF_F_CURRENT_NETNS, 0); - } else { - if (tuple->ipv6.dport != bpf_htons(4321)) - return TC_ACT_OK; - - /* Upper parts of daddr are already zero. */ - ln.ipv6.daddr[3] = bpf_htonl(0x1); - ln.ipv6.dport = bpf_htons(1234); - - sk = bpf_skc_lookup_tcp(skb, &ln, sizeof(ln.ipv6), - BPF_F_CURRENT_NETNS, 0); - } - - /* workaround: We can't do a single socket lookup here, because then - * the compiler will likely spill tuple_len to the stack. This makes it - * lose all bounds information in the verifier, which then rejects the - * call as unsafe. - */ + sk = bpf_map_lookup_elem(&server_map, &zero); if (!sk) return TC_ACT_SHOT; -- cgit v1.2.3-59-g8ed1b From 449e14bfdb83bf772200840a7ac4dcc1d7cacf54 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 29 Apr 2020 15:21:58 +0200 Subject: bpf: Fix unused variable warning Hiding the only using of bpf_link_type_strs[] in an #ifdef causes an unused-variable warning: kernel/bpf/syscall.c:2280:20: error: 'bpf_link_type_strs' defined but not used [-Werror=unused-variable] 2280 | static const char *bpf_link_type_strs[] = { Move the definition into the same #ifdef. Fixes: f2e10bff16a0 ("bpf: Add support for BPF_OBJ_GET_INFO_BY_FD for bpf_link") Signed-off-by: Arnd Bergmann Signed-off-by: Daniel Borkmann Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200429132217.1294289-1-arnd@arndb.de --- kernel/bpf/syscall.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index d23c04cbe14f..c75b2dd2459c 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -2271,6 +2271,7 @@ static int bpf_link_release(struct inode *inode, struct file *filp) return 0; } +#ifdef CONFIG_PROC_FS #define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type) #define BPF_MAP_TYPE(_id, _ops) #define BPF_LINK_TYPE(_id, _name) [_id] = #_name, @@ -2282,7 +2283,6 @@ static const char *bpf_link_type_strs[] = { #undef BPF_MAP_TYPE #undef BPF_LINK_TYPE -#ifdef CONFIG_PROC_FS static void bpf_link_show_fdinfo(struct seq_file *m, struct file *filp) { const struct bpf_link *link = filp->private_data; -- cgit v1.2.3-59-g8ed1b From 72f96347628e73dbb61b307f18dd19293cc6792a Mon Sep 17 00:00:00 2001 From: Doug Berger Date: Wed, 29 Apr 2020 13:02:00 -0700 Subject: net: bcmgenet: set Rx mode before starting netif This commit explicitly calls the bcmgenet_set_rx_mode() function when the network interface is started. This function is normally called by ndo_set_rx_mode when the flags are changed, but apparently not when the driver is suspended and resumed. This change ensures that address filtering or promiscuous mode are properly restored by the driver after the MAC may have been reset. Fixes: b6e978e50444 ("net: bcmgenet: add suspend/resume callbacks") Signed-off-by: Doug Berger Acked-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/genet/bcmgenet.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/net/ethernet/broadcom/genet/bcmgenet.c b/drivers/net/ethernet/broadcom/genet/bcmgenet.c index 351d0282f199..eb0dd4d4800c 100644 --- a/drivers/net/ethernet/broadcom/genet/bcmgenet.c +++ b/drivers/net/ethernet/broadcom/genet/bcmgenet.c @@ -65,6 +65,9 @@ #define GENET_RDMA_REG_OFF (priv->hw_params->rdma_offset + \ TOTAL_DESC * DMA_DESC_SIZE) +/* Forward declarations */ +static void bcmgenet_set_rx_mode(struct net_device *dev); + static inline void bcmgenet_writel(u32 value, void __iomem *offset) { /* MIPS chips strapped for BE will automagically configure the @@ -2793,6 +2796,7 @@ static void bcmgenet_netif_start(struct net_device *dev) struct bcmgenet_priv *priv = netdev_priv(dev); /* Start the network engine */ + bcmgenet_set_rx_mode(dev); bcmgenet_enable_rx_napi(priv); umac_enable_set(priv, CMD_TX_EN | CMD_RX_EN, true); -- cgit v1.2.3-59-g8ed1b From 6f7689057a0f10a6c967b9f2759d7a3dc948b930 Mon Sep 17 00:00:00 2001 From: Doug Berger Date: Wed, 29 Apr 2020 13:02:01 -0700 Subject: net: bcmgenet: Fix WoL with password after deep sleep Broadcom STB chips support a deep sleep mode where all register contents are lost. Because we were stashing the MagicPacket password into some of these registers a suspend into that deep sleep then a resumption would not lead to being able to wake-up from MagicPacket with password again. Fix this by keeping a software copy of the password and program it during suspend. Fixes: c51de7f3976b ("net: bcmgenet: add Wake-on-LAN support code") Suggested-by: Florian Fainelli Signed-off-by: Doug Berger Acked-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/genet/bcmgenet.h | 2 ++ drivers/net/ethernet/broadcom/genet/bcmgenet_wol.c | 39 ++++++++++------------ 2 files changed, 20 insertions(+), 21 deletions(-) diff --git a/drivers/net/ethernet/broadcom/genet/bcmgenet.h b/drivers/net/ethernet/broadcom/genet/bcmgenet.h index daf8fb2c39b6..c3bfe97f2e5c 100644 --- a/drivers/net/ethernet/broadcom/genet/bcmgenet.h +++ b/drivers/net/ethernet/broadcom/genet/bcmgenet.h @@ -14,6 +14,7 @@ #include #include #include +#include /* total number of Buffer Descriptors, same for Rx/Tx */ #define TOTAL_DESC 256 @@ -676,6 +677,7 @@ struct bcmgenet_priv { /* WOL */ struct clk *clk_wol; u32 wolopts; + u8 sopass[SOPASS_MAX]; struct bcmgenet_mib_counters mib; diff --git a/drivers/net/ethernet/broadcom/genet/bcmgenet_wol.c b/drivers/net/ethernet/broadcom/genet/bcmgenet_wol.c index c9a43695b182..597c0498689a 100644 --- a/drivers/net/ethernet/broadcom/genet/bcmgenet_wol.c +++ b/drivers/net/ethernet/broadcom/genet/bcmgenet_wol.c @@ -41,18 +41,13 @@ void bcmgenet_get_wol(struct net_device *dev, struct ethtool_wolinfo *wol) { struct bcmgenet_priv *priv = netdev_priv(dev); - u32 reg; wol->supported = WAKE_MAGIC | WAKE_MAGICSECURE; wol->wolopts = priv->wolopts; memset(wol->sopass, 0, sizeof(wol->sopass)); - if (wol->wolopts & WAKE_MAGICSECURE) { - reg = bcmgenet_umac_readl(priv, UMAC_MPD_PW_MS); - put_unaligned_be16(reg, &wol->sopass[0]); - reg = bcmgenet_umac_readl(priv, UMAC_MPD_PW_LS); - put_unaligned_be32(reg, &wol->sopass[2]); - } + if (wol->wolopts & WAKE_MAGICSECURE) + memcpy(wol->sopass, priv->sopass, sizeof(priv->sopass)); } /* ethtool function - set WOL (Wake on LAN) settings. @@ -62,7 +57,6 @@ int bcmgenet_set_wol(struct net_device *dev, struct ethtool_wolinfo *wol) { struct bcmgenet_priv *priv = netdev_priv(dev); struct device *kdev = &priv->pdev->dev; - u32 reg; if (!device_can_wakeup(kdev)) return -ENOTSUPP; @@ -70,17 +64,8 @@ int bcmgenet_set_wol(struct net_device *dev, struct ethtool_wolinfo *wol) if (wol->wolopts & ~(WAKE_MAGIC | WAKE_MAGICSECURE)) return -EINVAL; - reg = bcmgenet_umac_readl(priv, UMAC_MPD_CTRL); - if (wol->wolopts & WAKE_MAGICSECURE) { - bcmgenet_umac_writel(priv, get_unaligned_be16(&wol->sopass[0]), - UMAC_MPD_PW_MS); - bcmgenet_umac_writel(priv, get_unaligned_be32(&wol->sopass[2]), - UMAC_MPD_PW_LS); - reg |= MPD_PW_EN; - } else { - reg &= ~MPD_PW_EN; - } - bcmgenet_umac_writel(priv, reg, UMAC_MPD_CTRL); + if (wol->wolopts & WAKE_MAGICSECURE) + memcpy(priv->sopass, wol->sopass, sizeof(priv->sopass)); /* Flag the device and relevant IRQ as wakeup capable */ if (wol->wolopts) { @@ -120,6 +105,14 @@ static int bcmgenet_poll_wol_status(struct bcmgenet_priv *priv) return retries; } +static void bcmgenet_set_mpd_password(struct bcmgenet_priv *priv) +{ + bcmgenet_umac_writel(priv, get_unaligned_be16(&priv->sopass[0]), + UMAC_MPD_PW_MS); + bcmgenet_umac_writel(priv, get_unaligned_be32(&priv->sopass[2]), + UMAC_MPD_PW_LS); +} + int bcmgenet_wol_power_down_cfg(struct bcmgenet_priv *priv, enum bcmgenet_power_mode mode) { @@ -144,13 +137,17 @@ int bcmgenet_wol_power_down_cfg(struct bcmgenet_priv *priv, reg = bcmgenet_umac_readl(priv, UMAC_MPD_CTRL); reg |= MPD_EN; + if (priv->wolopts & WAKE_MAGICSECURE) { + bcmgenet_set_mpd_password(priv); + reg |= MPD_PW_EN; + } bcmgenet_umac_writel(priv, reg, UMAC_MPD_CTRL); /* Do not leave UniMAC in MPD mode only */ retries = bcmgenet_poll_wol_status(priv); if (retries < 0) { reg = bcmgenet_umac_readl(priv, UMAC_MPD_CTRL); - reg &= ~MPD_EN; + reg &= ~(MPD_EN | MPD_PW_EN); bcmgenet_umac_writel(priv, reg, UMAC_MPD_CTRL); return retries; } @@ -189,7 +186,7 @@ void bcmgenet_wol_power_up_cfg(struct bcmgenet_priv *priv, reg = bcmgenet_umac_readl(priv, UMAC_MPD_CTRL); if (!(reg & MPD_EN)) return; /* already powered up so skip the rest */ - reg &= ~MPD_EN; + reg &= ~(MPD_EN | MPD_PW_EN); bcmgenet_umac_writel(priv, reg, UMAC_MPD_CTRL); /* Disable CRC Forward */ -- cgit v1.2.3-59-g8ed1b From 1a1d5106c1e37321f3fe394b786d1aece56d0df5 Mon Sep 17 00:00:00 2001 From: Doug Berger Date: Wed, 29 Apr 2020 13:02:02 -0700 Subject: net: bcmgenet: move clk_wol management to bcmgenet_wol The GENET_POWER_WOL_MAGIC power up and power down code configures the device for WoL when suspending and disables the WoL logic when resuming. It makes sense that this code should also manage the WoL clocking. This commit consolidates the logic and moves it earlier in the resume sequence. Since the clock is now only enabled if WoL is successfully entered the wol_active flag is introduced to track that state to keep the clock enables and disables balanced in case a suspend is aborted. The MPD_EN hardware bit can't be used because it can be cleared when the MAC is reset by a deep sleep. Signed-off-by: Doug Berger Acked-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/genet/bcmgenet.c | 19 +++++++------------ drivers/net/ethernet/broadcom/genet/bcmgenet.h | 3 ++- drivers/net/ethernet/broadcom/genet/bcmgenet_wol.c | 14 +++++++++++--- 3 files changed, 20 insertions(+), 16 deletions(-) diff --git a/drivers/net/ethernet/broadcom/genet/bcmgenet.c b/drivers/net/ethernet/broadcom/genet/bcmgenet.c index eb0dd4d4800c..57b8608feae1 100644 --- a/drivers/net/ethernet/broadcom/genet/bcmgenet.c +++ b/drivers/net/ethernet/broadcom/genet/bcmgenet.c @@ -2,7 +2,7 @@ /* * Broadcom GENET (Gigabit Ethernet) controller driver * - * Copyright (c) 2014-2019 Broadcom + * Copyright (c) 2014-2020 Broadcom */ #define pr_fmt(fmt) "bcmgenet: " fmt @@ -3619,6 +3619,10 @@ static int bcmgenet_resume(struct device *d) if (ret) return ret; + /* From WOL-enabled suspend, switch to regular clock */ + if (device_may_wakeup(d) && priv->wolopts) + bcmgenet_power_up(priv, GENET_POWER_WOL_MAGIC); + /* If this is an internal GPHY, power it back on now, before UniMAC is * brought out of reset as absolutely no UniMAC activity is allowed */ @@ -3629,10 +3633,6 @@ static int bcmgenet_resume(struct device *d) init_umac(priv); - /* From WOL-enabled suspend, switch to regular clock */ - if (priv->wolopts) - clk_disable_unprepare(priv->clk_wol); - phy_init_hw(dev->phydev); /* Speed settings must be restored */ @@ -3650,9 +3650,6 @@ static int bcmgenet_resume(struct device *d) bcmgenet_ext_writel(priv, reg, EXT_EXT_PWR_MGMT); } - if (priv->wolopts) - bcmgenet_power_up(priv, GENET_POWER_WOL_MAGIC); - /* Disable RX/TX DMA and flush TX queues */ dma_ctrl = bcmgenet_dma_disable(priv); @@ -3702,12 +3699,10 @@ static int bcmgenet_suspend(struct device *d) phy_suspend(dev->phydev); /* Prepare the device for Wake-on-LAN and switch to the slow clock */ - if (device_may_wakeup(d) && priv->wolopts) { + if (device_may_wakeup(d) && priv->wolopts) ret = bcmgenet_power_down(priv, GENET_POWER_WOL_MAGIC); - clk_prepare_enable(priv->clk_wol); - } else if (priv->internal_phy) { + else if (priv->internal_phy) ret = bcmgenet_power_down(priv, GENET_POWER_PASSIVE); - } /* Turn off the clocks */ clk_disable_unprepare(priv->clk); diff --git a/drivers/net/ethernet/broadcom/genet/bcmgenet.h b/drivers/net/ethernet/broadcom/genet/bcmgenet.h index c3bfe97f2e5c..a858b7305832 100644 --- a/drivers/net/ethernet/broadcom/genet/bcmgenet.h +++ b/drivers/net/ethernet/broadcom/genet/bcmgenet.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0-only */ /* - * Copyright (c) 2014-2017 Broadcom + * Copyright (c) 2014-2020 Broadcom */ #ifndef __BCMGENET_H__ @@ -678,6 +678,7 @@ struct bcmgenet_priv { struct clk *clk_wol; u32 wolopts; u8 sopass[SOPASS_MAX]; + bool wol_active; struct bcmgenet_mib_counters mib; diff --git a/drivers/net/ethernet/broadcom/genet/bcmgenet_wol.c b/drivers/net/ethernet/broadcom/genet/bcmgenet_wol.c index 597c0498689a..da45a4645b94 100644 --- a/drivers/net/ethernet/broadcom/genet/bcmgenet_wol.c +++ b/drivers/net/ethernet/broadcom/genet/bcmgenet_wol.c @@ -2,7 +2,7 @@ /* * Broadcom GENET (Gigabit Ethernet) Wake-on-LAN support * - * Copyright (c) 2014-2017 Broadcom + * Copyright (c) 2014-2020 Broadcom */ #define pr_fmt(fmt) "bcmgenet_wol: " fmt @@ -155,6 +155,9 @@ int bcmgenet_wol_power_down_cfg(struct bcmgenet_priv *priv, netif_dbg(priv, wol, dev, "MPD WOL-ready status set after %d msec\n", retries); + clk_prepare_enable(priv->clk_wol); + priv->wol_active = 1; + /* Enable CRC forward */ reg = bcmgenet_umac_readl(priv, UMAC_CMD); priv->crc_fwd_en = 1; @@ -183,9 +186,14 @@ void bcmgenet_wol_power_up_cfg(struct bcmgenet_priv *priv, return; } + if (!priv->wol_active) + return; /* failed to suspend so skip the rest */ + + priv->wol_active = 0; + clk_disable_unprepare(priv->clk_wol); + + /* Disable Magic Packet Detection */ reg = bcmgenet_umac_readl(priv, UMAC_MPD_CTRL); - if (!(reg & MPD_EN)) - return; /* already powered up so skip the rest */ reg &= ~(MPD_EN | MPD_PW_EN); bcmgenet_umac_writel(priv, reg, UMAC_MPD_CTRL); -- cgit v1.2.3-59-g8ed1b From 14da1510fedc2d72ca81344a0f62939e0a1bc648 Mon Sep 17 00:00:00 2001 From: Doug Berger Date: Wed, 29 Apr 2020 13:02:03 -0700 Subject: Revert "net: bcmgenet: remove unused function in bcmgenet.c" This reverts commit e2072600a24161b7ddcfb26814f69f5fbc8ef85a. This commit restores the previous implementation of Hardware Filter Block functions to the file for use in subsequent commits. Signed-off-by: Doug Berger Acked-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/genet/bcmgenet.c | 122 +++++++++++++++++++++++++ 1 file changed, 122 insertions(+) diff --git a/drivers/net/ethernet/broadcom/genet/bcmgenet.c b/drivers/net/ethernet/broadcom/genet/bcmgenet.c index 57b8608feae1..b37ef05c5083 100644 --- a/drivers/net/ethernet/broadcom/genet/bcmgenet.c +++ b/drivers/net/ethernet/broadcom/genet/bcmgenet.c @@ -2759,6 +2759,128 @@ static void bcmgenet_enable_dma(struct bcmgenet_priv *priv, u32 dma_ctrl) bcmgenet_tdma_writel(priv, reg, DMA_CTRL); } +static bool bcmgenet_hfb_is_filter_enabled(struct bcmgenet_priv *priv, + u32 f_index) +{ + u32 offset; + u32 reg; + + offset = HFB_FLT_ENABLE_V3PLUS + (f_index < 32) * sizeof(u32); + reg = bcmgenet_hfb_reg_readl(priv, offset); + return !!(reg & (1 << (f_index % 32))); +} + +static void bcmgenet_hfb_enable_filter(struct bcmgenet_priv *priv, u32 f_index) +{ + u32 offset; + u32 reg; + + offset = HFB_FLT_ENABLE_V3PLUS + (f_index < 32) * sizeof(u32); + reg = bcmgenet_hfb_reg_readl(priv, offset); + reg |= (1 << (f_index % 32)); + bcmgenet_hfb_reg_writel(priv, reg, offset); +} + +static void bcmgenet_hfb_set_filter_rx_queue_mapping(struct bcmgenet_priv *priv, + u32 f_index, u32 rx_queue) +{ + u32 offset; + u32 reg; + + offset = f_index / 8; + reg = bcmgenet_rdma_readl(priv, DMA_INDEX2RING_0 + offset); + reg &= ~(0xF << (4 * (f_index % 8))); + reg |= ((rx_queue & 0xF) << (4 * (f_index % 8))); + bcmgenet_rdma_writel(priv, reg, DMA_INDEX2RING_0 + offset); +} + +static void bcmgenet_hfb_set_filter_length(struct bcmgenet_priv *priv, + u32 f_index, u32 f_length) +{ + u32 offset; + u32 reg; + + offset = HFB_FLT_LEN_V3PLUS + + ((priv->hw_params->hfb_filter_cnt - 1 - f_index) / 4) * + sizeof(u32); + reg = bcmgenet_hfb_reg_readl(priv, offset); + reg &= ~(0xFF << (8 * (f_index % 4))); + reg |= ((f_length & 0xFF) << (8 * (f_index % 4))); + bcmgenet_hfb_reg_writel(priv, reg, offset); +} + +static int bcmgenet_hfb_find_unused_filter(struct bcmgenet_priv *priv) +{ + u32 f_index; + + for (f_index = 0; f_index < priv->hw_params->hfb_filter_cnt; f_index++) + if (!bcmgenet_hfb_is_filter_enabled(priv, f_index)) + return f_index; + + return -ENOMEM; +} + +/* bcmgenet_hfb_add_filter + * + * Add new filter to Hardware Filter Block to match and direct Rx traffic to + * desired Rx queue. + * + * f_data is an array of unsigned 32-bit integers where each 32-bit integer + * provides filter data for 2 bytes (4 nibbles) of Rx frame: + * + * bits 31:20 - unused + * bit 19 - nibble 0 match enable + * bit 18 - nibble 1 match enable + * bit 17 - nibble 2 match enable + * bit 16 - nibble 3 match enable + * bits 15:12 - nibble 0 data + * bits 11:8 - nibble 1 data + * bits 7:4 - nibble 2 data + * bits 3:0 - nibble 3 data + * + * Example: + * In order to match: + * - Ethernet frame type = 0x0800 (IP) + * - IP version field = 4 + * - IP protocol field = 0x11 (UDP) + * + * The following filter is needed: + * u32 hfb_filter_ipv4_udp[] = { + * Rx frame offset 0x00: 0x00000000, 0x00000000, 0x00000000, 0x00000000, + * Rx frame offset 0x08: 0x00000000, 0x00000000, 0x000F0800, 0x00084000, + * Rx frame offset 0x10: 0x00000000, 0x00000000, 0x00000000, 0x00030011, + * }; + * + * To add the filter to HFB and direct the traffic to Rx queue 0, call: + * bcmgenet_hfb_add_filter(priv, hfb_filter_ipv4_udp, + * ARRAY_SIZE(hfb_filter_ipv4_udp), 0); + */ +int bcmgenet_hfb_add_filter(struct bcmgenet_priv *priv, u32 *f_data, + u32 f_length, u32 rx_queue) +{ + int f_index; + u32 i; + + f_index = bcmgenet_hfb_find_unused_filter(priv); + if (f_index < 0) + return -ENOMEM; + + if (f_length > priv->hw_params->hfb_filter_size) + return -EINVAL; + + for (i = 0; i < f_length; i++) + bcmgenet_hfb_writel(priv, f_data[i], + (f_index * priv->hw_params->hfb_filter_size + i) * + sizeof(u32)); + + bcmgenet_hfb_set_filter_length(priv, f_index, 2 * f_length); + bcmgenet_hfb_set_filter_rx_queue_mapping(priv, f_index, rx_queue); + bcmgenet_hfb_enable_filter(priv, f_index); + bcmgenet_hfb_reg_writel(priv, 0x1, HFB_CTRL); + + return 0; +} + /* bcmgenet_hfb_clear * * Clear Hardware Filter Block and disable all filtering. -- cgit v1.2.3-59-g8ed1b From 854295d03ca04461f275d723289a5886e6827498 Mon Sep 17 00:00:00 2001 From: Doug Berger Date: Wed, 29 Apr 2020 13:02:04 -0700 Subject: net: bcmgenet: code movement The Hardware Filter Block code will be used by ethtool functions when defining flow types so this commit moves the functions in the file to prevent the need for prototype declarations. This is broken out to facilitate review. Signed-off-by: Doug Berger Acked-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/genet/bcmgenet.c | 308 ++++++++++++------------- 1 file changed, 154 insertions(+), 154 deletions(-) diff --git a/drivers/net/ethernet/broadcom/genet/bcmgenet.c b/drivers/net/ethernet/broadcom/genet/bcmgenet.c index b37ef05c5083..ad41944d2cc0 100644 --- a/drivers/net/ethernet/broadcom/genet/bcmgenet.c +++ b/drivers/net/ethernet/broadcom/genet/bcmgenet.c @@ -459,6 +459,160 @@ static inline void bcmgenet_rdma_ring_writel(struct bcmgenet_priv *priv, genet_dma_ring_regs[r]); } +static bool bcmgenet_hfb_is_filter_enabled(struct bcmgenet_priv *priv, + u32 f_index) +{ + u32 offset; + u32 reg; + + offset = HFB_FLT_ENABLE_V3PLUS + (f_index < 32) * sizeof(u32); + reg = bcmgenet_hfb_reg_readl(priv, offset); + return !!(reg & (1 << (f_index % 32))); +} + +static void bcmgenet_hfb_enable_filter(struct bcmgenet_priv *priv, u32 f_index) +{ + u32 offset; + u32 reg; + + offset = HFB_FLT_ENABLE_V3PLUS + (f_index < 32) * sizeof(u32); + reg = bcmgenet_hfb_reg_readl(priv, offset); + reg |= (1 << (f_index % 32)); + bcmgenet_hfb_reg_writel(priv, reg, offset); +} + +static void bcmgenet_hfb_set_filter_rx_queue_mapping(struct bcmgenet_priv *priv, + u32 f_index, u32 rx_queue) +{ + u32 offset; + u32 reg; + + offset = f_index / 8; + reg = bcmgenet_rdma_readl(priv, DMA_INDEX2RING_0 + offset); + reg &= ~(0xF << (4 * (f_index % 8))); + reg |= ((rx_queue & 0xF) << (4 * (f_index % 8))); + bcmgenet_rdma_writel(priv, reg, DMA_INDEX2RING_0 + offset); +} + +static void bcmgenet_hfb_set_filter_length(struct bcmgenet_priv *priv, + u32 f_index, u32 f_length) +{ + u32 offset; + u32 reg; + + offset = HFB_FLT_LEN_V3PLUS + + ((priv->hw_params->hfb_filter_cnt - 1 - f_index) / 4) * + sizeof(u32); + reg = bcmgenet_hfb_reg_readl(priv, offset); + reg &= ~(0xFF << (8 * (f_index % 4))); + reg |= ((f_length & 0xFF) << (8 * (f_index % 4))); + bcmgenet_hfb_reg_writel(priv, reg, offset); +} + +static int bcmgenet_hfb_find_unused_filter(struct bcmgenet_priv *priv) +{ + u32 f_index; + + for (f_index = 0; f_index < priv->hw_params->hfb_filter_cnt; f_index++) + if (!bcmgenet_hfb_is_filter_enabled(priv, f_index)) + return f_index; + + return -ENOMEM; +} + +/* bcmgenet_hfb_add_filter + * + * Add new filter to Hardware Filter Block to match and direct Rx traffic to + * desired Rx queue. + * + * f_data is an array of unsigned 32-bit integers where each 32-bit integer + * provides filter data for 2 bytes (4 nibbles) of Rx frame: + * + * bits 31:20 - unused + * bit 19 - nibble 0 match enable + * bit 18 - nibble 1 match enable + * bit 17 - nibble 2 match enable + * bit 16 - nibble 3 match enable + * bits 15:12 - nibble 0 data + * bits 11:8 - nibble 1 data + * bits 7:4 - nibble 2 data + * bits 3:0 - nibble 3 data + * + * Example: + * In order to match: + * - Ethernet frame type = 0x0800 (IP) + * - IP version field = 4 + * - IP protocol field = 0x11 (UDP) + * + * The following filter is needed: + * u32 hfb_filter_ipv4_udp[] = { + * Rx frame offset 0x00: 0x00000000, 0x00000000, 0x00000000, 0x00000000, + * Rx frame offset 0x08: 0x00000000, 0x00000000, 0x000F0800, 0x00084000, + * Rx frame offset 0x10: 0x00000000, 0x00000000, 0x00000000, 0x00030011, + * }; + * + * To add the filter to HFB and direct the traffic to Rx queue 0, call: + * bcmgenet_hfb_add_filter(priv, hfb_filter_ipv4_udp, + * ARRAY_SIZE(hfb_filter_ipv4_udp), 0); + */ +int bcmgenet_hfb_add_filter(struct bcmgenet_priv *priv, u32 *f_data, + u32 f_length, u32 rx_queue) +{ + int f_index; + u32 i; + + f_index = bcmgenet_hfb_find_unused_filter(priv); + if (f_index < 0) + return -ENOMEM; + + if (f_length > priv->hw_params->hfb_filter_size) + return -EINVAL; + + for (i = 0; i < f_length; i++) + bcmgenet_hfb_writel(priv, f_data[i], + (f_index * priv->hw_params->hfb_filter_size + i) * + sizeof(u32)); + + bcmgenet_hfb_set_filter_length(priv, f_index, 2 * f_length); + bcmgenet_hfb_set_filter_rx_queue_mapping(priv, f_index, rx_queue); + bcmgenet_hfb_enable_filter(priv, f_index); + bcmgenet_hfb_reg_writel(priv, 0x1, HFB_CTRL); + + return 0; +} + +/* bcmgenet_hfb_clear + * + * Clear Hardware Filter Block and disable all filtering. + */ +static void bcmgenet_hfb_clear(struct bcmgenet_priv *priv) +{ + u32 i; + + bcmgenet_hfb_reg_writel(priv, 0x0, HFB_CTRL); + bcmgenet_hfb_reg_writel(priv, 0x0, HFB_FLT_ENABLE_V3PLUS); + bcmgenet_hfb_reg_writel(priv, 0x0, HFB_FLT_ENABLE_V3PLUS + 4); + + for (i = DMA_INDEX2RING_0; i <= DMA_INDEX2RING_7; i++) + bcmgenet_rdma_writel(priv, 0x0, i); + + for (i = 0; i < (priv->hw_params->hfb_filter_cnt / 4); i++) + bcmgenet_hfb_reg_writel(priv, 0x0, + HFB_FLT_LEN_V3PLUS + i * sizeof(u32)); + + for (i = 0; i < priv->hw_params->hfb_filter_cnt * + priv->hw_params->hfb_filter_size; i++) + bcmgenet_hfb_writel(priv, 0x0, i * sizeof(u32)); +} + +static void bcmgenet_hfb_init(struct bcmgenet_priv *priv) +{ + if (GENET_IS_V1(priv) || GENET_IS_V2(priv)) + return; + + bcmgenet_hfb_clear(priv); +} + static int bcmgenet_begin(struct net_device *dev) { struct bcmgenet_priv *priv = netdev_priv(dev); @@ -2759,160 +2913,6 @@ static void bcmgenet_enable_dma(struct bcmgenet_priv *priv, u32 dma_ctrl) bcmgenet_tdma_writel(priv, reg, DMA_CTRL); } -static bool bcmgenet_hfb_is_filter_enabled(struct bcmgenet_priv *priv, - u32 f_index) -{ - u32 offset; - u32 reg; - - offset = HFB_FLT_ENABLE_V3PLUS + (f_index < 32) * sizeof(u32); - reg = bcmgenet_hfb_reg_readl(priv, offset); - return !!(reg & (1 << (f_index % 32))); -} - -static void bcmgenet_hfb_enable_filter(struct bcmgenet_priv *priv, u32 f_index) -{ - u32 offset; - u32 reg; - - offset = HFB_FLT_ENABLE_V3PLUS + (f_index < 32) * sizeof(u32); - reg = bcmgenet_hfb_reg_readl(priv, offset); - reg |= (1 << (f_index % 32)); - bcmgenet_hfb_reg_writel(priv, reg, offset); -} - -static void bcmgenet_hfb_set_filter_rx_queue_mapping(struct bcmgenet_priv *priv, - u32 f_index, u32 rx_queue) -{ - u32 offset; - u32 reg; - - offset = f_index / 8; - reg = bcmgenet_rdma_readl(priv, DMA_INDEX2RING_0 + offset); - reg &= ~(0xF << (4 * (f_index % 8))); - reg |= ((rx_queue & 0xF) << (4 * (f_index % 8))); - bcmgenet_rdma_writel(priv, reg, DMA_INDEX2RING_0 + offset); -} - -static void bcmgenet_hfb_set_filter_length(struct bcmgenet_priv *priv, - u32 f_index, u32 f_length) -{ - u32 offset; - u32 reg; - - offset = HFB_FLT_LEN_V3PLUS + - ((priv->hw_params->hfb_filter_cnt - 1 - f_index) / 4) * - sizeof(u32); - reg = bcmgenet_hfb_reg_readl(priv, offset); - reg &= ~(0xFF << (8 * (f_index % 4))); - reg |= ((f_length & 0xFF) << (8 * (f_index % 4))); - bcmgenet_hfb_reg_writel(priv, reg, offset); -} - -static int bcmgenet_hfb_find_unused_filter(struct bcmgenet_priv *priv) -{ - u32 f_index; - - for (f_index = 0; f_index < priv->hw_params->hfb_filter_cnt; f_index++) - if (!bcmgenet_hfb_is_filter_enabled(priv, f_index)) - return f_index; - - return -ENOMEM; -} - -/* bcmgenet_hfb_add_filter - * - * Add new filter to Hardware Filter Block to match and direct Rx traffic to - * desired Rx queue. - * - * f_data is an array of unsigned 32-bit integers where each 32-bit integer - * provides filter data for 2 bytes (4 nibbles) of Rx frame: - * - * bits 31:20 - unused - * bit 19 - nibble 0 match enable - * bit 18 - nibble 1 match enable - * bit 17 - nibble 2 match enable - * bit 16 - nibble 3 match enable - * bits 15:12 - nibble 0 data - * bits 11:8 - nibble 1 data - * bits 7:4 - nibble 2 data - * bits 3:0 - nibble 3 data - * - * Example: - * In order to match: - * - Ethernet frame type = 0x0800 (IP) - * - IP version field = 4 - * - IP protocol field = 0x11 (UDP) - * - * The following filter is needed: - * u32 hfb_filter_ipv4_udp[] = { - * Rx frame offset 0x00: 0x00000000, 0x00000000, 0x00000000, 0x00000000, - * Rx frame offset 0x08: 0x00000000, 0x00000000, 0x000F0800, 0x00084000, - * Rx frame offset 0x10: 0x00000000, 0x00000000, 0x00000000, 0x00030011, - * }; - * - * To add the filter to HFB and direct the traffic to Rx queue 0, call: - * bcmgenet_hfb_add_filter(priv, hfb_filter_ipv4_udp, - * ARRAY_SIZE(hfb_filter_ipv4_udp), 0); - */ -int bcmgenet_hfb_add_filter(struct bcmgenet_priv *priv, u32 *f_data, - u32 f_length, u32 rx_queue) -{ - int f_index; - u32 i; - - f_index = bcmgenet_hfb_find_unused_filter(priv); - if (f_index < 0) - return -ENOMEM; - - if (f_length > priv->hw_params->hfb_filter_size) - return -EINVAL; - - for (i = 0; i < f_length; i++) - bcmgenet_hfb_writel(priv, f_data[i], - (f_index * priv->hw_params->hfb_filter_size + i) * - sizeof(u32)); - - bcmgenet_hfb_set_filter_length(priv, f_index, 2 * f_length); - bcmgenet_hfb_set_filter_rx_queue_mapping(priv, f_index, rx_queue); - bcmgenet_hfb_enable_filter(priv, f_index); - bcmgenet_hfb_reg_writel(priv, 0x1, HFB_CTRL); - - return 0; -} - -/* bcmgenet_hfb_clear - * - * Clear Hardware Filter Block and disable all filtering. - */ -static void bcmgenet_hfb_clear(struct bcmgenet_priv *priv) -{ - u32 i; - - bcmgenet_hfb_reg_writel(priv, 0x0, HFB_CTRL); - bcmgenet_hfb_reg_writel(priv, 0x0, HFB_FLT_ENABLE_V3PLUS); - bcmgenet_hfb_reg_writel(priv, 0x0, HFB_FLT_ENABLE_V3PLUS + 4); - - for (i = DMA_INDEX2RING_0; i <= DMA_INDEX2RING_7; i++) - bcmgenet_rdma_writel(priv, 0x0, i); - - for (i = 0; i < (priv->hw_params->hfb_filter_cnt / 4); i++) - bcmgenet_hfb_reg_writel(priv, 0x0, - HFB_FLT_LEN_V3PLUS + i * sizeof(u32)); - - for (i = 0; i < priv->hw_params->hfb_filter_cnt * - priv->hw_params->hfb_filter_size; i++) - bcmgenet_hfb_writel(priv, 0x0, i * sizeof(u32)); -} - -static void bcmgenet_hfb_init(struct bcmgenet_priv *priv) -{ - if (GENET_IS_V1(priv) || GENET_IS_V2(priv)) - return; - - bcmgenet_hfb_clear(priv); -} - static void bcmgenet_netif_start(struct net_device *dev) { struct bcmgenet_priv *priv = netdev_priv(dev); -- cgit v1.2.3-59-g8ed1b From 3e370952287c55e5fd240cb8bb41ef8acff8829d Mon Sep 17 00:00:00 2001 From: Doug Berger Date: Wed, 29 Apr 2020 13:02:05 -0700 Subject: net: bcmgenet: add support for ethtool rxnfc flows This commit enables driver support for ethtool commands of this form: ethtool -N|-U|--config-nfc|--config-ntuple devname flow-type ether|ip4 [src xx:yy:zz:aa:bb:cc [m xx:yy:zz:aa:bb:cc]] [dst xx:yy:zz:aa:bb:cc [m xx:yy:zz:aa:bb:cc]] [proto N [m N]] [src-ip x.x.x.x [m x.x.x.x]] [dst-ip x.x.x.x [m x.x.x.x]] [tos N [m N]] [l4proto N [m N]] [src-port N [m N]] [dst-port N [m N]] [spi N [m N]] [l4data N [m N]] [vlan-etype N [m N]] [vlan N [m N]] [dst-mac xx:yy:zz:aa:bb:cc [m xx:yy:zz:aa:bb:cc]] [action 0] [loc N] | delete N Since there is only one Rx Ring in this implementation action 0 behaves no differently from not specifying a rule. The rules can be seen with ethtool commands of this form: ethtool -n|-u|--show-nfc|--show-ntuple devname [rule N] Signed-off-by: Doug Berger Acked-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/genet/bcmgenet.c | 483 ++++++++++++++++++++++++- drivers/net/ethernet/broadcom/genet/bcmgenet.h | 16 + 2 files changed, 488 insertions(+), 11 deletions(-) diff --git a/drivers/net/ethernet/broadcom/genet/bcmgenet.c b/drivers/net/ethernet/broadcom/genet/bcmgenet.c index ad41944d2cc0..5ef1ea7e5312 100644 --- a/drivers/net/ethernet/broadcom/genet/bcmgenet.c +++ b/drivers/net/ethernet/broadcom/genet/bcmgenet.c @@ -479,6 +479,30 @@ static void bcmgenet_hfb_enable_filter(struct bcmgenet_priv *priv, u32 f_index) reg = bcmgenet_hfb_reg_readl(priv, offset); reg |= (1 << (f_index % 32)); bcmgenet_hfb_reg_writel(priv, reg, offset); + reg = bcmgenet_hfb_reg_readl(priv, HFB_CTRL); + reg |= RBUF_HFB_EN; + bcmgenet_hfb_reg_writel(priv, reg, HFB_CTRL); +} + +static void bcmgenet_hfb_disable_filter(struct bcmgenet_priv *priv, u32 f_index) +{ + u32 offset, reg, reg1; + + offset = HFB_FLT_ENABLE_V3PLUS; + reg = bcmgenet_hfb_reg_readl(priv, offset); + reg1 = bcmgenet_hfb_reg_readl(priv, offset + sizeof(u32)); + if (f_index < 32) { + reg1 &= ~(1 << (f_index % 32)); + bcmgenet_hfb_reg_writel(priv, reg1, offset + sizeof(u32)); + } else { + reg &= ~(1 << (f_index % 32)); + bcmgenet_hfb_reg_writel(priv, reg, offset); + } + if (!reg && !reg1) { + reg = bcmgenet_hfb_reg_readl(priv, HFB_CTRL); + reg &= ~RBUF_HFB_EN; + bcmgenet_hfb_reg_writel(priv, reg, HFB_CTRL); + } } static void bcmgenet_hfb_set_filter_rx_queue_mapping(struct bcmgenet_priv *priv, @@ -513,13 +537,213 @@ static int bcmgenet_hfb_find_unused_filter(struct bcmgenet_priv *priv) { u32 f_index; - for (f_index = 0; f_index < priv->hw_params->hfb_filter_cnt; f_index++) + /* First MAX_NUM_OF_FS_RULES are reserved for Rx NFC filters */ + for (f_index = MAX_NUM_OF_FS_RULES; + f_index < priv->hw_params->hfb_filter_cnt; f_index++) if (!bcmgenet_hfb_is_filter_enabled(priv, f_index)) return f_index; return -ENOMEM; } +static int bcmgenet_hfb_validate_mask(void *mask, size_t size) +{ + while (size) { + switch (*(unsigned char *)mask++) { + case 0x00: + case 0x0f: + case 0xf0: + case 0xff: + size--; + continue; + default: + return -EINVAL; + } + } + + return 0; +} + +#define VALIDATE_MASK(x) \ + bcmgenet_hfb_validate_mask(&(x), sizeof(x)) + +static int bcmgenet_hfb_insert_data(u32 *f, int offset, + void *val, void *mask, size_t size) +{ + int index; + u32 tmp; + + index = offset / 2; + tmp = f[index]; + + while (size--) { + if (offset++ & 1) { + tmp &= ~0x300FF; + tmp |= (*(unsigned char *)val++); + switch ((*(unsigned char *)mask++)) { + case 0xFF: + tmp |= 0x30000; + break; + case 0xF0: + tmp |= 0x20000; + break; + case 0x0F: + tmp |= 0x10000; + break; + } + f[index++] = tmp; + if (size) + tmp = f[index]; + } else { + tmp &= ~0xCFF00; + tmp |= (*(unsigned char *)val++) << 8; + switch ((*(unsigned char *)mask++)) { + case 0xFF: + tmp |= 0xC0000; + break; + case 0xF0: + tmp |= 0x80000; + break; + case 0x0F: + tmp |= 0x40000; + break; + } + if (!size) + f[index] = tmp; + } + } + + return 0; +} + +static void bcmgenet_hfb_set_filter(struct bcmgenet_priv *priv, u32 *f_data, + u32 f_length, u32 rx_queue, int f_index) +{ + u32 base = f_index * priv->hw_params->hfb_filter_size; + int i; + + for (i = 0; i < f_length; i++) + bcmgenet_hfb_writel(priv, f_data[i], (base + i) * sizeof(u32)); + + bcmgenet_hfb_set_filter_length(priv, f_index, 2 * f_length); + bcmgenet_hfb_set_filter_rx_queue_mapping(priv, f_index, rx_queue); +} + +static int bcmgenet_hfb_create_rxnfc_filter(struct bcmgenet_priv *priv, + struct bcmgenet_rxnfc_rule *rule) +{ + struct ethtool_rx_flow_spec *fs = &rule->fs; + int err = 0, offset = 0, f_length = 0; + u16 val_16, mask_16; + u8 val_8, mask_8; + size_t size; + u32 *f_data; + + f_data = kcalloc(priv->hw_params->hfb_filter_size, sizeof(u32), + GFP_KERNEL); + if (!f_data) + return -ENOMEM; + + if (fs->flow_type & FLOW_MAC_EXT) { + bcmgenet_hfb_insert_data(f_data, 0, + &fs->h_ext.h_dest, &fs->m_ext.h_dest, + sizeof(fs->h_ext.h_dest)); + } + + if (fs->flow_type & FLOW_EXT) { + if (fs->m_ext.vlan_etype || + fs->m_ext.vlan_tci) { + bcmgenet_hfb_insert_data(f_data, 12, + &fs->h_ext.vlan_etype, + &fs->m_ext.vlan_etype, + sizeof(fs->h_ext.vlan_etype)); + bcmgenet_hfb_insert_data(f_data, 14, + &fs->h_ext.vlan_tci, + &fs->m_ext.vlan_tci, + sizeof(fs->h_ext.vlan_tci)); + offset += VLAN_HLEN; + f_length += DIV_ROUND_UP(VLAN_HLEN, 2); + } + } + + switch (fs->flow_type & ~(FLOW_EXT | FLOW_MAC_EXT)) { + case ETHER_FLOW: + f_length += DIV_ROUND_UP(ETH_HLEN, 2); + bcmgenet_hfb_insert_data(f_data, 0, + &fs->h_u.ether_spec.h_dest, + &fs->m_u.ether_spec.h_dest, + sizeof(fs->h_u.ether_spec.h_dest)); + bcmgenet_hfb_insert_data(f_data, ETH_ALEN, + &fs->h_u.ether_spec.h_source, + &fs->m_u.ether_spec.h_source, + sizeof(fs->h_u.ether_spec.h_source)); + bcmgenet_hfb_insert_data(f_data, (2 * ETH_ALEN) + offset, + &fs->h_u.ether_spec.h_proto, + &fs->m_u.ether_spec.h_proto, + sizeof(fs->h_u.ether_spec.h_proto)); + break; + case IP_USER_FLOW: + f_length += DIV_ROUND_UP(ETH_HLEN + 20, 2); + /* Specify IP Ether Type */ + val_16 = htons(ETH_P_IP); + mask_16 = 0xFFFF; + bcmgenet_hfb_insert_data(f_data, (2 * ETH_ALEN) + offset, + &val_16, &mask_16, sizeof(val_16)); + bcmgenet_hfb_insert_data(f_data, 15 + offset, + &fs->h_u.usr_ip4_spec.tos, + &fs->m_u.usr_ip4_spec.tos, + sizeof(fs->h_u.usr_ip4_spec.tos)); + bcmgenet_hfb_insert_data(f_data, 23 + offset, + &fs->h_u.usr_ip4_spec.proto, + &fs->m_u.usr_ip4_spec.proto, + sizeof(fs->h_u.usr_ip4_spec.proto)); + bcmgenet_hfb_insert_data(f_data, 26 + offset, + &fs->h_u.usr_ip4_spec.ip4src, + &fs->m_u.usr_ip4_spec.ip4src, + sizeof(fs->h_u.usr_ip4_spec.ip4src)); + bcmgenet_hfb_insert_data(f_data, 30 + offset, + &fs->h_u.usr_ip4_spec.ip4dst, + &fs->m_u.usr_ip4_spec.ip4dst, + sizeof(fs->h_u.usr_ip4_spec.ip4dst)); + if (!fs->m_u.usr_ip4_spec.l4_4_bytes) + break; + + /* Only supports 20 byte IPv4 header */ + val_8 = 0x45; + mask_8 = 0xFF; + bcmgenet_hfb_insert_data(f_data, ETH_HLEN + offset, + &val_8, &mask_8, + sizeof(val_8)); + size = sizeof(fs->h_u.usr_ip4_spec.l4_4_bytes); + bcmgenet_hfb_insert_data(f_data, + ETH_HLEN + 20 + offset, + &fs->h_u.usr_ip4_spec.l4_4_bytes, + &fs->m_u.usr_ip4_spec.l4_4_bytes, + size); + f_length += DIV_ROUND_UP(size, 2); + break; + } + + if (!fs->ring_cookie) { + /* Ring 0 flows can be handled by the default Descriptor Ring + * We'll map them to ring 0, but don't enable the filter + */ + bcmgenet_hfb_set_filter(priv, f_data, f_length, 0, + fs->location); + rule->state = BCMGENET_RXNFC_STATE_DISABLED; + } else { + /* Other Rx rings are direct mapped here */ + bcmgenet_hfb_set_filter(priv, f_data, f_length, + fs->ring_cookie, fs->location); + bcmgenet_hfb_enable_filter(priv, fs->location); + rule->state = BCMGENET_RXNFC_STATE_ENABLED; + } + + kfree(f_data); + + return err; +} + /* bcmgenet_hfb_add_filter * * Add new filter to Hardware Filter Block to match and direct Rx traffic to @@ -559,7 +783,6 @@ int bcmgenet_hfb_add_filter(struct bcmgenet_priv *priv, u32 *f_data, u32 f_length, u32 rx_queue) { int f_index; - u32 i; f_index = bcmgenet_hfb_find_unused_filter(priv); if (f_index < 0) @@ -568,15 +791,8 @@ int bcmgenet_hfb_add_filter(struct bcmgenet_priv *priv, u32 *f_data, if (f_length > priv->hw_params->hfb_filter_size) return -EINVAL; - for (i = 0; i < f_length; i++) - bcmgenet_hfb_writel(priv, f_data[i], - (f_index * priv->hw_params->hfb_filter_size + i) * - sizeof(u32)); - - bcmgenet_hfb_set_filter_length(priv, f_index, 2 * f_length); - bcmgenet_hfb_set_filter_rx_queue_mapping(priv, f_index, rx_queue); + bcmgenet_hfb_set_filter(priv, f_data, f_length, rx_queue, f_index); bcmgenet_hfb_enable_filter(priv, f_index); - bcmgenet_hfb_reg_writel(priv, 0x1, HFB_CTRL); return 0; } @@ -607,9 +823,17 @@ static void bcmgenet_hfb_clear(struct bcmgenet_priv *priv) static void bcmgenet_hfb_init(struct bcmgenet_priv *priv) { + int i; + if (GENET_IS_V1(priv) || GENET_IS_V2(priv)) return; + INIT_LIST_HEAD(&priv->rxnfc_list); + for (i = 0; i < MAX_NUM_OF_FS_RULES; i++) { + INIT_LIST_HEAD(&priv->rxnfc_rules[i].list); + priv->rxnfc_rules[i].state = BCMGENET_RXNFC_STATE_UNUSED; + } + bcmgenet_hfb_clear(priv); } @@ -1197,6 +1421,228 @@ static int bcmgenet_set_eee(struct net_device *dev, struct ethtool_eee *e) return phy_ethtool_set_eee(dev->phydev, e); } +static int bcmgenet_validate_flow(struct net_device *dev, + struct ethtool_rxnfc *cmd) +{ + struct ethtool_usrip4_spec *l4_mask; + struct ethhdr *eth_mask; + + if (cmd->fs.location >= MAX_NUM_OF_FS_RULES) { + netdev_err(dev, "rxnfc: Invalid location (%d)\n", + cmd->fs.location); + return -EINVAL; + } + + switch (cmd->fs.flow_type & ~(FLOW_EXT | FLOW_MAC_EXT)) { + case IP_USER_FLOW: + l4_mask = &cmd->fs.m_u.usr_ip4_spec; + /* don't allow mask which isn't valid */ + if (VALIDATE_MASK(l4_mask->ip4src) || + VALIDATE_MASK(l4_mask->ip4dst) || + VALIDATE_MASK(l4_mask->l4_4_bytes) || + VALIDATE_MASK(l4_mask->proto) || + VALIDATE_MASK(l4_mask->ip_ver) || + VALIDATE_MASK(l4_mask->tos)) { + netdev_err(dev, "rxnfc: Unsupported mask\n"); + return -EINVAL; + } + break; + case ETHER_FLOW: + eth_mask = &cmd->fs.m_u.ether_spec; + /* don't allow mask which isn't valid */ + if (VALIDATE_MASK(eth_mask->h_source) || + VALIDATE_MASK(eth_mask->h_source) || + VALIDATE_MASK(eth_mask->h_proto)) { + netdev_err(dev, "rxnfc: Unsupported mask\n"); + return -EINVAL; + } + break; + default: + netdev_err(dev, "rxnfc: Unsupported flow type (0x%x)\n", + cmd->fs.flow_type); + return -EINVAL; + } + + if ((cmd->fs.flow_type & FLOW_EXT)) { + /* don't allow mask which isn't valid */ + if (VALIDATE_MASK(cmd->fs.m_ext.vlan_etype) || + VALIDATE_MASK(cmd->fs.m_ext.vlan_tci)) { + netdev_err(dev, "rxnfc: Unsupported mask\n"); + return -EINVAL; + } + if (cmd->fs.m_ext.data[0] || cmd->fs.m_ext.data[1]) { + netdev_err(dev, "rxnfc: user-def not supported\n"); + return -EINVAL; + } + } + + if ((cmd->fs.flow_type & FLOW_MAC_EXT)) { + /* don't allow mask which isn't valid */ + if (VALIDATE_MASK(cmd->fs.m_ext.h_dest)) { + netdev_err(dev, "rxnfc: Unsupported mask\n"); + return -EINVAL; + } + } + + return 0; +} + +static int bcmgenet_insert_flow(struct net_device *dev, + struct ethtool_rxnfc *cmd) +{ + struct bcmgenet_priv *priv = netdev_priv(dev); + struct bcmgenet_rxnfc_rule *loc_rule; + int err; + + if (priv->hw_params->hfb_filter_size < 128) { + netdev_err(dev, "rxnfc: Not supported by this device\n"); + return -EINVAL; + } + + if (cmd->fs.ring_cookie > priv->hw_params->rx_queues) { + netdev_err(dev, "rxnfc: Unsupported action (%llu)\n", + cmd->fs.ring_cookie); + return -EINVAL; + } + + err = bcmgenet_validate_flow(dev, cmd); + if (err) + return err; + + loc_rule = &priv->rxnfc_rules[cmd->fs.location]; + if (loc_rule->state == BCMGENET_RXNFC_STATE_ENABLED) + bcmgenet_hfb_disable_filter(priv, cmd->fs.location); + if (loc_rule->state != BCMGENET_RXNFC_STATE_UNUSED) + list_del(&loc_rule->list); + loc_rule->state = BCMGENET_RXNFC_STATE_UNUSED; + memcpy(&loc_rule->fs, &cmd->fs, + sizeof(struct ethtool_rx_flow_spec)); + + err = bcmgenet_hfb_create_rxnfc_filter(priv, loc_rule); + if (err) { + netdev_err(dev, "rxnfc: Could not install rule (%d)\n", + err); + return err; + } + + list_add_tail(&loc_rule->list, &priv->rxnfc_list); + + return 0; +} + +static int bcmgenet_delete_flow(struct net_device *dev, + struct ethtool_rxnfc *cmd) +{ + struct bcmgenet_priv *priv = netdev_priv(dev); + struct bcmgenet_rxnfc_rule *rule; + int err = 0; + + if (cmd->fs.location >= MAX_NUM_OF_FS_RULES) + return -EINVAL; + + rule = &priv->rxnfc_rules[cmd->fs.location]; + if (rule->state == BCMGENET_RXNFC_STATE_UNUSED) { + err = -ENOENT; + goto out; + } + + if (rule->state == BCMGENET_RXNFC_STATE_ENABLED) + bcmgenet_hfb_disable_filter(priv, cmd->fs.location); + if (rule->state != BCMGENET_RXNFC_STATE_UNUSED) + list_del(&rule->list); + rule->state = BCMGENET_RXNFC_STATE_UNUSED; + memset(&rule->fs, 0, sizeof(struct ethtool_rx_flow_spec)); + +out: + return err; +} + +static int bcmgenet_set_rxnfc(struct net_device *dev, struct ethtool_rxnfc *cmd) +{ + struct bcmgenet_priv *priv = netdev_priv(dev); + int err = 0; + + switch (cmd->cmd) { + case ETHTOOL_SRXCLSRLINS: + err = bcmgenet_insert_flow(dev, cmd); + break; + case ETHTOOL_SRXCLSRLDEL: + err = bcmgenet_delete_flow(dev, cmd); + break; + default: + netdev_warn(priv->dev, "Unsupported ethtool command. (%d)\n", + cmd->cmd); + return -EINVAL; + } + + return err; +} + +static int bcmgenet_get_flow(struct net_device *dev, struct ethtool_rxnfc *cmd, + int loc) +{ + struct bcmgenet_priv *priv = netdev_priv(dev); + struct bcmgenet_rxnfc_rule *rule; + int err = 0; + + if (loc < 0 || loc >= MAX_NUM_OF_FS_RULES) + return -EINVAL; + + rule = &priv->rxnfc_rules[loc]; + if (rule->state == BCMGENET_RXNFC_STATE_UNUSED) + err = -ENOENT; + else + memcpy(&cmd->fs, &rule->fs, + sizeof(struct ethtool_rx_flow_spec)); + + return err; +} + +static int bcmgenet_get_num_flows(struct bcmgenet_priv *priv) +{ + struct list_head *pos; + int res = 0; + + list_for_each(pos, &priv->rxnfc_list) + res++; + + return res; +} + +static int bcmgenet_get_rxnfc(struct net_device *dev, struct ethtool_rxnfc *cmd, + u32 *rule_locs) +{ + struct bcmgenet_priv *priv = netdev_priv(dev); + struct bcmgenet_rxnfc_rule *rule; + int err = 0; + int i = 0; + + switch (cmd->cmd) { + case ETHTOOL_GRXRINGS: + cmd->data = priv->hw_params->rx_queues ?: 1; + break; + case ETHTOOL_GRXCLSRLCNT: + cmd->rule_cnt = bcmgenet_get_num_flows(priv); + cmd->data = MAX_NUM_OF_FS_RULES; + break; + case ETHTOOL_GRXCLSRULE: + err = bcmgenet_get_flow(dev, cmd, cmd->fs.location); + break; + case ETHTOOL_GRXCLSRLALL: + list_for_each_entry(rule, &priv->rxnfc_list, list) + if (i < cmd->rule_cnt) + rule_locs[i++] = rule->fs.location; + cmd->rule_cnt = i; + cmd->data = MAX_NUM_OF_FS_RULES; + break; + default: + err = -EOPNOTSUPP; + break; + } + + return err; +} + /* standard ethtool support functions. */ static const struct ethtool_ops bcmgenet_ethtool_ops = { .supported_coalesce_params = ETHTOOL_COALESCE_RX_USECS | @@ -1221,6 +1667,8 @@ static const struct ethtool_ops bcmgenet_ethtool_ops = { .get_link_ksettings = bcmgenet_get_link_ksettings, .set_link_ksettings = bcmgenet_set_link_ksettings, .get_ts_info = ethtool_op_get_ts_info, + .get_rxnfc = bcmgenet_get_rxnfc, + .set_rxnfc = bcmgenet_set_rxnfc, }; /* Power down the unimac, based on mode. */ @@ -3730,8 +4178,8 @@ static int bcmgenet_resume(struct device *d) struct net_device *dev = dev_get_drvdata(d); struct bcmgenet_priv *priv = netdev_priv(dev); unsigned long dma_ctrl; + u32 offset, reg; int ret; - u32 reg; if (!netif_running(dev)) return 0; @@ -3766,6 +4214,11 @@ static int bcmgenet_resume(struct device *d) bcmgenet_set_hw_addr(priv, dev->dev_addr); + offset = HFB_FLT_ENABLE_V3PLUS; + bcmgenet_hfb_reg_writel(priv, priv->hfb_en[1], offset); + bcmgenet_hfb_reg_writel(priv, priv->hfb_en[2], offset + sizeof(u32)); + bcmgenet_hfb_reg_writel(priv, priv->hfb_en[0], HFB_CTRL); + if (priv->internal_phy) { reg = bcmgenet_ext_readl(priv, EXT_EXT_PWR_MGMT); reg |= EXT_ENERGY_DET_MASK; @@ -3809,6 +4262,7 @@ static int bcmgenet_suspend(struct device *d) struct net_device *dev = dev_get_drvdata(d); struct bcmgenet_priv *priv = netdev_priv(dev); int ret = 0; + u32 offset; if (!netif_running(dev)) return 0; @@ -3820,6 +4274,13 @@ static int bcmgenet_suspend(struct device *d) if (!device_may_wakeup(d)) phy_suspend(dev->phydev); + /* Preserve filter state and disable filtering */ + priv->hfb_en[0] = bcmgenet_hfb_reg_readl(priv, HFB_CTRL); + offset = HFB_FLT_ENABLE_V3PLUS; + priv->hfb_en[1] = bcmgenet_hfb_reg_readl(priv, offset); + priv->hfb_en[2] = bcmgenet_hfb_reg_readl(priv, offset + sizeof(u32)); + bcmgenet_hfb_reg_writel(priv, 0, HFB_CTRL); + /* Prepare the device for Wake-on-LAN and switch to the slow clock */ if (device_may_wakeup(d) && priv->wolopts) ret = bcmgenet_power_down(priv, GENET_POWER_WOL_MAGIC); diff --git a/drivers/net/ethernet/broadcom/genet/bcmgenet.h b/drivers/net/ethernet/broadcom/genet/bcmgenet.h index a858b7305832..031d91f45067 100644 --- a/drivers/net/ethernet/broadcom/genet/bcmgenet.h +++ b/drivers/net/ethernet/broadcom/genet/bcmgenet.h @@ -32,6 +32,7 @@ #define DMA_MAX_BURST_LENGTH 0x10 /* misc. configuration */ +#define MAX_NUM_OF_FS_RULES 16 #define CLEAR_ALL_HFB 0xFF #define DMA_FC_THRESH_HI (TOTAL_DESC >> 4) #define DMA_FC_THRESH_LO 5 @@ -609,6 +610,18 @@ struct bcmgenet_rx_ring { struct bcmgenet_priv *priv; }; +enum bcmgenet_rxnfc_state { + BCMGENET_RXNFC_STATE_UNUSED = 0, + BCMGENET_RXNFC_STATE_DISABLED, + BCMGENET_RXNFC_STATE_ENABLED +}; + +struct bcmgenet_rxnfc_rule { + struct list_head list; + struct ethtool_rx_flow_spec fs; + enum bcmgenet_rxnfc_state state; +}; + /* device context */ struct bcmgenet_priv { void __iomem *base; @@ -627,6 +640,8 @@ struct bcmgenet_priv { struct enet_cb *rx_cbs; unsigned int num_rx_bds; unsigned int rx_buf_len; + struct bcmgenet_rxnfc_rule rxnfc_rules[MAX_NUM_OF_FS_RULES]; + struct list_head rxnfc_list; struct bcmgenet_rx_ring rx_rings[DESC_INDEX + 1]; @@ -679,6 +694,7 @@ struct bcmgenet_priv { u32 wolopts; u8 sopass[SOPASS_MAX]; bool wol_active; + u32 hfb_en[3]; struct bcmgenet_mib_counters mib; -- cgit v1.2.3-59-g8ed1b From f50932cca632fb87ab4de678ecc7c3b41116140b Mon Sep 17 00:00:00 2001 From: Doug Berger Date: Wed, 29 Apr 2020 13:02:06 -0700 Subject: net: bcmgenet: add WAKE_FILTER support This commit enables support for the WAKE_FILTER method of Wake on LAN for the GENET driver. The method can be enabled by adding 'f' to the interface 'wol' setting specified by ethtool. Rx network flow rules can be specified using ethtool. Rules that define a flow-type with the RX_CLS_FLOW_WAKE action (i.e. -2) can wake the system from the 'standby' power state when the WAKE_FILTER WoL method is enabled. Signed-off-by: Doug Berger Acked-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/genet/bcmgenet.c | 5 ++- drivers/net/ethernet/broadcom/genet/bcmgenet_wol.c | 43 +++++++++++++++++----- 2 files changed, 37 insertions(+), 11 deletions(-) diff --git a/drivers/net/ethernet/broadcom/genet/bcmgenet.c b/drivers/net/ethernet/broadcom/genet/bcmgenet.c index 5ef1ea7e5312..ad614d7201bd 100644 --- a/drivers/net/ethernet/broadcom/genet/bcmgenet.c +++ b/drivers/net/ethernet/broadcom/genet/bcmgenet.c @@ -724,7 +724,7 @@ static int bcmgenet_hfb_create_rxnfc_filter(struct bcmgenet_priv *priv, break; } - if (!fs->ring_cookie) { + if (!fs->ring_cookie || fs->ring_cookie == RX_CLS_FLOW_WAKE) { /* Ring 0 flows can be handled by the default Descriptor Ring * We'll map them to ring 0, but don't enable the filter */ @@ -1499,7 +1499,8 @@ static int bcmgenet_insert_flow(struct net_device *dev, return -EINVAL; } - if (cmd->fs.ring_cookie > priv->hw_params->rx_queues) { + if (cmd->fs.ring_cookie > priv->hw_params->rx_queues && + cmd->fs.ring_cookie != RX_CLS_FLOW_WAKE) { netdev_err(dev, "rxnfc: Unsupported action (%llu)\n", cmd->fs.ring_cookie); return -EINVAL; diff --git a/drivers/net/ethernet/broadcom/genet/bcmgenet_wol.c b/drivers/net/ethernet/broadcom/genet/bcmgenet_wol.c index da45a4645b94..4b9d65f392c2 100644 --- a/drivers/net/ethernet/broadcom/genet/bcmgenet_wol.c +++ b/drivers/net/ethernet/broadcom/genet/bcmgenet_wol.c @@ -42,7 +42,7 @@ void bcmgenet_get_wol(struct net_device *dev, struct ethtool_wolinfo *wol) { struct bcmgenet_priv *priv = netdev_priv(dev); - wol->supported = WAKE_MAGIC | WAKE_MAGICSECURE; + wol->supported = WAKE_MAGIC | WAKE_MAGICSECURE | WAKE_FILTER; wol->wolopts = priv->wolopts; memset(wol->sopass, 0, sizeof(wol->sopass)); @@ -61,7 +61,7 @@ int bcmgenet_set_wol(struct net_device *dev, struct ethtool_wolinfo *wol) if (!device_can_wakeup(kdev)) return -ENOTSUPP; - if (wol->wolopts & ~(WAKE_MAGIC | WAKE_MAGICSECURE)) + if (wol->wolopts & ~(WAKE_MAGIC | WAKE_MAGICSECURE | WAKE_FILTER)) return -EINVAL; if (wol->wolopts & WAKE_MAGICSECURE) @@ -117,8 +117,9 @@ int bcmgenet_wol_power_down_cfg(struct bcmgenet_priv *priv, enum bcmgenet_power_mode mode) { struct net_device *dev = priv->dev; + struct bcmgenet_rxnfc_rule *rule; + u32 reg, hfb_ctrl_reg, hfb_enable = 0; int retries = 0; - u32 reg; if (mode != GENET_POWER_WOL_MAGIC) { netif_err(priv, wol, dev, "unsupported mode: %d\n", mode); @@ -135,13 +136,24 @@ int bcmgenet_wol_power_down_cfg(struct bcmgenet_priv *priv, bcmgenet_umac_writel(priv, reg, UMAC_CMD); mdelay(10); - reg = bcmgenet_umac_readl(priv, UMAC_MPD_CTRL); - reg |= MPD_EN; - if (priv->wolopts & WAKE_MAGICSECURE) { - bcmgenet_set_mpd_password(priv); - reg |= MPD_PW_EN; + if (priv->wolopts & (WAKE_MAGIC | WAKE_MAGICSECURE)) { + reg = bcmgenet_umac_readl(priv, UMAC_MPD_CTRL); + reg |= MPD_EN; + if (priv->wolopts & WAKE_MAGICSECURE) { + bcmgenet_set_mpd_password(priv); + reg |= MPD_PW_EN; + } + bcmgenet_umac_writel(priv, reg, UMAC_MPD_CTRL); + } + + hfb_ctrl_reg = bcmgenet_hfb_reg_readl(priv, HFB_CTRL); + if (priv->wolopts & WAKE_FILTER) { + list_for_each_entry(rule, &priv->rxnfc_list, list) + if (rule->fs.ring_cookie == RX_CLS_FLOW_WAKE) + hfb_enable |= (1 << rule->fs.location); + reg = (hfb_ctrl_reg & ~RBUF_HFB_EN) | RBUF_ACPI_EN; + bcmgenet_hfb_reg_writel(priv, reg, HFB_CTRL); } - bcmgenet_umac_writel(priv, reg, UMAC_MPD_CTRL); /* Do not leave UniMAC in MPD mode only */ retries = bcmgenet_poll_wol_status(priv); @@ -149,6 +161,7 @@ int bcmgenet_wol_power_down_cfg(struct bcmgenet_priv *priv, reg = bcmgenet_umac_readl(priv, UMAC_MPD_CTRL); reg &= ~(MPD_EN | MPD_PW_EN); bcmgenet_umac_writel(priv, reg, UMAC_MPD_CTRL); + bcmgenet_hfb_reg_writel(priv, hfb_ctrl_reg, HFB_CTRL); return retries; } @@ -158,6 +171,13 @@ int bcmgenet_wol_power_down_cfg(struct bcmgenet_priv *priv, clk_prepare_enable(priv->clk_wol); priv->wol_active = 1; + if (hfb_enable) { + bcmgenet_hfb_reg_writel(priv, hfb_enable, + HFB_FLT_ENABLE_V3PLUS + 4); + hfb_ctrl_reg = RBUF_HFB_EN | RBUF_ACPI_EN; + bcmgenet_hfb_reg_writel(priv, hfb_ctrl_reg, HFB_CTRL); + } + /* Enable CRC forward */ reg = bcmgenet_umac_readl(priv, UMAC_CMD); priv->crc_fwd_en = 1; @@ -197,6 +217,11 @@ void bcmgenet_wol_power_up_cfg(struct bcmgenet_priv *priv, reg &= ~(MPD_EN | MPD_PW_EN); bcmgenet_umac_writel(priv, reg, UMAC_MPD_CTRL); + /* Disable WAKE_FILTER Detection */ + reg = bcmgenet_hfb_reg_readl(priv, HFB_CTRL); + reg &= ~(RBUF_HFB_EN | RBUF_ACPI_EN); + bcmgenet_hfb_reg_writel(priv, reg, HFB_CTRL); + /* Disable CRC Forward */ reg = bcmgenet_umac_readl(priv, UMAC_CMD); reg &= ~CMD_CRC_FWD; -- cgit v1.2.3-59-g8ed1b From 91f658587a962378a410cc7dc90e122a4ccd7cf3 Mon Sep 17 00:00:00 2001 From: Luke Nelson Date: Wed, 29 Apr 2020 17:51:27 -0700 Subject: bpf, riscv: Fix stack layout of JITed code on RV32 This patch fixes issues with stackframe unwinding and alignment in the current stack layout for BPF programs on RV32. In the current layout, RV32 fp points to the JIT scratch registers, rather than to the callee-saved registers. This breaks stackframe unwinding, which expects fp to point just above the saved ra and fp registers. This patch fixes the issue by moving the callee-saved registers to be stored on the top of the stack, pointed to by fp. This satisfies the assumptions of stackframe unwinding. This patch also fixes an issue with the old layout that the stack was not aligned to 16 bytes. Stacktrace from JITed code using the old stack layout: [ 12.196249 ] [] walk_stackframe+0x0/0x96 Stacktrace using the new stack layout: [ 13.062888 ] [] walk_stackframe+0x0/0x96 [ 13.063028 ] [] show_stack+0x28/0x32 [ 13.063253 ] [] bpf_prog_82b916b2dfa00464+0x80/0x908 [ 13.063417 ] [] bpf_test_run+0x124/0x39a [ 13.063553 ] [] bpf_prog_test_run_skb+0x234/0x448 [ 13.063704 ] [] __do_sys_bpf+0x766/0x13b4 [ 13.063840 ] [] sys_bpf+0xc/0x14 [ 13.063961 ] [] ret_from_syscall+0x0/0x2 The new code is also simpler to understand and includes an ASCII diagram of the stack layout. Tested on riscv32 QEMU virt machine. Signed-off-by: Luke Nelson Signed-off-by: Daniel Borkmann Acked-by: Xi Wang Link: https://lore.kernel.org/bpf/20200430005127.2205-1-luke.r.nels@gmail.com --- arch/riscv/net/bpf_jit_comp32.c | 98 +++++++++++++++++++++++++++-------------- 1 file changed, 65 insertions(+), 33 deletions(-) diff --git a/arch/riscv/net/bpf_jit_comp32.c b/arch/riscv/net/bpf_jit_comp32.c index 11083d4d5f2d..b198eaa74456 100644 --- a/arch/riscv/net/bpf_jit_comp32.c +++ b/arch/riscv/net/bpf_jit_comp32.c @@ -13,8 +13,35 @@ #include #include "bpf_jit.h" +/* + * Stack layout during BPF program execution: + * + * high + * RV32 fp => +----------+ + * | saved ra | + * | saved fp | RV32 callee-saved registers + * | ... | + * +----------+ <= (fp - 4 * NR_SAVED_REGISTERS) + * | hi(R6) | + * | lo(R6) | + * | hi(R7) | JIT scratch space for BPF registers + * | lo(R7) | + * | ... | + * BPF_REG_FP => +----------+ <= (fp - 4 * NR_SAVED_REGISTERS + * | | - 4 * BPF_JIT_SCRATCH_REGS) + * | | + * | ... | BPF program stack + * | | + * RV32 sp => +----------+ + * | | + * | ... | Function call stack + * | | + * +----------+ + * low + */ + enum { - /* Stack layout - these are offsets from (top of stack - 4). */ + /* Stack layout - these are offsets from top of JIT scratch space. */ BPF_R6_HI, BPF_R6_LO, BPF_R7_HI, @@ -29,7 +56,11 @@ enum { BPF_JIT_SCRATCH_REGS, }; -#define STACK_OFFSET(k) (-4 - ((k) * 4)) +/* Number of callee-saved registers stored to stack: ra, fp, s1--s7. */ +#define NR_SAVED_REGISTERS 9 + +/* Offset from fp for BPF registers stored on stack. */ +#define STACK_OFFSET(k) (-4 - (4 * NR_SAVED_REGISTERS) - (4 * (k))) #define TMP_REG_1 (MAX_BPF_JIT_REG + 0) #define TMP_REG_2 (MAX_BPF_JIT_REG + 1) @@ -111,11 +142,9 @@ static void emit_imm64(const s8 *rd, s32 imm_hi, s32 imm_lo, static void __build_epilogue(bool is_tail_call, struct rv_jit_context *ctx) { - int stack_adjust = ctx->stack_size, store_offset = stack_adjust - 4; + int stack_adjust = ctx->stack_size; const s8 *r0 = bpf2rv32[BPF_REG_0]; - store_offset -= 4 * BPF_JIT_SCRATCH_REGS; - /* Set return value if not tail call. */ if (!is_tail_call) { emit(rv_addi(RV_REG_A0, lo(r0), 0), ctx); @@ -123,15 +152,15 @@ static void __build_epilogue(bool is_tail_call, struct rv_jit_context *ctx) } /* Restore callee-saved registers. */ - emit(rv_lw(RV_REG_RA, store_offset - 0, RV_REG_SP), ctx); - emit(rv_lw(RV_REG_FP, store_offset - 4, RV_REG_SP), ctx); - emit(rv_lw(RV_REG_S1, store_offset - 8, RV_REG_SP), ctx); - emit(rv_lw(RV_REG_S2, store_offset - 12, RV_REG_SP), ctx); - emit(rv_lw(RV_REG_S3, store_offset - 16, RV_REG_SP), ctx); - emit(rv_lw(RV_REG_S4, store_offset - 20, RV_REG_SP), ctx); - emit(rv_lw(RV_REG_S5, store_offset - 24, RV_REG_SP), ctx); - emit(rv_lw(RV_REG_S6, store_offset - 28, RV_REG_SP), ctx); - emit(rv_lw(RV_REG_S7, store_offset - 32, RV_REG_SP), ctx); + emit(rv_lw(RV_REG_RA, stack_adjust - 4, RV_REG_SP), ctx); + emit(rv_lw(RV_REG_FP, stack_adjust - 8, RV_REG_SP), ctx); + emit(rv_lw(RV_REG_S1, stack_adjust - 12, RV_REG_SP), ctx); + emit(rv_lw(RV_REG_S2, stack_adjust - 16, RV_REG_SP), ctx); + emit(rv_lw(RV_REG_S3, stack_adjust - 20, RV_REG_SP), ctx); + emit(rv_lw(RV_REG_S4, stack_adjust - 24, RV_REG_SP), ctx); + emit(rv_lw(RV_REG_S5, stack_adjust - 28, RV_REG_SP), ctx); + emit(rv_lw(RV_REG_S6, stack_adjust - 32, RV_REG_SP), ctx); + emit(rv_lw(RV_REG_S7, stack_adjust - 36, RV_REG_SP), ctx); emit(rv_addi(RV_REG_SP, RV_REG_SP, stack_adjust), ctx); @@ -1260,17 +1289,20 @@ notsupported: void bpf_jit_build_prologue(struct rv_jit_context *ctx) { - /* Make space to save 9 registers: ra, fp, s1--s7. */ - int stack_adjust = 9 * sizeof(u32), store_offset, bpf_stack_adjust; const s8 *fp = bpf2rv32[BPF_REG_FP]; const s8 *r1 = bpf2rv32[BPF_REG_1]; - - bpf_stack_adjust = round_up(ctx->prog->aux->stack_depth, 16); + int stack_adjust = 0; + int bpf_stack_adjust = + round_up(ctx->prog->aux->stack_depth, STACK_ALIGN); + + /* Make space for callee-saved registers. */ + stack_adjust += NR_SAVED_REGISTERS * sizeof(u32); + /* Make space for BPF registers on stack. */ + stack_adjust += BPF_JIT_SCRATCH_REGS * sizeof(u32); + /* Make space for BPF stack. */ stack_adjust += bpf_stack_adjust; - - store_offset = stack_adjust - 4; - - stack_adjust += 4 * BPF_JIT_SCRATCH_REGS; + /* Round up for stack alignment. */ + stack_adjust = round_up(stack_adjust, STACK_ALIGN); /* * The first instruction sets the tail-call-counter (TCC) register. @@ -1281,24 +1313,24 @@ void bpf_jit_build_prologue(struct rv_jit_context *ctx) emit(rv_addi(RV_REG_SP, RV_REG_SP, -stack_adjust), ctx); /* Save callee-save registers. */ - emit(rv_sw(RV_REG_SP, store_offset - 0, RV_REG_RA), ctx); - emit(rv_sw(RV_REG_SP, store_offset - 4, RV_REG_FP), ctx); - emit(rv_sw(RV_REG_SP, store_offset - 8, RV_REG_S1), ctx); - emit(rv_sw(RV_REG_SP, store_offset - 12, RV_REG_S2), ctx); - emit(rv_sw(RV_REG_SP, store_offset - 16, RV_REG_S3), ctx); - emit(rv_sw(RV_REG_SP, store_offset - 20, RV_REG_S4), ctx); - emit(rv_sw(RV_REG_SP, store_offset - 24, RV_REG_S5), ctx); - emit(rv_sw(RV_REG_SP, store_offset - 28, RV_REG_S6), ctx); - emit(rv_sw(RV_REG_SP, store_offset - 32, RV_REG_S7), ctx); + emit(rv_sw(RV_REG_SP, stack_adjust - 4, RV_REG_RA), ctx); + emit(rv_sw(RV_REG_SP, stack_adjust - 8, RV_REG_FP), ctx); + emit(rv_sw(RV_REG_SP, stack_adjust - 12, RV_REG_S1), ctx); + emit(rv_sw(RV_REG_SP, stack_adjust - 16, RV_REG_S2), ctx); + emit(rv_sw(RV_REG_SP, stack_adjust - 20, RV_REG_S3), ctx); + emit(rv_sw(RV_REG_SP, stack_adjust - 24, RV_REG_S4), ctx); + emit(rv_sw(RV_REG_SP, stack_adjust - 28, RV_REG_S5), ctx); + emit(rv_sw(RV_REG_SP, stack_adjust - 32, RV_REG_S6), ctx); + emit(rv_sw(RV_REG_SP, stack_adjust - 36, RV_REG_S7), ctx); /* Set fp: used as the base address for stacked BPF registers. */ emit(rv_addi(RV_REG_FP, RV_REG_SP, stack_adjust), ctx); - /* Set up BPF stack pointer. */ + /* Set up BPF frame pointer. */ emit(rv_addi(lo(fp), RV_REG_SP, bpf_stack_adjust), ctx); emit(rv_addi(hi(fp), RV_REG_ZERO, 0), ctx); - /* Set up context pointer. */ + /* Set up BPF context pointer. */ emit(rv_addi(lo(r1), RV_REG_A0, 0), ctx); emit(rv_addi(hi(r1), RV_REG_ZERO, 0), ctx); -- cgit v1.2.3-59-g8ed1b From 063e688133914505ddb396cc33231f22f12e0685 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Wed, 29 Apr 2020 19:14:36 -0700 Subject: libbpf: Fix false uninitialized variable warning Some versions of GCC falsely detect that vi might not be initialized. That's not true, but let's silence it with NULL initialization. Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20200430021436.1522502-1-andriin@fb.com --- tools/lib/bpf/libbpf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index d86ff8214b96..977add1b73e2 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -5003,8 +5003,8 @@ static int bpf_object__collect_map_relos(struct bpf_object *obj, GElf_Shdr *shdr, Elf_Data *data) { int i, j, nrels, new_sz, ptr_sz = sizeof(void *); + const struct btf_var_secinfo *vi = NULL; const struct btf_type *sec, *var, *def; - const struct btf_var_secinfo *vi; const struct btf_member *member; struct bpf_map *map, *targ_map; const char *name, *mname; -- cgit v1.2.3-59-g8ed1b From c321022244708aec4675de4f032ef1ba9ff0c640 Mon Sep 17 00:00:00 2001 From: Jakub Sitnicki Date: Thu, 30 Apr 2020 12:47:38 +0200 Subject: selftests/bpf: Test allowed maps for bpf_sk_select_reuseport Check that verifier allows passing a map of type: BPF_MAP_TYPE_REUSEPORT_SOCKARRARY, or BPF_MAP_TYPE_SOCKMAP, or BPF_MAP_TYPE_SOCKHASH ... to bpf_sk_select_reuseport helper. Suggested-by: John Fastabend Signed-off-by: Jakub Sitnicki Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20200430104738.494180-1-jakub@cloudflare.com --- tools/testing/selftests/bpf/test_verifier.c | 12 +++++++- tools/testing/selftests/bpf/verifier/sock.c | 45 +++++++++++++++++++++++++++++ 2 files changed, 56 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c index ad6939c67c5e..21a1ce219c1c 100644 --- a/tools/testing/selftests/bpf/test_verifier.c +++ b/tools/testing/selftests/bpf/test_verifier.c @@ -50,7 +50,7 @@ #define MAX_INSNS BPF_MAXINSNS #define MAX_TEST_INSNS 1000000 #define MAX_FIXUPS 8 -#define MAX_NR_MAPS 19 +#define MAX_NR_MAPS 20 #define MAX_TEST_RUNS 8 #define POINTER_VALUE 0xcafe4all #define TEST_DATA_LEN 64 @@ -86,6 +86,7 @@ struct bpf_test { int fixup_map_array_small[MAX_FIXUPS]; int fixup_sk_storage_map[MAX_FIXUPS]; int fixup_map_event_output[MAX_FIXUPS]; + int fixup_map_reuseport_array[MAX_FIXUPS]; const char *errstr; const char *errstr_unpriv; uint32_t insn_processed; @@ -637,6 +638,7 @@ static void do_test_fixup(struct bpf_test *test, enum bpf_prog_type prog_type, int *fixup_map_array_small = test->fixup_map_array_small; int *fixup_sk_storage_map = test->fixup_sk_storage_map; int *fixup_map_event_output = test->fixup_map_event_output; + int *fixup_map_reuseport_array = test->fixup_map_reuseport_array; if (test->fill_helper) { test->fill_insns = calloc(MAX_TEST_INSNS, sizeof(struct bpf_insn)); @@ -806,6 +808,14 @@ static void do_test_fixup(struct bpf_test *test, enum bpf_prog_type prog_type, fixup_map_event_output++; } while (*fixup_map_event_output); } + if (*fixup_map_reuseport_array) { + map_fds[19] = __create_map(BPF_MAP_TYPE_REUSEPORT_SOCKARRAY, + sizeof(u32), sizeof(u64), 1, 0); + do { + prog[*fixup_map_reuseport_array].imm = map_fds[19]; + fixup_map_reuseport_array++; + } while (*fixup_map_reuseport_array); + } } static int set_admin(bool admin) diff --git a/tools/testing/selftests/bpf/verifier/sock.c b/tools/testing/selftests/bpf/verifier/sock.c index f87ad69dbc62..0bc51ad9e0fb 100644 --- a/tools/testing/selftests/bpf/verifier/sock.c +++ b/tools/testing/selftests/bpf/verifier/sock.c @@ -586,3 +586,48 @@ .prog_type = BPF_PROG_TYPE_SK_SKB, .result = ACCEPT, }, +{ + "bpf_sk_select_reuseport(ctx, reuseport_array, &key, flags)", + .insns = { + BPF_MOV64_IMM(BPF_REG_4, 0), + BPF_ST_MEM(BPF_W, BPF_REG_10, -4, 0), + BPF_MOV64_REG(BPF_REG_3, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_3, -4), + BPF_LD_MAP_FD(BPF_REG_2, 0), + BPF_EMIT_CALL(BPF_FUNC_sk_select_reuseport), + BPF_EXIT_INSN(), + }, + .fixup_map_reuseport_array = { 4 }, + .prog_type = BPF_PROG_TYPE_SK_REUSEPORT, + .result = ACCEPT, +}, +{ + "bpf_sk_select_reuseport(ctx, sockmap, &key, flags)", + .insns = { + BPF_MOV64_IMM(BPF_REG_4, 0), + BPF_ST_MEM(BPF_W, BPF_REG_10, -4, 0), + BPF_MOV64_REG(BPF_REG_3, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_3, -4), + BPF_LD_MAP_FD(BPF_REG_2, 0), + BPF_EMIT_CALL(BPF_FUNC_sk_select_reuseport), + BPF_EXIT_INSN(), + }, + .fixup_map_sockmap = { 4 }, + .prog_type = BPF_PROG_TYPE_SK_REUSEPORT, + .result = ACCEPT, +}, +{ + "bpf_sk_select_reuseport(ctx, sockhash, &key, flags)", + .insns = { + BPF_MOV64_IMM(BPF_REG_4, 0), + BPF_ST_MEM(BPF_W, BPF_REG_10, -4, 0), + BPF_MOV64_REG(BPF_REG_3, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_3, -4), + BPF_LD_MAP_FD(BPF_REG_2, 0), + BPF_EMIT_CALL(BPF_FUNC_sk_select_reuseport), + BPF_EXIT_INSN(), + }, + .fixup_map_sockmap = { 4 }, + .prog_type = BPF_PROG_TYPE_SK_REUSEPORT, + .result = ACCEPT, +}, -- cgit v1.2.3-59-g8ed1b From 72d3fef16158b9c1852855a3846757ec165c16e1 Mon Sep 17 00:00:00 2001 From: Raed Salem Date: Wed, 22 Apr 2020 09:19:08 +0300 Subject: net/mlx5: IPsec, Fix coverity issue The cited commit introduced the following coverity issue at functions mlx5_fpga_is_ipsec_device() and mlx5_fpga_ipsec_release_sa_ctx(): - bit_and_with_zero: accel_xfrm->attrs.action & MLX5_ACCEL_ESP_ACTION_DECRYPT is always 0. As MLX5_ACCEL_ESP_ACTION_DECRYPT is not a bitwise flag and was wrongly used with bitwise operation, the above expression is always zero value as MLX5_ACCEL_ESP_ACTION_DECRYPT is zero. Fix by using "==" comparison operator instead. Fixes: 7dfee4b1d79e ("net/mlx5: IPsec, Refactor SA handle creation and destruction") Signed-off-by: Raed Salem Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/fpga/ipsec.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fpga/ipsec.c b/drivers/net/ethernet/mellanox/mlx5/core/fpga/ipsec.c index 0604216eb94f..b463787d6ca1 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/fpga/ipsec.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/fpga/ipsec.c @@ -708,7 +708,7 @@ void *mlx5_fpga_ipsec_create_sa_ctx(struct mlx5_core_dev *mdev, goto exists; } - if (accel_xfrm->attrs.action & MLX5_ACCEL_ESP_ACTION_DECRYPT) { + if (accel_xfrm->attrs.action == MLX5_ACCEL_ESP_ACTION_DECRYPT) { err = ida_simple_get(&fipsec->halloc, 1, 0, GFP_KERNEL); if (err < 0) { context = ERR_PTR(err); @@ -759,7 +759,7 @@ delete_hash: rhash_sa)); unlock_hash: mutex_unlock(&fipsec->sa_hash_lock); - if (accel_xfrm->attrs.action & MLX5_ACCEL_ESP_ACTION_DECRYPT) + if (accel_xfrm->attrs.action == MLX5_ACCEL_ESP_ACTION_DECRYPT) ida_simple_remove(&fipsec->halloc, sa_ctx->sa_handle); exists: mutex_unlock(&fpga_xfrm->lock); -- cgit v1.2.3-59-g8ed1b From 9c8e7434e0349b26df82ed25522e812e4feeb873 Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Fri, 3 Apr 2020 05:49:02 -0500 Subject: net/mlx5e: Use helper API to get devlink port index for all port flavours Use existing helper API to get unique devlink port index for all devlink port flavours. Reviewed-by: Jiri Pirko Signed-off-by: Parav Pandit Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en_rep.c | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c index 55457f268495..2de54d865dc8 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c @@ -2056,26 +2056,22 @@ static int register_devlink_port(struct mlx5_core_dev *dev, return 0; mlx5e_rep_get_port_parent_id(rpriv->netdev, &ppid); + dl_port_index = vport_to_devlink_port_index(dev, rep->vport); pfnum = PCI_FUNC(dev->pdev->devfn); - if (rep->vport == MLX5_VPORT_UPLINK) { + if (rep->vport == MLX5_VPORT_UPLINK) devlink_port_attrs_set(&rpriv->dl_port, DEVLINK_PORT_FLAVOUR_PHYSICAL, pfnum, false, 0, &ppid.id[0], ppid.id_len); - dl_port_index = vport_to_devlink_port_index(dev, rep->vport); - } else if (rep->vport == MLX5_VPORT_PF) { + else if (rep->vport == MLX5_VPORT_PF) devlink_port_attrs_pci_pf_set(&rpriv->dl_port, &ppid.id[0], ppid.id_len, pfnum); - dl_port_index = rep->vport; - } else if (mlx5_eswitch_is_vf_vport(dev->priv.eswitch, - rpriv->rep->vport)) { + else if (mlx5_eswitch_is_vf_vport(dev->priv.eswitch, rpriv->rep->vport)) devlink_port_attrs_pci_vf_set(&rpriv->dl_port, &ppid.id[0], ppid.id_len, pfnum, rep->vport - 1); - dl_port_index = vport_to_devlink_port_index(dev, rep->vport); - } return devlink_port_register(devlink, &rpriv->dl_port, dl_port_index); } -- cgit v1.2.3-59-g8ed1b From e59b254cbecc10088b691a2abdaeb6ded872b7a1 Mon Sep 17 00:00:00 2001 From: Zheng Bin Date: Fri, 24 Apr 2020 16:43:57 +0800 Subject: net/mlx5e: Remove unneeded semicolon Fixes coccicheck warning: drivers/net/ethernet/mellanox/mlx5/core/en/tc_ct.c:690:2-3: Unneeded semicolon Reported-by: Hulk Robot Signed-off-by: Zheng Bin Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en/tc_ct.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_ct.c b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_ct.c index 1079558d292a..77b3f372e831 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_ct.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_ct.c @@ -687,7 +687,7 @@ mlx5_tc_ct_block_flow_offload(enum tc_setup_type type, void *type_data, return mlx5_tc_ct_block_flow_offload_stats(ft, f); default: break; - }; + } return -EOPNOTSUPP; } -- cgit v1.2.3-59-g8ed1b From 70a5698a5683cd504b03c6030ee622b1bec3f702 Mon Sep 17 00:00:00 2001 From: Roi Dayan Date: Sun, 26 Apr 2020 09:52:02 +0300 Subject: net/mlx5e: CT: Avoid false warning about rule may be used uninitialized Avoid gcc warning by preset rule to invalid ptr. Fixes: 4c3844d9e97e ("net/mlx5e: CT: Introduce connection tracking") Signed-off-by: Roi Dayan Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en/tc_ct.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_ct.c b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_ct.c index 77b3f372e831..44f806e79e8d 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_ct.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_ct.c @@ -1131,7 +1131,7 @@ mlx5_tc_ct_flow_offload(struct mlx5e_priv *priv, { bool clear_action = attr->ct_attr.ct_action & TCA_CT_ACT_CLEAR; struct mlx5_tc_ct_priv *ct_priv = mlx5_tc_ct_get_ct_priv(priv); - struct mlx5_flow_handle *rule; + struct mlx5_flow_handle *rule = ERR_PTR(-EINVAL); int err; if (!ct_priv) -- cgit v1.2.3-59-g8ed1b From d2658b4a1d06e8458f2c88ee600afa1b1acd2627 Mon Sep 17 00:00:00 2001 From: Paul Blakey Date: Tue, 14 Apr 2020 11:30:39 +0300 Subject: net/mlx5: CT: Remove unused variables Signed-off-by: Paul Blakey Reviewed-by: Oz Shlomo Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en/tc_ct.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_ct.c b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_ct.c index 44f806e79e8d..5568ded97e0b 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_ct.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_ct.c @@ -73,9 +73,7 @@ struct mlx5_ct_ft { struct mlx5_ct_entry { u16 zone; struct rhash_head node; - struct flow_rule *flow_rule; struct mlx5_fc *counter; - unsigned long lastuse; unsigned long cookie; unsigned long restore_cookie; struct mlx5_ct_zone_rule zone_rules[2]; @@ -603,7 +601,6 @@ mlx5_tc_ct_block_flow_offload_add(struct mlx5_ct_ft *ft, return -ENOMEM; entry->zone = ft->zone; - entry->flow_rule = flow_rule; entry->cookie = flow->cookie; entry->restore_cookie = meta_action->ct_metadata.cookie; -- cgit v1.2.3-59-g8ed1b From 51dde00b8fb3cf07e577be6aa4d98ee1f34b84be Mon Sep 17 00:00:00 2001 From: Tariq Toukan Date: Wed, 15 Apr 2020 19:34:03 +0300 Subject: net/mlx5: Remove unused field in EQ The size field in EQ is not in use. Remove it. Signed-off-by: Tariq Toukan Reviewed-by: Tal Gilboa Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/lib/eq.h | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/eq.h b/drivers/net/ethernet/mellanox/mlx5/core/lib/eq.h index 4be4d2d36218..4aaca7400fb2 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lib/eq.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/eq.h @@ -27,7 +27,6 @@ struct mlx5_eq { __be32 __iomem *doorbell; u32 cons_index; struct mlx5_frag_buf buf; - int size; unsigned int vecidx; unsigned int irqn; u8 eqn; -- cgit v1.2.3-59-g8ed1b From c655c1f46957eb4f30221c52580e38f85058e167 Mon Sep 17 00:00:00 2001 From: Eran Ben Elisha Date: Mon, 30 Mar 2020 14:27:08 +0300 Subject: net/mlx5: Add helper function to release fw page Factor out the fwp address release page to an helper function, will be used in the downstream patch. Signed-off-by: Eran Ben Elisha Reviewed-by: Moshe Shemesh Signed-off-by: Saeed Mahameed --- .../net/ethernet/mellanox/mlx5/core/pagealloc.c | 30 ++++++++++++---------- 1 file changed, 17 insertions(+), 13 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/pagealloc.c b/drivers/net/ethernet/mellanox/mlx5/core/pagealloc.c index 3d6f617abb7d..c39907c641a0 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/pagealloc.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/pagealloc.c @@ -181,25 +181,17 @@ static int alloc_4k(struct mlx5_core_dev *dev, u64 *addr) #define MLX5_U64_4K_PAGE_MASK ((~(u64)0U) << PAGE_SHIFT) -static void free_4k(struct mlx5_core_dev *dev, u64 addr) +static void free_fwp(struct mlx5_core_dev *dev, struct fw_page *fwp) { - struct fw_page *fwp; - int n; + int n = (fwp->addr & ~MLX5_U64_4K_PAGE_MASK) >> MLX5_ADAPTER_PAGE_SHIFT; - fwp = find_fw_page(dev, addr & MLX5_U64_4K_PAGE_MASK); - if (!fwp) { - mlx5_core_warn(dev, "page not found\n"); - return; - } - - n = (addr & ~MLX5_U64_4K_PAGE_MASK) >> MLX5_ADAPTER_PAGE_SHIFT; fwp->free_count++; set_bit(n, &fwp->bitmask); if (fwp->free_count == MLX5_NUM_4K_IN_PAGE) { rb_erase(&fwp->rb_node, &dev->priv.page_root); if (fwp->free_count != 1) list_del(&fwp->list); - dma_unmap_page(dev->device, addr & MLX5_U64_4K_PAGE_MASK, + dma_unmap_page(dev->device, fwp->addr & MLX5_U64_4K_PAGE_MASK, PAGE_SIZE, DMA_BIDIRECTIONAL); __free_page(fwp->page); kfree(fwp); @@ -208,6 +200,18 @@ static void free_4k(struct mlx5_core_dev *dev, u64 addr) } } +static void free_addr(struct mlx5_core_dev *dev, u64 addr) +{ + struct fw_page *fwp; + + fwp = find_fw_page(dev, addr & MLX5_U64_4K_PAGE_MASK); + if (!fwp) { + mlx5_core_warn(dev, "page not found\n"); + return; + } + free_fwp(dev, fwp); +} + static int alloc_system_page(struct mlx5_core_dev *dev, u16 func_id) { struct device *device = dev->device; @@ -329,7 +333,7 @@ retry: out_4k: for (i--; i >= 0; i--) - free_4k(dev, MLX5_GET64(manage_pages_in, in, pas[i])); + free_addr(dev, MLX5_GET64(manage_pages_in, in, pas[i])); out_free: kvfree(in); if (notify_fail) @@ -408,7 +412,7 @@ static int reclaim_pages(struct mlx5_core_dev *dev, u32 func_id, int npages, } for (i = 0; i < num_claimed; i++) - free_4k(dev, MLX5_GET64(manage_pages_out, out, pas[i])); + free_addr(dev, MLX5_GET64(manage_pages_out, out, pas[i])); if (nclaimed) *nclaimed = num_claimed; -- cgit v1.2.3-59-g8ed1b From c7636942d278db7f502c626a47d2ce1111602716 Mon Sep 17 00:00:00 2001 From: Eran Ben Elisha Date: Sun, 19 Apr 2020 10:20:40 +0300 Subject: net/mlx5: Rate limit page not found error messages Thousands of pages are released with free_addr() function. In case of buggy sync between FW and driver on released address, the log will be flooded with error messages. Use mlx5_core_warn_rl() to limit it. Signed-off-by: Eran Ben Elisha Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/pagealloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/pagealloc.c b/drivers/net/ethernet/mellanox/mlx5/core/pagealloc.c index c39907c641a0..c790d6e3d204 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/pagealloc.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/pagealloc.c @@ -206,7 +206,7 @@ static void free_addr(struct mlx5_core_dev *dev, u64 addr) fwp = find_fw_page(dev, addr & MLX5_U64_4K_PAGE_MASK); if (!fwp) { - mlx5_core_warn(dev, "page not found\n"); + mlx5_core_warn_rl(dev, "page not found\n"); return; } free_fwp(dev, fwp); -- cgit v1.2.3-59-g8ed1b From c6168161f693e6d26cdcce891f99399f1432ac80 Mon Sep 17 00:00:00 2001 From: Eran Ben Elisha Date: Wed, 1 Apr 2020 10:30:32 +0300 Subject: net/mlx5: Add support for release all pages event If FW sets release_all_pages bit in MLX5_EVENT_TYPE_PAGE_REQUEST, driver shall release all pages of a given function id, with no further pages reclaim negotiation with FW nor MANAGE_PAGES commands from driver towards FW. Upon receiving this bit as part of pages reclaim event, driver will initiate release all flow, in which it will iterate and release all function's pages. As part of driver <-> FW capabilities handshake, FW will report release_all_pages max HCA cap bit, and driver will set the release_all_pages bit in HCA cap. NIC: ConnectX-4 Lx CPU: Intel(R) Xeon(R) CPU E5-2650 v2 @ 2.60GHz Test case: Simulataniously FLR 4 VFs, and measure FW release pages by driver. Before: 3.18 Sec After: 0.31 Sec Signed-off-by: Eran Ben Elisha Reviewed-by: Moshe Shemesh Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/main.c | 3 ++ .../net/ethernet/mellanox/mlx5/core/pagealloc.c | 41 ++++++++++++++++++++-- 2 files changed, 41 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c index fbbf51026b52..742ba012c234 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c @@ -549,6 +549,9 @@ static int handle_hca_cap(struct mlx5_core_dev *dev, void *set_ctx) num_vhca_ports, MLX5_CAP_GEN_MAX(dev, num_vhca_ports)); + if (MLX5_CAP_GEN_MAX(dev, release_all_pages)) + MLX5_SET(cmd_hca_cap, set_hca_cap, release_all_pages, 1); + return set_caps(dev, set_ctx, MLX5_SET_HCA_CAP_OP_MOD_GENERAL_DEVICE); } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/pagealloc.c b/drivers/net/ethernet/mellanox/mlx5/core/pagealloc.c index c790d6e3d204..8ce78f42dfc0 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/pagealloc.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/pagealloc.c @@ -50,6 +50,7 @@ struct mlx5_pages_req { u8 ec_function; s32 npages; struct work_struct work; + u8 release_all; }; struct fw_page { @@ -341,6 +342,33 @@ out_free: return err; } +static void release_all_pages(struct mlx5_core_dev *dev, u32 func_id, + bool ec_function) +{ + struct rb_node *p; + int npages = 0; + + p = rb_first(&dev->priv.page_root); + while (p) { + struct fw_page *fwp = rb_entry(p, struct fw_page, rb_node); + + p = rb_next(p); + if (fwp->func_id != func_id) + continue; + free_fwp(dev, fwp); + npages++; + } + + dev->priv.fw_pages -= npages; + if (func_id) + dev->priv.vfs_pages -= npages; + else if (mlx5_core_is_ecpf(dev) && !ec_function) + dev->priv.peer_pf_pages -= npages; + + mlx5_core_dbg(dev, "npages %d, ec_function %d, func_id 0x%x\n", + npages, ec_function, func_id); +} + static int reclaim_pages_cmd(struct mlx5_core_dev *dev, u32 *in, int in_size, u32 *out, int out_size) { @@ -434,7 +462,9 @@ static void pages_work_handler(struct work_struct *work) struct mlx5_core_dev *dev = req->dev; int err = 0; - if (req->npages < 0) + if (req->release_all) + release_all_pages(dev, req->func_id, req->ec_function); + else if (req->npages < 0) err = reclaim_pages(dev, req->func_id, -1 * req->npages, NULL, req->ec_function); else if (req->npages > 0) @@ -449,6 +479,7 @@ static void pages_work_handler(struct work_struct *work) enum { EC_FUNCTION_MASK = 0x8000, + RELEASE_ALL_PAGES_MASK = 0x4000, }; static int req_pages_handler(struct notifier_block *nb, @@ -459,6 +490,7 @@ static int req_pages_handler(struct notifier_block *nb, struct mlx5_priv *priv; struct mlx5_eqe *eqe; bool ec_function; + bool release_all; u16 func_id; s32 npages; @@ -469,8 +501,10 @@ static int req_pages_handler(struct notifier_block *nb, func_id = be16_to_cpu(eqe->data.req_pages.func_id); npages = be32_to_cpu(eqe->data.req_pages.num_pages); ec_function = be16_to_cpu(eqe->data.req_pages.ec_function) & EC_FUNCTION_MASK; - mlx5_core_dbg(dev, "page request for func 0x%x, npages %d\n", - func_id, npages); + release_all = be16_to_cpu(eqe->data.req_pages.ec_function) & + RELEASE_ALL_PAGES_MASK; + mlx5_core_dbg(dev, "page request for func 0x%x, npages %d, release_all %d\n", + func_id, npages, release_all); req = kzalloc(sizeof(*req), GFP_ATOMIC); if (!req) { mlx5_core_warn(dev, "failed to allocate pages request\n"); @@ -481,6 +515,7 @@ static int req_pages_handler(struct notifier_block *nb, req->func_id = func_id; req->npages = npages; req->ec_function = ec_function; + req->release_all = release_all; INIT_WORK(&req->work, pages_work_handler); queue_work(dev->priv.pg_wq, &req->work); return NOTIFY_OK; -- cgit v1.2.3-59-g8ed1b From e658664c77c11c5ba77dfb231138505ecac71c80 Mon Sep 17 00:00:00 2001 From: Tariq Toukan Date: Thu, 8 Aug 2019 12:32:17 +0300 Subject: net/mlx5e: Use proper name field for the UMR key Even though some of the WQE control segment's field share the same memory bits (a union of fields), prefer having the right field name for every different usage. Signed-off-by: Tariq Toukan Reviewed-by: Maxim Mikityanskiy Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index 10c933e5da9a..bf3fdbea1074 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -233,7 +233,7 @@ static inline void mlx5e_build_umr_wqe(struct mlx5e_rq *rq, cseg->qpn_ds = cpu_to_be32((sq->sqn << MLX5_WQE_CTRL_QPN_SHIFT) | ds_cnt); cseg->fm_ce_se = MLX5_WQE_CTRL_CQ_UPDATE; - cseg->imm = rq->mkey_be; + cseg->umr_mkey = rq->mkey_be; ucseg->flags = MLX5_UMR_TRANSLATION_OFFSET_EN | MLX5_UMR_INLINE; ucseg->xlt_octowords = -- cgit v1.2.3-59-g8ed1b From f1b95753eeedc00f1223e8033d96dae9f996ca6d Mon Sep 17 00:00:00 2001 From: Tariq Toukan Date: Sun, 9 Feb 2020 17:06:49 +0200 Subject: net/mlx5e: TX, Generalise code and usage of error CQE dump Error CQE was dumped only for TXQ SQs. Generalise the function, and add usage for error completions on ICO SQs and XDP SQs. Signed-off-by: Tariq Toukan Reviewed-by: Aya Levin Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en/txrx.h | 16 ++++++++++++++++ drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c | 13 ++++++++----- drivers/net/ethernet/mellanox/mlx5/core/en_rx.c | 2 ++ drivers/net/ethernet/mellanox/mlx5/core/en_tx.c | 18 +----------------- 4 files changed, 27 insertions(+), 22 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/txrx.h b/drivers/net/ethernet/mellanox/mlx5/core/en/txrx.h index 9f6967d76053..c0249fc77eaa 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/txrx.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/txrx.h @@ -189,6 +189,22 @@ static inline void mlx5e_rqwq_reset(struct mlx5e_rq *rq) } } +static inline void mlx5e_dump_error_cqe(struct mlx5e_cq *cq, u32 sqn, + struct mlx5_err_cqe *err_cqe) +{ + struct mlx5_cqwq *wq = &cq->wq; + u32 ci; + + ci = mlx5_cqwq_ctr2ix(wq, wq->cc - 1); + + netdev_err(cq->channel->netdev, + "Error cqe on cqn 0x%x, ci 0x%x, sqn 0x%x, opcode 0x%x, syndrome 0x%x, vendor syndrome 0x%x\n", + cq->mcq.cqn, ci, sqn, + get_cqe_opcode((struct mlx5_cqe64 *)err_cqe), + err_cqe->syndrome, err_cqe->vendor_err_synd); + mlx5_dump_err_cqe(cq->mdev, err_cqe); +} + /* SW parser related functions */ struct mlx5e_swp_spec { diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c b/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c index f049e0ac308a..f9dad2639061 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c @@ -415,11 +415,6 @@ bool mlx5e_poll_xdpsq_cq(struct mlx5e_cq *cq) wqe_counter = be16_to_cpu(cqe->wqe_counter); - if (unlikely(get_cqe_opcode(cqe) != MLX5_CQE_REQ)) - netdev_WARN_ONCE(sq->channel->netdev, - "Bad OP in XDPSQ CQE: 0x%x\n", - get_cqe_opcode(cqe)); - do { struct mlx5e_xdp_wqe_info *wi; u16 ci; @@ -432,6 +427,14 @@ bool mlx5e_poll_xdpsq_cq(struct mlx5e_cq *cq) mlx5e_free_xdpsq_desc(sq, wi, &xsk_frames, true); } while (!last_wqe); + + if (unlikely(get_cqe_opcode(cqe) != MLX5_CQE_REQ)) { + netdev_WARN_ONCE(sq->channel->netdev, + "Bad OP in XDPSQ CQE: 0x%x\n", + get_cqe_opcode(cqe)); + mlx5e_dump_error_cqe(&sq->cq, sq->sqn, + (struct mlx5_err_cqe *)cqe); + } } while ((++i < MLX5E_TX_CQ_POLL_BUDGET) && (cqe = mlx5_cqwq_get_cqe(&cq->wq))); if (xsk_frames) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c index e2beb89c1832..4db1c92f0019 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c @@ -631,6 +631,8 @@ int mlx5e_poll_ico_cq(struct mlx5e_cq *cq) netdev_WARN_ONCE(cq->channel->netdev, "Bad OP in ICOSQ CQE: 0x%x\n", get_cqe_opcode(cqe)); + mlx5e_dump_error_cqe(&sq->cq, sq->sqn, + (struct mlx5_err_cqe *)cqe); if (!test_and_set_bit(MLX5E_SQ_STATE_RECOVERING, &sq->state)) queue_work(cq->channel->priv->wq, &sq->recover_work); break; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c index fd6b2a1898c5..1679557f34c0 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c @@ -399,22 +399,6 @@ netdev_tx_t mlx5e_xmit(struct sk_buff *skb, struct net_device *dev) return mlx5e_sq_xmit(sq, skb, wqe, pi, netdev_xmit_more()); } -static void mlx5e_dump_error_cqe(struct mlx5e_txqsq *sq, - struct mlx5_err_cqe *err_cqe) -{ - struct mlx5_cqwq *wq = &sq->cq.wq; - u32 ci; - - ci = mlx5_cqwq_ctr2ix(wq, wq->cc - 1); - - netdev_err(sq->channel->netdev, - "Error cqe on cqn 0x%x, ci 0x%x, sqn 0x%x, opcode 0x%x, syndrome 0x%x, vendor syndrome 0x%x\n", - sq->cq.mcq.cqn, ci, sq->sqn, - get_cqe_opcode((struct mlx5_cqe64 *)err_cqe), - err_cqe->syndrome, err_cqe->vendor_err_synd); - mlx5_dump_err_cqe(sq->cq.mdev, err_cqe); -} - bool mlx5e_poll_tx_cq(struct mlx5e_cq *cq, int napi_budget) { struct mlx5e_sq_stats *stats; @@ -501,7 +485,7 @@ bool mlx5e_poll_tx_cq(struct mlx5e_cq *cq, int napi_budget) if (unlikely(get_cqe_opcode(cqe) == MLX5_CQE_REQ_ERR)) { if (!test_and_set_bit(MLX5E_SQ_STATE_RECOVERING, &sq->state)) { - mlx5e_dump_error_cqe(sq, + mlx5e_dump_error_cqe(&sq->cq, sq->sqn, (struct mlx5_err_cqe *)cqe); mlx5_wq_cyc_wqe_dump(&sq->wq, ci, wi->num_wqebbs); queue_work(cq->channel->priv->wq, -- cgit v1.2.3-59-g8ed1b From e2e11dbf36936d9cfea99c5b7386acea67b21634 Mon Sep 17 00:00:00 2001 From: Tariq Toukan Date: Sun, 9 Feb 2020 17:13:23 +0200 Subject: net/mlx5e: XDP, Print the offending TX descriptor on error completion Upon an error completion on an XDP SQ, print the offending WQE to ease the debug process. Signed-off-by: Tariq Toukan Reviewed-by: Aya Levin Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c b/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c index f9dad2639061..6f32a697a4bf 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c @@ -408,7 +408,8 @@ bool mlx5e_poll_xdpsq_cq(struct mlx5e_cq *cq) i = 0; do { - u16 wqe_counter; + struct mlx5e_xdp_wqe_info *wi; + u16 wqe_counter, ci; bool last_wqe; mlx5_cqwq_pop(&cq->wq); @@ -416,9 +417,6 @@ bool mlx5e_poll_xdpsq_cq(struct mlx5e_cq *cq) wqe_counter = be16_to_cpu(cqe->wqe_counter); do { - struct mlx5e_xdp_wqe_info *wi; - u16 ci; - last_wqe = (sqcc == wqe_counter); ci = mlx5_wq_cyc_ctr2ix(&sq->wq, sqcc); wi = &sq->db.wqe_info[ci]; @@ -434,6 +432,7 @@ bool mlx5e_poll_xdpsq_cq(struct mlx5e_cq *cq) get_cqe_opcode(cqe)); mlx5e_dump_error_cqe(&sq->cq, sq->sqn, (struct mlx5_err_cqe *)cqe); + mlx5_wq_cyc_wqe_dump(&sq->wq, ci, wi->num_wqebbs); } } while ((++i < MLX5E_TX_CQ_POLL_BUDGET) && (cqe = mlx5_cqwq_get_cqe(&cq->wq))); -- cgit v1.2.3-59-g8ed1b From fed0c6cfcd58f29ff60f47559b88a6289b6b680a Mon Sep 17 00:00:00 2001 From: Maxim Mikityanskiy Date: Fri, 15 Nov 2019 13:48:38 +0200 Subject: net/mlx5e: Fetch WQE: reuse code and enforce typing There are multiple functions mlx5{e,i}_*_fetch_wqe that contain the same code, that is repeated, because they operate on different SQ struct types. mlx5e_sq_fetch_wqe also returns void *, instead of the concrete WQE type. This commit generalizes the fetch WQE operation by putting this code into a single function. To simplify calls of the generic function in concrete use cases, macros are provided that substitute the right WQE size and cast the return type. Before this patch, fetch_wqe used to calculate pi itself, but the value was often known to the caller. This calculation is moved outside to eliminate this unnecessary step and prepare for the fill_frag_edge refactoring in the next patch. Signed-off-by: Maxim Mikityanskiy Reviewed-by: Tariq Toukan Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en/txrx.h | 12 ++++++------ drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c | 6 ++++-- drivers/net/ethernet/mellanox/mlx5/core/en/xdp.h | 13 ------------- drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls.h | 8 ++++++++ .../net/ethernet/mellanox/mlx5/core/en_accel/ktls_tx.c | 15 ++++++++++----- .../net/ethernet/mellanox/mlx5/core/en_accel/tls_rxtx.c | 3 ++- drivers/net/ethernet/mellanox/mlx5/core/en_tx.c | 10 ++++++---- drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.h | 11 ++--------- 8 files changed, 38 insertions(+), 40 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/txrx.h b/drivers/net/ethernet/mellanox/mlx5/core/en/txrx.h index c0249fc77eaa..8682d9148ab9 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/txrx.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/txrx.h @@ -33,19 +33,19 @@ mlx5e_wqc_has_room_for(struct mlx5_wq_cyc *wq, u16 cc, u16 pc, u16 n) return (mlx5_wq_cyc_ctr2ix(wq, cc - pc) >= n) || (cc == pc); } -static inline void * -mlx5e_sq_fetch_wqe(struct mlx5e_txqsq *sq, size_t size, u16 *pi) +static inline void *mlx5e_fetch_wqe(struct mlx5_wq_cyc *wq, u16 pi, size_t wqe_size) { - struct mlx5_wq_cyc *wq = &sq->wq; void *wqe; - *pi = mlx5_wq_cyc_ctr2ix(wq, sq->pc); - wqe = mlx5_wq_cyc_get_wqe(wq, *pi); - memset(wqe, 0, size); + wqe = mlx5_wq_cyc_get_wqe(wq, pi); + memset(wqe, 0, wqe_size); return wqe; } +#define MLX5E_TX_FETCH_WQE(sq, pi) \ + ((struct mlx5e_tx_wqe *)mlx5e_fetch_wqe(&(sq)->wq, pi, sizeof(struct mlx5e_tx_wqe))) + static inline struct mlx5e_tx_wqe * mlx5e_post_nop(struct mlx5_wq_cyc *wq, u32 sqn, u16 *pc) { diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c b/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c index 6f32a697a4bf..cf089520c031 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c @@ -188,10 +188,12 @@ static void mlx5e_xdp_mpwqe_session_start(struct mlx5e_xdpsq *sq) pi = mlx5_wq_cyc_ctr2ix(wq, sq->pc); contig_wqebbs = mlx5_wq_cyc_get_contig_wqebbs(wq, pi); - if (unlikely(contig_wqebbs < MLX5_SEND_WQE_MAX_WQEBBS)) + if (unlikely(contig_wqebbs < MLX5_SEND_WQE_MAX_WQEBBS)) { mlx5e_fill_xdpsq_frag_edge(sq, wq, pi, contig_wqebbs); + pi = mlx5_wq_cyc_ctr2ix(wq, sq->pc); + } - session->wqe = mlx5e_xdpsq_fetch_wqe(sq, &pi); + session->wqe = MLX5E_TX_FETCH_WQE(sq, pi); prefetchw(session->wqe->data); session->ds_count = MLX5E_XDP_TX_EMPTY_DS_COUNT; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.h b/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.h index d7587f40ecae..4fd0ff47bdc3 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.h @@ -186,19 +186,6 @@ mlx5e_xdp_mpwqe_add_dseg(struct mlx5e_xdpsq *sq, session->ds_count++; } -static inline struct mlx5e_tx_wqe * -mlx5e_xdpsq_fetch_wqe(struct mlx5e_xdpsq *sq, u16 *pi) -{ - struct mlx5_wq_cyc *wq = &sq->wq; - struct mlx5e_tx_wqe *wqe; - - *pi = mlx5_wq_cyc_ctr2ix(wq, sq->pc); - wqe = mlx5_wq_cyc_get_wqe(wq, *pi); - memset(wqe, 0, sizeof(*wqe)); - - return wqe; -} - static inline void mlx5e_xdpi_fifo_push(struct mlx5e_xdp_info_fifo *fifo, struct mlx5e_xdp_info *xi) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls.h b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls.h index 63116be6b1d6..9daaec244385 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls.h @@ -27,6 +27,14 @@ struct mlx5e_dump_wqe { struct mlx5_wqe_data_seg data; }; +#define MLX5E_TLS_FETCH_UMR_WQE(sq, pi) \ + ((struct mlx5e_umr_wqe *)mlx5e_fetch_wqe(&(sq)->wq, pi, MLX5E_KTLS_STATIC_UMR_WQE_SZ)) +#define MLX5E_TLS_FETCH_PROGRESS_WQE(sq, pi) \ + ((struct mlx5e_tx_wqe *)mlx5e_fetch_wqe(&(sq)->wq, pi, MLX5E_KTLS_PROGRESS_WQE_SZ)) +#define MLX5E_TLS_FETCH_DUMP_WQE(sq, pi) \ + ((struct mlx5e_dump_wqe *)mlx5e_fetch_wqe(&(sq)->wq, pi, \ + sizeof(struct mlx5e_dump_wqe))) + #define MLX5E_KTLS_DUMP_WQEBBS \ (DIV_ROUND_UP(sizeof(struct mlx5e_dump_wqe), MLX5_SEND_WQE_BB)) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_tx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_tx.c index 52a56622034a..717d36b45aa9 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_tx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_tx.c @@ -137,7 +137,8 @@ post_static_params(struct mlx5e_txqsq *sq, struct mlx5e_umr_wqe *umr_wqe; u16 pi; - umr_wqe = mlx5e_sq_fetch_wqe(sq, MLX5E_KTLS_STATIC_UMR_WQE_SZ, &pi); + pi = mlx5_wq_cyc_ctr2ix(&sq->wq, sq->pc); + umr_wqe = MLX5E_TLS_FETCH_UMR_WQE(sq, pi); build_static_params(umr_wqe, sq->pc, sq->sqn, priv_tx, fence); tx_fill_wi(sq, pi, MLX5E_KTLS_STATIC_WQEBBS, 0, NULL); sq->pc += MLX5E_KTLS_STATIC_WQEBBS; @@ -151,7 +152,8 @@ post_progress_params(struct mlx5e_txqsq *sq, struct mlx5e_tx_wqe *wqe; u16 pi; - wqe = mlx5e_sq_fetch_wqe(sq, MLX5E_KTLS_PROGRESS_WQE_SZ, &pi); + pi = mlx5_wq_cyc_ctr2ix(&sq->wq, sq->pc); + wqe = MLX5E_TLS_FETCH_PROGRESS_WQE(sq, pi); build_progress_params(wqe, sq->pc, sq->sqn, priv_tx, fence); tx_fill_wi(sq, pi, MLX5E_KTLS_PROGRESS_WQEBBS, 0, NULL); sq->pc += MLX5E_KTLS_PROGRESS_WQEBBS; @@ -278,7 +280,8 @@ tx_post_resync_dump(struct mlx5e_txqsq *sq, skb_frag_t *frag, u32 tisn, bool fir int fsz; u16 pi; - wqe = mlx5e_sq_fetch_wqe(sq, sizeof(*wqe), &pi); + pi = mlx5_wq_cyc_ctr2ix(&sq->wq, sq->pc); + wqe = MLX5E_TLS_FETCH_DUMP_WQE(sq, pi); ds_cnt = sizeof(*wqe) / MLX5_SEND_WQE_DS; @@ -449,7 +452,8 @@ struct sk_buff *mlx5e_ktls_handle_tx_skb(struct net_device *netdev, if (unlikely(mlx5e_ktls_tx_offload_test_and_clear_pending(priv_tx))) { mlx5e_ktls_tx_post_param_wqes(sq, priv_tx, false, false); - *wqe = mlx5e_sq_fetch_wqe(sq, sizeof(**wqe), pi); + *pi = mlx5_wq_cyc_ctr2ix(&sq->wq, sq->pc); + *wqe = MLX5E_TX_FETCH_WQE(sq, *pi); stats->tls_ctx++; } @@ -460,7 +464,8 @@ struct sk_buff *mlx5e_ktls_handle_tx_skb(struct net_device *netdev, switch (ret) { case MLX5E_KTLS_SYNC_DONE: - *wqe = mlx5e_sq_fetch_wqe(sq, sizeof(**wqe), pi); + *pi = mlx5_wq_cyc_ctr2ix(&sq->wq, sq->pc); + *wqe = MLX5E_TX_FETCH_WQE(sq, *pi); break; case MLX5E_KTLS_SYNC_SKIP_NO_DATA: if (likely(!skb->decrypted)) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/tls_rxtx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/tls_rxtx.c index ef1ed15a53b4..1d7ddeb7a46b 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/tls_rxtx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/tls_rxtx.c @@ -248,7 +248,8 @@ mlx5e_tls_handle_ooo(struct mlx5e_tls_offload_context_tx *context, mlx5e_tls_complete_sync_skb(skb, nskb, tcp_seq, headln, cpu_to_be64(info.rcd_sn)); mlx5e_sq_xmit(sq, nskb, *wqe, *pi, true); - *wqe = mlx5e_sq_fetch_wqe(sq, sizeof(**wqe), pi); + *pi = mlx5_wq_cyc_ctr2ix(&sq->wq, sq->pc); + *wqe = MLX5E_TX_FETCH_WQE(sq, *pi); return skb; err_out: diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c index 1679557f34c0..ec1429596cb7 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c @@ -324,7 +324,8 @@ netdev_tx_t mlx5e_sq_xmit(struct mlx5e_txqsq *sq, struct sk_buff *skb, struct mlx5_wqe_ctrl_seg cur_ctrl = wqe->ctrl; #endif mlx5e_fill_sq_frag_edge(sq, wq, pi, contig_wqebbs_room); - wqe = mlx5e_sq_fetch_wqe(sq, sizeof(*wqe), &pi); + pi = mlx5_wq_cyc_ctr2ix(wq, sq->pc); + wqe = MLX5E_TX_FETCH_WQE(sq, pi); #ifdef CONFIG_MLX5_EN_IPSEC wqe->eth = cur_eth; #endif @@ -389,7 +390,8 @@ netdev_tx_t mlx5e_xmit(struct sk_buff *skb, struct net_device *dev) u16 pi; sq = priv->txq2sq[skb_get_queue_mapping(skb)]; - wqe = mlx5e_sq_fetch_wqe(sq, sizeof(*wqe), &pi); + pi = mlx5_wq_cyc_ctr2ix(&sq->wq, sq->pc); + wqe = MLX5E_TX_FETCH_WQE(sq, pi); /* might send skbs and update wqe and pi */ skb = mlx5e_accel_handle_tx(skb, sq, dev, &wqe, &pi); @@ -622,10 +624,10 @@ netdev_tx_t mlx5i_sq_xmit(struct mlx5e_txqsq *sq, struct sk_buff *skb, contig_wqebbs_room = mlx5_wq_cyc_get_contig_wqebbs(wq, pi); if (unlikely(contig_wqebbs_room < num_wqebbs)) { mlx5e_fill_sq_frag_edge(sq, wq, pi, contig_wqebbs_room); - pi = mlx5_wq_cyc_ctr2ix(wq, sq->pc); + pi = mlx5_wq_cyc_ctr2ix(&sq->wq, sq->pc); } - mlx5i_sq_fetch_wqe(sq, &wqe, pi); + wqe = MLX5I_SQ_FETCH_WQE(sq, pi); /* fill wqe */ wi = &sq->db.wqe_info[pi]; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.h b/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.h index 3483ba642cfe..7844ab5d0ce7 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.h @@ -110,15 +110,8 @@ struct mlx5i_tx_wqe { struct mlx5_wqe_data_seg data[]; }; -static inline void mlx5i_sq_fetch_wqe(struct mlx5e_txqsq *sq, - struct mlx5i_tx_wqe **wqe, - u16 pi) -{ - struct mlx5_wq_cyc *wq = &sq->wq; - - *wqe = mlx5_wq_cyc_get_wqe(wq, pi); - memset(*wqe, 0, sizeof(**wqe)); -} +#define MLX5I_SQ_FETCH_WQE(sq, pi) \ + ((struct mlx5i_tx_wqe *)mlx5e_fetch_wqe(&(sq)->wq, pi, sizeof(struct mlx5i_tx_wqe))) netdev_tx_t mlx5i_sq_xmit(struct mlx5e_txqsq *sq, struct sk_buff *skb, struct mlx5_av *av, u32 dqpn, u32 dqkey, -- cgit v1.2.3-59-g8ed1b From 7d42c8e9ab50091e4dda29066975fbb7aa0f1585 Mon Sep 17 00:00:00 2001 From: Maxim Mikityanskiy Date: Thu, 16 Apr 2020 11:32:42 +0300 Subject: net/mlx5e: Rename ICOSQ WQE info struct and field Structs mlx5e_txqsq and mlx5e_xdpsq contain wqe_info arrays to store supplementary information corresponding to WQEs in the queue. Struct mlx5e_icosq also has such an array, but it's called differently - ico_wqe. This patch renames it to unify with the other SQs. In addition, rename the struct to emphasize its specific usage. Signed-off-by: Maxim Mikityanskiy Signed-off-by: Tariq Toukan Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en.h | 4 ++-- drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 10 +++++----- drivers/net/ethernet/mellanox/mlx5/core/en_rx.c | 12 ++++++------ drivers/net/ethernet/mellanox/mlx5/core/en_txrx.c | 4 ++-- 4 files changed, 15 insertions(+), 15 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h index e8508c74eaa8..0864b76ca2c0 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h @@ -370,7 +370,7 @@ enum { MLX5E_SQ_STATE_PENDING_XSK_TX, }; -struct mlx5e_sq_wqe_info { +struct mlx5e_icosq_wqe_info { u8 opcode; u8 num_wqebbs; @@ -552,7 +552,7 @@ struct mlx5e_icosq { /* write@xmit, read@completion */ struct { - struct mlx5e_sq_wqe_info *ico_wqe; + struct mlx5e_icosq_wqe_info *wqe_info; } db; /* read only */ diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index bf3fdbea1074..048a4f8601a8 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -1027,17 +1027,17 @@ static void mlx5e_free_xdpsq(struct mlx5e_xdpsq *sq) static void mlx5e_free_icosq_db(struct mlx5e_icosq *sq) { - kvfree(sq->db.ico_wqe); + kvfree(sq->db.wqe_info); } static int mlx5e_alloc_icosq_db(struct mlx5e_icosq *sq, int numa) { int wq_sz = mlx5_wq_cyc_get_size(&sq->wq); + size_t size; - sq->db.ico_wqe = kvzalloc_node(array_size(wq_sz, - sizeof(*sq->db.ico_wqe)), - GFP_KERNEL, numa); - if (!sq->db.ico_wqe) + size = array_size(wq_sz, sizeof(*sq->db.wqe_info)); + sq->db.wqe_info = kvzalloc_node(size, GFP_KERNEL, numa); + if (!sq->db.wqe_info) return -ENOMEM; return 0; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c index 4db1c92f0019..9f33a0e7dd9a 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c @@ -472,7 +472,7 @@ static inline void mlx5e_fill_icosq_frag_edge(struct mlx5e_icosq *sq, struct mlx5_wq_cyc *wq, u16 pi, u16 nnops) { - struct mlx5e_sq_wqe_info *edge_wi, *wi = &sq->db.ico_wqe[pi]; + struct mlx5e_icosq_wqe_info *edge_wi, *wi = &sq->db.wqe_info[pi]; edge_wi = wi + nnops; @@ -527,9 +527,9 @@ static int mlx5e_alloc_rx_mpwqe(struct mlx5e_rq *rq, u16 ix) MLX5_OPCODE_UMR); umr_wqe->uctrl.xlt_offset = cpu_to_be16(xlt_offset); - sq->db.ico_wqe[pi].opcode = MLX5_OPCODE_UMR; - sq->db.ico_wqe[pi].num_wqebbs = MLX5E_UMR_WQEBBS; - sq->db.ico_wqe[pi].umr.rq = rq; + sq->db.wqe_info[pi].opcode = MLX5_OPCODE_UMR; + sq->db.wqe_info[pi].num_wqebbs = MLX5E_UMR_WQEBBS; + sq->db.wqe_info[pi].umr.rq = rq; sq->pc += MLX5E_UMR_WQEBBS; sq->doorbell_cseg = &umr_wqe->ctrl; @@ -618,13 +618,13 @@ int mlx5e_poll_ico_cq(struct mlx5e_cq *cq) wqe_counter = be16_to_cpu(cqe->wqe_counter); do { - struct mlx5e_sq_wqe_info *wi; + struct mlx5e_icosq_wqe_info *wi; u16 ci; last_wqe = (sqcc == wqe_counter); ci = mlx5_wq_cyc_ctr2ix(&sq->wq, sqcc); - wi = &sq->db.ico_wqe[ci]; + wi = &sq->db.wqe_info[ci]; sqcc += wi->num_wqebbs; if (last_wqe && unlikely(get_cqe_opcode(cqe) != MLX5_CQE_REQ)) { diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_txrx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_txrx.c index acb20215a33b..869fd58a6775 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_txrx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_txrx.c @@ -78,8 +78,8 @@ void mlx5e_trigger_irq(struct mlx5e_icosq *sq) struct mlx5e_tx_wqe *nopwqe; u16 pi = mlx5_wq_cyc_ctr2ix(wq, sq->pc); - sq->db.ico_wqe[pi].opcode = MLX5_OPCODE_NOP; - sq->db.ico_wqe[pi].num_wqebbs = 1; + sq->db.wqe_info[pi].opcode = MLX5_OPCODE_NOP; + sq->db.wqe_info[pi].num_wqebbs = 1; nopwqe = mlx5e_post_nop(wq, sq->sqn, &sq->pc); mlx5e_notify_hw(wq, sq->pc, sq->uar_map, &nopwqe->ctrl); } -- cgit v1.2.3-59-g8ed1b From ec9cdca0663a543ede2072ff091beec1787e3374 Mon Sep 17 00:00:00 2001 From: Maxim Mikityanskiy Date: Thu, 16 Apr 2020 11:29:49 +0300 Subject: net/mlx5e: Unify reserving space for WQEs In our fast-path design, a WQE (Work Queue Element) must not cross the page boundary. To enforce that, for WQEs consisting of more than one BB (Basic Block), the driver checks the available contiguous space in the WQ in advance, and if it's not enough, it pads it with NOPs. This patch modifies the code that calculates the position of next WQE, considering the padding, and prepares the WQE. This code is common for all SQ types. In this patch it's reorganized in a way that makes the usage pattern unified for all SQ types, and makes the implementations self-contained and look almost the same, preparing the repeating code to further attempts to deduplicate it. One place is left as is: mlx5e_sq_xmit and mlx5e_fill_sq_frag_edge call inside, because it is special in a way that it may also copy WQE's cseg and eseg when reserving space. This will be eliminated in one of the following patches, and this place will be converted to the new approach, too. Signed-off-by: Maxim Mikityanskiy Reviewed-by: Tariq Toukan Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en/txrx.h | 56 ++++++++++++++++++++++ drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c | 31 ++++++++++-- drivers/net/ethernet/mellanox/mlx5/core/en/xdp.h | 17 ------- .../ethernet/mellanox/mlx5/core/en_accel/ktls_tx.c | 16 +------ drivers/net/ethernet/mellanox/mlx5/core/en_rx.c | 26 +--------- drivers/net/ethernet/mellanox/mlx5/core/en_tx.c | 11 +---- 6 files changed, 88 insertions(+), 69 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/txrx.h b/drivers/net/ethernet/mellanox/mlx5/core/en/txrx.h index 8682d9148ab9..89fe65593c16 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/txrx.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/txrx.h @@ -81,6 +81,62 @@ mlx5e_post_nop_fence(struct mlx5_wq_cyc *wq, u32 sqn, u16 *pc) return wqe; } +static inline u16 mlx5e_txqsq_get_next_pi(struct mlx5e_txqsq *sq, u16 size) +{ + struct mlx5_wq_cyc *wq = &sq->wq; + u16 pi, contig_wqebbs; + + pi = mlx5_wq_cyc_ctr2ix(wq, sq->pc); + contig_wqebbs = mlx5_wq_cyc_get_contig_wqebbs(wq, pi); + if (unlikely(contig_wqebbs < size)) { + struct mlx5e_tx_wqe_info *wi, *edge_wi; + + wi = &sq->db.wqe_info[pi]; + edge_wi = wi + contig_wqebbs; + + /* Fill SQ frag edge with NOPs to avoid WQE wrapping two pages. */ + for (; wi < edge_wi; wi++) { + *wi = (struct mlx5e_tx_wqe_info) { + .num_wqebbs = 1, + }; + mlx5e_post_nop(wq, sq->sqn, &sq->pc); + } + sq->stats->nop += contig_wqebbs; + + pi = mlx5_wq_cyc_ctr2ix(wq, sq->pc); + } + + return pi; +} + +static inline u16 mlx5e_icosq_get_next_pi(struct mlx5e_icosq *sq, u16 size) +{ + struct mlx5_wq_cyc *wq = &sq->wq; + u16 pi, contig_wqebbs; + + pi = mlx5_wq_cyc_ctr2ix(wq, sq->pc); + contig_wqebbs = mlx5_wq_cyc_get_contig_wqebbs(wq, pi); + if (unlikely(contig_wqebbs < size)) { + struct mlx5e_icosq_wqe_info *wi, *edge_wi; + + wi = &sq->db.wqe_info[pi]; + edge_wi = wi + contig_wqebbs; + + /* Fill SQ frag edge with NOPs to avoid WQE wrapping two pages. */ + for (; wi < edge_wi; wi++) { + *wi = (struct mlx5e_icosq_wqe_info) { + .opcode = MLX5_OPCODE_NOP, + .num_wqebbs = 1, + }; + mlx5e_post_nop(wq, sq->sqn, &sq->pc); + } + + pi = mlx5_wq_cyc_ctr2ix(wq, sq->pc); + } + + return pi; +} + static inline void mlx5e_fill_sq_frag_edge(struct mlx5e_txqsq *sq, struct mlx5_wq_cyc *wq, u16 pi, u16 nnops) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c b/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c index cf089520c031..c4a7fb4ecd14 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c @@ -178,21 +178,42 @@ xdp_abort: } } -static void mlx5e_xdp_mpwqe_session_start(struct mlx5e_xdpsq *sq) +static u16 mlx5e_xdpsq_get_next_pi(struct mlx5e_xdpsq *sq, u16 size) { - struct mlx5e_xdp_mpwqe *session = &sq->mpwqe; - struct mlx5e_xdpsq_stats *stats = sq->stats; struct mlx5_wq_cyc *wq = &sq->wq; u16 pi, contig_wqebbs; pi = mlx5_wq_cyc_ctr2ix(wq, sq->pc); contig_wqebbs = mlx5_wq_cyc_get_contig_wqebbs(wq, pi); + if (unlikely(contig_wqebbs < size)) { + struct mlx5e_xdp_wqe_info *wi, *edge_wi; + + wi = &sq->db.wqe_info[pi]; + edge_wi = wi + contig_wqebbs; + + /* Fill SQ frag edge with NOPs to avoid WQE wrapping two pages. */ + for (; wi < edge_wi; wi++) { + *wi = (struct mlx5e_xdp_wqe_info) { + .num_wqebbs = 1, + .num_pkts = 0, + }; + mlx5e_post_nop(wq, sq->sqn, &sq->pc); + } + sq->stats->nops += contig_wqebbs; - if (unlikely(contig_wqebbs < MLX5_SEND_WQE_MAX_WQEBBS)) { - mlx5e_fill_xdpsq_frag_edge(sq, wq, pi, contig_wqebbs); pi = mlx5_wq_cyc_ctr2ix(wq, sq->pc); } + return pi; +} + +static void mlx5e_xdp_mpwqe_session_start(struct mlx5e_xdpsq *sq) +{ + struct mlx5e_xdp_mpwqe *session = &sq->mpwqe; + struct mlx5e_xdpsq_stats *stats = sq->stats; + u16 pi; + + pi = mlx5e_xdpsq_get_next_pi(sq, MLX5_SEND_WQE_MAX_WQEBBS); session->wqe = MLX5E_TX_FETCH_WQE(sq, pi); prefetchw(session->wqe->data); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.h b/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.h index 4fd0ff47bdc3..ed6f045febeb 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.h @@ -137,23 +137,6 @@ mlx5e_xdp_no_room_for_inline_pkt(struct mlx5e_xdp_mpwqe *session) session->ds_count + MLX5E_XDP_INLINE_WQE_MAX_DS_CNT > MLX5E_XDP_MPW_MAX_NUM_DS; } -static inline void -mlx5e_fill_xdpsq_frag_edge(struct mlx5e_xdpsq *sq, struct mlx5_wq_cyc *wq, - u16 pi, u16 nnops) -{ - struct mlx5e_xdp_wqe_info *edge_wi, *wi = &sq->db.wqe_info[pi]; - - edge_wi = wi + nnops; - /* fill sq frag edge with nops to avoid wqe wrapping two pages */ - for (; wi < edge_wi; wi++) { - wi->num_wqebbs = 1; - wi->num_pkts = 0; - mlx5e_post_nop(wq, sq->sqn, &sq->pc); - } - - sq->stats->nops += nnops; -} - static inline void mlx5e_xdp_mpwqe_add_dseg(struct mlx5e_xdpsq *sq, struct mlx5e_xdp_xmit_data *xdptxd, diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_tx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_tx.c index 717d36b45aa9..ba973937f0b5 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_tx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_tx.c @@ -165,14 +165,8 @@ mlx5e_ktls_tx_post_param_wqes(struct mlx5e_txqsq *sq, bool skip_static_post, bool fence_first_post) { bool progress_fence = skip_static_post || !fence_first_post; - struct mlx5_wq_cyc *wq = &sq->wq; - u16 contig_wqebbs_room, pi; - pi = mlx5_wq_cyc_ctr2ix(wq, sq->pc); - contig_wqebbs_room = mlx5_wq_cyc_get_contig_wqebbs(wq, pi); - if (unlikely(contig_wqebbs_room < - MLX5E_KTLS_STATIC_WQEBBS + MLX5E_KTLS_PROGRESS_WQEBBS)) - mlx5e_fill_sq_frag_edge(sq, wq, pi, contig_wqebbs_room); + mlx5e_txqsq_get_next_pi(sq, MLX5E_KTLS_STATIC_WQEBBS + MLX5E_KTLS_PROGRESS_WQEBBS); if (!skip_static_post) post_static_params(sq, priv_tx, fence_first_post); @@ -346,10 +340,8 @@ mlx5e_ktls_tx_handle_ooo(struct mlx5e_ktls_offload_context_tx *priv_tx, u32 seq) { struct mlx5e_sq_stats *stats = sq->stats; - struct mlx5_wq_cyc *wq = &sq->wq; enum mlx5e_ktls_sync_retval ret; struct tx_sync_info info = {}; - u16 contig_wqebbs_room, pi; u8 num_wqebbs; int i = 0; @@ -380,11 +372,7 @@ mlx5e_ktls_tx_handle_ooo(struct mlx5e_ktls_offload_context_tx *priv_tx, } num_wqebbs = mlx5e_ktls_dumps_num_wqebbs(sq, info.nr_frags, info.sync_len); - pi = mlx5_wq_cyc_ctr2ix(wq, sq->pc); - contig_wqebbs_room = mlx5_wq_cyc_get_contig_wqebbs(wq, pi); - - if (unlikely(contig_wqebbs_room < num_wqebbs)) - mlx5e_fill_sq_frag_edge(sq, wq, pi, contig_wqebbs_room); + mlx5e_txqsq_get_next_pi(sq, num_wqebbs); for (; i < info.nr_frags; i++) { unsigned int orig_fsz, frag_offset = 0, n = 0; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c index 9f33a0e7dd9a..d9a5a669b84d 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c @@ -468,22 +468,6 @@ static void mlx5e_post_rx_mpwqe(struct mlx5e_rq *rq, u8 n) mlx5_wq_ll_update_db_record(wq); } -static inline void mlx5e_fill_icosq_frag_edge(struct mlx5e_icosq *sq, - struct mlx5_wq_cyc *wq, - u16 pi, u16 nnops) -{ - struct mlx5e_icosq_wqe_info *edge_wi, *wi = &sq->db.wqe_info[pi]; - - edge_wi = wi + nnops; - - /* fill sq frag edge with nops to avoid wqe wrapping two pages */ - for (; wi < edge_wi; wi++) { - wi->opcode = MLX5_OPCODE_NOP; - wi->num_wqebbs = 1; - mlx5e_post_nop(wq, sq->sqn, &sq->pc); - } -} - static int mlx5e_alloc_rx_mpwqe(struct mlx5e_rq *rq, u16 ix) { struct mlx5e_mpw_info *wi = &rq->mpwqe.info[ix]; @@ -492,7 +476,7 @@ static int mlx5e_alloc_rx_mpwqe(struct mlx5e_rq *rq, u16 ix) struct mlx5_wq_cyc *wq = &sq->wq; struct mlx5e_umr_wqe *umr_wqe; u16 xlt_offset = ix << (MLX5E_LOG_ALIGNED_MPWQE_PPW - 1); - u16 pi, contig_wqebbs_room; + u16 pi; int err; int i; @@ -502,13 +486,7 @@ static int mlx5e_alloc_rx_mpwqe(struct mlx5e_rq *rq, u16 ix) goto err; } - pi = mlx5_wq_cyc_ctr2ix(wq, sq->pc); - contig_wqebbs_room = mlx5_wq_cyc_get_contig_wqebbs(wq, pi); - if (unlikely(contig_wqebbs_room < MLX5E_UMR_WQEBBS)) { - mlx5e_fill_icosq_frag_edge(sq, wq, pi, contig_wqebbs_room); - pi = mlx5_wq_cyc_ctr2ix(wq, sq->pc); - } - + pi = mlx5e_icosq_get_next_pi(sq, MLX5E_UMR_WQEBBS); umr_wqe = mlx5_wq_cyc_get_wqe(wq, pi); memcpy(umr_wqe, &rq->mpwqe.umr_wqe, offsetof(struct mlx5e_umr_wqe, inline_mtts)); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c index ec1429596cb7..583e1b201b75 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c @@ -572,7 +572,6 @@ netdev_tx_t mlx5i_sq_xmit(struct mlx5e_txqsq *sq, struct sk_buff *skb, struct mlx5_av *av, u32 dqpn, u32 dqkey, bool xmit_more) { - struct mlx5_wq_cyc *wq = &sq->wq; struct mlx5i_tx_wqe *wqe; struct mlx5_wqe_datagram_seg *datagram; @@ -582,9 +581,9 @@ netdev_tx_t mlx5i_sq_xmit(struct mlx5e_txqsq *sq, struct sk_buff *skb, struct mlx5e_tx_wqe_info *wi; struct mlx5e_sq_stats *stats = sq->stats; - u16 headlen, ihs, pi, contig_wqebbs_room; u16 ds_cnt, ds_cnt_inl = 0; u8 num_wqebbs, opcode; + u16 headlen, ihs, pi; u32 num_bytes; int num_dma; __be16 mss; @@ -620,13 +619,7 @@ netdev_tx_t mlx5i_sq_xmit(struct mlx5e_txqsq *sq, struct sk_buff *skb, } num_wqebbs = DIV_ROUND_UP(ds_cnt, MLX5_SEND_WQEBB_NUM_DS); - pi = mlx5_wq_cyc_ctr2ix(wq, sq->pc); - contig_wqebbs_room = mlx5_wq_cyc_get_contig_wqebbs(wq, pi); - if (unlikely(contig_wqebbs_room < num_wqebbs)) { - mlx5e_fill_sq_frag_edge(sq, wq, pi, contig_wqebbs_room); - pi = mlx5_wq_cyc_ctr2ix(&sq->wq, sq->pc); - } - + pi = mlx5e_txqsq_get_next_pi(sq, num_wqebbs); wqe = MLX5I_SQ_FETCH_WQE(sq, pi); /* fill wqe */ -- cgit v1.2.3-59-g8ed1b From ddc2118ef064ca130f06716829e11e02b3915fd0 Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Thu, 30 Apr 2020 09:32:45 +0800 Subject: hinic: make a bunch of functions static These fucntions is used only in hinic_sriov.c, so make them static to fix sparse warnings. Reported-by: Hulk Robot Signed-off-by: YueHaibing Signed-off-by: David S. Miller --- drivers/net/ethernet/huawei/hinic/hinic_sriov.c | 91 +++++++++++++------------ 1 file changed, 48 insertions(+), 43 deletions(-) diff --git a/drivers/net/ethernet/huawei/hinic/hinic_sriov.c b/drivers/net/ethernet/huawei/hinic/hinic_sriov.c index b24788e9733c..0d44af9dce2a 100644 --- a/drivers/net/ethernet/huawei/hinic/hinic_sriov.c +++ b/drivers/net/ethernet/huawei/hinic/hinic_sriov.c @@ -23,8 +23,8 @@ MODULE_PARM_DESC(set_vf_link_state, "Set vf link state, 0 represents link auto, #define HINIC_VLAN_PRIORITY_SHIFT 13 #define HINIC_ADD_VLAN_IN_MAC 0x8000 -int hinic_set_mac(struct hinic_hwdev *hwdev, const u8 *mac_addr, u16 vlan_id, - u16 func_id) +static int hinic_set_mac(struct hinic_hwdev *hwdev, const u8 *mac_addr, + u16 vlan_id, u16 func_id) { struct hinic_port_mac_cmd mac_info = {0}; u16 out_size = sizeof(mac_info); @@ -84,7 +84,7 @@ void hinic_notify_all_vfs_link_changed(struct hinic_hwdev *hwdev, } } -u16 hinic_vf_info_vlanprio(struct hinic_hwdev *hwdev, int vf_id) +static u16 hinic_vf_info_vlanprio(struct hinic_hwdev *hwdev, int vf_id) { struct hinic_func_to_io *nic_io = &hwdev->func_to_io; u16 pf_vlan, vlanprio; @@ -97,8 +97,8 @@ u16 hinic_vf_info_vlanprio(struct hinic_hwdev *hwdev, int vf_id) return vlanprio; } -int hinic_set_vf_vlan(struct hinic_hwdev *hwdev, bool add, u16 vid, - u8 qos, int vf_id) +static int hinic_set_vf_vlan(struct hinic_hwdev *hwdev, bool add, u16 vid, + u8 qos, int vf_id) { struct hinic_vf_vlan_config vf_vlan = {0}; u16 out_size = sizeof(vf_vlan); @@ -163,9 +163,9 @@ static int hinic_init_vf_config(struct hinic_hwdev *hwdev, u16 vf_id) return 0; } -int hinic_register_vf_msg_handler(void *hwdev, u16 vf_id, - void *buf_in, u16 in_size, - void *buf_out, u16 *out_size) +static int hinic_register_vf_msg_handler(void *hwdev, u16 vf_id, + void *buf_in, u16 in_size, + void *buf_out, u16 *out_size) { struct hinic_register_vf *register_info = buf_out; struct hinic_hwdev *hw_dev = hwdev; @@ -192,9 +192,9 @@ int hinic_register_vf_msg_handler(void *hwdev, u16 vf_id, return 0; } -int hinic_unregister_vf_msg_handler(void *hwdev, u16 vf_id, - void *buf_in, u16 in_size, - void *buf_out, u16 *out_size) +static int hinic_unregister_vf_msg_handler(void *hwdev, u16 vf_id, + void *buf_in, u16 in_size, + void *buf_out, u16 *out_size) { struct hinic_hwdev *hw_dev = hwdev; struct hinic_func_to_io *nic_io; @@ -209,9 +209,9 @@ int hinic_unregister_vf_msg_handler(void *hwdev, u16 vf_id, return 0; } -int hinic_change_vf_mtu_msg_handler(void *hwdev, u16 vf_id, - void *buf_in, u16 in_size, - void *buf_out, u16 *out_size) +static int hinic_change_vf_mtu_msg_handler(void *hwdev, u16 vf_id, + void *buf_in, u16 in_size, + void *buf_out, u16 *out_size) { struct hinic_hwdev *hw_dev = hwdev; int err; @@ -227,9 +227,9 @@ int hinic_change_vf_mtu_msg_handler(void *hwdev, u16 vf_id, return 0; } -int hinic_get_vf_mac_msg_handler(void *hwdev, u16 vf_id, - void *buf_in, u16 in_size, - void *buf_out, u16 *out_size) +static int hinic_get_vf_mac_msg_handler(void *hwdev, u16 vf_id, + void *buf_in, u16 in_size, + void *buf_out, u16 *out_size) { struct hinic_port_mac_cmd *mac_info = buf_out; struct hinic_hwdev *dev = hwdev; @@ -246,9 +246,9 @@ int hinic_get_vf_mac_msg_handler(void *hwdev, u16 vf_id, return 0; } -int hinic_set_vf_mac_msg_handler(void *hwdev, u16 vf_id, - void *buf_in, u16 in_size, - void *buf_out, u16 *out_size) +static int hinic_set_vf_mac_msg_handler(void *hwdev, u16 vf_id, + void *buf_in, u16 in_size, + void *buf_out, u16 *out_size) { struct hinic_port_mac_cmd *mac_out = buf_out; struct hinic_port_mac_cmd *mac_in = buf_in; @@ -280,9 +280,9 @@ int hinic_set_vf_mac_msg_handler(void *hwdev, u16 vf_id, return err; } -int hinic_del_vf_mac_msg_handler(void *hwdev, u16 vf_id, - void *buf_in, u16 in_size, - void *buf_out, u16 *out_size) +static int hinic_del_vf_mac_msg_handler(void *hwdev, u16 vf_id, + void *buf_in, u16 in_size, + void *buf_out, u16 *out_size) { struct hinic_port_mac_cmd *mac_out = buf_out; struct hinic_port_mac_cmd *mac_in = buf_in; @@ -312,9 +312,9 @@ int hinic_del_vf_mac_msg_handler(void *hwdev, u16 vf_id, return err; } -int hinic_get_vf_link_status_msg_handler(void *hwdev, u16 vf_id, - void *buf_in, u16 in_size, - void *buf_out, u16 *out_size) +static int hinic_get_vf_link_status_msg_handler(void *hwdev, u16 vf_id, + void *buf_in, u16 in_size, + void *buf_out, u16 *out_size) { struct hinic_port_link_cmd *get_link = buf_out; struct hinic_hwdev *hw_dev = hwdev; @@ -339,7 +339,7 @@ int hinic_get_vf_link_status_msg_handler(void *hwdev, u16 vf_id, return 0; } -struct vf_cmd_msg_handle nic_vf_cmd_msg_handler[] = { +static struct vf_cmd_msg_handle nic_vf_cmd_msg_handler[] = { {HINIC_PORT_CMD_VF_REGISTER, hinic_register_vf_msg_handler}, {HINIC_PORT_CMD_VF_UNREGISTER, hinic_unregister_vf_msg_handler}, {HINIC_PORT_CMD_CHANGE_MTU, hinic_change_vf_mtu_msg_handler}, @@ -351,6 +351,7 @@ struct vf_cmd_msg_handle nic_vf_cmd_msg_handler[] = { #define CHECK_IPSU_15BIT 0X8000 +static struct hinic_sriov_info *hinic_get_sriov_info_by_pcidev(struct pci_dev *pdev) { struct net_device *netdev = pci_get_drvdata(pdev); @@ -372,8 +373,8 @@ static int hinic_check_mac_info(u8 status, u16 vlan_id) #define HINIC_VLAN_ID_MASK 0x7FFF -int hinic_update_mac(struct hinic_hwdev *hwdev, u8 *old_mac, u8 *new_mac, - u16 vlan_id, u16 func_id) +static int hinic_update_mac(struct hinic_hwdev *hwdev, u8 *old_mac, + u8 *new_mac, u16 vlan_id, u16 func_id) { struct hinic_port_mac_update mac_info = {0}; u16 out_size = sizeof(mac_info); @@ -416,8 +417,8 @@ int hinic_update_mac(struct hinic_hwdev *hwdev, u8 *old_mac, u8 *new_mac, return 0; } -void hinic_get_vf_config(struct hinic_hwdev *hwdev, u16 vf_id, - struct ifla_vf_info *ivi) +static void hinic_get_vf_config(struct hinic_hwdev *hwdev, u16 vf_id, + struct ifla_vf_info *ivi) { struct vf_data_storage *vfinfo; @@ -455,7 +456,8 @@ int hinic_ndo_get_vf_config(struct net_device *netdev, return 0; } -int hinic_set_vf_mac(struct hinic_hwdev *hwdev, int vf, unsigned char *mac_addr) +static int hinic_set_vf_mac(struct hinic_hwdev *hwdev, int vf, + unsigned char *mac_addr) { struct hinic_func_to_io *nic_io = &hwdev->func_to_io; struct vf_data_storage *vf_info; @@ -504,7 +506,8 @@ int hinic_ndo_set_vf_mac(struct net_device *netdev, int vf, u8 *mac) return 0; } -int hinic_add_vf_vlan(struct hinic_hwdev *hwdev, int vf_id, u16 vlan, u8 qos) +static int hinic_add_vf_vlan(struct hinic_hwdev *hwdev, int vf_id, + u16 vlan, u8 qos) { struct hinic_func_to_io *nic_io = &hwdev->func_to_io; int err; @@ -521,7 +524,7 @@ int hinic_add_vf_vlan(struct hinic_hwdev *hwdev, int vf_id, u16 vlan, u8 qos) return 0; } -int hinic_kill_vf_vlan(struct hinic_hwdev *hwdev, int vf_id) +static int hinic_kill_vf_vlan(struct hinic_hwdev *hwdev, int vf_id) { struct hinic_func_to_io *nic_io = &hwdev->func_to_io; int err; @@ -543,8 +546,8 @@ int hinic_kill_vf_vlan(struct hinic_hwdev *hwdev, int vf_id) return 0; } -int hinic_update_mac_vlan(struct hinic_dev *nic_dev, u16 old_vlan, u16 new_vlan, - int vf_id) +static int hinic_update_mac_vlan(struct hinic_dev *nic_dev, u16 old_vlan, + u16 new_vlan, int vf_id) { struct vf_data_storage *vf_info; u16 vlan_id; @@ -651,7 +654,8 @@ int hinic_ndo_set_vf_vlan(struct net_device *netdev, int vf, u16 vlan, u8 qos, return set_hw_vf_vlan(nic_dev, cur_vlanprio, vf, vlan, qos); } -int hinic_set_vf_trust(struct hinic_hwdev *hwdev, u16 vf_id, bool trust) +static int hinic_set_vf_trust(struct hinic_hwdev *hwdev, u16 vf_id, + bool trust) { struct vf_data_storage *vf_infos; struct hinic_func_to_io *nic_io; @@ -697,8 +701,8 @@ int hinic_ndo_set_vf_trust(struct net_device *netdev, int vf, bool setting) } /* pf receive message from vf */ -int nic_pf_mbox_handler(void *hwdev, u16 vf_id, u8 cmd, void *buf_in, - u16 in_size, void *buf_out, u16 *out_size) +static int nic_pf_mbox_handler(void *hwdev, u16 vf_id, u8 cmd, void *buf_in, + u16 in_size, void *buf_out, u16 *out_size) { struct vf_cmd_msg_handle *vf_msg_handle; struct hinic_hwdev *dev = hwdev; @@ -786,7 +790,7 @@ static int hinic_init_vf_infos(struct hinic_func_to_io *nic_io, u16 vf_id) return 0; } -void hinic_clear_vf_infos(struct hinic_dev *nic_dev, u16 vf_id) +static void hinic_clear_vf_infos(struct hinic_dev *nic_dev, u16 vf_id) { struct vf_data_storage *vf_infos; u16 func_id; @@ -807,8 +811,8 @@ void hinic_clear_vf_infos(struct hinic_dev *nic_dev, u16 vf_id) hinic_init_vf_infos(&nic_dev->hwdev->func_to_io, HW_VF_ID_TO_OS(vf_id)); } -int hinic_deinit_vf_hw(struct hinic_sriov_info *sriov_info, u16 start_vf_id, - u16 end_vf_id) +static int hinic_deinit_vf_hw(struct hinic_sriov_info *sriov_info, + u16 start_vf_id, u16 end_vf_id) { struct hinic_dev *nic_dev; u16 func_idx, idx; @@ -908,7 +912,8 @@ void hinic_vf_func_free(struct hinic_hwdev *hwdev) } } -int hinic_init_vf_hw(struct hinic_hwdev *hwdev, u16 start_vf_id, u16 end_vf_id) +static int hinic_init_vf_hw(struct hinic_hwdev *hwdev, u16 start_vf_id, + u16 end_vf_id) { u16 i, func_idx; int err; -- cgit v1.2.3-59-g8ed1b From 40cf7fbe8a43b1d8c3c0139423abbeb9625a909a Mon Sep 17 00:00:00 2001 From: Zou Wei Date: Thu, 30 Apr 2020 09:51:31 +0800 Subject: hinic: Use ARRAY_SIZE for nic_vf_cmd_msg_handler fix coccinelle warning, use ARRAY_SIZE drivers/net/ethernet/huawei/hinic/hinic_sriov.c:713:43-44: WARNING: Use ARRAY_SIZE v1-->v2: remove cmd_number v2-->v3: preserve the reverse christmas tree ordering of local variables Reported-by: Hulk Robot Signed-off-by: Zou Wei Signed-off-by: David S. Miller --- drivers/net/ethernet/huawei/hinic/hinic_sriov.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/drivers/net/ethernet/huawei/hinic/hinic_sriov.c b/drivers/net/ethernet/huawei/hinic/hinic_sriov.c index 0d44af9dce2a..fd4aaf43874a 100644 --- a/drivers/net/ethernet/huawei/hinic/hinic_sriov.c +++ b/drivers/net/ethernet/huawei/hinic/hinic_sriov.c @@ -708,17 +708,15 @@ static int nic_pf_mbox_handler(void *hwdev, u16 vf_id, u8 cmd, void *buf_in, struct hinic_hwdev *dev = hwdev; struct hinic_func_to_io *nic_io; struct hinic_pfhwdev *pfhwdev; - u32 i, cmd_number; int err = 0; + u32 i; if (!hwdev) return -EFAULT; - cmd_number = sizeof(nic_vf_cmd_msg_handler) / - sizeof(struct vf_cmd_msg_handle); pfhwdev = container_of(dev, struct hinic_pfhwdev, hwdev); nic_io = &dev->func_to_io; - for (i = 0; i < cmd_number; i++) { + for (i = 0; i < ARRAY_SIZE(nic_vf_cmd_msg_handler); i++) { vf_msg_handle = &nic_vf_cmd_msg_handler[i]; if (cmd == vf_msg_handle->cmd && vf_msg_handle->cmd_msg_handler) { @@ -729,7 +727,7 @@ static int nic_pf_mbox_handler(void *hwdev, u16 vf_id, u8 cmd, void *buf_in, break; } } - if (i == cmd_number) + if (i == ARRAY_SIZE(nic_vf_cmd_msg_handler)) err = hinic_msg_to_mgmt(&pfhwdev->pf_to_mgmt, HINIC_MOD_L2NIC, cmd, buf_in, in_size, buf_out, out_size, HINIC_MGMT_MSG_SYNC); -- cgit v1.2.3-59-g8ed1b From 466ed24fb22342f3ae1c10758a6a0c6a8c081b2d Mon Sep 17 00:00:00 2001 From: Robert Marko Date: Thu, 30 Apr 2020 11:07:05 +0200 Subject: net: phy: mdio: add IPQ4019 MDIO driver This patch adds the driver for the MDIO interface inside of Qualcomm IPQ40xx series SoC-s. Signed-off-by: Christian Lamparter Signed-off-by: Robert Marko Reviewed-by: Andrew Lunn Reviewed-by: Florian Fainelli Cc: Luka Perkov Signed-off-by: David S. Miller --- drivers/net/phy/Kconfig | 7 ++ drivers/net/phy/Makefile | 1 + drivers/net/phy/mdio-ipq4019.c | 160 +++++++++++++++++++++++++++++++++++++++++ 3 files changed, 168 insertions(+) create mode 100644 drivers/net/phy/mdio-ipq4019.c diff --git a/drivers/net/phy/Kconfig b/drivers/net/phy/Kconfig index bacfee41b564..2a32f26ead0b 100644 --- a/drivers/net/phy/Kconfig +++ b/drivers/net/phy/Kconfig @@ -157,6 +157,13 @@ config MDIO_I2C This is library mode. +config MDIO_IPQ4019 + tristate "Qualcomm IPQ4019 MDIO interface support" + depends on HAS_IOMEM && OF_MDIO + help + This driver supports the MDIO interface found in Qualcomm + IPQ40xx series Soc-s. + config MDIO_IPQ8064 tristate "Qualcomm IPQ8064 MDIO interface support" depends on HAS_IOMEM && OF_MDIO diff --git a/drivers/net/phy/Makefile b/drivers/net/phy/Makefile index cd345b75d127..dc9e53b511d6 100644 --- a/drivers/net/phy/Makefile +++ b/drivers/net/phy/Makefile @@ -37,6 +37,7 @@ obj-$(CONFIG_MDIO_CAVIUM) += mdio-cavium.o obj-$(CONFIG_MDIO_GPIO) += mdio-gpio.o obj-$(CONFIG_MDIO_HISI_FEMAC) += mdio-hisi-femac.o obj-$(CONFIG_MDIO_I2C) += mdio-i2c.o +obj-$(CONFIG_MDIO_IPQ4019) += mdio-ipq4019.o obj-$(CONFIG_MDIO_IPQ8064) += mdio-ipq8064.o obj-$(CONFIG_MDIO_MOXART) += mdio-moxart.o obj-$(CONFIG_MDIO_MSCC_MIIM) += mdio-mscc-miim.o diff --git a/drivers/net/phy/mdio-ipq4019.c b/drivers/net/phy/mdio-ipq4019.c new file mode 100644 index 000000000000..1ce81ff2f41d --- /dev/null +++ b/drivers/net/phy/mdio-ipq4019.c @@ -0,0 +1,160 @@ +// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause +/* Copyright (c) 2015, The Linux Foundation. All rights reserved. */ +/* Copyright (c) 2020 Sartura Ltd. */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define MDIO_ADDR_REG 0x44 +#define MDIO_DATA_WRITE_REG 0x48 +#define MDIO_DATA_READ_REG 0x4c +#define MDIO_CMD_REG 0x50 +#define MDIO_CMD_ACCESS_BUSY BIT(16) +#define MDIO_CMD_ACCESS_START BIT(8) +#define MDIO_CMD_ACCESS_CODE_READ 0 +#define MDIO_CMD_ACCESS_CODE_WRITE 1 + +#define ipq4019_MDIO_TIMEOUT 10000 +#define ipq4019_MDIO_SLEEP 10 + +struct ipq4019_mdio_data { + void __iomem *membase; +}; + +static int ipq4019_mdio_wait_busy(struct mii_bus *bus) +{ + struct ipq4019_mdio_data *priv = bus->priv; + unsigned int busy; + + return readl_poll_timeout(priv->membase + MDIO_CMD_REG, busy, + (busy & MDIO_CMD_ACCESS_BUSY) == 0, + ipq4019_MDIO_SLEEP, ipq4019_MDIO_TIMEOUT); +} + +static int ipq4019_mdio_read(struct mii_bus *bus, int mii_id, int regnum) +{ + struct ipq4019_mdio_data *priv = bus->priv; + unsigned int cmd; + + /* Reject clause 45 */ + if (regnum & MII_ADDR_C45) + return -EOPNOTSUPP; + + if (ipq4019_mdio_wait_busy(bus)) + return -ETIMEDOUT; + + /* issue the phy address and reg */ + writel((mii_id << 8) | regnum, priv->membase + MDIO_ADDR_REG); + + cmd = MDIO_CMD_ACCESS_START | MDIO_CMD_ACCESS_CODE_READ; + + /* issue read command */ + writel(cmd, priv->membase + MDIO_CMD_REG); + + /* Wait read complete */ + if (ipq4019_mdio_wait_busy(bus)) + return -ETIMEDOUT; + + /* Read and return data */ + return readl(priv->membase + MDIO_DATA_READ_REG); +} + +static int ipq4019_mdio_write(struct mii_bus *bus, int mii_id, int regnum, + u16 value) +{ + struct ipq4019_mdio_data *priv = bus->priv; + unsigned int cmd; + + /* Reject clause 45 */ + if (regnum & MII_ADDR_C45) + return -EOPNOTSUPP; + + if (ipq4019_mdio_wait_busy(bus)) + return -ETIMEDOUT; + + /* issue the phy address and reg */ + writel((mii_id << 8) | regnum, priv->membase + MDIO_ADDR_REG); + + /* issue write data */ + writel(value, priv->membase + MDIO_DATA_WRITE_REG); + + cmd = MDIO_CMD_ACCESS_START | MDIO_CMD_ACCESS_CODE_WRITE; + /* issue write command */ + writel(cmd, priv->membase + MDIO_CMD_REG); + + /* Wait write complete */ + if (ipq4019_mdio_wait_busy(bus)) + return -ETIMEDOUT; + + return 0; +} + +static int ipq4019_mdio_probe(struct platform_device *pdev) +{ + struct ipq4019_mdio_data *priv; + struct mii_bus *bus; + int ret; + + bus = devm_mdiobus_alloc_size(&pdev->dev, sizeof(*priv)); + if (!bus) + return -ENOMEM; + + priv = bus->priv; + + priv->membase = devm_platform_ioremap_resource(pdev, 0); + if (IS_ERR(priv->membase)) + return PTR_ERR(priv->membase); + + bus->name = "ipq4019_mdio"; + bus->read = ipq4019_mdio_read; + bus->write = ipq4019_mdio_write; + bus->parent = &pdev->dev; + snprintf(bus->id, MII_BUS_ID_SIZE, "%s%d", pdev->name, pdev->id); + + ret = of_mdiobus_register(bus, pdev->dev.of_node); + if (ret) { + dev_err(&pdev->dev, "Cannot register MDIO bus!\n"); + return ret; + } + + platform_set_drvdata(pdev, bus); + + return 0; +} + +static int ipq4019_mdio_remove(struct platform_device *pdev) +{ + struct mii_bus *bus = platform_get_drvdata(pdev); + + mdiobus_unregister(bus); + + return 0; +} + +static const struct of_device_id ipq4019_mdio_dt_ids[] = { + { .compatible = "qcom,ipq4019-mdio" }, + { } +}; +MODULE_DEVICE_TABLE(of, ipq4019_mdio_dt_ids); + +static struct platform_driver ipq4019_mdio_driver = { + .probe = ipq4019_mdio_probe, + .remove = ipq4019_mdio_remove, + .driver = { + .name = "ipq4019-mdio", + .of_match_table = ipq4019_mdio_dt_ids, + }, +}; + +module_platform_driver(ipq4019_mdio_driver); + +MODULE_DESCRIPTION("ipq4019 MDIO interface driver"); +MODULE_AUTHOR("Qualcomm Atheros"); +MODULE_LICENSE("Dual BSD/GPL"); -- cgit v1.2.3-59-g8ed1b From 4972ecee06612e167523f528317920fbeba5f12d Mon Sep 17 00:00:00 2001 From: Robert Marko Date: Thu, 30 Apr 2020 11:07:06 +0200 Subject: dt-bindings: add Qualcomm IPQ4019 MDIO bindings This patch adds the binding document for the IPQ40xx MDIO driver. Signed-off-by: Robert Marko Reviewed-by: Andrew Lunn Reviewed-by: Florian Fainelli Cc: Luka Perkov Signed-off-by: David S. Miller --- .../devicetree/bindings/net/qcom,ipq4019-mdio.yaml | 61 ++++++++++++++++++++++ 1 file changed, 61 insertions(+) create mode 100644 Documentation/devicetree/bindings/net/qcom,ipq4019-mdio.yaml diff --git a/Documentation/devicetree/bindings/net/qcom,ipq4019-mdio.yaml b/Documentation/devicetree/bindings/net/qcom,ipq4019-mdio.yaml new file mode 100644 index 000000000000..13555a89975f --- /dev/null +++ b/Documentation/devicetree/bindings/net/qcom,ipq4019-mdio.yaml @@ -0,0 +1,61 @@ +# SPDX-License-Identifier: GPL-2.0-only OR BSD-2-Clause +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/net/qcom,ipq4019-mdio.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: Qualcomm IPQ40xx MDIO Controller Device Tree Bindings + +maintainers: + - Robert Marko + +allOf: + - $ref: "mdio.yaml#" + +properties: + compatible: + const: qcom,ipq4019-mdio + + "#address-cells": + const: 1 + + "#size-cells": + const: 0 + + reg: + maxItems: 1 + +required: + - compatible + - reg + - "#address-cells" + - "#size-cells" + +examples: + - | + mdio@90000 { + #address-cells = <1>; + #size-cells = <0>; + compatible = "qcom,ipq4019-mdio"; + reg = <0x90000 0x64>; + + ethphy0: ethernet-phy@0 { + reg = <0>; + }; + + ethphy1: ethernet-phy@1 { + reg = <1>; + }; + + ethphy2: ethernet-phy@2 { + reg = <2>; + }; + + ethphy3: ethernet-phy@3 { + reg = <3>; + }; + + ethphy4: ethernet-phy@4 { + reg = <4>; + }; + }; -- cgit v1.2.3-59-g8ed1b From 9c8c0f70ec6fdac2398632c723c48277be09b7c0 Mon Sep 17 00:00:00 2001 From: Robert Marko Date: Thu, 30 Apr 2020 11:07:07 +0200 Subject: ARM: dts: qcom: ipq4019: add MDIO node This patch adds the necessary MDIO interface node to the Qualcomm IPQ4019 DTSI. Built-in QCA8337N switch is managed using it, and since we have a driver for it lets add it. Signed-off-by: Christian Lamparter Signed-off-by: Robert Marko Reviewed-by: Andrew Lunn Reviewed-by: Florian Fainelli Cc: Luka Perkov Signed-off-by: David S. Miller --- arch/arm/boot/dts/qcom-ipq4019.dtsi | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/arch/arm/boot/dts/qcom-ipq4019.dtsi b/arch/arm/boot/dts/qcom-ipq4019.dtsi index bfa9ce4c6e69..b9839f86e703 100644 --- a/arch/arm/boot/dts/qcom-ipq4019.dtsi +++ b/arch/arm/boot/dts/qcom-ipq4019.dtsi @@ -576,5 +576,33 @@ "legacy"; status = "disabled"; }; + + mdio: mdio@90000 { + #address-cells = <1>; + #size-cells = <0>; + compatible = "qcom,ipq4019-mdio"; + reg = <0x90000 0x64>; + status = "disabled"; + + ethphy0: ethernet-phy@0 { + reg = <0>; + }; + + ethphy1: ethernet-phy@1 { + reg = <1>; + }; + + ethphy2: ethernet-phy@2 { + reg = <2>; + }; + + ethphy3: ethernet-phy@3 { + reg = <3>; + }; + + ethphy4: ethernet-phy@4 { + reg = <4>; + }; + }; }; }; -- cgit v1.2.3-59-g8ed1b From 99b2292ba21b47bbd97d33ab20892347ad2ac351 Mon Sep 17 00:00:00 2001 From: Yunjian Wang Date: Thu, 30 Apr 2020 18:16:16 +0800 Subject: net: caif: Fix use correct return type for ndo_start_xmit() The method ndo_start_xmit() returns a value of type netdev_tx_t. Fix the ndo function to use the correct type. Signed-off-by: Yunjian Wang Signed-off-by: David S. Miller --- net/caif/chnl_net.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/caif/chnl_net.c b/net/caif/chnl_net.c index a56628962852..79b6a04d8eb6 100644 --- a/net/caif/chnl_net.c +++ b/net/caif/chnl_net.c @@ -211,7 +211,8 @@ static void chnl_flowctrl_cb(struct cflayer *layr, enum caif_ctrlcmd flow, } } -static int chnl_net_start_xmit(struct sk_buff *skb, struct net_device *dev) +static netdev_tx_t chnl_net_start_xmit(struct sk_buff *skb, + struct net_device *dev) { struct chnl_net *priv; struct cfpkt *pkt = NULL; -- cgit v1.2.3-59-g8ed1b From 51070a3627a943b13c7e0a32bbfa96cb8dca4493 Mon Sep 17 00:00:00 2001 From: Yunjian Wang Date: Thu, 30 Apr 2020 19:26:40 +0800 Subject: rionet: Fix use correct return type for ndo_start_xmit() The method ndo_start_xmit() returns a value of type netdev_tx_t. Fix the ndo function to use the correct type. Signed-off-by: Yunjian Wang Signed-off-by: David S. Miller --- drivers/net/rionet.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/rionet.c b/drivers/net/rionet.c index 8eeb38c6199e..2056d6ad04b5 100644 --- a/drivers/net/rionet.c +++ b/drivers/net/rionet.c @@ -166,7 +166,8 @@ static int rionet_queue_tx_msg(struct sk_buff *skb, struct net_device *ndev, return 0; } -static int rionet_start_xmit(struct sk_buff *skb, struct net_device *ndev) +static netdev_tx_t rionet_start_xmit(struct sk_buff *skb, + struct net_device *ndev) { int i; struct rionet_private *rnet = netdev_priv(ndev); -- cgit v1.2.3-59-g8ed1b From 1569a3c44303834680e93df2ecf60f356be9b322 Mon Sep 17 00:00:00 2001 From: Tang Bin Date: Thu, 30 Apr 2020 20:15:31 +0800 Subject: net/faraday: Fix unnecessary check in ftmac100_probe() The function ftmac100_probe() is only called with an openfirmware platform device. Therefore there is no need to check that the passed in device is NULL. Signed-off-by: Zhang Shengju Signed-off-by: Tang Bin Signed-off-by: David S. Miller --- drivers/net/ethernet/faraday/ftmac100.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/drivers/net/ethernet/faraday/ftmac100.c b/drivers/net/ethernet/faraday/ftmac100.c index 32cf54f0e35b..473b337b2e3b 100644 --- a/drivers/net/ethernet/faraday/ftmac100.c +++ b/drivers/net/ethernet/faraday/ftmac100.c @@ -1057,9 +1057,6 @@ static int ftmac100_probe(struct platform_device *pdev) struct ftmac100 *priv; int err; - if (!pdev) - return -ENODEV; - res = platform_get_resource(pdev, IORESOURCE_MEM, 0); if (!res) return -ENXIO; -- cgit v1.2.3-59-g8ed1b From 555da9af827d95134656fa459c8f3ece04dd867a Mon Sep 17 00:00:00 2001 From: Karsten Graul Date: Thu, 30 Apr 2020 15:55:38 +0200 Subject: net/smc: add event-based llc_flow framework The new framework allows to start specific types of LLC control flows, protects active flows and makes it possible to wait for flows to finish before starting a new flow. This mechanism is used for the LLC control layer to model flows like 'add link' or 'delete link' which need to send/receive several LLC messages and are not allowed to get interrupted by the wrong type of messages. 'Add link' or 'Delete link' messages arriving in the middle of a flow are delayed and processed when the current flow finished. Signed-off-by: Karsten Graul Reviewed-by: Ursula Braun Signed-off-by: David S. Miller --- net/smc/smc_core.c | 2 + net/smc/smc_core.h | 24 ++++++++ net/smc/smc_llc.c | 165 +++++++++++++++++++++++++++++++++++++++++++++++++++++ net/smc/smc_llc.h | 8 +++ 4 files changed, 199 insertions(+) diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index db49f8cd5c95..4867ddcfe0c6 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -263,6 +263,7 @@ static void smc_lgr_free_work(struct work_struct *work) if (smc_link_usable(lnk)) lnk->state = SMC_LNK_INACTIVE; } + wake_up_interruptible_all(&lgr->llc_waiter); } smc_lgr_free(lgr); } @@ -696,6 +697,7 @@ static void smc_lgr_cleanup(struct smc_link_group *lgr) if (smc_link_usable(lnk)) lnk->state = SMC_LNK_INACTIVE; } + wake_up_interruptible_all(&lgr->llc_waiter); } } diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h index b5781511063d..70399217ad6f 100644 --- a/net/smc/smc_core.h +++ b/net/smc/smc_core.h @@ -197,6 +197,20 @@ struct smc_rtoken { /* address/key of remote RMB */ struct smcd_dev; +enum smc_llc_flowtype { + SMC_LLC_FLOW_NONE = 0, + SMC_LLC_FLOW_ADD_LINK = 2, + SMC_LLC_FLOW_DEL_LINK = 4, + SMC_LLC_FLOW_RKEY = 6, +}; + +struct smc_llc_qentry; + +struct smc_llc_flow { + enum smc_llc_flowtype type; + struct smc_llc_qentry *qentry; +}; + struct smc_link_group { struct list_head list; struct rb_root conns_all; /* connection tree */ @@ -238,6 +252,16 @@ struct smc_link_group { /* protects llc_event_q */ struct work_struct llc_event_work; /* llc event worker */ + wait_queue_head_t llc_waiter; + /* w4 next llc event */ + struct smc_llc_flow llc_flow_lcl; + /* llc local control field */ + struct smc_llc_flow llc_flow_rmt; + /* llc remote control field */ + struct smc_llc_qentry *delayed_event; + /* arrived when flow active */ + spinlock_t llc_flow_lock; + /* protects llc flow */ int llc_testlink_time; /* link keep alive time */ }; diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c index e715dd6735ee..647cf1a2dfa5 100644 --- a/net/smc/smc_llc.c +++ b/net/smc/smc_llc.c @@ -140,6 +140,154 @@ struct smc_llc_qentry { union smc_llc_msg msg; }; +struct smc_llc_qentry *smc_llc_flow_qentry_clr(struct smc_llc_flow *flow) +{ + struct smc_llc_qentry *qentry = flow->qentry; + + flow->qentry = NULL; + return qentry; +} + +void smc_llc_flow_qentry_del(struct smc_llc_flow *flow) +{ + struct smc_llc_qentry *qentry; + + if (flow->qentry) { + qentry = flow->qentry; + flow->qentry = NULL; + kfree(qentry); + } +} + +static inline void smc_llc_flow_qentry_set(struct smc_llc_flow *flow, + struct smc_llc_qentry *qentry) +{ + flow->qentry = qentry; +} + +/* try to start a new llc flow, initiated by an incoming llc msg */ +static bool smc_llc_flow_start(struct smc_llc_flow *flow, + struct smc_llc_qentry *qentry) +{ + struct smc_link_group *lgr = qentry->link->lgr; + + spin_lock_bh(&lgr->llc_flow_lock); + if (flow->type) { + /* a flow is already active */ + if ((qentry->msg.raw.hdr.common.type == SMC_LLC_ADD_LINK || + qentry->msg.raw.hdr.common.type == SMC_LLC_DELETE_LINK) && + !lgr->delayed_event) { + lgr->delayed_event = qentry; + } else { + /* forget this llc request */ + kfree(qentry); + } + spin_unlock_bh(&lgr->llc_flow_lock); + return false; + } + switch (qentry->msg.raw.hdr.common.type) { + case SMC_LLC_ADD_LINK: + flow->type = SMC_LLC_FLOW_ADD_LINK; + break; + case SMC_LLC_DELETE_LINK: + flow->type = SMC_LLC_FLOW_DEL_LINK; + break; + case SMC_LLC_CONFIRM_RKEY: + case SMC_LLC_DELETE_RKEY: + flow->type = SMC_LLC_FLOW_RKEY; + break; + default: + flow->type = SMC_LLC_FLOW_NONE; + } + if (qentry == lgr->delayed_event) + lgr->delayed_event = NULL; + spin_unlock_bh(&lgr->llc_flow_lock); + smc_llc_flow_qentry_set(flow, qentry); + return true; +} + +/* start a new local llc flow, wait till current flow finished */ +int smc_llc_flow_initiate(struct smc_link_group *lgr, + enum smc_llc_flowtype type) +{ + enum smc_llc_flowtype allowed_remote = SMC_LLC_FLOW_NONE; + int rc; + + /* all flows except confirm_rkey and delete_rkey are exclusive, + * confirm/delete rkey flows can run concurrently (local and remote) + */ + if (type == SMC_LLC_FLOW_RKEY) + allowed_remote = SMC_LLC_FLOW_RKEY; +again: + if (list_empty(&lgr->list)) + return -ENODEV; + spin_lock_bh(&lgr->llc_flow_lock); + if (lgr->llc_flow_lcl.type == SMC_LLC_FLOW_NONE && + (lgr->llc_flow_rmt.type == SMC_LLC_FLOW_NONE || + lgr->llc_flow_rmt.type == allowed_remote)) { + lgr->llc_flow_lcl.type = type; + spin_unlock_bh(&lgr->llc_flow_lock); + return 0; + } + spin_unlock_bh(&lgr->llc_flow_lock); + rc = wait_event_interruptible_timeout(lgr->llc_waiter, + (lgr->llc_flow_lcl.type == SMC_LLC_FLOW_NONE && + (lgr->llc_flow_rmt.type == SMC_LLC_FLOW_NONE || + lgr->llc_flow_rmt.type == allowed_remote)), + SMC_LLC_WAIT_TIME); + if (!rc) + return -ETIMEDOUT; + goto again; +} + +/* finish the current llc flow */ +void smc_llc_flow_stop(struct smc_link_group *lgr, struct smc_llc_flow *flow) +{ + spin_lock_bh(&lgr->llc_flow_lock); + memset(flow, 0, sizeof(*flow)); + flow->type = SMC_LLC_FLOW_NONE; + spin_unlock_bh(&lgr->llc_flow_lock); + if (!list_empty(&lgr->list) && lgr->delayed_event && + flow == &lgr->llc_flow_lcl) + schedule_work(&lgr->llc_event_work); + else + wake_up_interruptible(&lgr->llc_waiter); +} + +/* lnk is optional and used for early wakeup when link goes down, useful in + * cases where we wait for a response on the link after we sent a request + */ +struct smc_llc_qentry *smc_llc_wait(struct smc_link_group *lgr, + struct smc_link *lnk, + int time_out, u8 exp_msg) +{ + struct smc_llc_flow *flow = &lgr->llc_flow_lcl; + + wait_event_interruptible_timeout(lgr->llc_waiter, + (flow->qentry || + (lnk && !smc_link_usable(lnk)) || + list_empty(&lgr->list)), + time_out); + if (!flow->qentry || + (lnk && !smc_link_usable(lnk)) || list_empty(&lgr->list)) { + smc_llc_flow_qentry_del(flow); + goto out; + } + if (exp_msg && flow->qentry->msg.raw.hdr.common.type != exp_msg) { + if (exp_msg == SMC_LLC_ADD_LINK && + flow->qentry->msg.raw.hdr.common.type == + SMC_LLC_DELETE_LINK) { + /* flow_start will delay the unexpected msg */ + smc_llc_flow_start(&lgr->llc_flow_lcl, + smc_llc_flow_qentry_clr(flow)); + return NULL; + } + smc_llc_flow_qentry_del(flow); + } +out: + return flow->qentry; +} + /********************************** send *************************************/ struct smc_llc_tx_pend { @@ -547,6 +695,16 @@ static void smc_llc_event_work(struct work_struct *work) llc_event_work); struct smc_llc_qentry *qentry; + if (!lgr->llc_flow_lcl.type && lgr->delayed_event) { + if (smc_link_usable(lgr->delayed_event->link)) { + smc_llc_event_handler(lgr->delayed_event); + } else { + qentry = lgr->delayed_event; + lgr->delayed_event = NULL; + kfree(qentry); + } + } + again: spin_lock_bh(&lgr->llc_event_q_lock); if (!list_empty(&lgr->llc_event_q)) { @@ -676,6 +834,8 @@ void smc_llc_lgr_init(struct smc_link_group *lgr, struct smc_sock *smc) INIT_WORK(&lgr->llc_event_work, smc_llc_event_work); INIT_LIST_HEAD(&lgr->llc_event_q); spin_lock_init(&lgr->llc_event_q_lock); + spin_lock_init(&lgr->llc_flow_lock); + init_waitqueue_head(&lgr->llc_waiter); lgr->llc_testlink_time = net->ipv4.sysctl_tcp_keepalive_time; } @@ -683,7 +843,12 @@ void smc_llc_lgr_init(struct smc_link_group *lgr, struct smc_sock *smc) void smc_llc_lgr_clear(struct smc_link_group *lgr) { smc_llc_event_flush(lgr); + wake_up_interruptible_all(&lgr->llc_waiter); cancel_work_sync(&lgr->llc_event_work); + if (lgr->delayed_event) { + kfree(lgr->delayed_event); + lgr->delayed_event = NULL; + } } int smc_llc_link_init(struct smc_link *link) diff --git a/net/smc/smc_llc.h b/net/smc/smc_llc.h index 66063f22166b..49e99ff00ee7 100644 --- a/net/smc/smc_llc.h +++ b/net/smc/smc_llc.h @@ -63,6 +63,14 @@ int smc_llc_do_confirm_rkey(struct smc_link *link, struct smc_buf_desc *rmb_desc); int smc_llc_do_delete_rkey(struct smc_link *link, struct smc_buf_desc *rmb_desc); +int smc_llc_flow_initiate(struct smc_link_group *lgr, + enum smc_llc_flowtype type); +void smc_llc_flow_stop(struct smc_link_group *lgr, struct smc_llc_flow *flow); +struct smc_llc_qentry *smc_llc_wait(struct smc_link_group *lgr, + struct smc_link *lnk, + int time_out, u8 exp_msg); +struct smc_llc_qentry *smc_llc_flow_qentry_clr(struct smc_llc_flow *flow); +void smc_llc_flow_qentry_del(struct smc_llc_flow *flow); int smc_llc_init(void) __init; #endif /* SMC_LLC_H */ -- cgit v1.2.3-59-g8ed1b From a6688d919b220bd714948e03bb3caa8a66895005 Mon Sep 17 00:00:00 2001 From: Karsten Graul Date: Thu, 30 Apr 2020 15:55:39 +0200 Subject: net/smc: enqueue all received LLC messages Introduce smc_llc_enqueue() to enqueue LLC messages, and adapt smc_llc_rx_handler() to enqueue all received LLC messages. smc_llc_enqueue() also makes it possible to enqueue LLC messages from local code. Signed-off-by: Karsten Graul Reviewed-by: Ursula Braun Signed-off-by: David S. Miller --- net/smc/smc_llc.c | 46 +++++++++++++++++++++++++++++----------------- 1 file changed, 29 insertions(+), 17 deletions(-) diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c index 647cf1a2dfa5..a146b3b43580 100644 --- a/net/smc/smc_llc.c +++ b/net/smc/smc_llc.c @@ -719,11 +719,14 @@ again: } /* process llc responses in tasklet context */ -static void smc_llc_rx_response(struct smc_link *link, union smc_llc_msg *llc) +static void smc_llc_rx_response(struct smc_link *link, + struct smc_llc_qentry *qentry) { + u8 llc_type = qentry->msg.raw.hdr.common.type; + union smc_llc_msg *llc = &qentry->msg; int rc = 0; - switch (llc->raw.hdr.common.type) { + switch (llc_type) { case SMC_LLC_TEST_LINK: if (link->state == SMC_LNK_ACTIVE) complete(&link->llc_testlink_resp); @@ -759,40 +762,49 @@ static void smc_llc_rx_response(struct smc_link *link, union smc_llc_msg *llc) complete(&link->llc_delete_rkey_resp); break; } + kfree(qentry); } -/* copy received msg and add it to the event queue */ -static void smc_llc_rx_handler(struct ib_wc *wc, void *buf) +static void smc_llc_enqueue(struct smc_link *link, union smc_llc_msg *llc) { - struct smc_link *link = (struct smc_link *)wc->qp->qp_context; struct smc_link_group *lgr = link->lgr; struct smc_llc_qentry *qentry; - union smc_llc_msg *llc = buf; unsigned long flags; - if (wc->byte_len < sizeof(*llc)) - return; /* short message */ - if (llc->raw.hdr.length != sizeof(*llc)) - return; /* invalid message */ - - /* process responses immediately */ - if (llc->raw.hdr.flags & SMC_LLC_FLAG_RESP) { - smc_llc_rx_response(link, llc); - return; - } - qentry = kmalloc(sizeof(*qentry), GFP_ATOMIC); if (!qentry) return; qentry->link = link; INIT_LIST_HEAD(&qentry->list); memcpy(&qentry->msg, llc, sizeof(union smc_llc_msg)); + + /* process responses immediately */ + if (llc->raw.hdr.flags & SMC_LLC_FLAG_RESP) { + smc_llc_rx_response(link, qentry); + return; + } + + /* add requests to event queue */ spin_lock_irqsave(&lgr->llc_event_q_lock, flags); list_add_tail(&qentry->list, &lgr->llc_event_q); spin_unlock_irqrestore(&lgr->llc_event_q_lock, flags); schedule_work(&link->lgr->llc_event_work); } +/* copy received msg and add it to the event queue */ +static void smc_llc_rx_handler(struct ib_wc *wc, void *buf) +{ + struct smc_link *link = (struct smc_link *)wc->qp->qp_context; + union smc_llc_msg *llc = buf; + + if (wc->byte_len < sizeof(*llc)) + return; /* short message */ + if (llc->raw.hdr.length != sizeof(*llc)) + return; /* invalid message */ + + smc_llc_enqueue(link, llc); +} + /***************************** worker, utils *********************************/ static void smc_llc_testlink_work(struct work_struct *work) -- cgit v1.2.3-59-g8ed1b From 81e6e5e70df46bb5b205e53f2b7885e49a9d4974 Mon Sep 17 00:00:00 2001 From: Karsten Graul Date: Thu, 30 Apr 2020 15:55:40 +0200 Subject: net/smc: introduce link group type Add a type field to the link group which reflects the current link group redundancy state. Signed-off-by: Karsten Graul Reviewed-by: Ursula Braun Signed-off-by: David S. Miller --- net/smc/smc_core.h | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h index 70399217ad6f..51366a9f4980 100644 --- a/net/smc/smc_core.h +++ b/net/smc/smc_core.h @@ -197,6 +197,14 @@ struct smc_rtoken { /* address/key of remote RMB */ struct smcd_dev; +enum smc_lgr_type { /* redundancy state of lgr */ + SMC_LGR_NONE, /* no active links, lgr to be deleted */ + SMC_LGR_SINGLE, /* 1 active RNIC on each peer */ + SMC_LGR_SYMMETRIC, /* 2 active RNICs on each peer */ + SMC_LGR_ASYMMETRIC_PEER, /* local has 2, peer 1 active RNICs */ + SMC_LGR_ASYMMETRIC_LOCAL, /* local has 1, peer 2 active RNICs */ +}; + enum smc_llc_flowtype { SMC_LLC_FLOW_NONE = 0, SMC_LLC_FLOW_ADD_LINK = 2, @@ -246,6 +254,8 @@ struct smc_link_group { DECLARE_BITMAP(rtokens_used_mask, SMC_RMBS_PER_LGR_MAX); /* used rtoken elements */ u8 next_link_id; + enum smc_lgr_type type; + /* redundancy state */ struct list_head llc_event_q; /* queue for llc events */ spinlock_t llc_event_q_lock; -- cgit v1.2.3-59-g8ed1b From 92334cfcb3a2a102dc1b23513bbe2fca4347e2d6 Mon Sep 17 00:00:00 2001 From: Karsten Graul Date: Thu, 30 Apr 2020 15:55:41 +0200 Subject: net/smc: add logic to evaluate CONFIRM_LINK messages to LLC layer Introduce smc_llc_eval_conf_link() to evaluate the CONFIRM_LINK message contents. This implements this logic at the LLC layer. The function will be used by af_smc.c to process the received LLC layer messages. Signed-off-by: Karsten Graul Reviewed-by: Ursula Braun Signed-off-by: David S. Miller --- net/smc/smc_llc.c | 11 +++++++++++ net/smc/smc_llc.h | 2 ++ 2 files changed, 13 insertions(+) diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c index a146b3b43580..9248b90fe37e 100644 --- a/net/smc/smc_llc.c +++ b/net/smc/smc_llc.c @@ -946,6 +946,17 @@ out: return rc; } +/* evaluate confirm link request or response */ +int smc_llc_eval_conf_link(struct smc_llc_qentry *qentry, + enum smc_llc_reqresp type) +{ + if (type == SMC_LLC_REQ) /* SMC server assigns link_id */ + qentry->link->link_id = qentry->msg.confirm_link.link_num; + if (!(qentry->msg.raw.hdr.flags & SMC_LLC_FLAG_NO_RMBE_EYEC)) + return -ENOTSUPP; + return 0; +} + /***************************** init, exit, misc ******************************/ static struct smc_wr_rx_handler smc_llc_rx_handlers[] = { diff --git a/net/smc/smc_llc.h b/net/smc/smc_llc.h index 49e99ff00ee7..637acf91ffb7 100644 --- a/net/smc/smc_llc.h +++ b/net/smc/smc_llc.h @@ -66,6 +66,8 @@ int smc_llc_do_delete_rkey(struct smc_link *link, int smc_llc_flow_initiate(struct smc_link_group *lgr, enum smc_llc_flowtype type); void smc_llc_flow_stop(struct smc_link_group *lgr, struct smc_llc_flow *flow); +int smc_llc_eval_conf_link(struct smc_llc_qentry *qentry, + enum smc_llc_reqresp type); struct smc_llc_qentry *smc_llc_wait(struct smc_link_group *lgr, struct smc_link *lnk, int time_out, u8 exp_msg); -- cgit v1.2.3-59-g8ed1b From 4667bb4aaabf87d6b97be1b4671b9db340a58cdc Mon Sep 17 00:00:00 2001 From: Karsten Graul Date: Thu, 30 Apr 2020 15:55:42 +0200 Subject: net/smc: adapt SMC server code to use the LLC flow Change the code that processes the SMC server part of connection establishment to use the LLC flow framework (CONFIRM_LINK response messages). Signed-off-by: Karsten Graul Reviewed-by: Ursula Braun Signed-off-by: David S. Miller --- net/smc/af_smc.c | 39 +++++++++++++++------------------------ net/smc/smc_core.h | 3 --- net/smc/smc_llc.c | 20 +++++--------------- 3 files changed, 20 insertions(+), 42 deletions(-) diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index e859e3f420d9..ab3aef1ddfa4 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -1019,9 +1019,11 @@ void smc_close_non_accepted(struct sock *sk) static int smcr_serv_conf_first_link(struct smc_sock *smc) { struct smc_link *link = smc->conn.lnk; - int rest; + struct smc_llc_qentry *qentry; int rc; + link->lgr->type = SMC_LGR_SINGLE; + if (smcr_link_reg_rmb(link, smc->conn.rmb_desc, false)) return SMC_CLC_DECL_ERR_REGRMB; @@ -1031,40 +1033,27 @@ static int smcr_serv_conf_first_link(struct smc_sock *smc) return SMC_CLC_DECL_TIMEOUT_CL; /* receive CONFIRM LINK response from client over the RoCE fabric */ - rest = wait_for_completion_interruptible_timeout( - &link->llc_confirm_resp, - SMC_LLC_WAIT_FIRST_TIME); - if (rest <= 0) { + qentry = smc_llc_wait(link->lgr, link, SMC_LLC_WAIT_TIME, + SMC_LLC_CONFIRM_LINK); + if (!qentry) { struct smc_clc_msg_decline dclc; rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc), SMC_CLC_DECLINE, CLC_WAIT_TIME_SHORT); return rc == -EAGAIN ? SMC_CLC_DECL_TIMEOUT_CL : rc; } - - if (link->llc_confirm_resp_rc) + rc = smc_llc_eval_conf_link(qentry, SMC_LLC_RESP); + smc_llc_flow_qentry_del(&link->lgr->llc_flow_lcl); + if (rc) return SMC_CLC_DECL_RMBE_EC; - /* send ADD LINK request to client over the RoCE fabric */ - rc = smc_llc_send_add_link(link, - link->smcibdev->mac[link->ibport - 1], - link->gid, SMC_LLC_REQ); - if (rc < 0) - return SMC_CLC_DECL_TIMEOUT_AL; - - /* receive ADD LINK response from client over the RoCE fabric */ - rest = wait_for_completion_interruptible_timeout(&link->llc_add_resp, - SMC_LLC_WAIT_TIME); - if (rest <= 0) { - struct smc_clc_msg_decline dclc; - - rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc), - SMC_CLC_DECLINE, CLC_WAIT_TIME_SHORT); - return rc == -EAGAIN ? SMC_CLC_DECL_TIMEOUT_AL : rc; - } + /* confirm_rkey is implicit on 1st contact */ + smc->conn.rmb_desc->is_conf_rkey = true; smc_llc_link_active(link); + /* initial contact - try to establish second link */ + /* tbd: call smc_llc_srv_add_link(link); */ return 0; } @@ -1240,7 +1229,9 @@ static int smc_listen_rdma_finish(struct smc_sock *new_smc, goto decline; } /* QP confirmation over RoCE fabric */ + smc_llc_flow_initiate(link->lgr, SMC_LLC_FLOW_ADD_LINK); reason_code = smcr_serv_conf_first_link(new_smc); + smc_llc_flow_stop(link->lgr, &link->lgr->llc_flow_lcl); if (reason_code) goto decline; } diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h index 51366a9f4980..01a9cb885ef2 100644 --- a/net/smc/smc_core.h +++ b/net/smc/smc_core.h @@ -121,11 +121,8 @@ struct smc_link { enum smc_link_state state; /* state of link */ struct completion llc_confirm; /* wait for rx of conf link */ - struct completion llc_confirm_resp; /* wait 4 rx of cnf lnk rsp */ int llc_confirm_rc; /* rc from confirm link msg */ - int llc_confirm_resp_rc; /* rc from conf_resp msg */ struct completion llc_add; /* wait for rx of add link */ - struct completion llc_add_resp; /* wait for rx of add link rsp*/ struct delayed_work llc_testlink_wrk; /* testlink worker */ struct completion llc_testlink_resp; /* wait for rx of testlink */ int llc_testlink_time; /* testlink interval */ diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c index 9248b90fe37e..5381b16fd482 100644 --- a/net/smc/smc_llc.c +++ b/net/smc/smc_llc.c @@ -724,26 +724,18 @@ static void smc_llc_rx_response(struct smc_link *link, { u8 llc_type = qentry->msg.raw.hdr.common.type; union smc_llc_msg *llc = &qentry->msg; - int rc = 0; switch (llc_type) { case SMC_LLC_TEST_LINK: if (link->state == SMC_LNK_ACTIVE) complete(&link->llc_testlink_resp); break; - case SMC_LLC_CONFIRM_LINK: - if (!(llc->raw.hdr.flags & SMC_LLC_FLAG_NO_RMBE_EYEC)) - rc = ENOTSUPP; - if (link->lgr->role == SMC_SERV && - link->state == SMC_LNK_ACTIVATING) { - link->llc_confirm_resp_rc = rc; - complete(&link->llc_confirm_resp); - } - break; case SMC_LLC_ADD_LINK: - if (link->state == SMC_LNK_ACTIVATING) - complete(&link->llc_add_resp); - break; + case SMC_LLC_CONFIRM_LINK: + /* assign responses to the local flow, we requested them */ + smc_llc_flow_qentry_set(&link->lgr->llc_flow_lcl, qentry); + wake_up_interruptible(&link->lgr->llc_waiter); + return; case SMC_LLC_DELETE_LINK: if (link->lgr->role == SMC_SERV) smc_lgr_schedule_free_work_fast(link->lgr); @@ -866,9 +858,7 @@ void smc_llc_lgr_clear(struct smc_link_group *lgr) int smc_llc_link_init(struct smc_link *link) { init_completion(&link->llc_confirm); - init_completion(&link->llc_confirm_resp); init_completion(&link->llc_add); - init_completion(&link->llc_add_resp); init_completion(&link->llc_confirm_rkey_resp); init_completion(&link->llc_delete_rkey_resp); mutex_init(&link->llc_delete_rkey_mutex); -- cgit v1.2.3-59-g8ed1b From 0fb0b02bd6fd26cba38002be4a6bbcae2228fd44 Mon Sep 17 00:00:00 2001 From: Karsten Graul Date: Thu, 30 Apr 2020 15:55:43 +0200 Subject: net/smc: adapt SMC client code to use the LLC flow Change the code that processes the SMC client part of connection establishment to use the LLC flow framework (CONFIRM_LINK request messages). Signed-off-by: Karsten Graul Reviewed-by: Ursula Braun Signed-off-by: David S. Miller --- net/smc/af_smc.c | 69 +++++++++++++++++++++++++++++++++------------------ net/smc/smc_clc.h | 1 + net/smc/smc_core.h | 3 --- net/smc/smc_llc.c | 72 +++++++++++++++++++----------------------------------- 4 files changed, 71 insertions(+), 74 deletions(-) diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index ab3aef1ddfa4..bd9662d06896 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -382,22 +382,24 @@ static int smcr_lgr_reg_rmbs(struct smc_link_group *lgr, static int smcr_clnt_conf_first_link(struct smc_sock *smc) { struct smc_link *link = smc->conn.lnk; - int rest; + struct smc_llc_qentry *qentry; int rc; + link->lgr->type = SMC_LGR_SINGLE; + /* receive CONFIRM LINK request from server over RoCE fabric */ - rest = wait_for_completion_interruptible_timeout( - &link->llc_confirm, - SMC_LLC_WAIT_FIRST_TIME); - if (rest <= 0) { + qentry = smc_llc_wait(link->lgr, NULL, SMC_LLC_WAIT_TIME, + SMC_LLC_CONFIRM_LINK); + if (!qentry) { struct smc_clc_msg_decline dclc; rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc), SMC_CLC_DECLINE, CLC_WAIT_TIME_SHORT); return rc == -EAGAIN ? SMC_CLC_DECL_TIMEOUT_CL : rc; } - - if (link->llc_confirm_rc) + rc = smc_llc_eval_conf_link(qentry, SMC_LLC_REQ); + smc_llc_flow_qentry_del(&link->lgr->llc_flow_lcl); + if (rc) return SMC_CLC_DECL_RMBE_EC; rc = smc_ib_modify_qp_rts(link); @@ -409,31 +411,30 @@ static int smcr_clnt_conf_first_link(struct smc_sock *smc) if (smcr_link_reg_rmb(link, smc->conn.rmb_desc, false)) return SMC_CLC_DECL_ERR_REGRMB; + /* confirm_rkey is implicit on 1st contact */ + smc->conn.rmb_desc->is_conf_rkey = true; + /* send CONFIRM LINK response over RoCE fabric */ rc = smc_llc_send_confirm_link(link, SMC_LLC_RESP); if (rc < 0) return SMC_CLC_DECL_TIMEOUT_CL; - /* receive ADD LINK request from server over RoCE fabric */ - rest = wait_for_completion_interruptible_timeout(&link->llc_add, - SMC_LLC_WAIT_TIME); - if (rest <= 0) { + smc_llc_link_active(link); + + /* optional 2nd link, receive ADD LINK request from server */ + qentry = smc_llc_wait(link->lgr, NULL, SMC_LLC_WAIT_TIME, + SMC_LLC_ADD_LINK); + if (!qentry) { struct smc_clc_msg_decline dclc; rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc), SMC_CLC_DECLINE, CLC_WAIT_TIME_SHORT); - return rc == -EAGAIN ? SMC_CLC_DECL_TIMEOUT_AL : rc; + if (rc == -EAGAIN) + rc = 0; /* no DECLINE received, go with one link */ + return rc; } - - /* send add link reject message, only one link supported for now */ - rc = smc_llc_send_add_link(link, - link->smcibdev->mac[link->ibport - 1], - link->gid, SMC_LLC_RESP); - if (rc < 0) - return SMC_CLC_DECL_TIMEOUT_AL; - - smc_llc_link_active(link); - + smc_llc_flow_qentry_clr(&link->lgr->llc_flow_lcl); + /* tbd: call smc_llc_cli_add_link(link, qentry); */ return 0; } @@ -613,8 +614,8 @@ static int smc_connect_rdma(struct smc_sock *smc, struct smc_clc_msg_accept_confirm *aclc, struct smc_init_info *ini) { + int i, reason_code = 0; struct smc_link *link; - int reason_code = 0; ini->is_smcd = false; ini->ib_lcl = &aclc->lcl; @@ -627,10 +628,28 @@ static int smc_connect_rdma(struct smc_sock *smc, mutex_unlock(&smc_client_lgr_pending); return reason_code; } - link = smc->conn.lnk; smc_conn_save_peer_info(smc, aclc); + if (ini->cln_first_contact == SMC_FIRST_CONTACT) { + link = smc->conn.lnk; + } else { + /* set link that was assigned by server */ + link = NULL; + for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { + struct smc_link *l = &smc->conn.lgr->lnk[i]; + + if (l->peer_qpn == ntoh24(aclc->qpn)) { + link = l; + break; + } + } + if (!link) + return smc_connect_abort(smc, SMC_CLC_DECL_NOSRVLINK, + ini->cln_first_contact); + smc->conn.lnk = link; + } + /* create send buffer and rmb */ if (smc_buf_create(smc, false)) return smc_connect_abort(smc, SMC_CLC_DECL_MEM, @@ -666,7 +685,9 @@ static int smc_connect_rdma(struct smc_sock *smc, if (ini->cln_first_contact == SMC_FIRST_CONTACT) { /* QP confirmation over RoCE fabric */ + smc_llc_flow_initiate(link->lgr, SMC_LLC_FLOW_ADD_LINK); reason_code = smcr_clnt_conf_first_link(smc); + smc_llc_flow_stop(link->lgr, &link->lgr->llc_flow_lcl); if (reason_code) return smc_connect_abort(smc, reason_code, ini->cln_first_contact); diff --git a/net/smc/smc_clc.h b/net/smc/smc_clc.h index 4f2e150a2be1..465876701b75 100644 --- a/net/smc/smc_clc.h +++ b/net/smc/smc_clc.h @@ -45,6 +45,7 @@ #define SMC_CLC_DECL_GETVLANERR 0x03080000 /* err to get vlan id of ip device*/ #define SMC_CLC_DECL_ISMVLANERR 0x03090000 /* err to reg vlan id on ism dev */ #define SMC_CLC_DECL_NOACTLINK 0x030a0000 /* no active smc-r link in lgr */ +#define SMC_CLC_DECL_NOSRVLINK 0x030b0000 /* SMC-R link from srv not found */ #define SMC_CLC_DECL_SYNCERR 0x04000000 /* synchronization error */ #define SMC_CLC_DECL_PEERDECL 0x05000000 /* peer declined during handshake */ #define SMC_CLC_DECL_INTERR 0x09990000 /* internal error */ diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h index 01a9cb885ef2..31237c4c0d93 100644 --- a/net/smc/smc_core.h +++ b/net/smc/smc_core.h @@ -120,9 +120,6 @@ struct smc_link { struct smc_link_group *lgr; /* parent link group */ enum smc_link_state state; /* state of link */ - struct completion llc_confirm; /* wait for rx of conf link */ - int llc_confirm_rc; /* rc from confirm link msg */ - struct completion llc_add; /* wait for rx of add link */ struct delayed_work llc_testlink_wrk; /* testlink worker */ struct completion llc_testlink_resp; /* wait for rx of testlink */ int llc_testlink_time; /* testlink interval */ diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c index 5381b16fd482..644e9ab0dec5 100644 --- a/net/smc/smc_llc.c +++ b/net/smc/smc_llc.c @@ -528,47 +528,6 @@ static int smc_llc_send_message(struct smc_link *link, void *llcbuf) /********************************* receive ***********************************/ -static void smc_llc_rx_confirm_link(struct smc_link *link, - struct smc_llc_msg_confirm_link *llc) -{ - struct smc_link_group *lgr = smc_get_lgr(link); - int conf_rc = 0; - - /* RMBE eyecatchers are not supported */ - if (!(llc->hd.flags & SMC_LLC_FLAG_NO_RMBE_EYEC)) - conf_rc = ENOTSUPP; - - if (lgr->role == SMC_CLNT && - link->state == SMC_LNK_ACTIVATING) { - link->llc_confirm_rc = conf_rc; - link->link_id = llc->link_num; - complete(&link->llc_confirm); - } -} - -static void smc_llc_rx_add_link(struct smc_link *link, - struct smc_llc_msg_add_link *llc) -{ - struct smc_link_group *lgr = smc_get_lgr(link); - - if (link->state == SMC_LNK_ACTIVATING) { - complete(&link->llc_add); - return; - } - - if (lgr->role == SMC_SERV) { - smc_llc_prep_add_link(llc, link, - link->smcibdev->mac[link->ibport - 1], - link->gid, SMC_LLC_REQ); - - } else { - smc_llc_prep_add_link(llc, link, - link->smcibdev->mac[link->ibport - 1], - link->gid, SMC_LLC_RESP); - } - smc_llc_send_message(link, llc); -} - static void smc_llc_rx_delete_link(struct smc_link *link, struct smc_llc_msg_del_link *llc) { @@ -657,6 +616,7 @@ static void smc_llc_event_handler(struct smc_llc_qentry *qentry) { union smc_llc_msg *llc = &qentry->msg; struct smc_link *link = qentry->link; + struct smc_link_group *lgr = link->lgr; if (!smc_link_usable(link)) goto out; @@ -665,11 +625,31 @@ static void smc_llc_event_handler(struct smc_llc_qentry *qentry) case SMC_LLC_TEST_LINK: smc_llc_rx_test_link(link, &llc->test_link); break; - case SMC_LLC_CONFIRM_LINK: - smc_llc_rx_confirm_link(link, &llc->confirm_link); - break; case SMC_LLC_ADD_LINK: - smc_llc_rx_add_link(link, &llc->add_link); + if (list_empty(&lgr->list)) + goto out; /* lgr is terminating */ + if (lgr->role == SMC_CLNT) { + if (lgr->llc_flow_lcl.type == SMC_LLC_FLOW_ADD_LINK) { + /* a flow is waiting for this message */ + smc_llc_flow_qentry_set(&lgr->llc_flow_lcl, + qentry); + wake_up_interruptible(&lgr->llc_waiter); + } else if (smc_llc_flow_start(&lgr->llc_flow_lcl, + qentry)) { + /* tbd: schedule_work(&lgr->llc_add_link_work); */ + } + } else if (smc_llc_flow_start(&lgr->llc_flow_lcl, qentry)) { + /* as smc server, handle client suggestion */ + /* tbd: schedule_work(&lgr->llc_add_link_work); */ + } + return; + case SMC_LLC_CONFIRM_LINK: + if (lgr->llc_flow_lcl.type != SMC_LLC_FLOW_NONE) { + /* a flow is waiting for this message */ + smc_llc_flow_qentry_set(&lgr->llc_flow_lcl, qentry); + wake_up_interruptible(&lgr->llc_waiter); + return; + } break; case SMC_LLC_DELETE_LINK: smc_llc_rx_delete_link(link, &llc->delete_link); @@ -857,8 +837,6 @@ void smc_llc_lgr_clear(struct smc_link_group *lgr) int smc_llc_link_init(struct smc_link *link) { - init_completion(&link->llc_confirm); - init_completion(&link->llc_add); init_completion(&link->llc_confirm_rkey_resp); init_completion(&link->llc_delete_rkey_resp); mutex_init(&link->llc_delete_rkey_mutex); -- cgit v1.2.3-59-g8ed1b From 3d88a21b0cb6a2661a567e57a431e5aa12ecb203 Mon Sep 17 00:00:00 2001 From: Karsten Graul Date: Thu, 30 Apr 2020 15:55:44 +0200 Subject: net/smc: multiple link support and LLC flow for smc_llc_do_confirm_rkey Adapt smc_llc_do_confirm_rkey() to use the LLC flow and support the rkeys of multiple links when the CONFIRM_RKEY LLC message is build. Signed-off-by: Karsten Graul Reviewed-by: Ursula Braun Signed-off-by: David S. Miller --- net/smc/smc_core.h | 2 -- net/smc/smc_llc.c | 65 +++++++++++++++++++++++++++++++++++------------------- net/smc/smc_llc.h | 2 +- 3 files changed, 43 insertions(+), 26 deletions(-) diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h index 31237c4c0d93..4e0dfb1d5804 100644 --- a/net/smc/smc_core.h +++ b/net/smc/smc_core.h @@ -123,8 +123,6 @@ struct smc_link { struct delayed_work llc_testlink_wrk; /* testlink worker */ struct completion llc_testlink_resp; /* wait for rx of testlink */ int llc_testlink_time; /* testlink interval */ - struct completion llc_confirm_rkey_resp; /* w4 rx of cnf rkey */ - int llc_confirm_rkey_resp_rc; /* rc from cnf rkey */ struct completion llc_delete_rkey_resp; /* w4 rx of del rkey */ int llc_delete_rkey_resp_rc; /* rc from del rkey */ struct mutex llc_delete_rkey_mutex; /* serialize usage */ diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c index 644e9ab0dec5..5db11f54b4cd 100644 --- a/net/smc/smc_llc.c +++ b/net/smc/smc_llc.c @@ -369,27 +369,44 @@ int smc_llc_send_confirm_link(struct smc_link *link, } /* send LLC confirm rkey request */ -static int smc_llc_send_confirm_rkey(struct smc_link *link, +static int smc_llc_send_confirm_rkey(struct smc_link *send_link, struct smc_buf_desc *rmb_desc) { struct smc_llc_msg_confirm_rkey *rkeyllc; struct smc_wr_tx_pend_priv *pend; struct smc_wr_buf *wr_buf; - int rc; + struct smc_link *link; + int i, rc, rtok_ix; - rc = smc_llc_add_pending_send(link, &wr_buf, &pend); + rc = smc_llc_add_pending_send(send_link, &wr_buf, &pend); if (rc) return rc; rkeyllc = (struct smc_llc_msg_confirm_rkey *)wr_buf; memset(rkeyllc, 0, sizeof(*rkeyllc)); rkeyllc->hd.common.type = SMC_LLC_CONFIRM_RKEY; rkeyllc->hd.length = sizeof(struct smc_llc_msg_confirm_rkey); + + rtok_ix = 1; + for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { + link = &send_link->lgr->lnk[i]; + if (link->state == SMC_LNK_ACTIVE && link != send_link) { + rkeyllc->rtoken[rtok_ix].link_id = link->link_id; + rkeyllc->rtoken[rtok_ix].rmb_key = + htonl(rmb_desc->mr_rx[link->link_idx]->rkey); + rkeyllc->rtoken[rtok_ix].rmb_vaddr = cpu_to_be64( + (u64)sg_dma_address( + rmb_desc->sgt[link->link_idx].sgl)); + rtok_ix++; + } + } + /* rkey of send_link is in rtoken[0] */ + rkeyllc->rtoken[0].num_rkeys = rtok_ix - 1; rkeyllc->rtoken[0].rmb_key = - htonl(rmb_desc->mr_rx[link->link_idx]->rkey); + htonl(rmb_desc->mr_rx[send_link->link_idx]->rkey); rkeyllc->rtoken[0].rmb_vaddr = cpu_to_be64( - (u64)sg_dma_address(rmb_desc->sgt[link->link_idx].sgl)); + (u64)sg_dma_address(rmb_desc->sgt[send_link->link_idx].sgl)); /* send llc message */ - rc = smc_wr_tx_send(link, pend); + rc = smc_wr_tx_send(send_link, pend); return rc; } @@ -712,6 +729,7 @@ static void smc_llc_rx_response(struct smc_link *link, break; case SMC_LLC_ADD_LINK: case SMC_LLC_CONFIRM_LINK: + case SMC_LLC_CONFIRM_RKEY: /* assign responses to the local flow, we requested them */ smc_llc_flow_qentry_set(&link->lgr->llc_flow_lcl, qentry); wake_up_interruptible(&link->lgr->llc_waiter); @@ -720,11 +738,6 @@ static void smc_llc_rx_response(struct smc_link *link, if (link->lgr->role == SMC_SERV) smc_lgr_schedule_free_work_fast(link->lgr); break; - case SMC_LLC_CONFIRM_RKEY: - link->llc_confirm_rkey_resp_rc = llc->raw.hdr.flags & - SMC_LLC_FLAG_RKEY_NEG; - complete(&link->llc_confirm_rkey_resp); - break; case SMC_LLC_CONFIRM_RKEY_CONT: /* unused as long as we don't send this type of msg */ break; @@ -837,7 +850,6 @@ void smc_llc_lgr_clear(struct smc_link_group *lgr) int smc_llc_link_init(struct smc_link *link) { - init_completion(&link->llc_confirm_rkey_resp); init_completion(&link->llc_delete_rkey_resp); mutex_init(&link->llc_delete_rkey_mutex); init_completion(&link->llc_testlink_resp); @@ -870,23 +882,30 @@ void smc_llc_link_clear(struct smc_link *link) smc_wr_wakeup_tx_wait(link); } -/* register a new rtoken at the remote peer */ -int smc_llc_do_confirm_rkey(struct smc_link *link, +/* register a new rtoken at the remote peer (for all links) */ +int smc_llc_do_confirm_rkey(struct smc_link *send_link, struct smc_buf_desc *rmb_desc) { - int rc; + struct smc_link_group *lgr = send_link->lgr; + struct smc_llc_qentry *qentry = NULL; + int rc = 0; - /* protected by mutex smc_create_lgr_pending */ - reinit_completion(&link->llc_confirm_rkey_resp); - rc = smc_llc_send_confirm_rkey(link, rmb_desc); + rc = smc_llc_flow_initiate(lgr, SMC_LLC_FLOW_RKEY); if (rc) return rc; + rc = smc_llc_send_confirm_rkey(send_link, rmb_desc); + if (rc) + goto out; /* receive CONFIRM RKEY response from server over RoCE fabric */ - rc = wait_for_completion_interruptible_timeout( - &link->llc_confirm_rkey_resp, SMC_LLC_WAIT_TIME); - if (rc <= 0 || link->llc_confirm_rkey_resp_rc) - return -EFAULT; - return 0; + qentry = smc_llc_wait(lgr, send_link, SMC_LLC_WAIT_TIME, + SMC_LLC_CONFIRM_RKEY); + if (!qentry || (qentry->msg.raw.hdr.flags & SMC_LLC_FLAG_RKEY_NEG)) + rc = -EFAULT; +out: + if (qentry) + smc_llc_flow_qentry_del(&lgr->llc_flow_lcl); + smc_llc_flow_stop(lgr, &lgr->llc_flow_lcl); + return rc; } /* unregister an rtoken at the remote peer */ diff --git a/net/smc/smc_llc.h b/net/smc/smc_llc.h index 637acf91ffb7..d82d8346b61e 100644 --- a/net/smc/smc_llc.h +++ b/net/smc/smc_llc.h @@ -59,7 +59,7 @@ int smc_llc_link_init(struct smc_link *link); void smc_llc_link_active(struct smc_link *link); void smc_llc_link_deleting(struct smc_link *link); void smc_llc_link_clear(struct smc_link *link); -int smc_llc_do_confirm_rkey(struct smc_link *link, +int smc_llc_do_confirm_rkey(struct smc_link *send_link, struct smc_buf_desc *rmb_desc); int smc_llc_do_delete_rkey(struct smc_link *link, struct smc_buf_desc *rmb_desc); -- cgit v1.2.3-59-g8ed1b From 6d74c3a8a3e7a488a7d9d8c4a59091ccae72fc4c Mon Sep 17 00:00:00 2001 From: Karsten Graul Date: Thu, 30 Apr 2020 15:55:45 +0200 Subject: net/smc: multiple link support and LLC flow for smc_llc_do_delete_rkey Adapt smc_llc_do_delete_rkey() to use the LLC flow and support multiple links when deleting the rkeys for rmb buffers at the peer. Signed-off-by: Karsten Graul Reviewed-by: Ursula Braun Signed-off-by: David S. Miller --- net/smc/smc_core.c | 10 ++++------ net/smc/smc_core.h | 3 --- net/smc/smc_llc.c | 39 +++++++++++++++++++-------------------- net/smc/smc_llc.h | 2 +- 4 files changed, 24 insertions(+), 30 deletions(-) diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index 4867ddcfe0c6..f71a366ed6ac 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -446,13 +446,11 @@ out: } static void smcr_buf_unuse(struct smc_buf_desc *rmb_desc, - struct smc_link *lnk) + struct smc_link_group *lgr) { - struct smc_link_group *lgr = lnk->lgr; - if (rmb_desc->is_conf_rkey && !list_empty(&lgr->list)) { /* unregister rmb with peer */ - smc_llc_do_delete_rkey(lnk, rmb_desc); + smc_llc_do_delete_rkey(lgr, rmb_desc); rmb_desc->is_conf_rkey = false; } if (rmb_desc->is_reg_err) { @@ -475,7 +473,7 @@ static void smc_buf_unuse(struct smc_connection *conn, if (conn->rmb_desc && lgr->is_smcd) conn->rmb_desc->used = 0; else if (conn->rmb_desc) - smcr_buf_unuse(conn->rmb_desc, conn->lnk); + smcr_buf_unuse(conn->rmb_desc, lgr); } /* remove a finished connection from its link group */ @@ -1169,7 +1167,6 @@ static int smcr_buf_map_usable_links(struct smc_link_group *lgr, if (!smc_link_usable(lnk)) continue; if (smcr_buf_map_link(buf_desc, is_rmb, lnk)) { - smcr_buf_unuse(buf_desc, lnk); rc = -ENOMEM; goto out; } @@ -1275,6 +1272,7 @@ static int __smc_buf_create(struct smc_sock *smc, bool is_smcd, bool is_rmb) if (!is_smcd) { if (smcr_buf_map_usable_links(lgr, buf_desc, is_rmb)) { + smcr_buf_unuse(buf_desc, lgr); return -ENOMEM; } } diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h index 4e0dfb1d5804..364a54e28d61 100644 --- a/net/smc/smc_core.h +++ b/net/smc/smc_core.h @@ -123,9 +123,6 @@ struct smc_link { struct delayed_work llc_testlink_wrk; /* testlink worker */ struct completion llc_testlink_resp; /* wait for rx of testlink */ int llc_testlink_time; /* testlink interval */ - struct completion llc_delete_rkey_resp; /* w4 rx of del rkey */ - int llc_delete_rkey_resp_rc; /* rc from del rkey */ - struct mutex llc_delete_rkey_mutex; /* serialize usage */ }; /* For now we just allow one parallel link per link group. The SMC protocol diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c index 5db11f54b4cd..f9ec270818fa 100644 --- a/net/smc/smc_llc.c +++ b/net/smc/smc_llc.c @@ -720,7 +720,6 @@ static void smc_llc_rx_response(struct smc_link *link, struct smc_llc_qentry *qentry) { u8 llc_type = qentry->msg.raw.hdr.common.type; - union smc_llc_msg *llc = &qentry->msg; switch (llc_type) { case SMC_LLC_TEST_LINK: @@ -730,6 +729,7 @@ static void smc_llc_rx_response(struct smc_link *link, case SMC_LLC_ADD_LINK: case SMC_LLC_CONFIRM_LINK: case SMC_LLC_CONFIRM_RKEY: + case SMC_LLC_DELETE_RKEY: /* assign responses to the local flow, we requested them */ smc_llc_flow_qentry_set(&link->lgr->llc_flow_lcl, qentry); wake_up_interruptible(&link->lgr->llc_waiter); @@ -741,11 +741,6 @@ static void smc_llc_rx_response(struct smc_link *link, case SMC_LLC_CONFIRM_RKEY_CONT: /* unused as long as we don't send this type of msg */ break; - case SMC_LLC_DELETE_RKEY: - link->llc_delete_rkey_resp_rc = llc->raw.hdr.flags & - SMC_LLC_FLAG_RKEY_NEG; - complete(&link->llc_delete_rkey_resp); - break; } kfree(qentry); } @@ -850,8 +845,6 @@ void smc_llc_lgr_clear(struct smc_link_group *lgr) int smc_llc_link_init(struct smc_link *link) { - init_completion(&link->llc_delete_rkey_resp); - mutex_init(&link->llc_delete_rkey_mutex); init_completion(&link->llc_testlink_resp); INIT_DELAYED_WORK(&link->llc_testlink_wrk, smc_llc_testlink_work); return 0; @@ -909,27 +902,33 @@ out: } /* unregister an rtoken at the remote peer */ -int smc_llc_do_delete_rkey(struct smc_link *link, +int smc_llc_do_delete_rkey(struct smc_link_group *lgr, struct smc_buf_desc *rmb_desc) { + struct smc_llc_qentry *qentry = NULL; + struct smc_link *send_link; int rc = 0; - mutex_lock(&link->llc_delete_rkey_mutex); - if (link->state != SMC_LNK_ACTIVE) - goto out; - reinit_completion(&link->llc_delete_rkey_resp); - rc = smc_llc_send_delete_rkey(link, rmb_desc); + send_link = smc_llc_usable_link(lgr); + if (!send_link) + return -ENOLINK; + + rc = smc_llc_flow_initiate(lgr, SMC_LLC_FLOW_RKEY); + if (rc) + return rc; + /* protected by llc_flow control */ + rc = smc_llc_send_delete_rkey(send_link, rmb_desc); if (rc) goto out; /* receive DELETE RKEY response from server over RoCE fabric */ - rc = wait_for_completion_interruptible_timeout( - &link->llc_delete_rkey_resp, SMC_LLC_WAIT_TIME); - if (rc <= 0 || link->llc_delete_rkey_resp_rc) + qentry = smc_llc_wait(lgr, send_link, SMC_LLC_WAIT_TIME, + SMC_LLC_DELETE_RKEY); + if (!qentry || (qentry->msg.raw.hdr.flags & SMC_LLC_FLAG_RKEY_NEG)) rc = -EFAULT; - else - rc = 0; out: - mutex_unlock(&link->llc_delete_rkey_mutex); + if (qentry) + smc_llc_flow_qentry_del(&lgr->llc_flow_lcl); + smc_llc_flow_stop(lgr, &lgr->llc_flow_lcl); return rc; } diff --git a/net/smc/smc_llc.h b/net/smc/smc_llc.h index d82d8346b61e..e9f23affece6 100644 --- a/net/smc/smc_llc.h +++ b/net/smc/smc_llc.h @@ -61,7 +61,7 @@ void smc_llc_link_deleting(struct smc_link *link); void smc_llc_link_clear(struct smc_link *link); int smc_llc_do_confirm_rkey(struct smc_link *send_link, struct smc_buf_desc *rmb_desc); -int smc_llc_do_delete_rkey(struct smc_link *link, +int smc_llc_do_delete_rkey(struct smc_link_group *lgr, struct smc_buf_desc *rmb_desc); int smc_llc_flow_initiate(struct smc_link_group *lgr, enum smc_llc_flowtype type); -- cgit v1.2.3-59-g8ed1b From 56e8091c7a098ef2257f85f16665d79cf3049da9 Mon Sep 17 00:00:00 2001 From: Karsten Graul Date: Thu, 30 Apr 2020 15:55:46 +0200 Subject: net/smc: move the TEST_LINK response processing into event handler Get rid of the extra function and move the two-liner for the TEST_LINK response processing into the event handler function. Signed-off-by: Karsten Graul Reviewed-by: Ursula Braun Signed-off-by: David S. Miller --- net/smc/smc_llc.c | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c index f9ec270818fa..4945abbad111 100644 --- a/net/smc/smc_llc.c +++ b/net/smc/smc_llc.c @@ -563,13 +563,6 @@ static void smc_llc_rx_delete_link(struct smc_link *link, smc_lgr_terminate_sched(lgr); } -static void smc_llc_rx_test_link(struct smc_link *link, - struct smc_llc_msg_test_link *llc) -{ - llc->hd.flags |= SMC_LLC_FLAG_RESP; - smc_llc_send_message(link, llc); -} - static void smc_llc_rx_confirm_rkey(struct smc_link *link, struct smc_llc_msg_confirm_rkey *llc) { @@ -640,7 +633,8 @@ static void smc_llc_event_handler(struct smc_llc_qentry *qentry) switch (llc->raw.hdr.common.type) { case SMC_LLC_TEST_LINK: - smc_llc_rx_test_link(link, &llc->test_link); + llc->test_link.hd.flags |= SMC_LLC_FLAG_RESP; + smc_llc_send_message(link, llc); break; case SMC_LLC_ADD_LINK: if (list_empty(&lgr->list)) -- cgit v1.2.3-59-g8ed1b From ba21abd22f9ffa023921923a6c01d28b59731ff8 Mon Sep 17 00:00:00 2001 From: Karsten Graul Date: Thu, 30 Apr 2020 15:55:47 +0200 Subject: net/smc: new smc_rtoken_set functions for multiple link support Introduce smc_rtoken_set() to set the rtoken for a new link to an existing rmb whose rtoken is given, and smc_rtoken_set2() to set an rtoken for a new link whose link_id is given. Signed-off-by: Karsten Graul Reviewed-by: Ursula Braun Signed-off-by: David S. Miller --- net/smc/smc_core.c | 47 +++++++++++++++++++++++++++++++++++++++++++++++ net/smc/smc_core.h | 4 ++++ 2 files changed, 51 insertions(+) diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index f71a366ed6ac..096dce92ee2b 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -1368,6 +1368,53 @@ static inline int smc_rmb_reserve_rtoken_idx(struct smc_link_group *lgr) return -ENOSPC; } +static int smc_rtoken_find_by_link(struct smc_link_group *lgr, int lnk_idx, + u32 rkey) +{ + int i; + + for (i = 0; i < SMC_RMBS_PER_LGR_MAX; i++) { + if (test_bit(i, lgr->rtokens_used_mask) && + lgr->rtokens[i][lnk_idx].rkey == rkey) + return i; + } + return -ENOENT; +} + +/* set rtoken for a new link to an existing rmb */ +void smc_rtoken_set(struct smc_link_group *lgr, int link_idx, int link_idx_new, + __be32 nw_rkey_known, __be64 nw_vaddr, __be32 nw_rkey) +{ + int rtok_idx; + + rtok_idx = smc_rtoken_find_by_link(lgr, link_idx, ntohl(nw_rkey_known)); + if (rtok_idx == -ENOENT) + return; + lgr->rtokens[rtok_idx][link_idx_new].rkey = ntohl(nw_rkey); + lgr->rtokens[rtok_idx][link_idx_new].dma_addr = be64_to_cpu(nw_vaddr); +} + +/* set rtoken for a new link whose link_id is given */ +void smc_rtoken_set2(struct smc_link_group *lgr, int rtok_idx, int link_id, + __be64 nw_vaddr, __be32 nw_rkey) +{ + u64 dma_addr = be64_to_cpu(nw_vaddr); + u32 rkey = ntohl(nw_rkey); + bool found = false; + int link_idx; + + for (link_idx = 0; link_idx < SMC_LINKS_PER_LGR_MAX; link_idx++) { + if (lgr->lnk[link_idx].link_id == link_id) { + found = true; + break; + } + } + if (!found) + return; + lgr->rtokens[rtok_idx][link_idx].rkey = rkey; + lgr->rtokens[rtok_idx][link_idx].dma_addr = dma_addr; +} + /* add a new rtoken from peer */ int smc_rtoken_add(struct smc_link *lnk, __be64 nw_vaddr, __be32 nw_rkey) { diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h index 364a54e28d61..0be26386057f 100644 --- a/net/smc/smc_core.h +++ b/net/smc/smc_core.h @@ -352,6 +352,10 @@ int smc_rmb_rtoken_handling(struct smc_connection *conn, struct smc_link *link, struct smc_clc_msg_accept_confirm *clc); int smc_rtoken_add(struct smc_link *lnk, __be64 nw_vaddr, __be32 nw_rkey); int smc_rtoken_delete(struct smc_link *lnk, __be32 nw_rkey); +void smc_rtoken_set(struct smc_link_group *lgr, int link_idx, int link_idx_new, + __be32 nw_rkey_known, __be64 nw_vaddr, __be32 nw_rkey); +void smc_rtoken_set2(struct smc_link_group *lgr, int rtok_idx, int link_id, + __be64 nw_vaddr, __be32 nw_rkey); void smc_sndbuf_sync_sg_for_cpu(struct smc_connection *conn); void smc_sndbuf_sync_sg_for_device(struct smc_connection *conn); void smc_rmb_sync_sg_for_cpu(struct smc_connection *conn); -- cgit v1.2.3-59-g8ed1b From 3bc67e098c3e215f6e09ba3c0e1f569e7ae020d0 Mon Sep 17 00:00:00 2001 From: Karsten Graul Date: Thu, 30 Apr 2020 15:55:48 +0200 Subject: net/smc: adapt SMC remote CONFIRM_RKEY processing to use the LLC flow Use the LLC flow framework for the processing of CONFIRM_RKEY messages that were received from the peer. Signed-off-by: Karsten Graul Reviewed-by: Ursula Braun Signed-off-by: David S. Miller --- net/smc/smc_llc.c | 56 ++++++++++++++++++++++++++++++++++++++++--------------- 1 file changed, 41 insertions(+), 15 deletions(-) diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c index 4945abbad111..b7b5cc01b78e 100644 --- a/net/smc/smc_llc.c +++ b/net/smc/smc_llc.c @@ -105,6 +105,7 @@ struct smc_llc_msg_confirm_rkey_cont { /* type 0x08 */ }; #define SMC_LLC_DEL_RKEY_MAX 8 +#define SMC_LLC_FLAG_RKEY_RETRY 0x10 #define SMC_LLC_FLAG_RKEY_NEG 0x20 struct smc_llc_msg_delete_rkey { /* type 0x09 */ @@ -563,21 +564,41 @@ static void smc_llc_rx_delete_link(struct smc_link *link, smc_lgr_terminate_sched(lgr); } -static void smc_llc_rx_confirm_rkey(struct smc_link *link, - struct smc_llc_msg_confirm_rkey *llc) +/* process a confirm_rkey request from peer, remote flow */ +static void smc_llc_rmt_conf_rkey(struct smc_link_group *lgr) { - int rc; - - rc = smc_rtoken_add(link, - llc->rtoken[0].rmb_vaddr, - llc->rtoken[0].rmb_key); - - /* ignore rtokens for other links, we have only one link */ - + struct smc_llc_msg_confirm_rkey *llc; + struct smc_llc_qentry *qentry; + struct smc_link *link; + int num_entries; + int rk_idx; + int i; + + qentry = lgr->llc_flow_rmt.qentry; + llc = &qentry->msg.confirm_rkey; + link = qentry->link; + + num_entries = llc->rtoken[0].num_rkeys; + /* first rkey entry is for receiving link */ + rk_idx = smc_rtoken_add(link, + llc->rtoken[0].rmb_vaddr, + llc->rtoken[0].rmb_key); + if (rk_idx < 0) + goto out_err; + + for (i = 1; i <= min_t(u8, num_entries, SMC_LLC_RKEYS_PER_MSG - 1); i++) + smc_rtoken_set2(lgr, rk_idx, llc->rtoken[i].link_id, + llc->rtoken[i].rmb_vaddr, + llc->rtoken[i].rmb_key); + /* max links is 3 so there is no need to support conf_rkey_cont msgs */ + goto out; +out_err: + llc->hd.flags |= SMC_LLC_FLAG_RKEY_NEG; + llc->hd.flags |= SMC_LLC_FLAG_RKEY_RETRY; +out: llc->hd.flags |= SMC_LLC_FLAG_RESP; - if (rc < 0) - llc->hd.flags |= SMC_LLC_FLAG_RKEY_NEG; - smc_llc_send_message(link, llc); + smc_llc_send_message(link, &qentry->msg); + smc_llc_flow_qentry_del(&lgr->llc_flow_rmt); } static void smc_llc_rx_confirm_rkey_cont(struct smc_link *link, @@ -666,8 +687,13 @@ static void smc_llc_event_handler(struct smc_llc_qentry *qentry) smc_llc_rx_delete_link(link, &llc->delete_link); break; case SMC_LLC_CONFIRM_RKEY: - smc_llc_rx_confirm_rkey(link, &llc->confirm_rkey); - break; + /* new request from remote, assign to remote flow */ + if (smc_llc_flow_start(&lgr->llc_flow_rmt, qentry)) { + /* process here, does not wait for more llc msgs */ + smc_llc_rmt_conf_rkey(lgr); + smc_llc_flow_stop(lgr, &lgr->llc_flow_rmt); + } + return; case SMC_LLC_CONFIRM_RKEY_CONT: smc_llc_rx_confirm_rkey_cont(link, &llc->confirm_rkey_cont); break; -- cgit v1.2.3-59-g8ed1b From 218b24fe381238941a06496eaf221a22c5935267 Mon Sep 17 00:00:00 2001 From: Karsten Graul Date: Thu, 30 Apr 2020 15:55:49 +0200 Subject: net/smc: adapt SMC remote DELETE_RKEY processing to use the LLC flow Use the LLC flow framework for the processing of DELETE_RKEY messages that were received from the peer. Signed-off-by: Karsten Graul Reviewed-by: Ursula Braun Signed-off-by: David S. Miller --- net/smc/smc_llc.c | 37 ++++++++++++++++++++++++------------- 1 file changed, 24 insertions(+), 13 deletions(-) diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c index b7b5cc01b78e..e458207bde9e 100644 --- a/net/smc/smc_llc.c +++ b/net/smc/smc_llc.c @@ -601,31 +601,37 @@ out: smc_llc_flow_qentry_del(&lgr->llc_flow_rmt); } -static void smc_llc_rx_confirm_rkey_cont(struct smc_link *link, - struct smc_llc_msg_confirm_rkey_cont *llc) -{ - /* ignore rtokens for other links, we have only one link */ - llc->hd.flags |= SMC_LLC_FLAG_RESP; - smc_llc_send_message(link, llc); -} - -static void smc_llc_rx_delete_rkey(struct smc_link *link, - struct smc_llc_msg_delete_rkey *llc) +/* process a delete_rkey request from peer, remote flow */ +static void smc_llc_rmt_delete_rkey(struct smc_link_group *lgr) { + struct smc_llc_msg_delete_rkey *llc; + struct smc_llc_qentry *qentry; + struct smc_link *link; u8 err_mask = 0; int i, max; + qentry = lgr->llc_flow_rmt.qentry; + llc = &qentry->msg.delete_rkey; + link = qentry->link; + max = min_t(u8, llc->num_rkeys, SMC_LLC_DEL_RKEY_MAX); for (i = 0; i < max; i++) { if (smc_rtoken_delete(link, llc->rkey[i])) err_mask |= 1 << (SMC_LLC_DEL_RKEY_MAX - 1 - i); } - if (err_mask) { llc->hd.flags |= SMC_LLC_FLAG_RKEY_NEG; llc->err_mask = err_mask; } + llc->hd.flags |= SMC_LLC_FLAG_RESP; + smc_llc_send_message(link, &qentry->msg); + smc_llc_flow_qentry_del(&lgr->llc_flow_rmt); +} +static void smc_llc_rx_confirm_rkey_cont(struct smc_link *link, + struct smc_llc_msg_confirm_rkey_cont *llc) +{ + /* ignore rtokens for other links, we have only one link */ llc->hd.flags |= SMC_LLC_FLAG_RESP; smc_llc_send_message(link, llc); } @@ -698,8 +704,13 @@ static void smc_llc_event_handler(struct smc_llc_qentry *qentry) smc_llc_rx_confirm_rkey_cont(link, &llc->confirm_rkey_cont); break; case SMC_LLC_DELETE_RKEY: - smc_llc_rx_delete_rkey(link, &llc->delete_rkey); - break; + /* new request from remote, assign to remote flow */ + if (smc_llc_flow_start(&lgr->llc_flow_rmt, qentry)) { + /* process here, does not wait for more llc msgs */ + smc_llc_rmt_delete_rkey(lgr); + smc_llc_flow_stop(lgr, &lgr->llc_flow_rmt); + } + return; } out: kfree(qentry); -- cgit v1.2.3-59-g8ed1b From 42d18acce9e29b61f5dbfc5118d7c72093e703a1 Mon Sep 17 00:00:00 2001 From: Karsten Graul Date: Thu, 30 Apr 2020 15:55:50 +0200 Subject: net/smc: remove handling of CONFIRM_RKEY_CONTINUE The new SMC-R multiple link support will support a maximum of 3 links, and one CONFIRM_RKEY LLC message can transport 3 rkeys of an rmb buffer. There is no need for the LLC message type CONFIRM_RKEY_CONTINUE which is needed when more than 3 rkeys per rmb buffer needs to be exchanged. Signed-off-by: Karsten Graul Reviewed-by: Ursula Braun Signed-off-by: David S. Miller --- net/smc/smc_llc.c | 21 ++++----------------- 1 file changed, 4 insertions(+), 17 deletions(-) diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c index e458207bde9e..92c9a8a8aaf9 100644 --- a/net/smc/smc_llc.c +++ b/net/smc/smc_llc.c @@ -98,12 +98,6 @@ struct smc_llc_msg_confirm_rkey { /* type 0x06 */ u8 reserved; }; -struct smc_llc_msg_confirm_rkey_cont { /* type 0x08 */ - struct smc_llc_hdr hd; - u8 num_rkeys; - struct smc_rmb_rtoken rtoken[SMC_LLC_RKEYS_PER_MSG]; -}; - #define SMC_LLC_DEL_RKEY_MAX 8 #define SMC_LLC_FLAG_RKEY_RETRY 0x10 #define SMC_LLC_FLAG_RKEY_NEG 0x20 @@ -123,7 +117,6 @@ union smc_llc_msg { struct smc_llc_msg_del_link delete_link; struct smc_llc_msg_confirm_rkey confirm_rkey; - struct smc_llc_msg_confirm_rkey_cont confirm_rkey_cont; struct smc_llc_msg_delete_rkey delete_rkey; struct smc_llc_msg_test_link test_link; @@ -628,14 +621,6 @@ static void smc_llc_rmt_delete_rkey(struct smc_link_group *lgr) smc_llc_flow_qentry_del(&lgr->llc_flow_rmt); } -static void smc_llc_rx_confirm_rkey_cont(struct smc_link *link, - struct smc_llc_msg_confirm_rkey_cont *llc) -{ - /* ignore rtokens for other links, we have only one link */ - llc->hd.flags |= SMC_LLC_FLAG_RESP; - smc_llc_send_message(link, llc); -} - /* flush the llc event queue */ static void smc_llc_event_flush(struct smc_link_group *lgr) { @@ -701,7 +686,9 @@ static void smc_llc_event_handler(struct smc_llc_qentry *qentry) } return; case SMC_LLC_CONFIRM_RKEY_CONT: - smc_llc_rx_confirm_rkey_cont(link, &llc->confirm_rkey_cont); + /* not used because max links is 3, and 3 rkeys fit into + * one CONFIRM_RKEY message + */ break; case SMC_LLC_DELETE_RKEY: /* new request from remote, assign to remote flow */ @@ -770,7 +757,7 @@ static void smc_llc_rx_response(struct smc_link *link, smc_lgr_schedule_free_work_fast(link->lgr); break; case SMC_LLC_CONFIRM_RKEY_CONT: - /* unused as long as we don't send this type of msg */ + /* not used because max links is 3 */ break; } kfree(qentry); -- cgit v1.2.3-59-g8ed1b From 41a211d862242439c9cdb2481946bb0928760541 Mon Sep 17 00:00:00 2001 From: Karsten Graul Date: Thu, 30 Apr 2020 15:55:51 +0200 Subject: net/smc: remove obsolete link state DELETING The connection layer in af_smc.c is now using the new LLC flow framework, which made the link state DELETING obsolete. Remove the state and the respective helpers. Signed-off-by: Karsten Graul Reviewed-by: Ursula Braun Signed-off-by: David S. Miller --- net/smc/smc_core.c | 4 +--- net/smc/smc_core.h | 1 - net/smc/smc_llc.c | 7 ------- net/smc/smc_llc.h | 1 - 4 files changed, 1 insertion(+), 12 deletions(-) diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index 096dce92ee2b..3539ceef9a97 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -200,7 +200,6 @@ static int smcr_link_send_delete(struct smc_link *lnk, bool orderly) { if (lnk->state == SMC_LNK_ACTIVE && !smc_llc_send_delete_link(lnk, SMC_LLC_REQ, orderly)) { - smc_llc_link_deleting(lnk); return 0; } return -ENOTCONN; @@ -767,8 +766,7 @@ void smc_port_terminate(struct smc_ib_device *smcibdev, u8 ibport) continue; /* tbd - terminate only when no more links are active */ for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { - if (!smc_link_usable(&lgr->lnk[i]) || - lgr->lnk[i].state == SMC_LNK_DELETING) + if (!smc_link_usable(&lgr->lnk[i])) continue; if (lgr->lnk[i].smcibdev == smcibdev && lgr->lnk[i].ibport == ibport) { diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h index 0be26386057f..f12474cc666c 100644 --- a/net/smc/smc_core.h +++ b/net/smc/smc_core.h @@ -36,7 +36,6 @@ enum smc_link_state { /* possible states of a link */ SMC_LNK_INACTIVE, /* link is inactive */ SMC_LNK_ACTIVATING, /* link is being activated */ SMC_LNK_ACTIVE, /* link is active */ - SMC_LNK_DELETING, /* link is being deleted */ }; #define SMC_WR_BUF_SIZE 48 /* size of work request buffer */ diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c index 92c9a8a8aaf9..327cf30b98cc 100644 --- a/net/smc/smc_llc.c +++ b/net/smc/smc_llc.c @@ -545,7 +545,6 @@ static void smc_llc_rx_delete_link(struct smc_link *link, struct smc_link_group *lgr = smc_get_lgr(link); smc_lgr_forget(lgr); - smc_llc_link_deleting(link); if (lgr->role == SMC_SERV) { /* client asks to delete this link, send request */ smc_llc_prep_delete_link(llc, link, SMC_LLC_REQ, true); @@ -878,12 +877,6 @@ void smc_llc_link_active(struct smc_link *link) } } -void smc_llc_link_deleting(struct smc_link *link) -{ - link->state = SMC_LNK_DELETING; - smc_wr_wakeup_tx_wait(link); -} - /* called in worker context */ void smc_llc_link_clear(struct smc_link *link) { diff --git a/net/smc/smc_llc.h b/net/smc/smc_llc.h index e9f23affece6..48029a5e14c3 100644 --- a/net/smc/smc_llc.h +++ b/net/smc/smc_llc.h @@ -57,7 +57,6 @@ void smc_llc_lgr_init(struct smc_link_group *lgr, struct smc_sock *smc); void smc_llc_lgr_clear(struct smc_link_group *lgr); int smc_llc_link_init(struct smc_link *link); void smc_llc_link_active(struct smc_link *link); -void smc_llc_link_deleting(struct smc_link *link); void smc_llc_link_clear(struct smc_link *link); int smc_llc_do_confirm_rkey(struct smc_link *send_link, struct smc_buf_desc *rmb_desc); -- cgit v1.2.3-59-g8ed1b From 801eb0501824da196c7b1c18c453528457308c5a Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Thu, 30 Apr 2020 18:02:48 +0300 Subject: stmmac: intel: Fix kernel crash due to wrong error path Unfortunately sometimes ->probe() may fail. The commit b9663b7ca6ff ("net: stmmac: Enable SERDES power up/down sequence") messed up with error handling and thus: [ 12.811311] ------------[ cut here ]------------ [ 12.811993] kernel BUG at net/core/dev.c:9937! Fix this by properly crafted error path. Fixes: b9663b7ca6ff ("net: stmmac: Enable SERDES power up/down sequence") Cc: Voon Weifeng Cc: Ong Boon Leong Signed-off-by: Andy Shevchenko Signed-off-by: David S. Miller --- drivers/net/ethernet/stmicro/stmmac/stmmac_main.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c index 565da6498c84..ff22f274aa43 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c @@ -4991,7 +4991,7 @@ int stmmac_dvr_probe(struct device *device, priv->plat->bsp_priv); if (ret < 0) - return ret; + goto error_serdes_powerup; } #ifdef CONFIG_DEBUG_FS @@ -5000,6 +5000,8 @@ int stmmac_dvr_probe(struct device *device, return ret; +error_serdes_powerup: + unregister_netdev(ndev); error_netdev_register: phylink_destroy(priv->phylink); error_phy_setup: -- cgit v1.2.3-59-g8ed1b From 09f012e64e4b8126ed6f02d0a85a57c3a0465cf9 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Thu, 30 Apr 2020 18:02:49 +0300 Subject: stmmac: intel: Fix clock handling on error and remove paths clk_prepare_enable() might fail, we have to check its returned value. Besides that we have to call clk_disable_unprepare() on the error and remove paths. Do above in the dwmac-intel driver. While at it, remove leftover in stmmac_pci and remove unneeded condition for NULL-aware clk_unregister_fixed_rate() call. Fixes: 58da0cfa6cf1 ("net: stmmac: create dwmac-intel.c to contain all Intel platform") Cc: Voon Weifeng Signed-off-by: Andy Shevchenko Signed-off-by: David S. Miller --- drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c | 20 ++++++++++++++++---- drivers/net/ethernet/stmicro/stmmac/stmmac_pci.c | 5 ----- 2 files changed, 16 insertions(+), 9 deletions(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c index 2e4aaedb93f5..d163c4b43da0 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c @@ -252,6 +252,7 @@ static void common_default_data(struct plat_stmmacenet_data *plat) static int intel_mgbe_common_data(struct pci_dev *pdev, struct plat_stmmacenet_data *plat) { + int ret; int i; plat->clk_csr = 5; @@ -324,7 +325,12 @@ static int intel_mgbe_common_data(struct pci_dev *pdev, dev_warn(&pdev->dev, "Fail to register stmmac-clk\n"); plat->stmmac_clk = NULL; } - clk_prepare_enable(plat->stmmac_clk); + + ret = clk_prepare_enable(plat->stmmac_clk); + if (ret) { + clk_unregister_fixed_rate(plat->stmmac_clk); + return ret; + } /* Set default value for multicast hash bins */ plat->multicast_filter_bins = HASH_TABLE_SIZE; @@ -657,7 +663,13 @@ static int intel_eth_pci_probe(struct pci_dev *pdev, res.wol_irq = pdev->irq; res.irq = pdev->irq; - return stmmac_dvr_probe(&pdev->dev, plat, &res); + ret = stmmac_dvr_probe(&pdev->dev, plat, &res); + if (ret) { + clk_disable_unprepare(plat->stmmac_clk); + clk_unregister_fixed_rate(plat->stmmac_clk); + } + + return ret; } /** @@ -675,8 +687,8 @@ static void intel_eth_pci_remove(struct pci_dev *pdev) stmmac_dvr_remove(&pdev->dev); - if (priv->plat->stmmac_clk) - clk_unregister_fixed_rate(priv->plat->stmmac_clk); + clk_disable_unprepare(priv->plat->stmmac_clk); + clk_unregister_fixed_rate(priv->plat->stmmac_clk); for (i = 0; i < PCI_STD_NUM_BARS; i++) { if (pci_resource_len(pdev, i) == 0) diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_pci.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_pci.c index 3fb21f7ac9fb..272cb47af9f2 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_pci.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_pci.c @@ -217,15 +217,10 @@ static int stmmac_pci_probe(struct pci_dev *pdev, */ static void stmmac_pci_remove(struct pci_dev *pdev) { - struct net_device *ndev = dev_get_drvdata(&pdev->dev); - struct stmmac_priv *priv = netdev_priv(ndev); int i; stmmac_dvr_remove(&pdev->dev); - if (priv->plat->stmmac_clk) - clk_unregister_fixed_rate(priv->plat->stmmac_clk); - for (i = 0; i < PCI_STD_NUM_BARS; i++) { if (pci_resource_len(pdev, i) == 0) continue; -- cgit v1.2.3-59-g8ed1b From e578f043ffcf8b4f0999ab9e9194f156e90c7fd3 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Thu, 30 Apr 2020 18:02:50 +0300 Subject: stmmac: intel: Remove unnecessary loop for PCI BARs Copy'n'paste without thinking is not a good idea and in this case it brought unnecessary loop over PCI BAR resources which was needed to workaround one of STMicro RVP boards. Remove unnecessary loops from Intel driver. Fixes: 58da0cfa6cf1 ("net: stmmac: create dwmac-intel.c to contain all Intel platform") Cc: Voon Weifeng Signed-off-by: Andy Shevchenko Signed-off-by: David S. Miller --- drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c | 23 +++++------------------ 1 file changed, 5 insertions(+), 18 deletions(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c index d163c4b43da0..e9f948559499 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c @@ -606,7 +606,6 @@ static int intel_eth_pci_probe(struct pci_dev *pdev, struct intel_priv_data *intel_priv; struct plat_stmmacenet_data *plat; struct stmmac_resources res; - int i; int ret; intel_priv = devm_kzalloc(&pdev->dev, sizeof(*intel_priv), @@ -637,15 +636,9 @@ static int intel_eth_pci_probe(struct pci_dev *pdev, return ret; } - /* Get the base address of device */ - for (i = 0; i < PCI_STD_NUM_BARS; i++) { - if (pci_resource_len(pdev, i) == 0) - continue; - ret = pcim_iomap_regions(pdev, BIT(i), pci_name(pdev)); - if (ret) - return ret; - break; - } + ret = pcim_iomap_regions(pdev, BIT(0), pci_name(pdev)); + if (ret) + return ret; pci_set_master(pdev); @@ -659,7 +652,7 @@ static int intel_eth_pci_probe(struct pci_dev *pdev, pci_enable_msi(pdev); memset(&res, 0, sizeof(res)); - res.addr = pcim_iomap_table(pdev)[i]; + res.addr = pcim_iomap_table(pdev)[0]; res.wol_irq = pdev->irq; res.irq = pdev->irq; @@ -683,19 +676,13 @@ static void intel_eth_pci_remove(struct pci_dev *pdev) { struct net_device *ndev = dev_get_drvdata(&pdev->dev); struct stmmac_priv *priv = netdev_priv(ndev); - int i; stmmac_dvr_remove(&pdev->dev); clk_disable_unprepare(priv->plat->stmmac_clk); clk_unregister_fixed_rate(priv->plat->stmmac_clk); - for (i = 0; i < PCI_STD_NUM_BARS; i++) { - if (pci_resource_len(pdev, i) == 0) - continue; - pcim_iounmap_regions(pdev, BIT(i)); - break; - } + pcim_iounmap_regions(pdev, BIT(0)); pci_disable_device(pdev); } -- cgit v1.2.3-59-g8ed1b From 52c1f794845411c39ee6deedb443894ca141b4a7 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Thu, 30 Apr 2020 18:02:51 +0300 Subject: stmmac: intel: Convert to use pci_alloc_irq_vectors() API pci_enable_msi() is deprecated API, thus, switch to modern pci_alloc_irq_vectors(). Signed-off-by: Andy Shevchenko Signed-off-by: David S. Miller --- drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c index e9f948559499..bb8bf31c1259 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c @@ -649,15 +649,18 @@ static int intel_eth_pci_probe(struct pci_dev *pdev, if (ret) return ret; - pci_enable_msi(pdev); + ret = pci_alloc_irq_vectors(pdev, 1, 1, PCI_IRQ_ALL_TYPES); + if (ret < 0) + return ret; memset(&res, 0, sizeof(res)); res.addr = pcim_iomap_table(pdev)[0]; - res.wol_irq = pdev->irq; - res.irq = pdev->irq; + res.wol_irq = pci_irq_vector(pdev, 0); + res.irq = pci_irq_vector(pdev, 0); ret = stmmac_dvr_probe(&pdev->dev, plat, &res); if (ret) { + pci_free_irq_vectors(pdev); clk_disable_unprepare(plat->stmmac_clk); clk_unregister_fixed_rate(plat->stmmac_clk); } @@ -679,6 +682,8 @@ static void intel_eth_pci_remove(struct pci_dev *pdev) stmmac_dvr_remove(&pdev->dev); + pci_free_irq_vectors(pdev); + clk_disable_unprepare(priv->plat->stmmac_clk); clk_unregister_fixed_rate(priv->plat->stmmac_clk); -- cgit v1.2.3-59-g8ed1b From d5383b0376643245d82230d8a974edd193ec900c Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Thu, 30 Apr 2020 18:02:52 +0300 Subject: stmmac: intel: Eliminate useless conditions and variables There are useless conditions like func() { ... int ret; ... ret = foo(); if (ret) return ret; return 0; } which may be replaced with direct return statement, what we have done here. Signed-off-by: Andy Shevchenko Signed-off-by: David S. Miller --- drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c index bb8bf31c1259..b0d735e4c13c 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c @@ -347,16 +347,11 @@ static int intel_mgbe_common_data(struct pci_dev *pdev, static int ehl_common_data(struct pci_dev *pdev, struct plat_stmmacenet_data *plat) { - int ret; - plat->rx_queues_to_use = 8; plat->tx_queues_to_use = 8; plat->clk_ptp_rate = 200000000; - ret = intel_mgbe_common_data(pdev, plat); - if (ret) - return ret; - return 0; + return intel_mgbe_common_data(pdev, plat); } static int ehl_sgmii_data(struct pci_dev *pdev, @@ -457,16 +452,11 @@ static struct stmmac_pci_info ehl_pse1_sgmii1g_pci_info = { static int tgl_common_data(struct pci_dev *pdev, struct plat_stmmacenet_data *plat) { - int ret; - plat->rx_queues_to_use = 6; plat->tx_queues_to_use = 4; plat->clk_ptp_rate = 200000000; - ret = intel_mgbe_common_data(pdev, plat); - if (ret) - return ret; - return 0; + return intel_mgbe_common_data(pdev, plat); } static int tgl_sgmii_data(struct pci_dev *pdev, -- cgit v1.2.3-59-g8ed1b From ccacb703b0f8141757fba2931a2a7202b54194d0 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Thu, 30 Apr 2020 18:02:53 +0300 Subject: stmmac: intel: Fix indentation to put on one line affected code There is no competition to get more LOCs into the kernel, and driver can look better and have improved readability without those additional line breaks. While at it, shorten info structures that they are all PCI, at the end it's a PCI driver for Intel hardware. Signed-off-by: Andy Shevchenko Signed-off-by: David S. Miller --- drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c | 92 ++++++++--------------- 1 file changed, 32 insertions(+), 60 deletions(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c index b0d735e4c13c..2ac9dfb3462c 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c @@ -83,13 +83,9 @@ static int intel_serdes_powerup(struct net_device *ndev, void *priv_data) serdes_phy_addr = intel_priv->mdio_adhoc_addr; /* assert clk_req */ - data = mdiobus_read(priv->mii, serdes_phy_addr, - SERDES_GCR0); - + data = mdiobus_read(priv->mii, serdes_phy_addr, SERDES_GCR0); data |= SERDES_PLL_CLK; - - mdiobus_write(priv->mii, serdes_phy_addr, - SERDES_GCR0, data); + mdiobus_write(priv->mii, serdes_phy_addr, SERDES_GCR0, data); /* check for clk_ack assertion */ data = serdes_status_poll(priv, serdes_phy_addr, @@ -103,13 +99,9 @@ static int intel_serdes_powerup(struct net_device *ndev, void *priv_data) } /* assert lane reset */ - data = mdiobus_read(priv->mii, serdes_phy_addr, - SERDES_GCR0); - + data = mdiobus_read(priv->mii, serdes_phy_addr, SERDES_GCR0); data |= SERDES_RST; - - mdiobus_write(priv->mii, serdes_phy_addr, - SERDES_GCR0, data); + mdiobus_write(priv->mii, serdes_phy_addr, SERDES_GCR0, data); /* check for assert lane reset reflection */ data = serdes_status_poll(priv, serdes_phy_addr, @@ -123,14 +115,12 @@ static int intel_serdes_powerup(struct net_device *ndev, void *priv_data) } /* move power state to P0 */ - data = mdiobus_read(priv->mii, serdes_phy_addr, - SERDES_GCR0); + data = mdiobus_read(priv->mii, serdes_phy_addr, SERDES_GCR0); data &= ~SERDES_PWR_ST_MASK; data |= SERDES_PWR_ST_P0 << SERDES_PWR_ST_SHIFT; - mdiobus_write(priv->mii, serdes_phy_addr, - SERDES_GCR0, data); + mdiobus_write(priv->mii, serdes_phy_addr, SERDES_GCR0, data); /* Check for P0 state */ data = serdes_status_poll(priv, serdes_phy_addr, @@ -159,14 +149,12 @@ static void intel_serdes_powerdown(struct net_device *ndev, void *intel_data) serdes_phy_addr = intel_priv->mdio_adhoc_addr; /* move power state to P3 */ - data = mdiobus_read(priv->mii, serdes_phy_addr, - SERDES_GCR0); + data = mdiobus_read(priv->mii, serdes_phy_addr, SERDES_GCR0); data &= ~SERDES_PWR_ST_MASK; data |= SERDES_PWR_ST_P3 << SERDES_PWR_ST_SHIFT; - mdiobus_write(priv->mii, serdes_phy_addr, - SERDES_GCR0, data); + mdiobus_write(priv->mii, serdes_phy_addr, SERDES_GCR0, data); /* Check for P3 state */ data = serdes_status_poll(priv, serdes_phy_addr, @@ -180,13 +168,9 @@ static void intel_serdes_powerdown(struct net_device *ndev, void *intel_data) } /* de-assert clk_req */ - data = mdiobus_read(priv->mii, serdes_phy_addr, - SERDES_GCR0); - + data = mdiobus_read(priv->mii, serdes_phy_addr, SERDES_GCR0); data &= ~SERDES_PLL_CLK; - - mdiobus_write(priv->mii, serdes_phy_addr, - SERDES_GCR0, data); + mdiobus_write(priv->mii, serdes_phy_addr, SERDES_GCR0, data); /* check for clk_ack de-assert */ data = serdes_status_poll(priv, serdes_phy_addr, @@ -200,13 +184,9 @@ static void intel_serdes_powerdown(struct net_device *ndev, void *intel_data) } /* de-assert lane reset */ - data = mdiobus_read(priv->mii, serdes_phy_addr, - SERDES_GCR0); - + data = mdiobus_read(priv->mii, serdes_phy_addr, SERDES_GCR0); data &= ~SERDES_RST; - - mdiobus_write(priv->mii, serdes_phy_addr, - SERDES_GCR0, data); + mdiobus_write(priv->mii, serdes_phy_addr, SERDES_GCR0, data); /* check for de-assert lane reset reflection */ data = serdes_status_poll(priv, serdes_phy_addr, @@ -367,7 +347,7 @@ static int ehl_sgmii_data(struct pci_dev *pdev, return ehl_common_data(pdev, plat); } -static struct stmmac_pci_info ehl_sgmii1g_pci_info = { +static struct stmmac_pci_info ehl_sgmii1g_info = { .setup = ehl_sgmii_data, }; @@ -381,7 +361,7 @@ static int ehl_rgmii_data(struct pci_dev *pdev, return ehl_common_data(pdev, plat); } -static struct stmmac_pci_info ehl_rgmii1g_pci_info = { +static struct stmmac_pci_info ehl_rgmii1g_info = { .setup = ehl_rgmii_data, }; @@ -400,7 +380,7 @@ static int ehl_pse0_rgmii1g_data(struct pci_dev *pdev, return ehl_pse0_common_data(pdev, plat); } -static struct stmmac_pci_info ehl_pse0_rgmii1g_pci_info = { +static struct stmmac_pci_info ehl_pse0_rgmii1g_info = { .setup = ehl_pse0_rgmii1g_data, }; @@ -413,7 +393,7 @@ static int ehl_pse0_sgmii1g_data(struct pci_dev *pdev, return ehl_pse0_common_data(pdev, plat); } -static struct stmmac_pci_info ehl_pse0_sgmii1g_pci_info = { +static struct stmmac_pci_info ehl_pse0_sgmii1g_info = { .setup = ehl_pse0_sgmii1g_data, }; @@ -432,7 +412,7 @@ static int ehl_pse1_rgmii1g_data(struct pci_dev *pdev, return ehl_pse1_common_data(pdev, plat); } -static struct stmmac_pci_info ehl_pse1_rgmii1g_pci_info = { +static struct stmmac_pci_info ehl_pse1_rgmii1g_info = { .setup = ehl_pse1_rgmii1g_data, }; @@ -445,7 +425,7 @@ static int ehl_pse1_sgmii1g_data(struct pci_dev *pdev, return ehl_pse1_common_data(pdev, plat); } -static struct stmmac_pci_info ehl_pse1_sgmii1g_pci_info = { +static struct stmmac_pci_info ehl_pse1_sgmii1g_info = { .setup = ehl_pse1_sgmii1g_data, }; @@ -470,7 +450,7 @@ static int tgl_sgmii_data(struct pci_dev *pdev, return tgl_common_data(pdev, plat); } -static struct stmmac_pci_info tgl_sgmii1g_pci_info = { +static struct stmmac_pci_info tgl_sgmii1g_info = { .setup = tgl_sgmii_data, }; @@ -573,7 +553,7 @@ static int quark_default_data(struct pci_dev *pdev, return 0; } -static const struct stmmac_pci_info quark_pci_info = { +static const struct stmmac_pci_info quark_info = { .setup = quark_default_data, }; @@ -598,8 +578,7 @@ static int intel_eth_pci_probe(struct pci_dev *pdev, struct stmmac_resources res; int ret; - intel_priv = devm_kzalloc(&pdev->dev, sizeof(*intel_priv), - GFP_KERNEL); + intel_priv = devm_kzalloc(&pdev->dev, sizeof(*intel_priv), GFP_KERNEL); if (!intel_priv) return -ENOMEM; @@ -736,26 +715,19 @@ static SIMPLE_DEV_PM_OPS(intel_eth_pm_ops, intel_eth_pci_suspend, #define PCI_DEVICE_ID_INTEL_TGL_SGMII1G_ID 0xa0ac static const struct pci_device_id intel_eth_pci_id_table[] = { - { PCI_DEVICE_DATA(INTEL, QUARK_ID, &quark_pci_info) }, - { PCI_DEVICE_DATA(INTEL, EHL_RGMII1G_ID, &ehl_rgmii1g_pci_info) }, - { PCI_DEVICE_DATA(INTEL, EHL_SGMII1G_ID, &ehl_sgmii1g_pci_info) }, - { PCI_DEVICE_DATA(INTEL, EHL_SGMII2G5_ID, &ehl_sgmii1g_pci_info) }, - { PCI_DEVICE_DATA(INTEL, EHL_PSE0_RGMII1G_ID, - &ehl_pse0_rgmii1g_pci_info) }, - { PCI_DEVICE_DATA(INTEL, EHL_PSE0_SGMII1G_ID, - &ehl_pse0_sgmii1g_pci_info) }, - { PCI_DEVICE_DATA(INTEL, EHL_PSE0_SGMII2G5_ID, - &ehl_pse0_sgmii1g_pci_info) }, - { PCI_DEVICE_DATA(INTEL, EHL_PSE1_RGMII1G_ID, - &ehl_pse1_rgmii1g_pci_info) }, - { PCI_DEVICE_DATA(INTEL, EHL_PSE1_SGMII1G_ID, - &ehl_pse1_sgmii1g_pci_info) }, - { PCI_DEVICE_DATA(INTEL, EHL_PSE1_SGMII2G5_ID, - &ehl_pse1_sgmii1g_pci_info) }, - { PCI_DEVICE_DATA(INTEL, TGL_SGMII1G_ID, &tgl_sgmii1g_pci_info) }, + { PCI_DEVICE_DATA(INTEL, QUARK_ID, &quark_info) }, + { PCI_DEVICE_DATA(INTEL, EHL_RGMII1G_ID, &ehl_rgmii1g_info) }, + { PCI_DEVICE_DATA(INTEL, EHL_SGMII1G_ID, &ehl_sgmii1g_info) }, + { PCI_DEVICE_DATA(INTEL, EHL_SGMII2G5_ID, &ehl_sgmii1g_info) }, + { PCI_DEVICE_DATA(INTEL, EHL_PSE0_RGMII1G_ID, &ehl_pse0_rgmii1g_info) }, + { PCI_DEVICE_DATA(INTEL, EHL_PSE0_SGMII1G_ID, &ehl_pse0_sgmii1g_info) }, + { PCI_DEVICE_DATA(INTEL, EHL_PSE0_SGMII2G5_ID, &ehl_pse0_sgmii1g_info) }, + { PCI_DEVICE_DATA(INTEL, EHL_PSE1_RGMII1G_ID, &ehl_pse1_rgmii1g_info) }, + { PCI_DEVICE_DATA(INTEL, EHL_PSE1_SGMII1G_ID, &ehl_pse1_sgmii1g_info) }, + { PCI_DEVICE_DATA(INTEL, EHL_PSE1_SGMII2G5_ID, &ehl_pse1_sgmii1g_info) }, + { PCI_DEVICE_DATA(INTEL, TGL_SGMII1G_ID, &tgl_sgmii1g_info) }, {} }; - MODULE_DEVICE_TABLE(pci, intel_eth_pci_id_table); static struct pci_driver intel_eth_pci_driver = { -- cgit v1.2.3-59-g8ed1b From 29e0c2f39f983f3b83f6004f9fd05f8f4ce225c6 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Thu, 30 Apr 2020 18:02:54 +0300 Subject: stmmac: intel: Place object in the Makefile according to the order Follow the order for the platform drivers, i.e. generic object are going first. Signed-off-by: Andy Shevchenko Signed-off-by: David S. Miller --- drivers/net/ethernet/stmicro/stmmac/Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/Makefile b/drivers/net/ethernet/stmicro/stmmac/Makefile index 5a6f265bc540..f9d024d6b69b 100644 --- a/drivers/net/ethernet/stmicro/stmmac/Makefile +++ b/drivers/net/ethernet/stmicro/stmmac/Makefile @@ -30,6 +30,6 @@ obj-$(CONFIG_DWMAC_GENERIC) += dwmac-generic.o stmmac-platform-objs:= stmmac_platform.o dwmac-altr-socfpga-objs := altr_tse_pcs.o dwmac-socfpga.o -obj-$(CONFIG_DWMAC_INTEL) += dwmac-intel.o -obj-$(CONFIG_STMMAC_PCI) += stmmac-pci.o +obj-$(CONFIG_STMMAC_PCI) += stmmac-pci.o +obj-$(CONFIG_DWMAC_INTEL) += dwmac-intel.o stmmac-pci-objs:= stmmac_pci.o -- cgit v1.2.3-59-g8ed1b From 6e3a401fc8af01828bcdc92d713195d318b36e7e Mon Sep 17 00:00:00 2001 From: Dmitry Yakunin Date: Thu, 30 Apr 2020 18:51:14 +0300 Subject: inet_diag: add cgroup id attribute This patch adds cgroup v2 ID to common inet diag message attributes. Cgroup v2 ID is kernfs ID (ino or ino+gen). This attribute allows filter inet diag output by cgroup ID obtained by name_to_handle_at() syscall. When net_cls or net_prio cgroup is activated this ID is equal to 1 (root cgroup ID) for newly created sockets. Some notes about this ID: 1) gets initialized in socket() syscall 2) incoming socket gets ID from listening socket (not during accept() syscall) 3) not changed when process get moved to another cgroup 4) can point to deleted cgroup (refcounting) v2: - use CONFIG_SOCK_CGROUP_DATA instead if CONFIG_CGROUPS v3: - fix attr size by using nla_total_size_64bit() (Eric Dumazet) - more detailed commit message (Konstantin Khlebnikov) Signed-off-by: Dmitry Yakunin Reviewed-by: Konstantin Khlebnikov Acked-By: Tejun Heo Signed-off-by: David S. Miller --- include/linux/inet_diag.h | 6 +++++- include/uapi/linux/inet_diag.h | 1 + net/ipv4/inet_diag.c | 7 +++++++ 3 files changed, 13 insertions(+), 1 deletion(-) diff --git a/include/linux/inet_diag.h b/include/linux/inet_diag.h index ce9ed1c0602f..0ef2d800fda7 100644 --- a/include/linux/inet_diag.h +++ b/include/linux/inet_diag.h @@ -71,7 +71,11 @@ static inline size_t inet_diag_msg_attrs_size(void) + nla_total_size(1) /* INET_DIAG_SKV6ONLY */ #endif + nla_total_size(4) /* INET_DIAG_MARK */ - + nla_total_size(4); /* INET_DIAG_CLASS_ID */ + + nla_total_size(4) /* INET_DIAG_CLASS_ID */ +#ifdef CONFIG_SOCK_CGROUP_DATA + + nla_total_size_64bit(sizeof(u64)) /* INET_DIAG_CGROUP_ID */ +#endif + ; } int inet_diag_msg_attrs_fill(struct sock *sk, struct sk_buff *skb, struct inet_diag_msg *r, int ext, diff --git a/include/uapi/linux/inet_diag.h b/include/uapi/linux/inet_diag.h index 57cc429a9177..c9b1e551792c 100644 --- a/include/uapi/linux/inet_diag.h +++ b/include/uapi/linux/inet_diag.h @@ -157,6 +157,7 @@ enum { INET_DIAG_MD5SIG, INET_DIAG_ULP_INFO, INET_DIAG_SK_BPF_STORAGES, + INET_DIAG_CGROUP_ID, __INET_DIAG_MAX, }; diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index 5d50aad3cdbf..9c4c315cbc10 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -162,6 +162,13 @@ int inet_diag_msg_attrs_fill(struct sock *sk, struct sk_buff *skb, goto errout; } +#ifdef CONFIG_SOCK_CGROUP_DATA + if (nla_put_u64_64bit(skb, INET_DIAG_CGROUP_ID, + cgroup_id(sock_cgroup_ptr(&sk->sk_cgrp_data)), + INET_DIAG_PAD)) + goto errout; +#endif + r->idiag_uid = from_kuid_munged(user_ns, sock_i_uid(sk)); r->idiag_inode = sock_i_ino(sk); -- cgit v1.2.3-59-g8ed1b From b1f3e43dbfacfcd95296b0f80f84b186add9ef54 Mon Sep 17 00:00:00 2001 From: Dmitry Yakunin Date: Thu, 30 Apr 2020 18:51:15 +0300 Subject: inet_diag: add support for cgroup filter This patch adds ability to filter sockets based on cgroup v2 ID. Such filter is helpful in ss utility for filtering sockets by cgroup pathname. Signed-off-by: Dmitry Yakunin Reviewed-by: Konstantin Khlebnikov Signed-off-by: David S. Miller --- include/uapi/linux/inet_diag.h | 1 + net/ipv4/inet_diag.c | 31 +++++++++++++++++++++++++++++++ 2 files changed, 32 insertions(+) diff --git a/include/uapi/linux/inet_diag.h b/include/uapi/linux/inet_diag.h index c9b1e551792c..e6f183ee8417 100644 --- a/include/uapi/linux/inet_diag.h +++ b/include/uapi/linux/inet_diag.h @@ -96,6 +96,7 @@ enum { INET_DIAG_BC_MARK_COND, INET_DIAG_BC_S_EQ, INET_DIAG_BC_D_EQ, + INET_DIAG_BC_CGROUP_COND, /* u64 cgroup v2 ID */ }; struct inet_diag_hostcond { diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index 9c4c315cbc10..0034092358c3 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -43,6 +43,9 @@ struct inet_diag_entry { u16 userlocks; u32 ifindex; u32 mark; +#ifdef CONFIG_SOCK_CGROUP_DATA + u64 cgroup_id; +#endif }; static DEFINE_MUTEX(inet_diag_table_mutex); @@ -682,6 +685,16 @@ static int inet_diag_bc_run(const struct nlattr *_bc, yes = 0; break; } +#ifdef CONFIG_SOCK_CGROUP_DATA + case INET_DIAG_BC_CGROUP_COND: { + u64 cgroup_id; + + cgroup_id = get_unaligned((const u64 *)(op + 1)); + if (cgroup_id != entry->cgroup_id) + yes = 0; + break; + } +#endif } if (yes) { @@ -732,6 +745,9 @@ int inet_diag_bc_sk(const struct nlattr *bc, struct sock *sk) entry.mark = inet_rsk(inet_reqsk(sk))->ir_mark; else entry.mark = 0; +#ifdef CONFIG_SOCK_CGROUP_DATA + entry.cgroup_id = cgroup_id(sock_cgroup_ptr(&sk->sk_cgrp_data)); +#endif return inet_diag_bc_run(bc, &entry); } @@ -821,6 +837,15 @@ static bool valid_markcond(const struct inet_diag_bc_op *op, int len, return len >= *min_len; } +#ifdef CONFIG_SOCK_CGROUP_DATA +static bool valid_cgroupcond(const struct inet_diag_bc_op *op, int len, + int *min_len) +{ + *min_len += sizeof(u64); + return len >= *min_len; +} +#endif + static int inet_diag_bc_audit(const struct nlattr *attr, const struct sk_buff *skb) { @@ -863,6 +888,12 @@ static int inet_diag_bc_audit(const struct nlattr *attr, if (!valid_markcond(bc, len, &min_len)) return -EINVAL; break; +#ifdef CONFIG_SOCK_CGROUP_DATA + case INET_DIAG_BC_CGROUP_COND: + if (!valid_cgroupcond(bc, len, &min_len)) + return -EINVAL; + break; +#endif case INET_DIAG_BC_AUTO: case INET_DIAG_BC_JMP: case INET_DIAG_BC_NOP: -- cgit v1.2.3-59-g8ed1b From 10ebb22137acd97083cb52d8664cdff269e8a629 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Thu, 30 Apr 2020 18:03:56 +0200 Subject: docs: networking: convert l2tp.txt to ReST - add SPDX header; - add a document title; - mark tables as such; - adjust identation, whitespaces and blank lines; - add to networking/index.rst. Signed-off-by: Mauro Carvalho Chehab Signed-off-by: David S. Miller --- Documentation/networking/index.rst | 1 + Documentation/networking/l2tp.rst | 358 +++++++++++++++++++++++++++++++++++++ Documentation/networking/l2tp.txt | 345 ----------------------------------- 3 files changed, 359 insertions(+), 345 deletions(-) create mode 100644 Documentation/networking/l2tp.rst delete mode 100644 Documentation/networking/l2tp.txt diff --git a/Documentation/networking/index.rst b/Documentation/networking/index.rst index e1ff08b94d90..0c5d7a037983 100644 --- a/Documentation/networking/index.rst +++ b/Documentation/networking/index.rst @@ -74,6 +74,7 @@ Contents: ipvlan ipvs-sysctl kcm + l2tp .. only:: subproject and html diff --git a/Documentation/networking/l2tp.rst b/Documentation/networking/l2tp.rst new file mode 100644 index 000000000000..a48238a2ec09 --- /dev/null +++ b/Documentation/networking/l2tp.rst @@ -0,0 +1,358 @@ +.. SPDX-License-Identifier: GPL-2.0 + +==== +L2TP +==== + +This document describes how to use the kernel's L2TP drivers to +provide L2TP functionality. L2TP is a protocol that tunnels one or +more sessions over an IP tunnel. It is commonly used for VPNs +(L2TP/IPSec) and by ISPs to tunnel subscriber PPP sessions over an IP +network infrastructure. With L2TPv3, it is also useful as a Layer-2 +tunneling infrastructure. + +Features +======== + +L2TPv2 (PPP over L2TP (UDP tunnels)). +L2TPv3 ethernet pseudowires. +L2TPv3 PPP pseudowires. +L2TPv3 IP encapsulation. +Netlink sockets for L2TPv3 configuration management. + +History +======= + +The original pppol2tp driver was introduced in 2.6.23 and provided +L2TPv2 functionality (rfc2661). L2TPv2 is used to tunnel one or more PPP +sessions over a UDP tunnel. + +L2TPv3 (rfc3931) changes the protocol to allow different frame types +to be passed over an L2TP tunnel by moving the PPP-specific parts of +the protocol out of the core L2TP packet headers. Each frame type is +known as a pseudowire type. Ethernet, PPP, HDLC, Frame Relay and ATM +pseudowires for L2TP are defined in separate RFC standards. Another +change for L2TPv3 is that it can be carried directly over IP with no +UDP header (UDP is optional). It is also possible to create static +unmanaged L2TPv3 tunnels manually without a control protocol +(userspace daemon) to manage them. + +To support L2TPv3, the original pppol2tp driver was split up to +separate the L2TP and PPP functionality. Existing L2TPv2 userspace +apps should be unaffected as the original pppol2tp sockets API is +retained. L2TPv3, however, uses netlink to manage L2TPv3 tunnels and +sessions. + +Design +====== + +The L2TP protocol separates control and data frames. The L2TP kernel +drivers handle only L2TP data frames; control frames are always +handled by userspace. L2TP control frames carry messages between L2TP +clients/servers and are used to setup / teardown tunnels and +sessions. An L2TP client or server is implemented in userspace. + +Each L2TP tunnel is implemented using a UDP or L2TPIP socket; L2TPIP +provides L2TPv3 IP encapsulation (no UDP) and is implemented using a +new l2tpip socket family. The tunnel socket is typically created by +userspace, though for unmanaged L2TPv3 tunnels, the socket can also be +created by the kernel. Each L2TP session (pseudowire) gets a network +interface instance. In the case of PPP, these interfaces are created +indirectly by pppd using a pppol2tp socket. In the case of ethernet, +the netdevice is created upon a netlink request to create an L2TPv3 +ethernet pseudowire. + +For PPP, the PPPoL2TP driver, net/l2tp/l2tp_ppp.c, provides a +mechanism by which PPP frames carried through an L2TP session are +passed through the kernel's PPP subsystem. The standard PPP daemon, +pppd, handles all PPP interaction with the peer. PPP network +interfaces are created for each local PPP endpoint. The kernel's PPP +subsystem arranges for PPP control frames to be delivered to pppd, +while data frames are forwarded as usual. + +For ethernet, the L2TPETH driver, net/l2tp/l2tp_eth.c, implements a +netdevice driver, managing virtual ethernet devices, one per +pseudowire. These interfaces can be managed using standard Linux tools +such as "ip" and "ifconfig". If only IP frames are passed over the +tunnel, the interface can be given an IP addresses of itself and its +peer. If non-IP frames are to be passed over the tunnel, the interface +can be added to a bridge using brctl. All L2TP datapath protocol +functions are handled by the L2TP core driver. + +Each tunnel and session within a tunnel is assigned a unique tunnel_id +and session_id. These ids are carried in the L2TP header of every +control and data packet. (Actually, in L2TPv3, the tunnel_id isn't +present in data frames - it is inferred from the IP connection on +which the packet was received.) The L2TP driver uses the ids to lookup +internal tunnel and/or session contexts to determine how to handle the +packet. Zero tunnel / session ids are treated specially - zero ids are +never assigned to tunnels or sessions in the network. In the driver, +the tunnel context keeps a reference to the tunnel UDP or L2TPIP +socket. The session context holds data that lets the driver interface +to the kernel's network frame type subsystems, i.e. PPP, ethernet. + +Userspace Programming +===================== + +For L2TPv2, there are a number of requirements on the userspace L2TP +daemon in order to use the pppol2tp driver. + +1. Use a UDP socket per tunnel. + +2. Create a single PPPoL2TP socket per tunnel bound to a special null + session id. This is used only for communicating with the driver but + must remain open while the tunnel is active. Opening this tunnel + management socket causes the driver to mark the tunnel socket as an + L2TP UDP encapsulation socket and flags it for use by the + referenced tunnel id. This hooks up the UDP receive path via + udp_encap_rcv() in net/ipv4/udp.c. PPP data frames are never passed + in this special PPPoX socket. + +3. Create a PPPoL2TP socket per L2TP session. This is typically done + by starting pppd with the pppol2tp plugin and appropriate + arguments. A PPPoL2TP tunnel management socket (Step 2) must be + created before the first PPPoL2TP session socket is created. + +When creating PPPoL2TP sockets, the application provides information +to the driver about the socket in a socket connect() call. Source and +destination tunnel and session ids are provided, as well as the file +descriptor of a UDP socket. See struct pppol2tp_addr in +include/linux/if_pppol2tp.h. Note that zero tunnel / session ids are +treated specially. When creating the per-tunnel PPPoL2TP management +socket in Step 2 above, zero source and destination session ids are +specified, which tells the driver to prepare the supplied UDP file +descriptor for use as an L2TP tunnel socket. + +Userspace may control behavior of the tunnel or session using +setsockopt and ioctl on the PPPoX socket. The following socket +options are supported:- + +========= =========================================================== +DEBUG bitmask of debug message categories. See below. +SENDSEQ - 0 => don't send packets with sequence numbers + - 1 => send packets with sequence numbers +RECVSEQ - 0 => receive packet sequence numbers are optional + - 1 => drop receive packets without sequence numbers +LNSMODE - 0 => act as LAC. + - 1 => act as LNS. +REORDERTO reorder timeout (in millisecs). If 0, don't try to reorder. +========= =========================================================== + +Only the DEBUG option is supported by the special tunnel management +PPPoX socket. + +In addition to the standard PPP ioctls, a PPPIOCGL2TPSTATS is provided +to retrieve tunnel and session statistics from the kernel using the +PPPoX socket of the appropriate tunnel or session. + +For L2TPv3, userspace must use the netlink API defined in +include/linux/l2tp.h to manage tunnel and session contexts. The +general procedure to create a new L2TP tunnel with one session is:- + +1. Open a GENL socket using L2TP_GENL_NAME for configuring the kernel + using netlink. + +2. Create a UDP or L2TPIP socket for the tunnel. + +3. Create a new L2TP tunnel using a L2TP_CMD_TUNNEL_CREATE + request. Set attributes according to desired tunnel parameters, + referencing the UDP or L2TPIP socket created in the previous step. + +4. Create a new L2TP session in the tunnel using a + L2TP_CMD_SESSION_CREATE request. + +The tunnel and all of its sessions are closed when the tunnel socket +is closed. The netlink API may also be used to delete sessions and +tunnels. Configuration and status info may be set or read using netlink. + +The L2TP driver also supports static (unmanaged) L2TPv3 tunnels. These +are where there is no L2TP control message exchange with the peer to +setup the tunnel; the tunnel is configured manually at each end of the +tunnel. There is no need for an L2TP userspace application in this +case -- the tunnel socket is created by the kernel and configured +using parameters sent in the L2TP_CMD_TUNNEL_CREATE netlink +request. The "ip" utility of iproute2 has commands for managing static +L2TPv3 tunnels; do "ip l2tp help" for more information. + +Debugging +========= + +The driver supports a flexible debug scheme where kernel trace +messages may be optionally enabled per tunnel and per session. Care is +needed when debugging a live system since the messages are not +rate-limited and a busy system could be swamped. Userspace uses +setsockopt on the PPPoX socket to set a debug mask. + +The following debug mask bits are available: + +================ ============================== +L2TP_MSG_DEBUG verbose debug (if compiled in) +L2TP_MSG_CONTROL userspace - kernel interface +L2TP_MSG_SEQ sequence numbers handling +L2TP_MSG_DATA data packets +================ ============================== + +If enabled, files under a l2tp debugfs directory can be used to dump +kernel state about L2TP tunnels and sessions. To access it, the +debugfs filesystem must first be mounted:: + + # mount -t debugfs debugfs /debug + +Files under the l2tp directory can then be accessed:: + + # cat /debug/l2tp/tunnels + +The debugfs files should not be used by applications to obtain L2TP +state information because the file format is subject to change. It is +implemented to provide extra debug information to help diagnose +problems.) Users should use the netlink API. + +/proc/net/pppol2tp is also provided for backwards compatibility with +the original pppol2tp driver. It lists information about L2TPv2 +tunnels and sessions only. Its use is discouraged. + +Unmanaged L2TPv3 Tunnels +======================== + +Some commercial L2TP products support unmanaged L2TPv3 ethernet +tunnels, where there is no L2TP control protocol; tunnels are +configured at each side manually. New commands are available in +iproute2's ip utility to support this. + +To create an L2TPv3 ethernet pseudowire between local host 192.168.1.1 +and peer 192.168.1.2, using IP addresses 10.5.1.1 and 10.5.1.2 for the +tunnel endpoints:: + + # ip l2tp add tunnel tunnel_id 1 peer_tunnel_id 1 udp_sport 5000 \ + udp_dport 5000 encap udp local 192.168.1.1 remote 192.168.1.2 + # ip l2tp add session tunnel_id 1 session_id 1 peer_session_id 1 + # ip -s -d show dev l2tpeth0 + # ip addr add 10.5.1.2/32 peer 10.5.1.1/32 dev l2tpeth0 + # ip li set dev l2tpeth0 up + +Choose IP addresses to be the address of a local IP interface and that +of the remote system. The IP addresses of the l2tpeth0 interface can be +anything suitable. + +Repeat the above at the peer, with ports, tunnel/session ids and IP +addresses reversed. The tunnel and session IDs can be any non-zero +32-bit number, but the values must be reversed at the peer. + +======================== =================== +Host 1 Host2 +======================== =================== +udp_sport=5000 udp_sport=5001 +udp_dport=5001 udp_dport=5000 +tunnel_id=42 tunnel_id=45 +peer_tunnel_id=45 peer_tunnel_id=42 +session_id=128 session_id=5196755 +peer_session_id=5196755 peer_session_id=128 +======================== =================== + +When done at both ends of the tunnel, it should be possible to send +data over the network. e.g.:: + + # ping 10.5.1.1 + + +Sample Userspace Code +===================== + +1. Create tunnel management PPPoX socket:: + + kernel_fd = socket(AF_PPPOX, SOCK_DGRAM, PX_PROTO_OL2TP); + if (kernel_fd >= 0) { + struct sockaddr_pppol2tp sax; + struct sockaddr_in const *peer_addr; + + peer_addr = l2tp_tunnel_get_peer_addr(tunnel); + memset(&sax, 0, sizeof(sax)); + sax.sa_family = AF_PPPOX; + sax.sa_protocol = PX_PROTO_OL2TP; + sax.pppol2tp.fd = udp_fd; /* fd of tunnel UDP socket */ + sax.pppol2tp.addr.sin_addr.s_addr = peer_addr->sin_addr.s_addr; + sax.pppol2tp.addr.sin_port = peer_addr->sin_port; + sax.pppol2tp.addr.sin_family = AF_INET; + sax.pppol2tp.s_tunnel = tunnel_id; + sax.pppol2tp.s_session = 0; /* special case: mgmt socket */ + sax.pppol2tp.d_tunnel = 0; + sax.pppol2tp.d_session = 0; /* special case: mgmt socket */ + + if(connect(kernel_fd, (struct sockaddr *)&sax, sizeof(sax) ) < 0 ) { + perror("connect failed"); + result = -errno; + goto err; + } + } + +2. Create session PPPoX data socket:: + + struct sockaddr_pppol2tp sax; + int fd; + + /* Note, the target socket must be bound already, else it will not be ready */ + sax.sa_family = AF_PPPOX; + sax.sa_protocol = PX_PROTO_OL2TP; + sax.pppol2tp.fd = tunnel_fd; + sax.pppol2tp.addr.sin_addr.s_addr = addr->sin_addr.s_addr; + sax.pppol2tp.addr.sin_port = addr->sin_port; + sax.pppol2tp.addr.sin_family = AF_INET; + sax.pppol2tp.s_tunnel = tunnel_id; + sax.pppol2tp.s_session = session_id; + sax.pppol2tp.d_tunnel = peer_tunnel_id; + sax.pppol2tp.d_session = peer_session_id; + + /* session_fd is the fd of the session's PPPoL2TP socket. + * tunnel_fd is the fd of the tunnel UDP socket. + */ + fd = connect(session_fd, (struct sockaddr *)&sax, sizeof(sax)); + if (fd < 0 ) { + return -errno; + } + return 0; + +Internal Implementation +======================= + +The driver keeps a struct l2tp_tunnel context per L2TP tunnel and a +struct l2tp_session context for each session. The l2tp_tunnel is +always associated with a UDP or L2TP/IP socket and keeps a list of +sessions in the tunnel. The l2tp_session context keeps kernel state +about the session. It has private data which is used for data specific +to the session type. With L2TPv2, the session always carried PPP +traffic. With L2TPv3, the session can also carry ethernet frames +(ethernet pseudowire) or other data types such as ATM, HDLC or Frame +Relay. + +When a tunnel is first opened, the reference count on the socket is +increased using sock_hold(). This ensures that the kernel socket +cannot be removed while L2TP's data structures reference it. + +Some L2TP sessions also have a socket (PPP pseudowires) while others +do not (ethernet pseudowires). We can't use the socket reference count +as the reference count for session contexts. The L2TP implementation +therefore has its own internal reference counts on the session +contexts. + +To Do +===== + +Add L2TP tunnel switching support. This would route tunneled traffic +from one L2TP tunnel into another. Specified in +http://tools.ietf.org/html/draft-ietf-l2tpext-tunnel-switching-08 + +Add L2TPv3 VLAN pseudowire support. + +Add L2TPv3 IP pseudowire support. + +Add L2TPv3 ATM pseudowire support. + +Miscellaneous +============= + +The L2TP drivers were developed as part of the OpenL2TP project by +Katalix Systems Ltd. OpenL2TP is a full-featured L2TP client / server, +designed from the ground up to have the L2TP datapath in the +kernel. The project also implemented the pppol2tp plugin for pppd +which allows pppd to use the kernel driver. Details can be found at +http://www.openl2tp.org. diff --git a/Documentation/networking/l2tp.txt b/Documentation/networking/l2tp.txt deleted file mode 100644 index 9bc271cdc9a8..000000000000 --- a/Documentation/networking/l2tp.txt +++ /dev/null @@ -1,345 +0,0 @@ -This document describes how to use the kernel's L2TP drivers to -provide L2TP functionality. L2TP is a protocol that tunnels one or -more sessions over an IP tunnel. It is commonly used for VPNs -(L2TP/IPSec) and by ISPs to tunnel subscriber PPP sessions over an IP -network infrastructure. With L2TPv3, it is also useful as a Layer-2 -tunneling infrastructure. - -Features -======== - -L2TPv2 (PPP over L2TP (UDP tunnels)). -L2TPv3 ethernet pseudowires. -L2TPv3 PPP pseudowires. -L2TPv3 IP encapsulation. -Netlink sockets for L2TPv3 configuration management. - -History -======= - -The original pppol2tp driver was introduced in 2.6.23 and provided -L2TPv2 functionality (rfc2661). L2TPv2 is used to tunnel one or more PPP -sessions over a UDP tunnel. - -L2TPv3 (rfc3931) changes the protocol to allow different frame types -to be passed over an L2TP tunnel by moving the PPP-specific parts of -the protocol out of the core L2TP packet headers. Each frame type is -known as a pseudowire type. Ethernet, PPP, HDLC, Frame Relay and ATM -pseudowires for L2TP are defined in separate RFC standards. Another -change for L2TPv3 is that it can be carried directly over IP with no -UDP header (UDP is optional). It is also possible to create static -unmanaged L2TPv3 tunnels manually without a control protocol -(userspace daemon) to manage them. - -To support L2TPv3, the original pppol2tp driver was split up to -separate the L2TP and PPP functionality. Existing L2TPv2 userspace -apps should be unaffected as the original pppol2tp sockets API is -retained. L2TPv3, however, uses netlink to manage L2TPv3 tunnels and -sessions. - -Design -====== - -The L2TP protocol separates control and data frames. The L2TP kernel -drivers handle only L2TP data frames; control frames are always -handled by userspace. L2TP control frames carry messages between L2TP -clients/servers and are used to setup / teardown tunnels and -sessions. An L2TP client or server is implemented in userspace. - -Each L2TP tunnel is implemented using a UDP or L2TPIP socket; L2TPIP -provides L2TPv3 IP encapsulation (no UDP) and is implemented using a -new l2tpip socket family. The tunnel socket is typically created by -userspace, though for unmanaged L2TPv3 tunnels, the socket can also be -created by the kernel. Each L2TP session (pseudowire) gets a network -interface instance. In the case of PPP, these interfaces are created -indirectly by pppd using a pppol2tp socket. In the case of ethernet, -the netdevice is created upon a netlink request to create an L2TPv3 -ethernet pseudowire. - -For PPP, the PPPoL2TP driver, net/l2tp/l2tp_ppp.c, provides a -mechanism by which PPP frames carried through an L2TP session are -passed through the kernel's PPP subsystem. The standard PPP daemon, -pppd, handles all PPP interaction with the peer. PPP network -interfaces are created for each local PPP endpoint. The kernel's PPP -subsystem arranges for PPP control frames to be delivered to pppd, -while data frames are forwarded as usual. - -For ethernet, the L2TPETH driver, net/l2tp/l2tp_eth.c, implements a -netdevice driver, managing virtual ethernet devices, one per -pseudowire. These interfaces can be managed using standard Linux tools -such as "ip" and "ifconfig". If only IP frames are passed over the -tunnel, the interface can be given an IP addresses of itself and its -peer. If non-IP frames are to be passed over the tunnel, the interface -can be added to a bridge using brctl. All L2TP datapath protocol -functions are handled by the L2TP core driver. - -Each tunnel and session within a tunnel is assigned a unique tunnel_id -and session_id. These ids are carried in the L2TP header of every -control and data packet. (Actually, in L2TPv3, the tunnel_id isn't -present in data frames - it is inferred from the IP connection on -which the packet was received.) The L2TP driver uses the ids to lookup -internal tunnel and/or session contexts to determine how to handle the -packet. Zero tunnel / session ids are treated specially - zero ids are -never assigned to tunnels or sessions in the network. In the driver, -the tunnel context keeps a reference to the tunnel UDP or L2TPIP -socket. The session context holds data that lets the driver interface -to the kernel's network frame type subsystems, i.e. PPP, ethernet. - -Userspace Programming -===================== - -For L2TPv2, there are a number of requirements on the userspace L2TP -daemon in order to use the pppol2tp driver. - -1. Use a UDP socket per tunnel. - -2. Create a single PPPoL2TP socket per tunnel bound to a special null - session id. This is used only for communicating with the driver but - must remain open while the tunnel is active. Opening this tunnel - management socket causes the driver to mark the tunnel socket as an - L2TP UDP encapsulation socket and flags it for use by the - referenced tunnel id. This hooks up the UDP receive path via - udp_encap_rcv() in net/ipv4/udp.c. PPP data frames are never passed - in this special PPPoX socket. - -3. Create a PPPoL2TP socket per L2TP session. This is typically done - by starting pppd with the pppol2tp plugin and appropriate - arguments. A PPPoL2TP tunnel management socket (Step 2) must be - created before the first PPPoL2TP session socket is created. - -When creating PPPoL2TP sockets, the application provides information -to the driver about the socket in a socket connect() call. Source and -destination tunnel and session ids are provided, as well as the file -descriptor of a UDP socket. See struct pppol2tp_addr in -include/linux/if_pppol2tp.h. Note that zero tunnel / session ids are -treated specially. When creating the per-tunnel PPPoL2TP management -socket in Step 2 above, zero source and destination session ids are -specified, which tells the driver to prepare the supplied UDP file -descriptor for use as an L2TP tunnel socket. - -Userspace may control behavior of the tunnel or session using -setsockopt and ioctl on the PPPoX socket. The following socket -options are supported:- - -DEBUG - bitmask of debug message categories. See below. -SENDSEQ - 0 => don't send packets with sequence numbers - 1 => send packets with sequence numbers -RECVSEQ - 0 => receive packet sequence numbers are optional - 1 => drop receive packets without sequence numbers -LNSMODE - 0 => act as LAC. - 1 => act as LNS. -REORDERTO - reorder timeout (in millisecs). If 0, don't try to reorder. - -Only the DEBUG option is supported by the special tunnel management -PPPoX socket. - -In addition to the standard PPP ioctls, a PPPIOCGL2TPSTATS is provided -to retrieve tunnel and session statistics from the kernel using the -PPPoX socket of the appropriate tunnel or session. - -For L2TPv3, userspace must use the netlink API defined in -include/linux/l2tp.h to manage tunnel and session contexts. The -general procedure to create a new L2TP tunnel with one session is:- - -1. Open a GENL socket using L2TP_GENL_NAME for configuring the kernel - using netlink. - -2. Create a UDP or L2TPIP socket for the tunnel. - -3. Create a new L2TP tunnel using a L2TP_CMD_TUNNEL_CREATE - request. Set attributes according to desired tunnel parameters, - referencing the UDP or L2TPIP socket created in the previous step. - -4. Create a new L2TP session in the tunnel using a - L2TP_CMD_SESSION_CREATE request. - -The tunnel and all of its sessions are closed when the tunnel socket -is closed. The netlink API may also be used to delete sessions and -tunnels. Configuration and status info may be set or read using netlink. - -The L2TP driver also supports static (unmanaged) L2TPv3 tunnels. These -are where there is no L2TP control message exchange with the peer to -setup the tunnel; the tunnel is configured manually at each end of the -tunnel. There is no need for an L2TP userspace application in this -case -- the tunnel socket is created by the kernel and configured -using parameters sent in the L2TP_CMD_TUNNEL_CREATE netlink -request. The "ip" utility of iproute2 has commands for managing static -L2TPv3 tunnels; do "ip l2tp help" for more information. - -Debugging -========= - -The driver supports a flexible debug scheme where kernel trace -messages may be optionally enabled per tunnel and per session. Care is -needed when debugging a live system since the messages are not -rate-limited and a busy system could be swamped. Userspace uses -setsockopt on the PPPoX socket to set a debug mask. - -The following debug mask bits are available: - -L2TP_MSG_DEBUG verbose debug (if compiled in) -L2TP_MSG_CONTROL userspace - kernel interface -L2TP_MSG_SEQ sequence numbers handling -L2TP_MSG_DATA data packets - -If enabled, files under a l2tp debugfs directory can be used to dump -kernel state about L2TP tunnels and sessions. To access it, the -debugfs filesystem must first be mounted. - -# mount -t debugfs debugfs /debug - -Files under the l2tp directory can then be accessed. - -# cat /debug/l2tp/tunnels - -The debugfs files should not be used by applications to obtain L2TP -state information because the file format is subject to change. It is -implemented to provide extra debug information to help diagnose -problems.) Users should use the netlink API. - -/proc/net/pppol2tp is also provided for backwards compatibility with -the original pppol2tp driver. It lists information about L2TPv2 -tunnels and sessions only. Its use is discouraged. - -Unmanaged L2TPv3 Tunnels -======================== - -Some commercial L2TP products support unmanaged L2TPv3 ethernet -tunnels, where there is no L2TP control protocol; tunnels are -configured at each side manually. New commands are available in -iproute2's ip utility to support this. - -To create an L2TPv3 ethernet pseudowire between local host 192.168.1.1 -and peer 192.168.1.2, using IP addresses 10.5.1.1 and 10.5.1.2 for the -tunnel endpoints:- - -# ip l2tp add tunnel tunnel_id 1 peer_tunnel_id 1 udp_sport 5000 \ - udp_dport 5000 encap udp local 192.168.1.1 remote 192.168.1.2 -# ip l2tp add session tunnel_id 1 session_id 1 peer_session_id 1 -# ip -s -d show dev l2tpeth0 -# ip addr add 10.5.1.2/32 peer 10.5.1.1/32 dev l2tpeth0 -# ip li set dev l2tpeth0 up - -Choose IP addresses to be the address of a local IP interface and that -of the remote system. The IP addresses of the l2tpeth0 interface can be -anything suitable. - -Repeat the above at the peer, with ports, tunnel/session ids and IP -addresses reversed. The tunnel and session IDs can be any non-zero -32-bit number, but the values must be reversed at the peer. - -Host 1 Host2 -udp_sport=5000 udp_sport=5001 -udp_dport=5001 udp_dport=5000 -tunnel_id=42 tunnel_id=45 -peer_tunnel_id=45 peer_tunnel_id=42 -session_id=128 session_id=5196755 -peer_session_id=5196755 peer_session_id=128 - -When done at both ends of the tunnel, it should be possible to send -data over the network. e.g. - -# ping 10.5.1.1 - - -Sample Userspace Code -===================== - -1. Create tunnel management PPPoX socket - - kernel_fd = socket(AF_PPPOX, SOCK_DGRAM, PX_PROTO_OL2TP); - if (kernel_fd >= 0) { - struct sockaddr_pppol2tp sax; - struct sockaddr_in const *peer_addr; - - peer_addr = l2tp_tunnel_get_peer_addr(tunnel); - memset(&sax, 0, sizeof(sax)); - sax.sa_family = AF_PPPOX; - sax.sa_protocol = PX_PROTO_OL2TP; - sax.pppol2tp.fd = udp_fd; /* fd of tunnel UDP socket */ - sax.pppol2tp.addr.sin_addr.s_addr = peer_addr->sin_addr.s_addr; - sax.pppol2tp.addr.sin_port = peer_addr->sin_port; - sax.pppol2tp.addr.sin_family = AF_INET; - sax.pppol2tp.s_tunnel = tunnel_id; - sax.pppol2tp.s_session = 0; /* special case: mgmt socket */ - sax.pppol2tp.d_tunnel = 0; - sax.pppol2tp.d_session = 0; /* special case: mgmt socket */ - - if(connect(kernel_fd, (struct sockaddr *)&sax, sizeof(sax) ) < 0 ) { - perror("connect failed"); - result = -errno; - goto err; - } - } - -2. Create session PPPoX data socket - - struct sockaddr_pppol2tp sax; - int fd; - - /* Note, the target socket must be bound already, else it will not be ready */ - sax.sa_family = AF_PPPOX; - sax.sa_protocol = PX_PROTO_OL2TP; - sax.pppol2tp.fd = tunnel_fd; - sax.pppol2tp.addr.sin_addr.s_addr = addr->sin_addr.s_addr; - sax.pppol2tp.addr.sin_port = addr->sin_port; - sax.pppol2tp.addr.sin_family = AF_INET; - sax.pppol2tp.s_tunnel = tunnel_id; - sax.pppol2tp.s_session = session_id; - sax.pppol2tp.d_tunnel = peer_tunnel_id; - sax.pppol2tp.d_session = peer_session_id; - - /* session_fd is the fd of the session's PPPoL2TP socket. - * tunnel_fd is the fd of the tunnel UDP socket. - */ - fd = connect(session_fd, (struct sockaddr *)&sax, sizeof(sax)); - if (fd < 0 ) { - return -errno; - } - return 0; - -Internal Implementation -======================= - -The driver keeps a struct l2tp_tunnel context per L2TP tunnel and a -struct l2tp_session context for each session. The l2tp_tunnel is -always associated with a UDP or L2TP/IP socket and keeps a list of -sessions in the tunnel. The l2tp_session context keeps kernel state -about the session. It has private data which is used for data specific -to the session type. With L2TPv2, the session always carried PPP -traffic. With L2TPv3, the session can also carry ethernet frames -(ethernet pseudowire) or other data types such as ATM, HDLC or Frame -Relay. - -When a tunnel is first opened, the reference count on the socket is -increased using sock_hold(). This ensures that the kernel socket -cannot be removed while L2TP's data structures reference it. - -Some L2TP sessions also have a socket (PPP pseudowires) while others -do not (ethernet pseudowires). We can't use the socket reference count -as the reference count for session contexts. The L2TP implementation -therefore has its own internal reference counts on the session -contexts. - -To Do -===== - -Add L2TP tunnel switching support. This would route tunneled traffic -from one L2TP tunnel into another. Specified in -http://tools.ietf.org/html/draft-ietf-l2tpext-tunnel-switching-08 - -Add L2TPv3 VLAN pseudowire support. - -Add L2TPv3 IP pseudowire support. - -Add L2TPv3 ATM pseudowire support. - -Miscellaneous -============= - -The L2TP drivers were developed as part of the OpenL2TP project by -Katalix Systems Ltd. OpenL2TP is a full-featured L2TP client / server, -designed from the ground up to have the L2TP datapath in the -kernel. The project also implemented the pppol2tp plugin for pppd -which allows pppd to use the kernel driver. Details can be found at -http://www.openl2tp.org. -- cgit v1.2.3-59-g8ed1b From 40e79150c1686263e6a031d7702aec63aff31332 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Thu, 30 Apr 2020 18:03:57 +0200 Subject: docs: networking: convert lapb-module.txt to ReST - add SPDX header; - adjust title markup; - mark code blocks and literals as such; - mark tables as such; - adjust identation, whitespaces and blank lines; - add to networking/index.rst. Signed-off-by: Mauro Carvalho Chehab Signed-off-by: David S. Miller --- Documentation/networking/index.rst | 1 + Documentation/networking/lapb-module.rst | 305 +++++++++++++++++++++++++++++++ Documentation/networking/lapb-module.txt | 263 -------------------------- MAINTAINERS | 2 +- net/lapb/Kconfig | 2 +- 5 files changed, 308 insertions(+), 265 deletions(-) create mode 100644 Documentation/networking/lapb-module.rst delete mode 100644 Documentation/networking/lapb-module.txt diff --git a/Documentation/networking/index.rst b/Documentation/networking/index.rst index 0c5d7a037983..acd2567cf0d4 100644 --- a/Documentation/networking/index.rst +++ b/Documentation/networking/index.rst @@ -75,6 +75,7 @@ Contents: ipvs-sysctl kcm l2tp + lapb-module .. only:: subproject and html diff --git a/Documentation/networking/lapb-module.rst b/Documentation/networking/lapb-module.rst new file mode 100644 index 000000000000..ff586bc9f005 --- /dev/null +++ b/Documentation/networking/lapb-module.rst @@ -0,0 +1,305 @@ +.. SPDX-License-Identifier: GPL-2.0 + +=============================== +The Linux LAPB Module Interface +=============================== + +Version 1.3 + +Jonathan Naylor 29.12.96 + +Changed (Henner Eisen, 2000-10-29): int return value for data_indication() + +The LAPB module will be a separately compiled module for use by any parts of +the Linux operating system that require a LAPB service. This document +defines the interfaces to, and the services provided by this module. The +term module in this context does not imply that the LAPB module is a +separately loadable module, although it may be. The term module is used in +its more standard meaning. + +The interface to the LAPB module consists of functions to the module, +callbacks from the module to indicate important state changes, and +structures for getting and setting information about the module. + +Structures +---------- + +Probably the most important structure is the skbuff structure for holding +received and transmitted data, however it is beyond the scope of this +document. + +The two LAPB specific structures are the LAPB initialisation structure and +the LAPB parameter structure. These will be defined in a standard header +file, . The header file is internal to the LAPB +module and is not for use. + +LAPB Initialisation Structure +----------------------------- + +This structure is used only once, in the call to lapb_register (see below). +It contains information about the device driver that requires the services +of the LAPB module:: + + struct lapb_register_struct { + void (*connect_confirmation)(int token, int reason); + void (*connect_indication)(int token, int reason); + void (*disconnect_confirmation)(int token, int reason); + void (*disconnect_indication)(int token, int reason); + int (*data_indication)(int token, struct sk_buff *skb); + void (*data_transmit)(int token, struct sk_buff *skb); + }; + +Each member of this structure corresponds to a function in the device driver +that is called when a particular event in the LAPB module occurs. These will +be described in detail below. If a callback is not required (!!) then a NULL +may be substituted. + + +LAPB Parameter Structure +------------------------ + +This structure is used with the lapb_getparms and lapb_setparms functions +(see below). They are used to allow the device driver to get and set the +operational parameters of the LAPB implementation for a given connection:: + + struct lapb_parms_struct { + unsigned int t1; + unsigned int t1timer; + unsigned int t2; + unsigned int t2timer; + unsigned int n2; + unsigned int n2count; + unsigned int window; + unsigned int state; + unsigned int mode; + }; + +T1 and T2 are protocol timing parameters and are given in units of 100ms. N2 +is the maximum number of tries on the link before it is declared a failure. +The window size is the maximum number of outstanding data packets allowed to +be unacknowledged by the remote end, the value of the window is between 1 +and 7 for a standard LAPB link, and between 1 and 127 for an extended LAPB +link. + +The mode variable is a bit field used for setting (at present) three values. +The bit fields have the following meanings: + +====== ================================================= +Bit Meaning +====== ================================================= +0 LAPB operation (0=LAPB_STANDARD 1=LAPB_EXTENDED). +1 [SM]LP operation (0=LAPB_SLP 1=LAPB=MLP). +2 DTE/DCE operation (0=LAPB_DTE 1=LAPB_DCE) +3-31 Reserved, must be 0. +====== ================================================= + +Extended LAPB operation indicates the use of extended sequence numbers and +consequently larger window sizes, the default is standard LAPB operation. +MLP operation is the same as SLP operation except that the addresses used by +LAPB are different to indicate the mode of operation, the default is Single +Link Procedure. The difference between DCE and DTE operation is (i) the +addresses used for commands and responses, and (ii) when the DCE is not +connected, it sends DM without polls set, every T1. The upper case constant +names will be defined in the public LAPB header file. + + +Functions +--------- + +The LAPB module provides a number of function entry points. + +:: + + int lapb_register(void *token, struct lapb_register_struct); + +This must be called before the LAPB module may be used. If the call is +successful then LAPB_OK is returned. The token must be a unique identifier +generated by the device driver to allow for the unique identification of the +instance of the LAPB link. It is returned by the LAPB module in all of the +callbacks, and is used by the device driver in all calls to the LAPB module. +For multiple LAPB links in a single device driver, multiple calls to +lapb_register must be made. The format of the lapb_register_struct is given +above. The return values are: + +============= ============================= +LAPB_OK LAPB registered successfully. +LAPB_BADTOKEN Token is already registered. +LAPB_NOMEM Out of memory +============= ============================= + +:: + + int lapb_unregister(void *token); + +This releases all the resources associated with a LAPB link. Any current +LAPB link will be abandoned without further messages being passed. After +this call, the value of token is no longer valid for any calls to the LAPB +function. The valid return values are: + +============= =============================== +LAPB_OK LAPB unregistered successfully. +LAPB_BADTOKEN Invalid/unknown LAPB token. +============= =============================== + +:: + + int lapb_getparms(void *token, struct lapb_parms_struct *parms); + +This allows the device driver to get the values of the current LAPB +variables, the lapb_parms_struct is described above. The valid return values +are: + +============= ============================= +LAPB_OK LAPB getparms was successful. +LAPB_BADTOKEN Invalid/unknown LAPB token. +============= ============================= + +:: + + int lapb_setparms(void *token, struct lapb_parms_struct *parms); + +This allows the device driver to set the values of the current LAPB +variables, the lapb_parms_struct is described above. The values of t1timer, +t2timer and n2count are ignored, likewise changing the mode bits when +connected will be ignored. An error implies that none of the values have +been changed. The valid return values are: + +============= ================================================= +LAPB_OK LAPB getparms was successful. +LAPB_BADTOKEN Invalid/unknown LAPB token. +LAPB_INVALUE One of the values was out of its allowable range. +============= ================================================= + +:: + + int lapb_connect_request(void *token); + +Initiate a connect using the current parameter settings. The valid return +values are: + +============== ================================= +LAPB_OK LAPB is starting to connect. +LAPB_BADTOKEN Invalid/unknown LAPB token. +LAPB_CONNECTED LAPB module is already connected. +============== ================================= + +:: + + int lapb_disconnect_request(void *token); + +Initiate a disconnect. The valid return values are: + +================= =============================== +LAPB_OK LAPB is starting to disconnect. +LAPB_BADTOKEN Invalid/unknown LAPB token. +LAPB_NOTCONNECTED LAPB module is not connected. +================= =============================== + +:: + + int lapb_data_request(void *token, struct sk_buff *skb); + +Queue data with the LAPB module for transmitting over the link. If the call +is successful then the skbuff is owned by the LAPB module and may not be +used by the device driver again. The valid return values are: + +================= ============================= +LAPB_OK LAPB has accepted the data. +LAPB_BADTOKEN Invalid/unknown LAPB token. +LAPB_NOTCONNECTED LAPB module is not connected. +================= ============================= + +:: + + int lapb_data_received(void *token, struct sk_buff *skb); + +Queue data with the LAPB module which has been received from the device. It +is expected that the data passed to the LAPB module has skb->data pointing +to the beginning of the LAPB data. If the call is successful then the skbuff +is owned by the LAPB module and may not be used by the device driver again. +The valid return values are: + +============= =========================== +LAPB_OK LAPB has accepted the data. +LAPB_BADTOKEN Invalid/unknown LAPB token. +============= =========================== + +Callbacks +--------- + +These callbacks are functions provided by the device driver for the LAPB +module to call when an event occurs. They are registered with the LAPB +module with lapb_register (see above) in the structure lapb_register_struct +(see above). + +:: + + void (*connect_confirmation)(void *token, int reason); + +This is called by the LAPB module when a connection is established after +being requested by a call to lapb_connect_request (see above). The reason is +always LAPB_OK. + +:: + + void (*connect_indication)(void *token, int reason); + +This is called by the LAPB module when the link is established by the remote +system. The value of reason is always LAPB_OK. + +:: + + void (*disconnect_confirmation)(void *token, int reason); + +This is called by the LAPB module when an event occurs after the device +driver has called lapb_disconnect_request (see above). The reason indicates +what has happened. In all cases the LAPB link can be regarded as being +terminated. The values for reason are: + +================= ==================================================== +LAPB_OK The LAPB link was terminated normally. +LAPB_NOTCONNECTED The remote system was not connected. +LAPB_TIMEDOUT No response was received in N2 tries from the remote + system. +================= ==================================================== + +:: + + void (*disconnect_indication)(void *token, int reason); + +This is called by the LAPB module when the link is terminated by the remote +system or another event has occurred to terminate the link. This may be +returned in response to a lapb_connect_request (see above) if the remote +system refused the request. The values for reason are: + +================= ==================================================== +LAPB_OK The LAPB link was terminated normally by the remote + system. +LAPB_REFUSED The remote system refused the connect request. +LAPB_NOTCONNECTED The remote system was not connected. +LAPB_TIMEDOUT No response was received in N2 tries from the remote + system. +================= ==================================================== + +:: + + int (*data_indication)(void *token, struct sk_buff *skb); + +This is called by the LAPB module when data has been received from the +remote system that should be passed onto the next layer in the protocol +stack. The skbuff becomes the property of the device driver and the LAPB +module will not perform any more actions on it. The skb->data pointer will +be pointing to the first byte of data after the LAPB header. + +This method should return NET_RX_DROP (as defined in the header +file include/linux/netdevice.h) if and only if the frame was dropped +before it could be delivered to the upper layer. + +:: + + void (*data_transmit)(void *token, struct sk_buff *skb); + +This is called by the LAPB module when data is to be transmitted to the +remote system by the device driver. The skbuff becomes the property of the +device driver and the LAPB module will not perform any more actions on it. +The skb->data pointer will be pointing to the first byte of the LAPB header. diff --git a/Documentation/networking/lapb-module.txt b/Documentation/networking/lapb-module.txt deleted file mode 100644 index d4fc8f221559..000000000000 --- a/Documentation/networking/lapb-module.txt +++ /dev/null @@ -1,263 +0,0 @@ - The Linux LAPB Module Interface 1.3 - - Jonathan Naylor 29.12.96 - -Changed (Henner Eisen, 2000-10-29): int return value for data_indication() - -The LAPB module will be a separately compiled module for use by any parts of -the Linux operating system that require a LAPB service. This document -defines the interfaces to, and the services provided by this module. The -term module in this context does not imply that the LAPB module is a -separately loadable module, although it may be. The term module is used in -its more standard meaning. - -The interface to the LAPB module consists of functions to the module, -callbacks from the module to indicate important state changes, and -structures for getting and setting information about the module. - -Structures ----------- - -Probably the most important structure is the skbuff structure for holding -received and transmitted data, however it is beyond the scope of this -document. - -The two LAPB specific structures are the LAPB initialisation structure and -the LAPB parameter structure. These will be defined in a standard header -file, . The header file is internal to the LAPB -module and is not for use. - -LAPB Initialisation Structure ------------------------------ - -This structure is used only once, in the call to lapb_register (see below). -It contains information about the device driver that requires the services -of the LAPB module. - -struct lapb_register_struct { - void (*connect_confirmation)(int token, int reason); - void (*connect_indication)(int token, int reason); - void (*disconnect_confirmation)(int token, int reason); - void (*disconnect_indication)(int token, int reason); - int (*data_indication)(int token, struct sk_buff *skb); - void (*data_transmit)(int token, struct sk_buff *skb); -}; - -Each member of this structure corresponds to a function in the device driver -that is called when a particular event in the LAPB module occurs. These will -be described in detail below. If a callback is not required (!!) then a NULL -may be substituted. - - -LAPB Parameter Structure ------------------------- - -This structure is used with the lapb_getparms and lapb_setparms functions -(see below). They are used to allow the device driver to get and set the -operational parameters of the LAPB implementation for a given connection. - -struct lapb_parms_struct { - unsigned int t1; - unsigned int t1timer; - unsigned int t2; - unsigned int t2timer; - unsigned int n2; - unsigned int n2count; - unsigned int window; - unsigned int state; - unsigned int mode; -}; - -T1 and T2 are protocol timing parameters and are given in units of 100ms. N2 -is the maximum number of tries on the link before it is declared a failure. -The window size is the maximum number of outstanding data packets allowed to -be unacknowledged by the remote end, the value of the window is between 1 -and 7 for a standard LAPB link, and between 1 and 127 for an extended LAPB -link. - -The mode variable is a bit field used for setting (at present) three values. -The bit fields have the following meanings: - -Bit Meaning -0 LAPB operation (0=LAPB_STANDARD 1=LAPB_EXTENDED). -1 [SM]LP operation (0=LAPB_SLP 1=LAPB=MLP). -2 DTE/DCE operation (0=LAPB_DTE 1=LAPB_DCE) -3-31 Reserved, must be 0. - -Extended LAPB operation indicates the use of extended sequence numbers and -consequently larger window sizes, the default is standard LAPB operation. -MLP operation is the same as SLP operation except that the addresses used by -LAPB are different to indicate the mode of operation, the default is Single -Link Procedure. The difference between DCE and DTE operation is (i) the -addresses used for commands and responses, and (ii) when the DCE is not -connected, it sends DM without polls set, every T1. The upper case constant -names will be defined in the public LAPB header file. - - -Functions ---------- - -The LAPB module provides a number of function entry points. - - -int lapb_register(void *token, struct lapb_register_struct); - -This must be called before the LAPB module may be used. If the call is -successful then LAPB_OK is returned. The token must be a unique identifier -generated by the device driver to allow for the unique identification of the -instance of the LAPB link. It is returned by the LAPB module in all of the -callbacks, and is used by the device driver in all calls to the LAPB module. -For multiple LAPB links in a single device driver, multiple calls to -lapb_register must be made. The format of the lapb_register_struct is given -above. The return values are: - -LAPB_OK LAPB registered successfully. -LAPB_BADTOKEN Token is already registered. -LAPB_NOMEM Out of memory - - -int lapb_unregister(void *token); - -This releases all the resources associated with a LAPB link. Any current -LAPB link will be abandoned without further messages being passed. After -this call, the value of token is no longer valid for any calls to the LAPB -function. The valid return values are: - -LAPB_OK LAPB unregistered successfully. -LAPB_BADTOKEN Invalid/unknown LAPB token. - - -int lapb_getparms(void *token, struct lapb_parms_struct *parms); - -This allows the device driver to get the values of the current LAPB -variables, the lapb_parms_struct is described above. The valid return values -are: - -LAPB_OK LAPB getparms was successful. -LAPB_BADTOKEN Invalid/unknown LAPB token. - - -int lapb_setparms(void *token, struct lapb_parms_struct *parms); - -This allows the device driver to set the values of the current LAPB -variables, the lapb_parms_struct is described above. The values of t1timer, -t2timer and n2count are ignored, likewise changing the mode bits when -connected will be ignored. An error implies that none of the values have -been changed. The valid return values are: - -LAPB_OK LAPB getparms was successful. -LAPB_BADTOKEN Invalid/unknown LAPB token. -LAPB_INVALUE One of the values was out of its allowable range. - - -int lapb_connect_request(void *token); - -Initiate a connect using the current parameter settings. The valid return -values are: - -LAPB_OK LAPB is starting to connect. -LAPB_BADTOKEN Invalid/unknown LAPB token. -LAPB_CONNECTED LAPB module is already connected. - - -int lapb_disconnect_request(void *token); - -Initiate a disconnect. The valid return values are: - -LAPB_OK LAPB is starting to disconnect. -LAPB_BADTOKEN Invalid/unknown LAPB token. -LAPB_NOTCONNECTED LAPB module is not connected. - - -int lapb_data_request(void *token, struct sk_buff *skb); - -Queue data with the LAPB module for transmitting over the link. If the call -is successful then the skbuff is owned by the LAPB module and may not be -used by the device driver again. The valid return values are: - -LAPB_OK LAPB has accepted the data. -LAPB_BADTOKEN Invalid/unknown LAPB token. -LAPB_NOTCONNECTED LAPB module is not connected. - - -int lapb_data_received(void *token, struct sk_buff *skb); - -Queue data with the LAPB module which has been received from the device. It -is expected that the data passed to the LAPB module has skb->data pointing -to the beginning of the LAPB data. If the call is successful then the skbuff -is owned by the LAPB module and may not be used by the device driver again. -The valid return values are: - -LAPB_OK LAPB has accepted the data. -LAPB_BADTOKEN Invalid/unknown LAPB token. - - -Callbacks ---------- - -These callbacks are functions provided by the device driver for the LAPB -module to call when an event occurs. They are registered with the LAPB -module with lapb_register (see above) in the structure lapb_register_struct -(see above). - - -void (*connect_confirmation)(void *token, int reason); - -This is called by the LAPB module when a connection is established after -being requested by a call to lapb_connect_request (see above). The reason is -always LAPB_OK. - - -void (*connect_indication)(void *token, int reason); - -This is called by the LAPB module when the link is established by the remote -system. The value of reason is always LAPB_OK. - - -void (*disconnect_confirmation)(void *token, int reason); - -This is called by the LAPB module when an event occurs after the device -driver has called lapb_disconnect_request (see above). The reason indicates -what has happened. In all cases the LAPB link can be regarded as being -terminated. The values for reason are: - -LAPB_OK The LAPB link was terminated normally. -LAPB_NOTCONNECTED The remote system was not connected. -LAPB_TIMEDOUT No response was received in N2 tries from the remote - system. - - -void (*disconnect_indication)(void *token, int reason); - -This is called by the LAPB module when the link is terminated by the remote -system or another event has occurred to terminate the link. This may be -returned in response to a lapb_connect_request (see above) if the remote -system refused the request. The values for reason are: - -LAPB_OK The LAPB link was terminated normally by the remote - system. -LAPB_REFUSED The remote system refused the connect request. -LAPB_NOTCONNECTED The remote system was not connected. -LAPB_TIMEDOUT No response was received in N2 tries from the remote - system. - - -int (*data_indication)(void *token, struct sk_buff *skb); - -This is called by the LAPB module when data has been received from the -remote system that should be passed onto the next layer in the protocol -stack. The skbuff becomes the property of the device driver and the LAPB -module will not perform any more actions on it. The skb->data pointer will -be pointing to the first byte of data after the LAPB header. - -This method should return NET_RX_DROP (as defined in the header -file include/linux/netdevice.h) if and only if the frame was dropped -before it could be delivered to the upper layer. - - -void (*data_transmit)(void *token, struct sk_buff *skb); - -This is called by the LAPB module when data is to be transmitted to the -remote system by the device driver. The skbuff becomes the property of the -device driver and the LAPB module will not perform any more actions on it. -The skb->data pointer will be pointing to the first byte of the LAPB header. diff --git a/MAINTAINERS b/MAINTAINERS index 3a5f52a3c055..956999d2d979 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -9515,7 +9515,7 @@ F: drivers/soc/lantiq LAPB module L: linux-x25@vger.kernel.org S: Orphan -F: Documentation/networking/lapb-module.txt +F: Documentation/networking/lapb-module.rst F: include/*/lapb.h F: net/lapb/ diff --git a/net/lapb/Kconfig b/net/lapb/Kconfig index 6acfc999c952..5b50e8d64f26 100644 --- a/net/lapb/Kconfig +++ b/net/lapb/Kconfig @@ -15,7 +15,7 @@ config LAPB currently supports LAPB only over Ethernet connections. If you want to use LAPB connections over Ethernet, say Y here and to "LAPB over Ethernet driver" below. Read - for technical + for technical details. To compile this driver as a module, choose M here: the -- cgit v1.2.3-59-g8ed1b From a6b93e6555a6ecd0d08b0383ea4d93d09a168187 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Thu, 30 Apr 2020 18:03:58 +0200 Subject: docs: networking: convert ltpc.txt to ReST - add SPDX header; - add a document title; - mark code blocks and literals as such; - mark tables as such; - adjust identation, whitespaces and blank lines; - add to networking/index.rst. Signed-off-by: Mauro Carvalho Chehab Signed-off-by: David S. Miller --- Documentation/networking/index.rst | 1 + Documentation/networking/ltpc.rst | 144 +++++++++++++++++++++++++++++++++++++ Documentation/networking/ltpc.txt | 131 --------------------------------- drivers/net/appletalk/Kconfig | 2 +- 4 files changed, 146 insertions(+), 132 deletions(-) create mode 100644 Documentation/networking/ltpc.rst delete mode 100644 Documentation/networking/ltpc.txt diff --git a/Documentation/networking/index.rst b/Documentation/networking/index.rst index acd2567cf0d4..b3608b177a8b 100644 --- a/Documentation/networking/index.rst +++ b/Documentation/networking/index.rst @@ -76,6 +76,7 @@ Contents: kcm l2tp lapb-module + ltpc .. only:: subproject and html diff --git a/Documentation/networking/ltpc.rst b/Documentation/networking/ltpc.rst new file mode 100644 index 000000000000..0ad197fd17ce --- /dev/null +++ b/Documentation/networking/ltpc.rst @@ -0,0 +1,144 @@ +.. SPDX-License-Identifier: GPL-2.0 + +=========== +LTPC Driver +=========== + +This is the ALPHA version of the ltpc driver. + +In order to use it, you will need at least version 1.3.3 of the +netatalk package, and the Apple or Farallon LocalTalk PC card. +There are a number of different LocalTalk cards for the PC; this +driver applies only to the one with the 65c02 processor chip on it. + +To include it in the kernel, select the CONFIG_LTPC switch in the +configuration dialog. You can also compile it as a module. + +While the driver will attempt to autoprobe the I/O port address, IRQ +line, and DMA channel of the card, this does not always work. For +this reason, you should be prepared to supply these parameters +yourself. (see "Card Configuration" below for how to determine or +change the settings on your card) + +When the driver is compiled into the kernel, you can add a line such +as the following to your /etc/lilo.conf:: + + append="ltpc=0x240,9,1" + +where the parameters (in order) are the port address, IRQ, and DMA +channel. The second and third values can be omitted, in which case +the driver will try to determine them itself. + +If you load the driver as a module, you can pass the parameters "io=", +"irq=", and "dma=" on the command line with insmod or modprobe, or add +them as options in a configuration file in /etc/modprobe.d/ directory:: + + alias lt0 ltpc # autoload the module when the interface is configured + options ltpc io=0x240 irq=9 dma=1 + +Before starting up the netatalk demons (perhaps in rc.local), you +need to add a line such as:: + + /sbin/ifconfig lt0 127.0.0.42 + +The address is unimportant - however, the card needs to be configured +with ifconfig so that Netatalk can find it. + +The appropriate netatalk configuration depends on whether you are +attached to a network that includes AppleTalk routers or not. If, +like me, you are simply connecting to your home Macintoshes and +printers, you need to set up netatalk to "seed". The way I do this +is to have the lines:: + + dummy -seed -phase 2 -net 2000 -addr 2000.26 -zone "1033" + lt0 -seed -phase 1 -net 1033 -addr 1033.27 -zone "1033" + +in my atalkd.conf. What is going on here is that I need to fool +netatalk into thinking that there are two AppleTalk interfaces +present; otherwise, it refuses to seed. This is a hack, and a more +permanent solution would be to alter the netatalk code. Also, make +sure you have the correct name for the dummy interface - If it's +compiled as a module, you will need to refer to it as "dummy0" or some +such. + +If you are attached to an extended AppleTalk network, with routers on +it, then you don't need to fool around with this -- the appropriate +line in atalkd.conf is:: + + lt0 -phase 1 + + +Card Configuration +================== + +The interrupts and so forth are configured via the dipswitch on the +board. Set the switches so as not to conflict with other hardware. + + Interrupts -- set at most one. If none are set, the driver uses + polled mode. Because the card was developed in the XT era, the + original documentation refers to IRQ2. Since you'll be running + this on an AT (or later) class machine, that really means IRQ9. + + === =========================================================== + SW1 IRQ 4 + SW2 IRQ 3 + SW3 IRQ 9 (2 in original card documentation only applies to XT) + === =========================================================== + + + DMA -- choose DMA 1 or 3, and set both corresponding switches. + + === ===== + SW4 DMA 3 + SW5 DMA 1 + SW6 DMA 3 + SW7 DMA 1 + === ===== + + + I/O address -- choose one. + + === ========= + SW8 220 / 240 + === ========= + + +IP +== + +Yes, it is possible to do IP over LocalTalk. However, you can't just +treat the LocalTalk device like an ordinary Ethernet device, even if +that's what it looks like to Netatalk. + +Instead, you follow the same procedure as for doing IP in EtherTalk. +See Documentation/networking/ipddp.rst for more information about the +kernel driver and userspace tools needed. + + +Bugs +==== + +IRQ autoprobing often doesn't work on a cold boot. To get around +this, either compile the driver as a module, or pass the parameters +for the card to the kernel as described above. + +Also, as usual, autoprobing is not recommended when you use the driver +as a module. (though it usually works at boot time, at least) + +Polled mode is *really* slow sometimes, but this seems to depend on +the configuration of the network. + +It may theoretically be possible to use two LTPC cards in the same +machine, but this is unsupported, so if you really want to do this, +you'll probably have to hack the initialization code a bit. + + +Thanks +====== + +Thanks to Alan Cox for helpful discussions early on in this +work, and to Denis Hainsworth for doing the bleeding-edge testing. + +Bradford Johnson + +Updated 11/09/1998 by David Huggins-Daines diff --git a/Documentation/networking/ltpc.txt b/Documentation/networking/ltpc.txt deleted file mode 100644 index a005a73b76d0..000000000000 --- a/Documentation/networking/ltpc.txt +++ /dev/null @@ -1,131 +0,0 @@ -This is the ALPHA version of the ltpc driver. - -In order to use it, you will need at least version 1.3.3 of the -netatalk package, and the Apple or Farallon LocalTalk PC card. -There are a number of different LocalTalk cards for the PC; this -driver applies only to the one with the 65c02 processor chip on it. - -To include it in the kernel, select the CONFIG_LTPC switch in the -configuration dialog. You can also compile it as a module. - -While the driver will attempt to autoprobe the I/O port address, IRQ -line, and DMA channel of the card, this does not always work. For -this reason, you should be prepared to supply these parameters -yourself. (see "Card Configuration" below for how to determine or -change the settings on your card) - -When the driver is compiled into the kernel, you can add a line such -as the following to your /etc/lilo.conf: - - append="ltpc=0x240,9,1" - -where the parameters (in order) are the port address, IRQ, and DMA -channel. The second and third values can be omitted, in which case -the driver will try to determine them itself. - -If you load the driver as a module, you can pass the parameters "io=", -"irq=", and "dma=" on the command line with insmod or modprobe, or add -them as options in a configuration file in /etc/modprobe.d/ directory: - - alias lt0 ltpc # autoload the module when the interface is configured - options ltpc io=0x240 irq=9 dma=1 - -Before starting up the netatalk demons (perhaps in rc.local), you -need to add a line such as: - - /sbin/ifconfig lt0 127.0.0.42 - -The address is unimportant - however, the card needs to be configured -with ifconfig so that Netatalk can find it. - -The appropriate netatalk configuration depends on whether you are -attached to a network that includes AppleTalk routers or not. If, -like me, you are simply connecting to your home Macintoshes and -printers, you need to set up netatalk to "seed". The way I do this -is to have the lines - - dummy -seed -phase 2 -net 2000 -addr 2000.26 -zone "1033" - lt0 -seed -phase 1 -net 1033 -addr 1033.27 -zone "1033" - -in my atalkd.conf. What is going on here is that I need to fool -netatalk into thinking that there are two AppleTalk interfaces -present; otherwise, it refuses to seed. This is a hack, and a more -permanent solution would be to alter the netatalk code. Also, make -sure you have the correct name for the dummy interface - If it's -compiled as a module, you will need to refer to it as "dummy0" or some -such. - -If you are attached to an extended AppleTalk network, with routers on -it, then you don't need to fool around with this -- the appropriate -line in atalkd.conf is - - lt0 -phase 1 - --------------------------------------- - -Card Configuration: - -The interrupts and so forth are configured via the dipswitch on the -board. Set the switches so as not to conflict with other hardware. - - Interrupts -- set at most one. If none are set, the driver uses - polled mode. Because the card was developed in the XT era, the - original documentation refers to IRQ2. Since you'll be running - this on an AT (or later) class machine, that really means IRQ9. - - SW1 IRQ 4 - SW2 IRQ 3 - SW3 IRQ 9 (2 in original card documentation only applies to XT) - - - DMA -- choose DMA 1 or 3, and set both corresponding switches. - - SW4 DMA 3 - SW5 DMA 1 - SW6 DMA 3 - SW7 DMA 1 - - - I/O address -- choose one. - - SW8 220 / 240 - --------------------------------------- - -IP: - -Yes, it is possible to do IP over LocalTalk. However, you can't just -treat the LocalTalk device like an ordinary Ethernet device, even if -that's what it looks like to Netatalk. - -Instead, you follow the same procedure as for doing IP in EtherTalk. -See Documentation/networking/ipddp.rst for more information about the -kernel driver and userspace tools needed. - --------------------------------------- - -BUGS: - -IRQ autoprobing often doesn't work on a cold boot. To get around -this, either compile the driver as a module, or pass the parameters -for the card to the kernel as described above. - -Also, as usual, autoprobing is not recommended when you use the driver -as a module. (though it usually works at boot time, at least) - -Polled mode is *really* slow sometimes, but this seems to depend on -the configuration of the network. - -It may theoretically be possible to use two LTPC cards in the same -machine, but this is unsupported, so if you really want to do this, -you'll probably have to hack the initialization code a bit. - -______________________________________ - -THANKS: - Thanks to Alan Cox for helpful discussions early on in this -work, and to Denis Hainsworth for doing the bleeding-edge testing. - --- Bradford Johnson - --- Updated 11/09/1998 by David Huggins-Daines diff --git a/drivers/net/appletalk/Kconfig b/drivers/net/appletalk/Kconfig index ccde6479050c..10589a82263b 100644 --- a/drivers/net/appletalk/Kconfig +++ b/drivers/net/appletalk/Kconfig @@ -48,7 +48,7 @@ config LTPC If you are in doubt, this card is the one with the 65C02 chip on it. You also need version 1.3.3 or later of the netatalk package. This driver is experimental, which means that it may not work. - See the file . + See the file . config COPS tristate "COPS LocalTalk PC support" -- cgit v1.2.3-59-g8ed1b From 429ff87bcac75b929d9ffec8d4d24be2616f8052 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Thu, 30 Apr 2020 18:03:59 +0200 Subject: docs: networking: convert mac80211-injection.txt to ReST - add SPDX header; - adjust title markup; - mark code blocks and literals as such; - mark tables as such; - adjust identation, whitespaces and blank lines; - add to networking/index.rst. Signed-off-by: Mauro Carvalho Chehab Signed-off-by: David S. Miller --- Documentation/networking/index.rst | 1 + Documentation/networking/mac80211-injection.rst | 106 ++++++++++++++++++++++++ Documentation/networking/mac80211-injection.txt | 97 ---------------------- MAINTAINERS | 2 +- net/mac80211/tx.c | 2 +- 5 files changed, 109 insertions(+), 99 deletions(-) create mode 100644 Documentation/networking/mac80211-injection.rst delete mode 100644 Documentation/networking/mac80211-injection.txt diff --git a/Documentation/networking/index.rst b/Documentation/networking/index.rst index b3608b177a8b..81c1834bfb57 100644 --- a/Documentation/networking/index.rst +++ b/Documentation/networking/index.rst @@ -77,6 +77,7 @@ Contents: l2tp lapb-module ltpc + mac80211-injection .. only:: subproject and html diff --git a/Documentation/networking/mac80211-injection.rst b/Documentation/networking/mac80211-injection.rst new file mode 100644 index 000000000000..75d4edcae852 --- /dev/null +++ b/Documentation/networking/mac80211-injection.rst @@ -0,0 +1,106 @@ +.. SPDX-License-Identifier: GPL-2.0 + +========================================= +How to use packet injection with mac80211 +========================================= + +mac80211 now allows arbitrary packets to be injected down any Monitor Mode +interface from userland. The packet you inject needs to be composed in the +following format:: + + [ radiotap header ] + [ ieee80211 header ] + [ payload ] + +The radiotap format is discussed in +./Documentation/networking/radiotap-headers.txt. + +Despite many radiotap parameters being currently defined, most only make sense +to appear on received packets. The following information is parsed from the +radiotap headers and used to control injection: + + * IEEE80211_RADIOTAP_FLAGS + + ========================= =========================================== + IEEE80211_RADIOTAP_F_FCS FCS will be removed and recalculated + IEEE80211_RADIOTAP_F_WEP frame will be encrypted if key available + IEEE80211_RADIOTAP_F_FRAG frame will be fragmented if longer than the + current fragmentation threshold. + ========================= =========================================== + + * IEEE80211_RADIOTAP_TX_FLAGS + + ============================= ======================================== + IEEE80211_RADIOTAP_F_TX_NOACK frame should be sent without waiting for + an ACK even if it is a unicast frame + ============================= ======================================== + + * IEEE80211_RADIOTAP_RATE + + legacy rate for the transmission (only for devices without own rate control) + + * IEEE80211_RADIOTAP_MCS + + HT rate for the transmission (only for devices without own rate control). + Also some flags are parsed + + ============================ ======================== + IEEE80211_RADIOTAP_MCS_SGI use short guard interval + IEEE80211_RADIOTAP_MCS_BW_40 send in HT40 mode + ============================ ======================== + + * IEEE80211_RADIOTAP_DATA_RETRIES + + number of retries when either IEEE80211_RADIOTAP_RATE or + IEEE80211_RADIOTAP_MCS was used + + * IEEE80211_RADIOTAP_VHT + + VHT mcs and number of streams used in the transmission (only for devices + without own rate control). Also other fields are parsed + + flags field + IEEE80211_RADIOTAP_VHT_FLAG_SGI: use short guard interval + + bandwidth field + * 1: send using 40MHz channel width + * 4: send using 80MHz channel width + * 11: send using 160MHz channel width + +The injection code can also skip all other currently defined radiotap fields +facilitating replay of captured radiotap headers directly. + +Here is an example valid radiotap header defining some parameters:: + + 0x00, 0x00, // <-- radiotap version + 0x0b, 0x00, // <- radiotap header length + 0x04, 0x0c, 0x00, 0x00, // <-- bitmap + 0x6c, // <-- rate + 0x0c, //<-- tx power + 0x01 //<-- antenna + +The ieee80211 header follows immediately afterwards, looking for example like +this:: + + 0x08, 0x01, 0x00, 0x00, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0x13, 0x22, 0x33, 0x44, 0x55, 0x66, + 0x13, 0x22, 0x33, 0x44, 0x55, 0x66, + 0x10, 0x86 + +Then lastly there is the payload. + +After composing the packet contents, it is sent by send()-ing it to a logical +mac80211 interface that is in Monitor mode. Libpcap can also be used, +(which is easier than doing the work to bind the socket to the right +interface), along the following lines::: + + ppcap = pcap_open_live(szInterfaceName, 800, 1, 20, szErrbuf); + ... + r = pcap_inject(ppcap, u8aSendBuffer, nLength); + +You can also find a link to a complete inject application here: + +http://wireless.kernel.org/en/users/Documentation/packetspammer + +Andy Green diff --git a/Documentation/networking/mac80211-injection.txt b/Documentation/networking/mac80211-injection.txt deleted file mode 100644 index d58d78df9ca2..000000000000 --- a/Documentation/networking/mac80211-injection.txt +++ /dev/null @@ -1,97 +0,0 @@ -How to use packet injection with mac80211 -========================================= - -mac80211 now allows arbitrary packets to be injected down any Monitor Mode -interface from userland. The packet you inject needs to be composed in the -following format: - - [ radiotap header ] - [ ieee80211 header ] - [ payload ] - -The radiotap format is discussed in -./Documentation/networking/radiotap-headers.txt. - -Despite many radiotap parameters being currently defined, most only make sense -to appear on received packets. The following information is parsed from the -radiotap headers and used to control injection: - - * IEEE80211_RADIOTAP_FLAGS - - IEEE80211_RADIOTAP_F_FCS: FCS will be removed and recalculated - IEEE80211_RADIOTAP_F_WEP: frame will be encrypted if key available - IEEE80211_RADIOTAP_F_FRAG: frame will be fragmented if longer than the - current fragmentation threshold. - - * IEEE80211_RADIOTAP_TX_FLAGS - - IEEE80211_RADIOTAP_F_TX_NOACK: frame should be sent without waiting for - an ACK even if it is a unicast frame - - * IEEE80211_RADIOTAP_RATE - - legacy rate for the transmission (only for devices without own rate control) - - * IEEE80211_RADIOTAP_MCS - - HT rate for the transmission (only for devices without own rate control). - Also some flags are parsed - - IEEE80211_RADIOTAP_MCS_SGI: use short guard interval - IEEE80211_RADIOTAP_MCS_BW_40: send in HT40 mode - - * IEEE80211_RADIOTAP_DATA_RETRIES - - number of retries when either IEEE80211_RADIOTAP_RATE or - IEEE80211_RADIOTAP_MCS was used - - * IEEE80211_RADIOTAP_VHT - - VHT mcs and number of streams used in the transmission (only for devices - without own rate control). Also other fields are parsed - - flags field - IEEE80211_RADIOTAP_VHT_FLAG_SGI: use short guard interval - - bandwidth field - 1: send using 40MHz channel width - 4: send using 80MHz channel width - 11: send using 160MHz channel width - -The injection code can also skip all other currently defined radiotap fields -facilitating replay of captured radiotap headers directly. - -Here is an example valid radiotap header defining some parameters - - 0x00, 0x00, // <-- radiotap version - 0x0b, 0x00, // <- radiotap header length - 0x04, 0x0c, 0x00, 0x00, // <-- bitmap - 0x6c, // <-- rate - 0x0c, //<-- tx power - 0x01 //<-- antenna - -The ieee80211 header follows immediately afterwards, looking for example like -this: - - 0x08, 0x01, 0x00, 0x00, - 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, - 0x13, 0x22, 0x33, 0x44, 0x55, 0x66, - 0x13, 0x22, 0x33, 0x44, 0x55, 0x66, - 0x10, 0x86 - -Then lastly there is the payload. - -After composing the packet contents, it is sent by send()-ing it to a logical -mac80211 interface that is in Monitor mode. Libpcap can also be used, -(which is easier than doing the work to bind the socket to the right -interface), along the following lines: - - ppcap = pcap_open_live(szInterfaceName, 800, 1, 20, szErrbuf); -... - r = pcap_inject(ppcap, u8aSendBuffer, nLength); - -You can also find a link to a complete inject application here: - -http://wireless.kernel.org/en/users/Documentation/packetspammer - -Andy Green diff --git a/MAINTAINERS b/MAINTAINERS index 956999d2d979..33bfc9e4aead 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -10079,7 +10079,7 @@ S: Maintained W: https://wireless.wiki.kernel.org/ T: git git://git.kernel.org/pub/scm/linux/kernel/git/jberg/mac80211.git T: git git://git.kernel.org/pub/scm/linux/kernel/git/jberg/mac80211-next.git -F: Documentation/networking/mac80211-injection.txt +F: Documentation/networking/mac80211-injection.rst F: Documentation/networking/mac80211_hwsim/mac80211_hwsim.rst F: drivers/net/wireless/mac80211_hwsim.[ch] F: include/net/mac80211.h diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index 82846aca86d9..9849c14694db 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -2144,7 +2144,7 @@ static bool ieee80211_parse_tx_radiotap(struct ieee80211_local *local, /* * Please update the file - * Documentation/networking/mac80211-injection.txt + * Documentation/networking/mac80211-injection.rst * when parsing new fields here. */ -- cgit v1.2.3-59-g8ed1b From e14fd64dcda5668c5dd5e59421fc5c61ce6d5951 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Thu, 30 Apr 2020 18:04:00 +0200 Subject: docs: networking: convert mpls-sysctl.txt to ReST - add SPDX header; - add a document title; - mark lists as such; - adjust identation, whitespaces and blank lines; - add to networking/index.rst. Signed-off-by: Mauro Carvalho Chehab Signed-off-by: David S. Miller --- Documentation/networking/index.rst | 1 + Documentation/networking/mpls-sysctl.rst | 57 ++++++++++++++++++++++++++++++++ Documentation/networking/mpls-sysctl.txt | 48 --------------------------- 3 files changed, 58 insertions(+), 48 deletions(-) create mode 100644 Documentation/networking/mpls-sysctl.rst delete mode 100644 Documentation/networking/mpls-sysctl.txt diff --git a/Documentation/networking/index.rst b/Documentation/networking/index.rst index 81c1834bfb57..a751cda83c3d 100644 --- a/Documentation/networking/index.rst +++ b/Documentation/networking/index.rst @@ -78,6 +78,7 @@ Contents: lapb-module ltpc mac80211-injection + mpls-sysctl .. only:: subproject and html diff --git a/Documentation/networking/mpls-sysctl.rst b/Documentation/networking/mpls-sysctl.rst new file mode 100644 index 000000000000..0a2ac88404d7 --- /dev/null +++ b/Documentation/networking/mpls-sysctl.rst @@ -0,0 +1,57 @@ +.. SPDX-License-Identifier: GPL-2.0 + +==================== +MPLS Sysfs variables +==================== + +/proc/sys/net/mpls/* Variables: +=============================== + +platform_labels - INTEGER + Number of entries in the platform label table. It is not + possible to configure forwarding for label values equal to or + greater than the number of platform labels. + + A dense utilization of the entries in the platform label table + is possible and expected as the platform labels are locally + allocated. + + If the number of platform label table entries is set to 0 no + label will be recognized by the kernel and mpls forwarding + will be disabled. + + Reducing this value will remove all label routing entries that + no longer fit in the table. + + Possible values: 0 - 1048575 + + Default: 0 + +ip_ttl_propagate - BOOL + Control whether TTL is propagated from the IPv4/IPv6 header to + the MPLS header on imposing labels and propagated from the + MPLS header to the IPv4/IPv6 header on popping the last label. + + If disabled, the MPLS transport network will appear as a + single hop to transit traffic. + + * 0 - disabled / RFC 3443 [Short] Pipe Model + * 1 - enabled / RFC 3443 Uniform Model (default) + +default_ttl - INTEGER + Default TTL value to use for MPLS packets where it cannot be + propagated from an IP header, either because one isn't present + or ip_ttl_propagate has been disabled. + + Possible values: 1 - 255 + + Default: 255 + +conf//input - BOOL + Control whether packets can be input on this interface. + + If disabled, packets will be discarded without further + processing. + + * 0 - disabled (default) + * not 0 - enabled diff --git a/Documentation/networking/mpls-sysctl.txt b/Documentation/networking/mpls-sysctl.txt deleted file mode 100644 index 025cc9b96992..000000000000 --- a/Documentation/networking/mpls-sysctl.txt +++ /dev/null @@ -1,48 +0,0 @@ -/proc/sys/net/mpls/* Variables: - -platform_labels - INTEGER - Number of entries in the platform label table. It is not - possible to configure forwarding for label values equal to or - greater than the number of platform labels. - - A dense utilization of the entries in the platform label table - is possible and expected as the platform labels are locally - allocated. - - If the number of platform label table entries is set to 0 no - label will be recognized by the kernel and mpls forwarding - will be disabled. - - Reducing this value will remove all label routing entries that - no longer fit in the table. - - Possible values: 0 - 1048575 - Default: 0 - -ip_ttl_propagate - BOOL - Control whether TTL is propagated from the IPv4/IPv6 header to - the MPLS header on imposing labels and propagated from the - MPLS header to the IPv4/IPv6 header on popping the last label. - - If disabled, the MPLS transport network will appear as a - single hop to transit traffic. - - 0 - disabled / RFC 3443 [Short] Pipe Model - 1 - enabled / RFC 3443 Uniform Model (default) - -default_ttl - INTEGER - Default TTL value to use for MPLS packets where it cannot be - propagated from an IP header, either because one isn't present - or ip_ttl_propagate has been disabled. - - Possible values: 1 - 255 - Default: 255 - -conf//input - BOOL - Control whether packets can be input on this interface. - - If disabled, packets will be discarded without further - processing. - - 0 - disabled (default) - not 0 - enabled -- cgit v1.2.3-59-g8ed1b From e98aa68223e48164c559803d25484533dfd495ba Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Thu, 30 Apr 2020 18:04:01 +0200 Subject: docs: networking: convert multiqueue.txt to ReST - add SPDX header; - adjust titles and chapters, adding proper markups; - mark code blocks and literals as such; - use :field: markup; - adjust identation, whitespaces and blank lines; - add to networking/index.rst. Signed-off-by: Mauro Carvalho Chehab Signed-off-by: David S. Miller --- Documentation/networking/bonding.rst | 2 +- Documentation/networking/index.rst | 1 + Documentation/networking/multiqueue.rst | 78 ++++++++++++++++++++++++++++++++ Documentation/networking/multiqueue.txt | 79 --------------------------------- 4 files changed, 80 insertions(+), 80 deletions(-) create mode 100644 Documentation/networking/multiqueue.rst delete mode 100644 Documentation/networking/multiqueue.txt diff --git a/Documentation/networking/bonding.rst b/Documentation/networking/bonding.rst index dd49f95d28d3..24168b0d16bd 100644 --- a/Documentation/networking/bonding.rst +++ b/Documentation/networking/bonding.rst @@ -1639,7 +1639,7 @@ can safely be sent over either interface. Such configurations may be achieved using the traffic control utilities inherent in linux. By default the bonding driver is multiqueue aware and 16 queues are created -when the driver initializes (see Documentation/networking/multiqueue.txt +when the driver initializes (see Documentation/networking/multiqueue.rst for details). If more or less queues are desired the module parameter tx_queues can be used to change this value. There is no sysfs parameter available as the allocation is done at module init time. diff --git a/Documentation/networking/index.rst b/Documentation/networking/index.rst index a751cda83c3d..492658bf7c0d 100644 --- a/Documentation/networking/index.rst +++ b/Documentation/networking/index.rst @@ -79,6 +79,7 @@ Contents: ltpc mac80211-injection mpls-sysctl + multiqueue .. only:: subproject and html diff --git a/Documentation/networking/multiqueue.rst b/Documentation/networking/multiqueue.rst new file mode 100644 index 000000000000..0a576166e9dd --- /dev/null +++ b/Documentation/networking/multiqueue.rst @@ -0,0 +1,78 @@ +.. SPDX-License-Identifier: GPL-2.0 + +=========================================== +HOWTO for multiqueue network device support +=========================================== + +Section 1: Base driver requirements for implementing multiqueue support +======================================================================= + +Intro: Kernel support for multiqueue devices +--------------------------------------------------------- + +Kernel support for multiqueue devices is always present. + +Base drivers are required to use the new alloc_etherdev_mq() or +alloc_netdev_mq() functions to allocate the subqueues for the device. The +underlying kernel API will take care of the allocation and deallocation of +the subqueue memory, as well as netdev configuration of where the queues +exist in memory. + +The base driver will also need to manage the queues as it does the global +netdev->queue_lock today. Therefore base drivers should use the +netif_{start|stop|wake}_subqueue() functions to manage each queue while the +device is still operational. netdev->queue_lock is still used when the device +comes online or when it's completely shut down (unregister_netdev(), etc.). + + +Section 2: Qdisc support for multiqueue devices +=============================================== + +Currently two qdiscs are optimized for multiqueue devices. The first is the +default pfifo_fast qdisc. This qdisc supports one qdisc per hardware queue. +A new round-robin qdisc, sch_multiq also supports multiple hardware queues. The +qdisc is responsible for classifying the skb's and then directing the skb's to +bands and queues based on the value in skb->queue_mapping. Use this field in +the base driver to determine which queue to send the skb to. + +sch_multiq has been added for hardware that wishes to avoid head-of-line +blocking. It will cycle though the bands and verify that the hardware queue +associated with the band is not stopped prior to dequeuing a packet. + +On qdisc load, the number of bands is based on the number of queues on the +hardware. Once the association is made, any skb with skb->queue_mapping set, +will be queued to the band associated with the hardware queue. + + +Section 3: Brief howto using MULTIQ for multiqueue devices +========================================================== + +The userspace command 'tc,' part of the iproute2 package, is used to configure +qdiscs. To add the MULTIQ qdisc to your network device, assuming the device +is called eth0, run the following command:: + + # tc qdisc add dev eth0 root handle 1: multiq + +The qdisc will allocate the number of bands to equal the number of queues that +the device reports, and bring the qdisc online. Assuming eth0 has 4 Tx +queues, the band mapping would look like:: + + band 0 => queue 0 + band 1 => queue 1 + band 2 => queue 2 + band 3 => queue 3 + +Traffic will begin flowing through each queue based on either the simple_tx_hash +function or based on netdev->select_queue() if you have it defined. + +The behavior of tc filters remains the same. However a new tc action, +skbedit, has been added. Assuming you wanted to route all traffic to a +specific host, for example 192.168.0.3, through a specific queue you could use +this action and establish a filter such as:: + + tc filter add dev eth0 parent 1: protocol ip prio 1 u32 \ + match ip dst 192.168.0.3 \ + action skbedit queue_mapping 3 + +:Author: Alexander Duyck +:Original Author: Peter P. Waskiewicz Jr. diff --git a/Documentation/networking/multiqueue.txt b/Documentation/networking/multiqueue.txt deleted file mode 100644 index 4caa0e314cc2..000000000000 --- a/Documentation/networking/multiqueue.txt +++ /dev/null @@ -1,79 +0,0 @@ - - HOWTO for multiqueue network device support - =========================================== - -Section 1: Base driver requirements for implementing multiqueue support - -Intro: Kernel support for multiqueue devices ---------------------------------------------------------- - -Kernel support for multiqueue devices is always present. - -Section 1: Base driver requirements for implementing multiqueue support ------------------------------------------------------------------------ - -Base drivers are required to use the new alloc_etherdev_mq() or -alloc_netdev_mq() functions to allocate the subqueues for the device. The -underlying kernel API will take care of the allocation and deallocation of -the subqueue memory, as well as netdev configuration of where the queues -exist in memory. - -The base driver will also need to manage the queues as it does the global -netdev->queue_lock today. Therefore base drivers should use the -netif_{start|stop|wake}_subqueue() functions to manage each queue while the -device is still operational. netdev->queue_lock is still used when the device -comes online or when it's completely shut down (unregister_netdev(), etc.). - - -Section 2: Qdisc support for multiqueue devices - ------------------------------------------------ - -Currently two qdiscs are optimized for multiqueue devices. The first is the -default pfifo_fast qdisc. This qdisc supports one qdisc per hardware queue. -A new round-robin qdisc, sch_multiq also supports multiple hardware queues. The -qdisc is responsible for classifying the skb's and then directing the skb's to -bands and queues based on the value in skb->queue_mapping. Use this field in -the base driver to determine which queue to send the skb to. - -sch_multiq has been added for hardware that wishes to avoid head-of-line -blocking. It will cycle though the bands and verify that the hardware queue -associated with the band is not stopped prior to dequeuing a packet. - -On qdisc load, the number of bands is based on the number of queues on the -hardware. Once the association is made, any skb with skb->queue_mapping set, -will be queued to the band associated with the hardware queue. - - -Section 3: Brief howto using MULTIQ for multiqueue devices ---------------------------------------------------------------- - -The userspace command 'tc,' part of the iproute2 package, is used to configure -qdiscs. To add the MULTIQ qdisc to your network device, assuming the device -is called eth0, run the following command: - -# tc qdisc add dev eth0 root handle 1: multiq - -The qdisc will allocate the number of bands to equal the number of queues that -the device reports, and bring the qdisc online. Assuming eth0 has 4 Tx -queues, the band mapping would look like: - -band 0 => queue 0 -band 1 => queue 1 -band 2 => queue 2 -band 3 => queue 3 - -Traffic will begin flowing through each queue based on either the simple_tx_hash -function or based on netdev->select_queue() if you have it defined. - -The behavior of tc filters remains the same. However a new tc action, -skbedit, has been added. Assuming you wanted to route all traffic to a -specific host, for example 192.168.0.3, through a specific queue you could use -this action and establish a filter such as: - -tc filter add dev eth0 parent 1: protocol ip prio 1 u32 \ - match ip dst 192.168.0.3 \ - action skbedit queue_mapping 3 - -Author: Alexander Duyck -Original Author: Peter P. Waskiewicz Jr. -- cgit v1.2.3-59-g8ed1b From d9d6ef25ecab3e10966981bc58d014576da74272 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Thu, 30 Apr 2020 18:04:02 +0200 Subject: docs: networking: convert netconsole.txt to ReST - add SPDX header; - add a document title; - mark code blocks and literals as such; - mark tables as such; - add notes markups; - adjust identation, whitespaces and blank lines; - add to networking/index.rst. Signed-off-by: Mauro Carvalho Chehab Signed-off-by: David S. Miller --- Documentation/admin-guide/kernel-parameters.txt | 2 +- Documentation/admin-guide/serial-console.rst | 2 +- Documentation/networking/index.rst | 1 + Documentation/networking/netconsole.rst | 239 ++++++++++++++++++++++++ Documentation/networking/netconsole.txt | 210 --------------------- drivers/net/Kconfig | 4 +- drivers/net/ethernet/toshiba/ps3_gelic_net.c | 2 +- drivers/net/ethernet/toshiba/spider_net.c | 2 +- 8 files changed, 246 insertions(+), 216 deletions(-) create mode 100644 Documentation/networking/netconsole.rst delete mode 100644 Documentation/networking/netconsole.txt diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index e43f2e1f2958..398c34804bb8 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -638,7 +638,7 @@ See Documentation/admin-guide/serial-console.rst for more information. See - Documentation/networking/netconsole.txt for an + Documentation/networking/netconsole.rst for an alternative. uart[8250],io,[,options] diff --git a/Documentation/admin-guide/serial-console.rst b/Documentation/admin-guide/serial-console.rst index a8d1e36b627a..58b32832e50a 100644 --- a/Documentation/admin-guide/serial-console.rst +++ b/Documentation/admin-guide/serial-console.rst @@ -54,7 +54,7 @@ You will need to create a new device to use ``/dev/console``. The official ``/dev/console`` is now character device 5,1. (You can also use a network device as a console. See -``Documentation/networking/netconsole.txt`` for information on that.) +``Documentation/networking/netconsole.rst`` for information on that.) Here's an example that will use ``/dev/ttyS1`` (COM2) as the console. Replace the sample values as needed. diff --git a/Documentation/networking/index.rst b/Documentation/networking/index.rst index 492658bf7c0d..e58f872d401d 100644 --- a/Documentation/networking/index.rst +++ b/Documentation/networking/index.rst @@ -80,6 +80,7 @@ Contents: mac80211-injection mpls-sysctl multiqueue + netconsole .. only:: subproject and html diff --git a/Documentation/networking/netconsole.rst b/Documentation/networking/netconsole.rst new file mode 100644 index 000000000000..1f5c4a04027c --- /dev/null +++ b/Documentation/networking/netconsole.rst @@ -0,0 +1,239 @@ +.. SPDX-License-Identifier: GPL-2.0 + +========== +Netconsole +========== + + +started by Ingo Molnar , 2001.09.17 + +2.6 port and netpoll api by Matt Mackall , Sep 9 2003 + +IPv6 support by Cong Wang , Jan 1 2013 + +Extended console support by Tejun Heo , May 1 2015 + +Please send bug reports to Matt Mackall +Satyam Sharma , and Cong Wang + +Introduction: +============= + +This module logs kernel printk messages over UDP allowing debugging of +problem where disk logging fails and serial consoles are impractical. + +It can be used either built-in or as a module. As a built-in, +netconsole initializes immediately after NIC cards and will bring up +the specified interface as soon as possible. While this doesn't allow +capture of early kernel panics, it does capture most of the boot +process. + +Sender and receiver configuration: +================================== + +It takes a string configuration parameter "netconsole" in the +following format:: + + netconsole=[+][src-port]@[src-ip]/[],[tgt-port]@/[tgt-macaddr] + + where + + if present, enable extended console support + src-port source for UDP packets (defaults to 6665) + src-ip source IP to use (interface address) + dev network interface (eth0) + tgt-port port for logging agent (6666) + tgt-ip IP address for logging agent + tgt-macaddr ethernet MAC address for logging agent (broadcast) + +Examples:: + + linux netconsole=4444@10.0.0.1/eth1,9353@10.0.0.2/12:34:56:78:9a:bc + +or:: + + insmod netconsole netconsole=@/,@10.0.0.2/ + +or using IPv6:: + + insmod netconsole netconsole=@/,@fd00:1:2:3::1/ + +It also supports logging to multiple remote agents by specifying +parameters for the multiple agents separated by semicolons and the +complete string enclosed in "quotes", thusly:: + + modprobe netconsole netconsole="@/,@10.0.0.2/;@/eth1,6892@10.0.0.3/" + +Built-in netconsole starts immediately after the TCP stack is +initialized and attempts to bring up the supplied dev at the supplied +address. + +The remote host has several options to receive the kernel messages, +for example: + +1) syslogd + +2) netcat + + On distributions using a BSD-based netcat version (e.g. Fedora, + openSUSE and Ubuntu) the listening port must be specified without + the -p switch:: + + nc -u -l -p ' / 'nc -u -l + + or:: + + netcat -u -l -p ' / 'netcat -u -l + +3) socat + +:: + + socat udp-recv: - + +Dynamic reconfiguration: +======================== + +Dynamic reconfigurability is a useful addition to netconsole that enables +remote logging targets to be dynamically added, removed, or have their +parameters reconfigured at runtime from a configfs-based userspace interface. +[ Note that the parameters of netconsole targets that were specified/created +from the boot/module option are not exposed via this interface, and hence +cannot be modified dynamically. ] + +To include this feature, select CONFIG_NETCONSOLE_DYNAMIC when building the +netconsole module (or kernel, if netconsole is built-in). + +Some examples follow (where configfs is mounted at the /sys/kernel/config +mountpoint). + +To add a remote logging target (target names can be arbitrary):: + + cd /sys/kernel/config/netconsole/ + mkdir target1 + +Note that newly created targets have default parameter values (as mentioned +above) and are disabled by default -- they must first be enabled by writing +"1" to the "enabled" attribute (usually after setting parameters accordingly) +as described below. + +To remove a target:: + + rmdir /sys/kernel/config/netconsole/othertarget/ + +The interface exposes these parameters of a netconsole target to userspace: + + ============== ================================= ============ + enabled Is this target currently enabled? (read-write) + extended Extended mode enabled (read-write) + dev_name Local network interface name (read-write) + local_port Source UDP port to use (read-write) + remote_port Remote agent's UDP port (read-write) + local_ip Source IP address to use (read-write) + remote_ip Remote agent's IP address (read-write) + local_mac Local interface's MAC address (read-only) + remote_mac Remote agent's MAC address (read-write) + ============== ================================= ============ + +The "enabled" attribute is also used to control whether the parameters of +a target can be updated or not -- you can modify the parameters of only +disabled targets (i.e. if "enabled" is 0). + +To update a target's parameters:: + + cat enabled # check if enabled is 1 + echo 0 > enabled # disable the target (if required) + echo eth2 > dev_name # set local interface + echo 10.0.0.4 > remote_ip # update some parameter + echo cb:a9:87:65:43:21 > remote_mac # update more parameters + echo 1 > enabled # enable target again + +You can also update the local interface dynamically. This is especially +useful if you want to use interfaces that have newly come up (and may not +have existed when netconsole was loaded / initialized). + +Extended console: +================= + +If '+' is prefixed to the configuration line or "extended" config file +is set to 1, extended console support is enabled. An example boot +param follows:: + + linux netconsole=+4444@10.0.0.1/eth1,9353@10.0.0.2/12:34:56:78:9a:bc + +Log messages are transmitted with extended metadata header in the +following format which is the same as /dev/kmsg:: + + ,,,; + +Non printable characters in are escaped using "\xff" +notation. If the message contains optional dictionary, verbatim +newline is used as the delimeter. + +If a message doesn't fit in certain number of bytes (currently 1000), +the message is split into multiple fragments by netconsole. These +fragments are transmitted with "ncfrag" header field added:: + + ncfrag=/ + +For example, assuming a lot smaller chunk size, a message "the first +chunk, the 2nd chunk." may be split as follows:: + + 6,416,1758426,-,ncfrag=0/31;the first chunk, + 6,416,1758426,-,ncfrag=16/31; the 2nd chunk. + +Miscellaneous notes: +==================== + +.. Warning:: + + the default target ethernet setting uses the broadcast + ethernet address to send packets, which can cause increased load on + other systems on the same ethernet segment. + +.. Tip:: + + some LAN switches may be configured to suppress ethernet broadcasts + so it is advised to explicitly specify the remote agents' MAC addresses + from the config parameters passed to netconsole. + +.. Tip:: + + to find out the MAC address of, say, 10.0.0.2, you may try using:: + + ping -c 1 10.0.0.2 ; /sbin/arp -n | grep 10.0.0.2 + +.. Tip:: + + in case the remote logging agent is on a separate LAN subnet than + the sender, it is suggested to try specifying the MAC address of the + default gateway (you may use /sbin/route -n to find it out) as the + remote MAC address instead. + +.. note:: + + the network device (eth1 in the above case) can run any kind + of other network traffic, netconsole is not intrusive. Netconsole + might cause slight delays in other traffic if the volume of kernel + messages is high, but should have no other impact. + +.. note:: + + if you find that the remote logging agent is not receiving or + printing all messages from the sender, it is likely that you have set + the "console_loglevel" parameter (on the sender) to only send high + priority messages to the console. You can change this at runtime using:: + + dmesg -n 8 + + or by specifying "debug" on the kernel command line at boot, to send + all kernel messages to the console. A specific value for this parameter + can also be set using the "loglevel" kernel boot option. See the + dmesg(8) man page and Documentation/admin-guide/kernel-parameters.rst + for details. + +Netconsole was designed to be as instantaneous as possible, to +enable the logging of even the most critical kernel bugs. It works +from IRQ contexts as well, and does not enable interrupts while +sending packets. Due to these unique needs, configuration cannot +be more automatic, and some fundamental limitations will remain: +only IP networks, UDP packets and ethernet devices are supported. diff --git a/Documentation/networking/netconsole.txt b/Documentation/networking/netconsole.txt deleted file mode 100644 index 296ea00fd3eb..000000000000 --- a/Documentation/networking/netconsole.txt +++ /dev/null @@ -1,210 +0,0 @@ - -started by Ingo Molnar , 2001.09.17 -2.6 port and netpoll api by Matt Mackall , Sep 9 2003 -IPv6 support by Cong Wang , Jan 1 2013 -Extended console support by Tejun Heo , May 1 2015 - -Please send bug reports to Matt Mackall -Satyam Sharma , and Cong Wang - -Introduction: -============= - -This module logs kernel printk messages over UDP allowing debugging of -problem where disk logging fails and serial consoles are impractical. - -It can be used either built-in or as a module. As a built-in, -netconsole initializes immediately after NIC cards and will bring up -the specified interface as soon as possible. While this doesn't allow -capture of early kernel panics, it does capture most of the boot -process. - -Sender and receiver configuration: -================================== - -It takes a string configuration parameter "netconsole" in the -following format: - - netconsole=[+][src-port]@[src-ip]/[],[tgt-port]@/[tgt-macaddr] - - where - + if present, enable extended console support - src-port source for UDP packets (defaults to 6665) - src-ip source IP to use (interface address) - dev network interface (eth0) - tgt-port port for logging agent (6666) - tgt-ip IP address for logging agent - tgt-macaddr ethernet MAC address for logging agent (broadcast) - -Examples: - - linux netconsole=4444@10.0.0.1/eth1,9353@10.0.0.2/12:34:56:78:9a:bc - - or - - insmod netconsole netconsole=@/,@10.0.0.2/ - - or using IPv6 - - insmod netconsole netconsole=@/,@fd00:1:2:3::1/ - -It also supports logging to multiple remote agents by specifying -parameters for the multiple agents separated by semicolons and the -complete string enclosed in "quotes", thusly: - - modprobe netconsole netconsole="@/,@10.0.0.2/;@/eth1,6892@10.0.0.3/" - -Built-in netconsole starts immediately after the TCP stack is -initialized and attempts to bring up the supplied dev at the supplied -address. - -The remote host has several options to receive the kernel messages, -for example: - -1) syslogd - -2) netcat - - On distributions using a BSD-based netcat version (e.g. Fedora, - openSUSE and Ubuntu) the listening port must be specified without - the -p switch: - - 'nc -u -l -p ' / 'nc -u -l ' or - 'netcat -u -l -p ' / 'netcat -u -l ' - -3) socat - - 'socat udp-recv: -' - -Dynamic reconfiguration: -======================== - -Dynamic reconfigurability is a useful addition to netconsole that enables -remote logging targets to be dynamically added, removed, or have their -parameters reconfigured at runtime from a configfs-based userspace interface. -[ Note that the parameters of netconsole targets that were specified/created -from the boot/module option are not exposed via this interface, and hence -cannot be modified dynamically. ] - -To include this feature, select CONFIG_NETCONSOLE_DYNAMIC when building the -netconsole module (or kernel, if netconsole is built-in). - -Some examples follow (where configfs is mounted at the /sys/kernel/config -mountpoint). - -To add a remote logging target (target names can be arbitrary): - - cd /sys/kernel/config/netconsole/ - mkdir target1 - -Note that newly created targets have default parameter values (as mentioned -above) and are disabled by default -- they must first be enabled by writing -"1" to the "enabled" attribute (usually after setting parameters accordingly) -as described below. - -To remove a target: - - rmdir /sys/kernel/config/netconsole/othertarget/ - -The interface exposes these parameters of a netconsole target to userspace: - - enabled Is this target currently enabled? (read-write) - extended Extended mode enabled (read-write) - dev_name Local network interface name (read-write) - local_port Source UDP port to use (read-write) - remote_port Remote agent's UDP port (read-write) - local_ip Source IP address to use (read-write) - remote_ip Remote agent's IP address (read-write) - local_mac Local interface's MAC address (read-only) - remote_mac Remote agent's MAC address (read-write) - -The "enabled" attribute is also used to control whether the parameters of -a target can be updated or not -- you can modify the parameters of only -disabled targets (i.e. if "enabled" is 0). - -To update a target's parameters: - - cat enabled # check if enabled is 1 - echo 0 > enabled # disable the target (if required) - echo eth2 > dev_name # set local interface - echo 10.0.0.4 > remote_ip # update some parameter - echo cb:a9:87:65:43:21 > remote_mac # update more parameters - echo 1 > enabled # enable target again - -You can also update the local interface dynamically. This is especially -useful if you want to use interfaces that have newly come up (and may not -have existed when netconsole was loaded / initialized). - -Extended console: -================= - -If '+' is prefixed to the configuration line or "extended" config file -is set to 1, extended console support is enabled. An example boot -param follows. - - linux netconsole=+4444@10.0.0.1/eth1,9353@10.0.0.2/12:34:56:78:9a:bc - -Log messages are transmitted with extended metadata header in the -following format which is the same as /dev/kmsg. - - ,,,; - -Non printable characters in are escaped using "\xff" -notation. If the message contains optional dictionary, verbatim -newline is used as the delimeter. - -If a message doesn't fit in certain number of bytes (currently 1000), -the message is split into multiple fragments by netconsole. These -fragments are transmitted with "ncfrag" header field added. - - ncfrag=/ - -For example, assuming a lot smaller chunk size, a message "the first -chunk, the 2nd chunk." may be split as follows. - - 6,416,1758426,-,ncfrag=0/31;the first chunk, - 6,416,1758426,-,ncfrag=16/31; the 2nd chunk. - -Miscellaneous notes: -==================== - -WARNING: the default target ethernet setting uses the broadcast -ethernet address to send packets, which can cause increased load on -other systems on the same ethernet segment. - -TIP: some LAN switches may be configured to suppress ethernet broadcasts -so it is advised to explicitly specify the remote agents' MAC addresses -from the config parameters passed to netconsole. - -TIP: to find out the MAC address of, say, 10.0.0.2, you may try using: - - ping -c 1 10.0.0.2 ; /sbin/arp -n | grep 10.0.0.2 - -TIP: in case the remote logging agent is on a separate LAN subnet than -the sender, it is suggested to try specifying the MAC address of the -default gateway (you may use /sbin/route -n to find it out) as the -remote MAC address instead. - -NOTE: the network device (eth1 in the above case) can run any kind -of other network traffic, netconsole is not intrusive. Netconsole -might cause slight delays in other traffic if the volume of kernel -messages is high, but should have no other impact. - -NOTE: if you find that the remote logging agent is not receiving or -printing all messages from the sender, it is likely that you have set -the "console_loglevel" parameter (on the sender) to only send high -priority messages to the console. You can change this at runtime using: - - dmesg -n 8 - -or by specifying "debug" on the kernel command line at boot, to send -all kernel messages to the console. A specific value for this parameter -can also be set using the "loglevel" kernel boot option. See the -dmesg(8) man page and Documentation/admin-guide/kernel-parameters.rst for details. - -Netconsole was designed to be as instantaneous as possible, to -enable the logging of even the most critical kernel bugs. It works -from IRQ contexts as well, and does not enable interrupts while -sending packets. Due to these unique needs, configuration cannot -be more automatic, and some fundamental limitations will remain: -only IP networks, UDP packets and ethernet devices are supported. diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig index c822f4a6d166..ad64be98330f 100644 --- a/drivers/net/Kconfig +++ b/drivers/net/Kconfig @@ -302,7 +302,7 @@ config NETCONSOLE tristate "Network console logging support" ---help--- If you want to log kernel messages over the network, enable this. - See for details. + See for details. config NETCONSOLE_DYNAMIC bool "Dynamic reconfiguration of logging targets" @@ -312,7 +312,7 @@ config NETCONSOLE_DYNAMIC This option enables the ability to dynamically reconfigure target parameters (interface, IP addresses, port numbers, MAC addresses) at runtime through a userspace interface exported using configfs. - See for details. + See for details. config NETPOLL def_bool NETCONSOLE diff --git a/drivers/net/ethernet/toshiba/ps3_gelic_net.c b/drivers/net/ethernet/toshiba/ps3_gelic_net.c index 070dd6fa9401..310e6839c6e5 100644 --- a/drivers/net/ethernet/toshiba/ps3_gelic_net.c +++ b/drivers/net/ethernet/toshiba/ps3_gelic_net.c @@ -1150,7 +1150,7 @@ static irqreturn_t gelic_card_interrupt(int irq, void *ptr) * gelic_net_poll_controller - artificial interrupt for netconsole etc. * @netdev: interface device structure * - * see Documentation/networking/netconsole.txt + * see Documentation/networking/netconsole.rst */ void gelic_net_poll_controller(struct net_device *netdev) { diff --git a/drivers/net/ethernet/toshiba/spider_net.c b/drivers/net/ethernet/toshiba/spider_net.c index 6576271642c1..3902b3aeb0c2 100644 --- a/drivers/net/ethernet/toshiba/spider_net.c +++ b/drivers/net/ethernet/toshiba/spider_net.c @@ -1615,7 +1615,7 @@ spider_net_interrupt(int irq, void *ptr) * spider_net_poll_controller - artificial interrupt for netconsole etc. * @netdev: interface device structure * - * see Documentation/networking/netconsole.txt + * see Documentation/networking/netconsole.rst */ static void spider_net_poll_controller(struct net_device *netdev) -- cgit v1.2.3-59-g8ed1b From ea5bacaa2cec6967ed337f4d0ad6034123ca737b Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Thu, 30 Apr 2020 18:04:03 +0200 Subject: docs: networking: convert netdev-features.txt to ReST Not much to be done here: - add SPDX header; - adjust titles and chapters, adding proper markups; - add to networking/index.rst. Signed-off-by: Mauro Carvalho Chehab Signed-off-by: David S. Miller --- Documentation/networking/checksum-offloads.rst | 2 +- Documentation/networking/index.rst | 1 + Documentation/networking/netdev-features.rst | 184 +++++++++++++++++++++++++ Documentation/networking/netdev-features.txt | 181 ------------------------ include/linux/netdev_features.h | 2 +- 5 files changed, 187 insertions(+), 183 deletions(-) create mode 100644 Documentation/networking/netdev-features.rst delete mode 100644 Documentation/networking/netdev-features.txt diff --git a/Documentation/networking/checksum-offloads.rst b/Documentation/networking/checksum-offloads.rst index 905c8a84b103..69b23cf6879e 100644 --- a/Documentation/networking/checksum-offloads.rst +++ b/Documentation/networking/checksum-offloads.rst @@ -59,7 +59,7 @@ recomputed for each resulting segment. See the skbuff.h comment (section 'E') for more details. A driver declares its offload capabilities in netdev->hw_features; see -Documentation/networking/netdev-features.txt for more. Note that a device +Documentation/networking/netdev-features.rst for more. Note that a device which only advertises NETIF_F_IP[V6]_CSUM must still obey the csum_start and csum_offset given in the SKB; if it tries to deduce these itself in hardware (as some NICs do) the driver should check that the values in the SKB match diff --git a/Documentation/networking/index.rst b/Documentation/networking/index.rst index e58f872d401d..4c6aa3db97d4 100644 --- a/Documentation/networking/index.rst +++ b/Documentation/networking/index.rst @@ -81,6 +81,7 @@ Contents: mpls-sysctl multiqueue netconsole + netdev-features .. only:: subproject and html diff --git a/Documentation/networking/netdev-features.rst b/Documentation/networking/netdev-features.rst new file mode 100644 index 000000000000..a2d7d7160e39 --- /dev/null +++ b/Documentation/networking/netdev-features.rst @@ -0,0 +1,184 @@ +.. SPDX-License-Identifier: GPL-2.0 + +===================================================== +Netdev features mess and how to get out from it alive +===================================================== + +Author: + Michał Mirosław + + + +Part I: Feature sets +==================== + +Long gone are the days when a network card would just take and give packets +verbatim. Today's devices add multiple features and bugs (read: offloads) +that relieve an OS of various tasks like generating and checking checksums, +splitting packets, classifying them. Those capabilities and their state +are commonly referred to as netdev features in Linux kernel world. + +There are currently three sets of features relevant to the driver, and +one used internally by network core: + + 1. netdev->hw_features set contains features whose state may possibly + be changed (enabled or disabled) for a particular device by user's + request. This set should be initialized in ndo_init callback and not + changed later. + + 2. netdev->features set contains features which are currently enabled + for a device. This should be changed only by network core or in + error paths of ndo_set_features callback. + + 3. netdev->vlan_features set contains features whose state is inherited + by child VLAN devices (limits netdev->features set). This is currently + used for all VLAN devices whether tags are stripped or inserted in + hardware or software. + + 4. netdev->wanted_features set contains feature set requested by user. + This set is filtered by ndo_fix_features callback whenever it or + some device-specific conditions change. This set is internal to + networking core and should not be referenced in drivers. + + + +Part II: Controlling enabled features +===================================== + +When current feature set (netdev->features) is to be changed, new set +is calculated and filtered by calling ndo_fix_features callback +and netdev_fix_features(). If the resulting set differs from current +set, it is passed to ndo_set_features callback and (if the callback +returns success) replaces value stored in netdev->features. +NETDEV_FEAT_CHANGE notification is issued after that whenever current +set might have changed. + +The following events trigger recalculation: + 1. device's registration, after ndo_init returned success + 2. user requested changes in features state + 3. netdev_update_features() is called + +ndo_*_features callbacks are called with rtnl_lock held. Missing callbacks +are treated as always returning success. + +A driver that wants to trigger recalculation must do so by calling +netdev_update_features() while holding rtnl_lock. This should not be done +from ndo_*_features callbacks. netdev->features should not be modified by +driver except by means of ndo_fix_features callback. + + + +Part III: Implementation hints +============================== + + * ndo_fix_features: + +All dependencies between features should be resolved here. The resulting +set can be reduced further by networking core imposed limitations (as coded +in netdev_fix_features()). For this reason it is safer to disable a feature +when its dependencies are not met instead of forcing the dependency on. + +This callback should not modify hardware nor driver state (should be +stateless). It can be called multiple times between successive +ndo_set_features calls. + +Callback must not alter features contained in NETIF_F_SOFT_FEATURES or +NETIF_F_NEVER_CHANGE sets. The exception is NETIF_F_VLAN_CHALLENGED but +care must be taken as the change won't affect already configured VLANs. + + * ndo_set_features: + +Hardware should be reconfigured to match passed feature set. The set +should not be altered unless some error condition happens that can't +be reliably detected in ndo_fix_features. In this case, the callback +should update netdev->features to match resulting hardware state. +Errors returned are not (and cannot be) propagated anywhere except dmesg. +(Note: successful return is zero, >0 means silent error.) + + + +Part IV: Features +================= + +For current list of features, see include/linux/netdev_features.h. +This section describes semantics of some of them. + + * Transmit checksumming + +For complete description, see comments near the top of include/linux/skbuff.h. + +Note: NETIF_F_HW_CSUM is a superset of NETIF_F_IP_CSUM + NETIF_F_IPV6_CSUM. +It means that device can fill TCP/UDP-like checksum anywhere in the packets +whatever headers there might be. + + * Transmit TCP segmentation offload + +NETIF_F_TSO_ECN means that hardware can properly split packets with CWR bit +set, be it TCPv4 (when NETIF_F_TSO is enabled) or TCPv6 (NETIF_F_TSO6). + + * Transmit UDP segmentation offload + +NETIF_F_GSO_UDP_L4 accepts a single UDP header with a payload that exceeds +gso_size. On segmentation, it segments the payload on gso_size boundaries and +replicates the network and UDP headers (fixing up the last one if less than +gso_size). + + * Transmit DMA from high memory + +On platforms where this is relevant, NETIF_F_HIGHDMA signals that +ndo_start_xmit can handle skbs with frags in high memory. + + * Transmit scatter-gather + +Those features say that ndo_start_xmit can handle fragmented skbs: +NETIF_F_SG --- paged skbs (skb_shinfo()->frags), NETIF_F_FRAGLIST --- +chained skbs (skb->next/prev list). + + * Software features + +Features contained in NETIF_F_SOFT_FEATURES are features of networking +stack. Driver should not change behaviour based on them. + + * LLTX driver (deprecated for hardware drivers) + +NETIF_F_LLTX is meant to be used by drivers that don't need locking at all, +e.g. software tunnels. + +This is also used in a few legacy drivers that implement their +own locking, don't use it for new (hardware) drivers. + + * netns-local device + +NETIF_F_NETNS_LOCAL is set for devices that are not allowed to move between +network namespaces (e.g. loopback). + +Don't use it in drivers. + + * VLAN challenged + +NETIF_F_VLAN_CHALLENGED should be set for devices which can't cope with VLAN +headers. Some drivers set this because the cards can't handle the bigger MTU. +[FIXME: Those cases could be fixed in VLAN code by allowing only reduced-MTU +VLANs. This may be not useful, though.] + +* rx-fcs + +This requests that the NIC append the Ethernet Frame Checksum (FCS) +to the end of the skb data. This allows sniffers and other tools to +read the CRC recorded by the NIC on receipt of the packet. + +* rx-all + +This requests that the NIC receive all possible frames, including errored +frames (such as bad FCS, etc). This can be helpful when sniffing a link with +bad packets on it. Some NICs may receive more packets if also put into normal +PROMISC mode. + +* rx-gro-hw + +This requests that the NIC enables Hardware GRO (generic receive offload). +Hardware GRO is basically the exact reverse of TSO, and is generally +stricter than Hardware LRO. A packet stream merged by Hardware GRO must +be re-segmentable by GSO or TSO back to the exact original packet stream. +Hardware GRO is dependent on RXCSUM since every packet successfully merged +by hardware must also have the checksum verified by hardware. diff --git a/Documentation/networking/netdev-features.txt b/Documentation/networking/netdev-features.txt deleted file mode 100644 index 58dd1c1e3c65..000000000000 --- a/Documentation/networking/netdev-features.txt +++ /dev/null @@ -1,181 +0,0 @@ -Netdev features mess and how to get out from it alive -===================================================== - -Author: - Michał Mirosław - - - - Part I: Feature sets -====================== - -Long gone are the days when a network card would just take and give packets -verbatim. Today's devices add multiple features and bugs (read: offloads) -that relieve an OS of various tasks like generating and checking checksums, -splitting packets, classifying them. Those capabilities and their state -are commonly referred to as netdev features in Linux kernel world. - -There are currently three sets of features relevant to the driver, and -one used internally by network core: - - 1. netdev->hw_features set contains features whose state may possibly - be changed (enabled or disabled) for a particular device by user's - request. This set should be initialized in ndo_init callback and not - changed later. - - 2. netdev->features set contains features which are currently enabled - for a device. This should be changed only by network core or in - error paths of ndo_set_features callback. - - 3. netdev->vlan_features set contains features whose state is inherited - by child VLAN devices (limits netdev->features set). This is currently - used for all VLAN devices whether tags are stripped or inserted in - hardware or software. - - 4. netdev->wanted_features set contains feature set requested by user. - This set is filtered by ndo_fix_features callback whenever it or - some device-specific conditions change. This set is internal to - networking core and should not be referenced in drivers. - - - - Part II: Controlling enabled features -======================================= - -When current feature set (netdev->features) is to be changed, new set -is calculated and filtered by calling ndo_fix_features callback -and netdev_fix_features(). If the resulting set differs from current -set, it is passed to ndo_set_features callback and (if the callback -returns success) replaces value stored in netdev->features. -NETDEV_FEAT_CHANGE notification is issued after that whenever current -set might have changed. - -The following events trigger recalculation: - 1. device's registration, after ndo_init returned success - 2. user requested changes in features state - 3. netdev_update_features() is called - -ndo_*_features callbacks are called with rtnl_lock held. Missing callbacks -are treated as always returning success. - -A driver that wants to trigger recalculation must do so by calling -netdev_update_features() while holding rtnl_lock. This should not be done -from ndo_*_features callbacks. netdev->features should not be modified by -driver except by means of ndo_fix_features callback. - - - - Part III: Implementation hints -================================ - - * ndo_fix_features: - -All dependencies between features should be resolved here. The resulting -set can be reduced further by networking core imposed limitations (as coded -in netdev_fix_features()). For this reason it is safer to disable a feature -when its dependencies are not met instead of forcing the dependency on. - -This callback should not modify hardware nor driver state (should be -stateless). It can be called multiple times between successive -ndo_set_features calls. - -Callback must not alter features contained in NETIF_F_SOFT_FEATURES or -NETIF_F_NEVER_CHANGE sets. The exception is NETIF_F_VLAN_CHALLENGED but -care must be taken as the change won't affect already configured VLANs. - - * ndo_set_features: - -Hardware should be reconfigured to match passed feature set. The set -should not be altered unless some error condition happens that can't -be reliably detected in ndo_fix_features. In this case, the callback -should update netdev->features to match resulting hardware state. -Errors returned are not (and cannot be) propagated anywhere except dmesg. -(Note: successful return is zero, >0 means silent error.) - - - - Part IV: Features -=================== - -For current list of features, see include/linux/netdev_features.h. -This section describes semantics of some of them. - - * Transmit checksumming - -For complete description, see comments near the top of include/linux/skbuff.h. - -Note: NETIF_F_HW_CSUM is a superset of NETIF_F_IP_CSUM + NETIF_F_IPV6_CSUM. -It means that device can fill TCP/UDP-like checksum anywhere in the packets -whatever headers there might be. - - * Transmit TCP segmentation offload - -NETIF_F_TSO_ECN means that hardware can properly split packets with CWR bit -set, be it TCPv4 (when NETIF_F_TSO is enabled) or TCPv6 (NETIF_F_TSO6). - - * Transmit UDP segmentation offload - -NETIF_F_GSO_UDP_L4 accepts a single UDP header with a payload that exceeds -gso_size. On segmentation, it segments the payload on gso_size boundaries and -replicates the network and UDP headers (fixing up the last one if less than -gso_size). - - * Transmit DMA from high memory - -On platforms where this is relevant, NETIF_F_HIGHDMA signals that -ndo_start_xmit can handle skbs with frags in high memory. - - * Transmit scatter-gather - -Those features say that ndo_start_xmit can handle fragmented skbs: -NETIF_F_SG --- paged skbs (skb_shinfo()->frags), NETIF_F_FRAGLIST --- -chained skbs (skb->next/prev list). - - * Software features - -Features contained in NETIF_F_SOFT_FEATURES are features of networking -stack. Driver should not change behaviour based on them. - - * LLTX driver (deprecated for hardware drivers) - -NETIF_F_LLTX is meant to be used by drivers that don't need locking at all, -e.g. software tunnels. - -This is also used in a few legacy drivers that implement their -own locking, don't use it for new (hardware) drivers. - - * netns-local device - -NETIF_F_NETNS_LOCAL is set for devices that are not allowed to move between -network namespaces (e.g. loopback). - -Don't use it in drivers. - - * VLAN challenged - -NETIF_F_VLAN_CHALLENGED should be set for devices which can't cope with VLAN -headers. Some drivers set this because the cards can't handle the bigger MTU. -[FIXME: Those cases could be fixed in VLAN code by allowing only reduced-MTU -VLANs. This may be not useful, though.] - -* rx-fcs - -This requests that the NIC append the Ethernet Frame Checksum (FCS) -to the end of the skb data. This allows sniffers and other tools to -read the CRC recorded by the NIC on receipt of the packet. - -* rx-all - -This requests that the NIC receive all possible frames, including errored -frames (such as bad FCS, etc). This can be helpful when sniffing a link with -bad packets on it. Some NICs may receive more packets if also put into normal -PROMISC mode. - -* rx-gro-hw - -This requests that the NIC enables Hardware GRO (generic receive offload). -Hardware GRO is basically the exact reverse of TSO, and is generally -stricter than Hardware LRO. A packet stream merged by Hardware GRO must -be re-segmentable by GSO or TSO back to the exact original packet stream. -Hardware GRO is dependent on RXCSUM since every packet successfully merged -by hardware must also have the checksum verified by hardware. diff --git a/include/linux/netdev_features.h b/include/linux/netdev_features.h index 9d53c5ad272c..2cc3cf80b49a 100644 --- a/include/linux/netdev_features.h +++ b/include/linux/netdev_features.h @@ -89,7 +89,7 @@ enum { * Add your fresh new feature above and remember to update * netdev_features_strings[] in net/core/ethtool.c and maybe * some feature mask #defines below. Please also describe it - * in Documentation/networking/netdev-features.txt. + * in Documentation/networking/netdev-features.rst. */ /**/NETDEV_FEATURE_COUNT -- cgit v1.2.3-59-g8ed1b From 482a4360c56a1ecb5ea54c00db647b0012d787cf Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Thu, 30 Apr 2020 18:04:04 +0200 Subject: docs: networking: convert netdevices.txt to ReST - add SPDX header; - adjust title markup; - mark lists as such; - adjust identation, whitespaces and blank lines; - add to networking/index.rst. Signed-off-by: Mauro Carvalho Chehab Signed-off-by: David S. Miller --- Documentation/networking/can.rst | 2 +- Documentation/networking/index.rst | 1 + Documentation/networking/netdevices.rst | 111 ++++++++++++++++++++++++++++++++ Documentation/networking/netdevices.txt | 104 ------------------------------ 4 files changed, 113 insertions(+), 105 deletions(-) create mode 100644 Documentation/networking/netdevices.rst delete mode 100644 Documentation/networking/netdevices.txt diff --git a/Documentation/networking/can.rst b/Documentation/networking/can.rst index 2fd0b51a8c52..ff05cbd05e0d 100644 --- a/Documentation/networking/can.rst +++ b/Documentation/networking/can.rst @@ -1058,7 +1058,7 @@ drivers you mainly have to deal with: - TX: Put the CAN frame from the socket buffer to the CAN controller. - RX: Put the CAN frame from the CAN controller to the socket buffer. -See e.g. at Documentation/networking/netdevices.txt . The differences +See e.g. at Documentation/networking/netdevices.rst . The differences for writing CAN network device driver are described below: diff --git a/Documentation/networking/index.rst b/Documentation/networking/index.rst index 4c6aa3db97d4..5a320553ffba 100644 --- a/Documentation/networking/index.rst +++ b/Documentation/networking/index.rst @@ -82,6 +82,7 @@ Contents: multiqueue netconsole netdev-features + netdevices .. only:: subproject and html diff --git a/Documentation/networking/netdevices.rst b/Documentation/networking/netdevices.rst new file mode 100644 index 000000000000..5a85fcc80c76 --- /dev/null +++ b/Documentation/networking/netdevices.rst @@ -0,0 +1,111 @@ +.. SPDX-License-Identifier: GPL-2.0 + +===================================== +Network Devices, the Kernel, and You! +===================================== + + +Introduction +============ +The following is a random collection of documentation regarding +network devices. + +struct net_device allocation rules +================================== +Network device structures need to persist even after module is unloaded and +must be allocated with alloc_netdev_mqs() and friends. +If device has registered successfully, it will be freed on last use +by free_netdev(). This is required to handle the pathologic case cleanly +(example: rmmod mydriver features this will be + called without holding netif_tx_lock. In this case the driver + has to lock by itself when needed. + The locking there should also properly protect against + set_rx_mode. WARNING: use of NETIF_F_LLTX is deprecated. + Don't use it for new drivers. + + Context: Process with BHs disabled or BH (timer), + will be called with interrupts disabled by netconsole. + + Return codes: + + * NETDEV_TX_OK everything ok. + * NETDEV_TX_BUSY Cannot transmit packet, try later + Usually a bug, means queue start/stop flow control is broken in + the driver. Note: the driver must NOT put the skb in its DMA ring. + +ndo_tx_timeout: + Synchronization: netif_tx_lock spinlock; all TX queues frozen. + Context: BHs disabled + Notes: netif_queue_stopped() is guaranteed true + +ndo_set_rx_mode: + Synchronization: netif_addr_lock spinlock. + Context: BHs disabled + +struct napi_struct synchronization rules +======================================== +napi->poll: + Synchronization: + NAPI_STATE_SCHED bit in napi->state. Device + driver's ndo_stop method will invoke napi_disable() on + all NAPI instances which will do a sleeping poll on the + NAPI_STATE_SCHED napi->state bit, waiting for all pending + NAPI activity to cease. + + Context: + softirq + will be called with interrupts disabled by netconsole. diff --git a/Documentation/networking/netdevices.txt b/Documentation/networking/netdevices.txt deleted file mode 100644 index 7fec2061a334..000000000000 --- a/Documentation/networking/netdevices.txt +++ /dev/null @@ -1,104 +0,0 @@ - -Network Devices, the Kernel, and You! - - -Introduction -============ -The following is a random collection of documentation regarding -network devices. - -struct net_device allocation rules -================================== -Network device structures need to persist even after module is unloaded and -must be allocated with alloc_netdev_mqs() and friends. -If device has registered successfully, it will be freed on last use -by free_netdev(). This is required to handle the pathologic case cleanly -(example: rmmod mydriver features this will be - called without holding netif_tx_lock. In this case the driver - has to lock by itself when needed. - The locking there should also properly protect against - set_rx_mode. WARNING: use of NETIF_F_LLTX is deprecated. - Don't use it for new drivers. - - Context: Process with BHs disabled or BH (timer), - will be called with interrupts disabled by netconsole. - - Return codes: - o NETDEV_TX_OK everything ok. - o NETDEV_TX_BUSY Cannot transmit packet, try later - Usually a bug, means queue start/stop flow control is broken in - the driver. Note: the driver must NOT put the skb in its DMA ring. - -ndo_tx_timeout: - Synchronization: netif_tx_lock spinlock; all TX queues frozen. - Context: BHs disabled - Notes: netif_queue_stopped() is guaranteed true - -ndo_set_rx_mode: - Synchronization: netif_addr_lock spinlock. - Context: BHs disabled - -struct napi_struct synchronization rules -======================================== -napi->poll: - Synchronization: NAPI_STATE_SCHED bit in napi->state. Device - driver's ndo_stop method will invoke napi_disable() on - all NAPI instances which will do a sleeping poll on the - NAPI_STATE_SCHED napi->state bit, waiting for all pending - NAPI activity to cease. - Context: softirq - will be called with interrupts disabled by netconsole. -- cgit v1.2.3-59-g8ed1b From 0191533087a3cfbe02a0bde9939fa978cb9761fd Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Thu, 30 Apr 2020 18:04:05 +0200 Subject: docs: networking: convert netfilter-sysctl.txt to ReST Not much to be done here: - add SPDX header; - add a document title; - add a chapter markup; - mark tables as such; - add to networking/index.rst. Signed-off-by: Mauro Carvalho Chehab Signed-off-by: David S. Miller --- Documentation/networking/index.rst | 1 + Documentation/networking/netfilter-sysctl.rst | 17 +++++++++++++++++ Documentation/networking/netfilter-sysctl.txt | 10 ---------- 3 files changed, 18 insertions(+), 10 deletions(-) create mode 100644 Documentation/networking/netfilter-sysctl.rst delete mode 100644 Documentation/networking/netfilter-sysctl.txt diff --git a/Documentation/networking/index.rst b/Documentation/networking/index.rst index 5a320553ffba..1ae0cbef8c04 100644 --- a/Documentation/networking/index.rst +++ b/Documentation/networking/index.rst @@ -83,6 +83,7 @@ Contents: netconsole netdev-features netdevices + netfilter-sysctl .. only:: subproject and html diff --git a/Documentation/networking/netfilter-sysctl.rst b/Documentation/networking/netfilter-sysctl.rst new file mode 100644 index 000000000000..beb6d7b275d4 --- /dev/null +++ b/Documentation/networking/netfilter-sysctl.rst @@ -0,0 +1,17 @@ +.. SPDX-License-Identifier: GPL-2.0 + +========================= +Netfilter Sysfs variables +========================= + +/proc/sys/net/netfilter/* Variables: +==================================== + +nf_log_all_netns - BOOLEAN + - 0 - disabled (default) + - not 0 - enabled + + By default, only init_net namespace can log packets into kernel log + with LOG target; this aims to prevent containers from flooding host + kernel log. If enabled, this target also works in other network + namespaces. This variable is only accessible from init_net. diff --git a/Documentation/networking/netfilter-sysctl.txt b/Documentation/networking/netfilter-sysctl.txt deleted file mode 100644 index 55791e50e169..000000000000 --- a/Documentation/networking/netfilter-sysctl.txt +++ /dev/null @@ -1,10 +0,0 @@ -/proc/sys/net/netfilter/* Variables: - -nf_log_all_netns - BOOLEAN - 0 - disabled (default) - not 0 - enabled - - By default, only init_net namespace can log packets into kernel log - with LOG target; this aims to prevent containers from flooding host - kernel log. If enabled, this target also works in other network - namespaces. This variable is only accessible from init_net. -- cgit v1.2.3-59-g8ed1b From c4d5dff60f0a142937b52626653314f3d0a18420 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Thu, 30 Apr 2020 18:04:06 +0200 Subject: docs: networking: convert netif-msg.txt to ReST - add SPDX header; - adjust title and chapter markups; - mark lists as such; - mark code blocks and literals as such; - adjust identation, whitespaces and blank lines; - add to networking/index.rst. Signed-off-by: Mauro Carvalho Chehab Signed-off-by: David S. Miller --- Documentation/networking/index.rst | 1 + Documentation/networking/netif-msg.rst | 95 ++++++++++++++++++++++++++++++++++ Documentation/networking/netif-msg.txt | 79 ---------------------------- 3 files changed, 96 insertions(+), 79 deletions(-) create mode 100644 Documentation/networking/netif-msg.rst delete mode 100644 Documentation/networking/netif-msg.txt diff --git a/Documentation/networking/index.rst b/Documentation/networking/index.rst index 1ae0cbef8c04..d98509f15363 100644 --- a/Documentation/networking/index.rst +++ b/Documentation/networking/index.rst @@ -84,6 +84,7 @@ Contents: netdev-features netdevices netfilter-sysctl + netif-msg .. only:: subproject and html diff --git a/Documentation/networking/netif-msg.rst b/Documentation/networking/netif-msg.rst new file mode 100644 index 000000000000..b20d265a734d --- /dev/null +++ b/Documentation/networking/netif-msg.rst @@ -0,0 +1,95 @@ +.. SPDX-License-Identifier: GPL-2.0 + +=============== +NETIF Msg Level +=============== + +The design of the network interface message level setting. + +History +------- + + The design of the debugging message interface was guided and + constrained by backwards compatibility previous practice. It is useful + to understand the history and evolution in order to understand current + practice and relate it to older driver source code. + + From the beginning of Linux, each network device driver has had a local + integer variable that controls the debug message level. The message + level ranged from 0 to 7, and monotonically increased in verbosity. + + The message level was not precisely defined past level 3, but were + always implemented within +-1 of the specified level. Drivers tended + to shed the more verbose level messages as they matured. + + - 0 Minimal messages, only essential information on fatal errors. + - 1 Standard messages, initialization status. No run-time messages + - 2 Special media selection messages, generally timer-driver. + - 3 Interface starts and stops, including normal status messages + - 4 Tx and Rx frame error messages, and abnormal driver operation + - 5 Tx packet queue information, interrupt events. + - 6 Status on each completed Tx packet and received Rx packets + - 7 Initial contents of Tx and Rx packets + + Initially this message level variable was uniquely named in each driver + e.g. "lance_debug", so that a kernel symbolic debugger could locate and + modify the setting. When kernel modules became common, the variables + were consistently renamed to "debug" and allowed to be set as a module + parameter. + + This approach worked well. However there is always a demand for + additional features. Over the years the following emerged as + reasonable and easily implemented enhancements + + - Using an ioctl() call to modify the level. + - Per-interface rather than per-driver message level setting. + - More selective control over the type of messages emitted. + + The netif_msg recommendation adds these features with only a minor + complexity and code size increase. + + The recommendation is the following points + + - Retaining the per-driver integer variable "debug" as a module + parameter with a default level of '1'. + + - Adding a per-interface private variable named "msg_enable". The + variable is a bit map rather than a level, and is initialized as:: + + 1 << debug + + Or more precisely:: + + debug < 0 ? 0 : 1 << min(sizeof(int)-1, debug) + + Messages should changes from:: + + if (debug > 1) + printk(MSG_DEBUG "%s: ... + + to:: + + if (np->msg_enable & NETIF_MSG_LINK) + printk(MSG_DEBUG "%s: ... + + +The set of message levels is named + + + ========= =================== ============ + Old level Name Bit position + ========= =================== ============ + 0 NETIF_MSG_DRV 0x0001 + 1 NETIF_MSG_PROBE 0x0002 + 2 NETIF_MSG_LINK 0x0004 + 2 NETIF_MSG_TIMER 0x0004 + 3 NETIF_MSG_IFDOWN 0x0008 + 3 NETIF_MSG_IFUP 0x0008 + 4 NETIF_MSG_RX_ERR 0x0010 + 4 NETIF_MSG_TX_ERR 0x0010 + 5 NETIF_MSG_TX_QUEUED 0x0020 + 5 NETIF_MSG_INTR 0x0020 + 6 NETIF_MSG_TX_DONE 0x0040 + 6 NETIF_MSG_RX_STATUS 0x0040 + 7 NETIF_MSG_PKTDATA 0x0080 + ========= =================== ============ diff --git a/Documentation/networking/netif-msg.txt b/Documentation/networking/netif-msg.txt deleted file mode 100644 index c967ddb90d0b..000000000000 --- a/Documentation/networking/netif-msg.txt +++ /dev/null @@ -1,79 +0,0 @@ - -________________ -NETIF Msg Level - -The design of the network interface message level setting. - -History - - The design of the debugging message interface was guided and - constrained by backwards compatibility previous practice. It is useful - to understand the history and evolution in order to understand current - practice and relate it to older driver source code. - - From the beginning of Linux, each network device driver has had a local - integer variable that controls the debug message level. The message - level ranged from 0 to 7, and monotonically increased in verbosity. - - The message level was not precisely defined past level 3, but were - always implemented within +-1 of the specified level. Drivers tended - to shed the more verbose level messages as they matured. - 0 Minimal messages, only essential information on fatal errors. - 1 Standard messages, initialization status. No run-time messages - 2 Special media selection messages, generally timer-driver. - 3 Interface starts and stops, including normal status messages - 4 Tx and Rx frame error messages, and abnormal driver operation - 5 Tx packet queue information, interrupt events. - 6 Status on each completed Tx packet and received Rx packets - 7 Initial contents of Tx and Rx packets - - Initially this message level variable was uniquely named in each driver - e.g. "lance_debug", so that a kernel symbolic debugger could locate and - modify the setting. When kernel modules became common, the variables - were consistently renamed to "debug" and allowed to be set as a module - parameter. - - This approach worked well. However there is always a demand for - additional features. Over the years the following emerged as - reasonable and easily implemented enhancements - Using an ioctl() call to modify the level. - Per-interface rather than per-driver message level setting. - More selective control over the type of messages emitted. - - The netif_msg recommendation adds these features with only a minor - complexity and code size increase. - - The recommendation is the following points - Retaining the per-driver integer variable "debug" as a module - parameter with a default level of '1'. - - Adding a per-interface private variable named "msg_enable". The - variable is a bit map rather than a level, and is initialized as - 1 << debug - Or more precisely - debug < 0 ? 0 : 1 << min(sizeof(int)-1, debug) - - Messages should changes from - if (debug > 1) - printk(MSG_DEBUG "%s: ... - to - if (np->msg_enable & NETIF_MSG_LINK) - printk(MSG_DEBUG "%s: ... - - -The set of message levels is named - Old level Name Bit position - 0 NETIF_MSG_DRV 0x0001 - 1 NETIF_MSG_PROBE 0x0002 - 2 NETIF_MSG_LINK 0x0004 - 2 NETIF_MSG_TIMER 0x0004 - 3 NETIF_MSG_IFDOWN 0x0008 - 3 NETIF_MSG_IFUP 0x0008 - 4 NETIF_MSG_RX_ERR 0x0010 - 4 NETIF_MSG_TX_ERR 0x0010 - 5 NETIF_MSG_TX_QUEUED 0x0020 - 5 NETIF_MSG_INTR 0x0020 - 6 NETIF_MSG_TX_DONE 0x0040 - 6 NETIF_MSG_RX_STATUS 0x0040 - 7 NETIF_MSG_PKTDATA 0x0080 - -- cgit v1.2.3-59-g8ed1b From 13df433f8c1333cd0f60e5e1878380a37576118a Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Thu, 30 Apr 2020 18:04:07 +0200 Subject: docs: networking: convert nf_conntrack-sysctl.txt to ReST - add SPDX header; - add a document title; - mark lists as such; - add to networking/index.rst. Signed-off-by: Mauro Carvalho Chehab Signed-off-by: David S. Miller --- Documentation/networking/index.rst | 1 + Documentation/networking/nf_conntrack-sysctl.rst | 179 +++++++++++++++++++++++ Documentation/networking/nf_conntrack-sysctl.txt | 172 ---------------------- 3 files changed, 180 insertions(+), 172 deletions(-) create mode 100644 Documentation/networking/nf_conntrack-sysctl.rst delete mode 100644 Documentation/networking/nf_conntrack-sysctl.txt diff --git a/Documentation/networking/index.rst b/Documentation/networking/index.rst index d98509f15363..e5128bb7e7df 100644 --- a/Documentation/networking/index.rst +++ b/Documentation/networking/index.rst @@ -85,6 +85,7 @@ Contents: netdevices netfilter-sysctl netif-msg + nf_conntrack-sysctl .. only:: subproject and html diff --git a/Documentation/networking/nf_conntrack-sysctl.rst b/Documentation/networking/nf_conntrack-sysctl.rst new file mode 100644 index 000000000000..11a9b76786cb --- /dev/null +++ b/Documentation/networking/nf_conntrack-sysctl.rst @@ -0,0 +1,179 @@ +.. SPDX-License-Identifier: GPL-2.0 + +=================================== +Netfilter Conntrack Sysfs variables +=================================== + +/proc/sys/net/netfilter/nf_conntrack_* Variables: +================================================= + +nf_conntrack_acct - BOOLEAN + - 0 - disabled (default) + - not 0 - enabled + + Enable connection tracking flow accounting. 64-bit byte and packet + counters per flow are added. + +nf_conntrack_buckets - INTEGER + Size of hash table. If not specified as parameter during module + loading, the default size is calculated by dividing total memory + by 16384 to determine the number of buckets but the hash table will + never have fewer than 32 and limited to 16384 buckets. For systems + with more than 4GB of memory it will be 65536 buckets. + This sysctl is only writeable in the initial net namespace. + +nf_conntrack_checksum - BOOLEAN + - 0 - disabled + - not 0 - enabled (default) + + Verify checksum of incoming packets. Packets with bad checksums are + in INVALID state. If this is enabled, such packets will not be + considered for connection tracking. + +nf_conntrack_count - INTEGER (read-only) + Number of currently allocated flow entries. + +nf_conntrack_events - BOOLEAN + - 0 - disabled + - not 0 - enabled (default) + + If this option is enabled, the connection tracking code will + provide userspace with connection tracking events via ctnetlink. + +nf_conntrack_expect_max - INTEGER + Maximum size of expectation table. Default value is + nf_conntrack_buckets / 256. Minimum is 1. + +nf_conntrack_frag6_high_thresh - INTEGER + default 262144 + + Maximum memory used to reassemble IPv6 fragments. When + nf_conntrack_frag6_high_thresh bytes of memory is allocated for this + purpose, the fragment handler will toss packets until + nf_conntrack_frag6_low_thresh is reached. + +nf_conntrack_frag6_low_thresh - INTEGER + default 196608 + + See nf_conntrack_frag6_low_thresh + +nf_conntrack_frag6_timeout - INTEGER (seconds) + default 60 + + Time to keep an IPv6 fragment in memory. + +nf_conntrack_generic_timeout - INTEGER (seconds) + default 600 + + Default for generic timeout. This refers to layer 4 unknown/unsupported + protocols. + +nf_conntrack_helper - BOOLEAN + - 0 - disabled (default) + - not 0 - enabled + + Enable automatic conntrack helper assignment. + If disabled it is required to set up iptables rules to assign + helpers to connections. See the CT target description in the + iptables-extensions(8) man page for further information. + +nf_conntrack_icmp_timeout - INTEGER (seconds) + default 30 + + Default for ICMP timeout. + +nf_conntrack_icmpv6_timeout - INTEGER (seconds) + default 30 + + Default for ICMP6 timeout. + +nf_conntrack_log_invalid - INTEGER + - 0 - disable (default) + - 1 - log ICMP packets + - 6 - log TCP packets + - 17 - log UDP packets + - 33 - log DCCP packets + - 41 - log ICMPv6 packets + - 136 - log UDPLITE packets + - 255 - log packets of any protocol + + Log invalid packets of a type specified by value. + +nf_conntrack_max - INTEGER + Size of connection tracking table. Default value is + nf_conntrack_buckets value * 4. + +nf_conntrack_tcp_be_liberal - BOOLEAN + - 0 - disabled (default) + - not 0 - enabled + + Be conservative in what you do, be liberal in what you accept from others. + If it's non-zero, we mark only out of window RST segments as INVALID. + +nf_conntrack_tcp_loose - BOOLEAN + - 0 - disabled + - not 0 - enabled (default) + + If it is set to zero, we disable picking up already established + connections. + +nf_conntrack_tcp_max_retrans - INTEGER + default 3 + + Maximum number of packets that can be retransmitted without + received an (acceptable) ACK from the destination. If this number + is reached, a shorter timer will be started. + +nf_conntrack_tcp_timeout_close - INTEGER (seconds) + default 10 + +nf_conntrack_tcp_timeout_close_wait - INTEGER (seconds) + default 60 + +nf_conntrack_tcp_timeout_established - INTEGER (seconds) + default 432000 (5 days) + +nf_conntrack_tcp_timeout_fin_wait - INTEGER (seconds) + default 120 + +nf_conntrack_tcp_timeout_last_ack - INTEGER (seconds) + default 30 + +nf_conntrack_tcp_timeout_max_retrans - INTEGER (seconds) + default 300 + +nf_conntrack_tcp_timeout_syn_recv - INTEGER (seconds) + default 60 + +nf_conntrack_tcp_timeout_syn_sent - INTEGER (seconds) + default 120 + +nf_conntrack_tcp_timeout_time_wait - INTEGER (seconds) + default 120 + +nf_conntrack_tcp_timeout_unacknowledged - INTEGER (seconds) + default 300 + +nf_conntrack_timestamp - BOOLEAN + - 0 - disabled (default) + - not 0 - enabled + + Enable connection tracking flow timestamping. + +nf_conntrack_udp_timeout - INTEGER (seconds) + default 30 + +nf_conntrack_udp_timeout_stream - INTEGER (seconds) + default 120 + + This extended timeout will be used in case there is an UDP stream + detected. + +nf_conntrack_gre_timeout - INTEGER (seconds) + default 30 + +nf_conntrack_gre_timeout_stream - INTEGER (seconds) + default 180 + + This extended timeout will be used in case there is an GRE stream + detected. diff --git a/Documentation/networking/nf_conntrack-sysctl.txt b/Documentation/networking/nf_conntrack-sysctl.txt deleted file mode 100644 index f75c2ce6e136..000000000000 --- a/Documentation/networking/nf_conntrack-sysctl.txt +++ /dev/null @@ -1,172 +0,0 @@ -/proc/sys/net/netfilter/nf_conntrack_* Variables: - -nf_conntrack_acct - BOOLEAN - 0 - disabled (default) - not 0 - enabled - - Enable connection tracking flow accounting. 64-bit byte and packet - counters per flow are added. - -nf_conntrack_buckets - INTEGER - Size of hash table. If not specified as parameter during module - loading, the default size is calculated by dividing total memory - by 16384 to determine the number of buckets but the hash table will - never have fewer than 32 and limited to 16384 buckets. For systems - with more than 4GB of memory it will be 65536 buckets. - This sysctl is only writeable in the initial net namespace. - -nf_conntrack_checksum - BOOLEAN - 0 - disabled - not 0 - enabled (default) - - Verify checksum of incoming packets. Packets with bad checksums are - in INVALID state. If this is enabled, such packets will not be - considered for connection tracking. - -nf_conntrack_count - INTEGER (read-only) - Number of currently allocated flow entries. - -nf_conntrack_events - BOOLEAN - 0 - disabled - not 0 - enabled (default) - - If this option is enabled, the connection tracking code will - provide userspace with connection tracking events via ctnetlink. - -nf_conntrack_expect_max - INTEGER - Maximum size of expectation table. Default value is - nf_conntrack_buckets / 256. Minimum is 1. - -nf_conntrack_frag6_high_thresh - INTEGER - default 262144 - - Maximum memory used to reassemble IPv6 fragments. When - nf_conntrack_frag6_high_thresh bytes of memory is allocated for this - purpose, the fragment handler will toss packets until - nf_conntrack_frag6_low_thresh is reached. - -nf_conntrack_frag6_low_thresh - INTEGER - default 196608 - - See nf_conntrack_frag6_low_thresh - -nf_conntrack_frag6_timeout - INTEGER (seconds) - default 60 - - Time to keep an IPv6 fragment in memory. - -nf_conntrack_generic_timeout - INTEGER (seconds) - default 600 - - Default for generic timeout. This refers to layer 4 unknown/unsupported - protocols. - -nf_conntrack_helper - BOOLEAN - 0 - disabled (default) - not 0 - enabled - - Enable automatic conntrack helper assignment. - If disabled it is required to set up iptables rules to assign - helpers to connections. See the CT target description in the - iptables-extensions(8) man page for further information. - -nf_conntrack_icmp_timeout - INTEGER (seconds) - default 30 - - Default for ICMP timeout. - -nf_conntrack_icmpv6_timeout - INTEGER (seconds) - default 30 - - Default for ICMP6 timeout. - -nf_conntrack_log_invalid - INTEGER - 0 - disable (default) - 1 - log ICMP packets - 6 - log TCP packets - 17 - log UDP packets - 33 - log DCCP packets - 41 - log ICMPv6 packets - 136 - log UDPLITE packets - 255 - log packets of any protocol - - Log invalid packets of a type specified by value. - -nf_conntrack_max - INTEGER - Size of connection tracking table. Default value is - nf_conntrack_buckets value * 4. - -nf_conntrack_tcp_be_liberal - BOOLEAN - 0 - disabled (default) - not 0 - enabled - - Be conservative in what you do, be liberal in what you accept from others. - If it's non-zero, we mark only out of window RST segments as INVALID. - -nf_conntrack_tcp_loose - BOOLEAN - 0 - disabled - not 0 - enabled (default) - - If it is set to zero, we disable picking up already established - connections. - -nf_conntrack_tcp_max_retrans - INTEGER - default 3 - - Maximum number of packets that can be retransmitted without - received an (acceptable) ACK from the destination. If this number - is reached, a shorter timer will be started. - -nf_conntrack_tcp_timeout_close - INTEGER (seconds) - default 10 - -nf_conntrack_tcp_timeout_close_wait - INTEGER (seconds) - default 60 - -nf_conntrack_tcp_timeout_established - INTEGER (seconds) - default 432000 (5 days) - -nf_conntrack_tcp_timeout_fin_wait - INTEGER (seconds) - default 120 - -nf_conntrack_tcp_timeout_last_ack - INTEGER (seconds) - default 30 - -nf_conntrack_tcp_timeout_max_retrans - INTEGER (seconds) - default 300 - -nf_conntrack_tcp_timeout_syn_recv - INTEGER (seconds) - default 60 - -nf_conntrack_tcp_timeout_syn_sent - INTEGER (seconds) - default 120 - -nf_conntrack_tcp_timeout_time_wait - INTEGER (seconds) - default 120 - -nf_conntrack_tcp_timeout_unacknowledged - INTEGER (seconds) - default 300 - -nf_conntrack_timestamp - BOOLEAN - 0 - disabled (default) - not 0 - enabled - - Enable connection tracking flow timestamping. - -nf_conntrack_udp_timeout - INTEGER (seconds) - default 30 - -nf_conntrack_udp_timeout_stream - INTEGER (seconds) - default 120 - - This extended timeout will be used in case there is an UDP stream - detected. - -nf_conntrack_gre_timeout - INTEGER (seconds) - default 30 - -nf_conntrack_gre_timeout_stream - INTEGER (seconds) - default 180 - - This extended timeout will be used in case there is an GRE stream - detected. -- cgit v1.2.3-59-g8ed1b From aa3764276a4bc1a927ab8feaad5a255234763d9d Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Thu, 30 Apr 2020 18:04:08 +0200 Subject: docs: networking: convert nf_flowtable.txt to ReST - add SPDX header; - adjust title markup; - mark code blocks and literals as such; - add notes markups; - adjust identation, whitespaces and blank lines; - add to networking/index.rst. Signed-off-by: Mauro Carvalho Chehab Signed-off-by: David S. Miller --- Documentation/networking/index.rst | 1 + Documentation/networking/nf_flowtable.rst | 117 ++++++++++++++++++++++++++++++ Documentation/networking/nf_flowtable.txt | 112 ---------------------------- 3 files changed, 118 insertions(+), 112 deletions(-) create mode 100644 Documentation/networking/nf_flowtable.rst delete mode 100644 Documentation/networking/nf_flowtable.txt diff --git a/Documentation/networking/index.rst b/Documentation/networking/index.rst index e5128bb7e7df..c4e8a43741be 100644 --- a/Documentation/networking/index.rst +++ b/Documentation/networking/index.rst @@ -86,6 +86,7 @@ Contents: netfilter-sysctl netif-msg nf_conntrack-sysctl + nf_flowtable .. only:: subproject and html diff --git a/Documentation/networking/nf_flowtable.rst b/Documentation/networking/nf_flowtable.rst new file mode 100644 index 000000000000..b6e1fa141aae --- /dev/null +++ b/Documentation/networking/nf_flowtable.rst @@ -0,0 +1,117 @@ +.. SPDX-License-Identifier: GPL-2.0 + +==================================== +Netfilter's flowtable infrastructure +==================================== + +This documentation describes the software flowtable infrastructure available in +Netfilter since Linux kernel 4.16. + +Overview +-------- + +Initial packets follow the classic forwarding path, once the flow enters the +established state according to the conntrack semantics (ie. we have seen traffic +in both directions), then you can decide to offload the flow to the flowtable +from the forward chain via the 'flow offload' action available in nftables. + +Packets that find an entry in the flowtable (ie. flowtable hit) are sent to the +output netdevice via neigh_xmit(), hence, they bypass the classic forwarding +path (the visible effect is that you do not see these packets from any of the +netfilter hooks coming after the ingress). In case of flowtable miss, the packet +follows the classic forward path. + +The flowtable uses a resizable hashtable, lookups are based on the following +7-tuple selectors: source, destination, layer 3 and layer 4 protocols, source +and destination ports and the input interface (useful in case there are several +conntrack zones in place). + +Flowtables are populated via the 'flow offload' nftables action, so the user can +selectively specify what flows are placed into the flow table. Hence, packets +follow the classic forwarding path unless the user explicitly instruct packets +to use this new alternative forwarding path via nftables policy. + +This is represented in Fig.1, which describes the classic forwarding path +including the Netfilter hooks and the flowtable fastpath bypass. + +:: + + userspace process + ^ | + | | + _____|____ ____\/___ + / \ / \ + | input | | output | + \__________/ \_________/ + ^ | + | | + _________ __________ --------- _____\/_____ + / \ / \ |Routing | / \ + --> ingress ---> prerouting ---> |decision| | postrouting |--> neigh_xmit + \_________/ \__________/ ---------- \____________/ ^ + | ^ | ^ | + flowtable | ____\/___ | | + | | / \ | | + __\/___ | | forward |------------ | + |-----| | \_________/ | + |-----| | 'flow offload' rule | + |-----| | adds entry to | + |_____| | flowtable | + | | | + / \ | | + /hit\_no_| | + \ ? / | + \ / | + |__yes_________________fastpath bypass ____________________________| + + Fig.1 Netfilter hooks and flowtable interactions + +The flowtable entry also stores the NAT configuration, so all packets are +mangled according to the NAT policy that matches the initial packets that went +through the classic forwarding path. The TTL is decremented before calling +neigh_xmit(). Fragmented traffic is passed up to follow the classic forwarding +path given that the transport selectors are missing, therefore flowtable lookup +is not possible. + +Example configuration +--------------------- + +Enabling the flowtable bypass is relatively easy, you only need to create a +flowtable and add one rule to your forward chain:: + + table inet x { + flowtable f { + hook ingress priority 0; devices = { eth0, eth1 }; + } + chain y { + type filter hook forward priority 0; policy accept; + ip protocol tcp flow offload @f + counter packets 0 bytes 0 + } + } + +This example adds the flowtable 'f' to the ingress hook of the eth0 and eth1 +netdevices. You can create as many flowtables as you want in case you need to +perform resource partitioning. The flowtable priority defines the order in which +hooks are run in the pipeline, this is convenient in case you already have a +nftables ingress chain (make sure the flowtable priority is smaller than the +nftables ingress chain hence the flowtable runs before in the pipeline). + +The 'flow offload' action from the forward chain 'y' adds an entry to the +flowtable for the TCP syn-ack packet coming in the reply direction. Once the +flow is offloaded, you will observe that the counter rule in the example above +does not get updated for the packets that are being forwarded through the +forwarding bypass. + +More reading +------------ + +This documentation is based on the LWN.net articles [1]_\ [2]_. Rafal Milecki +also made a very complete and comprehensive summary called "A state of network +acceleration" that describes how things were before this infrastructure was +mailined [3]_ and it also makes a rough summary of this work [4]_. + +.. [1] https://lwn.net/Articles/738214/ +.. [2] https://lwn.net/Articles/742164/ +.. [3] http://lists.infradead.org/pipermail/lede-dev/2018-January/010830.html +.. [4] http://lists.infradead.org/pipermail/lede-dev/2018-January/010829.html diff --git a/Documentation/networking/nf_flowtable.txt b/Documentation/networking/nf_flowtable.txt deleted file mode 100644 index 0bf32d1121be..000000000000 --- a/Documentation/networking/nf_flowtable.txt +++ /dev/null @@ -1,112 +0,0 @@ -Netfilter's flowtable infrastructure -==================================== - -This documentation describes the software flowtable infrastructure available in -Netfilter since Linux kernel 4.16. - -Overview --------- - -Initial packets follow the classic forwarding path, once the flow enters the -established state according to the conntrack semantics (ie. we have seen traffic -in both directions), then you can decide to offload the flow to the flowtable -from the forward chain via the 'flow offload' action available in nftables. - -Packets that find an entry in the flowtable (ie. flowtable hit) are sent to the -output netdevice via neigh_xmit(), hence, they bypass the classic forwarding -path (the visible effect is that you do not see these packets from any of the -netfilter hooks coming after the ingress). In case of flowtable miss, the packet -follows the classic forward path. - -The flowtable uses a resizable hashtable, lookups are based on the following -7-tuple selectors: source, destination, layer 3 and layer 4 protocols, source -and destination ports and the input interface (useful in case there are several -conntrack zones in place). - -Flowtables are populated via the 'flow offload' nftables action, so the user can -selectively specify what flows are placed into the flow table. Hence, packets -follow the classic forwarding path unless the user explicitly instruct packets -to use this new alternative forwarding path via nftables policy. - -This is represented in Fig.1, which describes the classic forwarding path -including the Netfilter hooks and the flowtable fastpath bypass. - - userspace process - ^ | - | | - _____|____ ____\/___ - / \ / \ - | input | | output | - \__________/ \_________/ - ^ | - | | - _________ __________ --------- _____\/_____ - / \ / \ |Routing | / \ - --> ingress ---> prerouting ---> |decision| | postrouting |--> neigh_xmit - \_________/ \__________/ ---------- \____________/ ^ - | ^ | ^ | - flowtable | ____\/___ | | - | | / \ | | - __\/___ | | forward |------------ | - |-----| | \_________/ | - |-----| | 'flow offload' rule | - |-----| | adds entry to | - |_____| | flowtable | - | | | - / \ | | - /hit\_no_| | - \ ? / | - \ / | - |__yes_________________fastpath bypass ____________________________| - - Fig.1 Netfilter hooks and flowtable interactions - -The flowtable entry also stores the NAT configuration, so all packets are -mangled according to the NAT policy that matches the initial packets that went -through the classic forwarding path. The TTL is decremented before calling -neigh_xmit(). Fragmented traffic is passed up to follow the classic forwarding -path given that the transport selectors are missing, therefore flowtable lookup -is not possible. - -Example configuration ---------------------- - -Enabling the flowtable bypass is relatively easy, you only need to create a -flowtable and add one rule to your forward chain. - - table inet x { - flowtable f { - hook ingress priority 0; devices = { eth0, eth1 }; - } - chain y { - type filter hook forward priority 0; policy accept; - ip protocol tcp flow offload @f - counter packets 0 bytes 0 - } - } - -This example adds the flowtable 'f' to the ingress hook of the eth0 and eth1 -netdevices. You can create as many flowtables as you want in case you need to -perform resource partitioning. The flowtable priority defines the order in which -hooks are run in the pipeline, this is convenient in case you already have a -nftables ingress chain (make sure the flowtable priority is smaller than the -nftables ingress chain hence the flowtable runs before in the pipeline). - -The 'flow offload' action from the forward chain 'y' adds an entry to the -flowtable for the TCP syn-ack packet coming in the reply direction. Once the -flow is offloaded, you will observe that the counter rule in the example above -does not get updated for the packets that are being forwarded through the -forwarding bypass. - -More reading ------------- - -This documentation is based on the LWN.net articles [1][2]. Rafal Milecki also -made a very complete and comprehensive summary called "A state of network -acceleration" that describes how things were before this infrastructure was -mailined [3] and it also makes a rough summary of this work [4]. - -[1] https://lwn.net/Articles/738214/ -[2] https://lwn.net/Articles/742164/ -[3] http://lists.infradead.org/pipermail/lede-dev/2018-January/010830.html -[4] http://lists.infradead.org/pipermail/lede-dev/2018-January/010829.html -- cgit v1.2.3-59-g8ed1b From 63893472d753e97204331e568a5e70290054cb38 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Thu, 30 Apr 2020 18:04:09 +0200 Subject: docs: networking: convert openvswitch.txt to ReST - add SPDX header; - adjust title markup; - mark code blocks and literals as such; - adjust identation, whitespaces and blank lines; - add to networking/index.rst. Signed-off-by: Mauro Carvalho Chehab Signed-off-by: David S. Miller --- Documentation/networking/index.rst | 1 + Documentation/networking/openvswitch.rst | 251 +++++++++++++++++++++++++++++++ Documentation/networking/openvswitch.txt | 248 ------------------------------ 3 files changed, 252 insertions(+), 248 deletions(-) create mode 100644 Documentation/networking/openvswitch.rst delete mode 100644 Documentation/networking/openvswitch.txt diff --git a/Documentation/networking/index.rst b/Documentation/networking/index.rst index c4e8a43741be..b7f558480aca 100644 --- a/Documentation/networking/index.rst +++ b/Documentation/networking/index.rst @@ -87,6 +87,7 @@ Contents: netif-msg nf_conntrack-sysctl nf_flowtable + openvswitch .. only:: subproject and html diff --git a/Documentation/networking/openvswitch.rst b/Documentation/networking/openvswitch.rst new file mode 100644 index 000000000000..1a8353dbf1b6 --- /dev/null +++ b/Documentation/networking/openvswitch.rst @@ -0,0 +1,251 @@ +.. SPDX-License-Identifier: GPL-2.0 + +============================================= +Open vSwitch datapath developer documentation +============================================= + +The Open vSwitch kernel module allows flexible userspace control over +flow-level packet processing on selected network devices. It can be +used to implement a plain Ethernet switch, network device bonding, +VLAN processing, network access control, flow-based network control, +and so on. + +The kernel module implements multiple "datapaths" (analogous to +bridges), each of which can have multiple "vports" (analogous to ports +within a bridge). Each datapath also has associated with it a "flow +table" that userspace populates with "flows" that map from keys based +on packet headers and metadata to sets of actions. The most common +action forwards the packet to another vport; other actions are also +implemented. + +When a packet arrives on a vport, the kernel module processes it by +extracting its flow key and looking it up in the flow table. If there +is a matching flow, it executes the associated actions. If there is +no match, it queues the packet to userspace for processing (as part of +its processing, userspace will likely set up a flow to handle further +packets of the same type entirely in-kernel). + + +Flow key compatibility +---------------------- + +Network protocols evolve over time. New protocols become important +and existing protocols lose their prominence. For the Open vSwitch +kernel module to remain relevant, it must be possible for newer +versions to parse additional protocols as part of the flow key. It +might even be desirable, someday, to drop support for parsing +protocols that have become obsolete. Therefore, the Netlink interface +to Open vSwitch is designed to allow carefully written userspace +applications to work with any version of the flow key, past or future. + +To support this forward and backward compatibility, whenever the +kernel module passes a packet to userspace, it also passes along the +flow key that it parsed from the packet. Userspace then extracts its +own notion of a flow key from the packet and compares it against the +kernel-provided version: + + - If userspace's notion of the flow key for the packet matches the + kernel's, then nothing special is necessary. + + - If the kernel's flow key includes more fields than the userspace + version of the flow key, for example if the kernel decoded IPv6 + headers but userspace stopped at the Ethernet type (because it + does not understand IPv6), then again nothing special is + necessary. Userspace can still set up a flow in the usual way, + as long as it uses the kernel-provided flow key to do it. + + - If the userspace flow key includes more fields than the + kernel's, for example if userspace decoded an IPv6 header but + the kernel stopped at the Ethernet type, then userspace can + forward the packet manually, without setting up a flow in the + kernel. This case is bad for performance because every packet + that the kernel considers part of the flow must go to userspace, + but the forwarding behavior is correct. (If userspace can + determine that the values of the extra fields would not affect + forwarding behavior, then it could set up a flow anyway.) + +How flow keys evolve over time is important to making this work, so +the following sections go into detail. + + +Flow key format +--------------- + +A flow key is passed over a Netlink socket as a sequence of Netlink +attributes. Some attributes represent packet metadata, defined as any +information about a packet that cannot be extracted from the packet +itself, e.g. the vport on which the packet was received. Most +attributes, however, are extracted from headers within the packet, +e.g. source and destination addresses from Ethernet, IP, or TCP +headers. + +The header file defines the exact format of the +flow key attributes. For informal explanatory purposes here, we write +them as comma-separated strings, with parentheses indicating arguments +and nesting. For example, the following could represent a flow key +corresponding to a TCP packet that arrived on vport 1:: + + in_port(1), eth(src=e0:91:f5:21:d0:b2, dst=00:02:e3:0f:80:a4), + eth_type(0x0800), ipv4(src=172.16.0.20, dst=172.18.0.52, proto=17, tos=0, + frag=no), tcp(src=49163, dst=80) + +Often we ellipsize arguments not important to the discussion, e.g.:: + + in_port(1), eth(...), eth_type(0x0800), ipv4(...), tcp(...) + + +Wildcarded flow key format +-------------------------- + +A wildcarded flow is described with two sequences of Netlink attributes +passed over the Netlink socket. A flow key, exactly as described above, and an +optional corresponding flow mask. + +A wildcarded flow can represent a group of exact match flows. Each '1' bit +in the mask specifies a exact match with the corresponding bit in the flow key. +A '0' bit specifies a don't care bit, which will match either a '1' or '0' bit +of a incoming packet. Using wildcarded flow can improve the flow set up rate +by reduce the number of new flows need to be processed by the user space program. + +Support for the mask Netlink attribute is optional for both the kernel and user +space program. The kernel can ignore the mask attribute, installing an exact +match flow, or reduce the number of don't care bits in the kernel to less than +what was specified by the user space program. In this case, variations in bits +that the kernel does not implement will simply result in additional flow setups. +The kernel module will also work with user space programs that neither support +nor supply flow mask attributes. + +Since the kernel may ignore or modify wildcard bits, it can be difficult for +the userspace program to know exactly what matches are installed. There are +two possible approaches: reactively install flows as they miss the kernel +flow table (and therefore not attempt to determine wildcard changes at all) +or use the kernel's response messages to determine the installed wildcards. + +When interacting with userspace, the kernel should maintain the match portion +of the key exactly as originally installed. This will provides a handle to +identify the flow for all future operations. However, when reporting the +mask of an installed flow, the mask should include any restrictions imposed +by the kernel. + +The behavior when using overlapping wildcarded flows is undefined. It is the +responsibility of the user space program to ensure that any incoming packet +can match at most one flow, wildcarded or not. The current implementation +performs best-effort detection of overlapping wildcarded flows and may reject +some but not all of them. However, this behavior may change in future versions. + + +Unique flow identifiers +----------------------- + +An alternative to using the original match portion of a key as the handle for +flow identification is a unique flow identifier, or "UFID". UFIDs are optional +for both the kernel and user space program. + +User space programs that support UFID are expected to provide it during flow +setup in addition to the flow, then refer to the flow using the UFID for all +future operations. The kernel is not required to index flows by the original +flow key if a UFID is specified. + + +Basic rule for evolving flow keys +--------------------------------- + +Some care is needed to really maintain forward and backward +compatibility for applications that follow the rules listed under +"Flow key compatibility" above. + +The basic rule is obvious:: + + ================================================================== + New network protocol support must only supplement existing flow + key attributes. It must not change the meaning of already defined + flow key attributes. + ================================================================== + +This rule does have less-obvious consequences so it is worth working +through a few examples. Suppose, for example, that the kernel module +did not already implement VLAN parsing. Instead, it just interpreted +the 802.1Q TPID (0x8100) as the Ethertype then stopped parsing the +packet. The flow key for any packet with an 802.1Q header would look +essentially like this, ignoring metadata:: + + eth(...), eth_type(0x8100) + +Naively, to add VLAN support, it makes sense to add a new "vlan" flow +key attribute to contain the VLAN tag, then continue to decode the +encapsulated headers beyond the VLAN tag using the existing field +definitions. With this change, a TCP packet in VLAN 10 would have a +flow key much like this:: + + eth(...), vlan(vid=10, pcp=0), eth_type(0x0800), ip(proto=6, ...), tcp(...) + +But this change would negatively affect a userspace application that +has not been updated to understand the new "vlan" flow key attribute. +The application could, following the flow compatibility rules above, +ignore the "vlan" attribute that it does not understand and therefore +assume that the flow contained IP packets. This is a bad assumption +(the flow only contains IP packets if one parses and skips over the +802.1Q header) and it could cause the application's behavior to change +across kernel versions even though it follows the compatibility rules. + +The solution is to use a set of nested attributes. This is, for +example, why 802.1Q support uses nested attributes. A TCP packet in +VLAN 10 is actually expressed as:: + + eth(...), eth_type(0x8100), vlan(vid=10, pcp=0), encap(eth_type(0x0800), + ip(proto=6, ...), tcp(...))) + +Notice how the "eth_type", "ip", and "tcp" flow key attributes are +nested inside the "encap" attribute. Thus, an application that does +not understand the "vlan" key will not see either of those attributes +and therefore will not misinterpret them. (Also, the outer eth_type +is still 0x8100, not changed to 0x0800.) + +Handling malformed packets +-------------------------- + +Don't drop packets in the kernel for malformed protocol headers, bad +checksums, etc. This would prevent userspace from implementing a +simple Ethernet switch that forwards every packet. + +Instead, in such a case, include an attribute with "empty" content. +It doesn't matter if the empty content could be valid protocol values, +as long as those values are rarely seen in practice, because userspace +can always forward all packets with those values to userspace and +handle them individually. + +For example, consider a packet that contains an IP header that +indicates protocol 6 for TCP, but which is truncated just after the IP +header, so that the TCP header is missing. The flow key for this +packet would include a tcp attribute with all-zero src and dst, like +this:: + + eth(...), eth_type(0x0800), ip(proto=6, ...), tcp(src=0, dst=0) + +As another example, consider a packet with an Ethernet type of 0x8100, +indicating that a VLAN TCI should follow, but which is truncated just +after the Ethernet type. The flow key for this packet would include +an all-zero-bits vlan and an empty encap attribute, like this:: + + eth(...), eth_type(0x8100), vlan(0), encap() + +Unlike a TCP packet with source and destination ports 0, an +all-zero-bits VLAN TCI is not that rare, so the CFI bit (aka +VLAN_TAG_PRESENT inside the kernel) is ordinarily set in a vlan +attribute expressly to allow this situation to be distinguished. +Thus, the flow key in this second example unambiguously indicates a +missing or malformed VLAN TCI. + +Other rules +----------- + +The other rules for flow keys are much less subtle: + + - Duplicate attributes are not allowed at a given nesting level. + + - Ordering of attributes is not significant. + + - When the kernel sends a given flow key to userspace, it always + composes it the same way. This allows userspace to hash and + compare entire flow keys that it may not be able to fully + interpret. diff --git a/Documentation/networking/openvswitch.txt b/Documentation/networking/openvswitch.txt deleted file mode 100644 index b3b9ac61d29d..000000000000 --- a/Documentation/networking/openvswitch.txt +++ /dev/null @@ -1,248 +0,0 @@ -Open vSwitch datapath developer documentation -============================================= - -The Open vSwitch kernel module allows flexible userspace control over -flow-level packet processing on selected network devices. It can be -used to implement a plain Ethernet switch, network device bonding, -VLAN processing, network access control, flow-based network control, -and so on. - -The kernel module implements multiple "datapaths" (analogous to -bridges), each of which can have multiple "vports" (analogous to ports -within a bridge). Each datapath also has associated with it a "flow -table" that userspace populates with "flows" that map from keys based -on packet headers and metadata to sets of actions. The most common -action forwards the packet to another vport; other actions are also -implemented. - -When a packet arrives on a vport, the kernel module processes it by -extracting its flow key and looking it up in the flow table. If there -is a matching flow, it executes the associated actions. If there is -no match, it queues the packet to userspace for processing (as part of -its processing, userspace will likely set up a flow to handle further -packets of the same type entirely in-kernel). - - -Flow key compatibility ----------------------- - -Network protocols evolve over time. New protocols become important -and existing protocols lose their prominence. For the Open vSwitch -kernel module to remain relevant, it must be possible for newer -versions to parse additional protocols as part of the flow key. It -might even be desirable, someday, to drop support for parsing -protocols that have become obsolete. Therefore, the Netlink interface -to Open vSwitch is designed to allow carefully written userspace -applications to work with any version of the flow key, past or future. - -To support this forward and backward compatibility, whenever the -kernel module passes a packet to userspace, it also passes along the -flow key that it parsed from the packet. Userspace then extracts its -own notion of a flow key from the packet and compares it against the -kernel-provided version: - - - If userspace's notion of the flow key for the packet matches the - kernel's, then nothing special is necessary. - - - If the kernel's flow key includes more fields than the userspace - version of the flow key, for example if the kernel decoded IPv6 - headers but userspace stopped at the Ethernet type (because it - does not understand IPv6), then again nothing special is - necessary. Userspace can still set up a flow in the usual way, - as long as it uses the kernel-provided flow key to do it. - - - If the userspace flow key includes more fields than the - kernel's, for example if userspace decoded an IPv6 header but - the kernel stopped at the Ethernet type, then userspace can - forward the packet manually, without setting up a flow in the - kernel. This case is bad for performance because every packet - that the kernel considers part of the flow must go to userspace, - but the forwarding behavior is correct. (If userspace can - determine that the values of the extra fields would not affect - forwarding behavior, then it could set up a flow anyway.) - -How flow keys evolve over time is important to making this work, so -the following sections go into detail. - - -Flow key format ---------------- - -A flow key is passed over a Netlink socket as a sequence of Netlink -attributes. Some attributes represent packet metadata, defined as any -information about a packet that cannot be extracted from the packet -itself, e.g. the vport on which the packet was received. Most -attributes, however, are extracted from headers within the packet, -e.g. source and destination addresses from Ethernet, IP, or TCP -headers. - -The header file defines the exact format of the -flow key attributes. For informal explanatory purposes here, we write -them as comma-separated strings, with parentheses indicating arguments -and nesting. For example, the following could represent a flow key -corresponding to a TCP packet that arrived on vport 1: - - in_port(1), eth(src=e0:91:f5:21:d0:b2, dst=00:02:e3:0f:80:a4), - eth_type(0x0800), ipv4(src=172.16.0.20, dst=172.18.0.52, proto=17, tos=0, - frag=no), tcp(src=49163, dst=80) - -Often we ellipsize arguments not important to the discussion, e.g.: - - in_port(1), eth(...), eth_type(0x0800), ipv4(...), tcp(...) - - -Wildcarded flow key format --------------------------- - -A wildcarded flow is described with two sequences of Netlink attributes -passed over the Netlink socket. A flow key, exactly as described above, and an -optional corresponding flow mask. - -A wildcarded flow can represent a group of exact match flows. Each '1' bit -in the mask specifies a exact match with the corresponding bit in the flow key. -A '0' bit specifies a don't care bit, which will match either a '1' or '0' bit -of a incoming packet. Using wildcarded flow can improve the flow set up rate -by reduce the number of new flows need to be processed by the user space program. - -Support for the mask Netlink attribute is optional for both the kernel and user -space program. The kernel can ignore the mask attribute, installing an exact -match flow, or reduce the number of don't care bits in the kernel to less than -what was specified by the user space program. In this case, variations in bits -that the kernel does not implement will simply result in additional flow setups. -The kernel module will also work with user space programs that neither support -nor supply flow mask attributes. - -Since the kernel may ignore or modify wildcard bits, it can be difficult for -the userspace program to know exactly what matches are installed. There are -two possible approaches: reactively install flows as they miss the kernel -flow table (and therefore not attempt to determine wildcard changes at all) -or use the kernel's response messages to determine the installed wildcards. - -When interacting with userspace, the kernel should maintain the match portion -of the key exactly as originally installed. This will provides a handle to -identify the flow for all future operations. However, when reporting the -mask of an installed flow, the mask should include any restrictions imposed -by the kernel. - -The behavior when using overlapping wildcarded flows is undefined. It is the -responsibility of the user space program to ensure that any incoming packet -can match at most one flow, wildcarded or not. The current implementation -performs best-effort detection of overlapping wildcarded flows and may reject -some but not all of them. However, this behavior may change in future versions. - - -Unique flow identifiers ------------------------ - -An alternative to using the original match portion of a key as the handle for -flow identification is a unique flow identifier, or "UFID". UFIDs are optional -for both the kernel and user space program. - -User space programs that support UFID are expected to provide it during flow -setup in addition to the flow, then refer to the flow using the UFID for all -future operations. The kernel is not required to index flows by the original -flow key if a UFID is specified. - - -Basic rule for evolving flow keys ---------------------------------- - -Some care is needed to really maintain forward and backward -compatibility for applications that follow the rules listed under -"Flow key compatibility" above. - -The basic rule is obvious: - - ------------------------------------------------------------------ - New network protocol support must only supplement existing flow - key attributes. It must not change the meaning of already defined - flow key attributes. - ------------------------------------------------------------------ - -This rule does have less-obvious consequences so it is worth working -through a few examples. Suppose, for example, that the kernel module -did not already implement VLAN parsing. Instead, it just interpreted -the 802.1Q TPID (0x8100) as the Ethertype then stopped parsing the -packet. The flow key for any packet with an 802.1Q header would look -essentially like this, ignoring metadata: - - eth(...), eth_type(0x8100) - -Naively, to add VLAN support, it makes sense to add a new "vlan" flow -key attribute to contain the VLAN tag, then continue to decode the -encapsulated headers beyond the VLAN tag using the existing field -definitions. With this change, a TCP packet in VLAN 10 would have a -flow key much like this: - - eth(...), vlan(vid=10, pcp=0), eth_type(0x0800), ip(proto=6, ...), tcp(...) - -But this change would negatively affect a userspace application that -has not been updated to understand the new "vlan" flow key attribute. -The application could, following the flow compatibility rules above, -ignore the "vlan" attribute that it does not understand and therefore -assume that the flow contained IP packets. This is a bad assumption -(the flow only contains IP packets if one parses and skips over the -802.1Q header) and it could cause the application's behavior to change -across kernel versions even though it follows the compatibility rules. - -The solution is to use a set of nested attributes. This is, for -example, why 802.1Q support uses nested attributes. A TCP packet in -VLAN 10 is actually expressed as: - - eth(...), eth_type(0x8100), vlan(vid=10, pcp=0), encap(eth_type(0x0800), - ip(proto=6, ...), tcp(...))) - -Notice how the "eth_type", "ip", and "tcp" flow key attributes are -nested inside the "encap" attribute. Thus, an application that does -not understand the "vlan" key will not see either of those attributes -and therefore will not misinterpret them. (Also, the outer eth_type -is still 0x8100, not changed to 0x0800.) - -Handling malformed packets --------------------------- - -Don't drop packets in the kernel for malformed protocol headers, bad -checksums, etc. This would prevent userspace from implementing a -simple Ethernet switch that forwards every packet. - -Instead, in such a case, include an attribute with "empty" content. -It doesn't matter if the empty content could be valid protocol values, -as long as those values are rarely seen in practice, because userspace -can always forward all packets with those values to userspace and -handle them individually. - -For example, consider a packet that contains an IP header that -indicates protocol 6 for TCP, but which is truncated just after the IP -header, so that the TCP header is missing. The flow key for this -packet would include a tcp attribute with all-zero src and dst, like -this: - - eth(...), eth_type(0x0800), ip(proto=6, ...), tcp(src=0, dst=0) - -As another example, consider a packet with an Ethernet type of 0x8100, -indicating that a VLAN TCI should follow, but which is truncated just -after the Ethernet type. The flow key for this packet would include -an all-zero-bits vlan and an empty encap attribute, like this: - - eth(...), eth_type(0x8100), vlan(0), encap() - -Unlike a TCP packet with source and destination ports 0, an -all-zero-bits VLAN TCI is not that rare, so the CFI bit (aka -VLAN_TAG_PRESENT inside the kernel) is ordinarily set in a vlan -attribute expressly to allow this situation to be distinguished. -Thus, the flow key in this second example unambiguously indicates a -missing or malformed VLAN TCI. - -Other rules ------------ - -The other rules for flow keys are much less subtle: - - - Duplicate attributes are not allowed at a given nesting level. - - - Ordering of attributes is not significant. - - - When the kernel sends a given flow key to userspace, it always - composes it the same way. This allows userspace to hash and - compare entire flow keys that it may not be able to fully - interpret. -- cgit v1.2.3-59-g8ed1b From f5c39ef3299f4efaab4d1bb410406de5909c1687 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Thu, 30 Apr 2020 18:04:10 +0200 Subject: docs: networking: convert operstates.txt to ReST - add SPDX header; - add a document title; - adjust chapters, adding proper markups; - mark lists as such; - adjust identation, whitespaces and blank lines where needed; - add to networking/index.rst. Signed-off-by: Mauro Carvalho Chehab Signed-off-by: David S. Miller --- Documentation/networking/index.rst | 1 + Documentation/networking/operstates.rst | 185 ++++++++++++++++++++++++++++++++ Documentation/networking/operstates.txt | 164 ---------------------------- 3 files changed, 186 insertions(+), 164 deletions(-) create mode 100644 Documentation/networking/operstates.rst delete mode 100644 Documentation/networking/operstates.txt diff --git a/Documentation/networking/index.rst b/Documentation/networking/index.rst index b7f558480aca..028a36821b9a 100644 --- a/Documentation/networking/index.rst +++ b/Documentation/networking/index.rst @@ -88,6 +88,7 @@ Contents: nf_conntrack-sysctl nf_flowtable openvswitch + operstates .. only:: subproject and html diff --git a/Documentation/networking/operstates.rst b/Documentation/networking/operstates.rst new file mode 100644 index 000000000000..9c918f7cb0e8 --- /dev/null +++ b/Documentation/networking/operstates.rst @@ -0,0 +1,185 @@ +.. SPDX-License-Identifier: GPL-2.0 + +================== +Operational States +================== + + +1. Introduction +=============== + +Linux distinguishes between administrative and operational state of an +interface. Administrative state is the result of "ip link set dev + up or down" and reflects whether the administrator wants to use +the device for traffic. + +However, an interface is not usable just because the admin enabled it +- ethernet requires to be plugged into the switch and, depending on +a site's networking policy and configuration, an 802.1X authentication +to be performed before user data can be transferred. Operational state +shows the ability of an interface to transmit this user data. + +Thanks to 802.1X, userspace must be granted the possibility to +influence operational state. To accommodate this, operational state is +split into two parts: Two flags that can be set by the driver only, and +a RFC2863 compatible state that is derived from these flags, a policy, +and changeable from userspace under certain rules. + + +2. Querying from userspace +========================== + +Both admin and operational state can be queried via the netlink +operation RTM_GETLINK. It is also possible to subscribe to RTNLGRP_LINK +to be notified of updates while the interface is admin up. This is +important for setting from userspace. + +These values contain interface state: + +ifinfomsg::if_flags & IFF_UP: + Interface is admin up + +ifinfomsg::if_flags & IFF_RUNNING: + Interface is in RFC2863 operational state UP or UNKNOWN. This is for + backward compatibility, routing daemons, dhcp clients can use this + flag to determine whether they should use the interface. + +ifinfomsg::if_flags & IFF_LOWER_UP: + Driver has signaled netif_carrier_on() + +ifinfomsg::if_flags & IFF_DORMANT: + Driver has signaled netif_dormant_on() + +TLV IFLA_OPERSTATE +------------------ + +contains RFC2863 state of the interface in numeric representation: + +IF_OPER_UNKNOWN (0): + Interface is in unknown state, neither driver nor userspace has set + operational state. Interface must be considered for user data as + setting operational state has not been implemented in every driver. + +IF_OPER_NOTPRESENT (1): + Unused in current kernel (notpresent interfaces normally disappear), + just a numerical placeholder. + +IF_OPER_DOWN (2): + Interface is unable to transfer data on L1, f.e. ethernet is not + plugged or interface is ADMIN down. + +IF_OPER_LOWERLAYERDOWN (3): + Interfaces stacked on an interface that is IF_OPER_DOWN show this + state (f.e. VLAN). + +IF_OPER_TESTING (4): + Unused in current kernel. + +IF_OPER_DORMANT (5): + Interface is L1 up, but waiting for an external event, f.e. for a + protocol to establish. (802.1X) + +IF_OPER_UP (6): + Interface is operational up and can be used. + +This TLV can also be queried via sysfs. + +TLV IFLA_LINKMODE +----------------- + +contains link policy. This is needed for userspace interaction +described below. + +This TLV can also be queried via sysfs. + + +3. Kernel driver API +==================== + +Kernel drivers have access to two flags that map to IFF_LOWER_UP and +IFF_DORMANT. These flags can be set from everywhere, even from +interrupts. It is guaranteed that only the driver has write access, +however, if different layers of the driver manipulate the same flag, +the driver has to provide the synchronisation needed. + +__LINK_STATE_NOCARRIER, maps to !IFF_LOWER_UP: + +The driver uses netif_carrier_on() to clear and netif_carrier_off() to +set this flag. On netif_carrier_off(), the scheduler stops sending +packets. The name 'carrier' and the inversion are historical, think of +it as lower layer. + +Note that for certain kind of soft-devices, which are not managing any +real hardware, it is possible to set this bit from userspace. One +should use TVL IFLA_CARRIER to do so. + +netif_carrier_ok() can be used to query that bit. + +__LINK_STATE_DORMANT, maps to IFF_DORMANT: + +Set by the driver to express that the device cannot yet be used +because some driver controlled protocol establishment has to +complete. Corresponding functions are netif_dormant_on() to set the +flag, netif_dormant_off() to clear it and netif_dormant() to query. + +On device allocation, both flags __LINK_STATE_NOCARRIER and +__LINK_STATE_DORMANT are cleared, so the effective state is equivalent +to netif_carrier_ok() and !netif_dormant(). + + +Whenever the driver CHANGES one of these flags, a workqueue event is +scheduled to translate the flag combination to IFLA_OPERSTATE as +follows: + +!netif_carrier_ok(): + IF_OPER_LOWERLAYERDOWN if the interface is stacked, IF_OPER_DOWN + otherwise. Kernel can recognise stacked interfaces because their + ifindex != iflink. + +netif_carrier_ok() && netif_dormant(): + IF_OPER_DORMANT + +netif_carrier_ok() && !netif_dormant(): + IF_OPER_UP if userspace interaction is disabled. Otherwise + IF_OPER_DORMANT with the possibility for userspace to initiate the + IF_OPER_UP transition afterwards. + + +4. Setting from userspace +========================= + +Applications have to use the netlink interface to influence the +RFC2863 operational state of an interface. Setting IFLA_LINKMODE to 1 +via RTM_SETLINK instructs the kernel that an interface should go to +IF_OPER_DORMANT instead of IF_OPER_UP when the combination +netif_carrier_ok() && !netif_dormant() is set by the +driver. Afterwards, the userspace application can set IFLA_OPERSTATE +to IF_OPER_DORMANT or IF_OPER_UP as long as the driver does not set +netif_carrier_off() or netif_dormant_on(). Changes made by userspace +are multicasted on the netlink group RTNLGRP_LINK. + +So basically a 802.1X supplicant interacts with the kernel like this: + +- subscribe to RTNLGRP_LINK +- set IFLA_LINKMODE to 1 via RTM_SETLINK +- query RTM_GETLINK once to get initial state +- if initial flags are not (IFF_LOWER_UP && !IFF_DORMANT), wait until + netlink multicast signals this state +- do 802.1X, eventually abort if flags go down again +- send RTM_SETLINK to set operstate to IF_OPER_UP if authentication + succeeds, IF_OPER_DORMANT otherwise +- see how operstate and IFF_RUNNING is echoed via netlink multicast +- set interface back to IF_OPER_DORMANT if 802.1X reauthentication + fails +- restart if kernel changes IFF_LOWER_UP or IFF_DORMANT flag + +if supplicant goes down, bring back IFLA_LINKMODE to 0 and +IFLA_OPERSTATE to a sane value. + +A routing daemon or dhcp client just needs to care for IFF_RUNNING or +waiting for operstate to go IF_OPER_UP/IF_OPER_UNKNOWN before +considering the interface / querying a DHCP address. + + +For technical questions and/or comments please e-mail to Stefan Rompf +(stefan at loplof.de). diff --git a/Documentation/networking/operstates.txt b/Documentation/networking/operstates.txt deleted file mode 100644 index b203d1334822..000000000000 --- a/Documentation/networking/operstates.txt +++ /dev/null @@ -1,164 +0,0 @@ - -1. Introduction - -Linux distinguishes between administrative and operational state of an -interface. Administrative state is the result of "ip link set dev - up or down" and reflects whether the administrator wants to use -the device for traffic. - -However, an interface is not usable just because the admin enabled it -- ethernet requires to be plugged into the switch and, depending on -a site's networking policy and configuration, an 802.1X authentication -to be performed before user data can be transferred. Operational state -shows the ability of an interface to transmit this user data. - -Thanks to 802.1X, userspace must be granted the possibility to -influence operational state. To accommodate this, operational state is -split into two parts: Two flags that can be set by the driver only, and -a RFC2863 compatible state that is derived from these flags, a policy, -and changeable from userspace under certain rules. - - -2. Querying from userspace - -Both admin and operational state can be queried via the netlink -operation RTM_GETLINK. It is also possible to subscribe to RTNLGRP_LINK -to be notified of updates while the interface is admin up. This is -important for setting from userspace. - -These values contain interface state: - -ifinfomsg::if_flags & IFF_UP: - Interface is admin up -ifinfomsg::if_flags & IFF_RUNNING: - Interface is in RFC2863 operational state UP or UNKNOWN. This is for - backward compatibility, routing daemons, dhcp clients can use this - flag to determine whether they should use the interface. -ifinfomsg::if_flags & IFF_LOWER_UP: - Driver has signaled netif_carrier_on() -ifinfomsg::if_flags & IFF_DORMANT: - Driver has signaled netif_dormant_on() - -TLV IFLA_OPERSTATE - -contains RFC2863 state of the interface in numeric representation: - -IF_OPER_UNKNOWN (0): - Interface is in unknown state, neither driver nor userspace has set - operational state. Interface must be considered for user data as - setting operational state has not been implemented in every driver. -IF_OPER_NOTPRESENT (1): - Unused in current kernel (notpresent interfaces normally disappear), - just a numerical placeholder. -IF_OPER_DOWN (2): - Interface is unable to transfer data on L1, f.e. ethernet is not - plugged or interface is ADMIN down. -IF_OPER_LOWERLAYERDOWN (3): - Interfaces stacked on an interface that is IF_OPER_DOWN show this - state (f.e. VLAN). -IF_OPER_TESTING (4): - Unused in current kernel. -IF_OPER_DORMANT (5): - Interface is L1 up, but waiting for an external event, f.e. for a - protocol to establish. (802.1X) -IF_OPER_UP (6): - Interface is operational up and can be used. - -This TLV can also be queried via sysfs. - -TLV IFLA_LINKMODE - -contains link policy. This is needed for userspace interaction -described below. - -This TLV can also be queried via sysfs. - - -3. Kernel driver API - -Kernel drivers have access to two flags that map to IFF_LOWER_UP and -IFF_DORMANT. These flags can be set from everywhere, even from -interrupts. It is guaranteed that only the driver has write access, -however, if different layers of the driver manipulate the same flag, -the driver has to provide the synchronisation needed. - -__LINK_STATE_NOCARRIER, maps to !IFF_LOWER_UP: - -The driver uses netif_carrier_on() to clear and netif_carrier_off() to -set this flag. On netif_carrier_off(), the scheduler stops sending -packets. The name 'carrier' and the inversion are historical, think of -it as lower layer. - -Note that for certain kind of soft-devices, which are not managing any -real hardware, it is possible to set this bit from userspace. One -should use TVL IFLA_CARRIER to do so. - -netif_carrier_ok() can be used to query that bit. - -__LINK_STATE_DORMANT, maps to IFF_DORMANT: - -Set by the driver to express that the device cannot yet be used -because some driver controlled protocol establishment has to -complete. Corresponding functions are netif_dormant_on() to set the -flag, netif_dormant_off() to clear it and netif_dormant() to query. - -On device allocation, both flags __LINK_STATE_NOCARRIER and -__LINK_STATE_DORMANT are cleared, so the effective state is equivalent -to netif_carrier_ok() and !netif_dormant(). - - -Whenever the driver CHANGES one of these flags, a workqueue event is -scheduled to translate the flag combination to IFLA_OPERSTATE as -follows: - -!netif_carrier_ok(): - IF_OPER_LOWERLAYERDOWN if the interface is stacked, IF_OPER_DOWN - otherwise. Kernel can recognise stacked interfaces because their - ifindex != iflink. - -netif_carrier_ok() && netif_dormant(): - IF_OPER_DORMANT - -netif_carrier_ok() && !netif_dormant(): - IF_OPER_UP if userspace interaction is disabled. Otherwise - IF_OPER_DORMANT with the possibility for userspace to initiate the - IF_OPER_UP transition afterwards. - - -4. Setting from userspace - -Applications have to use the netlink interface to influence the -RFC2863 operational state of an interface. Setting IFLA_LINKMODE to 1 -via RTM_SETLINK instructs the kernel that an interface should go to -IF_OPER_DORMANT instead of IF_OPER_UP when the combination -netif_carrier_ok() && !netif_dormant() is set by the -driver. Afterwards, the userspace application can set IFLA_OPERSTATE -to IF_OPER_DORMANT or IF_OPER_UP as long as the driver does not set -netif_carrier_off() or netif_dormant_on(). Changes made by userspace -are multicasted on the netlink group RTNLGRP_LINK. - -So basically a 802.1X supplicant interacts with the kernel like this: - --subscribe to RTNLGRP_LINK --set IFLA_LINKMODE to 1 via RTM_SETLINK --query RTM_GETLINK once to get initial state --if initial flags are not (IFF_LOWER_UP && !IFF_DORMANT), wait until - netlink multicast signals this state --do 802.1X, eventually abort if flags go down again --send RTM_SETLINK to set operstate to IF_OPER_UP if authentication - succeeds, IF_OPER_DORMANT otherwise --see how operstate and IFF_RUNNING is echoed via netlink multicast --set interface back to IF_OPER_DORMANT if 802.1X reauthentication - fails --restart if kernel changes IFF_LOWER_UP or IFF_DORMANT flag - -if supplicant goes down, bring back IFLA_LINKMODE to 0 and -IFLA_OPERSTATE to a sane value. - -A routing daemon or dhcp client just needs to care for IFF_RUNNING or -waiting for operstate to go IF_OPER_UP/IF_OPER_UNKNOWN before -considering the interface / querying a DHCP address. - - -For technical questions and/or comments please e-mail to Stefan Rompf -(stefan at loplof.de). -- cgit v1.2.3-59-g8ed1b From 4ba7bc9f2de6b230da2e38e6317de4b5ed91f46b Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Thu, 30 Apr 2020 18:04:11 +0200 Subject: docs: networking: convert packet_mmap.txt to ReST This patch has a big diff, but most are due to whitespaces. Yet, the conversion is similar to other files under networking: - add SPDX header; - add a document title; - adjust titles and chapters, adding proper markups; - mark lists as such; - mark tables as such; - mark code blocks and literals as such; - adjust identation, whitespaces and blank lines where needed; - add to networking/index.rst. Signed-off-by: Mauro Carvalho Chehab Signed-off-by: David S. Miller --- Documentation/networking/index.rst | 1 + Documentation/networking/packet_mmap.rst | 1084 ++++++++++++++++++++++++++++++ Documentation/networking/packet_mmap.txt | 1061 ----------------------------- 3 files changed, 1085 insertions(+), 1061 deletions(-) create mode 100644 Documentation/networking/packet_mmap.rst delete mode 100644 Documentation/networking/packet_mmap.txt diff --git a/Documentation/networking/index.rst b/Documentation/networking/index.rst index 028a36821b9a..8262b535a83e 100644 --- a/Documentation/networking/index.rst +++ b/Documentation/networking/index.rst @@ -89,6 +89,7 @@ Contents: nf_flowtable openvswitch operstates + packet_mmap .. only:: subproject and html diff --git a/Documentation/networking/packet_mmap.rst b/Documentation/networking/packet_mmap.rst new file mode 100644 index 000000000000..5f213d17652f --- /dev/null +++ b/Documentation/networking/packet_mmap.rst @@ -0,0 +1,1084 @@ +.. SPDX-License-Identifier: GPL-2.0 + +=========== +Packet MMAP +=========== + +Abstract +======== + +This file documents the mmap() facility available with the PACKET +socket interface on 2.4/2.6/3.x kernels. This type of sockets is used for + +i) capture network traffic with utilities like tcpdump, +ii) transmit network traffic, or any other that needs raw + access to network interface. + +Howto can be found at: + + https://sites.google.com/site/packetmmap/ + +Please send your comments to + - Ulisses Alonso Camaró + - Johann Baudy + +Why use PACKET_MMAP +=================== + +In Linux 2.4/2.6/3.x if PACKET_MMAP is not enabled, the capture process is very +inefficient. It uses very limited buffers and requires one system call to +capture each packet, it requires two if you want to get packet's timestamp +(like libpcap always does). + +In the other hand PACKET_MMAP is very efficient. PACKET_MMAP provides a size +configurable circular buffer mapped in user space that can be used to either +send or receive packets. This way reading packets just needs to wait for them, +most of the time there is no need to issue a single system call. Concerning +transmission, multiple packets can be sent through one system call to get the +highest bandwidth. By using a shared buffer between the kernel and the user +also has the benefit of minimizing packet copies. + +It's fine to use PACKET_MMAP to improve the performance of the capture and +transmission process, but it isn't everything. At least, if you are capturing +at high speeds (this is relative to the cpu speed), you should check if the +device driver of your network interface card supports some sort of interrupt +load mitigation or (even better) if it supports NAPI, also make sure it is +enabled. For transmission, check the MTU (Maximum Transmission Unit) used and +supported by devices of your network. CPU IRQ pinning of your network interface +card can also be an advantage. + +How to use mmap() to improve capture process +============================================ + +From the user standpoint, you should use the higher level libpcap library, which +is a de facto standard, portable across nearly all operating systems +including Win32. + +Packet MMAP support was integrated into libpcap around the time of version 1.3.0; +TPACKET_V3 support was added in version 1.5.0 + +How to use mmap() directly to improve capture process +===================================================== + +From the system calls stand point, the use of PACKET_MMAP involves +the following process:: + + + [setup] socket() -------> creation of the capture socket + setsockopt() ---> allocation of the circular buffer (ring) + option: PACKET_RX_RING + mmap() ---------> mapping of the allocated buffer to the + user process + + [capture] poll() ---------> to wait for incoming packets + + [shutdown] close() --------> destruction of the capture socket and + deallocation of all associated + resources. + + +socket creation and destruction is straight forward, and is done +the same way with or without PACKET_MMAP:: + + int fd = socket(PF_PACKET, mode, htons(ETH_P_ALL)); + +where mode is SOCK_RAW for the raw interface were link level +information can be captured or SOCK_DGRAM for the cooked +interface where link level information capture is not +supported and a link level pseudo-header is provided +by the kernel. + +The destruction of the socket and all associated resources +is done by a simple call to close(fd). + +Similarly as without PACKET_MMAP, it is possible to use one socket +for capture and transmission. This can be done by mapping the +allocated RX and TX buffer ring with a single mmap() call. +See "Mapping and use of the circular buffer (ring)". + +Next I will describe PACKET_MMAP settings and its constraints, +also the mapping of the circular buffer in the user process and +the use of this buffer. + +How to use mmap() directly to improve transmission process +========================================================== +Transmission process is similar to capture as shown below:: + + [setup] socket() -------> creation of the transmission socket + setsockopt() ---> allocation of the circular buffer (ring) + option: PACKET_TX_RING + bind() ---------> bind transmission socket with a network interface + mmap() ---------> mapping of the allocated buffer to the + user process + + [transmission] poll() ---------> wait for free packets (optional) + send() ---------> send all packets that are set as ready in + the ring + The flag MSG_DONTWAIT can be used to return + before end of transfer. + + [shutdown] close() --------> destruction of the transmission socket and + deallocation of all associated resources. + +Socket creation and destruction is also straight forward, and is done +the same way as in capturing described in the previous paragraph:: + + int fd = socket(PF_PACKET, mode, 0); + +The protocol can optionally be 0 in case we only want to transmit +via this socket, which avoids an expensive call to packet_rcv(). +In this case, you also need to bind(2) the TX_RING with sll_protocol = 0 +set. Otherwise, htons(ETH_P_ALL) or any other protocol, for example. + +Binding the socket to your network interface is mandatory (with zero copy) to +know the header size of frames used in the circular buffer. + +As capture, each frame contains two parts:: + + -------------------- + | struct tpacket_hdr | Header. It contains the status of + | | of this frame + |--------------------| + | data buffer | + . . Data that will be sent over the network interface. + . . + -------------------- + + bind() associates the socket to your network interface thanks to + sll_ifindex parameter of struct sockaddr_ll. + + Initialization example:: + + struct sockaddr_ll my_addr; + struct ifreq s_ifr; + ... + + strncpy (s_ifr.ifr_name, "eth0", sizeof(s_ifr.ifr_name)); + + /* get interface index of eth0 */ + ioctl(this->socket, SIOCGIFINDEX, &s_ifr); + + /* fill sockaddr_ll struct to prepare binding */ + my_addr.sll_family = AF_PACKET; + my_addr.sll_protocol = htons(ETH_P_ALL); + my_addr.sll_ifindex = s_ifr.ifr_ifindex; + + /* bind socket to eth0 */ + bind(this->socket, (struct sockaddr *)&my_addr, sizeof(struct sockaddr_ll)); + + A complete tutorial is available at: https://sites.google.com/site/packetmmap/ + +By default, the user should put data at:: + + frame base + TPACKET_HDRLEN - sizeof(struct sockaddr_ll) + +So, whatever you choose for the socket mode (SOCK_DGRAM or SOCK_RAW), +the beginning of the user data will be at:: + + frame base + TPACKET_ALIGN(sizeof(struct tpacket_hdr)) + +If you wish to put user data at a custom offset from the beginning of +the frame (for payload alignment with SOCK_RAW mode for instance) you +can set tp_net (with SOCK_DGRAM) or tp_mac (with SOCK_RAW). In order +to make this work it must be enabled previously with setsockopt() +and the PACKET_TX_HAS_OFF option. + +PACKET_MMAP settings +==================== + +To setup PACKET_MMAP from user level code is done with a call like + + - Capture process:: + + setsockopt(fd, SOL_PACKET, PACKET_RX_RING, (void *) &req, sizeof(req)) + + - Transmission process:: + + setsockopt(fd, SOL_PACKET, PACKET_TX_RING, (void *) &req, sizeof(req)) + +The most significant argument in the previous call is the req parameter, +this parameter must to have the following structure:: + + struct tpacket_req + { + unsigned int tp_block_size; /* Minimal size of contiguous block */ + unsigned int tp_block_nr; /* Number of blocks */ + unsigned int tp_frame_size; /* Size of frame */ + unsigned int tp_frame_nr; /* Total number of frames */ + }; + +This structure is defined in /usr/include/linux/if_packet.h and establishes a +circular buffer (ring) of unswappable memory. +Being mapped in the capture process allows reading the captured frames and +related meta-information like timestamps without requiring a system call. + +Frames are grouped in blocks. Each block is a physically contiguous +region of memory and holds tp_block_size/tp_frame_size frames. The total number +of blocks is tp_block_nr. Note that tp_frame_nr is a redundant parameter because:: + + frames_per_block = tp_block_size/tp_frame_size + +indeed, packet_set_ring checks that the following condition is true:: + + frames_per_block * tp_block_nr == tp_frame_nr + +Lets see an example, with the following values:: + + tp_block_size= 4096 + tp_frame_size= 2048 + tp_block_nr = 4 + tp_frame_nr = 8 + +we will get the following buffer structure:: + + block #1 block #2 + +---------+---------+ +---------+---------+ + | frame 1 | frame 2 | | frame 3 | frame 4 | + +---------+---------+ +---------+---------+ + + block #3 block #4 + +---------+---------+ +---------+---------+ + | frame 5 | frame 6 | | frame 7 | frame 8 | + +---------+---------+ +---------+---------+ + +A frame can be of any size with the only condition it can fit in a block. A block +can only hold an integer number of frames, or in other words, a frame cannot +be spawned across two blocks, so there are some details you have to take into +account when choosing the frame_size. See "Mapping and use of the circular +buffer (ring)". + +PACKET_MMAP setting constraints +=============================== + +In kernel versions prior to 2.4.26 (for the 2.4 branch) and 2.6.5 (2.6 branch), +the PACKET_MMAP buffer could hold only 32768 frames in a 32 bit architecture or +16384 in a 64 bit architecture. For information on these kernel versions +see http://pusa.uv.es/~ulisses/packet_mmap/packet_mmap.pre-2.4.26_2.6.5.txt + +Block size limit +---------------- + +As stated earlier, each block is a contiguous physical region of memory. These +memory regions are allocated with calls to the __get_free_pages() function. As +the name indicates, this function allocates pages of memory, and the second +argument is "order" or a power of two number of pages, that is +(for PAGE_SIZE == 4096) order=0 ==> 4096 bytes, order=1 ==> 8192 bytes, +order=2 ==> 16384 bytes, etc. The maximum size of a +region allocated by __get_free_pages is determined by the MAX_ORDER macro. More +precisely the limit can be calculated as:: + + PAGE_SIZE << MAX_ORDER + + In a i386 architecture PAGE_SIZE is 4096 bytes + In a 2.4/i386 kernel MAX_ORDER is 10 + In a 2.6/i386 kernel MAX_ORDER is 11 + +So get_free_pages can allocate as much as 4MB or 8MB in a 2.4/2.6 kernel +respectively, with an i386 architecture. + +User space programs can include /usr/include/sys/user.h and +/usr/include/linux/mmzone.h to get PAGE_SIZE MAX_ORDER declarations. + +The pagesize can also be determined dynamically with the getpagesize (2) +system call. + +Block number limit +------------------ + +To understand the constraints of PACKET_MMAP, we have to see the structure +used to hold the pointers to each block. + +Currently, this structure is a dynamically allocated vector with kmalloc +called pg_vec, its size limits the number of blocks that can be allocated:: + + +---+---+---+---+ + | x | x | x | x | + +---+---+---+---+ + | | | | + | | | v + | | v block #4 + | v block #3 + v block #2 + block #1 + +kmalloc allocates any number of bytes of physically contiguous memory from +a pool of pre-determined sizes. This pool of memory is maintained by the slab +allocator which is at the end the responsible for doing the allocation and +hence which imposes the maximum memory that kmalloc can allocate. + +In a 2.4/2.6 kernel and the i386 architecture, the limit is 131072 bytes. The +predetermined sizes that kmalloc uses can be checked in the "size-" +entries of /proc/slabinfo + +In a 32 bit architecture, pointers are 4 bytes long, so the total number of +pointers to blocks is:: + + 131072/4 = 32768 blocks + +PACKET_MMAP buffer size calculator +================================== + +Definitions: + +============== ================================================================ + is the maximum size of allocable with kmalloc + (see /proc/slabinfo) + depends on the architecture -- ``sizeof(void *)`` + depends on the architecture -- PAGE_SIZE or getpagesize (2) + is the value defined with MAX_ORDER + it's an upper bound of frame's capture size (more on this later) +============== ================================================================ + +from these definitions we will derive:: + + = / + = << + +so, the max buffer size is:: + + * + +and, the number of frames be:: + + * / + +Suppose the following parameters, which apply for 2.6 kernel and an +i386 architecture:: + + = 131072 bytes + = 4 bytes + = 4096 bytes + = 11 + +and a value for of 2048 bytes. These parameters will yield:: + + = 131072/4 = 32768 blocks + = 4096 << 11 = 8 MiB. + +and hence the buffer will have a 262144 MiB size. So it can hold +262144 MiB / 2048 bytes = 134217728 frames + +Actually, this buffer size is not possible with an i386 architecture. +Remember that the memory is allocated in kernel space, in the case of +an i386 kernel's memory size is limited to 1GiB. + +All memory allocations are not freed until the socket is closed. The memory +allocations are done with GFP_KERNEL priority, this basically means that +the allocation can wait and swap other process' memory in order to allocate +the necessary memory, so normally limits can be reached. + +Other constraints +----------------- + +If you check the source code you will see that what I draw here as a frame +is not only the link level frame. At the beginning of each frame there is a +header called struct tpacket_hdr used in PACKET_MMAP to hold link level's frame +meta information like timestamp. So what we draw here a frame it's really +the following (from include/linux/if_packet.h):: + + /* + Frame structure: + + - Start. Frame must be aligned to TPACKET_ALIGNMENT=16 + - struct tpacket_hdr + - pad to TPACKET_ALIGNMENT=16 + - struct sockaddr_ll + - Gap, chosen so that packet data (Start+tp_net) aligns to + TPACKET_ALIGNMENT=16 + - Start+tp_mac: [ Optional MAC header ] + - Start+tp_net: Packet data, aligned to TPACKET_ALIGNMENT=16. + - Pad to align to TPACKET_ALIGNMENT=16 + */ + +The following are conditions that are checked in packet_set_ring + + - tp_block_size must be a multiple of PAGE_SIZE (1) + - tp_frame_size must be greater than TPACKET_HDRLEN (obvious) + - tp_frame_size must be a multiple of TPACKET_ALIGNMENT + - tp_frame_nr must be exactly frames_per_block*tp_block_nr + +Note that tp_block_size should be chosen to be a power of two or there will +be a waste of memory. + +Mapping and use of the circular buffer (ring) +--------------------------------------------- + +The mapping of the buffer in the user process is done with the conventional +mmap function. Even the circular buffer is compound of several physically +discontiguous blocks of memory, they are contiguous to the user space, hence +just one call to mmap is needed:: + + mmap(0, size, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0); + +If tp_frame_size is a divisor of tp_block_size frames will be +contiguously spaced by tp_frame_size bytes. If not, each +tp_block_size/tp_frame_size frames there will be a gap between +the frames. This is because a frame cannot be spawn across two +blocks. + +To use one socket for capture and transmission, the mapping of both the +RX and TX buffer ring has to be done with one call to mmap:: + + ... + setsockopt(fd, SOL_PACKET, PACKET_RX_RING, &foo, sizeof(foo)); + setsockopt(fd, SOL_PACKET, PACKET_TX_RING, &bar, sizeof(bar)); + ... + rx_ring = mmap(0, size * 2, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0); + tx_ring = rx_ring + size; + +RX must be the first as the kernel maps the TX ring memory right +after the RX one. + +At the beginning of each frame there is an status field (see +struct tpacket_hdr). If this field is 0 means that the frame is ready +to be used for the kernel, If not, there is a frame the user can read +and the following flags apply: + +Capture process +^^^^^^^^^^^^^^^ + + from include/linux/if_packet.h + + #define TP_STATUS_COPY (1 << 1) + #define TP_STATUS_LOSING (1 << 2) + #define TP_STATUS_CSUMNOTREADY (1 << 3) + #define TP_STATUS_CSUM_VALID (1 << 7) + +====================== ======================================================= +TP_STATUS_COPY This flag indicates that the frame (and associated + meta information) has been truncated because it's + larger than tp_frame_size. This packet can be + read entirely with recvfrom(). + + In order to make this work it must to be + enabled previously with setsockopt() and + the PACKET_COPY_THRESH option. + + The number of frames that can be buffered to + be read with recvfrom is limited like a normal socket. + See the SO_RCVBUF option in the socket (7) man page. + +TP_STATUS_LOSING indicates there were packet drops from last time + statistics where checked with getsockopt() and + the PACKET_STATISTICS option. + +TP_STATUS_CSUMNOTREADY currently it's used for outgoing IP packets which + its checksum will be done in hardware. So while + reading the packet we should not try to check the + checksum. + +TP_STATUS_CSUM_VALID This flag indicates that at least the transport + header checksum of the packet has been already + validated on the kernel side. If the flag is not set + then we are free to check the checksum by ourselves + provided that TP_STATUS_CSUMNOTREADY is also not set. +====================== ======================================================= + +for convenience there are also the following defines:: + + #define TP_STATUS_KERNEL 0 + #define TP_STATUS_USER 1 + +The kernel initializes all frames to TP_STATUS_KERNEL, when the kernel +receives a packet it puts in the buffer and updates the status with +at least the TP_STATUS_USER flag. Then the user can read the packet, +once the packet is read the user must zero the status field, so the kernel +can use again that frame buffer. + +The user can use poll (any other variant should apply too) to check if new +packets are in the ring:: + + struct pollfd pfd; + + pfd.fd = fd; + pfd.revents = 0; + pfd.events = POLLIN|POLLRDNORM|POLLERR; + + if (status == TP_STATUS_KERNEL) + retval = poll(&pfd, 1, timeout); + +It doesn't incur in a race condition to first check the status value and +then poll for frames. + +Transmission process +^^^^^^^^^^^^^^^^^^^^ + +Those defines are also used for transmission:: + + #define TP_STATUS_AVAILABLE 0 // Frame is available + #define TP_STATUS_SEND_REQUEST 1 // Frame will be sent on next send() + #define TP_STATUS_SENDING 2 // Frame is currently in transmission + #define TP_STATUS_WRONG_FORMAT 4 // Frame format is not correct + +First, the kernel initializes all frames to TP_STATUS_AVAILABLE. To send a +packet, the user fills a data buffer of an available frame, sets tp_len to +current data buffer size and sets its status field to TP_STATUS_SEND_REQUEST. +This can be done on multiple frames. Once the user is ready to transmit, it +calls send(). Then all buffers with status equal to TP_STATUS_SEND_REQUEST are +forwarded to the network device. The kernel updates each status of sent +frames with TP_STATUS_SENDING until the end of transfer. + +At the end of each transfer, buffer status returns to TP_STATUS_AVAILABLE. + +:: + + header->tp_len = in_i_size; + header->tp_status = TP_STATUS_SEND_REQUEST; + retval = send(this->socket, NULL, 0, 0); + +The user can also use poll() to check if a buffer is available: + +(status == TP_STATUS_SENDING) + +:: + + struct pollfd pfd; + pfd.fd = fd; + pfd.revents = 0; + pfd.events = POLLOUT; + retval = poll(&pfd, 1, timeout); + +What TPACKET versions are available and when to use them? +========================================================= + +:: + + int val = tpacket_version; + setsockopt(fd, SOL_PACKET, PACKET_VERSION, &val, sizeof(val)); + getsockopt(fd, SOL_PACKET, PACKET_VERSION, &val, sizeof(val)); + +where 'tpacket_version' can be TPACKET_V1 (default), TPACKET_V2, TPACKET_V3. + +TPACKET_V1: + - Default if not otherwise specified by setsockopt(2) + - RX_RING, TX_RING available + +TPACKET_V1 --> TPACKET_V2: + - Made 64 bit clean due to unsigned long usage in TPACKET_V1 + structures, thus this also works on 64 bit kernel with 32 bit + userspace and the like + - Timestamp resolution in nanoseconds instead of microseconds + - RX_RING, TX_RING available + - VLAN metadata information available for packets + (TP_STATUS_VLAN_VALID, TP_STATUS_VLAN_TPID_VALID), + in the tpacket2_hdr structure: + + - TP_STATUS_VLAN_VALID bit being set into the tp_status field indicates + that the tp_vlan_tci field has valid VLAN TCI value + - TP_STATUS_VLAN_TPID_VALID bit being set into the tp_status field + indicates that the tp_vlan_tpid field has valid VLAN TPID value + + - How to switch to TPACKET_V2: + + 1. Replace struct tpacket_hdr by struct tpacket2_hdr + 2. Query header len and save + 3. Set protocol version to 2, set up ring as usual + 4. For getting the sockaddr_ll, + use ``(void *)hdr + TPACKET_ALIGN(hdrlen)`` instead of + ``(void *)hdr + TPACKET_ALIGN(sizeof(struct tpacket_hdr))`` + +TPACKET_V2 --> TPACKET_V3: + - Flexible buffer implementation for RX_RING: + 1. Blocks can be configured with non-static frame-size + 2. Read/poll is at a block-level (as opposed to packet-level) + 3. Added poll timeout to avoid indefinite user-space wait + on idle links + 4. Added user-configurable knobs: + + 4.1 block::timeout + 4.2 tpkt_hdr::sk_rxhash + + - RX Hash data available in user space + - TX_RING semantics are conceptually similar to TPACKET_V2; + use tpacket3_hdr instead of tpacket2_hdr, and TPACKET3_HDRLEN + instead of TPACKET2_HDRLEN. In the current implementation, + the tp_next_offset field in the tpacket3_hdr MUST be set to + zero, indicating that the ring does not hold variable sized frames. + Packets with non-zero values of tp_next_offset will be dropped. + +AF_PACKET fanout mode +===================== + +In the AF_PACKET fanout mode, packet reception can be load balanced among +processes. This also works in combination with mmap(2) on packet sockets. + +Currently implemented fanout policies are: + + - PACKET_FANOUT_HASH: schedule to socket by skb's packet hash + - PACKET_FANOUT_LB: schedule to socket by round-robin + - PACKET_FANOUT_CPU: schedule to socket by CPU packet arrives on + - PACKET_FANOUT_RND: schedule to socket by random selection + - PACKET_FANOUT_ROLLOVER: if one socket is full, rollover to another + - PACKET_FANOUT_QM: schedule to socket by skbs recorded queue_mapping + +Minimal example code by David S. Miller (try things like "./test eth0 hash", +"./test eth0 lb", etc.):: + + #include + #include + #include + #include + + #include + #include + #include + #include + + #include + + #include + #include + + #include + + static const char *device_name; + static int fanout_type; + static int fanout_id; + + #ifndef PACKET_FANOUT + # define PACKET_FANOUT 18 + # define PACKET_FANOUT_HASH 0 + # define PACKET_FANOUT_LB 1 + #endif + + static int setup_socket(void) + { + int err, fd = socket(AF_PACKET, SOCK_RAW, htons(ETH_P_IP)); + struct sockaddr_ll ll; + struct ifreq ifr; + int fanout_arg; + + if (fd < 0) { + perror("socket"); + return EXIT_FAILURE; + } + + memset(&ifr, 0, sizeof(ifr)); + strcpy(ifr.ifr_name, device_name); + err = ioctl(fd, SIOCGIFINDEX, &ifr); + if (err < 0) { + perror("SIOCGIFINDEX"); + return EXIT_FAILURE; + } + + memset(&ll, 0, sizeof(ll)); + ll.sll_family = AF_PACKET; + ll.sll_ifindex = ifr.ifr_ifindex; + err = bind(fd, (struct sockaddr *) &ll, sizeof(ll)); + if (err < 0) { + perror("bind"); + return EXIT_FAILURE; + } + + fanout_arg = (fanout_id | (fanout_type << 16)); + err = setsockopt(fd, SOL_PACKET, PACKET_FANOUT, + &fanout_arg, sizeof(fanout_arg)); + if (err) { + perror("setsockopt"); + return EXIT_FAILURE; + } + + return fd; + } + + static void fanout_thread(void) + { + int fd = setup_socket(); + int limit = 10000; + + if (fd < 0) + exit(fd); + + while (limit-- > 0) { + char buf[1600]; + int err; + + err = read(fd, buf, sizeof(buf)); + if (err < 0) { + perror("read"); + exit(EXIT_FAILURE); + } + if ((limit % 10) == 0) + fprintf(stdout, "(%d) \n", getpid()); + } + + fprintf(stdout, "%d: Received 10000 packets\n", getpid()); + + close(fd); + exit(0); + } + + int main(int argc, char **argp) + { + int fd, err; + int i; + + if (argc != 3) { + fprintf(stderr, "Usage: %s INTERFACE {hash|lb}\n", argp[0]); + return EXIT_FAILURE; + } + + if (!strcmp(argp[2], "hash")) + fanout_type = PACKET_FANOUT_HASH; + else if (!strcmp(argp[2], "lb")) + fanout_type = PACKET_FANOUT_LB; + else { + fprintf(stderr, "Unknown fanout type [%s]\n", argp[2]); + exit(EXIT_FAILURE); + } + + device_name = argp[1]; + fanout_id = getpid() & 0xffff; + + for (i = 0; i < 4; i++) { + pid_t pid = fork(); + + switch (pid) { + case 0: + fanout_thread(); + + case -1: + perror("fork"); + exit(EXIT_FAILURE); + } + } + + for (i = 0; i < 4; i++) { + int status; + + wait(&status); + } + + return 0; + } + +AF_PACKET TPACKET_V3 example +============================ + +AF_PACKET's TPACKET_V3 ring buffer can be configured to use non-static frame +sizes by doing it's own memory management. It is based on blocks where polling +works on a per block basis instead of per ring as in TPACKET_V2 and predecessor. + +It is said that TPACKET_V3 brings the following benefits: + + * ~15% - 20% reduction in CPU-usage + * ~20% increase in packet capture rate + * ~2x increase in packet density + * Port aggregation analysis + * Non static frame size to capture entire packet payload + +So it seems to be a good candidate to be used with packet fanout. + +Minimal example code by Daniel Borkmann based on Chetan Loke's lolpcap (compile +it with gcc -Wall -O2 blob.c, and try things like "./a.out eth0", etc.):: + + /* Written from scratch, but kernel-to-user space API usage + * dissected from lolpcap: + * Copyright 2011, Chetan Loke + * License: GPL, version 2.0 + */ + + #include + #include + #include + #include + #include + #include + #include + #include + #include + #include + #include + #include + #include + #include + #include + #include + #include + + #ifndef likely + # define likely(x) __builtin_expect(!!(x), 1) + #endif + #ifndef unlikely + # define unlikely(x) __builtin_expect(!!(x), 0) + #endif + + struct block_desc { + uint32_t version; + uint32_t offset_to_priv; + struct tpacket_hdr_v1 h1; + }; + + struct ring { + struct iovec *rd; + uint8_t *map; + struct tpacket_req3 req; + }; + + static unsigned long packets_total = 0, bytes_total = 0; + static sig_atomic_t sigint = 0; + + static void sighandler(int num) + { + sigint = 1; + } + + static int setup_socket(struct ring *ring, char *netdev) + { + int err, i, fd, v = TPACKET_V3; + struct sockaddr_ll ll; + unsigned int blocksiz = 1 << 22, framesiz = 1 << 11; + unsigned int blocknum = 64; + + fd = socket(AF_PACKET, SOCK_RAW, htons(ETH_P_ALL)); + if (fd < 0) { + perror("socket"); + exit(1); + } + + err = setsockopt(fd, SOL_PACKET, PACKET_VERSION, &v, sizeof(v)); + if (err < 0) { + perror("setsockopt"); + exit(1); + } + + memset(&ring->req, 0, sizeof(ring->req)); + ring->req.tp_block_size = blocksiz; + ring->req.tp_frame_size = framesiz; + ring->req.tp_block_nr = blocknum; + ring->req.tp_frame_nr = (blocksiz * blocknum) / framesiz; + ring->req.tp_retire_blk_tov = 60; + ring->req.tp_feature_req_word = TP_FT_REQ_FILL_RXHASH; + + err = setsockopt(fd, SOL_PACKET, PACKET_RX_RING, &ring->req, + sizeof(ring->req)); + if (err < 0) { + perror("setsockopt"); + exit(1); + } + + ring->map = mmap(NULL, ring->req.tp_block_size * ring->req.tp_block_nr, + PROT_READ | PROT_WRITE, MAP_SHARED | MAP_LOCKED, fd, 0); + if (ring->map == MAP_FAILED) { + perror("mmap"); + exit(1); + } + + ring->rd = malloc(ring->req.tp_block_nr * sizeof(*ring->rd)); + assert(ring->rd); + for (i = 0; i < ring->req.tp_block_nr; ++i) { + ring->rd[i].iov_base = ring->map + (i * ring->req.tp_block_size); + ring->rd[i].iov_len = ring->req.tp_block_size; + } + + memset(&ll, 0, sizeof(ll)); + ll.sll_family = PF_PACKET; + ll.sll_protocol = htons(ETH_P_ALL); + ll.sll_ifindex = if_nametoindex(netdev); + ll.sll_hatype = 0; + ll.sll_pkttype = 0; + ll.sll_halen = 0; + + err = bind(fd, (struct sockaddr *) &ll, sizeof(ll)); + if (err < 0) { + perror("bind"); + exit(1); + } + + return fd; + } + + static void display(struct tpacket3_hdr *ppd) + { + struct ethhdr *eth = (struct ethhdr *) ((uint8_t *) ppd + ppd->tp_mac); + struct iphdr *ip = (struct iphdr *) ((uint8_t *) eth + ETH_HLEN); + + if (eth->h_proto == htons(ETH_P_IP)) { + struct sockaddr_in ss, sd; + char sbuff[NI_MAXHOST], dbuff[NI_MAXHOST]; + + memset(&ss, 0, sizeof(ss)); + ss.sin_family = PF_INET; + ss.sin_addr.s_addr = ip->saddr; + getnameinfo((struct sockaddr *) &ss, sizeof(ss), + sbuff, sizeof(sbuff), NULL, 0, NI_NUMERICHOST); + + memset(&sd, 0, sizeof(sd)); + sd.sin_family = PF_INET; + sd.sin_addr.s_addr = ip->daddr; + getnameinfo((struct sockaddr *) &sd, sizeof(sd), + dbuff, sizeof(dbuff), NULL, 0, NI_NUMERICHOST); + + printf("%s -> %s, ", sbuff, dbuff); + } + + printf("rxhash: 0x%x\n", ppd->hv1.tp_rxhash); + } + + static void walk_block(struct block_desc *pbd, const int block_num) + { + int num_pkts = pbd->h1.num_pkts, i; + unsigned long bytes = 0; + struct tpacket3_hdr *ppd; + + ppd = (struct tpacket3_hdr *) ((uint8_t *) pbd + + pbd->h1.offset_to_first_pkt); + for (i = 0; i < num_pkts; ++i) { + bytes += ppd->tp_snaplen; + display(ppd); + + ppd = (struct tpacket3_hdr *) ((uint8_t *) ppd + + ppd->tp_next_offset); + } + + packets_total += num_pkts; + bytes_total += bytes; + } + + static void flush_block(struct block_desc *pbd) + { + pbd->h1.block_status = TP_STATUS_KERNEL; + } + + static void teardown_socket(struct ring *ring, int fd) + { + munmap(ring->map, ring->req.tp_block_size * ring->req.tp_block_nr); + free(ring->rd); + close(fd); + } + + int main(int argc, char **argp) + { + int fd, err; + socklen_t len; + struct ring ring; + struct pollfd pfd; + unsigned int block_num = 0, blocks = 64; + struct block_desc *pbd; + struct tpacket_stats_v3 stats; + + if (argc != 2) { + fprintf(stderr, "Usage: %s INTERFACE\n", argp[0]); + return EXIT_FAILURE; + } + + signal(SIGINT, sighandler); + + memset(&ring, 0, sizeof(ring)); + fd = setup_socket(&ring, argp[argc - 1]); + assert(fd > 0); + + memset(&pfd, 0, sizeof(pfd)); + pfd.fd = fd; + pfd.events = POLLIN | POLLERR; + pfd.revents = 0; + + while (likely(!sigint)) { + pbd = (struct block_desc *) ring.rd[block_num].iov_base; + + if ((pbd->h1.block_status & TP_STATUS_USER) == 0) { + poll(&pfd, 1, -1); + continue; + } + + walk_block(pbd, block_num); + flush_block(pbd); + block_num = (block_num + 1) % blocks; + } + + len = sizeof(stats); + err = getsockopt(fd, SOL_PACKET, PACKET_STATISTICS, &stats, &len); + if (err < 0) { + perror("getsockopt"); + exit(1); + } + + fflush(stdout); + printf("\nReceived %u packets, %lu bytes, %u dropped, freeze_q_cnt: %u\n", + stats.tp_packets, bytes_total, stats.tp_drops, + stats.tp_freeze_q_cnt); + + teardown_socket(&ring, fd); + return 0; + } + +PACKET_QDISC_BYPASS +=================== + +If there is a requirement to load the network with many packets in a similar +fashion as pktgen does, you might set the following option after socket +creation:: + + int one = 1; + setsockopt(fd, SOL_PACKET, PACKET_QDISC_BYPASS, &one, sizeof(one)); + +This has the side-effect, that packets sent through PF_PACKET will bypass the +kernel's qdisc layer and are forcedly pushed to the driver directly. Meaning, +packet are not buffered, tc disciplines are ignored, increased loss can occur +and such packets are also not visible to other PF_PACKET sockets anymore. So, +you have been warned; generally, this can be useful for stress testing various +components of a system. + +On default, PACKET_QDISC_BYPASS is disabled and needs to be explicitly enabled +on PF_PACKET sockets. + +PACKET_TIMESTAMP +================ + +The PACKET_TIMESTAMP setting determines the source of the timestamp in +the packet meta information for mmap(2)ed RX_RING and TX_RINGs. If your +NIC is capable of timestamping packets in hardware, you can request those +hardware timestamps to be used. Note: you may need to enable the generation +of hardware timestamps with SIOCSHWTSTAMP (see related information from +Documentation/networking/timestamping.txt). + +PACKET_TIMESTAMP accepts the same integer bit field as SO_TIMESTAMPING:: + + int req = SOF_TIMESTAMPING_RAW_HARDWARE; + setsockopt(fd, SOL_PACKET, PACKET_TIMESTAMP, (void *) &req, sizeof(req)) + +For the mmap(2)ed ring buffers, such timestamps are stored in the +``tpacket{,2,3}_hdr`` structure's tp_sec and ``tp_{n,u}sec`` members. +To determine what kind of timestamp has been reported, the tp_status field +is binary or'ed with the following possible bits ... + +:: + + TP_STATUS_TS_RAW_HARDWARE + TP_STATUS_TS_SOFTWARE + +... that are equivalent to its ``SOF_TIMESTAMPING_*`` counterparts. For the +RX_RING, if neither is set (i.e. PACKET_TIMESTAMP is not set), then a +software fallback was invoked *within* PF_PACKET's processing code (less +precise). + +Getting timestamps for the TX_RING works as follows: i) fill the ring frames, +ii) call sendto() e.g. in blocking mode, iii) wait for status of relevant +frames to be updated resp. the frame handed over to the application, iv) walk +through the frames to pick up the individual hw/sw timestamps. + +Only (!) if transmit timestamping is enabled, then these bits are combined +with binary | with TP_STATUS_AVAILABLE, so you must check for that in your +application (e.g. !(tp_status & (TP_STATUS_SEND_REQUEST | TP_STATUS_SENDING)) +in a first step to see if the frame belongs to the application, and then +one can extract the type of timestamp in a second step from tp_status)! + +If you don't care about them, thus having it disabled, checking for +TP_STATUS_AVAILABLE resp. TP_STATUS_WRONG_FORMAT is sufficient. If in the +TX_RING part only TP_STATUS_AVAILABLE is set, then the tp_sec and tp_{n,u}sec +members do not contain a valid value. For TX_RINGs, by default no timestamp +is generated! + +See include/linux/net_tstamp.h and Documentation/networking/timestamping.txt +for more information on hardware timestamps. + +Miscellaneous bits +================== + +- Packet sockets work well together with Linux socket filters, thus you also + might want to have a look at Documentation/networking/filter.txt + +THANKS +====== + + Jesse Brandeburg, for fixing my grammathical/spelling errors diff --git a/Documentation/networking/packet_mmap.txt b/Documentation/networking/packet_mmap.txt deleted file mode 100644 index 494614573c67..000000000000 --- a/Documentation/networking/packet_mmap.txt +++ /dev/null @@ -1,1061 +0,0 @@ --------------------------------------------------------------------------------- -+ ABSTRACT --------------------------------------------------------------------------------- - -This file documents the mmap() facility available with the PACKET -socket interface on 2.4/2.6/3.x kernels. This type of sockets is used for -i) capture network traffic with utilities like tcpdump, ii) transmit network -traffic, or any other that needs raw access to network interface. - -Howto can be found at: - https://sites.google.com/site/packetmmap/ - -Please send your comments to - Ulisses Alonso Camaró - Johann Baudy - -------------------------------------------------------------------------------- -+ Why use PACKET_MMAP --------------------------------------------------------------------------------- - -In Linux 2.4/2.6/3.x if PACKET_MMAP is not enabled, the capture process is very -inefficient. It uses very limited buffers and requires one system call to -capture each packet, it requires two if you want to get packet's timestamp -(like libpcap always does). - -In the other hand PACKET_MMAP is very efficient. PACKET_MMAP provides a size -configurable circular buffer mapped in user space that can be used to either -send or receive packets. This way reading packets just needs to wait for them, -most of the time there is no need to issue a single system call. Concerning -transmission, multiple packets can be sent through one system call to get the -highest bandwidth. By using a shared buffer between the kernel and the user -also has the benefit of minimizing packet copies. - -It's fine to use PACKET_MMAP to improve the performance of the capture and -transmission process, but it isn't everything. At least, if you are capturing -at high speeds (this is relative to the cpu speed), you should check if the -device driver of your network interface card supports some sort of interrupt -load mitigation or (even better) if it supports NAPI, also make sure it is -enabled. For transmission, check the MTU (Maximum Transmission Unit) used and -supported by devices of your network. CPU IRQ pinning of your network interface -card can also be an advantage. - --------------------------------------------------------------------------------- -+ How to use mmap() to improve capture process --------------------------------------------------------------------------------- - -From the user standpoint, you should use the higher level libpcap library, which -is a de facto standard, portable across nearly all operating systems -including Win32. - -Packet MMAP support was integrated into libpcap around the time of version 1.3.0; -TPACKET_V3 support was added in version 1.5.0 - --------------------------------------------------------------------------------- -+ How to use mmap() directly to improve capture process --------------------------------------------------------------------------------- - -From the system calls stand point, the use of PACKET_MMAP involves -the following process: - - -[setup] socket() -------> creation of the capture socket - setsockopt() ---> allocation of the circular buffer (ring) - option: PACKET_RX_RING - mmap() ---------> mapping of the allocated buffer to the - user process - -[capture] poll() ---------> to wait for incoming packets - -[shutdown] close() --------> destruction of the capture socket and - deallocation of all associated - resources. - - -socket creation and destruction is straight forward, and is done -the same way with or without PACKET_MMAP: - - int fd = socket(PF_PACKET, mode, htons(ETH_P_ALL)); - -where mode is SOCK_RAW for the raw interface were link level -information can be captured or SOCK_DGRAM for the cooked -interface where link level information capture is not -supported and a link level pseudo-header is provided -by the kernel. - -The destruction of the socket and all associated resources -is done by a simple call to close(fd). - -Similarly as without PACKET_MMAP, it is possible to use one socket -for capture and transmission. This can be done by mapping the -allocated RX and TX buffer ring with a single mmap() call. -See "Mapping and use of the circular buffer (ring)". - -Next I will describe PACKET_MMAP settings and its constraints, -also the mapping of the circular buffer in the user process and -the use of this buffer. - --------------------------------------------------------------------------------- -+ How to use mmap() directly to improve transmission process --------------------------------------------------------------------------------- -Transmission process is similar to capture as shown below. - -[setup] socket() -------> creation of the transmission socket - setsockopt() ---> allocation of the circular buffer (ring) - option: PACKET_TX_RING - bind() ---------> bind transmission socket with a network interface - mmap() ---------> mapping of the allocated buffer to the - user process - -[transmission] poll() ---------> wait for free packets (optional) - send() ---------> send all packets that are set as ready in - the ring - The flag MSG_DONTWAIT can be used to return - before end of transfer. - -[shutdown] close() --------> destruction of the transmission socket and - deallocation of all associated resources. - -Socket creation and destruction is also straight forward, and is done -the same way as in capturing described in the previous paragraph: - - int fd = socket(PF_PACKET, mode, 0); - -The protocol can optionally be 0 in case we only want to transmit -via this socket, which avoids an expensive call to packet_rcv(). -In this case, you also need to bind(2) the TX_RING with sll_protocol = 0 -set. Otherwise, htons(ETH_P_ALL) or any other protocol, for example. - -Binding the socket to your network interface is mandatory (with zero copy) to -know the header size of frames used in the circular buffer. - -As capture, each frame contains two parts: - - -------------------- -| struct tpacket_hdr | Header. It contains the status of -| | of this frame -|--------------------| -| data buffer | -. . Data that will be sent over the network interface. -. . - -------------------- - - bind() associates the socket to your network interface thanks to - sll_ifindex parameter of struct sockaddr_ll. - - Initialization example: - - struct sockaddr_ll my_addr; - struct ifreq s_ifr; - ... - - strncpy (s_ifr.ifr_name, "eth0", sizeof(s_ifr.ifr_name)); - - /* get interface index of eth0 */ - ioctl(this->socket, SIOCGIFINDEX, &s_ifr); - - /* fill sockaddr_ll struct to prepare binding */ - my_addr.sll_family = AF_PACKET; - my_addr.sll_protocol = htons(ETH_P_ALL); - my_addr.sll_ifindex = s_ifr.ifr_ifindex; - - /* bind socket to eth0 */ - bind(this->socket, (struct sockaddr *)&my_addr, sizeof(struct sockaddr_ll)); - - A complete tutorial is available at: https://sites.google.com/site/packetmmap/ - -By default, the user should put data at : - frame base + TPACKET_HDRLEN - sizeof(struct sockaddr_ll) - -So, whatever you choose for the socket mode (SOCK_DGRAM or SOCK_RAW), -the beginning of the user data will be at : - frame base + TPACKET_ALIGN(sizeof(struct tpacket_hdr)) - -If you wish to put user data at a custom offset from the beginning of -the frame (for payload alignment with SOCK_RAW mode for instance) you -can set tp_net (with SOCK_DGRAM) or tp_mac (with SOCK_RAW). In order -to make this work it must be enabled previously with setsockopt() -and the PACKET_TX_HAS_OFF option. - --------------------------------------------------------------------------------- -+ PACKET_MMAP settings --------------------------------------------------------------------------------- - -To setup PACKET_MMAP from user level code is done with a call like - - - Capture process - setsockopt(fd, SOL_PACKET, PACKET_RX_RING, (void *) &req, sizeof(req)) - - Transmission process - setsockopt(fd, SOL_PACKET, PACKET_TX_RING, (void *) &req, sizeof(req)) - -The most significant argument in the previous call is the req parameter, -this parameter must to have the following structure: - - struct tpacket_req - { - unsigned int tp_block_size; /* Minimal size of contiguous block */ - unsigned int tp_block_nr; /* Number of blocks */ - unsigned int tp_frame_size; /* Size of frame */ - unsigned int tp_frame_nr; /* Total number of frames */ - }; - -This structure is defined in /usr/include/linux/if_packet.h and establishes a -circular buffer (ring) of unswappable memory. -Being mapped in the capture process allows reading the captured frames and -related meta-information like timestamps without requiring a system call. - -Frames are grouped in blocks. Each block is a physically contiguous -region of memory and holds tp_block_size/tp_frame_size frames. The total number -of blocks is tp_block_nr. Note that tp_frame_nr is a redundant parameter because - - frames_per_block = tp_block_size/tp_frame_size - -indeed, packet_set_ring checks that the following condition is true - - frames_per_block * tp_block_nr == tp_frame_nr - -Lets see an example, with the following values: - - tp_block_size= 4096 - tp_frame_size= 2048 - tp_block_nr = 4 - tp_frame_nr = 8 - -we will get the following buffer structure: - - block #1 block #2 -+---------+---------+ +---------+---------+ -| frame 1 | frame 2 | | frame 3 | frame 4 | -+---------+---------+ +---------+---------+ - - block #3 block #4 -+---------+---------+ +---------+---------+ -| frame 5 | frame 6 | | frame 7 | frame 8 | -+---------+---------+ +---------+---------+ - -A frame can be of any size with the only condition it can fit in a block. A block -can only hold an integer number of frames, or in other words, a frame cannot -be spawned across two blocks, so there are some details you have to take into -account when choosing the frame_size. See "Mapping and use of the circular -buffer (ring)". - --------------------------------------------------------------------------------- -+ PACKET_MMAP setting constraints --------------------------------------------------------------------------------- - -In kernel versions prior to 2.4.26 (for the 2.4 branch) and 2.6.5 (2.6 branch), -the PACKET_MMAP buffer could hold only 32768 frames in a 32 bit architecture or -16384 in a 64 bit architecture. For information on these kernel versions -see http://pusa.uv.es/~ulisses/packet_mmap/packet_mmap.pre-2.4.26_2.6.5.txt - - Block size limit ------------------- - -As stated earlier, each block is a contiguous physical region of memory. These -memory regions are allocated with calls to the __get_free_pages() function. As -the name indicates, this function allocates pages of memory, and the second -argument is "order" or a power of two number of pages, that is -(for PAGE_SIZE == 4096) order=0 ==> 4096 bytes, order=1 ==> 8192 bytes, -order=2 ==> 16384 bytes, etc. The maximum size of a -region allocated by __get_free_pages is determined by the MAX_ORDER macro. More -precisely the limit can be calculated as: - - PAGE_SIZE << MAX_ORDER - - In a i386 architecture PAGE_SIZE is 4096 bytes - In a 2.4/i386 kernel MAX_ORDER is 10 - In a 2.6/i386 kernel MAX_ORDER is 11 - -So get_free_pages can allocate as much as 4MB or 8MB in a 2.4/2.6 kernel -respectively, with an i386 architecture. - -User space programs can include /usr/include/sys/user.h and -/usr/include/linux/mmzone.h to get PAGE_SIZE MAX_ORDER declarations. - -The pagesize can also be determined dynamically with the getpagesize (2) -system call. - - Block number limit --------------------- - -To understand the constraints of PACKET_MMAP, we have to see the structure -used to hold the pointers to each block. - -Currently, this structure is a dynamically allocated vector with kmalloc -called pg_vec, its size limits the number of blocks that can be allocated. - - +---+---+---+---+ - | x | x | x | x | - +---+---+---+---+ - | | | | - | | | v - | | v block #4 - | v block #3 - v block #2 - block #1 - -kmalloc allocates any number of bytes of physically contiguous memory from -a pool of pre-determined sizes. This pool of memory is maintained by the slab -allocator which is at the end the responsible for doing the allocation and -hence which imposes the maximum memory that kmalloc can allocate. - -In a 2.4/2.6 kernel and the i386 architecture, the limit is 131072 bytes. The -predetermined sizes that kmalloc uses can be checked in the "size-" -entries of /proc/slabinfo - -In a 32 bit architecture, pointers are 4 bytes long, so the total number of -pointers to blocks is - - 131072/4 = 32768 blocks - - PACKET_MMAP buffer size calculator ------------------------------------- - -Definitions: - - : is the maximum size of allocable with kmalloc (see /proc/slabinfo) -: depends on the architecture -- sizeof(void *) - : depends on the architecture -- PAGE_SIZE or getpagesize (2) - : is the value defined with MAX_ORDER - : it's an upper bound of frame's capture size (more on this later) - -from these definitions we will derive - - = / - = << - -so, the max buffer size is - - * - -and, the number of frames be - - * / - -Suppose the following parameters, which apply for 2.6 kernel and an -i386 architecture: - - = 131072 bytes - = 4 bytes - = 4096 bytes - = 11 - -and a value for of 2048 bytes. These parameters will yield - - = 131072/4 = 32768 blocks - = 4096 << 11 = 8 MiB. - -and hence the buffer will have a 262144 MiB size. So it can hold -262144 MiB / 2048 bytes = 134217728 frames - -Actually, this buffer size is not possible with an i386 architecture. -Remember that the memory is allocated in kernel space, in the case of -an i386 kernel's memory size is limited to 1GiB. - -All memory allocations are not freed until the socket is closed. The memory -allocations are done with GFP_KERNEL priority, this basically means that -the allocation can wait and swap other process' memory in order to allocate -the necessary memory, so normally limits can be reached. - - Other constraints -------------------- - -If you check the source code you will see that what I draw here as a frame -is not only the link level frame. At the beginning of each frame there is a -header called struct tpacket_hdr used in PACKET_MMAP to hold link level's frame -meta information like timestamp. So what we draw here a frame it's really -the following (from include/linux/if_packet.h): - -/* - Frame structure: - - - Start. Frame must be aligned to TPACKET_ALIGNMENT=16 - - struct tpacket_hdr - - pad to TPACKET_ALIGNMENT=16 - - struct sockaddr_ll - - Gap, chosen so that packet data (Start+tp_net) aligns to - TPACKET_ALIGNMENT=16 - - Start+tp_mac: [ Optional MAC header ] - - Start+tp_net: Packet data, aligned to TPACKET_ALIGNMENT=16. - - Pad to align to TPACKET_ALIGNMENT=16 - */ - - The following are conditions that are checked in packet_set_ring - - tp_block_size must be a multiple of PAGE_SIZE (1) - tp_frame_size must be greater than TPACKET_HDRLEN (obvious) - tp_frame_size must be a multiple of TPACKET_ALIGNMENT - tp_frame_nr must be exactly frames_per_block*tp_block_nr - -Note that tp_block_size should be chosen to be a power of two or there will -be a waste of memory. - --------------------------------------------------------------------------------- -+ Mapping and use of the circular buffer (ring) --------------------------------------------------------------------------------- - -The mapping of the buffer in the user process is done with the conventional -mmap function. Even the circular buffer is compound of several physically -discontiguous blocks of memory, they are contiguous to the user space, hence -just one call to mmap is needed: - - mmap(0, size, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0); - -If tp_frame_size is a divisor of tp_block_size frames will be -contiguously spaced by tp_frame_size bytes. If not, each -tp_block_size/tp_frame_size frames there will be a gap between -the frames. This is because a frame cannot be spawn across two -blocks. - -To use one socket for capture and transmission, the mapping of both the -RX and TX buffer ring has to be done with one call to mmap: - - ... - setsockopt(fd, SOL_PACKET, PACKET_RX_RING, &foo, sizeof(foo)); - setsockopt(fd, SOL_PACKET, PACKET_TX_RING, &bar, sizeof(bar)); - ... - rx_ring = mmap(0, size * 2, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0); - tx_ring = rx_ring + size; - -RX must be the first as the kernel maps the TX ring memory right -after the RX one. - -At the beginning of each frame there is an status field (see -struct tpacket_hdr). If this field is 0 means that the frame is ready -to be used for the kernel, If not, there is a frame the user can read -and the following flags apply: - -+++ Capture process: - from include/linux/if_packet.h - - #define TP_STATUS_COPY (1 << 1) - #define TP_STATUS_LOSING (1 << 2) - #define TP_STATUS_CSUMNOTREADY (1 << 3) - #define TP_STATUS_CSUM_VALID (1 << 7) - -TP_STATUS_COPY : This flag indicates that the frame (and associated - meta information) has been truncated because it's - larger than tp_frame_size. This packet can be - read entirely with recvfrom(). - - In order to make this work it must to be - enabled previously with setsockopt() and - the PACKET_COPY_THRESH option. - - The number of frames that can be buffered to - be read with recvfrom is limited like a normal socket. - See the SO_RCVBUF option in the socket (7) man page. - -TP_STATUS_LOSING : indicates there were packet drops from last time - statistics where checked with getsockopt() and - the PACKET_STATISTICS option. - -TP_STATUS_CSUMNOTREADY: currently it's used for outgoing IP packets which - its checksum will be done in hardware. So while - reading the packet we should not try to check the - checksum. - -TP_STATUS_CSUM_VALID : This flag indicates that at least the transport - header checksum of the packet has been already - validated on the kernel side. If the flag is not set - then we are free to check the checksum by ourselves - provided that TP_STATUS_CSUMNOTREADY is also not set. - -for convenience there are also the following defines: - - #define TP_STATUS_KERNEL 0 - #define TP_STATUS_USER 1 - -The kernel initializes all frames to TP_STATUS_KERNEL, when the kernel -receives a packet it puts in the buffer and updates the status with -at least the TP_STATUS_USER flag. Then the user can read the packet, -once the packet is read the user must zero the status field, so the kernel -can use again that frame buffer. - -The user can use poll (any other variant should apply too) to check if new -packets are in the ring: - - struct pollfd pfd; - - pfd.fd = fd; - pfd.revents = 0; - pfd.events = POLLIN|POLLRDNORM|POLLERR; - - if (status == TP_STATUS_KERNEL) - retval = poll(&pfd, 1, timeout); - -It doesn't incur in a race condition to first check the status value and -then poll for frames. - -++ Transmission process -Those defines are also used for transmission: - - #define TP_STATUS_AVAILABLE 0 // Frame is available - #define TP_STATUS_SEND_REQUEST 1 // Frame will be sent on next send() - #define TP_STATUS_SENDING 2 // Frame is currently in transmission - #define TP_STATUS_WRONG_FORMAT 4 // Frame format is not correct - -First, the kernel initializes all frames to TP_STATUS_AVAILABLE. To send a -packet, the user fills a data buffer of an available frame, sets tp_len to -current data buffer size and sets its status field to TP_STATUS_SEND_REQUEST. -This can be done on multiple frames. Once the user is ready to transmit, it -calls send(). Then all buffers with status equal to TP_STATUS_SEND_REQUEST are -forwarded to the network device. The kernel updates each status of sent -frames with TP_STATUS_SENDING until the end of transfer. -At the end of each transfer, buffer status returns to TP_STATUS_AVAILABLE. - - header->tp_len = in_i_size; - header->tp_status = TP_STATUS_SEND_REQUEST; - retval = send(this->socket, NULL, 0, 0); - -The user can also use poll() to check if a buffer is available: -(status == TP_STATUS_SENDING) - - struct pollfd pfd; - pfd.fd = fd; - pfd.revents = 0; - pfd.events = POLLOUT; - retval = poll(&pfd, 1, timeout); - -------------------------------------------------------------------------------- -+ What TPACKET versions are available and when to use them? -------------------------------------------------------------------------------- - - int val = tpacket_version; - setsockopt(fd, SOL_PACKET, PACKET_VERSION, &val, sizeof(val)); - getsockopt(fd, SOL_PACKET, PACKET_VERSION, &val, sizeof(val)); - -where 'tpacket_version' can be TPACKET_V1 (default), TPACKET_V2, TPACKET_V3. - -TPACKET_V1: - - Default if not otherwise specified by setsockopt(2) - - RX_RING, TX_RING available - -TPACKET_V1 --> TPACKET_V2: - - Made 64 bit clean due to unsigned long usage in TPACKET_V1 - structures, thus this also works on 64 bit kernel with 32 bit - userspace and the like - - Timestamp resolution in nanoseconds instead of microseconds - - RX_RING, TX_RING available - - VLAN metadata information available for packets - (TP_STATUS_VLAN_VALID, TP_STATUS_VLAN_TPID_VALID), - in the tpacket2_hdr structure: - - TP_STATUS_VLAN_VALID bit being set into the tp_status field indicates - that the tp_vlan_tci field has valid VLAN TCI value - - TP_STATUS_VLAN_TPID_VALID bit being set into the tp_status field - indicates that the tp_vlan_tpid field has valid VLAN TPID value - - How to switch to TPACKET_V2: - 1. Replace struct tpacket_hdr by struct tpacket2_hdr - 2. Query header len and save - 3. Set protocol version to 2, set up ring as usual - 4. For getting the sockaddr_ll, - use (void *)hdr + TPACKET_ALIGN(hdrlen) instead of - (void *)hdr + TPACKET_ALIGN(sizeof(struct tpacket_hdr)) - -TPACKET_V2 --> TPACKET_V3: - - Flexible buffer implementation for RX_RING: - 1. Blocks can be configured with non-static frame-size - 2. Read/poll is at a block-level (as opposed to packet-level) - 3. Added poll timeout to avoid indefinite user-space wait - on idle links - 4. Added user-configurable knobs: - 4.1 block::timeout - 4.2 tpkt_hdr::sk_rxhash - - RX Hash data available in user space - - TX_RING semantics are conceptually similar to TPACKET_V2; - use tpacket3_hdr instead of tpacket2_hdr, and TPACKET3_HDRLEN - instead of TPACKET2_HDRLEN. In the current implementation, - the tp_next_offset field in the tpacket3_hdr MUST be set to - zero, indicating that the ring does not hold variable sized frames. - Packets with non-zero values of tp_next_offset will be dropped. - -------------------------------------------------------------------------------- -+ AF_PACKET fanout mode -------------------------------------------------------------------------------- - -In the AF_PACKET fanout mode, packet reception can be load balanced among -processes. This also works in combination with mmap(2) on packet sockets. - -Currently implemented fanout policies are: - - - PACKET_FANOUT_HASH: schedule to socket by skb's packet hash - - PACKET_FANOUT_LB: schedule to socket by round-robin - - PACKET_FANOUT_CPU: schedule to socket by CPU packet arrives on - - PACKET_FANOUT_RND: schedule to socket by random selection - - PACKET_FANOUT_ROLLOVER: if one socket is full, rollover to another - - PACKET_FANOUT_QM: schedule to socket by skbs recorded queue_mapping - -Minimal example code by David S. Miller (try things like "./test eth0 hash", -"./test eth0 lb", etc.): - -#include -#include -#include -#include - -#include -#include -#include -#include - -#include - -#include -#include - -#include - -static const char *device_name; -static int fanout_type; -static int fanout_id; - -#ifndef PACKET_FANOUT -# define PACKET_FANOUT 18 -# define PACKET_FANOUT_HASH 0 -# define PACKET_FANOUT_LB 1 -#endif - -static int setup_socket(void) -{ - int err, fd = socket(AF_PACKET, SOCK_RAW, htons(ETH_P_IP)); - struct sockaddr_ll ll; - struct ifreq ifr; - int fanout_arg; - - if (fd < 0) { - perror("socket"); - return EXIT_FAILURE; - } - - memset(&ifr, 0, sizeof(ifr)); - strcpy(ifr.ifr_name, device_name); - err = ioctl(fd, SIOCGIFINDEX, &ifr); - if (err < 0) { - perror("SIOCGIFINDEX"); - return EXIT_FAILURE; - } - - memset(&ll, 0, sizeof(ll)); - ll.sll_family = AF_PACKET; - ll.sll_ifindex = ifr.ifr_ifindex; - err = bind(fd, (struct sockaddr *) &ll, sizeof(ll)); - if (err < 0) { - perror("bind"); - return EXIT_FAILURE; - } - - fanout_arg = (fanout_id | (fanout_type << 16)); - err = setsockopt(fd, SOL_PACKET, PACKET_FANOUT, - &fanout_arg, sizeof(fanout_arg)); - if (err) { - perror("setsockopt"); - return EXIT_FAILURE; - } - - return fd; -} - -static void fanout_thread(void) -{ - int fd = setup_socket(); - int limit = 10000; - - if (fd < 0) - exit(fd); - - while (limit-- > 0) { - char buf[1600]; - int err; - - err = read(fd, buf, sizeof(buf)); - if (err < 0) { - perror("read"); - exit(EXIT_FAILURE); - } - if ((limit % 10) == 0) - fprintf(stdout, "(%d) \n", getpid()); - } - - fprintf(stdout, "%d: Received 10000 packets\n", getpid()); - - close(fd); - exit(0); -} - -int main(int argc, char **argp) -{ - int fd, err; - int i; - - if (argc != 3) { - fprintf(stderr, "Usage: %s INTERFACE {hash|lb}\n", argp[0]); - return EXIT_FAILURE; - } - - if (!strcmp(argp[2], "hash")) - fanout_type = PACKET_FANOUT_HASH; - else if (!strcmp(argp[2], "lb")) - fanout_type = PACKET_FANOUT_LB; - else { - fprintf(stderr, "Unknown fanout type [%s]\n", argp[2]); - exit(EXIT_FAILURE); - } - - device_name = argp[1]; - fanout_id = getpid() & 0xffff; - - for (i = 0; i < 4; i++) { - pid_t pid = fork(); - - switch (pid) { - case 0: - fanout_thread(); - - case -1: - perror("fork"); - exit(EXIT_FAILURE); - } - } - - for (i = 0; i < 4; i++) { - int status; - - wait(&status); - } - - return 0; -} - -------------------------------------------------------------------------------- -+ AF_PACKET TPACKET_V3 example -------------------------------------------------------------------------------- - -AF_PACKET's TPACKET_V3 ring buffer can be configured to use non-static frame -sizes by doing it's own memory management. It is based on blocks where polling -works on a per block basis instead of per ring as in TPACKET_V2 and predecessor. - -It is said that TPACKET_V3 brings the following benefits: - *) ~15 - 20% reduction in CPU-usage - *) ~20% increase in packet capture rate - *) ~2x increase in packet density - *) Port aggregation analysis - *) Non static frame size to capture entire packet payload - -So it seems to be a good candidate to be used with packet fanout. - -Minimal example code by Daniel Borkmann based on Chetan Loke's lolpcap (compile -it with gcc -Wall -O2 blob.c, and try things like "./a.out eth0", etc.): - -/* Written from scratch, but kernel-to-user space API usage - * dissected from lolpcap: - * Copyright 2011, Chetan Loke - * License: GPL, version 2.0 - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#ifndef likely -# define likely(x) __builtin_expect(!!(x), 1) -#endif -#ifndef unlikely -# define unlikely(x) __builtin_expect(!!(x), 0) -#endif - -struct block_desc { - uint32_t version; - uint32_t offset_to_priv; - struct tpacket_hdr_v1 h1; -}; - -struct ring { - struct iovec *rd; - uint8_t *map; - struct tpacket_req3 req; -}; - -static unsigned long packets_total = 0, bytes_total = 0; -static sig_atomic_t sigint = 0; - -static void sighandler(int num) -{ - sigint = 1; -} - -static int setup_socket(struct ring *ring, char *netdev) -{ - int err, i, fd, v = TPACKET_V3; - struct sockaddr_ll ll; - unsigned int blocksiz = 1 << 22, framesiz = 1 << 11; - unsigned int blocknum = 64; - - fd = socket(AF_PACKET, SOCK_RAW, htons(ETH_P_ALL)); - if (fd < 0) { - perror("socket"); - exit(1); - } - - err = setsockopt(fd, SOL_PACKET, PACKET_VERSION, &v, sizeof(v)); - if (err < 0) { - perror("setsockopt"); - exit(1); - } - - memset(&ring->req, 0, sizeof(ring->req)); - ring->req.tp_block_size = blocksiz; - ring->req.tp_frame_size = framesiz; - ring->req.tp_block_nr = blocknum; - ring->req.tp_frame_nr = (blocksiz * blocknum) / framesiz; - ring->req.tp_retire_blk_tov = 60; - ring->req.tp_feature_req_word = TP_FT_REQ_FILL_RXHASH; - - err = setsockopt(fd, SOL_PACKET, PACKET_RX_RING, &ring->req, - sizeof(ring->req)); - if (err < 0) { - perror("setsockopt"); - exit(1); - } - - ring->map = mmap(NULL, ring->req.tp_block_size * ring->req.tp_block_nr, - PROT_READ | PROT_WRITE, MAP_SHARED | MAP_LOCKED, fd, 0); - if (ring->map == MAP_FAILED) { - perror("mmap"); - exit(1); - } - - ring->rd = malloc(ring->req.tp_block_nr * sizeof(*ring->rd)); - assert(ring->rd); - for (i = 0; i < ring->req.tp_block_nr; ++i) { - ring->rd[i].iov_base = ring->map + (i * ring->req.tp_block_size); - ring->rd[i].iov_len = ring->req.tp_block_size; - } - - memset(&ll, 0, sizeof(ll)); - ll.sll_family = PF_PACKET; - ll.sll_protocol = htons(ETH_P_ALL); - ll.sll_ifindex = if_nametoindex(netdev); - ll.sll_hatype = 0; - ll.sll_pkttype = 0; - ll.sll_halen = 0; - - err = bind(fd, (struct sockaddr *) &ll, sizeof(ll)); - if (err < 0) { - perror("bind"); - exit(1); - } - - return fd; -} - -static void display(struct tpacket3_hdr *ppd) -{ - struct ethhdr *eth = (struct ethhdr *) ((uint8_t *) ppd + ppd->tp_mac); - struct iphdr *ip = (struct iphdr *) ((uint8_t *) eth + ETH_HLEN); - - if (eth->h_proto == htons(ETH_P_IP)) { - struct sockaddr_in ss, sd; - char sbuff[NI_MAXHOST], dbuff[NI_MAXHOST]; - - memset(&ss, 0, sizeof(ss)); - ss.sin_family = PF_INET; - ss.sin_addr.s_addr = ip->saddr; - getnameinfo((struct sockaddr *) &ss, sizeof(ss), - sbuff, sizeof(sbuff), NULL, 0, NI_NUMERICHOST); - - memset(&sd, 0, sizeof(sd)); - sd.sin_family = PF_INET; - sd.sin_addr.s_addr = ip->daddr; - getnameinfo((struct sockaddr *) &sd, sizeof(sd), - dbuff, sizeof(dbuff), NULL, 0, NI_NUMERICHOST); - - printf("%s -> %s, ", sbuff, dbuff); - } - - printf("rxhash: 0x%x\n", ppd->hv1.tp_rxhash); -} - -static void walk_block(struct block_desc *pbd, const int block_num) -{ - int num_pkts = pbd->h1.num_pkts, i; - unsigned long bytes = 0; - struct tpacket3_hdr *ppd; - - ppd = (struct tpacket3_hdr *) ((uint8_t *) pbd + - pbd->h1.offset_to_first_pkt); - for (i = 0; i < num_pkts; ++i) { - bytes += ppd->tp_snaplen; - display(ppd); - - ppd = (struct tpacket3_hdr *) ((uint8_t *) ppd + - ppd->tp_next_offset); - } - - packets_total += num_pkts; - bytes_total += bytes; -} - -static void flush_block(struct block_desc *pbd) -{ - pbd->h1.block_status = TP_STATUS_KERNEL; -} - -static void teardown_socket(struct ring *ring, int fd) -{ - munmap(ring->map, ring->req.tp_block_size * ring->req.tp_block_nr); - free(ring->rd); - close(fd); -} - -int main(int argc, char **argp) -{ - int fd, err; - socklen_t len; - struct ring ring; - struct pollfd pfd; - unsigned int block_num = 0, blocks = 64; - struct block_desc *pbd; - struct tpacket_stats_v3 stats; - - if (argc != 2) { - fprintf(stderr, "Usage: %s INTERFACE\n", argp[0]); - return EXIT_FAILURE; - } - - signal(SIGINT, sighandler); - - memset(&ring, 0, sizeof(ring)); - fd = setup_socket(&ring, argp[argc - 1]); - assert(fd > 0); - - memset(&pfd, 0, sizeof(pfd)); - pfd.fd = fd; - pfd.events = POLLIN | POLLERR; - pfd.revents = 0; - - while (likely(!sigint)) { - pbd = (struct block_desc *) ring.rd[block_num].iov_base; - - if ((pbd->h1.block_status & TP_STATUS_USER) == 0) { - poll(&pfd, 1, -1); - continue; - } - - walk_block(pbd, block_num); - flush_block(pbd); - block_num = (block_num + 1) % blocks; - } - - len = sizeof(stats); - err = getsockopt(fd, SOL_PACKET, PACKET_STATISTICS, &stats, &len); - if (err < 0) { - perror("getsockopt"); - exit(1); - } - - fflush(stdout); - printf("\nReceived %u packets, %lu bytes, %u dropped, freeze_q_cnt: %u\n", - stats.tp_packets, bytes_total, stats.tp_drops, - stats.tp_freeze_q_cnt); - - teardown_socket(&ring, fd); - return 0; -} - -------------------------------------------------------------------------------- -+ PACKET_QDISC_BYPASS -------------------------------------------------------------------------------- - -If there is a requirement to load the network with many packets in a similar -fashion as pktgen does, you might set the following option after socket -creation: - - int one = 1; - setsockopt(fd, SOL_PACKET, PACKET_QDISC_BYPASS, &one, sizeof(one)); - -This has the side-effect, that packets sent through PF_PACKET will bypass the -kernel's qdisc layer and are forcedly pushed to the driver directly. Meaning, -packet are not buffered, tc disciplines are ignored, increased loss can occur -and such packets are also not visible to other PF_PACKET sockets anymore. So, -you have been warned; generally, this can be useful for stress testing various -components of a system. - -On default, PACKET_QDISC_BYPASS is disabled and needs to be explicitly enabled -on PF_PACKET sockets. - -------------------------------------------------------------------------------- -+ PACKET_TIMESTAMP -------------------------------------------------------------------------------- - -The PACKET_TIMESTAMP setting determines the source of the timestamp in -the packet meta information for mmap(2)ed RX_RING and TX_RINGs. If your -NIC is capable of timestamping packets in hardware, you can request those -hardware timestamps to be used. Note: you may need to enable the generation -of hardware timestamps with SIOCSHWTSTAMP (see related information from -Documentation/networking/timestamping.txt). - -PACKET_TIMESTAMP accepts the same integer bit field as SO_TIMESTAMPING: - - int req = SOF_TIMESTAMPING_RAW_HARDWARE; - setsockopt(fd, SOL_PACKET, PACKET_TIMESTAMP, (void *) &req, sizeof(req)) - -For the mmap(2)ed ring buffers, such timestamps are stored in the -tpacket{,2,3}_hdr structure's tp_sec and tp_{n,u}sec members. To determine -what kind of timestamp has been reported, the tp_status field is binary |'ed -with the following possible bits ... - - TP_STATUS_TS_RAW_HARDWARE - TP_STATUS_TS_SOFTWARE - -... that are equivalent to its SOF_TIMESTAMPING_* counterparts. For the -RX_RING, if neither is set (i.e. PACKET_TIMESTAMP is not set), then a -software fallback was invoked *within* PF_PACKET's processing code (less -precise). - -Getting timestamps for the TX_RING works as follows: i) fill the ring frames, -ii) call sendto() e.g. in blocking mode, iii) wait for status of relevant -frames to be updated resp. the frame handed over to the application, iv) walk -through the frames to pick up the individual hw/sw timestamps. - -Only (!) if transmit timestamping is enabled, then these bits are combined -with binary | with TP_STATUS_AVAILABLE, so you must check for that in your -application (e.g. !(tp_status & (TP_STATUS_SEND_REQUEST | TP_STATUS_SENDING)) -in a first step to see if the frame belongs to the application, and then -one can extract the type of timestamp in a second step from tp_status)! - -If you don't care about them, thus having it disabled, checking for -TP_STATUS_AVAILABLE resp. TP_STATUS_WRONG_FORMAT is sufficient. If in the -TX_RING part only TP_STATUS_AVAILABLE is set, then the tp_sec and tp_{n,u}sec -members do not contain a valid value. For TX_RINGs, by default no timestamp -is generated! - -See include/linux/net_tstamp.h and Documentation/networking/timestamping.txt -for more information on hardware timestamps. - -------------------------------------------------------------------------------- -+ Miscellaneous bits -------------------------------------------------------------------------------- - -- Packet sockets work well together with Linux socket filters, thus you also - might want to have a look at Documentation/networking/filter.rst - --------------------------------------------------------------------------------- -+ THANKS --------------------------------------------------------------------------------- - - Jesse Brandeburg, for fixing my grammathical/spelling errors - -- cgit v1.2.3-59-g8ed1b From 6e94eaaa400d66f13e25e071926047ef2e3d21e3 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Thu, 30 Apr 2020 18:04:12 +0200 Subject: docs: networking: convert phonet.txt to ReST MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - add SPDX header; - adjust title markup; - use copyright symbol; - add notes markups; - mark code blocks and literals as such; - adjust identation, whitespaces and blank lines where needed; - add to networking/index.rst. Signed-off-by: Mauro Carvalho Chehab Acked-by: Rémi Denis-Courmont Signed-off-by: David S. Miller --- Documentation/networking/index.rst | 1 + Documentation/networking/packet_mmap.rst | 2 +- Documentation/networking/phonet.rst | 230 +++++++++++++++++++++++++++++++ Documentation/networking/phonet.txt | 214 ---------------------------- MAINTAINERS | 2 +- 5 files changed, 233 insertions(+), 216 deletions(-) create mode 100644 Documentation/networking/phonet.rst delete mode 100644 Documentation/networking/phonet.txt diff --git a/Documentation/networking/index.rst b/Documentation/networking/index.rst index 8262b535a83e..e460026331c6 100644 --- a/Documentation/networking/index.rst +++ b/Documentation/networking/index.rst @@ -90,6 +90,7 @@ Contents: openvswitch operstates packet_mmap + phonet .. only:: subproject and html diff --git a/Documentation/networking/packet_mmap.rst b/Documentation/networking/packet_mmap.rst index 5f213d17652f..884c7222b9e9 100644 --- a/Documentation/networking/packet_mmap.rst +++ b/Documentation/networking/packet_mmap.rst @@ -1076,7 +1076,7 @@ Miscellaneous bits ================== - Packet sockets work well together with Linux socket filters, thus you also - might want to have a look at Documentation/networking/filter.txt + might want to have a look at Documentation/networking/filter.rst THANKS ====== diff --git a/Documentation/networking/phonet.rst b/Documentation/networking/phonet.rst new file mode 100644 index 000000000000..8668dcbc5e6a --- /dev/null +++ b/Documentation/networking/phonet.rst @@ -0,0 +1,230 @@ +.. SPDX-License-Identifier: GPL-2.0 +.. include:: + +============================ +Linux Phonet protocol family +============================ + +Introduction +------------ + +Phonet is a packet protocol used by Nokia cellular modems for both IPC +and RPC. With the Linux Phonet socket family, Linux host processes can +receive and send messages from/to the modem, or any other external +device attached to the modem. The modem takes care of routing. + +Phonet packets can be exchanged through various hardware connections +depending on the device, such as: + + - USB with the CDC Phonet interface, + - infrared, + - Bluetooth, + - an RS232 serial port (with a dedicated "FBUS" line discipline), + - the SSI bus with some TI OMAP processors. + + +Packets format +-------------- + +Phonet packets have a common header as follows:: + + struct phonethdr { + uint8_t pn_media; /* Media type (link-layer identifier) */ + uint8_t pn_rdev; /* Receiver device ID */ + uint8_t pn_sdev; /* Sender device ID */ + uint8_t pn_res; /* Resource ID or function */ + uint16_t pn_length; /* Big-endian message byte length (minus 6) */ + uint8_t pn_robj; /* Receiver object ID */ + uint8_t pn_sobj; /* Sender object ID */ + }; + +On Linux, the link-layer header includes the pn_media byte (see below). +The next 7 bytes are part of the network-layer header. + +The device ID is split: the 6 higher-order bits constitute the device +address, while the 2 lower-order bits are used for multiplexing, as are +the 8-bit object identifiers. As such, Phonet can be considered as a +network layer with 6 bits of address space and 10 bits for transport +protocol (much like port numbers in IP world). + +The modem always has address number zero. All other device have a their +own 6-bit address. + + +Link layer +---------- + +Phonet links are always point-to-point links. The link layer header +consists of a single Phonet media type byte. It uniquely identifies the +link through which the packet is transmitted, from the modem's +perspective. Each Phonet network device shall prepend and set the media +type byte as appropriate. For convenience, a common phonet_header_ops +link-layer header operations structure is provided. It sets the +media type according to the network device hardware address. + +Linux Phonet network interfaces support a dedicated link layer packets +type (ETH_P_PHONET) which is out of the Ethernet type range. They can +only send and receive Phonet packets. + +The virtual TUN tunnel device driver can also be used for Phonet. This +requires IFF_TUN mode, _without_ the IFF_NO_PI flag. In this case, +there is no link-layer header, so there is no Phonet media type byte. + +Note that Phonet interfaces are not allowed to re-order packets, so +only the (default) Linux FIFO qdisc should be used with them. + + +Network layer +------------- + +The Phonet socket address family maps the Phonet packet header:: + + struct sockaddr_pn { + sa_family_t spn_family; /* AF_PHONET */ + uint8_t spn_obj; /* Object ID */ + uint8_t spn_dev; /* Device ID */ + uint8_t spn_resource; /* Resource or function */ + uint8_t spn_zero[...]; /* Padding */ + }; + +The resource field is only used when sending and receiving; +It is ignored by bind() and getsockname(). + + +Low-level datagram protocol +--------------------------- + +Applications can send Phonet messages using the Phonet datagram socket +protocol from the PF_PHONET family. Each socket is bound to one of the +2^10 object IDs available, and can send and receive packets with any +other peer. + +:: + + struct sockaddr_pn addr = { .spn_family = AF_PHONET, }; + ssize_t len; + socklen_t addrlen = sizeof(addr); + int fd; + + fd = socket(PF_PHONET, SOCK_DGRAM, 0); + bind(fd, (struct sockaddr *)&addr, sizeof(addr)); + /* ... */ + + sendto(fd, msg, msglen, 0, (struct sockaddr *)&addr, sizeof(addr)); + len = recvfrom(fd, buf, sizeof(buf), 0, + (struct sockaddr *)&addr, &addrlen); + +This protocol follows the SOCK_DGRAM connection-less semantics. +However, connect() and getpeername() are not supported, as they did +not seem useful with Phonet usages (could be added easily). + + +Resource subscription +--------------------- + +A Phonet datagram socket can be subscribed to any number of 8-bits +Phonet resources, as follow:: + + uint32_t res = 0xXX; + ioctl(fd, SIOCPNADDRESOURCE, &res); + +Subscription is similarly cancelled using the SIOCPNDELRESOURCE I/O +control request, or when the socket is closed. + +Note that no more than one socket can be subcribed to any given +resource at a time. If not, ioctl() will return EBUSY. + + +Phonet Pipe protocol +-------------------- + +The Phonet Pipe protocol is a simple sequenced packets protocol +with end-to-end congestion control. It uses the passive listening +socket paradigm. The listening socket is bound to an unique free object +ID. Each listening socket can handle up to 255 simultaneous +connections, one per accept()'d socket. + +:: + + int lfd, cfd; + + lfd = socket(PF_PHONET, SOCK_SEQPACKET, PN_PROTO_PIPE); + listen (lfd, INT_MAX); + + /* ... */ + cfd = accept(lfd, NULL, NULL); + for (;;) + { + char buf[...]; + ssize_t len = read(cfd, buf, sizeof(buf)); + + /* ... */ + + write(cfd, msg, msglen); + } + +Connections are traditionally established between two endpoints by a +"third party" application. This means that both endpoints are passive. + + +As of Linux kernel version 2.6.39, it is also possible to connect +two endpoints directly, using connect() on the active side. This is +intended to support the newer Nokia Wireless Modem API, as found in +e.g. the Nokia Slim Modem in the ST-Ericsson U8500 platform:: + + struct sockaddr_spn spn; + int fd; + + fd = socket(PF_PHONET, SOCK_SEQPACKET, PN_PROTO_PIPE); + memset(&spn, 0, sizeof(spn)); + spn.spn_family = AF_PHONET; + spn.spn_obj = ...; + spn.spn_dev = ...; + spn.spn_resource = 0xD9; + connect(fd, (struct sockaddr *)&spn, sizeof(spn)); + /* normal I/O here ... */ + close(fd); + + +.. Warning: + + When polling a connected pipe socket for writability, there is an + intrinsic race condition whereby writability might be lost between the + polling and the writing system calls. In this case, the socket will + block until write becomes possible again, unless non-blocking mode + is enabled. + + +The pipe protocol provides two socket options at the SOL_PNPIPE level: + + PNPIPE_ENCAP accepts one integer value (int) of: + + PNPIPE_ENCAP_NONE: + The socket operates normally (default). + + PNPIPE_ENCAP_IP: + The socket is used as a backend for a virtual IP + interface. This requires CAP_NET_ADMIN capability. GPRS data + support on Nokia modems can use this. Note that the socket cannot + be reliably poll()'d or read() from while in this mode. + + PNPIPE_IFINDEX + is a read-only integer value. It contains the + interface index of the network interface created by PNPIPE_ENCAP, + or zero if encapsulation is off. + + PNPIPE_HANDLE + is a read-only integer value. It contains the underlying + identifier ("pipe handle") of the pipe. This is only defined for + socket descriptors that are already connected or being connected. + + +Authors +------- + +Linux Phonet was initially written by Sakari Ailus. + +Other contributors include Mikä Liljeberg, Andras Domokos, +Carlos Chinea and Rémi Denis-Courmont. + +Copyright |copy| 2008 Nokia Corporation. diff --git a/Documentation/networking/phonet.txt b/Documentation/networking/phonet.txt deleted file mode 100644 index 81003581f47a..000000000000 --- a/Documentation/networking/phonet.txt +++ /dev/null @@ -1,214 +0,0 @@ -Linux Phonet protocol family -============================ - -Introduction ------------- - -Phonet is a packet protocol used by Nokia cellular modems for both IPC -and RPC. With the Linux Phonet socket family, Linux host processes can -receive and send messages from/to the modem, or any other external -device attached to the modem. The modem takes care of routing. - -Phonet packets can be exchanged through various hardware connections -depending on the device, such as: - - USB with the CDC Phonet interface, - - infrared, - - Bluetooth, - - an RS232 serial port (with a dedicated "FBUS" line discipline), - - the SSI bus with some TI OMAP processors. - - -Packets format --------------- - -Phonet packets have a common header as follows: - - struct phonethdr { - uint8_t pn_media; /* Media type (link-layer identifier) */ - uint8_t pn_rdev; /* Receiver device ID */ - uint8_t pn_sdev; /* Sender device ID */ - uint8_t pn_res; /* Resource ID or function */ - uint16_t pn_length; /* Big-endian message byte length (minus 6) */ - uint8_t pn_robj; /* Receiver object ID */ - uint8_t pn_sobj; /* Sender object ID */ - }; - -On Linux, the link-layer header includes the pn_media byte (see below). -The next 7 bytes are part of the network-layer header. - -The device ID is split: the 6 higher-order bits constitute the device -address, while the 2 lower-order bits are used for multiplexing, as are -the 8-bit object identifiers. As such, Phonet can be considered as a -network layer with 6 bits of address space and 10 bits for transport -protocol (much like port numbers in IP world). - -The modem always has address number zero. All other device have a their -own 6-bit address. - - -Link layer ----------- - -Phonet links are always point-to-point links. The link layer header -consists of a single Phonet media type byte. It uniquely identifies the -link through which the packet is transmitted, from the modem's -perspective. Each Phonet network device shall prepend and set the media -type byte as appropriate. For convenience, a common phonet_header_ops -link-layer header operations structure is provided. It sets the -media type according to the network device hardware address. - -Linux Phonet network interfaces support a dedicated link layer packets -type (ETH_P_PHONET) which is out of the Ethernet type range. They can -only send and receive Phonet packets. - -The virtual TUN tunnel device driver can also be used for Phonet. This -requires IFF_TUN mode, _without_ the IFF_NO_PI flag. In this case, -there is no link-layer header, so there is no Phonet media type byte. - -Note that Phonet interfaces are not allowed to re-order packets, so -only the (default) Linux FIFO qdisc should be used with them. - - -Network layer -------------- - -The Phonet socket address family maps the Phonet packet header: - - struct sockaddr_pn { - sa_family_t spn_family; /* AF_PHONET */ - uint8_t spn_obj; /* Object ID */ - uint8_t spn_dev; /* Device ID */ - uint8_t spn_resource; /* Resource or function */ - uint8_t spn_zero[...]; /* Padding */ - }; - -The resource field is only used when sending and receiving; -It is ignored by bind() and getsockname(). - - -Low-level datagram protocol ---------------------------- - -Applications can send Phonet messages using the Phonet datagram socket -protocol from the PF_PHONET family. Each socket is bound to one of the -2^10 object IDs available, and can send and receive packets with any -other peer. - - struct sockaddr_pn addr = { .spn_family = AF_PHONET, }; - ssize_t len; - socklen_t addrlen = sizeof(addr); - int fd; - - fd = socket(PF_PHONET, SOCK_DGRAM, 0); - bind(fd, (struct sockaddr *)&addr, sizeof(addr)); - /* ... */ - - sendto(fd, msg, msglen, 0, (struct sockaddr *)&addr, sizeof(addr)); - len = recvfrom(fd, buf, sizeof(buf), 0, - (struct sockaddr *)&addr, &addrlen); - -This protocol follows the SOCK_DGRAM connection-less semantics. -However, connect() and getpeername() are not supported, as they did -not seem useful with Phonet usages (could be added easily). - - -Resource subscription ---------------------- - -A Phonet datagram socket can be subscribed to any number of 8-bits -Phonet resources, as follow: - - uint32_t res = 0xXX; - ioctl(fd, SIOCPNADDRESOURCE, &res); - -Subscription is similarly cancelled using the SIOCPNDELRESOURCE I/O -control request, or when the socket is closed. - -Note that no more than one socket can be subcribed to any given -resource at a time. If not, ioctl() will return EBUSY. - - -Phonet Pipe protocol --------------------- - -The Phonet Pipe protocol is a simple sequenced packets protocol -with end-to-end congestion control. It uses the passive listening -socket paradigm. The listening socket is bound to an unique free object -ID. Each listening socket can handle up to 255 simultaneous -connections, one per accept()'d socket. - - int lfd, cfd; - - lfd = socket(PF_PHONET, SOCK_SEQPACKET, PN_PROTO_PIPE); - listen (lfd, INT_MAX); - - /* ... */ - cfd = accept(lfd, NULL, NULL); - for (;;) - { - char buf[...]; - ssize_t len = read(cfd, buf, sizeof(buf)); - - /* ... */ - - write(cfd, msg, msglen); - } - -Connections are traditionally established between two endpoints by a -"third party" application. This means that both endpoints are passive. - - -As of Linux kernel version 2.6.39, it is also possible to connect -two endpoints directly, using connect() on the active side. This is -intended to support the newer Nokia Wireless Modem API, as found in -e.g. the Nokia Slim Modem in the ST-Ericsson U8500 platform: - - struct sockaddr_spn spn; - int fd; - - fd = socket(PF_PHONET, SOCK_SEQPACKET, PN_PROTO_PIPE); - memset(&spn, 0, sizeof(spn)); - spn.spn_family = AF_PHONET; - spn.spn_obj = ...; - spn.spn_dev = ...; - spn.spn_resource = 0xD9; - connect(fd, (struct sockaddr *)&spn, sizeof(spn)); - /* normal I/O here ... */ - close(fd); - - -WARNING: -When polling a connected pipe socket for writability, there is an -intrinsic race condition whereby writability might be lost between the -polling and the writing system calls. In this case, the socket will -block until write becomes possible again, unless non-blocking mode -is enabled. - - -The pipe protocol provides two socket options at the SOL_PNPIPE level: - - PNPIPE_ENCAP accepts one integer value (int) of: - - PNPIPE_ENCAP_NONE: The socket operates normally (default). - - PNPIPE_ENCAP_IP: The socket is used as a backend for a virtual IP - interface. This requires CAP_NET_ADMIN capability. GPRS data - support on Nokia modems can use this. Note that the socket cannot - be reliably poll()'d or read() from while in this mode. - - PNPIPE_IFINDEX is a read-only integer value. It contains the - interface index of the network interface created by PNPIPE_ENCAP, - or zero if encapsulation is off. - - PNPIPE_HANDLE is a read-only integer value. It contains the underlying - identifier ("pipe handle") of the pipe. This is only defined for - socket descriptors that are already connected or being connected. - - -Authors -------- - -Linux Phonet was initially written by Sakari Ailus. -Other contributors include Mikä Liljeberg, Andras Domokos, -Carlos Chinea and Rémi Denis-Courmont. -Copyright (C) 2008 Nokia Corporation. diff --git a/MAINTAINERS b/MAINTAINERS index 33bfc9e4aead..785f56e5f210 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -13262,7 +13262,7 @@ F: drivers/input/joystick/pxrc.c PHONET PROTOCOL M: Remi Denis-Courmont S: Supported -F: Documentation/networking/phonet.txt +F: Documentation/networking/phonet.rst F: include/linux/phonet.h F: include/net/phonet/ F: include/uapi/linux/phonet.h -- cgit v1.2.3-59-g8ed1b From c1e4535f24bcfeef55a7ed409a5f50548e284426 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Thu, 30 Apr 2020 18:04:13 +0200 Subject: docs: networking: convert pktgen.txt to ReST - add SPDX header; - adjust title markup; - use bold markups on a few places; - mark code blocks and literals as such; - adjust identation, whitespaces and blank lines where needed; - add to networking/index.rst. Signed-off-by: Mauro Carvalho Chehab Signed-off-by: David S. Miller --- Documentation/networking/index.rst | 1 + Documentation/networking/pktgen.rst | 412 ++++++++++++++++++++++++++++++++++++ Documentation/networking/pktgen.txt | 400 ---------------------------------- net/Kconfig | 2 +- net/core/pktgen.c | 2 +- samples/pktgen/README.rst | 2 +- 6 files changed, 416 insertions(+), 403 deletions(-) create mode 100644 Documentation/networking/pktgen.rst delete mode 100644 Documentation/networking/pktgen.txt diff --git a/Documentation/networking/index.rst b/Documentation/networking/index.rst index e460026331c6..696181a96e3c 100644 --- a/Documentation/networking/index.rst +++ b/Documentation/networking/index.rst @@ -91,6 +91,7 @@ Contents: operstates packet_mmap phonet + pktgen .. only:: subproject and html diff --git a/Documentation/networking/pktgen.rst b/Documentation/networking/pktgen.rst new file mode 100644 index 000000000000..7afa1c9f1183 --- /dev/null +++ b/Documentation/networking/pktgen.rst @@ -0,0 +1,412 @@ +.. SPDX-License-Identifier: GPL-2.0 + +==================================== +HOWTO for the linux packet generator +==================================== + +Enable CONFIG_NET_PKTGEN to compile and build pktgen either in-kernel +or as a module. A module is preferred; modprobe pktgen if needed. Once +running, pktgen creates a thread for each CPU with affinity to that CPU. +Monitoring and controlling is done via /proc. It is easiest to select a +suitable sample script and configure that. + +On a dual CPU:: + + ps aux | grep pkt + root 129 0.3 0.0 0 0 ? SW 2003 523:20 [kpktgend_0] + root 130 0.3 0.0 0 0 ? SW 2003 509:50 [kpktgend_1] + + +For monitoring and control pktgen creates:: + + /proc/net/pktgen/pgctrl + /proc/net/pktgen/kpktgend_X + /proc/net/pktgen/ethX + + +Tuning NIC for max performance +============================== + +The default NIC settings are (likely) not tuned for pktgen's artificial +overload type of benchmarking, as this could hurt the normal use-case. + +Specifically increasing the TX ring buffer in the NIC:: + + # ethtool -G ethX tx 1024 + +A larger TX ring can improve pktgen's performance, while it can hurt +in the general case, 1) because the TX ring buffer might get larger +than the CPU's L1/L2 cache, 2) because it allows more queueing in the +NIC HW layer (which is bad for bufferbloat). + +One should hesitate to conclude that packets/descriptors in the HW +TX ring cause delay. Drivers usually delay cleaning up the +ring-buffers for various performance reasons, and packets stalling +the TX ring might just be waiting for cleanup. + +This cleanup issue is specifically the case for the driver ixgbe +(Intel 82599 chip). This driver (ixgbe) combines TX+RX ring cleanups, +and the cleanup interval is affected by the ethtool --coalesce setting +of parameter "rx-usecs". + +For ixgbe use e.g. "30" resulting in approx 33K interrupts/sec (1/30*10^6):: + + # ethtool -C ethX rx-usecs 30 + + +Kernel threads +============== +Pktgen creates a thread for each CPU with affinity to that CPU. +Which is controlled through procfile /proc/net/pktgen/kpktgend_X. + +Example: /proc/net/pktgen/kpktgend_0:: + + Running: + Stopped: eth4@0 + Result: OK: add_device=eth4@0 + +Most important are the devices assigned to the thread. + +The two basic thread commands are: + + * add_device DEVICE@NAME -- adds a single device + * rem_device_all -- remove all associated devices + +When adding a device to a thread, a corresponding procfile is created +which is used for configuring this device. Thus, device names need to +be unique. + +To support adding the same device to multiple threads, which is useful +with multi queue NICs, the device naming scheme is extended with "@": +device@something + +The part after "@" can be anything, but it is custom to use the thread +number. + +Viewing devices +=============== + +The Params section holds configured information. The Current section +holds running statistics. The Result is printed after a run or after +interruption. Example:: + + /proc/net/pktgen/eth4@0 + + Params: count 100000 min_pkt_size: 60 max_pkt_size: 60 + frags: 0 delay: 0 clone_skb: 64 ifname: eth4@0 + flows: 0 flowlen: 0 + queue_map_min: 0 queue_map_max: 0 + dst_min: 192.168.81.2 dst_max: + src_min: src_max: + src_mac: 90:e2:ba:0a:56:b4 dst_mac: 00:1b:21:3c:9d:f8 + udp_src_min: 9 udp_src_max: 109 udp_dst_min: 9 udp_dst_max: 9 + src_mac_count: 0 dst_mac_count: 0 + Flags: UDPSRC_RND NO_TIMESTAMP QUEUE_MAP_CPU + Current: + pkts-sofar: 100000 errors: 0 + started: 623913381008us stopped: 623913396439us idle: 25us + seq_num: 100001 cur_dst_mac_offset: 0 cur_src_mac_offset: 0 + cur_saddr: 192.168.8.3 cur_daddr: 192.168.81.2 + cur_udp_dst: 9 cur_udp_src: 42 + cur_queue_map: 0 + flows: 0 + Result: OK: 15430(c15405+d25) usec, 100000 (60byte,0frags) + 6480562pps 3110Mb/sec (3110669760bps) errors: 0 + + +Configuring devices +=================== +This is done via the /proc interface, and most easily done via pgset +as defined in the sample scripts. +You need to specify PGDEV environment variable to use functions from sample +scripts, i.e.:: + + export PGDEV=/proc/net/pktgen/eth4@0 + source samples/pktgen/functions.sh + +Examples:: + + pg_ctrl start starts injection. + pg_ctrl stop aborts injection. Also, ^C aborts generator. + + pgset "clone_skb 1" sets the number of copies of the same packet + pgset "clone_skb 0" use single SKB for all transmits + pgset "burst 8" uses xmit_more API to queue 8 copies of the same + packet and update HW tx queue tail pointer once. + "burst 1" is the default + pgset "pkt_size 9014" sets packet size to 9014 + pgset "frags 5" packet will consist of 5 fragments + pgset "count 200000" sets number of packets to send, set to zero + for continuous sends until explicitly stopped. + + pgset "delay 5000" adds delay to hard_start_xmit(). nanoseconds + + pgset "dst 10.0.0.1" sets IP destination address + (BEWARE! This generator is very aggressive!) + + pgset "dst_min 10.0.0.1" Same as dst + pgset "dst_max 10.0.0.254" Set the maximum destination IP. + pgset "src_min 10.0.0.1" Set the minimum (or only) source IP. + pgset "src_max 10.0.0.254" Set the maximum source IP. + pgset "dst6 fec0::1" IPV6 destination address + pgset "src6 fec0::2" IPV6 source address + pgset "dstmac 00:00:00:00:00:00" sets MAC destination address + pgset "srcmac 00:00:00:00:00:00" sets MAC source address + + pgset "queue_map_min 0" Sets the min value of tx queue interval + pgset "queue_map_max 7" Sets the max value of tx queue interval, for multiqueue devices + To select queue 1 of a given device, + use queue_map_min=1 and queue_map_max=1 + + pgset "src_mac_count 1" Sets the number of MACs we'll range through. + The 'minimum' MAC is what you set with srcmac. + + pgset "dst_mac_count 1" Sets the number of MACs we'll range through. + The 'minimum' MAC is what you set with dstmac. + + pgset "flag [name]" Set a flag to determine behaviour. Current flags + are: IPSRC_RND # IP source is random (between min/max) + IPDST_RND # IP destination is random + UDPSRC_RND, UDPDST_RND, + MACSRC_RND, MACDST_RND + TXSIZE_RND, IPV6, + MPLS_RND, VID_RND, SVID_RND + FLOW_SEQ, + QUEUE_MAP_RND # queue map random + QUEUE_MAP_CPU # queue map mirrors smp_processor_id() + UDPCSUM, + IPSEC # IPsec encapsulation (needs CONFIG_XFRM) + NODE_ALLOC # node specific memory allocation + NO_TIMESTAMP # disable timestamping + pgset 'flag ![name]' Clear a flag to determine behaviour. + Note that you might need to use single quote in + interactive mode, so that your shell wouldn't expand + the specified flag as a history command. + + pgset "spi [SPI_VALUE]" Set specific SA used to transform packet. + + pgset "udp_src_min 9" set UDP source port min, If < udp_src_max, then + cycle through the port range. + + pgset "udp_src_max 9" set UDP source port max. + pgset "udp_dst_min 9" set UDP destination port min, If < udp_dst_max, then + cycle through the port range. + pgset "udp_dst_max 9" set UDP destination port max. + + pgset "mpls 0001000a,0002000a,0000000a" set MPLS labels (in this example + outer label=16,middle label=32, + inner label=0 (IPv4 NULL)) Note that + there must be no spaces between the + arguments. Leading zeros are required. + Do not set the bottom of stack bit, + that's done automatically. If you do + set the bottom of stack bit, that + indicates that you want to randomly + generate that address and the flag + MPLS_RND will be turned on. You + can have any mix of random and fixed + labels in the label stack. + + pgset "mpls 0" turn off mpls (or any invalid argument works too!) + + pgset "vlan_id 77" set VLAN ID 0-4095 + pgset "vlan_p 3" set priority bit 0-7 (default 0) + pgset "vlan_cfi 0" set canonical format identifier 0-1 (default 0) + + pgset "svlan_id 22" set SVLAN ID 0-4095 + pgset "svlan_p 3" set priority bit 0-7 (default 0) + pgset "svlan_cfi 0" set canonical format identifier 0-1 (default 0) + + pgset "vlan_id 9999" > 4095 remove vlan and svlan tags + pgset "svlan 9999" > 4095 remove svlan tag + + + pgset "tos XX" set former IPv4 TOS field (e.g. "tos 28" for AF11 no ECN, default 00) + pgset "traffic_class XX" set former IPv6 TRAFFIC CLASS (e.g. "traffic_class B8" for EF no ECN, default 00) + + pgset "rate 300M" set rate to 300 Mb/s + pgset "ratep 1000000" set rate to 1Mpps + + pgset "xmit_mode netif_receive" RX inject into stack netif_receive_skb() + Works with "burst" but not with "clone_skb". + Default xmit_mode is "start_xmit". + +Sample scripts +============== + +A collection of tutorial scripts and helpers for pktgen is in the +samples/pktgen directory. The helper parameters.sh file support easy +and consistent parameter parsing across the sample scripts. + +Usage example and help:: + + ./pktgen_sample01_simple.sh -i eth4 -m 00:1B:21:3C:9D:F8 -d 192.168.8.2 + +Usage::: + + ./pktgen_sample01_simple.sh [-vx] -i ethX + + -i : ($DEV) output interface/device (required) + -s : ($PKT_SIZE) packet size + -d : ($DEST_IP) destination IP + -m : ($DST_MAC) destination MAC-addr + -t : ($THREADS) threads to start + -c : ($SKB_CLONE) SKB clones send before alloc new SKB + -b : ($BURST) HW level bursting of SKBs + -v : ($VERBOSE) verbose + -x : ($DEBUG) debug + +The global variables being set are also listed. E.g. the required +interface/device parameter "-i" sets variable $DEV. Copy the +pktgen_sampleXX scripts and modify them to fit your own needs. + +The old scripts:: + + pktgen.conf-1-2 # 1 CPU 2 dev + pktgen.conf-1-1-rdos # 1 CPU 1 dev w. route DoS + pktgen.conf-1-1-ip6 # 1 CPU 1 dev ipv6 + pktgen.conf-1-1-ip6-rdos # 1 CPU 1 dev ipv6 w. route DoS + pktgen.conf-1-1-flows # 1 CPU 1 dev multiple flows. + + +Interrupt affinity +=================== +Note that when adding devices to a specific CPU it is a good idea to +also assign /proc/irq/XX/smp_affinity so that the TX interrupts are bound +to the same CPU. This reduces cache bouncing when freeing skbs. + +Plus using the device flag QUEUE_MAP_CPU, which maps the SKBs TX queue +to the running threads CPU (directly from smp_processor_id()). + +Enable IPsec +============ +Default IPsec transformation with ESP encapsulation plus transport mode +can be enabled by simply setting:: + + pgset "flag IPSEC" + pgset "flows 1" + +To avoid breaking existing testbed scripts for using AH type and tunnel mode, +you can use "pgset spi SPI_VALUE" to specify which transformation mode +to employ. + + +Current commands and configuration options +========================================== + +**Pgcontrol commands**:: + + start + stop + reset + +**Thread commands**:: + + add_device + rem_device_all + + +**Device commands**:: + + count + clone_skb + burst + debug + + frags + delay + + src_mac_count + dst_mac_count + + pkt_size + min_pkt_size + max_pkt_size + + queue_map_min + queue_map_max + skb_priority + + tos (ipv4) + traffic_class (ipv6) + + mpls + + udp_src_min + udp_src_max + + udp_dst_min + udp_dst_max + + node + + flag + IPSRC_RND + IPDST_RND + UDPSRC_RND + UDPDST_RND + MACSRC_RND + MACDST_RND + TXSIZE_RND + IPV6 + MPLS_RND + VID_RND + SVID_RND + FLOW_SEQ + QUEUE_MAP_RND + QUEUE_MAP_CPU + UDPCSUM + IPSEC + NODE_ALLOC + NO_TIMESTAMP + + spi (ipsec) + + dst_min + dst_max + + src_min + src_max + + dst_mac + src_mac + + clear_counters + + src6 + dst6 + dst6_max + dst6_min + + flows + flowlen + + rate + ratep + + xmit_mode + + vlan_cfi + vlan_id + vlan_p + + svlan_cfi + svlan_id + svlan_p + + +References: + +- ftp://robur.slu.se/pub/Linux/net-development/pktgen-testing/ +- tp://robur.slu.se/pub/Linux/net-development/pktgen-testing/examples/ + +Paper from Linux-Kongress in Erlangen 2004. +- ftp://robur.slu.se/pub/Linux/net-development/pktgen-testing/pktgen_paper.pdf + +Thanks to: + +Grant Grundler for testing on IA-64 and parisc, Harald Welte, Lennert Buytenhek +Stephen Hemminger, Andi Kleen, Dave Miller and many others. + + +Good luck with the linux net-development. diff --git a/Documentation/networking/pktgen.txt b/Documentation/networking/pktgen.txt deleted file mode 100644 index d2fd78f85aa4..000000000000 --- a/Documentation/networking/pktgen.txt +++ /dev/null @@ -1,400 +0,0 @@ - - - HOWTO for the linux packet generator - ------------------------------------ - -Enable CONFIG_NET_PKTGEN to compile and build pktgen either in-kernel -or as a module. A module is preferred; modprobe pktgen if needed. Once -running, pktgen creates a thread for each CPU with affinity to that CPU. -Monitoring and controlling is done via /proc. It is easiest to select a -suitable sample script and configure that. - -On a dual CPU: - -ps aux | grep pkt -root 129 0.3 0.0 0 0 ? SW 2003 523:20 [kpktgend_0] -root 130 0.3 0.0 0 0 ? SW 2003 509:50 [kpktgend_1] - - -For monitoring and control pktgen creates: - /proc/net/pktgen/pgctrl - /proc/net/pktgen/kpktgend_X - /proc/net/pktgen/ethX - - -Tuning NIC for max performance -============================== - -The default NIC settings are (likely) not tuned for pktgen's artificial -overload type of benchmarking, as this could hurt the normal use-case. - -Specifically increasing the TX ring buffer in the NIC: - # ethtool -G ethX tx 1024 - -A larger TX ring can improve pktgen's performance, while it can hurt -in the general case, 1) because the TX ring buffer might get larger -than the CPU's L1/L2 cache, 2) because it allows more queueing in the -NIC HW layer (which is bad for bufferbloat). - -One should hesitate to conclude that packets/descriptors in the HW -TX ring cause delay. Drivers usually delay cleaning up the -ring-buffers for various performance reasons, and packets stalling -the TX ring might just be waiting for cleanup. - -This cleanup issue is specifically the case for the driver ixgbe -(Intel 82599 chip). This driver (ixgbe) combines TX+RX ring cleanups, -and the cleanup interval is affected by the ethtool --coalesce setting -of parameter "rx-usecs". - -For ixgbe use e.g. "30" resulting in approx 33K interrupts/sec (1/30*10^6): - # ethtool -C ethX rx-usecs 30 - - -Kernel threads -============== -Pktgen creates a thread for each CPU with affinity to that CPU. -Which is controlled through procfile /proc/net/pktgen/kpktgend_X. - -Example: /proc/net/pktgen/kpktgend_0 - - Running: - Stopped: eth4@0 - Result: OK: add_device=eth4@0 - -Most important are the devices assigned to the thread. - -The two basic thread commands are: - * add_device DEVICE@NAME -- adds a single device - * rem_device_all -- remove all associated devices - -When adding a device to a thread, a corresponding procfile is created -which is used for configuring this device. Thus, device names need to -be unique. - -To support adding the same device to multiple threads, which is useful -with multi queue NICs, the device naming scheme is extended with "@": - device@something - -The part after "@" can be anything, but it is custom to use the thread -number. - -Viewing devices -=============== - -The Params section holds configured information. The Current section -holds running statistics. The Result is printed after a run or after -interruption. Example: - -/proc/net/pktgen/eth4@0 - - Params: count 100000 min_pkt_size: 60 max_pkt_size: 60 - frags: 0 delay: 0 clone_skb: 64 ifname: eth4@0 - flows: 0 flowlen: 0 - queue_map_min: 0 queue_map_max: 0 - dst_min: 192.168.81.2 dst_max: - src_min: src_max: - src_mac: 90:e2:ba:0a:56:b4 dst_mac: 00:1b:21:3c:9d:f8 - udp_src_min: 9 udp_src_max: 109 udp_dst_min: 9 udp_dst_max: 9 - src_mac_count: 0 dst_mac_count: 0 - Flags: UDPSRC_RND NO_TIMESTAMP QUEUE_MAP_CPU - Current: - pkts-sofar: 100000 errors: 0 - started: 623913381008us stopped: 623913396439us idle: 25us - seq_num: 100001 cur_dst_mac_offset: 0 cur_src_mac_offset: 0 - cur_saddr: 192.168.8.3 cur_daddr: 192.168.81.2 - cur_udp_dst: 9 cur_udp_src: 42 - cur_queue_map: 0 - flows: 0 - Result: OK: 15430(c15405+d25) usec, 100000 (60byte,0frags) - 6480562pps 3110Mb/sec (3110669760bps) errors: 0 - - -Configuring devices -=================== -This is done via the /proc interface, and most easily done via pgset -as defined in the sample scripts. -You need to specify PGDEV environment variable to use functions from sample -scripts, i.e.: -export PGDEV=/proc/net/pktgen/eth4@0 -source samples/pktgen/functions.sh - -Examples: - - pg_ctrl start starts injection. - pg_ctrl stop aborts injection. Also, ^C aborts generator. - - pgset "clone_skb 1" sets the number of copies of the same packet - pgset "clone_skb 0" use single SKB for all transmits - pgset "burst 8" uses xmit_more API to queue 8 copies of the same - packet and update HW tx queue tail pointer once. - "burst 1" is the default - pgset "pkt_size 9014" sets packet size to 9014 - pgset "frags 5" packet will consist of 5 fragments - pgset "count 200000" sets number of packets to send, set to zero - for continuous sends until explicitly stopped. - - pgset "delay 5000" adds delay to hard_start_xmit(). nanoseconds - - pgset "dst 10.0.0.1" sets IP destination address - (BEWARE! This generator is very aggressive!) - - pgset "dst_min 10.0.0.1" Same as dst - pgset "dst_max 10.0.0.254" Set the maximum destination IP. - pgset "src_min 10.0.0.1" Set the minimum (or only) source IP. - pgset "src_max 10.0.0.254" Set the maximum source IP. - pgset "dst6 fec0::1" IPV6 destination address - pgset "src6 fec0::2" IPV6 source address - pgset "dstmac 00:00:00:00:00:00" sets MAC destination address - pgset "srcmac 00:00:00:00:00:00" sets MAC source address - - pgset "queue_map_min 0" Sets the min value of tx queue interval - pgset "queue_map_max 7" Sets the max value of tx queue interval, for multiqueue devices - To select queue 1 of a given device, - use queue_map_min=1 and queue_map_max=1 - - pgset "src_mac_count 1" Sets the number of MACs we'll range through. - The 'minimum' MAC is what you set with srcmac. - - pgset "dst_mac_count 1" Sets the number of MACs we'll range through. - The 'minimum' MAC is what you set with dstmac. - - pgset "flag [name]" Set a flag to determine behaviour. Current flags - are: IPSRC_RND # IP source is random (between min/max) - IPDST_RND # IP destination is random - UDPSRC_RND, UDPDST_RND, - MACSRC_RND, MACDST_RND - TXSIZE_RND, IPV6, - MPLS_RND, VID_RND, SVID_RND - FLOW_SEQ, - QUEUE_MAP_RND # queue map random - QUEUE_MAP_CPU # queue map mirrors smp_processor_id() - UDPCSUM, - IPSEC # IPsec encapsulation (needs CONFIG_XFRM) - NODE_ALLOC # node specific memory allocation - NO_TIMESTAMP # disable timestamping - pgset 'flag ![name]' Clear a flag to determine behaviour. - Note that you might need to use single quote in - interactive mode, so that your shell wouldn't expand - the specified flag as a history command. - - pgset "spi [SPI_VALUE]" Set specific SA used to transform packet. - - pgset "udp_src_min 9" set UDP source port min, If < udp_src_max, then - cycle through the port range. - - pgset "udp_src_max 9" set UDP source port max. - pgset "udp_dst_min 9" set UDP destination port min, If < udp_dst_max, then - cycle through the port range. - pgset "udp_dst_max 9" set UDP destination port max. - - pgset "mpls 0001000a,0002000a,0000000a" set MPLS labels (in this example - outer label=16,middle label=32, - inner label=0 (IPv4 NULL)) Note that - there must be no spaces between the - arguments. Leading zeros are required. - Do not set the bottom of stack bit, - that's done automatically. If you do - set the bottom of stack bit, that - indicates that you want to randomly - generate that address and the flag - MPLS_RND will be turned on. You - can have any mix of random and fixed - labels in the label stack. - - pgset "mpls 0" turn off mpls (or any invalid argument works too!) - - pgset "vlan_id 77" set VLAN ID 0-4095 - pgset "vlan_p 3" set priority bit 0-7 (default 0) - pgset "vlan_cfi 0" set canonical format identifier 0-1 (default 0) - - pgset "svlan_id 22" set SVLAN ID 0-4095 - pgset "svlan_p 3" set priority bit 0-7 (default 0) - pgset "svlan_cfi 0" set canonical format identifier 0-1 (default 0) - - pgset "vlan_id 9999" > 4095 remove vlan and svlan tags - pgset "svlan 9999" > 4095 remove svlan tag - - - pgset "tos XX" set former IPv4 TOS field (e.g. "tos 28" for AF11 no ECN, default 00) - pgset "traffic_class XX" set former IPv6 TRAFFIC CLASS (e.g. "traffic_class B8" for EF no ECN, default 00) - - pgset "rate 300M" set rate to 300 Mb/s - pgset "ratep 1000000" set rate to 1Mpps - - pgset "xmit_mode netif_receive" RX inject into stack netif_receive_skb() - Works with "burst" but not with "clone_skb". - Default xmit_mode is "start_xmit". - -Sample scripts -============== - -A collection of tutorial scripts and helpers for pktgen is in the -samples/pktgen directory. The helper parameters.sh file support easy -and consistent parameter parsing across the sample scripts. - -Usage example and help: - ./pktgen_sample01_simple.sh -i eth4 -m 00:1B:21:3C:9D:F8 -d 192.168.8.2 - -Usage: ./pktgen_sample01_simple.sh [-vx] -i ethX - -i : ($DEV) output interface/device (required) - -s : ($PKT_SIZE) packet size - -d : ($DEST_IP) destination IP - -m : ($DST_MAC) destination MAC-addr - -t : ($THREADS) threads to start - -c : ($SKB_CLONE) SKB clones send before alloc new SKB - -b : ($BURST) HW level bursting of SKBs - -v : ($VERBOSE) verbose - -x : ($DEBUG) debug - -The global variables being set are also listed. E.g. the required -interface/device parameter "-i" sets variable $DEV. Copy the -pktgen_sampleXX scripts and modify them to fit your own needs. - -The old scripts: - -pktgen.conf-1-2 # 1 CPU 2 dev -pktgen.conf-1-1-rdos # 1 CPU 1 dev w. route DoS -pktgen.conf-1-1-ip6 # 1 CPU 1 dev ipv6 -pktgen.conf-1-1-ip6-rdos # 1 CPU 1 dev ipv6 w. route DoS -pktgen.conf-1-1-flows # 1 CPU 1 dev multiple flows. - - -Interrupt affinity -=================== -Note that when adding devices to a specific CPU it is a good idea to -also assign /proc/irq/XX/smp_affinity so that the TX interrupts are bound -to the same CPU. This reduces cache bouncing when freeing skbs. - -Plus using the device flag QUEUE_MAP_CPU, which maps the SKBs TX queue -to the running threads CPU (directly from smp_processor_id()). - -Enable IPsec -============ -Default IPsec transformation with ESP encapsulation plus transport mode -can be enabled by simply setting: - -pgset "flag IPSEC" -pgset "flows 1" - -To avoid breaking existing testbed scripts for using AH type and tunnel mode, -you can use "pgset spi SPI_VALUE" to specify which transformation mode -to employ. - - -Current commands and configuration options -========================================== - -** Pgcontrol commands: - -start -stop -reset - -** Thread commands: - -add_device -rem_device_all - - -** Device commands: - -count -clone_skb -burst -debug - -frags -delay - -src_mac_count -dst_mac_count - -pkt_size -min_pkt_size -max_pkt_size - -queue_map_min -queue_map_max -skb_priority - -tos (ipv4) -traffic_class (ipv6) - -mpls - -udp_src_min -udp_src_max - -udp_dst_min -udp_dst_max - -node - -flag - IPSRC_RND - IPDST_RND - UDPSRC_RND - UDPDST_RND - MACSRC_RND - MACDST_RND - TXSIZE_RND - IPV6 - MPLS_RND - VID_RND - SVID_RND - FLOW_SEQ - QUEUE_MAP_RND - QUEUE_MAP_CPU - UDPCSUM - IPSEC - NODE_ALLOC - NO_TIMESTAMP - -spi (ipsec) - -dst_min -dst_max - -src_min -src_max - -dst_mac -src_mac - -clear_counters - -src6 -dst6 -dst6_max -dst6_min - -flows -flowlen - -rate -ratep - -xmit_mode - -vlan_cfi -vlan_id -vlan_p - -svlan_cfi -svlan_id -svlan_p - - -References: -ftp://robur.slu.se/pub/Linux/net-development/pktgen-testing/ -ftp://robur.slu.se/pub/Linux/net-development/pktgen-testing/examples/ - -Paper from Linux-Kongress in Erlangen 2004. -ftp://robur.slu.se/pub/Linux/net-development/pktgen-testing/pktgen_paper.pdf - -Thanks to: -Grant Grundler for testing on IA-64 and parisc, Harald Welte, Lennert Buytenhek -Stephen Hemminger, Andi Kleen, Dave Miller and many others. - - -Good luck with the linux net-development. diff --git a/net/Kconfig b/net/Kconfig index 8b1f85820a6b..c5ba2d180c43 100644 --- a/net/Kconfig +++ b/net/Kconfig @@ -344,7 +344,7 @@ config NET_PKTGEN what was just said, you don't need it: say N. Documentation on how to use the packet generator can be found - at . + at . To compile this code as a module, choose M here: the module will be called pktgen. diff --git a/net/core/pktgen.c b/net/core/pktgen.c index 08e2811b5274..b53b6d38c4df 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -56,7 +56,7 @@ * Integrated to 2.5.x 021029 --Lucio Maciel (luciomaciel@zipmail.com.br) * * 021124 Finished major redesign and rewrite for new functionality. - * See Documentation/networking/pktgen.txt for how to use this. + * See Documentation/networking/pktgen.rst for how to use this. * * The new operation: * For each CPU one thread/process is created at start. This process checks diff --git a/samples/pktgen/README.rst b/samples/pktgen/README.rst index 3f6483e8b2df..f9c53ca5cf93 100644 --- a/samples/pktgen/README.rst +++ b/samples/pktgen/README.rst @@ -3,7 +3,7 @@ Sample and benchmark scripts for pktgen (packet generator) This directory contains some pktgen sample and benchmark scripts, that can easily be copied and adjusted for your own use-case. -General doc is located in kernel: Documentation/networking/pktgen.txt +General doc is located in kernel: Documentation/networking/pktgen.rst Helper include files ==================== -- cgit v1.2.3-59-g8ed1b From 32c01266c0aab060550f592b3fe59405be8ab022 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Thu, 30 Apr 2020 18:04:14 +0200 Subject: docs: networking: convert PLIP.txt to ReST - add SPDX header; - mark code blocks and literals as such; - adjust identation, whitespaces and blank lines where needed; - add to networking/index.rst. Signed-off-by: Mauro Carvalho Chehab Signed-off-by: David S. Miller --- Documentation/networking/PLIP.txt | 215 ----------------------------------- Documentation/networking/index.rst | 1 + Documentation/networking/plip.rst | 222 +++++++++++++++++++++++++++++++++++++ drivers/net/plip/Kconfig | 2 +- 4 files changed, 224 insertions(+), 216 deletions(-) delete mode 100644 Documentation/networking/PLIP.txt create mode 100644 Documentation/networking/plip.rst diff --git a/Documentation/networking/PLIP.txt b/Documentation/networking/PLIP.txt deleted file mode 100644 index ad7e3f7c3bbf..000000000000 --- a/Documentation/networking/PLIP.txt +++ /dev/null @@ -1,215 +0,0 @@ -PLIP: The Parallel Line Internet Protocol Device - -Donald Becker (becker@super.org) -I.D.A. Supercomputing Research Center, Bowie MD 20715 - -At some point T. Thorn will probably contribute text, -Tommy Thorn (tthorn@daimi.aau.dk) - -PLIP Introduction ------------------ - -This document describes the parallel port packet pusher for Net/LGX. -This device interface allows a point-to-point connection between two -parallel ports to appear as a IP network interface. - -What is PLIP? -============= - -PLIP is Parallel Line IP, that is, the transportation of IP packages -over a parallel port. In the case of a PC, the obvious choice is the -printer port. PLIP is a non-standard, but [can use] uses the standard -LapLink null-printer cable [can also work in turbo mode, with a PLIP -cable]. [The protocol used to pack IP packages, is a simple one -initiated by Crynwr.] - -Advantages of PLIP -================== - -It's cheap, it's available everywhere, and it's easy. - -The PLIP cable is all that's needed to connect two Linux boxes, and it -can be built for very few bucks. - -Connecting two Linux boxes takes only a second's decision and a few -minutes' work, no need to search for a [supported] netcard. This might -even be especially important in the case of notebooks, where netcards -are not easily available. - -Not requiring a netcard also means that apart from connecting the -cables, everything else is software configuration [which in principle -could be made very easy.] - -Disadvantages of PLIP -===================== - -Doesn't work over a modem, like SLIP and PPP. Limited range, 15 m. -Can only be used to connect three (?) Linux boxes. Doesn't connect to -an existing Ethernet. Isn't standard (not even de facto standard, like -SLIP). - -Performance -=========== - -PLIP easily outperforms Ethernet cards....(ups, I was dreaming, but -it *is* getting late. EOB) - -PLIP driver details -------------------- - -The Linux PLIP driver is an implementation of the original Crynwr protocol, -that uses the parallel port subsystem of the kernel in order to properly -share parallel ports between PLIP and other services. - -IRQs and trigger timeouts -========================= - -When a parallel port used for a PLIP driver has an IRQ configured to it, the -PLIP driver is signaled whenever data is sent to it via the cable, such that -when no data is available, the driver isn't being used. - -However, on some machines it is hard, if not impossible, to configure an IRQ -to a certain parallel port, mainly because it is used by some other device. -On these machines, the PLIP driver can be used in IRQ-less mode, where -the PLIP driver would constantly poll the parallel port for data waiting, -and if such data is available, process it. This mode is less efficient than -the IRQ mode, because the driver has to check the parallel port many times -per second, even when no data at all is sent. Some rough measurements -indicate that there isn't a noticeable performance drop when using IRQ-less -mode as compared to IRQ mode as far as the data transfer speed is involved. -There is a performance drop on the machine hosting the driver. - -When the PLIP driver is used in IRQ mode, the timeout used for triggering a -data transfer (the maximal time the PLIP driver would allow the other side -before announcing a timeout, when trying to handshake a transfer of some -data) is, by default, 500usec. As IRQ delivery is more or less immediate, -this timeout is quite sufficient. - -When in IRQ-less mode, the PLIP driver polls the parallel port HZ times -per second (where HZ is typically 100 on most platforms, and 1024 on an -Alpha, as of this writing). Between two such polls, there are 10^6/HZ usecs. -On an i386, for example, 10^6/100 = 10000usec. It is easy to see that it is -quite possible for the trigger timeout to expire between two such polls, as -the timeout is only 500usec long. As a result, it is required to change the -trigger timeout on the *other* side of a PLIP connection, to about -10^6/HZ usecs. If both sides of a PLIP connection are used in IRQ-less mode, -this timeout is required on both sides. - -It appears that in practice, the trigger timeout can be shorter than in the -above calculation. It isn't an important issue, unless the wire is faulty, -in which case a long timeout would stall the machine when, for whatever -reason, bits are dropped. - -A utility that can perform this change in Linux is plipconfig, which is part -of the net-tools package (its location can be found in the -Documentation/Changes file). An example command would be -'plipconfig plipX trigger 10000', where plipX is the appropriate -PLIP device. - -PLIP hardware interconnection ------------------------------ - -PLIP uses several different data transfer methods. The first (and the -only one implemented in the early version of the code) uses a standard -printer "null" cable to transfer data four bits at a time using -data bit outputs connected to status bit inputs. - -The second data transfer method relies on both machines having -bi-directional parallel ports, rather than output-only ``printer'' -ports. This allows byte-wide transfers and avoids reconstructing -nibbles into bytes, leading to much faster transfers. - -Parallel Transfer Mode 0 Cable -============================== - -The cable for the first transfer mode is a standard -printer "null" cable which transfers data four bits at a time using -data bit outputs of the first port (machine T) connected to the -status bit inputs of the second port (machine R). There are five -status inputs, and they are used as four data inputs and a clock (data -strobe) input, arranged so that the data input bits appear as contiguous -bits with standard status register implementation. - -A cable that implements this protocol is available commercially as a -"Null Printer" or "Turbo Laplink" cable. It can be constructed with -two DB-25 male connectors symmetrically connected as follows: - - STROBE output 1* - D0->ERROR 2 - 15 15 - 2 - D1->SLCT 3 - 13 13 - 3 - D2->PAPOUT 4 - 12 12 - 4 - D3->ACK 5 - 10 10 - 5 - D4->BUSY 6 - 11 11 - 6 - D5,D6,D7 are 7*, 8*, 9* - AUTOFD output 14* - INIT output 16* - SLCTIN 17 - 17 - extra grounds are 18*,19*,20*,21*,22*,23*,24* - GROUND 25 - 25 -* Do not connect these pins on either end - -If the cable you are using has a metallic shield it should be -connected to the metallic DB-25 shell at one end only. - -Parallel Transfer Mode 1 -======================== - -The second data transfer method relies on both machines having -bi-directional parallel ports, rather than output-only ``printer'' -ports. This allows byte-wide transfers, and avoids reconstructing -nibbles into bytes. This cable should not be used on unidirectional -``printer'' (as opposed to ``parallel'') ports or when the machine -isn't configured for PLIP, as it will result in output driver -conflicts and the (unlikely) possibility of damage. - -The cable for this transfer mode should be constructed as follows: - - STROBE->BUSY 1 - 11 - D0->D0 2 - 2 - D1->D1 3 - 3 - D2->D2 4 - 4 - D3->D3 5 - 5 - D4->D4 6 - 6 - D5->D5 7 - 7 - D6->D6 8 - 8 - D7->D7 9 - 9 - INIT -> ACK 16 - 10 - AUTOFD->PAPOUT 14 - 12 - SLCT->SLCTIN 13 - 17 - GND->ERROR 18 - 15 - extra grounds are 19*,20*,21*,22*,23*,24* - GROUND 25 - 25 -* Do not connect these pins on either end - -Once again, if the cable you are using has a metallic shield it should -be connected to the metallic DB-25 shell at one end only. - -PLIP Mode 0 transfer protocol -============================= - -The PLIP driver is compatible with the "Crynwr" parallel port transfer -standard in Mode 0. That standard specifies the following protocol: - - send header nibble '0x8' - count-low octet - count-high octet - ... data octets - checksum octet - -Each octet is sent as - - >4)&0x0F)> - -To start a transfer the transmitting machine outputs a nibble 0x08. -That raises the ACK line, triggering an interrupt in the receiving -machine. The receiving machine disables interrupts and raises its own ACK -line. - -Restated: - -(OUT is bit 0-4, OUT.j is bit j from OUT. IN likewise) -Send_Byte: - OUT := low nibble, OUT.4 := 1 - WAIT FOR IN.4 = 1 - OUT := high nibble, OUT.4 := 0 - WAIT FOR IN.4 = 0 diff --git a/Documentation/networking/index.rst b/Documentation/networking/index.rst index 696181a96e3c..18bb10239cad 100644 --- a/Documentation/networking/index.rst +++ b/Documentation/networking/index.rst @@ -92,6 +92,7 @@ Contents: packet_mmap phonet pktgen + plip .. only:: subproject and html diff --git a/Documentation/networking/plip.rst b/Documentation/networking/plip.rst new file mode 100644 index 000000000000..0eda745050ff --- /dev/null +++ b/Documentation/networking/plip.rst @@ -0,0 +1,222 @@ +.. SPDX-License-Identifier: GPL-2.0 + +================================================ +PLIP: The Parallel Line Internet Protocol Device +================================================ + +Donald Becker (becker@super.org) +I.D.A. Supercomputing Research Center, Bowie MD 20715 + +At some point T. Thorn will probably contribute text, +Tommy Thorn (tthorn@daimi.aau.dk) + +PLIP Introduction +----------------- + +This document describes the parallel port packet pusher for Net/LGX. +This device interface allows a point-to-point connection between two +parallel ports to appear as a IP network interface. + +What is PLIP? +============= + +PLIP is Parallel Line IP, that is, the transportation of IP packages +over a parallel port. In the case of a PC, the obvious choice is the +printer port. PLIP is a non-standard, but [can use] uses the standard +LapLink null-printer cable [can also work in turbo mode, with a PLIP +cable]. [The protocol used to pack IP packages, is a simple one +initiated by Crynwr.] + +Advantages of PLIP +================== + +It's cheap, it's available everywhere, and it's easy. + +The PLIP cable is all that's needed to connect two Linux boxes, and it +can be built for very few bucks. + +Connecting two Linux boxes takes only a second's decision and a few +minutes' work, no need to search for a [supported] netcard. This might +even be especially important in the case of notebooks, where netcards +are not easily available. + +Not requiring a netcard also means that apart from connecting the +cables, everything else is software configuration [which in principle +could be made very easy.] + +Disadvantages of PLIP +===================== + +Doesn't work over a modem, like SLIP and PPP. Limited range, 15 m. +Can only be used to connect three (?) Linux boxes. Doesn't connect to +an existing Ethernet. Isn't standard (not even de facto standard, like +SLIP). + +Performance +=========== + +PLIP easily outperforms Ethernet cards....(ups, I was dreaming, but +it *is* getting late. EOB) + +PLIP driver details +------------------- + +The Linux PLIP driver is an implementation of the original Crynwr protocol, +that uses the parallel port subsystem of the kernel in order to properly +share parallel ports between PLIP and other services. + +IRQs and trigger timeouts +========================= + +When a parallel port used for a PLIP driver has an IRQ configured to it, the +PLIP driver is signaled whenever data is sent to it via the cable, such that +when no data is available, the driver isn't being used. + +However, on some machines it is hard, if not impossible, to configure an IRQ +to a certain parallel port, mainly because it is used by some other device. +On these machines, the PLIP driver can be used in IRQ-less mode, where +the PLIP driver would constantly poll the parallel port for data waiting, +and if such data is available, process it. This mode is less efficient than +the IRQ mode, because the driver has to check the parallel port many times +per second, even when no data at all is sent. Some rough measurements +indicate that there isn't a noticeable performance drop when using IRQ-less +mode as compared to IRQ mode as far as the data transfer speed is involved. +There is a performance drop on the machine hosting the driver. + +When the PLIP driver is used in IRQ mode, the timeout used for triggering a +data transfer (the maximal time the PLIP driver would allow the other side +before announcing a timeout, when trying to handshake a transfer of some +data) is, by default, 500usec. As IRQ delivery is more or less immediate, +this timeout is quite sufficient. + +When in IRQ-less mode, the PLIP driver polls the parallel port HZ times +per second (where HZ is typically 100 on most platforms, and 1024 on an +Alpha, as of this writing). Between two such polls, there are 10^6/HZ usecs. +On an i386, for example, 10^6/100 = 10000usec. It is easy to see that it is +quite possible for the trigger timeout to expire between two such polls, as +the timeout is only 500usec long. As a result, it is required to change the +trigger timeout on the *other* side of a PLIP connection, to about +10^6/HZ usecs. If both sides of a PLIP connection are used in IRQ-less mode, +this timeout is required on both sides. + +It appears that in practice, the trigger timeout can be shorter than in the +above calculation. It isn't an important issue, unless the wire is faulty, +in which case a long timeout would stall the machine when, for whatever +reason, bits are dropped. + +A utility that can perform this change in Linux is plipconfig, which is part +of the net-tools package (its location can be found in the +Documentation/Changes file). An example command would be +'plipconfig plipX trigger 10000', where plipX is the appropriate +PLIP device. + +PLIP hardware interconnection +----------------------------- + +PLIP uses several different data transfer methods. The first (and the +only one implemented in the early version of the code) uses a standard +printer "null" cable to transfer data four bits at a time using +data bit outputs connected to status bit inputs. + +The second data transfer method relies on both machines having +bi-directional parallel ports, rather than output-only ``printer`` +ports. This allows byte-wide transfers and avoids reconstructing +nibbles into bytes, leading to much faster transfers. + +Parallel Transfer Mode 0 Cable +============================== + +The cable for the first transfer mode is a standard +printer "null" cable which transfers data four bits at a time using +data bit outputs of the first port (machine T) connected to the +status bit inputs of the second port (machine R). There are five +status inputs, and they are used as four data inputs and a clock (data +strobe) input, arranged so that the data input bits appear as contiguous +bits with standard status register implementation. + +A cable that implements this protocol is available commercially as a +"Null Printer" or "Turbo Laplink" cable. It can be constructed with +two DB-25 male connectors symmetrically connected as follows:: + + STROBE output 1* + D0->ERROR 2 - 15 15 - 2 + D1->SLCT 3 - 13 13 - 3 + D2->PAPOUT 4 - 12 12 - 4 + D3->ACK 5 - 10 10 - 5 + D4->BUSY 6 - 11 11 - 6 + D5,D6,D7 are 7*, 8*, 9* + AUTOFD output 14* + INIT output 16* + SLCTIN 17 - 17 + extra grounds are 18*,19*,20*,21*,22*,23*,24* + GROUND 25 - 25 + + * Do not connect these pins on either end + +If the cable you are using has a metallic shield it should be +connected to the metallic DB-25 shell at one end only. + +Parallel Transfer Mode 1 +======================== + +The second data transfer method relies on both machines having +bi-directional parallel ports, rather than output-only ``printer`` +ports. This allows byte-wide transfers, and avoids reconstructing +nibbles into bytes. This cable should not be used on unidirectional +``printer`` (as opposed to ``parallel``) ports or when the machine +isn't configured for PLIP, as it will result in output driver +conflicts and the (unlikely) possibility of damage. + +The cable for this transfer mode should be constructed as follows:: + + STROBE->BUSY 1 - 11 + D0->D0 2 - 2 + D1->D1 3 - 3 + D2->D2 4 - 4 + D3->D3 5 - 5 + D4->D4 6 - 6 + D5->D5 7 - 7 + D6->D6 8 - 8 + D7->D7 9 - 9 + INIT -> ACK 16 - 10 + AUTOFD->PAPOUT 14 - 12 + SLCT->SLCTIN 13 - 17 + GND->ERROR 18 - 15 + extra grounds are 19*,20*,21*,22*,23*,24* + GROUND 25 - 25 + + * Do not connect these pins on either end + +Once again, if the cable you are using has a metallic shield it should +be connected to the metallic DB-25 shell at one end only. + +PLIP Mode 0 transfer protocol +============================= + +The PLIP driver is compatible with the "Crynwr" parallel port transfer +standard in Mode 0. That standard specifies the following protocol:: + + send header nibble '0x8' + count-low octet + count-high octet + ... data octets + checksum octet + +Each octet is sent as:: + + + >4)&0x0F)> + +To start a transfer the transmitting machine outputs a nibble 0x08. +That raises the ACK line, triggering an interrupt in the receiving +machine. The receiving machine disables interrupts and raises its own ACK +line. + +Restated:: + + (OUT is bit 0-4, OUT.j is bit j from OUT. IN likewise) + Send_Byte: + OUT := low nibble, OUT.4 := 1 + WAIT FOR IN.4 = 1 + OUT := high nibble, OUT.4 := 0 + WAIT FOR IN.4 = 0 diff --git a/drivers/net/plip/Kconfig b/drivers/net/plip/Kconfig index b41035be2d51..e03556d1d0c2 100644 --- a/drivers/net/plip/Kconfig +++ b/drivers/net/plip/Kconfig @@ -21,7 +21,7 @@ config PLIP bits at a time (mode 0) or with special PLIP cables, to be used on bidirectional parallel ports only, which can transmit 8 bits at a time (mode 1); you can find the wiring of these cables in - . The cables can be up to + . The cables can be up to 15m long. Mode 0 works also if one of the machines runs DOS/Windows and has some PLIP software installed, e.g. the Crynwr PLIP packet driver () -- cgit v1.2.3-59-g8ed1b From 71120802ebeda1e645baf673b958978c4000a695 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Thu, 30 Apr 2020 18:04:15 +0200 Subject: docs: networking: convert ppp_generic.txt to ReST - add SPDX header; - adjust title markup; - mark code blocks and literals as such; - mark tables as such; - adjust identation, whitespaces and blank lines where needed; - add to networking/index.rst. Signed-off-by: Mauro Carvalho Chehab Signed-off-by: David S. Miller --- Documentation/networking/index.rst | 1 + Documentation/networking/ppp_generic.rst | 440 +++++++++++++++++++++++++++++++ Documentation/networking/ppp_generic.txt | 428 ------------------------------ 3 files changed, 441 insertions(+), 428 deletions(-) create mode 100644 Documentation/networking/ppp_generic.rst delete mode 100644 Documentation/networking/ppp_generic.txt diff --git a/Documentation/networking/index.rst b/Documentation/networking/index.rst index 18bb10239cad..f89535871481 100644 --- a/Documentation/networking/index.rst +++ b/Documentation/networking/index.rst @@ -93,6 +93,7 @@ Contents: phonet pktgen plip + ppp_generic .. only:: subproject and html diff --git a/Documentation/networking/ppp_generic.rst b/Documentation/networking/ppp_generic.rst new file mode 100644 index 000000000000..e60504377900 --- /dev/null +++ b/Documentation/networking/ppp_generic.rst @@ -0,0 +1,440 @@ +.. SPDX-License-Identifier: GPL-2.0 + +======================================== +PPP Generic Driver and Channel Interface +======================================== + + Paul Mackerras + paulus@samba.org + + 7 Feb 2002 + +The generic PPP driver in linux-2.4 provides an implementation of the +functionality which is of use in any PPP implementation, including: + +* the network interface unit (ppp0 etc.) +* the interface to the networking code +* PPP multilink: splitting datagrams between multiple links, and + ordering and combining received fragments +* the interface to pppd, via a /dev/ppp character device +* packet compression and decompression +* TCP/IP header compression and decompression +* detecting network traffic for demand dialling and for idle timeouts +* simple packet filtering + +For sending and receiving PPP frames, the generic PPP driver calls on +the services of PPP ``channels``. A PPP channel encapsulates a +mechanism for transporting PPP frames from one machine to another. A +PPP channel implementation can be arbitrarily complex internally but +has a very simple interface with the generic PPP code: it merely has +to be able to send PPP frames, receive PPP frames, and optionally +handle ioctl requests. Currently there are PPP channel +implementations for asynchronous serial ports, synchronous serial +ports, and for PPP over ethernet. + +This architecture makes it possible to implement PPP multilink in a +natural and straightforward way, by allowing more than one channel to +be linked to each ppp network interface unit. The generic layer is +responsible for splitting datagrams on transmit and recombining them +on receive. + + +PPP channel API +--------------- + +See include/linux/ppp_channel.h for the declaration of the types and +functions used to communicate between the generic PPP layer and PPP +channels. + +Each channel has to provide two functions to the generic PPP layer, +via the ppp_channel.ops pointer: + +* start_xmit() is called by the generic layer when it has a frame to + send. The channel has the option of rejecting the frame for + flow-control reasons. In this case, start_xmit() should return 0 + and the channel should call the ppp_output_wakeup() function at a + later time when it can accept frames again, and the generic layer + will then attempt to retransmit the rejected frame(s). If the frame + is accepted, the start_xmit() function should return 1. + +* ioctl() provides an interface which can be used by a user-space + program to control aspects of the channel's behaviour. This + procedure will be called when a user-space program does an ioctl + system call on an instance of /dev/ppp which is bound to the + channel. (Usually it would only be pppd which would do this.) + +The generic PPP layer provides seven functions to channels: + +* ppp_register_channel() is called when a channel has been created, to + notify the PPP generic layer of its presence. For example, setting + a serial port to the PPPDISC line discipline causes the ppp_async + channel code to call this function. + +* ppp_unregister_channel() is called when a channel is to be + destroyed. For example, the ppp_async channel code calls this when + a hangup is detected on the serial port. + +* ppp_output_wakeup() is called by a channel when it has previously + rejected a call to its start_xmit function, and can now accept more + packets. + +* ppp_input() is called by a channel when it has received a complete + PPP frame. + +* ppp_input_error() is called by a channel when it has detected that a + frame has been lost or dropped (for example, because of a FCS (frame + check sequence) error). + +* ppp_channel_index() returns the channel index assigned by the PPP + generic layer to this channel. The channel should provide some way + (e.g. an ioctl) to transmit this back to user-space, as user-space + will need it to attach an instance of /dev/ppp to this channel. + +* ppp_unit_number() returns the unit number of the ppp network + interface to which this channel is connected, or -1 if the channel + is not connected. + +Connecting a channel to the ppp generic layer is initiated from the +channel code, rather than from the generic layer. The channel is +expected to have some way for a user-level process to control it +independently of the ppp generic layer. For example, with the +ppp_async channel, this is provided by the file descriptor to the +serial port. + +Generally a user-level process will initialize the underlying +communications medium and prepare it to do PPP. For example, with an +async tty, this can involve setting the tty speed and modes, issuing +modem commands, and then going through some sort of dialog with the +remote system to invoke PPP service there. We refer to this process +as ``discovery``. Then the user-level process tells the medium to +become a PPP channel and register itself with the generic PPP layer. +The channel then has to report the channel number assigned to it back +to the user-level process. From that point, the PPP negotiation code +in the PPP daemon (pppd) can take over and perform the PPP +negotiation, accessing the channel through the /dev/ppp interface. + +At the interface to the PPP generic layer, PPP frames are stored in +skbuff structures and start with the two-byte PPP protocol number. +The frame does *not* include the 0xff ``address`` byte or the 0x03 +``control`` byte that are optionally used in async PPP. Nor is there +any escaping of control characters, nor are there any FCS or framing +characters included. That is all the responsibility of the channel +code, if it is needed for the particular medium. That is, the skbuffs +presented to the start_xmit() function contain only the 2-byte +protocol number and the data, and the skbuffs presented to ppp_input() +must be in the same format. + +The channel must provide an instance of a ppp_channel struct to +represent the channel. The channel is free to use the ``private`` field +however it wishes. The channel should initialize the ``mtu`` and +``hdrlen`` fields before calling ppp_register_channel() and not change +them until after ppp_unregister_channel() returns. The ``mtu`` field +represents the maximum size of the data part of the PPP frames, that +is, it does not include the 2-byte protocol number. + +If the channel needs some headroom in the skbuffs presented to it for +transmission (i.e., some space free in the skbuff data area before the +start of the PPP frame), it should set the ``hdrlen`` field of the +ppp_channel struct to the amount of headroom required. The generic +PPP layer will attempt to provide that much headroom but the channel +should still check if there is sufficient headroom and copy the skbuff +if there isn't. + +On the input side, channels should ideally provide at least 2 bytes of +headroom in the skbuffs presented to ppp_input(). The generic PPP +code does not require this but will be more efficient if this is done. + + +Buffering and flow control +-------------------------- + +The generic PPP layer has been designed to minimize the amount of data +that it buffers in the transmit direction. It maintains a queue of +transmit packets for the PPP unit (network interface device) plus a +queue of transmit packets for each attached channel. Normally the +transmit queue for the unit will contain at most one packet; the +exceptions are when pppd sends packets by writing to /dev/ppp, and +when the core networking code calls the generic layer's start_xmit() +function with the queue stopped, i.e. when the generic layer has +called netif_stop_queue(), which only happens on a transmit timeout. +The start_xmit function always accepts and queues the packet which it +is asked to transmit. + +Transmit packets are dequeued from the PPP unit transmit queue and +then subjected to TCP/IP header compression and packet compression +(Deflate or BSD-Compress compression), as appropriate. After this +point the packets can no longer be reordered, as the decompression +algorithms rely on receiving compressed packets in the same order that +they were generated. + +If multilink is not in use, this packet is then passed to the attached +channel's start_xmit() function. If the channel refuses to take +the packet, the generic layer saves it for later transmission. The +generic layer will call the channel's start_xmit() function again +when the channel calls ppp_output_wakeup() or when the core +networking code calls the generic layer's start_xmit() function +again. The generic layer contains no timeout and retransmission +logic; it relies on the core networking code for that. + +If multilink is in use, the generic layer divides the packet into one +or more fragments and puts a multilink header on each fragment. It +decides how many fragments to use based on the length of the packet +and the number of channels which are potentially able to accept a +fragment at the moment. A channel is potentially able to accept a +fragment if it doesn't have any fragments currently queued up for it +to transmit. The channel may still refuse a fragment; in this case +the fragment is queued up for the channel to transmit later. This +scheme has the effect that more fragments are given to higher- +bandwidth channels. It also means that under light load, the generic +layer will tend to fragment large packets across all the channels, +thus reducing latency, while under heavy load, packets will tend to be +transmitted as single fragments, thus reducing the overhead of +fragmentation. + + +SMP safety +---------- + +The PPP generic layer has been designed to be SMP-safe. Locks are +used around accesses to the internal data structures where necessary +to ensure their integrity. As part of this, the generic layer +requires that the channels adhere to certain requirements and in turn +provides certain guarantees to the channels. Essentially the channels +are required to provide the appropriate locking on the ppp_channel +structures that form the basis of the communication between the +channel and the generic layer. This is because the channel provides +the storage for the ppp_channel structure, and so the channel is +required to provide the guarantee that this storage exists and is +valid at the appropriate times. + +The generic layer requires these guarantees from the channel: + +* The ppp_channel object must exist from the time that + ppp_register_channel() is called until after the call to + ppp_unregister_channel() returns. + +* No thread may be in a call to any of ppp_input(), ppp_input_error(), + ppp_output_wakeup(), ppp_channel_index() or ppp_unit_number() for a + channel at the time that ppp_unregister_channel() is called for that + channel. + +* ppp_register_channel() and ppp_unregister_channel() must be called + from process context, not interrupt or softirq/BH context. + +* The remaining generic layer functions may be called at softirq/BH + level but must not be called from a hardware interrupt handler. + +* The generic layer may call the channel start_xmit() function at + softirq/BH level but will not call it at interrupt level. Thus the + start_xmit() function may not block. + +* The generic layer will only call the channel ioctl() function in + process context. + +The generic layer provides these guarantees to the channels: + +* The generic layer will not call the start_xmit() function for a + channel while any thread is already executing in that function for + that channel. + +* The generic layer will not call the ioctl() function for a channel + while any thread is already executing in that function for that + channel. + +* By the time a call to ppp_unregister_channel() returns, no thread + will be executing in a call from the generic layer to that channel's + start_xmit() or ioctl() function, and the generic layer will not + call either of those functions subsequently. + + +Interface to pppd +----------------- + +The PPP generic layer exports a character device interface called +/dev/ppp. This is used by pppd to control PPP interface units and +channels. Although there is only one /dev/ppp, each open instance of +/dev/ppp acts independently and can be attached either to a PPP unit +or a PPP channel. This is achieved using the file->private_data field +to point to a separate object for each open instance of /dev/ppp. In +this way an effect similar to Solaris' clone open is obtained, +allowing us to control an arbitrary number of PPP interfaces and +channels without having to fill up /dev with hundreds of device names. + +When /dev/ppp is opened, a new instance is created which is initially +unattached. Using an ioctl call, it can then be attached to an +existing unit, attached to a newly-created unit, or attached to an +existing channel. An instance attached to a unit can be used to send +and receive PPP control frames, using the read() and write() system +calls, along with poll() if necessary. Similarly, an instance +attached to a channel can be used to send and receive PPP frames on +that channel. + +In multilink terms, the unit represents the bundle, while the channels +represent the individual physical links. Thus, a PPP frame sent by a +write to the unit (i.e., to an instance of /dev/ppp attached to the +unit) will be subject to bundle-level compression and to fragmentation +across the individual links (if multilink is in use). In contrast, a +PPP frame sent by a write to the channel will be sent as-is on that +channel, without any multilink header. + +A channel is not initially attached to any unit. In this state it can +be used for PPP negotiation but not for the transfer of data packets. +It can then be connected to a PPP unit with an ioctl call, which +makes it available to send and receive data packets for that unit. + +The ioctl calls which are available on an instance of /dev/ppp depend +on whether it is unattached, attached to a PPP interface, or attached +to a PPP channel. The ioctl calls which are available on an +unattached instance are: + +* PPPIOCNEWUNIT creates a new PPP interface and makes this /dev/ppp + instance the "owner" of the interface. The argument should point to + an int which is the desired unit number if >= 0, or -1 to assign the + lowest unused unit number. Being the owner of the interface means + that the interface will be shut down if this instance of /dev/ppp is + closed. + +* PPPIOCATTACH attaches this instance to an existing PPP interface. + The argument should point to an int containing the unit number. + This does not make this instance the owner of the PPP interface. + +* PPPIOCATTCHAN attaches this instance to an existing PPP channel. + The argument should point to an int containing the channel number. + +The ioctl calls available on an instance of /dev/ppp attached to a +channel are: + +* PPPIOCCONNECT connects this channel to a PPP interface. The + argument should point to an int containing the interface unit + number. It will return an EINVAL error if the channel is already + connected to an interface, or ENXIO if the requested interface does + not exist. + +* PPPIOCDISCONN disconnects this channel from the PPP interface that + it is connected to. It will return an EINVAL error if the channel + is not connected to an interface. + +* All other ioctl commands are passed to the channel ioctl() function. + +The ioctl calls that are available on an instance that is attached to +an interface unit are: + +* PPPIOCSMRU sets the MRU (maximum receive unit) for the interface. + The argument should point to an int containing the new MRU value. + +* PPPIOCSFLAGS sets flags which control the operation of the + interface. The argument should be a pointer to an int containing + the new flags value. The bits in the flags value that can be set + are: + + ================ ======================================== + SC_COMP_TCP enable transmit TCP header compression + SC_NO_TCP_CCID disable connection-id compression for + TCP header compression + SC_REJ_COMP_TCP disable receive TCP header decompression + SC_CCP_OPEN Compression Control Protocol (CCP) is + open, so inspect CCP packets + SC_CCP_UP CCP is up, may (de)compress packets + SC_LOOP_TRAFFIC send IP traffic to pppd + SC_MULTILINK enable PPP multilink fragmentation on + transmitted packets + SC_MP_SHORTSEQ expect short multilink sequence + numbers on received multilink fragments + SC_MP_XSHORTSEQ transmit short multilink sequence nos. + ================ ======================================== + + The values of these flags are defined in . Note + that the values of the SC_MULTILINK, SC_MP_SHORTSEQ and + SC_MP_XSHORTSEQ bits are ignored if the CONFIG_PPP_MULTILINK option + is not selected. + +* PPPIOCGFLAGS returns the value of the status/control flags for the + interface unit. The argument should point to an int where the ioctl + will store the flags value. As well as the values listed above for + PPPIOCSFLAGS, the following bits may be set in the returned value: + + ================ ========================================= + SC_COMP_RUN CCP compressor is running + SC_DECOMP_RUN CCP decompressor is running + SC_DC_ERROR CCP decompressor detected non-fatal error + SC_DC_FERROR CCP decompressor detected fatal error + ================ ========================================= + +* PPPIOCSCOMPRESS sets the parameters for packet compression or + decompression. The argument should point to a ppp_option_data + structure (defined in ), which contains a + pointer/length pair which should describe a block of memory + containing a CCP option specifying a compression method and its + parameters. The ppp_option_data struct also contains a ``transmit`` + field. If this is 0, the ioctl will affect the receive path, + otherwise the transmit path. + +* PPPIOCGUNIT returns, in the int pointed to by the argument, the unit + number of this interface unit. + +* PPPIOCSDEBUG sets the debug flags for the interface to the value in + the int pointed to by the argument. Only the least significant bit + is used; if this is 1 the generic layer will print some debug + messages during its operation. This is only intended for debugging + the generic PPP layer code; it is generally not helpful for working + out why a PPP connection is failing. + +* PPPIOCGDEBUG returns the debug flags for the interface in the int + pointed to by the argument. + +* PPPIOCGIDLE returns the time, in seconds, since the last data + packets were sent and received. The argument should point to a + ppp_idle structure (defined in ). If the + CONFIG_PPP_FILTER option is enabled, the set of packets which reset + the transmit and receive idle timers is restricted to those which + pass the ``active`` packet filter. + Two versions of this command exist, to deal with user space + expecting times as either 32-bit or 64-bit time_t seconds. + +* PPPIOCSMAXCID sets the maximum connection-ID parameter (and thus the + number of connection slots) for the TCP header compressor and + decompressor. The lower 16 bits of the int pointed to by the + argument specify the maximum connection-ID for the compressor. If + the upper 16 bits of that int are non-zero, they specify the maximum + connection-ID for the decompressor, otherwise the decompressor's + maximum connection-ID is set to 15. + +* PPPIOCSNPMODE sets the network-protocol mode for a given network + protocol. The argument should point to an npioctl struct (defined + in ). The ``protocol`` field gives the PPP protocol + number for the protocol to be affected, and the ``mode`` field + specifies what to do with packets for that protocol: + + ============= ============================================== + NPMODE_PASS normal operation, transmit and receive packets + NPMODE_DROP silently drop packets for this protocol + NPMODE_ERROR drop packets and return an error on transmit + NPMODE_QUEUE queue up packets for transmit, drop received + packets + ============= ============================================== + + At present NPMODE_ERROR and NPMODE_QUEUE have the same effect as + NPMODE_DROP. + +* PPPIOCGNPMODE returns the network-protocol mode for a given + protocol. The argument should point to an npioctl struct with the + ``protocol`` field set to the PPP protocol number for the protocol of + interest. On return the ``mode`` field will be set to the network- + protocol mode for that protocol. + +* PPPIOCSPASS and PPPIOCSACTIVE set the ``pass`` and ``active`` packet + filters. These ioctls are only available if the CONFIG_PPP_FILTER + option is selected. The argument should point to a sock_fprog + structure (defined in ) containing the compiled BPF + instructions for the filter. Packets are dropped if they fail the + ``pass`` filter; otherwise, if they fail the ``active`` filter they are + passed but they do not reset the transmit or receive idle timer. + +* PPPIOCSMRRU enables or disables multilink processing for received + packets and sets the multilink MRRU (maximum reconstructed receive + unit). The argument should point to an int containing the new MRRU + value. If the MRRU value is 0, processing of received multilink + fragments is disabled. This ioctl is only available if the + CONFIG_PPP_MULTILINK option is selected. + +Last modified: 7-feb-2002 diff --git a/Documentation/networking/ppp_generic.txt b/Documentation/networking/ppp_generic.txt deleted file mode 100644 index fd563aff5fc9..000000000000 --- a/Documentation/networking/ppp_generic.txt +++ /dev/null @@ -1,428 +0,0 @@ - PPP Generic Driver and Channel Interface - ---------------------------------------- - - Paul Mackerras - paulus@samba.org - 7 Feb 2002 - -The generic PPP driver in linux-2.4 provides an implementation of the -functionality which is of use in any PPP implementation, including: - -* the network interface unit (ppp0 etc.) -* the interface to the networking code -* PPP multilink: splitting datagrams between multiple links, and - ordering and combining received fragments -* the interface to pppd, via a /dev/ppp character device -* packet compression and decompression -* TCP/IP header compression and decompression -* detecting network traffic for demand dialling and for idle timeouts -* simple packet filtering - -For sending and receiving PPP frames, the generic PPP driver calls on -the services of PPP `channels'. A PPP channel encapsulates a -mechanism for transporting PPP frames from one machine to another. A -PPP channel implementation can be arbitrarily complex internally but -has a very simple interface with the generic PPP code: it merely has -to be able to send PPP frames, receive PPP frames, and optionally -handle ioctl requests. Currently there are PPP channel -implementations for asynchronous serial ports, synchronous serial -ports, and for PPP over ethernet. - -This architecture makes it possible to implement PPP multilink in a -natural and straightforward way, by allowing more than one channel to -be linked to each ppp network interface unit. The generic layer is -responsible for splitting datagrams on transmit and recombining them -on receive. - - -PPP channel API ---------------- - -See include/linux/ppp_channel.h for the declaration of the types and -functions used to communicate between the generic PPP layer and PPP -channels. - -Each channel has to provide two functions to the generic PPP layer, -via the ppp_channel.ops pointer: - -* start_xmit() is called by the generic layer when it has a frame to - send. The channel has the option of rejecting the frame for - flow-control reasons. In this case, start_xmit() should return 0 - and the channel should call the ppp_output_wakeup() function at a - later time when it can accept frames again, and the generic layer - will then attempt to retransmit the rejected frame(s). If the frame - is accepted, the start_xmit() function should return 1. - -* ioctl() provides an interface which can be used by a user-space - program to control aspects of the channel's behaviour. This - procedure will be called when a user-space program does an ioctl - system call on an instance of /dev/ppp which is bound to the - channel. (Usually it would only be pppd which would do this.) - -The generic PPP layer provides seven functions to channels: - -* ppp_register_channel() is called when a channel has been created, to - notify the PPP generic layer of its presence. For example, setting - a serial port to the PPPDISC line discipline causes the ppp_async - channel code to call this function. - -* ppp_unregister_channel() is called when a channel is to be - destroyed. For example, the ppp_async channel code calls this when - a hangup is detected on the serial port. - -* ppp_output_wakeup() is called by a channel when it has previously - rejected a call to its start_xmit function, and can now accept more - packets. - -* ppp_input() is called by a channel when it has received a complete - PPP frame. - -* ppp_input_error() is called by a channel when it has detected that a - frame has been lost or dropped (for example, because of a FCS (frame - check sequence) error). - -* ppp_channel_index() returns the channel index assigned by the PPP - generic layer to this channel. The channel should provide some way - (e.g. an ioctl) to transmit this back to user-space, as user-space - will need it to attach an instance of /dev/ppp to this channel. - -* ppp_unit_number() returns the unit number of the ppp network - interface to which this channel is connected, or -1 if the channel - is not connected. - -Connecting a channel to the ppp generic layer is initiated from the -channel code, rather than from the generic layer. The channel is -expected to have some way for a user-level process to control it -independently of the ppp generic layer. For example, with the -ppp_async channel, this is provided by the file descriptor to the -serial port. - -Generally a user-level process will initialize the underlying -communications medium and prepare it to do PPP. For example, with an -async tty, this can involve setting the tty speed and modes, issuing -modem commands, and then going through some sort of dialog with the -remote system to invoke PPP service there. We refer to this process -as `discovery'. Then the user-level process tells the medium to -become a PPP channel and register itself with the generic PPP layer. -The channel then has to report the channel number assigned to it back -to the user-level process. From that point, the PPP negotiation code -in the PPP daemon (pppd) can take over and perform the PPP -negotiation, accessing the channel through the /dev/ppp interface. - -At the interface to the PPP generic layer, PPP frames are stored in -skbuff structures and start with the two-byte PPP protocol number. -The frame does *not* include the 0xff `address' byte or the 0x03 -`control' byte that are optionally used in async PPP. Nor is there -any escaping of control characters, nor are there any FCS or framing -characters included. That is all the responsibility of the channel -code, if it is needed for the particular medium. That is, the skbuffs -presented to the start_xmit() function contain only the 2-byte -protocol number and the data, and the skbuffs presented to ppp_input() -must be in the same format. - -The channel must provide an instance of a ppp_channel struct to -represent the channel. The channel is free to use the `private' field -however it wishes. The channel should initialize the `mtu' and -`hdrlen' fields before calling ppp_register_channel() and not change -them until after ppp_unregister_channel() returns. The `mtu' field -represents the maximum size of the data part of the PPP frames, that -is, it does not include the 2-byte protocol number. - -If the channel needs some headroom in the skbuffs presented to it for -transmission (i.e., some space free in the skbuff data area before the -start of the PPP frame), it should set the `hdrlen' field of the -ppp_channel struct to the amount of headroom required. The generic -PPP layer will attempt to provide that much headroom but the channel -should still check if there is sufficient headroom and copy the skbuff -if there isn't. - -On the input side, channels should ideally provide at least 2 bytes of -headroom in the skbuffs presented to ppp_input(). The generic PPP -code does not require this but will be more efficient if this is done. - - -Buffering and flow control --------------------------- - -The generic PPP layer has been designed to minimize the amount of data -that it buffers in the transmit direction. It maintains a queue of -transmit packets for the PPP unit (network interface device) plus a -queue of transmit packets for each attached channel. Normally the -transmit queue for the unit will contain at most one packet; the -exceptions are when pppd sends packets by writing to /dev/ppp, and -when the core networking code calls the generic layer's start_xmit() -function with the queue stopped, i.e. when the generic layer has -called netif_stop_queue(), which only happens on a transmit timeout. -The start_xmit function always accepts and queues the packet which it -is asked to transmit. - -Transmit packets are dequeued from the PPP unit transmit queue and -then subjected to TCP/IP header compression and packet compression -(Deflate or BSD-Compress compression), as appropriate. After this -point the packets can no longer be reordered, as the decompression -algorithms rely on receiving compressed packets in the same order that -they were generated. - -If multilink is not in use, this packet is then passed to the attached -channel's start_xmit() function. If the channel refuses to take -the packet, the generic layer saves it for later transmission. The -generic layer will call the channel's start_xmit() function again -when the channel calls ppp_output_wakeup() or when the core -networking code calls the generic layer's start_xmit() function -again. The generic layer contains no timeout and retransmission -logic; it relies on the core networking code for that. - -If multilink is in use, the generic layer divides the packet into one -or more fragments and puts a multilink header on each fragment. It -decides how many fragments to use based on the length of the packet -and the number of channels which are potentially able to accept a -fragment at the moment. A channel is potentially able to accept a -fragment if it doesn't have any fragments currently queued up for it -to transmit. The channel may still refuse a fragment; in this case -the fragment is queued up for the channel to transmit later. This -scheme has the effect that more fragments are given to higher- -bandwidth channels. It also means that under light load, the generic -layer will tend to fragment large packets across all the channels, -thus reducing latency, while under heavy load, packets will tend to be -transmitted as single fragments, thus reducing the overhead of -fragmentation. - - -SMP safety ----------- - -The PPP generic layer has been designed to be SMP-safe. Locks are -used around accesses to the internal data structures where necessary -to ensure their integrity. As part of this, the generic layer -requires that the channels adhere to certain requirements and in turn -provides certain guarantees to the channels. Essentially the channels -are required to provide the appropriate locking on the ppp_channel -structures that form the basis of the communication between the -channel and the generic layer. This is because the channel provides -the storage for the ppp_channel structure, and so the channel is -required to provide the guarantee that this storage exists and is -valid at the appropriate times. - -The generic layer requires these guarantees from the channel: - -* The ppp_channel object must exist from the time that - ppp_register_channel() is called until after the call to - ppp_unregister_channel() returns. - -* No thread may be in a call to any of ppp_input(), ppp_input_error(), - ppp_output_wakeup(), ppp_channel_index() or ppp_unit_number() for a - channel at the time that ppp_unregister_channel() is called for that - channel. - -* ppp_register_channel() and ppp_unregister_channel() must be called - from process context, not interrupt or softirq/BH context. - -* The remaining generic layer functions may be called at softirq/BH - level but must not be called from a hardware interrupt handler. - -* The generic layer may call the channel start_xmit() function at - softirq/BH level but will not call it at interrupt level. Thus the - start_xmit() function may not block. - -* The generic layer will only call the channel ioctl() function in - process context. - -The generic layer provides these guarantees to the channels: - -* The generic layer will not call the start_xmit() function for a - channel while any thread is already executing in that function for - that channel. - -* The generic layer will not call the ioctl() function for a channel - while any thread is already executing in that function for that - channel. - -* By the time a call to ppp_unregister_channel() returns, no thread - will be executing in a call from the generic layer to that channel's - start_xmit() or ioctl() function, and the generic layer will not - call either of those functions subsequently. - - -Interface to pppd ------------------ - -The PPP generic layer exports a character device interface called -/dev/ppp. This is used by pppd to control PPP interface units and -channels. Although there is only one /dev/ppp, each open instance of -/dev/ppp acts independently and can be attached either to a PPP unit -or a PPP channel. This is achieved using the file->private_data field -to point to a separate object for each open instance of /dev/ppp. In -this way an effect similar to Solaris' clone open is obtained, -allowing us to control an arbitrary number of PPP interfaces and -channels without having to fill up /dev with hundreds of device names. - -When /dev/ppp is opened, a new instance is created which is initially -unattached. Using an ioctl call, it can then be attached to an -existing unit, attached to a newly-created unit, or attached to an -existing channel. An instance attached to a unit can be used to send -and receive PPP control frames, using the read() and write() system -calls, along with poll() if necessary. Similarly, an instance -attached to a channel can be used to send and receive PPP frames on -that channel. - -In multilink terms, the unit represents the bundle, while the channels -represent the individual physical links. Thus, a PPP frame sent by a -write to the unit (i.e., to an instance of /dev/ppp attached to the -unit) will be subject to bundle-level compression and to fragmentation -across the individual links (if multilink is in use). In contrast, a -PPP frame sent by a write to the channel will be sent as-is on that -channel, without any multilink header. - -A channel is not initially attached to any unit. In this state it can -be used for PPP negotiation but not for the transfer of data packets. -It can then be connected to a PPP unit with an ioctl call, which -makes it available to send and receive data packets for that unit. - -The ioctl calls which are available on an instance of /dev/ppp depend -on whether it is unattached, attached to a PPP interface, or attached -to a PPP channel. The ioctl calls which are available on an -unattached instance are: - -* PPPIOCNEWUNIT creates a new PPP interface and makes this /dev/ppp - instance the "owner" of the interface. The argument should point to - an int which is the desired unit number if >= 0, or -1 to assign the - lowest unused unit number. Being the owner of the interface means - that the interface will be shut down if this instance of /dev/ppp is - closed. - -* PPPIOCATTACH attaches this instance to an existing PPP interface. - The argument should point to an int containing the unit number. - This does not make this instance the owner of the PPP interface. - -* PPPIOCATTCHAN attaches this instance to an existing PPP channel. - The argument should point to an int containing the channel number. - -The ioctl calls available on an instance of /dev/ppp attached to a -channel are: - -* PPPIOCCONNECT connects this channel to a PPP interface. The - argument should point to an int containing the interface unit - number. It will return an EINVAL error if the channel is already - connected to an interface, or ENXIO if the requested interface does - not exist. - -* PPPIOCDISCONN disconnects this channel from the PPP interface that - it is connected to. It will return an EINVAL error if the channel - is not connected to an interface. - -* All other ioctl commands are passed to the channel ioctl() function. - -The ioctl calls that are available on an instance that is attached to -an interface unit are: - -* PPPIOCSMRU sets the MRU (maximum receive unit) for the interface. - The argument should point to an int containing the new MRU value. - -* PPPIOCSFLAGS sets flags which control the operation of the - interface. The argument should be a pointer to an int containing - the new flags value. The bits in the flags value that can be set - are: - SC_COMP_TCP enable transmit TCP header compression - SC_NO_TCP_CCID disable connection-id compression for - TCP header compression - SC_REJ_COMP_TCP disable receive TCP header decompression - SC_CCP_OPEN Compression Control Protocol (CCP) is - open, so inspect CCP packets - SC_CCP_UP CCP is up, may (de)compress packets - SC_LOOP_TRAFFIC send IP traffic to pppd - SC_MULTILINK enable PPP multilink fragmentation on - transmitted packets - SC_MP_SHORTSEQ expect short multilink sequence - numbers on received multilink fragments - SC_MP_XSHORTSEQ transmit short multilink sequence nos. - - The values of these flags are defined in . Note - that the values of the SC_MULTILINK, SC_MP_SHORTSEQ and - SC_MP_XSHORTSEQ bits are ignored if the CONFIG_PPP_MULTILINK option - is not selected. - -* PPPIOCGFLAGS returns the value of the status/control flags for the - interface unit. The argument should point to an int where the ioctl - will store the flags value. As well as the values listed above for - PPPIOCSFLAGS, the following bits may be set in the returned value: - SC_COMP_RUN CCP compressor is running - SC_DECOMP_RUN CCP decompressor is running - SC_DC_ERROR CCP decompressor detected non-fatal error - SC_DC_FERROR CCP decompressor detected fatal error - -* PPPIOCSCOMPRESS sets the parameters for packet compression or - decompression. The argument should point to a ppp_option_data - structure (defined in ), which contains a - pointer/length pair which should describe a block of memory - containing a CCP option specifying a compression method and its - parameters. The ppp_option_data struct also contains a `transmit' - field. If this is 0, the ioctl will affect the receive path, - otherwise the transmit path. - -* PPPIOCGUNIT returns, in the int pointed to by the argument, the unit - number of this interface unit. - -* PPPIOCSDEBUG sets the debug flags for the interface to the value in - the int pointed to by the argument. Only the least significant bit - is used; if this is 1 the generic layer will print some debug - messages during its operation. This is only intended for debugging - the generic PPP layer code; it is generally not helpful for working - out why a PPP connection is failing. - -* PPPIOCGDEBUG returns the debug flags for the interface in the int - pointed to by the argument. - -* PPPIOCGIDLE returns the time, in seconds, since the last data - packets were sent and received. The argument should point to a - ppp_idle structure (defined in ). If the - CONFIG_PPP_FILTER option is enabled, the set of packets which reset - the transmit and receive idle timers is restricted to those which - pass the `active' packet filter. - Two versions of this command exist, to deal with user space - expecting times as either 32-bit or 64-bit time_t seconds. - -* PPPIOCSMAXCID sets the maximum connection-ID parameter (and thus the - number of connection slots) for the TCP header compressor and - decompressor. The lower 16 bits of the int pointed to by the - argument specify the maximum connection-ID for the compressor. If - the upper 16 bits of that int are non-zero, they specify the maximum - connection-ID for the decompressor, otherwise the decompressor's - maximum connection-ID is set to 15. - -* PPPIOCSNPMODE sets the network-protocol mode for a given network - protocol. The argument should point to an npioctl struct (defined - in ). The `protocol' field gives the PPP protocol - number for the protocol to be affected, and the `mode' field - specifies what to do with packets for that protocol: - - NPMODE_PASS normal operation, transmit and receive packets - NPMODE_DROP silently drop packets for this protocol - NPMODE_ERROR drop packets and return an error on transmit - NPMODE_QUEUE queue up packets for transmit, drop received - packets - - At present NPMODE_ERROR and NPMODE_QUEUE have the same effect as - NPMODE_DROP. - -* PPPIOCGNPMODE returns the network-protocol mode for a given - protocol. The argument should point to an npioctl struct with the - `protocol' field set to the PPP protocol number for the protocol of - interest. On return the `mode' field will be set to the network- - protocol mode for that protocol. - -* PPPIOCSPASS and PPPIOCSACTIVE set the `pass' and `active' packet - filters. These ioctls are only available if the CONFIG_PPP_FILTER - option is selected. The argument should point to a sock_fprog - structure (defined in ) containing the compiled BPF - instructions for the filter. Packets are dropped if they fail the - `pass' filter; otherwise, if they fail the `active' filter they are - passed but they do not reset the transmit or receive idle timer. - -* PPPIOCSMRRU enables or disables multilink processing for received - packets and sets the multilink MRRU (maximum reconstructed receive - unit). The argument should point to an int containing the new MRRU - value. If the MRRU value is 0, processing of received multilink - fragments is disabled. This ioctl is only available if the - CONFIG_PPP_MULTILINK option is selected. - -Last modified: 7-feb-2002 -- cgit v1.2.3-59-g8ed1b From 832619012c972dc1ca0723ad66ffff6e6e4cf5e0 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Thu, 30 Apr 2020 18:04:16 +0200 Subject: docs: networking: convert proc_net_tcp.txt to ReST - add SPDX header; - add a document title; - mark code blocks and literals as such; - mark tables as such; - adjust identation, whitespaces and blank lines where needed; - add to networking/index.rst. Signed-off-by: Mauro Carvalho Chehab Signed-off-by: David S. Miller --- Documentation/networking/index.rst | 1 + Documentation/networking/proc_net_tcp.rst | 57 +++++++++++++++++++++++++++++++ Documentation/networking/proc_net_tcp.txt | 48 -------------------------- 3 files changed, 58 insertions(+), 48 deletions(-) create mode 100644 Documentation/networking/proc_net_tcp.rst delete mode 100644 Documentation/networking/proc_net_tcp.txt diff --git a/Documentation/networking/index.rst b/Documentation/networking/index.rst index f89535871481..0da7eb0ec85a 100644 --- a/Documentation/networking/index.rst +++ b/Documentation/networking/index.rst @@ -94,6 +94,7 @@ Contents: pktgen plip ppp_generic + proc_net_tcp .. only:: subproject and html diff --git a/Documentation/networking/proc_net_tcp.rst b/Documentation/networking/proc_net_tcp.rst new file mode 100644 index 000000000000..7d9dfe36af45 --- /dev/null +++ b/Documentation/networking/proc_net_tcp.rst @@ -0,0 +1,57 @@ +.. SPDX-License-Identifier: GPL-2.0 + +============================================ +The proc/net/tcp and proc/net/tcp6 variables +============================================ + +This document describes the interfaces /proc/net/tcp and /proc/net/tcp6. +Note that these interfaces are deprecated in favor of tcp_diag. + +These /proc interfaces provide information about currently active TCP +connections, and are implemented by tcp4_seq_show() in net/ipv4/tcp_ipv4.c +and tcp6_seq_show() in net/ipv6/tcp_ipv6.c, respectively. + +It will first list all listening TCP sockets, and next list all established +TCP connections. A typical entry of /proc/net/tcp would look like this (split +up into 3 parts because of the length of the line):: + + 46: 010310AC:9C4C 030310AC:1770 01 + | | | | | |--> connection state + | | | | |------> remote TCP port number + | | | |-------------> remote IPv4 address + | | |--------------------> local TCP port number + | |---------------------------> local IPv4 address + |----------------------------------> number of entry + + 00000150:00000000 01:00000019 00000000 + | | | | |--> number of unrecovered RTO timeouts + | | | |----------> number of jiffies until timer expires + | | |----------------> timer_active (see below) + | |----------------------> receive-queue + |-------------------------------> transmit-queue + + 1000 0 54165785 4 cd1e6040 25 4 27 3 -1 + | | | | | | | | | |--> slow start size threshold, + | | | | | | | | | or -1 if the threshold + | | | | | | | | | is >= 0xFFFF + | | | | | | | | |----> sending congestion window + | | | | | | | |-------> (ack.quick<<1)|ack.pingpong + | | | | | | |---------> Predicted tick of soft clock + | | | | | | (delayed ACK control data) + | | | | | |------------> retransmit timeout + | | | | |------------------> location of socket in memory + | | | |-----------------------> socket reference count + | | |-----------------------------> inode + | |----------------------------------> unanswered 0-window probes + |---------------------------------------------> uid + +timer_active: + + == ================================================================ + 0 no timer is pending + 1 retransmit-timer is pending + 2 another timer (e.g. delayed ack or keepalive) is pending + 3 this is a socket in TIME_WAIT state. Not all fields will contain + data (or even exist) + 4 zero window probe timer is pending + == ================================================================ diff --git a/Documentation/networking/proc_net_tcp.txt b/Documentation/networking/proc_net_tcp.txt deleted file mode 100644 index 4a79209e77a7..000000000000 --- a/Documentation/networking/proc_net_tcp.txt +++ /dev/null @@ -1,48 +0,0 @@ -This document describes the interfaces /proc/net/tcp and /proc/net/tcp6. -Note that these interfaces are deprecated in favor of tcp_diag. - -These /proc interfaces provide information about currently active TCP -connections, and are implemented by tcp4_seq_show() in net/ipv4/tcp_ipv4.c -and tcp6_seq_show() in net/ipv6/tcp_ipv6.c, respectively. - -It will first list all listening TCP sockets, and next list all established -TCP connections. A typical entry of /proc/net/tcp would look like this (split -up into 3 parts because of the length of the line): - - 46: 010310AC:9C4C 030310AC:1770 01 - | | | | | |--> connection state - | | | | |------> remote TCP port number - | | | |-------------> remote IPv4 address - | | |--------------------> local TCP port number - | |---------------------------> local IPv4 address - |----------------------------------> number of entry - - 00000150:00000000 01:00000019 00000000 - | | | | |--> number of unrecovered RTO timeouts - | | | |----------> number of jiffies until timer expires - | | |----------------> timer_active (see below) - | |----------------------> receive-queue - |-------------------------------> transmit-queue - - 1000 0 54165785 4 cd1e6040 25 4 27 3 -1 - | | | | | | | | | |--> slow start size threshold, - | | | | | | | | | or -1 if the threshold - | | | | | | | | | is >= 0xFFFF - | | | | | | | | |----> sending congestion window - | | | | | | | |-------> (ack.quick<<1)|ack.pingpong - | | | | | | |---------> Predicted tick of soft clock - | | | | | | (delayed ACK control data) - | | | | | |------------> retransmit timeout - | | | | |------------------> location of socket in memory - | | | |-----------------------> socket reference count - | | |-----------------------------> inode - | |----------------------------------> unanswered 0-window probes - |---------------------------------------------> uid - -timer_active: - 0 no timer is pending - 1 retransmit-timer is pending - 2 another timer (e.g. delayed ack or keepalive) is pending - 3 this is a socket in TIME_WAIT state. Not all fields will contain - data (or even exist) - 4 zero window probe timer is pending -- cgit v1.2.3-59-g8ed1b From 66d495d0a5aecd1692f6b5e3190de14f9a31e14b Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Thu, 30 Apr 2020 18:04:17 +0200 Subject: docs: networking: convert radiotap-headers.txt to ReST - add SPDX header; - adjust title markup; - mark code blocks and literals as such; - adjust identation, whitespaces and blank lines where needed; - add to networking/index.rst. Signed-off-by: Mauro Carvalho Chehab Signed-off-by: David S. Miller --- Documentation/networking/index.rst | 1 + Documentation/networking/mac80211-injection.rst | 2 +- Documentation/networking/radiotap-headers.rst | 159 ++++++++++++++++++++++++ Documentation/networking/radiotap-headers.txt | 152 ---------------------- include/net/cfg80211.h | 2 +- net/wireless/radiotap.c | 2 +- 6 files changed, 163 insertions(+), 155 deletions(-) create mode 100644 Documentation/networking/radiotap-headers.rst delete mode 100644 Documentation/networking/radiotap-headers.txt diff --git a/Documentation/networking/index.rst b/Documentation/networking/index.rst index 0da7eb0ec85a..85bc52d0b3a6 100644 --- a/Documentation/networking/index.rst +++ b/Documentation/networking/index.rst @@ -95,6 +95,7 @@ Contents: plip ppp_generic proc_net_tcp + radiotap-headers .. only:: subproject and html diff --git a/Documentation/networking/mac80211-injection.rst b/Documentation/networking/mac80211-injection.rst index 75d4edcae852..be65f886ff1f 100644 --- a/Documentation/networking/mac80211-injection.rst +++ b/Documentation/networking/mac80211-injection.rst @@ -13,7 +13,7 @@ following format:: [ payload ] The radiotap format is discussed in -./Documentation/networking/radiotap-headers.txt. +./Documentation/networking/radiotap-headers.rst. Despite many radiotap parameters being currently defined, most only make sense to appear on received packets. The following information is parsed from the diff --git a/Documentation/networking/radiotap-headers.rst b/Documentation/networking/radiotap-headers.rst new file mode 100644 index 000000000000..1a1bd1ec0650 --- /dev/null +++ b/Documentation/networking/radiotap-headers.rst @@ -0,0 +1,159 @@ +.. SPDX-License-Identifier: GPL-2.0 + +=========================== +How to use radiotap headers +=========================== + +Pointer to the radiotap include file +------------------------------------ + +Radiotap headers are variable-length and extensible, you can get most of the +information you need to know on them from:: + + ./include/net/ieee80211_radiotap.h + +This document gives an overview and warns on some corner cases. + + +Structure of the header +----------------------- + +There is a fixed portion at the start which contains a u32 bitmap that defines +if the possible argument associated with that bit is present or not. So if b0 +of the it_present member of ieee80211_radiotap_header is set, it means that +the header for argument index 0 (IEEE80211_RADIOTAP_TSFT) is present in the +argument area. + +:: + + < 8-byte ieee80211_radiotap_header > + [ ] + [ ... ] + +At the moment there are only 13 possible argument indexes defined, but in case +we run out of space in the u32 it_present member, it is defined that b31 set +indicates that there is another u32 bitmap following (shown as "possible +argument bitmap extensions..." above), and the start of the arguments is moved +forward 4 bytes each time. + +Note also that the it_len member __le16 is set to the total number of bytes +covered by the ieee80211_radiotap_header and any arguments following. + + +Requirements for arguments +-------------------------- + +After the fixed part of the header, the arguments follow for each argument +index whose matching bit is set in the it_present member of +ieee80211_radiotap_header. + + - the arguments are all stored little-endian! + + - the argument payload for a given argument index has a fixed size. So + IEEE80211_RADIOTAP_TSFT being present always indicates an 8-byte argument is + present. See the comments in ./include/net/ieee80211_radiotap.h for a nice + breakdown of all the argument sizes + + - the arguments must be aligned to a boundary of the argument size using + padding. So a u16 argument must start on the next u16 boundary if it isn't + already on one, a u32 must start on the next u32 boundary and so on. + + - "alignment" is relative to the start of the ieee80211_radiotap_header, ie, + the first byte of the radiotap header. The absolute alignment of that first + byte isn't defined. So even if the whole radiotap header is starting at, eg, + address 0x00000003, still the first byte of the radiotap header is treated as + 0 for alignment purposes. + + - the above point that there may be no absolute alignment for multibyte + entities in the fixed radiotap header or the argument region means that you + have to take special evasive action when trying to access these multibyte + entities. Some arches like Blackfin cannot deal with an attempt to + dereference, eg, a u16 pointer that is pointing to an odd address. Instead + you have to use a kernel API get_unaligned() to dereference the pointer, + which will do it bytewise on the arches that require that. + + - The arguments for a given argument index can be a compound of multiple types + together. For example IEEE80211_RADIOTAP_CHANNEL has an argument payload + consisting of two u16s of total length 4. When this happens, the padding + rule is applied dealing with a u16, NOT dealing with a 4-byte single entity. + + +Example valid radiotap header +----------------------------- + +:: + + 0x00, 0x00, // <-- radiotap version + pad byte + 0x0b, 0x00, // <- radiotap header length + 0x04, 0x0c, 0x00, 0x00, // <-- bitmap + 0x6c, // <-- rate (in 500kHz units) + 0x0c, //<-- tx power + 0x01 //<-- antenna + + +Using the Radiotap Parser +------------------------- + +If you are having to parse a radiotap struct, you can radically simplify the +job by using the radiotap parser that lives in net/wireless/radiotap.c and has +its prototypes available in include/net/cfg80211.h. You use it like this:: + + #include + + /* buf points to the start of the radiotap header part */ + + int MyFunction(u8 * buf, int buflen) + { + int pkt_rate_100kHz = 0, antenna = 0, pwr = 0; + struct ieee80211_radiotap_iterator iterator; + int ret = ieee80211_radiotap_iterator_init(&iterator, buf, buflen); + + while (!ret) { + + ret = ieee80211_radiotap_iterator_next(&iterator); + + if (ret) + continue; + + /* see if this argument is something we can use */ + + switch (iterator.this_arg_index) { + /* + * You must take care when dereferencing iterator.this_arg + * for multibyte types... the pointer is not aligned. Use + * get_unaligned((type *)iterator.this_arg) to dereference + * iterator.this_arg for type "type" safely on all arches. + */ + case IEEE80211_RADIOTAP_RATE: + /* radiotap "rate" u8 is in + * 500kbps units, eg, 0x02=1Mbps + */ + pkt_rate_100kHz = (*iterator.this_arg) * 5; + break; + + case IEEE80211_RADIOTAP_ANTENNA: + /* radiotap uses 0 for 1st ant */ + antenna = *iterator.this_arg); + break; + + case IEEE80211_RADIOTAP_DBM_TX_POWER: + pwr = *iterator.this_arg; + break; + + default: + break; + } + } /* while more rt headers */ + + if (ret != -ENOENT) + return TXRX_DROP; + + /* discard the radiotap header part */ + buf += iterator.max_length; + buflen -= iterator.max_length; + + ... + + } + +Andy Green diff --git a/Documentation/networking/radiotap-headers.txt b/Documentation/networking/radiotap-headers.txt deleted file mode 100644 index 953331c7984f..000000000000 --- a/Documentation/networking/radiotap-headers.txt +++ /dev/null @@ -1,152 +0,0 @@ -How to use radiotap headers -=========================== - -Pointer to the radiotap include file ------------------------------------- - -Radiotap headers are variable-length and extensible, you can get most of the -information you need to know on them from: - -./include/net/ieee80211_radiotap.h - -This document gives an overview and warns on some corner cases. - - -Structure of the header ------------------------ - -There is a fixed portion at the start which contains a u32 bitmap that defines -if the possible argument associated with that bit is present or not. So if b0 -of the it_present member of ieee80211_radiotap_header is set, it means that -the header for argument index 0 (IEEE80211_RADIOTAP_TSFT) is present in the -argument area. - - < 8-byte ieee80211_radiotap_header > - [ ] - [ ... ] - -At the moment there are only 13 possible argument indexes defined, but in case -we run out of space in the u32 it_present member, it is defined that b31 set -indicates that there is another u32 bitmap following (shown as "possible -argument bitmap extensions..." above), and the start of the arguments is moved -forward 4 bytes each time. - -Note also that the it_len member __le16 is set to the total number of bytes -covered by the ieee80211_radiotap_header and any arguments following. - - -Requirements for arguments --------------------------- - -After the fixed part of the header, the arguments follow for each argument -index whose matching bit is set in the it_present member of -ieee80211_radiotap_header. - - - the arguments are all stored little-endian! - - - the argument payload for a given argument index has a fixed size. So - IEEE80211_RADIOTAP_TSFT being present always indicates an 8-byte argument is - present. See the comments in ./include/net/ieee80211_radiotap.h for a nice - breakdown of all the argument sizes - - - the arguments must be aligned to a boundary of the argument size using - padding. So a u16 argument must start on the next u16 boundary if it isn't - already on one, a u32 must start on the next u32 boundary and so on. - - - "alignment" is relative to the start of the ieee80211_radiotap_header, ie, - the first byte of the radiotap header. The absolute alignment of that first - byte isn't defined. So even if the whole radiotap header is starting at, eg, - address 0x00000003, still the first byte of the radiotap header is treated as - 0 for alignment purposes. - - - the above point that there may be no absolute alignment for multibyte - entities in the fixed radiotap header or the argument region means that you - have to take special evasive action when trying to access these multibyte - entities. Some arches like Blackfin cannot deal with an attempt to - dereference, eg, a u16 pointer that is pointing to an odd address. Instead - you have to use a kernel API get_unaligned() to dereference the pointer, - which will do it bytewise on the arches that require that. - - - The arguments for a given argument index can be a compound of multiple types - together. For example IEEE80211_RADIOTAP_CHANNEL has an argument payload - consisting of two u16s of total length 4. When this happens, the padding - rule is applied dealing with a u16, NOT dealing with a 4-byte single entity. - - -Example valid radiotap header ------------------------------ - - 0x00, 0x00, // <-- radiotap version + pad byte - 0x0b, 0x00, // <- radiotap header length - 0x04, 0x0c, 0x00, 0x00, // <-- bitmap - 0x6c, // <-- rate (in 500kHz units) - 0x0c, //<-- tx power - 0x01 //<-- antenna - - -Using the Radiotap Parser -------------------------- - -If you are having to parse a radiotap struct, you can radically simplify the -job by using the radiotap parser that lives in net/wireless/radiotap.c and has -its prototypes available in include/net/cfg80211.h. You use it like this: - -#include - -/* buf points to the start of the radiotap header part */ - -int MyFunction(u8 * buf, int buflen) -{ - int pkt_rate_100kHz = 0, antenna = 0, pwr = 0; - struct ieee80211_radiotap_iterator iterator; - int ret = ieee80211_radiotap_iterator_init(&iterator, buf, buflen); - - while (!ret) { - - ret = ieee80211_radiotap_iterator_next(&iterator); - - if (ret) - continue; - - /* see if this argument is something we can use */ - - switch (iterator.this_arg_index) { - /* - * You must take care when dereferencing iterator.this_arg - * for multibyte types... the pointer is not aligned. Use - * get_unaligned((type *)iterator.this_arg) to dereference - * iterator.this_arg for type "type" safely on all arches. - */ - case IEEE80211_RADIOTAP_RATE: - /* radiotap "rate" u8 is in - * 500kbps units, eg, 0x02=1Mbps - */ - pkt_rate_100kHz = (*iterator.this_arg) * 5; - break; - - case IEEE80211_RADIOTAP_ANTENNA: - /* radiotap uses 0 for 1st ant */ - antenna = *iterator.this_arg); - break; - - case IEEE80211_RADIOTAP_DBM_TX_POWER: - pwr = *iterator.this_arg; - break; - - default: - break; - } - } /* while more rt headers */ - - if (ret != -ENOENT) - return TXRX_DROP; - - /* discard the radiotap header part */ - buf += iterator.max_length; - buflen -= iterator.max_length; - - ... - -} - -Andy Green diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 70e48f66dac8..46ac80423b28 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -5211,7 +5211,7 @@ u32 ieee80211_mandatory_rates(struct ieee80211_supported_band *sband, * Radiotap parsing functions -- for controlled injection support * * Implemented in net/wireless/radiotap.c - * Documentation in Documentation/networking/radiotap-headers.txt + * Documentation in Documentation/networking/radiotap-headers.rst */ struct radiotap_align_size { diff --git a/net/wireless/radiotap.c b/net/wireless/radiotap.c index 6582d155e2fc..d5e28239e030 100644 --- a/net/wireless/radiotap.c +++ b/net/wireless/radiotap.c @@ -90,7 +90,7 @@ static const struct ieee80211_radiotap_namespace radiotap_ns = { * iterator.this_arg for type "type" safely on all arches. * * Example code: - * See Documentation/networking/radiotap-headers.txt + * See Documentation/networking/radiotap-headers.rst */ int ieee80211_radiotap_iterator_init( -- cgit v1.2.3-59-g8ed1b From 8c6e172002987202c17756e06e84c7e4d8916c44 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Thu, 30 Apr 2020 18:04:18 +0200 Subject: docs: networking: convert ray_cs.txt to ReST - add SPDX header; - use copyright symbol; - add a document title; - mark code blocks and literals as such; - mark tables as such; - adjust identation, whitespaces and blank lines where needed; - add to networking/index.rst. Signed-off-by: Mauro Carvalho Chehab Signed-off-by: David S. Miller --- Documentation/networking/index.rst | 1 + Documentation/networking/ray_cs.rst | 165 ++++++++++++++++++++++++++++++++++++ Documentation/networking/ray_cs.txt | 150 -------------------------------- drivers/net/wireless/Kconfig | 2 +- 4 files changed, 167 insertions(+), 151 deletions(-) create mode 100644 Documentation/networking/ray_cs.rst delete mode 100644 Documentation/networking/ray_cs.txt diff --git a/Documentation/networking/index.rst b/Documentation/networking/index.rst index 85bc52d0b3a6..b7e35b0d905c 100644 --- a/Documentation/networking/index.rst +++ b/Documentation/networking/index.rst @@ -96,6 +96,7 @@ Contents: ppp_generic proc_net_tcp radiotap-headers + ray_cs .. only:: subproject and html diff --git a/Documentation/networking/ray_cs.rst b/Documentation/networking/ray_cs.rst new file mode 100644 index 000000000000..9a46d1ae8f20 --- /dev/null +++ b/Documentation/networking/ray_cs.rst @@ -0,0 +1,165 @@ +.. SPDX-License-Identifier: GPL-2.0 + +.. include:: + +========================= +Raylink wireless LAN card +========================= + +September 21, 1999 + +Copyright |copy| 1998 Corey Thomas (corey@world.std.com) + +This file is the documentation for the Raylink Wireless LAN card driver for +Linux. The Raylink wireless LAN card is a PCMCIA card which provides IEEE +802.11 compatible wireless network connectivity at 1 and 2 megabits/second. +See http://www.raytheon.com/micro/raylink/ for more information on the Raylink +card. This driver is in early development and does have bugs. See the known +bugs and limitations at the end of this document for more information. +This driver also works with WebGear's Aviator 2.4 and Aviator Pro +wireless LAN cards. + +As of kernel 2.3.18, the ray_cs driver is part of the Linux kernel +source. My web page for the development of ray_cs is at +http://web.ralinktech.com/ralink/Home/Support/Linux.html +and I can be emailed at corey@world.std.com + +The kernel driver is based on ray_cs-1.62.tgz + +The driver at my web page is intended to be used as an add on to +David Hinds pcmcia package. All the command line parameters are +available when compiled as a module. When built into the kernel, only +the essid= string parameter is available via the kernel command line. +This will change after the method of sorting out parameters for all +the PCMCIA drivers is agreed upon. If you must have a built in driver +with nondefault parameters, they can be edited in +/usr/src/linux/drivers/net/pcmcia/ray_cs.c. Searching for module_param +will find them all. + +Information on card services is available at: + + http://pcmcia-cs.sourceforge.net/ + + +Card services user programs are still required for PCMCIA devices. +pcmcia-cs-3.1.1 or greater is required for the kernel version of +the driver. + +Currently, ray_cs is not part of David Hinds card services package, +so the following magic is required. + +At the end of the /etc/pcmcia/config.opts file, add the line: +source ./ray_cs.opts +This will make card services read the ray_cs.opts file +when starting. Create the file /etc/pcmcia/ray_cs.opts containing the +following:: + + #### start of /etc/pcmcia/ray_cs.opts ################### + # Configuration options for Raylink Wireless LAN PCMCIA card + device "ray_cs" + class "network" module "misc/ray_cs" + + card "RayLink PC Card WLAN Adapter" + manfid 0x01a6, 0x0000 + bind "ray_cs" + + module "misc/ray_cs" opts "" + #### end of /etc/pcmcia/ray_cs.opts ##################### + + +To join an existing network with +different parameters, contact the network administrator for the +configuration information, and edit /etc/pcmcia/ray_cs.opts. +Add the parameters below between the empty quotes. + +Parameters for ray_cs driver which may be specified in ray_cs.opts: + +=============== =============== ============================================= +bc integer 0 = normal mode (802.11 timing), + 1 = slow down inter frame timing to allow + operation with older breezecom access + points. + +beacon_period integer beacon period in Kilo-microseconds, + + legal values = must be integer multiple + of hop dwell + + default = 256 + +country integer 1 = USA (default), + 2 = Europe, + 3 = Japan, + 4 = Korea, + 5 = Spain, + 6 = France, + 7 = Israel, + 8 = Australia + +essid string ESS ID - network name to join + + string with maximum length of 32 chars + default value = "ADHOC_ESSID" + +hop_dwell integer hop dwell time in Kilo-microseconds + + legal values = 16,32,64,128(default),256 + +irq_mask integer linux standard 16 bit value 1bit/IRQ + + lsb is IRQ 0, bit 1 is IRQ 1 etc. + Used to restrict choice of IRQ's to use. + Recommended method for controlling + interrupts is in /etc/pcmcia/config.opts + +net_type integer 0 (default) = adhoc network, + 1 = infrastructure + +phy_addr string string containing new MAC address in + hex, must start with x eg + x00008f123456 + +psm integer 0 = continuously active, + 1 = power save mode (not useful yet) + +pc_debug integer (0-5) larger values for more verbose + logging. Replaces ray_debug. + +ray_debug integer Replaced with pc_debug + +ray_mem_speed integer defaults to 500 + +sniffer integer 0 = not sniffer (default), + 1 = sniffer which can be used to record all + network traffic using tcpdump or similar, + but no normal network use is allowed. + +translate integer 0 = no translation (encapsulate frames), + 1 = translation (RFC1042/802.1) +=============== =============== ============================================= + +More on sniffer mode: + +tcpdump does not understand 802.11 headers, so it can't +interpret the contents, but it can record to a file. This is only +useful for debugging 802.11 lowlevel protocols that are not visible to +linux. If you want to watch ftp xfers, or do similar things, you +don't need to use sniffer mode. Also, some packet types are never +sent up by the card, so you will never see them (ack, rts, cts, probe +etc.) There is a simple program (showcap) included in the ray_cs +package which parses the 802.11 headers. + +Known Problems and missing features + + Does not work with non x86 + + Does not work with SMP + + Support for defragmenting frames is not yet debugged, and in + fact is known to not work. I have never encountered a net set + up to fragment, but still, it should be fixed. + + The ioctl support is incomplete. The hardware address cannot be set + using ifconfig yet. If a different hardware address is needed, it may + be set using the phy_addr parameter in ray_cs.opts. This requires + a card insertion to take effect. diff --git a/Documentation/networking/ray_cs.txt b/Documentation/networking/ray_cs.txt deleted file mode 100644 index c0c12307ed9d..000000000000 --- a/Documentation/networking/ray_cs.txt +++ /dev/null @@ -1,150 +0,0 @@ -September 21, 1999 - -Copyright (c) 1998 Corey Thomas (corey@world.std.com) - -This file is the documentation for the Raylink Wireless LAN card driver for -Linux. The Raylink wireless LAN card is a PCMCIA card which provides IEEE -802.11 compatible wireless network connectivity at 1 and 2 megabits/second. -See http://www.raytheon.com/micro/raylink/ for more information on the Raylink -card. This driver is in early development and does have bugs. See the known -bugs and limitations at the end of this document for more information. -This driver also works with WebGear's Aviator 2.4 and Aviator Pro -wireless LAN cards. - -As of kernel 2.3.18, the ray_cs driver is part of the Linux kernel -source. My web page for the development of ray_cs is at -http://web.ralinktech.com/ralink/Home/Support/Linux.html -and I can be emailed at corey@world.std.com - -The kernel driver is based on ray_cs-1.62.tgz - -The driver at my web page is intended to be used as an add on to -David Hinds pcmcia package. All the command line parameters are -available when compiled as a module. When built into the kernel, only -the essid= string parameter is available via the kernel command line. -This will change after the method of sorting out parameters for all -the PCMCIA drivers is agreed upon. If you must have a built in driver -with nondefault parameters, they can be edited in -/usr/src/linux/drivers/net/pcmcia/ray_cs.c. Searching for module_param -will find them all. - -Information on card services is available at: - http://pcmcia-cs.sourceforge.net/ - - -Card services user programs are still required for PCMCIA devices. -pcmcia-cs-3.1.1 or greater is required for the kernel version of -the driver. - -Currently, ray_cs is not part of David Hinds card services package, -so the following magic is required. - -At the end of the /etc/pcmcia/config.opts file, add the line: -source ./ray_cs.opts -This will make card services read the ray_cs.opts file -when starting. Create the file /etc/pcmcia/ray_cs.opts containing the -following: - -#### start of /etc/pcmcia/ray_cs.opts ################### -# Configuration options for Raylink Wireless LAN PCMCIA card -device "ray_cs" - class "network" module "misc/ray_cs" - -card "RayLink PC Card WLAN Adapter" - manfid 0x01a6, 0x0000 - bind "ray_cs" - -module "misc/ray_cs" opts "" -#### end of /etc/pcmcia/ray_cs.opts ##################### - - -To join an existing network with -different parameters, contact the network administrator for the -configuration information, and edit /etc/pcmcia/ray_cs.opts. -Add the parameters below between the empty quotes. - -Parameters for ray_cs driver which may be specified in ray_cs.opts: - -bc integer 0 = normal mode (802.11 timing) - 1 = slow down inter frame timing to allow - operation with older breezecom access - points. - -beacon_period integer beacon period in Kilo-microseconds - legal values = must be integer multiple - of hop dwell - default = 256 - -country integer 1 = USA (default) - 2 = Europe - 3 = Japan - 4 = Korea - 5 = Spain - 6 = France - 7 = Israel - 8 = Australia - -essid string ESS ID - network name to join - string with maximum length of 32 chars - default value = "ADHOC_ESSID" - -hop_dwell integer hop dwell time in Kilo-microseconds - legal values = 16,32,64,128(default),256 - -irq_mask integer linux standard 16 bit value 1bit/IRQ - lsb is IRQ 0, bit 1 is IRQ 1 etc. - Used to restrict choice of IRQ's to use. - Recommended method for controlling - interrupts is in /etc/pcmcia/config.opts - -net_type integer 0 (default) = adhoc network, - 1 = infrastructure - -phy_addr string string containing new MAC address in - hex, must start with x eg - x00008f123456 - -psm integer 0 = continuously active - 1 = power save mode (not useful yet) - -pc_debug integer (0-5) larger values for more verbose - logging. Replaces ray_debug. - -ray_debug integer Replaced with pc_debug - -ray_mem_speed integer defaults to 500 - -sniffer integer 0 = not sniffer (default) - 1 = sniffer which can be used to record all - network traffic using tcpdump or similar, - but no normal network use is allowed. - -translate integer 0 = no translation (encapsulate frames) - 1 = translation (RFC1042/802.1) - - -More on sniffer mode: - -tcpdump does not understand 802.11 headers, so it can't -interpret the contents, but it can record to a file. This is only -useful for debugging 802.11 lowlevel protocols that are not visible to -linux. If you want to watch ftp xfers, or do similar things, you -don't need to use sniffer mode. Also, some packet types are never -sent up by the card, so you will never see them (ack, rts, cts, probe -etc.) There is a simple program (showcap) included in the ray_cs -package which parses the 802.11 headers. - -Known Problems and missing features - - Does not work with non x86 - - Does not work with SMP - - Support for defragmenting frames is not yet debugged, and in - fact is known to not work. I have never encountered a net set - up to fragment, but still, it should be fixed. - - The ioctl support is incomplete. The hardware address cannot be set - using ifconfig yet. If a different hardware address is needed, it may - be set using the phy_addr parameter in ray_cs.opts. This requires - a card insertion to take effect. diff --git a/drivers/net/wireless/Kconfig b/drivers/net/wireless/Kconfig index 1c98d781ae49..15b0ad171f4c 100644 --- a/drivers/net/wireless/Kconfig +++ b/drivers/net/wireless/Kconfig @@ -57,7 +57,7 @@ config PCMCIA_RAYCS ---help--- Say Y here if you intend to attach an Aviator/Raytheon PCMCIA (PC-card) wireless Ethernet networking card to your computer. - Please read the file for + Please read the file for details. To compile this driver as a module, choose M here: the module will be -- cgit v1.2.3-59-g8ed1b From bad5b6e223e8409c860c0574d5239ee4348f06b3 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Thu, 30 Apr 2020 18:04:19 +0200 Subject: docs: networking: convert rds.txt to ReST - add SPDX header; - add a document title; - mark code blocks and literals as such; - mark tables as such; - mark lists as such; - adjust identation, whitespaces and blank lines where needed; - add to networking/index.rst. Signed-off-by: Mauro Carvalho Chehab Acked-by: Santosh Shilimkar Signed-off-by: David S. Miller --- Documentation/networking/index.rst | 1 + Documentation/networking/rds.rst | 448 +++++++++++++++++++++++++++++++++++++ Documentation/networking/rds.txt | 423 ---------------------------------- MAINTAINERS | 2 +- 4 files changed, 450 insertions(+), 424 deletions(-) create mode 100644 Documentation/networking/rds.rst delete mode 100644 Documentation/networking/rds.txt diff --git a/Documentation/networking/index.rst b/Documentation/networking/index.rst index b7e35b0d905c..e63a2cb2e4cb 100644 --- a/Documentation/networking/index.rst +++ b/Documentation/networking/index.rst @@ -97,6 +97,7 @@ Contents: proc_net_tcp radiotap-headers ray_cs + rds .. only:: subproject and html diff --git a/Documentation/networking/rds.rst b/Documentation/networking/rds.rst new file mode 100644 index 000000000000..44936c27ab3a --- /dev/null +++ b/Documentation/networking/rds.rst @@ -0,0 +1,448 @@ +.. SPDX-License-Identifier: GPL-2.0 + +== +RDS +=== + +Overview +======== + +This readme tries to provide some background on the hows and whys of RDS, +and will hopefully help you find your way around the code. + +In addition, please see this email about RDS origins: +http://oss.oracle.com/pipermail/rds-devel/2007-November/000228.html + +RDS Architecture +================ + +RDS provides reliable, ordered datagram delivery by using a single +reliable connection between any two nodes in the cluster. This allows +applications to use a single socket to talk to any other process in the +cluster - so in a cluster with N processes you need N sockets, in contrast +to N*N if you use a connection-oriented socket transport like TCP. + +RDS is not Infiniband-specific; it was designed to support different +transports. The current implementation used to support RDS over TCP as well +as IB. + +The high-level semantics of RDS from the application's point of view are + + * Addressing + + RDS uses IPv4 addresses and 16bit port numbers to identify + the end point of a connection. All socket operations that involve + passing addresses between kernel and user space generally + use a struct sockaddr_in. + + The fact that IPv4 addresses are used does not mean the underlying + transport has to be IP-based. In fact, RDS over IB uses a + reliable IB connection; the IP address is used exclusively to + locate the remote node's GID (by ARPing for the given IP). + + The port space is entirely independent of UDP, TCP or any other + protocol. + + * Socket interface + + RDS sockets work *mostly* as you would expect from a BSD + socket. The next section will cover the details. At any rate, + all I/O is performed through the standard BSD socket API. + Some additions like zerocopy support are implemented through + control messages, while other extensions use the getsockopt/ + setsockopt calls. + + Sockets must be bound before you can send or receive data. + This is needed because binding also selects a transport and + attaches it to the socket. Once bound, the transport assignment + does not change. RDS will tolerate IPs moving around (eg in + a active-active HA scenario), but only as long as the address + doesn't move to a different transport. + + * sysctls + + RDS supports a number of sysctls in /proc/sys/net/rds + + +Socket Interface +================ + + AF_RDS, PF_RDS, SOL_RDS + AF_RDS and PF_RDS are the domain type to be used with socket(2) + to create RDS sockets. SOL_RDS is the socket-level to be used + with setsockopt(2) and getsockopt(2) for RDS specific socket + options. + + fd = socket(PF_RDS, SOCK_SEQPACKET, 0); + This creates a new, unbound RDS socket. + + setsockopt(SOL_SOCKET): send and receive buffer size + RDS honors the send and receive buffer size socket options. + You are not allowed to queue more than SO_SNDSIZE bytes to + a socket. A message is queued when sendmsg is called, and + it leaves the queue when the remote system acknowledges + its arrival. + + The SO_RCVSIZE option controls the maximum receive queue length. + This is a soft limit rather than a hard limit - RDS will + continue to accept and queue incoming messages, even if that + takes the queue length over the limit. However, it will also + mark the port as "congested" and send a congestion update to + the source node. The source node is supposed to throttle any + processes sending to this congested port. + + bind(fd, &sockaddr_in, ...) + This binds the socket to a local IP address and port, and a + transport, if one has not already been selected via the + SO_RDS_TRANSPORT socket option + + sendmsg(fd, ...) + Sends a message to the indicated recipient. The kernel will + transparently establish the underlying reliable connection + if it isn't up yet. + + An attempt to send a message that exceeds SO_SNDSIZE will + return with -EMSGSIZE + + An attempt to send a message that would take the total number + of queued bytes over the SO_SNDSIZE threshold will return + EAGAIN. + + An attempt to send a message to a destination that is marked + as "congested" will return ENOBUFS. + + recvmsg(fd, ...) + Receives a message that was queued to this socket. The sockets + recv queue accounting is adjusted, and if the queue length + drops below SO_SNDSIZE, the port is marked uncongested, and + a congestion update is sent to all peers. + + Applications can ask the RDS kernel module to receive + notifications via control messages (for instance, there is a + notification when a congestion update arrived, or when a RDMA + operation completes). These notifications are received through + the msg.msg_control buffer of struct msghdr. The format of the + messages is described in manpages. + + poll(fd) + RDS supports the poll interface to allow the application + to implement async I/O. + + POLLIN handling is pretty straightforward. When there's an + incoming message queued to the socket, or a pending notification, + we signal POLLIN. + + POLLOUT is a little harder. Since you can essentially send + to any destination, RDS will always signal POLLOUT as long as + there's room on the send queue (ie the number of bytes queued + is less than the sendbuf size). + + However, the kernel will refuse to accept messages to + a destination marked congested - in this case you will loop + forever if you rely on poll to tell you what to do. + This isn't a trivial problem, but applications can deal with + this - by using congestion notifications, and by checking for + ENOBUFS errors returned by sendmsg. + + setsockopt(SOL_RDS, RDS_CANCEL_SENT_TO, &sockaddr_in) + This allows the application to discard all messages queued to a + specific destination on this particular socket. + + This allows the application to cancel outstanding messages if + it detects a timeout. For instance, if it tried to send a message, + and the remote host is unreachable, RDS will keep trying forever. + The application may decide it's not worth it, and cancel the + operation. In this case, it would use RDS_CANCEL_SENT_TO to + nuke any pending messages. + + ``setsockopt(fd, SOL_RDS, SO_RDS_TRANSPORT, (int *)&transport ..), getsockopt(fd, SOL_RDS, SO_RDS_TRANSPORT, (int *)&transport ..)`` + Set or read an integer defining the underlying + encapsulating transport to be used for RDS packets on the + socket. When setting the option, integer argument may be + one of RDS_TRANS_TCP or RDS_TRANS_IB. When retrieving the + value, RDS_TRANS_NONE will be returned on an unbound socket. + This socket option may only be set exactly once on the socket, + prior to binding it via the bind(2) system call. Attempts to + set SO_RDS_TRANSPORT on a socket for which the transport has + been previously attached explicitly (by SO_RDS_TRANSPORT) or + implicitly (via bind(2)) will return an error of EOPNOTSUPP. + An attempt to set SO_RDS_TRANSPORT to RDS_TRANS_NONE will + always return EINVAL. + +RDMA for RDS +============ + + see rds-rdma(7) manpage (available in rds-tools) + + +Congestion Notifications +======================== + + see rds(7) manpage + + +RDS Protocol +============ + + Message header + + The message header is a 'struct rds_header' (see rds.h): + + Fields: + + h_sequence: + per-packet sequence number + h_ack: + piggybacked acknowledgment of last packet received + h_len: + length of data, not including header + h_sport: + source port + h_dport: + destination port + h_flags: + Can be: + + ============= ================================== + CONG_BITMAP this is a congestion update bitmap + ACK_REQUIRED receiver must ack this packet + RETRANSMITTED packet has previously been sent + ============= ================================== + + h_credit: + indicate to other end of connection that + it has more credits available (i.e. there is + more send room) + h_padding[4]: + unused, for future use + h_csum: + header checksum + h_exthdr: + optional data can be passed here. This is currently used for + passing RDMA-related information. + + ACK and retransmit handling + + One might think that with reliable IB connections you wouldn't need + to ack messages that have been received. The problem is that IB + hardware generates an ack message before it has DMAed the message + into memory. This creates a potential message loss if the HCA is + disabled for any reason between when it sends the ack and before + the message is DMAed and processed. This is only a potential issue + if another HCA is available for fail-over. + + Sending an ack immediately would allow the sender to free the sent + message from their send queue quickly, but could cause excessive + traffic to be used for acks. RDS piggybacks acks on sent data + packets. Ack-only packets are reduced by only allowing one to be + in flight at a time, and by the sender only asking for acks when + its send buffers start to fill up. All retransmissions are also + acked. + + Flow Control + + RDS's IB transport uses a credit-based mechanism to verify that + there is space in the peer's receive buffers for more data. This + eliminates the need for hardware retries on the connection. + + Congestion + + Messages waiting in the receive queue on the receiving socket + are accounted against the sockets SO_RCVBUF option value. Only + the payload bytes in the message are accounted for. If the + number of bytes queued equals or exceeds rcvbuf then the socket + is congested. All sends attempted to this socket's address + should return block or return -EWOULDBLOCK. + + Applications are expected to be reasonably tuned such that this + situation very rarely occurs. An application encountering this + "back-pressure" is considered a bug. + + This is implemented by having each node maintain bitmaps which + indicate which ports on bound addresses are congested. As the + bitmap changes it is sent through all the connections which + terminate in the local address of the bitmap which changed. + + The bitmaps are allocated as connections are brought up. This + avoids allocation in the interrupt handling path which queues + sages on sockets. The dense bitmaps let transports send the + entire bitmap on any bitmap change reasonably efficiently. This + is much easier to implement than some finer-grained + communication of per-port congestion. The sender does a very + inexpensive bit test to test if the port it's about to send to + is congested or not. + + +RDS Transport Layer +=================== + + As mentioned above, RDS is not IB-specific. Its code is divided + into a general RDS layer and a transport layer. + + The general layer handles the socket API, congestion handling, + loopback, stats, usermem pinning, and the connection state machine. + + The transport layer handles the details of the transport. The IB + transport, for example, handles all the queue pairs, work requests, + CM event handlers, and other Infiniband details. + + +RDS Kernel Structures +===================== + + struct rds_message + aka possibly "rds_outgoing", the generic RDS layer copies data to + be sent and sets header fields as needed, based on the socket API. + This is then queued for the individual connection and sent by the + connection's transport. + + struct rds_incoming + a generic struct referring to incoming data that can be handed from + the transport to the general code and queued by the general code + while the socket is awoken. It is then passed back to the transport + code to handle the actual copy-to-user. + + struct rds_socket + per-socket information + + struct rds_connection + per-connection information + + struct rds_transport + pointers to transport-specific functions + + struct rds_statistics + non-transport-specific statistics + + struct rds_cong_map + wraps the raw congestion bitmap, contains rbnode, waitq, etc. + +Connection management +===================== + + Connections may be in UP, DOWN, CONNECTING, DISCONNECTING, and + ERROR states. + + The first time an attempt is made by an RDS socket to send data to + a node, a connection is allocated and connected. That connection is + then maintained forever -- if there are transport errors, the + connection will be dropped and re-established. + + Dropping a connection while packets are queued will cause queued or + partially-sent datagrams to be retransmitted when the connection is + re-established. + + +The send path +============= + + rds_sendmsg() + - struct rds_message built from incoming data + - CMSGs parsed (e.g. RDMA ops) + - transport connection alloced and connected if not already + - rds_message placed on send queue + - send worker awoken + + rds_send_worker() + - calls rds_send_xmit() until queue is empty + + rds_send_xmit() + - transmits congestion map if one is pending + - may set ACK_REQUIRED + - calls transport to send either non-RDMA or RDMA message + (RDMA ops never retransmitted) + + rds_ib_xmit() + - allocs work requests from send ring + - adds any new send credits available to peer (h_credits) + - maps the rds_message's sg list + - piggybacks ack + - populates work requests + - post send to connection's queue pair + +The recv path +============= + + rds_ib_recv_cq_comp_handler() + - looks at write completions + - unmaps recv buffer from device + - no errors, call rds_ib_process_recv() + - refill recv ring + + rds_ib_process_recv() + - validate header checksum + - copy header to rds_ib_incoming struct if start of a new datagram + - add to ibinc's fraglist + - if competed datagram: + - update cong map if datagram was cong update + - call rds_recv_incoming() otherwise + - note if ack is required + + rds_recv_incoming() + - drop duplicate packets + - respond to pings + - find the sock associated with this datagram + - add to sock queue + - wake up sock + - do some congestion calculations + rds_recvmsg + - copy data into user iovec + - handle CMSGs + - return to application + +Multipath RDS (mprds) +===================== + Mprds is multipathed-RDS, primarily intended for RDS-over-TCP + (though the concept can be extended to other transports). The classical + implementation of RDS-over-TCP is implemented by demultiplexing multiple + PF_RDS sockets between any 2 endpoints (where endpoint == [IP address, + port]) over a single TCP socket between the 2 IP addresses involved. This + has the limitation that it ends up funneling multiple RDS flows over a + single TCP flow, thus it is + (a) upper-bounded to the single-flow bandwidth, + (b) suffers from head-of-line blocking for all the RDS sockets. + + Better throughput (for a fixed small packet size, MTU) can be achieved + by having multiple TCP/IP flows per rds/tcp connection, i.e., multipathed + RDS (mprds). Each such TCP/IP flow constitutes a path for the rds/tcp + connection. RDS sockets will be attached to a path based on some hash + (e.g., of local address and RDS port number) and packets for that RDS + socket will be sent over the attached path using TCP to segment/reassemble + RDS datagrams on that path. + + Multipathed RDS is implemented by splitting the struct rds_connection into + a common (to all paths) part, and a per-path struct rds_conn_path. All + I/O workqs and reconnect threads are driven from the rds_conn_path. + Transports such as TCP that are multipath capable may then set up a + TCP socket per rds_conn_path, and this is managed by the transport via + the transport privatee cp_transport_data pointer. + + Transports announce themselves as multipath capable by setting the + t_mp_capable bit during registration with the rds core module. When the + transport is multipath-capable, rds_sendmsg() hashes outgoing traffic + across multiple paths. The outgoing hash is computed based on the + local address and port that the PF_RDS socket is bound to. + + Additionally, even if the transport is MP capable, we may be + peering with some node that does not support mprds, or supports + a different number of paths. As a result, the peering nodes need + to agree on the number of paths to be used for the connection. + This is done by sending out a control packet exchange before the + first data packet. The control packet exchange must have completed + prior to outgoing hash completion in rds_sendmsg() when the transport + is mutlipath capable. + + The control packet is an RDS ping packet (i.e., packet to rds dest + port 0) with the ping packet having a rds extension header option of + type RDS_EXTHDR_NPATHS, length 2 bytes, and the value is the + number of paths supported by the sender. The "probe" ping packet will + get sent from some reserved port, RDS_FLAG_PROBE_PORT (in ) + The receiver of a ping from RDS_FLAG_PROBE_PORT will thus immediately + be able to compute the min(sender_paths, rcvr_paths). The pong + sent in response to a probe-ping should contain the rcvr's npaths + when the rcvr is mprds-capable. + + If the rcvr is not mprds-capable, the exthdr in the ping will be + ignored. In this case the pong will not have any exthdrs, so the sender + of the probe-ping can default to single-path mprds. + diff --git a/Documentation/networking/rds.txt b/Documentation/networking/rds.txt deleted file mode 100644 index eec61694e894..000000000000 --- a/Documentation/networking/rds.txt +++ /dev/null @@ -1,423 +0,0 @@ - -Overview -======== - -This readme tries to provide some background on the hows and whys of RDS, -and will hopefully help you find your way around the code. - -In addition, please see this email about RDS origins: -http://oss.oracle.com/pipermail/rds-devel/2007-November/000228.html - -RDS Architecture -================ - -RDS provides reliable, ordered datagram delivery by using a single -reliable connection between any two nodes in the cluster. This allows -applications to use a single socket to talk to any other process in the -cluster - so in a cluster with N processes you need N sockets, in contrast -to N*N if you use a connection-oriented socket transport like TCP. - -RDS is not Infiniband-specific; it was designed to support different -transports. The current implementation used to support RDS over TCP as well -as IB. - -The high-level semantics of RDS from the application's point of view are - - * Addressing - RDS uses IPv4 addresses and 16bit port numbers to identify - the end point of a connection. All socket operations that involve - passing addresses between kernel and user space generally - use a struct sockaddr_in. - - The fact that IPv4 addresses are used does not mean the underlying - transport has to be IP-based. In fact, RDS over IB uses a - reliable IB connection; the IP address is used exclusively to - locate the remote node's GID (by ARPing for the given IP). - - The port space is entirely independent of UDP, TCP or any other - protocol. - - * Socket interface - RDS sockets work *mostly* as you would expect from a BSD - socket. The next section will cover the details. At any rate, - all I/O is performed through the standard BSD socket API. - Some additions like zerocopy support are implemented through - control messages, while other extensions use the getsockopt/ - setsockopt calls. - - Sockets must be bound before you can send or receive data. - This is needed because binding also selects a transport and - attaches it to the socket. Once bound, the transport assignment - does not change. RDS will tolerate IPs moving around (eg in - a active-active HA scenario), but only as long as the address - doesn't move to a different transport. - - * sysctls - RDS supports a number of sysctls in /proc/sys/net/rds - - -Socket Interface -================ - - AF_RDS, PF_RDS, SOL_RDS - AF_RDS and PF_RDS are the domain type to be used with socket(2) - to create RDS sockets. SOL_RDS is the socket-level to be used - with setsockopt(2) and getsockopt(2) for RDS specific socket - options. - - fd = socket(PF_RDS, SOCK_SEQPACKET, 0); - This creates a new, unbound RDS socket. - - setsockopt(SOL_SOCKET): send and receive buffer size - RDS honors the send and receive buffer size socket options. - You are not allowed to queue more than SO_SNDSIZE bytes to - a socket. A message is queued when sendmsg is called, and - it leaves the queue when the remote system acknowledges - its arrival. - - The SO_RCVSIZE option controls the maximum receive queue length. - This is a soft limit rather than a hard limit - RDS will - continue to accept and queue incoming messages, even if that - takes the queue length over the limit. However, it will also - mark the port as "congested" and send a congestion update to - the source node. The source node is supposed to throttle any - processes sending to this congested port. - - bind(fd, &sockaddr_in, ...) - This binds the socket to a local IP address and port, and a - transport, if one has not already been selected via the - SO_RDS_TRANSPORT socket option - - sendmsg(fd, ...) - Sends a message to the indicated recipient. The kernel will - transparently establish the underlying reliable connection - if it isn't up yet. - - An attempt to send a message that exceeds SO_SNDSIZE will - return with -EMSGSIZE - - An attempt to send a message that would take the total number - of queued bytes over the SO_SNDSIZE threshold will return - EAGAIN. - - An attempt to send a message to a destination that is marked - as "congested" will return ENOBUFS. - - recvmsg(fd, ...) - Receives a message that was queued to this socket. The sockets - recv queue accounting is adjusted, and if the queue length - drops below SO_SNDSIZE, the port is marked uncongested, and - a congestion update is sent to all peers. - - Applications can ask the RDS kernel module to receive - notifications via control messages (for instance, there is a - notification when a congestion update arrived, or when a RDMA - operation completes). These notifications are received through - the msg.msg_control buffer of struct msghdr. The format of the - messages is described in manpages. - - poll(fd) - RDS supports the poll interface to allow the application - to implement async I/O. - - POLLIN handling is pretty straightforward. When there's an - incoming message queued to the socket, or a pending notification, - we signal POLLIN. - - POLLOUT is a little harder. Since you can essentially send - to any destination, RDS will always signal POLLOUT as long as - there's room on the send queue (ie the number of bytes queued - is less than the sendbuf size). - - However, the kernel will refuse to accept messages to - a destination marked congested - in this case you will loop - forever if you rely on poll to tell you what to do. - This isn't a trivial problem, but applications can deal with - this - by using congestion notifications, and by checking for - ENOBUFS errors returned by sendmsg. - - setsockopt(SOL_RDS, RDS_CANCEL_SENT_TO, &sockaddr_in) - This allows the application to discard all messages queued to a - specific destination on this particular socket. - - This allows the application to cancel outstanding messages if - it detects a timeout. For instance, if it tried to send a message, - and the remote host is unreachable, RDS will keep trying forever. - The application may decide it's not worth it, and cancel the - operation. In this case, it would use RDS_CANCEL_SENT_TO to - nuke any pending messages. - - setsockopt(fd, SOL_RDS, SO_RDS_TRANSPORT, (int *)&transport ..) - getsockopt(fd, SOL_RDS, SO_RDS_TRANSPORT, (int *)&transport ..) - Set or read an integer defining the underlying - encapsulating transport to be used for RDS packets on the - socket. When setting the option, integer argument may be - one of RDS_TRANS_TCP or RDS_TRANS_IB. When retrieving the - value, RDS_TRANS_NONE will be returned on an unbound socket. - This socket option may only be set exactly once on the socket, - prior to binding it via the bind(2) system call. Attempts to - set SO_RDS_TRANSPORT on a socket for which the transport has - been previously attached explicitly (by SO_RDS_TRANSPORT) or - implicitly (via bind(2)) will return an error of EOPNOTSUPP. - An attempt to set SO_RDS_TRANSPORT to RDS_TRANS_NONE will - always return EINVAL. - -RDMA for RDS -============ - - see rds-rdma(7) manpage (available in rds-tools) - - -Congestion Notifications -======================== - - see rds(7) manpage - - -RDS Protocol -============ - - Message header - - The message header is a 'struct rds_header' (see rds.h): - Fields: - h_sequence: - per-packet sequence number - h_ack: - piggybacked acknowledgment of last packet received - h_len: - length of data, not including header - h_sport: - source port - h_dport: - destination port - h_flags: - CONG_BITMAP - this is a congestion update bitmap - ACK_REQUIRED - receiver must ack this packet - RETRANSMITTED - packet has previously been sent - h_credit: - indicate to other end of connection that - it has more credits available (i.e. there is - more send room) - h_padding[4]: - unused, for future use - h_csum: - header checksum - h_exthdr: - optional data can be passed here. This is currently used for - passing RDMA-related information. - - ACK and retransmit handling - - One might think that with reliable IB connections you wouldn't need - to ack messages that have been received. The problem is that IB - hardware generates an ack message before it has DMAed the message - into memory. This creates a potential message loss if the HCA is - disabled for any reason between when it sends the ack and before - the message is DMAed and processed. This is only a potential issue - if another HCA is available for fail-over. - - Sending an ack immediately would allow the sender to free the sent - message from their send queue quickly, but could cause excessive - traffic to be used for acks. RDS piggybacks acks on sent data - packets. Ack-only packets are reduced by only allowing one to be - in flight at a time, and by the sender only asking for acks when - its send buffers start to fill up. All retransmissions are also - acked. - - Flow Control - - RDS's IB transport uses a credit-based mechanism to verify that - there is space in the peer's receive buffers for more data. This - eliminates the need for hardware retries on the connection. - - Congestion - - Messages waiting in the receive queue on the receiving socket - are accounted against the sockets SO_RCVBUF option value. Only - the payload bytes in the message are accounted for. If the - number of bytes queued equals or exceeds rcvbuf then the socket - is congested. All sends attempted to this socket's address - should return block or return -EWOULDBLOCK. - - Applications are expected to be reasonably tuned such that this - situation very rarely occurs. An application encountering this - "back-pressure" is considered a bug. - - This is implemented by having each node maintain bitmaps which - indicate which ports on bound addresses are congested. As the - bitmap changes it is sent through all the connections which - terminate in the local address of the bitmap which changed. - - The bitmaps are allocated as connections are brought up. This - avoids allocation in the interrupt handling path which queues - sages on sockets. The dense bitmaps let transports send the - entire bitmap on any bitmap change reasonably efficiently. This - is much easier to implement than some finer-grained - communication of per-port congestion. The sender does a very - inexpensive bit test to test if the port it's about to send to - is congested or not. - - -RDS Transport Layer -================== - - As mentioned above, RDS is not IB-specific. Its code is divided - into a general RDS layer and a transport layer. - - The general layer handles the socket API, congestion handling, - loopback, stats, usermem pinning, and the connection state machine. - - The transport layer handles the details of the transport. The IB - transport, for example, handles all the queue pairs, work requests, - CM event handlers, and other Infiniband details. - - -RDS Kernel Structures -===================== - - struct rds_message - aka possibly "rds_outgoing", the generic RDS layer copies data to - be sent and sets header fields as needed, based on the socket API. - This is then queued for the individual connection and sent by the - connection's transport. - struct rds_incoming - a generic struct referring to incoming data that can be handed from - the transport to the general code and queued by the general code - while the socket is awoken. It is then passed back to the transport - code to handle the actual copy-to-user. - struct rds_socket - per-socket information - struct rds_connection - per-connection information - struct rds_transport - pointers to transport-specific functions - struct rds_statistics - non-transport-specific statistics - struct rds_cong_map - wraps the raw congestion bitmap, contains rbnode, waitq, etc. - -Connection management -===================== - - Connections may be in UP, DOWN, CONNECTING, DISCONNECTING, and - ERROR states. - - The first time an attempt is made by an RDS socket to send data to - a node, a connection is allocated and connected. That connection is - then maintained forever -- if there are transport errors, the - connection will be dropped and re-established. - - Dropping a connection while packets are queued will cause queued or - partially-sent datagrams to be retransmitted when the connection is - re-established. - - -The send path -============= - - rds_sendmsg() - struct rds_message built from incoming data - CMSGs parsed (e.g. RDMA ops) - transport connection alloced and connected if not already - rds_message placed on send queue - send worker awoken - rds_send_worker() - calls rds_send_xmit() until queue is empty - rds_send_xmit() - transmits congestion map if one is pending - may set ACK_REQUIRED - calls transport to send either non-RDMA or RDMA message - (RDMA ops never retransmitted) - rds_ib_xmit() - allocs work requests from send ring - adds any new send credits available to peer (h_credits) - maps the rds_message's sg list - piggybacks ack - populates work requests - post send to connection's queue pair - -The recv path -============= - - rds_ib_recv_cq_comp_handler() - looks at write completions - unmaps recv buffer from device - no errors, call rds_ib_process_recv() - refill recv ring - rds_ib_process_recv() - validate header checksum - copy header to rds_ib_incoming struct if start of a new datagram - add to ibinc's fraglist - if competed datagram: - update cong map if datagram was cong update - call rds_recv_incoming() otherwise - note if ack is required - rds_recv_incoming() - drop duplicate packets - respond to pings - find the sock associated with this datagram - add to sock queue - wake up sock - do some congestion calculations - rds_recvmsg - copy data into user iovec - handle CMSGs - return to application - -Multipath RDS (mprds) -===================== - Mprds is multipathed-RDS, primarily intended for RDS-over-TCP - (though the concept can be extended to other transports). The classical - implementation of RDS-over-TCP is implemented by demultiplexing multiple - PF_RDS sockets between any 2 endpoints (where endpoint == [IP address, - port]) over a single TCP socket between the 2 IP addresses involved. This - has the limitation that it ends up funneling multiple RDS flows over a - single TCP flow, thus it is - (a) upper-bounded to the single-flow bandwidth, - (b) suffers from head-of-line blocking for all the RDS sockets. - - Better throughput (for a fixed small packet size, MTU) can be achieved - by having multiple TCP/IP flows per rds/tcp connection, i.e., multipathed - RDS (mprds). Each such TCP/IP flow constitutes a path for the rds/tcp - connection. RDS sockets will be attached to a path based on some hash - (e.g., of local address and RDS port number) and packets for that RDS - socket will be sent over the attached path using TCP to segment/reassemble - RDS datagrams on that path. - - Multipathed RDS is implemented by splitting the struct rds_connection into - a common (to all paths) part, and a per-path struct rds_conn_path. All - I/O workqs and reconnect threads are driven from the rds_conn_path. - Transports such as TCP that are multipath capable may then set up a - TCP socket per rds_conn_path, and this is managed by the transport via - the transport privatee cp_transport_data pointer. - - Transports announce themselves as multipath capable by setting the - t_mp_capable bit during registration with the rds core module. When the - transport is multipath-capable, rds_sendmsg() hashes outgoing traffic - across multiple paths. The outgoing hash is computed based on the - local address and port that the PF_RDS socket is bound to. - - Additionally, even if the transport is MP capable, we may be - peering with some node that does not support mprds, or supports - a different number of paths. As a result, the peering nodes need - to agree on the number of paths to be used for the connection. - This is done by sending out a control packet exchange before the - first data packet. The control packet exchange must have completed - prior to outgoing hash completion in rds_sendmsg() when the transport - is mutlipath capable. - - The control packet is an RDS ping packet (i.e., packet to rds dest - port 0) with the ping packet having a rds extension header option of - type RDS_EXTHDR_NPATHS, length 2 bytes, and the value is the - number of paths supported by the sender. The "probe" ping packet will - get sent from some reserved port, RDS_FLAG_PROBE_PORT (in ) - The receiver of a ping from RDS_FLAG_PROBE_PORT will thus immediately - be able to compute the min(sender_paths, rcvr_paths). The pong - sent in response to a probe-ping should contain the rcvr's npaths - when the rcvr is mprds-capable. - - If the rcvr is not mprds-capable, the exthdr in the ping will be - ignored. In this case the pong will not have any exthdrs, so the sender - of the probe-ping can default to single-path mprds. - diff --git a/MAINTAINERS b/MAINTAINERS index 785f56e5f210..ea5dd3d1df9d 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -14219,7 +14219,7 @@ L: linux-rdma@vger.kernel.org L: rds-devel@oss.oracle.com (moderated for non-subscribers) S: Supported W: https://oss.oracle.com/projects/rds/ -F: Documentation/networking/rds.txt +F: Documentation/networking/rds.rst F: net/rds/ RDT - RESOURCE ALLOCATION -- cgit v1.2.3-59-g8ed1b From 98661e0c579dbda0e0910185f752fddd95e2d29c Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Thu, 30 Apr 2020 18:04:20 +0200 Subject: docs: networking: convert regulatory.txt to ReST - add SPDX header; - adjust title markup; - mark code blocks and literals as such; - adjust identation, whitespaces and blank lines where needed; - add to networking/index.rst. Signed-off-by: Mauro Carvalho Chehab Signed-off-by: David S. Miller --- Documentation/networking/index.rst | 1 + Documentation/networking/regulatory.rst | 209 ++++++++++++++++++++++++++++++++ Documentation/networking/regulatory.txt | 204 ------------------------------- MAINTAINERS | 2 +- 4 files changed, 211 insertions(+), 205 deletions(-) create mode 100644 Documentation/networking/regulatory.rst delete mode 100644 Documentation/networking/regulatory.txt diff --git a/Documentation/networking/index.rst b/Documentation/networking/index.rst index e63a2cb2e4cb..bc3b04a2edde 100644 --- a/Documentation/networking/index.rst +++ b/Documentation/networking/index.rst @@ -98,6 +98,7 @@ Contents: radiotap-headers ray_cs rds + regulatory .. only:: subproject and html diff --git a/Documentation/networking/regulatory.rst b/Documentation/networking/regulatory.rst new file mode 100644 index 000000000000..8701b91e81ee --- /dev/null +++ b/Documentation/networking/regulatory.rst @@ -0,0 +1,209 @@ +.. SPDX-License-Identifier: GPL-2.0 + +======================================= +Linux wireless regulatory documentation +======================================= + +This document gives a brief review over how the Linux wireless +regulatory infrastructure works. + +More up to date information can be obtained at the project's web page: + +http://wireless.kernel.org/en/developers/Regulatory + +Keeping regulatory domains in userspace +--------------------------------------- + +Due to the dynamic nature of regulatory domains we keep them +in userspace and provide a framework for userspace to upload +to the kernel one regulatory domain to be used as the central +core regulatory domain all wireless devices should adhere to. + +How to get regulatory domains to the kernel +------------------------------------------- + +When the regulatory domain is first set up, the kernel will request a +database file (regulatory.db) containing all the regulatory rules. It +will then use that database when it needs to look up the rules for a +given country. + +How to get regulatory domains to the kernel (old CRDA solution) +--------------------------------------------------------------- + +Userspace gets a regulatory domain in the kernel by having +a userspace agent build it and send it via nl80211. Only +expected regulatory domains will be respected by the kernel. + +A currently available userspace agent which can accomplish this +is CRDA - central regulatory domain agent. Its documented here: + +http://wireless.kernel.org/en/developers/Regulatory/CRDA + +Essentially the kernel will send a udev event when it knows +it needs a new regulatory domain. A udev rule can be put in place +to trigger crda to send the respective regulatory domain for a +specific ISO/IEC 3166 alpha2. + +Below is an example udev rule which can be used: + +# Example file, should be put in /etc/udev/rules.d/regulatory.rules +KERNEL=="regulatory*", ACTION=="change", SUBSYSTEM=="platform", RUN+="/sbin/crda" + +The alpha2 is passed as an environment variable under the variable COUNTRY. + +Who asks for regulatory domains? +-------------------------------- + +* Users + +Users can use iw: + +http://wireless.kernel.org/en/users/Documentation/iw + +An example:: + + # set regulatory domain to "Costa Rica" + iw reg set CR + +This will request the kernel to set the regulatory domain to +the specificied alpha2. The kernel in turn will then ask userspace +to provide a regulatory domain for the alpha2 specified by the user +by sending a uevent. + +* Wireless subsystems for Country Information elements + +The kernel will send a uevent to inform userspace a new +regulatory domain is required. More on this to be added +as its integration is added. + +* Drivers + +If drivers determine they need a specific regulatory domain +set they can inform the wireless core using regulatory_hint(). +They have two options -- they either provide an alpha2 so that +crda can provide back a regulatory domain for that country or +they can build their own regulatory domain based on internal +custom knowledge so the wireless core can respect it. + +*Most* drivers will rely on the first mechanism of providing a +regulatory hint with an alpha2. For these drivers there is an additional +check that can be used to ensure compliance based on custom EEPROM +regulatory data. This additional check can be used by drivers by +registering on its struct wiphy a reg_notifier() callback. This notifier +is called when the core's regulatory domain has been changed. The driver +can use this to review the changes made and also review who made them +(driver, user, country IE) and determine what to allow based on its +internal EEPROM data. Devices drivers wishing to be capable of world +roaming should use this callback. More on world roaming will be +added to this document when its support is enabled. + +Device drivers who provide their own built regulatory domain +do not need a callback as the channels registered by them are +the only ones that will be allowed and therefore *additional* +channels cannot be enabled. + +Example code - drivers hinting an alpha2: +------------------------------------------ + +This example comes from the zd1211rw device driver. You can start +by having a mapping of your device's EEPROM country/regulatory +domain value to a specific alpha2 as follows:: + + static struct zd_reg_alpha2_map reg_alpha2_map[] = { + { ZD_REGDOMAIN_FCC, "US" }, + { ZD_REGDOMAIN_IC, "CA" }, + { ZD_REGDOMAIN_ETSI, "DE" }, /* Generic ETSI, use most restrictive */ + { ZD_REGDOMAIN_JAPAN, "JP" }, + { ZD_REGDOMAIN_JAPAN_ADD, "JP" }, + { ZD_REGDOMAIN_SPAIN, "ES" }, + { ZD_REGDOMAIN_FRANCE, "FR" }, + +Then you can define a routine to map your read EEPROM value to an alpha2, +as follows:: + + static int zd_reg2alpha2(u8 regdomain, char *alpha2) + { + unsigned int i; + struct zd_reg_alpha2_map *reg_map; + for (i = 0; i < ARRAY_SIZE(reg_alpha2_map); i++) { + reg_map = ®_alpha2_map[i]; + if (regdomain == reg_map->reg) { + alpha2[0] = reg_map->alpha2[0]; + alpha2[1] = reg_map->alpha2[1]; + return 0; + } + } + return 1; + } + +Lastly, you can then hint to the core of your discovered alpha2, if a match +was found. You need to do this after you have registered your wiphy. You +are expected to do this during initialization. + +:: + + r = zd_reg2alpha2(mac->regdomain, alpha2); + if (!r) + regulatory_hint(hw->wiphy, alpha2); + +Example code - drivers providing a built in regulatory domain: +-------------------------------------------------------------- + +[NOTE: This API is not currently available, it can be added when required] + +If you have regulatory information you can obtain from your +driver and you *need* to use this we let you build a regulatory domain +structure and pass it to the wireless core. To do this you should +kmalloc() a structure big enough to hold your regulatory domain +structure and you should then fill it with your data. Finally you simply +call regulatory_hint() with the regulatory domain structure in it. + +Bellow is a simple example, with a regulatory domain cached using the stack. +Your implementation may vary (read EEPROM cache instead, for example). + +Example cache of some regulatory domain:: + + struct ieee80211_regdomain mydriver_jp_regdom = { + .n_reg_rules = 3, + .alpha2 = "JP", + //.alpha2 = "99", /* If I have no alpha2 to map it to */ + .reg_rules = { + /* IEEE 802.11b/g, channels 1..14 */ + REG_RULE(2412-10, 2484+10, 40, 6, 20, 0), + /* IEEE 802.11a, channels 34..48 */ + REG_RULE(5170-10, 5240+10, 40, 6, 20, + NL80211_RRF_NO_IR), + /* IEEE 802.11a, channels 52..64 */ + REG_RULE(5260-10, 5320+10, 40, 6, 20, + NL80211_RRF_NO_IR| + NL80211_RRF_DFS), + } + }; + +Then in some part of your code after your wiphy has been registered:: + + struct ieee80211_regdomain *rd; + int size_of_regd; + int num_rules = mydriver_jp_regdom.n_reg_rules; + unsigned int i; + + size_of_regd = sizeof(struct ieee80211_regdomain) + + (num_rules * sizeof(struct ieee80211_reg_rule)); + + rd = kzalloc(size_of_regd, GFP_KERNEL); + if (!rd) + return -ENOMEM; + + memcpy(rd, &mydriver_jp_regdom, sizeof(struct ieee80211_regdomain)); + + for (i=0; i < num_rules; i++) + memcpy(&rd->reg_rules[i], + &mydriver_jp_regdom.reg_rules[i], + sizeof(struct ieee80211_reg_rule)); + regulatory_struct_hint(rd); + +Statically compiled regulatory database +--------------------------------------- + +When a database should be fixed into the kernel, it can be provided as a +firmware file at build time that is then linked into the kernel. diff --git a/Documentation/networking/regulatory.txt b/Documentation/networking/regulatory.txt deleted file mode 100644 index 381e5b23d61d..000000000000 --- a/Documentation/networking/regulatory.txt +++ /dev/null @@ -1,204 +0,0 @@ -Linux wireless regulatory documentation ---------------------------------------- - -This document gives a brief review over how the Linux wireless -regulatory infrastructure works. - -More up to date information can be obtained at the project's web page: - -http://wireless.kernel.org/en/developers/Regulatory - -Keeping regulatory domains in userspace ---------------------------------------- - -Due to the dynamic nature of regulatory domains we keep them -in userspace and provide a framework for userspace to upload -to the kernel one regulatory domain to be used as the central -core regulatory domain all wireless devices should adhere to. - -How to get regulatory domains to the kernel -------------------------------------------- - -When the regulatory domain is first set up, the kernel will request a -database file (regulatory.db) containing all the regulatory rules. It -will then use that database when it needs to look up the rules for a -given country. - -How to get regulatory domains to the kernel (old CRDA solution) ---------------------------------------------------------------- - -Userspace gets a regulatory domain in the kernel by having -a userspace agent build it and send it via nl80211. Only -expected regulatory domains will be respected by the kernel. - -A currently available userspace agent which can accomplish this -is CRDA - central regulatory domain agent. Its documented here: - -http://wireless.kernel.org/en/developers/Regulatory/CRDA - -Essentially the kernel will send a udev event when it knows -it needs a new regulatory domain. A udev rule can be put in place -to trigger crda to send the respective regulatory domain for a -specific ISO/IEC 3166 alpha2. - -Below is an example udev rule which can be used: - -# Example file, should be put in /etc/udev/rules.d/regulatory.rules -KERNEL=="regulatory*", ACTION=="change", SUBSYSTEM=="platform", RUN+="/sbin/crda" - -The alpha2 is passed as an environment variable under the variable COUNTRY. - -Who asks for regulatory domains? --------------------------------- - -* Users - -Users can use iw: - -http://wireless.kernel.org/en/users/Documentation/iw - -An example: - - # set regulatory domain to "Costa Rica" - iw reg set CR - -This will request the kernel to set the regulatory domain to -the specificied alpha2. The kernel in turn will then ask userspace -to provide a regulatory domain for the alpha2 specified by the user -by sending a uevent. - -* Wireless subsystems for Country Information elements - -The kernel will send a uevent to inform userspace a new -regulatory domain is required. More on this to be added -as its integration is added. - -* Drivers - -If drivers determine they need a specific regulatory domain -set they can inform the wireless core using regulatory_hint(). -They have two options -- they either provide an alpha2 so that -crda can provide back a regulatory domain for that country or -they can build their own regulatory domain based on internal -custom knowledge so the wireless core can respect it. - -*Most* drivers will rely on the first mechanism of providing a -regulatory hint with an alpha2. For these drivers there is an additional -check that can be used to ensure compliance based on custom EEPROM -regulatory data. This additional check can be used by drivers by -registering on its struct wiphy a reg_notifier() callback. This notifier -is called when the core's regulatory domain has been changed. The driver -can use this to review the changes made and also review who made them -(driver, user, country IE) and determine what to allow based on its -internal EEPROM data. Devices drivers wishing to be capable of world -roaming should use this callback. More on world roaming will be -added to this document when its support is enabled. - -Device drivers who provide their own built regulatory domain -do not need a callback as the channels registered by them are -the only ones that will be allowed and therefore *additional* -channels cannot be enabled. - -Example code - drivers hinting an alpha2: ------------------------------------------- - -This example comes from the zd1211rw device driver. You can start -by having a mapping of your device's EEPROM country/regulatory -domain value to a specific alpha2 as follows: - -static struct zd_reg_alpha2_map reg_alpha2_map[] = { - { ZD_REGDOMAIN_FCC, "US" }, - { ZD_REGDOMAIN_IC, "CA" }, - { ZD_REGDOMAIN_ETSI, "DE" }, /* Generic ETSI, use most restrictive */ - { ZD_REGDOMAIN_JAPAN, "JP" }, - { ZD_REGDOMAIN_JAPAN_ADD, "JP" }, - { ZD_REGDOMAIN_SPAIN, "ES" }, - { ZD_REGDOMAIN_FRANCE, "FR" }, - -Then you can define a routine to map your read EEPROM value to an alpha2, -as follows: - -static int zd_reg2alpha2(u8 regdomain, char *alpha2) -{ - unsigned int i; - struct zd_reg_alpha2_map *reg_map; - for (i = 0; i < ARRAY_SIZE(reg_alpha2_map); i++) { - reg_map = ®_alpha2_map[i]; - if (regdomain == reg_map->reg) { - alpha2[0] = reg_map->alpha2[0]; - alpha2[1] = reg_map->alpha2[1]; - return 0; - } - } - return 1; -} - -Lastly, you can then hint to the core of your discovered alpha2, if a match -was found. You need to do this after you have registered your wiphy. You -are expected to do this during initialization. - - r = zd_reg2alpha2(mac->regdomain, alpha2); - if (!r) - regulatory_hint(hw->wiphy, alpha2); - -Example code - drivers providing a built in regulatory domain: --------------------------------------------------------------- - -[NOTE: This API is not currently available, it can be added when required] - -If you have regulatory information you can obtain from your -driver and you *need* to use this we let you build a regulatory domain -structure and pass it to the wireless core. To do this you should -kmalloc() a structure big enough to hold your regulatory domain -structure and you should then fill it with your data. Finally you simply -call regulatory_hint() with the regulatory domain structure in it. - -Bellow is a simple example, with a regulatory domain cached using the stack. -Your implementation may vary (read EEPROM cache instead, for example). - -Example cache of some regulatory domain - -struct ieee80211_regdomain mydriver_jp_regdom = { - .n_reg_rules = 3, - .alpha2 = "JP", - //.alpha2 = "99", /* If I have no alpha2 to map it to */ - .reg_rules = { - /* IEEE 802.11b/g, channels 1..14 */ - REG_RULE(2412-10, 2484+10, 40, 6, 20, 0), - /* IEEE 802.11a, channels 34..48 */ - REG_RULE(5170-10, 5240+10, 40, 6, 20, - NL80211_RRF_NO_IR), - /* IEEE 802.11a, channels 52..64 */ - REG_RULE(5260-10, 5320+10, 40, 6, 20, - NL80211_RRF_NO_IR| - NL80211_RRF_DFS), - } -}; - -Then in some part of your code after your wiphy has been registered: - - struct ieee80211_regdomain *rd; - int size_of_regd; - int num_rules = mydriver_jp_regdom.n_reg_rules; - unsigned int i; - - size_of_regd = sizeof(struct ieee80211_regdomain) + - (num_rules * sizeof(struct ieee80211_reg_rule)); - - rd = kzalloc(size_of_regd, GFP_KERNEL); - if (!rd) - return -ENOMEM; - - memcpy(rd, &mydriver_jp_regdom, sizeof(struct ieee80211_regdomain)); - - for (i=0; i < num_rules; i++) - memcpy(&rd->reg_rules[i], - &mydriver_jp_regdom.reg_rules[i], - sizeof(struct ieee80211_reg_rule)); - regulatory_struct_hint(rd); - -Statically compiled regulatory database ---------------------------------------- - -When a database should be fixed into the kernel, it can be provided as a -firmware file at build time that is then linked into the kernel. diff --git a/MAINTAINERS b/MAINTAINERS index ea5dd3d1df9d..b28823ab48c5 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -193,7 +193,7 @@ W: https://wireless.wiki.kernel.org/ T: git git://git.kernel.org/pub/scm/linux/kernel/git/jberg/mac80211.git T: git git://git.kernel.org/pub/scm/linux/kernel/git/jberg/mac80211-next.git F: Documentation/driver-api/80211/cfg80211.rst -F: Documentation/networking/regulatory.txt +F: Documentation/networking/regulatory.rst F: include/linux/ieee80211.h F: include/net/cfg80211.h F: include/net/ieee80211_radiotap.h -- cgit v1.2.3-59-g8ed1b From 9f72374cb5959556870be8078b128158edde5d3e Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Thu, 30 Apr 2020 18:04:21 +0200 Subject: docs: networking: convert rxrpc.txt to ReST - add SPDX header; - adjust title markup; - use autonumbered list markups; - mark code blocks and literals as such; - mark tables as such; - adjust identation, whitespaces and blank lines where needed; - add to networking/index.rst. Signed-off-by: Mauro Carvalho Chehab Signed-off-by: David S. Miller --- Documentation/filesystems/afs.rst | 2 +- Documentation/networking/index.rst | 1 + Documentation/networking/rxrpc.rst | 1169 ++++++++++++++++++++++++++++++++++++ Documentation/networking/rxrpc.txt | 1155 ----------------------------------- MAINTAINERS | 2 +- net/rxrpc/Kconfig | 6 +- net/rxrpc/sysctl.c | 2 +- 7 files changed, 1176 insertions(+), 1161 deletions(-) create mode 100644 Documentation/networking/rxrpc.rst delete mode 100644 Documentation/networking/rxrpc.txt diff --git a/Documentation/filesystems/afs.rst b/Documentation/filesystems/afs.rst index c4ec39a5966e..cada9464d6bd 100644 --- a/Documentation/filesystems/afs.rst +++ b/Documentation/filesystems/afs.rst @@ -70,7 +70,7 @@ list of volume location server IP addresses:: The first module is the AF_RXRPC network protocol driver. This provides the RxRPC remote operation protocol and may also be accessed from userspace. See: - Documentation/networking/rxrpc.txt + Documentation/networking/rxrpc.rst The second module is the kerberos RxRPC security driver, and the third module is the actual filesystem driver for the AFS filesystem. diff --git a/Documentation/networking/index.rst b/Documentation/networking/index.rst index bc3b04a2edde..cd307b9601fa 100644 --- a/Documentation/networking/index.rst +++ b/Documentation/networking/index.rst @@ -99,6 +99,7 @@ Contents: ray_cs rds regulatory + rxrpc .. only:: subproject and html diff --git a/Documentation/networking/rxrpc.rst b/Documentation/networking/rxrpc.rst new file mode 100644 index 000000000000..5ad35113d0f4 --- /dev/null +++ b/Documentation/networking/rxrpc.rst @@ -0,0 +1,1169 @@ +.. SPDX-License-Identifier: GPL-2.0 + +====================== +RxRPC Network Protocol +====================== + +The RxRPC protocol driver provides a reliable two-phase transport on top of UDP +that can be used to perform RxRPC remote operations. This is done over sockets +of AF_RXRPC family, using sendmsg() and recvmsg() with control data to send and +receive data, aborts and errors. + +Contents of this document: + + (#) Overview. + + (#) RxRPC protocol summary. + + (#) AF_RXRPC driver model. + + (#) Control messages. + + (#) Socket options. + + (#) Security. + + (#) Example client usage. + + (#) Example server usage. + + (#) AF_RXRPC kernel interface. + + (#) Configurable parameters. + + +Overview +======== + +RxRPC is a two-layer protocol. There is a session layer which provides +reliable virtual connections using UDP over IPv4 (or IPv6) as the transport +layer, but implements a real network protocol; and there's the presentation +layer which renders structured data to binary blobs and back again using XDR +(as does SunRPC):: + + +-------------+ + | Application | + +-------------+ + | XDR | Presentation + +-------------+ + | RxRPC | Session + +-------------+ + | UDP | Transport + +-------------+ + + +AF_RXRPC provides: + + (1) Part of an RxRPC facility for both kernel and userspace applications by + making the session part of it a Linux network protocol (AF_RXRPC). + + (2) A two-phase protocol. The client transmits a blob (the request) and then + receives a blob (the reply), and the server receives the request and then + transmits the reply. + + (3) Retention of the reusable bits of the transport system set up for one call + to speed up subsequent calls. + + (4) A secure protocol, using the Linux kernel's key retention facility to + manage security on the client end. The server end must of necessity be + more active in security negotiations. + +AF_RXRPC does not provide XDR marshalling/presentation facilities. That is +left to the application. AF_RXRPC only deals in blobs. Even the operation ID +is just the first four bytes of the request blob, and as such is beyond the +kernel's interest. + + +Sockets of AF_RXRPC family are: + + (1) created as type SOCK_DGRAM; + + (2) provided with a protocol of the type of underlying transport they're going + to use - currently only PF_INET is supported. + + +The Andrew File System (AFS) is an example of an application that uses this and +that has both kernel (filesystem) and userspace (utility) components. + + +RxRPC Protocol Summary +====================== + +An overview of the RxRPC protocol: + + (#) RxRPC sits on top of another networking protocol (UDP is the only option + currently), and uses this to provide network transport. UDP ports, for + example, provide transport endpoints. + + (#) RxRPC supports multiple virtual "connections" from any given transport + endpoint, thus allowing the endpoints to be shared, even to the same + remote endpoint. + + (#) Each connection goes to a particular "service". A connection may not go + to multiple services. A service may be considered the RxRPC equivalent of + a port number. AF_RXRPC permits multiple services to share an endpoint. + + (#) Client-originating packets are marked, thus a transport endpoint can be + shared between client and server connections (connections have a + direction). + + (#) Up to a billion connections may be supported concurrently between one + local transport endpoint and one service on one remote endpoint. An RxRPC + connection is described by seven numbers:: + + Local address } + Local port } Transport (UDP) address + Remote address } + Remote port } + Direction + Connection ID + Service ID + + (#) Each RxRPC operation is a "call". A connection may make up to four + billion calls, but only up to four calls may be in progress on a + connection at any one time. + + (#) Calls are two-phase and asymmetric: the client sends its request data, + which the service receives; then the service transmits the reply data + which the client receives. + + (#) The data blobs are of indefinite size, the end of a phase is marked with a + flag in the packet. The number of packets of data making up one blob may + not exceed 4 billion, however, as this would cause the sequence number to + wrap. + + (#) The first four bytes of the request data are the service operation ID. + + (#) Security is negotiated on a per-connection basis. The connection is + initiated by the first data packet on it arriving. If security is + requested, the server then issues a "challenge" and then the client + replies with a "response". If the response is successful, the security is + set for the lifetime of that connection, and all subsequent calls made + upon it use that same security. In the event that the server lets a + connection lapse before the client, the security will be renegotiated if + the client uses the connection again. + + (#) Calls use ACK packets to handle reliability. Data packets are also + explicitly sequenced per call. + + (#) There are two types of positive acknowledgment: hard-ACKs and soft-ACKs. + A hard-ACK indicates to the far side that all the data received to a point + has been received and processed; a soft-ACK indicates that the data has + been received but may yet be discarded and re-requested. The sender may + not discard any transmittable packets until they've been hard-ACK'd. + + (#) Reception of a reply data packet implicitly hard-ACK's all the data + packets that make up the request. + + (#) An call is complete when the request has been sent, the reply has been + received and the final hard-ACK on the last packet of the reply has + reached the server. + + (#) An call may be aborted by either end at any time up to its completion. + + +AF_RXRPC Driver Model +===================== + +About the AF_RXRPC driver: + + (#) The AF_RXRPC protocol transparently uses internal sockets of the transport + protocol to represent transport endpoints. + + (#) AF_RXRPC sockets map onto RxRPC connection bundles. Actual RxRPC + connections are handled transparently. One client socket may be used to + make multiple simultaneous calls to the same service. One server socket + may handle calls from many clients. + + (#) Additional parallel client connections will be initiated to support extra + concurrent calls, up to a tunable limit. + + (#) Each connection is retained for a certain amount of time [tunable] after + the last call currently using it has completed in case a new call is made + that could reuse it. + + (#) Each internal UDP socket is retained [tunable] for a certain amount of + time [tunable] after the last connection using it discarded, in case a new + connection is made that could use it. + + (#) A client-side connection is only shared between calls if they have have + the same key struct describing their security (and assuming the calls + would otherwise share the connection). Non-secured calls would also be + able to share connections with each other. + + (#) A server-side connection is shared if the client says it is. + + (#) ACK'ing is handled by the protocol driver automatically, including ping + replying. + + (#) SO_KEEPALIVE automatically pings the other side to keep the connection + alive [TODO]. + + (#) If an ICMP error is received, all calls affected by that error will be + aborted with an appropriate network error passed through recvmsg(). + + +Interaction with the user of the RxRPC socket: + + (#) A socket is made into a server socket by binding an address with a + non-zero service ID. + + (#) In the client, sending a request is achieved with one or more sendmsgs, + followed by the reply being received with one or more recvmsgs. + + (#) The first sendmsg for a request to be sent from a client contains a tag to + be used in all other sendmsgs or recvmsgs associated with that call. The + tag is carried in the control data. + + (#) connect() is used to supply a default destination address for a client + socket. This may be overridden by supplying an alternate address to the + first sendmsg() of a call (struct msghdr::msg_name). + + (#) If connect() is called on an unbound client, a random local port will + bound before the operation takes place. + + (#) A server socket may also be used to make client calls. To do this, the + first sendmsg() of the call must specify the target address. The server's + transport endpoint is used to send the packets. + + (#) Once the application has received the last message associated with a call, + the tag is guaranteed not to be seen again, and so it can be used to pin + client resources. A new call can then be initiated with the same tag + without fear of interference. + + (#) In the server, a request is received with one or more recvmsgs, then the + the reply is transmitted with one or more sendmsgs, and then the final ACK + is received with a last recvmsg. + + (#) When sending data for a call, sendmsg is given MSG_MORE if there's more + data to come on that call. + + (#) When receiving data for a call, recvmsg flags MSG_MORE if there's more + data to come for that call. + + (#) When receiving data or messages for a call, MSG_EOR is flagged by recvmsg + to indicate the terminal message for that call. + + (#) A call may be aborted by adding an abort control message to the control + data. Issuing an abort terminates the kernel's use of that call's tag. + Any messages waiting in the receive queue for that call will be discarded. + + (#) Aborts, busy notifications and challenge packets are delivered by recvmsg, + and control data messages will be set to indicate the context. Receiving + an abort or a busy message terminates the kernel's use of that call's tag. + + (#) The control data part of the msghdr struct is used for a number of things: + + (#) The tag of the intended or affected call. + + (#) Sending or receiving errors, aborts and busy notifications. + + (#) Notifications of incoming calls. + + (#) Sending debug requests and receiving debug replies [TODO]. + + (#) When the kernel has received and set up an incoming call, it sends a + message to server application to let it know there's a new call awaiting + its acceptance [recvmsg reports a special control message]. The server + application then uses sendmsg to assign a tag to the new call. Once that + is done, the first part of the request data will be delivered by recvmsg. + + (#) The server application has to provide the server socket with a keyring of + secret keys corresponding to the security types it permits. When a secure + connection is being set up, the kernel looks up the appropriate secret key + in the keyring and then sends a challenge packet to the client and + receives a response packet. The kernel then checks the authorisation of + the packet and either aborts the connection or sets up the security. + + (#) The name of the key a client will use to secure its communications is + nominated by a socket option. + + +Notes on sendmsg: + + (#) MSG_WAITALL can be set to tell sendmsg to ignore signals if the peer is + making progress at accepting packets within a reasonable time such that we + manage to queue up all the data for transmission. This requires the + client to accept at least one packet per 2*RTT time period. + + If this isn't set, sendmsg() will return immediately, either returning + EINTR/ERESTARTSYS if nothing was consumed or returning the amount of data + consumed. + + +Notes on recvmsg: + + (#) If there's a sequence of data messages belonging to a particular call on + the receive queue, then recvmsg will keep working through them until: + + (a) it meets the end of that call's received data, + + (b) it meets a non-data message, + + (c) it meets a message belonging to a different call, or + + (d) it fills the user buffer. + + If recvmsg is called in blocking mode, it will keep sleeping, awaiting the + reception of further data, until one of the above four conditions is met. + + (2) MSG_PEEK operates similarly, but will return immediately if it has put any + data in the buffer rather than sleeping until it can fill the buffer. + + (3) If a data message is only partially consumed in filling a user buffer, + then the remainder of that message will be left on the front of the queue + for the next taker. MSG_TRUNC will never be flagged. + + (4) If there is more data to be had on a call (it hasn't copied the last byte + of the last data message in that phase yet), then MSG_MORE will be + flagged. + + +Control Messages +================ + +AF_RXRPC makes use of control messages in sendmsg() and recvmsg() to multiplex +calls, to invoke certain actions and to report certain conditions. These are: + + ======================= === =========== =============================== + MESSAGE ID SRT DATA MEANING + ======================= === =========== =============================== + RXRPC_USER_CALL_ID sr- User ID App's call specifier + RXRPC_ABORT srt Abort code Abort code to issue/received + RXRPC_ACK -rt n/a Final ACK received + RXRPC_NET_ERROR -rt error num Network error on call + RXRPC_BUSY -rt n/a Call rejected (server busy) + RXRPC_LOCAL_ERROR -rt error num Local error encountered + RXRPC_NEW_CALL -r- n/a New call received + RXRPC_ACCEPT s-- n/a Accept new call + RXRPC_EXCLUSIVE_CALL s-- n/a Make an exclusive client call + RXRPC_UPGRADE_SERVICE s-- n/a Client call can be upgraded + RXRPC_TX_LENGTH s-- data len Total length of Tx data + ======================= === =========== =============================== + + (SRT = usable in Sendmsg / delivered by Recvmsg / Terminal message) + + (#) RXRPC_USER_CALL_ID + + This is used to indicate the application's call ID. It's an unsigned long + that the app specifies in the client by attaching it to the first data + message or in the server by passing it in association with an RXRPC_ACCEPT + message. recvmsg() passes it in conjunction with all messages except + those of the RXRPC_NEW_CALL message. + + (#) RXRPC_ABORT + + This is can be used by an application to abort a call by passing it to + sendmsg, or it can be delivered by recvmsg to indicate a remote abort was + received. Either way, it must be associated with an RXRPC_USER_CALL_ID to + specify the call affected. If an abort is being sent, then error EBADSLT + will be returned if there is no call with that user ID. + + (#) RXRPC_ACK + + This is delivered to a server application to indicate that the final ACK + of a call was received from the client. It will be associated with an + RXRPC_USER_CALL_ID to indicate the call that's now complete. + + (#) RXRPC_NET_ERROR + + This is delivered to an application to indicate that an ICMP error message + was encountered in the process of trying to talk to the peer. An + errno-class integer value will be included in the control message data + indicating the problem, and an RXRPC_USER_CALL_ID will indicate the call + affected. + + (#) RXRPC_BUSY + + This is delivered to a client application to indicate that a call was + rejected by the server due to the server being busy. It will be + associated with an RXRPC_USER_CALL_ID to indicate the rejected call. + + (#) RXRPC_LOCAL_ERROR + + This is delivered to an application to indicate that a local error was + encountered and that a call has been aborted because of it. An + errno-class integer value will be included in the control message data + indicating the problem, and an RXRPC_USER_CALL_ID will indicate the call + affected. + + (#) RXRPC_NEW_CALL + + This is delivered to indicate to a server application that a new call has + arrived and is awaiting acceptance. No user ID is associated with this, + as a user ID must subsequently be assigned by doing an RXRPC_ACCEPT. + + (#) RXRPC_ACCEPT + + This is used by a server application to attempt to accept a call and + assign it a user ID. It should be associated with an RXRPC_USER_CALL_ID + to indicate the user ID to be assigned. If there is no call to be + accepted (it may have timed out, been aborted, etc.), then sendmsg will + return error ENODATA. If the user ID is already in use by another call, + then error EBADSLT will be returned. + + (#) RXRPC_EXCLUSIVE_CALL + + This is used to indicate that a client call should be made on a one-off + connection. The connection is discarded once the call has terminated. + + (#) RXRPC_UPGRADE_SERVICE + + This is used to make a client call to probe if the specified service ID + may be upgraded by the server. The caller must check msg_name returned to + recvmsg() for the service ID actually in use. The operation probed must + be one that takes the same arguments in both services. + + Once this has been used to establish the upgrade capability (or lack + thereof) of the server, the service ID returned should be used for all + future communication to that server and RXRPC_UPGRADE_SERVICE should no + longer be set. + + (#) RXRPC_TX_LENGTH + + This is used to inform the kernel of the total amount of data that is + going to be transmitted by a call (whether in a client request or a + service response). If given, it allows the kernel to encrypt from the + userspace buffer directly to the packet buffers, rather than copying into + the buffer and then encrypting in place. This may only be given with the + first sendmsg() providing data for a call. EMSGSIZE will be generated if + the amount of data actually given is different. + + This takes a parameter of __s64 type that indicates how much will be + transmitted. This may not be less than zero. + +The symbol RXRPC__SUPPORTED is defined as one more than the highest control +message type supported. At run time this can be queried by means of the +RXRPC_SUPPORTED_CMSG socket option (see below). + + +============== +SOCKET OPTIONS +============== + +AF_RXRPC sockets support a few socket options at the SOL_RXRPC level: + + (#) RXRPC_SECURITY_KEY + + This is used to specify the description of the key to be used. The key is + extracted from the calling process's keyrings with request_key() and + should be of "rxrpc" type. + + The optval pointer points to the description string, and optlen indicates + how long the string is, without the NUL terminator. + + (#) RXRPC_SECURITY_KEYRING + + Similar to above but specifies a keyring of server secret keys to use (key + type "keyring"). See the "Security" section. + + (#) RXRPC_EXCLUSIVE_CONNECTION + + This is used to request that new connections should be used for each call + made subsequently on this socket. optval should be NULL and optlen 0. + + (#) RXRPC_MIN_SECURITY_LEVEL + + This is used to specify the minimum security level required for calls on + this socket. optval must point to an int containing one of the following + values: + + (a) RXRPC_SECURITY_PLAIN + + Encrypted checksum only. + + (b) RXRPC_SECURITY_AUTH + + Encrypted checksum plus packet padded and first eight bytes of packet + encrypted - which includes the actual packet length. + + (c) RXRPC_SECURITY_ENCRYPTED + + Encrypted checksum plus entire packet padded and encrypted, including + actual packet length. + + (#) RXRPC_UPGRADEABLE_SERVICE + + This is used to indicate that a service socket with two bindings may + upgrade one bound service to the other if requested by the client. optval + must point to an array of two unsigned short ints. The first is the + service ID to upgrade from and the second the service ID to upgrade to. + + (#) RXRPC_SUPPORTED_CMSG + + This is a read-only option that writes an int into the buffer indicating + the highest control message type supported. + + +======== +SECURITY +======== + +Currently, only the kerberos 4 equivalent protocol has been implemented +(security index 2 - rxkad). This requires the rxkad module to be loaded and, +on the client, tickets of the appropriate type to be obtained from the AFS +kaserver or the kerberos server and installed as "rxrpc" type keys. This is +normally done using the klog program. An example simple klog program can be +found at: + + http://people.redhat.com/~dhowells/rxrpc/klog.c + +The payload provided to add_key() on the client should be of the following +form:: + + struct rxrpc_key_sec2_v1 { + uint16_t security_index; /* 2 */ + uint16_t ticket_length; /* length of ticket[] */ + uint32_t expiry; /* time at which expires */ + uint8_t kvno; /* key version number */ + uint8_t __pad[3]; + uint8_t session_key[8]; /* DES session key */ + uint8_t ticket[0]; /* the encrypted ticket */ + }; + +Where the ticket blob is just appended to the above structure. + + +For the server, keys of type "rxrpc_s" must be made available to the server. +They have a description of ":" (eg: "52:2" for an +rxkad key for the AFS VL service). When such a key is created, it should be +given the server's secret key as the instantiation data (see the example +below). + + add_key("rxrpc_s", "52:2", secret_key, 8, keyring); + +A keyring is passed to the server socket by naming it in a sockopt. The server +socket then looks the server secret keys up in this keyring when secure +incoming connections are made. This can be seen in an example program that can +be found at: + + http://people.redhat.com/~dhowells/rxrpc/listen.c + + +==================== +EXAMPLE CLIENT USAGE +==================== + +A client would issue an operation by: + + (1) An RxRPC socket is set up by:: + + client = socket(AF_RXRPC, SOCK_DGRAM, PF_INET); + + Where the third parameter indicates the protocol family of the transport + socket used - usually IPv4 but it can also be IPv6 [TODO]. + + (2) A local address can optionally be bound:: + + struct sockaddr_rxrpc srx = { + .srx_family = AF_RXRPC, + .srx_service = 0, /* we're a client */ + .transport_type = SOCK_DGRAM, /* type of transport socket */ + .transport.sin_family = AF_INET, + .transport.sin_port = htons(7000), /* AFS callback */ + .transport.sin_address = 0, /* all local interfaces */ + }; + bind(client, &srx, sizeof(srx)); + + This specifies the local UDP port to be used. If not given, a random + non-privileged port will be used. A UDP port may be shared between + several unrelated RxRPC sockets. Security is handled on a basis of + per-RxRPC virtual connection. + + (3) The security is set:: + + const char *key = "AFS:cambridge.redhat.com"; + setsockopt(client, SOL_RXRPC, RXRPC_SECURITY_KEY, key, strlen(key)); + + This issues a request_key() to get the key representing the security + context. The minimum security level can be set:: + + unsigned int sec = RXRPC_SECURITY_ENCRYPTED; + setsockopt(client, SOL_RXRPC, RXRPC_MIN_SECURITY_LEVEL, + &sec, sizeof(sec)); + + (4) The server to be contacted can then be specified (alternatively this can + be done through sendmsg):: + + struct sockaddr_rxrpc srx = { + .srx_family = AF_RXRPC, + .srx_service = VL_SERVICE_ID, + .transport_type = SOCK_DGRAM, /* type of transport socket */ + .transport.sin_family = AF_INET, + .transport.sin_port = htons(7005), /* AFS volume manager */ + .transport.sin_address = ..., + }; + connect(client, &srx, sizeof(srx)); + + (5) The request data should then be posted to the server socket using a series + of sendmsg() calls, each with the following control message attached: + + ================== =================================== + RXRPC_USER_CALL_ID specifies the user ID for this call + ================== =================================== + + MSG_MORE should be set in msghdr::msg_flags on all but the last part of + the request. Multiple requests may be made simultaneously. + + An RXRPC_TX_LENGTH control message can also be specified on the first + sendmsg() call. + + If a call is intended to go to a destination other than the default + specified through connect(), then msghdr::msg_name should be set on the + first request message of that call. + + (6) The reply data will then be posted to the server socket for recvmsg() to + pick up. MSG_MORE will be flagged by recvmsg() if there's more reply data + for a particular call to be read. MSG_EOR will be set on the terminal + read for a call. + + All data will be delivered with the following control message attached: + + RXRPC_USER_CALL_ID - specifies the user ID for this call + + If an abort or error occurred, this will be returned in the control data + buffer instead, and MSG_EOR will be flagged to indicate the end of that + call. + +A client may ask for a service ID it knows and ask that this be upgraded to a +better service if one is available by supplying RXRPC_UPGRADE_SERVICE on the +first sendmsg() of a call. The client should then check srx_service in the +msg_name filled in by recvmsg() when collecting the result. srx_service will +hold the same value as given to sendmsg() if the upgrade request was ignored by +the service - otherwise it will be altered to indicate the service ID the +server upgraded to. Note that the upgraded service ID is chosen by the server. +The caller has to wait until it sees the service ID in the reply before sending +any more calls (further calls to the same destination will be blocked until the +probe is concluded). + + +Example Server Usage +==================== + +A server would be set up to accept operations in the following manner: + + (1) An RxRPC socket is created by:: + + server = socket(AF_RXRPC, SOCK_DGRAM, PF_INET); + + Where the third parameter indicates the address type of the transport + socket used - usually IPv4. + + (2) Security is set up if desired by giving the socket a keyring with server + secret keys in it:: + + keyring = add_key("keyring", "AFSkeys", NULL, 0, + KEY_SPEC_PROCESS_KEYRING); + + const char secret_key[8] = { + 0xa7, 0x83, 0x8a, 0xcb, 0xc7, 0x83, 0xec, 0x94 }; + add_key("rxrpc_s", "52:2", secret_key, 8, keyring); + + setsockopt(server, SOL_RXRPC, RXRPC_SECURITY_KEYRING, "AFSkeys", 7); + + The keyring can be manipulated after it has been given to the socket. This + permits the server to add more keys, replace keys, etc. while it is live. + + (3) A local address must then be bound:: + + struct sockaddr_rxrpc srx = { + .srx_family = AF_RXRPC, + .srx_service = VL_SERVICE_ID, /* RxRPC service ID */ + .transport_type = SOCK_DGRAM, /* type of transport socket */ + .transport.sin_family = AF_INET, + .transport.sin_port = htons(7000), /* AFS callback */ + .transport.sin_address = 0, /* all local interfaces */ + }; + bind(server, &srx, sizeof(srx)); + + More than one service ID may be bound to a socket, provided the transport + parameters are the same. The limit is currently two. To do this, bind() + should be called twice. + + (4) If service upgrading is required, first two service IDs must have been + bound and then the following option must be set:: + + unsigned short service_ids[2] = { from_ID, to_ID }; + setsockopt(server, SOL_RXRPC, RXRPC_UPGRADEABLE_SERVICE, + service_ids, sizeof(service_ids)); + + This will automatically upgrade connections on service from_ID to service + to_ID if they request it. This will be reflected in msg_name obtained + through recvmsg() when the request data is delivered to userspace. + + (5) The server is then set to listen out for incoming calls:: + + listen(server, 100); + + (6) The kernel notifies the server of pending incoming connections by sending + it a message for each. This is received with recvmsg() on the server + socket. It has no data, and has a single dataless control message + attached:: + + RXRPC_NEW_CALL + + The address that can be passed back by recvmsg() at this point should be + ignored since the call for which the message was posted may have gone by + the time it is accepted - in which case the first call still on the queue + will be accepted. + + (7) The server then accepts the new call by issuing a sendmsg() with two + pieces of control data and no actual data: + + ================== ============================== + RXRPC_ACCEPT indicate connection acceptance + RXRPC_USER_CALL_ID specify user ID for this call + ================== ============================== + + (8) The first request data packet will then be posted to the server socket for + recvmsg() to pick up. At that point, the RxRPC address for the call can + be read from the address fields in the msghdr struct. + + Subsequent request data will be posted to the server socket for recvmsg() + to collect as it arrives. All but the last piece of the request data will + be delivered with MSG_MORE flagged. + + All data will be delivered with the following control message attached: + + + ================== =================================== + RXRPC_USER_CALL_ID specifies the user ID for this call + ================== =================================== + + (9) The reply data should then be posted to the server socket using a series + of sendmsg() calls, each with the following control messages attached: + + ================== =================================== + RXRPC_USER_CALL_ID specifies the user ID for this call + ================== =================================== + + MSG_MORE should be set in msghdr::msg_flags on all but the last message + for a particular call. + +(10) The final ACK from the client will be posted for retrieval by recvmsg() + when it is received. It will take the form of a dataless message with two + control messages attached: + + ================== =================================== + RXRPC_USER_CALL_ID specifies the user ID for this call + RXRPC_ACK indicates final ACK (no data) + ================== =================================== + + MSG_EOR will be flagged to indicate that this is the final message for + this call. + +(11) Up to the point the final packet of reply data is sent, the call can be + aborted by calling sendmsg() with a dataless message with the following + control messages attached: + + ================== =================================== + RXRPC_USER_CALL_ID specifies the user ID for this call + RXRPC_ABORT indicates abort code (4 byte data) + ================== =================================== + + Any packets waiting in the socket's receive queue will be discarded if + this is issued. + +Note that all the communications for a particular service take place through +the one server socket, using control messages on sendmsg() and recvmsg() to +determine the call affected. + + +AF_RXRPC Kernel Interface +========================= + +The AF_RXRPC module also provides an interface for use by in-kernel utilities +such as the AFS filesystem. This permits such a utility to: + + (1) Use different keys directly on individual client calls on one socket + rather than having to open a whole slew of sockets, one for each key it + might want to use. + + (2) Avoid having RxRPC call request_key() at the point of issue of a call or + opening of a socket. Instead the utility is responsible for requesting a + key at the appropriate point. AFS, for instance, would do this during VFS + operations such as open() or unlink(). The key is then handed through + when the call is initiated. + + (3) Request the use of something other than GFP_KERNEL to allocate memory. + + (4) Avoid the overhead of using the recvmsg() call. RxRPC messages can be + intercepted before they get put into the socket Rx queue and the socket + buffers manipulated directly. + +To use the RxRPC facility, a kernel utility must still open an AF_RXRPC socket, +bind an address as appropriate and listen if it's to be a server socket, but +then it passes this to the kernel interface functions. + +The kernel interface functions are as follows: + + (#) Begin a new client call:: + + struct rxrpc_call * + rxrpc_kernel_begin_call(struct socket *sock, + struct sockaddr_rxrpc *srx, + struct key *key, + unsigned long user_call_ID, + s64 tx_total_len, + gfp_t gfp, + rxrpc_notify_rx_t notify_rx, + bool upgrade, + bool intr, + unsigned int debug_id); + + This allocates the infrastructure to make a new RxRPC call and assigns + call and connection numbers. The call will be made on the UDP port that + the socket is bound to. The call will go to the destination address of a + connected client socket unless an alternative is supplied (srx is + non-NULL). + + If a key is supplied then this will be used to secure the call instead of + the key bound to the socket with the RXRPC_SECURITY_KEY sockopt. Calls + secured in this way will still share connections if at all possible. + + The user_call_ID is equivalent to that supplied to sendmsg() in the + control data buffer. It is entirely feasible to use this to point to a + kernel data structure. + + tx_total_len is the amount of data the caller is intending to transmit + with this call (or -1 if unknown at this point). Setting the data size + allows the kernel to encrypt directly to the packet buffers, thereby + saving a copy. The value may not be less than -1. + + notify_rx is a pointer to a function to be called when events such as + incoming data packets or remote aborts happen. + + upgrade should be set to true if a client operation should request that + the server upgrade the service to a better one. The resultant service ID + is returned by rxrpc_kernel_recv_data(). + + intr should be set to true if the call should be interruptible. If this + is not set, this function may not return until a channel has been + allocated; if it is set, the function may return -ERESTARTSYS. + + debug_id is the call debugging ID to be used for tracing. This can be + obtained by atomically incrementing rxrpc_debug_id. + + If this function is successful, an opaque reference to the RxRPC call is + returned. The caller now holds a reference on this and it must be + properly ended. + + (#) End a client call:: + + void rxrpc_kernel_end_call(struct socket *sock, + struct rxrpc_call *call); + + This is used to end a previously begun call. The user_call_ID is expunged + from AF_RXRPC's knowledge and will not be seen again in association with + the specified call. + + (#) Send data through a call:: + + typedef void (*rxrpc_notify_end_tx_t)(struct sock *sk, + unsigned long user_call_ID, + struct sk_buff *skb); + + int rxrpc_kernel_send_data(struct socket *sock, + struct rxrpc_call *call, + struct msghdr *msg, + size_t len, + rxrpc_notify_end_tx_t notify_end_rx); + + This is used to supply either the request part of a client call or the + reply part of a server call. msg.msg_iovlen and msg.msg_iov specify the + data buffers to be used. msg_iov may not be NULL and must point + exclusively to in-kernel virtual addresses. msg.msg_flags may be given + MSG_MORE if there will be subsequent data sends for this call. + + The msg must not specify a destination address, control data or any flags + other than MSG_MORE. len is the total amount of data to transmit. + + notify_end_rx can be NULL or it can be used to specify a function to be + called when the call changes state to end the Tx phase. This function is + called with the call-state spinlock held to prevent any reply or final ACK + from being delivered first. + + (#) Receive data from a call:: + + int rxrpc_kernel_recv_data(struct socket *sock, + struct rxrpc_call *call, + void *buf, + size_t size, + size_t *_offset, + bool want_more, + u32 *_abort, + u16 *_service) + + This is used to receive data from either the reply part of a client call + or the request part of a service call. buf and size specify how much + data is desired and where to store it. *_offset is added on to buf and + subtracted from size internally; the amount copied into the buffer is + added to *_offset before returning. + + want_more should be true if further data will be required after this is + satisfied and false if this is the last item of the receive phase. + + There are three normal returns: 0 if the buffer was filled and want_more + was true; 1 if the buffer was filled, the last DATA packet has been + emptied and want_more was false; and -EAGAIN if the function needs to be + called again. + + If the last DATA packet is processed but the buffer contains less than + the amount requested, EBADMSG is returned. If want_more wasn't set, but + more data was available, EMSGSIZE is returned. + + If a remote ABORT is detected, the abort code received will be stored in + ``*_abort`` and ECONNABORTED will be returned. + + The service ID that the call ended up with is returned into *_service. + This can be used to see if a call got a service upgrade. + + (#) Abort a call?? + + :: + + void rxrpc_kernel_abort_call(struct socket *sock, + struct rxrpc_call *call, + u32 abort_code); + + This is used to abort a call if it's still in an abortable state. The + abort code specified will be placed in the ABORT message sent. + + (#) Intercept received RxRPC messages:: + + typedef void (*rxrpc_interceptor_t)(struct sock *sk, + unsigned long user_call_ID, + struct sk_buff *skb); + + void + rxrpc_kernel_intercept_rx_messages(struct socket *sock, + rxrpc_interceptor_t interceptor); + + This installs an interceptor function on the specified AF_RXRPC socket. + All messages that would otherwise wind up in the socket's Rx queue are + then diverted to this function. Note that care must be taken to process + the messages in the right order to maintain DATA message sequentiality. + + The interceptor function itself is provided with the address of the socket + and handling the incoming message, the ID assigned by the kernel utility + to the call and the socket buffer containing the message. + + The skb->mark field indicates the type of message: + + =============================== ======================================= + Mark Meaning + =============================== ======================================= + RXRPC_SKB_MARK_DATA Data message + RXRPC_SKB_MARK_FINAL_ACK Final ACK received for an incoming call + RXRPC_SKB_MARK_BUSY Client call rejected as server busy + RXRPC_SKB_MARK_REMOTE_ABORT Call aborted by peer + RXRPC_SKB_MARK_NET_ERROR Network error detected + RXRPC_SKB_MARK_LOCAL_ERROR Local error encountered + RXRPC_SKB_MARK_NEW_CALL New incoming call awaiting acceptance + =============================== ======================================= + + The remote abort message can be probed with rxrpc_kernel_get_abort_code(). + The two error messages can be probed with rxrpc_kernel_get_error_number(). + A new call can be accepted with rxrpc_kernel_accept_call(). + + Data messages can have their contents extracted with the usual bunch of + socket buffer manipulation functions. A data message can be determined to + be the last one in a sequence with rxrpc_kernel_is_data_last(). When a + data message has been used up, rxrpc_kernel_data_consumed() should be + called on it. + + Messages should be handled to rxrpc_kernel_free_skb() to dispose of. It + is possible to get extra refs on all types of message for later freeing, + but this may pin the state of a call until the message is finally freed. + + (#) Accept an incoming call:: + + struct rxrpc_call * + rxrpc_kernel_accept_call(struct socket *sock, + unsigned long user_call_ID); + + This is used to accept an incoming call and to assign it a call ID. This + function is similar to rxrpc_kernel_begin_call() and calls accepted must + be ended in the same way. + + If this function is successful, an opaque reference to the RxRPC call is + returned. The caller now holds a reference on this and it must be + properly ended. + + (#) Reject an incoming call:: + + int rxrpc_kernel_reject_call(struct socket *sock); + + This is used to reject the first incoming call on the socket's queue with + a BUSY message. -ENODATA is returned if there were no incoming calls. + Other errors may be returned if the call had been aborted (-ECONNABORTED) + or had timed out (-ETIME). + + (#) Allocate a null key for doing anonymous security:: + + struct key *rxrpc_get_null_key(const char *keyname); + + This is used to allocate a null RxRPC key that can be used to indicate + anonymous security for a particular domain. + + (#) Get the peer address of a call:: + + void rxrpc_kernel_get_peer(struct socket *sock, struct rxrpc_call *call, + struct sockaddr_rxrpc *_srx); + + This is used to find the remote peer address of a call. + + (#) Set the total transmit data size on a call:: + + void rxrpc_kernel_set_tx_length(struct socket *sock, + struct rxrpc_call *call, + s64 tx_total_len); + + This sets the amount of data that the caller is intending to transmit on a + call. It's intended to be used for setting the reply size as the request + size should be set when the call is begun. tx_total_len may not be less + than zero. + + (#) Get call RTT:: + + u64 rxrpc_kernel_get_rtt(struct socket *sock, struct rxrpc_call *call); + + Get the RTT time to the peer in use by a call. The value returned is in + nanoseconds. + + (#) Check call still alive:: + + bool rxrpc_kernel_check_life(struct socket *sock, + struct rxrpc_call *call, + u32 *_life); + void rxrpc_kernel_probe_life(struct socket *sock, + struct rxrpc_call *call); + + The first function passes back in ``*_life`` a number that is updated when + ACKs are received from the peer (notably including PING RESPONSE ACKs + which we can elicit by sending PING ACKs to see if the call still exists + on the server). The caller should compare the numbers of two calls to see + if the call is still alive after waiting for a suitable interval. It also + returns true as long as the call hasn't yet reached the completed state. + + This allows the caller to work out if the server is still contactable and + if the call is still alive on the server while waiting for the server to + process a client operation. + + The second function causes a ping ACK to be transmitted to try to provoke + the peer into responding, which would then cause the value returned by the + first function to change. Note that this must be called in TASK_RUNNING + state. + + (#) Get reply timestamp:: + + bool rxrpc_kernel_get_reply_time(struct socket *sock, + struct rxrpc_call *call, + ktime_t *_ts) + + This allows the timestamp on the first DATA packet of the reply of a + client call to be queried, provided that it is still in the Rx ring. If + successful, the timestamp will be stored into ``*_ts`` and true will be + returned; false will be returned otherwise. + + (#) Get remote client epoch:: + + u32 rxrpc_kernel_get_epoch(struct socket *sock, + struct rxrpc_call *call) + + This allows the epoch that's contained in packets of an incoming client + call to be queried. This value is returned. The function always + successful if the call is still in progress. It shouldn't be called once + the call has expired. Note that calling this on a local client call only + returns the local epoch. + + This value can be used to determine if the remote client has been + restarted as it shouldn't change otherwise. + + (#) Set the maxmimum lifespan on a call:: + + void rxrpc_kernel_set_max_life(struct socket *sock, + struct rxrpc_call *call, + unsigned long hard_timeout) + + This sets the maximum lifespan on a call to hard_timeout (which is in + jiffies). In the event of the timeout occurring, the call will be + aborted and -ETIME or -ETIMEDOUT will be returned. + + +Configurable Parameters +======================= + +The RxRPC protocol driver has a number of configurable parameters that can be +adjusted through sysctls in /proc/net/rxrpc/: + + (#) req_ack_delay + + The amount of time in milliseconds after receiving a packet with the + request-ack flag set before we honour the flag and actually send the + requested ack. + + Usually the other side won't stop sending packets until the advertised + reception window is full (to a maximum of 255 packets), so delaying the + ACK permits several packets to be ACK'd in one go. + + (#) soft_ack_delay + + The amount of time in milliseconds after receiving a new packet before we + generate a soft-ACK to tell the sender that it doesn't need to resend. + + (#) idle_ack_delay + + The amount of time in milliseconds after all the packets currently in the + received queue have been consumed before we generate a hard-ACK to tell + the sender it can free its buffers, assuming no other reason occurs that + we would send an ACK. + + (#) resend_timeout + + The amount of time in milliseconds after transmitting a packet before we + transmit it again, assuming no ACK is received from the receiver telling + us they got it. + + (#) max_call_lifetime + + The maximum amount of time in seconds that a call may be in progress + before we preemptively kill it. + + (#) dead_call_expiry + + The amount of time in seconds before we remove a dead call from the call + list. Dead calls are kept around for a little while for the purpose of + repeating ACK and ABORT packets. + + (#) connection_expiry + + The amount of time in seconds after a connection was last used before we + remove it from the connection list. While a connection is in existence, + it serves as a placeholder for negotiated security; when it is deleted, + the security must be renegotiated. + + (#) transport_expiry + + The amount of time in seconds after a transport was last used before we + remove it from the transport list. While a transport is in existence, it + serves to anchor the peer data and keeps the connection ID counter. + + (#) rxrpc_rx_window_size + + The size of the receive window in packets. This is the maximum number of + unconsumed received packets we're willing to hold in memory for any + particular call. + + (#) rxrpc_rx_mtu + + The maximum packet MTU size that we're willing to receive in bytes. This + indicates to the peer whether we're willing to accept jumbo packets. + + (#) rxrpc_rx_jumbo_max + + The maximum number of packets that we're willing to accept in a jumbo + packet. Non-terminal packets in a jumbo packet must contain a four byte + header plus exactly 1412 bytes of data. The terminal packet must contain + a four byte header plus any amount of data. In any event, a jumbo packet + may not exceed rxrpc_rx_mtu in size. diff --git a/Documentation/networking/rxrpc.txt b/Documentation/networking/rxrpc.txt deleted file mode 100644 index 180e07d956a7..000000000000 --- a/Documentation/networking/rxrpc.txt +++ /dev/null @@ -1,1155 +0,0 @@ - ====================== - RxRPC NETWORK PROTOCOL - ====================== - -The RxRPC protocol driver provides a reliable two-phase transport on top of UDP -that can be used to perform RxRPC remote operations. This is done over sockets -of AF_RXRPC family, using sendmsg() and recvmsg() with control data to send and -receive data, aborts and errors. - -Contents of this document: - - (*) Overview. - - (*) RxRPC protocol summary. - - (*) AF_RXRPC driver model. - - (*) Control messages. - - (*) Socket options. - - (*) Security. - - (*) Example client usage. - - (*) Example server usage. - - (*) AF_RXRPC kernel interface. - - (*) Configurable parameters. - - -======== -OVERVIEW -======== - -RxRPC is a two-layer protocol. There is a session layer which provides -reliable virtual connections using UDP over IPv4 (or IPv6) as the transport -layer, but implements a real network protocol; and there's the presentation -layer which renders structured data to binary blobs and back again using XDR -(as does SunRPC): - - +-------------+ - | Application | - +-------------+ - | XDR | Presentation - +-------------+ - | RxRPC | Session - +-------------+ - | UDP | Transport - +-------------+ - - -AF_RXRPC provides: - - (1) Part of an RxRPC facility for both kernel and userspace applications by - making the session part of it a Linux network protocol (AF_RXRPC). - - (2) A two-phase protocol. The client transmits a blob (the request) and then - receives a blob (the reply), and the server receives the request and then - transmits the reply. - - (3) Retention of the reusable bits of the transport system set up for one call - to speed up subsequent calls. - - (4) A secure protocol, using the Linux kernel's key retention facility to - manage security on the client end. The server end must of necessity be - more active in security negotiations. - -AF_RXRPC does not provide XDR marshalling/presentation facilities. That is -left to the application. AF_RXRPC only deals in blobs. Even the operation ID -is just the first four bytes of the request blob, and as such is beyond the -kernel's interest. - - -Sockets of AF_RXRPC family are: - - (1) created as type SOCK_DGRAM; - - (2) provided with a protocol of the type of underlying transport they're going - to use - currently only PF_INET is supported. - - -The Andrew File System (AFS) is an example of an application that uses this and -that has both kernel (filesystem) and userspace (utility) components. - - -====================== -RXRPC PROTOCOL SUMMARY -====================== - -An overview of the RxRPC protocol: - - (*) RxRPC sits on top of another networking protocol (UDP is the only option - currently), and uses this to provide network transport. UDP ports, for - example, provide transport endpoints. - - (*) RxRPC supports multiple virtual "connections" from any given transport - endpoint, thus allowing the endpoints to be shared, even to the same - remote endpoint. - - (*) Each connection goes to a particular "service". A connection may not go - to multiple services. A service may be considered the RxRPC equivalent of - a port number. AF_RXRPC permits multiple services to share an endpoint. - - (*) Client-originating packets are marked, thus a transport endpoint can be - shared between client and server connections (connections have a - direction). - - (*) Up to a billion connections may be supported concurrently between one - local transport endpoint and one service on one remote endpoint. An RxRPC - connection is described by seven numbers: - - Local address } - Local port } Transport (UDP) address - Remote address } - Remote port } - Direction - Connection ID - Service ID - - (*) Each RxRPC operation is a "call". A connection may make up to four - billion calls, but only up to four calls may be in progress on a - connection at any one time. - - (*) Calls are two-phase and asymmetric: the client sends its request data, - which the service receives; then the service transmits the reply data - which the client receives. - - (*) The data blobs are of indefinite size, the end of a phase is marked with a - flag in the packet. The number of packets of data making up one blob may - not exceed 4 billion, however, as this would cause the sequence number to - wrap. - - (*) The first four bytes of the request data are the service operation ID. - - (*) Security is negotiated on a per-connection basis. The connection is - initiated by the first data packet on it arriving. If security is - requested, the server then issues a "challenge" and then the client - replies with a "response". If the response is successful, the security is - set for the lifetime of that connection, and all subsequent calls made - upon it use that same security. In the event that the server lets a - connection lapse before the client, the security will be renegotiated if - the client uses the connection again. - - (*) Calls use ACK packets to handle reliability. Data packets are also - explicitly sequenced per call. - - (*) There are two types of positive acknowledgment: hard-ACKs and soft-ACKs. - A hard-ACK indicates to the far side that all the data received to a point - has been received and processed; a soft-ACK indicates that the data has - been received but may yet be discarded and re-requested. The sender may - not discard any transmittable packets until they've been hard-ACK'd. - - (*) Reception of a reply data packet implicitly hard-ACK's all the data - packets that make up the request. - - (*) An call is complete when the request has been sent, the reply has been - received and the final hard-ACK on the last packet of the reply has - reached the server. - - (*) An call may be aborted by either end at any time up to its completion. - - -===================== -AF_RXRPC DRIVER MODEL -===================== - -About the AF_RXRPC driver: - - (*) The AF_RXRPC protocol transparently uses internal sockets of the transport - protocol to represent transport endpoints. - - (*) AF_RXRPC sockets map onto RxRPC connection bundles. Actual RxRPC - connections are handled transparently. One client socket may be used to - make multiple simultaneous calls to the same service. One server socket - may handle calls from many clients. - - (*) Additional parallel client connections will be initiated to support extra - concurrent calls, up to a tunable limit. - - (*) Each connection is retained for a certain amount of time [tunable] after - the last call currently using it has completed in case a new call is made - that could reuse it. - - (*) Each internal UDP socket is retained [tunable] for a certain amount of - time [tunable] after the last connection using it discarded, in case a new - connection is made that could use it. - - (*) A client-side connection is only shared between calls if they have have - the same key struct describing their security (and assuming the calls - would otherwise share the connection). Non-secured calls would also be - able to share connections with each other. - - (*) A server-side connection is shared if the client says it is. - - (*) ACK'ing is handled by the protocol driver automatically, including ping - replying. - - (*) SO_KEEPALIVE automatically pings the other side to keep the connection - alive [TODO]. - - (*) If an ICMP error is received, all calls affected by that error will be - aborted with an appropriate network error passed through recvmsg(). - - -Interaction with the user of the RxRPC socket: - - (*) A socket is made into a server socket by binding an address with a - non-zero service ID. - - (*) In the client, sending a request is achieved with one or more sendmsgs, - followed by the reply being received with one or more recvmsgs. - - (*) The first sendmsg for a request to be sent from a client contains a tag to - be used in all other sendmsgs or recvmsgs associated with that call. The - tag is carried in the control data. - - (*) connect() is used to supply a default destination address for a client - socket. This may be overridden by supplying an alternate address to the - first sendmsg() of a call (struct msghdr::msg_name). - - (*) If connect() is called on an unbound client, a random local port will - bound before the operation takes place. - - (*) A server socket may also be used to make client calls. To do this, the - first sendmsg() of the call must specify the target address. The server's - transport endpoint is used to send the packets. - - (*) Once the application has received the last message associated with a call, - the tag is guaranteed not to be seen again, and so it can be used to pin - client resources. A new call can then be initiated with the same tag - without fear of interference. - - (*) In the server, a request is received with one or more recvmsgs, then the - the reply is transmitted with one or more sendmsgs, and then the final ACK - is received with a last recvmsg. - - (*) When sending data for a call, sendmsg is given MSG_MORE if there's more - data to come on that call. - - (*) When receiving data for a call, recvmsg flags MSG_MORE if there's more - data to come for that call. - - (*) When receiving data or messages for a call, MSG_EOR is flagged by recvmsg - to indicate the terminal message for that call. - - (*) A call may be aborted by adding an abort control message to the control - data. Issuing an abort terminates the kernel's use of that call's tag. - Any messages waiting in the receive queue for that call will be discarded. - - (*) Aborts, busy notifications and challenge packets are delivered by recvmsg, - and control data messages will be set to indicate the context. Receiving - an abort or a busy message terminates the kernel's use of that call's tag. - - (*) The control data part of the msghdr struct is used for a number of things: - - (*) The tag of the intended or affected call. - - (*) Sending or receiving errors, aborts and busy notifications. - - (*) Notifications of incoming calls. - - (*) Sending debug requests and receiving debug replies [TODO]. - - (*) When the kernel has received and set up an incoming call, it sends a - message to server application to let it know there's a new call awaiting - its acceptance [recvmsg reports a special control message]. The server - application then uses sendmsg to assign a tag to the new call. Once that - is done, the first part of the request data will be delivered by recvmsg. - - (*) The server application has to provide the server socket with a keyring of - secret keys corresponding to the security types it permits. When a secure - connection is being set up, the kernel looks up the appropriate secret key - in the keyring and then sends a challenge packet to the client and - receives a response packet. The kernel then checks the authorisation of - the packet and either aborts the connection or sets up the security. - - (*) The name of the key a client will use to secure its communications is - nominated by a socket option. - - -Notes on sendmsg: - - (*) MSG_WAITALL can be set to tell sendmsg to ignore signals if the peer is - making progress at accepting packets within a reasonable time such that we - manage to queue up all the data for transmission. This requires the - client to accept at least one packet per 2*RTT time period. - - If this isn't set, sendmsg() will return immediately, either returning - EINTR/ERESTARTSYS if nothing was consumed or returning the amount of data - consumed. - - -Notes on recvmsg: - - (*) If there's a sequence of data messages belonging to a particular call on - the receive queue, then recvmsg will keep working through them until: - - (a) it meets the end of that call's received data, - - (b) it meets a non-data message, - - (c) it meets a message belonging to a different call, or - - (d) it fills the user buffer. - - If recvmsg is called in blocking mode, it will keep sleeping, awaiting the - reception of further data, until one of the above four conditions is met. - - (2) MSG_PEEK operates similarly, but will return immediately if it has put any - data in the buffer rather than sleeping until it can fill the buffer. - - (3) If a data message is only partially consumed in filling a user buffer, - then the remainder of that message will be left on the front of the queue - for the next taker. MSG_TRUNC will never be flagged. - - (4) If there is more data to be had on a call (it hasn't copied the last byte - of the last data message in that phase yet), then MSG_MORE will be - flagged. - - -================ -CONTROL MESSAGES -================ - -AF_RXRPC makes use of control messages in sendmsg() and recvmsg() to multiplex -calls, to invoke certain actions and to report certain conditions. These are: - - MESSAGE ID SRT DATA MEANING - ======================= === =========== =============================== - RXRPC_USER_CALL_ID sr- User ID App's call specifier - RXRPC_ABORT srt Abort code Abort code to issue/received - RXRPC_ACK -rt n/a Final ACK received - RXRPC_NET_ERROR -rt error num Network error on call - RXRPC_BUSY -rt n/a Call rejected (server busy) - RXRPC_LOCAL_ERROR -rt error num Local error encountered - RXRPC_NEW_CALL -r- n/a New call received - RXRPC_ACCEPT s-- n/a Accept new call - RXRPC_EXCLUSIVE_CALL s-- n/a Make an exclusive client call - RXRPC_UPGRADE_SERVICE s-- n/a Client call can be upgraded - RXRPC_TX_LENGTH s-- data len Total length of Tx data - - (SRT = usable in Sendmsg / delivered by Recvmsg / Terminal message) - - (*) RXRPC_USER_CALL_ID - - This is used to indicate the application's call ID. It's an unsigned long - that the app specifies in the client by attaching it to the first data - message or in the server by passing it in association with an RXRPC_ACCEPT - message. recvmsg() passes it in conjunction with all messages except - those of the RXRPC_NEW_CALL message. - - (*) RXRPC_ABORT - - This is can be used by an application to abort a call by passing it to - sendmsg, or it can be delivered by recvmsg to indicate a remote abort was - received. Either way, it must be associated with an RXRPC_USER_CALL_ID to - specify the call affected. If an abort is being sent, then error EBADSLT - will be returned if there is no call with that user ID. - - (*) RXRPC_ACK - - This is delivered to a server application to indicate that the final ACK - of a call was received from the client. It will be associated with an - RXRPC_USER_CALL_ID to indicate the call that's now complete. - - (*) RXRPC_NET_ERROR - - This is delivered to an application to indicate that an ICMP error message - was encountered in the process of trying to talk to the peer. An - errno-class integer value will be included in the control message data - indicating the problem, and an RXRPC_USER_CALL_ID will indicate the call - affected. - - (*) RXRPC_BUSY - - This is delivered to a client application to indicate that a call was - rejected by the server due to the server being busy. It will be - associated with an RXRPC_USER_CALL_ID to indicate the rejected call. - - (*) RXRPC_LOCAL_ERROR - - This is delivered to an application to indicate that a local error was - encountered and that a call has been aborted because of it. An - errno-class integer value will be included in the control message data - indicating the problem, and an RXRPC_USER_CALL_ID will indicate the call - affected. - - (*) RXRPC_NEW_CALL - - This is delivered to indicate to a server application that a new call has - arrived and is awaiting acceptance. No user ID is associated with this, - as a user ID must subsequently be assigned by doing an RXRPC_ACCEPT. - - (*) RXRPC_ACCEPT - - This is used by a server application to attempt to accept a call and - assign it a user ID. It should be associated with an RXRPC_USER_CALL_ID - to indicate the user ID to be assigned. If there is no call to be - accepted (it may have timed out, been aborted, etc.), then sendmsg will - return error ENODATA. If the user ID is already in use by another call, - then error EBADSLT will be returned. - - (*) RXRPC_EXCLUSIVE_CALL - - This is used to indicate that a client call should be made on a one-off - connection. The connection is discarded once the call has terminated. - - (*) RXRPC_UPGRADE_SERVICE - - This is used to make a client call to probe if the specified service ID - may be upgraded by the server. The caller must check msg_name returned to - recvmsg() for the service ID actually in use. The operation probed must - be one that takes the same arguments in both services. - - Once this has been used to establish the upgrade capability (or lack - thereof) of the server, the service ID returned should be used for all - future communication to that server and RXRPC_UPGRADE_SERVICE should no - longer be set. - - (*) RXRPC_TX_LENGTH - - This is used to inform the kernel of the total amount of data that is - going to be transmitted by a call (whether in a client request or a - service response). If given, it allows the kernel to encrypt from the - userspace buffer directly to the packet buffers, rather than copying into - the buffer and then encrypting in place. This may only be given with the - first sendmsg() providing data for a call. EMSGSIZE will be generated if - the amount of data actually given is different. - - This takes a parameter of __s64 type that indicates how much will be - transmitted. This may not be less than zero. - -The symbol RXRPC__SUPPORTED is defined as one more than the highest control -message type supported. At run time this can be queried by means of the -RXRPC_SUPPORTED_CMSG socket option (see below). - - -============== -SOCKET OPTIONS -============== - -AF_RXRPC sockets support a few socket options at the SOL_RXRPC level: - - (*) RXRPC_SECURITY_KEY - - This is used to specify the description of the key to be used. The key is - extracted from the calling process's keyrings with request_key() and - should be of "rxrpc" type. - - The optval pointer points to the description string, and optlen indicates - how long the string is, without the NUL terminator. - - (*) RXRPC_SECURITY_KEYRING - - Similar to above but specifies a keyring of server secret keys to use (key - type "keyring"). See the "Security" section. - - (*) RXRPC_EXCLUSIVE_CONNECTION - - This is used to request that new connections should be used for each call - made subsequently on this socket. optval should be NULL and optlen 0. - - (*) RXRPC_MIN_SECURITY_LEVEL - - This is used to specify the minimum security level required for calls on - this socket. optval must point to an int containing one of the following - values: - - (a) RXRPC_SECURITY_PLAIN - - Encrypted checksum only. - - (b) RXRPC_SECURITY_AUTH - - Encrypted checksum plus packet padded and first eight bytes of packet - encrypted - which includes the actual packet length. - - (c) RXRPC_SECURITY_ENCRYPTED - - Encrypted checksum plus entire packet padded and encrypted, including - actual packet length. - - (*) RXRPC_UPGRADEABLE_SERVICE - - This is used to indicate that a service socket with two bindings may - upgrade one bound service to the other if requested by the client. optval - must point to an array of two unsigned short ints. The first is the - service ID to upgrade from and the second the service ID to upgrade to. - - (*) RXRPC_SUPPORTED_CMSG - - This is a read-only option that writes an int into the buffer indicating - the highest control message type supported. - - -======== -SECURITY -======== - -Currently, only the kerberos 4 equivalent protocol has been implemented -(security index 2 - rxkad). This requires the rxkad module to be loaded and, -on the client, tickets of the appropriate type to be obtained from the AFS -kaserver or the kerberos server and installed as "rxrpc" type keys. This is -normally done using the klog program. An example simple klog program can be -found at: - - http://people.redhat.com/~dhowells/rxrpc/klog.c - -The payload provided to add_key() on the client should be of the following -form: - - struct rxrpc_key_sec2_v1 { - uint16_t security_index; /* 2 */ - uint16_t ticket_length; /* length of ticket[] */ - uint32_t expiry; /* time at which expires */ - uint8_t kvno; /* key version number */ - uint8_t __pad[3]; - uint8_t session_key[8]; /* DES session key */ - uint8_t ticket[0]; /* the encrypted ticket */ - }; - -Where the ticket blob is just appended to the above structure. - - -For the server, keys of type "rxrpc_s" must be made available to the server. -They have a description of ":" (eg: "52:2" for an -rxkad key for the AFS VL service). When such a key is created, it should be -given the server's secret key as the instantiation data (see the example -below). - - add_key("rxrpc_s", "52:2", secret_key, 8, keyring); - -A keyring is passed to the server socket by naming it in a sockopt. The server -socket then looks the server secret keys up in this keyring when secure -incoming connections are made. This can be seen in an example program that can -be found at: - - http://people.redhat.com/~dhowells/rxrpc/listen.c - - -==================== -EXAMPLE CLIENT USAGE -==================== - -A client would issue an operation by: - - (1) An RxRPC socket is set up by: - - client = socket(AF_RXRPC, SOCK_DGRAM, PF_INET); - - Where the third parameter indicates the protocol family of the transport - socket used - usually IPv4 but it can also be IPv6 [TODO]. - - (2) A local address can optionally be bound: - - struct sockaddr_rxrpc srx = { - .srx_family = AF_RXRPC, - .srx_service = 0, /* we're a client */ - .transport_type = SOCK_DGRAM, /* type of transport socket */ - .transport.sin_family = AF_INET, - .transport.sin_port = htons(7000), /* AFS callback */ - .transport.sin_address = 0, /* all local interfaces */ - }; - bind(client, &srx, sizeof(srx)); - - This specifies the local UDP port to be used. If not given, a random - non-privileged port will be used. A UDP port may be shared between - several unrelated RxRPC sockets. Security is handled on a basis of - per-RxRPC virtual connection. - - (3) The security is set: - - const char *key = "AFS:cambridge.redhat.com"; - setsockopt(client, SOL_RXRPC, RXRPC_SECURITY_KEY, key, strlen(key)); - - This issues a request_key() to get the key representing the security - context. The minimum security level can be set: - - unsigned int sec = RXRPC_SECURITY_ENCRYPTED; - setsockopt(client, SOL_RXRPC, RXRPC_MIN_SECURITY_LEVEL, - &sec, sizeof(sec)); - - (4) The server to be contacted can then be specified (alternatively this can - be done through sendmsg): - - struct sockaddr_rxrpc srx = { - .srx_family = AF_RXRPC, - .srx_service = VL_SERVICE_ID, - .transport_type = SOCK_DGRAM, /* type of transport socket */ - .transport.sin_family = AF_INET, - .transport.sin_port = htons(7005), /* AFS volume manager */ - .transport.sin_address = ..., - }; - connect(client, &srx, sizeof(srx)); - - (5) The request data should then be posted to the server socket using a series - of sendmsg() calls, each with the following control message attached: - - RXRPC_USER_CALL_ID - specifies the user ID for this call - - MSG_MORE should be set in msghdr::msg_flags on all but the last part of - the request. Multiple requests may be made simultaneously. - - An RXRPC_TX_LENGTH control message can also be specified on the first - sendmsg() call. - - If a call is intended to go to a destination other than the default - specified through connect(), then msghdr::msg_name should be set on the - first request message of that call. - - (6) The reply data will then be posted to the server socket for recvmsg() to - pick up. MSG_MORE will be flagged by recvmsg() if there's more reply data - for a particular call to be read. MSG_EOR will be set on the terminal - read for a call. - - All data will be delivered with the following control message attached: - - RXRPC_USER_CALL_ID - specifies the user ID for this call - - If an abort or error occurred, this will be returned in the control data - buffer instead, and MSG_EOR will be flagged to indicate the end of that - call. - -A client may ask for a service ID it knows and ask that this be upgraded to a -better service if one is available by supplying RXRPC_UPGRADE_SERVICE on the -first sendmsg() of a call. The client should then check srx_service in the -msg_name filled in by recvmsg() when collecting the result. srx_service will -hold the same value as given to sendmsg() if the upgrade request was ignored by -the service - otherwise it will be altered to indicate the service ID the -server upgraded to. Note that the upgraded service ID is chosen by the server. -The caller has to wait until it sees the service ID in the reply before sending -any more calls (further calls to the same destination will be blocked until the -probe is concluded). - - -==================== -EXAMPLE SERVER USAGE -==================== - -A server would be set up to accept operations in the following manner: - - (1) An RxRPC socket is created by: - - server = socket(AF_RXRPC, SOCK_DGRAM, PF_INET); - - Where the third parameter indicates the address type of the transport - socket used - usually IPv4. - - (2) Security is set up if desired by giving the socket a keyring with server - secret keys in it: - - keyring = add_key("keyring", "AFSkeys", NULL, 0, - KEY_SPEC_PROCESS_KEYRING); - - const char secret_key[8] = { - 0xa7, 0x83, 0x8a, 0xcb, 0xc7, 0x83, 0xec, 0x94 }; - add_key("rxrpc_s", "52:2", secret_key, 8, keyring); - - setsockopt(server, SOL_RXRPC, RXRPC_SECURITY_KEYRING, "AFSkeys", 7); - - The keyring can be manipulated after it has been given to the socket. This - permits the server to add more keys, replace keys, etc. while it is live. - - (3) A local address must then be bound: - - struct sockaddr_rxrpc srx = { - .srx_family = AF_RXRPC, - .srx_service = VL_SERVICE_ID, /* RxRPC service ID */ - .transport_type = SOCK_DGRAM, /* type of transport socket */ - .transport.sin_family = AF_INET, - .transport.sin_port = htons(7000), /* AFS callback */ - .transport.sin_address = 0, /* all local interfaces */ - }; - bind(server, &srx, sizeof(srx)); - - More than one service ID may be bound to a socket, provided the transport - parameters are the same. The limit is currently two. To do this, bind() - should be called twice. - - (4) If service upgrading is required, first two service IDs must have been - bound and then the following option must be set: - - unsigned short service_ids[2] = { from_ID, to_ID }; - setsockopt(server, SOL_RXRPC, RXRPC_UPGRADEABLE_SERVICE, - service_ids, sizeof(service_ids)); - - This will automatically upgrade connections on service from_ID to service - to_ID if they request it. This will be reflected in msg_name obtained - through recvmsg() when the request data is delivered to userspace. - - (5) The server is then set to listen out for incoming calls: - - listen(server, 100); - - (6) The kernel notifies the server of pending incoming connections by sending - it a message for each. This is received with recvmsg() on the server - socket. It has no data, and has a single dataless control message - attached: - - RXRPC_NEW_CALL - - The address that can be passed back by recvmsg() at this point should be - ignored since the call for which the message was posted may have gone by - the time it is accepted - in which case the first call still on the queue - will be accepted. - - (7) The server then accepts the new call by issuing a sendmsg() with two - pieces of control data and no actual data: - - RXRPC_ACCEPT - indicate connection acceptance - RXRPC_USER_CALL_ID - specify user ID for this call - - (8) The first request data packet will then be posted to the server socket for - recvmsg() to pick up. At that point, the RxRPC address for the call can - be read from the address fields in the msghdr struct. - - Subsequent request data will be posted to the server socket for recvmsg() - to collect as it arrives. All but the last piece of the request data will - be delivered with MSG_MORE flagged. - - All data will be delivered with the following control message attached: - - RXRPC_USER_CALL_ID - specifies the user ID for this call - - (9) The reply data should then be posted to the server socket using a series - of sendmsg() calls, each with the following control messages attached: - - RXRPC_USER_CALL_ID - specifies the user ID for this call - - MSG_MORE should be set in msghdr::msg_flags on all but the last message - for a particular call. - -(10) The final ACK from the client will be posted for retrieval by recvmsg() - when it is received. It will take the form of a dataless message with two - control messages attached: - - RXRPC_USER_CALL_ID - specifies the user ID for this call - RXRPC_ACK - indicates final ACK (no data) - - MSG_EOR will be flagged to indicate that this is the final message for - this call. - -(11) Up to the point the final packet of reply data is sent, the call can be - aborted by calling sendmsg() with a dataless message with the following - control messages attached: - - RXRPC_USER_CALL_ID - specifies the user ID for this call - RXRPC_ABORT - indicates abort code (4 byte data) - - Any packets waiting in the socket's receive queue will be discarded if - this is issued. - -Note that all the communications for a particular service take place through -the one server socket, using control messages on sendmsg() and recvmsg() to -determine the call affected. - - -========================= -AF_RXRPC KERNEL INTERFACE -========================= - -The AF_RXRPC module also provides an interface for use by in-kernel utilities -such as the AFS filesystem. This permits such a utility to: - - (1) Use different keys directly on individual client calls on one socket - rather than having to open a whole slew of sockets, one for each key it - might want to use. - - (2) Avoid having RxRPC call request_key() at the point of issue of a call or - opening of a socket. Instead the utility is responsible for requesting a - key at the appropriate point. AFS, for instance, would do this during VFS - operations such as open() or unlink(). The key is then handed through - when the call is initiated. - - (3) Request the use of something other than GFP_KERNEL to allocate memory. - - (4) Avoid the overhead of using the recvmsg() call. RxRPC messages can be - intercepted before they get put into the socket Rx queue and the socket - buffers manipulated directly. - -To use the RxRPC facility, a kernel utility must still open an AF_RXRPC socket, -bind an address as appropriate and listen if it's to be a server socket, but -then it passes this to the kernel interface functions. - -The kernel interface functions are as follows: - - (*) Begin a new client call. - - struct rxrpc_call * - rxrpc_kernel_begin_call(struct socket *sock, - struct sockaddr_rxrpc *srx, - struct key *key, - unsigned long user_call_ID, - s64 tx_total_len, - gfp_t gfp, - rxrpc_notify_rx_t notify_rx, - bool upgrade, - bool intr, - unsigned int debug_id); - - This allocates the infrastructure to make a new RxRPC call and assigns - call and connection numbers. The call will be made on the UDP port that - the socket is bound to. The call will go to the destination address of a - connected client socket unless an alternative is supplied (srx is - non-NULL). - - If a key is supplied then this will be used to secure the call instead of - the key bound to the socket with the RXRPC_SECURITY_KEY sockopt. Calls - secured in this way will still share connections if at all possible. - - The user_call_ID is equivalent to that supplied to sendmsg() in the - control data buffer. It is entirely feasible to use this to point to a - kernel data structure. - - tx_total_len is the amount of data the caller is intending to transmit - with this call (or -1 if unknown at this point). Setting the data size - allows the kernel to encrypt directly to the packet buffers, thereby - saving a copy. The value may not be less than -1. - - notify_rx is a pointer to a function to be called when events such as - incoming data packets or remote aborts happen. - - upgrade should be set to true if a client operation should request that - the server upgrade the service to a better one. The resultant service ID - is returned by rxrpc_kernel_recv_data(). - - intr should be set to true if the call should be interruptible. If this - is not set, this function may not return until a channel has been - allocated; if it is set, the function may return -ERESTARTSYS. - - debug_id is the call debugging ID to be used for tracing. This can be - obtained by atomically incrementing rxrpc_debug_id. - - If this function is successful, an opaque reference to the RxRPC call is - returned. The caller now holds a reference on this and it must be - properly ended. - - (*) End a client call. - - void rxrpc_kernel_end_call(struct socket *sock, - struct rxrpc_call *call); - - This is used to end a previously begun call. The user_call_ID is expunged - from AF_RXRPC's knowledge and will not be seen again in association with - the specified call. - - (*) Send data through a call. - - typedef void (*rxrpc_notify_end_tx_t)(struct sock *sk, - unsigned long user_call_ID, - struct sk_buff *skb); - - int rxrpc_kernel_send_data(struct socket *sock, - struct rxrpc_call *call, - struct msghdr *msg, - size_t len, - rxrpc_notify_end_tx_t notify_end_rx); - - This is used to supply either the request part of a client call or the - reply part of a server call. msg.msg_iovlen and msg.msg_iov specify the - data buffers to be used. msg_iov may not be NULL and must point - exclusively to in-kernel virtual addresses. msg.msg_flags may be given - MSG_MORE if there will be subsequent data sends for this call. - - The msg must not specify a destination address, control data or any flags - other than MSG_MORE. len is the total amount of data to transmit. - - notify_end_rx can be NULL or it can be used to specify a function to be - called when the call changes state to end the Tx phase. This function is - called with the call-state spinlock held to prevent any reply or final ACK - from being delivered first. - - (*) Receive data from a call. - - int rxrpc_kernel_recv_data(struct socket *sock, - struct rxrpc_call *call, - void *buf, - size_t size, - size_t *_offset, - bool want_more, - u32 *_abort, - u16 *_service) - - This is used to receive data from either the reply part of a client call - or the request part of a service call. buf and size specify how much - data is desired and where to store it. *_offset is added on to buf and - subtracted from size internally; the amount copied into the buffer is - added to *_offset before returning. - - want_more should be true if further data will be required after this is - satisfied and false if this is the last item of the receive phase. - - There are three normal returns: 0 if the buffer was filled and want_more - was true; 1 if the buffer was filled, the last DATA packet has been - emptied and want_more was false; and -EAGAIN if the function needs to be - called again. - - If the last DATA packet is processed but the buffer contains less than - the amount requested, EBADMSG is returned. If want_more wasn't set, but - more data was available, EMSGSIZE is returned. - - If a remote ABORT is detected, the abort code received will be stored in - *_abort and ECONNABORTED will be returned. - - The service ID that the call ended up with is returned into *_service. - This can be used to see if a call got a service upgrade. - - (*) Abort a call. - - void rxrpc_kernel_abort_call(struct socket *sock, - struct rxrpc_call *call, - u32 abort_code); - - This is used to abort a call if it's still in an abortable state. The - abort code specified will be placed in the ABORT message sent. - - (*) Intercept received RxRPC messages. - - typedef void (*rxrpc_interceptor_t)(struct sock *sk, - unsigned long user_call_ID, - struct sk_buff *skb); - - void - rxrpc_kernel_intercept_rx_messages(struct socket *sock, - rxrpc_interceptor_t interceptor); - - This installs an interceptor function on the specified AF_RXRPC socket. - All messages that would otherwise wind up in the socket's Rx queue are - then diverted to this function. Note that care must be taken to process - the messages in the right order to maintain DATA message sequentiality. - - The interceptor function itself is provided with the address of the socket - and handling the incoming message, the ID assigned by the kernel utility - to the call and the socket buffer containing the message. - - The skb->mark field indicates the type of message: - - MARK MEANING - =============================== ======================================= - RXRPC_SKB_MARK_DATA Data message - RXRPC_SKB_MARK_FINAL_ACK Final ACK received for an incoming call - RXRPC_SKB_MARK_BUSY Client call rejected as server busy - RXRPC_SKB_MARK_REMOTE_ABORT Call aborted by peer - RXRPC_SKB_MARK_NET_ERROR Network error detected - RXRPC_SKB_MARK_LOCAL_ERROR Local error encountered - RXRPC_SKB_MARK_NEW_CALL New incoming call awaiting acceptance - - The remote abort message can be probed with rxrpc_kernel_get_abort_code(). - The two error messages can be probed with rxrpc_kernel_get_error_number(). - A new call can be accepted with rxrpc_kernel_accept_call(). - - Data messages can have their contents extracted with the usual bunch of - socket buffer manipulation functions. A data message can be determined to - be the last one in a sequence with rxrpc_kernel_is_data_last(). When a - data message has been used up, rxrpc_kernel_data_consumed() should be - called on it. - - Messages should be handled to rxrpc_kernel_free_skb() to dispose of. It - is possible to get extra refs on all types of message for later freeing, - but this may pin the state of a call until the message is finally freed. - - (*) Accept an incoming call. - - struct rxrpc_call * - rxrpc_kernel_accept_call(struct socket *sock, - unsigned long user_call_ID); - - This is used to accept an incoming call and to assign it a call ID. This - function is similar to rxrpc_kernel_begin_call() and calls accepted must - be ended in the same way. - - If this function is successful, an opaque reference to the RxRPC call is - returned. The caller now holds a reference on this and it must be - properly ended. - - (*) Reject an incoming call. - - int rxrpc_kernel_reject_call(struct socket *sock); - - This is used to reject the first incoming call on the socket's queue with - a BUSY message. -ENODATA is returned if there were no incoming calls. - Other errors may be returned if the call had been aborted (-ECONNABORTED) - or had timed out (-ETIME). - - (*) Allocate a null key for doing anonymous security. - - struct key *rxrpc_get_null_key(const char *keyname); - - This is used to allocate a null RxRPC key that can be used to indicate - anonymous security for a particular domain. - - (*) Get the peer address of a call. - - void rxrpc_kernel_get_peer(struct socket *sock, struct rxrpc_call *call, - struct sockaddr_rxrpc *_srx); - - This is used to find the remote peer address of a call. - - (*) Set the total transmit data size on a call. - - void rxrpc_kernel_set_tx_length(struct socket *sock, - struct rxrpc_call *call, - s64 tx_total_len); - - This sets the amount of data that the caller is intending to transmit on a - call. It's intended to be used for setting the reply size as the request - size should be set when the call is begun. tx_total_len may not be less - than zero. - - (*) Get call RTT. - - u64 rxrpc_kernel_get_rtt(struct socket *sock, struct rxrpc_call *call); - - Get the RTT time to the peer in use by a call. The value returned is in - nanoseconds. - - (*) Check call still alive. - - bool rxrpc_kernel_check_life(struct socket *sock, - struct rxrpc_call *call, - u32 *_life); - void rxrpc_kernel_probe_life(struct socket *sock, - struct rxrpc_call *call); - - The first function passes back in *_life a number that is updated when - ACKs are received from the peer (notably including PING RESPONSE ACKs - which we can elicit by sending PING ACKs to see if the call still exists - on the server). The caller should compare the numbers of two calls to see - if the call is still alive after waiting for a suitable interval. It also - returns true as long as the call hasn't yet reached the completed state. - - This allows the caller to work out if the server is still contactable and - if the call is still alive on the server while waiting for the server to - process a client operation. - - The second function causes a ping ACK to be transmitted to try to provoke - the peer into responding, which would then cause the value returned by the - first function to change. Note that this must be called in TASK_RUNNING - state. - - (*) Get reply timestamp. - - bool rxrpc_kernel_get_reply_time(struct socket *sock, - struct rxrpc_call *call, - ktime_t *_ts) - - This allows the timestamp on the first DATA packet of the reply of a - client call to be queried, provided that it is still in the Rx ring. If - successful, the timestamp will be stored into *_ts and true will be - returned; false will be returned otherwise. - - (*) Get remote client epoch. - - u32 rxrpc_kernel_get_epoch(struct socket *sock, - struct rxrpc_call *call) - - This allows the epoch that's contained in packets of an incoming client - call to be queried. This value is returned. The function always - successful if the call is still in progress. It shouldn't be called once - the call has expired. Note that calling this on a local client call only - returns the local epoch. - - This value can be used to determine if the remote client has been - restarted as it shouldn't change otherwise. - - (*) Set the maxmimum lifespan on a call. - - void rxrpc_kernel_set_max_life(struct socket *sock, - struct rxrpc_call *call, - unsigned long hard_timeout) - - This sets the maximum lifespan on a call to hard_timeout (which is in - jiffies). In the event of the timeout occurring, the call will be - aborted and -ETIME or -ETIMEDOUT will be returned. - - -======================= -CONFIGURABLE PARAMETERS -======================= - -The RxRPC protocol driver has a number of configurable parameters that can be -adjusted through sysctls in /proc/net/rxrpc/: - - (*) req_ack_delay - - The amount of time in milliseconds after receiving a packet with the - request-ack flag set before we honour the flag and actually send the - requested ack. - - Usually the other side won't stop sending packets until the advertised - reception window is full (to a maximum of 255 packets), so delaying the - ACK permits several packets to be ACK'd in one go. - - (*) soft_ack_delay - - The amount of time in milliseconds after receiving a new packet before we - generate a soft-ACK to tell the sender that it doesn't need to resend. - - (*) idle_ack_delay - - The amount of time in milliseconds after all the packets currently in the - received queue have been consumed before we generate a hard-ACK to tell - the sender it can free its buffers, assuming no other reason occurs that - we would send an ACK. - - (*) resend_timeout - - The amount of time in milliseconds after transmitting a packet before we - transmit it again, assuming no ACK is received from the receiver telling - us they got it. - - (*) max_call_lifetime - - The maximum amount of time in seconds that a call may be in progress - before we preemptively kill it. - - (*) dead_call_expiry - - The amount of time in seconds before we remove a dead call from the call - list. Dead calls are kept around for a little while for the purpose of - repeating ACK and ABORT packets. - - (*) connection_expiry - - The amount of time in seconds after a connection was last used before we - remove it from the connection list. While a connection is in existence, - it serves as a placeholder for negotiated security; when it is deleted, - the security must be renegotiated. - - (*) transport_expiry - - The amount of time in seconds after a transport was last used before we - remove it from the transport list. While a transport is in existence, it - serves to anchor the peer data and keeps the connection ID counter. - - (*) rxrpc_rx_window_size - - The size of the receive window in packets. This is the maximum number of - unconsumed received packets we're willing to hold in memory for any - particular call. - - (*) rxrpc_rx_mtu - - The maximum packet MTU size that we're willing to receive in bytes. This - indicates to the peer whether we're willing to accept jumbo packets. - - (*) rxrpc_rx_jumbo_max - - The maximum number of packets that we're willing to accept in a jumbo - packet. Non-terminal packets in a jumbo packet must contain a four byte - header plus exactly 1412 bytes of data. The terminal packet must contain - a four byte header plus any amount of data. In any event, a jumbo packet - may not exceed rxrpc_rx_mtu in size. diff --git a/MAINTAINERS b/MAINTAINERS index b28823ab48c5..866a0dcd66ef 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -14593,7 +14593,7 @@ M: David Howells L: linux-afs@lists.infradead.org S: Supported W: https://www.infradead.org/~dhowells/kafs/ -F: Documentation/networking/rxrpc.txt +F: Documentation/networking/rxrpc.rst F: include/keys/rxrpc-type.h F: include/net/af_rxrpc.h F: include/trace/events/rxrpc.h diff --git a/net/rxrpc/Kconfig b/net/rxrpc/Kconfig index 57ebb29c26ad..d706bb408365 100644 --- a/net/rxrpc/Kconfig +++ b/net/rxrpc/Kconfig @@ -18,7 +18,7 @@ config AF_RXRPC This module at the moment only supports client operations and is currently incomplete. - See Documentation/networking/rxrpc.txt. + See Documentation/networking/rxrpc.rst. config AF_RXRPC_IPV6 bool "IPv6 support for RxRPC" @@ -41,7 +41,7 @@ config AF_RXRPC_DEBUG help Say Y here to make runtime controllable debugging messages appear. - See Documentation/networking/rxrpc.txt. + See Documentation/networking/rxrpc.rst. config RXKAD @@ -56,4 +56,4 @@ config RXKAD Provide kerberos 4 and AFS kaserver security handling for AF_RXRPC through the use of the key retention service. - See Documentation/networking/rxrpc.txt. + See Documentation/networking/rxrpc.rst. diff --git a/net/rxrpc/sysctl.c b/net/rxrpc/sysctl.c index 2bbb38161851..174e903e18de 100644 --- a/net/rxrpc/sysctl.c +++ b/net/rxrpc/sysctl.c @@ -21,7 +21,7 @@ static const unsigned long max_jiffies = MAX_JIFFY_OFFSET; /* * RxRPC operating parameters. * - * See Documentation/networking/rxrpc.txt and the variable definitions for more + * See Documentation/networking/rxrpc.rst and the variable definitions for more * information on the individual parameters. */ static struct ctl_table rxrpc_sysctl_table[] = { -- cgit v1.2.3-59-g8ed1b From 671d114d8cde3ba4390714b850c86d8b39d31009 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Thu, 30 Apr 2020 18:04:22 +0200 Subject: docs: networking: convert sctp.txt to ReST - add SPDX header; - add a document title; - adjust identation, whitespaces and blank lines where needed; - add to networking/index.rst. Signed-off-by: Mauro Carvalho Chehab Acked-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- Documentation/networking/index.rst | 1 + Documentation/networking/sctp.rst | 42 ++++++++++++++++++++++++++++++++++++++ Documentation/networking/sctp.txt | 35 ------------------------------- MAINTAINERS | 2 +- 4 files changed, 44 insertions(+), 36 deletions(-) create mode 100644 Documentation/networking/sctp.rst delete mode 100644 Documentation/networking/sctp.txt diff --git a/Documentation/networking/index.rst b/Documentation/networking/index.rst index cd307b9601fa..1761eb715061 100644 --- a/Documentation/networking/index.rst +++ b/Documentation/networking/index.rst @@ -100,6 +100,7 @@ Contents: rds regulatory rxrpc + sctp .. only:: subproject and html diff --git a/Documentation/networking/sctp.rst b/Documentation/networking/sctp.rst new file mode 100644 index 000000000000..9f4d9c8a925b --- /dev/null +++ b/Documentation/networking/sctp.rst @@ -0,0 +1,42 @@ +.. SPDX-License-Identifier: GPL-2.0 + +================= +Linux Kernel SCTP +================= + +This is the current BETA release of the Linux Kernel SCTP reference +implementation. + +SCTP (Stream Control Transmission Protocol) is a IP based, message oriented, +reliable transport protocol, with congestion control, support for +transparent multi-homing, and multiple ordered streams of messages. +RFC2960 defines the core protocol. The IETF SIGTRAN working group originally +developed the SCTP protocol and later handed the protocol over to the +Transport Area (TSVWG) working group for the continued evolvement of SCTP as a +general purpose transport. + +See the IETF website (http://www.ietf.org) for further documents on SCTP. +See http://www.ietf.org/rfc/rfc2960.txt + +The initial project goal is to create an Linux kernel reference implementation +of SCTP that is RFC 2960 compliant and provides an programming interface +referred to as the UDP-style API of the Sockets Extensions for SCTP, as +proposed in IETF Internet-Drafts. + +Caveats +======= + +- lksctp can be built as statically or as a module. However, be aware that + module removal of lksctp is not yet a safe activity. + +- There is tentative support for IPv6, but most work has gone towards + implementation and testing lksctp on IPv4. + + +For more information, please visit the lksctp project website: + + http://www.sf.net/projects/lksctp + +Or contact the lksctp developers through the mailing list: + + diff --git a/Documentation/networking/sctp.txt b/Documentation/networking/sctp.txt deleted file mode 100644 index 97b810ca9082..000000000000 --- a/Documentation/networking/sctp.txt +++ /dev/null @@ -1,35 +0,0 @@ -Linux Kernel SCTP - -This is the current BETA release of the Linux Kernel SCTP reference -implementation. - -SCTP (Stream Control Transmission Protocol) is a IP based, message oriented, -reliable transport protocol, with congestion control, support for -transparent multi-homing, and multiple ordered streams of messages. -RFC2960 defines the core protocol. The IETF SIGTRAN working group originally -developed the SCTP protocol and later handed the protocol over to the -Transport Area (TSVWG) working group for the continued evolvement of SCTP as a -general purpose transport. - -See the IETF website (http://www.ietf.org) for further documents on SCTP. -See http://www.ietf.org/rfc/rfc2960.txt - -The initial project goal is to create an Linux kernel reference implementation -of SCTP that is RFC 2960 compliant and provides an programming interface -referred to as the UDP-style API of the Sockets Extensions for SCTP, as -proposed in IETF Internet-Drafts. - -Caveats: - --lksctp can be built as statically or as a module. However, be aware that -module removal of lksctp is not yet a safe activity. - --There is tentative support for IPv6, but most work has gone towards -implementation and testing lksctp on IPv4. - - -For more information, please visit the lksctp project website: - http://www.sf.net/projects/lksctp - -Or contact the lksctp developers through the mailing list: - diff --git a/MAINTAINERS b/MAINTAINERS index 866a0dcd66ef..0ac9cec0bce6 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -14999,7 +14999,7 @@ M: Marcelo Ricardo Leitner L: linux-sctp@vger.kernel.org S: Maintained W: http://lksctp.sourceforge.net -F: Documentation/networking/sctp.txt +F: Documentation/networking/sctp.rst F: include/linux/sctp.h F: include/net/sctp/ F: include/uapi/linux/sctp.h -- cgit v1.2.3-59-g8ed1b From de1fd4a7b0f2351b775b673f092430dff64b221e Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Thu, 30 Apr 2020 18:04:23 +0200 Subject: docs: networking: convert secid.txt to ReST Not much to be done here: - add SPDX header; - add a document title; - add to networking/index.rst. Signed-off-by: Mauro Carvalho Chehab Signed-off-by: David S. Miller --- Documentation/networking/index.rst | 1 + Documentation/networking/secid.rst | 20 ++++++++++++++++++++ Documentation/networking/secid.txt | 14 -------------- 3 files changed, 21 insertions(+), 14 deletions(-) create mode 100644 Documentation/networking/secid.rst delete mode 100644 Documentation/networking/secid.txt diff --git a/Documentation/networking/index.rst b/Documentation/networking/index.rst index 1761eb715061..8b672f252f67 100644 --- a/Documentation/networking/index.rst +++ b/Documentation/networking/index.rst @@ -101,6 +101,7 @@ Contents: regulatory rxrpc sctp + secid .. only:: subproject and html diff --git a/Documentation/networking/secid.rst b/Documentation/networking/secid.rst new file mode 100644 index 000000000000..b45141a98027 --- /dev/null +++ b/Documentation/networking/secid.rst @@ -0,0 +1,20 @@ +.. SPDX-License-Identifier: GPL-2.0 + +================= +LSM/SeLinux secid +================= + +flowi structure: + +The secid member in the flow structure is used in LSMs (e.g. SELinux) to indicate +the label of the flow. This label of the flow is currently used in selecting +matching labeled xfrm(s). + +If this is an outbound flow, the label is derived from the socket, if any, or +the incoming packet this flow is being generated as a response to (e.g. tcp +resets, timewait ack, etc.). It is also conceivable that the label could be +derived from other sources such as process context, device, etc., in special +cases, as may be appropriate. + +If this is an inbound flow, the label is derived from the IPSec security +associations, if any, used by the packet. diff --git a/Documentation/networking/secid.txt b/Documentation/networking/secid.txt deleted file mode 100644 index 95ea06784333..000000000000 --- a/Documentation/networking/secid.txt +++ /dev/null @@ -1,14 +0,0 @@ -flowi structure: - -The secid member in the flow structure is used in LSMs (e.g. SELinux) to indicate -the label of the flow. This label of the flow is currently used in selecting -matching labeled xfrm(s). - -If this is an outbound flow, the label is derived from the socket, if any, or -the incoming packet this flow is being generated as a response to (e.g. tcp -resets, timewait ack, etc.). It is also conceivable that the label could be -derived from other sources such as process context, device, etc., in special -cases, as may be appropriate. - -If this is an inbound flow, the label is derived from the IPSec security -associations, if any, used by the packet. -- cgit v1.2.3-59-g8ed1b From d6c48bc6f8da38590946d23a27138d5258aca261 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Thu, 30 Apr 2020 18:04:24 +0200 Subject: docs: networking: convert seg6-sysctl.txt to ReST - add SPDX header; - mark code blocks and literals as such; - add a document title; - adjust chapters, adding proper markups; - mark lists as such; - add to networking/index.rst. Signed-off-by: Mauro Carvalho Chehab Signed-off-by: David S. Miller --- Documentation/networking/index.rst | 1 + Documentation/networking/seg6-sysctl.rst | 26 ++++++++++++++++++++++++++ Documentation/networking/seg6-sysctl.txt | 18 ------------------ 3 files changed, 27 insertions(+), 18 deletions(-) create mode 100644 Documentation/networking/seg6-sysctl.rst delete mode 100644 Documentation/networking/seg6-sysctl.txt diff --git a/Documentation/networking/index.rst b/Documentation/networking/index.rst index 8b672f252f67..716744c568b7 100644 --- a/Documentation/networking/index.rst +++ b/Documentation/networking/index.rst @@ -102,6 +102,7 @@ Contents: rxrpc sctp secid + seg6-sysctl .. only:: subproject and html diff --git a/Documentation/networking/seg6-sysctl.rst b/Documentation/networking/seg6-sysctl.rst new file mode 100644 index 000000000000..ec73e1445030 --- /dev/null +++ b/Documentation/networking/seg6-sysctl.rst @@ -0,0 +1,26 @@ +.. SPDX-License-Identifier: GPL-2.0 + +==================== +Seg6 Sysfs variables +==================== + + +/proc/sys/net/conf//seg6_* variables: +============================================ + +seg6_enabled - BOOL + Accept or drop SR-enabled IPv6 packets on this interface. + + Relevant packets are those with SRH present and DA = local. + + * 0 - disabled (default) + * not 0 - enabled + +seg6_require_hmac - INTEGER + Define HMAC policy for ingress SR-enabled packets on this interface. + + * -1 - Ignore HMAC field + * 0 - Accept SR packets without HMAC, validate SR packets with HMAC + * 1 - Drop SR packets without HMAC, validate SR packets with HMAC + + Default is 0. diff --git a/Documentation/networking/seg6-sysctl.txt b/Documentation/networking/seg6-sysctl.txt deleted file mode 100644 index bdbde23b19cb..000000000000 --- a/Documentation/networking/seg6-sysctl.txt +++ /dev/null @@ -1,18 +0,0 @@ -/proc/sys/net/conf//seg6_* variables: - -seg6_enabled - BOOL - Accept or drop SR-enabled IPv6 packets on this interface. - - Relevant packets are those with SRH present and DA = local. - - 0 - disabled (default) - not 0 - enabled - -seg6_require_hmac - INTEGER - Define HMAC policy for ingress SR-enabled packets on this interface. - - -1 - Ignore HMAC field - 0 - Accept SR packets without HMAC, validate SR packets with HMAC - 1 - Drop SR packets without HMAC, validate SR packets with HMAC - - Default is 0. -- cgit v1.2.3-59-g8ed1b From fe3dfe418cbbd1ee168d9294725d482029097694 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Thu, 30 Apr 2020 18:04:25 +0200 Subject: docs: networking: convert skfp.txt to ReST - add SPDX header; - use copyright symbol; - add a document title; - adjust titles and chapters, adding proper markups; - comment out text-only TOC from html/pdf output; - adjust identation, whitespaces and blank lines where needed; - add to networking/index.rst. Signed-off-by: Mauro Carvalho Chehab Signed-off-by: David S. Miller --- Documentation/networking/index.rst | 1 + Documentation/networking/skfp.rst | 253 +++++++++++++++++++++++++++++++++++++ Documentation/networking/skfp.txt | 220 -------------------------------- drivers/net/fddi/Kconfig | 2 +- 4 files changed, 255 insertions(+), 221 deletions(-) create mode 100644 Documentation/networking/skfp.rst delete mode 100644 Documentation/networking/skfp.txt diff --git a/Documentation/networking/index.rst b/Documentation/networking/index.rst index 716744c568b7..d19ddcbe66e5 100644 --- a/Documentation/networking/index.rst +++ b/Documentation/networking/index.rst @@ -103,6 +103,7 @@ Contents: sctp secid seg6-sysctl + skfp .. only:: subproject and html diff --git a/Documentation/networking/skfp.rst b/Documentation/networking/skfp.rst new file mode 100644 index 000000000000..58f548105c1d --- /dev/null +++ b/Documentation/networking/skfp.rst @@ -0,0 +1,253 @@ +.. SPDX-License-Identifier: GPL-2.0 + +.. include:: + +======================== +SysKonnect driver - SKFP +======================== + +|copy| Copyright 1998-2000 SysKonnect, + +skfp.txt created 11-May-2000 + +Readme File for skfp.o v2.06 + + +.. This file contains + + (1) OVERVIEW + (2) SUPPORTED ADAPTERS + (3) GENERAL INFORMATION + (4) INSTALLATION + (5) INCLUSION OF THE ADAPTER IN SYSTEM START + (6) TROUBLESHOOTING + (7) FUNCTION OF THE ADAPTER LEDS + (8) HISTORY + + +1. Overview +=========== + +This README explains how to use the driver 'skfp' for Linux with your +network adapter. + +Chapter 2: Contains a list of all network adapters that are supported by +this driver. + +Chapter 3: + Gives some general information. + +Chapter 4: Describes common problems and solutions. + +Chapter 5: Shows the changed functionality of the adapter LEDs. + +Chapter 6: History of development. + + +2. Supported adapters +===================== + +The network driver 'skfp' supports the following network adapters: +SysKonnect adapters: + + - SK-5521 (SK-NET FDDI-UP) + - SK-5522 (SK-NET FDDI-UP DAS) + - SK-5541 (SK-NET FDDI-FP) + - SK-5543 (SK-NET FDDI-LP) + - SK-5544 (SK-NET FDDI-LP DAS) + - SK-5821 (SK-NET FDDI-UP64) + - SK-5822 (SK-NET FDDI-UP64 DAS) + - SK-5841 (SK-NET FDDI-FP64) + - SK-5843 (SK-NET FDDI-LP64) + - SK-5844 (SK-NET FDDI-LP64 DAS) + +Compaq adapters (not tested): + + - Netelligent 100 FDDI DAS Fibre SC + - Netelligent 100 FDDI SAS Fibre SC + - Netelligent 100 FDDI DAS UTP + - Netelligent 100 FDDI SAS UTP + - Netelligent 100 FDDI SAS Fibre MIC + + +3. General Information +====================== + +From v2.01 on, the driver is integrated in the linux kernel sources. +Therefore, the installation is the same as for any other adapter +supported by the kernel. + +Refer to the manual of your distribution about the installation +of network adapters. + +Makes my life much easier :-) + +4. Troubleshooting +================== + +If you run into problems during installation, check those items: + +Problem: + The FDDI adapter cannot be found by the driver. + +Reason: + Look in /proc/pci for the following entry: + + 'FDDI network controller: SysKonnect SK-FDDI-PCI ...' + + If this entry exists, then the FDDI adapter has been + found by the system and should be able to be used. + + If this entry does not exist or if the file '/proc/pci' + is not there, then you may have a hardware problem or PCI + support may not be enabled in your kernel. + + The adapter can be checked using the diagnostic program + which is available from the SysKonnect web site: + + www.syskonnect.de + + Some COMPAQ machines have a problem with PCI under + Linux. This is described in the 'PCI howto' document + (included in some distributions or available from the + www, e.g. at 'www.linux.org') and no workaround is available. + +Problem: + You want to use your computer as a router between + multiple IP subnetworks (using multiple adapters), but + you cannot reach computers in other subnetworks. + +Reason: + Either the router's kernel is not configured for IP + forwarding or there is a problem with the routing table + and gateway configuration in at least one of the + computers. + +If your problem is not listed here, please contact our +technical support for help. + +You can send email to: linux@syskonnect.de + +When contacting our technical support, +please ensure that the following information is available: + +- System Manufacturer and Model +- Boards in your system +- Distribution +- Kernel version + + +5. Function of the Adapter LEDs +=============================== + + The functionality of the LED's on the FDDI network adapters was + changed in SMT version v2.82. With this new SMT version, the yellow + LED works as a ring operational indicator. An active yellow LED + indicates that the ring is down. The green LED on the adapter now + works as a link indicator where an active GREEN LED indicates that + the respective port has a physical connection. + + With versions of SMT prior to v2.82 a ring up was indicated if the + yellow LED was off while the green LED(s) showed the connection + status of the adapter. During a ring down the green LED was off and + the yellow LED was on. + + All implementations indicate that a driver is not loaded if + all LEDs are off. + + +6. History +========== + +v2.06 (20000511) (In-Kernel version) + New features: + + - 64 bit support + - new pci dma interface + - in kernel 2.3.99 + +v2.05 (20000217) (In-Kernel version) + New features: + + - Changes for 2.3.45 kernel + +v2.04 (20000207) (Standalone version) + New features: + + - Added rx/tx byte counter + +v2.03 (20000111) (Standalone version) + Problems fixed: + + - Fixed printk statements from v2.02 + +v2.02 (991215) (Standalone version) + Problems fixed: + + - Removed unnecessary output + - Fixed path for "printver.sh" in makefile + +v2.01 (991122) (In-Kernel version) + New features: + + - Integration in Linux kernel sources + - Support for memory mapped I/O. + +v2.00 (991112) + New features: + + - Full source released under GPL + +v1.05 (991023) + Problems fixed: + + - Compilation with kernel version 2.2.13 failed + +v1.04 (990427) + Changes: + + - New SMT module included, changing LED functionality + + Problems fixed: + + - Synchronization on SMP machines was buggy + +v1.03 (990325) + Problems fixed: + + - Interrupt routing on SMP machines could be incorrect + +v1.02 (990310) + New features: + + - Support for kernel versions 2.2.x added + - Kernel patch instead of private duplicate of kernel functions + +v1.01 (980812) + Problems fixed: + + Connection hangup with telnet + Slow telnet connection + +v1.00 beta 01 (980507) + New features: + + None. + + Problems fixed: + + None. + + Known limitations: + + - tar archive instead of standard package format (rpm). + - FDDI statistic is empty. + - not tested with 2.1.xx kernels + - integration in kernel not tested + - not tested simultaneously with FDDI adapters from other vendors. + - only X86 processors supported. + - SBA (Synchronous Bandwidth Allocator) parameters can + not be configured. + - does not work on some COMPAQ machines. See the PCI howto + document for details about this problem. + - data corruption with kernel versions below 2.0.33. diff --git a/Documentation/networking/skfp.txt b/Documentation/networking/skfp.txt deleted file mode 100644 index 203ec66c9fb4..000000000000 --- a/Documentation/networking/skfp.txt +++ /dev/null @@ -1,220 +0,0 @@ -(C)Copyright 1998-2000 SysKonnect, -=========================================================================== - -skfp.txt created 11-May-2000 - -Readme File for skfp.o v2.06 - - -This file contains -(1) OVERVIEW -(2) SUPPORTED ADAPTERS -(3) GENERAL INFORMATION -(4) INSTALLATION -(5) INCLUSION OF THE ADAPTER IN SYSTEM START -(6) TROUBLESHOOTING -(7) FUNCTION OF THE ADAPTER LEDS -(8) HISTORY - -=========================================================================== - - - -(1) OVERVIEW -============ - -This README explains how to use the driver 'skfp' for Linux with your -network adapter. - -Chapter 2: Contains a list of all network adapters that are supported by - this driver. - -Chapter 3: Gives some general information. - -Chapter 4: Describes common problems and solutions. - -Chapter 5: Shows the changed functionality of the adapter LEDs. - -Chapter 6: History of development. - -*** - - -(2) SUPPORTED ADAPTERS -====================== - -The network driver 'skfp' supports the following network adapters: -SysKonnect adapters: - - SK-5521 (SK-NET FDDI-UP) - - SK-5522 (SK-NET FDDI-UP DAS) - - SK-5541 (SK-NET FDDI-FP) - - SK-5543 (SK-NET FDDI-LP) - - SK-5544 (SK-NET FDDI-LP DAS) - - SK-5821 (SK-NET FDDI-UP64) - - SK-5822 (SK-NET FDDI-UP64 DAS) - - SK-5841 (SK-NET FDDI-FP64) - - SK-5843 (SK-NET FDDI-LP64) - - SK-5844 (SK-NET FDDI-LP64 DAS) -Compaq adapters (not tested): - - Netelligent 100 FDDI DAS Fibre SC - - Netelligent 100 FDDI SAS Fibre SC - - Netelligent 100 FDDI DAS UTP - - Netelligent 100 FDDI SAS UTP - - Netelligent 100 FDDI SAS Fibre MIC -*** - - -(3) GENERAL INFORMATION -======================= - -From v2.01 on, the driver is integrated in the linux kernel sources. -Therefore, the installation is the same as for any other adapter -supported by the kernel. -Refer to the manual of your distribution about the installation -of network adapters. -Makes my life much easier :-) -*** - - -(4) TROUBLESHOOTING -=================== - -If you run into problems during installation, check those items: - -Problem: The FDDI adapter cannot be found by the driver. -Reason: Look in /proc/pci for the following entry: - 'FDDI network controller: SysKonnect SK-FDDI-PCI ...' - If this entry exists, then the FDDI adapter has been - found by the system and should be able to be used. - If this entry does not exist or if the file '/proc/pci' - is not there, then you may have a hardware problem or PCI - support may not be enabled in your kernel. - The adapter can be checked using the diagnostic program - which is available from the SysKonnect web site: - www.syskonnect.de - Some COMPAQ machines have a problem with PCI under - Linux. This is described in the 'PCI howto' document - (included in some distributions or available from the - www, e.g. at 'www.linux.org') and no workaround is available. - -Problem: You want to use your computer as a router between - multiple IP subnetworks (using multiple adapters), but - you cannot reach computers in other subnetworks. -Reason: Either the router's kernel is not configured for IP - forwarding or there is a problem with the routing table - and gateway configuration in at least one of the - computers. - -If your problem is not listed here, please contact our -technical support for help. -You can send email to: - linux@syskonnect.de -When contacting our technical support, -please ensure that the following information is available: -- System Manufacturer and Model -- Boards in your system -- Distribution -- Kernel version - -*** - - -(5) FUNCTION OF THE ADAPTER LEDS -================================ - - The functionality of the LED's on the FDDI network adapters was - changed in SMT version v2.82. With this new SMT version, the yellow - LED works as a ring operational indicator. An active yellow LED - indicates that the ring is down. The green LED on the adapter now - works as a link indicator where an active GREEN LED indicates that - the respective port has a physical connection. - - With versions of SMT prior to v2.82 a ring up was indicated if the - yellow LED was off while the green LED(s) showed the connection - status of the adapter. During a ring down the green LED was off and - the yellow LED was on. - - All implementations indicate that a driver is not loaded if - all LEDs are off. - -*** - - -(6) HISTORY -=========== - -v2.06 (20000511) (In-Kernel version) - New features: - - 64 bit support - - new pci dma interface - - in kernel 2.3.99 - -v2.05 (20000217) (In-Kernel version) - New features: - - Changes for 2.3.45 kernel - -v2.04 (20000207) (Standalone version) - New features: - - Added rx/tx byte counter - -v2.03 (20000111) (Standalone version) - Problems fixed: - - Fixed printk statements from v2.02 - -v2.02 (991215) (Standalone version) - Problems fixed: - - Removed unnecessary output - - Fixed path for "printver.sh" in makefile - -v2.01 (991122) (In-Kernel version) - New features: - - Integration in Linux kernel sources - - Support for memory mapped I/O. - -v2.00 (991112) - New features: - - Full source released under GPL - -v1.05 (991023) - Problems fixed: - - Compilation with kernel version 2.2.13 failed - -v1.04 (990427) - Changes: - - New SMT module included, changing LED functionality - Problems fixed: - - Synchronization on SMP machines was buggy - -v1.03 (990325) - Problems fixed: - - Interrupt routing on SMP machines could be incorrect - -v1.02 (990310) - New features: - - Support for kernel versions 2.2.x added - - Kernel patch instead of private duplicate of kernel functions - -v1.01 (980812) - Problems fixed: - Connection hangup with telnet - Slow telnet connection - -v1.00 beta 01 (980507) - New features: - None. - Problems fixed: - None. - Known limitations: - - tar archive instead of standard package format (rpm). - - FDDI statistic is empty. - - not tested with 2.1.xx kernels - - integration in kernel not tested - - not tested simultaneously with FDDI adapters from other vendors. - - only X86 processors supported. - - SBA (Synchronous Bandwidth Allocator) parameters can - not be configured. - - does not work on some COMPAQ machines. See the PCI howto - document for details about this problem. - - data corruption with kernel versions below 2.0.33. - -*** End of information file *** diff --git a/drivers/net/fddi/Kconfig b/drivers/net/fddi/Kconfig index 3b412a56f2cb..da4f58eed08f 100644 --- a/drivers/net/fddi/Kconfig +++ b/drivers/net/fddi/Kconfig @@ -77,7 +77,7 @@ config SKFP - Netelligent 100 FDDI SAS UTP - Netelligent 100 FDDI SAS Fibre MIC - Read for information about + Read for information about the driver. Questions concerning this driver can be addressed to: -- cgit v1.2.3-59-g8ed1b From 060d9d3e1282e3ccf4bbb3cc4ea94bed3ce69310 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Thu, 30 Apr 2020 18:04:26 +0200 Subject: docs: networking: convert strparser.txt to ReST - add SPDX header; - adjust title markup; - mark code blocks and literals as such; - mark tables as such; - adjust identation, whitespaces and blank lines where needed; - add to networking/index.rst. Signed-off-by: Mauro Carvalho Chehab Signed-off-by: David S. Miller --- Documentation/networking/index.rst | 1 + Documentation/networking/strparser.rst | 240 +++++++++++++++++++++++++++++++++ Documentation/networking/strparser.txt | 207 ---------------------------- 3 files changed, 241 insertions(+), 207 deletions(-) create mode 100644 Documentation/networking/strparser.rst delete mode 100644 Documentation/networking/strparser.txt diff --git a/Documentation/networking/index.rst b/Documentation/networking/index.rst index d19ddcbe66e5..e5a705024c6a 100644 --- a/Documentation/networking/index.rst +++ b/Documentation/networking/index.rst @@ -104,6 +104,7 @@ Contents: secid seg6-sysctl skfp + strparser .. only:: subproject and html diff --git a/Documentation/networking/strparser.rst b/Documentation/networking/strparser.rst new file mode 100644 index 000000000000..6cab1f74ae05 --- /dev/null +++ b/Documentation/networking/strparser.rst @@ -0,0 +1,240 @@ +.. SPDX-License-Identifier: GPL-2.0 + +========================= +Stream Parser (strparser) +========================= + +Introduction +============ + +The stream parser (strparser) is a utility that parses messages of an +application layer protocol running over a data stream. The stream +parser works in conjunction with an upper layer in the kernel to provide +kernel support for application layer messages. For instance, Kernel +Connection Multiplexor (KCM) uses the Stream Parser to parse messages +using a BPF program. + +The strparser works in one of two modes: receive callback or general +mode. + +In receive callback mode, the strparser is called from the data_ready +callback of a TCP socket. Messages are parsed and delivered as they are +received on the socket. + +In general mode, a sequence of skbs are fed to strparser from an +outside source. Message are parsed and delivered as the sequence is +processed. This modes allows strparser to be applied to arbitrary +streams of data. + +Interface +========= + +The API includes a context structure, a set of callbacks, utility +functions, and a data_ready function for receive callback mode. The +callbacks include a parse_msg function that is called to perform +parsing (e.g. BPF parsing in case of KCM), and a rcv_msg function +that is called when a full message has been completed. + +Functions +========= + + :: + + strp_init(struct strparser *strp, struct sock *sk, + const struct strp_callbacks *cb) + + Called to initialize a stream parser. strp is a struct of type + strparser that is allocated by the upper layer. sk is the TCP + socket associated with the stream parser for use with receive + callback mode; in general mode this is set to NULL. Callbacks + are called by the stream parser (the callbacks are listed below). + + :: + + void strp_pause(struct strparser *strp) + + Temporarily pause a stream parser. Message parsing is suspended + and no new messages are delivered to the upper layer. + + :: + + void strp_unpause(struct strparser *strp) + + Unpause a paused stream parser. + + :: + + void strp_stop(struct strparser *strp); + + strp_stop is called to completely stop stream parser operations. + This is called internally when the stream parser encounters an + error, and it is called from the upper layer to stop parsing + operations. + + :: + + void strp_done(struct strparser *strp); + + strp_done is called to release any resources held by the stream + parser instance. This must be called after the stream processor + has been stopped. + + :: + + int strp_process(struct strparser *strp, struct sk_buff *orig_skb, + unsigned int orig_offset, size_t orig_len, + size_t max_msg_size, long timeo) + + strp_process is called in general mode for a stream parser to + parse an sk_buff. The number of bytes processed or a negative + error number is returned. Note that strp_process does not + consume the sk_buff. max_msg_size is maximum size the stream + parser will parse. timeo is timeout for completing a message. + + :: + + void strp_data_ready(struct strparser *strp); + + The upper layer calls strp_tcp_data_ready when data is ready on + the lower socket for strparser to process. This should be called + from a data_ready callback that is set on the socket. Note that + maximum messages size is the limit of the receive socket + buffer and message timeout is the receive timeout for the socket. + + :: + + void strp_check_rcv(struct strparser *strp); + + strp_check_rcv is called to check for new messages on the socket. + This is normally called at initialization of a stream parser + instance or after strp_unpause. + +Callbacks +========= + +There are six callbacks: + + :: + + int (*parse_msg)(struct strparser *strp, struct sk_buff *skb); + + parse_msg is called to determine the length of the next message + in the stream. The upper layer must implement this function. It + should parse the sk_buff as containing the headers for the + next application layer message in the stream. + + The skb->cb in the input skb is a struct strp_msg. Only + the offset field is relevant in parse_msg and gives the offset + where the message starts in the skb. + + The return values of this function are: + + ========= =========================================================== + >0 indicates length of successfully parsed message + 0 indicates more data must be received to parse the message + -ESTRPIPE current message should not be processed by the + kernel, return control of the socket to userspace which + can proceed to read the messages itself + other < 0 Error in parsing, give control back to userspace + assuming that synchronization is lost and the stream + is unrecoverable (application expected to close TCP socket) + ========= =========================================================== + + In the case that an error is returned (return value is less than + zero) and the parser is in receive callback mode, then it will set + the error on TCP socket and wake it up. If parse_msg returned + -ESTRPIPE and the stream parser had previously read some bytes for + the current message, then the error set on the attached socket is + ENODATA since the stream is unrecoverable in that case. + + :: + + void (*lock)(struct strparser *strp) + + The lock callback is called to lock the strp structure when + the strparser is performing an asynchronous operation (such as + processing a timeout). In receive callback mode the default + function is to lock_sock for the associated socket. In general + mode the callback must be set appropriately. + + :: + + void (*unlock)(struct strparser *strp) + + The unlock callback is called to release the lock obtained + by the lock callback. In receive callback mode the default + function is release_sock for the associated socket. In general + mode the callback must be set appropriately. + + :: + + void (*rcv_msg)(struct strparser *strp, struct sk_buff *skb); + + rcv_msg is called when a full message has been received and + is queued. The callee must consume the sk_buff; it can + call strp_pause to prevent any further messages from being + received in rcv_msg (see strp_pause above). This callback + must be set. + + The skb->cb in the input skb is a struct strp_msg. This + struct contains two fields: offset and full_len. Offset is + where the message starts in the skb, and full_len is the + the length of the message. skb->len - offset may be greater + then full_len since strparser does not trim the skb. + + :: + + int (*read_sock_done)(struct strparser *strp, int err); + + read_sock_done is called when the stream parser is done reading + the TCP socket in receive callback mode. The stream parser may + read multiple messages in a loop and this function allows cleanup + to occur when exiting the loop. If the callback is not set (NULL + in strp_init) a default function is used. + + :: + + void (*abort_parser)(struct strparser *strp, int err); + + This function is called when stream parser encounters an error + in parsing. The default function stops the stream parser and + sets the error in the socket if the parser is in receive callback + mode. The default function can be changed by setting the callback + to non-NULL in strp_init. + +Statistics +========== + +Various counters are kept for each stream parser instance. These are in +the strp_stats structure. strp_aggr_stats is a convenience structure for +accumulating statistics for multiple stream parser instances. +save_strp_stats and aggregate_strp_stats are helper functions to save +and aggregate statistics. + +Message assembly limits +======================= + +The stream parser provide mechanisms to limit the resources consumed by +message assembly. + +A timer is set when assembly starts for a new message. In receive +callback mode the message timeout is taken from rcvtime for the +associated TCP socket. In general mode, the timeout is passed as an +argument in strp_process. If the timer fires before assembly completes +the stream parser is aborted and the ETIMEDOUT error is set on the TCP +socket if in receive callback mode. + +In receive callback mode, message length is limited to the receive +buffer size of the associated TCP socket. If the length returned by +parse_msg is greater than the socket buffer size then the stream parser +is aborted with EMSGSIZE error set on the TCP socket. Note that this +makes the maximum size of receive skbuffs for a socket with a stream +parser to be 2*sk_rcvbuf of the TCP socket. + +In general mode the message length limit is passed in as an argument +to strp_process. + +Author +====== + +Tom Herbert (tom@quantonium.net) diff --git a/Documentation/networking/strparser.txt b/Documentation/networking/strparser.txt deleted file mode 100644 index a7d354ddda7b..000000000000 --- a/Documentation/networking/strparser.txt +++ /dev/null @@ -1,207 +0,0 @@ -Stream Parser (strparser) - -Introduction -============ - -The stream parser (strparser) is a utility that parses messages of an -application layer protocol running over a data stream. The stream -parser works in conjunction with an upper layer in the kernel to provide -kernel support for application layer messages. For instance, Kernel -Connection Multiplexor (KCM) uses the Stream Parser to parse messages -using a BPF program. - -The strparser works in one of two modes: receive callback or general -mode. - -In receive callback mode, the strparser is called from the data_ready -callback of a TCP socket. Messages are parsed and delivered as they are -received on the socket. - -In general mode, a sequence of skbs are fed to strparser from an -outside source. Message are parsed and delivered as the sequence is -processed. This modes allows strparser to be applied to arbitrary -streams of data. - -Interface -========= - -The API includes a context structure, a set of callbacks, utility -functions, and a data_ready function for receive callback mode. The -callbacks include a parse_msg function that is called to perform -parsing (e.g. BPF parsing in case of KCM), and a rcv_msg function -that is called when a full message has been completed. - -Functions -========= - -strp_init(struct strparser *strp, struct sock *sk, - const struct strp_callbacks *cb) - - Called to initialize a stream parser. strp is a struct of type - strparser that is allocated by the upper layer. sk is the TCP - socket associated with the stream parser for use with receive - callback mode; in general mode this is set to NULL. Callbacks - are called by the stream parser (the callbacks are listed below). - -void strp_pause(struct strparser *strp) - - Temporarily pause a stream parser. Message parsing is suspended - and no new messages are delivered to the upper layer. - -void strp_unpause(struct strparser *strp) - - Unpause a paused stream parser. - -void strp_stop(struct strparser *strp); - - strp_stop is called to completely stop stream parser operations. - This is called internally when the stream parser encounters an - error, and it is called from the upper layer to stop parsing - operations. - -void strp_done(struct strparser *strp); - - strp_done is called to release any resources held by the stream - parser instance. This must be called after the stream processor - has been stopped. - -int strp_process(struct strparser *strp, struct sk_buff *orig_skb, - unsigned int orig_offset, size_t orig_len, - size_t max_msg_size, long timeo) - - strp_process is called in general mode for a stream parser to - parse an sk_buff. The number of bytes processed or a negative - error number is returned. Note that strp_process does not - consume the sk_buff. max_msg_size is maximum size the stream - parser will parse. timeo is timeout for completing a message. - -void strp_data_ready(struct strparser *strp); - - The upper layer calls strp_tcp_data_ready when data is ready on - the lower socket for strparser to process. This should be called - from a data_ready callback that is set on the socket. Note that - maximum messages size is the limit of the receive socket - buffer and message timeout is the receive timeout for the socket. - -void strp_check_rcv(struct strparser *strp); - - strp_check_rcv is called to check for new messages on the socket. - This is normally called at initialization of a stream parser - instance or after strp_unpause. - -Callbacks -========= - -There are six callbacks: - -int (*parse_msg)(struct strparser *strp, struct sk_buff *skb); - - parse_msg is called to determine the length of the next message - in the stream. The upper layer must implement this function. It - should parse the sk_buff as containing the headers for the - next application layer message in the stream. - - The skb->cb in the input skb is a struct strp_msg. Only - the offset field is relevant in parse_msg and gives the offset - where the message starts in the skb. - - The return values of this function are: - - >0 : indicates length of successfully parsed message - 0 : indicates more data must be received to parse the message - -ESTRPIPE : current message should not be processed by the - kernel, return control of the socket to userspace which - can proceed to read the messages itself - other < 0 : Error in parsing, give control back to userspace - assuming that synchronization is lost and the stream - is unrecoverable (application expected to close TCP socket) - - In the case that an error is returned (return value is less than - zero) and the parser is in receive callback mode, then it will set - the error on TCP socket and wake it up. If parse_msg returned - -ESTRPIPE and the stream parser had previously read some bytes for - the current message, then the error set on the attached socket is - ENODATA since the stream is unrecoverable in that case. - -void (*lock)(struct strparser *strp) - - The lock callback is called to lock the strp structure when - the strparser is performing an asynchronous operation (such as - processing a timeout). In receive callback mode the default - function is to lock_sock for the associated socket. In general - mode the callback must be set appropriately. - -void (*unlock)(struct strparser *strp) - - The unlock callback is called to release the lock obtained - by the lock callback. In receive callback mode the default - function is release_sock for the associated socket. In general - mode the callback must be set appropriately. - -void (*rcv_msg)(struct strparser *strp, struct sk_buff *skb); - - rcv_msg is called when a full message has been received and - is queued. The callee must consume the sk_buff; it can - call strp_pause to prevent any further messages from being - received in rcv_msg (see strp_pause above). This callback - must be set. - - The skb->cb in the input skb is a struct strp_msg. This - struct contains two fields: offset and full_len. Offset is - where the message starts in the skb, and full_len is the - the length of the message. skb->len - offset may be greater - then full_len since strparser does not trim the skb. - -int (*read_sock_done)(struct strparser *strp, int err); - - read_sock_done is called when the stream parser is done reading - the TCP socket in receive callback mode. The stream parser may - read multiple messages in a loop and this function allows cleanup - to occur when exiting the loop. If the callback is not set (NULL - in strp_init) a default function is used. - -void (*abort_parser)(struct strparser *strp, int err); - - This function is called when stream parser encounters an error - in parsing. The default function stops the stream parser and - sets the error in the socket if the parser is in receive callback - mode. The default function can be changed by setting the callback - to non-NULL in strp_init. - -Statistics -========== - -Various counters are kept for each stream parser instance. These are in -the strp_stats structure. strp_aggr_stats is a convenience structure for -accumulating statistics for multiple stream parser instances. -save_strp_stats and aggregate_strp_stats are helper functions to save -and aggregate statistics. - -Message assembly limits -======================= - -The stream parser provide mechanisms to limit the resources consumed by -message assembly. - -A timer is set when assembly starts for a new message. In receive -callback mode the message timeout is taken from rcvtime for the -associated TCP socket. In general mode, the timeout is passed as an -argument in strp_process. If the timer fires before assembly completes -the stream parser is aborted and the ETIMEDOUT error is set on the TCP -socket if in receive callback mode. - -In receive callback mode, message length is limited to the receive -buffer size of the associated TCP socket. If the length returned by -parse_msg is greater than the socket buffer size then the stream parser -is aborted with EMSGSIZE error set on the TCP socket. Note that this -makes the maximum size of receive skbuffs for a socket with a stream -parser to be 2*sk_rcvbuf of the TCP socket. - -In general mode the message length limit is passed in as an argument -to strp_process. - -Author -====== - -Tom Herbert (tom@quantonium.net) - -- cgit v1.2.3-59-g8ed1b From 32c0f0bed5bb08625083ed7f5b661c842d63ebd1 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Thu, 30 Apr 2020 18:04:27 +0200 Subject: docs: networking: convert switchdev.txt to ReST - add SPDX header; - use copyright symbol; - adjust title markup; - mark code blocks and literals as such; - mark tables as such; - adjust identation, whitespaces and blank lines where needed; - add to networking/index.rst. Signed-off-by: Mauro Carvalho Chehab Signed-off-by: David S. Miller --- Documentation/networking/index.rst | 1 + Documentation/networking/switchdev.rst | 387 +++++++++++++++++++++++++++++++++ Documentation/networking/switchdev.txt | 373 ------------------------------- drivers/staging/fsl-dpaa2/ethsw/README | 2 +- 4 files changed, 389 insertions(+), 374 deletions(-) create mode 100644 Documentation/networking/switchdev.rst delete mode 100644 Documentation/networking/switchdev.txt diff --git a/Documentation/networking/index.rst b/Documentation/networking/index.rst index e5a705024c6a..5e495804f96f 100644 --- a/Documentation/networking/index.rst +++ b/Documentation/networking/index.rst @@ -105,6 +105,7 @@ Contents: seg6-sysctl skfp strparser + switchdev .. only:: subproject and html diff --git a/Documentation/networking/switchdev.rst b/Documentation/networking/switchdev.rst new file mode 100644 index 000000000000..ddc3f35775dc --- /dev/null +++ b/Documentation/networking/switchdev.rst @@ -0,0 +1,387 @@ +.. SPDX-License-Identifier: GPL-2.0 +.. include:: + +=============================================== +Ethernet switch device driver model (switchdev) +=============================================== + +Copyright |copy| 2014 Jiri Pirko + +Copyright |copy| 2014-2015 Scott Feldman + + +The Ethernet switch device driver model (switchdev) is an in-kernel driver +model for switch devices which offload the forwarding (data) plane from the +kernel. + +Figure 1 is a block diagram showing the components of the switchdev model for +an example setup using a data-center-class switch ASIC chip. Other setups +with SR-IOV or soft switches, such as OVS, are possible. + +:: + + + User-space tools + + user space | + +-------------------------------------------------------------------+ + kernel | Netlink + | + +--------------+-------------------------------+ + | Network stack | + | (Linux) | + | | + +----------------------------------------------+ + + sw1p2 sw1p4 sw1p6 + sw1p1 + sw1p3 + sw1p5 + eth1 + + | + | + | + + | | | | | | | + +--+----+----+----+----+----+---+ +-----+-----+ + | Switch driver | | mgmt | + | (this document) | | driver | + | | | | + +--------------+----------------+ +-----------+ + | + kernel | HW bus (eg PCI) + +-------------------------------------------------------------------+ + hardware | + +--------------+----------------+ + | Switch device (sw1) | + | +----+ +--------+ + | | v offloaded data path | mgmt port + | | | | + +--|----|----+----+----+----+---+ + | | | | | | + + + + + + + + p1 p2 p3 p4 p5 p6 + + front-panel ports + + + Fig 1. + + +Include Files +------------- + +:: + + #include + #include + + +Configuration +------------- + +Use "depends NET_SWITCHDEV" in driver's Kconfig to ensure switchdev model +support is built for driver. + + +Switch Ports +------------ + +On switchdev driver initialization, the driver will allocate and register a +struct net_device (using register_netdev()) for each enumerated physical switch +port, called the port netdev. A port netdev is the software representation of +the physical port and provides a conduit for control traffic to/from the +controller (the kernel) and the network, as well as an anchor point for higher +level constructs such as bridges, bonds, VLANs, tunnels, and L3 routers. Using +standard netdev tools (iproute2, ethtool, etc), the port netdev can also +provide to the user access to the physical properties of the switch port such +as PHY link state and I/O statistics. + +There is (currently) no higher-level kernel object for the switch beyond the +port netdevs. All of the switchdev driver ops are netdev ops or switchdev ops. + +A switch management port is outside the scope of the switchdev driver model. +Typically, the management port is not participating in offloaded data plane and +is loaded with a different driver, such as a NIC driver, on the management port +device. + +Switch ID +^^^^^^^^^ + +The switchdev driver must implement the net_device operation +ndo_get_port_parent_id for each port netdev, returning the same physical ID for +each port of a switch. The ID must be unique between switches on the same +system. The ID does not need to be unique between switches on different +systems. + +The switch ID is used to locate ports on a switch and to know if aggregated +ports belong to the same switch. + +Port Netdev Naming +^^^^^^^^^^^^^^^^^^ + +Udev rules should be used for port netdev naming, using some unique attribute +of the port as a key, for example the port MAC address or the port PHYS name. +Hard-coding of kernel netdev names within the driver is discouraged; let the +kernel pick the default netdev name, and let udev set the final name based on a +port attribute. + +Using port PHYS name (ndo_get_phys_port_name) for the key is particularly +useful for dynamically-named ports where the device names its ports based on +external configuration. For example, if a physical 40G port is split logically +into 4 10G ports, resulting in 4 port netdevs, the device can give a unique +name for each port using port PHYS name. The udev rule would be:: + + SUBSYSTEM=="net", ACTION=="add", ATTR{phys_switch_id}=="", \ + ATTR{phys_port_name}!="", NAME="swX$attr{phys_port_name}" + +Suggested naming convention is "swXpYsZ", where X is the switch name or ID, Y +is the port name or ID, and Z is the sub-port name or ID. For example, sw1p1s0 +would be sub-port 0 on port 1 on switch 1. + +Port Features +^^^^^^^^^^^^^ + +NETIF_F_NETNS_LOCAL + +If the switchdev driver (and device) only supports offloading of the default +network namespace (netns), the driver should set this feature flag to prevent +the port netdev from being moved out of the default netns. A netns-aware +driver/device would not set this flag and be responsible for partitioning +hardware to preserve netns containment. This means hardware cannot forward +traffic from a port in one namespace to another port in another namespace. + +Port Topology +^^^^^^^^^^^^^ + +The port netdevs representing the physical switch ports can be organized into +higher-level switching constructs. The default construct is a standalone +router port, used to offload L3 forwarding. Two or more ports can be bonded +together to form a LAG. Two or more ports (or LAGs) can be bridged to bridge +L2 networks. VLANs can be applied to sub-divide L2 networks. L2-over-L3 +tunnels can be built on ports. These constructs are built using standard Linux +tools such as the bridge driver, the bonding/team drivers, and netlink-based +tools such as iproute2. + +The switchdev driver can know a particular port's position in the topology by +monitoring NETDEV_CHANGEUPPER notifications. For example, a port moved into a +bond will see it's upper master change. If that bond is moved into a bridge, +the bond's upper master will change. And so on. The driver will track such +movements to know what position a port is in in the overall topology by +registering for netdevice events and acting on NETDEV_CHANGEUPPER. + +L2 Forwarding Offload +--------------------- + +The idea is to offload the L2 data forwarding (switching) path from the kernel +to the switchdev device by mirroring bridge FDB entries down to the device. An +FDB entry is the {port, MAC, VLAN} tuple forwarding destination. + +To offloading L2 bridging, the switchdev driver/device should support: + + - Static FDB entries installed on a bridge port + - Notification of learned/forgotten src mac/vlans from device + - STP state changes on the port + - VLAN flooding of multicast/broadcast and unknown unicast packets + +Static FDB Entries +^^^^^^^^^^^^^^^^^^ + +The switchdev driver should implement ndo_fdb_add, ndo_fdb_del and ndo_fdb_dump +to support static FDB entries installed to the device. Static bridge FDB +entries are installed, for example, using iproute2 bridge cmd:: + + bridge fdb add ADDR dev DEV [vlan VID] [self] + +The driver should use the helper switchdev_port_fdb_xxx ops for ndo_fdb_xxx +ops, and handle add/delete/dump of SWITCHDEV_OBJ_ID_PORT_FDB object using +switchdev_port_obj_xxx ops. + +XXX: what should be done if offloading this rule to hardware fails (for +example, due to full capacity in hardware tables) ? + +Note: by default, the bridge does not filter on VLAN and only bridges untagged +traffic. To enable VLAN support, turn on VLAN filtering:: + + echo 1 >/sys/class/net//bridge/vlan_filtering + +Notification of Learned/Forgotten Source MAC/VLANs +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The switch device will learn/forget source MAC address/VLAN on ingress packets +and notify the switch driver of the mac/vlan/port tuples. The switch driver, +in turn, will notify the bridge driver using the switchdev notifier call:: + + err = call_switchdev_notifiers(val, dev, info, extack); + +Where val is SWITCHDEV_FDB_ADD when learning and SWITCHDEV_FDB_DEL when +forgetting, and info points to a struct switchdev_notifier_fdb_info. On +SWITCHDEV_FDB_ADD, the bridge driver will install the FDB entry into the +bridge's FDB and mark the entry as NTF_EXT_LEARNED. The iproute2 bridge +command will label these entries "offload":: + + $ bridge fdb + 52:54:00:12:35:01 dev sw1p1 master br0 permanent + 00:02:00:00:02:00 dev sw1p1 master br0 offload + 00:02:00:00:02:00 dev sw1p1 self + 52:54:00:12:35:02 dev sw1p2 master br0 permanent + 00:02:00:00:03:00 dev sw1p2 master br0 offload + 00:02:00:00:03:00 dev sw1p2 self + 33:33:00:00:00:01 dev eth0 self permanent + 01:00:5e:00:00:01 dev eth0 self permanent + 33:33:ff:00:00:00 dev eth0 self permanent + 01:80:c2:00:00:0e dev eth0 self permanent + 33:33:00:00:00:01 dev br0 self permanent + 01:00:5e:00:00:01 dev br0 self permanent + 33:33:ff:12:35:01 dev br0 self permanent + +Learning on the port should be disabled on the bridge using the bridge command:: + + bridge link set dev DEV learning off + +Learning on the device port should be enabled, as well as learning_sync:: + + bridge link set dev DEV learning on self + bridge link set dev DEV learning_sync on self + +Learning_sync attribute enables syncing of the learned/forgotten FDB entry to +the bridge's FDB. It's possible, but not optimal, to enable learning on the +device port and on the bridge port, and disable learning_sync. + +To support learning, the driver implements switchdev op +switchdev_port_attr_set for SWITCHDEV_ATTR_PORT_ID_{PRE}_BRIDGE_FLAGS. + +FDB Ageing +^^^^^^^^^^ + +The bridge will skip ageing FDB entries marked with NTF_EXT_LEARNED and it is +the responsibility of the port driver/device to age out these entries. If the +port device supports ageing, when the FDB entry expires, it will notify the +driver which in turn will notify the bridge with SWITCHDEV_FDB_DEL. If the +device does not support ageing, the driver can simulate ageing using a +garbage collection timer to monitor FDB entries. Expired entries will be +notified to the bridge using SWITCHDEV_FDB_DEL. See rocker driver for +example of driver running ageing timer. + +To keep an NTF_EXT_LEARNED entry "alive", the driver should refresh the FDB +entry by calling call_switchdev_notifiers(SWITCHDEV_FDB_ADD, ...). The +notification will reset the FDB entry's last-used time to now. The driver +should rate limit refresh notifications, for example, no more than once a +second. (The last-used time is visible using the bridge -s fdb option). + +STP State Change on Port +^^^^^^^^^^^^^^^^^^^^^^^^ + +Internally or with a third-party STP protocol implementation (e.g. mstpd), the +bridge driver maintains the STP state for ports, and will notify the switch +driver of STP state change on a port using the switchdev op +switchdev_attr_port_set for SWITCHDEV_ATTR_PORT_ID_STP_UPDATE. + +State is one of BR_STATE_*. The switch driver can use STP state updates to +update ingress packet filter list for the port. For example, if port is +DISABLED, no packets should pass, but if port moves to BLOCKED, then STP BPDUs +and other IEEE 01:80:c2:xx:xx:xx link-local multicast packets can pass. + +Note that STP BDPUs are untagged and STP state applies to all VLANs on the port +so packet filters should be applied consistently across untagged and tagged +VLANs on the port. + +Flooding L2 domain +^^^^^^^^^^^^^^^^^^ + +For a given L2 VLAN domain, the switch device should flood multicast/broadcast +and unknown unicast packets to all ports in domain, if allowed by port's +current STP state. The switch driver, knowing which ports are within which +vlan L2 domain, can program the switch device for flooding. The packet may +be sent to the port netdev for processing by the bridge driver. The +bridge should not reflood the packet to the same ports the device flooded, +otherwise there will be duplicate packets on the wire. + +To avoid duplicate packets, the switch driver should mark a packet as already +forwarded by setting the skb->offload_fwd_mark bit. The bridge driver will mark +the skb using the ingress bridge port's mark and prevent it from being forwarded +through any bridge port with the same mark. + +It is possible for the switch device to not handle flooding and push the +packets up to the bridge driver for flooding. This is not ideal as the number +of ports scale in the L2 domain as the device is much more efficient at +flooding packets that software. + +If supported by the device, flood control can be offloaded to it, preventing +certain netdevs from flooding unicast traffic for which there is no FDB entry. + +IGMP Snooping +^^^^^^^^^^^^^ + +In order to support IGMP snooping, the port netdevs should trap to the bridge +driver all IGMP join and leave messages. +The bridge multicast module will notify port netdevs on every multicast group +changed whether it is static configured or dynamically joined/leave. +The hardware implementation should be forwarding all registered multicast +traffic groups only to the configured ports. + +L3 Routing Offload +------------------ + +Offloading L3 routing requires that device be programmed with FIB entries from +the kernel, with the device doing the FIB lookup and forwarding. The device +does a longest prefix match (LPM) on FIB entries matching route prefix and +forwards the packet to the matching FIB entry's nexthop(s) egress ports. + +To program the device, the driver has to register a FIB notifier handler +using register_fib_notifier. The following events are available: + +=================== =================================================== +FIB_EVENT_ENTRY_ADD used for both adding a new FIB entry to the device, + or modifying an existing entry on the device. +FIB_EVENT_ENTRY_DEL used for removing a FIB entry +FIB_EVENT_RULE_ADD, +FIB_EVENT_RULE_DEL used to propagate FIB rule changes +=================== =================================================== + +FIB_EVENT_ENTRY_ADD and FIB_EVENT_ENTRY_DEL events pass:: + + struct fib_entry_notifier_info { + struct fib_notifier_info info; /* must be first */ + u32 dst; + int dst_len; + struct fib_info *fi; + u8 tos; + u8 type; + u32 tb_id; + u32 nlflags; + }; + +to add/modify/delete IPv4 dst/dest_len prefix on table tb_id. The ``*fi`` +structure holds details on the route and route's nexthops. ``*dev`` is one +of the port netdevs mentioned in the route's next hop list. + +Routes offloaded to the device are labeled with "offload" in the ip route +listing:: + + $ ip route show + default via 192.168.0.2 dev eth0 + 11.0.0.0/30 dev sw1p1 proto kernel scope link src 11.0.0.2 offload + 11.0.0.4/30 via 11.0.0.1 dev sw1p1 proto zebra metric 20 offload + 11.0.0.8/30 dev sw1p2 proto kernel scope link src 11.0.0.10 offload + 11.0.0.12/30 via 11.0.0.9 dev sw1p2 proto zebra metric 20 offload + 12.0.0.2 proto zebra metric 30 offload + nexthop via 11.0.0.1 dev sw1p1 weight 1 + nexthop via 11.0.0.9 dev sw1p2 weight 1 + 12.0.0.3 via 11.0.0.1 dev sw1p1 proto zebra metric 20 offload + 12.0.0.4 via 11.0.0.9 dev sw1p2 proto zebra metric 20 offload + 192.168.0.0/24 dev eth0 proto kernel scope link src 192.168.0.15 + +The "offload" flag is set in case at least one device offloads the FIB entry. + +XXX: add/mod/del IPv6 FIB API + +Nexthop Resolution +^^^^^^^^^^^^^^^^^^ + +The FIB entry's nexthop list contains the nexthop tuple (gateway, dev), but for +the switch device to forward the packet with the correct dst mac address, the +nexthop gateways must be resolved to the neighbor's mac address. Neighbor mac +address discovery comes via the ARP (or ND) process and is available via the +arp_tbl neighbor table. To resolve the routes nexthop gateways, the driver +should trigger the kernel's neighbor resolution process. See the rocker +driver's rocker_port_ipv4_resolve() for an example. + +The driver can monitor for updates to arp_tbl using the netevent notifier +NETEVENT_NEIGH_UPDATE. The device can be programmed with resolved nexthops +for the routes as arp_tbl updates. The driver implements ndo_neigh_destroy +to know when arp_tbl neighbor entries are purged from the port. diff --git a/Documentation/networking/switchdev.txt b/Documentation/networking/switchdev.txt deleted file mode 100644 index 86174ce8cd13..000000000000 --- a/Documentation/networking/switchdev.txt +++ /dev/null @@ -1,373 +0,0 @@ -Ethernet switch device driver model (switchdev) -=============================================== -Copyright (c) 2014 Jiri Pirko -Copyright (c) 2014-2015 Scott Feldman - - -The Ethernet switch device driver model (switchdev) is an in-kernel driver -model for switch devices which offload the forwarding (data) plane from the -kernel. - -Figure 1 is a block diagram showing the components of the switchdev model for -an example setup using a data-center-class switch ASIC chip. Other setups -with SR-IOV or soft switches, such as OVS, are possible. - - - User-space tools - - user space | - +-------------------------------------------------------------------+ - kernel | Netlink - | - +--------------+-------------------------------+ - | Network stack | - | (Linux) | - | | - +----------------------------------------------+ - - sw1p2 sw1p4 sw1p6 - sw1p1 + sw1p3 + sw1p5 + eth1 - + | + | + | + - | | | | | | | - +--+----+----+----+----+----+---+ +-----+-----+ - | Switch driver | | mgmt | - | (this document) | | driver | - | | | | - +--------------+----------------+ +-----------+ - | - kernel | HW bus (eg PCI) - +-------------------------------------------------------------------+ - hardware | - +--------------+----------------+ - | Switch device (sw1) | - | +----+ +--------+ - | | v offloaded data path | mgmt port - | | | | - +--|----|----+----+----+----+---+ - | | | | | | - + + + + + + - p1 p2 p3 p4 p5 p6 - - front-panel ports - - - Fig 1. - - -Include Files -------------- - -#include -#include - - -Configuration -------------- - -Use "depends NET_SWITCHDEV" in driver's Kconfig to ensure switchdev model -support is built for driver. - - -Switch Ports ------------- - -On switchdev driver initialization, the driver will allocate and register a -struct net_device (using register_netdev()) for each enumerated physical switch -port, called the port netdev. A port netdev is the software representation of -the physical port and provides a conduit for control traffic to/from the -controller (the kernel) and the network, as well as an anchor point for higher -level constructs such as bridges, bonds, VLANs, tunnels, and L3 routers. Using -standard netdev tools (iproute2, ethtool, etc), the port netdev can also -provide to the user access to the physical properties of the switch port such -as PHY link state and I/O statistics. - -There is (currently) no higher-level kernel object for the switch beyond the -port netdevs. All of the switchdev driver ops are netdev ops or switchdev ops. - -A switch management port is outside the scope of the switchdev driver model. -Typically, the management port is not participating in offloaded data plane and -is loaded with a different driver, such as a NIC driver, on the management port -device. - -Switch ID -^^^^^^^^^ - -The switchdev driver must implement the net_device operation -ndo_get_port_parent_id for each port netdev, returning the same physical ID for -each port of a switch. The ID must be unique between switches on the same -system. The ID does not need to be unique between switches on different -systems. - -The switch ID is used to locate ports on a switch and to know if aggregated -ports belong to the same switch. - -Port Netdev Naming -^^^^^^^^^^^^^^^^^^ - -Udev rules should be used for port netdev naming, using some unique attribute -of the port as a key, for example the port MAC address or the port PHYS name. -Hard-coding of kernel netdev names within the driver is discouraged; let the -kernel pick the default netdev name, and let udev set the final name based on a -port attribute. - -Using port PHYS name (ndo_get_phys_port_name) for the key is particularly -useful for dynamically-named ports where the device names its ports based on -external configuration. For example, if a physical 40G port is split logically -into 4 10G ports, resulting in 4 port netdevs, the device can give a unique -name for each port using port PHYS name. The udev rule would be: - -SUBSYSTEM=="net", ACTION=="add", ATTR{phys_switch_id}=="", \ - ATTR{phys_port_name}!="", NAME="swX$attr{phys_port_name}" - -Suggested naming convention is "swXpYsZ", where X is the switch name or ID, Y -is the port name or ID, and Z is the sub-port name or ID. For example, sw1p1s0 -would be sub-port 0 on port 1 on switch 1. - -Port Features -^^^^^^^^^^^^^ - -NETIF_F_NETNS_LOCAL - -If the switchdev driver (and device) only supports offloading of the default -network namespace (netns), the driver should set this feature flag to prevent -the port netdev from being moved out of the default netns. A netns-aware -driver/device would not set this flag and be responsible for partitioning -hardware to preserve netns containment. This means hardware cannot forward -traffic from a port in one namespace to another port in another namespace. - -Port Topology -^^^^^^^^^^^^^ - -The port netdevs representing the physical switch ports can be organized into -higher-level switching constructs. The default construct is a standalone -router port, used to offload L3 forwarding. Two or more ports can be bonded -together to form a LAG. Two or more ports (or LAGs) can be bridged to bridge -L2 networks. VLANs can be applied to sub-divide L2 networks. L2-over-L3 -tunnels can be built on ports. These constructs are built using standard Linux -tools such as the bridge driver, the bonding/team drivers, and netlink-based -tools such as iproute2. - -The switchdev driver can know a particular port's position in the topology by -monitoring NETDEV_CHANGEUPPER notifications. For example, a port moved into a -bond will see it's upper master change. If that bond is moved into a bridge, -the bond's upper master will change. And so on. The driver will track such -movements to know what position a port is in in the overall topology by -registering for netdevice events and acting on NETDEV_CHANGEUPPER. - -L2 Forwarding Offload ---------------------- - -The idea is to offload the L2 data forwarding (switching) path from the kernel -to the switchdev device by mirroring bridge FDB entries down to the device. An -FDB entry is the {port, MAC, VLAN} tuple forwarding destination. - -To offloading L2 bridging, the switchdev driver/device should support: - - - Static FDB entries installed on a bridge port - - Notification of learned/forgotten src mac/vlans from device - - STP state changes on the port - - VLAN flooding of multicast/broadcast and unknown unicast packets - -Static FDB Entries -^^^^^^^^^^^^^^^^^^ - -The switchdev driver should implement ndo_fdb_add, ndo_fdb_del and ndo_fdb_dump -to support static FDB entries installed to the device. Static bridge FDB -entries are installed, for example, using iproute2 bridge cmd: - - bridge fdb add ADDR dev DEV [vlan VID] [self] - -The driver should use the helper switchdev_port_fdb_xxx ops for ndo_fdb_xxx -ops, and handle add/delete/dump of SWITCHDEV_OBJ_ID_PORT_FDB object using -switchdev_port_obj_xxx ops. - -XXX: what should be done if offloading this rule to hardware fails (for -example, due to full capacity in hardware tables) ? - -Note: by default, the bridge does not filter on VLAN and only bridges untagged -traffic. To enable VLAN support, turn on VLAN filtering: - - echo 1 >/sys/class/net//bridge/vlan_filtering - -Notification of Learned/Forgotten Source MAC/VLANs -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -The switch device will learn/forget source MAC address/VLAN on ingress packets -and notify the switch driver of the mac/vlan/port tuples. The switch driver, -in turn, will notify the bridge driver using the switchdev notifier call: - - err = call_switchdev_notifiers(val, dev, info, extack); - -Where val is SWITCHDEV_FDB_ADD when learning and SWITCHDEV_FDB_DEL when -forgetting, and info points to a struct switchdev_notifier_fdb_info. On -SWITCHDEV_FDB_ADD, the bridge driver will install the FDB entry into the -bridge's FDB and mark the entry as NTF_EXT_LEARNED. The iproute2 bridge -command will label these entries "offload": - - $ bridge fdb - 52:54:00:12:35:01 dev sw1p1 master br0 permanent - 00:02:00:00:02:00 dev sw1p1 master br0 offload - 00:02:00:00:02:00 dev sw1p1 self - 52:54:00:12:35:02 dev sw1p2 master br0 permanent - 00:02:00:00:03:00 dev sw1p2 master br0 offload - 00:02:00:00:03:00 dev sw1p2 self - 33:33:00:00:00:01 dev eth0 self permanent - 01:00:5e:00:00:01 dev eth0 self permanent - 33:33:ff:00:00:00 dev eth0 self permanent - 01:80:c2:00:00:0e dev eth0 self permanent - 33:33:00:00:00:01 dev br0 self permanent - 01:00:5e:00:00:01 dev br0 self permanent - 33:33:ff:12:35:01 dev br0 self permanent - -Learning on the port should be disabled on the bridge using the bridge command: - - bridge link set dev DEV learning off - -Learning on the device port should be enabled, as well as learning_sync: - - bridge link set dev DEV learning on self - bridge link set dev DEV learning_sync on self - -Learning_sync attribute enables syncing of the learned/forgotten FDB entry to -the bridge's FDB. It's possible, but not optimal, to enable learning on the -device port and on the bridge port, and disable learning_sync. - -To support learning, the driver implements switchdev op -switchdev_port_attr_set for SWITCHDEV_ATTR_PORT_ID_{PRE}_BRIDGE_FLAGS. - -FDB Ageing -^^^^^^^^^^ - -The bridge will skip ageing FDB entries marked with NTF_EXT_LEARNED and it is -the responsibility of the port driver/device to age out these entries. If the -port device supports ageing, when the FDB entry expires, it will notify the -driver which in turn will notify the bridge with SWITCHDEV_FDB_DEL. If the -device does not support ageing, the driver can simulate ageing using a -garbage collection timer to monitor FDB entries. Expired entries will be -notified to the bridge using SWITCHDEV_FDB_DEL. See rocker driver for -example of driver running ageing timer. - -To keep an NTF_EXT_LEARNED entry "alive", the driver should refresh the FDB -entry by calling call_switchdev_notifiers(SWITCHDEV_FDB_ADD, ...). The -notification will reset the FDB entry's last-used time to now. The driver -should rate limit refresh notifications, for example, no more than once a -second. (The last-used time is visible using the bridge -s fdb option). - -STP State Change on Port -^^^^^^^^^^^^^^^^^^^^^^^^ - -Internally or with a third-party STP protocol implementation (e.g. mstpd), the -bridge driver maintains the STP state for ports, and will notify the switch -driver of STP state change on a port using the switchdev op -switchdev_attr_port_set for SWITCHDEV_ATTR_PORT_ID_STP_UPDATE. - -State is one of BR_STATE_*. The switch driver can use STP state updates to -update ingress packet filter list for the port. For example, if port is -DISABLED, no packets should pass, but if port moves to BLOCKED, then STP BPDUs -and other IEEE 01:80:c2:xx:xx:xx link-local multicast packets can pass. - -Note that STP BDPUs are untagged and STP state applies to all VLANs on the port -so packet filters should be applied consistently across untagged and tagged -VLANs on the port. - -Flooding L2 domain -^^^^^^^^^^^^^^^^^^ - -For a given L2 VLAN domain, the switch device should flood multicast/broadcast -and unknown unicast packets to all ports in domain, if allowed by port's -current STP state. The switch driver, knowing which ports are within which -vlan L2 domain, can program the switch device for flooding. The packet may -be sent to the port netdev for processing by the bridge driver. The -bridge should not reflood the packet to the same ports the device flooded, -otherwise there will be duplicate packets on the wire. - -To avoid duplicate packets, the switch driver should mark a packet as already -forwarded by setting the skb->offload_fwd_mark bit. The bridge driver will mark -the skb using the ingress bridge port's mark and prevent it from being forwarded -through any bridge port with the same mark. - -It is possible for the switch device to not handle flooding and push the -packets up to the bridge driver for flooding. This is not ideal as the number -of ports scale in the L2 domain as the device is much more efficient at -flooding packets that software. - -If supported by the device, flood control can be offloaded to it, preventing -certain netdevs from flooding unicast traffic for which there is no FDB entry. - -IGMP Snooping -^^^^^^^^^^^^^ - -In order to support IGMP snooping, the port netdevs should trap to the bridge -driver all IGMP join and leave messages. -The bridge multicast module will notify port netdevs on every multicast group -changed whether it is static configured or dynamically joined/leave. -The hardware implementation should be forwarding all registered multicast -traffic groups only to the configured ports. - -L3 Routing Offload ------------------- - -Offloading L3 routing requires that device be programmed with FIB entries from -the kernel, with the device doing the FIB lookup and forwarding. The device -does a longest prefix match (LPM) on FIB entries matching route prefix and -forwards the packet to the matching FIB entry's nexthop(s) egress ports. - -To program the device, the driver has to register a FIB notifier handler -using register_fib_notifier. The following events are available: -FIB_EVENT_ENTRY_ADD: used for both adding a new FIB entry to the device, - or modifying an existing entry on the device. -FIB_EVENT_ENTRY_DEL: used for removing a FIB entry -FIB_EVENT_RULE_ADD, FIB_EVENT_RULE_DEL: used to propagate FIB rule changes - -FIB_EVENT_ENTRY_ADD and FIB_EVENT_ENTRY_DEL events pass: - - struct fib_entry_notifier_info { - struct fib_notifier_info info; /* must be first */ - u32 dst; - int dst_len; - struct fib_info *fi; - u8 tos; - u8 type; - u32 tb_id; - u32 nlflags; - }; - -to add/modify/delete IPv4 dst/dest_len prefix on table tb_id. The *fi -structure holds details on the route and route's nexthops. *dev is one of the -port netdevs mentioned in the route's next hop list. - -Routes offloaded to the device are labeled with "offload" in the ip route -listing: - - $ ip route show - default via 192.168.0.2 dev eth0 - 11.0.0.0/30 dev sw1p1 proto kernel scope link src 11.0.0.2 offload - 11.0.0.4/30 via 11.0.0.1 dev sw1p1 proto zebra metric 20 offload - 11.0.0.8/30 dev sw1p2 proto kernel scope link src 11.0.0.10 offload - 11.0.0.12/30 via 11.0.0.9 dev sw1p2 proto zebra metric 20 offload - 12.0.0.2 proto zebra metric 30 offload - nexthop via 11.0.0.1 dev sw1p1 weight 1 - nexthop via 11.0.0.9 dev sw1p2 weight 1 - 12.0.0.3 via 11.0.0.1 dev sw1p1 proto zebra metric 20 offload - 12.0.0.4 via 11.0.0.9 dev sw1p2 proto zebra metric 20 offload - 192.168.0.0/24 dev eth0 proto kernel scope link src 192.168.0.15 - -The "offload" flag is set in case at least one device offloads the FIB entry. - -XXX: add/mod/del IPv6 FIB API - -Nexthop Resolution -^^^^^^^^^^^^^^^^^^ - -The FIB entry's nexthop list contains the nexthop tuple (gateway, dev), but for -the switch device to forward the packet with the correct dst mac address, the -nexthop gateways must be resolved to the neighbor's mac address. Neighbor mac -address discovery comes via the ARP (or ND) process and is available via the -arp_tbl neighbor table. To resolve the routes nexthop gateways, the driver -should trigger the kernel's neighbor resolution process. See the rocker -driver's rocker_port_ipv4_resolve() for an example. - -The driver can monitor for updates to arp_tbl using the netevent notifier -NETEVENT_NEIGH_UPDATE. The device can be programmed with resolved nexthops -for the routes as arp_tbl updates. The driver implements ndo_neigh_destroy -to know when arp_tbl neighbor entries are purged from the port. diff --git a/drivers/staging/fsl-dpaa2/ethsw/README b/drivers/staging/fsl-dpaa2/ethsw/README index f6fc07f780d1..b48dcbf7c5fb 100644 --- a/drivers/staging/fsl-dpaa2/ethsw/README +++ b/drivers/staging/fsl-dpaa2/ethsw/README @@ -79,7 +79,7 @@ The DPSW can have ports connected to DPNIs or to PHYs via DPMACs. For a more detailed description of the Ethernet switch device driver model see: - Documentation/networking/switchdev.txt + Documentation/networking/switchdev.rst Creating an Ethernet Switch =========================== -- cgit v1.2.3-59-g8ed1b From d2461edde7d15121a2cd39d48cd39edbd3fa019a Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Thu, 30 Apr 2020 18:04:28 +0200 Subject: docs: networking: convert tc-actions-env-rules.txt to ReST - add SPDX header; - add a document title; - use the right numbered list markup; - adjust identation, whitespaces and blank lines where needed; - add to networking/index.rst. Signed-off-by: Mauro Carvalho Chehab Signed-off-by: David S. Miller --- Documentation/networking/index.rst | 1 + Documentation/networking/tc-actions-env-rules.rst | 29 +++++++++++++++++++++++ Documentation/networking/tc-actions-env-rules.txt | 24 ------------------- 3 files changed, 30 insertions(+), 24 deletions(-) create mode 100644 Documentation/networking/tc-actions-env-rules.rst delete mode 100644 Documentation/networking/tc-actions-env-rules.txt diff --git a/Documentation/networking/index.rst b/Documentation/networking/index.rst index 5e495804f96f..f53d89b5679a 100644 --- a/Documentation/networking/index.rst +++ b/Documentation/networking/index.rst @@ -106,6 +106,7 @@ Contents: skfp strparser switchdev + tc-actions-env-rules .. only:: subproject and html diff --git a/Documentation/networking/tc-actions-env-rules.rst b/Documentation/networking/tc-actions-env-rules.rst new file mode 100644 index 000000000000..86884b8fb4e0 --- /dev/null +++ b/Documentation/networking/tc-actions-env-rules.rst @@ -0,0 +1,29 @@ +.. SPDX-License-Identifier: GPL-2.0 + +================================ +TC Actions - Environmental Rules +================================ + + +The "environmental" rules for authors of any new tc actions are: + +1) If you stealeth or borroweth any packet thou shalt be branching + from the righteous path and thou shalt cloneth. + + For example if your action queues a packet to be processed later, + or intentionally branches by redirecting a packet, then you need to + clone the packet. + +2) If you munge any packet thou shalt call pskb_expand_head in the case + someone else is referencing the skb. After that you "own" the skb. + +3) Dropping packets you don't own is a no-no. You simply return + TC_ACT_SHOT to the caller and they will drop it. + +The "environmental" rules for callers of actions (qdiscs etc) are: + +#) Thou art responsible for freeing anything returned as being + TC_ACT_SHOT/STOLEN/QUEUED. If none of TC_ACT_SHOT/STOLEN/QUEUED is + returned, then all is great and you don't need to do anything. + +Post on netdev if something is unclear. diff --git a/Documentation/networking/tc-actions-env-rules.txt b/Documentation/networking/tc-actions-env-rules.txt deleted file mode 100644 index f37814693ad3..000000000000 --- a/Documentation/networking/tc-actions-env-rules.txt +++ /dev/null @@ -1,24 +0,0 @@ - -The "environmental" rules for authors of any new tc actions are: - -1) If you stealeth or borroweth any packet thou shalt be branching -from the righteous path and thou shalt cloneth. - -For example if your action queues a packet to be processed later, -or intentionally branches by redirecting a packet, then you need to -clone the packet. - -2) If you munge any packet thou shalt call pskb_expand_head in the case -someone else is referencing the skb. After that you "own" the skb. - -3) Dropping packets you don't own is a no-no. You simply return -TC_ACT_SHOT to the caller and they will drop it. - -The "environmental" rules for callers of actions (qdiscs etc) are: - -*) Thou art responsible for freeing anything returned as being -TC_ACT_SHOT/STOLEN/QUEUED. If none of TC_ACT_SHOT/STOLEN/QUEUED is -returned, then all is great and you don't need to do anything. - -Post on netdev if something is unclear. - -- cgit v1.2.3-59-g8ed1b From ff159f4f1152b7012d378c69ffd253ed53227865 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Thu, 30 Apr 2020 18:04:29 +0200 Subject: docs: networking: convert tcp-thin.txt to ReST Not much to be done here: - add SPDX header; - adjust identation, whitespaces and blank lines where needed; - add to networking/index.rst. Signed-off-by: Mauro Carvalho Chehab Signed-off-by: David S. Miller --- Documentation/networking/index.rst | 1 + Documentation/networking/ip-sysctl.rst | 2 +- Documentation/networking/tcp-thin.rst | 52 ++++++++++++++++++++++++++++++++++ Documentation/networking/tcp-thin.txt | 47 ------------------------------ 4 files changed, 54 insertions(+), 48 deletions(-) create mode 100644 Documentation/networking/tcp-thin.rst delete mode 100644 Documentation/networking/tcp-thin.txt diff --git a/Documentation/networking/index.rst b/Documentation/networking/index.rst index f53d89b5679a..89b02fbfc2eb 100644 --- a/Documentation/networking/index.rst +++ b/Documentation/networking/index.rst @@ -107,6 +107,7 @@ Contents: strparser switchdev tc-actions-env-rules + tcp-thin .. only:: subproject and html diff --git a/Documentation/networking/ip-sysctl.rst b/Documentation/networking/ip-sysctl.rst index 38f811d4b2f0..3266aee9e052 100644 --- a/Documentation/networking/ip-sysctl.rst +++ b/Documentation/networking/ip-sysctl.rst @@ -886,7 +886,7 @@ tcp_thin_linear_timeouts - BOOLEAN initiated. This improves retransmission latency for non-aggressive thin streams, often found to be time-dependent. For more information on thin streams, see - Documentation/networking/tcp-thin.txt + Documentation/networking/tcp-thin.rst Default: 0 diff --git a/Documentation/networking/tcp-thin.rst b/Documentation/networking/tcp-thin.rst new file mode 100644 index 000000000000..b06765c96ea1 --- /dev/null +++ b/Documentation/networking/tcp-thin.rst @@ -0,0 +1,52 @@ +.. SPDX-License-Identifier: GPL-2.0 + +==================== +Thin-streams and TCP +==================== + +A wide range of Internet-based services that use reliable transport +protocols display what we call thin-stream properties. This means +that the application sends data with such a low rate that the +retransmission mechanisms of the transport protocol are not fully +effective. In time-dependent scenarios (like online games, control +systems, stock trading etc.) where the user experience depends +on the data delivery latency, packet loss can be devastating for +the service quality. Extreme latencies are caused by TCP's +dependency on the arrival of new data from the application to trigger +retransmissions effectively through fast retransmit instead of +waiting for long timeouts. + +After analysing a large number of time-dependent interactive +applications, we have seen that they often produce thin streams +and also stay with this traffic pattern throughout its entire +lifespan. The combination of time-dependency and the fact that the +streams provoke high latencies when using TCP is unfortunate. + +In order to reduce application-layer latency when packets are lost, +a set of mechanisms has been made, which address these latency issues +for thin streams. In short, if the kernel detects a thin stream, +the retransmission mechanisms are modified in the following manner: + +1) If the stream is thin, fast retransmit on the first dupACK. +2) If the stream is thin, do not apply exponential backoff. + +These enhancements are applied only if the stream is detected as +thin. This is accomplished by defining a threshold for the number +of packets in flight. If there are less than 4 packets in flight, +fast retransmissions can not be triggered, and the stream is prone +to experience high retransmission latencies. + +Since these mechanisms are targeted at time-dependent applications, +they must be specifically activated by the application using the +TCP_THIN_LINEAR_TIMEOUTS and TCP_THIN_DUPACK IOCTLS or the +tcp_thin_linear_timeouts and tcp_thin_dupack sysctls. Both +modifications are turned off by default. + +References +========== +More information on the modifications, as well as a wide range of +experimental data can be found here: + +"Improving latency for interactive, thin-stream applications over +reliable transport" +http://simula.no/research/nd/publications/Simula.nd.477/simula_pdf_file diff --git a/Documentation/networking/tcp-thin.txt b/Documentation/networking/tcp-thin.txt deleted file mode 100644 index 151e229980f1..000000000000 --- a/Documentation/networking/tcp-thin.txt +++ /dev/null @@ -1,47 +0,0 @@ -Thin-streams and TCP -==================== -A wide range of Internet-based services that use reliable transport -protocols display what we call thin-stream properties. This means -that the application sends data with such a low rate that the -retransmission mechanisms of the transport protocol are not fully -effective. In time-dependent scenarios (like online games, control -systems, stock trading etc.) where the user experience depends -on the data delivery latency, packet loss can be devastating for -the service quality. Extreme latencies are caused by TCP's -dependency on the arrival of new data from the application to trigger -retransmissions effectively through fast retransmit instead of -waiting for long timeouts. - -After analysing a large number of time-dependent interactive -applications, we have seen that they often produce thin streams -and also stay with this traffic pattern throughout its entire -lifespan. The combination of time-dependency and the fact that the -streams provoke high latencies when using TCP is unfortunate. - -In order to reduce application-layer latency when packets are lost, -a set of mechanisms has been made, which address these latency issues -for thin streams. In short, if the kernel detects a thin stream, -the retransmission mechanisms are modified in the following manner: - -1) If the stream is thin, fast retransmit on the first dupACK. -2) If the stream is thin, do not apply exponential backoff. - -These enhancements are applied only if the stream is detected as -thin. This is accomplished by defining a threshold for the number -of packets in flight. If there are less than 4 packets in flight, -fast retransmissions can not be triggered, and the stream is prone -to experience high retransmission latencies. - -Since these mechanisms are targeted at time-dependent applications, -they must be specifically activated by the application using the -TCP_THIN_LINEAR_TIMEOUTS and TCP_THIN_DUPACK IOCTLS or the -tcp_thin_linear_timeouts and tcp_thin_dupack sysctls. Both -modifications are turned off by default. - -References -========== -More information on the modifications, as well as a wide range of -experimental data can be found here: -"Improving latency for interactive, thin-stream applications over -reliable transport" -http://simula.no/research/nd/publications/Simula.nd.477/simula_pdf_file -- cgit v1.2.3-59-g8ed1b From aa8a6ee3e3fc4001e952de37660fe71826da8189 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Thu, 30 Apr 2020 18:04:30 +0200 Subject: docs: networking: convert team.txt to ReST Not much to be done here: - add SPDX header; - add a document title; - add to networking/index.rst. Signed-off-by: Mauro Carvalho Chehab Signed-off-by: David S. Miller --- Documentation/networking/index.rst | 1 + Documentation/networking/team.rst | 8 ++++++++ Documentation/networking/team.txt | 2 -- 3 files changed, 9 insertions(+), 2 deletions(-) create mode 100644 Documentation/networking/team.rst delete mode 100644 Documentation/networking/team.txt diff --git a/Documentation/networking/index.rst b/Documentation/networking/index.rst index 89b02fbfc2eb..be65ee509669 100644 --- a/Documentation/networking/index.rst +++ b/Documentation/networking/index.rst @@ -108,6 +108,7 @@ Contents: switchdev tc-actions-env-rules tcp-thin + team .. only:: subproject and html diff --git a/Documentation/networking/team.rst b/Documentation/networking/team.rst new file mode 100644 index 000000000000..0a7f3a059586 --- /dev/null +++ b/Documentation/networking/team.rst @@ -0,0 +1,8 @@ +.. SPDX-License-Identifier: GPL-2.0 + +==== +Team +==== + +Team devices are driven from userspace via libteam library which is here: + https://github.com/jpirko/libteam diff --git a/Documentation/networking/team.txt b/Documentation/networking/team.txt deleted file mode 100644 index 5a013686b9ea..000000000000 --- a/Documentation/networking/team.txt +++ /dev/null @@ -1,2 +0,0 @@ -Team devices are driven from userspace via libteam library which is here: - https://github.com/jpirko/libteam -- cgit v1.2.3-59-g8ed1b From 06bfa47e72c83550fefc93c62a1ace5fff72e212 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Thu, 30 Apr 2020 18:04:31 +0200 Subject: docs: networking: convert timestamping.txt to ReST - add SPDX header; - add a document title; - adjust titles and chapters, adding proper markups; - mark code blocks and literals as such; - adjust identation, whitespaces and blank lines where needed; - add to networking/index.rst. Signed-off-by: Mauro Carvalho Chehab Signed-off-by: David S. Miller --- Documentation/networking/index.rst | 1 + Documentation/networking/packet_mmap.rst | 4 +- Documentation/networking/timestamping.rst | 591 ++++++++++++++++++++++++++++++ Documentation/networking/timestamping.txt | 571 ----------------------------- include/uapi/linux/errqueue.h | 2 +- 5 files changed, 595 insertions(+), 574 deletions(-) create mode 100644 Documentation/networking/timestamping.rst delete mode 100644 Documentation/networking/timestamping.txt diff --git a/Documentation/networking/index.rst b/Documentation/networking/index.rst index be65ee509669..8f9a84b8e3f2 100644 --- a/Documentation/networking/index.rst +++ b/Documentation/networking/index.rst @@ -109,6 +109,7 @@ Contents: tc-actions-env-rules tcp-thin team + timestamping .. only:: subproject and html diff --git a/Documentation/networking/packet_mmap.rst b/Documentation/networking/packet_mmap.rst index 884c7222b9e9..6c009ceb1183 100644 --- a/Documentation/networking/packet_mmap.rst +++ b/Documentation/networking/packet_mmap.rst @@ -1030,7 +1030,7 @@ the packet meta information for mmap(2)ed RX_RING and TX_RINGs. If your NIC is capable of timestamping packets in hardware, you can request those hardware timestamps to be used. Note: you may need to enable the generation of hardware timestamps with SIOCSHWTSTAMP (see related information from -Documentation/networking/timestamping.txt). +Documentation/networking/timestamping.rst). PACKET_TIMESTAMP accepts the same integer bit field as SO_TIMESTAMPING:: @@ -1069,7 +1069,7 @@ TX_RING part only TP_STATUS_AVAILABLE is set, then the tp_sec and tp_{n,u}sec members do not contain a valid value. For TX_RINGs, by default no timestamp is generated! -See include/linux/net_tstamp.h and Documentation/networking/timestamping.txt +See include/linux/net_tstamp.h and Documentation/networking/timestamping.rst for more information on hardware timestamps. Miscellaneous bits diff --git a/Documentation/networking/timestamping.rst b/Documentation/networking/timestamping.rst new file mode 100644 index 000000000000..1adead6a4527 --- /dev/null +++ b/Documentation/networking/timestamping.rst @@ -0,0 +1,591 @@ +.. SPDX-License-Identifier: GPL-2.0 + +============ +Timestamping +============ + + +1. Control Interfaces +===================== + +The interfaces for receiving network packages timestamps are: + +SO_TIMESTAMP + Generates a timestamp for each incoming packet in (not necessarily + monotonic) system time. Reports the timestamp via recvmsg() in a + control message in usec resolution. + SO_TIMESTAMP is defined as SO_TIMESTAMP_NEW or SO_TIMESTAMP_OLD + based on the architecture type and time_t representation of libc. + Control message format is in struct __kernel_old_timeval for + SO_TIMESTAMP_OLD and in struct __kernel_sock_timeval for + SO_TIMESTAMP_NEW options respectively. + +SO_TIMESTAMPNS + Same timestamping mechanism as SO_TIMESTAMP, but reports the + timestamp as struct timespec in nsec resolution. + SO_TIMESTAMPNS is defined as SO_TIMESTAMPNS_NEW or SO_TIMESTAMPNS_OLD + based on the architecture type and time_t representation of libc. + Control message format is in struct timespec for SO_TIMESTAMPNS_OLD + and in struct __kernel_timespec for SO_TIMESTAMPNS_NEW options + respectively. + +IP_MULTICAST_LOOP + SO_TIMESTAMP[NS] + Only for multicast:approximate transmit timestamp obtained by + reading the looped packet receive timestamp. + +SO_TIMESTAMPING + Generates timestamps on reception, transmission or both. Supports + multiple timestamp sources, including hardware. Supports generating + timestamps for stream sockets. + + +1.1 SO_TIMESTAMP (also SO_TIMESTAMP_OLD and SO_TIMESTAMP_NEW) +------------------------------------------------------------- + +This socket option enables timestamping of datagrams on the reception +path. Because the destination socket, if any, is not known early in +the network stack, the feature has to be enabled for all packets. The +same is true for all early receive timestamp options. + +For interface details, see `man 7 socket`. + +Always use SO_TIMESTAMP_NEW timestamp to always get timestamp in +struct __kernel_sock_timeval format. + +SO_TIMESTAMP_OLD returns incorrect timestamps after the year 2038 +on 32 bit machines. + +1.2 SO_TIMESTAMPNS (also SO_TIMESTAMPNS_OLD and SO_TIMESTAMPNS_NEW): + +This option is identical to SO_TIMESTAMP except for the returned data type. +Its struct timespec allows for higher resolution (ns) timestamps than the +timeval of SO_TIMESTAMP (ms). + +Always use SO_TIMESTAMPNS_NEW timestamp to always get timestamp in +struct __kernel_timespec format. + +SO_TIMESTAMPNS_OLD returns incorrect timestamps after the year 2038 +on 32 bit machines. + +1.3 SO_TIMESTAMPING (also SO_TIMESTAMPING_OLD and SO_TIMESTAMPING_NEW) +---------------------------------------------------------------------- + +Supports multiple types of timestamp requests. As a result, this +socket option takes a bitmap of flags, not a boolean. In:: + + err = setsockopt(fd, SOL_SOCKET, SO_TIMESTAMPING, &val, sizeof(val)); + +val is an integer with any of the following bits set. Setting other +bit returns EINVAL and does not change the current state. + +The socket option configures timestamp generation for individual +sk_buffs (1.3.1), timestamp reporting to the socket's error +queue (1.3.2) and options (1.3.3). Timestamp generation can also +be enabled for individual sendmsg calls using cmsg (1.3.4). + + +1.3.1 Timestamp Generation +^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Some bits are requests to the stack to try to generate timestamps. Any +combination of them is valid. Changes to these bits apply to newly +created packets, not to packets already in the stack. As a result, it +is possible to selectively request timestamps for a subset of packets +(e.g., for sampling) by embedding an send() call within two setsockopt +calls, one to enable timestamp generation and one to disable it. +Timestamps may also be generated for reasons other than being +requested by a particular socket, such as when receive timestamping is +enabled system wide, as explained earlier. + +SOF_TIMESTAMPING_RX_HARDWARE: + Request rx timestamps generated by the network adapter. + +SOF_TIMESTAMPING_RX_SOFTWARE: + Request rx timestamps when data enters the kernel. These timestamps + are generated just after a device driver hands a packet to the + kernel receive stack. + +SOF_TIMESTAMPING_TX_HARDWARE: + Request tx timestamps generated by the network adapter. This flag + can be enabled via both socket options and control messages. + +SOF_TIMESTAMPING_TX_SOFTWARE: + Request tx timestamps when data leaves the kernel. These timestamps + are generated in the device driver as close as possible, but always + prior to, passing the packet to the network interface. Hence, they + require driver support and may not be available for all devices. + This flag can be enabled via both socket options and control messages. + +SOF_TIMESTAMPING_TX_SCHED: + Request tx timestamps prior to entering the packet scheduler. Kernel + transmit latency is, if long, often dominated by queuing delay. The + difference between this timestamp and one taken at + SOF_TIMESTAMPING_TX_SOFTWARE will expose this latency independent + of protocol processing. The latency incurred in protocol + processing, if any, can be computed by subtracting a userspace + timestamp taken immediately before send() from this timestamp. On + machines with virtual devices where a transmitted packet travels + through multiple devices and, hence, multiple packet schedulers, + a timestamp is generated at each layer. This allows for fine + grained measurement of queuing delay. This flag can be enabled + via both socket options and control messages. + +SOF_TIMESTAMPING_TX_ACK: + Request tx timestamps when all data in the send buffer has been + acknowledged. This only makes sense for reliable protocols. It is + currently only implemented for TCP. For that protocol, it may + over-report measurement, because the timestamp is generated when all + data up to and including the buffer at send() was acknowledged: the + cumulative acknowledgment. The mechanism ignores SACK and FACK. + This flag can be enabled via both socket options and control messages. + + +1.3.2 Timestamp Reporting +^^^^^^^^^^^^^^^^^^^^^^^^^ + +The other three bits control which timestamps will be reported in a +generated control message. Changes to the bits take immediate +effect at the timestamp reporting locations in the stack. Timestamps +are only reported for packets that also have the relevant timestamp +generation request set. + +SOF_TIMESTAMPING_SOFTWARE: + Report any software timestamps when available. + +SOF_TIMESTAMPING_SYS_HARDWARE: + This option is deprecated and ignored. + +SOF_TIMESTAMPING_RAW_HARDWARE: + Report hardware timestamps as generated by + SOF_TIMESTAMPING_TX_HARDWARE when available. + + +1.3.3 Timestamp Options +^^^^^^^^^^^^^^^^^^^^^^^ + +The interface supports the options + +SOF_TIMESTAMPING_OPT_ID: + Generate a unique identifier along with each packet. A process can + have multiple concurrent timestamping requests outstanding. Packets + can be reordered in the transmit path, for instance in the packet + scheduler. In that case timestamps will be queued onto the error + queue out of order from the original send() calls. It is not always + possible to uniquely match timestamps to the original send() calls + based on timestamp order or payload inspection alone, then. + + This option associates each packet at send() with a unique + identifier and returns that along with the timestamp. The identifier + is derived from a per-socket u32 counter (that wraps). For datagram + sockets, the counter increments with each sent packet. For stream + sockets, it increments with every byte. + + The counter starts at zero. It is initialized the first time that + the socket option is enabled. It is reset each time the option is + enabled after having been disabled. Resetting the counter does not + change the identifiers of existing packets in the system. + + This option is implemented only for transmit timestamps. There, the + timestamp is always looped along with a struct sock_extended_err. + The option modifies field ee_data to pass an id that is unique + among all possibly concurrently outstanding timestamp requests for + that socket. + + +SOF_TIMESTAMPING_OPT_CMSG: + Support recv() cmsg for all timestamped packets. Control messages + are already supported unconditionally on all packets with receive + timestamps and on IPv6 packets with transmit timestamp. This option + extends them to IPv4 packets with transmit timestamp. One use case + is to correlate packets with their egress device, by enabling socket + option IP_PKTINFO simultaneously. + + +SOF_TIMESTAMPING_OPT_TSONLY: + Applies to transmit timestamps only. Makes the kernel return the + timestamp as a cmsg alongside an empty packet, as opposed to + alongside the original packet. This reduces the amount of memory + charged to the socket's receive budget (SO_RCVBUF) and delivers + the timestamp even if sysctl net.core.tstamp_allow_data is 0. + This option disables SOF_TIMESTAMPING_OPT_CMSG. + +SOF_TIMESTAMPING_OPT_STATS: + Optional stats that are obtained along with the transmit timestamps. + It must be used together with SOF_TIMESTAMPING_OPT_TSONLY. When the + transmit timestamp is available, the stats are available in a + separate control message of type SCM_TIMESTAMPING_OPT_STATS, as a + list of TLVs (struct nlattr) of types. These stats allow the + application to associate various transport layer stats with + the transmit timestamps, such as how long a certain block of + data was limited by peer's receiver window. + +SOF_TIMESTAMPING_OPT_PKTINFO: + Enable the SCM_TIMESTAMPING_PKTINFO control message for incoming + packets with hardware timestamps. The message contains struct + scm_ts_pktinfo, which supplies the index of the real interface which + received the packet and its length at layer 2. A valid (non-zero) + interface index will be returned only if CONFIG_NET_RX_BUSY_POLL is + enabled and the driver is using NAPI. The struct contains also two + other fields, but they are reserved and undefined. + +SOF_TIMESTAMPING_OPT_TX_SWHW: + Request both hardware and software timestamps for outgoing packets + when SOF_TIMESTAMPING_TX_HARDWARE and SOF_TIMESTAMPING_TX_SOFTWARE + are enabled at the same time. If both timestamps are generated, + two separate messages will be looped to the socket's error queue, + each containing just one timestamp. + +New applications are encouraged to pass SOF_TIMESTAMPING_OPT_ID to +disambiguate timestamps and SOF_TIMESTAMPING_OPT_TSONLY to operate +regardless of the setting of sysctl net.core.tstamp_allow_data. + +An exception is when a process needs additional cmsg data, for +instance SOL_IP/IP_PKTINFO to detect the egress network interface. +Then pass option SOF_TIMESTAMPING_OPT_CMSG. This option depends on +having access to the contents of the original packet, so cannot be +combined with SOF_TIMESTAMPING_OPT_TSONLY. + + +1.3.4. Enabling timestamps via control messages +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +In addition to socket options, timestamp generation can be requested +per write via cmsg, only for SOF_TIMESTAMPING_TX_* (see Section 1.3.1). +Using this feature, applications can sample timestamps per sendmsg() +without paying the overhead of enabling and disabling timestamps via +setsockopt:: + + struct msghdr *msg; + ... + cmsg = CMSG_FIRSTHDR(msg); + cmsg->cmsg_level = SOL_SOCKET; + cmsg->cmsg_type = SO_TIMESTAMPING; + cmsg->cmsg_len = CMSG_LEN(sizeof(__u32)); + *((__u32 *) CMSG_DATA(cmsg)) = SOF_TIMESTAMPING_TX_SCHED | + SOF_TIMESTAMPING_TX_SOFTWARE | + SOF_TIMESTAMPING_TX_ACK; + err = sendmsg(fd, msg, 0); + +The SOF_TIMESTAMPING_TX_* flags set via cmsg will override +the SOF_TIMESTAMPING_TX_* flags set via setsockopt. + +Moreover, applications must still enable timestamp reporting via +setsockopt to receive timestamps:: + + __u32 val = SOF_TIMESTAMPING_SOFTWARE | + SOF_TIMESTAMPING_OPT_ID /* or any other flag */; + err = setsockopt(fd, SOL_SOCKET, SO_TIMESTAMPING, &val, sizeof(val)); + + +1.4 Bytestream Timestamps +------------------------- + +The SO_TIMESTAMPING interface supports timestamping of bytes in a +bytestream. Each request is interpreted as a request for when the +entire contents of the buffer has passed a timestamping point. That +is, for streams option SOF_TIMESTAMPING_TX_SOFTWARE will record +when all bytes have reached the device driver, regardless of how +many packets the data has been converted into. + +In general, bytestreams have no natural delimiters and therefore +correlating a timestamp with data is non-trivial. A range of bytes +may be split across segments, any segments may be merged (possibly +coalescing sections of previously segmented buffers associated with +independent send() calls). Segments can be reordered and the same +byte range can coexist in multiple segments for protocols that +implement retransmissions. + +It is essential that all timestamps implement the same semantics, +regardless of these possible transformations, as otherwise they are +incomparable. Handling "rare" corner cases differently from the +simple case (a 1:1 mapping from buffer to skb) is insufficient +because performance debugging often needs to focus on such outliers. + +In practice, timestamps can be correlated with segments of a +bytestream consistently, if both semantics of the timestamp and the +timing of measurement are chosen correctly. This challenge is no +different from deciding on a strategy for IP fragmentation. There, the +definition is that only the first fragment is timestamped. For +bytestreams, we chose that a timestamp is generated only when all +bytes have passed a point. SOF_TIMESTAMPING_TX_ACK as defined is easy to +implement and reason about. An implementation that has to take into +account SACK would be more complex due to possible transmission holes +and out of order arrival. + +On the host, TCP can also break the simple 1:1 mapping from buffer to +skbuff as a result of Nagle, cork, autocork, segmentation and GSO. The +implementation ensures correctness in all cases by tracking the +individual last byte passed to send(), even if it is no longer the +last byte after an skbuff extend or merge operation. It stores the +relevant sequence number in skb_shinfo(skb)->tskey. Because an skbuff +has only one such field, only one timestamp can be generated. + +In rare cases, a timestamp request can be missed if two requests are +collapsed onto the same skb. A process can detect this situation by +enabling SOF_TIMESTAMPING_OPT_ID and comparing the byte offset at +send time with the value returned for each timestamp. It can prevent +the situation by always flushing the TCP stack in between requests, +for instance by enabling TCP_NODELAY and disabling TCP_CORK and +autocork. + +These precautions ensure that the timestamp is generated only when all +bytes have passed a timestamp point, assuming that the network stack +itself does not reorder the segments. The stack indeed tries to avoid +reordering. The one exception is under administrator control: it is +possible to construct a packet scheduler configuration that delays +segments from the same stream differently. Such a setup would be +unusual. + + +2 Data Interfaces +================== + +Timestamps are read using the ancillary data feature of recvmsg(). +See `man 3 cmsg` for details of this interface. The socket manual +page (`man 7 socket`) describes how timestamps generated with +SO_TIMESTAMP and SO_TIMESTAMPNS records can be retrieved. + + +2.1 SCM_TIMESTAMPING records +---------------------------- + +These timestamps are returned in a control message with cmsg_level +SOL_SOCKET, cmsg_type SCM_TIMESTAMPING, and payload of type + +For SO_TIMESTAMPING_OLD:: + + struct scm_timestamping { + struct timespec ts[3]; + }; + +For SO_TIMESTAMPING_NEW:: + + struct scm_timestamping64 { + struct __kernel_timespec ts[3]; + +Always use SO_TIMESTAMPING_NEW timestamp to always get timestamp in +struct scm_timestamping64 format. + +SO_TIMESTAMPING_OLD returns incorrect timestamps after the year 2038 +on 32 bit machines. + +The structure can return up to three timestamps. This is a legacy +feature. At least one field is non-zero at any time. Most timestamps +are passed in ts[0]. Hardware timestamps are passed in ts[2]. + +ts[1] used to hold hardware timestamps converted to system time. +Instead, expose the hardware clock device on the NIC directly as +a HW PTP clock source, to allow time conversion in userspace and +optionally synchronize system time with a userspace PTP stack such +as linuxptp. For the PTP clock API, see Documentation/driver-api/ptp.rst. + +Note that if the SO_TIMESTAMP or SO_TIMESTAMPNS option is enabled +together with SO_TIMESTAMPING using SOF_TIMESTAMPING_SOFTWARE, a false +software timestamp will be generated in the recvmsg() call and passed +in ts[0] when a real software timestamp is missing. This happens also +on hardware transmit timestamps. + +2.1.1 Transmit timestamps with MSG_ERRQUEUE +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +For transmit timestamps the outgoing packet is looped back to the +socket's error queue with the send timestamp(s) attached. A process +receives the timestamps by calling recvmsg() with flag MSG_ERRQUEUE +set and with a msg_control buffer sufficiently large to receive the +relevant metadata structures. The recvmsg call returns the original +outgoing data packet with two ancillary messages attached. + +A message of cm_level SOL_IP(V6) and cm_type IP(V6)_RECVERR +embeds a struct sock_extended_err. This defines the error type. For +timestamps, the ee_errno field is ENOMSG. The other ancillary message +will have cm_level SOL_SOCKET and cm_type SCM_TIMESTAMPING. This +embeds the struct scm_timestamping. + + +2.1.1.2 Timestamp types +~~~~~~~~~~~~~~~~~~~~~~~ + +The semantics of the three struct timespec are defined by field +ee_info in the extended error structure. It contains a value of +type SCM_TSTAMP_* to define the actual timestamp passed in +scm_timestamping. + +The SCM_TSTAMP_* types are 1:1 matches to the SOF_TIMESTAMPING_* +control fields discussed previously, with one exception. For legacy +reasons, SCM_TSTAMP_SND is equal to zero and can be set for both +SOF_TIMESTAMPING_TX_HARDWARE and SOF_TIMESTAMPING_TX_SOFTWARE. It +is the first if ts[2] is non-zero, the second otherwise, in which +case the timestamp is stored in ts[0]. + + +2.1.1.3 Fragmentation +~~~~~~~~~~~~~~~~~~~~~ + +Fragmentation of outgoing datagrams is rare, but is possible, e.g., by +explicitly disabling PMTU discovery. If an outgoing packet is fragmented, +then only the first fragment is timestamped and returned to the sending +socket. + + +2.1.1.4 Packet Payload +~~~~~~~~~~~~~~~~~~~~~~ + +The calling application is often not interested in receiving the whole +packet payload that it passed to the stack originally: the socket +error queue mechanism is just a method to piggyback the timestamp on. +In this case, the application can choose to read datagrams with a +smaller buffer, possibly even of length 0. The payload is truncated +accordingly. Until the process calls recvmsg() on the error queue, +however, the full packet is queued, taking up budget from SO_RCVBUF. + + +2.1.1.5 Blocking Read +~~~~~~~~~~~~~~~~~~~~~ + +Reading from the error queue is always a non-blocking operation. To +block waiting on a timestamp, use poll or select. poll() will return +POLLERR in pollfd.revents if any data is ready on the error queue. +There is no need to pass this flag in pollfd.events. This flag is +ignored on request. See also `man 2 poll`. + + +2.1.2 Receive timestamps +^^^^^^^^^^^^^^^^^^^^^^^^ + +On reception, there is no reason to read from the socket error queue. +The SCM_TIMESTAMPING ancillary data is sent along with the packet data +on a normal recvmsg(). Since this is not a socket error, it is not +accompanied by a message SOL_IP(V6)/IP(V6)_RECVERROR. In this case, +the meaning of the three fields in struct scm_timestamping is +implicitly defined. ts[0] holds a software timestamp if set, ts[1] +is again deprecated and ts[2] holds a hardware timestamp if set. + + +3. Hardware Timestamping configuration: SIOCSHWTSTAMP and SIOCGHWTSTAMP +======================================================================= + +Hardware time stamping must also be initialized for each device driver +that is expected to do hardware time stamping. The parameter is defined in +include/uapi/linux/net_tstamp.h as:: + + struct hwtstamp_config { + int flags; /* no flags defined right now, must be zero */ + int tx_type; /* HWTSTAMP_TX_* */ + int rx_filter; /* HWTSTAMP_FILTER_* */ + }; + +Desired behavior is passed into the kernel and to a specific device by +calling ioctl(SIOCSHWTSTAMP) with a pointer to a struct ifreq whose +ifr_data points to a struct hwtstamp_config. The tx_type and +rx_filter are hints to the driver what it is expected to do. If +the requested fine-grained filtering for incoming packets is not +supported, the driver may time stamp more than just the requested types +of packets. + +Drivers are free to use a more permissive configuration than the requested +configuration. It is expected that drivers should only implement directly the +most generic mode that can be supported. For example if the hardware can +support HWTSTAMP_FILTER_V2_EVENT, then it should generally always upscale +HWTSTAMP_FILTER_V2_L2_SYNC_MESSAGE, and so forth, as HWTSTAMP_FILTER_V2_EVENT +is more generic (and more useful to applications). + +A driver which supports hardware time stamping shall update the struct +with the actual, possibly more permissive configuration. If the +requested packets cannot be time stamped, then nothing should be +changed and ERANGE shall be returned (in contrast to EINVAL, which +indicates that SIOCSHWTSTAMP is not supported at all). + +Only a processes with admin rights may change the configuration. User +space is responsible to ensure that multiple processes don't interfere +with each other and that the settings are reset. + +Any process can read the actual configuration by passing this +structure to ioctl(SIOCGHWTSTAMP) in the same way. However, this has +not been implemented in all drivers. + +:: + + /* possible values for hwtstamp_config->tx_type */ + enum { + /* + * no outgoing packet will need hardware time stamping; + * should a packet arrive which asks for it, no hardware + * time stamping will be done + */ + HWTSTAMP_TX_OFF, + + /* + * enables hardware time stamping for outgoing packets; + * the sender of the packet decides which are to be + * time stamped by setting SOF_TIMESTAMPING_TX_SOFTWARE + * before sending the packet + */ + HWTSTAMP_TX_ON, + }; + + /* possible values for hwtstamp_config->rx_filter */ + enum { + /* time stamp no incoming packet at all */ + HWTSTAMP_FILTER_NONE, + + /* time stamp any incoming packet */ + HWTSTAMP_FILTER_ALL, + + /* return value: time stamp all packets requested plus some others */ + HWTSTAMP_FILTER_SOME, + + /* PTP v1, UDP, any kind of event packet */ + HWTSTAMP_FILTER_PTP_V1_L4_EVENT, + + /* for the complete list of values, please check + * the include file include/uapi/linux/net_tstamp.h + */ + }; + +3.1 Hardware Timestamping Implementation: Device Drivers +-------------------------------------------------------- + +A driver which supports hardware time stamping must support the +SIOCSHWTSTAMP ioctl and update the supplied struct hwtstamp_config with +the actual values as described in the section on SIOCSHWTSTAMP. It +should also support SIOCGHWTSTAMP. + +Time stamps for received packets must be stored in the skb. To get a pointer +to the shared time stamp structure of the skb call skb_hwtstamps(). Then +set the time stamps in the structure:: + + struct skb_shared_hwtstamps { + /* hardware time stamp transformed into duration + * since arbitrary point in time + */ + ktime_t hwtstamp; + }; + +Time stamps for outgoing packets are to be generated as follows: + +- In hard_start_xmit(), check if (skb_shinfo(skb)->tx_flags & SKBTX_HW_TSTAMP) + is set no-zero. If yes, then the driver is expected to do hardware time + stamping. +- If this is possible for the skb and requested, then declare + that the driver is doing the time stamping by setting the flag + SKBTX_IN_PROGRESS in skb_shinfo(skb)->tx_flags , e.g. with:: + + skb_shinfo(skb)->tx_flags |= SKBTX_IN_PROGRESS; + + You might want to keep a pointer to the associated skb for the next step + and not free the skb. A driver not supporting hardware time stamping doesn't + do that. A driver must never touch sk_buff::tstamp! It is used to store + software generated time stamps by the network subsystem. +- Driver should call skb_tx_timestamp() as close to passing sk_buff to hardware + as possible. skb_tx_timestamp() provides a software time stamp if requested + and hardware timestamping is not possible (SKBTX_IN_PROGRESS not set). +- As soon as the driver has sent the packet and/or obtained a + hardware time stamp for it, it passes the time stamp back by + calling skb_hwtstamp_tx() with the original skb, the raw + hardware time stamp. skb_hwtstamp_tx() clones the original skb and + adds the timestamps, therefore the original skb has to be freed now. + If obtaining the hardware time stamp somehow fails, then the driver + should not fall back to software time stamping. The rationale is that + this would occur at a later time in the processing pipeline than other + software time stamping and therefore could lead to unexpected deltas + between time stamps. diff --git a/Documentation/networking/timestamping.txt b/Documentation/networking/timestamping.txt deleted file mode 100644 index 8dd6333c3270..000000000000 --- a/Documentation/networking/timestamping.txt +++ /dev/null @@ -1,571 +0,0 @@ - -1. Control Interfaces - -The interfaces for receiving network packages timestamps are: - -* SO_TIMESTAMP - Generates a timestamp for each incoming packet in (not necessarily - monotonic) system time. Reports the timestamp via recvmsg() in a - control message in usec resolution. - SO_TIMESTAMP is defined as SO_TIMESTAMP_NEW or SO_TIMESTAMP_OLD - based on the architecture type and time_t representation of libc. - Control message format is in struct __kernel_old_timeval for - SO_TIMESTAMP_OLD and in struct __kernel_sock_timeval for - SO_TIMESTAMP_NEW options respectively. - -* SO_TIMESTAMPNS - Same timestamping mechanism as SO_TIMESTAMP, but reports the - timestamp as struct timespec in nsec resolution. - SO_TIMESTAMPNS is defined as SO_TIMESTAMPNS_NEW or SO_TIMESTAMPNS_OLD - based on the architecture type and time_t representation of libc. - Control message format is in struct timespec for SO_TIMESTAMPNS_OLD - and in struct __kernel_timespec for SO_TIMESTAMPNS_NEW options - respectively. - -* IP_MULTICAST_LOOP + SO_TIMESTAMP[NS] - Only for multicast:approximate transmit timestamp obtained by - reading the looped packet receive timestamp. - -* SO_TIMESTAMPING - Generates timestamps on reception, transmission or both. Supports - multiple timestamp sources, including hardware. Supports generating - timestamps for stream sockets. - - -1.1 SO_TIMESTAMP (also SO_TIMESTAMP_OLD and SO_TIMESTAMP_NEW): - -This socket option enables timestamping of datagrams on the reception -path. Because the destination socket, if any, is not known early in -the network stack, the feature has to be enabled for all packets. The -same is true for all early receive timestamp options. - -For interface details, see `man 7 socket`. - -Always use SO_TIMESTAMP_NEW timestamp to always get timestamp in -struct __kernel_sock_timeval format. - -SO_TIMESTAMP_OLD returns incorrect timestamps after the year 2038 -on 32 bit machines. - -1.2 SO_TIMESTAMPNS (also SO_TIMESTAMPNS_OLD and SO_TIMESTAMPNS_NEW): - -This option is identical to SO_TIMESTAMP except for the returned data type. -Its struct timespec allows for higher resolution (ns) timestamps than the -timeval of SO_TIMESTAMP (ms). - -Always use SO_TIMESTAMPNS_NEW timestamp to always get timestamp in -struct __kernel_timespec format. - -SO_TIMESTAMPNS_OLD returns incorrect timestamps after the year 2038 -on 32 bit machines. - -1.3 SO_TIMESTAMPING (also SO_TIMESTAMPING_OLD and SO_TIMESTAMPING_NEW): - -Supports multiple types of timestamp requests. As a result, this -socket option takes a bitmap of flags, not a boolean. In - - err = setsockopt(fd, SOL_SOCKET, SO_TIMESTAMPING, &val, sizeof(val)); - -val is an integer with any of the following bits set. Setting other -bit returns EINVAL and does not change the current state. - -The socket option configures timestamp generation for individual -sk_buffs (1.3.1), timestamp reporting to the socket's error -queue (1.3.2) and options (1.3.3). Timestamp generation can also -be enabled for individual sendmsg calls using cmsg (1.3.4). - - -1.3.1 Timestamp Generation - -Some bits are requests to the stack to try to generate timestamps. Any -combination of them is valid. Changes to these bits apply to newly -created packets, not to packets already in the stack. As a result, it -is possible to selectively request timestamps for a subset of packets -(e.g., for sampling) by embedding an send() call within two setsockopt -calls, one to enable timestamp generation and one to disable it. -Timestamps may also be generated for reasons other than being -requested by a particular socket, such as when receive timestamping is -enabled system wide, as explained earlier. - -SOF_TIMESTAMPING_RX_HARDWARE: - Request rx timestamps generated by the network adapter. - -SOF_TIMESTAMPING_RX_SOFTWARE: - Request rx timestamps when data enters the kernel. These timestamps - are generated just after a device driver hands a packet to the - kernel receive stack. - -SOF_TIMESTAMPING_TX_HARDWARE: - Request tx timestamps generated by the network adapter. This flag - can be enabled via both socket options and control messages. - -SOF_TIMESTAMPING_TX_SOFTWARE: - Request tx timestamps when data leaves the kernel. These timestamps - are generated in the device driver as close as possible, but always - prior to, passing the packet to the network interface. Hence, they - require driver support and may not be available for all devices. - This flag can be enabled via both socket options and control messages. - - -SOF_TIMESTAMPING_TX_SCHED: - Request tx timestamps prior to entering the packet scheduler. Kernel - transmit latency is, if long, often dominated by queuing delay. The - difference between this timestamp and one taken at - SOF_TIMESTAMPING_TX_SOFTWARE will expose this latency independent - of protocol processing. The latency incurred in protocol - processing, if any, can be computed by subtracting a userspace - timestamp taken immediately before send() from this timestamp. On - machines with virtual devices where a transmitted packet travels - through multiple devices and, hence, multiple packet schedulers, - a timestamp is generated at each layer. This allows for fine - grained measurement of queuing delay. This flag can be enabled - via both socket options and control messages. - -SOF_TIMESTAMPING_TX_ACK: - Request tx timestamps when all data in the send buffer has been - acknowledged. This only makes sense for reliable protocols. It is - currently only implemented for TCP. For that protocol, it may - over-report measurement, because the timestamp is generated when all - data up to and including the buffer at send() was acknowledged: the - cumulative acknowledgment. The mechanism ignores SACK and FACK. - This flag can be enabled via both socket options and control messages. - - -1.3.2 Timestamp Reporting - -The other three bits control which timestamps will be reported in a -generated control message. Changes to the bits take immediate -effect at the timestamp reporting locations in the stack. Timestamps -are only reported for packets that also have the relevant timestamp -generation request set. - -SOF_TIMESTAMPING_SOFTWARE: - Report any software timestamps when available. - -SOF_TIMESTAMPING_SYS_HARDWARE: - This option is deprecated and ignored. - -SOF_TIMESTAMPING_RAW_HARDWARE: - Report hardware timestamps as generated by - SOF_TIMESTAMPING_TX_HARDWARE when available. - - -1.3.3 Timestamp Options - -The interface supports the options - -SOF_TIMESTAMPING_OPT_ID: - - Generate a unique identifier along with each packet. A process can - have multiple concurrent timestamping requests outstanding. Packets - can be reordered in the transmit path, for instance in the packet - scheduler. In that case timestamps will be queued onto the error - queue out of order from the original send() calls. It is not always - possible to uniquely match timestamps to the original send() calls - based on timestamp order or payload inspection alone, then. - - This option associates each packet at send() with a unique - identifier and returns that along with the timestamp. The identifier - is derived from a per-socket u32 counter (that wraps). For datagram - sockets, the counter increments with each sent packet. For stream - sockets, it increments with every byte. - - The counter starts at zero. It is initialized the first time that - the socket option is enabled. It is reset each time the option is - enabled after having been disabled. Resetting the counter does not - change the identifiers of existing packets in the system. - - This option is implemented only for transmit timestamps. There, the - timestamp is always looped along with a struct sock_extended_err. - The option modifies field ee_data to pass an id that is unique - among all possibly concurrently outstanding timestamp requests for - that socket. - - -SOF_TIMESTAMPING_OPT_CMSG: - - Support recv() cmsg for all timestamped packets. Control messages - are already supported unconditionally on all packets with receive - timestamps and on IPv6 packets with transmit timestamp. This option - extends them to IPv4 packets with transmit timestamp. One use case - is to correlate packets with their egress device, by enabling socket - option IP_PKTINFO simultaneously. - - -SOF_TIMESTAMPING_OPT_TSONLY: - - Applies to transmit timestamps only. Makes the kernel return the - timestamp as a cmsg alongside an empty packet, as opposed to - alongside the original packet. This reduces the amount of memory - charged to the socket's receive budget (SO_RCVBUF) and delivers - the timestamp even if sysctl net.core.tstamp_allow_data is 0. - This option disables SOF_TIMESTAMPING_OPT_CMSG. - -SOF_TIMESTAMPING_OPT_STATS: - - Optional stats that are obtained along with the transmit timestamps. - It must be used together with SOF_TIMESTAMPING_OPT_TSONLY. When the - transmit timestamp is available, the stats are available in a - separate control message of type SCM_TIMESTAMPING_OPT_STATS, as a - list of TLVs (struct nlattr) of types. These stats allow the - application to associate various transport layer stats with - the transmit timestamps, such as how long a certain block of - data was limited by peer's receiver window. - -SOF_TIMESTAMPING_OPT_PKTINFO: - - Enable the SCM_TIMESTAMPING_PKTINFO control message for incoming - packets with hardware timestamps. The message contains struct - scm_ts_pktinfo, which supplies the index of the real interface which - received the packet and its length at layer 2. A valid (non-zero) - interface index will be returned only if CONFIG_NET_RX_BUSY_POLL is - enabled and the driver is using NAPI. The struct contains also two - other fields, but they are reserved and undefined. - -SOF_TIMESTAMPING_OPT_TX_SWHW: - - Request both hardware and software timestamps for outgoing packets - when SOF_TIMESTAMPING_TX_HARDWARE and SOF_TIMESTAMPING_TX_SOFTWARE - are enabled at the same time. If both timestamps are generated, - two separate messages will be looped to the socket's error queue, - each containing just one timestamp. - -New applications are encouraged to pass SOF_TIMESTAMPING_OPT_ID to -disambiguate timestamps and SOF_TIMESTAMPING_OPT_TSONLY to operate -regardless of the setting of sysctl net.core.tstamp_allow_data. - -An exception is when a process needs additional cmsg data, for -instance SOL_IP/IP_PKTINFO to detect the egress network interface. -Then pass option SOF_TIMESTAMPING_OPT_CMSG. This option depends on -having access to the contents of the original packet, so cannot be -combined with SOF_TIMESTAMPING_OPT_TSONLY. - - -1.3.4. Enabling timestamps via control messages - -In addition to socket options, timestamp generation can be requested -per write via cmsg, only for SOF_TIMESTAMPING_TX_* (see Section 1.3.1). -Using this feature, applications can sample timestamps per sendmsg() -without paying the overhead of enabling and disabling timestamps via -setsockopt: - - struct msghdr *msg; - ... - cmsg = CMSG_FIRSTHDR(msg); - cmsg->cmsg_level = SOL_SOCKET; - cmsg->cmsg_type = SO_TIMESTAMPING; - cmsg->cmsg_len = CMSG_LEN(sizeof(__u32)); - *((__u32 *) CMSG_DATA(cmsg)) = SOF_TIMESTAMPING_TX_SCHED | - SOF_TIMESTAMPING_TX_SOFTWARE | - SOF_TIMESTAMPING_TX_ACK; - err = sendmsg(fd, msg, 0); - -The SOF_TIMESTAMPING_TX_* flags set via cmsg will override -the SOF_TIMESTAMPING_TX_* flags set via setsockopt. - -Moreover, applications must still enable timestamp reporting via -setsockopt to receive timestamps: - - __u32 val = SOF_TIMESTAMPING_SOFTWARE | - SOF_TIMESTAMPING_OPT_ID /* or any other flag */; - err = setsockopt(fd, SOL_SOCKET, SO_TIMESTAMPING, &val, sizeof(val)); - - -1.4 Bytestream Timestamps - -The SO_TIMESTAMPING interface supports timestamping of bytes in a -bytestream. Each request is interpreted as a request for when the -entire contents of the buffer has passed a timestamping point. That -is, for streams option SOF_TIMESTAMPING_TX_SOFTWARE will record -when all bytes have reached the device driver, regardless of how -many packets the data has been converted into. - -In general, bytestreams have no natural delimiters and therefore -correlating a timestamp with data is non-trivial. A range of bytes -may be split across segments, any segments may be merged (possibly -coalescing sections of previously segmented buffers associated with -independent send() calls). Segments can be reordered and the same -byte range can coexist in multiple segments for protocols that -implement retransmissions. - -It is essential that all timestamps implement the same semantics, -regardless of these possible transformations, as otherwise they are -incomparable. Handling "rare" corner cases differently from the -simple case (a 1:1 mapping from buffer to skb) is insufficient -because performance debugging often needs to focus on such outliers. - -In practice, timestamps can be correlated with segments of a -bytestream consistently, if both semantics of the timestamp and the -timing of measurement are chosen correctly. This challenge is no -different from deciding on a strategy for IP fragmentation. There, the -definition is that only the first fragment is timestamped. For -bytestreams, we chose that a timestamp is generated only when all -bytes have passed a point. SOF_TIMESTAMPING_TX_ACK as defined is easy to -implement and reason about. An implementation that has to take into -account SACK would be more complex due to possible transmission holes -and out of order arrival. - -On the host, TCP can also break the simple 1:1 mapping from buffer to -skbuff as a result of Nagle, cork, autocork, segmentation and GSO. The -implementation ensures correctness in all cases by tracking the -individual last byte passed to send(), even if it is no longer the -last byte after an skbuff extend or merge operation. It stores the -relevant sequence number in skb_shinfo(skb)->tskey. Because an skbuff -has only one such field, only one timestamp can be generated. - -In rare cases, a timestamp request can be missed if two requests are -collapsed onto the same skb. A process can detect this situation by -enabling SOF_TIMESTAMPING_OPT_ID and comparing the byte offset at -send time with the value returned for each timestamp. It can prevent -the situation by always flushing the TCP stack in between requests, -for instance by enabling TCP_NODELAY and disabling TCP_CORK and -autocork. - -These precautions ensure that the timestamp is generated only when all -bytes have passed a timestamp point, assuming that the network stack -itself does not reorder the segments. The stack indeed tries to avoid -reordering. The one exception is under administrator control: it is -possible to construct a packet scheduler configuration that delays -segments from the same stream differently. Such a setup would be -unusual. - - -2 Data Interfaces - -Timestamps are read using the ancillary data feature of recvmsg(). -See `man 3 cmsg` for details of this interface. The socket manual -page (`man 7 socket`) describes how timestamps generated with -SO_TIMESTAMP and SO_TIMESTAMPNS records can be retrieved. - - -2.1 SCM_TIMESTAMPING records - -These timestamps are returned in a control message with cmsg_level -SOL_SOCKET, cmsg_type SCM_TIMESTAMPING, and payload of type - -For SO_TIMESTAMPING_OLD: - -struct scm_timestamping { - struct timespec ts[3]; -}; - -For SO_TIMESTAMPING_NEW: - -struct scm_timestamping64 { - struct __kernel_timespec ts[3]; - -Always use SO_TIMESTAMPING_NEW timestamp to always get timestamp in -struct scm_timestamping64 format. - -SO_TIMESTAMPING_OLD returns incorrect timestamps after the year 2038 -on 32 bit machines. - -The structure can return up to three timestamps. This is a legacy -feature. At least one field is non-zero at any time. Most timestamps -are passed in ts[0]. Hardware timestamps are passed in ts[2]. - -ts[1] used to hold hardware timestamps converted to system time. -Instead, expose the hardware clock device on the NIC directly as -a HW PTP clock source, to allow time conversion in userspace and -optionally synchronize system time with a userspace PTP stack such -as linuxptp. For the PTP clock API, see Documentation/driver-api/ptp.rst. - -Note that if the SO_TIMESTAMP or SO_TIMESTAMPNS option is enabled -together with SO_TIMESTAMPING using SOF_TIMESTAMPING_SOFTWARE, a false -software timestamp will be generated in the recvmsg() call and passed -in ts[0] when a real software timestamp is missing. This happens also -on hardware transmit timestamps. - -2.1.1 Transmit timestamps with MSG_ERRQUEUE - -For transmit timestamps the outgoing packet is looped back to the -socket's error queue with the send timestamp(s) attached. A process -receives the timestamps by calling recvmsg() with flag MSG_ERRQUEUE -set and with a msg_control buffer sufficiently large to receive the -relevant metadata structures. The recvmsg call returns the original -outgoing data packet with two ancillary messages attached. - -A message of cm_level SOL_IP(V6) and cm_type IP(V6)_RECVERR -embeds a struct sock_extended_err. This defines the error type. For -timestamps, the ee_errno field is ENOMSG. The other ancillary message -will have cm_level SOL_SOCKET and cm_type SCM_TIMESTAMPING. This -embeds the struct scm_timestamping. - - -2.1.1.2 Timestamp types - -The semantics of the three struct timespec are defined by field -ee_info in the extended error structure. It contains a value of -type SCM_TSTAMP_* to define the actual timestamp passed in -scm_timestamping. - -The SCM_TSTAMP_* types are 1:1 matches to the SOF_TIMESTAMPING_* -control fields discussed previously, with one exception. For legacy -reasons, SCM_TSTAMP_SND is equal to zero and can be set for both -SOF_TIMESTAMPING_TX_HARDWARE and SOF_TIMESTAMPING_TX_SOFTWARE. It -is the first if ts[2] is non-zero, the second otherwise, in which -case the timestamp is stored in ts[0]. - - -2.1.1.3 Fragmentation - -Fragmentation of outgoing datagrams is rare, but is possible, e.g., by -explicitly disabling PMTU discovery. If an outgoing packet is fragmented, -then only the first fragment is timestamped and returned to the sending -socket. - - -2.1.1.4 Packet Payload - -The calling application is often not interested in receiving the whole -packet payload that it passed to the stack originally: the socket -error queue mechanism is just a method to piggyback the timestamp on. -In this case, the application can choose to read datagrams with a -smaller buffer, possibly even of length 0. The payload is truncated -accordingly. Until the process calls recvmsg() on the error queue, -however, the full packet is queued, taking up budget from SO_RCVBUF. - - -2.1.1.5 Blocking Read - -Reading from the error queue is always a non-blocking operation. To -block waiting on a timestamp, use poll or select. poll() will return -POLLERR in pollfd.revents if any data is ready on the error queue. -There is no need to pass this flag in pollfd.events. This flag is -ignored on request. See also `man 2 poll`. - - -2.1.2 Receive timestamps - -On reception, there is no reason to read from the socket error queue. -The SCM_TIMESTAMPING ancillary data is sent along with the packet data -on a normal recvmsg(). Since this is not a socket error, it is not -accompanied by a message SOL_IP(V6)/IP(V6)_RECVERROR. In this case, -the meaning of the three fields in struct scm_timestamping is -implicitly defined. ts[0] holds a software timestamp if set, ts[1] -is again deprecated and ts[2] holds a hardware timestamp if set. - - -3. Hardware Timestamping configuration: SIOCSHWTSTAMP and SIOCGHWTSTAMP - -Hardware time stamping must also be initialized for each device driver -that is expected to do hardware time stamping. The parameter is defined in -include/uapi/linux/net_tstamp.h as: - -struct hwtstamp_config { - int flags; /* no flags defined right now, must be zero */ - int tx_type; /* HWTSTAMP_TX_* */ - int rx_filter; /* HWTSTAMP_FILTER_* */ -}; - -Desired behavior is passed into the kernel and to a specific device by -calling ioctl(SIOCSHWTSTAMP) with a pointer to a struct ifreq whose -ifr_data points to a struct hwtstamp_config. The tx_type and -rx_filter are hints to the driver what it is expected to do. If -the requested fine-grained filtering for incoming packets is not -supported, the driver may time stamp more than just the requested types -of packets. - -Drivers are free to use a more permissive configuration than the requested -configuration. It is expected that drivers should only implement directly the -most generic mode that can be supported. For example if the hardware can -support HWTSTAMP_FILTER_V2_EVENT, then it should generally always upscale -HWTSTAMP_FILTER_V2_L2_SYNC_MESSAGE, and so forth, as HWTSTAMP_FILTER_V2_EVENT -is more generic (and more useful to applications). - -A driver which supports hardware time stamping shall update the struct -with the actual, possibly more permissive configuration. If the -requested packets cannot be time stamped, then nothing should be -changed and ERANGE shall be returned (in contrast to EINVAL, which -indicates that SIOCSHWTSTAMP is not supported at all). - -Only a processes with admin rights may change the configuration. User -space is responsible to ensure that multiple processes don't interfere -with each other and that the settings are reset. - -Any process can read the actual configuration by passing this -structure to ioctl(SIOCGHWTSTAMP) in the same way. However, this has -not been implemented in all drivers. - -/* possible values for hwtstamp_config->tx_type */ -enum { - /* - * no outgoing packet will need hardware time stamping; - * should a packet arrive which asks for it, no hardware - * time stamping will be done - */ - HWTSTAMP_TX_OFF, - - /* - * enables hardware time stamping for outgoing packets; - * the sender of the packet decides which are to be - * time stamped by setting SOF_TIMESTAMPING_TX_SOFTWARE - * before sending the packet - */ - HWTSTAMP_TX_ON, -}; - -/* possible values for hwtstamp_config->rx_filter */ -enum { - /* time stamp no incoming packet at all */ - HWTSTAMP_FILTER_NONE, - - /* time stamp any incoming packet */ - HWTSTAMP_FILTER_ALL, - - /* return value: time stamp all packets requested plus some others */ - HWTSTAMP_FILTER_SOME, - - /* PTP v1, UDP, any kind of event packet */ - HWTSTAMP_FILTER_PTP_V1_L4_EVENT, - - /* for the complete list of values, please check - * the include file include/uapi/linux/net_tstamp.h - */ -}; - -3.1 Hardware Timestamping Implementation: Device Drivers - -A driver which supports hardware time stamping must support the -SIOCSHWTSTAMP ioctl and update the supplied struct hwtstamp_config with -the actual values as described in the section on SIOCSHWTSTAMP. It -should also support SIOCGHWTSTAMP. - -Time stamps for received packets must be stored in the skb. To get a pointer -to the shared time stamp structure of the skb call skb_hwtstamps(). Then -set the time stamps in the structure: - -struct skb_shared_hwtstamps { - /* hardware time stamp transformed into duration - * since arbitrary point in time - */ - ktime_t hwtstamp; -}; - -Time stamps for outgoing packets are to be generated as follows: -- In hard_start_xmit(), check if (skb_shinfo(skb)->tx_flags & SKBTX_HW_TSTAMP) - is set no-zero. If yes, then the driver is expected to do hardware time - stamping. -- If this is possible for the skb and requested, then declare - that the driver is doing the time stamping by setting the flag - SKBTX_IN_PROGRESS in skb_shinfo(skb)->tx_flags , e.g. with - - skb_shinfo(skb)->tx_flags |= SKBTX_IN_PROGRESS; - - You might want to keep a pointer to the associated skb for the next step - and not free the skb. A driver not supporting hardware time stamping doesn't - do that. A driver must never touch sk_buff::tstamp! It is used to store - software generated time stamps by the network subsystem. -- Driver should call skb_tx_timestamp() as close to passing sk_buff to hardware - as possible. skb_tx_timestamp() provides a software time stamp if requested - and hardware timestamping is not possible (SKBTX_IN_PROGRESS not set). -- As soon as the driver has sent the packet and/or obtained a - hardware time stamp for it, it passes the time stamp back by - calling skb_hwtstamp_tx() with the original skb, the raw - hardware time stamp. skb_hwtstamp_tx() clones the original skb and - adds the timestamps, therefore the original skb has to be freed now. - If obtaining the hardware time stamp somehow fails, then the driver - should not fall back to software time stamping. The rationale is that - this would occur at a later time in the processing pipeline than other - software time stamping and therefore could lead to unexpected deltas - between time stamps. diff --git a/include/uapi/linux/errqueue.h b/include/uapi/linux/errqueue.h index 0cca19670fd2..ca5cb3e3c6df 100644 --- a/include/uapi/linux/errqueue.h +++ b/include/uapi/linux/errqueue.h @@ -36,7 +36,7 @@ struct sock_extended_err { * * The timestamping interfaces SO_TIMESTAMPING, MSG_TSTAMP_* * communicate network timestamps by passing this struct in a cmsg with - * recvmsg(). See Documentation/networking/timestamping.txt for details. + * recvmsg(). See Documentation/networking/timestamping.rst for details. * User space sees a timespec definition that matches either * __kernel_timespec or __kernel_old_timespec, in the kernel we * require two structure definitions to provide both. -- cgit v1.2.3-59-g8ed1b From 4ac0b122ee63d89b5aaf2e3e376092d8ac02a567 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Thu, 30 Apr 2020 18:04:32 +0200 Subject: docs: networking: convert tproxy.txt to ReST - add SPDX header; - adjust title markup; - mark code blocks and literals as such; - adjust identation, whitespaces and blank lines where needed; - add to networking/index.rst. Signed-off-by: Mauro Carvalho Chehab Signed-off-by: David S. Miller --- Documentation/networking/index.rst | 1 + Documentation/networking/tproxy.rst | 109 ++++++++++++++++++++++++++++++++++++ Documentation/networking/tproxy.txt | 104 ---------------------------------- net/netfilter/Kconfig | 2 +- 4 files changed, 111 insertions(+), 105 deletions(-) create mode 100644 Documentation/networking/tproxy.rst delete mode 100644 Documentation/networking/tproxy.txt diff --git a/Documentation/networking/index.rst b/Documentation/networking/index.rst index 8f9a84b8e3f2..b423b2db5f96 100644 --- a/Documentation/networking/index.rst +++ b/Documentation/networking/index.rst @@ -110,6 +110,7 @@ Contents: tcp-thin team timestamping + tproxy .. only:: subproject and html diff --git a/Documentation/networking/tproxy.rst b/Documentation/networking/tproxy.rst new file mode 100644 index 000000000000..00dc3a1a66b4 --- /dev/null +++ b/Documentation/networking/tproxy.rst @@ -0,0 +1,109 @@ +.. SPDX-License-Identifier: GPL-2.0 + +========================= +Transparent proxy support +========================= + +This feature adds Linux 2.2-like transparent proxy support to current kernels. +To use it, enable the socket match and the TPROXY target in your kernel config. +You will need policy routing too, so be sure to enable that as well. + +From Linux 4.18 transparent proxy support is also available in nf_tables. + +1. Making non-local sockets work +================================ + +The idea is that you identify packets with destination address matching a local +socket on your box, set the packet mark to a certain value:: + + # iptables -t mangle -N DIVERT + # iptables -t mangle -A PREROUTING -p tcp -m socket -j DIVERT + # iptables -t mangle -A DIVERT -j MARK --set-mark 1 + # iptables -t mangle -A DIVERT -j ACCEPT + +Alternatively you can do this in nft with the following commands:: + + # nft add table filter + # nft add chain filter divert "{ type filter hook prerouting priority -150; }" + # nft add rule filter divert meta l4proto tcp socket transparent 1 meta mark set 1 accept + +And then match on that value using policy routing to have those packets +delivered locally:: + + # ip rule add fwmark 1 lookup 100 + # ip route add local 0.0.0.0/0 dev lo table 100 + +Because of certain restrictions in the IPv4 routing output code you'll have to +modify your application to allow it to send datagrams _from_ non-local IP +addresses. All you have to do is enable the (SOL_IP, IP_TRANSPARENT) socket +option before calling bind:: + + fd = socket(AF_INET, SOCK_STREAM, 0); + /* - 8< -*/ + int value = 1; + setsockopt(fd, SOL_IP, IP_TRANSPARENT, &value, sizeof(value)); + /* - 8< -*/ + name.sin_family = AF_INET; + name.sin_port = htons(0xCAFE); + name.sin_addr.s_addr = htonl(0xDEADBEEF); + bind(fd, &name, sizeof(name)); + +A trivial patch for netcat is available here: +http://people.netfilter.org/hidden/tproxy/netcat-ip_transparent-support.patch + + +2. Redirecting traffic +====================== + +Transparent proxying often involves "intercepting" traffic on a router. This is +usually done with the iptables REDIRECT target; however, there are serious +limitations of that method. One of the major issues is that it actually +modifies the packets to change the destination address -- which might not be +acceptable in certain situations. (Think of proxying UDP for example: you won't +be able to find out the original destination address. Even in case of TCP +getting the original destination address is racy.) + +The 'TPROXY' target provides similar functionality without relying on NAT. Simply +add rules like this to the iptables ruleset above:: + + # iptables -t mangle -A PREROUTING -p tcp --dport 80 -j TPROXY \ + --tproxy-mark 0x1/0x1 --on-port 50080 + +Or the following rule to nft: + +# nft add rule filter divert tcp dport 80 tproxy to :50080 meta mark set 1 accept + +Note that for this to work you'll have to modify the proxy to enable (SOL_IP, +IP_TRANSPARENT) for the listening socket. + +As an example implementation, tcprdr is available here: +https://git.breakpoint.cc/cgit/fw/tcprdr.git/ +This tool is written by Florian Westphal and it was used for testing during the +nf_tables implementation. + +3. Iptables and nf_tables extensions +==================================== + +To use tproxy you'll need to have the following modules compiled for iptables: + + - NETFILTER_XT_MATCH_SOCKET + - NETFILTER_XT_TARGET_TPROXY + +Or the floowing modules for nf_tables: + + - NFT_SOCKET + - NFT_TPROXY + +4. Application support +====================== + +4.1. Squid +---------- + +Squid 3.HEAD has support built-in. To use it, pass +'--enable-linux-netfilter' to configure and set the 'tproxy' option on +the HTTP listener you redirect traffic to with the TPROXY iptables +target. + +For more information please consult the following page on the Squid +wiki: http://wiki.squid-cache.org/Features/Tproxy4 diff --git a/Documentation/networking/tproxy.txt b/Documentation/networking/tproxy.txt deleted file mode 100644 index b9a188823d9f..000000000000 --- a/Documentation/networking/tproxy.txt +++ /dev/null @@ -1,104 +0,0 @@ -Transparent proxy support -========================= - -This feature adds Linux 2.2-like transparent proxy support to current kernels. -To use it, enable the socket match and the TPROXY target in your kernel config. -You will need policy routing too, so be sure to enable that as well. - -From Linux 4.18 transparent proxy support is also available in nf_tables. - -1. Making non-local sockets work -================================ - -The idea is that you identify packets with destination address matching a local -socket on your box, set the packet mark to a certain value: - -# iptables -t mangle -N DIVERT -# iptables -t mangle -A PREROUTING -p tcp -m socket -j DIVERT -# iptables -t mangle -A DIVERT -j MARK --set-mark 1 -# iptables -t mangle -A DIVERT -j ACCEPT - -Alternatively you can do this in nft with the following commands: - -# nft add table filter -# nft add chain filter divert "{ type filter hook prerouting priority -150; }" -# nft add rule filter divert meta l4proto tcp socket transparent 1 meta mark set 1 accept - -And then match on that value using policy routing to have those packets -delivered locally: - -# ip rule add fwmark 1 lookup 100 -# ip route add local 0.0.0.0/0 dev lo table 100 - -Because of certain restrictions in the IPv4 routing output code you'll have to -modify your application to allow it to send datagrams _from_ non-local IP -addresses. All you have to do is enable the (SOL_IP, IP_TRANSPARENT) socket -option before calling bind: - -fd = socket(AF_INET, SOCK_STREAM, 0); -/* - 8< -*/ -int value = 1; -setsockopt(fd, SOL_IP, IP_TRANSPARENT, &value, sizeof(value)); -/* - 8< -*/ -name.sin_family = AF_INET; -name.sin_port = htons(0xCAFE); -name.sin_addr.s_addr = htonl(0xDEADBEEF); -bind(fd, &name, sizeof(name)); - -A trivial patch for netcat is available here: -http://people.netfilter.org/hidden/tproxy/netcat-ip_transparent-support.patch - - -2. Redirecting traffic -====================== - -Transparent proxying often involves "intercepting" traffic on a router. This is -usually done with the iptables REDIRECT target; however, there are serious -limitations of that method. One of the major issues is that it actually -modifies the packets to change the destination address -- which might not be -acceptable in certain situations. (Think of proxying UDP for example: you won't -be able to find out the original destination address. Even in case of TCP -getting the original destination address is racy.) - -The 'TPROXY' target provides similar functionality without relying on NAT. Simply -add rules like this to the iptables ruleset above: - -# iptables -t mangle -A PREROUTING -p tcp --dport 80 -j TPROXY \ - --tproxy-mark 0x1/0x1 --on-port 50080 - -Or the following rule to nft: - -# nft add rule filter divert tcp dport 80 tproxy to :50080 meta mark set 1 accept - -Note that for this to work you'll have to modify the proxy to enable (SOL_IP, -IP_TRANSPARENT) for the listening socket. - -As an example implementation, tcprdr is available here: -https://git.breakpoint.cc/cgit/fw/tcprdr.git/ -This tool is written by Florian Westphal and it was used for testing during the -nf_tables implementation. - -3. Iptables and nf_tables extensions -==================================== - -To use tproxy you'll need to have the following modules compiled for iptables: - - NETFILTER_XT_MATCH_SOCKET - - NETFILTER_XT_TARGET_TPROXY - -Or the floowing modules for nf_tables: - - NFT_SOCKET - - NFT_TPROXY - -4. Application support -====================== - -4.1. Squid ----------- - -Squid 3.HEAD has support built-in. To use it, pass -'--enable-linux-netfilter' to configure and set the 'tproxy' option on -the HTTP listener you redirect traffic to with the TPROXY iptables -target. - -For more information please consult the following page on the Squid -wiki: http://wiki.squid-cache.org/Features/Tproxy4 diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig index 468fea1aebba..3a3915d2e1ea 100644 --- a/net/netfilter/Kconfig +++ b/net/netfilter/Kconfig @@ -1043,7 +1043,7 @@ config NETFILTER_XT_TARGET_TPROXY on Netfilter connection tracking and NAT, unlike REDIRECT. For it to work you will have to configure certain iptables rules and use policy routing. For more information on how to set it up - see Documentation/networking/tproxy.txt. + see Documentation/networking/tproxy.rst. To compile it as a module, choose M here. If unsure, say N. -- cgit v1.2.3-59-g8ed1b From 466010342e89240c45746f65767c7290b96a4b36 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Thu, 30 Apr 2020 20:01:08 +0300 Subject: mlxsw: spectrum_span: Add APIs to get / put a SPAN agent Given a netdev that packets should be mirrored to, create a SPAN agent and return its identifier to the caller. The SPAN agent is reference counted, as multiple tc-mirred actions can point to the same destination netdev. Signed-off-by: Ido Schimmel Reviewed-by: Jiri Pirko Signed-off-by: David S. Miller --- .../net/ethernet/mellanox/mlxsw/spectrum_span.c | 43 ++++++++++++++++++++++ .../net/ethernet/mellanox/mlxsw/spectrum_span.h | 4 ++ 2 files changed, 47 insertions(+) diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_span.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_span.c index ae3c8a1e9a43..c4159f4a66e2 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_span.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_span.c @@ -1039,3 +1039,46 @@ void mlxsw_sp_span_respin(struct mlxsw_sp *mlxsw_sp) return; mlxsw_core_schedule_work(&mlxsw_sp->span->work); } + +int mlxsw_sp_span_agent_get(struct mlxsw_sp *mlxsw_sp, + const struct net_device *to_dev, int *p_span_id) +{ + const struct mlxsw_sp_span_entry_ops *ops; + struct mlxsw_sp_span_entry *span_entry; + struct mlxsw_sp_span_parms sparms; + int err; + + ASSERT_RTNL(); + + ops = mlxsw_sp_span_entry_ops(mlxsw_sp, to_dev); + if (!ops) { + dev_err(mlxsw_sp->bus_info->dev, "Cannot mirror to requested destination\n"); + return -EOPNOTSUPP; + } + + memset(&sparms, 0, sizeof(sparms)); + err = ops->parms_set(to_dev, &sparms); + if (err) + return err; + + span_entry = mlxsw_sp_span_entry_get(mlxsw_sp, to_dev, ops, sparms); + if (!span_entry) + return -ENOBUFS; + + *p_span_id = span_entry->id; + + return 0; +} + +void mlxsw_sp_span_agent_put(struct mlxsw_sp *mlxsw_sp, int span_id) +{ + struct mlxsw_sp_span_entry *span_entry; + + ASSERT_RTNL(); + + span_entry = mlxsw_sp_span_entry_find_by_id(mlxsw_sp, span_id); + if (WARN_ON_ONCE(!span_entry)) + return; + + mlxsw_sp_span_entry_put(mlxsw_sp, span_entry); +} diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_span.h b/drivers/net/ethernet/mellanox/mlxsw/spectrum_span.h index d23abdf957fa..b79de9a125bb 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_span.h +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_span.h @@ -77,4 +77,8 @@ void mlxsw_sp_span_entry_invalidate(struct mlxsw_sp *mlxsw_sp, int mlxsw_sp_span_port_mtu_update(struct mlxsw_sp_port *port, u16 mtu); void mlxsw_sp_span_speed_update_work(struct work_struct *work); +int mlxsw_sp_span_agent_get(struct mlxsw_sp *mlxsw_sp, + const struct net_device *to_dev, int *p_span_id); +void mlxsw_sp_span_agent_put(struct mlxsw_sp *mlxsw_sp, int span_id); + #endif -- cgit v1.2.3-59-g8ed1b From ed04458d4a900005d96b991fc0eb938875091047 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Thu, 30 Apr 2020 20:01:09 +0300 Subject: mlxsw: spectrum_span: Add APIs to get / put an analyzed port An analyzed port is a port whose incoming / outgoing traffic is mirrored to a SPAN agent and analyzed on a remote server. A port can be analyzed by multiple tc filters and therefore the corresponding analyzed port entry needs to be reference counted. This is significant because ports whose outgoing traffic is analyzed need to have an egress mirror buffer. Add APIs to get / put an analyzed port. Allocate an egress mirror buffer on a port when it is first inspected at egress and free the buffer when it is no longer inspected at egress. Protect the list of analyzed ports with a mutex, as a later patch will traverse it from a context in which RTNL lock is not held. Signed-off-by: Ido Schimmel Reviewed-by: Jiri Pirko Signed-off-by: David S. Miller --- .../net/ethernet/mellanox/mlxsw/spectrum_span.c | 136 +++++++++++++++++++++ .../net/ethernet/mellanox/mlxsw/spectrum_span.h | 4 + 2 files changed, 140 insertions(+) diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_span.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_span.c index c4159f4a66e2..5edf9d1bf937 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_span.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_span.c @@ -3,6 +3,7 @@ #include #include +#include #include #include #include @@ -20,11 +21,20 @@ struct mlxsw_sp_span { struct work_struct work; struct mlxsw_sp *mlxsw_sp; + struct list_head analyzed_ports_list; + struct mutex analyzed_ports_lock; /* Protects analyzed_ports_list */ atomic_t active_entries_count; int entries_count; struct mlxsw_sp_span_entry entries[]; }; +struct mlxsw_sp_span_analyzed_port { + struct list_head list; /* Member of analyzed_ports_list */ + refcount_t ref_count; + u8 local_port; + bool ingress; +}; + static void mlxsw_sp_span_respin_work(struct work_struct *work); static u64 mlxsw_sp_span_occ_get(void *priv) @@ -49,6 +59,8 @@ int mlxsw_sp_span_init(struct mlxsw_sp *mlxsw_sp) return -ENOMEM; span->entries_count = entries_count; atomic_set(&span->active_entries_count, 0); + mutex_init(&span->analyzed_ports_lock); + INIT_LIST_HEAD(&span->analyzed_ports_list); span->mlxsw_sp = mlxsw_sp; mlxsw_sp->span = span; @@ -79,6 +91,8 @@ void mlxsw_sp_span_fini(struct mlxsw_sp *mlxsw_sp) WARN_ON_ONCE(!list_empty(&curr->bound_ports_list)); } + WARN_ON_ONCE(!list_empty(&mlxsw_sp->span->analyzed_ports_list)); + mutex_destroy(&mlxsw_sp->span->analyzed_ports_lock); kfree(mlxsw_sp->span); } @@ -1082,3 +1096,125 @@ void mlxsw_sp_span_agent_put(struct mlxsw_sp *mlxsw_sp, int span_id) mlxsw_sp_span_entry_put(mlxsw_sp, span_entry); } + +static struct mlxsw_sp_span_analyzed_port * +mlxsw_sp_span_analyzed_port_find(struct mlxsw_sp_span *span, u8 local_port, + bool ingress) +{ + struct mlxsw_sp_span_analyzed_port *analyzed_port; + + list_for_each_entry(analyzed_port, &span->analyzed_ports_list, list) { + if (analyzed_port->local_port == local_port && + analyzed_port->ingress == ingress) + return analyzed_port; + } + + return NULL; +} + +static struct mlxsw_sp_span_analyzed_port * +mlxsw_sp_span_analyzed_port_create(struct mlxsw_sp_span *span, + struct mlxsw_sp_port *mlxsw_sp_port, + bool ingress) +{ + struct mlxsw_sp_span_analyzed_port *analyzed_port; + int err; + + analyzed_port = kzalloc(sizeof(*analyzed_port), GFP_KERNEL); + if (!analyzed_port) + return ERR_PTR(-ENOMEM); + + refcount_set(&analyzed_port->ref_count, 1); + analyzed_port->local_port = mlxsw_sp_port->local_port; + analyzed_port->ingress = ingress; + list_add_tail(&analyzed_port->list, &span->analyzed_ports_list); + + /* An egress mirror buffer should be allocated on the egress port which + * does the mirroring. + */ + if (!ingress) { + u16 mtu = mlxsw_sp_port->dev->mtu; + + err = mlxsw_sp_span_port_buffsize_update(mlxsw_sp_port, mtu); + if (err) + goto err_buffsize_update; + } + + return analyzed_port; + +err_buffsize_update: + list_del(&analyzed_port->list); + kfree(analyzed_port); + return ERR_PTR(err); +} + +static void +mlxsw_sp_span_analyzed_port_destroy(struct mlxsw_sp_span *span, + struct mlxsw_sp_span_analyzed_port * + analyzed_port) +{ + struct mlxsw_sp *mlxsw_sp = span->mlxsw_sp; + char sbib_pl[MLXSW_REG_SBIB_LEN]; + + /* Remove egress mirror buffer now that port is no longer analyzed + * at egress. + */ + if (!analyzed_port->ingress) { + mlxsw_reg_sbib_pack(sbib_pl, analyzed_port->local_port, 0); + mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(sbib), sbib_pl); + } + + list_del(&analyzed_port->list); + kfree(analyzed_port); +} + +int mlxsw_sp_span_analyzed_port_get(struct mlxsw_sp_port *mlxsw_sp_port, + bool ingress) +{ + struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp; + struct mlxsw_sp_span_analyzed_port *analyzed_port; + u8 local_port = mlxsw_sp_port->local_port; + int err = 0; + + mutex_lock(&mlxsw_sp->span->analyzed_ports_lock); + + analyzed_port = mlxsw_sp_span_analyzed_port_find(mlxsw_sp->span, + local_port, ingress); + if (analyzed_port) { + refcount_inc(&analyzed_port->ref_count); + goto out_unlock; + } + + analyzed_port = mlxsw_sp_span_analyzed_port_create(mlxsw_sp->span, + mlxsw_sp_port, + ingress); + if (IS_ERR(analyzed_port)) + err = PTR_ERR(analyzed_port); + +out_unlock: + mutex_unlock(&mlxsw_sp->span->analyzed_ports_lock); + return err; +} + +void mlxsw_sp_span_analyzed_port_put(struct mlxsw_sp_port *mlxsw_sp_port, + bool ingress) +{ + struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp; + struct mlxsw_sp_span_analyzed_port *analyzed_port; + u8 local_port = mlxsw_sp_port->local_port; + + mutex_lock(&mlxsw_sp->span->analyzed_ports_lock); + + analyzed_port = mlxsw_sp_span_analyzed_port_find(mlxsw_sp->span, + local_port, ingress); + if (WARN_ON_ONCE(!analyzed_port)) + goto out_unlock; + + if (!refcount_dec_and_test(&analyzed_port->ref_count)) + goto out_unlock; + + mlxsw_sp_span_analyzed_port_destroy(mlxsw_sp->span, analyzed_port); + +out_unlock: + mutex_unlock(&mlxsw_sp->span->analyzed_ports_lock); +} diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_span.h b/drivers/net/ethernet/mellanox/mlxsw/spectrum_span.h index b79de9a125bb..1345eda5cc34 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_span.h +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_span.h @@ -80,5 +80,9 @@ void mlxsw_sp_span_speed_update_work(struct work_struct *work); int mlxsw_sp_span_agent_get(struct mlxsw_sp *mlxsw_sp, const struct net_device *to_dev, int *p_span_id); void mlxsw_sp_span_agent_put(struct mlxsw_sp *mlxsw_sp, int span_id); +int mlxsw_sp_span_analyzed_port_get(struct mlxsw_sp_port *mlxsw_sp_port, + bool ingress); +void mlxsw_sp_span_analyzed_port_put(struct mlxsw_sp_port *mlxsw_sp_port, + bool ingress); #endif -- cgit v1.2.3-59-g8ed1b From eb773c3a2d98e1e1cb9076419f14ebb0a2f40951 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Thu, 30 Apr 2020 20:01:10 +0300 Subject: mlxsw: spectrum_span: Rename function Next patch will introduce mlxsw_sp_span_port_buffer_disable() function that disables the egress buffer on an analyzed port. Rename the opposite function that updates the buffer on an analyzed port accordingly. Signed-off-by: Ido Schimmel Suggested-by: Petr Machata Reviewed-by: Jiri Pirko Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlxsw/spectrum_span.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_span.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_span.c index 5edf9d1bf937..c52f79a97f36 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_span.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_span.c @@ -784,7 +784,7 @@ static bool mlxsw_sp_span_is_egress_mirror(struct mlxsw_sp_port *port) } static int -mlxsw_sp_span_port_buffsize_update(struct mlxsw_sp_port *mlxsw_sp_port, u16 mtu) +mlxsw_sp_span_port_buffer_update(struct mlxsw_sp_port *mlxsw_sp_port, u16 mtu) { struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp; char sbib_pl[MLXSW_REG_SBIB_LEN]; @@ -809,7 +809,7 @@ int mlxsw_sp_span_port_mtu_update(struct mlxsw_sp_port *port, u16 mtu) * updated according to the mtu value */ if (mlxsw_sp_span_is_egress_mirror(port)) - return mlxsw_sp_span_port_buffsize_update(port, mtu); + return mlxsw_sp_span_port_buffer_update(port, mtu); return 0; } @@ -825,8 +825,8 @@ void mlxsw_sp_span_speed_update_work(struct work_struct *work) * updated according to the speed value. */ if (mlxsw_sp_span_is_egress_mirror(mlxsw_sp_port)) - mlxsw_sp_span_port_buffsize_update(mlxsw_sp_port, - mlxsw_sp_port->dev->mtu); + mlxsw_sp_span_port_buffer_update(mlxsw_sp_port, + mlxsw_sp_port->dev->mtu); } static struct mlxsw_sp_span_inspected_port * @@ -888,7 +888,7 @@ mlxsw_sp_span_inspected_port_add(struct mlxsw_sp_port *port, /* if it is an egress SPAN, bind a shared buffer to it */ if (type == MLXSW_SP_SPAN_EGRESS) { - err = mlxsw_sp_span_port_buffsize_update(port, port->dev->mtu); + err = mlxsw_sp_span_port_buffer_update(port, port->dev->mtu); if (err) return err; } @@ -1135,14 +1135,14 @@ mlxsw_sp_span_analyzed_port_create(struct mlxsw_sp_span *span, if (!ingress) { u16 mtu = mlxsw_sp_port->dev->mtu; - err = mlxsw_sp_span_port_buffsize_update(mlxsw_sp_port, mtu); + err = mlxsw_sp_span_port_buffer_update(mlxsw_sp_port, mtu); if (err) - goto err_buffsize_update; + goto err_buffer_update; } return analyzed_port; -err_buffsize_update: +err_buffer_update: list_del(&analyzed_port->list); kfree(analyzed_port); return ERR_PTR(err); -- cgit v1.2.3-59-g8ed1b From 14366da6b59203ef5fc3cf21660113b60c2f1421 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Thu, 30 Apr 2020 20:01:11 +0300 Subject: mlxsw: spectrum_span: Wrap buffer change in a function The code that adjusts the egress buffer size is not symmetric at the moment. The update is done via a call to mlxsw_sp_span_port_buffer_update(), but the disablement is done inline by invoking the write to SBIB register directly. Wrap the disablement code in mlxsw_sp_span_port_buffer_disable(). Signed-off-by: Ido Schimmel Suggested-by: Petr Machata Reviewed-by: Jiri Pirko Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlxsw/spectrum_span.c | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_span.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_span.c index c52f79a97f36..2b9d8ce93b13 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_span.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_span.c @@ -803,6 +803,15 @@ mlxsw_sp_span_port_buffer_update(struct mlxsw_sp_port *mlxsw_sp_port, u16 mtu) return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(sbib), sbib_pl); } +static void mlxsw_sp_span_port_buffer_disable(struct mlxsw_sp *mlxsw_sp, + u8 local_port) +{ + char sbib_pl[MLXSW_REG_SBIB_LEN]; + + mlxsw_reg_sbib_pack(sbib_pl, local_port, 0); + mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(sbib), sbib_pl); +} + int mlxsw_sp_span_port_mtu_update(struct mlxsw_sp_port *port, u16 mtu) { /* If port is egress mirrored, the shared buffer size should be @@ -1154,15 +1163,13 @@ mlxsw_sp_span_analyzed_port_destroy(struct mlxsw_sp_span *span, analyzed_port) { struct mlxsw_sp *mlxsw_sp = span->mlxsw_sp; - char sbib_pl[MLXSW_REG_SBIB_LEN]; /* Remove egress mirror buffer now that port is no longer analyzed * at egress. */ - if (!analyzed_port->ingress) { - mlxsw_reg_sbib_pack(sbib_pl, analyzed_port->local_port, 0); - mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(sbib), sbib_pl); - } + if (!analyzed_port->ingress) + mlxsw_sp_span_port_buffer_disable(mlxsw_sp, + analyzed_port->local_port); list_del(&analyzed_port->list); kfree(analyzed_port); -- cgit v1.2.3-59-g8ed1b From c056618c53a771bacf4e077e8be01de4405439ae Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Thu, 30 Apr 2020 20:01:12 +0300 Subject: mlxsw: spectrum_span: Add APIs to bind / unbind a SPAN agent Currently, a SPAN agent can only be bound to a per-port trigger where the trigger is either an incoming packet (INGRESS) or an outgoing packet (EGRESS) to / from the port. A follow-up patch set will introduce the concept of global triggers and per-{port, TC} enablement. With global triggers, the trigger entry is only keyed by a trigger and not by a port and a trigger. The trigger can be, for example, a packet that was early dropped. While the binding between the SPAN agent and the trigger is performed only once, the trigger entry needs to be reference counted, as the trigger can be enabled on multiple ports. Add APIs to bind / unbind a SPAN agent to a trigger and reference count the trigger entry in preparation for global triggers. Signed-off-by: Ido Schimmel Reviewed-by: Jiri Pirko Signed-off-by: David S. Miller --- .../net/ethernet/mellanox/mlxsw/spectrum_span.c | 169 +++++++++++++++++++++ .../net/ethernet/mellanox/mlxsw/spectrum_span.h | 18 +++ 2 files changed, 187 insertions(+) diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_span.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_span.c index 2b9d8ce93b13..de9012ab94d5 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_span.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_span.c @@ -23,6 +23,7 @@ struct mlxsw_sp_span { struct mlxsw_sp *mlxsw_sp; struct list_head analyzed_ports_list; struct mutex analyzed_ports_lock; /* Protects analyzed_ports_list */ + struct list_head trigger_entries_list; atomic_t active_entries_count; int entries_count; struct mlxsw_sp_span_entry entries[]; @@ -35,6 +36,14 @@ struct mlxsw_sp_span_analyzed_port { bool ingress; }; +struct mlxsw_sp_span_trigger_entry { + struct list_head list; /* Member of trigger_entries_list */ + refcount_t ref_count; + u8 local_port; + enum mlxsw_sp_span_trigger trigger; + struct mlxsw_sp_span_trigger_parms parms; +}; + static void mlxsw_sp_span_respin_work(struct work_struct *work); static u64 mlxsw_sp_span_occ_get(void *priv) @@ -61,6 +70,7 @@ int mlxsw_sp_span_init(struct mlxsw_sp *mlxsw_sp) atomic_set(&span->active_entries_count, 0); mutex_init(&span->analyzed_ports_lock); INIT_LIST_HEAD(&span->analyzed_ports_list); + INIT_LIST_HEAD(&span->trigger_entries_list); span->mlxsw_sp = mlxsw_sp; mlxsw_sp->span = span; @@ -91,6 +101,7 @@ void mlxsw_sp_span_fini(struct mlxsw_sp *mlxsw_sp) WARN_ON_ONCE(!list_empty(&curr->bound_ports_list)); } + WARN_ON_ONCE(!list_empty(&mlxsw_sp->span->trigger_entries_list)); WARN_ON_ONCE(!list_empty(&mlxsw_sp->span->analyzed_ports_list)); mutex_destroy(&mlxsw_sp->span->analyzed_ports_lock); kfree(mlxsw_sp->span); @@ -1225,3 +1236,161 @@ void mlxsw_sp_span_analyzed_port_put(struct mlxsw_sp_port *mlxsw_sp_port, out_unlock: mutex_unlock(&mlxsw_sp->span->analyzed_ports_lock); } + +static int +__mlxsw_sp_span_trigger_entry_bind(struct mlxsw_sp_span *span, + struct mlxsw_sp_span_trigger_entry * + trigger_entry, bool enable) +{ + char mpar_pl[MLXSW_REG_MPAR_LEN]; + enum mlxsw_reg_mpar_i_e i_e; + + switch (trigger_entry->trigger) { + case MLXSW_SP_SPAN_TRIGGER_INGRESS: + i_e = MLXSW_REG_MPAR_TYPE_INGRESS; + break; + case MLXSW_SP_SPAN_TRIGGER_EGRESS: + i_e = MLXSW_REG_MPAR_TYPE_EGRESS; + break; + default: + WARN_ON_ONCE(1); + return -EINVAL; + } + + mlxsw_reg_mpar_pack(mpar_pl, trigger_entry->local_port, i_e, enable, + trigger_entry->parms.span_id); + return mlxsw_reg_write(span->mlxsw_sp->core, MLXSW_REG(mpar), mpar_pl); +} + +static int +mlxsw_sp_span_trigger_entry_bind(struct mlxsw_sp_span *span, + struct mlxsw_sp_span_trigger_entry * + trigger_entry) +{ + return __mlxsw_sp_span_trigger_entry_bind(span, trigger_entry, true); +} + +static void +mlxsw_sp_span_trigger_entry_unbind(struct mlxsw_sp_span *span, + struct mlxsw_sp_span_trigger_entry * + trigger_entry) +{ + __mlxsw_sp_span_trigger_entry_bind(span, trigger_entry, false); +} + +static struct mlxsw_sp_span_trigger_entry * +mlxsw_sp_span_trigger_entry_create(struct mlxsw_sp_span *span, + enum mlxsw_sp_span_trigger trigger, + struct mlxsw_sp_port *mlxsw_sp_port, + const struct mlxsw_sp_span_trigger_parms + *parms) +{ + struct mlxsw_sp_span_trigger_entry *trigger_entry; + int err; + + trigger_entry = kzalloc(sizeof(*trigger_entry), GFP_KERNEL); + if (!trigger_entry) + return ERR_PTR(-ENOMEM); + + refcount_set(&trigger_entry->ref_count, 1); + trigger_entry->local_port = mlxsw_sp_port->local_port; + trigger_entry->trigger = trigger; + memcpy(&trigger_entry->parms, parms, sizeof(trigger_entry->parms)); + list_add_tail(&trigger_entry->list, &span->trigger_entries_list); + + err = mlxsw_sp_span_trigger_entry_bind(span, trigger_entry); + if (err) + goto err_trigger_entry_bind; + + return trigger_entry; + +err_trigger_entry_bind: + list_del(&trigger_entry->list); + kfree(trigger_entry); + return ERR_PTR(err); +} + +static void +mlxsw_sp_span_trigger_entry_destroy(struct mlxsw_sp_span *span, + struct mlxsw_sp_span_trigger_entry * + trigger_entry) +{ + mlxsw_sp_span_trigger_entry_unbind(span, trigger_entry); + list_del(&trigger_entry->list); + kfree(trigger_entry); +} + +static struct mlxsw_sp_span_trigger_entry * +mlxsw_sp_span_trigger_entry_find(struct mlxsw_sp_span *span, + enum mlxsw_sp_span_trigger trigger, + struct mlxsw_sp_port *mlxsw_sp_port) +{ + struct mlxsw_sp_span_trigger_entry *trigger_entry; + + list_for_each_entry(trigger_entry, &span->trigger_entries_list, list) { + if (trigger_entry->trigger == trigger && + trigger_entry->local_port == mlxsw_sp_port->local_port) + return trigger_entry; + } + + return NULL; +} + +int mlxsw_sp_span_agent_bind(struct mlxsw_sp *mlxsw_sp, + enum mlxsw_sp_span_trigger trigger, + struct mlxsw_sp_port *mlxsw_sp_port, + const struct mlxsw_sp_span_trigger_parms *parms) +{ + struct mlxsw_sp_span_trigger_entry *trigger_entry; + int err = 0; + + ASSERT_RTNL(); + + if (!mlxsw_sp_span_entry_find_by_id(mlxsw_sp, parms->span_id)) + return -EINVAL; + + trigger_entry = mlxsw_sp_span_trigger_entry_find(mlxsw_sp->span, + trigger, + mlxsw_sp_port); + if (trigger_entry) { + if (trigger_entry->parms.span_id != parms->span_id) + return -EINVAL; + refcount_inc(&trigger_entry->ref_count); + goto out; + } + + trigger_entry = mlxsw_sp_span_trigger_entry_create(mlxsw_sp->span, + trigger, + mlxsw_sp_port, + parms); + if (IS_ERR(trigger_entry)) + err = PTR_ERR(trigger_entry); + +out: + return err; +} + +void mlxsw_sp_span_agent_unbind(struct mlxsw_sp *mlxsw_sp, + enum mlxsw_sp_span_trigger trigger, + struct mlxsw_sp_port *mlxsw_sp_port, + const struct mlxsw_sp_span_trigger_parms *parms) +{ + struct mlxsw_sp_span_trigger_entry *trigger_entry; + + ASSERT_RTNL(); + + if (WARN_ON_ONCE(!mlxsw_sp_span_entry_find_by_id(mlxsw_sp, + parms->span_id))) + return; + + trigger_entry = mlxsw_sp_span_trigger_entry_find(mlxsw_sp->span, + trigger, + mlxsw_sp_port); + if (WARN_ON_ONCE(!trigger_entry)) + return; + + if (!refcount_dec_and_test(&trigger_entry->ref_count)) + return; + + mlxsw_sp_span_trigger_entry_destroy(mlxsw_sp->span, trigger_entry); +} diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_span.h b/drivers/net/ethernet/mellanox/mlxsw/spectrum_span.h index 1345eda5cc34..6821eeb3906b 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_span.h +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_span.h @@ -37,6 +37,15 @@ struct mlxsw_sp_span_parms { u16 vid; }; +enum mlxsw_sp_span_trigger { + MLXSW_SP_SPAN_TRIGGER_INGRESS, + MLXSW_SP_SPAN_TRIGGER_EGRESS, +}; + +struct mlxsw_sp_span_trigger_parms { + int span_id; +}; + struct mlxsw_sp_span_entry_ops; struct mlxsw_sp_span_entry { @@ -84,5 +93,14 @@ int mlxsw_sp_span_analyzed_port_get(struct mlxsw_sp_port *mlxsw_sp_port, bool ingress); void mlxsw_sp_span_analyzed_port_put(struct mlxsw_sp_port *mlxsw_sp_port, bool ingress); +int mlxsw_sp_span_agent_bind(struct mlxsw_sp *mlxsw_sp, + enum mlxsw_sp_span_trigger trigger, + struct mlxsw_sp_port *mlxsw_sp_port, + const struct mlxsw_sp_span_trigger_parms *parms); +void +mlxsw_sp_span_agent_unbind(struct mlxsw_sp *mlxsw_sp, + enum mlxsw_sp_span_trigger trigger, + struct mlxsw_sp_port *mlxsw_sp_port, + const struct mlxsw_sp_span_trigger_parms *parms); #endif -- cgit v1.2.3-59-g8ed1b From c1d7845dfbd3dfeed2cd7bf92ec10ea97ef5d7fd Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Thu, 30 Apr 2020 20:01:13 +0300 Subject: mlxsw: spectrum: Convert matchall-based mirroring to new SPAN API In matchall-based mirroring, mirroring is not done with ACLs, but a SPAN agent is bound to the ingress / egress of a port and all incoming / outgoing traffic is mirrored. Convert this type of mirroring to use the new API. First the SPAN agent is resolved, then the port is marked as analyzed and its egress mirror buffer is potentially allocated. Lastly, the binding is performed. Signed-off-by: Ido Schimmel Reviewed-by: Jiri Pirko Signed-off-by: David S. Miller --- .../ethernet/mellanox/mlxsw/spectrum_matchall.c | 52 ++++++++++++++++------ 1 file changed, 39 insertions(+), 13 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_matchall.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_matchall.c index 889da63072be..da1c05f44cec 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_matchall.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_matchall.c @@ -48,31 +48,57 @@ static int mlxsw_sp_mall_port_mirror_add(struct mlxsw_sp_port *mlxsw_sp_port, struct mlxsw_sp_mall_entry *mall_entry) { - enum mlxsw_sp_span_type span_type; + struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp; + struct mlxsw_sp_span_trigger_parms parms; + enum mlxsw_sp_span_trigger trigger; + int err; if (!mall_entry->mirror.to_dev) { netdev_err(mlxsw_sp_port->dev, "Could not find requested device\n"); return -EINVAL; } - span_type = mall_entry->ingress ? MLXSW_SP_SPAN_INGRESS : - MLXSW_SP_SPAN_EGRESS; - return mlxsw_sp_span_mirror_add(mlxsw_sp_port, - mall_entry->mirror.to_dev, - span_type, true, - &mall_entry->mirror.span_id); + err = mlxsw_sp_span_agent_get(mlxsw_sp, mall_entry->mirror.to_dev, + &mall_entry->mirror.span_id); + if (err) + return err; + + err = mlxsw_sp_span_analyzed_port_get(mlxsw_sp_port, + mall_entry->ingress); + if (err) + goto err_analyzed_port_get; + + trigger = mall_entry->ingress ? MLXSW_SP_SPAN_TRIGGER_INGRESS : + MLXSW_SP_SPAN_TRIGGER_EGRESS; + parms.span_id = mall_entry->mirror.span_id; + err = mlxsw_sp_span_agent_bind(mlxsw_sp, trigger, mlxsw_sp_port, + &parms); + if (err) + goto err_agent_bind; + + return 0; + +err_agent_bind: + mlxsw_sp_span_analyzed_port_put(mlxsw_sp_port, mall_entry->ingress); +err_analyzed_port_get: + mlxsw_sp_span_agent_put(mlxsw_sp, mall_entry->mirror.span_id); + return err; } static void mlxsw_sp_mall_port_mirror_del(struct mlxsw_sp_port *mlxsw_sp_port, struct mlxsw_sp_mall_entry *mall_entry) { - enum mlxsw_sp_span_type span_type; - - span_type = mall_entry->ingress ? MLXSW_SP_SPAN_INGRESS : - MLXSW_SP_SPAN_EGRESS; - mlxsw_sp_span_mirror_del(mlxsw_sp_port, mall_entry->mirror.span_id, - span_type, true); + struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp; + struct mlxsw_sp_span_trigger_parms parms; + enum mlxsw_sp_span_trigger trigger; + + trigger = mall_entry->ingress ? MLXSW_SP_SPAN_TRIGGER_INGRESS : + MLXSW_SP_SPAN_TRIGGER_EGRESS; + parms.span_id = mall_entry->mirror.span_id; + mlxsw_sp_span_agent_unbind(mlxsw_sp, trigger, mlxsw_sp_port, &parms); + mlxsw_sp_span_analyzed_port_put(mlxsw_sp_port, mall_entry->ingress); + mlxsw_sp_span_agent_put(mlxsw_sp, mall_entry->mirror.span_id); } static int mlxsw_sp_mall_port_sample_set(struct mlxsw_sp_port *mlxsw_sp_port, -- cgit v1.2.3-59-g8ed1b From 7240db69c332f6625da044b048d06839ab0c2649 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Thu, 30 Apr 2020 20:01:14 +0300 Subject: mlxsw: spectrum_acl: Convert flower-based mirroring to new SPAN API In flower-based mirroring, mirroring is done with ACLs and the SPAN agent is not bound to a port. Instead its identifier is specified in an ACL action. Convert this type of mirroring to use the new API. Signed-off-by: Ido Schimmel Reviewed-by: Jiri Pirko Signed-off-by: David S. Miller --- .../mellanox/mlxsw/spectrum_acl_flex_actions.c | 31 +++++++++++++--------- 1 file changed, 19 insertions(+), 12 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_acl_flex_actions.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_acl_flex_actions.c index e47d1d286e93..73d56012654b 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_acl_flex_actions.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_acl_flex_actions.c @@ -136,28 +136,35 @@ mlxsw_sp_act_mirror_add(void *priv, u8 local_in_port, const struct net_device *out_dev, bool ingress, int *p_span_id) { - struct mlxsw_sp_port *in_port; + struct mlxsw_sp_port *mlxsw_sp_port; struct mlxsw_sp *mlxsw_sp = priv; - enum mlxsw_sp_span_type type; + int err; - type = ingress ? MLXSW_SP_SPAN_INGRESS : MLXSW_SP_SPAN_EGRESS; - in_port = mlxsw_sp->ports[local_in_port]; + err = mlxsw_sp_span_agent_get(mlxsw_sp, out_dev, p_span_id); + if (err) + return err; + + mlxsw_sp_port = mlxsw_sp->ports[local_in_port]; + err = mlxsw_sp_span_analyzed_port_get(mlxsw_sp_port, ingress); + if (err) + goto err_analyzed_port_get; - return mlxsw_sp_span_mirror_add(in_port, out_dev, type, - false, p_span_id); + return 0; + +err_analyzed_port_get: + mlxsw_sp_span_agent_put(mlxsw_sp, *p_span_id); + return err; } static void mlxsw_sp_act_mirror_del(void *priv, u8 local_in_port, int span_id, bool ingress) { + struct mlxsw_sp_port *mlxsw_sp_port; struct mlxsw_sp *mlxsw_sp = priv; - struct mlxsw_sp_port *in_port; - enum mlxsw_sp_span_type type; - - type = ingress ? MLXSW_SP_SPAN_INGRESS : MLXSW_SP_SPAN_EGRESS; - in_port = mlxsw_sp->ports[local_in_port]; - mlxsw_sp_span_mirror_del(in_port, span_id, type, false); + mlxsw_sp_port = mlxsw_sp->ports[local_in_port]; + mlxsw_sp_span_analyzed_port_put(mlxsw_sp_port, ingress); + mlxsw_sp_span_agent_put(mlxsw_sp, span_id); } const struct mlxsw_afa_ops mlxsw_sp1_act_afa_ops = { -- cgit v1.2.3-59-g8ed1b From 835d6b8c1a35eafba3faaf4b809f6d84517467f1 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Thu, 30 Apr 2020 20:01:15 +0300 Subject: mlxsw: spectrum_span: Use new analyzed ports list during speed / MTU change As previously explained, each port whose outgoing traffic is analyzed needs to have an egress mirror buffer. The size of the egress mirror buffer is calculated based on various parameters, two of which are the speed and the MTU of the port. Therefore, when the MTU or the speed of a port change, the SPAN code is called to see if the egress mirror buffer of the port needs to be adjusted. Currently, this is done by traversing all the SPAN agents and for each SPAN agent the list of bound ports is traversed. Instead of the above, traverse the recently added list of analyzed ports. This will later allow us to remove the old SPAN API. Signed-off-by: Ido Schimmel Reviewed-by: Jiri Pirko Signed-off-by: David S. Miller --- .../net/ethernet/mellanox/mlxsw/spectrum_span.c | 72 +++++++++++----------- 1 file changed, 35 insertions(+), 37 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_span.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_span.c index de9012ab94d5..9cb8b509b849 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_span.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_span.c @@ -776,24 +776,6 @@ static int mlxsw_sp_span_entry_put(struct mlxsw_sp *mlxsw_sp, return 0; } -static bool mlxsw_sp_span_is_egress_mirror(struct mlxsw_sp_port *port) -{ - struct mlxsw_sp *mlxsw_sp = port->mlxsw_sp; - struct mlxsw_sp_span_inspected_port *p; - int i; - - for (i = 0; i < mlxsw_sp->span->entries_count; i++) { - struct mlxsw_sp_span_entry *curr = &mlxsw_sp->span->entries[i]; - - list_for_each_entry(p, &curr->bound_ports_list, list) - if (p->local_port == port->local_port && - p->type == MLXSW_SP_SPAN_EGRESS) - return true; - } - - return false; -} - static int mlxsw_sp_span_port_buffer_update(struct mlxsw_sp_port *mlxsw_sp_port, u16 mtu) { @@ -823,20 +805,45 @@ static void mlxsw_sp_span_port_buffer_disable(struct mlxsw_sp *mlxsw_sp, mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(sbib), sbib_pl); } +static struct mlxsw_sp_span_analyzed_port * +mlxsw_sp_span_analyzed_port_find(struct mlxsw_sp_span *span, u8 local_port, + bool ingress) +{ + struct mlxsw_sp_span_analyzed_port *analyzed_port; + + list_for_each_entry(analyzed_port, &span->analyzed_ports_list, list) { + if (analyzed_port->local_port == local_port && + analyzed_port->ingress == ingress) + return analyzed_port; + } + + return NULL; +} + int mlxsw_sp_span_port_mtu_update(struct mlxsw_sp_port *port, u16 mtu) { + struct mlxsw_sp *mlxsw_sp = port->mlxsw_sp; + int err = 0; + /* If port is egress mirrored, the shared buffer size should be * updated according to the mtu value */ - if (mlxsw_sp_span_is_egress_mirror(port)) - return mlxsw_sp_span_port_buffer_update(port, mtu); - return 0; + mutex_lock(&mlxsw_sp->span->analyzed_ports_lock); + + if (mlxsw_sp_span_analyzed_port_find(mlxsw_sp->span, port->local_port, + false)) + err = mlxsw_sp_span_port_buffer_update(port, mtu); + + mutex_unlock(&mlxsw_sp->span->analyzed_ports_lock); + + return err; } void mlxsw_sp_span_speed_update_work(struct work_struct *work) { struct delayed_work *dwork = to_delayed_work(work); struct mlxsw_sp_port *mlxsw_sp_port; + struct mlxsw_sp *mlxsw_sp; mlxsw_sp_port = container_of(dwork, struct mlxsw_sp_port, span.speed_update_dw); @@ -844,9 +851,15 @@ void mlxsw_sp_span_speed_update_work(struct work_struct *work) /* If port is egress mirrored, the shared buffer size should be * updated according to the speed value. */ - if (mlxsw_sp_span_is_egress_mirror(mlxsw_sp_port)) + mlxsw_sp = mlxsw_sp_port->mlxsw_sp; + mutex_lock(&mlxsw_sp->span->analyzed_ports_lock); + + if (mlxsw_sp_span_analyzed_port_find(mlxsw_sp->span, + mlxsw_sp_port->local_port, false)) mlxsw_sp_span_port_buffer_update(mlxsw_sp_port, mlxsw_sp_port->dev->mtu); + + mutex_unlock(&mlxsw_sp->span->analyzed_ports_lock); } static struct mlxsw_sp_span_inspected_port * @@ -1117,21 +1130,6 @@ void mlxsw_sp_span_agent_put(struct mlxsw_sp *mlxsw_sp, int span_id) mlxsw_sp_span_entry_put(mlxsw_sp, span_entry); } -static struct mlxsw_sp_span_analyzed_port * -mlxsw_sp_span_analyzed_port_find(struct mlxsw_sp_span *span, u8 local_port, - bool ingress) -{ - struct mlxsw_sp_span_analyzed_port *analyzed_port; - - list_for_each_entry(analyzed_port, &span->analyzed_ports_list, list) { - if (analyzed_port->local_port == local_port && - analyzed_port->ingress == ingress) - return analyzed_port; - } - - return NULL; -} - static struct mlxsw_sp_span_analyzed_port * mlxsw_sp_span_analyzed_port_create(struct mlxsw_sp_span *span, struct mlxsw_sp_port *mlxsw_sp_port, -- cgit v1.2.3-59-g8ed1b From ca0892235ae68b097f42cd39e03b17cfead7c7c7 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Thu, 30 Apr 2020 20:01:16 +0300 Subject: mlxsw: spectrum_span: Remove old SPAN API Remove the old SPAN API now that matchall-based and flower-based mirroring were converted to use the new API. Signed-off-by: Ido Schimmel Reviewed-by: Jiri Pirko Signed-off-by: David S. Miller --- .../net/ethernet/mellanox/mlxsw/spectrum_span.c | 190 +-------------------- .../net/ethernet/mellanox/mlxsw/spectrum_span.h | 21 --- 2 files changed, 2 insertions(+), 209 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_span.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_span.c index 9cb8b509b849..304eb8c3d8bd 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_span.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_span.c @@ -74,12 +74,8 @@ int mlxsw_sp_span_init(struct mlxsw_sp *mlxsw_sp) span->mlxsw_sp = mlxsw_sp; mlxsw_sp->span = span; - for (i = 0; i < mlxsw_sp->span->entries_count; i++) { - struct mlxsw_sp_span_entry *curr = &mlxsw_sp->span->entries[i]; - - INIT_LIST_HEAD(&curr->bound_ports_list); - curr->id = i; - } + for (i = 0; i < mlxsw_sp->span->entries_count; i++) + mlxsw_sp->span->entries[i].id = i; devlink_resource_occ_get_register(devlink, MLXSW_SP_RESOURCE_SPAN, mlxsw_sp_span_occ_get, mlxsw_sp); @@ -91,16 +87,10 @@ int mlxsw_sp_span_init(struct mlxsw_sp *mlxsw_sp) void mlxsw_sp_span_fini(struct mlxsw_sp *mlxsw_sp) { struct devlink *devlink = priv_to_devlink(mlxsw_sp->core); - int i; cancel_work_sync(&mlxsw_sp->span->work); devlink_resource_occ_get_unregister(devlink, MLXSW_SP_RESOURCE_SPAN); - for (i = 0; i < mlxsw_sp->span->entries_count; i++) { - struct mlxsw_sp_span_entry *curr = &mlxsw_sp->span->entries[i]; - - WARN_ON_ONCE(!list_empty(&curr->bound_ports_list)); - } WARN_ON_ONCE(!list_empty(&mlxsw_sp->span->trigger_entries_list)); WARN_ON_ONCE(!list_empty(&mlxsw_sp->span->analyzed_ports_list)); mutex_destroy(&mlxsw_sp->span->analyzed_ports_lock); @@ -862,131 +852,6 @@ void mlxsw_sp_span_speed_update_work(struct work_struct *work) mutex_unlock(&mlxsw_sp->span->analyzed_ports_lock); } -static struct mlxsw_sp_span_inspected_port * -mlxsw_sp_span_entry_bound_port_find(struct mlxsw_sp_span_entry *span_entry, - enum mlxsw_sp_span_type type, - struct mlxsw_sp_port *port, - bool bind) -{ - struct mlxsw_sp_span_inspected_port *p; - - list_for_each_entry(p, &span_entry->bound_ports_list, list) - if (type == p->type && - port->local_port == p->local_port && - bind == p->bound) - return p; - return NULL; -} - -static int -mlxsw_sp_span_inspected_port_bind(struct mlxsw_sp_port *port, - struct mlxsw_sp_span_entry *span_entry, - enum mlxsw_sp_span_type type, - bool bind) -{ - struct mlxsw_sp *mlxsw_sp = port->mlxsw_sp; - char mpar_pl[MLXSW_REG_MPAR_LEN]; - int pa_id = span_entry->id; - - /* bind the port to the SPAN entry */ - mlxsw_reg_mpar_pack(mpar_pl, port->local_port, - (enum mlxsw_reg_mpar_i_e)type, bind, pa_id); - return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(mpar), mpar_pl); -} - -static int -mlxsw_sp_span_inspected_port_add(struct mlxsw_sp_port *port, - struct mlxsw_sp_span_entry *span_entry, - enum mlxsw_sp_span_type type, - bool bind) -{ - struct mlxsw_sp_span_inspected_port *inspected_port; - struct mlxsw_sp *mlxsw_sp = port->mlxsw_sp; - char sbib_pl[MLXSW_REG_SBIB_LEN]; - int i; - int err; - - /* A given (source port, direction) can only be bound to one analyzer, - * so if a binding is requested, check for conflicts. - */ - if (bind) - for (i = 0; i < mlxsw_sp->span->entries_count; i++) { - struct mlxsw_sp_span_entry *curr = - &mlxsw_sp->span->entries[i]; - - if (mlxsw_sp_span_entry_bound_port_find(curr, type, - port, bind)) - return -EEXIST; - } - - /* if it is an egress SPAN, bind a shared buffer to it */ - if (type == MLXSW_SP_SPAN_EGRESS) { - err = mlxsw_sp_span_port_buffer_update(port, port->dev->mtu); - if (err) - return err; - } - - if (bind) { - err = mlxsw_sp_span_inspected_port_bind(port, span_entry, type, - true); - if (err) - goto err_port_bind; - } - - inspected_port = kzalloc(sizeof(*inspected_port), GFP_KERNEL); - if (!inspected_port) { - err = -ENOMEM; - goto err_inspected_port_alloc; - } - inspected_port->local_port = port->local_port; - inspected_port->type = type; - inspected_port->bound = bind; - list_add_tail(&inspected_port->list, &span_entry->bound_ports_list); - - return 0; - -err_inspected_port_alloc: - if (bind) - mlxsw_sp_span_inspected_port_bind(port, span_entry, type, - false); -err_port_bind: - if (type == MLXSW_SP_SPAN_EGRESS) { - mlxsw_reg_sbib_pack(sbib_pl, port->local_port, 0); - mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(sbib), sbib_pl); - } - return err; -} - -static void -mlxsw_sp_span_inspected_port_del(struct mlxsw_sp_port *port, - struct mlxsw_sp_span_entry *span_entry, - enum mlxsw_sp_span_type type, - bool bind) -{ - struct mlxsw_sp_span_inspected_port *inspected_port; - struct mlxsw_sp *mlxsw_sp = port->mlxsw_sp; - char sbib_pl[MLXSW_REG_SBIB_LEN]; - - inspected_port = mlxsw_sp_span_entry_bound_port_find(span_entry, type, - port, bind); - if (!inspected_port) - return; - - if (bind) - mlxsw_sp_span_inspected_port_bind(port, span_entry, type, - false); - /* remove the SBIB buffer if it was egress SPAN */ - if (type == MLXSW_SP_SPAN_EGRESS) { - mlxsw_reg_sbib_pack(sbib_pl, port->local_port, 0); - mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(sbib), sbib_pl); - } - - mlxsw_sp_span_entry_put(mlxsw_sp, span_entry); - - list_del(&inspected_port->list); - kfree(inspected_port); -} - static const struct mlxsw_sp_span_entry_ops * mlxsw_sp_span_entry_ops(struct mlxsw_sp *mlxsw_sp, const struct net_device *to_dev) @@ -1000,57 +865,6 @@ mlxsw_sp_span_entry_ops(struct mlxsw_sp *mlxsw_sp, return NULL; } -int mlxsw_sp_span_mirror_add(struct mlxsw_sp_port *from, - const struct net_device *to_dev, - enum mlxsw_sp_span_type type, bool bind, - int *p_span_id) -{ - struct mlxsw_sp *mlxsw_sp = from->mlxsw_sp; - const struct mlxsw_sp_span_entry_ops *ops; - struct mlxsw_sp_span_parms sparms = {NULL}; - struct mlxsw_sp_span_entry *span_entry; - int err; - - ops = mlxsw_sp_span_entry_ops(mlxsw_sp, to_dev); - if (!ops) { - netdev_err(to_dev, "Cannot mirror to %s", to_dev->name); - return -EOPNOTSUPP; - } - - err = ops->parms_set(to_dev, &sparms); - if (err) - return err; - - span_entry = mlxsw_sp_span_entry_get(mlxsw_sp, to_dev, ops, sparms); - if (!span_entry) - return -ENOBUFS; - - err = mlxsw_sp_span_inspected_port_add(from, span_entry, type, bind); - if (err) - goto err_port_bind; - - *p_span_id = span_entry->id; - return 0; - -err_port_bind: - mlxsw_sp_span_entry_put(mlxsw_sp, span_entry); - return err; -} - -void mlxsw_sp_span_mirror_del(struct mlxsw_sp_port *from, int span_id, - enum mlxsw_sp_span_type type, bool bind) -{ - struct mlxsw_sp_span_entry *span_entry; - - span_entry = mlxsw_sp_span_entry_find_by_id(from->mlxsw_sp, span_id); - if (!span_entry) { - netdev_err(from->dev, "no span entry found\n"); - return; - } - - mlxsw_sp_span_inspected_port_del(from, span_entry, type, bind); -} - static void mlxsw_sp_span_respin_work(struct work_struct *work) { struct mlxsw_sp_span *span; diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_span.h b/drivers/net/ethernet/mellanox/mlxsw/spectrum_span.h index 6821eeb3906b..9f6dd2d0f4e6 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_span.h +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_span.h @@ -13,20 +13,6 @@ struct mlxsw_sp; struct mlxsw_sp_port; -enum mlxsw_sp_span_type { - MLXSW_SP_SPAN_EGRESS, - MLXSW_SP_SPAN_INGRESS -}; - -struct mlxsw_sp_span_inspected_port { - struct list_head list; - enum mlxsw_sp_span_type type; - u8 local_port; - - /* Whether this is a directly bound mirror (port-to-port) or an ACL. */ - bool bound; -}; - struct mlxsw_sp_span_parms { struct mlxsw_sp_port *dest_port; /* NULL for unoffloaded SPAN. */ unsigned int ttl; @@ -52,7 +38,6 @@ struct mlxsw_sp_span_entry { const struct net_device *to_dev; const struct mlxsw_sp_span_entry_ops *ops; struct mlxsw_sp_span_parms parms; - struct list_head bound_ports_list; refcount_t ref_count; int id; }; @@ -70,12 +55,6 @@ int mlxsw_sp_span_init(struct mlxsw_sp *mlxsw_sp); void mlxsw_sp_span_fini(struct mlxsw_sp *mlxsw_sp); void mlxsw_sp_span_respin(struct mlxsw_sp *mlxsw_sp); -int mlxsw_sp_span_mirror_add(struct mlxsw_sp_port *from, - const struct net_device *to_dev, - enum mlxsw_sp_span_type type, - bool bind, int *p_span_id); -void mlxsw_sp_span_mirror_del(struct mlxsw_sp_port *from, int span_id, - enum mlxsw_sp_span_type type, bool bind); struct mlxsw_sp_span_entry * mlxsw_sp_span_entry_find_by_port(struct mlxsw_sp *mlxsw_sp, const struct net_device *to_dev); -- cgit v1.2.3-59-g8ed1b From 2b195850128f5bafde177b12489d9fa27962cc1e Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 30 Apr 2020 10:35:41 -0700 Subject: tcp: add tp->dup_ack_counter In commit 86de5921a3d5 ("tcp: defer SACK compression after DupThresh") I added a TCP_FASTRETRANS_THRESH bias to tp->compressed_ack in order to enable sack compression only after 3 dupacks. Since we plan to relax this rule for flows that involve stacks not requiring this old rule, this patch adds a distinct tp->dup_ack_counter. This means the TCP_FASTRETRANS_THRESH value is now used in a single location that a future patch can adjust: if (tp->dup_ack_counter < TCP_FASTRETRANS_THRESH) { tp->dup_ack_counter++; goto send_now; } This patch also introduces tcp_sack_compress_send_ack() helper to ease following patch comprehension. This patch refines LINUX_MIB_TCPACKCOMPRESSED to not count the acks that we had to send if the timer expires or tcp_sack_compress_send_ack() is sending an ack. Signed-off-by: Eric Dumazet Acked-by: Soheil Hassas Yeganeh Acked-by: Neal Cardwell Signed-off-by: David S. Miller --- include/linux/tcp.h | 1 + net/ipv4/tcp_input.c | 36 +++++++++++++++++++++++++++--------- net/ipv4/tcp_output.c | 6 +++--- net/ipv4/tcp_timer.c | 8 +++++++- 4 files changed, 38 insertions(+), 13 deletions(-) diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 421c99c12291..2c6f87e9f0cf 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -268,6 +268,7 @@ struct tcp_sock { } rack; u16 advmss; /* Advertised MSS */ u8 compressed_ack; + u8 dup_ack_counter; u32 chrono_start; /* Start time in jiffies of a TCP chrono */ u32 chrono_stat[3]; /* Time in jiffies for chrono_stat stats */ u8 chrono_type:2, /* current chronograph type */ diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index bf4ced9273e8..da777df0a0ba 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -4327,6 +4327,27 @@ static void tcp_sack_maybe_coalesce(struct tcp_sock *tp) } } +static void tcp_sack_compress_send_ack(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + + if (!tp->compressed_ack) + return; + + if (hrtimer_try_to_cancel(&tp->compressed_ack_timer) == 1) + __sock_put(sk); + + /* Since we have to send one ack finally, + * substract one from tp->compressed_ack to keep + * LINUX_MIB_TCPACKCOMPRESSED accurate. + */ + NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPACKCOMPRESSED, + tp->compressed_ack - 1); + + tp->compressed_ack = 0; + tcp_send_ack(sk); +} + static void tcp_sack_new_ofo_skb(struct sock *sk, u32 seq, u32 end_seq) { struct tcp_sock *tp = tcp_sk(sk); @@ -4355,8 +4376,7 @@ static void tcp_sack_new_ofo_skb(struct sock *sk, u32 seq, u32 end_seq) * If the sack array is full, forget about the last one. */ if (this_sack >= TCP_NUM_SACKS) { - if (tp->compressed_ack > TCP_FASTRETRANS_THRESH) - tcp_send_ack(sk); + tcp_sack_compress_send_ack(sk); this_sack--; tp->rx_opt.num_sacks--; sp--; @@ -5275,15 +5295,13 @@ send_now: if (tp->compressed_ack_rcv_nxt != tp->rcv_nxt) { tp->compressed_ack_rcv_nxt = tp->rcv_nxt; - if (tp->compressed_ack > TCP_FASTRETRANS_THRESH) - NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPACKCOMPRESSED, - tp->compressed_ack - TCP_FASTRETRANS_THRESH); - tp->compressed_ack = 0; + tp->dup_ack_counter = 0; } - - if (++tp->compressed_ack <= TCP_FASTRETRANS_THRESH) + if (tp->dup_ack_counter < TCP_FASTRETRANS_THRESH) { + tp->dup_ack_counter++; goto send_now; - + } + tp->compressed_ack++; if (hrtimer_is_queued(&tp->compressed_ack_timer)) return; diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index ba4482130f08..c414aeb1efa9 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -184,10 +184,10 @@ static inline void tcp_event_ack_sent(struct sock *sk, unsigned int pkts, { struct tcp_sock *tp = tcp_sk(sk); - if (unlikely(tp->compressed_ack > TCP_FASTRETRANS_THRESH)) { + if (unlikely(tp->compressed_ack)) { NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPACKCOMPRESSED, - tp->compressed_ack - TCP_FASTRETRANS_THRESH); - tp->compressed_ack = TCP_FASTRETRANS_THRESH; + tp->compressed_ack); + tp->compressed_ack = 0; if (hrtimer_try_to_cancel(&tp->compressed_ack_timer) == 1) __sock_put(sk); } diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index c3f26dcd6704..ada046f425d2 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -753,8 +753,14 @@ static enum hrtimer_restart tcp_compressed_ack_kick(struct hrtimer *timer) bh_lock_sock(sk); if (!sock_owned_by_user(sk)) { - if (tp->compressed_ack > TCP_FASTRETRANS_THRESH) + if (tp->compressed_ack) { + /* Since we have to send one ack finally, + * substract one from tp->compressed_ack to keep + * LINUX_MIB_TCPACKCOMPRESSED accurate. + */ + tp->compressed_ack--; tcp_send_ack(sk); + } } else { if (!test_and_set_bit(TCP_DELACK_TIMER_DEFERRED, &sk->sk_tsq_flags)) -- cgit v1.2.3-59-g8ed1b From ccd0628fca440268711560a1dbacc727b4f9e214 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 30 Apr 2020 10:35:42 -0700 Subject: tcp: tcp_sack_new_ofo_skb() should be more conservative Currently, tcp_sack_new_ofo_skb() sends an ack if prior acks were 'compressed', if room has to be made in tp->selective_acks[] But there is no guarantee all four sack ranges can be included in SACK option. As a matter of fact, when TCP timestamps option is used, only three SACK ranges can be included. Lets assume only two ranges can be included, and force the ack: - When we touch more than 2 ranges in the reordering done if tcp_sack_extend() could be done. - If we have at least 2 ranges when adding a new one. This enforces that before a range is in third or fourth position, at least one ACK packet included it in first/second position. Signed-off-by: Eric Dumazet Acked-by: Soheil Hassas Yeganeh Acked-by: Neal Cardwell Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index da777df0a0ba..ef921ecba415 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -4348,6 +4348,12 @@ static void tcp_sack_compress_send_ack(struct sock *sk) tcp_send_ack(sk); } +/* Reasonable amount of sack blocks included in TCP SACK option + * The max is 4, but this becomes 3 if TCP timestamps are there. + * Given that SACK packets might be lost, be conservative and use 2. + */ +#define TCP_SACK_BLOCKS_EXPECTED 2 + static void tcp_sack_new_ofo_skb(struct sock *sk, u32 seq, u32 end_seq) { struct tcp_sock *tp = tcp_sk(sk); @@ -4360,6 +4366,8 @@ static void tcp_sack_new_ofo_skb(struct sock *sk, u32 seq, u32 end_seq) for (this_sack = 0; this_sack < cur_sacks; this_sack++, sp++) { if (tcp_sack_extend(sp, seq, end_seq)) { + if (this_sack >= TCP_SACK_BLOCKS_EXPECTED) + tcp_sack_compress_send_ack(sk); /* Rotate this_sack to the first one. */ for (; this_sack > 0; this_sack--, sp--) swap(*sp, *(sp - 1)); @@ -4369,6 +4377,9 @@ static void tcp_sack_new_ofo_skb(struct sock *sk, u32 seq, u32 end_seq) } } + if (this_sack >= TCP_SACK_BLOCKS_EXPECTED) + tcp_sack_compress_send_ack(sk); + /* Could not find an adjacent existing SACK, build a new one, * put it at the front, and shift everyone else down. We * always know there is at least one SACK present already here. @@ -4376,7 +4387,6 @@ static void tcp_sack_new_ofo_skb(struct sock *sk, u32 seq, u32 end_seq) * If the sack array is full, forget about the last one. */ if (this_sack >= TCP_NUM_SACKS) { - tcp_sack_compress_send_ack(sk); this_sack--; tp->rx_opt.num_sacks--; sp--; -- cgit v1.2.3-59-g8ed1b From a70437cc09a11771870e9f6bfc0ba1237161daa8 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 30 Apr 2020 10:35:43 -0700 Subject: tcp: add hrtimer slack to sack compression Add a sysctl to control hrtimer slack, default of 100 usec. This gives the opportunity to reduce system overhead, and help very short RTT flows. Signed-off-by: Eric Dumazet Acked-by: Soheil Hassas Yeganeh Acked-by: Neal Cardwell Signed-off-by: David S. Miller --- Documentation/networking/ip-sysctl.rst | 8 ++++++++ include/net/netns/ipv4.h | 1 + net/ipv4/sysctl_net_ipv4.c | 7 +++++++ net/ipv4/tcp_input.c | 5 +++-- net/ipv4/tcp_ipv4.c | 1 + 5 files changed, 20 insertions(+), 2 deletions(-) diff --git a/Documentation/networking/ip-sysctl.rst b/Documentation/networking/ip-sysctl.rst index 3266aee9e052..50b440d29a13 100644 --- a/Documentation/networking/ip-sysctl.rst +++ b/Documentation/networking/ip-sysctl.rst @@ -651,6 +651,14 @@ tcp_comp_sack_delay_ns - LONG INTEGER Default : 1,000,000 ns (1 ms) +tcp_comp_sack_slack_ns - LONG INTEGER + This sysctl control the slack used when arming the + timer used by SACK compression. This gives extra time + for small RTT flows, and reduces system overhead by allowing + opportunistic reduction of timer interrupts. + + Default : 100,000 ns (100 us) + tcp_comp_sack_nr - INTEGER Max number of SACK that can be compressed. Using 0 disables SACK compression. diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index 5acdb4d414c4..9e36738c1fe1 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -173,6 +173,7 @@ struct netns_ipv4 { int sysctl_tcp_rmem[3]; int sysctl_tcp_comp_sack_nr; unsigned long sysctl_tcp_comp_sack_delay_ns; + unsigned long sysctl_tcp_comp_sack_slack_ns; struct inet_timewait_death_row tcp_death_row; int sysctl_max_syn_backlog; int sysctl_tcp_fastopen; diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 95ad71e76cc3..3a628423d27b 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -1329,6 +1329,13 @@ static struct ctl_table ipv4_net_table[] = { .mode = 0644, .proc_handler = proc_doulongvec_minmax, }, + { + .procname = "tcp_comp_sack_slack_ns", + .data = &init_net.ipv4.sysctl_tcp_comp_sack_slack_ns, + .maxlen = sizeof(unsigned long), + .mode = 0644, + .proc_handler = proc_doulongvec_minmax, + }, { .procname = "tcp_comp_sack_nr", .data = &init_net.ipv4.sysctl_tcp_comp_sack_nr, diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index ef921ecba415..d68128a672ab 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -5324,8 +5324,9 @@ send_now: delay = min_t(unsigned long, sock_net(sk)->ipv4.sysctl_tcp_comp_sack_delay_ns, rtt * (NSEC_PER_USEC >> 3)/20); sock_hold(sk); - hrtimer_start(&tp->compressed_ack_timer, ns_to_ktime(delay), - HRTIMER_MODE_REL_PINNED_SOFT); + hrtimer_start_range_ns(&tp->compressed_ack_timer, ns_to_ktime(delay), + sock_net(sk)->ipv4.sysctl_tcp_comp_sack_slack_ns, + HRTIMER_MODE_REL_PINNED_SOFT); } static inline void tcp_ack_snd_check(struct sock *sk) diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 83a5d24e13b8..6c05f1ceb538 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2780,6 +2780,7 @@ static int __net_init tcp_sk_init(struct net *net) sizeof(init_net.ipv4.sysctl_tcp_wmem)); } net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC; + net->ipv4.sysctl_tcp_comp_sack_slack_ns = 100 * NSEC_PER_USEC; net->ipv4.sysctl_tcp_comp_sack_nr = 44; net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE; spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock); -- cgit v1.2.3-59-g8ed1b From 34a9c361dd480041d790fff3d6ea58513c8769e8 Mon Sep 17 00:00:00 2001 From: Taehee Yoo Date: Thu, 30 Apr 2020 17:37:02 +0000 Subject: hsr: remove hsr interface if all slaves are removed When all hsr slave interfaces are removed, hsr interface doesn't work. At that moment, it's fine to remove an unused hsr interface automatically for saving resources. That's a common behavior of virtual interfaces. Signed-off-by: Taehee Yoo Signed-off-by: David S. Miller --- net/hsr/hsr_main.c | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/net/hsr/hsr_main.c b/net/hsr/hsr_main.c index 26d6c39f24e1..e2564de67603 100644 --- a/net/hsr/hsr_main.c +++ b/net/hsr/hsr_main.c @@ -15,12 +15,23 @@ #include "hsr_framereg.h" #include "hsr_slave.h" +static bool hsr_slave_empty(struct hsr_priv *hsr) +{ + struct hsr_port *port; + + hsr_for_each_port(hsr, port) + if (port->type != HSR_PT_MASTER) + return false; + return true; +} + static int hsr_netdev_notify(struct notifier_block *nb, unsigned long event, void *ptr) { - struct net_device *dev; struct hsr_port *port, *master; + struct net_device *dev; struct hsr_priv *hsr; + LIST_HEAD(list_kill); int mtu_max; int res; @@ -85,8 +96,15 @@ static int hsr_netdev_notify(struct notifier_block *nb, unsigned long event, master->dev->mtu = mtu_max; break; case NETDEV_UNREGISTER: - if (!is_hsr_master(dev)) + if (!is_hsr_master(dev)) { + master = hsr_port_get_hsr(port->hsr, HSR_PT_MASTER); hsr_del_port(port); + if (hsr_slave_empty(master->hsr)) { + unregister_netdevice_queue(master->dev, + &list_kill); + unregister_netdevice_many(&list_kill); + } + } break; case NETDEV_PRE_TYPE_CHANGE: /* HSR works only on Ethernet devices. Refuse slave to change -- cgit v1.2.3-59-g8ed1b From ef2c0a78aee10113d1299eb81e642470308e32ca Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Thu, 30 Apr 2020 21:55:36 +0200 Subject: r8169: don't pass net_device to irq coalescing sub-functions The net_device argument is just used to get a struct rtl8169_private pointer via netdev_priv(). Therefore pass the struct rtl8169_private pointer directly. Signed-off-by: Heiner Kallweit Signed-off-by: David S. Miller --- drivers/net/ethernet/realtek/r8169_main.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/net/ethernet/realtek/r8169_main.c b/drivers/net/ethernet/realtek/r8169_main.c index 68d5255568a5..1c3974ad88eb 100644 --- a/drivers/net/ethernet/realtek/r8169_main.c +++ b/drivers/net/ethernet/realtek/r8169_main.c @@ -1805,9 +1805,9 @@ static const struct rtl_coalesce_info rtl_coalesce_info_8168_8136[] = { #undef rxtx_x1822 /* get rx/tx scale vector corresponding to current speed */ -static const struct rtl_coalesce_info *rtl_coalesce_info(struct net_device *dev) +static const struct rtl_coalesce_info * +rtl_coalesce_info(struct rtl8169_private *tp) { - struct rtl8169_private *tp = netdev_priv(dev); const struct rtl_coalesce_info *ci; if (tp->mac_version <= RTL_GIGA_MAC_VER_06) @@ -1844,7 +1844,7 @@ static int rtl_get_coalesce(struct net_device *dev, struct ethtool_coalesce *ec) memset(ec, 0, sizeof(*ec)); /* get rx/tx scale corresponding to current speed and CPlusCmd[0:1] */ - ci = rtl_coalesce_info(dev); + ci = rtl_coalesce_info(tp); if (IS_ERR(ci)) return PTR_ERR(ci); @@ -1874,12 +1874,12 @@ static int rtl_get_coalesce(struct net_device *dev, struct ethtool_coalesce *ec) /* choose appropriate scale factor and CPlusCmd[0:1] for (speed, nsec) */ static const struct rtl_coalesce_scale *rtl_coalesce_choose_scale( - struct net_device *dev, u32 nsec, u16 *cp01) + struct rtl8169_private *tp, u32 nsec, u16 *cp01) { const struct rtl_coalesce_info *ci; u16 i; - ci = rtl_coalesce_info(dev); + ci = rtl_coalesce_info(tp); if (IS_ERR(ci)) return ERR_CAST(ci); @@ -1912,7 +1912,7 @@ static int rtl_set_coalesce(struct net_device *dev, struct ethtool_coalesce *ec) if (rtl_is_8125(tp)) return -EOPNOTSUPP; - scale = rtl_coalesce_choose_scale(dev, + scale = rtl_coalesce_choose_scale(tp, max(p[0].usecs, p[1].usecs) * 1000, &cp01); if (IS_ERR(scale)) return PTR_ERR(scale); -- cgit v1.2.3-59-g8ed1b From 2815b30535a0613ee07d477d0c628100f40b6059 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Thu, 30 Apr 2020 21:56:20 +0200 Subject: r8169: merge scale for tx and rx irq coalescing Rx and tx scale are the same always. Simplify the code by using one scale for rx and tx only. Signed-off-by: Heiner Kallweit Signed-off-by: David S. Miller --- drivers/net/ethernet/realtek/r8169_main.c | 65 ++++++++++++------------------- 1 file changed, 25 insertions(+), 40 deletions(-) diff --git a/drivers/net/ethernet/realtek/r8169_main.c b/drivers/net/ethernet/realtek/r8169_main.c index 1c3974ad88eb..9932b6ffae02 100644 --- a/drivers/net/ethernet/realtek/r8169_main.c +++ b/drivers/net/ethernet/realtek/r8169_main.c @@ -1768,41 +1768,29 @@ static void rtl8169_get_strings(struct net_device *dev, u32 stringset, u8 *data) * 1 1 160us 81.92us 1.31ms */ -/* rx/tx scale factors for one particular CPlusCmd[0:1] value */ -struct rtl_coalesce_scale { - /* Rx / Tx */ - u32 nsecs[2]; -}; - /* rx/tx scale factors for all CPlusCmd[0:1] cases */ struct rtl_coalesce_info { u32 speed; - struct rtl_coalesce_scale scalev[4]; /* each CPlusCmd[0:1] case */ + u32 scale_nsecs[4]; }; -/* produce (r,t) pairs with each being in series of *1, *8, *8*2, *8*2*2 */ -#define rxtx_x1822(r, t) { \ - {{(r), (t)}}, \ - {{(r)*8, (t)*8}}, \ - {{(r)*8*2, (t)*8*2}}, \ - {{(r)*8*2*2, (t)*8*2*2}}, \ -} +/* produce array with base delay *1, *8, *8*2, *8*2*2 */ +#define COALESCE_DELAY(d) { (d), 8 * (d), 16 * (d), 32 * (d) } + static const struct rtl_coalesce_info rtl_coalesce_info_8169[] = { - /* speed delays: rx00 tx00 */ - { SPEED_10, rxtx_x1822(40960, 40960) }, - { SPEED_100, rxtx_x1822( 2560, 2560) }, - { SPEED_1000, rxtx_x1822( 320, 320) }, + { SPEED_10, COALESCE_DELAY(40960) }, + { SPEED_100, COALESCE_DELAY(2560) }, + { SPEED_1000, COALESCE_DELAY(320) }, { 0 }, }; static const struct rtl_coalesce_info rtl_coalesce_info_8168_8136[] = { - /* speed delays: rx00 tx00 */ - { SPEED_10, rxtx_x1822(40960, 40960) }, - { SPEED_100, rxtx_x1822( 2560, 2560) }, - { SPEED_1000, rxtx_x1822( 5000, 5000) }, + { SPEED_10, COALESCE_DELAY(40960) }, + { SPEED_100, COALESCE_DELAY(2560) }, + { SPEED_1000, COALESCE_DELAY(5000) }, { 0 }, }; -#undef rxtx_x1822 +#undef COALESCE_DELAY /* get rx/tx scale vector corresponding to current speed */ static const struct rtl_coalesce_info * @@ -1827,7 +1815,6 @@ static int rtl_get_coalesce(struct net_device *dev, struct ethtool_coalesce *ec) { struct rtl8169_private *tp = netdev_priv(dev); const struct rtl_coalesce_info *ci; - const struct rtl_coalesce_scale *scale; struct { u32 *max_frames; u32 *usecs; @@ -1835,6 +1822,7 @@ static int rtl_get_coalesce(struct net_device *dev, struct ethtool_coalesce *ec) { &ec->rx_max_coalesced_frames, &ec->rx_coalesce_usecs }, { &ec->tx_max_coalesced_frames, &ec->tx_coalesce_usecs } }, *p = coal_settings; + u32 scale; int i; u16 w; @@ -1848,7 +1836,7 @@ static int rtl_get_coalesce(struct net_device *dev, struct ethtool_coalesce *ec) if (IS_ERR(ci)) return PTR_ERR(ci); - scale = &ci->scalev[tp->cp_cmd & INTT_MASK]; + scale = ci->scale_nsecs[tp->cp_cmd & INTT_MASK]; /* read IntrMitigate and adjust according to scale */ for (w = RTL_R16(tp, IntrMitigate); w; w >>= RTL_COALESCE_SHIFT, p++) { @@ -1859,7 +1847,7 @@ static int rtl_get_coalesce(struct net_device *dev, struct ethtool_coalesce *ec) for (i = 0; i < 2; i++) { p = coal_settings + i; - *p->usecs = (*p->usecs * scale->nsecs[i]) / 1000; + *p->usecs = (*p->usecs * scale) / 1000; /* * ethtool_coalesce says it is illegal to set both usecs and @@ -1873,32 +1861,29 @@ static int rtl_get_coalesce(struct net_device *dev, struct ethtool_coalesce *ec) } /* choose appropriate scale factor and CPlusCmd[0:1] for (speed, nsec) */ -static const struct rtl_coalesce_scale *rtl_coalesce_choose_scale( - struct rtl8169_private *tp, u32 nsec, u16 *cp01) +static int rtl_coalesce_choose_scale(struct rtl8169_private *tp, u32 nsec, + u16 *cp01) { const struct rtl_coalesce_info *ci; u16 i; ci = rtl_coalesce_info(tp); if (IS_ERR(ci)) - return ERR_CAST(ci); + return PTR_ERR(ci); for (i = 0; i < 4; i++) { - u32 rxtx_maxscale = max(ci->scalev[i].nsecs[0], - ci->scalev[i].nsecs[1]); - if (nsec <= rxtx_maxscale * RTL_COALESCE_T_MAX) { + if (nsec <= ci->scale_nsecs[i] * RTL_COALESCE_T_MAX) { *cp01 = i; - return &ci->scalev[i]; + return ci->scale_nsecs[i]; } } - return ERR_PTR(-EINVAL); + return -EINVAL; } static int rtl_set_coalesce(struct net_device *dev, struct ethtool_coalesce *ec) { struct rtl8169_private *tp = netdev_priv(dev); - const struct rtl_coalesce_scale *scale; struct { u32 frames; u32 usecs; @@ -1906,16 +1891,16 @@ static int rtl_set_coalesce(struct net_device *dev, struct ethtool_coalesce *ec) { ec->rx_max_coalesced_frames, ec->rx_coalesce_usecs }, { ec->tx_max_coalesced_frames, ec->tx_coalesce_usecs } }, *p = coal_settings; - u16 w = 0, cp01; - int i; + u16 w = 0, cp01 = 0; + int scale, i; if (rtl_is_8125(tp)) return -EOPNOTSUPP; scale = rtl_coalesce_choose_scale(tp, max(p[0].usecs, p[1].usecs) * 1000, &cp01); - if (IS_ERR(scale)) - return PTR_ERR(scale); + if (scale < 0) + return scale; for (i = 0; i < 2; i++, p++) { u32 units; @@ -1936,7 +1921,7 @@ static int rtl_set_coalesce(struct net_device *dev, struct ethtool_coalesce *ec) p->frames = 0; } - units = p->usecs * 1000 / scale->nsecs[i]; + units = p->usecs * 1000 / scale; if (p->frames > RTL_COALESCE_FRAME_MAX || p->frames % 4) return -EINVAL; -- cgit v1.2.3-59-g8ed1b From 6cf96dd4272537baf4ceab452f6276da1b8d82af Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Thu, 30 Apr 2020 21:56:58 +0200 Subject: r8169: improve rtl_get_coalesce Use FIELD_GET() macro to make the code better readable. In addition change the logic to round the time limit up, not down. Reason is that a time limit <1us would be rounded to 0 currently, what would be interpreted as "no time limit set". Signed-off-by: Heiner Kallweit Signed-off-by: David S. Miller --- drivers/net/ethernet/realtek/r8169_main.c | 46 ++++++++++++++----------------- 1 file changed, 20 insertions(+), 26 deletions(-) diff --git a/drivers/net/ethernet/realtek/r8169_main.c b/drivers/net/ethernet/realtek/r8169_main.c index 9932b6ffae02..a36a48f713fa 100644 --- a/drivers/net/ethernet/realtek/r8169_main.c +++ b/drivers/net/ethernet/realtek/r8169_main.c @@ -27,6 +27,7 @@ #include #include #include +#include #include #include #include @@ -229,6 +230,11 @@ enum rtl_registers { CPlusCmd = 0xe0, IntrMitigate = 0xe2, +#define RTL_COALESCE_TX_USECS GENMASK(15, 12) +#define RTL_COALESCE_TX_FRAMES GENMASK(11, 8) +#define RTL_COALESCE_RX_USECS GENMASK(7, 4) +#define RTL_COALESCE_RX_FRAMES GENMASK(3, 0) + #define RTL_COALESCE_MASK 0x0f #define RTL_COALESCE_SHIFT 4 #define RTL_COALESCE_T_MAX (RTL_COALESCE_MASK) @@ -1815,16 +1821,8 @@ static int rtl_get_coalesce(struct net_device *dev, struct ethtool_coalesce *ec) { struct rtl8169_private *tp = netdev_priv(dev); const struct rtl_coalesce_info *ci; - struct { - u32 *max_frames; - u32 *usecs; - } coal_settings [] = { - { &ec->rx_max_coalesced_frames, &ec->rx_coalesce_usecs }, - { &ec->tx_max_coalesced_frames, &ec->tx_coalesce_usecs } - }, *p = coal_settings; - u32 scale; - int i; - u16 w; + u32 scale, c_us, c_fr; + u16 intrmit; if (rtl_is_8125(tp)) return -EOPNOTSUPP; @@ -1838,24 +1836,20 @@ static int rtl_get_coalesce(struct net_device *dev, struct ethtool_coalesce *ec) scale = ci->scale_nsecs[tp->cp_cmd & INTT_MASK]; - /* read IntrMitigate and adjust according to scale */ - for (w = RTL_R16(tp, IntrMitigate); w; w >>= RTL_COALESCE_SHIFT, p++) { - *p->max_frames = (w & RTL_COALESCE_MASK) << 2; - w >>= RTL_COALESCE_SHIFT; - *p->usecs = w & RTL_COALESCE_MASK; - } + intrmit = RTL_R16(tp, IntrMitigate); - for (i = 0; i < 2; i++) { - p = coal_settings + i; - *p->usecs = (*p->usecs * scale) / 1000; + c_us = FIELD_GET(RTL_COALESCE_TX_USECS, intrmit); + ec->tx_coalesce_usecs = DIV_ROUND_UP(c_us * scale, 1000); - /* - * ethtool_coalesce says it is illegal to set both usecs and - * max_frames to 0. - */ - if (!*p->usecs && !*p->max_frames) - *p->max_frames = 1; - } + c_fr = FIELD_GET(RTL_COALESCE_TX_FRAMES, intrmit); + /* ethtool_coalesce states usecs and max_frames must not both be 0 */ + ec->tx_max_coalesced_frames = (c_us || c_fr) ? c_fr * 4 : 1; + + c_us = FIELD_GET(RTL_COALESCE_RX_USECS, intrmit); + ec->rx_coalesce_usecs = DIV_ROUND_UP(c_us * scale, 1000); + + c_fr = FIELD_GET(RTL_COALESCE_RX_FRAMES, intrmit); + ec->rx_max_coalesced_frames = (c_us || c_fr) ? c_fr * 4 : 1; return 0; } -- cgit v1.2.3-59-g8ed1b From cb9d97de05646de69b997da0137b94e00cba7f99 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Thu, 30 Apr 2020 21:57:32 +0200 Subject: r8169: improve rtl_coalesce_choose_scale The time limit provided by userspace is multiplied with 1000, what could result in an overflow. Therefore change the time limit parameter unit from ns to us, and avoid the problematic operation. If there's no matching scale because provided time limit is too big, return ERANGE instead of EINVAL to provide a hint to the user what's wrong. Signed-off-by: Heiner Kallweit Signed-off-by: David S. Miller --- drivers/net/ethernet/realtek/r8169_main.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/drivers/net/ethernet/realtek/r8169_main.c b/drivers/net/ethernet/realtek/r8169_main.c index a36a48f713fa..6c17c234bc06 100644 --- a/drivers/net/ethernet/realtek/r8169_main.c +++ b/drivers/net/ethernet/realtek/r8169_main.c @@ -1854,8 +1854,8 @@ static int rtl_get_coalesce(struct net_device *dev, struct ethtool_coalesce *ec) return 0; } -/* choose appropriate scale factor and CPlusCmd[0:1] for (speed, nsec) */ -static int rtl_coalesce_choose_scale(struct rtl8169_private *tp, u32 nsec, +/* choose appropriate scale factor and CPlusCmd[0:1] for (speed, usec) */ +static int rtl_coalesce_choose_scale(struct rtl8169_private *tp, u32 usec, u16 *cp01) { const struct rtl_coalesce_info *ci; @@ -1866,13 +1866,13 @@ static int rtl_coalesce_choose_scale(struct rtl8169_private *tp, u32 nsec, return PTR_ERR(ci); for (i = 0; i < 4; i++) { - if (nsec <= ci->scale_nsecs[i] * RTL_COALESCE_T_MAX) { + if (usec <= ci->scale_nsecs[i] * RTL_COALESCE_T_MAX / 1000U) { *cp01 = i; return ci->scale_nsecs[i]; } } - return -EINVAL; + return -ERANGE; } static int rtl_set_coalesce(struct net_device *dev, struct ethtool_coalesce *ec) @@ -1886,13 +1886,14 @@ static int rtl_set_coalesce(struct net_device *dev, struct ethtool_coalesce *ec) { ec->tx_max_coalesced_frames, ec->tx_coalesce_usecs } }, *p = coal_settings; u16 w = 0, cp01 = 0; + u32 coal_usec_max; int scale, i; if (rtl_is_8125(tp)) return -EOPNOTSUPP; - scale = rtl_coalesce_choose_scale(tp, - max(p[0].usecs, p[1].usecs) * 1000, &cp01); + coal_usec_max = max(ec->rx_coalesce_usecs, ec->tx_coalesce_usecs); + scale = rtl_coalesce_choose_scale(tp, coal_usec_max, &cp01); if (scale < 0) return scale; -- cgit v1.2.3-59-g8ed1b From bdd2be3adb7d139a598f2277af7fa625fc399af1 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Thu, 30 Apr 2020 21:58:06 +0200 Subject: r8169: improve interrupt coalescing parameter handling The chip supports only frame limits 0, 4, 8, .. 60 internally. Returning EINVAL for all val % 4 != 0 seems to be a little bit too unfriendly to the user. Therefore round up the frame limit to the next supported value. In addition round up the time limit, else a very low limit could be rounded down to 0, and interpreted as "ignore value" by the chip. Signed-off-by: Heiner Kallweit Signed-off-by: David S. Miller --- drivers/net/ethernet/realtek/r8169_main.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/net/ethernet/realtek/r8169_main.c b/drivers/net/ethernet/realtek/r8169_main.c index 6c17c234bc06..a81d46abe3c2 100644 --- a/drivers/net/ethernet/realtek/r8169_main.c +++ b/drivers/net/ethernet/realtek/r8169_main.c @@ -1909,21 +1909,21 @@ static int rtl_set_coalesce(struct net_device *dev, struct ethtool_coalesce *ec) * - then user does `ethtool -C eth0 rx-usecs 100` * * since ethtool sends to kernel whole ethtool_coalesce - * settings, if we do not handle rx_usecs=!0, rx_frames=1 - * we'll reject it below in `frames % 4 != 0`. + * settings, if we want to ignore rx_frames then it has + * to be set to 0. */ if (p->frames == 1) { p->frames = 0; } - units = p->usecs * 1000 / scale; - if (p->frames > RTL_COALESCE_FRAME_MAX || p->frames % 4) - return -EINVAL; + units = DIV_ROUND_UP(p->usecs * 1000, scale); + if (p->frames > RTL_COALESCE_FRAME_MAX) + return -ERANGE; w <<= RTL_COALESCE_SHIFT; w |= units; w <<= RTL_COALESCE_SHIFT; - w |= p->frames >> 2; + w |= DIV_ROUND_UP(p->frames, 4); } rtl_lock_work(tp); -- cgit v1.2.3-59-g8ed1b From 2b3e48b66516602d7d63142cda84b554f908eb54 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Thu, 30 Apr 2020 21:58:47 +0200 Subject: r8169: improve rtl_set_coalesce Use FIELD_PREP() to make the code better readable, and avoid the loop. No functional change intended. Signed-off-by: Heiner Kallweit Signed-off-by: David S. Miller --- drivers/net/ethernet/realtek/r8169_main.c | 70 +++++++++++++------------------ 1 file changed, 30 insertions(+), 40 deletions(-) diff --git a/drivers/net/ethernet/realtek/r8169_main.c b/drivers/net/ethernet/realtek/r8169_main.c index a81d46abe3c2..4fe8b1d35b69 100644 --- a/drivers/net/ethernet/realtek/r8169_main.c +++ b/drivers/net/ethernet/realtek/r8169_main.c @@ -235,10 +235,8 @@ enum rtl_registers { #define RTL_COALESCE_RX_USECS GENMASK(7, 4) #define RTL_COALESCE_RX_FRAMES GENMASK(3, 0) -#define RTL_COALESCE_MASK 0x0f -#define RTL_COALESCE_SHIFT 4 -#define RTL_COALESCE_T_MAX (RTL_COALESCE_MASK) -#define RTL_COALESCE_FRAME_MAX (RTL_COALESCE_MASK << 2) +#define RTL_COALESCE_T_MAX 0x0fU +#define RTL_COALESCE_FRAME_MAX (RTL_COALESCE_T_MAX * 4) RxDescAddrLow = 0xe4, RxDescAddrHigh = 0xe8, @@ -1878,57 +1876,49 @@ static int rtl_coalesce_choose_scale(struct rtl8169_private *tp, u32 usec, static int rtl_set_coalesce(struct net_device *dev, struct ethtool_coalesce *ec) { struct rtl8169_private *tp = netdev_priv(dev); - struct { - u32 frames; - u32 usecs; - } coal_settings [] = { - { ec->rx_max_coalesced_frames, ec->rx_coalesce_usecs }, - { ec->tx_max_coalesced_frames, ec->tx_coalesce_usecs } - }, *p = coal_settings; + u32 tx_fr = ec->tx_max_coalesced_frames; + u32 rx_fr = ec->rx_max_coalesced_frames; + u32 coal_usec_max, units; u16 w = 0, cp01 = 0; - u32 coal_usec_max; - int scale, i; + int scale; if (rtl_is_8125(tp)) return -EOPNOTSUPP; + if (rx_fr > RTL_COALESCE_FRAME_MAX || tx_fr > RTL_COALESCE_FRAME_MAX) + return -ERANGE; + coal_usec_max = max(ec->rx_coalesce_usecs, ec->tx_coalesce_usecs); scale = rtl_coalesce_choose_scale(tp, coal_usec_max, &cp01); if (scale < 0) return scale; - for (i = 0; i < 2; i++, p++) { - u32 units; - - /* - * accept max_frames=1 we returned in rtl_get_coalesce. - * accept it not only when usecs=0 because of e.g. the following scenario: - * - * - both rx_usecs=0 & rx_frames=0 in hardware (no delay on RX) - * - rtl_get_coalesce returns rx_usecs=0, rx_frames=1 - * - then user does `ethtool -C eth0 rx-usecs 100` - * - * since ethtool sends to kernel whole ethtool_coalesce - * settings, if we want to ignore rx_frames then it has - * to be set to 0. - */ - if (p->frames == 1) { - p->frames = 0; - } + /* Accept max_frames=1 we returned in rtl_get_coalesce. Accept it + * not only when usecs=0 because of e.g. the following scenario: + * + * - both rx_usecs=0 & rx_frames=0 in hardware (no delay on RX) + * - rtl_get_coalesce returns rx_usecs=0, rx_frames=1 + * - then user does `ethtool -C eth0 rx-usecs 100` + * + * Since ethtool sends to kernel whole ethtool_coalesce settings, + * if we want to ignore rx_frames then it has to be set to 0. + */ + if (rx_fr == 1) + rx_fr = 0; + if (tx_fr == 1) + tx_fr = 0; - units = DIV_ROUND_UP(p->usecs * 1000, scale); - if (p->frames > RTL_COALESCE_FRAME_MAX) - return -ERANGE; + w |= FIELD_PREP(RTL_COALESCE_TX_FRAMES, DIV_ROUND_UP(tx_fr, 4)); + w |= FIELD_PREP(RTL_COALESCE_RX_FRAMES, DIV_ROUND_UP(rx_fr, 4)); - w <<= RTL_COALESCE_SHIFT; - w |= units; - w <<= RTL_COALESCE_SHIFT; - w |= DIV_ROUND_UP(p->frames, 4); - } + units = DIV_ROUND_UP(ec->tx_coalesce_usecs * 1000U, scale); + w |= FIELD_PREP(RTL_COALESCE_TX_USECS, units); + units = DIV_ROUND_UP(ec->rx_coalesce_usecs * 1000U, scale); + w |= FIELD_PREP(RTL_COALESCE_RX_USECS, units); rtl_lock_work(tp); - RTL_W16(tp, IntrMitigate, swab16(w)); + RTL_W16(tp, IntrMitigate, w); tp->cp_cmd = (tp->cp_cmd & ~INTT_MASK) | cp01; RTL_W16(tp, CPlusCmd, tp->cp_cmd); -- cgit v1.2.3-59-g8ed1b From 81496b72e9ba1999d4ed7bb7fa407a1edef020a4 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Thu, 30 Apr 2020 21:59:48 +0200 Subject: r8169: add check for invalid parameter combination in rtl_set_coalesce Realtek provided information about a HW constraint that time limit must not be set to 0 if the frame limit is >0. Add a check for this and reject invalid parameter combinations. Signed-off-by: Heiner Kallweit Signed-off-by: David S. Miller --- drivers/net/ethernet/realtek/r8169_main.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/net/ethernet/realtek/r8169_main.c b/drivers/net/ethernet/realtek/r8169_main.c index 4fe8b1d35b69..aa3e63e031da 100644 --- a/drivers/net/ethernet/realtek/r8169_main.c +++ b/drivers/net/ethernet/realtek/r8169_main.c @@ -1908,6 +1908,11 @@ static int rtl_set_coalesce(struct net_device *dev, struct ethtool_coalesce *ec) if (tx_fr == 1) tx_fr = 0; + /* HW requires time limit to be set if frame limit is set */ + if ((tx_fr && !ec->tx_coalesce_usecs) || + (rx_fr && !ec->rx_coalesce_usecs)) + return -EINVAL; + w |= FIELD_PREP(RTL_COALESCE_TX_FRAMES, DIV_ROUND_UP(tx_fr, 4)); w |= FIELD_PREP(RTL_COALESCE_RX_FRAMES, DIV_ROUND_UP(rx_fr, 4)); -- cgit v1.2.3-59-g8ed1b From 673e69a67dd63fc3b40f109d1677a5dc72185fbb Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Thu, 30 Apr 2020 11:49:08 -0700 Subject: net: dsa: b53: Rename num_arl_entries to num_arl_bins The variable currently holds the number of ARL bins per ARL buckets, which is different from the number of ARL entries which would be bins times buckets. We will be adding a num_arl_buckets in a subsequent patch so get variables straight now. Signed-off-by: Florian Fainelli Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- drivers/net/dsa/b53/b53_common.c | 52 ++++++++++++++++++++-------------------- drivers/net/dsa/b53/b53_priv.h | 2 +- 2 files changed, 27 insertions(+), 27 deletions(-) diff --git a/drivers/net/dsa/b53/b53_common.c b/drivers/net/dsa/b53/b53_common.c index c283593bef17..41b75f41677a 100644 --- a/drivers/net/dsa/b53/b53_common.c +++ b/drivers/net/dsa/b53/b53_common.c @@ -1495,10 +1495,10 @@ static int b53_arl_read(struct b53_device *dev, u64 mac, if (ret) return ret; - bitmap_zero(free_bins, dev->num_arl_entries); + bitmap_zero(free_bins, dev->num_arl_bins); /* Read the bins */ - for (i = 0; i < dev->num_arl_entries; i++) { + for (i = 0; i < dev->num_arl_bins; i++) { u64 mac_vid; u32 fwd_entry; @@ -1521,10 +1521,10 @@ static int b53_arl_read(struct b53_device *dev, u64 mac, return 0; } - if (bitmap_weight(free_bins, dev->num_arl_entries) == 0) + if (bitmap_weight(free_bins, dev->num_arl_bins) == 0) return -ENOSPC; - *idx = find_first_bit(free_bins, dev->num_arl_entries); + *idx = find_first_bit(free_bins, dev->num_arl_bins); return -ENOENT; } @@ -1692,7 +1692,7 @@ int b53_fdb_dump(struct dsa_switch *ds, int port, if (ret) return ret; - if (priv->num_arl_entries > 2) { + if (priv->num_arl_bins > 2) { b53_arl_search_rd(priv, 1, &results[1]); ret = b53_fdb_copy(port, &results[1], cb, data); if (ret) @@ -2185,7 +2185,7 @@ struct b53_chip_data { u16 enabled_ports; u8 cpu_port; u8 vta_regs[3]; - u8 arl_entries; + u8 arl_bins; u8 duplex_reg; u8 jumbo_pm_reg; u8 jumbo_size_reg; @@ -2204,7 +2204,7 @@ static const struct b53_chip_data b53_switch_chips[] = { .dev_name = "BCM5325", .vlans = 16, .enabled_ports = 0x1f, - .arl_entries = 2, + .arl_bins = 2, .cpu_port = B53_CPU_PORT_25, .duplex_reg = B53_DUPLEX_STAT_FE, }, @@ -2213,7 +2213,7 @@ static const struct b53_chip_data b53_switch_chips[] = { .dev_name = "BCM5365", .vlans = 256, .enabled_ports = 0x1f, - .arl_entries = 2, + .arl_bins = 2, .cpu_port = B53_CPU_PORT_25, .duplex_reg = B53_DUPLEX_STAT_FE, }, @@ -2222,7 +2222,7 @@ static const struct b53_chip_data b53_switch_chips[] = { .dev_name = "BCM5389", .vlans = 4096, .enabled_ports = 0x1f, - .arl_entries = 4, + .arl_bins = 4, .cpu_port = B53_CPU_PORT, .vta_regs = B53_VTA_REGS, .duplex_reg = B53_DUPLEX_STAT_GE, @@ -2234,7 +2234,7 @@ static const struct b53_chip_data b53_switch_chips[] = { .dev_name = "BCM5395", .vlans = 4096, .enabled_ports = 0x1f, - .arl_entries = 4, + .arl_bins = 4, .cpu_port = B53_CPU_PORT, .vta_regs = B53_VTA_REGS, .duplex_reg = B53_DUPLEX_STAT_GE, @@ -2246,7 +2246,7 @@ static const struct b53_chip_data b53_switch_chips[] = { .dev_name = "BCM5397", .vlans = 4096, .enabled_ports = 0x1f, - .arl_entries = 4, + .arl_bins = 4, .cpu_port = B53_CPU_PORT, .vta_regs = B53_VTA_REGS_9798, .duplex_reg = B53_DUPLEX_STAT_GE, @@ -2258,7 +2258,7 @@ static const struct b53_chip_data b53_switch_chips[] = { .dev_name = "BCM5398", .vlans = 4096, .enabled_ports = 0x7f, - .arl_entries = 4, + .arl_bins = 4, .cpu_port = B53_CPU_PORT, .vta_regs = B53_VTA_REGS_9798, .duplex_reg = B53_DUPLEX_STAT_GE, @@ -2270,7 +2270,7 @@ static const struct b53_chip_data b53_switch_chips[] = { .dev_name = "BCM53115", .vlans = 4096, .enabled_ports = 0x1f, - .arl_entries = 4, + .arl_bins = 4, .vta_regs = B53_VTA_REGS, .cpu_port = B53_CPU_PORT, .duplex_reg = B53_DUPLEX_STAT_GE, @@ -2282,7 +2282,7 @@ static const struct b53_chip_data b53_switch_chips[] = { .dev_name = "BCM53125", .vlans = 4096, .enabled_ports = 0xff, - .arl_entries = 4, + .arl_bins = 4, .cpu_port = B53_CPU_PORT, .vta_regs = B53_VTA_REGS, .duplex_reg = B53_DUPLEX_STAT_GE, @@ -2294,7 +2294,7 @@ static const struct b53_chip_data b53_switch_chips[] = { .dev_name = "BCM53128", .vlans = 4096, .enabled_ports = 0x1ff, - .arl_entries = 4, + .arl_bins = 4, .cpu_port = B53_CPU_PORT, .vta_regs = B53_VTA_REGS, .duplex_reg = B53_DUPLEX_STAT_GE, @@ -2306,7 +2306,7 @@ static const struct b53_chip_data b53_switch_chips[] = { .dev_name = "BCM63xx", .vlans = 4096, .enabled_ports = 0, /* pdata must provide them */ - .arl_entries = 4, + .arl_bins = 4, .cpu_port = B53_CPU_PORT, .vta_regs = B53_VTA_REGS_63XX, .duplex_reg = B53_DUPLEX_STAT_63XX, @@ -2318,7 +2318,7 @@ static const struct b53_chip_data b53_switch_chips[] = { .dev_name = "BCM53010", .vlans = 4096, .enabled_ports = 0x1f, - .arl_entries = 4, + .arl_bins = 4, .cpu_port = B53_CPU_PORT_25, /* TODO: auto detect */ .vta_regs = B53_VTA_REGS, .duplex_reg = B53_DUPLEX_STAT_GE, @@ -2330,7 +2330,7 @@ static const struct b53_chip_data b53_switch_chips[] = { .dev_name = "BCM53011", .vlans = 4096, .enabled_ports = 0x1bf, - .arl_entries = 4, + .arl_bins = 4, .cpu_port = B53_CPU_PORT_25, /* TODO: auto detect */ .vta_regs = B53_VTA_REGS, .duplex_reg = B53_DUPLEX_STAT_GE, @@ -2342,7 +2342,7 @@ static const struct b53_chip_data b53_switch_chips[] = { .dev_name = "BCM53012", .vlans = 4096, .enabled_ports = 0x1bf, - .arl_entries = 4, + .arl_bins = 4, .cpu_port = B53_CPU_PORT_25, /* TODO: auto detect */ .vta_regs = B53_VTA_REGS, .duplex_reg = B53_DUPLEX_STAT_GE, @@ -2354,7 +2354,7 @@ static const struct b53_chip_data b53_switch_chips[] = { .dev_name = "BCM53018", .vlans = 4096, .enabled_ports = 0x1f, - .arl_entries = 4, + .arl_bins = 4, .cpu_port = B53_CPU_PORT_25, /* TODO: auto detect */ .vta_regs = B53_VTA_REGS, .duplex_reg = B53_DUPLEX_STAT_GE, @@ -2366,7 +2366,7 @@ static const struct b53_chip_data b53_switch_chips[] = { .dev_name = "BCM53019", .vlans = 4096, .enabled_ports = 0x1f, - .arl_entries = 4, + .arl_bins = 4, .cpu_port = B53_CPU_PORT_25, /* TODO: auto detect */ .vta_regs = B53_VTA_REGS, .duplex_reg = B53_DUPLEX_STAT_GE, @@ -2378,7 +2378,7 @@ static const struct b53_chip_data b53_switch_chips[] = { .dev_name = "BCM585xx/586xx/88312", .vlans = 4096, .enabled_ports = 0x1ff, - .arl_entries = 4, + .arl_bins = 4, .cpu_port = B53_CPU_PORT, .vta_regs = B53_VTA_REGS, .duplex_reg = B53_DUPLEX_STAT_GE, @@ -2390,7 +2390,7 @@ static const struct b53_chip_data b53_switch_chips[] = { .dev_name = "BCM583xx/11360", .vlans = 4096, .enabled_ports = 0x103, - .arl_entries = 4, + .arl_bins = 4, .cpu_port = B53_CPU_PORT, .vta_regs = B53_VTA_REGS, .duplex_reg = B53_DUPLEX_STAT_GE, @@ -2402,7 +2402,7 @@ static const struct b53_chip_data b53_switch_chips[] = { .dev_name = "BCM7445", .vlans = 4096, .enabled_ports = 0x1ff, - .arl_entries = 4, + .arl_bins = 4, .cpu_port = B53_CPU_PORT, .vta_regs = B53_VTA_REGS, .duplex_reg = B53_DUPLEX_STAT_GE, @@ -2414,7 +2414,7 @@ static const struct b53_chip_data b53_switch_chips[] = { .dev_name = "BCM7278", .vlans = 4096, .enabled_ports = 0x1ff, - .arl_entries= 4, + .arl_bins = 4, .cpu_port = B53_CPU_PORT, .vta_regs = B53_VTA_REGS, .duplex_reg = B53_DUPLEX_STAT_GE, @@ -2442,7 +2442,7 @@ static int b53_switch_init(struct b53_device *dev) dev->jumbo_pm_reg = chip->jumbo_pm_reg; dev->cpu_port = chip->cpu_port; dev->num_vlans = chip->vlans; - dev->num_arl_entries = chip->arl_entries; + dev->num_arl_bins = chip->arl_bins; break; } } diff --git a/drivers/net/dsa/b53/b53_priv.h b/drivers/net/dsa/b53/b53_priv.h index 3d42318bc3f1..1c5c443d571f 100644 --- a/drivers/net/dsa/b53/b53_priv.h +++ b/drivers/net/dsa/b53/b53_priv.h @@ -117,7 +117,7 @@ struct b53_device { u8 jumbo_pm_reg; u8 jumbo_size_reg; int reset_gpio; - u8 num_arl_entries; + u8 num_arl_bins; enum dsa_tag_protocol tag_protocol; /* used ports mask */ -- cgit v1.2.3-59-g8ed1b From e3da4038f4ca1094596a7604c6edac4a6a4f6ee9 Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Thu, 30 Apr 2020 11:49:09 -0700 Subject: net: dsa: b53: Provide number of ARL buckets In preparation for doing proper upper bound checking of FDB/MDB entries being added to the ARL, provide the number of ARL buckets for each switch chip we support. All chips have 1024 buckets, except 7278 which has only 256. Signed-off-by: Florian Fainelli Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- drivers/net/dsa/b53/b53_common.c | 21 +++++++++++++++++++++ drivers/net/dsa/b53/b53_priv.h | 1 + 2 files changed, 22 insertions(+) diff --git a/drivers/net/dsa/b53/b53_common.c b/drivers/net/dsa/b53/b53_common.c index 41b75f41677a..aa0836ac751c 100644 --- a/drivers/net/dsa/b53/b53_common.c +++ b/drivers/net/dsa/b53/b53_common.c @@ -2186,6 +2186,7 @@ struct b53_chip_data { u8 cpu_port; u8 vta_regs[3]; u8 arl_bins; + u16 arl_buckets; u8 duplex_reg; u8 jumbo_pm_reg; u8 jumbo_size_reg; @@ -2205,6 +2206,7 @@ static const struct b53_chip_data b53_switch_chips[] = { .vlans = 16, .enabled_ports = 0x1f, .arl_bins = 2, + .arl_buckets = 1024, .cpu_port = B53_CPU_PORT_25, .duplex_reg = B53_DUPLEX_STAT_FE, }, @@ -2214,6 +2216,7 @@ static const struct b53_chip_data b53_switch_chips[] = { .vlans = 256, .enabled_ports = 0x1f, .arl_bins = 2, + .arl_buckets = 1024, .cpu_port = B53_CPU_PORT_25, .duplex_reg = B53_DUPLEX_STAT_FE, }, @@ -2223,6 +2226,7 @@ static const struct b53_chip_data b53_switch_chips[] = { .vlans = 4096, .enabled_ports = 0x1f, .arl_bins = 4, + .arl_buckets = 1024, .cpu_port = B53_CPU_PORT, .vta_regs = B53_VTA_REGS, .duplex_reg = B53_DUPLEX_STAT_GE, @@ -2235,6 +2239,7 @@ static const struct b53_chip_data b53_switch_chips[] = { .vlans = 4096, .enabled_ports = 0x1f, .arl_bins = 4, + .arl_buckets = 1024, .cpu_port = B53_CPU_PORT, .vta_regs = B53_VTA_REGS, .duplex_reg = B53_DUPLEX_STAT_GE, @@ -2247,6 +2252,7 @@ static const struct b53_chip_data b53_switch_chips[] = { .vlans = 4096, .enabled_ports = 0x1f, .arl_bins = 4, + .arl_buckets = 1024, .cpu_port = B53_CPU_PORT, .vta_regs = B53_VTA_REGS_9798, .duplex_reg = B53_DUPLEX_STAT_GE, @@ -2259,6 +2265,7 @@ static const struct b53_chip_data b53_switch_chips[] = { .vlans = 4096, .enabled_ports = 0x7f, .arl_bins = 4, + .arl_buckets = 1024, .cpu_port = B53_CPU_PORT, .vta_regs = B53_VTA_REGS_9798, .duplex_reg = B53_DUPLEX_STAT_GE, @@ -2271,6 +2278,7 @@ static const struct b53_chip_data b53_switch_chips[] = { .vlans = 4096, .enabled_ports = 0x1f, .arl_bins = 4, + .arl_buckets = 1024, .vta_regs = B53_VTA_REGS, .cpu_port = B53_CPU_PORT, .duplex_reg = B53_DUPLEX_STAT_GE, @@ -2283,6 +2291,7 @@ static const struct b53_chip_data b53_switch_chips[] = { .vlans = 4096, .enabled_ports = 0xff, .arl_bins = 4, + .arl_buckets = 1024, .cpu_port = B53_CPU_PORT, .vta_regs = B53_VTA_REGS, .duplex_reg = B53_DUPLEX_STAT_GE, @@ -2295,6 +2304,7 @@ static const struct b53_chip_data b53_switch_chips[] = { .vlans = 4096, .enabled_ports = 0x1ff, .arl_bins = 4, + .arl_buckets = 1024, .cpu_port = B53_CPU_PORT, .vta_regs = B53_VTA_REGS, .duplex_reg = B53_DUPLEX_STAT_GE, @@ -2307,6 +2317,7 @@ static const struct b53_chip_data b53_switch_chips[] = { .vlans = 4096, .enabled_ports = 0, /* pdata must provide them */ .arl_bins = 4, + .arl_buckets = 1024, .cpu_port = B53_CPU_PORT, .vta_regs = B53_VTA_REGS_63XX, .duplex_reg = B53_DUPLEX_STAT_63XX, @@ -2319,6 +2330,7 @@ static const struct b53_chip_data b53_switch_chips[] = { .vlans = 4096, .enabled_ports = 0x1f, .arl_bins = 4, + .arl_buckets = 1024, .cpu_port = B53_CPU_PORT_25, /* TODO: auto detect */ .vta_regs = B53_VTA_REGS, .duplex_reg = B53_DUPLEX_STAT_GE, @@ -2331,6 +2343,7 @@ static const struct b53_chip_data b53_switch_chips[] = { .vlans = 4096, .enabled_ports = 0x1bf, .arl_bins = 4, + .arl_buckets = 1024, .cpu_port = B53_CPU_PORT_25, /* TODO: auto detect */ .vta_regs = B53_VTA_REGS, .duplex_reg = B53_DUPLEX_STAT_GE, @@ -2343,6 +2356,7 @@ static const struct b53_chip_data b53_switch_chips[] = { .vlans = 4096, .enabled_ports = 0x1bf, .arl_bins = 4, + .arl_buckets = 1024, .cpu_port = B53_CPU_PORT_25, /* TODO: auto detect */ .vta_regs = B53_VTA_REGS, .duplex_reg = B53_DUPLEX_STAT_GE, @@ -2355,6 +2369,7 @@ static const struct b53_chip_data b53_switch_chips[] = { .vlans = 4096, .enabled_ports = 0x1f, .arl_bins = 4, + .arl_buckets = 1024, .cpu_port = B53_CPU_PORT_25, /* TODO: auto detect */ .vta_regs = B53_VTA_REGS, .duplex_reg = B53_DUPLEX_STAT_GE, @@ -2367,6 +2382,7 @@ static const struct b53_chip_data b53_switch_chips[] = { .vlans = 4096, .enabled_ports = 0x1f, .arl_bins = 4, + .arl_buckets = 1024, .cpu_port = B53_CPU_PORT_25, /* TODO: auto detect */ .vta_regs = B53_VTA_REGS, .duplex_reg = B53_DUPLEX_STAT_GE, @@ -2379,6 +2395,7 @@ static const struct b53_chip_data b53_switch_chips[] = { .vlans = 4096, .enabled_ports = 0x1ff, .arl_bins = 4, + .arl_buckets = 1024, .cpu_port = B53_CPU_PORT, .vta_regs = B53_VTA_REGS, .duplex_reg = B53_DUPLEX_STAT_GE, @@ -2391,6 +2408,7 @@ static const struct b53_chip_data b53_switch_chips[] = { .vlans = 4096, .enabled_ports = 0x103, .arl_bins = 4, + .arl_buckets = 1024, .cpu_port = B53_CPU_PORT, .vta_regs = B53_VTA_REGS, .duplex_reg = B53_DUPLEX_STAT_GE, @@ -2403,6 +2421,7 @@ static const struct b53_chip_data b53_switch_chips[] = { .vlans = 4096, .enabled_ports = 0x1ff, .arl_bins = 4, + .arl_buckets = 1024, .cpu_port = B53_CPU_PORT, .vta_regs = B53_VTA_REGS, .duplex_reg = B53_DUPLEX_STAT_GE, @@ -2415,6 +2434,7 @@ static const struct b53_chip_data b53_switch_chips[] = { .vlans = 4096, .enabled_ports = 0x1ff, .arl_bins = 4, + .arl_buckets = 256, .cpu_port = B53_CPU_PORT, .vta_regs = B53_VTA_REGS, .duplex_reg = B53_DUPLEX_STAT_GE, @@ -2443,6 +2463,7 @@ static int b53_switch_init(struct b53_device *dev) dev->cpu_port = chip->cpu_port; dev->num_vlans = chip->vlans; dev->num_arl_bins = chip->arl_bins; + dev->num_arl_buckets = chip->arl_buckets; break; } } diff --git a/drivers/net/dsa/b53/b53_priv.h b/drivers/net/dsa/b53/b53_priv.h index 1c5c443d571f..694e26cdfd4d 100644 --- a/drivers/net/dsa/b53/b53_priv.h +++ b/drivers/net/dsa/b53/b53_priv.h @@ -118,6 +118,7 @@ struct b53_device { u8 jumbo_size_reg; int reset_gpio; u8 num_arl_bins; + u16 num_arl_buckets; enum dsa_tag_protocol tag_protocol; /* used ports mask */ -- cgit v1.2.3-59-g8ed1b From cd169d799beeb738fa2d3e891960924cdcaf8414 Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Thu, 30 Apr 2020 11:49:10 -0700 Subject: net: dsa: b53: Bound check ARL searches ARL searches are done by reading two ARL entries at a time, do not cap the search at 1024 which would only limit us to half of the possible ARL capacity, but use b53_max_arl_entries() instead which does the right multiplication between bins and indexes. Signed-off-by: Florian Fainelli Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- drivers/net/dsa/b53/b53_common.c | 2 +- drivers/net/dsa/b53/b53_priv.h | 5 +++++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/net/dsa/b53/b53_common.c b/drivers/net/dsa/b53/b53_common.c index aa0836ac751c..9550d972f8c5 100644 --- a/drivers/net/dsa/b53/b53_common.c +++ b/drivers/net/dsa/b53/b53_common.c @@ -1702,7 +1702,7 @@ int b53_fdb_dump(struct dsa_switch *ds, int port, break; } - } while (count++ < 1024); + } while (count++ < b53_max_arl_entries(priv) / 2); return 0; } diff --git a/drivers/net/dsa/b53/b53_priv.h b/drivers/net/dsa/b53/b53_priv.h index 694e26cdfd4d..e942c60e4365 100644 --- a/drivers/net/dsa/b53/b53_priv.h +++ b/drivers/net/dsa/b53/b53_priv.h @@ -213,6 +213,11 @@ static inline int is58xx(struct b53_device *dev) #define B53_CPU_PORT_25 5 #define B53_CPU_PORT 8 +static inline unsigned int b53_max_arl_entries(struct b53_device *dev) +{ + return dev->num_arl_buckets * dev->num_arl_bins; +} + struct b53_device *b53_switch_alloc(struct device *base, const struct b53_io_ops *ops, void *priv); -- cgit v1.2.3-59-g8ed1b From ef2a0bd99b1549a3a4253355be247d5dff25d720 Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Thu, 30 Apr 2020 11:49:11 -0700 Subject: net: dsa: b53: Remove is_static argument to b53_read_op() This argument is not used. Signed-off-by: Florian Fainelli Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- drivers/net/dsa/b53/b53_common.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/net/dsa/b53/b53_common.c b/drivers/net/dsa/b53/b53_common.c index 9550d972f8c5..ceb8be653182 100644 --- a/drivers/net/dsa/b53/b53_common.c +++ b/drivers/net/dsa/b53/b53_common.c @@ -1484,8 +1484,7 @@ static int b53_arl_rw_op(struct b53_device *dev, unsigned int op) } static int b53_arl_read(struct b53_device *dev, u64 mac, - u16 vid, struct b53_arl_entry *ent, u8 *idx, - bool is_valid) + u16 vid, struct b53_arl_entry *ent, u8 *idx) { DECLARE_BITMAP(free_bins, B53_ARLTBL_MAX_BIN_ENTRIES); unsigned int i; @@ -1550,7 +1549,8 @@ static int b53_arl_op(struct b53_device *dev, int op, int port, if (ret) return ret; - ret = b53_arl_read(dev, mac, vid, &ent, &idx, is_valid); + ret = b53_arl_read(dev, mac, vid, &ent, &idx); + /* If this is a read, just finish now */ if (op) return ret; -- cgit v1.2.3-59-g8ed1b From 47a1494b8208461094923400c396ce4b8163c064 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Thu, 30 Apr 2020 22:13:05 +0200 Subject: netlink: remove type-unsafe validation_data pointer In the netlink policy, we currently have a void *validation_data that's pointing to different things: * a u32 value for bitfield32, * the netlink policy for nested/nested array * the string for NLA_REJECT Remove the pointer and place appropriate type-safe items in the union instead. While at it, completely dissolve the pointer for the bitfield32 case and just put the value there directly. Signed-off-by: Johannes Berg Signed-off-by: David S. Miller --- include/net/netlink.h | 60 +++++++++++++++++++++++++++------------------------ lib/nlattr.c | 20 ++++++++--------- net/sched/act_api.c | 13 +++-------- net/sched/sch_red.c | 9 ++++---- 4 files changed, 49 insertions(+), 53 deletions(-) diff --git a/include/net/netlink.h b/include/net/netlink.h index 67c57d6942e3..671b29d170a8 100644 --- a/include/net/netlink.h +++ b/include/net/netlink.h @@ -217,7 +217,7 @@ enum nla_policy_validation { * NLA_NESTED, * NLA_NESTED_ARRAY Length verification is done by checking len of * nested header (or empty); len field is used if - * validation_data is also used, for the max attr + * nested_policy is also used, for the max attr * number in the nested policy. * NLA_U8, NLA_U16, * NLA_U32, NLA_U64, @@ -235,27 +235,25 @@ enum nla_policy_validation { * NLA_MIN_LEN Minimum length of attribute payload * All other Minimum length of attribute payload * - * Meaning of `validation_data' field: + * Meaning of validation union: * NLA_BITFIELD32 This is a 32-bit bitmap/bitselector attribute and - * validation data must point to a u32 value of valid - * flags - * NLA_REJECT This attribute is always rejected and validation data + * `bitfield32_valid' is the u32 value of valid flags + * NLA_REJECT This attribute is always rejected and `reject_message' * may point to a string to report as the error instead * of the generic one in extended ACK. - * NLA_NESTED Points to a nested policy to validate, must also set - * `len' to the max attribute number. + * NLA_NESTED `nested_policy' to a nested policy to validate, must + * also set `len' to the max attribute number. Use the + * provided NLA_POLICY_NESTED() macro. * Note that nla_parse() will validate, but of course not * parse, the nested sub-policies. - * NLA_NESTED_ARRAY Points to a nested policy to validate, must also set - * `len' to the max attribute number. The difference to - * NLA_NESTED is the structure - NLA_NESTED has the - * nested attributes directly inside, while an array has - * the nested attributes at another level down and the - * attributes directly in the nesting don't matter. - * All other Unused - but note that it's a union - * - * Meaning of `min' and `max' fields, use via NLA_POLICY_MIN, NLA_POLICY_MAX - * and NLA_POLICY_RANGE: + * NLA_NESTED_ARRAY `nested_policy' points to a nested policy to validate, + * must also set `len' to the max attribute number. Use + * the provided NLA_POLICY_NESTED_ARRAY() macro. + * The difference to NLA_NESTED is the structure: + * NLA_NESTED has the nested attributes directly inside + * while an array has the nested attributes at another + * level down and the attribute types directly in the + * nesting don't matter. * NLA_U8, * NLA_U16, * NLA_U32, @@ -263,29 +261,31 @@ enum nla_policy_validation { * NLA_S8, * NLA_S16, * NLA_S32, - * NLA_S64 These are used depending on the validation_type - * field, if that is min/max/range then the minimum, - * maximum and both are used (respectively) to check + * NLA_S64 The `min' and `max' fields are used depending on the + * validation_type field, if that is min/max/range then + * the min, max or both are used (respectively) to check * the value of the integer attribute. * Note that in the interest of code simplicity and * struct size both limits are s16, so you cannot * enforce a range that doesn't fall within the range * of s16 - do that as usual in the code instead. + * Use the NLA_POLICY_MIN(), NLA_POLICY_MAX() and + * NLA_POLICY_RANGE() macros. * All other Unused - but note that it's a union * * Meaning of `validate' field, use via NLA_POLICY_VALIDATE_FN: - * NLA_BINARY Validation function called for the attribute, - * not compatible with use of the validation_data - * as in NLA_BITFIELD32, NLA_REJECT, NLA_NESTED and - * NLA_NESTED_ARRAY. + * NLA_BINARY Validation function called for the attribute. * All other Unused - but note that it's a union * * Example: + * + * static const u32 myvalidflags = 0xff231023; + * * static const struct nla_policy my_policy[ATTR_MAX+1] = { * [ATTR_FOO] = { .type = NLA_U16 }, * [ATTR_BAR] = { .type = NLA_STRING, .len = BARSIZ }, * [ATTR_BAZ] = { .type = NLA_EXACT_LEN, .len = sizeof(struct mystruct) }, - * [ATTR_GOO] = { .type = NLA_BITFIELD32, .validation_data = &myvalidflags }, + * [ATTR_GOO] = NLA_POLICY_BITFIELD32(myvalidflags), * }; */ struct nla_policy { @@ -293,7 +293,9 @@ struct nla_policy { u8 validation_type; u16 len; union { - const void *validation_data; + const u32 bitfield32_valid; + const char *reject_message; + const struct nla_policy *nested_policy; struct { s16 min, max; }; @@ -329,13 +331,15 @@ struct nla_policy { #define NLA_POLICY_ETH_ADDR_COMPAT NLA_POLICY_EXACT_LEN_WARN(ETH_ALEN) #define _NLA_POLICY_NESTED(maxattr, policy) \ - { .type = NLA_NESTED, .validation_data = policy, .len = maxattr } + { .type = NLA_NESTED, .nested_policy = policy, .len = maxattr } #define _NLA_POLICY_NESTED_ARRAY(maxattr, policy) \ - { .type = NLA_NESTED_ARRAY, .validation_data = policy, .len = maxattr } + { .type = NLA_NESTED_ARRAY, .nested_policy = policy, .len = maxattr } #define NLA_POLICY_NESTED(policy) \ _NLA_POLICY_NESTED(ARRAY_SIZE(policy) - 1, policy) #define NLA_POLICY_NESTED_ARRAY(policy) \ _NLA_POLICY_NESTED_ARRAY(ARRAY_SIZE(policy) - 1, policy) +#define NLA_POLICY_BITFIELD32(valid) \ + { .type = NLA_BITFIELD32, .bitfield32_valid = valid } #define __NLA_ENSURE(condition) BUILD_BUG_ON_ZERO(!(condition)) #define NLA_ENSURE_INT_TYPE(tp) \ diff --git a/lib/nlattr.c b/lib/nlattr.c index cace9b307781..3df05db732ca 100644 --- a/lib/nlattr.c +++ b/lib/nlattr.c @@ -45,7 +45,7 @@ static const u8 nla_attr_minlen[NLA_TYPE_MAX+1] = { }; static int validate_nla_bitfield32(const struct nlattr *nla, - const u32 *valid_flags_mask) + const u32 valid_flags_mask) { const struct nla_bitfield32 *bf = nla_data(nla); @@ -53,11 +53,11 @@ static int validate_nla_bitfield32(const struct nlattr *nla, return -EINVAL; /*disallow invalid bit selector */ - if (bf->selector & ~*valid_flags_mask) + if (bf->selector & ~valid_flags_mask) return -EINVAL; /*disallow invalid bit values */ - if (bf->value & ~*valid_flags_mask) + if (bf->value & ~valid_flags_mask) return -EINVAL; /*disallow valid bit values that are not selected*/ @@ -206,9 +206,9 @@ static int validate_nla(const struct nlattr *nla, int maxtype, break; case NLA_REJECT: - if (extack && pt->validation_data) { + if (extack && pt->reject_message) { NL_SET_BAD_ATTR(extack, nla); - extack->_msg = pt->validation_data; + extack->_msg = pt->reject_message; return -EINVAL; } err = -EINVAL; @@ -223,7 +223,7 @@ static int validate_nla(const struct nlattr *nla, int maxtype, if (attrlen != sizeof(struct nla_bitfield32)) goto out_err; - err = validate_nla_bitfield32(nla, pt->validation_data); + err = validate_nla_bitfield32(nla, pt->bitfield32_valid); if (err) goto out_err; break; @@ -268,9 +268,9 @@ static int validate_nla(const struct nlattr *nla, int maxtype, break; if (attrlen < NLA_HDRLEN) goto out_err; - if (pt->validation_data) { + if (pt->nested_policy) { err = __nla_validate(nla_data(nla), nla_len(nla), pt->len, - pt->validation_data, validate, + pt->nested_policy, validate, extack); if (err < 0) { /* @@ -289,11 +289,11 @@ static int validate_nla(const struct nlattr *nla, int maxtype, break; if (attrlen < NLA_HDRLEN) goto out_err; - if (pt->validation_data) { + if (pt->nested_policy) { int err; err = nla_validate_array(nla_data(nla), nla_len(nla), - pt->len, pt->validation_data, + pt->len, pt->nested_policy, extack, validate); if (err < 0) { /* diff --git a/net/sched/act_api.c b/net/sched/act_api.c index df4560909157..fbbec2e562f5 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -876,19 +876,14 @@ static u8 tcf_action_hw_stats_get(struct nlattr *hw_stats_attr) return hw_stats_bf.value; } -static const u32 tca_act_flags_allowed = TCA_ACT_FLAGS_NO_PERCPU_STATS; -static const u32 tca_act_hw_stats_allowed = TCA_ACT_HW_STATS_ANY; - static const struct nla_policy tcf_action_policy[TCA_ACT_MAX + 1] = { [TCA_ACT_KIND] = { .type = NLA_STRING }, [TCA_ACT_INDEX] = { .type = NLA_U32 }, [TCA_ACT_COOKIE] = { .type = NLA_BINARY, .len = TC_COOKIE_MAX_SIZE }, [TCA_ACT_OPTIONS] = { .type = NLA_NESTED }, - [TCA_ACT_FLAGS] = { .type = NLA_BITFIELD32, - .validation_data = &tca_act_flags_allowed }, - [TCA_ACT_HW_STATS] = { .type = NLA_BITFIELD32, - .validation_data = &tca_act_hw_stats_allowed }, + [TCA_ACT_FLAGS] = NLA_POLICY_BITFIELD32(TCA_ACT_FLAGS_NO_PERCPU_STATS), + [TCA_ACT_HW_STATS] = NLA_POLICY_BITFIELD32(TCA_ACT_HW_STATS_ANY), }; struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp, @@ -1454,10 +1449,8 @@ static int tcf_action_add(struct net *net, struct nlattr *nla, return ret; } -static u32 tcaa_root_flags_allowed = TCA_FLAG_LARGE_DUMP_ON; static const struct nla_policy tcaa_policy[TCA_ROOT_MAX + 1] = { - [TCA_ROOT_FLAGS] = { .type = NLA_BITFIELD32, - .validation_data = &tcaa_root_flags_allowed }, + [TCA_ROOT_FLAGS] = NLA_POLICY_BITFIELD32(TCA_FLAG_LARGE_DUMP_ON), [TCA_ROOT_TIME_DELTA] = { .type = NLA_U32 }, }; diff --git a/net/sched/sch_red.c b/net/sched/sch_red.c index c7de47c942e3..555a1b9e467f 100644 --- a/net/sched/sch_red.c +++ b/net/sched/sch_red.c @@ -48,7 +48,7 @@ struct red_sched_data { struct Qdisc *qdisc; }; -static const u32 red_supported_flags = TC_RED_HISTORIC_FLAGS | TC_RED_NODROP; +#define TC_RED_SUPPORTED_FLAGS (TC_RED_HISTORIC_FLAGS | TC_RED_NODROP) static inline int red_use_ecn(struct red_sched_data *q) { @@ -212,8 +212,7 @@ static const struct nla_policy red_policy[TCA_RED_MAX + 1] = { [TCA_RED_PARMS] = { .len = sizeof(struct tc_red_qopt) }, [TCA_RED_STAB] = { .len = RED_STAB_SIZE }, [TCA_RED_MAX_P] = { .type = NLA_U32 }, - [TCA_RED_FLAGS] = { .type = NLA_BITFIELD32, - .validation_data = &red_supported_flags }, + [TCA_RED_FLAGS] = NLA_POLICY_BITFIELD32(TC_RED_SUPPORTED_FLAGS), }; static int red_change(struct Qdisc *sch, struct nlattr *opt, @@ -248,7 +247,7 @@ static int red_change(struct Qdisc *sch, struct nlattr *opt, return -EINVAL; err = red_get_flags(ctl->flags, TC_RED_HISTORIC_FLAGS, - tb[TCA_RED_FLAGS], red_supported_flags, + tb[TCA_RED_FLAGS], TC_RED_SUPPORTED_FLAGS, &flags_bf, &userbits, extack); if (err) return err; @@ -372,7 +371,7 @@ static int red_dump(struct Qdisc *sch, struct sk_buff *skb) if (nla_put(skb, TCA_RED_PARMS, sizeof(opt), &opt) || nla_put_u32(skb, TCA_RED_MAX_P, q->parms.max_P) || nla_put_bitfield32(skb, TCA_RED_FLAGS, - q->flags, red_supported_flags)) + q->flags, TC_RED_SUPPORTED_FLAGS)) goto nla_put_failure; return nla_nest_end(skb, opts); -- cgit v1.2.3-59-g8ed1b From 7690aa1cdf7c4565ad6b013b324c28b685505e24 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Thu, 30 Apr 2020 22:13:06 +0200 Subject: netlink: limit recursion depth in policy validation Now that we have nested policies, we can theoretically recurse forever parsing attributes if a (sub-)policy refers back to a higher level one. This is a situation that has happened in nl80211, and we've avoided it there by not linking it. Add some code to netlink parsing to limit recursion depth. Signed-off-by: Johannes Berg Signed-off-by: David S. Miller --- lib/nlattr.c | 46 ++++++++++++++++++++++++++++++++++------------ 1 file changed, 34 insertions(+), 12 deletions(-) diff --git a/lib/nlattr.c b/lib/nlattr.c index 3df05db732ca..7f7ebd89caa4 100644 --- a/lib/nlattr.c +++ b/lib/nlattr.c @@ -44,6 +44,20 @@ static const u8 nla_attr_minlen[NLA_TYPE_MAX+1] = { [NLA_S64] = sizeof(s64), }; +/* + * Nested policies might refer back to the original + * policy in some cases, and userspace could try to + * abuse that and recurse by nesting in the right + * ways. Limit recursion to avoid this problem. + */ +#define MAX_POLICY_RECURSION_DEPTH 10 + +static int __nla_validate_parse(const struct nlattr *head, int len, int maxtype, + const struct nla_policy *policy, + unsigned int validate, + struct netlink_ext_ack *extack, + struct nlattr **tb, unsigned int depth); + static int validate_nla_bitfield32(const struct nlattr *nla, const u32 valid_flags_mask) { @@ -70,7 +84,7 @@ static int validate_nla_bitfield32(const struct nlattr *nla, static int nla_validate_array(const struct nlattr *head, int len, int maxtype, const struct nla_policy *policy, struct netlink_ext_ack *extack, - unsigned int validate) + unsigned int validate, unsigned int depth) { const struct nlattr *entry; int rem; @@ -87,8 +101,9 @@ static int nla_validate_array(const struct nlattr *head, int len, int maxtype, return -ERANGE; } - ret = __nla_validate(nla_data(entry), nla_len(entry), - maxtype, policy, validate, extack); + ret = __nla_validate_parse(nla_data(entry), nla_len(entry), + maxtype, policy, validate, extack, + NULL, depth + 1); if (ret < 0) return ret; } @@ -156,7 +171,7 @@ static int nla_validate_int_range(const struct nla_policy *pt, static int validate_nla(const struct nlattr *nla, int maxtype, const struct nla_policy *policy, unsigned int validate, - struct netlink_ext_ack *extack) + struct netlink_ext_ack *extack, unsigned int depth) { u16 strict_start_type = policy[0].strict_start_type; const struct nla_policy *pt; @@ -269,9 +284,10 @@ static int validate_nla(const struct nlattr *nla, int maxtype, if (attrlen < NLA_HDRLEN) goto out_err; if (pt->nested_policy) { - err = __nla_validate(nla_data(nla), nla_len(nla), pt->len, - pt->nested_policy, validate, - extack); + err = __nla_validate_parse(nla_data(nla), nla_len(nla), + pt->len, pt->nested_policy, + validate, extack, NULL, + depth + 1); if (err < 0) { /* * return directly to preserve the inner @@ -294,7 +310,7 @@ static int validate_nla(const struct nlattr *nla, int maxtype, err = nla_validate_array(nla_data(nla), nla_len(nla), pt->len, pt->nested_policy, - extack, validate); + extack, validate, depth); if (err < 0) { /* * return directly to preserve the inner @@ -358,11 +374,17 @@ static int __nla_validate_parse(const struct nlattr *head, int len, int maxtype, const struct nla_policy *policy, unsigned int validate, struct netlink_ext_ack *extack, - struct nlattr **tb) + struct nlattr **tb, unsigned int depth) { const struct nlattr *nla; int rem; + if (depth >= MAX_POLICY_RECURSION_DEPTH) { + NL_SET_ERR_MSG(extack, + "allowed policy recursion depth exceeded"); + return -EINVAL; + } + if (tb) memset(tb, 0, sizeof(struct nlattr *) * (maxtype + 1)); @@ -379,7 +401,7 @@ static int __nla_validate_parse(const struct nlattr *head, int len, int maxtype, } if (policy) { int err = validate_nla(nla, maxtype, policy, - validate, extack); + validate, extack, depth); if (err < 0) return err; @@ -421,7 +443,7 @@ int __nla_validate(const struct nlattr *head, int len, int maxtype, struct netlink_ext_ack *extack) { return __nla_validate_parse(head, len, maxtype, policy, validate, - extack, NULL); + extack, NULL, 0); } EXPORT_SYMBOL(__nla_validate); @@ -476,7 +498,7 @@ int __nla_parse(struct nlattr **tb, int maxtype, struct netlink_ext_ack *extack) { return __nla_validate_parse(head, len, maxtype, policy, validate, - extack, tb); + extack, tb, 0); } EXPORT_SYMBOL(__nla_parse); -- cgit v1.2.3-59-g8ed1b From d15da2a2e813679aeac8bff3be38d3adc849c1a6 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Thu, 30 Apr 2020 22:13:07 +0200 Subject: nl80211: link recursive netlink nested policy Now that we have limited recursive policy validation to avoid stack overflows, change nl80211 to actually link the nested policy (linking back to itself eventually), which allows some code cleanups. Signed-off-by: Johannes Berg Signed-off-by: David S. Miller --- net/wireless/nl80211.c | 10 ++++------ net/wireless/nl80211.h | 2 -- net/wireless/pmsr.c | 3 +-- 3 files changed, 5 insertions(+), 10 deletions(-) diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 692bcd35f809..57c618b6cb0e 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -253,6 +253,8 @@ static int validate_ie_attr(const struct nlattr *attr, } /* policy for the attributes */ +static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR]; + static const struct nla_policy nl80211_ftm_responder_policy[NL80211_FTM_RESP_ATTR_MAX + 1] = { [NL80211_FTM_RESP_ATTR_ENABLED] = { .type = NLA_FLAG, }, @@ -296,11 +298,7 @@ nl80211_pmsr_req_attr_policy[NL80211_PMSR_REQ_ATTR_MAX + 1] = { static const struct nla_policy nl80211_psmr_peer_attr_policy[NL80211_PMSR_PEER_ATTR_MAX + 1] = { [NL80211_PMSR_PEER_ATTR_ADDR] = NLA_POLICY_ETH_ADDR, - /* - * we could specify this again to be the top-level policy, - * but that would open us up to recursion problems ... - */ - [NL80211_PMSR_PEER_ATTR_CHAN] = { .type = NLA_NESTED }, + [NL80211_PMSR_PEER_ATTR_CHAN] = NLA_POLICY_NESTED(nl80211_policy), [NL80211_PMSR_PEER_ATTR_REQ] = NLA_POLICY_NESTED(nl80211_pmsr_req_attr_policy), [NL80211_PMSR_PEER_ATTR_RESP] = { .type = NLA_REJECT }, @@ -347,7 +345,7 @@ nl80211_tid_config_attr_policy[NL80211_TID_CONFIG_ATTR_MAX + 1] = { NLA_POLICY_MAX(NLA_U8, NL80211_TID_CONFIG_DISABLE), }; -const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = { +static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = { [0] = { .strict_start_type = NL80211_ATTR_HE_OBSS_PD }, [NL80211_ATTR_WIPHY] = { .type = NLA_U32 }, [NL80211_ATTR_WIPHY_NAME] = { .type = NLA_NUL_STRING, diff --git a/net/wireless/nl80211.h b/net/wireless/nl80211.h index a41e94a49a89..d3e8e426c486 100644 --- a/net/wireless/nl80211.h +++ b/net/wireless/nl80211.h @@ -11,8 +11,6 @@ int nl80211_init(void); void nl80211_exit(void); -extern const struct nla_policy nl80211_policy[NUM_NL80211_ATTR]; - void *nl80211hdr_put(struct sk_buff *skb, u32 portid, u32 seq, int flags, u8 cmd); bool nl80211_put_sta_rate(struct sk_buff *msg, struct rate_info *info, diff --git a/net/wireless/pmsr.c b/net/wireless/pmsr.c index 63dc8023447f..a95c79d18349 100644 --- a/net/wireless/pmsr.c +++ b/net/wireless/pmsr.c @@ -187,10 +187,9 @@ static int pmsr_parse_peer(struct cfg80211_registered_device *rdev, /* reuse info->attrs */ memset(info->attrs, 0, sizeof(*info->attrs) * (NL80211_ATTR_MAX + 1)); - /* need to validate here, we don't want to have validation recursion */ err = nla_parse_nested_deprecated(info->attrs, NL80211_ATTR_MAX, tb[NL80211_PMSR_PEER_ATTR_CHAN], - nl80211_policy, info->extack); + NULL, info->extack); if (err) return err; -- cgit v1.2.3-59-g8ed1b From d06a09b94c618c96ced584dd4611a888c8856b8d Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Thu, 30 Apr 2020 22:13:08 +0200 Subject: netlink: extend policy range validation Using a pointer to a struct indicating the min/max values, extend the ability to do range validation for arbitrary values. Small values in the s16 range can be kept in the policy directly. Signed-off-by: Johannes Berg Signed-off-by: David S. Miller --- include/net/netlink.h | 45 ++++++++++++++++++++ lib/nlattr.c | 113 ++++++++++++++++++++++++++++++++++++++++---------- 2 files changed, 137 insertions(+), 21 deletions(-) diff --git a/include/net/netlink.h b/include/net/netlink.h index 671b29d170a8..94a7df4ab122 100644 --- a/include/net/netlink.h +++ b/include/net/netlink.h @@ -189,11 +189,20 @@ enum { #define NLA_TYPE_MAX (__NLA_TYPE_MAX - 1) +struct netlink_range_validation { + u64 min, max; +}; + +struct netlink_range_validation_signed { + s64 min, max; +}; + enum nla_policy_validation { NLA_VALIDATE_NONE, NLA_VALIDATE_RANGE, NLA_VALIDATE_MIN, NLA_VALIDATE_MAX, + NLA_VALIDATE_RANGE_PTR, NLA_VALIDATE_FUNCTION, }; @@ -271,6 +280,22 @@ enum nla_policy_validation { * of s16 - do that as usual in the code instead. * Use the NLA_POLICY_MIN(), NLA_POLICY_MAX() and * NLA_POLICY_RANGE() macros. + * NLA_U8, + * NLA_U16, + * NLA_U32, + * NLA_U64 If the validation_type field instead is set to + * NLA_VALIDATE_RANGE_PTR, `range' must be a pointer + * to a struct netlink_range_validation that indicates + * the min/max values. + * Use NLA_POLICY_FULL_RANGE(). + * NLA_S8, + * NLA_S16, + * NLA_S32, + * NLA_S64 If the validation_type field instead is set to + * NLA_VALIDATE_RANGE_PTR, `range_signed' must be a + * pointer to a struct netlink_range_validation_signed + * that indicates the min/max values. + * Use NLA_POLICY_FULL_RANGE_SIGNED(). * All other Unused - but note that it's a union * * Meaning of `validate' field, use via NLA_POLICY_VALIDATE_FN: @@ -296,6 +321,8 @@ struct nla_policy { const u32 bitfield32_valid; const char *reject_message; const struct nla_policy *nested_policy; + struct netlink_range_validation *range; + struct netlink_range_validation_signed *range_signed; struct { s16 min, max; }; @@ -342,6 +369,12 @@ struct nla_policy { { .type = NLA_BITFIELD32, .bitfield32_valid = valid } #define __NLA_ENSURE(condition) BUILD_BUG_ON_ZERO(!(condition)) +#define NLA_ENSURE_UINT_TYPE(tp) \ + (__NLA_ENSURE(tp == NLA_U8 || tp == NLA_U16 || \ + tp == NLA_U32 || tp == NLA_U64) + tp) +#define NLA_ENSURE_SINT_TYPE(tp) \ + (__NLA_ENSURE(tp == NLA_S8 || tp == NLA_S16 || \ + tp == NLA_S32 || tp == NLA_S64) + tp) #define NLA_ENSURE_INT_TYPE(tp) \ (__NLA_ENSURE(tp == NLA_S8 || tp == NLA_U8 || \ tp == NLA_S16 || tp == NLA_U16 || \ @@ -360,6 +393,18 @@ struct nla_policy { .max = _max \ } +#define NLA_POLICY_FULL_RANGE(tp, _range) { \ + .type = NLA_ENSURE_UINT_TYPE(tp), \ + .validation_type = NLA_VALIDATE_RANGE_PTR, \ + .range = _range, \ +} + +#define NLA_POLICY_FULL_RANGE_SIGNED(tp, _range) { \ + .type = NLA_ENSURE_SINT_TYPE(tp), \ + .validation_type = NLA_VALIDATE_RANGE_PTR, \ + .range_signed = _range, \ +} + #define NLA_POLICY_MIN(tp, _min) { \ .type = NLA_ENSURE_INT_TYPE(tp), \ .validation_type = NLA_VALIDATE_MIN, \ diff --git a/lib/nlattr.c b/lib/nlattr.c index 7f7ebd89caa4..a8beb173f558 100644 --- a/lib/nlattr.c +++ b/lib/nlattr.c @@ -111,17 +111,34 @@ static int nla_validate_array(const struct nlattr *head, int len, int maxtype, return 0; } -static int nla_validate_int_range(const struct nla_policy *pt, - const struct nlattr *nla, - struct netlink_ext_ack *extack) +static int nla_validate_int_range_unsigned(const struct nla_policy *pt, + const struct nlattr *nla, + struct netlink_ext_ack *extack) { - bool validate_min, validate_max; - s64 value; + struct netlink_range_validation _range = { + .min = 0, + .max = U64_MAX, + }, *range = &_range; + u64 value; - validate_min = pt->validation_type == NLA_VALIDATE_RANGE || - pt->validation_type == NLA_VALIDATE_MIN; - validate_max = pt->validation_type == NLA_VALIDATE_RANGE || - pt->validation_type == NLA_VALIDATE_MAX; + WARN_ON_ONCE(pt->validation_type != NLA_VALIDATE_RANGE_PTR && + (pt->min < 0 || pt->max < 0)); + + switch (pt->validation_type) { + case NLA_VALIDATE_RANGE: + range->min = pt->min; + range->max = pt->max; + break; + case NLA_VALIDATE_RANGE_PTR: + range = pt->range; + break; + case NLA_VALIDATE_MIN: + range->min = pt->min; + break; + case NLA_VALIDATE_MAX: + range->max = pt->max; + break; + } switch (pt->type) { case NLA_U8: @@ -133,6 +150,49 @@ static int nla_validate_int_range(const struct nla_policy *pt, case NLA_U32: value = nla_get_u32(nla); break; + case NLA_U64: + value = nla_get_u64(nla); + break; + default: + return -EINVAL; + } + + if (value < range->min || value > range->max) { + NL_SET_ERR_MSG_ATTR(extack, nla, + "integer out of range"); + return -ERANGE; + } + + return 0; +} + +static int nla_validate_int_range_signed(const struct nla_policy *pt, + const struct nlattr *nla, + struct netlink_ext_ack *extack) +{ + struct netlink_range_validation_signed _range = { + .min = S64_MIN, + .max = S64_MAX, + }, *range = &_range; + s64 value; + + switch (pt->validation_type) { + case NLA_VALIDATE_RANGE: + range->min = pt->min; + range->max = pt->max; + break; + case NLA_VALIDATE_RANGE_PTR: + range = pt->range_signed; + break; + case NLA_VALIDATE_MIN: + range->min = pt->min; + break; + case NLA_VALIDATE_MAX: + range->max = pt->max; + break; + } + + switch (pt->type) { case NLA_S8: value = nla_get_s8(nla); break; @@ -145,22 +205,11 @@ static int nla_validate_int_range(const struct nla_policy *pt, case NLA_S64: value = nla_get_s64(nla); break; - case NLA_U64: - /* treat this one specially, since it may not fit into s64 */ - if ((validate_min && nla_get_u64(nla) < pt->min) || - (validate_max && nla_get_u64(nla) > pt->max)) { - NL_SET_ERR_MSG_ATTR(extack, nla, - "integer out of range"); - return -ERANGE; - } - return 0; default: - WARN_ON(1); return -EINVAL; } - if ((validate_min && value < pt->min) || - (validate_max && value > pt->max)) { + if (value < range->min || value > range->max) { NL_SET_ERR_MSG_ATTR(extack, nla, "integer out of range"); return -ERANGE; @@ -169,6 +218,27 @@ static int nla_validate_int_range(const struct nla_policy *pt, return 0; } +static int nla_validate_int_range(const struct nla_policy *pt, + const struct nlattr *nla, + struct netlink_ext_ack *extack) +{ + switch (pt->type) { + case NLA_U8: + case NLA_U16: + case NLA_U32: + case NLA_U64: + return nla_validate_int_range_unsigned(pt, nla, extack); + case NLA_S8: + case NLA_S16: + case NLA_S32: + case NLA_S64: + return nla_validate_int_range_signed(pt, nla, extack); + default: + WARN_ON(1); + return -EINVAL; + } +} + static int validate_nla(const struct nlattr *nla, int maxtype, const struct nla_policy *policy, unsigned int validate, struct netlink_ext_ack *extack, unsigned int depth) @@ -348,6 +418,7 @@ static int validate_nla(const struct nlattr *nla, int maxtype, case NLA_VALIDATE_NONE: /* nothing to do */ break; + case NLA_VALIDATE_RANGE_PTR: case NLA_VALIDATE_RANGE: case NLA_VALIDATE_MIN: case NLA_VALIDATE_MAX: -- cgit v1.2.3-59-g8ed1b From da4063bdfcfa70ec57a6c25f772ac6378b1584ad Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Thu, 30 Apr 2020 22:13:09 +0200 Subject: netlink: allow NLA_MSECS to have range validation Since NLA_MSECS is really equivalent to NLA_U64, allow it to have range validation as well. Signed-off-by: Johannes Berg Signed-off-by: David S. Miller --- include/net/netlink.h | 6 ++++-- lib/nlattr.c | 2 ++ 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/include/net/netlink.h b/include/net/netlink.h index 94a7df4ab122..4acd7165e900 100644 --- a/include/net/netlink.h +++ b/include/net/netlink.h @@ -371,7 +371,8 @@ struct nla_policy { #define __NLA_ENSURE(condition) BUILD_BUG_ON_ZERO(!(condition)) #define NLA_ENSURE_UINT_TYPE(tp) \ (__NLA_ENSURE(tp == NLA_U8 || tp == NLA_U16 || \ - tp == NLA_U32 || tp == NLA_U64) + tp) + tp == NLA_U32 || tp == NLA_U64 || \ + tp == NLA_MSECS) + tp) #define NLA_ENSURE_SINT_TYPE(tp) \ (__NLA_ENSURE(tp == NLA_S8 || tp == NLA_S16 || \ tp == NLA_S32 || tp == NLA_S64) + tp) @@ -379,7 +380,8 @@ struct nla_policy { (__NLA_ENSURE(tp == NLA_S8 || tp == NLA_U8 || \ tp == NLA_S16 || tp == NLA_U16 || \ tp == NLA_S32 || tp == NLA_U32 || \ - tp == NLA_S64 || tp == NLA_U64) + tp) + tp == NLA_S64 || tp == NLA_U64 || \ + tp == NLA_MSECS) + tp) #define NLA_ENSURE_NO_VALIDATION_PTR(tp) \ (__NLA_ENSURE(tp != NLA_BITFIELD32 && \ tp != NLA_REJECT && \ diff --git a/lib/nlattr.c b/lib/nlattr.c index a8beb173f558..21ef3998b9d9 100644 --- a/lib/nlattr.c +++ b/lib/nlattr.c @@ -151,6 +151,7 @@ static int nla_validate_int_range_unsigned(const struct nla_policy *pt, value = nla_get_u32(nla); break; case NLA_U64: + case NLA_MSECS: value = nla_get_u64(nla); break; default: @@ -227,6 +228,7 @@ static int nla_validate_int_range(const struct nla_policy *pt, case NLA_U16: case NLA_U32: case NLA_U64: + case NLA_MSECS: return nla_validate_int_range_unsigned(pt, nla, extack); case NLA_S8: case NLA_S16: -- cgit v1.2.3-59-g8ed1b From c7721c05a6217491810f406ec28df80a9bcf3546 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Thu, 30 Apr 2020 22:13:10 +0200 Subject: netlink: remove NLA_EXACT_LEN_WARN Use a validation type instead, so we can later expose the NLA_* values to userspace for policy descriptions. Some transformations were done with this spatch: @@ identifier p; expression X, L, A; @@ struct nla_policy p[X] = { [A] = -{ .type = NLA_EXACT_LEN_WARN, .len = L }, +NLA_POLICY_EXACT_LEN_WARN(L), ... }; Signed-off-by: Johannes Berg Signed-off-by: David S. Miller --- include/net/netlink.h | 15 +++++----- lib/nlattr.c | 16 ++++++---- net/wireless/nl80211.c | 81 +++++++++++--------------------------------------- 3 files changed, 36 insertions(+), 76 deletions(-) diff --git a/include/net/netlink.h b/include/net/netlink.h index 4acd7165e900..4d4a733f1e8d 100644 --- a/include/net/netlink.h +++ b/include/net/netlink.h @@ -182,7 +182,6 @@ enum { NLA_BITFIELD32, NLA_REJECT, NLA_EXACT_LEN, - NLA_EXACT_LEN_WARN, NLA_MIN_LEN, __NLA_TYPE_MAX, }; @@ -204,6 +203,7 @@ enum nla_policy_validation { NLA_VALIDATE_MAX, NLA_VALIDATE_RANGE_PTR, NLA_VALIDATE_FUNCTION, + NLA_VALIDATE_WARN_TOO_LONG, }; /** @@ -237,10 +237,10 @@ enum nla_policy_validation { * just like "All other" * NLA_BITFIELD32 Unused * NLA_REJECT Unused - * NLA_EXACT_LEN Attribute must have exactly this length, otherwise - * it is rejected. - * NLA_EXACT_LEN_WARN Attribute should have exactly this length, a warning - * is logged if it is longer, shorter is rejected. + * NLA_EXACT_LEN Attribute should have exactly this length, otherwise + * it is rejected or warned about, the latter happening + * if and only if the `validation_type' is set to + * NLA_VALIDATE_WARN_TOO_LONG. * NLA_MIN_LEN Minimum length of attribute payload * All other Minimum length of attribute payload * @@ -350,8 +350,9 @@ struct nla_policy { }; #define NLA_POLICY_EXACT_LEN(_len) { .type = NLA_EXACT_LEN, .len = _len } -#define NLA_POLICY_EXACT_LEN_WARN(_len) { .type = NLA_EXACT_LEN_WARN, \ - .len = _len } +#define NLA_POLICY_EXACT_LEN_WARN(_len) \ + { .type = NLA_EXACT_LEN, .len = _len, \ + .validation_type = NLA_VALIDATE_WARN_TOO_LONG, } #define NLA_POLICY_MIN_LEN(_len) { .type = NLA_MIN_LEN, .len = _len } #define NLA_POLICY_ETH_ADDR NLA_POLICY_EXACT_LEN(ETH_ALEN) diff --git a/lib/nlattr.c b/lib/nlattr.c index 21ef3998b9d9..6dcbe1bedd3b 100644 --- a/lib/nlattr.c +++ b/lib/nlattr.c @@ -261,7 +261,9 @@ static int validate_nla(const struct nlattr *nla, int maxtype, BUG_ON(pt->type > NLA_TYPE_MAX); if ((nla_attr_len[pt->type] && attrlen != nla_attr_len[pt->type]) || - (pt->type == NLA_EXACT_LEN_WARN && attrlen != pt->len)) { + (pt->type == NLA_EXACT_LEN && + pt->validation_type == NLA_VALIDATE_WARN_TOO_LONG && + attrlen != pt->len)) { pr_warn_ratelimited("netlink: '%s': attribute type %d has an invalid length.\n", current->comm, type); if (validate & NL_VALIDATE_STRICT_ATTRS) { @@ -287,11 +289,6 @@ static int validate_nla(const struct nlattr *nla, int maxtype, } switch (pt->type) { - case NLA_EXACT_LEN: - if (attrlen != pt->len) - goto out_err; - break; - case NLA_REJECT: if (extack && pt->reject_message) { NL_SET_BAD_ATTR(extack, nla); @@ -405,6 +402,13 @@ static int validate_nla(const struct nlattr *nla, int maxtype, goto out_err; break; + case NLA_EXACT_LEN: + if (pt->validation_type != NLA_VALIDATE_WARN_TOO_LONG) { + if (attrlen != pt->len) + goto out_err; + break; + } + /* fall through */ default: if (pt->len) minlen = pt->len; diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 57c618b6cb0e..519414468b5d 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -376,11 +376,8 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = { [NL80211_ATTR_IFINDEX] = { .type = NLA_U32 }, [NL80211_ATTR_IFNAME] = { .type = NLA_NUL_STRING, .len = IFNAMSIZ-1 }, - [NL80211_ATTR_MAC] = { .type = NLA_EXACT_LEN_WARN, .len = ETH_ALEN }, - [NL80211_ATTR_PREV_BSSID] = { - .type = NLA_EXACT_LEN_WARN, - .len = ETH_ALEN - }, + [NL80211_ATTR_MAC] = NLA_POLICY_EXACT_LEN_WARN(ETH_ALEN), + [NL80211_ATTR_PREV_BSSID] = NLA_POLICY_EXACT_LEN_WARN(ETH_ALEN), [NL80211_ATTR_KEY] = { .type = NLA_NESTED, }, [NL80211_ATTR_KEY_DATA] = { .type = NLA_BINARY, @@ -432,10 +429,7 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = { [NL80211_ATTR_MESH_CONFIG] = { .type = NLA_NESTED }, [NL80211_ATTR_SUPPORT_MESH_AUTH] = { .type = NLA_FLAG }, - [NL80211_ATTR_HT_CAPABILITY] = { - .type = NLA_EXACT_LEN_WARN, - .len = NL80211_HT_CAPABILITY_LEN - }, + [NL80211_ATTR_HT_CAPABILITY] = NLA_POLICY_EXACT_LEN_WARN(NL80211_HT_CAPABILITY_LEN), [NL80211_ATTR_MGMT_SUBTYPE] = { .type = NLA_U8 }, [NL80211_ATTR_IE] = NLA_POLICY_VALIDATE_FN(NLA_BINARY, @@ -466,10 +460,7 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = { [NL80211_ATTR_WPA_VERSIONS] = { .type = NLA_U32 }, [NL80211_ATTR_PID] = { .type = NLA_U32 }, [NL80211_ATTR_4ADDR] = { .type = NLA_U8 }, - [NL80211_ATTR_PMKID] = { - .type = NLA_EXACT_LEN_WARN, - .len = WLAN_PMKID_LEN - }, + [NL80211_ATTR_PMKID] = NLA_POLICY_EXACT_LEN_WARN(WLAN_PMKID_LEN), [NL80211_ATTR_DURATION] = { .type = NLA_U32 }, [NL80211_ATTR_COOKIE] = { .type = NLA_U64 }, [NL80211_ATTR_TX_RATES] = { .type = NLA_NESTED }, @@ -533,10 +524,7 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = { [NL80211_ATTR_WDEV] = { .type = NLA_U64 }, [NL80211_ATTR_USER_REG_HINT_TYPE] = { .type = NLA_U32 }, [NL80211_ATTR_AUTH_DATA] = { .type = NLA_BINARY, }, - [NL80211_ATTR_VHT_CAPABILITY] = { - .type = NLA_EXACT_LEN_WARN, - .len = NL80211_VHT_CAPABILITY_LEN - }, + [NL80211_ATTR_VHT_CAPABILITY] = NLA_POLICY_EXACT_LEN_WARN(NL80211_VHT_CAPABILITY_LEN), [NL80211_ATTR_SCAN_FLAGS] = { .type = NLA_U32 }, [NL80211_ATTR_P2P_CTWINDOW] = NLA_POLICY_MAX(NLA_U8, 127), [NL80211_ATTR_P2P_OPPPS] = NLA_POLICY_MAX(NLA_U8, 1), @@ -574,10 +562,7 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = { [NL80211_ATTR_VENDOR_DATA] = { .type = NLA_BINARY }, [NL80211_ATTR_QOS_MAP] = { .type = NLA_BINARY, .len = IEEE80211_QOS_MAP_LEN_MAX }, - [NL80211_ATTR_MAC_HINT] = { - .type = NLA_EXACT_LEN_WARN, - .len = ETH_ALEN - }, + [NL80211_ATTR_MAC_HINT] = NLA_POLICY_EXACT_LEN_WARN(ETH_ALEN), [NL80211_ATTR_WIPHY_FREQ_HINT] = { .type = NLA_U32 }, [NL80211_ATTR_TDLS_PEER_CAPABILITY] = { .type = NLA_U32 }, [NL80211_ATTR_SOCKET_OWNER] = { .type = NLA_FLAG }, @@ -589,10 +574,7 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = { [NL80211_ATTR_ADMITTED_TIME] = { .type = NLA_U16 }, [NL80211_ATTR_SMPS_MODE] = { .type = NLA_U8 }, [NL80211_ATTR_OPER_CLASS] = { .type = NLA_U8 }, - [NL80211_ATTR_MAC_MASK] = { - .type = NLA_EXACT_LEN_WARN, - .len = ETH_ALEN - }, + [NL80211_ATTR_MAC_MASK] = NLA_POLICY_EXACT_LEN_WARN(ETH_ALEN), [NL80211_ATTR_WIPHY_SELF_MANAGED_REG] = { .type = NLA_FLAG }, [NL80211_ATTR_NETNS_FD] = { .type = NLA_U32 }, [NL80211_ATTR_SCHED_SCAN_DELAY] = { .type = NLA_U32 }, @@ -604,21 +586,15 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = { [NL80211_ATTR_MU_MIMO_GROUP_DATA] = { .len = VHT_MUMIMO_GROUPS_DATA_LEN }, - [NL80211_ATTR_MU_MIMO_FOLLOW_MAC_ADDR] = { - .type = NLA_EXACT_LEN_WARN, - .len = ETH_ALEN - }, + [NL80211_ATTR_MU_MIMO_FOLLOW_MAC_ADDR] = NLA_POLICY_EXACT_LEN_WARN(ETH_ALEN), [NL80211_ATTR_NAN_MASTER_PREF] = NLA_POLICY_MIN(NLA_U8, 1), [NL80211_ATTR_BANDS] = { .type = NLA_U32 }, [NL80211_ATTR_NAN_FUNC] = { .type = NLA_NESTED }, [NL80211_ATTR_FILS_KEK] = { .type = NLA_BINARY, .len = FILS_MAX_KEK_LEN }, - [NL80211_ATTR_FILS_NONCES] = { - .type = NLA_EXACT_LEN_WARN, - .len = 2 * FILS_NONCE_LEN - }, + [NL80211_ATTR_FILS_NONCES] = NLA_POLICY_EXACT_LEN_WARN(2 * FILS_NONCE_LEN), [NL80211_ATTR_MULTICAST_TO_UNICAST_ENABLED] = { .type = NLA_FLAG, }, - [NL80211_ATTR_BSSID] = { .type = NLA_EXACT_LEN_WARN, .len = ETH_ALEN }, + [NL80211_ATTR_BSSID] = NLA_POLICY_EXACT_LEN_WARN(ETH_ALEN), [NL80211_ATTR_SCHED_SCAN_RELATIVE_RSSI] = { .type = NLA_S8 }, [NL80211_ATTR_SCHED_SCAN_RSSI_ADJUST] = { .len = sizeof(struct nl80211_bss_select_rssi_adjust) @@ -631,7 +607,7 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = { [NL80211_ATTR_FILS_ERP_NEXT_SEQ_NUM] = { .type = NLA_U16 }, [NL80211_ATTR_FILS_ERP_RRK] = { .type = NLA_BINARY, .len = FILS_ERP_MAX_RRK_LEN }, - [NL80211_ATTR_FILS_CACHE_ID] = { .type = NLA_EXACT_LEN_WARN, .len = 2 }, + [NL80211_ATTR_FILS_CACHE_ID] = NLA_POLICY_EXACT_LEN_WARN(2), [NL80211_ATTR_PMK] = { .type = NLA_BINARY, .len = PMK_MAX_LEN }, [NL80211_ATTR_SCHED_SCAN_MULTI] = { .type = NLA_FLAG }, [NL80211_ATTR_EXTERNAL_AUTH_SUPPORT] = { .type = NLA_FLAG }, @@ -701,10 +677,7 @@ static const struct nla_policy nl80211_wowlan_tcp_policy[NUM_NL80211_WOWLAN_TCP] = { [NL80211_WOWLAN_TCP_SRC_IPV4] = { .type = NLA_U32 }, [NL80211_WOWLAN_TCP_DST_IPV4] = { .type = NLA_U32 }, - [NL80211_WOWLAN_TCP_DST_MAC] = { - .type = NLA_EXACT_LEN_WARN, - .len = ETH_ALEN - }, + [NL80211_WOWLAN_TCP_DST_MAC] = NLA_POLICY_EXACT_LEN_WARN(ETH_ALEN), [NL80211_WOWLAN_TCP_SRC_PORT] = { .type = NLA_U16 }, [NL80211_WOWLAN_TCP_DST_PORT] = { .type = NLA_U16 }, [NL80211_WOWLAN_TCP_DATA_PAYLOAD] = { .type = NLA_MIN_LEN, .len = 1 }, @@ -734,18 +707,9 @@ nl80211_coalesce_policy[NUM_NL80211_ATTR_COALESCE_RULE] = { /* policy for GTK rekey offload attributes */ static const struct nla_policy nl80211_rekey_policy[NUM_NL80211_REKEY_DATA] = { - [NL80211_REKEY_DATA_KEK] = { - .type = NLA_EXACT_LEN_WARN, - .len = NL80211_KEK_LEN, - }, - [NL80211_REKEY_DATA_KCK] = { - .type = NLA_EXACT_LEN_WARN, - .len = NL80211_KCK_LEN, - }, - [NL80211_REKEY_DATA_REPLAY_CTR] = { - .type = NLA_EXACT_LEN_WARN, - .len = NL80211_REPLAY_CTR_LEN - }, + [NL80211_REKEY_DATA_KEK] = NLA_POLICY_EXACT_LEN_WARN(NL80211_KEK_LEN), + [NL80211_REKEY_DATA_KCK] = NLA_POLICY_EXACT_LEN_WARN(NL80211_KCK_LEN), + [NL80211_REKEY_DATA_REPLAY_CTR] = NLA_POLICY_EXACT_LEN_WARN(NL80211_REPLAY_CTR_LEN), }; static const struct nla_policy @@ -760,10 +724,7 @@ static const struct nla_policy nl80211_match_policy[NL80211_SCHED_SCAN_MATCH_ATTR_MAX + 1] = { [NL80211_SCHED_SCAN_MATCH_ATTR_SSID] = { .type = NLA_BINARY, .len = IEEE80211_MAX_SSID_LEN }, - [NL80211_SCHED_SCAN_MATCH_ATTR_BSSID] = { - .type = NLA_EXACT_LEN_WARN, - .len = ETH_ALEN - }, + [NL80211_SCHED_SCAN_MATCH_ATTR_BSSID] = NLA_POLICY_EXACT_LEN_WARN(ETH_ALEN), [NL80211_SCHED_SCAN_MATCH_ATTR_RSSI] = { .type = NLA_U32 }, [NL80211_SCHED_SCAN_MATCH_PER_BAND_RSSI] = NLA_POLICY_NESTED(nl80211_match_band_rssi_policy), @@ -795,10 +756,7 @@ nl80211_nan_func_policy[NL80211_NAN_FUNC_ATTR_MAX + 1] = { [NL80211_NAN_FUNC_SUBSCRIBE_ACTIVE] = { .type = NLA_FLAG }, [NL80211_NAN_FUNC_FOLLOW_UP_ID] = { .type = NLA_U8 }, [NL80211_NAN_FUNC_FOLLOW_UP_REQ_ID] = { .type = NLA_U8 }, - [NL80211_NAN_FUNC_FOLLOW_UP_DEST] = { - .type = NLA_EXACT_LEN_WARN, - .len = ETH_ALEN - }, + [NL80211_NAN_FUNC_FOLLOW_UP_DEST] = NLA_POLICY_EXACT_LEN_WARN(ETH_ALEN), [NL80211_NAN_FUNC_CLOSE_RANGE] = { .type = NLA_FLAG }, [NL80211_NAN_FUNC_TTL] = { .type = NLA_U32 }, [NL80211_NAN_FUNC_SERVICE_INFO] = { .type = NLA_BINARY, @@ -4404,10 +4362,7 @@ static const struct nla_policy nl80211_txattr_policy[NL80211_TXRATE_MAX + 1] = { .len = NL80211_MAX_SUPP_RATES }, [NL80211_TXRATE_HT] = { .type = NLA_BINARY, .len = NL80211_MAX_SUPP_HT_RATES }, - [NL80211_TXRATE_VHT] = { - .type = NLA_EXACT_LEN_WARN, - .len = sizeof(struct nl80211_txrate_vht), - }, + [NL80211_TXRATE_VHT] = NLA_POLICY_EXACT_LEN_WARN(sizeof(struct nl80211_txrate_vht)), [NL80211_TXRATE_GI] = { .type = NLA_U8 }, }; -- cgit v1.2.3-59-g8ed1b From 2c28ae48f24d84fcda31fb8acaf2edca6ec46c49 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Thu, 30 Apr 2020 22:13:11 +0200 Subject: netlink: factor out policy range helpers Add helpers to get the policy's signed/unsigned range validation data. Signed-off-by: Johannes Berg Signed-off-by: David S. Miller --- include/net/netlink.h | 5 +++ lib/nlattr.c | 95 +++++++++++++++++++++++++++++++++++++++------------ 2 files changed, 79 insertions(+), 21 deletions(-) diff --git a/include/net/netlink.h b/include/net/netlink.h index 4d4a733f1e8d..557b67f1db99 100644 --- a/include/net/netlink.h +++ b/include/net/netlink.h @@ -1928,4 +1928,9 @@ static inline bool nla_is_last(const struct nlattr *nla, int rem) return nla->nla_len == rem; } +void nla_get_range_unsigned(const struct nla_policy *pt, + struct netlink_range_validation *range); +void nla_get_range_signed(const struct nla_policy *pt, + struct netlink_range_validation_signed *range); + #endif diff --git a/lib/nlattr.c b/lib/nlattr.c index 6dcbe1bedd3b..bc5b5cf608c4 100644 --- a/lib/nlattr.c +++ b/lib/nlattr.c @@ -111,26 +111,40 @@ static int nla_validate_array(const struct nlattr *head, int len, int maxtype, return 0; } -static int nla_validate_int_range_unsigned(const struct nla_policy *pt, - const struct nlattr *nla, - struct netlink_ext_ack *extack) +void nla_get_range_unsigned(const struct nla_policy *pt, + struct netlink_range_validation *range) { - struct netlink_range_validation _range = { - .min = 0, - .max = U64_MAX, - }, *range = &_range; - u64 value; - WARN_ON_ONCE(pt->validation_type != NLA_VALIDATE_RANGE_PTR && (pt->min < 0 || pt->max < 0)); + range->min = 0; + + switch (pt->type) { + case NLA_U8: + range->max = U8_MAX; + break; + case NLA_U16: + range->max = U16_MAX; + break; + case NLA_U32: + range->max = U32_MAX; + break; + case NLA_U64: + case NLA_MSECS: + range->max = U64_MAX; + break; + default: + WARN_ON_ONCE(1); + return; + } + switch (pt->validation_type) { case NLA_VALIDATE_RANGE: range->min = pt->min; range->max = pt->max; break; case NLA_VALIDATE_RANGE_PTR: - range = pt->range; + *range = *pt->range; break; case NLA_VALIDATE_MIN: range->min = pt->min; @@ -138,7 +152,17 @@ static int nla_validate_int_range_unsigned(const struct nla_policy *pt, case NLA_VALIDATE_MAX: range->max = pt->max; break; + default: + break; } +} + +static int nla_validate_int_range_unsigned(const struct nla_policy *pt, + const struct nlattr *nla, + struct netlink_ext_ack *extack) +{ + struct netlink_range_validation range; + u64 value; switch (pt->type) { case NLA_U8: @@ -158,7 +182,9 @@ static int nla_validate_int_range_unsigned(const struct nla_policy *pt, return -EINVAL; } - if (value < range->min || value > range->max) { + nla_get_range_unsigned(pt, &range); + + if (value < range.min || value > range.max) { NL_SET_ERR_MSG_ATTR(extack, nla, "integer out of range"); return -ERANGE; @@ -167,15 +193,30 @@ static int nla_validate_int_range_unsigned(const struct nla_policy *pt, return 0; } -static int nla_validate_int_range_signed(const struct nla_policy *pt, - const struct nlattr *nla, - struct netlink_ext_ack *extack) +void nla_get_range_signed(const struct nla_policy *pt, + struct netlink_range_validation_signed *range) { - struct netlink_range_validation_signed _range = { - .min = S64_MIN, - .max = S64_MAX, - }, *range = &_range; - s64 value; + switch (pt->type) { + case NLA_S8: + range->min = S8_MIN; + range->max = S8_MAX; + break; + case NLA_S16: + range->min = S16_MIN; + range->max = S16_MAX; + break; + case NLA_S32: + range->min = S32_MIN; + range->max = S32_MAX; + break; + case NLA_S64: + range->min = S64_MIN; + range->max = S64_MAX; + break; + default: + WARN_ON_ONCE(1); + return; + } switch (pt->validation_type) { case NLA_VALIDATE_RANGE: @@ -183,7 +224,7 @@ static int nla_validate_int_range_signed(const struct nla_policy *pt, range->max = pt->max; break; case NLA_VALIDATE_RANGE_PTR: - range = pt->range_signed; + *range = *pt->range_signed; break; case NLA_VALIDATE_MIN: range->min = pt->min; @@ -191,7 +232,17 @@ static int nla_validate_int_range_signed(const struct nla_policy *pt, case NLA_VALIDATE_MAX: range->max = pt->max; break; + default: + break; } +} + +static int nla_validate_int_range_signed(const struct nla_policy *pt, + const struct nlattr *nla, + struct netlink_ext_ack *extack) +{ + struct netlink_range_validation_signed range; + s64 value; switch (pt->type) { case NLA_S8: @@ -210,7 +261,9 @@ static int nla_validate_int_range_signed(const struct nla_policy *pt, return -EINVAL; } - if (value < range->min || value > range->max) { + nla_get_range_signed(pt, &range); + + if (value < range.min || value > range.max) { NL_SET_ERR_MSG_ATTR(extack, nla, "integer out of range"); return -ERANGE; -- cgit v1.2.3-59-g8ed1b From d07dcf9aadd6b2842b439e8668ff7ea2873f28d7 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Thu, 30 Apr 2020 22:13:12 +0200 Subject: netlink: add infrastructure to expose policies to userspace Add, and use in generic netlink, helpers to dump out a netlink policy to userspace, including all the range validation data, nested policies etc. This lets userspace discover what the kernel understands. For families/commands other than generic netlink, the helpers need to be used directly in an appropriate command, or we can add some infrastructure (a new netlink family) that those can register their policies with for introspection. I'm not that familiar with non-generic netlink, so that's left out for now. The data exposed to userspace also includes min and max length for binary/string data, I've done that instead of letting the userspace tools figure out whether min/max is intended based on the type so that we can extend this later in the kernel, we might want to just use the range data for example. Because of this, I opted to not directly expose the NLA_* values, even if some of them are already exposed via BPF, as with min/max length we don't need to have different types here for NLA_BINARY/NLA_MIN_LEN/NLA_EXACT_LEN, we just make them all NL_ATTR_TYPE_BINARY with min/max length optionally set. Similarly, we don't really need NLA_MSECS, and perhaps can remove it in the future - but not if we encode it into the userspace API now. It gets mapped to NL_ATTR_TYPE_U64 here. Note that the exposing here corresponds to the strict policy interpretation, and NLA_UNSPEC items are omitted entirely. To get those, change them to NLA_MIN_LEN which behaves in exactly the same way, but is exposed. Signed-off-by: Johannes Berg Signed-off-by: David S. Miller --- include/net/netlink.h | 6 + include/uapi/linux/genetlink.h | 2 + include/uapi/linux/netlink.h | 103 ++++++++++++++ net/netlink/Makefile | 2 +- net/netlink/genetlink.c | 78 +++++++++++ net/netlink/policy.c | 308 +++++++++++++++++++++++++++++++++++++++++ 6 files changed, 498 insertions(+), 1 deletion(-) create mode 100644 net/netlink/policy.c diff --git a/include/net/netlink.h b/include/net/netlink.h index 557b67f1db99..c0411f14fb53 100644 --- a/include/net/netlink.h +++ b/include/net/netlink.h @@ -1933,4 +1933,10 @@ void nla_get_range_unsigned(const struct nla_policy *pt, void nla_get_range_signed(const struct nla_policy *pt, struct netlink_range_validation_signed *range); +int netlink_policy_dump_start(const struct nla_policy *policy, + unsigned int maxtype, + unsigned long *state); +bool netlink_policy_dump_loop(unsigned long *state); +int netlink_policy_dump_write(struct sk_buff *skb, unsigned long state); + #endif diff --git a/include/uapi/linux/genetlink.h b/include/uapi/linux/genetlink.h index 877f7fa95466..9c0636ec2286 100644 --- a/include/uapi/linux/genetlink.h +++ b/include/uapi/linux/genetlink.h @@ -48,6 +48,7 @@ enum { CTRL_CMD_NEWMCAST_GRP, CTRL_CMD_DELMCAST_GRP, CTRL_CMD_GETMCAST_GRP, /* unused */ + CTRL_CMD_GETPOLICY, __CTRL_CMD_MAX, }; @@ -62,6 +63,7 @@ enum { CTRL_ATTR_MAXATTR, CTRL_ATTR_OPS, CTRL_ATTR_MCAST_GROUPS, + CTRL_ATTR_POLICY, __CTRL_ATTR_MAX, }; diff --git a/include/uapi/linux/netlink.h b/include/uapi/linux/netlink.h index 0a4d73317759..eac8a6a648ea 100644 --- a/include/uapi/linux/netlink.h +++ b/include/uapi/linux/netlink.h @@ -249,4 +249,107 @@ struct nla_bitfield32 { __u32 selector; }; +/* + * policy descriptions - it's specific to each family how this is used + * Normally, it should be retrieved via a dump inside another attribute + * specifying where it applies. + */ + +/** + * enum netlink_attribute_type - type of an attribute + * @NL_ATTR_TYPE_INVALID: unused + * @NL_ATTR_TYPE_FLAG: flag attribute (present/not present) + * @NL_ATTR_TYPE_U8: 8-bit unsigned attribute + * @NL_ATTR_TYPE_U16: 16-bit unsigned attribute + * @NL_ATTR_TYPE_U32: 32-bit unsigned attribute + * @NL_ATTR_TYPE_U64: 64-bit unsigned attribute + * @NL_ATTR_TYPE_S8: 8-bit signed attribute + * @NL_ATTR_TYPE_S16: 16-bit signed attribute + * @NL_ATTR_TYPE_S32: 32-bit signed attribute + * @NL_ATTR_TYPE_S64: 64-bit signed attribute + * @NL_ATTR_TYPE_BINARY: binary data, min/max length may be specified + * @NL_ATTR_TYPE_STRING: string, min/max length may be specified + * @NL_ATTR_TYPE_NUL_STRING: NUL-terminated string, + * min/max length may be specified + * @NL_ATTR_TYPE_NESTED: nested, i.e. the content of this attribute + * consists of sub-attributes. The nested policy and maxtype + * inside may be specified. + * @NL_ATTR_TYPE_NESTED_ARRAY: nested array, i.e. the content of this + * attribute contains sub-attributes whose type is irrelevant + * (just used to separate the array entries) and each such array + * entry has attributes again, the policy for those inner ones + * and the corresponding maxtype may be specified. + * @NL_ATTR_TYPE_BITFIELD32: &struct nla_bitfield32 attribute + */ +enum netlink_attribute_type { + NL_ATTR_TYPE_INVALID, + + NL_ATTR_TYPE_FLAG, + + NL_ATTR_TYPE_U8, + NL_ATTR_TYPE_U16, + NL_ATTR_TYPE_U32, + NL_ATTR_TYPE_U64, + + NL_ATTR_TYPE_S8, + NL_ATTR_TYPE_S16, + NL_ATTR_TYPE_S32, + NL_ATTR_TYPE_S64, + + NL_ATTR_TYPE_BINARY, + NL_ATTR_TYPE_STRING, + NL_ATTR_TYPE_NUL_STRING, + + NL_ATTR_TYPE_NESTED, + NL_ATTR_TYPE_NESTED_ARRAY, + + NL_ATTR_TYPE_BITFIELD32, +}; + +/** + * enum netlink_policy_type_attr - policy type attributes + * @NL_POLICY_TYPE_ATTR_UNSPEC: unused + * @NL_POLICY_TYPE_ATTR_TYPE: type of the attribute, + * &enum netlink_attribute_type (U32) + * @NL_POLICY_TYPE_ATTR_MIN_VALUE_S: minimum value for signed + * integers (S64) + * @NL_POLICY_TYPE_ATTR_MAX_VALUE_S: maximum value for signed + * integers (S64) + * @NL_POLICY_TYPE_ATTR_MIN_VALUE_U: minimum value for unsigned + * integers (U64) + * @NL_POLICY_TYPE_ATTR_MAX_VALUE_U: maximum value for unsigned + * integers (U64) + * @NL_POLICY_TYPE_ATTR_MIN_LENGTH: minimum length for binary + * attributes, no minimum if not given (U32) + * @NL_POLICY_TYPE_ATTR_MAX_LENGTH: maximum length for binary + * attributes, no maximum if not given (U32) + * @NL_POLICY_TYPE_ATTR_POLICY_IDX: sub policy for nested and + * nested array types (U32) + * @NL_POLICY_TYPE_ATTR_POLICY_MAXTYPE: maximum sub policy + * attribute for nested and nested array types, this can + * in theory be < the size of the policy pointed to by + * the index, if limited inside the nesting (U32) + * @NL_POLICY_TYPE_ATTR_BITFIELD32_MASK: valid mask for the + * bitfield32 type (U32) + * @NL_POLICY_TYPE_ATTR_PAD: pad attribute for 64-bit alignment + */ +enum netlink_policy_type_attr { + NL_POLICY_TYPE_ATTR_UNSPEC, + NL_POLICY_TYPE_ATTR_TYPE, + NL_POLICY_TYPE_ATTR_MIN_VALUE_S, + NL_POLICY_TYPE_ATTR_MAX_VALUE_S, + NL_POLICY_TYPE_ATTR_MIN_VALUE_U, + NL_POLICY_TYPE_ATTR_MAX_VALUE_U, + NL_POLICY_TYPE_ATTR_MIN_LENGTH, + NL_POLICY_TYPE_ATTR_MAX_LENGTH, + NL_POLICY_TYPE_ATTR_POLICY_IDX, + NL_POLICY_TYPE_ATTR_POLICY_MAXTYPE, + NL_POLICY_TYPE_ATTR_BITFIELD32_MASK, + NL_POLICY_TYPE_ATTR_PAD, + + /* keep last */ + __NL_POLICY_TYPE_ATTR_MAX, + NL_POLICY_TYPE_ATTR_MAX = __NL_POLICY_TYPE_ATTR_MAX - 1 +}; + #endif /* _UAPI__LINUX_NETLINK_H */ diff --git a/net/netlink/Makefile b/net/netlink/Makefile index de42df7f0068..e05202708c90 100644 --- a/net/netlink/Makefile +++ b/net/netlink/Makefile @@ -3,7 +3,7 @@ # Makefile for the netlink driver. # -obj-y := af_netlink.o genetlink.o +obj-y := af_netlink.o genetlink.o policy.o obj-$(CONFIG_NETLINK_DIAG) += netlink_diag.o netlink_diag-y := diag.o diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c index 9f357aa22b94..2f049692e012 100644 --- a/net/netlink/genetlink.c +++ b/net/netlink/genetlink.c @@ -1043,6 +1043,80 @@ static int genl_ctrl_event(int event, const struct genl_family *family, return 0; } +static int ctrl_dumppolicy(struct sk_buff *skb, struct netlink_callback *cb) +{ + const struct genl_family *rt; + unsigned int fam_id = cb->args[0]; + int err; + + if (!fam_id) { + struct nlattr *tb[CTRL_ATTR_MAX + 1]; + + err = genlmsg_parse(cb->nlh, &genl_ctrl, tb, + genl_ctrl.maxattr, + genl_ctrl.policy, cb->extack); + if (err) + return err; + + if (!tb[CTRL_ATTR_FAMILY_ID] && !tb[CTRL_ATTR_FAMILY_NAME]) + return -EINVAL; + + if (tb[CTRL_ATTR_FAMILY_ID]) { + fam_id = nla_get_u16(tb[CTRL_ATTR_FAMILY_ID]); + } else { + rt = genl_family_find_byname( + nla_data(tb[CTRL_ATTR_FAMILY_NAME])); + if (!rt) + return -ENOENT; + fam_id = rt->id; + } + } + + rt = genl_family_find_byid(fam_id); + if (!rt) + return -ENOENT; + + if (!rt->policy) + return -ENODATA; + + err = netlink_policy_dump_start(rt->policy, rt->maxattr, &cb->args[1]); + if (err) + return err; + + while (netlink_policy_dump_loop(&cb->args[1])) { + void *hdr; + struct nlattr *nest; + + hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, &genl_ctrl, + NLM_F_MULTI, CTRL_CMD_GETPOLICY); + if (!hdr) + goto nla_put_failure; + + if (nla_put_u16(skb, CTRL_ATTR_FAMILY_ID, rt->id)) + goto nla_put_failure; + + nest = nla_nest_start(skb, CTRL_ATTR_POLICY); + if (!nest) + goto nla_put_failure; + + if (netlink_policy_dump_write(skb, cb->args[1])) + goto nla_put_failure; + + nla_nest_end(skb, nest); + + genlmsg_end(skb, hdr); + continue; + +nla_put_failure: + genlmsg_cancel(skb, hdr); + break; + } + + cb->args[0] = fam_id; + return skb->len; +} + static const struct genl_ops genl_ctrl_ops[] = { { .cmd = CTRL_CMD_GETFAMILY, @@ -1050,6 +1124,10 @@ static const struct genl_ops genl_ctrl_ops[] = { .doit = ctrl_getfamily, .dumpit = ctrl_dumpfamily, }, + { + .cmd = CTRL_CMD_GETPOLICY, + .dumpit = ctrl_dumppolicy, + }, }; static const struct genl_multicast_group genl_ctrl_groups[] = { diff --git a/net/netlink/policy.c b/net/netlink/policy.c new file mode 100644 index 000000000000..f6491853c797 --- /dev/null +++ b/net/netlink/policy.c @@ -0,0 +1,308 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * NETLINK Policy advertisement to userspace + * + * Authors: Johannes Berg + * + * Copyright 2019 Intel Corporation + */ + +#include +#include +#include +#include + +#define INITIAL_POLICIES_ALLOC 10 + +struct nl_policy_dump { + unsigned int policy_idx; + unsigned int attr_idx; + unsigned int n_alloc; + struct { + const struct nla_policy *policy; + unsigned int maxtype; + } policies[]; +}; + +static int add_policy(struct nl_policy_dump **statep, + const struct nla_policy *policy, + unsigned int maxtype) +{ + struct nl_policy_dump *state = *statep; + unsigned int n_alloc, i; + + if (!policy || !maxtype) + return 0; + + for (i = 0; i < state->n_alloc; i++) { + if (state->policies[i].policy == policy) + return 0; + + if (!state->policies[i].policy) { + state->policies[i].policy = policy; + state->policies[i].maxtype = maxtype; + return 0; + } + } + + n_alloc = state->n_alloc + INITIAL_POLICIES_ALLOC; + state = krealloc(state, struct_size(state, policies, n_alloc), + GFP_KERNEL); + if (!state) + return -ENOMEM; + + state->policies[state->n_alloc].policy = policy; + state->policies[state->n_alloc].maxtype = maxtype; + state->n_alloc = n_alloc; + *statep = state; + + return 0; +} + +static unsigned int get_policy_idx(struct nl_policy_dump *state, + const struct nla_policy *policy) +{ + unsigned int i; + + for (i = 0; i < state->n_alloc; i++) { + if (state->policies[i].policy == policy) + return i; + } + + WARN_ON_ONCE(1); + return -1; +} + +int netlink_policy_dump_start(const struct nla_policy *policy, + unsigned int maxtype, + unsigned long *_state) +{ + struct nl_policy_dump *state; + unsigned int policy_idx; + int err; + + /* also returns 0 if "*_state" is our ERR_PTR() end marker */ + if (*_state) + return 0; + + /* + * walk the policies and nested ones first, and build + * a linear list of them. + */ + + state = kzalloc(struct_size(state, policies, INITIAL_POLICIES_ALLOC), + GFP_KERNEL); + if (!state) + return -ENOMEM; + state->n_alloc = INITIAL_POLICIES_ALLOC; + + err = add_policy(&state, policy, maxtype); + if (err) + return err; + + for (policy_idx = 0; + policy_idx < state->n_alloc && state->policies[policy_idx].policy; + policy_idx++) { + const struct nla_policy *policy; + unsigned int type; + + policy = state->policies[policy_idx].policy; + + for (type = 0; + type <= state->policies[policy_idx].maxtype; + type++) { + switch (policy[type].type) { + case NLA_NESTED: + case NLA_NESTED_ARRAY: + err = add_policy(&state, + policy[type].nested_policy, + policy[type].len); + if (err) + return err; + break; + default: + break; + } + } + } + + *_state = (unsigned long)state; + + return 0; +} + +static bool netlink_policy_dump_finished(struct nl_policy_dump *state) +{ + return state->policy_idx >= state->n_alloc || + !state->policies[state->policy_idx].policy; +} + +bool netlink_policy_dump_loop(unsigned long *_state) +{ + struct nl_policy_dump *state = (void *)*_state; + + if (IS_ERR(state)) + return false; + + if (netlink_policy_dump_finished(state)) { + kfree(state); + /* store end marker instead of freed state */ + *_state = (unsigned long)ERR_PTR(-ENOENT); + return false; + } + + return true; +} + +int netlink_policy_dump_write(struct sk_buff *skb, unsigned long _state) +{ + struct nl_policy_dump *state = (void *)_state; + const struct nla_policy *pt; + struct nlattr *policy, *attr; + enum netlink_attribute_type type; + bool again; + +send_attribute: + again = false; + + pt = &state->policies[state->policy_idx].policy[state->attr_idx]; + + policy = nla_nest_start(skb, state->policy_idx); + if (!policy) + return -ENOBUFS; + + attr = nla_nest_start(skb, state->attr_idx); + if (!attr) + goto nla_put_failure; + + switch (pt->type) { + default: + case NLA_UNSPEC: + case NLA_REJECT: + /* skip - use NLA_MIN_LEN to advertise such */ + nla_nest_cancel(skb, policy); + again = true; + goto next; + case NLA_NESTED: + type = NL_ATTR_TYPE_NESTED; + /* fall through */ + case NLA_NESTED_ARRAY: + if (pt->type == NLA_NESTED_ARRAY) + type = NL_ATTR_TYPE_NESTED_ARRAY; + if (pt->nested_policy && pt->len && + (nla_put_u32(skb, NL_POLICY_TYPE_ATTR_POLICY_IDX, + get_policy_idx(state, pt->nested_policy)) || + nla_put_u32(skb, NL_POLICY_TYPE_ATTR_POLICY_MAXTYPE, + pt->len))) + goto nla_put_failure; + break; + case NLA_U8: + case NLA_U16: + case NLA_U32: + case NLA_U64: + case NLA_MSECS: { + struct netlink_range_validation range; + + if (pt->type == NLA_U8) + type = NL_ATTR_TYPE_U8; + else if (pt->type == NLA_U16) + type = NL_ATTR_TYPE_U16; + else if (pt->type == NLA_U32) + type = NL_ATTR_TYPE_U32; + else + type = NL_ATTR_TYPE_U64; + + nla_get_range_unsigned(pt, &range); + + if (nla_put_u64_64bit(skb, NL_POLICY_TYPE_ATTR_MIN_VALUE_U, + range.min, NL_POLICY_TYPE_ATTR_PAD) || + nla_put_u64_64bit(skb, NL_POLICY_TYPE_ATTR_MAX_VALUE_U, + range.max, NL_POLICY_TYPE_ATTR_PAD)) + goto nla_put_failure; + break; + } + case NLA_S8: + case NLA_S16: + case NLA_S32: + case NLA_S64: { + struct netlink_range_validation_signed range; + + if (pt->type == NLA_S8) + type = NL_ATTR_TYPE_S8; + else if (pt->type == NLA_S16) + type = NL_ATTR_TYPE_S16; + else if (pt->type == NLA_S32) + type = NL_ATTR_TYPE_S32; + else + type = NL_ATTR_TYPE_S64; + + nla_get_range_signed(pt, &range); + + if (nla_put_s64(skb, NL_POLICY_TYPE_ATTR_MIN_VALUE_S, + range.min, NL_POLICY_TYPE_ATTR_PAD) || + nla_put_s64(skb, NL_POLICY_TYPE_ATTR_MAX_VALUE_S, + range.max, NL_POLICY_TYPE_ATTR_PAD)) + goto nla_put_failure; + break; + } + case NLA_BITFIELD32: + type = NL_ATTR_TYPE_BITFIELD32; + if (nla_put_u32(skb, NL_POLICY_TYPE_ATTR_BITFIELD32_MASK, + pt->bitfield32_valid)) + goto nla_put_failure; + break; + case NLA_EXACT_LEN: + type = NL_ATTR_TYPE_BINARY; + if (nla_put_u32(skb, NL_POLICY_TYPE_ATTR_MIN_LENGTH, pt->len) || + nla_put_u32(skb, NL_POLICY_TYPE_ATTR_MAX_LENGTH, pt->len)) + goto nla_put_failure; + break; + case NLA_STRING: + case NLA_NUL_STRING: + case NLA_BINARY: + if (pt->type == NLA_STRING) + type = NL_ATTR_TYPE_STRING; + else if (pt->type == NLA_NUL_STRING) + type = NL_ATTR_TYPE_NUL_STRING; + else + type = NL_ATTR_TYPE_BINARY; + if (pt->len && nla_put_u32(skb, NL_POLICY_TYPE_ATTR_MAX_LENGTH, + pt->len)) + goto nla_put_failure; + break; + case NLA_MIN_LEN: + type = NL_ATTR_TYPE_BINARY; + if (nla_put_u32(skb, NL_POLICY_TYPE_ATTR_MIN_LENGTH, pt->len)) + goto nla_put_failure; + break; + case NLA_FLAG: + type = NL_ATTR_TYPE_FLAG; + break; + } + + if (nla_put_u32(skb, NL_POLICY_TYPE_ATTR_TYPE, type)) + goto nla_put_failure; + + /* finish and move state to next attribute */ + nla_nest_end(skb, attr); + nla_nest_end(skb, policy); + +next: + state->attr_idx += 1; + if (state->attr_idx > state->policies[state->policy_idx].maxtype) { + state->attr_idx = 0; + state->policy_idx++; + } + + if (again) { + if (netlink_policy_dump_finished(state)) + return -ENODATA; + goto send_attribute; + } + + return 0; + +nla_put_failure: + nla_nest_cancel(skb, policy); + return -ENOBUFS; +} -- cgit v1.2.3-59-g8ed1b From f256356f65e6449a9fcf6089ea25882c91768665 Mon Sep 17 00:00:00 2001 From: Yangbo Lu Date: Mon, 27 Apr 2020 11:39:03 +0800 Subject: ptp_qoriq: output PPS signal on FIPER2 in default Output PPS signal on FIPER2 (Fixed Period Interval Pulse) in default which is more desired by user. Signed-off-by: Yangbo Lu Signed-off-by: David S. Miller --- include/linux/fsl/ptp_qoriq.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/fsl/ptp_qoriq.h b/include/linux/fsl/ptp_qoriq.h index 75884563059f..884b8f8ca06d 100644 --- a/include/linux/fsl/ptp_qoriq.h +++ b/include/linux/fsl/ptp_qoriq.h @@ -135,7 +135,7 @@ struct ptp_qoriq_registers { #define DEFAULT_CKSEL 1 #define DEFAULT_TMR_PRSC 2 #define DEFAULT_FIPER1_PERIOD 1000000000 -#define DEFAULT_FIPER2_PERIOD 100000 +#define DEFAULT_FIPER2_PERIOD 1000000000 struct ptp_qoriq { void __iomem *base; -- cgit v1.2.3-59-g8ed1b From 7ae9a4f483eccfd46f40ede8d2bd28fd2f7a9c5f Mon Sep 17 00:00:00 2001 From: Aishwarya Ramakrishnan Date: Mon, 27 Apr 2020 16:02:30 +0530 Subject: dpaa_eth: Fix comparing pointer to 0 Fixes coccicheck warning: ./drivers/net/ethernet/freescale/dpaa/dpaa_eth.c:2110:30-31: WARNING comparing pointer to 0 Avoid pointer type value compared to 0. Signed-off-by: Aishwarya Ramakrishnan Signed-off-by: David S. Miller --- drivers/net/ethernet/freescale/dpaa/dpaa_eth.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c b/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c index 2cd1f8efdfa3..c4416a5f8816 100644 --- a/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c +++ b/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c @@ -2107,7 +2107,7 @@ workaround: /* Workaround for DPAA_A050385 requires data start to be aligned */ start = PTR_ALIGN(new_skb->data, DPAA_A050385_ALIGN); - if (start - new_skb->data != 0) + if (start - new_skb->data) skb_reserve(new_skb, start - new_skb->data); skb_put(new_skb, skb->len); -- cgit v1.2.3-59-g8ed1b From 654cad8b6a17dcb00077070b27bc65873951a568 Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Mon, 27 Apr 2020 12:11:10 +0000 Subject: octeontx2-pf: Fix error return code in otx2_probe() Fix to return negative error code -ENOMEM from the error handling case instead of 0, as done elsewhere in this function. Fixes: 5a6d7c9daef3 ("octeontx2-pf: Mailbox communication with AF") Signed-off-by: Wei Yongjun Signed-off-by: David S. Miller --- drivers/net/ethernet/marvell/octeontx2/nic/otx2_pf.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_pf.c b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_pf.c index 411e5ea1031e..64786568af0d 100644 --- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_pf.c +++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_pf.c @@ -1856,13 +1856,17 @@ static int otx2_probe(struct pci_dev *pdev, const struct pci_device_id *id) num_vec = pci_msix_vec_count(pdev); hw->irq_name = devm_kmalloc_array(&hw->pdev->dev, num_vec, NAME_SIZE, GFP_KERNEL); - if (!hw->irq_name) + if (!hw->irq_name) { + err = -ENOMEM; goto err_free_netdev; + } hw->affinity_mask = devm_kcalloc(&hw->pdev->dev, num_vec, sizeof(cpumask_var_t), GFP_KERNEL); - if (!hw->affinity_mask) + if (!hw->affinity_mask) { + err = -ENOMEM; goto err_free_netdev; + } /* Map CSRs */ pf->reg_base = pcim_iomap(pdev, PCI_CFG_REG_BAR_NUM, 0); -- cgit v1.2.3-59-g8ed1b From f8d530ac29fe9248f5e58ca5bcf4c368f8393ccf Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Mon, 27 Apr 2020 12:12:28 +0000 Subject: ice: Fix error return code in ice_add_prof() Fix to return a error code from the error handling case instead of 0, as done elsewhere in this function. Fixes: 31ad4e4ee1e4 ("ice: Allocate flow profile") Signed-off-by: Wei Yongjun Signed-off-by: David S. Miller --- drivers/net/ethernet/intel/ice/ice_flex_pipe.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/intel/ice/ice_flex_pipe.c b/drivers/net/ethernet/intel/ice/ice_flex_pipe.c index 42bac3ec5526..e7a2671222d2 100644 --- a/drivers/net/ethernet/intel/ice/ice_flex_pipe.c +++ b/drivers/net/ethernet/intel/ice/ice_flex_pipe.c @@ -2962,8 +2962,10 @@ ice_add_prof(struct ice_hw *hw, enum ice_block blk, u64 id, u8 ptypes[], /* add profile info */ prof = devm_kzalloc(ice_hw_to_dev(hw), sizeof(*prof), GFP_KERNEL); - if (!prof) + if (!prof) { + status = ICE_ERR_NO_MEMORY; goto err_ice_add_prof; + } prof->profile_cookie = id; prof->prof_id = prof_id; -- cgit v1.2.3-59-g8ed1b From 88ec7cb22ddde725ed4ce15991f0bd9dd817fd85 Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Mon, 27 Apr 2020 12:15:07 +0000 Subject: net: lpc-enet: fix error return code in lpc_mii_init() Fix to return a negative error code from the error handling case instead of 0, as done elsewhere in this function. Fixes: b7370112f519 ("lpc32xx: Added ethernet driver") Signed-off-by: Wei Yongjun Acked-by: Vladimir Zapolskiy Signed-off-by: David S. Miller --- drivers/net/ethernet/nxp/lpc_eth.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/nxp/lpc_eth.c b/drivers/net/ethernet/nxp/lpc_eth.c index d20cf03a3ea0..311454d9b0bc 100644 --- a/drivers/net/ethernet/nxp/lpc_eth.c +++ b/drivers/net/ethernet/nxp/lpc_eth.c @@ -823,7 +823,8 @@ static int lpc_mii_init(struct netdata_local *pldat) if (err) goto err_out_unregister_bus; - if (lpc_mii_probe(pldat->ndev) != 0) + err = lpc_mii_probe(pldat->ndev); + if (err) goto err_out_unregister_bus; return 0; -- cgit v1.2.3-59-g8ed1b From 0a699302be5986307b3dcf84ac7a0dd30f9e9305 Mon Sep 17 00:00:00 2001 From: Fugang Duan Date: Mon, 27 Apr 2020 22:08:04 +0800 Subject: net: ethernet: fec: Revert "net: ethernet: fec: Replace interrupt driven MDIO with polled IO" This reverts commit 29ae6bd1b0d8a57d7c00ab12cbb949fc41986eef. The commit breaks ethernet function on i.MX6SX, i.MX7D, i.MX8MM, i.MX8MQ, and i.MX8QXP platforms. Boot yocto system by NFS mounting rootfs will be failed with the commit. Signed-off-by: Fugang Duan Signed-off-by: David S. Miller --- drivers/net/ethernet/freescale/fec.h | 4 +- drivers/net/ethernet/freescale/fec_main.c | 67 +++++++++++++++---------------- 2 files changed, 36 insertions(+), 35 deletions(-) diff --git a/drivers/net/ethernet/freescale/fec.h b/drivers/net/ethernet/freescale/fec.h index a6cdd5b61921..e74dd1f86bba 100644 --- a/drivers/net/ethernet/freescale/fec.h +++ b/drivers/net/ethernet/freescale/fec.h @@ -376,7 +376,8 @@ struct bufdesc_ex { #define FEC_ENET_TS_AVAIL ((uint)0x00010000) #define FEC_ENET_TS_TIMER ((uint)0x00008000) -#define FEC_DEFAULT_IMASK (FEC_ENET_TXF | FEC_ENET_RXF) +#define FEC_DEFAULT_IMASK (FEC_ENET_TXF | FEC_ENET_RXF | FEC_ENET_MII) +#define FEC_NAPI_IMASK FEC_ENET_MII #define FEC_RX_DISABLED_IMASK (FEC_DEFAULT_IMASK & (~FEC_ENET_RXF)) /* ENET interrupt coalescing macro define */ @@ -542,6 +543,7 @@ struct fec_enet_private { int link; int full_duplex; int speed; + struct completion mdio_done; int irq[FEC_IRQ_NUM]; bool bufdesc_ex; int pause_flag; diff --git a/drivers/net/ethernet/freescale/fec_main.c b/drivers/net/ethernet/freescale/fec_main.c index 1ae075a246a3..c7b84bb22f75 100644 --- a/drivers/net/ethernet/freescale/fec_main.c +++ b/drivers/net/ethernet/freescale/fec_main.c @@ -976,8 +976,8 @@ fec_restart(struct net_device *ndev) writel((__force u32)cpu_to_be32(temp_mac[1]), fep->hwp + FEC_ADDR_HIGH); - /* Clear any outstanding interrupt, except MDIO. */ - writel((0xffffffff & ~FEC_ENET_MII), fep->hwp + FEC_IEVENT); + /* Clear any outstanding interrupt. */ + writel(0xffffffff, fep->hwp + FEC_IEVENT); fec_enet_bd_init(ndev); @@ -1123,7 +1123,7 @@ fec_restart(struct net_device *ndev) if (fep->link) writel(FEC_DEFAULT_IMASK, fep->hwp + FEC_IMASK); else - writel(0, fep->hwp + FEC_IMASK); + writel(FEC_ENET_MII, fep->hwp + FEC_IMASK); /* Init the interrupt coalescing */ fec_enet_itr_coal_init(ndev); @@ -1652,10 +1652,6 @@ fec_enet_interrupt(int irq, void *dev_id) irqreturn_t ret = IRQ_NONE; int_events = readl(fep->hwp + FEC_IEVENT); - - /* Don't clear MDIO events, we poll for those */ - int_events &= ~FEC_ENET_MII; - writel(int_events, fep->hwp + FEC_IEVENT); fec_enet_collect_events(fep, int_events); @@ -1663,12 +1659,16 @@ fec_enet_interrupt(int irq, void *dev_id) ret = IRQ_HANDLED; if (napi_schedule_prep(&fep->napi)) { - /* Disable interrupts */ - writel(0, fep->hwp + FEC_IMASK); + /* Disable the NAPI interrupts */ + writel(FEC_NAPI_IMASK, fep->hwp + FEC_IMASK); __napi_schedule(&fep->napi); } } + if (int_events & FEC_ENET_MII) { + ret = IRQ_HANDLED; + complete(&fep->mdio_done); + } return ret; } @@ -1818,24 +1818,11 @@ static void fec_enet_adjust_link(struct net_device *ndev) phy_print_status(phy_dev); } -static int fec_enet_mdio_wait(struct fec_enet_private *fep) -{ - uint ievent; - int ret; - - ret = readl_poll_timeout_atomic(fep->hwp + FEC_IEVENT, ievent, - ievent & FEC_ENET_MII, 2, 30000); - - if (!ret) - writel(FEC_ENET_MII, fep->hwp + FEC_IEVENT); - - return ret; -} - static int fec_enet_mdio_read(struct mii_bus *bus, int mii_id, int regnum) { struct fec_enet_private *fep = bus->priv; struct device *dev = &fep->pdev->dev; + unsigned long time_left; int ret = 0, frame_start, frame_addr, frame_op; bool is_c45 = !!(regnum & MII_ADDR_C45); @@ -1843,6 +1830,8 @@ static int fec_enet_mdio_read(struct mii_bus *bus, int mii_id, int regnum) if (ret < 0) return ret; + reinit_completion(&fep->mdio_done); + if (is_c45) { frame_start = FEC_MMFR_ST_C45; @@ -1854,9 +1843,11 @@ static int fec_enet_mdio_read(struct mii_bus *bus, int mii_id, int regnum) fep->hwp + FEC_MII_DATA); /* wait for end of transfer */ - ret = fec_enet_mdio_wait(fep); - if (ret) { + time_left = wait_for_completion_timeout(&fep->mdio_done, + usecs_to_jiffies(FEC_MII_TIMEOUT)); + if (time_left == 0) { netdev_err(fep->netdev, "MDIO address write timeout\n"); + ret = -ETIMEDOUT; goto out; } @@ -1875,9 +1866,11 @@ static int fec_enet_mdio_read(struct mii_bus *bus, int mii_id, int regnum) FEC_MMFR_TA, fep->hwp + FEC_MII_DATA); /* wait for end of transfer */ - ret = fec_enet_mdio_wait(fep); - if (ret) { + time_left = wait_for_completion_timeout(&fep->mdio_done, + usecs_to_jiffies(FEC_MII_TIMEOUT)); + if (time_left == 0) { netdev_err(fep->netdev, "MDIO read timeout\n"); + ret = -ETIMEDOUT; goto out; } @@ -1895,6 +1888,7 @@ static int fec_enet_mdio_write(struct mii_bus *bus, int mii_id, int regnum, { struct fec_enet_private *fep = bus->priv; struct device *dev = &fep->pdev->dev; + unsigned long time_left; int ret, frame_start, frame_addr; bool is_c45 = !!(regnum & MII_ADDR_C45); @@ -1904,6 +1898,8 @@ static int fec_enet_mdio_write(struct mii_bus *bus, int mii_id, int regnum, else ret = 0; + reinit_completion(&fep->mdio_done); + if (is_c45) { frame_start = FEC_MMFR_ST_C45; @@ -1915,9 +1911,11 @@ static int fec_enet_mdio_write(struct mii_bus *bus, int mii_id, int regnum, fep->hwp + FEC_MII_DATA); /* wait for end of transfer */ - ret = fec_enet_mdio_wait(fep); - if (ret) { + time_left = wait_for_completion_timeout(&fep->mdio_done, + usecs_to_jiffies(FEC_MII_TIMEOUT)); + if (time_left == 0) { netdev_err(fep->netdev, "MDIO address write timeout\n"); + ret = -ETIMEDOUT; goto out; } } else { @@ -1933,9 +1931,12 @@ static int fec_enet_mdio_write(struct mii_bus *bus, int mii_id, int regnum, fep->hwp + FEC_MII_DATA); /* wait for end of transfer */ - ret = fec_enet_mdio_wait(fep); - if (ret) + time_left = wait_for_completion_timeout(&fep->mdio_done, + usecs_to_jiffies(FEC_MII_TIMEOUT)); + if (time_left == 0) { netdev_err(fep->netdev, "MDIO write timeout\n"); + ret = -ETIMEDOUT; + } out: pm_runtime_mark_last_busy(dev); @@ -2144,9 +2145,6 @@ static int fec_enet_mii_init(struct platform_device *pdev) writel(fep->phy_speed, fep->hwp + FEC_MII_SPEED); - /* Clear any pending transaction complete indication */ - writel(FEC_ENET_MII, fep->hwp + FEC_IEVENT); - fep->mii_bus = mdiobus_alloc(); if (fep->mii_bus == NULL) { err = -ENOMEM; @@ -3688,6 +3686,7 @@ fec_probe(struct platform_device *pdev) fep->irq[i] = irq; } + init_completion(&fep->mdio_done); ret = fec_enet_mii_init(pdev); if (ret) goto failed_mii_init; -- cgit v1.2.3-59-g8ed1b From c4db9934a33e5f276965a14b3eea7a6d64c85065 Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Mon, 27 Apr 2020 09:40:52 +0000 Subject: net: ll_temac: Fix return value check in temac_probe() In case of error, the function devm_ioremap() returns NULL pointer not ERR_PTR(). The IS_ERR() test in the return value check should be replaced with NULL test. Signed-off-by: Wei Yongjun Acked-by: Esben Haabendal Signed-off-by: David S. Miller --- drivers/net/ethernet/xilinx/ll_temac_main.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/xilinx/ll_temac_main.c b/drivers/net/ethernet/xilinx/ll_temac_main.c index 3e313e71ae36..929244064abd 100644 --- a/drivers/net/ethernet/xilinx/ll_temac_main.c +++ b/drivers/net/ethernet/xilinx/ll_temac_main.c @@ -1410,9 +1410,9 @@ static int temac_probe(struct platform_device *pdev) res = platform_get_resource(pdev, IORESOURCE_MEM, 0); lp->regs = devm_ioremap(&pdev->dev, res->start, resource_size(res)); - if (IS_ERR(lp->regs)) { + if (!lp->regs) { dev_err(&pdev->dev, "could not map TEMAC registers\n"); - return PTR_ERR(lp->regs); + return -ENOMEM; } /* Select register access functions with the specified @@ -1505,10 +1505,10 @@ static int temac_probe(struct platform_device *pdev) res = platform_get_resource(pdev, IORESOURCE_MEM, 1); lp->sdma_regs = devm_ioremap(&pdev->dev, res->start, resource_size(res)); - if (IS_ERR(lp->sdma_regs)) { + if (!lp->sdma_regs) { dev_err(&pdev->dev, "could not map DMA registers\n"); - return PTR_ERR(lp->sdma_regs); + return -ENOMEM; } if (pdata->dma_little_endian) { lp->dma_in = temac_dma_in32_le; -- cgit v1.2.3-59-g8ed1b From 97fff7c8de1e54e5326dfeb66085796864bceb64 Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Mon, 27 Apr 2020 10:43:22 +0000 Subject: dpaa2-eth: fix error return code in setup_dpni() Fix to return negative error code -ENOMEM from the error handling case instead of 0, as done elsewhere in this function. Signed-off-by: Wei Yongjun Signed-off-by: David S. Miller --- drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c b/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c index 8ec435ba7d27..11accab81ea1 100644 --- a/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c +++ b/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c @@ -2702,8 +2702,10 @@ static int setup_dpni(struct fsl_mc_device *ls_dev) priv->cls_rules = devm_kzalloc(dev, sizeof(struct dpaa2_eth_cls_rule) * dpaa2_eth_fs_count(priv), GFP_KERNEL); - if (!priv->cls_rules) + if (!priv->cls_rules) { + err = -ENOMEM; goto close; + } return 0; -- cgit v1.2.3-59-g8ed1b From 78734404ef9c133eac70339415c8028dbe19109a Mon Sep 17 00:00:00 2001 From: Bjorn Andersson Date: Tue, 28 Apr 2020 00:01:39 -0700 Subject: net: usb: ax88179_178a: Implement ethtool_ops set_eeprom The vendor driver does upon failing to read a valid MAC address from EEPROM write the netdev's address back to EEPROM and invoking a EEPROM reload operation. Based on this we can implement the ethtool_ops set_eeprom and provide the means to populate the EEPROM from within Linux. It's worth noting that ax88179_get_eeprom() will return some default data unless the content of the EEPROM is deemed "complete", so until the EEPROM is fully populated (e.g. by running ethtool -e | ethtool -E) data written with ax88179_set_eeprom() will appear not to stick. The implementation is based on asix_set_eeprom(), from asix_common.c Signed-off-by: Bjorn Andersson Signed-off-by: David S. Miller --- drivers/net/usb/ax88179_178a.c | 77 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 77 insertions(+) diff --git a/drivers/net/usb/ax88179_178a.c b/drivers/net/usb/ax88179_178a.c index 93044cf1417a..b05bb11a02cb 100644 --- a/drivers/net/usb/ax88179_178a.c +++ b/drivers/net/usb/ax88179_178a.c @@ -31,6 +31,7 @@ #define AX_ACCESS_PHY 0x02 #define AX_ACCESS_EEPROM 0x04 #define AX_ACCESS_EFUS 0x05 +#define AX_RELOAD_EEPROM_EFUSE 0x06 #define AX_PAUSE_WATERLVL_HIGH 0x54 #define AX_PAUSE_WATERLVL_LOW 0x55 @@ -611,6 +612,81 @@ ax88179_get_eeprom(struct net_device *net, struct ethtool_eeprom *eeprom, return 0; } +static int +ax88179_set_eeprom(struct net_device *net, struct ethtool_eeprom *eeprom, + u8 *data) +{ + struct usbnet *dev = netdev_priv(net); + u16 *eeprom_buff; + int first_word; + int last_word; + int ret; + int i; + + netdev_dbg(net, "write EEPROM len %d, offset %d, magic 0x%x\n", + eeprom->len, eeprom->offset, eeprom->magic); + + if (eeprom->len == 0) + return -EINVAL; + + if (eeprom->magic != AX88179_EEPROM_MAGIC) + return -EINVAL; + + first_word = eeprom->offset >> 1; + last_word = (eeprom->offset + eeprom->len - 1) >> 1; + + eeprom_buff = kmalloc_array(last_word - first_word + 1, sizeof(u16), + GFP_KERNEL); + if (!eeprom_buff) + return -ENOMEM; + + /* align data to 16 bit boundaries, read the missing data from + the EEPROM */ + if (eeprom->offset & 1) { + ret = ax88179_read_cmd(dev, AX_ACCESS_EEPROM, first_word, 1, 2, + &eeprom_buff[0]); + if (ret < 0) { + netdev_err(net, "Failed to read EEPROM at offset 0x%02x.\n", first_word); + goto free; + } + } + + if ((eeprom->offset + eeprom->len) & 1) { + ret = ax88179_read_cmd(dev, AX_ACCESS_EEPROM, last_word, 1, 2, + &eeprom_buff[last_word - first_word]); + if (ret < 0) { + netdev_err(net, "Failed to read EEPROM at offset 0x%02x.\n", last_word); + goto free; + } + } + + memcpy((u8 *)eeprom_buff + (eeprom->offset & 1), data, eeprom->len); + + for (i = first_word; i <= last_word; i++) { + netdev_dbg(net, "write to EEPROM at offset 0x%02x, data 0x%04x\n", + i, eeprom_buff[i - first_word]); + ret = ax88179_write_cmd(dev, AX_ACCESS_EEPROM, i, 1, 2, + &eeprom_buff[i - first_word]); + if (ret < 0) { + netdev_err(net, "Failed to write EEPROM at offset 0x%02x.\n", i); + goto free; + } + msleep(20); + } + + /* reload EEPROM data */ + ret = ax88179_write_cmd(dev, AX_RELOAD_EEPROM_EFUSE, 0x0000, 0, 0, NULL); + if (ret < 0) { + netdev_err(net, "Failed to reload EEPROM data\n"); + goto free; + } + + ret = 0; +free: + kfree(eeprom_buff); + return ret; +} + static int ax88179_get_link_ksettings(struct net_device *net, struct ethtool_link_ksettings *cmd) { @@ -822,6 +898,7 @@ static const struct ethtool_ops ax88179_ethtool_ops = { .set_wol = ax88179_set_wol, .get_eeprom_len = ax88179_get_eeprom_len, .get_eeprom = ax88179_get_eeprom, + .set_eeprom = ax88179_set_eeprom, .get_eee = ax88179_get_eee, .set_eee = ax88179_set_eee, .nway_reset = usbnet_nway_reset, -- cgit v1.2.3-59-g8ed1b From b410439ca371334cd9e5772e411b5e2b6b2d0c9a Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Tue, 28 Apr 2020 22:54:12 +0200 Subject: r8169: improve max jumbo packet size definition Sync definition of max jumbo packet size with vendor driver and reserve 22 bytes for VLAN ethernet header plus checksum. Signed-off-by: Heiner Kallweit Signed-off-by: David S. Miller --- drivers/net/ethernet/realtek/r8169_main.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/realtek/r8169_main.c b/drivers/net/ethernet/realtek/r8169_main.c index aa3e63e031da..82f2ae7b6cf7 100644 --- a/drivers/net/ethernet/realtek/r8169_main.c +++ b/drivers/net/ethernet/realtek/r8169_main.c @@ -88,10 +88,10 @@ #define RTL_R16(tp, reg) readw(tp->mmio_addr + (reg)) #define RTL_R32(tp, reg) readl(tp->mmio_addr + (reg)) -#define JUMBO_4K (4*1024 - ETH_HLEN - 2) -#define JUMBO_6K (6*1024 - ETH_HLEN - 2) -#define JUMBO_7K (7*1024 - ETH_HLEN - 2) -#define JUMBO_9K (9*1024 - ETH_HLEN - 2) +#define JUMBO_4K (4 * SZ_1K - VLAN_ETH_HLEN - ETH_FCS_LEN) +#define JUMBO_6K (6 * SZ_1K - VLAN_ETH_HLEN - ETH_FCS_LEN) +#define JUMBO_7K (7 * SZ_1K - VLAN_ETH_HLEN - ETH_FCS_LEN) +#define JUMBO_9K (9 * SZ_1K - VLAN_ETH_HLEN - ETH_FCS_LEN) static const struct { const char *name; -- cgit v1.2.3-59-g8ed1b From 838974e1e08a9724525fa13ef15d6e021c23c99d Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Tue, 28 Apr 2020 22:55:59 +0200 Subject: r8169: configure PME_SIGNAL for RTL8125 too RTL8125 supports the same PME_SIGNAL handling as all later RTL8168 chip variants. Signed-off-by: Heiner Kallweit Signed-off-by: David S. Miller --- drivers/net/ethernet/realtek/r8169_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/realtek/r8169_main.c b/drivers/net/ethernet/realtek/r8169_main.c index 82f2ae7b6cf7..0ac3976e3204 100644 --- a/drivers/net/ethernet/realtek/r8169_main.c +++ b/drivers/net/ethernet/realtek/r8169_main.c @@ -1429,7 +1429,7 @@ static void __rtl8169_set_wol(struct rtl8169_private *tp, u32 wolopts) break; case RTL_GIGA_MAC_VER_34: case RTL_GIGA_MAC_VER_37: - case RTL_GIGA_MAC_VER_39 ... RTL_GIGA_MAC_VER_52: + case RTL_GIGA_MAC_VER_39 ... RTL_GIGA_MAC_VER_61: options = RTL_R8(tp, Config2) & ~PME_SIGNAL; if (wolopts) options |= PME_SIGNAL; -- cgit v1.2.3-59-g8ed1b From cde0f4f81d1c11ccc214146e1c550bfe48629fac Mon Sep 17 00:00:00 2001 From: Michael Walle Date: Tue, 28 Apr 2020 23:15:02 +0200 Subject: net: phy: at803x: add downshift support The AR8031 and AR8035 support the link speed downshift. Add driver support for it. One peculiarity of these PHYs is that it needs a software reset after changing the setting, thus add the .soft_reset() op and do a phy_init_hw() if necessary. This was tested on a custom board with the AR8031. Signed-off-by: Michael Walle Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- drivers/net/phy/at803x.c | 87 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 87 insertions(+) diff --git a/drivers/net/phy/at803x.c b/drivers/net/phy/at803x.c index 31b6edcc1fd1..f4fec5f644e9 100644 --- a/drivers/net/phy/at803x.c +++ b/drivers/net/phy/at803x.c @@ -43,6 +43,9 @@ #define AT803X_INTR_STATUS 0x13 #define AT803X_SMART_SPEED 0x14 +#define AT803X_SMART_SPEED_ENABLE BIT(5) +#define AT803X_SMART_SPEED_RETRY_LIMIT_MASK GENMASK(4, 2) +#define AT803X_SMART_SPEED_BYPASS_TIMER BIT(1) #define AT803X_LED_CONTROL 0x18 #define AT803X_DEVICE_ADDR 0x03 @@ -103,6 +106,10 @@ #define AT803X_CLK_OUT_STRENGTH_HALF 1 #define AT803X_CLK_OUT_STRENGTH_QUARTER 2 +#define AT803X_DEFAULT_DOWNSHIFT 5 +#define AT803X_MIN_DOWNSHIFT 2 +#define AT803X_MAX_DOWNSHIFT 9 + #define ATH9331_PHY_ID 0x004dd041 #define ATH8030_PHY_ID 0x004dd076 #define ATH8031_PHY_ID 0x004dd074 @@ -713,6 +720,80 @@ static int at803x_read_status(struct phy_device *phydev) return 0; } +static int at803x_get_downshift(struct phy_device *phydev, u8 *d) +{ + int val; + + val = phy_read(phydev, AT803X_SMART_SPEED); + if (val < 0) + return val; + + if (val & AT803X_SMART_SPEED_ENABLE) + *d = FIELD_GET(AT803X_SMART_SPEED_RETRY_LIMIT_MASK, val) + 2; + else + *d = DOWNSHIFT_DEV_DISABLE; + + return 0; +} + +static int at803x_set_downshift(struct phy_device *phydev, u8 cnt) +{ + u16 mask, set; + int ret; + + switch (cnt) { + case DOWNSHIFT_DEV_DEFAULT_COUNT: + cnt = AT803X_DEFAULT_DOWNSHIFT; + fallthrough; + case AT803X_MIN_DOWNSHIFT ... AT803X_MAX_DOWNSHIFT: + set = AT803X_SMART_SPEED_ENABLE | + AT803X_SMART_SPEED_BYPASS_TIMER | + FIELD_PREP(AT803X_SMART_SPEED_RETRY_LIMIT_MASK, cnt - 2); + mask = AT803X_SMART_SPEED_RETRY_LIMIT_MASK; + break; + case DOWNSHIFT_DEV_DISABLE: + set = 0; + mask = AT803X_SMART_SPEED_ENABLE | + AT803X_SMART_SPEED_BYPASS_TIMER; + break; + default: + return -EINVAL; + } + + ret = phy_modify_changed(phydev, AT803X_SMART_SPEED, mask, set); + + /* After changing the smart speed settings, we need to perform a + * software reset, use phy_init_hw() to make sure we set the + * reapply any values which might got lost during software reset. + */ + if (ret == 1) + ret = phy_init_hw(phydev); + + return ret; +} + +static int at803x_get_tunable(struct phy_device *phydev, + struct ethtool_tunable *tuna, void *data) +{ + switch (tuna->id) { + case ETHTOOL_PHY_DOWNSHIFT: + return at803x_get_downshift(phydev, data); + default: + return -EOPNOTSUPP; + } +} + +static int at803x_set_tunable(struct phy_device *phydev, + struct ethtool_tunable *tuna, const void *data) +{ + switch (tuna->id) { + case ETHTOOL_PHY_DOWNSHIFT: + return at803x_set_downshift(phydev, *(const u8 *)data); + default: + return -EOPNOTSUPP; + } +} + static struct phy_driver at803x_driver[] = { { /* Qualcomm Atheros AR8035 */ @@ -722,6 +803,7 @@ static struct phy_driver at803x_driver[] = { .probe = at803x_probe, .remove = at803x_remove, .config_init = at803x_config_init, + .soft_reset = genphy_soft_reset, .set_wol = at803x_set_wol, .get_wol = at803x_get_wol, .suspend = at803x_suspend, @@ -730,6 +812,8 @@ static struct phy_driver at803x_driver[] = { .read_status = at803x_read_status, .ack_interrupt = at803x_ack_interrupt, .config_intr = at803x_config_intr, + .get_tunable = at803x_get_tunable, + .set_tunable = at803x_set_tunable, }, { /* Qualcomm Atheros AR8030 */ .phy_id = ATH8030_PHY_ID, @@ -754,6 +838,7 @@ static struct phy_driver at803x_driver[] = { .probe = at803x_probe, .remove = at803x_remove, .config_init = at803x_config_init, + .soft_reset = genphy_soft_reset, .set_wol = at803x_set_wol, .get_wol = at803x_get_wol, .suspend = at803x_suspend, @@ -763,6 +848,8 @@ static struct phy_driver at803x_driver[] = { .aneg_done = at803x_aneg_done, .ack_interrupt = &at803x_ack_interrupt, .config_intr = &at803x_config_intr, + .get_tunable = at803x_get_tunable, + .set_tunable = at803x_set_tunable, }, { /* Qualcomm Atheros AR8032 */ PHY_ID_MATCH_EXACT(ATH8032_PHY_ID), -- cgit v1.2.3-59-g8ed1b From 86570d8a2f768485a515f646d702a37b34b27260 Mon Sep 17 00:00:00 2001 From: Michael Walle Date: Wed, 29 Apr 2020 01:06:56 +0200 Subject: net: phy: bcm54140: use genphy_soft_reset() Set the .soft_reset() op to be sure there will be a reset even if there is no hardware reset line registered. Signed-off-by: Michael Walle Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/phy/bcm54140.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/phy/bcm54140.c b/drivers/net/phy/bcm54140.c index c009ac2856a5..18d1e798a4c3 100644 --- a/drivers/net/phy/bcm54140.c +++ b/drivers/net/phy/bcm54140.c @@ -862,6 +862,7 @@ static struct phy_driver bcm54140_drivers[] = { .probe = bcm54140_probe, .suspend = genphy_suspend, .resume = genphy_resume, + .soft_reset = genphy_soft_reset, .get_tunable = bcm54140_get_tunable, .set_tunable = bcm54140_set_tunable, }, -- cgit v1.2.3-59-g8ed1b From afcecca56f0cd287c7895511dc380b95c633b1a2 Mon Sep 17 00:00:00 2001 From: Michael Walle Date: Wed, 29 Apr 2020 01:06:57 +0200 Subject: net: phy: bcm54140: fix phy_id_mask Broadcom defines the bits for this PHY as follows: { oui[24:3], model[6:0], revision[2:0] } Thus we have to mask the lower three bits only. Fixes: 6937602ed3f9 ("net: phy: add Broadcom BCM54140 support") Signed-off-by: Michael Walle Reviewed-by: Andrew Lunn Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/phy/bcm54140.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/phy/bcm54140.c b/drivers/net/phy/bcm54140.c index 18d1e798a4c3..63acf34663d9 100644 --- a/drivers/net/phy/bcm54140.c +++ b/drivers/net/phy/bcm54140.c @@ -852,7 +852,7 @@ static int bcm54140_set_tunable(struct phy_device *phydev, static struct phy_driver bcm54140_drivers[] = { { .phy_id = PHY_ID_BCM54140, - .phy_id_mask = 0xfffffff0, + .phy_id_mask = 0xfffffff8, .name = "Broadcom BCM54140", .features = PHY_GBIT_FEATURES, .config_init = bcm54140_config_init, @@ -870,7 +870,7 @@ static struct phy_driver bcm54140_drivers[] = { module_phy_driver(bcm54140_drivers); static struct mdio_device_id __maybe_unused bcm54140_tbl[] = { - { PHY_ID_BCM54140, 0xfffffff0 }, + { PHY_ID_BCM54140, 0xfffffff8 }, { } }; -- cgit v1.2.3-59-g8ed1b From e9a66851de722dfa97a41e982266603cdb97ea3b Mon Sep 17 00:00:00 2001 From: Michael Walle Date: Wed, 29 Apr 2020 01:06:58 +0200 Subject: net: phy: bcm54140: apply the workaround on b0 chips The lower three bits of the phy_id specifies the chip stepping. The workaround is specifically for the B0 stepping. Apply it only on these chips. Signed-off-by: Michael Walle Reviewed-by: Andrew Lunn Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/phy/bcm54140.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/drivers/net/phy/bcm54140.c b/drivers/net/phy/bcm54140.c index 63acf34663d9..d0498ed47878 100644 --- a/drivers/net/phy/bcm54140.c +++ b/drivers/net/phy/bcm54140.c @@ -115,6 +115,9 @@ #define BCM54140_HWMON_IN_ALARM_BIT(ch) ((ch) ? BCM54140_RDB_MON_ISR_3V3 \ : BCM54140_RDB_MON_ISR_1V0) +#define BCM54140_PHY_ID_REV(phy_id) ((phy_id) & 0x7) +#define BCM54140_REV_B0 1 + #define BCM54140_DEFAULT_DOWNSHIFT 5 #define BCM54140_MAX_DOWNSHIFT 9 @@ -632,9 +635,11 @@ static int bcm54140_config_init(struct phy_device *phydev) int ret; /* Apply hardware errata */ - ret = bcm54140_b0_workaround(phydev); - if (ret) - return ret; + if (BCM54140_PHY_ID_REV(phydev->phy_id) == BCM54140_REV_B0) { + ret = bcm54140_b0_workaround(phydev); + if (ret) + return ret; + } /* Unmask events we are interested in. */ reg &= ~(BCM54140_RDB_INT_DUPLEX | -- cgit v1.2.3-59-g8ed1b From e4e51da66dc812176cca16b0f8a5b87b173deb5d Mon Sep 17 00:00:00 2001 From: Michael Walle Date: Wed, 29 Apr 2020 01:06:59 +0200 Subject: net: phy: bcm54140: add second PHY ID This PHY has two PHY IDs depending on its mode. Adjust the mask so that it includes both IDs. Signed-off-by: Michael Walle Reviewed-by: Andrew Lunn Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/phy/bcm54140.c | 11 +++++++++-- include/linux/brcmphy.h | 2 +- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/drivers/net/phy/bcm54140.c b/drivers/net/phy/bcm54140.c index d0498ed47878..400d7c3c405a 100644 --- a/drivers/net/phy/bcm54140.c +++ b/drivers/net/phy/bcm54140.c @@ -115,6 +115,13 @@ #define BCM54140_HWMON_IN_ALARM_BIT(ch) ((ch) ? BCM54140_RDB_MON_ISR_3V3 \ : BCM54140_RDB_MON_ISR_1V0) +/* This PHY has two different PHY IDs depening on its MODE_SEL pin. This + * pin choses between 4x SGMII and QSGMII mode: + * AE02_5009 4x SGMII + * AE02_5019 QSGMII + */ +#define BCM54140_PHY_ID_MASK 0xffffffe8 + #define BCM54140_PHY_ID_REV(phy_id) ((phy_id) & 0x7) #define BCM54140_REV_B0 1 @@ -857,7 +864,7 @@ static int bcm54140_set_tunable(struct phy_device *phydev, static struct phy_driver bcm54140_drivers[] = { { .phy_id = PHY_ID_BCM54140, - .phy_id_mask = 0xfffffff8, + .phy_id_mask = BCM54140_PHY_ID_MASK, .name = "Broadcom BCM54140", .features = PHY_GBIT_FEATURES, .config_init = bcm54140_config_init, @@ -875,7 +882,7 @@ static struct phy_driver bcm54140_drivers[] = { module_phy_driver(bcm54140_drivers); static struct mdio_device_id __maybe_unused bcm54140_tbl[] = { - { PHY_ID_BCM54140, 0xfffffff8 }, + { PHY_ID_BCM54140, BCM54140_PHY_ID_MASK }, { } }; diff --git a/include/linux/brcmphy.h b/include/linux/brcmphy.h index 8be150e69c7c..58d0150acc3e 100644 --- a/include/linux/brcmphy.h +++ b/include/linux/brcmphy.h @@ -25,7 +25,7 @@ #define PHY_ID_BCM5461 0x002060c0 #define PHY_ID_BCM54612E 0x03625e60 #define PHY_ID_BCM54616S 0x03625d10 -#define PHY_ID_BCM54140 0xae025019 +#define PHY_ID_BCM54140 0xae025009 #define PHY_ID_BCM57780 0x03625d90 #define PHY_ID_BCM89610 0x03625cd0 -- cgit v1.2.3-59-g8ed1b From 2597912514925f6f06e85807cc9fc0fdfa630d11 Mon Sep 17 00:00:00 2001 From: Zheng Bin Date: Wed, 29 Apr 2020 09:58:24 +0800 Subject: hinic: make symbol 'dump_mox_reg' static Fix sparse warnings: drivers/net/ethernet/huawei/hinic/hinic_hw_mbox.c:601:6: warning: symbol 'dump_mox_reg' was not declared. Should it be static? Reported-by: Hulk Robot Signed-off-by: Zheng Bin Signed-off-by: David S. Miller --- drivers/net/ethernet/huawei/hinic/hinic_hw_mbox.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/huawei/hinic/hinic_hw_mbox.c b/drivers/net/ethernet/huawei/hinic/hinic_hw_mbox.c index f8626dfd192e..d5cf31529dbf 100644 --- a/drivers/net/ethernet/huawei/hinic/hinic_hw_mbox.c +++ b/drivers/net/ethernet/huawei/hinic/hinic_hw_mbox.c @@ -598,7 +598,7 @@ static void write_mbox_msg_attr(struct hinic_mbox_func_to_func *func_to_func, HINIC_FUNC_CSR_MAILBOX_CONTROL_OFF, mbox_ctrl); } -void dump_mox_reg(struct hinic_hwdev *hwdev) +static void dump_mox_reg(struct hinic_hwdev *hwdev) { u32 val; -- cgit v1.2.3-59-g8ed1b From 37ecb5b8b8cd3156e739fd1c56a8e3842b72ebad Mon Sep 17 00:00:00 2001 From: Zou Wei Date: Wed, 29 Apr 2020 11:35:28 +0800 Subject: hinic: Use kmemdup instead of kzalloc and memcpy Fixes coccicheck warnings: drivers/net/ethernet/huawei/hinic/hinic_hw_mbox.c:452:17-24: WARNING opportunity for kmemdup drivers/net/ethernet/huawei/hinic/hinic_hw_mbox.c:458:23-30: WARNING opportunity for kmemdup Reported-by: Hulk Robot Signed-off-by: Zou Wei Signed-off-by: David S. Miller --- drivers/net/ethernet/huawei/hinic/hinic_hw_mbox.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/drivers/net/ethernet/huawei/hinic/hinic_hw_mbox.c b/drivers/net/ethernet/huawei/hinic/hinic_hw_mbox.c index d5cf31529dbf..564fb2294a29 100644 --- a/drivers/net/ethernet/huawei/hinic/hinic_hw_mbox.c +++ b/drivers/net/ethernet/huawei/hinic/hinic_hw_mbox.c @@ -449,18 +449,15 @@ static void recv_mbox_handler(struct hinic_mbox_func_to_func *func_to_func, return; } - rcv_mbox_temp = kzalloc(sizeof(*rcv_mbox_temp), GFP_KERNEL); + rcv_mbox_temp = kmemdup(recv_mbox, sizeof(*rcv_mbox_temp), GFP_KERNEL); if (!rcv_mbox_temp) return; - memcpy(rcv_mbox_temp, recv_mbox, sizeof(*rcv_mbox_temp)); - - rcv_mbox_temp->mbox = kzalloc(MBOX_MAX_BUF_SZ, GFP_KERNEL); + rcv_mbox_temp->mbox = kmemdup(recv_mbox->mbox, MBOX_MAX_BUF_SZ, + GFP_KERNEL); if (!rcv_mbox_temp->mbox) goto err_alloc_rcv_mbox_msg; - memcpy(rcv_mbox_temp->mbox, recv_mbox->mbox, MBOX_MAX_BUF_SZ); - rcv_mbox_temp->buf_out = kzalloc(MBOX_MAX_BUF_SZ, GFP_KERNEL); if (!rcv_mbox_temp->buf_out) goto err_alloc_rcv_mbox_buf; -- cgit v1.2.3-59-g8ed1b From d46edd671a147032e22cfeb271a5734703093649 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Thu, 30 Apr 2020 00:15:04 -0700 Subject: bpf: Sharing bpf runtime stats with BPF_ENABLE_STATS Currently, sysctl kernel.bpf_stats_enabled controls BPF runtime stats. Typical userspace tools use kernel.bpf_stats_enabled as follows: 1. Enable kernel.bpf_stats_enabled; 2. Check program run_time_ns; 3. Sleep for the monitoring period; 4. Check program run_time_ns again, calculate the difference; 5. Disable kernel.bpf_stats_enabled. The problem with this approach is that only one userspace tool can toggle this sysctl. If multiple tools toggle the sysctl at the same time, the measurement may be inaccurate. To fix this problem while keep backward compatibility, introduce a new bpf command BPF_ENABLE_STATS. On success, this command enables stats and returns a valid fd. BPF_ENABLE_STATS takes argument "type". Currently, only one type, BPF_STATS_RUN_TIME, is supported. We can extend the command to support other types of stats in the future. With BPF_ENABLE_STATS, user space tool would have the following flow: 1. Get a fd with BPF_ENABLE_STATS, and make sure it is valid; 2. Check program run_time_ns; 3. Sleep for the monitoring period; 4. Check program run_time_ns again, calculate the difference; 5. Close the fd. Signed-off-by: Song Liu Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200430071506.1408910-2-songliubraving@fb.com --- include/linux/bpf.h | 1 + include/uapi/linux/bpf.h | 11 ++++++++ kernel/bpf/syscall.c | 57 ++++++++++++++++++++++++++++++++++++++++++ kernel/sysctl.c | 36 +++++++++++++++++++++++++- tools/include/uapi/linux/bpf.h | 11 ++++++++ 5 files changed, 115 insertions(+), 1 deletion(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index c07b1d2f3824..1262ec460ab3 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -987,6 +987,7 @@ _out: \ #ifdef CONFIG_BPF_SYSCALL DECLARE_PER_CPU(int, bpf_prog_active); +extern struct mutex bpf_stats_enabled_mutex; /* * Block execution of BPF programs attached to instrumentation (perf, diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 0eccafae55bb..705e4822f997 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -115,6 +115,7 @@ enum bpf_cmd { BPF_LINK_UPDATE, BPF_LINK_GET_FD_BY_ID, BPF_LINK_GET_NEXT_ID, + BPF_ENABLE_STATS, }; enum bpf_map_type { @@ -390,6 +391,12 @@ enum { */ #define BPF_F_QUERY_EFFECTIVE (1U << 0) +/* type for BPF_ENABLE_STATS */ +enum bpf_stats_type { + /* enabled run_time_ns and run_cnt */ + BPF_STATS_RUN_TIME = 0, +}; + enum bpf_stack_build_id_status { /* user space need an empty entry to identify end of a trace */ BPF_STACK_BUILD_ID_EMPTY = 0, @@ -601,6 +608,10 @@ union bpf_attr { __u32 old_prog_fd; } link_update; + struct { /* struct used by BPF_ENABLE_STATS command */ + __u32 type; + } enable_stats; + } __attribute__((aligned(8))); /* The description below is an attempt at providing documentation to eBPF diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index c75b2dd2459c..4f34eecec9ce 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -3872,6 +3872,60 @@ static int bpf_link_get_fd_by_id(const union bpf_attr *attr) return fd; } +DEFINE_MUTEX(bpf_stats_enabled_mutex); + +static int bpf_stats_release(struct inode *inode, struct file *file) +{ + mutex_lock(&bpf_stats_enabled_mutex); + static_key_slow_dec(&bpf_stats_enabled_key.key); + mutex_unlock(&bpf_stats_enabled_mutex); + return 0; +} + +static const struct file_operations bpf_stats_fops = { + .release = bpf_stats_release, +}; + +static int bpf_enable_runtime_stats(void) +{ + int fd; + + mutex_lock(&bpf_stats_enabled_mutex); + + /* Set a very high limit to avoid overflow */ + if (static_key_count(&bpf_stats_enabled_key.key) > INT_MAX / 2) { + mutex_unlock(&bpf_stats_enabled_mutex); + return -EBUSY; + } + + fd = anon_inode_getfd("bpf-stats", &bpf_stats_fops, NULL, O_CLOEXEC); + if (fd >= 0) + static_key_slow_inc(&bpf_stats_enabled_key.key); + + mutex_unlock(&bpf_stats_enabled_mutex); + return fd; +} + +#define BPF_ENABLE_STATS_LAST_FIELD enable_stats.type + +static int bpf_enable_stats(union bpf_attr *attr) +{ + + if (CHECK_ATTR(BPF_ENABLE_STATS)) + return -EINVAL; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + switch (attr->enable_stats.type) { + case BPF_STATS_RUN_TIME: + return bpf_enable_runtime_stats(); + default: + break; + } + return -EINVAL; +} + SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size) { union bpf_attr attr; @@ -3996,6 +4050,9 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz err = bpf_obj_get_next_id(&attr, uattr, &link_idr, &link_idr_lock); break; + case BPF_ENABLE_STATS: + err = bpf_enable_stats(&attr); + break; default: err = -EINVAL; break; diff --git a/kernel/sysctl.c b/kernel/sysctl.c index e961286d0e14..7adfe5dbce9d 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -201,6 +201,40 @@ static int max_extfrag_threshold = 1000; #endif /* CONFIG_SYSCTL */ +#ifdef CONFIG_BPF_SYSCALL +static int bpf_stats_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + struct static_key *key = (struct static_key *)table->data; + static int saved_val; + int val, ret; + struct ctl_table tmp = { + .data = &val, + .maxlen = sizeof(val), + .mode = table->mode, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }; + + if (write && !capable(CAP_SYS_ADMIN)) + return -EPERM; + + mutex_lock(&bpf_stats_enabled_mutex); + val = saved_val; + ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos); + if (write && !ret && val != saved_val) { + if (val) + static_key_slow_inc(key); + else + static_key_slow_dec(key); + saved_val = val; + } + mutex_unlock(&bpf_stats_enabled_mutex); + return ret; +} +#endif + /* * /proc/sys support */ @@ -2549,7 +2583,7 @@ static struct ctl_table kern_table[] = { .data = &bpf_stats_enabled_key.key, .maxlen = sizeof(bpf_stats_enabled_key), .mode = 0644, - .proc_handler = proc_do_static_key, + .proc_handler = bpf_stats_handler, }, #endif #if defined(CONFIG_TREE_RCU) diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 0eccafae55bb..705e4822f997 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -115,6 +115,7 @@ enum bpf_cmd { BPF_LINK_UPDATE, BPF_LINK_GET_FD_BY_ID, BPF_LINK_GET_NEXT_ID, + BPF_ENABLE_STATS, }; enum bpf_map_type { @@ -390,6 +391,12 @@ enum { */ #define BPF_F_QUERY_EFFECTIVE (1U << 0) +/* type for BPF_ENABLE_STATS */ +enum bpf_stats_type { + /* enabled run_time_ns and run_cnt */ + BPF_STATS_RUN_TIME = 0, +}; + enum bpf_stack_build_id_status { /* user space need an empty entry to identify end of a trace */ BPF_STACK_BUILD_ID_EMPTY = 0, @@ -601,6 +608,10 @@ union bpf_attr { __u32 old_prog_fd; } link_update; + struct { /* struct used by BPF_ENABLE_STATS command */ + __u32 type; + } enable_stats; + } __attribute__((aligned(8))); /* The description below is an attempt at providing documentation to eBPF -- cgit v1.2.3-59-g8ed1b From 0bee106716cfb2c6da81916b968395db22bd7755 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Thu, 30 Apr 2020 00:15:05 -0700 Subject: libbpf: Add support for command BPF_ENABLE_STATS bpf_enable_stats() is added to enable given stats. Signed-off-by: Song Liu Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200430071506.1408910-3-songliubraving@fb.com --- tools/lib/bpf/bpf.c | 10 ++++++++++ tools/lib/bpf/bpf.h | 1 + tools/lib/bpf/libbpf.map | 1 + 3 files changed, 12 insertions(+) diff --git a/tools/lib/bpf/bpf.c b/tools/lib/bpf/bpf.c index 8f2f0958d446..43322f0d6c7f 100644 --- a/tools/lib/bpf/bpf.c +++ b/tools/lib/bpf/bpf.c @@ -841,3 +841,13 @@ int bpf_task_fd_query(int pid, int fd, __u32 flags, char *buf, __u32 *buf_len, return err; } + +int bpf_enable_stats(enum bpf_stats_type type) +{ + union bpf_attr attr; + + memset(&attr, 0, sizeof(attr)); + attr.enable_stats.type = type; + + return sys_bpf(BPF_ENABLE_STATS, &attr, sizeof(attr)); +} diff --git a/tools/lib/bpf/bpf.h b/tools/lib/bpf/bpf.h index 335b457b3a25..1901b2777854 100644 --- a/tools/lib/bpf/bpf.h +++ b/tools/lib/bpf/bpf.h @@ -231,6 +231,7 @@ LIBBPF_API int bpf_load_btf(void *btf, __u32 btf_size, char *log_buf, LIBBPF_API int bpf_task_fd_query(int pid, int fd, __u32 flags, char *buf, __u32 *buf_len, __u32 *prog_id, __u32 *fd_type, __u64 *probe_offset, __u64 *probe_addr); +LIBBPF_API int bpf_enable_stats(enum bpf_stats_type type); #ifdef __cplusplus } /* extern "C" */ diff --git a/tools/lib/bpf/libbpf.map b/tools/lib/bpf/libbpf.map index 7cd49aa38005..e03bd4db827e 100644 --- a/tools/lib/bpf/libbpf.map +++ b/tools/lib/bpf/libbpf.map @@ -257,6 +257,7 @@ LIBBPF_0.0.8 { LIBBPF_0.0.9 { global: + bpf_enable_stats; bpf_link_get_fd_by_id; bpf_link_get_next_id; } LIBBPF_0.0.8; -- cgit v1.2.3-59-g8ed1b From 31a9f7fe93378ab587d758d5b2e96a237caa7b8c Mon Sep 17 00:00:00 2001 From: Song Liu Date: Thu, 30 Apr 2020 00:15:06 -0700 Subject: bpf: Add selftest for BPF_ENABLE_STATS Add test for BPF_ENABLE_STATS, which should enable run_time_ns stats. ~/selftests/bpf# ./test_progs -t enable_stats -v test_enable_stats:PASS:skel_open_and_load 0 nsec test_enable_stats:PASS:get_stats_fd 0 nsec test_enable_stats:PASS:attach_raw_tp 0 nsec test_enable_stats:PASS:get_prog_info 0 nsec test_enable_stats:PASS:check_stats_enabled 0 nsec test_enable_stats:PASS:check_run_cnt_valid 0 nsec Summary: 1/0 PASSED, 0 SKIPPED, 0 FAILED Signed-off-by: Song Liu Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200430071506.1408910-4-songliubraving@fb.com --- .../selftests/bpf/prog_tests/enable_stats.c | 45 ++++++++++++++++++++++ .../selftests/bpf/progs/test_enable_stats.c | 18 +++++++++ 2 files changed, 63 insertions(+) create mode 100644 tools/testing/selftests/bpf/prog_tests/enable_stats.c create mode 100644 tools/testing/selftests/bpf/progs/test_enable_stats.c diff --git a/tools/testing/selftests/bpf/prog_tests/enable_stats.c b/tools/testing/selftests/bpf/prog_tests/enable_stats.c new file mode 100644 index 000000000000..2cb2085917e7 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/enable_stats.c @@ -0,0 +1,45 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include "test_enable_stats.skel.h" + +void test_enable_stats(void) +{ + struct test_enable_stats *skel; + int stats_fd, err, prog_fd; + struct bpf_prog_info info; + __u32 info_len = sizeof(info); + int duration = 0; + + skel = test_enable_stats__open_and_load(); + if (CHECK(!skel, "skel_open_and_load", "skeleton open/load failed\n")) + return; + + stats_fd = bpf_enable_stats(BPF_STATS_RUN_TIME); + if (CHECK(stats_fd < 0, "get_stats_fd", "failed %d\n", errno)) { + test_enable_stats__destroy(skel); + return; + } + + err = test_enable_stats__attach(skel); + if (CHECK(err, "attach_raw_tp", "err %d\n", err)) + goto cleanup; + + test_enable_stats__detach(skel); + + prog_fd = bpf_program__fd(skel->progs.test_enable_stats); + memset(&info, 0, info_len); + err = bpf_obj_get_info_by_fd(prog_fd, &info, &info_len); + if (CHECK(err, "get_prog_info", + "failed to get bpf_prog_info for fd %d\n", prog_fd)) + goto cleanup; + if (CHECK(info.run_time_ns == 0, "check_stats_enabled", + "failed to enable run_time_ns stats\n")) + goto cleanup; + + CHECK(info.run_cnt != skel->bss->count, "check_run_cnt_valid", + "invalid run_cnt stats\n"); + +cleanup: + test_enable_stats__destroy(skel); + close(stats_fd); +} diff --git a/tools/testing/selftests/bpf/progs/test_enable_stats.c b/tools/testing/selftests/bpf/progs/test_enable_stats.c new file mode 100644 index 000000000000..01a002ade529 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_enable_stats.c @@ -0,0 +1,18 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (c) 2020 Facebook + +#include +#include +#include +#include + +char _license[] SEC("license") = "GPL"; + +__u64 count = 0; + +SEC("raw_tracepoint/sys_enter") +int test_enable_stats(void *ctx) +{ + count += 1; + return 0; +} -- cgit v1.2.3-59-g8ed1b From ef891284b130c365a7b60afaf7fe3c92c4260bfb Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Fri, 1 May 2020 10:11:28 +0200 Subject: r8169: remove not needed parameter in rtl8169_set_magic_reg Remove a not needed parameter in rtl8169_set_magic_reg. Signed-off-by: Heiner Kallweit Signed-off-by: David S. Miller --- drivers/net/ethernet/realtek/r8169_main.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/realtek/r8169_main.c b/drivers/net/ethernet/realtek/r8169_main.c index 0ac3976e3204..0f869a761d8c 100644 --- a/drivers/net/ethernet/realtek/r8169_main.c +++ b/drivers/net/ethernet/realtek/r8169_main.c @@ -2608,7 +2608,7 @@ static void rtl_set_rx_tx_desc_registers(struct rtl8169_private *tp) RTL_W32(tp, RxDescAddrLow, ((u64) tp->RxPhyAddr) & DMA_BIT_MASK(32)); } -static void rtl8169_set_magic_reg(struct rtl8169_private *tp, unsigned mac_version) +static void rtl8169_set_magic_reg(struct rtl8169_private *tp) { u32 val; @@ -3811,7 +3811,7 @@ static void rtl_hw_start_8169(struct rtl8169_private *tp) RTL_W16(tp, CPlusCmd, tp->cp_cmd); - rtl8169_set_magic_reg(tp, tp->mac_version); + rtl8169_set_magic_reg(tp); /* disable interrupt coalescing */ RTL_W16(tp, IntrMitigate, 0x0000); -- cgit v1.2.3-59-g8ed1b From cff9f12b18915d957a2130885a00f8ab15cff7e4 Mon Sep 17 00:00:00 2001 From: Maor Gottlieb Date: Thu, 30 Apr 2020 22:21:31 +0300 Subject: net/core: Introduce netdev_get_xmit_slave Add new ndo to get the xmit slave of master device. The reference counters are not incremented so the caller must be careful with locks. User can ask to get the xmit slave assume all the slaves can transmit by set all_slaves arg to true. Signed-off-by: Maor Gottlieb Reviewed-by: Jiri Pirko Reviewed-by: David Ahern Acked-by: David S. Miller Signed-off-by: Saeed Mahameed --- include/linux/netdevice.h | 12 ++++++++++++ net/core/dev.c | 22 ++++++++++++++++++++++ 2 files changed, 34 insertions(+) diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 130a668049ab..26bc0f11b7ad 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1146,6 +1146,12 @@ struct netdev_net_notifier { * int (*ndo_del_slave)(struct net_device *dev, struct net_device *slave_dev); * Called to release previously enslaved netdev. * + * struct net_device *(*ndo_get_xmit_slave)(struct net_device *dev, + * struct sk_buff *skb, + * bool all_slaves); + * Get the xmit slave of master device. If all_slaves is true, function + * assume all the slaves can transmit. + * * Feature/offload setting functions. * netdev_features_t (*ndo_fix_features)(struct net_device *dev, * netdev_features_t features); @@ -1389,6 +1395,9 @@ struct net_device_ops { struct netlink_ext_ack *extack); int (*ndo_del_slave)(struct net_device *dev, struct net_device *slave_dev); + struct net_device* (*ndo_get_xmit_slave)(struct net_device *dev, + struct sk_buff *skb, + bool all_slaves); netdev_features_t (*ndo_fix_features)(struct net_device *dev, netdev_features_t features); int (*ndo_set_features)(struct net_device *dev, @@ -2731,6 +2740,9 @@ void netdev_freemem(struct net_device *dev); void synchronize_net(void); int init_dummy_netdev(struct net_device *dev); +struct net_device *netdev_get_xmit_slave(struct net_device *dev, + struct sk_buff *skb, + bool all_slaves); struct net_device *dev_get_by_index(struct net *net, int ifindex); struct net_device *__dev_get_by_index(struct net *net, int ifindex); struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex); diff --git a/net/core/dev.c b/net/core/dev.c index 9c9e763bfe0e..e6c10980abfd 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -7785,6 +7785,28 @@ void netdev_bonding_info_change(struct net_device *dev, } EXPORT_SYMBOL(netdev_bonding_info_change); +/** + * netdev_get_xmit_slave - Get the xmit slave of master device + * @skb: The packet + * @all_slaves: assume all the slaves are active + * + * The reference counters are not incremented so the caller must be + * careful with locks. The caller must hold RCU lock. + * %NULL is returned if no slave is found. + */ + +struct net_device *netdev_get_xmit_slave(struct net_device *dev, + struct sk_buff *skb, + bool all_slaves) +{ + const struct net_device_ops *ops = dev->netdev_ops; + + if (!ops->ndo_get_xmit_slave) + return NULL; + return ops->ndo_get_xmit_slave(dev, skb, all_slaves); +} +EXPORT_SYMBOL(netdev_get_xmit_slave); + static void netdev_adjacent_add_links(struct net_device *dev) { struct netdev_adjacent *iter; -- cgit v1.2.3-59-g8ed1b From 119d48fd4298594beccf4f2ecd00627826ce2646 Mon Sep 17 00:00:00 2001 From: Maor Gottlieb Date: Thu, 30 Apr 2020 22:21:32 +0300 Subject: bonding: Export skip slave logic to function As a preparation for following change that add array of all slaves, extract code that skip slave to function. Signed-off-by: Maor Gottlieb Reviewed-by: Jiri Pirko Reviewed-by: Jay Vosburgh Acked-by: David S. Miller Signed-off-by: Saeed Mahameed --- drivers/net/bonding/bond_main.c | 47 +++++++++++++++++++++++------------------ 1 file changed, 26 insertions(+), 21 deletions(-) diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index 2e70e43c5df5..f7aded014f08 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -4087,6 +4087,29 @@ err: bond_slave_arr_work_rearm(bond, 1); } +static void bond_skip_slave(struct bond_up_slave *slaves, + struct slave *skipslave) +{ + int idx; + + /* Rare situation where caller has asked to skip a specific + * slave but allocation failed (most likely!). BTW this is + * only possible when the call is initiated from + * __bond_release_one(). In this situation; overwrite the + * skipslave entry in the array with the last entry from the + * array to avoid a situation where the xmit path may choose + * this to-be-skipped slave to send a packet out. + */ + for (idx = 0; slaves && idx < slaves->count; idx++) { + if (skipslave == slaves->arr[idx]) { + slaves->arr[idx] = + slaves->arr[slaves->count - 1]; + slaves->count--; + break; + } + } +} + /* Build the usable slaves array in control path for modes that use xmit-hash * to determine the slave interface - * (a) BOND_MODE_8023AD @@ -4156,27 +4179,9 @@ int bond_update_slave_arr(struct bonding *bond, struct slave *skipslave) if (old_arr) kfree_rcu(old_arr, rcu); out: - if (ret != 0 && skipslave) { - int idx; - - /* Rare situation where caller has asked to skip a specific - * slave but allocation failed (most likely!). BTW this is - * only possible when the call is initiated from - * __bond_release_one(). In this situation; overwrite the - * skipslave entry in the array with the last entry from the - * array to avoid a situation where the xmit path may choose - * this to-be-skipped slave to send a packet out. - */ - old_arr = rtnl_dereference(bond->slave_arr); - for (idx = 0; old_arr != NULL && idx < old_arr->count; idx++) { - if (skipslave == old_arr->arr[idx]) { - old_arr->arr[idx] = - old_arr->arr[old_arr->count-1]; - old_arr->count--; - break; - } - } - } + if (ret != 0 && skipslave) + bond_skip_slave(rtnl_dereference(bond->slave_arr), skipslave); + return ret; } -- cgit v1.2.3-59-g8ed1b From ed7d4f023b1a9b0578f20d66557c66452ab845ec Mon Sep 17 00:00:00 2001 From: Maor Gottlieb Date: Thu, 30 Apr 2020 22:21:33 +0300 Subject: bonding: Rename slave_arr to usable_slaves Rename slave_arr to usable_slaves, since we will have two arrays, one for the usable slaves and the other to all slaves. Signed-off-by: Maor Gottlieb Reviewed-by: Jiri Pirko Reviewed-by: Jay Vosburgh Acked-by: David S. Miller Signed-off-by: Saeed Mahameed --- drivers/net/bonding/bond_alb.c | 4 ++-- drivers/net/bonding/bond_main.c | 40 ++++++++++++++++++++-------------------- include/net/bonding.h | 2 +- 3 files changed, 23 insertions(+), 23 deletions(-) diff --git a/drivers/net/bonding/bond_alb.c b/drivers/net/bonding/bond_alb.c index c81698550e5a..7bb49b049dcc 100644 --- a/drivers/net/bonding/bond_alb.c +++ b/drivers/net/bonding/bond_alb.c @@ -1360,7 +1360,7 @@ netdev_tx_t bond_tlb_xmit(struct sk_buff *skb, struct net_device *bond_dev) struct bond_up_slave *slaves; unsigned int count; - slaves = rcu_dereference(bond->slave_arr); + slaves = rcu_dereference(bond->usable_slaves); count = slaves ? READ_ONCE(slaves->count) : 0; if (likely(count)) tx_slave = slaves->arr[hash_index % @@ -1494,7 +1494,7 @@ netdev_tx_t bond_alb_xmit(struct sk_buff *skb, struct net_device *bond_dev) struct bond_up_slave *slaves; unsigned int count; - slaves = rcu_dereference(bond->slave_arr); + slaves = rcu_dereference(bond->usable_slaves); count = slaves ? READ_ONCE(slaves->count) : 0; if (likely(count)) tx_slave = slaves->arr[bond_xmit_hash(bond, skb) % diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index f7aded014f08..2cb41d480ae2 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -4120,9 +4120,9 @@ static void bond_skip_slave(struct bond_up_slave *slaves, */ int bond_update_slave_arr(struct bonding *bond, struct slave *skipslave) { + struct bond_up_slave *usable_slaves, *old_usable_slaves; struct slave *slave; struct list_head *iter; - struct bond_up_slave *new_arr, *old_arr; int agg_id = 0; int ret = 0; @@ -4130,11 +4130,10 @@ int bond_update_slave_arr(struct bonding *bond, struct slave *skipslave) WARN_ON(lockdep_is_held(&bond->mode_lock)); #endif - new_arr = kzalloc(offsetof(struct bond_up_slave, arr[bond->slave_cnt]), - GFP_KERNEL); - if (!new_arr) { + usable_slaves = kzalloc(struct_size(usable_slaves, arr, + bond->slave_cnt), GFP_KERNEL); + if (!usable_slaves) { ret = -ENOMEM; - pr_err("Failed to build slave-array.\n"); goto out; } if (BOND_MODE(bond) == BOND_MODE_8023AD) { @@ -4142,14 +4141,14 @@ int bond_update_slave_arr(struct bonding *bond, struct slave *skipslave) if (bond_3ad_get_active_agg_info(bond, &ad_info)) { pr_debug("bond_3ad_get_active_agg_info failed\n"); - kfree_rcu(new_arr, rcu); + kfree_rcu(usable_slaves, rcu); /* No active aggragator means it's not safe to use * the previous array. */ - old_arr = rtnl_dereference(bond->slave_arr); - if (old_arr) { - RCU_INIT_POINTER(bond->slave_arr, NULL); - kfree_rcu(old_arr, rcu); + old_usable_slaves = rtnl_dereference(bond->usable_slaves); + if (old_usable_slaves) { + RCU_INIT_POINTER(bond->usable_slaves, NULL); + kfree_rcu(old_usable_slaves, rcu); } goto out; } @@ -4169,18 +4168,19 @@ int bond_update_slave_arr(struct bonding *bond, struct slave *skipslave) continue; slave_dbg(bond->dev, slave->dev, "Adding slave to tx hash array[%d]\n", - new_arr->count); + usable_slaves->count); - new_arr->arr[new_arr->count++] = slave; + usable_slaves->arr[usable_slaves->count++] = slave; } - old_arr = rtnl_dereference(bond->slave_arr); - rcu_assign_pointer(bond->slave_arr, new_arr); - if (old_arr) - kfree_rcu(old_arr, rcu); + old_usable_slaves = rtnl_dereference(bond->usable_slaves); + rcu_assign_pointer(bond->usable_slaves, usable_slaves); + if (old_usable_slaves) + kfree_rcu(old_usable_slaves, rcu); out: if (ret != 0 && skipslave) - bond_skip_slave(rtnl_dereference(bond->slave_arr), skipslave); + bond_skip_slave(rtnl_dereference(bond->usable_slaves), + skipslave); return ret; } @@ -4197,7 +4197,7 @@ static netdev_tx_t bond_3ad_xor_xmit(struct sk_buff *skb, struct bond_up_slave *slaves; unsigned int count; - slaves = rcu_dereference(bond->slave_arr); + slaves = rcu_dereference(bond->usable_slaves); count = slaves ? READ_ONCE(slaves->count) : 0; if (likely(count)) { slave = slaves->arr[bond_xmit_hash(bond, skb) % count]; @@ -4488,9 +4488,9 @@ static void bond_uninit(struct net_device *bond_dev) __bond_release_one(bond_dev, slave->dev, true, true); netdev_info(bond_dev, "Released all slaves\n"); - arr = rtnl_dereference(bond->slave_arr); + arr = rtnl_dereference(bond->usable_slaves); if (arr) { - RCU_INIT_POINTER(bond->slave_arr, NULL); + RCU_INIT_POINTER(bond->usable_slaves, NULL); kfree_rcu(arr, rcu); } diff --git a/include/net/bonding.h b/include/net/bonding.h index dc2ce31a1f52..33bdb6d5182d 100644 --- a/include/net/bonding.h +++ b/include/net/bonding.h @@ -200,7 +200,7 @@ struct bonding { struct slave __rcu *curr_active_slave; struct slave __rcu *current_arp_slave; struct slave __rcu *primary_slave; - struct bond_up_slave __rcu *slave_arr; /* Array of usable slaves */ + struct bond_up_slave __rcu *usable_slaves; /* Array of usable slaves */ bool force_primary; s32 slave_cnt; /* never change this value outside the attach/detach wrappers */ int (*recv_probe)(const struct sk_buff *, struct bonding *, -- cgit v1.2.3-59-g8ed1b From 34b37e204dfc8b20a09bb7b7f4c5e970c87420dd Mon Sep 17 00:00:00 2001 From: Maor Gottlieb Date: Thu, 30 Apr 2020 22:21:34 +0300 Subject: bonding/alb: Add helper functions to get the xmit slave Add two helper functions to get the xmit slave of bond in alb or tlb mode. Extract the logic of find the xmit slave from the xmit flow to function. Xmit flow will xmit through this slave and in the following patches the new .ndo will call to the helper function to return the xmit slave. Signed-off-by: Maor Gottlieb Reviewed-by: Jiri Pirko Reviewed-by: Jay Vosburgh Acked-by: David S. Miller Signed-off-by: Saeed Mahameed --- drivers/net/bonding/bond_alb.c | 35 ++++++++++++++++++++++++++--------- include/net/bond_alb.h | 4 ++++ 2 files changed, 30 insertions(+), 9 deletions(-) diff --git a/drivers/net/bonding/bond_alb.c b/drivers/net/bonding/bond_alb.c index 7bb49b049dcc..e863c694c309 100644 --- a/drivers/net/bonding/bond_alb.c +++ b/drivers/net/bonding/bond_alb.c @@ -1334,11 +1334,11 @@ out: return NETDEV_TX_OK; } -netdev_tx_t bond_tlb_xmit(struct sk_buff *skb, struct net_device *bond_dev) +struct slave *bond_xmit_tlb_slave_get(struct bonding *bond, + struct sk_buff *skb) { - struct bonding *bond = netdev_priv(bond_dev); - struct ethhdr *eth_data; struct slave *tx_slave = NULL; + struct ethhdr *eth_data; u32 hash_index; skb_reset_mac_header(skb); @@ -1369,20 +1369,29 @@ netdev_tx_t bond_tlb_xmit(struct sk_buff *skb, struct net_device *bond_dev) break; } } - return bond_do_alb_xmit(skb, bond, tx_slave); + return tx_slave; } -netdev_tx_t bond_alb_xmit(struct sk_buff *skb, struct net_device *bond_dev) +netdev_tx_t bond_tlb_xmit(struct sk_buff *skb, struct net_device *bond_dev) { struct bonding *bond = netdev_priv(bond_dev); - struct ethhdr *eth_data; + struct slave *tx_slave; + + tx_slave = bond_xmit_tlb_slave_get(bond, skb); + return bond_do_alb_xmit(skb, bond, tx_slave); +} + +struct slave *bond_xmit_alb_slave_get(struct bonding *bond, + struct sk_buff *skb) +{ struct alb_bond_info *bond_info = &(BOND_ALB_INFO(bond)); - struct slave *tx_slave = NULL; static const __be32 ip_bcast = htonl(0xffffffff); - int hash_size = 0; + struct slave *tx_slave = NULL; + const u8 *hash_start = NULL; bool do_tx_balance = true; + struct ethhdr *eth_data; u32 hash_index = 0; - const u8 *hash_start = NULL; + int hash_size = 0; skb_reset_mac_header(skb); eth_data = eth_hdr(skb); @@ -1501,7 +1510,15 @@ netdev_tx_t bond_alb_xmit(struct sk_buff *skb, struct net_device *bond_dev) count]; } } + return tx_slave; +} + +netdev_tx_t bond_alb_xmit(struct sk_buff *skb, struct net_device *bond_dev) +{ + struct bonding *bond = netdev_priv(bond_dev); + struct slave *tx_slave = NULL; + tx_slave = bond_xmit_alb_slave_get(bond, skb); return bond_do_alb_xmit(skb, bond, tx_slave); } diff --git a/include/net/bond_alb.h b/include/net/bond_alb.h index b3504fcd773d..f6af76c87a6c 100644 --- a/include/net/bond_alb.h +++ b/include/net/bond_alb.h @@ -158,6 +158,10 @@ void bond_alb_handle_link_change(struct bonding *bond, struct slave *slave, char void bond_alb_handle_active_change(struct bonding *bond, struct slave *new_slave); int bond_alb_xmit(struct sk_buff *skb, struct net_device *bond_dev); int bond_tlb_xmit(struct sk_buff *skb, struct net_device *bond_dev); +struct slave *bond_xmit_alb_slave_get(struct bonding *bond, + struct sk_buff *skb); +struct slave *bond_xmit_tlb_slave_get(struct bonding *bond, + struct sk_buff *skb); void bond_alb_monitor(struct work_struct *); int bond_alb_set_mac_address(struct net_device *bond_dev, void *addr); void bond_alb_clear_vlan(struct bonding *bond, unsigned short vlan_id); -- cgit v1.2.3-59-g8ed1b From c071d91d2a89b0dac1354673810b36453aed62c4 Mon Sep 17 00:00:00 2001 From: Maor Gottlieb Date: Thu, 30 Apr 2020 22:21:35 +0300 Subject: bonding: Add helper function to get the xmit slave based on hash Both xor and 802.3ad modes use bond_xmit_hash to get the xmit slave. Export the logic to helper function so it could be used in the following patches by the .ndo to get the xmit slave. Signed-off-by: Maor Gottlieb Reviewed-by: Jiri Pirko Reviewed-by: Jay Vosburgh Acked-by: David S. Miller Signed-off-by: Saeed Mahameed --- drivers/net/bonding/bond_main.c | 28 +++++++++++++++++++++------- 1 file changed, 21 insertions(+), 7 deletions(-) diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index 2cb41d480ae2..8e6305955c75 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -4185,6 +4185,23 @@ out: return ret; } +static struct slave *bond_xmit_3ad_xor_slave_get(struct bonding *bond, + struct sk_buff *skb, + struct bond_up_slave *slaves) +{ + struct slave *slave; + unsigned int count; + u32 hash; + + hash = bond_xmit_hash(bond, skb); + count = slaves ? READ_ONCE(slaves->count) : 0; + if (unlikely(!count)) + return NULL; + + slave = slaves->arr[hash % count]; + return slave; +} + /* Use this Xmit function for 3AD as well as XOR modes. The current * usable slave array is formed in the control path. The xmit function * just calculates hash and sends the packet out. @@ -4193,18 +4210,15 @@ static netdev_tx_t bond_3ad_xor_xmit(struct sk_buff *skb, struct net_device *dev) { struct bonding *bond = netdev_priv(dev); - struct slave *slave; struct bond_up_slave *slaves; - unsigned int count; + struct slave *slave; slaves = rcu_dereference(bond->usable_slaves); - count = slaves ? READ_ONCE(slaves->count) : 0; - if (likely(count)) { - slave = slaves->arr[bond_xmit_hash(bond, skb) % count]; + slave = bond_xmit_3ad_xor_slave_get(bond, skb, slaves); + if (likely(slave)) bond_dev_queue_xmit(bond, skb, slave->dev); - } else { + else bond_tx_drop(dev, skb); - } return NETDEV_TX_OK; } -- cgit v1.2.3-59-g8ed1b From 29d5bbccb3a171eb146c94efeb3d752fad3ddf7d Mon Sep 17 00:00:00 2001 From: Maor Gottlieb Date: Thu, 30 Apr 2020 22:21:36 +0300 Subject: bonding: Add helper function to get the xmit slave in rr mode Add helper function to get the xmit slave when bond is in round robin mode. Change bond_xmit_slave_id to bond_get_slave_by_id, then the logic for find the next slave for transmit could be used both by the xmit flow and the .ndo to get the xmit slave. Signed-off-by: Maor Gottlieb Reviewed-by: Jiri Pirko Reviewed-by: Jay Vosburgh Acked-by: David S. Miller Signed-off-by: Saeed Mahameed --- drivers/net/bonding/bond_main.c | 56 ++++++++++++++++++++++------------------- 1 file changed, 30 insertions(+), 26 deletions(-) diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index 8e6305955c75..09c8485e965d 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -3923,16 +3923,15 @@ unwind: } /** - * bond_xmit_slave_id - transmit skb through slave with slave_id + * bond_get_slave_by_id - get xmit slave with slave_id * @bond: bonding device that is transmitting - * @skb: buffer to transmit * @slave_id: slave id up to slave_cnt-1 through which to transmit * - * This function tries to transmit through slave with slave_id but in case + * This function tries to get slave with slave_id but in case * it fails, it tries to find the first available slave for transmission. - * The skb is consumed in all cases, thus the function is void. */ -static void bond_xmit_slave_id(struct bonding *bond, struct sk_buff *skb, int slave_id) +static struct slave *bond_get_slave_by_id(struct bonding *bond, + int slave_id) { struct list_head *iter; struct slave *slave; @@ -3941,10 +3940,8 @@ static void bond_xmit_slave_id(struct bonding *bond, struct sk_buff *skb, int sl /* Here we start from the slave with slave_id */ bond_for_each_slave_rcu(bond, slave, iter) { if (--i < 0) { - if (bond_slave_can_tx(slave)) { - bond_dev_queue_xmit(bond, skb, slave->dev); - return; - } + if (bond_slave_can_tx(slave)) + return slave; } } @@ -3953,13 +3950,11 @@ static void bond_xmit_slave_id(struct bonding *bond, struct sk_buff *skb, int sl bond_for_each_slave_rcu(bond, slave, iter) { if (--i < 0) break; - if (bond_slave_can_tx(slave)) { - bond_dev_queue_xmit(bond, skb, slave->dev); - return; - } + if (bond_slave_can_tx(slave)) + return slave; } - /* no slave that can tx has been found */ - bond_tx_drop(bond->dev, skb); + + return NULL; } /** @@ -3995,10 +3990,9 @@ static u32 bond_rr_gen_slave_id(struct bonding *bond) return slave_id; } -static netdev_tx_t bond_xmit_roundrobin(struct sk_buff *skb, - struct net_device *bond_dev) +static struct slave *bond_xmit_roundrobin_slave_get(struct bonding *bond, + struct sk_buff *skb) { - struct bonding *bond = netdev_priv(bond_dev); struct slave *slave; int slave_cnt; u32 slave_id; @@ -4020,21 +4014,31 @@ static netdev_tx_t bond_xmit_roundrobin(struct sk_buff *skb, if (iph->protocol == IPPROTO_IGMP) { slave = rcu_dereference(bond->curr_active_slave); if (slave) - bond_dev_queue_xmit(bond, skb, slave->dev); - else - bond_xmit_slave_id(bond, skb, 0); - return NETDEV_TX_OK; + return slave; + return bond_get_slave_by_id(bond, 0); } } non_igmp: slave_cnt = READ_ONCE(bond->slave_cnt); if (likely(slave_cnt)) { - slave_id = bond_rr_gen_slave_id(bond); - bond_xmit_slave_id(bond, skb, slave_id % slave_cnt); - } else { - bond_tx_drop(bond_dev, skb); + slave_id = bond_rr_gen_slave_id(bond) % slave_cnt; + return bond_get_slave_by_id(bond, slave_id); } + return NULL; +} + +static netdev_tx_t bond_xmit_roundrobin(struct sk_buff *skb, + struct net_device *bond_dev) +{ + struct bonding *bond = netdev_priv(bond_dev); + struct slave *slave; + + slave = bond_xmit_roundrobin_slave_get(bond, skb); + if (slave) + bond_dev_queue_xmit(bond, skb, slave->dev); + else + bond_tx_drop(bond_dev, skb); return NETDEV_TX_OK; } -- cgit v1.2.3-59-g8ed1b From 5a19f1c1a2a0f7d5fb80b130ab4a15fa99e792d7 Mon Sep 17 00:00:00 2001 From: Maor Gottlieb Date: Thu, 30 Apr 2020 22:21:37 +0300 Subject: bonding: Add function to get the xmit slave in active-backup mode Add helper function to get the xmit slave in active-backup mode. It's only one line function that return the curr_active_slave, but it will used both in the xmit flow and by the new .ndo to get the xmit slave. Signed-off-by: Maor Gottlieb Reviewed-by: Jiri Pirko Reviewed-by: Jay Vosburgh Acked-by: David S. Miller Signed-off-by: Saeed Mahameed --- drivers/net/bonding/bond_main.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index 09c8485e965d..1b0ae750d732 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -4042,6 +4042,12 @@ static netdev_tx_t bond_xmit_roundrobin(struct sk_buff *skb, return NETDEV_TX_OK; } +static struct slave *bond_xmit_activebackup_slave_get(struct bonding *bond, + struct sk_buff *skb) +{ + return rcu_dereference(bond->curr_active_slave); +} + /* In active-backup mode, we know that bond->curr_active_slave is always valid if * the bond has a usable interface. */ @@ -4051,7 +4057,7 @@ static netdev_tx_t bond_xmit_activebackup(struct sk_buff *skb, struct bonding *bond = netdev_priv(bond_dev); struct slave *slave; - slave = rcu_dereference(bond->curr_active_slave); + slave = bond_xmit_activebackup_slave_get(bond, skb); if (slave) bond_dev_queue_xmit(bond, skb, slave->dev); else -- cgit v1.2.3-59-g8ed1b From 6b447e76ed44cc354cd0a346b86efe393e603e0d Mon Sep 17 00:00:00 2001 From: Maor Gottlieb Date: Thu, 30 Apr 2020 22:21:38 +0300 Subject: bonding: Add array of all slaves Keep all slaves in array so it could be used to get the xmit slave assume all the slaves are active. The logic to add slave to the array is like the usable slaves, except that we also add slaves that currently can't transmit - not up or active. Signed-off-by: Maor Gottlieb Reviewed-by: Jiri Pirko Reviewed-by: Jay Vosburgh Acked-by: David S. Miller Signed-off-by: Saeed Mahameed --- drivers/net/bonding/bond_main.c | 78 +++++++++++++++++++++++++++++++---------- include/net/bonding.h | 3 +- 2 files changed, 61 insertions(+), 20 deletions(-) diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index 1b0ae750d732..2de693f0262e 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -4120,6 +4120,38 @@ static void bond_skip_slave(struct bond_up_slave *slaves, } } +static void bond_set_slave_arr(struct bonding *bond, + struct bond_up_slave *usable_slaves, + struct bond_up_slave *all_slaves) +{ + struct bond_up_slave *usable, *all; + + usable = rtnl_dereference(bond->usable_slaves); + rcu_assign_pointer(bond->usable_slaves, usable_slaves); + kfree_rcu(usable, rcu); + + all = rtnl_dereference(bond->all_slaves); + rcu_assign_pointer(bond->all_slaves, all_slaves); + kfree_rcu(all, rcu); +} + +static void bond_reset_slave_arr(struct bonding *bond) +{ + struct bond_up_slave *usable, *all; + + usable = rtnl_dereference(bond->usable_slaves); + if (usable) { + RCU_INIT_POINTER(bond->usable_slaves, NULL); + kfree_rcu(usable, rcu); + } + + all = rtnl_dereference(bond->all_slaves); + if (all) { + RCU_INIT_POINTER(bond->all_slaves, NULL); + kfree_rcu(all, rcu); + } +} + /* Build the usable slaves array in control path for modes that use xmit-hash * to determine the slave interface - * (a) BOND_MODE_8023AD @@ -4130,7 +4162,7 @@ static void bond_skip_slave(struct bond_up_slave *slaves, */ int bond_update_slave_arr(struct bonding *bond, struct slave *skipslave) { - struct bond_up_slave *usable_slaves, *old_usable_slaves; + struct bond_up_slave *usable_slaves = NULL, *all_slaves = NULL; struct slave *slave; struct list_head *iter; int agg_id = 0; @@ -4142,7 +4174,9 @@ int bond_update_slave_arr(struct bonding *bond, struct slave *skipslave) usable_slaves = kzalloc(struct_size(usable_slaves, arr, bond->slave_cnt), GFP_KERNEL); - if (!usable_slaves) { + all_slaves = kzalloc(struct_size(all_slaves, arr, + bond->slave_cnt), GFP_KERNEL); + if (!usable_slaves || !all_slaves) { ret = -ENOMEM; goto out; } @@ -4151,20 +4185,19 @@ int bond_update_slave_arr(struct bonding *bond, struct slave *skipslave) if (bond_3ad_get_active_agg_info(bond, &ad_info)) { pr_debug("bond_3ad_get_active_agg_info failed\n"); - kfree_rcu(usable_slaves, rcu); /* No active aggragator means it's not safe to use * the previous array. */ - old_usable_slaves = rtnl_dereference(bond->usable_slaves); - if (old_usable_slaves) { - RCU_INIT_POINTER(bond->usable_slaves, NULL); - kfree_rcu(old_usable_slaves, rcu); - } + bond_reset_slave_arr(bond); goto out; } agg_id = ad_info.aggregator_id; } bond_for_each_slave(bond, slave, iter) { + if (skipslave == slave) + continue; + + all_slaves->arr[all_slaves->count++] = slave; if (BOND_MODE(bond) == BOND_MODE_8023AD) { struct aggregator *agg; @@ -4174,8 +4207,6 @@ int bond_update_slave_arr(struct bonding *bond, struct slave *skipslave) } if (!bond_slave_can_tx(slave)) continue; - if (skipslave == slave) - continue; slave_dbg(bond->dev, slave->dev, "Adding slave to tx hash array[%d]\n", usable_slaves->count); @@ -4183,14 +4214,17 @@ int bond_update_slave_arr(struct bonding *bond, struct slave *skipslave) usable_slaves->arr[usable_slaves->count++] = slave; } - old_usable_slaves = rtnl_dereference(bond->usable_slaves); - rcu_assign_pointer(bond->usable_slaves, usable_slaves); - if (old_usable_slaves) - kfree_rcu(old_usable_slaves, rcu); + bond_set_slave_arr(bond, usable_slaves, all_slaves); + return ret; out: - if (ret != 0 && skipslave) + if (ret != 0 && skipslave) { + bond_skip_slave(rtnl_dereference(bond->all_slaves), + skipslave); bond_skip_slave(rtnl_dereference(bond->usable_slaves), skipslave); + } + kfree_rcu(all_slaves, rcu); + kfree_rcu(usable_slaves, rcu); return ret; } @@ -4501,9 +4535,9 @@ void bond_setup(struct net_device *bond_dev) static void bond_uninit(struct net_device *bond_dev) { struct bonding *bond = netdev_priv(bond_dev); + struct bond_up_slave *usable, *all; struct list_head *iter; struct slave *slave; - struct bond_up_slave *arr; bond_netpoll_cleanup(bond_dev); @@ -4512,10 +4546,16 @@ static void bond_uninit(struct net_device *bond_dev) __bond_release_one(bond_dev, slave->dev, true, true); netdev_info(bond_dev, "Released all slaves\n"); - arr = rtnl_dereference(bond->usable_slaves); - if (arr) { + usable = rtnl_dereference(bond->usable_slaves); + if (usable) { RCU_INIT_POINTER(bond->usable_slaves, NULL); - kfree_rcu(arr, rcu); + kfree_rcu(usable, rcu); + } + + all = rtnl_dereference(bond->all_slaves); + if (all) { + RCU_INIT_POINTER(bond->all_slaves, NULL); + kfree_rcu(all, rcu); } list_del(&bond->bond_list); diff --git a/include/net/bonding.h b/include/net/bonding.h index 33bdb6d5182d..b5e49bedbc9f 100644 --- a/include/net/bonding.h +++ b/include/net/bonding.h @@ -200,7 +200,8 @@ struct bonding { struct slave __rcu *curr_active_slave; struct slave __rcu *current_arp_slave; struct slave __rcu *primary_slave; - struct bond_up_slave __rcu *usable_slaves; /* Array of usable slaves */ + struct bond_up_slave __rcu *usable_slaves; + struct bond_up_slave __rcu *all_slaves; bool force_primary; s32 slave_cnt; /* never change this value outside the attach/detach wrappers */ int (*recv_probe)(const struct sk_buff *, struct bonding *, -- cgit v1.2.3-59-g8ed1b From 33720aaf8c2af5c0ff341a16b5048b9c7ecae569 Mon Sep 17 00:00:00 2001 From: Maor Gottlieb Date: Thu, 30 Apr 2020 22:21:39 +0300 Subject: bonding: Implement ndo_get_xmit_slave Add implementation of ndo_get_xmit_slave. Find the slave by using the helper function according to the bond mode. If the caller set all_slaves to true, then it assumes that all slaves are available to transmit. Signed-off-by: Maor Gottlieb Reviewed-by: Jay Vosburgh Reviewed-by: Jiri Pirko Acked-by: David S. Miller Signed-off-by: Saeed Mahameed --- drivers/net/bonding/bond_main.c | 43 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index 2de693f0262e..39b1ad7edbb4 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -4347,6 +4347,48 @@ static u16 bond_select_queue(struct net_device *dev, struct sk_buff *skb, return txq; } +static struct net_device *bond_xmit_get_slave(struct net_device *master_dev, + struct sk_buff *skb, + bool all_slaves) +{ + struct bonding *bond = netdev_priv(master_dev); + struct bond_up_slave *slaves; + struct slave *slave = NULL; + + switch (BOND_MODE(bond)) { + case BOND_MODE_ROUNDROBIN: + slave = bond_xmit_roundrobin_slave_get(bond, skb); + break; + case BOND_MODE_ACTIVEBACKUP: + slave = bond_xmit_activebackup_slave_get(bond, skb); + break; + case BOND_MODE_8023AD: + case BOND_MODE_XOR: + if (all_slaves) + slaves = rcu_dereference(bond->all_slaves); + else + slaves = rcu_dereference(bond->usable_slaves); + slave = bond_xmit_3ad_xor_slave_get(bond, skb, slaves); + break; + case BOND_MODE_BROADCAST: + break; + case BOND_MODE_ALB: + slave = bond_xmit_alb_slave_get(bond, skb); + break; + case BOND_MODE_TLB: + slave = bond_xmit_tlb_slave_get(bond, skb); + break; + default: + /* Should never happen, mode already checked */ + WARN_ONCE(true, "Unknown bonding mode"); + break; + } + + if (slave) + return slave->dev; + return NULL; +} + static netdev_tx_t __bond_start_xmit(struct sk_buff *skb, struct net_device *dev) { struct bonding *bond = netdev_priv(dev); @@ -4468,6 +4510,7 @@ static const struct net_device_ops bond_netdev_ops = { .ndo_del_slave = bond_release, .ndo_fix_features = bond_fix_features, .ndo_features_check = passthru_features_check, + .ndo_get_xmit_slave = bond_xmit_get_slave, }; static const struct device_type bond_type = { -- cgit v1.2.3-59-g8ed1b From 64363e61c7bbcfa4c7d6697d96ef2e18fc311cf3 Mon Sep 17 00:00:00 2001 From: Maor Gottlieb Date: Thu, 30 Apr 2020 22:21:40 +0300 Subject: net/mlx5: Change lag mutex lock to spin lock The lag lock could be a spin lock, the critical section is short and there is no need that the thread will sleep. Change the lock that protects the LAG structure from mutex to spin lock. It is required for next patch that need to access this structure from context that we can't sleep. In addition there is no need to hold this lock when query the congestion counters. Signed-off-by: Maor Gottlieb Reviewed-by: Leon Romanovsky Acked-by: David S. Miller Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/lag.c | 42 +++++++++++++-------------- 1 file changed, 21 insertions(+), 21 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lag.c b/drivers/net/ethernet/mellanox/mlx5/core/lag.c index c6ad5ca46877..b17b80bcd045 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lag.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/lag.c @@ -42,7 +42,7 @@ * Beware of lock dependencies (preferably, no locks should be acquired * under it). */ -static DEFINE_MUTEX(lag_mutex); +static DEFINE_SPINLOCK(lag_lock); static int mlx5_cmd_create_lag(struct mlx5_core_dev *dev, u8 remap_port1, u8 remap_port2) @@ -274,9 +274,9 @@ static void mlx5_do_bond(struct mlx5_lag *ldev) if (!dev0 || !dev1) return; - mutex_lock(&lag_mutex); + spin_lock(&lag_lock); tracker = ldev->tracker; - mutex_unlock(&lag_mutex); + spin_unlock(&lag_lock); do_bond = tracker.is_bonded && mlx5_lag_check_prereq(ldev); @@ -458,9 +458,9 @@ static int mlx5_lag_netdev_event(struct notifier_block *this, break; } - mutex_lock(&lag_mutex); + spin_lock(&lag_lock); ldev->tracker = tracker; - mutex_unlock(&lag_mutex); + spin_unlock(&lag_lock); if (changed) mlx5_queue_bond_work(ldev, 0); @@ -502,7 +502,7 @@ static void mlx5_lag_dev_add_pf(struct mlx5_lag *ldev, if (fn >= MLX5_MAX_PORTS) return; - mutex_lock(&lag_mutex); + spin_lock(&lag_lock); ldev->pf[fn].dev = dev; ldev->pf[fn].netdev = netdev; ldev->tracker.netdev_state[fn].link_up = 0; @@ -510,7 +510,7 @@ static void mlx5_lag_dev_add_pf(struct mlx5_lag *ldev, dev->priv.lag = ldev; - mutex_unlock(&lag_mutex); + spin_unlock(&lag_lock); } static void mlx5_lag_dev_remove_pf(struct mlx5_lag *ldev, @@ -525,11 +525,11 @@ static void mlx5_lag_dev_remove_pf(struct mlx5_lag *ldev, if (i == MLX5_MAX_PORTS) return; - mutex_lock(&lag_mutex); + spin_lock(&lag_lock); memset(&ldev->pf[i], 0, sizeof(*ldev->pf)); dev->priv.lag = NULL; - mutex_unlock(&lag_mutex); + spin_unlock(&lag_lock); } /* Must be called with intf_mutex held */ @@ -607,10 +607,10 @@ bool mlx5_lag_is_roce(struct mlx5_core_dev *dev) struct mlx5_lag *ldev; bool res; - mutex_lock(&lag_mutex); + spin_lock(&lag_lock); ldev = mlx5_lag_dev_get(dev); res = ldev && __mlx5_lag_is_roce(ldev); - mutex_unlock(&lag_mutex); + spin_unlock(&lag_lock); return res; } @@ -621,10 +621,10 @@ bool mlx5_lag_is_active(struct mlx5_core_dev *dev) struct mlx5_lag *ldev; bool res; - mutex_lock(&lag_mutex); + spin_lock(&lag_lock); ldev = mlx5_lag_dev_get(dev); res = ldev && __mlx5_lag_is_active(ldev); - mutex_unlock(&lag_mutex); + spin_unlock(&lag_lock); return res; } @@ -635,10 +635,10 @@ bool mlx5_lag_is_sriov(struct mlx5_core_dev *dev) struct mlx5_lag *ldev; bool res; - mutex_lock(&lag_mutex); + spin_lock(&lag_lock); ldev = mlx5_lag_dev_get(dev); res = ldev && __mlx5_lag_is_sriov(ldev); - mutex_unlock(&lag_mutex); + spin_unlock(&lag_lock); return res; } @@ -664,7 +664,7 @@ struct net_device *mlx5_lag_get_roce_netdev(struct mlx5_core_dev *dev) struct net_device *ndev = NULL; struct mlx5_lag *ldev; - mutex_lock(&lag_mutex); + spin_lock(&lag_lock); ldev = mlx5_lag_dev_get(dev); if (!(ldev && __mlx5_lag_is_roce(ldev))) @@ -681,7 +681,7 @@ struct net_device *mlx5_lag_get_roce_netdev(struct mlx5_core_dev *dev) dev_hold(ndev); unlock: - mutex_unlock(&lag_mutex); + spin_unlock(&lag_lock); return ndev; } @@ -723,7 +723,7 @@ int mlx5_lag_query_cong_counters(struct mlx5_core_dev *dev, memset(values, 0, sizeof(*values) * num_counters); - mutex_lock(&lag_mutex); + spin_lock(&lag_lock); ldev = mlx5_lag_dev_get(dev); if (ldev && __mlx5_lag_is_roce(ldev)) { num_ports = MLX5_MAX_PORTS; @@ -733,6 +733,7 @@ int mlx5_lag_query_cong_counters(struct mlx5_core_dev *dev, num_ports = 1; mdev[MLX5_LAG_P1] = dev; } + spin_unlock(&lag_lock); for (i = 0; i < num_ports; ++i) { u32 in[MLX5_ST_SZ_DW(query_cong_statistics_in)] = {}; @@ -742,14 +743,13 @@ int mlx5_lag_query_cong_counters(struct mlx5_core_dev *dev, ret = mlx5_cmd_exec_inout(mdev[i], query_cong_statistics, in, out); if (ret) - goto unlock; + goto free; for (j = 0; j < num_counters; ++j) values[j] += be64_to_cpup((__be64 *)(out + offsets[j])); } -unlock: - mutex_unlock(&lag_mutex); +free: kvfree(out); return ret; } -- cgit v1.2.3-59-g8ed1b From c6bc6041b10f70b617f2d13894311fe62027d292 Mon Sep 17 00:00:00 2001 From: Maor Gottlieb Date: Thu, 30 Apr 2020 22:21:41 +0300 Subject: net/mlx5: Add support to get lag physical port Add function to get the device physical port of the lag slave. Signed-off-by: Maor Gottlieb Reviewed-by: Leon Romanovsky Acked-by: David S. Miller Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/lag.c | 24 ++++++++++++++++++++++++ include/linux/mlx5/driver.h | 2 ++ 2 files changed, 26 insertions(+) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lag.c b/drivers/net/ethernet/mellanox/mlx5/core/lag.c index b17b80bcd045..874c70e8cc54 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lag.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/lag.c @@ -687,6 +687,30 @@ unlock: } EXPORT_SYMBOL(mlx5_lag_get_roce_netdev); +u8 mlx5_lag_get_slave_port(struct mlx5_core_dev *dev, + struct net_device *slave) +{ + struct mlx5_lag *ldev; + u8 port = 0; + + spin_lock(&lag_lock); + ldev = mlx5_lag_dev_get(dev); + if (!(ldev && __mlx5_lag_is_roce(ldev))) + goto unlock; + + if (ldev->pf[MLX5_LAG_P1].netdev == slave) + port = MLX5_LAG_P1; + else + port = MLX5_LAG_P2; + + port = ldev->v2p_map[port]; + +unlock: + spin_unlock(&lag_lock); + return port; +} +EXPORT_SYMBOL(mlx5_lag_get_slave_port); + bool mlx5_lag_intf_add(struct mlx5_interface *intf, struct mlx5_priv *priv) { struct mlx5_core_dev *dev = container_of(priv, struct mlx5_core_dev, diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index d82dbbab8179..267dfcc5493e 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -1074,6 +1074,8 @@ bool mlx5_lag_is_sriov(struct mlx5_core_dev *dev); bool mlx5_lag_is_multipath(struct mlx5_core_dev *dev); bool mlx5_lag_is_active(struct mlx5_core_dev *dev); struct net_device *mlx5_lag_get_roce_netdev(struct mlx5_core_dev *dev); +u8 mlx5_lag_get_slave_port(struct mlx5_core_dev *dev, + struct net_device *slave); int mlx5_lag_query_cong_counters(struct mlx5_core_dev *dev, u64 *values, int num_counters, -- cgit v1.2.3-59-g8ed1b From 973d55e590beeca13fece60596ee3b511d36d9da Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Fri, 1 May 2020 16:44:23 +0200 Subject: docs: networking: convert tuntap.txt to ReST - add SPDX header; - use copyright symbol; - adjust titles and chapters, adding proper markups; - mark code blocks and literals as such; - adjust identation, whitespaces and blank lines where needed; - add to networking/index.rst. Signed-off-by: Mauro Carvalho Chehab Signed-off-by: David S. Miller --- Documentation/networking/index.rst | 1 + Documentation/networking/tuntap.rst | 259 ++++++++++++++++++++++++++++++++++++ Documentation/networking/tuntap.txt | 227 ------------------------------- MAINTAINERS | 2 +- drivers/net/Kconfig | 2 +- 5 files changed, 262 insertions(+), 229 deletions(-) create mode 100644 Documentation/networking/tuntap.rst delete mode 100644 Documentation/networking/tuntap.txt diff --git a/Documentation/networking/index.rst b/Documentation/networking/index.rst index b423b2db5f96..e7a683f0528d 100644 --- a/Documentation/networking/index.rst +++ b/Documentation/networking/index.rst @@ -111,6 +111,7 @@ Contents: team timestamping tproxy + tuntap .. only:: subproject and html diff --git a/Documentation/networking/tuntap.rst b/Documentation/networking/tuntap.rst new file mode 100644 index 000000000000..a59d1dd6fdcc --- /dev/null +++ b/Documentation/networking/tuntap.rst @@ -0,0 +1,259 @@ +.. SPDX-License-Identifier: GPL-2.0 +.. include:: + +=============================== +Universal TUN/TAP device driver +=============================== + +Copyright |copy| 1999-2000 Maxim Krasnyansky + + Linux, Solaris drivers + Copyright |copy| 1999-2000 Maxim Krasnyansky + + FreeBSD TAP driver + Copyright |copy| 1999-2000 Maksim Yevmenkin + + Revision of this document 2002 by Florian Thiel + +1. Description +============== + + TUN/TAP provides packet reception and transmission for user space programs. + It can be seen as a simple Point-to-Point or Ethernet device, which, + instead of receiving packets from physical media, receives them from + user space program and instead of sending packets via physical media + writes them to the user space program. + + In order to use the driver a program has to open /dev/net/tun and issue a + corresponding ioctl() to register a network device with the kernel. A network + device will appear as tunXX or tapXX, depending on the options chosen. When + the program closes the file descriptor, the network device and all + corresponding routes will disappear. + + Depending on the type of device chosen the userspace program has to read/write + IP packets (with tun) or ethernet frames (with tap). Which one is being used + depends on the flags given with the ioctl(). + + The package from http://vtun.sourceforge.net/tun contains two simple examples + for how to use tun and tap devices. Both programs work like a bridge between + two network interfaces. + br_select.c - bridge based on select system call. + br_sigio.c - bridge based on async io and SIGIO signal. + However, the best example is VTun http://vtun.sourceforge.net :)) + +2. Configuration +================ + + Create device node:: + + mkdir /dev/net (if it doesn't exist already) + mknod /dev/net/tun c 10 200 + + Set permissions:: + + e.g. chmod 0666 /dev/net/tun + + There's no harm in allowing the device to be accessible by non-root users, + since CAP_NET_ADMIN is required for creating network devices or for + connecting to network devices which aren't owned by the user in question. + If you want to create persistent devices and give ownership of them to + unprivileged users, then you need the /dev/net/tun device to be usable by + those users. + + Driver module autoloading + + Make sure that "Kernel module loader" - module auto-loading + support is enabled in your kernel. The kernel should load it on + first access. + + Manual loading + + insert the module by hand:: + + modprobe tun + + If you do it the latter way, you have to load the module every time you + need it, if you do it the other way it will be automatically loaded when + /dev/net/tun is being opened. + +3. Program interface +==================== + +3.1 Network device allocation +----------------------------- + +``char *dev`` should be the name of the device with a format string (e.g. +"tun%d"), but (as far as I can see) this can be any valid network device name. +Note that the character pointer becomes overwritten with the real device name +(e.g. "tun0"):: + + #include + #include + + int tun_alloc(char *dev) + { + struct ifreq ifr; + int fd, err; + + if( (fd = open("/dev/net/tun", O_RDWR)) < 0 ) + return tun_alloc_old(dev); + + memset(&ifr, 0, sizeof(ifr)); + + /* Flags: IFF_TUN - TUN device (no Ethernet headers) + * IFF_TAP - TAP device + * + * IFF_NO_PI - Do not provide packet information + */ + ifr.ifr_flags = IFF_TUN; + if( *dev ) + strncpy(ifr.ifr_name, dev, IFNAMSIZ); + + if( (err = ioctl(fd, TUNSETIFF, (void *) &ifr)) < 0 ){ + close(fd); + return err; + } + strcpy(dev, ifr.ifr_name); + return fd; + } + +3.2 Frame format +---------------- + +If flag IFF_NO_PI is not set each frame format is:: + + Flags [2 bytes] + Proto [2 bytes] + Raw protocol(IP, IPv6, etc) frame. + +3.3 Multiqueue tuntap interface +------------------------------- + +From version 3.8, Linux supports multiqueue tuntap which can uses multiple +file descriptors (queues) to parallelize packets sending or receiving. The +device allocation is the same as before, and if user wants to create multiple +queues, TUNSETIFF with the same device name must be called many times with +IFF_MULTI_QUEUE flag. + +``char *dev`` should be the name of the device, queues is the number of queues +to be created, fds is used to store and return the file descriptors (queues) +created to the caller. Each file descriptor were served as the interface of a +queue which could be accessed by userspace. + +:: + + #include + #include + + int tun_alloc_mq(char *dev, int queues, int *fds) + { + struct ifreq ifr; + int fd, err, i; + + if (!dev) + return -1; + + memset(&ifr, 0, sizeof(ifr)); + /* Flags: IFF_TUN - TUN device (no Ethernet headers) + * IFF_TAP - TAP device + * + * IFF_NO_PI - Do not provide packet information + * IFF_MULTI_QUEUE - Create a queue of multiqueue device + */ + ifr.ifr_flags = IFF_TAP | IFF_NO_PI | IFF_MULTI_QUEUE; + strcpy(ifr.ifr_name, dev); + + for (i = 0; i < queues; i++) { + if ((fd = open("/dev/net/tun", O_RDWR)) < 0) + goto err; + err = ioctl(fd, TUNSETIFF, (void *)&ifr); + if (err) { + close(fd); + goto err; + } + fds[i] = fd; + } + + return 0; + err: + for (--i; i >= 0; i--) + close(fds[i]); + return err; + } + +A new ioctl(TUNSETQUEUE) were introduced to enable or disable a queue. When +calling it with IFF_DETACH_QUEUE flag, the queue were disabled. And when +calling it with IFF_ATTACH_QUEUE flag, the queue were enabled. The queue were +enabled by default after it was created through TUNSETIFF. + +fd is the file descriptor (queue) that we want to enable or disable, when +enable is true we enable it, otherwise we disable it:: + + #include + #include + + int tun_set_queue(int fd, int enable) + { + struct ifreq ifr; + + memset(&ifr, 0, sizeof(ifr)); + + if (enable) + ifr.ifr_flags = IFF_ATTACH_QUEUE; + else + ifr.ifr_flags = IFF_DETACH_QUEUE; + + return ioctl(fd, TUNSETQUEUE, (void *)&ifr); + } + +Universal TUN/TAP device driver Frequently Asked Question +========================================================= + +1. What platforms are supported by TUN/TAP driver ? + +Currently driver has been written for 3 Unices: + + - Linux kernels 2.2.x, 2.4.x + - FreeBSD 3.x, 4.x, 5.x + - Solaris 2.6, 7.0, 8.0 + +2. What is TUN/TAP driver used for? + +As mentioned above, main purpose of TUN/TAP driver is tunneling. +It is used by VTun (http://vtun.sourceforge.net). + +Another interesting application using TUN/TAP is pipsecd +(http://perso.enst.fr/~beyssac/pipsec/), a userspace IPSec +implementation that can use complete kernel routing (unlike FreeS/WAN). + +3. How does Virtual network device actually work ? + +Virtual network device can be viewed as a simple Point-to-Point or +Ethernet device, which instead of receiving packets from a physical +media, receives them from user space program and instead of sending +packets via physical media sends them to the user space program. + +Let's say that you configured IPv6 on the tap0, then whenever +the kernel sends an IPv6 packet to tap0, it is passed to the application +(VTun for example). The application encrypts, compresses and sends it to +the other side over TCP or UDP. The application on the other side decompresses +and decrypts the data received and writes the packet to the TAP device, +the kernel handles the packet like it came from real physical device. + +4. What is the difference between TUN driver and TAP driver? + +TUN works with IP frames. TAP works with Ethernet frames. + +This means that you have to read/write IP packets when you are using tun and +ethernet frames when using tap. + +5. What is the difference between BPF and TUN/TAP driver? + +BPF is an advanced packet filter. It can be attached to existing +network interface. It does not provide a virtual network interface. +A TUN/TAP driver does provide a virtual network interface and it is possible +to attach BPF to this interface. + +6. Does TAP driver support kernel Ethernet bridging? + +Yes. Linux and FreeBSD drivers support Ethernet bridging. diff --git a/Documentation/networking/tuntap.txt b/Documentation/networking/tuntap.txt deleted file mode 100644 index 0104830d5075..000000000000 --- a/Documentation/networking/tuntap.txt +++ /dev/null @@ -1,227 +0,0 @@ -Universal TUN/TAP device driver. -Copyright (C) 1999-2000 Maxim Krasnyansky - - Linux, Solaris drivers - Copyright (C) 1999-2000 Maxim Krasnyansky - - FreeBSD TAP driver - Copyright (c) 1999-2000 Maksim Yevmenkin - - Revision of this document 2002 by Florian Thiel - -1. Description - TUN/TAP provides packet reception and transmission for user space programs. - It can be seen as a simple Point-to-Point or Ethernet device, which, - instead of receiving packets from physical media, receives them from - user space program and instead of sending packets via physical media - writes them to the user space program. - - In order to use the driver a program has to open /dev/net/tun and issue a - corresponding ioctl() to register a network device with the kernel. A network - device will appear as tunXX or tapXX, depending on the options chosen. When - the program closes the file descriptor, the network device and all - corresponding routes will disappear. - - Depending on the type of device chosen the userspace program has to read/write - IP packets (with tun) or ethernet frames (with tap). Which one is being used - depends on the flags given with the ioctl(). - - The package from http://vtun.sourceforge.net/tun contains two simple examples - for how to use tun and tap devices. Both programs work like a bridge between - two network interfaces. - br_select.c - bridge based on select system call. - br_sigio.c - bridge based on async io and SIGIO signal. - However, the best example is VTun http://vtun.sourceforge.net :)) - -2. Configuration - Create device node: - mkdir /dev/net (if it doesn't exist already) - mknod /dev/net/tun c 10 200 - - Set permissions: - e.g. chmod 0666 /dev/net/tun - There's no harm in allowing the device to be accessible by non-root users, - since CAP_NET_ADMIN is required for creating network devices or for - connecting to network devices which aren't owned by the user in question. - If you want to create persistent devices and give ownership of them to - unprivileged users, then you need the /dev/net/tun device to be usable by - those users. - - Driver module autoloading - - Make sure that "Kernel module loader" - module auto-loading - support is enabled in your kernel. The kernel should load it on - first access. - - Manual loading - insert the module by hand: - modprobe tun - - If you do it the latter way, you have to load the module every time you - need it, if you do it the other way it will be automatically loaded when - /dev/net/tun is being opened. - -3. Program interface - 3.1 Network device allocation: - - char *dev should be the name of the device with a format string (e.g. - "tun%d"), but (as far as I can see) this can be any valid network device name. - Note that the character pointer becomes overwritten with the real device name - (e.g. "tun0") - - #include - #include - - int tun_alloc(char *dev) - { - struct ifreq ifr; - int fd, err; - - if( (fd = open("/dev/net/tun", O_RDWR)) < 0 ) - return tun_alloc_old(dev); - - memset(&ifr, 0, sizeof(ifr)); - - /* Flags: IFF_TUN - TUN device (no Ethernet headers) - * IFF_TAP - TAP device - * - * IFF_NO_PI - Do not provide packet information - */ - ifr.ifr_flags = IFF_TUN; - if( *dev ) - strncpy(ifr.ifr_name, dev, IFNAMSIZ); - - if( (err = ioctl(fd, TUNSETIFF, (void *) &ifr)) < 0 ){ - close(fd); - return err; - } - strcpy(dev, ifr.ifr_name); - return fd; - } - - 3.2 Frame format: - If flag IFF_NO_PI is not set each frame format is: - Flags [2 bytes] - Proto [2 bytes] - Raw protocol(IP, IPv6, etc) frame. - - 3.3 Multiqueue tuntap interface: - - From version 3.8, Linux supports multiqueue tuntap which can uses multiple - file descriptors (queues) to parallelize packets sending or receiving. The - device allocation is the same as before, and if user wants to create multiple - queues, TUNSETIFF with the same device name must be called many times with - IFF_MULTI_QUEUE flag. - - char *dev should be the name of the device, queues is the number of queues to - be created, fds is used to store and return the file descriptors (queues) - created to the caller. Each file descriptor were served as the interface of a - queue which could be accessed by userspace. - - #include - #include - - int tun_alloc_mq(char *dev, int queues, int *fds) - { - struct ifreq ifr; - int fd, err, i; - - if (!dev) - return -1; - - memset(&ifr, 0, sizeof(ifr)); - /* Flags: IFF_TUN - TUN device (no Ethernet headers) - * IFF_TAP - TAP device - * - * IFF_NO_PI - Do not provide packet information - * IFF_MULTI_QUEUE - Create a queue of multiqueue device - */ - ifr.ifr_flags = IFF_TAP | IFF_NO_PI | IFF_MULTI_QUEUE; - strcpy(ifr.ifr_name, dev); - - for (i = 0; i < queues; i++) { - if ((fd = open("/dev/net/tun", O_RDWR)) < 0) - goto err; - err = ioctl(fd, TUNSETIFF, (void *)&ifr); - if (err) { - close(fd); - goto err; - } - fds[i] = fd; - } - - return 0; - err: - for (--i; i >= 0; i--) - close(fds[i]); - return err; - } - - A new ioctl(TUNSETQUEUE) were introduced to enable or disable a queue. When - calling it with IFF_DETACH_QUEUE flag, the queue were disabled. And when - calling it with IFF_ATTACH_QUEUE flag, the queue were enabled. The queue were - enabled by default after it was created through TUNSETIFF. - - fd is the file descriptor (queue) that we want to enable or disable, when - enable is true we enable it, otherwise we disable it - - #include - #include - - int tun_set_queue(int fd, int enable) - { - struct ifreq ifr; - - memset(&ifr, 0, sizeof(ifr)); - - if (enable) - ifr.ifr_flags = IFF_ATTACH_QUEUE; - else - ifr.ifr_flags = IFF_DETACH_QUEUE; - - return ioctl(fd, TUNSETQUEUE, (void *)&ifr); - } - -Universal TUN/TAP device driver Frequently Asked Question. - -1. What platforms are supported by TUN/TAP driver ? -Currently driver has been written for 3 Unices: - Linux kernels 2.2.x, 2.4.x - FreeBSD 3.x, 4.x, 5.x - Solaris 2.6, 7.0, 8.0 - -2. What is TUN/TAP driver used for? -As mentioned above, main purpose of TUN/TAP driver is tunneling. -It is used by VTun (http://vtun.sourceforge.net). - -Another interesting application using TUN/TAP is pipsecd -(http://perso.enst.fr/~beyssac/pipsec/), a userspace IPSec -implementation that can use complete kernel routing (unlike FreeS/WAN). - -3. How does Virtual network device actually work ? -Virtual network device can be viewed as a simple Point-to-Point or -Ethernet device, which instead of receiving packets from a physical -media, receives them from user space program and instead of sending -packets via physical media sends them to the user space program. - -Let's say that you configured IPv6 on the tap0, then whenever -the kernel sends an IPv6 packet to tap0, it is passed to the application -(VTun for example). The application encrypts, compresses and sends it to -the other side over TCP or UDP. The application on the other side decompresses -and decrypts the data received and writes the packet to the TAP device, -the kernel handles the packet like it came from real physical device. - -4. What is the difference between TUN driver and TAP driver? -TUN works with IP frames. TAP works with Ethernet frames. - -This means that you have to read/write IP packets when you are using tun and -ethernet frames when using tap. - -5. What is the difference between BPF and TUN/TAP driver? -BPF is an advanced packet filter. It can be attached to existing -network interface. It does not provide a virtual network interface. -A TUN/TAP driver does provide a virtual network interface and it is possible -to attach BPF to this interface. - -6. Does TAP driver support kernel Ethernet bridging? -Yes. Linux and FreeBSD drivers support Ethernet bridging. diff --git a/MAINTAINERS b/MAINTAINERS index 0ac9cec0bce6..6456c5bb02f1 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -17161,7 +17161,7 @@ TUN/TAP driver M: Maxim Krasnyansky S: Maintained W: http://vtun.sourceforge.net/tun -F: Documentation/networking/tuntap.txt +F: Documentation/networking/tuntap.rst F: arch/um/os-Linux/drivers/ TURBOCHANNEL SUBSYSTEM diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig index ad64be98330f..3f2c98a7906c 100644 --- a/drivers/net/Kconfig +++ b/drivers/net/Kconfig @@ -355,7 +355,7 @@ config TUN devices, driver will automatically delete tunXX or tapXX device and all routes corresponding to it. - Please read for more + Please read for more information. To compile this driver as a module, choose M here: the module -- cgit v1.2.3-59-g8ed1b From 961fb1ff412a2cefaf50f4f56bb60a10ed071df5 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Fri, 1 May 2020 16:44:24 +0200 Subject: docs: networking: convert udplite.txt to ReST - add SPDX header; - adjust titles and chapters, adding proper markups; - mark lists as such; - mark tables as such; - mark code blocks and literals as such; - adjust identation, whitespaces and blank lines where needed; - add to networking/index.rst. Signed-off-by: Mauro Carvalho Chehab Signed-off-by: David S. Miller --- Documentation/networking/index.rst | 1 + Documentation/networking/udplite.rst | 291 +++++++++++++++++++++++++++++++++++ Documentation/networking/udplite.txt | 278 --------------------------------- 3 files changed, 292 insertions(+), 278 deletions(-) create mode 100644 Documentation/networking/udplite.rst delete mode 100644 Documentation/networking/udplite.txt diff --git a/Documentation/networking/index.rst b/Documentation/networking/index.rst index e7a683f0528d..ca0b0dbfd9ad 100644 --- a/Documentation/networking/index.rst +++ b/Documentation/networking/index.rst @@ -112,6 +112,7 @@ Contents: timestamping tproxy tuntap + udplite .. only:: subproject and html diff --git a/Documentation/networking/udplite.rst b/Documentation/networking/udplite.rst new file mode 100644 index 000000000000..2c225f28b7b2 --- /dev/null +++ b/Documentation/networking/udplite.rst @@ -0,0 +1,291 @@ +.. SPDX-License-Identifier: GPL-2.0 + +================================ +The UDP-Lite protocol (RFC 3828) +================================ + + + UDP-Lite is a Standards-Track IETF transport protocol whose characteristic + is a variable-length checksum. This has advantages for transport of multimedia + (video, VoIP) over wireless networks, as partly damaged packets can still be + fed into the codec instead of being discarded due to a failed checksum test. + + This file briefly describes the existing kernel support and the socket API. + For in-depth information, you can consult: + + - The UDP-Lite Homepage: + http://web.archive.org/web/%2E/http://www.erg.abdn.ac.uk/users/gerrit/udp-lite/ + + From here you can also download some example application source code. + + - The UDP-Lite HOWTO on + http://web.archive.org/web/%2E/http://www.erg.abdn.ac.uk/users/gerrit/udp-lite/files/UDP-Lite-HOWTO.txt + + - The Wireshark UDP-Lite WiKi (with capture files): + https://wiki.wireshark.org/Lightweight_User_Datagram_Protocol + + - The Protocol Spec, RFC 3828, http://www.ietf.org/rfc/rfc3828.txt + + +1. Applications +=============== + + Several applications have been ported successfully to UDP-Lite. Ethereal + (now called wireshark) has UDP-Litev4/v6 support by default. + + Porting applications to UDP-Lite is straightforward: only socket level and + IPPROTO need to be changed; senders additionally set the checksum coverage + length (default = header length = 8). Details are in the next section. + +2. Programming API +================== + + UDP-Lite provides a connectionless, unreliable datagram service and hence + uses the same socket type as UDP. In fact, porting from UDP to UDP-Lite is + very easy: simply add ``IPPROTO_UDPLITE`` as the last argument of the + socket(2) call so that the statement looks like:: + + s = socket(PF_INET, SOCK_DGRAM, IPPROTO_UDPLITE); + + or, respectively, + + :: + + s = socket(PF_INET6, SOCK_DGRAM, IPPROTO_UDPLITE); + + With just the above change you are able to run UDP-Lite services or connect + to UDP-Lite servers. The kernel will assume that you are not interested in + using partial checksum coverage and so emulate UDP mode (full coverage). + + To make use of the partial checksum coverage facilities requires setting a + single socket option, which takes an integer specifying the coverage length: + + * Sender checksum coverage: UDPLITE_SEND_CSCOV + + For example:: + + int val = 20; + setsockopt(s, SOL_UDPLITE, UDPLITE_SEND_CSCOV, &val, sizeof(int)); + + sets the checksum coverage length to 20 bytes (12b data + 8b header). + Of each packet only the first 20 bytes (plus the pseudo-header) will be + checksummed. This is useful for RTP applications which have a 12-byte + base header. + + + * Receiver checksum coverage: UDPLITE_RECV_CSCOV + + This option is the receiver-side analogue. It is truly optional, i.e. not + required to enable traffic with partial checksum coverage. Its function is + that of a traffic filter: when enabled, it instructs the kernel to drop + all packets which have a coverage _less_ than this value. For example, if + RTP and UDP headers are to be protected, a receiver can enforce that only + packets with a minimum coverage of 20 are admitted:: + + int min = 20; + setsockopt(s, SOL_UDPLITE, UDPLITE_RECV_CSCOV, &min, sizeof(int)); + + The calls to getsockopt(2) are analogous. Being an extension and not a stand- + alone protocol, all socket options known from UDP can be used in exactly the + same manner as before, e.g. UDP_CORK or UDP_ENCAP. + + A detailed discussion of UDP-Lite checksum coverage options is in section IV. + +3. Header Files +=============== + + The socket API requires support through header files in /usr/include: + + * /usr/include/netinet/in.h + to define IPPROTO_UDPLITE + + * /usr/include/netinet/udplite.h + for UDP-Lite header fields and protocol constants + + For testing purposes, the following can serve as a ``mini`` header file:: + + #define IPPROTO_UDPLITE 136 + #define SOL_UDPLITE 136 + #define UDPLITE_SEND_CSCOV 10 + #define UDPLITE_RECV_CSCOV 11 + + Ready-made header files for various distros are in the UDP-Lite tarball. + +4. Kernel Behaviour with Regards to the Various Socket Options +============================================================== + + + To enable debugging messages, the log level need to be set to 8, as most + messages use the KERN_DEBUG level (7). + + 1) Sender Socket Options + + If the sender specifies a value of 0 as coverage length, the module + assumes full coverage, transmits a packet with coverage length of 0 + and according checksum. If the sender specifies a coverage < 8 and + different from 0, the kernel assumes 8 as default value. Finally, + if the specified coverage length exceeds the packet length, the packet + length is used instead as coverage length. + + 2) Receiver Socket Options + + The receiver specifies the minimum value of the coverage length it + is willing to accept. A value of 0 here indicates that the receiver + always wants the whole of the packet covered. In this case, all + partially covered packets are dropped and an error is logged. + + It is not possible to specify illegal values (<0 and <8); in these + cases the default of 8 is assumed. + + All packets arriving with a coverage value less than the specified + threshold are discarded, these events are also logged. + + 3) Disabling the Checksum Computation + + On both sender and receiver, checksumming will always be performed + and cannot be disabled using SO_NO_CHECK. Thus:: + + setsockopt(sockfd, SOL_SOCKET, SO_NO_CHECK, ... ); + + will always will be ignored, while the value of:: + + getsockopt(sockfd, SOL_SOCKET, SO_NO_CHECK, &value, ...); + + is meaningless (as in TCP). Packets with a zero checksum field are + illegal (cf. RFC 3828, sec. 3.1) and will be silently discarded. + + 4) Fragmentation + + The checksum computation respects both buffersize and MTU. The size + of UDP-Lite packets is determined by the size of the send buffer. The + minimum size of the send buffer is 2048 (defined as SOCK_MIN_SNDBUF + in include/net/sock.h), the default value is configurable as + net.core.wmem_default or via setting the SO_SNDBUF socket(7) + option. The maximum upper bound for the send buffer is determined + by net.core.wmem_max. + + Given a payload size larger than the send buffer size, UDP-Lite will + split the payload into several individual packets, filling up the + send buffer size in each case. + + The precise value also depends on the interface MTU. The interface MTU, + in turn, may trigger IP fragmentation. In this case, the generated + UDP-Lite packet is split into several IP packets, of which only the + first one contains the L4 header. + + The send buffer size has implications on the checksum coverage length. + Consider the following example:: + + Payload: 1536 bytes Send Buffer: 1024 bytes + MTU: 1500 bytes Coverage Length: 856 bytes + + UDP-Lite will ship the 1536 bytes in two separate packets:: + + Packet 1: 1024 payload + 8 byte header + 20 byte IP header = 1052 bytes + Packet 2: 512 payload + 8 byte header + 20 byte IP header = 540 bytes + + The coverage packet covers the UDP-Lite header and 848 bytes of the + payload in the first packet, the second packet is fully covered. Note + that for the second packet, the coverage length exceeds the packet + length. The kernel always re-adjusts the coverage length to the packet + length in such cases. + + As an example of what happens when one UDP-Lite packet is split into + several tiny fragments, consider the following example:: + + Payload: 1024 bytes Send buffer size: 1024 bytes + MTU: 300 bytes Coverage length: 575 bytes + + +-+-----------+--------------+--------------+--------------+ + |8| 272 | 280 | 280 | 280 | + +-+-----------+--------------+--------------+--------------+ + 280 560 840 1032 + ^ + *****checksum coverage************* + + The UDP-Lite module generates one 1032 byte packet (1024 + 8 byte + header). According to the interface MTU, these are split into 4 IP + packets (280 byte IP payload + 20 byte IP header). The kernel module + sums the contents of the entire first two packets, plus 15 bytes of + the last packet before releasing the fragments to the IP module. + + To see the analogous case for IPv6 fragmentation, consider a link + MTU of 1280 bytes and a write buffer of 3356 bytes. If the checksum + coverage is less than 1232 bytes (MTU minus IPv6/fragment header + lengths), only the first fragment needs to be considered. When using + larger checksum coverage lengths, each eligible fragment needs to be + checksummed. Suppose we have a checksum coverage of 3062. The buffer + of 3356 bytes will be split into the following fragments:: + + Fragment 1: 1280 bytes carrying 1232 bytes of UDP-Lite data + Fragment 2: 1280 bytes carrying 1232 bytes of UDP-Lite data + Fragment 3: 948 bytes carrying 900 bytes of UDP-Lite data + + The first two fragments have to be checksummed in full, of the last + fragment only 598 (= 3062 - 2*1232) bytes are checksummed. + + While it is important that such cases are dealt with correctly, they + are (annoyingly) rare: UDP-Lite is designed for optimising multimedia + performance over wireless (or generally noisy) links and thus smaller + coverage lengths are likely to be expected. + +5. UDP-Lite Runtime Statistics and their Meaning +================================================ + + Exceptional and error conditions are logged to syslog at the KERN_DEBUG + level. Live statistics about UDP-Lite are available in /proc/net/snmp + and can (with newer versions of netstat) be viewed using:: + + netstat -svu + + This displays UDP-Lite statistics variables, whose meaning is as follows. + + ============ ===================================================== + InDatagrams The total number of datagrams delivered to users. + + NoPorts Number of packets received to an unknown port. + These cases are counted separately (not as InErrors). + + InErrors Number of erroneous UDP-Lite packets. Errors include: + + * internal socket queue receive errors + * packet too short (less than 8 bytes or stated + coverage length exceeds received length) + * xfrm4_policy_check() returned with error + * application has specified larger min. coverage + length than that of incoming packet + * checksum coverage violated + * bad checksum + + OutDatagrams Total number of sent datagrams. + ============ ===================================================== + + These statistics derive from the UDP MIB (RFC 2013). + +6. IPtables +=========== + + There is packet match support for UDP-Lite as well as support for the LOG target. + If you copy and paste the following line into /etc/protocols:: + + udplite 136 UDP-Lite # UDP-Lite [RFC 3828] + + then:: + + iptables -A INPUT -p udplite -j LOG + + will produce logging output to syslog. Dropping and rejecting packets also works. + +7. Maintainer Address +===================== + + The UDP-Lite patch was developed at + + University of Aberdeen + Electronics Research Group + Department of Engineering + Fraser Noble Building + Aberdeen AB24 3UE; UK + + The current maintainer is Gerrit Renker, . Initial + code was developed by William Stanislaus, . diff --git a/Documentation/networking/udplite.txt b/Documentation/networking/udplite.txt deleted file mode 100644 index 53a726855e49..000000000000 --- a/Documentation/networking/udplite.txt +++ /dev/null @@ -1,278 +0,0 @@ - =========================================================================== - The UDP-Lite protocol (RFC 3828) - =========================================================================== - - - UDP-Lite is a Standards-Track IETF transport protocol whose characteristic - is a variable-length checksum. This has advantages for transport of multimedia - (video, VoIP) over wireless networks, as partly damaged packets can still be - fed into the codec instead of being discarded due to a failed checksum test. - - This file briefly describes the existing kernel support and the socket API. - For in-depth information, you can consult: - - o The UDP-Lite Homepage: - http://web.archive.org/web/*/http://www.erg.abdn.ac.uk/users/gerrit/udp-lite/ - From here you can also download some example application source code. - - o The UDP-Lite HOWTO on - http://web.archive.org/web/*/http://www.erg.abdn.ac.uk/users/gerrit/udp-lite/ - files/UDP-Lite-HOWTO.txt - - o The Wireshark UDP-Lite WiKi (with capture files): - https://wiki.wireshark.org/Lightweight_User_Datagram_Protocol - - o The Protocol Spec, RFC 3828, http://www.ietf.org/rfc/rfc3828.txt - - - I) APPLICATIONS - - Several applications have been ported successfully to UDP-Lite. Ethereal - (now called wireshark) has UDP-Litev4/v6 support by default. - Porting applications to UDP-Lite is straightforward: only socket level and - IPPROTO need to be changed; senders additionally set the checksum coverage - length (default = header length = 8). Details are in the next section. - - - II) PROGRAMMING API - - UDP-Lite provides a connectionless, unreliable datagram service and hence - uses the same socket type as UDP. In fact, porting from UDP to UDP-Lite is - very easy: simply add `IPPROTO_UDPLITE' as the last argument of the socket(2) - call so that the statement looks like: - - s = socket(PF_INET, SOCK_DGRAM, IPPROTO_UDPLITE); - - or, respectively, - - s = socket(PF_INET6, SOCK_DGRAM, IPPROTO_UDPLITE); - - With just the above change you are able to run UDP-Lite services or connect - to UDP-Lite servers. The kernel will assume that you are not interested in - using partial checksum coverage and so emulate UDP mode (full coverage). - - To make use of the partial checksum coverage facilities requires setting a - single socket option, which takes an integer specifying the coverage length: - - * Sender checksum coverage: UDPLITE_SEND_CSCOV - - For example, - - int val = 20; - setsockopt(s, SOL_UDPLITE, UDPLITE_SEND_CSCOV, &val, sizeof(int)); - - sets the checksum coverage length to 20 bytes (12b data + 8b header). - Of each packet only the first 20 bytes (plus the pseudo-header) will be - checksummed. This is useful for RTP applications which have a 12-byte - base header. - - - * Receiver checksum coverage: UDPLITE_RECV_CSCOV - - This option is the receiver-side analogue. It is truly optional, i.e. not - required to enable traffic with partial checksum coverage. Its function is - that of a traffic filter: when enabled, it instructs the kernel to drop - all packets which have a coverage _less_ than this value. For example, if - RTP and UDP headers are to be protected, a receiver can enforce that only - packets with a minimum coverage of 20 are admitted: - - int min = 20; - setsockopt(s, SOL_UDPLITE, UDPLITE_RECV_CSCOV, &min, sizeof(int)); - - The calls to getsockopt(2) are analogous. Being an extension and not a stand- - alone protocol, all socket options known from UDP can be used in exactly the - same manner as before, e.g. UDP_CORK or UDP_ENCAP. - - A detailed discussion of UDP-Lite checksum coverage options is in section IV. - - - III) HEADER FILES - - The socket API requires support through header files in /usr/include: - - * /usr/include/netinet/in.h - to define IPPROTO_UDPLITE - - * /usr/include/netinet/udplite.h - for UDP-Lite header fields and protocol constants - - For testing purposes, the following can serve as a `mini' header file: - - #define IPPROTO_UDPLITE 136 - #define SOL_UDPLITE 136 - #define UDPLITE_SEND_CSCOV 10 - #define UDPLITE_RECV_CSCOV 11 - - Ready-made header files for various distros are in the UDP-Lite tarball. - - - IV) KERNEL BEHAVIOUR WITH REGARD TO THE VARIOUS SOCKET OPTIONS - - To enable debugging messages, the log level need to be set to 8, as most - messages use the KERN_DEBUG level (7). - - 1) Sender Socket Options - - If the sender specifies a value of 0 as coverage length, the module - assumes full coverage, transmits a packet with coverage length of 0 - and according checksum. If the sender specifies a coverage < 8 and - different from 0, the kernel assumes 8 as default value. Finally, - if the specified coverage length exceeds the packet length, the packet - length is used instead as coverage length. - - 2) Receiver Socket Options - - The receiver specifies the minimum value of the coverage length it - is willing to accept. A value of 0 here indicates that the receiver - always wants the whole of the packet covered. In this case, all - partially covered packets are dropped and an error is logged. - - It is not possible to specify illegal values (<0 and <8); in these - cases the default of 8 is assumed. - - All packets arriving with a coverage value less than the specified - threshold are discarded, these events are also logged. - - 3) Disabling the Checksum Computation - - On both sender and receiver, checksumming will always be performed - and cannot be disabled using SO_NO_CHECK. Thus - - setsockopt(sockfd, SOL_SOCKET, SO_NO_CHECK, ... ); - - will always will be ignored, while the value of - - getsockopt(sockfd, SOL_SOCKET, SO_NO_CHECK, &value, ...); - - is meaningless (as in TCP). Packets with a zero checksum field are - illegal (cf. RFC 3828, sec. 3.1) and will be silently discarded. - - 4) Fragmentation - - The checksum computation respects both buffersize and MTU. The size - of UDP-Lite packets is determined by the size of the send buffer. The - minimum size of the send buffer is 2048 (defined as SOCK_MIN_SNDBUF - in include/net/sock.h), the default value is configurable as - net.core.wmem_default or via setting the SO_SNDBUF socket(7) - option. The maximum upper bound for the send buffer is determined - by net.core.wmem_max. - - Given a payload size larger than the send buffer size, UDP-Lite will - split the payload into several individual packets, filling up the - send buffer size in each case. - - The precise value also depends on the interface MTU. The interface MTU, - in turn, may trigger IP fragmentation. In this case, the generated - UDP-Lite packet is split into several IP packets, of which only the - first one contains the L4 header. - - The send buffer size has implications on the checksum coverage length. - Consider the following example: - - Payload: 1536 bytes Send Buffer: 1024 bytes - MTU: 1500 bytes Coverage Length: 856 bytes - - UDP-Lite will ship the 1536 bytes in two separate packets: - - Packet 1: 1024 payload + 8 byte header + 20 byte IP header = 1052 bytes - Packet 2: 512 payload + 8 byte header + 20 byte IP header = 540 bytes - - The coverage packet covers the UDP-Lite header and 848 bytes of the - payload in the first packet, the second packet is fully covered. Note - that for the second packet, the coverage length exceeds the packet - length. The kernel always re-adjusts the coverage length to the packet - length in such cases. - - As an example of what happens when one UDP-Lite packet is split into - several tiny fragments, consider the following example. - - Payload: 1024 bytes Send buffer size: 1024 bytes - MTU: 300 bytes Coverage length: 575 bytes - - +-+-----------+--------------+--------------+--------------+ - |8| 272 | 280 | 280 | 280 | - +-+-----------+--------------+--------------+--------------+ - 280 560 840 1032 - ^ - *****checksum coverage************* - - The UDP-Lite module generates one 1032 byte packet (1024 + 8 byte - header). According to the interface MTU, these are split into 4 IP - packets (280 byte IP payload + 20 byte IP header). The kernel module - sums the contents of the entire first two packets, plus 15 bytes of - the last packet before releasing the fragments to the IP module. - - To see the analogous case for IPv6 fragmentation, consider a link - MTU of 1280 bytes and a write buffer of 3356 bytes. If the checksum - coverage is less than 1232 bytes (MTU minus IPv6/fragment header - lengths), only the first fragment needs to be considered. When using - larger checksum coverage lengths, each eligible fragment needs to be - checksummed. Suppose we have a checksum coverage of 3062. The buffer - of 3356 bytes will be split into the following fragments: - - Fragment 1: 1280 bytes carrying 1232 bytes of UDP-Lite data - Fragment 2: 1280 bytes carrying 1232 bytes of UDP-Lite data - Fragment 3: 948 bytes carrying 900 bytes of UDP-Lite data - - The first two fragments have to be checksummed in full, of the last - fragment only 598 (= 3062 - 2*1232) bytes are checksummed. - - While it is important that such cases are dealt with correctly, they - are (annoyingly) rare: UDP-Lite is designed for optimising multimedia - performance over wireless (or generally noisy) links and thus smaller - coverage lengths are likely to be expected. - - - V) UDP-LITE RUNTIME STATISTICS AND THEIR MEANING - - Exceptional and error conditions are logged to syslog at the KERN_DEBUG - level. Live statistics about UDP-Lite are available in /proc/net/snmp - and can (with newer versions of netstat) be viewed using - - netstat -svu - - This displays UDP-Lite statistics variables, whose meaning is as follows. - - InDatagrams: The total number of datagrams delivered to users. - - NoPorts: Number of packets received to an unknown port. - These cases are counted separately (not as InErrors). - - InErrors: Number of erroneous UDP-Lite packets. Errors include: - * internal socket queue receive errors - * packet too short (less than 8 bytes or stated - coverage length exceeds received length) - * xfrm4_policy_check() returned with error - * application has specified larger min. coverage - length than that of incoming packet - * checksum coverage violated - * bad checksum - - OutDatagrams: Total number of sent datagrams. - - These statistics derive from the UDP MIB (RFC 2013). - - - VI) IPTABLES - - There is packet match support for UDP-Lite as well as support for the LOG target. - If you copy and paste the following line into /etc/protocols, - - udplite 136 UDP-Lite # UDP-Lite [RFC 3828] - - then - iptables -A INPUT -p udplite -j LOG - - will produce logging output to syslog. Dropping and rejecting packets also works. - - - VII) MAINTAINER ADDRESS - - The UDP-Lite patch was developed at - University of Aberdeen - Electronics Research Group - Department of Engineering - Fraser Noble Building - Aberdeen AB24 3UE; UK - The current maintainer is Gerrit Renker, . Initial - code was developed by William Stanislaus, . -- cgit v1.2.3-59-g8ed1b From 58ccb2b2e87d52ec0b4cbd40b94e0b63e90af873 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Fri, 1 May 2020 16:44:25 +0200 Subject: docs: networking: convert vrf.txt to ReST - add SPDX header; - adjust title markup; - Add a subtitle for the first section; - mark code blocks and literals as such; - adjust identation, whitespaces and blank lines; - add to networking/index.rst. Signed-off-by: Mauro Carvalho Chehab Acked-by: David Ahern Signed-off-by: David S. Miller --- Documentation/networking/index.rst | 1 + Documentation/networking/vrf.rst | 451 +++++++++++++++++++++++++++++++++++++ Documentation/networking/vrf.txt | 418 ---------------------------------- MAINTAINERS | 2 +- 4 files changed, 453 insertions(+), 419 deletions(-) create mode 100644 Documentation/networking/vrf.rst delete mode 100644 Documentation/networking/vrf.txt diff --git a/Documentation/networking/index.rst b/Documentation/networking/index.rst index ca0b0dbfd9ad..2227b9f4509d 100644 --- a/Documentation/networking/index.rst +++ b/Documentation/networking/index.rst @@ -113,6 +113,7 @@ Contents: tproxy tuntap udplite + vrf .. only:: subproject and html diff --git a/Documentation/networking/vrf.rst b/Documentation/networking/vrf.rst new file mode 100644 index 000000000000..0dde145043bc --- /dev/null +++ b/Documentation/networking/vrf.rst @@ -0,0 +1,451 @@ +.. SPDX-License-Identifier: GPL-2.0 + +==================================== +Virtual Routing and Forwarding (VRF) +==================================== + +The VRF Device +============== + +The VRF device combined with ip rules provides the ability to create virtual +routing and forwarding domains (aka VRFs, VRF-lite to be specific) in the +Linux network stack. One use case is the multi-tenancy problem where each +tenant has their own unique routing tables and in the very least need +different default gateways. + +Processes can be "VRF aware" by binding a socket to the VRF device. Packets +through the socket then use the routing table associated with the VRF +device. An important feature of the VRF device implementation is that it +impacts only Layer 3 and above so L2 tools (e.g., LLDP) are not affected +(ie., they do not need to be run in each VRF). The design also allows +the use of higher priority ip rules (Policy Based Routing, PBR) to take +precedence over the VRF device rules directing specific traffic as desired. + +In addition, VRF devices allow VRFs to be nested within namespaces. For +example network namespaces provide separation of network interfaces at the +device layer, VLANs on the interfaces within a namespace provide L2 separation +and then VRF devices provide L3 separation. + +Design +------ +A VRF device is created with an associated route table. Network interfaces +are then enslaved to a VRF device:: + + +-----------------------------+ + | vrf-blue | ===> route table 10 + +-----------------------------+ + | | | + +------+ +------+ +-------------+ + | eth1 | | eth2 | ... | bond1 | + +------+ +------+ +-------------+ + | | + +------+ +------+ + | eth8 | | eth9 | + +------+ +------+ + +Packets received on an enslaved device and are switched to the VRF device +in the IPv4 and IPv6 processing stacks giving the impression that packets +flow through the VRF device. Similarly on egress routing rules are used to +send packets to the VRF device driver before getting sent out the actual +interface. This allows tcpdump on a VRF device to capture all packets into +and out of the VRF as a whole\ [1]_. Similarly, netfilter\ [2]_ and tc rules +can be applied using the VRF device to specify rules that apply to the VRF +domain as a whole. + +.. [1] Packets in the forwarded state do not flow through the device, so those + packets are not seen by tcpdump. Will revisit this limitation in a + future release. + +.. [2] Iptables on ingress supports PREROUTING with skb->dev set to the real + ingress device and both INPUT and PREROUTING rules with skb->dev set to + the VRF device. For egress POSTROUTING and OUTPUT rules can be written + using either the VRF device or real egress device. + +Setup +----- +1. VRF device is created with an association to a FIB table. + e.g,:: + + ip link add vrf-blue type vrf table 10 + ip link set dev vrf-blue up + +2. An l3mdev FIB rule directs lookups to the table associated with the device. + A single l3mdev rule is sufficient for all VRFs. The VRF device adds the + l3mdev rule for IPv4 and IPv6 when the first device is created with a + default preference of 1000. Users may delete the rule if desired and add + with a different priority or install per-VRF rules. + + Prior to the v4.8 kernel iif and oif rules are needed for each VRF device:: + + ip ru add oif vrf-blue table 10 + ip ru add iif vrf-blue table 10 + +3. Set the default route for the table (and hence default route for the VRF):: + + ip route add table 10 unreachable default metric 4278198272 + + This high metric value ensures that the default unreachable route can + be overridden by a routing protocol suite. FRRouting interprets + kernel metrics as a combined admin distance (upper byte) and priority + (lower 3 bytes). Thus the above metric translates to [255/8192]. + +4. Enslave L3 interfaces to a VRF device:: + + ip link set dev eth1 master vrf-blue + + Local and connected routes for enslaved devices are automatically moved to + the table associated with VRF device. Any additional routes depending on + the enslaved device are dropped and will need to be reinserted to the VRF + FIB table following the enslavement. + + The IPv6 sysctl option keep_addr_on_down can be enabled to keep IPv6 global + addresses as VRF enslavement changes:: + + sysctl -w net.ipv6.conf.all.keep_addr_on_down=1 + +5. Additional VRF routes are added to associated table:: + + ip route add table 10 ... + + +Applications +------------ +Applications that are to work within a VRF need to bind their socket to the +VRF device:: + + setsockopt(sd, SOL_SOCKET, SO_BINDTODEVICE, dev, strlen(dev)+1); + +or to specify the output device using cmsg and IP_PKTINFO. + +By default the scope of the port bindings for unbound sockets is +limited to the default VRF. That is, it will not be matched by packets +arriving on interfaces enslaved to an l3mdev and processes may bind to +the same port if they bind to an l3mdev. + +TCP & UDP services running in the default VRF context (ie., not bound +to any VRF device) can work across all VRF domains by enabling the +tcp_l3mdev_accept and udp_l3mdev_accept sysctl options:: + + sysctl -w net.ipv4.tcp_l3mdev_accept=1 + sysctl -w net.ipv4.udp_l3mdev_accept=1 + +These options are disabled by default so that a socket in a VRF is only +selected for packets in that VRF. There is a similar option for RAW +sockets, which is enabled by default for reasons of backwards compatibility. +This is so as to specify the output device with cmsg and IP_PKTINFO, but +using a socket not bound to the corresponding VRF. This allows e.g. older ping +implementations to be run with specifying the device but without executing it +in the VRF. This option can be disabled so that packets received in a VRF +context are only handled by a raw socket bound to the VRF, and packets in the +default VRF are only handled by a socket not bound to any VRF:: + + sysctl -w net.ipv4.raw_l3mdev_accept=0 + +netfilter rules on the VRF device can be used to limit access to services +running in the default VRF context as well. + +-------------------------------------------------------------------------------- + +Using iproute2 for VRFs +======================= +iproute2 supports the vrf keyword as of v4.7. For backwards compatibility this +section lists both commands where appropriate -- with the vrf keyword and the +older form without it. + +1. Create a VRF + + To instantiate a VRF device and associate it with a table:: + + $ ip link add dev NAME type vrf table ID + + As of v4.8 the kernel supports the l3mdev FIB rule where a single rule + covers all VRFs. The l3mdev rule is created for IPv4 and IPv6 on first + device create. + +2. List VRFs + + To list VRFs that have been created:: + + $ ip [-d] link show type vrf + NOTE: The -d option is needed to show the table id + + For example:: + + $ ip -d link show type vrf + 11: mgmt: mtu 1500 qdisc pfifo_fast state UP mode DEFAULT group default qlen 1000 + link/ether 72:b3:ba:91:e2:24 brd ff:ff:ff:ff:ff:ff promiscuity 0 + vrf table 1 addrgenmode eui64 + 12: red: mtu 1500 qdisc pfifo_fast state UP mode DEFAULT group default qlen 1000 + link/ether b6:6f:6e:f6:da:73 brd ff:ff:ff:ff:ff:ff promiscuity 0 + vrf table 10 addrgenmode eui64 + 13: blue: mtu 1500 qdisc pfifo_fast state UP mode DEFAULT group default qlen 1000 + link/ether 36:62:e8:7d:bb:8c brd ff:ff:ff:ff:ff:ff promiscuity 0 + vrf table 66 addrgenmode eui64 + 14: green: mtu 1500 qdisc pfifo_fast state UP mode DEFAULT group default qlen 1000 + link/ether e6:28:b8:63:70:bb brd ff:ff:ff:ff:ff:ff promiscuity 0 + vrf table 81 addrgenmode eui64 + + + Or in brief output:: + + $ ip -br link show type vrf + mgmt UP 72:b3:ba:91:e2:24 + red UP b6:6f:6e:f6:da:73 + blue UP 36:62:e8:7d:bb:8c + green UP e6:28:b8:63:70:bb + + +3. Assign a Network Interface to a VRF + + Network interfaces are assigned to a VRF by enslaving the netdevice to a + VRF device:: + + $ ip link set dev NAME master NAME + + On enslavement connected and local routes are automatically moved to the + table associated with the VRF device. + + For example:: + + $ ip link set dev eth0 master mgmt + + +4. Show Devices Assigned to a VRF + + To show devices that have been assigned to a specific VRF add the master + option to the ip command:: + + $ ip link show vrf NAME + $ ip link show master NAME + + For example:: + + $ ip link show vrf red + 3: eth1: mtu 1500 qdisc pfifo_fast master red state UP mode DEFAULT group default qlen 1000 + link/ether 02:00:00:00:02:02 brd ff:ff:ff:ff:ff:ff + 4: eth2: mtu 1500 qdisc pfifo_fast master red state UP mode DEFAULT group default qlen 1000 + link/ether 02:00:00:00:02:03 brd ff:ff:ff:ff:ff:ff + 7: eth5: mtu 1500 qdisc noop master red state DOWN mode DEFAULT group default qlen 1000 + link/ether 02:00:00:00:02:06 brd ff:ff:ff:ff:ff:ff + + + Or using the brief output:: + + $ ip -br link show vrf red + eth1 UP 02:00:00:00:02:02 + eth2 UP 02:00:00:00:02:03 + eth5 DOWN 02:00:00:00:02:06 + + +5. Show Neighbor Entries for a VRF + + To list neighbor entries associated with devices enslaved to a VRF device + add the master option to the ip command:: + + $ ip [-6] neigh show vrf NAME + $ ip [-6] neigh show master NAME + + For example:: + + $ ip neigh show vrf red + 10.2.1.254 dev eth1 lladdr a6:d9:c7:4f:06:23 REACHABLE + 10.2.2.254 dev eth2 lladdr 5e:54:01:6a:ee:80 REACHABLE + + $ ip -6 neigh show vrf red + 2002:1::64 dev eth1 lladdr a6:d9:c7:4f:06:23 REACHABLE + + +6. Show Addresses for a VRF + + To show addresses for interfaces associated with a VRF add the master + option to the ip command:: + + $ ip addr show vrf NAME + $ ip addr show master NAME + + For example:: + + $ ip addr show vrf red + 3: eth1: mtu 1500 qdisc pfifo_fast master red state UP group default qlen 1000 + link/ether 02:00:00:00:02:02 brd ff:ff:ff:ff:ff:ff + inet 10.2.1.2/24 brd 10.2.1.255 scope global eth1 + valid_lft forever preferred_lft forever + inet6 2002:1::2/120 scope global + valid_lft forever preferred_lft forever + inet6 fe80::ff:fe00:202/64 scope link + valid_lft forever preferred_lft forever + 4: eth2: mtu 1500 qdisc pfifo_fast master red state UP group default qlen 1000 + link/ether 02:00:00:00:02:03 brd ff:ff:ff:ff:ff:ff + inet 10.2.2.2/24 brd 10.2.2.255 scope global eth2 + valid_lft forever preferred_lft forever + inet6 2002:2::2/120 scope global + valid_lft forever preferred_lft forever + inet6 fe80::ff:fe00:203/64 scope link + valid_lft forever preferred_lft forever + 7: eth5: mtu 1500 qdisc noop master red state DOWN group default qlen 1000 + link/ether 02:00:00:00:02:06 brd ff:ff:ff:ff:ff:ff + + Or in brief format:: + + $ ip -br addr show vrf red + eth1 UP 10.2.1.2/24 2002:1::2/120 fe80::ff:fe00:202/64 + eth2 UP 10.2.2.2/24 2002:2::2/120 fe80::ff:fe00:203/64 + eth5 DOWN + + +7. Show Routes for a VRF + + To show routes for a VRF use the ip command to display the table associated + with the VRF device:: + + $ ip [-6] route show vrf NAME + $ ip [-6] route show table ID + + For example:: + + $ ip route show vrf red + unreachable default metric 4278198272 + broadcast 10.2.1.0 dev eth1 proto kernel scope link src 10.2.1.2 + 10.2.1.0/24 dev eth1 proto kernel scope link src 10.2.1.2 + local 10.2.1.2 dev eth1 proto kernel scope host src 10.2.1.2 + broadcast 10.2.1.255 dev eth1 proto kernel scope link src 10.2.1.2 + broadcast 10.2.2.0 dev eth2 proto kernel scope link src 10.2.2.2 + 10.2.2.0/24 dev eth2 proto kernel scope link src 10.2.2.2 + local 10.2.2.2 dev eth2 proto kernel scope host src 10.2.2.2 + broadcast 10.2.2.255 dev eth2 proto kernel scope link src 10.2.2.2 + + $ ip -6 route show vrf red + local 2002:1:: dev lo proto none metric 0 pref medium + local 2002:1::2 dev lo proto none metric 0 pref medium + 2002:1::/120 dev eth1 proto kernel metric 256 pref medium + local 2002:2:: dev lo proto none metric 0 pref medium + local 2002:2::2 dev lo proto none metric 0 pref medium + 2002:2::/120 dev eth2 proto kernel metric 256 pref medium + local fe80:: dev lo proto none metric 0 pref medium + local fe80:: dev lo proto none metric 0 pref medium + local fe80::ff:fe00:202 dev lo proto none metric 0 pref medium + local fe80::ff:fe00:203 dev lo proto none metric 0 pref medium + fe80::/64 dev eth1 proto kernel metric 256 pref medium + fe80::/64 dev eth2 proto kernel metric 256 pref medium + ff00::/8 dev red metric 256 pref medium + ff00::/8 dev eth1 metric 256 pref medium + ff00::/8 dev eth2 metric 256 pref medium + unreachable default dev lo metric 4278198272 error -101 pref medium + +8. Route Lookup for a VRF + + A test route lookup can be done for a VRF:: + + $ ip [-6] route get vrf NAME ADDRESS + $ ip [-6] route get oif NAME ADDRESS + + For example:: + + $ ip route get 10.2.1.40 vrf red + 10.2.1.40 dev eth1 table red src 10.2.1.2 + cache + + $ ip -6 route get 2002:1::32 vrf red + 2002:1::32 from :: dev eth1 table red proto kernel src 2002:1::2 metric 256 pref medium + + +9. Removing Network Interface from a VRF + + Network interfaces are removed from a VRF by breaking the enslavement to + the VRF device:: + + $ ip link set dev NAME nomaster + + Connected routes are moved back to the default table and local entries are + moved to the local table. + + For example:: + + $ ip link set dev eth0 nomaster + +-------------------------------------------------------------------------------- + +Commands used in this example:: + + cat >> /etc/iproute2/rt_tables.d/vrf.conf < route table 10 - +-----------------------------+ - | | | - +------+ +------+ +-------------+ - | eth1 | | eth2 | ... | bond1 | - +------+ +------+ +-------------+ - | | - +------+ +------+ - | eth8 | | eth9 | - +------+ +------+ - -Packets received on an enslaved device and are switched to the VRF device -in the IPv4 and IPv6 processing stacks giving the impression that packets -flow through the VRF device. Similarly on egress routing rules are used to -send packets to the VRF device driver before getting sent out the actual -interface. This allows tcpdump on a VRF device to capture all packets into -and out of the VRF as a whole.[1] Similarly, netfilter[2] and tc rules can be -applied using the VRF device to specify rules that apply to the VRF domain -as a whole. - -[1] Packets in the forwarded state do not flow through the device, so those - packets are not seen by tcpdump. Will revisit this limitation in a - future release. - -[2] Iptables on ingress supports PREROUTING with skb->dev set to the real - ingress device and both INPUT and PREROUTING rules with skb->dev set to - the VRF device. For egress POSTROUTING and OUTPUT rules can be written - using either the VRF device or real egress device. - -Setup ------ -1. VRF device is created with an association to a FIB table. - e.g, ip link add vrf-blue type vrf table 10 - ip link set dev vrf-blue up - -2. An l3mdev FIB rule directs lookups to the table associated with the device. - A single l3mdev rule is sufficient for all VRFs. The VRF device adds the - l3mdev rule for IPv4 and IPv6 when the first device is created with a - default preference of 1000. Users may delete the rule if desired and add - with a different priority or install per-VRF rules. - - Prior to the v4.8 kernel iif and oif rules are needed for each VRF device: - ip ru add oif vrf-blue table 10 - ip ru add iif vrf-blue table 10 - -3. Set the default route for the table (and hence default route for the VRF). - ip route add table 10 unreachable default metric 4278198272 - - This high metric value ensures that the default unreachable route can - be overridden by a routing protocol suite. FRRouting interprets - kernel metrics as a combined admin distance (upper byte) and priority - (lower 3 bytes). Thus the above metric translates to [255/8192]. - -4. Enslave L3 interfaces to a VRF device. - ip link set dev eth1 master vrf-blue - - Local and connected routes for enslaved devices are automatically moved to - the table associated with VRF device. Any additional routes depending on - the enslaved device are dropped and will need to be reinserted to the VRF - FIB table following the enslavement. - - The IPv6 sysctl option keep_addr_on_down can be enabled to keep IPv6 global - addresses as VRF enslavement changes. - sysctl -w net.ipv6.conf.all.keep_addr_on_down=1 - -5. Additional VRF routes are added to associated table. - ip route add table 10 ... - - -Applications ------------- -Applications that are to work within a VRF need to bind their socket to the -VRF device: - - setsockopt(sd, SOL_SOCKET, SO_BINDTODEVICE, dev, strlen(dev)+1); - -or to specify the output device using cmsg and IP_PKTINFO. - -By default the scope of the port bindings for unbound sockets is -limited to the default VRF. That is, it will not be matched by packets -arriving on interfaces enslaved to an l3mdev and processes may bind to -the same port if they bind to an l3mdev. - -TCP & UDP services running in the default VRF context (ie., not bound -to any VRF device) can work across all VRF domains by enabling the -tcp_l3mdev_accept and udp_l3mdev_accept sysctl options: - - sysctl -w net.ipv4.tcp_l3mdev_accept=1 - sysctl -w net.ipv4.udp_l3mdev_accept=1 - -These options are disabled by default so that a socket in a VRF is only -selected for packets in that VRF. There is a similar option for RAW -sockets, which is enabled by default for reasons of backwards compatibility. -This is so as to specify the output device with cmsg and IP_PKTINFO, but -using a socket not bound to the corresponding VRF. This allows e.g. older ping -implementations to be run with specifying the device but without executing it -in the VRF. This option can be disabled so that packets received in a VRF -context are only handled by a raw socket bound to the VRF, and packets in the -default VRF are only handled by a socket not bound to any VRF: - - sysctl -w net.ipv4.raw_l3mdev_accept=0 - -netfilter rules on the VRF device can be used to limit access to services -running in the default VRF context as well. - -################################################################################ - -Using iproute2 for VRFs -======================= -iproute2 supports the vrf keyword as of v4.7. For backwards compatibility this -section lists both commands where appropriate -- with the vrf keyword and the -older form without it. - -1. Create a VRF - - To instantiate a VRF device and associate it with a table: - $ ip link add dev NAME type vrf table ID - - As of v4.8 the kernel supports the l3mdev FIB rule where a single rule - covers all VRFs. The l3mdev rule is created for IPv4 and IPv6 on first - device create. - -2. List VRFs - - To list VRFs that have been created: - $ ip [-d] link show type vrf - NOTE: The -d option is needed to show the table id - - For example: - $ ip -d link show type vrf - 11: mgmt: mtu 1500 qdisc pfifo_fast state UP mode DEFAULT group default qlen 1000 - link/ether 72:b3:ba:91:e2:24 brd ff:ff:ff:ff:ff:ff promiscuity 0 - vrf table 1 addrgenmode eui64 - 12: red: mtu 1500 qdisc pfifo_fast state UP mode DEFAULT group default qlen 1000 - link/ether b6:6f:6e:f6:da:73 brd ff:ff:ff:ff:ff:ff promiscuity 0 - vrf table 10 addrgenmode eui64 - 13: blue: mtu 1500 qdisc pfifo_fast state UP mode DEFAULT group default qlen 1000 - link/ether 36:62:e8:7d:bb:8c brd ff:ff:ff:ff:ff:ff promiscuity 0 - vrf table 66 addrgenmode eui64 - 14: green: mtu 1500 qdisc pfifo_fast state UP mode DEFAULT group default qlen 1000 - link/ether e6:28:b8:63:70:bb brd ff:ff:ff:ff:ff:ff promiscuity 0 - vrf table 81 addrgenmode eui64 - - - Or in brief output: - - $ ip -br link show type vrf - mgmt UP 72:b3:ba:91:e2:24 - red UP b6:6f:6e:f6:da:73 - blue UP 36:62:e8:7d:bb:8c - green UP e6:28:b8:63:70:bb - - -3. Assign a Network Interface to a VRF - - Network interfaces are assigned to a VRF by enslaving the netdevice to a - VRF device: - $ ip link set dev NAME master NAME - - On enslavement connected and local routes are automatically moved to the - table associated with the VRF device. - - For example: - $ ip link set dev eth0 master mgmt - - -4. Show Devices Assigned to a VRF - - To show devices that have been assigned to a specific VRF add the master - option to the ip command: - $ ip link show vrf NAME - $ ip link show master NAME - - For example: - $ ip link show vrf red - 3: eth1: mtu 1500 qdisc pfifo_fast master red state UP mode DEFAULT group default qlen 1000 - link/ether 02:00:00:00:02:02 brd ff:ff:ff:ff:ff:ff - 4: eth2: mtu 1500 qdisc pfifo_fast master red state UP mode DEFAULT group default qlen 1000 - link/ether 02:00:00:00:02:03 brd ff:ff:ff:ff:ff:ff - 7: eth5: mtu 1500 qdisc noop master red state DOWN mode DEFAULT group default qlen 1000 - link/ether 02:00:00:00:02:06 brd ff:ff:ff:ff:ff:ff - - - Or using the brief output: - $ ip -br link show vrf red - eth1 UP 02:00:00:00:02:02 - eth2 UP 02:00:00:00:02:03 - eth5 DOWN 02:00:00:00:02:06 - - -5. Show Neighbor Entries for a VRF - - To list neighbor entries associated with devices enslaved to a VRF device - add the master option to the ip command: - $ ip [-6] neigh show vrf NAME - $ ip [-6] neigh show master NAME - - For example: - $ ip neigh show vrf red - 10.2.1.254 dev eth1 lladdr a6:d9:c7:4f:06:23 REACHABLE - 10.2.2.254 dev eth2 lladdr 5e:54:01:6a:ee:80 REACHABLE - - $ ip -6 neigh show vrf red - 2002:1::64 dev eth1 lladdr a6:d9:c7:4f:06:23 REACHABLE - - -6. Show Addresses for a VRF - - To show addresses for interfaces associated with a VRF add the master - option to the ip command: - $ ip addr show vrf NAME - $ ip addr show master NAME - - For example: - $ ip addr show vrf red - 3: eth1: mtu 1500 qdisc pfifo_fast master red state UP group default qlen 1000 - link/ether 02:00:00:00:02:02 brd ff:ff:ff:ff:ff:ff - inet 10.2.1.2/24 brd 10.2.1.255 scope global eth1 - valid_lft forever preferred_lft forever - inet6 2002:1::2/120 scope global - valid_lft forever preferred_lft forever - inet6 fe80::ff:fe00:202/64 scope link - valid_lft forever preferred_lft forever - 4: eth2: mtu 1500 qdisc pfifo_fast master red state UP group default qlen 1000 - link/ether 02:00:00:00:02:03 brd ff:ff:ff:ff:ff:ff - inet 10.2.2.2/24 brd 10.2.2.255 scope global eth2 - valid_lft forever preferred_lft forever - inet6 2002:2::2/120 scope global - valid_lft forever preferred_lft forever - inet6 fe80::ff:fe00:203/64 scope link - valid_lft forever preferred_lft forever - 7: eth5: mtu 1500 qdisc noop master red state DOWN group default qlen 1000 - link/ether 02:00:00:00:02:06 brd ff:ff:ff:ff:ff:ff - - Or in brief format: - $ ip -br addr show vrf red - eth1 UP 10.2.1.2/24 2002:1::2/120 fe80::ff:fe00:202/64 - eth2 UP 10.2.2.2/24 2002:2::2/120 fe80::ff:fe00:203/64 - eth5 DOWN - - -7. Show Routes for a VRF - - To show routes for a VRF use the ip command to display the table associated - with the VRF device: - $ ip [-6] route show vrf NAME - $ ip [-6] route show table ID - - For example: - $ ip route show vrf red - unreachable default metric 4278198272 - broadcast 10.2.1.0 dev eth1 proto kernel scope link src 10.2.1.2 - 10.2.1.0/24 dev eth1 proto kernel scope link src 10.2.1.2 - local 10.2.1.2 dev eth1 proto kernel scope host src 10.2.1.2 - broadcast 10.2.1.255 dev eth1 proto kernel scope link src 10.2.1.2 - broadcast 10.2.2.0 dev eth2 proto kernel scope link src 10.2.2.2 - 10.2.2.0/24 dev eth2 proto kernel scope link src 10.2.2.2 - local 10.2.2.2 dev eth2 proto kernel scope host src 10.2.2.2 - broadcast 10.2.2.255 dev eth2 proto kernel scope link src 10.2.2.2 - - $ ip -6 route show vrf red - local 2002:1:: dev lo proto none metric 0 pref medium - local 2002:1::2 dev lo proto none metric 0 pref medium - 2002:1::/120 dev eth1 proto kernel metric 256 pref medium - local 2002:2:: dev lo proto none metric 0 pref medium - local 2002:2::2 dev lo proto none metric 0 pref medium - 2002:2::/120 dev eth2 proto kernel metric 256 pref medium - local fe80:: dev lo proto none metric 0 pref medium - local fe80:: dev lo proto none metric 0 pref medium - local fe80::ff:fe00:202 dev lo proto none metric 0 pref medium - local fe80::ff:fe00:203 dev lo proto none metric 0 pref medium - fe80::/64 dev eth1 proto kernel metric 256 pref medium - fe80::/64 dev eth2 proto kernel metric 256 pref medium - ff00::/8 dev red metric 256 pref medium - ff00::/8 dev eth1 metric 256 pref medium - ff00::/8 dev eth2 metric 256 pref medium - unreachable default dev lo metric 4278198272 error -101 pref medium - -8. Route Lookup for a VRF - - A test route lookup can be done for a VRF: - $ ip [-6] route get vrf NAME ADDRESS - $ ip [-6] route get oif NAME ADDRESS - - For example: - $ ip route get 10.2.1.40 vrf red - 10.2.1.40 dev eth1 table red src 10.2.1.2 - cache - - $ ip -6 route get 2002:1::32 vrf red - 2002:1::32 from :: dev eth1 table red proto kernel src 2002:1::2 metric 256 pref medium - - -9. Removing Network Interface from a VRF - - Network interfaces are removed from a VRF by breaking the enslavement to - the VRF device: - $ ip link set dev NAME nomaster - - Connected routes are moved back to the default table and local entries are - moved to the local table. - - For example: - $ ip link set dev eth0 nomaster - --------------------------------------------------------------------------------- - -Commands used in this example: - -cat >> /etc/iproute2/rt_tables.d/vrf.conf < M: Shrijeet Mukherjee L: netdev@vger.kernel.org S: Maintained -F: Documentation/networking/vrf.txt +F: Documentation/networking/vrf.rst F: drivers/net/vrf.c VSPRINTF -- cgit v1.2.3-59-g8ed1b From d2a85c184ac6e738daa5e42f89b1f353910d6a89 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Fri, 1 May 2020 16:44:26 +0200 Subject: docs: networking: convert vxlan.txt to ReST - add SPDX header; - adjust title markup; - mark code blocks and literals as such; - adjust identation, whitespaces and blank lines where needed; - add to networking/index.rst. Signed-off-by: Mauro Carvalho Chehab Signed-off-by: David S. Miller --- Documentation/networking/index.rst | 1 + Documentation/networking/vxlan.rst | 60 ++++++++++++++++++++++++++++++++++++++ Documentation/networking/vxlan.txt | 51 -------------------------------- 3 files changed, 61 insertions(+), 51 deletions(-) create mode 100644 Documentation/networking/vxlan.rst delete mode 100644 Documentation/networking/vxlan.txt diff --git a/Documentation/networking/index.rst b/Documentation/networking/index.rst index 2227b9f4509d..a72fdfb391b6 100644 --- a/Documentation/networking/index.rst +++ b/Documentation/networking/index.rst @@ -114,6 +114,7 @@ Contents: tuntap udplite vrf + vxlan .. only:: subproject and html diff --git a/Documentation/networking/vxlan.rst b/Documentation/networking/vxlan.rst new file mode 100644 index 000000000000..ce239fa01848 --- /dev/null +++ b/Documentation/networking/vxlan.rst @@ -0,0 +1,60 @@ +.. SPDX-License-Identifier: GPL-2.0 + +====================================================== +Virtual eXtensible Local Area Networking documentation +====================================================== + +The VXLAN protocol is a tunnelling protocol designed to solve the +problem of limited VLAN IDs (4096) in IEEE 802.1q. With VXLAN the +size of the identifier is expanded to 24 bits (16777216). + +VXLAN is described by IETF RFC 7348, and has been implemented by a +number of vendors. The protocol runs over UDP using a single +destination port. This document describes the Linux kernel tunnel +device, there is also a separate implementation of VXLAN for +Openvswitch. + +Unlike most tunnels, a VXLAN is a 1 to N network, not just point to +point. A VXLAN device can learn the IP address of the other endpoint +either dynamically in a manner similar to a learning bridge, or make +use of statically-configured forwarding entries. + +The management of vxlan is done in a manner similar to its two closest +neighbors GRE and VLAN. Configuring VXLAN requires the version of +iproute2 that matches the kernel release where VXLAN was first merged +upstream. + +1. Create vxlan device:: + + # ip link add vxlan0 type vxlan id 42 group 239.1.1.1 dev eth1 dstport 4789 + +This creates a new device named vxlan0. The device uses the multicast +group 239.1.1.1 over eth1 to handle traffic for which there is no +entry in the forwarding table. The destination port number is set to +the IANA-assigned value of 4789. The Linux implementation of VXLAN +pre-dates the IANA's selection of a standard destination port number +and uses the Linux-selected value by default to maintain backwards +compatibility. + +2. Delete vxlan device:: + + # ip link delete vxlan0 + +3. Show vxlan info:: + + # ip -d link show vxlan0 + +It is possible to create, destroy and display the vxlan +forwarding table using the new bridge command. + +1. Create forwarding table entry:: + + # bridge fdb add to 00:17:42:8a:b4:05 dst 192.19.0.2 dev vxlan0 + +2. Delete forwarding table entry:: + + # bridge fdb delete 00:17:42:8a:b4:05 dev vxlan0 + +3. Show forwarding table:: + + # bridge fdb show dev vxlan0 diff --git a/Documentation/networking/vxlan.txt b/Documentation/networking/vxlan.txt deleted file mode 100644 index c28f4989c3f0..000000000000 --- a/Documentation/networking/vxlan.txt +++ /dev/null @@ -1,51 +0,0 @@ -Virtual eXtensible Local Area Networking documentation -====================================================== - -The VXLAN protocol is a tunnelling protocol designed to solve the -problem of limited VLAN IDs (4096) in IEEE 802.1q. With VXLAN the -size of the identifier is expanded to 24 bits (16777216). - -VXLAN is described by IETF RFC 7348, and has been implemented by a -number of vendors. The protocol runs over UDP using a single -destination port. This document describes the Linux kernel tunnel -device, there is also a separate implementation of VXLAN for -Openvswitch. - -Unlike most tunnels, a VXLAN is a 1 to N network, not just point to -point. A VXLAN device can learn the IP address of the other endpoint -either dynamically in a manner similar to a learning bridge, or make -use of statically-configured forwarding entries. - -The management of vxlan is done in a manner similar to its two closest -neighbors GRE and VLAN. Configuring VXLAN requires the version of -iproute2 that matches the kernel release where VXLAN was first merged -upstream. - -1. Create vxlan device - # ip link add vxlan0 type vxlan id 42 group 239.1.1.1 dev eth1 dstport 4789 - -This creates a new device named vxlan0. The device uses the multicast -group 239.1.1.1 over eth1 to handle traffic for which there is no -entry in the forwarding table. The destination port number is set to -the IANA-assigned value of 4789. The Linux implementation of VXLAN -pre-dates the IANA's selection of a standard destination port number -and uses the Linux-selected value by default to maintain backwards -compatibility. - -2. Delete vxlan device - # ip link delete vxlan0 - -3. Show vxlan info - # ip -d link show vxlan0 - -It is possible to create, destroy and display the vxlan -forwarding table using the new bridge command. - -1. Create forwarding table entry - # bridge fdb add to 00:17:42:8a:b4:05 dst 192.19.0.2 dev vxlan0 - -2. Delete forwarding table entry - # bridge fdb delete 00:17:42:8a:b4:05 dev vxlan0 - -3. Show forwarding table - # bridge fdb show dev vxlan0 -- cgit v1.2.3-59-g8ed1b From 883780af72090daf9ab53779a3085a6ddfc468ca Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Fri, 1 May 2020 16:44:27 +0200 Subject: docs: networking: convert x25-iface.txt to ReST Not much to be done here: - add SPDX header; - adjust title markup; - remove a tail whitespace; - add to networking/index.rst. Signed-off-by: Mauro Carvalho Chehab Signed-off-by: David S. Miller --- Documentation/networking/index.rst | 1 + Documentation/networking/x25-iface.rst | 129 +++++++++++++++++++++++++++++++++ Documentation/networking/x25-iface.txt | 123 ------------------------------- include/uapi/linux/if_x25.h | 2 +- net/x25/Kconfig | 2 +- 5 files changed, 132 insertions(+), 125 deletions(-) create mode 100644 Documentation/networking/x25-iface.rst delete mode 100644 Documentation/networking/x25-iface.txt diff --git a/Documentation/networking/index.rst b/Documentation/networking/index.rst index a72fdfb391b6..7a4bdbc111b0 100644 --- a/Documentation/networking/index.rst +++ b/Documentation/networking/index.rst @@ -115,6 +115,7 @@ Contents: udplite vrf vxlan + x25-iface .. only:: subproject and html diff --git a/Documentation/networking/x25-iface.rst b/Documentation/networking/x25-iface.rst new file mode 100644 index 000000000000..df401891dce6 --- /dev/null +++ b/Documentation/networking/x25-iface.rst @@ -0,0 +1,129 @@ +.. SPDX-License-Identifier: GPL-2.0 + +============================- +X.25 Device Driver Interface +============================- + +Version 1.1 + + Jonathan Naylor 26.12.96 + +This is a description of the messages to be passed between the X.25 Packet +Layer and the X.25 device driver. They are designed to allow for the easy +setting of the LAPB mode from within the Packet Layer. + +The X.25 device driver will be coded normally as per the Linux device driver +standards. Most X.25 device drivers will be moderately similar to the +already existing Ethernet device drivers. However unlike those drivers, the +X.25 device driver has a state associated with it, and this information +needs to be passed to and from the Packet Layer for proper operation. + +All messages are held in sk_buff's just like real data to be transmitted +over the LAPB link. The first byte of the skbuff indicates the meaning of +the rest of the skbuff, if any more information does exist. + + +Packet Layer to Device Driver +----------------------------- + +First Byte = 0x00 (X25_IFACE_DATA) + +This indicates that the rest of the skbuff contains data to be transmitted +over the LAPB link. The LAPB link should already exist before any data is +passed down. + +First Byte = 0x01 (X25_IFACE_CONNECT) + +Establish the LAPB link. If the link is already established then the connect +confirmation message should be returned as soon as possible. + +First Byte = 0x02 (X25_IFACE_DISCONNECT) + +Terminate the LAPB link. If it is already disconnected then the disconnect +confirmation message should be returned as soon as possible. + +First Byte = 0x03 (X25_IFACE_PARAMS) + +LAPB parameters. To be defined. + + +Device Driver to Packet Layer +----------------------------- + +First Byte = 0x00 (X25_IFACE_DATA) + +This indicates that the rest of the skbuff contains data that has been +received over the LAPB link. + +First Byte = 0x01 (X25_IFACE_CONNECT) + +LAPB link has been established. The same message is used for both a LAPB +link connect_confirmation and a connect_indication. + +First Byte = 0x02 (X25_IFACE_DISCONNECT) + +LAPB link has been terminated. This same message is used for both a LAPB +link disconnect_confirmation and a disconnect_indication. + +First Byte = 0x03 (X25_IFACE_PARAMS) + +LAPB parameters. To be defined. + + + +Possible Problems +================= + +(Henner Eisen, 2000-10-28) + +The X.25 packet layer protocol depends on a reliable datalink service. +The LAPB protocol provides such reliable service. But this reliability +is not preserved by the Linux network device driver interface: + +- With Linux 2.4.x (and above) SMP kernels, packet ordering is not + preserved. Even if a device driver calls netif_rx(skb1) and later + netif_rx(skb2), skb2 might be delivered to the network layer + earlier that skb1. +- Data passed upstream by means of netif_rx() might be dropped by the + kernel if the backlog queue is congested. + +The X.25 packet layer protocol will detect this and reset the virtual +call in question. But many upper layer protocols are not designed to +handle such N-Reset events gracefully. And frequent N-Reset events +will always degrade performance. + +Thus, driver authors should make netif_rx() as reliable as possible: + +SMP re-ordering will not occur if the driver's interrupt handler is +always executed on the same CPU. Thus, + +- Driver authors should use irq affinity for the interrupt handler. + +The probability of packet loss due to backlog congestion can be +reduced by the following measures or a combination thereof: + +(1) Drivers for kernel versions 2.4.x and above should always check the + return value of netif_rx(). If it returns NET_RX_DROP, the + driver's LAPB protocol must not confirm reception of the frame + to the peer. + This will reliably suppress packet loss. The LAPB protocol will + automatically cause the peer to re-transmit the dropped packet + later. + The lapb module interface was modified to support this. Its + data_indication() method should now transparently pass the + netif_rx() return value to the (lapb module) caller. +(2) Drivers for kernel versions 2.2.x should always check the global + variable netdev_dropping when a new frame is received. The driver + should only call netif_rx() if netdev_dropping is zero. Otherwise + the driver should not confirm delivery of the frame and drop it. + Alternatively, the driver can queue the frame internally and call + netif_rx() later when netif_dropping is 0 again. In that case, delivery + confirmation should also be deferred such that the internal queue + cannot grow to much. + This will not reliably avoid packet loss, but the probability + of packet loss in netif_rx() path will be significantly reduced. +(3) Additionally, driver authors might consider to support + CONFIG_NET_HW_FLOWCONTROL. This allows the driver to be woken up + when a previously congested backlog queue becomes empty again. + The driver could uses this for flow-controlling the peer by means + of the LAPB protocol's flow-control service. diff --git a/Documentation/networking/x25-iface.txt b/Documentation/networking/x25-iface.txt deleted file mode 100644 index 7f213b556e85..000000000000 --- a/Documentation/networking/x25-iface.txt +++ /dev/null @@ -1,123 +0,0 @@ - X.25 Device Driver Interface 1.1 - - Jonathan Naylor 26.12.96 - -This is a description of the messages to be passed between the X.25 Packet -Layer and the X.25 device driver. They are designed to allow for the easy -setting of the LAPB mode from within the Packet Layer. - -The X.25 device driver will be coded normally as per the Linux device driver -standards. Most X.25 device drivers will be moderately similar to the -already existing Ethernet device drivers. However unlike those drivers, the -X.25 device driver has a state associated with it, and this information -needs to be passed to and from the Packet Layer for proper operation. - -All messages are held in sk_buff's just like real data to be transmitted -over the LAPB link. The first byte of the skbuff indicates the meaning of -the rest of the skbuff, if any more information does exist. - - -Packet Layer to Device Driver ------------------------------ - -First Byte = 0x00 (X25_IFACE_DATA) - -This indicates that the rest of the skbuff contains data to be transmitted -over the LAPB link. The LAPB link should already exist before any data is -passed down. - -First Byte = 0x01 (X25_IFACE_CONNECT) - -Establish the LAPB link. If the link is already established then the connect -confirmation message should be returned as soon as possible. - -First Byte = 0x02 (X25_IFACE_DISCONNECT) - -Terminate the LAPB link. If it is already disconnected then the disconnect -confirmation message should be returned as soon as possible. - -First Byte = 0x03 (X25_IFACE_PARAMS) - -LAPB parameters. To be defined. - - -Device Driver to Packet Layer ------------------------------ - -First Byte = 0x00 (X25_IFACE_DATA) - -This indicates that the rest of the skbuff contains data that has been -received over the LAPB link. - -First Byte = 0x01 (X25_IFACE_CONNECT) - -LAPB link has been established. The same message is used for both a LAPB -link connect_confirmation and a connect_indication. - -First Byte = 0x02 (X25_IFACE_DISCONNECT) - -LAPB link has been terminated. This same message is used for both a LAPB -link disconnect_confirmation and a disconnect_indication. - -First Byte = 0x03 (X25_IFACE_PARAMS) - -LAPB parameters. To be defined. - - - -Possible Problems -================= - -(Henner Eisen, 2000-10-28) - -The X.25 packet layer protocol depends on a reliable datalink service. -The LAPB protocol provides such reliable service. But this reliability -is not preserved by the Linux network device driver interface: - -- With Linux 2.4.x (and above) SMP kernels, packet ordering is not - preserved. Even if a device driver calls netif_rx(skb1) and later - netif_rx(skb2), skb2 might be delivered to the network layer - earlier that skb1. -- Data passed upstream by means of netif_rx() might be dropped by the - kernel if the backlog queue is congested. - -The X.25 packet layer protocol will detect this and reset the virtual -call in question. But many upper layer protocols are not designed to -handle such N-Reset events gracefully. And frequent N-Reset events -will always degrade performance. - -Thus, driver authors should make netif_rx() as reliable as possible: - -SMP re-ordering will not occur if the driver's interrupt handler is -always executed on the same CPU. Thus, - -- Driver authors should use irq affinity for the interrupt handler. - -The probability of packet loss due to backlog congestion can be -reduced by the following measures or a combination thereof: - -(1) Drivers for kernel versions 2.4.x and above should always check the - return value of netif_rx(). If it returns NET_RX_DROP, the - driver's LAPB protocol must not confirm reception of the frame - to the peer. - This will reliably suppress packet loss. The LAPB protocol will - automatically cause the peer to re-transmit the dropped packet - later. - The lapb module interface was modified to support this. Its - data_indication() method should now transparently pass the - netif_rx() return value to the (lapb module) caller. -(2) Drivers for kernel versions 2.2.x should always check the global - variable netdev_dropping when a new frame is received. The driver - should only call netif_rx() if netdev_dropping is zero. Otherwise - the driver should not confirm delivery of the frame and drop it. - Alternatively, the driver can queue the frame internally and call - netif_rx() later when netif_dropping is 0 again. In that case, delivery - confirmation should also be deferred such that the internal queue - cannot grow to much. - This will not reliably avoid packet loss, but the probability - of packet loss in netif_rx() path will be significantly reduced. -(3) Additionally, driver authors might consider to support - CONFIG_NET_HW_FLOWCONTROL. This allows the driver to be woken up - when a previously congested backlog queue becomes empty again. - The driver could uses this for flow-controlling the peer by means - of the LAPB protocol's flow-control service. diff --git a/include/uapi/linux/if_x25.h b/include/uapi/linux/if_x25.h index 5d962448345f..3a5938e38370 100644 --- a/include/uapi/linux/if_x25.h +++ b/include/uapi/linux/if_x25.h @@ -18,7 +18,7 @@ #include -/* Documentation/networking/x25-iface.txt */ +/* Documentation/networking/x25-iface.rst */ #define X25_IFACE_DATA 0x00 #define X25_IFACE_CONNECT 0x01 #define X25_IFACE_DISCONNECT 0x02 diff --git a/net/x25/Kconfig b/net/x25/Kconfig index 2ecb2e5e241e..a328f79885d1 100644 --- a/net/x25/Kconfig +++ b/net/x25/Kconfig @@ -21,7 +21,7 @@ config X25 . Information about X.25 for Linux is contained in the files and - . + . One connects to an X.25 network either with a dedicated network card using the X.21 protocol (not yet supported by Linux) or one can do -- cgit v1.2.3-59-g8ed1b From c4ea03fdfd122b4ff293bff643c2369852e9cc1c Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Fri, 1 May 2020 16:44:28 +0200 Subject: docs: networking: convert x25.txt to ReST Not much to be done here: - add SPDX header; - add a document title; - add to networking/index.rst. Signed-off-by: Mauro Carvalho Chehab Signed-off-by: David S. Miller --- Documentation/networking/index.rst | 1 + Documentation/networking/x25.rst | 48 ++++++++++++++++++++++++++++++++++++++ Documentation/networking/x25.txt | 44 ---------------------------------- net/x25/Kconfig | 2 +- 4 files changed, 50 insertions(+), 45 deletions(-) create mode 100644 Documentation/networking/x25.rst delete mode 100644 Documentation/networking/x25.txt diff --git a/Documentation/networking/index.rst b/Documentation/networking/index.rst index 7a4bdbc111b0..75521e6c473b 100644 --- a/Documentation/networking/index.rst +++ b/Documentation/networking/index.rst @@ -116,6 +116,7 @@ Contents: vrf vxlan x25-iface + x25 .. only:: subproject and html diff --git a/Documentation/networking/x25.rst b/Documentation/networking/x25.rst new file mode 100644 index 000000000000..00e45d384ba0 --- /dev/null +++ b/Documentation/networking/x25.rst @@ -0,0 +1,48 @@ +.. SPDX-License-Identifier: GPL-2.0 + +================== +Linux X.25 Project +================== + +As my third year dissertation at University I have taken it upon myself to +write an X.25 implementation for Linux. My aim is to provide a complete X.25 +Packet Layer and a LAPB module to allow for "normal" X.25 to be run using +Linux. There are two sorts of X.25 cards available, intelligent ones that +implement LAPB on the card itself, and unintelligent ones that simply do +framing, bit-stuffing and checksumming. These both need to be handled by the +system. + +I therefore decided to write the implementation such that as far as the +Packet Layer is concerned, the link layer was being performed by a lower +layer of the Linux kernel and therefore it did not concern itself with +implementation of LAPB. Therefore the LAPB modules would be called by +unintelligent X.25 card drivers and not by intelligent ones, this would +provide a uniform device driver interface, and simplify configuration. + +To confuse matters a little, an 802.2 LLC implementation for Linux is being +written which will allow X.25 to be run over an Ethernet (or Token Ring) and +conform with the JNT "Pink Book", this will have a different interface to +the Packet Layer but there will be no confusion since the class of device +being served by the LLC will be completely separate from LAPB. The LLC +implementation is being done as part of another protocol project (SNA) and +by a different author. + +Just when you thought that it could not become more confusing, another +option appeared, XOT. This allows X.25 Packet Layer frames to operate over +the Internet using TCP/IP as a reliable link layer. RFC1613 specifies the +format and behaviour of the protocol. If time permits this option will also +be actively considered. + +A linux-x25 mailing list has been created at vger.kernel.org to support the +development and use of Linux X.25. It is early days yet, but interested +parties are welcome to subscribe to it. Just send a message to +majordomo@vger.kernel.org with the following in the message body: + +subscribe linux-x25 +end + +The contents of the Subject line are ignored. + +Jonathan + +g4klx@g4klx.demon.co.uk diff --git a/Documentation/networking/x25.txt b/Documentation/networking/x25.txt deleted file mode 100644 index c91c6d7159ff..000000000000 --- a/Documentation/networking/x25.txt +++ /dev/null @@ -1,44 +0,0 @@ -Linux X.25 Project - -As my third year dissertation at University I have taken it upon myself to -write an X.25 implementation for Linux. My aim is to provide a complete X.25 -Packet Layer and a LAPB module to allow for "normal" X.25 to be run using -Linux. There are two sorts of X.25 cards available, intelligent ones that -implement LAPB on the card itself, and unintelligent ones that simply do -framing, bit-stuffing and checksumming. These both need to be handled by the -system. - -I therefore decided to write the implementation such that as far as the -Packet Layer is concerned, the link layer was being performed by a lower -layer of the Linux kernel and therefore it did not concern itself with -implementation of LAPB. Therefore the LAPB modules would be called by -unintelligent X.25 card drivers and not by intelligent ones, this would -provide a uniform device driver interface, and simplify configuration. - -To confuse matters a little, an 802.2 LLC implementation for Linux is being -written which will allow X.25 to be run over an Ethernet (or Token Ring) and -conform with the JNT "Pink Book", this will have a different interface to -the Packet Layer but there will be no confusion since the class of device -being served by the LLC will be completely separate from LAPB. The LLC -implementation is being done as part of another protocol project (SNA) and -by a different author. - -Just when you thought that it could not become more confusing, another -option appeared, XOT. This allows X.25 Packet Layer frames to operate over -the Internet using TCP/IP as a reliable link layer. RFC1613 specifies the -format and behaviour of the protocol. If time permits this option will also -be actively considered. - -A linux-x25 mailing list has been created at vger.kernel.org to support the -development and use of Linux X.25. It is early days yet, but interested -parties are welcome to subscribe to it. Just send a message to -majordomo@vger.kernel.org with the following in the message body: - -subscribe linux-x25 -end - -The contents of the Subject line are ignored. - -Jonathan - -g4klx@g4klx.demon.co.uk diff --git a/net/x25/Kconfig b/net/x25/Kconfig index a328f79885d1..9f0d58b0b90b 100644 --- a/net/x25/Kconfig +++ b/net/x25/Kconfig @@ -20,7 +20,7 @@ config X25 You can read more about X.25 at and . Information about X.25 for Linux is contained in the files - and + and . One connects to an X.25 network either with a dedicated network card -- cgit v1.2.3-59-g8ed1b From c4a0eb9350183d1d188793c534e4141bcf2ccea8 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Fri, 1 May 2020 16:44:29 +0200 Subject: docs: networking: convert xfrm_device.txt to ReST - add SPDX header; - mark code blocks and literals as such; - mark tables as such; - adjust identation, whitespaces and blank lines where needed; - add to networking/index.rst. Signed-off-by: Mauro Carvalho Chehab Signed-off-by: David S. Miller --- Documentation/networking/index.rst | 1 + Documentation/networking/xfrm_device.rst | 151 +++++++++++++++++++++++++++++++ Documentation/networking/xfrm_device.txt | 140 ---------------------------- 3 files changed, 152 insertions(+), 140 deletions(-) create mode 100644 Documentation/networking/xfrm_device.rst delete mode 100644 Documentation/networking/xfrm_device.txt diff --git a/Documentation/networking/index.rst b/Documentation/networking/index.rst index 75521e6c473b..e31f6cb564b4 100644 --- a/Documentation/networking/index.rst +++ b/Documentation/networking/index.rst @@ -117,6 +117,7 @@ Contents: vxlan x25-iface x25 + xfrm_device .. only:: subproject and html diff --git a/Documentation/networking/xfrm_device.rst b/Documentation/networking/xfrm_device.rst new file mode 100644 index 000000000000..da1073acda96 --- /dev/null +++ b/Documentation/networking/xfrm_device.rst @@ -0,0 +1,151 @@ +.. SPDX-License-Identifier: GPL-2.0 + +=============================================== +XFRM device - offloading the IPsec computations +=============================================== + +Shannon Nelson + + +Overview +======== + +IPsec is a useful feature for securing network traffic, but the +computational cost is high: a 10Gbps link can easily be brought down +to under 1Gbps, depending on the traffic and link configuration. +Luckily, there are NICs that offer a hardware based IPsec offload which +can radically increase throughput and decrease CPU utilization. The XFRM +Device interface allows NIC drivers to offer to the stack access to the +hardware offload. + +Userland access to the offload is typically through a system such as +libreswan or KAME/raccoon, but the iproute2 'ip xfrm' command set can +be handy when experimenting. An example command might look something +like this:: + + ip x s add proto esp dst 14.0.0.70 src 14.0.0.52 spi 0x07 mode transport \ + reqid 0x07 replay-window 32 \ + aead 'rfc4106(gcm(aes))' 0x44434241343332312423222114131211f4f3f2f1 128 \ + sel src 14.0.0.52/24 dst 14.0.0.70/24 proto tcp \ + offload dev eth4 dir in + +Yes, that's ugly, but that's what shell scripts and/or libreswan are for. + + + +Callbacks to implement +====================== + +:: + + /* from include/linux/netdevice.h */ + struct xfrmdev_ops { + int (*xdo_dev_state_add) (struct xfrm_state *x); + void (*xdo_dev_state_delete) (struct xfrm_state *x); + void (*xdo_dev_state_free) (struct xfrm_state *x); + bool (*xdo_dev_offload_ok) (struct sk_buff *skb, + struct xfrm_state *x); + void (*xdo_dev_state_advance_esn) (struct xfrm_state *x); + }; + +The NIC driver offering ipsec offload will need to implement these +callbacks to make the offload available to the network stack's +XFRM subsytem. Additionally, the feature bits NETIF_F_HW_ESP and +NETIF_F_HW_ESP_TX_CSUM will signal the availability of the offload. + + + +Flow +==== + +At probe time and before the call to register_netdev(), the driver should +set up local data structures and XFRM callbacks, and set the feature bits. +The XFRM code's listener will finish the setup on NETDEV_REGISTER. + +:: + + adapter->netdev->xfrmdev_ops = &ixgbe_xfrmdev_ops; + adapter->netdev->features |= NETIF_F_HW_ESP; + adapter->netdev->hw_enc_features |= NETIF_F_HW_ESP; + +When new SAs are set up with a request for "offload" feature, the +driver's xdo_dev_state_add() will be given the new SA to be offloaded +and an indication of whether it is for Rx or Tx. The driver should + + - verify the algorithm is supported for offloads + - store the SA information (key, salt, target-ip, protocol, etc) + - enable the HW offload of the SA + - return status value: + + =========== =================================== + 0 success + -EOPNETSUPP offload not supported, try SW IPsec + other fail the request + =========== =================================== + +The driver can also set an offload_handle in the SA, an opaque void pointer +that can be used to convey context into the fast-path offload requests:: + + xs->xso.offload_handle = context; + + +When the network stack is preparing an IPsec packet for an SA that has +been setup for offload, it first calls into xdo_dev_offload_ok() with +the skb and the intended offload state to ask the driver if the offload +will serviceable. This can check the packet information to be sure the +offload can be supported (e.g. IPv4 or IPv6, no IPv4 options, etc) and +return true of false to signify its support. + +When ready to send, the driver needs to inspect the Tx packet for the +offload information, including the opaque context, and set up the packet +send accordingly:: + + xs = xfrm_input_state(skb); + context = xs->xso.offload_handle; + set up HW for send + +The stack has already inserted the appropriate IPsec headers in the +packet data, the offload just needs to do the encryption and fix up the +header values. + + +When a packet is received and the HW has indicated that it offloaded a +decryption, the driver needs to add a reference to the decoded SA into +the packet's skb. At this point the data should be decrypted but the +IPsec headers are still in the packet data; they are removed later up +the stack in xfrm_input(). + + find and hold the SA that was used to the Rx skb:: + + get spi, protocol, and destination IP from packet headers + xs = find xs from (spi, protocol, dest_IP) + xfrm_state_hold(xs); + + store the state information into the skb:: + + sp = secpath_set(skb); + if (!sp) return; + sp->xvec[sp->len++] = xs; + sp->olen++; + + indicate the success and/or error status of the offload:: + + xo = xfrm_offload(skb); + xo->flags = CRYPTO_DONE; + xo->status = crypto_status; + + hand the packet to napi_gro_receive() as usual + +In ESN mode, xdo_dev_state_advance_esn() is called from xfrm_replay_advance_esn(). +Driver will check packet seq number and update HW ESN state machine if needed. + +When the SA is removed by the user, the driver's xdo_dev_state_delete() +is asked to disable the offload. Later, xdo_dev_state_free() is called +from a garbage collection routine after all reference counts to the state +have been removed and any remaining resources can be cleared for the +offload state. How these are used by the driver will depend on specific +hardware needs. + +As a netdev is set to DOWN the XFRM stack's netdev listener will call +xdo_dev_state_delete() and xdo_dev_state_free() on any remaining offloaded +states. diff --git a/Documentation/networking/xfrm_device.txt b/Documentation/networking/xfrm_device.txt deleted file mode 100644 index a1c904dc70dc..000000000000 --- a/Documentation/networking/xfrm_device.txt +++ /dev/null @@ -1,140 +0,0 @@ - -=============================================== -XFRM device - offloading the IPsec computations -=============================================== -Shannon Nelson - - -Overview -======== - -IPsec is a useful feature for securing network traffic, but the -computational cost is high: a 10Gbps link can easily be brought down -to under 1Gbps, depending on the traffic and link configuration. -Luckily, there are NICs that offer a hardware based IPsec offload which -can radically increase throughput and decrease CPU utilization. The XFRM -Device interface allows NIC drivers to offer to the stack access to the -hardware offload. - -Userland access to the offload is typically through a system such as -libreswan or KAME/raccoon, but the iproute2 'ip xfrm' command set can -be handy when experimenting. An example command might look something -like this: - - ip x s add proto esp dst 14.0.0.70 src 14.0.0.52 spi 0x07 mode transport \ - reqid 0x07 replay-window 32 \ - aead 'rfc4106(gcm(aes))' 0x44434241343332312423222114131211f4f3f2f1 128 \ - sel src 14.0.0.52/24 dst 14.0.0.70/24 proto tcp \ - offload dev eth4 dir in - -Yes, that's ugly, but that's what shell scripts and/or libreswan are for. - - - -Callbacks to implement -====================== - -/* from include/linux/netdevice.h */ -struct xfrmdev_ops { - int (*xdo_dev_state_add) (struct xfrm_state *x); - void (*xdo_dev_state_delete) (struct xfrm_state *x); - void (*xdo_dev_state_free) (struct xfrm_state *x); - bool (*xdo_dev_offload_ok) (struct sk_buff *skb, - struct xfrm_state *x); - void (*xdo_dev_state_advance_esn) (struct xfrm_state *x); -}; - -The NIC driver offering ipsec offload will need to implement these -callbacks to make the offload available to the network stack's -XFRM subsytem. Additionally, the feature bits NETIF_F_HW_ESP and -NETIF_F_HW_ESP_TX_CSUM will signal the availability of the offload. - - - -Flow -==== - -At probe time and before the call to register_netdev(), the driver should -set up local data structures and XFRM callbacks, and set the feature bits. -The XFRM code's listener will finish the setup on NETDEV_REGISTER. - - adapter->netdev->xfrmdev_ops = &ixgbe_xfrmdev_ops; - adapter->netdev->features |= NETIF_F_HW_ESP; - adapter->netdev->hw_enc_features |= NETIF_F_HW_ESP; - -When new SAs are set up with a request for "offload" feature, the -driver's xdo_dev_state_add() will be given the new SA to be offloaded -and an indication of whether it is for Rx or Tx. The driver should - - verify the algorithm is supported for offloads - - store the SA information (key, salt, target-ip, protocol, etc) - - enable the HW offload of the SA - - return status value: - 0 success - -EOPNETSUPP offload not supported, try SW IPsec - other fail the request - -The driver can also set an offload_handle in the SA, an opaque void pointer -that can be used to convey context into the fast-path offload requests. - - xs->xso.offload_handle = context; - - -When the network stack is preparing an IPsec packet for an SA that has -been setup for offload, it first calls into xdo_dev_offload_ok() with -the skb and the intended offload state to ask the driver if the offload -will serviceable. This can check the packet information to be sure the -offload can be supported (e.g. IPv4 or IPv6, no IPv4 options, etc) and -return true of false to signify its support. - -When ready to send, the driver needs to inspect the Tx packet for the -offload information, including the opaque context, and set up the packet -send accordingly. - - xs = xfrm_input_state(skb); - context = xs->xso.offload_handle; - set up HW for send - -The stack has already inserted the appropriate IPsec headers in the -packet data, the offload just needs to do the encryption and fix up the -header values. - - -When a packet is received and the HW has indicated that it offloaded a -decryption, the driver needs to add a reference to the decoded SA into -the packet's skb. At this point the data should be decrypted but the -IPsec headers are still in the packet data; they are removed later up -the stack in xfrm_input(). - - find and hold the SA that was used to the Rx skb - get spi, protocol, and destination IP from packet headers - xs = find xs from (spi, protocol, dest_IP) - xfrm_state_hold(xs); - - store the state information into the skb - sp = secpath_set(skb); - if (!sp) return; - sp->xvec[sp->len++] = xs; - sp->olen++; - - indicate the success and/or error status of the offload - xo = xfrm_offload(skb); - xo->flags = CRYPTO_DONE; - xo->status = crypto_status; - - hand the packet to napi_gro_receive() as usual - -In ESN mode, xdo_dev_state_advance_esn() is called from xfrm_replay_advance_esn(). -Driver will check packet seq number and update HW ESN state machine if needed. - -When the SA is removed by the user, the driver's xdo_dev_state_delete() -is asked to disable the offload. Later, xdo_dev_state_free() is called -from a garbage collection routine after all reference counts to the state -have been removed and any remaining resources can be cleared for the -offload state. How these are used by the driver will depend on specific -hardware needs. - -As a netdev is set to DOWN the XFRM stack's netdev listener will call -xdo_dev_state_delete() and xdo_dev_state_free() on any remaining offloaded -states. - - -- cgit v1.2.3-59-g8ed1b From da62baada5cc94037ef91ed0c414a930a3a06520 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Fri, 1 May 2020 16:44:30 +0200 Subject: docs: networking: convert xfrm_proc.txt to ReST - add SPDX header; - adjust title markup; - adjust identation, whitespaces and blank lines where needed; - add to networking/index.rst. Signed-off-by: Mauro Carvalho Chehab Signed-off-by: David S. Miller --- Documentation/networking/index.rst | 1 + Documentation/networking/xfrm_proc.rst | 113 +++++++++++++++++++++++++++++++++ Documentation/networking/xfrm_proc.txt | 82 ------------------------ 3 files changed, 114 insertions(+), 82 deletions(-) create mode 100644 Documentation/networking/xfrm_proc.rst delete mode 100644 Documentation/networking/xfrm_proc.txt diff --git a/Documentation/networking/index.rst b/Documentation/networking/index.rst index e31f6cb564b4..3fe70efb632e 100644 --- a/Documentation/networking/index.rst +++ b/Documentation/networking/index.rst @@ -118,6 +118,7 @@ Contents: x25-iface x25 xfrm_device + xfrm_proc .. only:: subproject and html diff --git a/Documentation/networking/xfrm_proc.rst b/Documentation/networking/xfrm_proc.rst new file mode 100644 index 000000000000..0a771c5a7399 --- /dev/null +++ b/Documentation/networking/xfrm_proc.rst @@ -0,0 +1,113 @@ +.. SPDX-License-Identifier: GPL-2.0 + +================================== +XFRM proc - /proc/net/xfrm_* files +================================== + +Masahide NAKAMURA + + +Transformation Statistics +------------------------- + +The xfrm_proc code is a set of statistics showing numbers of packets +dropped by the transformation code and why. These counters are defined +as part of the linux private MIB. These counters can be viewed in +/proc/net/xfrm_stat. + + +Inbound errors +~~~~~~~~~~~~~~ + +XfrmInError: + All errors which is not matched others + +XfrmInBufferError: + No buffer is left + +XfrmInHdrError: + Header error + +XfrmInNoStates: + No state is found + i.e. Either inbound SPI, address, or IPsec protocol at SA is wrong + +XfrmInStateProtoError: + Transformation protocol specific error + e.g. SA key is wrong + +XfrmInStateModeError: + Transformation mode specific error + +XfrmInStateSeqError: + Sequence error + i.e. Sequence number is out of window + +XfrmInStateExpired: + State is expired + +XfrmInStateMismatch: + State has mismatch option + e.g. UDP encapsulation type is mismatch + +XfrmInStateInvalid: + State is invalid + +XfrmInTmplMismatch: + No matching template for states + e.g. Inbound SAs are correct but SP rule is wrong + +XfrmInNoPols: + No policy is found for states + e.g. Inbound SAs are correct but no SP is found + +XfrmInPolBlock: + Policy discards + +XfrmInPolError: + Policy error + +XfrmAcquireError: + State hasn't been fully acquired before use + +XfrmFwdHdrError: + Forward routing of a packet is not allowed + +Outbound errors +~~~~~~~~~~~~~~~ +XfrmOutError: + All errors which is not matched others + +XfrmOutBundleGenError: + Bundle generation error + +XfrmOutBundleCheckError: + Bundle check error + +XfrmOutNoStates: + No state is found + +XfrmOutStateProtoError: + Transformation protocol specific error + +XfrmOutStateModeError: + Transformation mode specific error + +XfrmOutStateSeqError: + Sequence error + i.e. Sequence number overflow + +XfrmOutStateExpired: + State is expired + +XfrmOutPolBlock: + Policy discards + +XfrmOutPolDead: + Policy is dead + +XfrmOutPolError: + Policy error + +XfrmOutStateInvalid: + State is invalid, perhaps expired diff --git a/Documentation/networking/xfrm_proc.txt b/Documentation/networking/xfrm_proc.txt deleted file mode 100644 index 2eae619ab67b..000000000000 --- a/Documentation/networking/xfrm_proc.txt +++ /dev/null @@ -1,82 +0,0 @@ -XFRM proc - /proc/net/xfrm_* files -================================== -Masahide NAKAMURA - - -Transformation Statistics -------------------------- - -The xfrm_proc code is a set of statistics showing numbers of packets -dropped by the transformation code and why. These counters are defined -as part of the linux private MIB. These counters can be viewed in -/proc/net/xfrm_stat. - - -Inbound errors -~~~~~~~~~~~~~~ -XfrmInError: - All errors which is not matched others -XfrmInBufferError: - No buffer is left -XfrmInHdrError: - Header error -XfrmInNoStates: - No state is found - i.e. Either inbound SPI, address, or IPsec protocol at SA is wrong -XfrmInStateProtoError: - Transformation protocol specific error - e.g. SA key is wrong -XfrmInStateModeError: - Transformation mode specific error -XfrmInStateSeqError: - Sequence error - i.e. Sequence number is out of window -XfrmInStateExpired: - State is expired -XfrmInStateMismatch: - State has mismatch option - e.g. UDP encapsulation type is mismatch -XfrmInStateInvalid: - State is invalid -XfrmInTmplMismatch: - No matching template for states - e.g. Inbound SAs are correct but SP rule is wrong -XfrmInNoPols: - No policy is found for states - e.g. Inbound SAs are correct but no SP is found -XfrmInPolBlock: - Policy discards -XfrmInPolError: - Policy error -XfrmAcquireError: - State hasn't been fully acquired before use -XfrmFwdHdrError: - Forward routing of a packet is not allowed - -Outbound errors -~~~~~~~~~~~~~~~ -XfrmOutError: - All errors which is not matched others -XfrmOutBundleGenError: - Bundle generation error -XfrmOutBundleCheckError: - Bundle check error -XfrmOutNoStates: - No state is found -XfrmOutStateProtoError: - Transformation protocol specific error -XfrmOutStateModeError: - Transformation mode specific error -XfrmOutStateSeqError: - Sequence error - i.e. Sequence number overflow -XfrmOutStateExpired: - State is expired -XfrmOutPolBlock: - Policy discards -XfrmOutPolDead: - Policy is dead -XfrmOutPolError: - Policy error -XfrmOutStateInvalid: - State is invalid, perhaps expired -- cgit v1.2.3-59-g8ed1b From a5cfea33e5e54854fa541deb08b85b782f21bab5 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Fri, 1 May 2020 16:44:31 +0200 Subject: docs: networking: convert xfrm_sync.txt to ReST - add SPDX header; - add a document title; - adjust titles and chapters, adding proper markups; - mark code blocks and literals as such; - adjust identation, whitespaces and blank lines where needed; - add to networking/index.rst. Signed-off-by: Mauro Carvalho Chehab Signed-off-by: David S. Miller --- Documentation/networking/index.rst | 1 + Documentation/networking/xfrm_sync.rst | 189 +++++++++++++++++++++++++++++++++ Documentation/networking/xfrm_sync.txt | 169 ----------------------------- 3 files changed, 190 insertions(+), 169 deletions(-) create mode 100644 Documentation/networking/xfrm_sync.rst delete mode 100644 Documentation/networking/xfrm_sync.txt diff --git a/Documentation/networking/index.rst b/Documentation/networking/index.rst index 3fe70efb632e..ec83bd95e4e9 100644 --- a/Documentation/networking/index.rst +++ b/Documentation/networking/index.rst @@ -119,6 +119,7 @@ Contents: x25 xfrm_device xfrm_proc + xfrm_sync .. only:: subproject and html diff --git a/Documentation/networking/xfrm_sync.rst b/Documentation/networking/xfrm_sync.rst new file mode 100644 index 000000000000..6246503ceab2 --- /dev/null +++ b/Documentation/networking/xfrm_sync.rst @@ -0,0 +1,189 @@ +.. SPDX-License-Identifier: GPL-2.0 + +==== +XFRM +==== + +The sync patches work is based on initial patches from +Krisztian and others and additional patches +from Jamal . + +The end goal for syncing is to be able to insert attributes + generate +events so that the SA can be safely moved from one machine to another +for HA purposes. +The idea is to synchronize the SA so that the takeover machine can do +the processing of the SA as accurate as possible if it has access to it. + +We already have the ability to generate SA add/del/upd events. +These patches add ability to sync and have accurate lifetime byte (to +ensure proper decay of SAs) and replay counters to avoid replay attacks +with as minimal loss at failover time. +This way a backup stays as closely up-to-date as an active member. + +Because the above items change for every packet the SA receives, +it is possible for a lot of the events to be generated. +For this reason, we also add a nagle-like algorithm to restrict +the events. i.e we are going to set thresholds to say "let me +know if the replay sequence threshold is reached or 10 secs have passed" +These thresholds are set system-wide via sysctls or can be updated +per SA. + +The identified items that need to be synchronized are: +- the lifetime byte counter +note that: lifetime time limit is not important if you assume the failover +machine is known ahead of time since the decay of the time countdown +is not driven by packet arrival. +- the replay sequence for both inbound and outbound + +1) Message Structure +---------------------- + +nlmsghdr:aevent_id:optional-TLVs. + +The netlink message types are: + +XFRM_MSG_NEWAE and XFRM_MSG_GETAE. + +A XFRM_MSG_GETAE does not have TLVs. + +A XFRM_MSG_NEWAE will have at least two TLVs (as is +discussed further below). + +aevent_id structure looks like:: + + struct xfrm_aevent_id { + struct xfrm_usersa_id sa_id; + xfrm_address_t saddr; + __u32 flags; + __u32 reqid; + }; + +The unique SA is identified by the combination of xfrm_usersa_id, +reqid and saddr. + +flags are used to indicate different things. The possible +flags are:: + + XFRM_AE_RTHR=1, /* replay threshold*/ + XFRM_AE_RVAL=2, /* replay value */ + XFRM_AE_LVAL=4, /* lifetime value */ + XFRM_AE_ETHR=8, /* expiry timer threshold */ + XFRM_AE_CR=16, /* Event cause is replay update */ + XFRM_AE_CE=32, /* Event cause is timer expiry */ + XFRM_AE_CU=64, /* Event cause is policy update */ + +How these flags are used is dependent on the direction of the +message (kernel<->user) as well the cause (config, query or event). +This is described below in the different messages. + +The pid will be set appropriately in netlink to recognize direction +(0 to the kernel and pid = processid that created the event +when going from kernel to user space) + +A program needs to subscribe to multicast group XFRMNLGRP_AEVENTS +to get notified of these events. + +2) TLVS reflect the different parameters: +----------------------------------------- + +a) byte value (XFRMA_LTIME_VAL) + +This TLV carries the running/current counter for byte lifetime since +last event. + +b)replay value (XFRMA_REPLAY_VAL) + +This TLV carries the running/current counter for replay sequence since +last event. + +c)replay threshold (XFRMA_REPLAY_THRESH) + +This TLV carries the threshold being used by the kernel to trigger events +when the replay sequence is exceeded. + +d) expiry timer (XFRMA_ETIMER_THRESH) + +This is a timer value in milliseconds which is used as the nagle +value to rate limit the events. + +3) Default configurations for the parameters: +--------------------------------------------- + +By default these events should be turned off unless there is +at least one listener registered to listen to the multicast +group XFRMNLGRP_AEVENTS. + +Programs installing SAs will need to specify the two thresholds, however, +in order to not change existing applications such as racoon +we also provide default threshold values for these different parameters +in case they are not specified. + +the two sysctls/proc entries are: + +a) /proc/sys/net/core/sysctl_xfrm_aevent_etime +used to provide default values for the XFRMA_ETIMER_THRESH in incremental +units of time of 100ms. The default is 10 (1 second) + +b) /proc/sys/net/core/sysctl_xfrm_aevent_rseqth +used to provide default values for XFRMA_REPLAY_THRESH parameter +in incremental packet count. The default is two packets. + +4) Message types +---------------- + +a) XFRM_MSG_GETAE issued by user-->kernel. + XFRM_MSG_GETAE does not carry any TLVs. + +The response is a XFRM_MSG_NEWAE which is formatted based on what +XFRM_MSG_GETAE queried for. + +The response will always have XFRMA_LTIME_VAL and XFRMA_REPLAY_VAL TLVs. +* if XFRM_AE_RTHR flag is set, then XFRMA_REPLAY_THRESH is also retrieved +* if XFRM_AE_ETHR flag is set, then XFRMA_ETIMER_THRESH is also retrieved + +b) XFRM_MSG_NEWAE is issued by either user space to configure + or kernel to announce events or respond to a XFRM_MSG_GETAE. + +i) user --> kernel to configure a specific SA. + +any of the values or threshold parameters can be updated by passing the +appropriate TLV. + +A response is issued back to the sender in user space to indicate success +or failure. + +In the case of success, additionally an event with +XFRM_MSG_NEWAE is also issued to any listeners as described in iii). + +ii) kernel->user direction as a response to XFRM_MSG_GETAE + +The response will always have XFRMA_LTIME_VAL and XFRMA_REPLAY_VAL TLVs. + +The threshold TLVs will be included if explicitly requested in +the XFRM_MSG_GETAE message. + +iii) kernel->user to report as event if someone sets any values or + thresholds for an SA using XFRM_MSG_NEWAE (as described in #i above). + In such a case XFRM_AE_CU flag is set to inform the user that + the change happened as a result of an update. + The message will always have XFRMA_LTIME_VAL and XFRMA_REPLAY_VAL TLVs. + +iv) kernel->user to report event when replay threshold or a timeout + is exceeded. + +In such a case either XFRM_AE_CR (replay exceeded) or XFRM_AE_CE (timeout +happened) is set to inform the user what happened. +Note the two flags are mutually exclusive. +The message will always have XFRMA_LTIME_VAL and XFRMA_REPLAY_VAL TLVs. + +Exceptions to threshold settings +-------------------------------- + +If you have an SA that is getting hit by traffic in bursts such that +there is a period where the timer threshold expires with no packets +seen, then an odd behavior is seen as follows: +The first packet arrival after a timer expiry will trigger a timeout +event; i.e we don't wait for a timeout period or a packet threshold +to be reached. This is done for simplicity and efficiency reasons. + +-JHS diff --git a/Documentation/networking/xfrm_sync.txt b/Documentation/networking/xfrm_sync.txt deleted file mode 100644 index 8d88e0f2ec49..000000000000 --- a/Documentation/networking/xfrm_sync.txt +++ /dev/null @@ -1,169 +0,0 @@ - -The sync patches work is based on initial patches from -Krisztian and others and additional patches -from Jamal . - -The end goal for syncing is to be able to insert attributes + generate -events so that the SA can be safely moved from one machine to another -for HA purposes. -The idea is to synchronize the SA so that the takeover machine can do -the processing of the SA as accurate as possible if it has access to it. - -We already have the ability to generate SA add/del/upd events. -These patches add ability to sync and have accurate lifetime byte (to -ensure proper decay of SAs) and replay counters to avoid replay attacks -with as minimal loss at failover time. -This way a backup stays as closely up-to-date as an active member. - -Because the above items change for every packet the SA receives, -it is possible for a lot of the events to be generated. -For this reason, we also add a nagle-like algorithm to restrict -the events. i.e we are going to set thresholds to say "let me -know if the replay sequence threshold is reached or 10 secs have passed" -These thresholds are set system-wide via sysctls or can be updated -per SA. - -The identified items that need to be synchronized are: -- the lifetime byte counter -note that: lifetime time limit is not important if you assume the failover -machine is known ahead of time since the decay of the time countdown -is not driven by packet arrival. -- the replay sequence for both inbound and outbound - -1) Message Structure ----------------------- - -nlmsghdr:aevent_id:optional-TLVs. - -The netlink message types are: - -XFRM_MSG_NEWAE and XFRM_MSG_GETAE. - -A XFRM_MSG_GETAE does not have TLVs. -A XFRM_MSG_NEWAE will have at least two TLVs (as is -discussed further below). - -aevent_id structure looks like: - - struct xfrm_aevent_id { - struct xfrm_usersa_id sa_id; - xfrm_address_t saddr; - __u32 flags; - __u32 reqid; - }; - -The unique SA is identified by the combination of xfrm_usersa_id, -reqid and saddr. - -flags are used to indicate different things. The possible -flags are: - XFRM_AE_RTHR=1, /* replay threshold*/ - XFRM_AE_RVAL=2, /* replay value */ - XFRM_AE_LVAL=4, /* lifetime value */ - XFRM_AE_ETHR=8, /* expiry timer threshold */ - XFRM_AE_CR=16, /* Event cause is replay update */ - XFRM_AE_CE=32, /* Event cause is timer expiry */ - XFRM_AE_CU=64, /* Event cause is policy update */ - -How these flags are used is dependent on the direction of the -message (kernel<->user) as well the cause (config, query or event). -This is described below in the different messages. - -The pid will be set appropriately in netlink to recognize direction -(0 to the kernel and pid = processid that created the event -when going from kernel to user space) - -A program needs to subscribe to multicast group XFRMNLGRP_AEVENTS -to get notified of these events. - -2) TLVS reflect the different parameters: ------------------------------------------ - -a) byte value (XFRMA_LTIME_VAL) -This TLV carries the running/current counter for byte lifetime since -last event. - -b)replay value (XFRMA_REPLAY_VAL) -This TLV carries the running/current counter for replay sequence since -last event. - -c)replay threshold (XFRMA_REPLAY_THRESH) -This TLV carries the threshold being used by the kernel to trigger events -when the replay sequence is exceeded. - -d) expiry timer (XFRMA_ETIMER_THRESH) -This is a timer value in milliseconds which is used as the nagle -value to rate limit the events. - -3) Default configurations for the parameters: ----------------------------------------------- - -By default these events should be turned off unless there is -at least one listener registered to listen to the multicast -group XFRMNLGRP_AEVENTS. - -Programs installing SAs will need to specify the two thresholds, however, -in order to not change existing applications such as racoon -we also provide default threshold values for these different parameters -in case they are not specified. - -the two sysctls/proc entries are: -a) /proc/sys/net/core/sysctl_xfrm_aevent_etime -used to provide default values for the XFRMA_ETIMER_THRESH in incremental -units of time of 100ms. The default is 10 (1 second) - -b) /proc/sys/net/core/sysctl_xfrm_aevent_rseqth -used to provide default values for XFRMA_REPLAY_THRESH parameter -in incremental packet count. The default is two packets. - -4) Message types ----------------- - -a) XFRM_MSG_GETAE issued by user-->kernel. -XFRM_MSG_GETAE does not carry any TLVs. -The response is a XFRM_MSG_NEWAE which is formatted based on what -XFRM_MSG_GETAE queried for. -The response will always have XFRMA_LTIME_VAL and XFRMA_REPLAY_VAL TLVs. -*if XFRM_AE_RTHR flag is set, then XFRMA_REPLAY_THRESH is also retrieved -*if XFRM_AE_ETHR flag is set, then XFRMA_ETIMER_THRESH is also retrieved - -b) XFRM_MSG_NEWAE is issued by either user space to configure -or kernel to announce events or respond to a XFRM_MSG_GETAE. - -i) user --> kernel to configure a specific SA. -any of the values or threshold parameters can be updated by passing the -appropriate TLV. -A response is issued back to the sender in user space to indicate success -or failure. -In the case of success, additionally an event with -XFRM_MSG_NEWAE is also issued to any listeners as described in iii). - -ii) kernel->user direction as a response to XFRM_MSG_GETAE -The response will always have XFRMA_LTIME_VAL and XFRMA_REPLAY_VAL TLVs. -The threshold TLVs will be included if explicitly requested in -the XFRM_MSG_GETAE message. - -iii) kernel->user to report as event if someone sets any values or -thresholds for an SA using XFRM_MSG_NEWAE (as described in #i above). -In such a case XFRM_AE_CU flag is set to inform the user that -the change happened as a result of an update. -The message will always have XFRMA_LTIME_VAL and XFRMA_REPLAY_VAL TLVs. - -iv) kernel->user to report event when replay threshold or a timeout -is exceeded. -In such a case either XFRM_AE_CR (replay exceeded) or XFRM_AE_CE (timeout -happened) is set to inform the user what happened. -Note the two flags are mutually exclusive. -The message will always have XFRMA_LTIME_VAL and XFRMA_REPLAY_VAL TLVs. - -Exceptions to threshold settings --------------------------------- - -If you have an SA that is getting hit by traffic in bursts such that -there is a period where the timer threshold expires with no packets -seen, then an odd behavior is seen as follows: -The first packet arrival after a timer expiry will trigger a timeout -event; i.e we don't wait for a timeout period or a packet threshold -to be reached. This is done for simplicity and efficiency reasons. - --JHS -- cgit v1.2.3-59-g8ed1b From a6c34b476ca27d0e5c14e58aefdbbdc4c509dd5f Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Fri, 1 May 2020 16:44:32 +0200 Subject: docs: networking: convert xfrm_sysctl.txt to ReST Not much to be done here: - add SPDX header; - add a document title; - add a chapter's markup; - add to networking/index.rst. Signed-off-by: Mauro Carvalho Chehab Signed-off-by: David S. Miller --- Documentation/networking/index.rst | 1 + Documentation/networking/xfrm_sysctl.rst | 11 +++++++++++ Documentation/networking/xfrm_sysctl.txt | 4 ---- 3 files changed, 12 insertions(+), 4 deletions(-) create mode 100644 Documentation/networking/xfrm_sysctl.rst delete mode 100644 Documentation/networking/xfrm_sysctl.txt diff --git a/Documentation/networking/index.rst b/Documentation/networking/index.rst index ec83bd95e4e9..1630801cec19 100644 --- a/Documentation/networking/index.rst +++ b/Documentation/networking/index.rst @@ -120,6 +120,7 @@ Contents: xfrm_device xfrm_proc xfrm_sync + xfrm_sysctl .. only:: subproject and html diff --git a/Documentation/networking/xfrm_sysctl.rst b/Documentation/networking/xfrm_sysctl.rst new file mode 100644 index 000000000000..47b9bbdd0179 --- /dev/null +++ b/Documentation/networking/xfrm_sysctl.rst @@ -0,0 +1,11 @@ +.. SPDX-License-Identifier: GPL-2.0 + +============ +XFRM Syscall +============ + +/proc/sys/net/core/xfrm_* Variables: +==================================== + +xfrm_acq_expires - INTEGER + default 30 - hard timeout in seconds for acquire requests diff --git a/Documentation/networking/xfrm_sysctl.txt b/Documentation/networking/xfrm_sysctl.txt deleted file mode 100644 index 5bbd16792fe1..000000000000 --- a/Documentation/networking/xfrm_sysctl.txt +++ /dev/null @@ -1,4 +0,0 @@ -/proc/sys/net/core/xfrm_* Variables: - -xfrm_acq_expires - INTEGER - default 30 - hard timeout in seconds for acquire requests -- cgit v1.2.3-59-g8ed1b From 0046db09d539523ef1470bcad2f2614cc3ef7ddf Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Fri, 1 May 2020 16:44:33 +0200 Subject: docs: networking: convert z8530drv.txt to ReST - add SPDX header; - use copyright symbol; - adjust titles and chapters, adding proper markups; - mark tables as such; - mark code blocks and literals as such; - adjust identation, whitespaces and blank lines where needed; - add to networking/index.rst. Signed-off-by: Mauro Carvalho Chehab Signed-off-by: David S. Miller --- Documentation/networking/index.rst | 1 + Documentation/networking/z8530drv.rst | 686 ++++++++++++++++++++++++++++++++++ Documentation/networking/z8530drv.txt | 657 -------------------------------- MAINTAINERS | 2 +- drivers/net/hamradio/Kconfig | 4 +- drivers/net/hamradio/scc.c | 2 +- 6 files changed, 691 insertions(+), 661 deletions(-) create mode 100644 Documentation/networking/z8530drv.rst delete mode 100644 Documentation/networking/z8530drv.txt diff --git a/Documentation/networking/index.rst b/Documentation/networking/index.rst index 1630801cec19..f5733ca4fbcb 100644 --- a/Documentation/networking/index.rst +++ b/Documentation/networking/index.rst @@ -121,6 +121,7 @@ Contents: xfrm_proc xfrm_sync xfrm_sysctl + z8530drv .. only:: subproject and html diff --git a/Documentation/networking/z8530drv.rst b/Documentation/networking/z8530drv.rst new file mode 100644 index 000000000000..d2942760f167 --- /dev/null +++ b/Documentation/networking/z8530drv.rst @@ -0,0 +1,686 @@ +.. SPDX-License-Identifier: GPL-2.0 +.. include:: + +========================================================= +SCC.C - Linux driver for Z8530 based HDLC cards for AX.25 +========================================================= + + +This is a subset of the documentation. To use this driver you MUST have the +full package from: + +Internet: + + 1. ftp://ftp.ccac.rwth-aachen.de/pub/jr/z8530drv-utils_3.0-3.tar.gz + + 2. ftp://ftp.pspt.fi/pub/ham/linux/ax25/z8530drv-utils_3.0-3.tar.gz + +Please note that the information in this document may be hopelessly outdated. +A new version of the documentation, along with links to other important +Linux Kernel AX.25 documentation and programs, is available on +http://yaina.de/jreuter + +Copyright |copy| 1993,2000 by Joerg Reuter DL1BKE + +portions Copyright |copy| 1993 Guido ten Dolle PE1NNZ + +for the complete copyright notice see >> Copying.Z8530DRV << + +1. Initialization of the driver +=============================== + +To use the driver, 3 steps must be performed: + + 1. if compiled as module: loading the module + 2. Setup of hardware, MODEM and KISS parameters with sccinit + 3. Attach each channel to the Linux kernel AX.25 with "ifconfig" + +Unlike the versions below 2.4 this driver is a real network device +driver. If you want to run xNOS instead of our fine kernel AX.25 +use a 2.x version (available from above sites) or read the +AX.25-HOWTO on how to emulate a KISS TNC on network device drivers. + + +1.1 Loading the module +====================== + +(If you're going to compile the driver as a part of the kernel image, + skip this chapter and continue with 1.2) + +Before you can use a module, you'll have to load it with:: + + insmod scc.o + +please read 'man insmod' that comes with module-init-tools. + +You should include the insmod in one of the /etc/rc.d/rc.* files, +and don't forget to insert a call of sccinit after that. It +will read your /etc/z8530drv.conf. + +1.2. /etc/z8530drv.conf +======================= + +To setup all parameters you must run /sbin/sccinit from one +of your rc.*-files. This has to be done BEFORE you can +"ifconfig" an interface. Sccinit reads the file /etc/z8530drv.conf +and sets the hardware, MODEM and KISS parameters. A sample file is +delivered with this package. Change it to your needs. + +The file itself consists of two main sections. + +1.2.1 configuration of hardware parameters +========================================== + +The hardware setup section defines the following parameters for each +Z8530:: + + chip 1 + data_a 0x300 # data port A + ctrl_a 0x304 # control port A + data_b 0x301 # data port B + ctrl_b 0x305 # control port B + irq 5 # IRQ No. 5 + pclock 4915200 # clock + board BAYCOM # hardware type + escc no # enhanced SCC chip? (8580/85180/85280) + vector 0 # latch for interrupt vector + special no # address of special function register + option 0 # option to set via sfr + + +chip + - this is just a delimiter to make sccinit a bit simpler to + program. A parameter has no effect. + +data_a + - the address of the data port A of this Z8530 (needed) +ctrl_a + - the address of the control port A (needed) +data_b + - the address of the data port B (needed) +ctrl_b + - the address of the control port B (needed) + +irq + - the used IRQ for this chip. Different chips can use different + IRQs or the same. If they share an interrupt, it needs to be + specified within one chip-definition only. + +pclock - the clock at the PCLK pin of the Z8530 (option, 4915200 is + default), measured in Hertz + +board + - the "type" of the board: + + ======================= ======== + SCC type value + ======================= ======== + PA0HZP SCC card PA0HZP + EAGLE card EAGLE + PC100 card PC100 + PRIMUS-PC (DG9BL) card PRIMUS + BayCom (U)SCC card BAYCOM + ======================= ======== + +escc + - if you want support for ESCC chips (8580, 85180, 85280), set + this to "yes" (option, defaults to "no") + +vector + - address of the vector latch (aka "intack port") for PA0HZP + cards. There can be only one vector latch for all chips! + (option, defaults to 0) + +special + - address of the special function register on several cards. + (option, defaults to 0) + +option - The value you write into that register (option, default is 0) + +You can specify up to four chips (8 channels). If this is not enough, +just change:: + + #define MAXSCC 4 + +to a higher value. + +Example for the BAYCOM USCC: +---------------------------- + +:: + + chip 1 + data_a 0x300 # data port A + ctrl_a 0x304 # control port A + data_b 0x301 # data port B + ctrl_b 0x305 # control port B + irq 5 # IRQ No. 5 (#) + board BAYCOM # hardware type (*) + # + # SCC chip 2 + # + chip 2 + data_a 0x302 + ctrl_a 0x306 + data_b 0x303 + ctrl_b 0x307 + board BAYCOM + +An example for a PA0HZP card: +----------------------------- + +:: + + chip 1 + data_a 0x153 + data_b 0x151 + ctrl_a 0x152 + ctrl_b 0x150 + irq 9 + pclock 4915200 + board PA0HZP + vector 0x168 + escc no + # + # + # + chip 2 + data_a 0x157 + data_b 0x155 + ctrl_a 0x156 + ctrl_b 0x154 + irq 9 + pclock 4915200 + board PA0HZP + vector 0x168 + escc no + +A DRSI would should probably work with this: +-------------------------------------------- +(actually: two DRSI cards...) + +:: + + chip 1 + data_a 0x303 + data_b 0x301 + ctrl_a 0x302 + ctrl_b 0x300 + irq 7 + pclock 4915200 + board DRSI + escc no + # + # + # + chip 2 + data_a 0x313 + data_b 0x311 + ctrl_a 0x312 + ctrl_b 0x310 + irq 7 + pclock 4915200 + board DRSI + escc no + +Note that you cannot use the on-board baudrate generator off DRSI +cards. Use "mode dpll" for clock source (see below). + +This is based on information provided by Mike Bilow (and verified +by Paul Helay) + +The utility "gencfg" +-------------------- + +If you only know the parameters for the PE1CHL driver for DOS, +run gencfg. It will generate the correct port addresses (I hope). +Its parameters are exactly the same as the ones you use with +the "attach scc" command in net, except that the string "init" must +not appear. Example:: + + gencfg 2 0x150 4 2 0 1 0x168 9 4915200 + +will print a skeleton z8530drv.conf for the OptoSCC to stdout. + +:: + + gencfg 2 0x300 2 4 5 -4 0 7 4915200 0x10 + +does the same for the BAYCOM USCC card. In my opinion it is much easier +to edit scc_config.h... + + +1.2.2 channel configuration +=========================== + +The channel definition is divided into three sub sections for each +channel: + +An example for scc0:: + + # DEVICE + + device scc0 # the device for the following params + + # MODEM / BUFFERS + + speed 1200 # the default baudrate + clock dpll # clock source: + # dpll = normal half duplex operation + # external = MODEM provides own Rx/Tx clock + # divider = use full duplex divider if + # installed (1) + mode nrzi # HDLC encoding mode + # nrzi = 1k2 MODEM, G3RUH 9k6 MODEM + # nrz = DF9IC 9k6 MODEM + # + bufsize 384 # size of buffers. Note that this must include + # the AX.25 header, not only the data field! + # (optional, defaults to 384) + + # KISS (Layer 1) + + txdelay 36 # (see chapter 1.4) + persist 64 + slot 8 + tail 8 + fulldup 0 + wait 12 + min 3 + maxkey 7 + idle 3 + maxdef 120 + group 0 + txoff off + softdcd on + slip off + +The order WITHIN these sections is unimportant. The order OF these +sections IS important. The MODEM parameters are set with the first +recognized KISS parameter... + +Please note that you can initialize the board only once after boot +(or insmod). You can change all parameters but "mode" and "clock" +later with the Sccparam program or through KISS. Just to avoid +security holes... + +(1) this divider is usually mounted on the SCC-PBC (PA0HZP) or not + present at all (BayCom). It feeds back the output of the DPLL + (digital pll) as transmit clock. Using this mode without a divider + installed will normally result in keying the transceiver until + maxkey expires --- of course without sending anything (useful). + +2. Attachment of a channel by your AX.25 software +================================================= + +2.1 Kernel AX.25 +================ + +To set up an AX.25 device you can simply type:: + + ifconfig scc0 44.128.1.1 hw ax25 dl0tha-7 + +This will create a network interface with the IP number 44.128.20.107 +and the callsign "dl0tha". If you do not have any IP number (yet) you +can use any of the 44.128.0.0 network. Note that you do not need +axattach. The purpose of axattach (like slattach) is to create a KISS +network device linked to a TTY. Please read the documentation of the +ax25-utils and the AX.25-HOWTO to learn how to set the parameters of +the kernel AX.25. + +2.2 NOS, NET and TFKISS +======================= + +Since the TTY driver (aka KISS TNC emulation) is gone you need +to emulate the old behaviour. The cost of using these programs is +that you probably need to compile the kernel AX.25, regardless of whether +you actually use it or not. First setup your /etc/ax25/axports, +for example:: + + 9k6 dl0tha-9 9600 255 4 9600 baud port (scc3) + axlink dl0tha-15 38400 255 4 Link to NOS + +Now "ifconfig" the scc device:: + + ifconfig scc3 44.128.1.1 hw ax25 dl0tha-9 + +You can now axattach a pseudo-TTY:: + + axattach /dev/ptys0 axlink + +and start your NOS and attach /dev/ptys0 there. The problem is that +NOS is reachable only via digipeating through the kernel AX.25 +(disastrous on a DAMA controlled channel). To solve this problem, +configure "rxecho" to echo the incoming frames from "9k6" to "axlink" +and outgoing frames from "axlink" to "9k6" and start:: + + rxecho + +Or simply use "kissbridge" coming with z8530drv-utils:: + + ifconfig scc3 hw ax25 dl0tha-9 + kissbridge scc3 /dev/ptys0 + + +3. Adjustment and Display of parameters +======================================= + +3.1 Displaying SCC Parameters: +============================== + +Once a SCC channel has been attached, the parameter settings and +some statistic information can be shown using the param program:: + + dl1bke-u:~$ sccstat scc0 + + Parameters: + + speed : 1200 baud + txdelay : 36 + persist : 255 + slottime : 0 + txtail : 8 + fulldup : 1 + waittime : 12 + mintime : 3 sec + maxkeyup : 7 sec + idletime : 3 sec + maxdefer : 120 sec + group : 0x00 + txoff : off + softdcd : on + SLIP : off + + Status: + + HDLC Z8530 Interrupts Buffers + ----------------------------------------------------------------------- + Sent : 273 RxOver : 0 RxInts : 125074 Size : 384 + Received : 1095 TxUnder: 0 TxInts : 4684 NoSpace : 0 + RxErrors : 1591 ExInts : 11776 + TxErrors : 0 SpInts : 1503 + Tx State : idle + + +The status info shown is: + +============== ============================================================== +Sent number of frames transmitted +Received number of frames received +RxErrors number of receive errors (CRC, ABORT) +TxErrors number of discarded Tx frames (due to various reasons) +Tx State status of the Tx interrupt handler: idle/busy/active/tail (2) +RxOver number of receiver overruns +TxUnder number of transmitter underruns +RxInts number of receiver interrupts +TxInts number of transmitter interrupts +EpInts number of receiver special condition interrupts +SpInts number of external/status interrupts +Size maximum size of an AX.25 frame (*with* AX.25 headers!) +NoSpace number of times a buffer could not get allocated +============== ============================================================== + +An overrun is abnormal. If lots of these occur, the product of +baudrate and number of interfaces is too high for the processing +power of your computer. NoSpace errors are unlikely to be caused by the +driver or the kernel AX.25. + + +3.2 Setting Parameters +====================== + + +The setting of parameters of the emulated KISS TNC is done in the +same way in the SCC driver. You can change parameters by using +the kissparms program from the ax25-utils package or use the program +"sccparam":: + + sccparam + +You can change the following parameters: + +=========== ===== +param value +=========== ===== +speed 1200 +txdelay 36 +persist 255 +slottime 0 +txtail 8 +fulldup 1 +waittime 12 +mintime 3 +maxkeyup 7 +idletime 3 +maxdefer 120 +group 0x00 +txoff off +softdcd on +SLIP off +=========== ===== + + +The parameters have the following meaning: + +speed: + The baudrate on this channel in bits/sec + + Example: sccparam /dev/scc3 speed 9600 + +txdelay: + The delay (in units of 10 ms) after keying of the + transmitter, until the first byte is sent. This is usually + called "TXDELAY" in a TNC. When 0 is specified, the driver + will just wait until the CTS signal is asserted. This + assumes the presence of a timer or other circuitry in the + MODEM and/or transmitter, that asserts CTS when the + transmitter is ready for data. + A normal value of this parameter is 30-36. + + Example: sccparam /dev/scc0 txd 20 + +persist: + This is the probability that the transmitter will be keyed + when the channel is found to be free. It is a value from 0 + to 255, and the probability is (value+1)/256. The value + should be somewhere near 50-60, and should be lowered when + the channel is used more heavily. + + Example: sccparam /dev/scc2 persist 20 + +slottime: + This is the time between samples of the channel. It is + expressed in units of 10 ms. About 200-300 ms (value 20-30) + seems to be a good value. + + Example: sccparam /dev/scc0 slot 20 + +tail: + The time the transmitter will remain keyed after the last + byte of a packet has been transferred to the SCC. This is + necessary because the CRC and a flag still have to leave the + SCC before the transmitter is keyed down. The value depends + on the baudrate selected. A few character times should be + sufficient, e.g. 40ms at 1200 baud. (value 4) + The value of this parameter is in 10 ms units. + + Example: sccparam /dev/scc2 4 + +full: + The full-duplex mode switch. This can be one of the following + values: + + 0: The interface will operate in CSMA mode (the normal + half-duplex packet radio operation) + 1: Fullduplex mode, i.e. the transmitter will be keyed at + any time, without checking the received carrier. It + will be unkeyed when there are no packets to be sent. + 2: Like 1, but the transmitter will remain keyed, also + when there are no packets to be sent. Flags will be + sent in that case, until a timeout (parameter 10) + occurs. + + Example: sccparam /dev/scc0 fulldup off + +wait: + The initial waittime before any transmit attempt, after the + frame has been queue for transmit. This is the length of + the first slot in CSMA mode. In full duplex modes it is + set to 0 for maximum performance. + The value of this parameter is in 10 ms units. + + Example: sccparam /dev/scc1 wait 4 + +maxkey: + The maximal time the transmitter will be keyed to send + packets, in seconds. This can be useful on busy CSMA + channels, to avoid "getting a bad reputation" when you are + generating a lot of traffic. After the specified time has + elapsed, no new frame will be started. Instead, the trans- + mitter will be switched off for a specified time (parameter + min), and then the selected algorithm for keyup will be + started again. + The value 0 as well as "off" will disable this feature, + and allow infinite transmission time. + + Example: sccparam /dev/scc0 maxk 20 + +min: + This is the time the transmitter will be switched off when + the maximum transmission time is exceeded. + + Example: sccparam /dev/scc3 min 10 + +idle: + This parameter specifies the maximum idle time in full duplex + 2 mode, in seconds. When no frames have been sent for this + time, the transmitter will be keyed down. A value of 0 is + has same result as the fullduplex mode 1. This parameter + can be disabled. + + Example: sccparam /dev/scc2 idle off # transmit forever + +maxdefer + This is the maximum time (in seconds) to wait for a free channel + to send. When this timer expires the transmitter will be keyed + IMMEDIATELY. If you love to get trouble with other users you + should set this to a very low value ;-) + + Example: sccparam /dev/scc0 maxdefer 240 # 2 minutes + + +txoff: + When this parameter has the value 0, the transmission of packets + is enable. Otherwise it is disabled. + + Example: sccparam /dev/scc2 txoff on + +group: + It is possible to build special radio equipment to use more than + one frequency on the same band, e.g. using several receivers and + only one transmitter that can be switched between frequencies. + Also, you can connect several radios that are active on the same + band. In these cases, it is not possible, or not a good idea, to + transmit on more than one frequency. The SCC driver provides a + method to lock transmitters on different interfaces, using the + "param group " command. This will only work when + you are using CSMA mode (parameter full = 0). + + The number must be 0 if you want no group restrictions, and + can be computed as follows to create restricted groups: + is the sum of some OCTAL numbers: + + + === ======================================================= + 200 This transmitter will only be keyed when all other + transmitters in the group are off. + 100 This transmitter will only be keyed when the carrier + detect of all other interfaces in the group is off. + 0xx A byte that can be used to define different groups. + Interfaces are in the same group, when the logical AND + between their xx values is nonzero. + === ======================================================= + + Examples: + + When 2 interfaces use group 201, their transmitters will never be + keyed at the same time. + + When 2 interfaces use group 101, the transmitters will only key + when both channels are clear at the same time. When group 301, + the transmitters will not be keyed at the same time. + + Don't forget to convert the octal numbers into decimal before + you set the parameter. + + Example: (to be written) + +softdcd: + use a software dcd instead of the real one... Useful for a very + slow squelch. + + Example: sccparam /dev/scc0 soft on + + +4. Problems +=========== + +If you have tx-problems with your BayCom USCC card please check +the manufacturer of the 8530. SGS chips have a slightly +different timing. Try Zilog... A solution is to write to register 8 +instead to the data port, but this won't work with the ESCC chips. +*SIGH!* + +A very common problem is that the PTT locks until the maxkeyup timer +expires, although interrupts and clock source are correct. In most +cases compiling the driver with CONFIG_SCC_DELAY (set with +make config) solves the problems. For more hints read the (pseudo) FAQ +and the documentation coming with z8530drv-utils. + +I got reports that the driver has problems on some 386-based systems. +(i.e. Amstrad) Those systems have a bogus AT bus timing which will +lead to delayed answers on interrupts. You can recognize these +problems by looking at the output of Sccstat for the suspected +port. If it shows under- and overruns you own such a system. + +Delayed processing of received data: This depends on + +- the kernel version + +- kernel profiling compiled or not + +- a high interrupt load + +- a high load of the machine --- running X, Xmorph, XV and Povray, + while compiling the kernel... hmm ... even with 32 MB RAM ... ;-) + Or running a named for the whole .ampr.org domain on an 8 MB + box... + +- using information from rxecho or kissbridge. + +Kernel panics: please read /linux/README and find out if it +really occurred within the scc driver. + +If you cannot solve a problem, send me + +- a description of the problem, +- information on your hardware (computer system, scc board, modem) +- your kernel version +- the output of cat /proc/net/z8530 + +4. Thor RLC100 +============== + +Mysteriously this board seems not to work with the driver. Anyone +got it up-and-running? + + +Many thanks to Linus Torvalds and Alan Cox for including the driver +in the Linux standard distribution and their support. + +:: + + Joerg Reuter ampr-net: dl1bke@db0pra.ampr.org + AX-25 : DL1BKE @ DB0ABH.#BAY.DEU.EU + Internet: jreuter@yaina.de + WWW : http://yaina.de/jreuter diff --git a/Documentation/networking/z8530drv.txt b/Documentation/networking/z8530drv.txt deleted file mode 100644 index 2206abbc3e1b..000000000000 --- a/Documentation/networking/z8530drv.txt +++ /dev/null @@ -1,657 +0,0 @@ -This is a subset of the documentation. To use this driver you MUST have the -full package from: - -Internet: -========= - -1. ftp://ftp.ccac.rwth-aachen.de/pub/jr/z8530drv-utils_3.0-3.tar.gz - -2. ftp://ftp.pspt.fi/pub/ham/linux/ax25/z8530drv-utils_3.0-3.tar.gz - -Please note that the information in this document may be hopelessly outdated. -A new version of the documentation, along with links to other important -Linux Kernel AX.25 documentation and programs, is available on -http://yaina.de/jreuter - ------------------------------------------------------------------------------ - - - SCC.C - Linux driver for Z8530 based HDLC cards for AX.25 - - ******************************************************************** - - (c) 1993,2000 by Joerg Reuter DL1BKE - - portions (c) 1993 Guido ten Dolle PE1NNZ - - for the complete copyright notice see >> Copying.Z8530DRV << - - ******************************************************************** - - -1. Initialization of the driver -=============================== - -To use the driver, 3 steps must be performed: - - 1. if compiled as module: loading the module - 2. Setup of hardware, MODEM and KISS parameters with sccinit - 3. Attach each channel to the Linux kernel AX.25 with "ifconfig" - -Unlike the versions below 2.4 this driver is a real network device -driver. If you want to run xNOS instead of our fine kernel AX.25 -use a 2.x version (available from above sites) or read the -AX.25-HOWTO on how to emulate a KISS TNC on network device drivers. - - -1.1 Loading the module -====================== - -(If you're going to compile the driver as a part of the kernel image, - skip this chapter and continue with 1.2) - -Before you can use a module, you'll have to load it with - - insmod scc.o - -please read 'man insmod' that comes with module-init-tools. - -You should include the insmod in one of the /etc/rc.d/rc.* files, -and don't forget to insert a call of sccinit after that. It -will read your /etc/z8530drv.conf. - -1.2. /etc/z8530drv.conf -======================= - -To setup all parameters you must run /sbin/sccinit from one -of your rc.*-files. This has to be done BEFORE you can -"ifconfig" an interface. Sccinit reads the file /etc/z8530drv.conf -and sets the hardware, MODEM and KISS parameters. A sample file is -delivered with this package. Change it to your needs. - -The file itself consists of two main sections. - -1.2.1 configuration of hardware parameters -========================================== - -The hardware setup section defines the following parameters for each -Z8530: - -chip 1 -data_a 0x300 # data port A -ctrl_a 0x304 # control port A -data_b 0x301 # data port B -ctrl_b 0x305 # control port B -irq 5 # IRQ No. 5 -pclock 4915200 # clock -board BAYCOM # hardware type -escc no # enhanced SCC chip? (8580/85180/85280) -vector 0 # latch for interrupt vector -special no # address of special function register -option 0 # option to set via sfr - - -chip - this is just a delimiter to make sccinit a bit simpler to - program. A parameter has no effect. - -data_a - the address of the data port A of this Z8530 (needed) -ctrl_a - the address of the control port A (needed) -data_b - the address of the data port B (needed) -ctrl_b - the address of the control port B (needed) - -irq - the used IRQ for this chip. Different chips can use different - IRQs or the same. If they share an interrupt, it needs to be - specified within one chip-definition only. - -pclock - the clock at the PCLK pin of the Z8530 (option, 4915200 is - default), measured in Hertz - -board - the "type" of the board: - - SCC type value - --------------------------------- - PA0HZP SCC card PA0HZP - EAGLE card EAGLE - PC100 card PC100 - PRIMUS-PC (DG9BL) card PRIMUS - BayCom (U)SCC card BAYCOM - -escc - if you want support for ESCC chips (8580, 85180, 85280), set - this to "yes" (option, defaults to "no") - -vector - address of the vector latch (aka "intack port") for PA0HZP - cards. There can be only one vector latch for all chips! - (option, defaults to 0) - -special - address of the special function register on several cards. - (option, defaults to 0) - -option - The value you write into that register (option, default is 0) - -You can specify up to four chips (8 channels). If this is not enough, -just change - - #define MAXSCC 4 - -to a higher value. - -Example for the BAYCOM USCC: ----------------------------- - -chip 1 -data_a 0x300 # data port A -ctrl_a 0x304 # control port A -data_b 0x301 # data port B -ctrl_b 0x305 # control port B -irq 5 # IRQ No. 5 (#) -board BAYCOM # hardware type (*) -# -# SCC chip 2 -# -chip 2 -data_a 0x302 -ctrl_a 0x306 -data_b 0x303 -ctrl_b 0x307 -board BAYCOM - -An example for a PA0HZP card: ------------------------------ - -chip 1 -data_a 0x153 -data_b 0x151 -ctrl_a 0x152 -ctrl_b 0x150 -irq 9 -pclock 4915200 -board PA0HZP -vector 0x168 -escc no -# -# -# -chip 2 -data_a 0x157 -data_b 0x155 -ctrl_a 0x156 -ctrl_b 0x154 -irq 9 -pclock 4915200 -board PA0HZP -vector 0x168 -escc no - -A DRSI would should probably work with this: --------------------------------------------- -(actually: two DRSI cards...) - -chip 1 -data_a 0x303 -data_b 0x301 -ctrl_a 0x302 -ctrl_b 0x300 -irq 7 -pclock 4915200 -board DRSI -escc no -# -# -# -chip 2 -data_a 0x313 -data_b 0x311 -ctrl_a 0x312 -ctrl_b 0x310 -irq 7 -pclock 4915200 -board DRSI -escc no - -Note that you cannot use the on-board baudrate generator off DRSI -cards. Use "mode dpll" for clock source (see below). - -This is based on information provided by Mike Bilow (and verified -by Paul Helay) - -The utility "gencfg" --------------------- - -If you only know the parameters for the PE1CHL driver for DOS, -run gencfg. It will generate the correct port addresses (I hope). -Its parameters are exactly the same as the ones you use with -the "attach scc" command in net, except that the string "init" must -not appear. Example: - -gencfg 2 0x150 4 2 0 1 0x168 9 4915200 - -will print a skeleton z8530drv.conf for the OptoSCC to stdout. - -gencfg 2 0x300 2 4 5 -4 0 7 4915200 0x10 - -does the same for the BAYCOM USCC card. In my opinion it is much easier -to edit scc_config.h... - - -1.2.2 channel configuration -=========================== - -The channel definition is divided into three sub sections for each -channel: - -An example for scc0: - -# DEVICE - -device scc0 # the device for the following params - -# MODEM / BUFFERS - -speed 1200 # the default baudrate -clock dpll # clock source: - # dpll = normal half duplex operation - # external = MODEM provides own Rx/Tx clock - # divider = use full duplex divider if - # installed (1) -mode nrzi # HDLC encoding mode - # nrzi = 1k2 MODEM, G3RUH 9k6 MODEM - # nrz = DF9IC 9k6 MODEM - # -bufsize 384 # size of buffers. Note that this must include - # the AX.25 header, not only the data field! - # (optional, defaults to 384) - -# KISS (Layer 1) - -txdelay 36 # (see chapter 1.4) -persist 64 -slot 8 -tail 8 -fulldup 0 -wait 12 -min 3 -maxkey 7 -idle 3 -maxdef 120 -group 0 -txoff off -softdcd on -slip off - -The order WITHIN these sections is unimportant. The order OF these -sections IS important. The MODEM parameters are set with the first -recognized KISS parameter... - -Please note that you can initialize the board only once after boot -(or insmod). You can change all parameters but "mode" and "clock" -later with the Sccparam program or through KISS. Just to avoid -security holes... - -(1) this divider is usually mounted on the SCC-PBC (PA0HZP) or not - present at all (BayCom). It feeds back the output of the DPLL - (digital pll) as transmit clock. Using this mode without a divider - installed will normally result in keying the transceiver until - maxkey expires --- of course without sending anything (useful). - -2. Attachment of a channel by your AX.25 software -================================================= - -2.1 Kernel AX.25 -================ - -To set up an AX.25 device you can simply type: - - ifconfig scc0 44.128.1.1 hw ax25 dl0tha-7 - -This will create a network interface with the IP number 44.128.20.107 -and the callsign "dl0tha". If you do not have any IP number (yet) you -can use any of the 44.128.0.0 network. Note that you do not need -axattach. The purpose of axattach (like slattach) is to create a KISS -network device linked to a TTY. Please read the documentation of the -ax25-utils and the AX.25-HOWTO to learn how to set the parameters of -the kernel AX.25. - -2.2 NOS, NET and TFKISS -======================= - -Since the TTY driver (aka KISS TNC emulation) is gone you need -to emulate the old behaviour. The cost of using these programs is -that you probably need to compile the kernel AX.25, regardless of whether -you actually use it or not. First setup your /etc/ax25/axports, -for example: - - 9k6 dl0tha-9 9600 255 4 9600 baud port (scc3) - axlink dl0tha-15 38400 255 4 Link to NOS - -Now "ifconfig" the scc device: - - ifconfig scc3 44.128.1.1 hw ax25 dl0tha-9 - -You can now axattach a pseudo-TTY: - - axattach /dev/ptys0 axlink - -and start your NOS and attach /dev/ptys0 there. The problem is that -NOS is reachable only via digipeating through the kernel AX.25 -(disastrous on a DAMA controlled channel). To solve this problem, -configure "rxecho" to echo the incoming frames from "9k6" to "axlink" -and outgoing frames from "axlink" to "9k6" and start: - - rxecho - -Or simply use "kissbridge" coming with z8530drv-utils: - - ifconfig scc3 hw ax25 dl0tha-9 - kissbridge scc3 /dev/ptys0 - - -3. Adjustment and Display of parameters -======================================= - -3.1 Displaying SCC Parameters: -============================== - -Once a SCC channel has been attached, the parameter settings and -some statistic information can be shown using the param program: - -dl1bke-u:~$ sccstat scc0 - -Parameters: - -speed : 1200 baud -txdelay : 36 -persist : 255 -slottime : 0 -txtail : 8 -fulldup : 1 -waittime : 12 -mintime : 3 sec -maxkeyup : 7 sec -idletime : 3 sec -maxdefer : 120 sec -group : 0x00 -txoff : off -softdcd : on -SLIP : off - -Status: - -HDLC Z8530 Interrupts Buffers ------------------------------------------------------------------------ -Sent : 273 RxOver : 0 RxInts : 125074 Size : 384 -Received : 1095 TxUnder: 0 TxInts : 4684 NoSpace : 0 -RxErrors : 1591 ExInts : 11776 -TxErrors : 0 SpInts : 1503 -Tx State : idle - - -The status info shown is: - -Sent - number of frames transmitted -Received - number of frames received -RxErrors - number of receive errors (CRC, ABORT) -TxErrors - number of discarded Tx frames (due to various reasons) -Tx State - status of the Tx interrupt handler: idle/busy/active/tail (2) -RxOver - number of receiver overruns -TxUnder - number of transmitter underruns -RxInts - number of receiver interrupts -TxInts - number of transmitter interrupts -EpInts - number of receiver special condition interrupts -SpInts - number of external/status interrupts -Size - maximum size of an AX.25 frame (*with* AX.25 headers!) -NoSpace - number of times a buffer could not get allocated - -An overrun is abnormal. If lots of these occur, the product of -baudrate and number of interfaces is too high for the processing -power of your computer. NoSpace errors are unlikely to be caused by the -driver or the kernel AX.25. - - -3.2 Setting Parameters -====================== - - -The setting of parameters of the emulated KISS TNC is done in the -same way in the SCC driver. You can change parameters by using -the kissparms program from the ax25-utils package or use the program -"sccparam": - - sccparam - -You can change the following parameters: - -param : value ------------------------- -speed : 1200 -txdelay : 36 -persist : 255 -slottime : 0 -txtail : 8 -fulldup : 1 -waittime : 12 -mintime : 3 -maxkeyup : 7 -idletime : 3 -maxdefer : 120 -group : 0x00 -txoff : off -softdcd : on -SLIP : off - - -The parameters have the following meaning: - -speed: - The baudrate on this channel in bits/sec - - Example: sccparam /dev/scc3 speed 9600 - -txdelay: - The delay (in units of 10 ms) after keying of the - transmitter, until the first byte is sent. This is usually - called "TXDELAY" in a TNC. When 0 is specified, the driver - will just wait until the CTS signal is asserted. This - assumes the presence of a timer or other circuitry in the - MODEM and/or transmitter, that asserts CTS when the - transmitter is ready for data. - A normal value of this parameter is 30-36. - - Example: sccparam /dev/scc0 txd 20 - -persist: - This is the probability that the transmitter will be keyed - when the channel is found to be free. It is a value from 0 - to 255, and the probability is (value+1)/256. The value - should be somewhere near 50-60, and should be lowered when - the channel is used more heavily. - - Example: sccparam /dev/scc2 persist 20 - -slottime: - This is the time between samples of the channel. It is - expressed in units of 10 ms. About 200-300 ms (value 20-30) - seems to be a good value. - - Example: sccparam /dev/scc0 slot 20 - -tail: - The time the transmitter will remain keyed after the last - byte of a packet has been transferred to the SCC. This is - necessary because the CRC and a flag still have to leave the - SCC before the transmitter is keyed down. The value depends - on the baudrate selected. A few character times should be - sufficient, e.g. 40ms at 1200 baud. (value 4) - The value of this parameter is in 10 ms units. - - Example: sccparam /dev/scc2 4 - -full: - The full-duplex mode switch. This can be one of the following - values: - - 0: The interface will operate in CSMA mode (the normal - half-duplex packet radio operation) - 1: Fullduplex mode, i.e. the transmitter will be keyed at - any time, without checking the received carrier. It - will be unkeyed when there are no packets to be sent. - 2: Like 1, but the transmitter will remain keyed, also - when there are no packets to be sent. Flags will be - sent in that case, until a timeout (parameter 10) - occurs. - - Example: sccparam /dev/scc0 fulldup off - -wait: - The initial waittime before any transmit attempt, after the - frame has been queue for transmit. This is the length of - the first slot in CSMA mode. In full duplex modes it is - set to 0 for maximum performance. - The value of this parameter is in 10 ms units. - - Example: sccparam /dev/scc1 wait 4 - -maxkey: - The maximal time the transmitter will be keyed to send - packets, in seconds. This can be useful on busy CSMA - channels, to avoid "getting a bad reputation" when you are - generating a lot of traffic. After the specified time has - elapsed, no new frame will be started. Instead, the trans- - mitter will be switched off for a specified time (parameter - min), and then the selected algorithm for keyup will be - started again. - The value 0 as well as "off" will disable this feature, - and allow infinite transmission time. - - Example: sccparam /dev/scc0 maxk 20 - -min: - This is the time the transmitter will be switched off when - the maximum transmission time is exceeded. - - Example: sccparam /dev/scc3 min 10 - -idle - This parameter specifies the maximum idle time in full duplex - 2 mode, in seconds. When no frames have been sent for this - time, the transmitter will be keyed down. A value of 0 is - has same result as the fullduplex mode 1. This parameter - can be disabled. - - Example: sccparam /dev/scc2 idle off # transmit forever - -maxdefer - This is the maximum time (in seconds) to wait for a free channel - to send. When this timer expires the transmitter will be keyed - IMMEDIATELY. If you love to get trouble with other users you - should set this to a very low value ;-) - - Example: sccparam /dev/scc0 maxdefer 240 # 2 minutes - - -txoff: - When this parameter has the value 0, the transmission of packets - is enable. Otherwise it is disabled. - - Example: sccparam /dev/scc2 txoff on - -group: - It is possible to build special radio equipment to use more than - one frequency on the same band, e.g. using several receivers and - only one transmitter that can be switched between frequencies. - Also, you can connect several radios that are active on the same - band. In these cases, it is not possible, or not a good idea, to - transmit on more than one frequency. The SCC driver provides a - method to lock transmitters on different interfaces, using the - "param group " command. This will only work when - you are using CSMA mode (parameter full = 0). - The number must be 0 if you want no group restrictions, and - can be computed as follows to create restricted groups: - is the sum of some OCTAL numbers: - - 200 This transmitter will only be keyed when all other - transmitters in the group are off. - 100 This transmitter will only be keyed when the carrier - detect of all other interfaces in the group is off. - 0xx A byte that can be used to define different groups. - Interfaces are in the same group, when the logical AND - between their xx values is nonzero. - - Examples: - When 2 interfaces use group 201, their transmitters will never be - keyed at the same time. - When 2 interfaces use group 101, the transmitters will only key - when both channels are clear at the same time. When group 301, - the transmitters will not be keyed at the same time. - - Don't forget to convert the octal numbers into decimal before - you set the parameter. - - Example: (to be written) - -softdcd: - use a software dcd instead of the real one... Useful for a very - slow squelch. - - Example: sccparam /dev/scc0 soft on - - -4. Problems -=========== - -If you have tx-problems with your BayCom USCC card please check -the manufacturer of the 8530. SGS chips have a slightly -different timing. Try Zilog... A solution is to write to register 8 -instead to the data port, but this won't work with the ESCC chips. -*SIGH!* - -A very common problem is that the PTT locks until the maxkeyup timer -expires, although interrupts and clock source are correct. In most -cases compiling the driver with CONFIG_SCC_DELAY (set with -make config) solves the problems. For more hints read the (pseudo) FAQ -and the documentation coming with z8530drv-utils. - -I got reports that the driver has problems on some 386-based systems. -(i.e. Amstrad) Those systems have a bogus AT bus timing which will -lead to delayed answers on interrupts. You can recognize these -problems by looking at the output of Sccstat for the suspected -port. If it shows under- and overruns you own such a system. - -Delayed processing of received data: This depends on - -- the kernel version - -- kernel profiling compiled or not - -- a high interrupt load - -- a high load of the machine --- running X, Xmorph, XV and Povray, - while compiling the kernel... hmm ... even with 32 MB RAM ... ;-) - Or running a named for the whole .ampr.org domain on an 8 MB - box... - -- using information from rxecho or kissbridge. - -Kernel panics: please read /linux/README and find out if it -really occurred within the scc driver. - -If you cannot solve a problem, send me - -- a description of the problem, -- information on your hardware (computer system, scc board, modem) -- your kernel version -- the output of cat /proc/net/z8530 - -4. Thor RLC100 -============== - -Mysteriously this board seems not to work with the driver. Anyone -got it up-and-running? - - -Many thanks to Linus Torvalds and Alan Cox for including the driver -in the Linux standard distribution and their support. - -Joerg Reuter ampr-net: dl1bke@db0pra.ampr.org - AX-25 : DL1BKE @ DB0ABH.#BAY.DEU.EU - Internet: jreuter@yaina.de - WWW : http://yaina.de/jreuter diff --git a/MAINTAINERS b/MAINTAINERS index d59455c27c42..bee65ebdc67e 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -18644,7 +18644,7 @@ L: linux-hams@vger.kernel.org S: Maintained W: http://yaina.de/jreuter/ W: http://www.qsl.net/dl1bke/ -F: Documentation/networking/z8530drv.txt +F: Documentation/networking/z8530drv.rst F: drivers/net/hamradio/*scc.c F: drivers/net/hamradio/z8530.h diff --git a/drivers/net/hamradio/Kconfig b/drivers/net/hamradio/Kconfig index fe409819b56d..f4500f04147d 100644 --- a/drivers/net/hamradio/Kconfig +++ b/drivers/net/hamradio/Kconfig @@ -84,7 +84,7 @@ config SCC ---help--- These cards are used to connect your Linux box to an amateur radio in order to communicate with other computers. If you want to use - this, read and the + this, read and the AX25-HOWTO, available from . Also make sure to say Y to "Amateur Radio AX.25 Level 2" support. @@ -98,7 +98,7 @@ config SCC_DELAY help Say Y here if you experience problems with the SCC driver not working properly; please read - for details. + for details. If unsure, say N. diff --git a/drivers/net/hamradio/scc.c b/drivers/net/hamradio/scc.c index 6c03932d8a6b..33fdd55c6122 100644 --- a/drivers/net/hamradio/scc.c +++ b/drivers/net/hamradio/scc.c @@ -7,7 +7,7 @@ * ------------------ * * You can find a subset of the documentation in - * Documentation/networking/z8530drv.txt. + * Documentation/networking/z8530drv.rst. */ /* -- cgit v1.2.3-59-g8ed1b From c79773e83e66fb2c22627eda3cd768f9e2bc10b5 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Fri, 1 May 2020 16:44:34 +0200 Subject: docs: networking: device drivers: convert 3com/3c509.txt to ReST - add SPDX header; - adjust titles and chapters, adding proper markups; - mark code blocks and literals as such; - add notes markups; - mark tables as such; - adjust identation, whitespaces and blank lines where needed; - add to networking/index.rst. Signed-off-by: Mauro Carvalho Chehab Signed-off-by: David S. Miller --- .../networking/device_drivers/3com/3c509.rst | 249 +++++++++++++++++++++ .../networking/device_drivers/3com/3c509.txt | 213 ------------------ Documentation/networking/device_drivers/index.rst | 1 + 3 files changed, 250 insertions(+), 213 deletions(-) create mode 100644 Documentation/networking/device_drivers/3com/3c509.rst delete mode 100644 Documentation/networking/device_drivers/3com/3c509.txt diff --git a/Documentation/networking/device_drivers/3com/3c509.rst b/Documentation/networking/device_drivers/3com/3c509.rst new file mode 100644 index 000000000000..47f706bacdd9 --- /dev/null +++ b/Documentation/networking/device_drivers/3com/3c509.rst @@ -0,0 +1,249 @@ +.. SPDX-License-Identifier: GPL-2.0 + +============================================================================= +Linux and the 3Com EtherLink III Series Ethercards (driver v1.18c and higher) +============================================================================= + +This file contains the instructions and caveats for v1.18c and higher versions +of the 3c509 driver. You should not use the driver without reading this file. + +release 1.0 + +28 February 2002 + +Current maintainer (corrections to): + David Ruggiero + +Introduction +============ + +The following are notes and information on using the 3Com EtherLink III series +ethercards in Linux. These cards are commonly known by the most widely-used +card's 3Com model number, 3c509. They are all 10mb/s ISA-bus cards and shouldn't +be (but sometimes are) confused with the similarly-numbered PCI-bus "3c905" +(aka "Vortex" or "Boomerang") series. Kernel support for the 3c509 family is +provided by the module 3c509.c, which has code to support all of the following +models: + + - 3c509 (original ISA card) + - 3c509B (later revision of the ISA card; supports full-duplex) + - 3c589 (PCMCIA) + - 3c589B (later revision of the 3c589; supports full-duplex) + - 3c579 (EISA) + +Large portions of this documentation were heavily borrowed from the guide +written the original author of the 3c509 driver, Donald Becker. The master +copy of that document, which contains notes on older versions of the driver, +currently resides on Scyld web server: http://www.scyld.com/. + + +Special Driver Features +======================= + +Overriding card settings + +The driver allows boot- or load-time overriding of the card's detected IOADDR, +IRQ, and transceiver settings, although this capability shouldn't generally be +needed except to enable full-duplex mode (see below). An example of the syntax +for LILO parameters for doing this:: + + ether=10,0x310,3,0x3c509,eth0 + +This configures the first found 3c509 card for IRQ 10, base I/O 0x310, and +transceiver type 3 (10base2). The flag "0x3c509" must be set to avoid conflicts +with other card types when overriding the I/O address. When the driver is +loaded as a module, only the IRQ may be overridden. For example, +setting two cards to IRQ10 and IRQ11 is done by using the irq module +option:: + + options 3c509 irq=10,11 + + +Full-duplex mode +================ + +The v1.18c driver added support for the 3c509B's full-duplex capabilities. +In order to enable and successfully use full-duplex mode, three conditions +must be met: + +(a) You must have a Etherlink III card model whose hardware supports full- +duplex operations. Currently, the only members of the 3c509 family that are +positively known to support full-duplex are the 3c509B (ISA bus) and 3c589B +(PCMCIA) cards. Cards without the "B" model designation do *not* support +full-duplex mode; these include the original 3c509 (no "B"), the original +3c589, the 3c529 (MCA bus), and the 3c579 (EISA bus). + +(b) You must be using your card's 10baseT transceiver (i.e., the RJ-45 +connector), not its AUI (thick-net) or 10base2 (thin-net/coax) interfaces. +AUI and 10base2 network cabling is physically incapable of full-duplex +operation. + +(c) Most importantly, your 3c509B must be connected to a link partner that is +itself full-duplex capable. This is almost certainly one of two things: a full- +duplex-capable Ethernet switch (*not* a hub), or a full-duplex-capable NIC on +another system that's connected directly to the 3c509B via a crossover cable. + +Full-duplex mode can be enabled using 'ethtool'. + +.. warning:: + + Extremely important caution concerning full-duplex mode + + Understand that the 3c509B's hardware's full-duplex support is much more + limited than that provide by more modern network interface cards. Although + at the physical layer of the network it fully supports full-duplex operation, + the card was designed before the current Ethernet auto-negotiation (N-way) + spec was written. This means that the 3c509B family ***cannot and will not + auto-negotiate a full-duplex connection with its link partner under any + circumstances, no matter how it is initialized***. If the full-duplex mode + of the 3c509B is enabled, its link partner will very likely need to be + independently _forced_ into full-duplex mode as well; otherwise various nasty + failures will occur - at the very least, you'll see massive numbers of packet + collisions. This is one of very rare circumstances where disabling auto- + negotiation and forcing the duplex mode of a network interface card or switch + would ever be necessary or desirable. + + +Available Transceiver Types +=========================== + +For versions of the driver v1.18c and above, the available transceiver types are: + +== ========================================================================= +0 transceiver type from EEPROM config (normally 10baseT); force half-duplex +1 AUI (thick-net / DB15 connector) +2 (undefined) +3 10base2 (thin-net == coax / BNC connector) +4 10baseT (RJ-45 connector); force half-duplex mode +8 transceiver type and duplex mode taken from card's EEPROM config settings +12 10baseT (RJ-45 connector); force full-duplex mode +== ========================================================================= + +Prior to driver version 1.18c, only transceiver codes 0-4 were supported. Note +that the new transceiver codes 8 and 12 are the *only* ones that will enable +full-duplex mode, no matter what the card's detected EEPROM settings might be. +This insured that merely upgrading the driver from an earlier version would +never automatically enable full-duplex mode in an existing installation; +it must always be explicitly enabled via one of these code in order to be +activated. + +The transceiver type can be changed using 'ethtool'. + + +Interpretation of error messages and common problems +---------------------------------------------------- + +Error Messages +^^^^^^^^^^^^^^ + +eth0: Infinite loop in interrupt, status 2011. +These are "mostly harmless" message indicating that the driver had too much +work during that interrupt cycle. With a status of 0x2011 you are receiving +packets faster than they can be removed from the card. This should be rare +or impossible in normal operation. Possible causes of this error report are: + + - a "green" mode enabled that slows the processor down when there is no + keyboard activity. + + - some other device or device driver hogging the bus or disabling interrupts. + Check /proc/interrupts for excessive interrupt counts. The timer tick + interrupt should always be incrementing faster than the others. + +No received packets +^^^^^^^^^^^^^^^^^^^ + +If a 3c509, 3c562 or 3c589 can successfully transmit packets, but never +receives packets (as reported by /proc/net/dev or 'ifconfig') you likely +have an interrupt line problem. Check /proc/interrupts to verify that the +card is actually generating interrupts. If the interrupt count is not +increasing you likely have a physical conflict with two devices trying to +use the same ISA IRQ line. The common conflict is with a sound card on IRQ10 +or IRQ5, and the easiest solution is to move the 3c509 to a different +interrupt line. If the device is receiving packets but 'ping' doesn't work, +you have a routing problem. + +Tx Carrier Errors Reported in /proc/net/dev +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + + +If an EtherLink III appears to transmit packets, but the "Tx carrier errors" +field in /proc/net/dev increments as quickly as the Tx packet count, you +likely have an unterminated network or the incorrect media transceiver selected. + +3c509B card is not detected on machines with an ISA PnP BIOS. +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +While the updated driver works with most PnP BIOS programs, it does not work +with all. This can be fixed by disabling PnP support using the 3Com-supplied +setup program. + +3c509 card is not detected on overclocked machines +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Increase the delay time in id_read_eeprom() from the current value, 500, +to an absurdly high value, such as 5000. + + +Decoding Status and Error Messages +---------------------------------- + + +The bits in the main status register are: + +===== ====================================== +value description +===== ====================================== +0x01 Interrupt latch +0x02 Tx overrun, or Rx underrun +0x04 Tx complete +0x08 Tx FIFO room available +0x10 A complete Rx packet has arrived +0x20 A Rx packet has started to arrive +0x40 The driver has requested an interrupt +0x80 Statistics counter nearly full +===== ====================================== + +The bits in the transmit (Tx) status word are: + +===== ============================================ +value description +===== ============================================ +0x02 Out-of-window collision. +0x04 Status stack overflow (normally impossible). +0x08 16 collisions. +0x10 Tx underrun (not enough PCI bus bandwidth). +0x20 Tx jabber. +0x40 Tx interrupt requested. +0x80 Status is valid (this should always be set). +===== ============================================ + + +When a transmit error occurs the driver produces a status message such as:: + + eth0: Transmit error, Tx status register 82 + +The two values typically seen here are: + +0x82 +^^^^ + +Out of window collision. This typically occurs when some other Ethernet +host is incorrectly set to full duplex on a half duplex network. + +0x88 +^^^^ + +16 collisions. This typically occurs when the network is exceptionally busy +or when another host doesn't correctly back off after a collision. If this +error is mixed with 0x82 errors it is the result of a host incorrectly set +to full duplex (see above). + +Both of these errors are the result of network problems that should be +corrected. They do not represent driver malfunction. + + +Revision history (this file) +============================ + +28Feb02 v1.0 DR New; major portions based on Becker original 3c509 docs + diff --git a/Documentation/networking/device_drivers/3com/3c509.txt b/Documentation/networking/device_drivers/3com/3c509.txt deleted file mode 100644 index fbf722e15ac3..000000000000 --- a/Documentation/networking/device_drivers/3com/3c509.txt +++ /dev/null @@ -1,213 +0,0 @@ -Linux and the 3Com EtherLink III Series Ethercards (driver v1.18c and higher) ----------------------------------------------------------------------------- - -This file contains the instructions and caveats for v1.18c and higher versions -of the 3c509 driver. You should not use the driver without reading this file. - -release 1.0 -28 February 2002 -Current maintainer (corrections to): - David Ruggiero - ----------------------------------------------------------------------------- - -(0) Introduction - -The following are notes and information on using the 3Com EtherLink III series -ethercards in Linux. These cards are commonly known by the most widely-used -card's 3Com model number, 3c509. They are all 10mb/s ISA-bus cards and shouldn't -be (but sometimes are) confused with the similarly-numbered PCI-bus "3c905" -(aka "Vortex" or "Boomerang") series. Kernel support for the 3c509 family is -provided by the module 3c509.c, which has code to support all of the following -models: - - 3c509 (original ISA card) - 3c509B (later revision of the ISA card; supports full-duplex) - 3c589 (PCMCIA) - 3c589B (later revision of the 3c589; supports full-duplex) - 3c579 (EISA) - -Large portions of this documentation were heavily borrowed from the guide -written the original author of the 3c509 driver, Donald Becker. The master -copy of that document, which contains notes on older versions of the driver, -currently resides on Scyld web server: http://www.scyld.com/. - - -(1) Special Driver Features - -Overriding card settings - -The driver allows boot- or load-time overriding of the card's detected IOADDR, -IRQ, and transceiver settings, although this capability shouldn't generally be -needed except to enable full-duplex mode (see below). An example of the syntax -for LILO parameters for doing this: - - ether=10,0x310,3,0x3c509,eth0 - -This configures the first found 3c509 card for IRQ 10, base I/O 0x310, and -transceiver type 3 (10base2). The flag "0x3c509" must be set to avoid conflicts -with other card types when overriding the I/O address. When the driver is -loaded as a module, only the IRQ may be overridden. For example, -setting two cards to IRQ10 and IRQ11 is done by using the irq module -option: - - options 3c509 irq=10,11 - - -(2) Full-duplex mode - -The v1.18c driver added support for the 3c509B's full-duplex capabilities. -In order to enable and successfully use full-duplex mode, three conditions -must be met: - -(a) You must have a Etherlink III card model whose hardware supports full- -duplex operations. Currently, the only members of the 3c509 family that are -positively known to support full-duplex are the 3c509B (ISA bus) and 3c589B -(PCMCIA) cards. Cards without the "B" model designation do *not* support -full-duplex mode; these include the original 3c509 (no "B"), the original -3c589, the 3c529 (MCA bus), and the 3c579 (EISA bus). - -(b) You must be using your card's 10baseT transceiver (i.e., the RJ-45 -connector), not its AUI (thick-net) or 10base2 (thin-net/coax) interfaces. -AUI and 10base2 network cabling is physically incapable of full-duplex -operation. - -(c) Most importantly, your 3c509B must be connected to a link partner that is -itself full-duplex capable. This is almost certainly one of two things: a full- -duplex-capable Ethernet switch (*not* a hub), or a full-duplex-capable NIC on -another system that's connected directly to the 3c509B via a crossover cable. - -Full-duplex mode can be enabled using 'ethtool'. - -/////Extremely important caution concerning full-duplex mode///// -Understand that the 3c509B's hardware's full-duplex support is much more -limited than that provide by more modern network interface cards. Although -at the physical layer of the network it fully supports full-duplex operation, -the card was designed before the current Ethernet auto-negotiation (N-way) -spec was written. This means that the 3c509B family ***cannot and will not -auto-negotiate a full-duplex connection with its link partner under any -circumstances, no matter how it is initialized***. If the full-duplex mode -of the 3c509B is enabled, its link partner will very likely need to be -independently _forced_ into full-duplex mode as well; otherwise various nasty -failures will occur - at the very least, you'll see massive numbers of packet -collisions. This is one of very rare circumstances where disabling auto- -negotiation and forcing the duplex mode of a network interface card or switch -would ever be necessary or desirable. - - -(3) Available Transceiver Types - -For versions of the driver v1.18c and above, the available transceiver types are: - -0 transceiver type from EEPROM config (normally 10baseT); force half-duplex -1 AUI (thick-net / DB15 connector) -2 (undefined) -3 10base2 (thin-net == coax / BNC connector) -4 10baseT (RJ-45 connector); force half-duplex mode -8 transceiver type and duplex mode taken from card's EEPROM config settings -12 10baseT (RJ-45 connector); force full-duplex mode - -Prior to driver version 1.18c, only transceiver codes 0-4 were supported. Note -that the new transceiver codes 8 and 12 are the *only* ones that will enable -full-duplex mode, no matter what the card's detected EEPROM settings might be. -This insured that merely upgrading the driver from an earlier version would -never automatically enable full-duplex mode in an existing installation; -it must always be explicitly enabled via one of these code in order to be -activated. - -The transceiver type can be changed using 'ethtool'. - - -(4a) Interpretation of error messages and common problems - -Error Messages - -eth0: Infinite loop in interrupt, status 2011. -These are "mostly harmless" message indicating that the driver had too much -work during that interrupt cycle. With a status of 0x2011 you are receiving -packets faster than they can be removed from the card. This should be rare -or impossible in normal operation. Possible causes of this error report are: - - - a "green" mode enabled that slows the processor down when there is no - keyboard activity. - - - some other device or device driver hogging the bus or disabling interrupts. - Check /proc/interrupts for excessive interrupt counts. The timer tick - interrupt should always be incrementing faster than the others. - -No received packets -If a 3c509, 3c562 or 3c589 can successfully transmit packets, but never -receives packets (as reported by /proc/net/dev or 'ifconfig') you likely -have an interrupt line problem. Check /proc/interrupts to verify that the -card is actually generating interrupts. If the interrupt count is not -increasing you likely have a physical conflict with two devices trying to -use the same ISA IRQ line. The common conflict is with a sound card on IRQ10 -or IRQ5, and the easiest solution is to move the 3c509 to a different -interrupt line. If the device is receiving packets but 'ping' doesn't work, -you have a routing problem. - -Tx Carrier Errors Reported in /proc/net/dev -If an EtherLink III appears to transmit packets, but the "Tx carrier errors" -field in /proc/net/dev increments as quickly as the Tx packet count, you -likely have an unterminated network or the incorrect media transceiver selected. - -3c509B card is not detected on machines with an ISA PnP BIOS. -While the updated driver works with most PnP BIOS programs, it does not work -with all. This can be fixed by disabling PnP support using the 3Com-supplied -setup program. - -3c509 card is not detected on overclocked machines -Increase the delay time in id_read_eeprom() from the current value, 500, -to an absurdly high value, such as 5000. - - -(4b) Decoding Status and Error Messages - -The bits in the main status register are: - -value description -0x01 Interrupt latch -0x02 Tx overrun, or Rx underrun -0x04 Tx complete -0x08 Tx FIFO room available -0x10 A complete Rx packet has arrived -0x20 A Rx packet has started to arrive -0x40 The driver has requested an interrupt -0x80 Statistics counter nearly full - -The bits in the transmit (Tx) status word are: - -value description -0x02 Out-of-window collision. -0x04 Status stack overflow (normally impossible). -0x08 16 collisions. -0x10 Tx underrun (not enough PCI bus bandwidth). -0x20 Tx jabber. -0x40 Tx interrupt requested. -0x80 Status is valid (this should always be set). - - -When a transmit error occurs the driver produces a status message such as - - eth0: Transmit error, Tx status register 82 - -The two values typically seen here are: - -0x82 -Out of window collision. This typically occurs when some other Ethernet -host is incorrectly set to full duplex on a half duplex network. - -0x88 -16 collisions. This typically occurs when the network is exceptionally busy -or when another host doesn't correctly back off after a collision. If this -error is mixed with 0x82 errors it is the result of a host incorrectly set -to full duplex (see above). - -Both of these errors are the result of network problems that should be -corrected. They do not represent driver malfunction. - - -(5) Revision history (this file) - -28Feb02 v1.0 DR New; major portions based on Becker original 3c509 docs - diff --git a/Documentation/networking/device_drivers/index.rst b/Documentation/networking/device_drivers/index.rst index a191faaf97de..402a9188f446 100644 --- a/Documentation/networking/device_drivers/index.rst +++ b/Documentation/networking/device_drivers/index.rst @@ -27,6 +27,7 @@ Contents: netronome/nfp pensando/ionic stmicro/stmmac + 3com/3c509 .. only:: subproject and html -- cgit v1.2.3-59-g8ed1b From 9ea2af8d16f5612168ed52cb0ec6752bac0877a9 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Fri, 1 May 2020 16:44:35 +0200 Subject: docs: networking: device drivers: convert 3com/vortex.txt to ReST - add SPDX header; - add a document title; - mark code blocks and literals as such; - mark tables as such; - adjust identation, whitespaces and blank lines where needed; - add to networking/index.rst. Signed-off-by: Mauro Carvalho Chehab Signed-off-by: David S. Miller --- .../networking/device_drivers/3com/vortex.rst | 461 +++++++++++++++++++++ .../networking/device_drivers/3com/vortex.txt | 448 -------------------- Documentation/networking/device_drivers/index.rst | 1 + MAINTAINERS | 2 +- drivers/net/ethernet/3com/3c59x.c | 4 +- drivers/net/ethernet/3com/Kconfig | 2 +- 6 files changed, 466 insertions(+), 452 deletions(-) create mode 100644 Documentation/networking/device_drivers/3com/vortex.rst delete mode 100644 Documentation/networking/device_drivers/3com/vortex.txt diff --git a/Documentation/networking/device_drivers/3com/vortex.rst b/Documentation/networking/device_drivers/3com/vortex.rst new file mode 100644 index 000000000000..800add5be338 --- /dev/null +++ b/Documentation/networking/device_drivers/3com/vortex.rst @@ -0,0 +1,461 @@ +.. SPDX-License-Identifier: GPL-2.0 + +========================= +3Com Vortex device driver +========================= + +Documentation/networking/device_drivers/3com/vortex.rst + +Andrew Morton + +30 April 2000 + + +This document describes the usage and errata of the 3Com "Vortex" device +driver for Linux, 3c59x.c. + +The driver was written by Donald Becker + +Don is no longer the prime maintainer of this version of the driver. +Please report problems to one or more of: + +- Andrew Morton +- Netdev mailing list +- Linux kernel mailing list + +Please note the 'Reporting and Diagnosing Problems' section at the end +of this file. + + +Since kernel 2.3.99-pre6, this driver incorporates the support for the +3c575-series Cardbus cards which used to be handled by 3c575_cb.c. + +This driver supports the following hardware: + + - 3c590 Vortex 10Mbps + - 3c592 EISA 10Mbps Demon/Vortex + - 3c597 EISA Fast Demon/Vortex + - 3c595 Vortex 100baseTx + - 3c595 Vortex 100baseT4 + - 3c595 Vortex 100base-MII + - 3c900 Boomerang 10baseT + - 3c900 Boomerang 10Mbps Combo + - 3c900 Cyclone 10Mbps TPO + - 3c900 Cyclone 10Mbps Combo + - 3c900 Cyclone 10Mbps TPC + - 3c900B-FL Cyclone 10base-FL + - 3c905 Boomerang 100baseTx + - 3c905 Boomerang 100baseT4 + - 3c905B Cyclone 100baseTx + - 3c905B Cyclone 10/100/BNC + - 3c905B-FX Cyclone 100baseFx + - 3c905C Tornado + - 3c920B-EMB-WNM (ATI Radeon 9100 IGP) + - 3c980 Cyclone + - 3c980C Python-T + - 3cSOHO100-TX Hurricane + - 3c555 Laptop Hurricane + - 3c556 Laptop Tornado + - 3c556B Laptop Hurricane + - 3c575 [Megahertz] 10/100 LAN CardBus + - 3c575 Boomerang CardBus + - 3CCFE575BT Cyclone CardBus + - 3CCFE575CT Tornado CardBus + - 3CCFE656 Cyclone CardBus + - 3CCFEM656B Cyclone+Winmodem CardBus + - 3CXFEM656C Tornado+Winmodem CardBus + - 3c450 HomePNA Tornado + - 3c920 Tornado + - 3c982 Hydra Dual Port A + - 3c982 Hydra Dual Port B + - 3c905B-T4 + - 3c920B-EMB-WNM Tornado + +Module parameters +================= + +There are several parameters which may be provided to the driver when +its module is loaded. These are usually placed in ``/etc/modprobe.d/*.conf`` +configuration files. Example:: + + options 3c59x debug=3 rx_copybreak=300 + +If you are using the PCMCIA tools (cardmgr) then the options may be +placed in /etc/pcmcia/config.opts:: + + module "3c59x" opts "debug=3 rx_copybreak=300" + + +The supported parameters are: + +debug=N + + Where N is a number from 0 to 7. Anything above 3 produces a lot + of output in your system logs. debug=1 is default. + +options=N1,N2,N3,... + + Each number in the list provides an option to the corresponding + network card. So if you have two 3c905's and you wish to provide + them with option 0x204 you would use:: + + options=0x204,0x204 + + The individual options are composed of a number of bitfields which + have the following meanings: + + Possible media type settings + + == ================================= + 0 10baseT + 1 10Mbs AUI + 2 undefined + 3 10base2 (BNC) + 4 100base-TX + 5 100base-FX + 6 MII (Media Independent Interface) + 7 Use default setting from EEPROM + 8 Autonegotiate + 9 External MII + 10 Use default setting from EEPROM + == ================================= + + When generating a value for the 'options' setting, the above media + selection values may be OR'ed (or added to) the following: + + ====== ============================================= + 0x8000 Set driver debugging level to 7 + 0x4000 Set driver debugging level to 2 + 0x0400 Enable Wake-on-LAN + 0x0200 Force full duplex mode. + 0x0010 Bus-master enable bit (Old Vortex cards only) + ====== ============================================= + + For example:: + + insmod 3c59x options=0x204 + + will force full-duplex 100base-TX, rather than allowing the usual + autonegotiation. + +global_options=N + + Sets the ``options`` parameter for all 3c59x NICs in the machine. + Entries in the ``options`` array above will override any setting of + this. + +full_duplex=N1,N2,N3... + + Similar to bit 9 of 'options'. Forces the corresponding card into + full-duplex mode. Please use this in preference to the ``options`` + parameter. + + In fact, please don't use this at all! You're better off getting + autonegotiation working properly. + +global_full_duplex=N1 + + Sets full duplex mode for all 3c59x NICs in the machine. Entries + in the ``full_duplex`` array above will override any setting of this. + +flow_ctrl=N1,N2,N3... + + Use 802.3x MAC-layer flow control. The 3com cards only support the + PAUSE command, which means that they will stop sending packets for a + short period if they receive a PAUSE frame from the link partner. + + The driver only allows flow control on a link which is operating in + full duplex mode. + + This feature does not appear to work on the 3c905 - only 3c905B and + 3c905C have been tested. + + The 3com cards appear to only respond to PAUSE frames which are + sent to the reserved destination address of 01:80:c2:00:00:01. They + do not honour PAUSE frames which are sent to the station MAC address. + +rx_copybreak=M + + The driver preallocates 32 full-sized (1536 byte) network buffers + for receiving. When a packet arrives, the driver has to decide + whether to leave the packet in its full-sized buffer, or to allocate + a smaller buffer and copy the packet across into it. + + This is a speed/space tradeoff. + + The value of rx_copybreak is used to decide when to make the copy. + If the packet size is less than rx_copybreak, the packet is copied. + The default value for rx_copybreak is 200 bytes. + +max_interrupt_work=N + + The driver's interrupt service routine can handle many receive and + transmit packets in a single invocation. It does this in a loop. + The value of max_interrupt_work governs how many times the interrupt + service routine will loop. The default value is 32 loops. If this + is exceeded the interrupt service routine gives up and generates a + warning message "eth0: Too much work in interrupt". + +hw_checksums=N1,N2,N3,... + + Recent 3com NICs are able to generate IPv4, TCP and UDP checksums + in hardware. Linux has used the Rx checksumming for a long time. + The "zero copy" patch which is planned for the 2.4 kernel series + allows you to make use of the NIC's DMA scatter/gather and transmit + checksumming as well. + + The driver is set up so that, when the zerocopy patch is applied, + all Tornado and Cyclone devices will use S/G and Tx checksums. + + This module parameter has been provided so you can override this + decision. If you think that Tx checksums are causing a problem, you + may disable the feature with ``hw_checksums=0``. + + If you think your NIC should be performing Tx checksumming and the + driver isn't enabling it, you can force the use of hardware Tx + checksumming with ``hw_checksums=1``. + + The driver drops a message in the logfiles to indicate whether or + not it is using hardware scatter/gather and hardware Tx checksums. + + Scatter/gather and hardware checksums provide considerable + performance improvement for the sendfile() system call, but a small + decrease in throughput for send(). There is no effect upon receive + efficiency. + +compaq_ioaddr=N, +compaq_irq=N, +compaq_device_id=N + + "Variables to work-around the Compaq PCI BIOS32 problem".... + +watchdog=N + + Sets the time duration (in milliseconds) after which the kernel + decides that the transmitter has become stuck and needs to be reset. + This is mainly for debugging purposes, although it may be advantageous + to increase this value on LANs which have very high collision rates. + The default value is 5000 (5.0 seconds). + +enable_wol=N1,N2,N3,... + + Enable Wake-on-LAN support for the relevant interface. Donald + Becker's ``ether-wake`` application may be used to wake suspended + machines. + + Also enables the NIC's power management support. + +global_enable_wol=N + + Sets enable_wol mode for all 3c59x NICs in the machine. Entries in + the ``enable_wol`` array above will override any setting of this. + +Media selection +--------------- + +A number of the older NICs such as the 3c590 and 3c900 series have +10base2 and AUI interfaces. + +Prior to January, 2001 this driver would autoeselect the 10base2 or AUI +port if it didn't detect activity on the 10baseT port. It would then +get stuck on the 10base2 port and a driver reload was necessary to +switch back to 10baseT. This behaviour could not be prevented with a +module option override. + +Later (current) versions of the driver _do_ support locking of the +media type. So if you load the driver module with + + modprobe 3c59x options=0 + +it will permanently select the 10baseT port. Automatic selection of +other media types does not occur. + + +Transmit error, Tx status register 82 +------------------------------------- + +This is a common error which is almost always caused by another host on +the same network being in full-duplex mode, while this host is in +half-duplex mode. You need to find that other host and make it run in +half-duplex mode or fix this host to run in full-duplex mode. + +As a last resort, you can force the 3c59x driver into full-duplex mode +with + + options 3c59x full_duplex=1 + +but this has to be viewed as a workaround for broken network gear and +should only really be used for equipment which cannot autonegotiate. + + +Additional resources +-------------------- + +Details of the device driver implementation are at the top of the source file. + +Additional documentation is available at Don Becker's Linux Drivers site: + + http://www.scyld.com/vortex.html + +Donald Becker's driver development site: + + http://www.scyld.com/network.html + +Donald's vortex-diag program is useful for inspecting the NIC's state: + + http://www.scyld.com/ethercard_diag.html + +Donald's mii-diag program may be used for inspecting and manipulating +the NIC's Media Independent Interface subsystem: + + http://www.scyld.com/ethercard_diag.html#mii-diag + +Donald's wake-on-LAN page: + + http://www.scyld.com/wakeonlan.html + +3Com's DOS-based application for setting up the NICs EEPROMs: + + ftp://ftp.3com.com/pub/nic/3c90x/3c90xx2.exe + + +Autonegotiation notes +--------------------- + + The driver uses a one-minute heartbeat for adapting to changes in + the external LAN environment if link is up and 5 seconds if link is down. + This means that when, for example, a machine is unplugged from a hubbed + 10baseT LAN plugged into a switched 100baseT LAN, the throughput + will be quite dreadful for up to sixty seconds. Be patient. + + Cisco interoperability note from Walter Wong : + + On a side note, adding HAS_NWAY seems to share a problem with the + Cisco 6509 switch. Specifically, you need to change the spanning + tree parameter for the port the machine is plugged into to 'portfast' + mode. Otherwise, the negotiation fails. This has been an issue + we've noticed for a while but haven't had the time to track down. + + Cisco switches (Jeff Busch ) + + My "standard config" for ports to which PC's/servers connect directly:: + + interface FastEthernet0/N + description machinename + load-interval 30 + spanning-tree portfast + + If autonegotiation is a problem, you may need to specify "speed + 100" and "duplex full" as well (or "speed 10" and "duplex half"). + + WARNING: DO NOT hook up hubs/switches/bridges to these + specially-configured ports! The switch will become very confused. + + +Reporting and diagnosing problems +--------------------------------- + +Maintainers find that accurate and complete problem reports are +invaluable in resolving driver problems. We are frequently not able to +reproduce problems and must rely on your patience and efforts to get to +the bottom of the problem. + +If you believe you have a driver problem here are some of the +steps you should take: + +- Is it really a driver problem? + + Eliminate some variables: try different cards, different + computers, different cables, different ports on the switch/hub, + different versions of the kernel or of the driver, etc. + +- OK, it's a driver problem. + + You need to generate a report. Typically this is an email to the + maintainer and/or netdev@vger.kernel.org. The maintainer's + email address will be in the driver source or in the MAINTAINERS file. + +- The contents of your report will vary a lot depending upon the + problem. If it's a kernel crash then you should refer to the + admin-guide/reporting-bugs.rst file. + + But for most problems it is useful to provide the following: + + - Kernel version, driver version + + - A copy of the banner message which the driver generates when + it is initialised. For example: + + eth0: 3Com PCI 3c905C Tornado at 0xa400, 00:50:da:6a:88:f0, IRQ 19 + 8K byte-wide RAM 5:3 Rx:Tx split, autoselect/Autonegotiate interface. + MII transceiver found at address 24, status 782d. + Enabling bus-master transmits and whole-frame receives. + + NOTE: You must provide the ``debug=2`` modprobe option to generate + a full detection message. Please do this:: + + modprobe 3c59x debug=2 + + - If it is a PCI device, the relevant output from 'lspci -vx', eg:: + + 00:09.0 Ethernet controller: 3Com Corporation 3c905C-TX [Fast Etherlink] (rev 74) + Subsystem: 3Com Corporation: Unknown device 9200 + Flags: bus master, medium devsel, latency 32, IRQ 19 + I/O ports at a400 [size=128] + Memory at db000000 (32-bit, non-prefetchable) [size=128] + Expansion ROM at [disabled] [size=128K] + Capabilities: [dc] Power Management version 2 + 00: b7 10 00 92 07 00 10 02 74 00 00 02 08 20 00 00 + 10: 01 a4 00 00 00 00 00 db 00 00 00 00 00 00 00 00 + 20: 00 00 00 00 00 00 00 00 00 00 00 00 b7 10 00 10 + 30: 00 00 00 00 dc 00 00 00 00 00 00 00 05 01 0a 0a + + - A description of the environment: 10baseT? 100baseT? + full/half duplex? switched or hubbed? + + - Any additional module parameters which you may be providing to the driver. + + - Any kernel logs which are produced. The more the merrier. + If this is a large file and you are sending your report to a + mailing list, mention that you have the logfile, but don't send + it. If you're reporting direct to the maintainer then just send + it. + + To ensure that all kernel logs are available, add the + following line to /etc/syslog.conf:: + + kern.* /var/log/messages + + Then restart syslogd with:: + + /etc/rc.d/init.d/syslog restart + + (The above may vary, depending upon which Linux distribution you use). + + - If your problem is reproducible then that's great. Try the + following: + + 1) Increase the debug level. Usually this is done via: + + a) modprobe driver debug=7 + b) In /etc/modprobe.d/driver.conf: + options driver debug=7 + + 2) Recreate the problem with the higher debug level, + send all logs to the maintainer. + + 3) Download you card's diagnostic tool from Donald + Becker's website . + Download mii-diag.c as well. Build these. + + a) Run 'vortex-diag -aaee' and 'mii-diag -v' when the card is + working correctly. Save the output. + + b) Run the above commands when the card is malfunctioning. Send + both sets of output. + +Finally, please be patient and be prepared to do some work. You may +end up working on this problem for a week or more as the maintainer +asks more questions, asks for more tests, asks for patches to be +applied, etc. At the end of it all, the problem may even remain +unresolved. diff --git a/Documentation/networking/device_drivers/3com/vortex.txt b/Documentation/networking/device_drivers/3com/vortex.txt deleted file mode 100644 index 587f3fcfbcae..000000000000 --- a/Documentation/networking/device_drivers/3com/vortex.txt +++ /dev/null @@ -1,448 +0,0 @@ -Documentation/networking/device_drivers/3com/vortex.txt -Andrew Morton -30 April 2000 - - -This document describes the usage and errata of the 3Com "Vortex" device -driver for Linux, 3c59x.c. - -The driver was written by Donald Becker - -Don is no longer the prime maintainer of this version of the driver. -Please report problems to one or more of: - - Andrew Morton - Netdev mailing list - Linux kernel mailing list - -Please note the 'Reporting and Diagnosing Problems' section at the end -of this file. - - -Since kernel 2.3.99-pre6, this driver incorporates the support for the -3c575-series Cardbus cards which used to be handled by 3c575_cb.c. - -This driver supports the following hardware: - - 3c590 Vortex 10Mbps - 3c592 EISA 10Mbps Demon/Vortex - 3c597 EISA Fast Demon/Vortex - 3c595 Vortex 100baseTx - 3c595 Vortex 100baseT4 - 3c595 Vortex 100base-MII - 3c900 Boomerang 10baseT - 3c900 Boomerang 10Mbps Combo - 3c900 Cyclone 10Mbps TPO - 3c900 Cyclone 10Mbps Combo - 3c900 Cyclone 10Mbps TPC - 3c900B-FL Cyclone 10base-FL - 3c905 Boomerang 100baseTx - 3c905 Boomerang 100baseT4 - 3c905B Cyclone 100baseTx - 3c905B Cyclone 10/100/BNC - 3c905B-FX Cyclone 100baseFx - 3c905C Tornado - 3c920B-EMB-WNM (ATI Radeon 9100 IGP) - 3c980 Cyclone - 3c980C Python-T - 3cSOHO100-TX Hurricane - 3c555 Laptop Hurricane - 3c556 Laptop Tornado - 3c556B Laptop Hurricane - 3c575 [Megahertz] 10/100 LAN CardBus - 3c575 Boomerang CardBus - 3CCFE575BT Cyclone CardBus - 3CCFE575CT Tornado CardBus - 3CCFE656 Cyclone CardBus - 3CCFEM656B Cyclone+Winmodem CardBus - 3CXFEM656C Tornado+Winmodem CardBus - 3c450 HomePNA Tornado - 3c920 Tornado - 3c982 Hydra Dual Port A - 3c982 Hydra Dual Port B - 3c905B-T4 - 3c920B-EMB-WNM Tornado - -Module parameters -================= - -There are several parameters which may be provided to the driver when -its module is loaded. These are usually placed in /etc/modprobe.d/*.conf -configuration files. Example: - -options 3c59x debug=3 rx_copybreak=300 - -If you are using the PCMCIA tools (cardmgr) then the options may be -placed in /etc/pcmcia/config.opts: - -module "3c59x" opts "debug=3 rx_copybreak=300" - - -The supported parameters are: - -debug=N - - Where N is a number from 0 to 7. Anything above 3 produces a lot - of output in your system logs. debug=1 is default. - -options=N1,N2,N3,... - - Each number in the list provides an option to the corresponding - network card. So if you have two 3c905's and you wish to provide - them with option 0x204 you would use: - - options=0x204,0x204 - - The individual options are composed of a number of bitfields which - have the following meanings: - - Possible media type settings - 0 10baseT - 1 10Mbs AUI - 2 undefined - 3 10base2 (BNC) - 4 100base-TX - 5 100base-FX - 6 MII (Media Independent Interface) - 7 Use default setting from EEPROM - 8 Autonegotiate - 9 External MII - 10 Use default setting from EEPROM - - When generating a value for the 'options' setting, the above media - selection values may be OR'ed (or added to) the following: - - 0x8000 Set driver debugging level to 7 - 0x4000 Set driver debugging level to 2 - 0x0400 Enable Wake-on-LAN - 0x0200 Force full duplex mode. - 0x0010 Bus-master enable bit (Old Vortex cards only) - - For example: - - insmod 3c59x options=0x204 - - will force full-duplex 100base-TX, rather than allowing the usual - autonegotiation. - -global_options=N - - Sets the `options' parameter for all 3c59x NICs in the machine. - Entries in the `options' array above will override any setting of - this. - -full_duplex=N1,N2,N3... - - Similar to bit 9 of 'options'. Forces the corresponding card into - full-duplex mode. Please use this in preference to the `options' - parameter. - - In fact, please don't use this at all! You're better off getting - autonegotiation working properly. - -global_full_duplex=N1 - - Sets full duplex mode for all 3c59x NICs in the machine. Entries - in the `full_duplex' array above will override any setting of this. - -flow_ctrl=N1,N2,N3... - - Use 802.3x MAC-layer flow control. The 3com cards only support the - PAUSE command, which means that they will stop sending packets for a - short period if they receive a PAUSE frame from the link partner. - - The driver only allows flow control on a link which is operating in - full duplex mode. - - This feature does not appear to work on the 3c905 - only 3c905B and - 3c905C have been tested. - - The 3com cards appear to only respond to PAUSE frames which are - sent to the reserved destination address of 01:80:c2:00:00:01. They - do not honour PAUSE frames which are sent to the station MAC address. - -rx_copybreak=M - - The driver preallocates 32 full-sized (1536 byte) network buffers - for receiving. When a packet arrives, the driver has to decide - whether to leave the packet in its full-sized buffer, or to allocate - a smaller buffer and copy the packet across into it. - - This is a speed/space tradeoff. - - The value of rx_copybreak is used to decide when to make the copy. - If the packet size is less than rx_copybreak, the packet is copied. - The default value for rx_copybreak is 200 bytes. - -max_interrupt_work=N - - The driver's interrupt service routine can handle many receive and - transmit packets in a single invocation. It does this in a loop. - The value of max_interrupt_work governs how many times the interrupt - service routine will loop. The default value is 32 loops. If this - is exceeded the interrupt service routine gives up and generates a - warning message "eth0: Too much work in interrupt". - -hw_checksums=N1,N2,N3,... - - Recent 3com NICs are able to generate IPv4, TCP and UDP checksums - in hardware. Linux has used the Rx checksumming for a long time. - The "zero copy" patch which is planned for the 2.4 kernel series - allows you to make use of the NIC's DMA scatter/gather and transmit - checksumming as well. - - The driver is set up so that, when the zerocopy patch is applied, - all Tornado and Cyclone devices will use S/G and Tx checksums. - - This module parameter has been provided so you can override this - decision. If you think that Tx checksums are causing a problem, you - may disable the feature with `hw_checksums=0'. - - If you think your NIC should be performing Tx checksumming and the - driver isn't enabling it, you can force the use of hardware Tx - checksumming with `hw_checksums=1'. - - The driver drops a message in the logfiles to indicate whether or - not it is using hardware scatter/gather and hardware Tx checksums. - - Scatter/gather and hardware checksums provide considerable - performance improvement for the sendfile() system call, but a small - decrease in throughput for send(). There is no effect upon receive - efficiency. - -compaq_ioaddr=N -compaq_irq=N -compaq_device_id=N - - "Variables to work-around the Compaq PCI BIOS32 problem".... - -watchdog=N - - Sets the time duration (in milliseconds) after which the kernel - decides that the transmitter has become stuck and needs to be reset. - This is mainly for debugging purposes, although it may be advantageous - to increase this value on LANs which have very high collision rates. - The default value is 5000 (5.0 seconds). - -enable_wol=N1,N2,N3,... - - Enable Wake-on-LAN support for the relevant interface. Donald - Becker's `ether-wake' application may be used to wake suspended - machines. - - Also enables the NIC's power management support. - -global_enable_wol=N - - Sets enable_wol mode for all 3c59x NICs in the machine. Entries in - the `enable_wol' array above will override any setting of this. - -Media selection ---------------- - -A number of the older NICs such as the 3c590 and 3c900 series have -10base2 and AUI interfaces. - -Prior to January, 2001 this driver would autoeselect the 10base2 or AUI -port if it didn't detect activity on the 10baseT port. It would then -get stuck on the 10base2 port and a driver reload was necessary to -switch back to 10baseT. This behaviour could not be prevented with a -module option override. - -Later (current) versions of the driver _do_ support locking of the -media type. So if you load the driver module with - - modprobe 3c59x options=0 - -it will permanently select the 10baseT port. Automatic selection of -other media types does not occur. - - -Transmit error, Tx status register 82 -------------------------------------- - -This is a common error which is almost always caused by another host on -the same network being in full-duplex mode, while this host is in -half-duplex mode. You need to find that other host and make it run in -half-duplex mode or fix this host to run in full-duplex mode. - -As a last resort, you can force the 3c59x driver into full-duplex mode -with - - options 3c59x full_duplex=1 - -but this has to be viewed as a workaround for broken network gear and -should only really be used for equipment which cannot autonegotiate. - - -Additional resources --------------------- - -Details of the device driver implementation are at the top of the source file. - -Additional documentation is available at Don Becker's Linux Drivers site: - - http://www.scyld.com/vortex.html - -Donald Becker's driver development site: - - http://www.scyld.com/network.html - -Donald's vortex-diag program is useful for inspecting the NIC's state: - - http://www.scyld.com/ethercard_diag.html - -Donald's mii-diag program may be used for inspecting and manipulating -the NIC's Media Independent Interface subsystem: - - http://www.scyld.com/ethercard_diag.html#mii-diag - -Donald's wake-on-LAN page: - - http://www.scyld.com/wakeonlan.html - -3Com's DOS-based application for setting up the NICs EEPROMs: - - ftp://ftp.3com.com/pub/nic/3c90x/3c90xx2.exe - - -Autonegotiation notes ---------------------- - - The driver uses a one-minute heartbeat for adapting to changes in - the external LAN environment if link is up and 5 seconds if link is down. - This means that when, for example, a machine is unplugged from a hubbed - 10baseT LAN plugged into a switched 100baseT LAN, the throughput - will be quite dreadful for up to sixty seconds. Be patient. - - Cisco interoperability note from Walter Wong : - - On a side note, adding HAS_NWAY seems to share a problem with the - Cisco 6509 switch. Specifically, you need to change the spanning - tree parameter for the port the machine is plugged into to 'portfast' - mode. Otherwise, the negotiation fails. This has been an issue - we've noticed for a while but haven't had the time to track down. - - Cisco switches (Jeff Busch ) - - My "standard config" for ports to which PC's/servers connect directly: - - interface FastEthernet0/N - description machinename - load-interval 30 - spanning-tree portfast - - If autonegotiation is a problem, you may need to specify "speed - 100" and "duplex full" as well (or "speed 10" and "duplex half"). - - WARNING: DO NOT hook up hubs/switches/bridges to these - specially-configured ports! The switch will become very confused. - - -Reporting and diagnosing problems ---------------------------------- - -Maintainers find that accurate and complete problem reports are -invaluable in resolving driver problems. We are frequently not able to -reproduce problems and must rely on your patience and efforts to get to -the bottom of the problem. - -If you believe you have a driver problem here are some of the -steps you should take: - -- Is it really a driver problem? - - Eliminate some variables: try different cards, different - computers, different cables, different ports on the switch/hub, - different versions of the kernel or of the driver, etc. - -- OK, it's a driver problem. - - You need to generate a report. Typically this is an email to the - maintainer and/or netdev@vger.kernel.org. The maintainer's - email address will be in the driver source or in the MAINTAINERS file. - -- The contents of your report will vary a lot depending upon the - problem. If it's a kernel crash then you should refer to the - admin-guide/reporting-bugs.rst file. - - But for most problems it is useful to provide the following: - - o Kernel version, driver version - - o A copy of the banner message which the driver generates when - it is initialised. For example: - - eth0: 3Com PCI 3c905C Tornado at 0xa400, 00:50:da:6a:88:f0, IRQ 19 - 8K byte-wide RAM 5:3 Rx:Tx split, autoselect/Autonegotiate interface. - MII transceiver found at address 24, status 782d. - Enabling bus-master transmits and whole-frame receives. - - NOTE: You must provide the `debug=2' modprobe option to generate - a full detection message. Please do this: - - modprobe 3c59x debug=2 - - o If it is a PCI device, the relevant output from 'lspci -vx', eg: - - 00:09.0 Ethernet controller: 3Com Corporation 3c905C-TX [Fast Etherlink] (rev 74) - Subsystem: 3Com Corporation: Unknown device 9200 - Flags: bus master, medium devsel, latency 32, IRQ 19 - I/O ports at a400 [size=128] - Memory at db000000 (32-bit, non-prefetchable) [size=128] - Expansion ROM at [disabled] [size=128K] - Capabilities: [dc] Power Management version 2 - 00: b7 10 00 92 07 00 10 02 74 00 00 02 08 20 00 00 - 10: 01 a4 00 00 00 00 00 db 00 00 00 00 00 00 00 00 - 20: 00 00 00 00 00 00 00 00 00 00 00 00 b7 10 00 10 - 30: 00 00 00 00 dc 00 00 00 00 00 00 00 05 01 0a 0a - - o A description of the environment: 10baseT? 100baseT? - full/half duplex? switched or hubbed? - - o Any additional module parameters which you may be providing to the driver. - - o Any kernel logs which are produced. The more the merrier. - If this is a large file and you are sending your report to a - mailing list, mention that you have the logfile, but don't send - it. If you're reporting direct to the maintainer then just send - it. - - To ensure that all kernel logs are available, add the - following line to /etc/syslog.conf: - - kern.* /var/log/messages - - Then restart syslogd with: - - /etc/rc.d/init.d/syslog restart - - (The above may vary, depending upon which Linux distribution you use). - - o If your problem is reproducible then that's great. Try the - following: - - 1) Increase the debug level. Usually this is done via: - - a) modprobe driver debug=7 - b) In /etc/modprobe.d/driver.conf: - options driver debug=7 - - 2) Recreate the problem with the higher debug level, - send all logs to the maintainer. - - 3) Download you card's diagnostic tool from Donald - Becker's website . - Download mii-diag.c as well. Build these. - - a) Run 'vortex-diag -aaee' and 'mii-diag -v' when the card is - working correctly. Save the output. - - b) Run the above commands when the card is malfunctioning. Send - both sets of output. - -Finally, please be patient and be prepared to do some work. You may -end up working on this problem for a week or more as the maintainer -asks more questions, asks for more tests, asks for patches to be -applied, etc. At the end of it all, the problem may even remain -unresolved. diff --git a/Documentation/networking/device_drivers/index.rst b/Documentation/networking/device_drivers/index.rst index 402a9188f446..aaac502b81ea 100644 --- a/Documentation/networking/device_drivers/index.rst +++ b/Documentation/networking/device_drivers/index.rst @@ -28,6 +28,7 @@ Contents: pensando/ionic stmicro/stmmac 3com/3c509 + 3com/vortex .. only:: subproject and html diff --git a/MAINTAINERS b/MAINTAINERS index bee65ebdc67e..eaea5f1994c9 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -147,7 +147,7 @@ Maintainers List M: Steffen Klassert L: netdev@vger.kernel.org S: Odd Fixes -F: Documentation/networking/device_drivers/3com/vortex.txt +F: Documentation/networking/device_drivers/3com/vortex.rst F: drivers/net/ethernet/3com/3c59x.c 3CR990 NETWORK DRIVER diff --git a/drivers/net/ethernet/3com/3c59x.c b/drivers/net/ethernet/3com/3c59x.c index a2b7f7ab8170..5984b7033999 100644 --- a/drivers/net/ethernet/3com/3c59x.c +++ b/drivers/net/ethernet/3com/3c59x.c @@ -1149,7 +1149,7 @@ static int vortex_probe1(struct device *gendev, void __iomem *ioaddr, int irq, print_info = (vortex_debug > 1); if (print_info) - pr_info("See Documentation/networking/device_drivers/3com/vortex.txt\n"); + pr_info("See Documentation/networking/device_drivers/3com/vortex.rst\n"); pr_info("%s: 3Com %s %s at %p.\n", print_name, @@ -1954,7 +1954,7 @@ vortex_error(struct net_device *dev, int status) dev->name, tx_status); if (tx_status == 0x82) { pr_err("Probably a duplex mismatch. See " - "Documentation/networking/device_drivers/3com/vortex.txt\n"); + "Documentation/networking/device_drivers/3com/vortex.rst\n"); } dump_tx_ring(dev); } diff --git a/drivers/net/ethernet/3com/Kconfig b/drivers/net/ethernet/3com/Kconfig index 3a6fc99c6f32..7cc259893cb9 100644 --- a/drivers/net/ethernet/3com/Kconfig +++ b/drivers/net/ethernet/3com/Kconfig @@ -76,7 +76,7 @@ config VORTEX "Hurricane" (3c555/3cSOHO) PCI If you have such a card, say Y here. More specific information is in - and + and in the comments at the beginning of . -- cgit v1.2.3-59-g8ed1b From 8d299c7e912bd8ebb88b9ac2b8e336c9878783aa Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Fri, 1 May 2020 16:44:36 +0200 Subject: docs: networking: device drivers: convert amazon/ena.txt to ReST - add SPDX header; - adjust titles and chapters, adding proper markups; - mark code blocks and literals as such; - mark tables as such; - adjust identation, whitespaces and blank lines where needed; - add to networking/index.rst. Signed-off-by: Mauro Carvalho Chehab Signed-off-by: David S. Miller --- .../networking/device_drivers/amazon/ena.rst | 344 +++++++++++++++++++++ .../networking/device_drivers/amazon/ena.txt | 308 ------------------ Documentation/networking/device_drivers/index.rst | 1 + MAINTAINERS | 2 +- 4 files changed, 346 insertions(+), 309 deletions(-) create mode 100644 Documentation/networking/device_drivers/amazon/ena.rst delete mode 100644 Documentation/networking/device_drivers/amazon/ena.txt diff --git a/Documentation/networking/device_drivers/amazon/ena.rst b/Documentation/networking/device_drivers/amazon/ena.rst new file mode 100644 index 000000000000..11af6388ea87 --- /dev/null +++ b/Documentation/networking/device_drivers/amazon/ena.rst @@ -0,0 +1,344 @@ +.. SPDX-License-Identifier: GPL-2.0 + +============================================================ +Linux kernel driver for Elastic Network Adapter (ENA) family +============================================================ + +Overview +======== + +ENA is a networking interface designed to make good use of modern CPU +features and system architectures. + +The ENA device exposes a lightweight management interface with a +minimal set of memory mapped registers and extendable command set +through an Admin Queue. + +The driver supports a range of ENA devices, is link-speed independent +(i.e., the same driver is used for 10GbE, 25GbE, 40GbE, etc.), and has +a negotiated and extendable feature set. + +Some ENA devices support SR-IOV. This driver is used for both the +SR-IOV Physical Function (PF) and Virtual Function (VF) devices. + +ENA devices enable high speed and low overhead network traffic +processing by providing multiple Tx/Rx queue pairs (the maximum number +is advertised by the device via the Admin Queue), a dedicated MSI-X +interrupt vector per Tx/Rx queue pair, adaptive interrupt moderation, +and CPU cacheline optimized data placement. + +The ENA driver supports industry standard TCP/IP offload features such +as checksum offload and TCP transmit segmentation offload (TSO). +Receive-side scaling (RSS) is supported for multi-core scaling. + +The ENA driver and its corresponding devices implement health +monitoring mechanisms such as watchdog, enabling the device and driver +to recover in a manner transparent to the application, as well as +debug logs. + +Some of the ENA devices support a working mode called Low-latency +Queue (LLQ), which saves several more microseconds. + +Supported PCI vendor ID/device IDs +================================== + +========= ======================= +1d0f:0ec2 ENA PF +1d0f:1ec2 ENA PF with LLQ support +1d0f:ec20 ENA VF +1d0f:ec21 ENA VF with LLQ support +========= ======================= + +ENA Source Code Directory Structure +=================================== + +================= ====================================================== +ena_com.[ch] Management communication layer. This layer is + responsible for the handling all the management + (admin) communication between the device and the + driver. +ena_eth_com.[ch] Tx/Rx data path. +ena_admin_defs.h Definition of ENA management interface. +ena_eth_io_defs.h Definition of ENA data path interface. +ena_common_defs.h Common definitions for ena_com layer. +ena_regs_defs.h Definition of ENA PCI memory-mapped (MMIO) registers. +ena_netdev.[ch] Main Linux kernel driver. +ena_syfsfs.[ch] Sysfs files. +ena_ethtool.c ethtool callbacks. +ena_pci_id_tbl.h Supported device IDs. +================= ====================================================== + +Management Interface: +===================== + +ENA management interface is exposed by means of: + +- PCIe Configuration Space +- Device Registers +- Admin Queue (AQ) and Admin Completion Queue (ACQ) +- Asynchronous Event Notification Queue (AENQ) + +ENA device MMIO Registers are accessed only during driver +initialization and are not involved in further normal device +operation. + +AQ is used for submitting management commands, and the +results/responses are reported asynchronously through ACQ. + +ENA introduces a small set of management commands with room for +vendor-specific extensions. Most of the management operations are +framed in a generic Get/Set feature command. + +The following admin queue commands are supported: + +- Create I/O submission queue +- Create I/O completion queue +- Destroy I/O submission queue +- Destroy I/O completion queue +- Get feature +- Set feature +- Configure AENQ +- Get statistics + +Refer to ena_admin_defs.h for the list of supported Get/Set Feature +properties. + +The Asynchronous Event Notification Queue (AENQ) is a uni-directional +queue used by the ENA device to send to the driver events that cannot +be reported using ACQ. AENQ events are subdivided into groups. Each +group may have multiple syndromes, as shown below + +The events are: + + ==================== =============== + Group Syndrome + ==================== =============== + Link state change **X** + Fatal error **X** + Notification Suspend traffic + Notification Resume traffic + Keep-Alive **X** + ==================== =============== + +ACQ and AENQ share the same MSI-X vector. + +Keep-Alive is a special mechanism that allows monitoring of the +device's health. The driver maintains a watchdog (WD) handler which, +if fired, logs the current state and statistics then resets and +restarts the ENA device and driver. A Keep-Alive event is delivered by +the device every second. The driver re-arms the WD upon reception of a +Keep-Alive event. A missed Keep-Alive event causes the WD handler to +fire. + +Data Path Interface +=================== +I/O operations are based on Tx and Rx Submission Queues (Tx SQ and Rx +SQ correspondingly). Each SQ has a completion queue (CQ) associated +with it. + +The SQs and CQs are implemented as descriptor rings in contiguous +physical memory. + +The ENA driver supports two Queue Operation modes for Tx SQs: + +- Regular mode + + * In this mode the Tx SQs reside in the host's memory. The ENA + device fetches the ENA Tx descriptors and packet data from host + memory. + +- Low Latency Queue (LLQ) mode or "push-mode". + + * In this mode the driver pushes the transmit descriptors and the + first 128 bytes of the packet directly to the ENA device memory + space. The rest of the packet payload is fetched by the + device. For this operation mode, the driver uses a dedicated PCI + device memory BAR, which is mapped with write-combine capability. + +The Rx SQs support only the regular mode. + +Note: Not all ENA devices support LLQ, and this feature is negotiated + with the device upon initialization. If the ENA device does not + support LLQ mode, the driver falls back to the regular mode. + +The driver supports multi-queue for both Tx and Rx. This has various +benefits: + +- Reduced CPU/thread/process contention on a given Ethernet interface. +- Cache miss rate on completion is reduced, particularly for data + cache lines that hold the sk_buff structures. +- Increased process-level parallelism when handling received packets. +- Increased data cache hit rate, by steering kernel processing of + packets to the CPU, where the application thread consuming the + packet is running. +- In hardware interrupt re-direction. + +Interrupt Modes +=============== +The driver assigns a single MSI-X vector per queue pair (for both Tx +and Rx directions). The driver assigns an additional dedicated MSI-X vector +for management (for ACQ and AENQ). + +Management interrupt registration is performed when the Linux kernel +probes the adapter, and it is de-registered when the adapter is +removed. I/O queue interrupt registration is performed when the Linux +interface of the adapter is opened, and it is de-registered when the +interface is closed. + +The management interrupt is named:: + + ena-mgmnt@pci: + +and for each queue pair, an interrupt is named:: + + -Tx-Rx- + +The ENA device operates in auto-mask and auto-clear interrupt +modes. That is, once MSI-X is delivered to the host, its Cause bit is +automatically cleared and the interrupt is masked. The interrupt is +unmasked by the driver after NAPI processing is complete. + +Interrupt Moderation +==================== +ENA driver and device can operate in conventional or adaptive interrupt +moderation mode. + +In conventional mode the driver instructs device to postpone interrupt +posting according to static interrupt delay value. The interrupt delay +value can be configured through ethtool(8). The following ethtool +parameters are supported by the driver: tx-usecs, rx-usecs + +In adaptive interrupt moderation mode the interrupt delay value is +updated by the driver dynamically and adjusted every NAPI cycle +according to the traffic nature. + +By default ENA driver applies adaptive coalescing on Rx traffic and +conventional coalescing on Tx traffic. + +Adaptive coalescing can be switched on/off through ethtool(8) +adaptive_rx on|off parameter. + +The driver chooses interrupt delay value according to the number of +bytes and packets received between interrupt unmasking and interrupt +posting. The driver uses interrupt delay table that subdivides the +range of received bytes/packets into 5 levels and assigns interrupt +delay value to each level. + +The user can enable/disable adaptive moderation, modify the interrupt +delay table and restore its default values through sysfs. + +RX copybreak +============ +The rx_copybreak is initialized by default to ENA_DEFAULT_RX_COPYBREAK +and can be configured by the ETHTOOL_STUNABLE command of the +SIOCETHTOOL ioctl. + +SKB +=== +The driver-allocated SKB for frames received from Rx handling using +NAPI context. The allocation method depends on the size of the packet. +If the frame length is larger than rx_copybreak, napi_get_frags() +is used, otherwise netdev_alloc_skb_ip_align() is used, the buffer +content is copied (by CPU) to the SKB, and the buffer is recycled. + +Statistics +========== +The user can obtain ENA device and driver statistics using ethtool. +The driver can collect regular or extended statistics (including +per-queue stats) from the device. + +In addition the driver logs the stats to syslog upon device reset. + +MTU +=== +The driver supports an arbitrarily large MTU with a maximum that is +negotiated with the device. The driver configures MTU using the +SetFeature command (ENA_ADMIN_MTU property). The user can change MTU +via ip(8) and similar legacy tools. + +Stateless Offloads +================== +The ENA driver supports: + +- TSO over IPv4/IPv6 +- TSO with ECN +- IPv4 header checksum offload +- TCP/UDP over IPv4/IPv6 checksum offloads + +RSS +=== +- The ENA device supports RSS that allows flexible Rx traffic + steering. +- Toeplitz and CRC32 hash functions are supported. +- Different combinations of L2/L3/L4 fields can be configured as + inputs for hash functions. +- The driver configures RSS settings using the AQ SetFeature command + (ENA_ADMIN_RSS_HASH_FUNCTION, ENA_ADMIN_RSS_HASH_INPUT and + ENA_ADMIN_RSS_REDIRECTION_TABLE_CONFIG properties). +- If the NETIF_F_RXHASH flag is set, the 32-bit result of the hash + function delivered in the Rx CQ descriptor is set in the received + SKB. +- The user can provide a hash key, hash function, and configure the + indirection table through ethtool(8). + +DATA PATH +========= +Tx +-- + +end_start_xmit() is called by the stack. This function does the following: + +- Maps data buffers (skb->data and frags). +- Populates ena_buf for the push buffer (if the driver and device are + in push mode.) +- Prepares ENA bufs for the remaining frags. +- Allocates a new request ID from the empty req_id ring. The request + ID is the index of the packet in the Tx info. This is used for + out-of-order TX completions. +- Adds the packet to the proper place in the Tx ring. +- Calls ena_com_prepare_tx(), an ENA communication layer that converts + the ena_bufs to ENA descriptors (and adds meta ENA descriptors as + needed.) + + * This function also copies the ENA descriptors and the push buffer + to the Device memory space (if in push mode.) + +- Writes doorbell to the ENA device. +- When the ENA device finishes sending the packet, a completion + interrupt is raised. +- The interrupt handler schedules NAPI. +- The ena_clean_tx_irq() function is called. This function handles the + completion descriptors generated by the ENA, with a single + completion descriptor per completed packet. + + * req_id is retrieved from the completion descriptor. The tx_info of + the packet is retrieved via the req_id. The data buffers are + unmapped and req_id is returned to the empty req_id ring. + * The function stops when the completion descriptors are completed or + the budget is reached. + +Rx +-- + +- When a packet is received from the ENA device. +- The interrupt handler schedules NAPI. +- The ena_clean_rx_irq() function is called. This function calls + ena_rx_pkt(), an ENA communication layer function, which returns the + number of descriptors used for a new unhandled packet, and zero if + no new packet is found. +- Then it calls the ena_clean_rx_irq() function. +- ena_eth_rx_skb() checks packet length: + + * If the packet is small (len < rx_copybreak), the driver allocates + a SKB for the new packet, and copies the packet payload into the + SKB data buffer. + + - In this way the original data buffer is not passed to the stack + and is reused for future Rx packets. + + * Otherwise the function unmaps the Rx buffer, then allocates the + new SKB structure and hooks the Rx buffer to the SKB frags. + +- The new SKB is updated with the necessary information (protocol, + checksum hw verify result, etc.), and then passed to the network + stack, using the NAPI interface function napi_gro_receive(). diff --git a/Documentation/networking/device_drivers/amazon/ena.txt b/Documentation/networking/device_drivers/amazon/ena.txt deleted file mode 100644 index 1bb55c7b604c..000000000000 --- a/Documentation/networking/device_drivers/amazon/ena.txt +++ /dev/null @@ -1,308 +0,0 @@ -Linux kernel driver for Elastic Network Adapter (ENA) family: -============================================================= - -Overview: -========= -ENA is a networking interface designed to make good use of modern CPU -features and system architectures. - -The ENA device exposes a lightweight management interface with a -minimal set of memory mapped registers and extendable command set -through an Admin Queue. - -The driver supports a range of ENA devices, is link-speed independent -(i.e., the same driver is used for 10GbE, 25GbE, 40GbE, etc.), and has -a negotiated and extendable feature set. - -Some ENA devices support SR-IOV. This driver is used for both the -SR-IOV Physical Function (PF) and Virtual Function (VF) devices. - -ENA devices enable high speed and low overhead network traffic -processing by providing multiple Tx/Rx queue pairs (the maximum number -is advertised by the device via the Admin Queue), a dedicated MSI-X -interrupt vector per Tx/Rx queue pair, adaptive interrupt moderation, -and CPU cacheline optimized data placement. - -The ENA driver supports industry standard TCP/IP offload features such -as checksum offload and TCP transmit segmentation offload (TSO). -Receive-side scaling (RSS) is supported for multi-core scaling. - -The ENA driver and its corresponding devices implement health -monitoring mechanisms such as watchdog, enabling the device and driver -to recover in a manner transparent to the application, as well as -debug logs. - -Some of the ENA devices support a working mode called Low-latency -Queue (LLQ), which saves several more microseconds. - -Supported PCI vendor ID/device IDs: -=================================== -1d0f:0ec2 - ENA PF -1d0f:1ec2 - ENA PF with LLQ support -1d0f:ec20 - ENA VF -1d0f:ec21 - ENA VF with LLQ support - -ENA Source Code Directory Structure: -==================================== -ena_com.[ch] - Management communication layer. This layer is - responsible for the handling all the management - (admin) communication between the device and the - driver. -ena_eth_com.[ch] - Tx/Rx data path. -ena_admin_defs.h - Definition of ENA management interface. -ena_eth_io_defs.h - Definition of ENA data path interface. -ena_common_defs.h - Common definitions for ena_com layer. -ena_regs_defs.h - Definition of ENA PCI memory-mapped (MMIO) registers. -ena_netdev.[ch] - Main Linux kernel driver. -ena_syfsfs.[ch] - Sysfs files. -ena_ethtool.c - ethtool callbacks. -ena_pci_id_tbl.h - Supported device IDs. - -Management Interface: -===================== -ENA management interface is exposed by means of: -- PCIe Configuration Space -- Device Registers -- Admin Queue (AQ) and Admin Completion Queue (ACQ) -- Asynchronous Event Notification Queue (AENQ) - -ENA device MMIO Registers are accessed only during driver -initialization and are not involved in further normal device -operation. - -AQ is used for submitting management commands, and the -results/responses are reported asynchronously through ACQ. - -ENA introduces a small set of management commands with room for -vendor-specific extensions. Most of the management operations are -framed in a generic Get/Set feature command. - -The following admin queue commands are supported: -- Create I/O submission queue -- Create I/O completion queue -- Destroy I/O submission queue -- Destroy I/O completion queue -- Get feature -- Set feature -- Configure AENQ -- Get statistics - -Refer to ena_admin_defs.h for the list of supported Get/Set Feature -properties. - -The Asynchronous Event Notification Queue (AENQ) is a uni-directional -queue used by the ENA device to send to the driver events that cannot -be reported using ACQ. AENQ events are subdivided into groups. Each -group may have multiple syndromes, as shown below - -The events are: - Group Syndrome - Link state change - X - - Fatal error - X - - Notification Suspend traffic - Notification Resume traffic - Keep-Alive - X - - -ACQ and AENQ share the same MSI-X vector. - -Keep-Alive is a special mechanism that allows monitoring of the -device's health. The driver maintains a watchdog (WD) handler which, -if fired, logs the current state and statistics then resets and -restarts the ENA device and driver. A Keep-Alive event is delivered by -the device every second. The driver re-arms the WD upon reception of a -Keep-Alive event. A missed Keep-Alive event causes the WD handler to -fire. - -Data Path Interface: -==================== -I/O operations are based on Tx and Rx Submission Queues (Tx SQ and Rx -SQ correspondingly). Each SQ has a completion queue (CQ) associated -with it. - -The SQs and CQs are implemented as descriptor rings in contiguous -physical memory. - -The ENA driver supports two Queue Operation modes for Tx SQs: -- Regular mode - * In this mode the Tx SQs reside in the host's memory. The ENA - device fetches the ENA Tx descriptors and packet data from host - memory. -- Low Latency Queue (LLQ) mode or "push-mode". - * In this mode the driver pushes the transmit descriptors and the - first 128 bytes of the packet directly to the ENA device memory - space. The rest of the packet payload is fetched by the - device. For this operation mode, the driver uses a dedicated PCI - device memory BAR, which is mapped with write-combine capability. - -The Rx SQs support only the regular mode. - -Note: Not all ENA devices support LLQ, and this feature is negotiated - with the device upon initialization. If the ENA device does not - support LLQ mode, the driver falls back to the regular mode. - -The driver supports multi-queue for both Tx and Rx. This has various -benefits: -- Reduced CPU/thread/process contention on a given Ethernet interface. -- Cache miss rate on completion is reduced, particularly for data - cache lines that hold the sk_buff structures. -- Increased process-level parallelism when handling received packets. -- Increased data cache hit rate, by steering kernel processing of - packets to the CPU, where the application thread consuming the - packet is running. -- In hardware interrupt re-direction. - -Interrupt Modes: -================ -The driver assigns a single MSI-X vector per queue pair (for both Tx -and Rx directions). The driver assigns an additional dedicated MSI-X vector -for management (for ACQ and AENQ). - -Management interrupt registration is performed when the Linux kernel -probes the adapter, and it is de-registered when the adapter is -removed. I/O queue interrupt registration is performed when the Linux -interface of the adapter is opened, and it is de-registered when the -interface is closed. - -The management interrupt is named: - ena-mgmnt@pci: -and for each queue pair, an interrupt is named: - -Tx-Rx- - -The ENA device operates in auto-mask and auto-clear interrupt -modes. That is, once MSI-X is delivered to the host, its Cause bit is -automatically cleared and the interrupt is masked. The interrupt is -unmasked by the driver after NAPI processing is complete. - -Interrupt Moderation: -===================== -ENA driver and device can operate in conventional or adaptive interrupt -moderation mode. - -In conventional mode the driver instructs device to postpone interrupt -posting according to static interrupt delay value. The interrupt delay -value can be configured through ethtool(8). The following ethtool -parameters are supported by the driver: tx-usecs, rx-usecs - -In adaptive interrupt moderation mode the interrupt delay value is -updated by the driver dynamically and adjusted every NAPI cycle -according to the traffic nature. - -By default ENA driver applies adaptive coalescing on Rx traffic and -conventional coalescing on Tx traffic. - -Adaptive coalescing can be switched on/off through ethtool(8) -adaptive_rx on|off parameter. - -The driver chooses interrupt delay value according to the number of -bytes and packets received between interrupt unmasking and interrupt -posting. The driver uses interrupt delay table that subdivides the -range of received bytes/packets into 5 levels and assigns interrupt -delay value to each level. - -The user can enable/disable adaptive moderation, modify the interrupt -delay table and restore its default values through sysfs. - -RX copybreak: -============= -The rx_copybreak is initialized by default to ENA_DEFAULT_RX_COPYBREAK -and can be configured by the ETHTOOL_STUNABLE command of the -SIOCETHTOOL ioctl. - -SKB: -==== -The driver-allocated SKB for frames received from Rx handling using -NAPI context. The allocation method depends on the size of the packet. -If the frame length is larger than rx_copybreak, napi_get_frags() -is used, otherwise netdev_alloc_skb_ip_align() is used, the buffer -content is copied (by CPU) to the SKB, and the buffer is recycled. - -Statistics: -=========== -The user can obtain ENA device and driver statistics using ethtool. -The driver can collect regular or extended statistics (including -per-queue stats) from the device. - -In addition the driver logs the stats to syslog upon device reset. - -MTU: -==== -The driver supports an arbitrarily large MTU with a maximum that is -negotiated with the device. The driver configures MTU using the -SetFeature command (ENA_ADMIN_MTU property). The user can change MTU -via ip(8) and similar legacy tools. - -Stateless Offloads: -=================== -The ENA driver supports: -- TSO over IPv4/IPv6 -- TSO with ECN -- IPv4 header checksum offload -- TCP/UDP over IPv4/IPv6 checksum offloads - -RSS: -==== -- The ENA device supports RSS that allows flexible Rx traffic - steering. -- Toeplitz and CRC32 hash functions are supported. -- Different combinations of L2/L3/L4 fields can be configured as - inputs for hash functions. -- The driver configures RSS settings using the AQ SetFeature command - (ENA_ADMIN_RSS_HASH_FUNCTION, ENA_ADMIN_RSS_HASH_INPUT and - ENA_ADMIN_RSS_REDIRECTION_TABLE_CONFIG properties). -- If the NETIF_F_RXHASH flag is set, the 32-bit result of the hash - function delivered in the Rx CQ descriptor is set in the received - SKB. -- The user can provide a hash key, hash function, and configure the - indirection table through ethtool(8). - -DATA PATH: -========== -Tx: ---- -end_start_xmit() is called by the stack. This function does the following: -- Maps data buffers (skb->data and frags). -- Populates ena_buf for the push buffer (if the driver and device are - in push mode.) -- Prepares ENA bufs for the remaining frags. -- Allocates a new request ID from the empty req_id ring. The request - ID is the index of the packet in the Tx info. This is used for - out-of-order TX completions. -- Adds the packet to the proper place in the Tx ring. -- Calls ena_com_prepare_tx(), an ENA communication layer that converts - the ena_bufs to ENA descriptors (and adds meta ENA descriptors as - needed.) - * This function also copies the ENA descriptors and the push buffer - to the Device memory space (if in push mode.) -- Writes doorbell to the ENA device. -- When the ENA device finishes sending the packet, a completion - interrupt is raised. -- The interrupt handler schedules NAPI. -- The ena_clean_tx_irq() function is called. This function handles the - completion descriptors generated by the ENA, with a single - completion descriptor per completed packet. - * req_id is retrieved from the completion descriptor. The tx_info of - the packet is retrieved via the req_id. The data buffers are - unmapped and req_id is returned to the empty req_id ring. - * The function stops when the completion descriptors are completed or - the budget is reached. - -Rx: ---- -- When a packet is received from the ENA device. -- The interrupt handler schedules NAPI. -- The ena_clean_rx_irq() function is called. This function calls - ena_rx_pkt(), an ENA communication layer function, which returns the - number of descriptors used for a new unhandled packet, and zero if - no new packet is found. -- Then it calls the ena_clean_rx_irq() function. -- ena_eth_rx_skb() checks packet length: - * If the packet is small (len < rx_copybreak), the driver allocates - a SKB for the new packet, and copies the packet payload into the - SKB data buffer. - - In this way the original data buffer is not passed to the stack - and is reused for future Rx packets. - * Otherwise the function unmaps the Rx buffer, then allocates the - new SKB structure and hooks the Rx buffer to the SKB frags. -- The new SKB is updated with the necessary information (protocol, - checksum hw verify result, etc.), and then passed to the network - stack, using the NAPI interface function napi_gro_receive(). diff --git a/Documentation/networking/device_drivers/index.rst b/Documentation/networking/device_drivers/index.rst index aaac502b81ea..019a0d2efe67 100644 --- a/Documentation/networking/device_drivers/index.rst +++ b/Documentation/networking/device_drivers/index.rst @@ -29,6 +29,7 @@ Contents: stmicro/stmmac 3com/3c509 3com/vortex + amazon/ena .. only:: subproject and html diff --git a/MAINTAINERS b/MAINTAINERS index eaea5f1994c9..7b6c13cc832f 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -815,7 +815,7 @@ R: Saeed Bishara R: Zorik Machulsky L: netdev@vger.kernel.org S: Supported -F: Documentation/networking/device_drivers/amazon/ena.txt +F: Documentation/networking/device_drivers/amazon/ena.rst F: drivers/net/ethernet/amazon/ AMAZON RDMA EFA DRIVER -- cgit v1.2.3-59-g8ed1b From c958119a487ec4578f50b352f45e965a30daa020 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Fri, 1 May 2020 16:44:37 +0200 Subject: docs: networking: device drivers: convert aquantia/atlantic.txt to ReST - add SPDX header; - use copyright symbol; - adjust title and its markup; - comment out text-only TOC from html/pdf output; - mark code blocks and literals as such; - adjust identation, whitespaces and blank lines where needed; - add to networking/index.rst. Signed-off-by: Mauro Carvalho Chehab Signed-off-by: David S. Miller --- .../device_drivers/aquantia/atlantic.rst | 556 +++++++++++++++++++++ .../device_drivers/aquantia/atlantic.txt | 479 ------------------ Documentation/networking/device_drivers/index.rst | 1 + MAINTAINERS | 2 +- 4 files changed, 558 insertions(+), 480 deletions(-) create mode 100644 Documentation/networking/device_drivers/aquantia/atlantic.rst delete mode 100644 Documentation/networking/device_drivers/aquantia/atlantic.txt diff --git a/Documentation/networking/device_drivers/aquantia/atlantic.rst b/Documentation/networking/device_drivers/aquantia/atlantic.rst new file mode 100644 index 000000000000..595ddef1c8b3 --- /dev/null +++ b/Documentation/networking/device_drivers/aquantia/atlantic.rst @@ -0,0 +1,556 @@ +.. SPDX-License-Identifier: GPL-2.0 +.. include:: + +=============================== +Marvell(Aquantia) AQtion Driver +=============================== + +For the aQuantia Multi-Gigabit PCI Express Family of Ethernet Adapters + +.. Contents + + - Identifying Your Adapter + - Configuration + - Supported ethtool options + - Command Line Parameters + - Config file parameters + - Support + - License + +Identifying Your Adapter +======================== + +The driver in this release is compatible with AQC-100, AQC-107, AQC-108 +based ethernet adapters. + + +SFP+ Devices (for AQC-100 based adapters) +----------------------------------------- + +This release tested with passive Direct Attach Cables (DAC) and SFP+/LC +Optical Transceiver. + +Configuration +============= + +Viewing Link Messages +--------------------- + Link messages will not be displayed to the console if the distribution is + restricting system messages. In order to see network driver link messages on + your console, set dmesg to eight by entering the following:: + + dmesg -n 8 + + .. note:: + + This setting is not saved across reboots. + +Jumbo Frames +------------ + The driver supports Jumbo Frames for all adapters. Jumbo Frames support is + enabled by changing the MTU to a value larger than the default of 1500. + The maximum value for the MTU is 16000. Use the `ip` command to + increase the MTU size. For example:: + + ip link set mtu 16000 dev enp1s0 + +ethtool +------- + The driver utilizes the ethtool interface for driver configuration and + diagnostics, as well as displaying statistical information. The latest + ethtool version is required for this functionality. + +NAPI +---- + NAPI (Rx polling mode) is supported in the atlantic driver. + +Supported ethtool options +========================= + +Viewing adapter settings +------------------------ + + :: + + ethtool + + Output example:: + + Settings for enp1s0: + Supported ports: [ TP ] + Supported link modes: 100baseT/Full + 1000baseT/Full + 10000baseT/Full + 2500baseT/Full + 5000baseT/Full + Supported pause frame use: Symmetric + Supports auto-negotiation: Yes + Supported FEC modes: Not reported + Advertised link modes: 100baseT/Full + 1000baseT/Full + 10000baseT/Full + 2500baseT/Full + 5000baseT/Full + Advertised pause frame use: Symmetric + Advertised auto-negotiation: Yes + Advertised FEC modes: Not reported + Speed: 10000Mb/s + Duplex: Full + Port: Twisted Pair + PHYAD: 0 + Transceiver: internal + Auto-negotiation: on + MDI-X: Unknown + Supports Wake-on: g + Wake-on: d + Link detected: yes + + + .. note:: + + AQrate speeds (2.5/5 Gb/s) will be displayed only with linux kernels > 4.10. + But you can still use these speeds:: + + ethtool -s eth0 autoneg off speed 2500 + +Viewing adapter information +--------------------------- + + :: + + ethtool -i + + Output example:: + + driver: atlantic + version: 5.2.0-050200rc5-generic-kern + firmware-version: 3.1.78 + expansion-rom-version: + bus-info: 0000:01:00.0 + supports-statistics: yes + supports-test: no + supports-eeprom-access: no + supports-register-dump: yes + supports-priv-flags: no + + +Viewing Ethernet adapter statistics +----------------------------------- + + :: + + ethtool -S + + Output example:: + + NIC statistics: + InPackets: 13238607 + InUCast: 13293852 + InMCast: 52 + InBCast: 3 + InErrors: 0 + OutPackets: 23703019 + OutUCast: 23704941 + OutMCast: 67 + OutBCast: 11 + InUCastOctects: 213182760 + OutUCastOctects: 22698443 + InMCastOctects: 6600 + OutMCastOctects: 8776 + InBCastOctects: 192 + OutBCastOctects: 704 + InOctects: 2131839552 + OutOctects: 226938073 + InPacketsDma: 95532300 + OutPacketsDma: 59503397 + InOctetsDma: 1137102462 + OutOctetsDma: 2394339518 + InDroppedDma: 0 + Queue[0] InPackets: 23567131 + Queue[0] OutPackets: 20070028 + Queue[0] InJumboPackets: 0 + Queue[0] InLroPackets: 0 + Queue[0] InErrors: 0 + Queue[1] InPackets: 45428967 + Queue[1] OutPackets: 11306178 + Queue[1] InJumboPackets: 0 + Queue[1] InLroPackets: 0 + Queue[1] InErrors: 0 + Queue[2] InPackets: 3187011 + Queue[2] OutPackets: 13080381 + Queue[2] InJumboPackets: 0 + Queue[2] InLroPackets: 0 + Queue[2] InErrors: 0 + Queue[3] InPackets: 23349136 + Queue[3] OutPackets: 15046810 + Queue[3] InJumboPackets: 0 + Queue[3] InLroPackets: 0 + Queue[3] InErrors: 0 + +Interrupt coalescing support +---------------------------- + + ITR mode, TX/RX coalescing timings could be viewed with:: + + ethtool -c + + and changed with:: + + ethtool -C tx-usecs rx-usecs + + To disable coalescing:: + + ethtool -C tx-usecs 0 rx-usecs 0 tx-max-frames 1 tx-max-frames 1 + +Wake on LAN support +------------------- + + WOL support by magic packet:: + + ethtool -s wol g + + To disable WOL:: + + ethtool -s wol d + +Set and check the driver message level +-------------------------------------- + + Set message level + + :: + + ethtool -s msglvl + + Level values: + + ====== ============================= + 0x0001 general driver status. + 0x0002 hardware probing. + 0x0004 link state. + 0x0008 periodic status check. + 0x0010 interface being brought down. + 0x0020 interface being brought up. + 0x0040 receive error. + 0x0080 transmit error. + 0x0200 interrupt handling. + 0x0400 transmit completion. + 0x0800 receive completion. + 0x1000 packet contents. + 0x2000 hardware status. + 0x4000 Wake-on-LAN status. + ====== ============================= + + By default, the level of debugging messages is set 0x0001(general driver status). + + Check message level + + :: + + ethtool | grep "Current message level" + + If you want to disable the output of messages:: + + ethtool -s msglvl 0 + +RX flow rules (ntuple filters) +------------------------------ + + There are separate rules supported, that applies in that order: + + 1. 16 VLAN ID rules + 2. 16 L2 EtherType rules + 3. 8 L3/L4 5-Tuple rules + + + The driver utilizes the ethtool interface for configuring ntuple filters, + via ``ethtool -N ``. + + To enable or disable the RX flow rules:: + + ethtool -K ethX ntuple + + When disabling ntuple filters, all the user programed filters are + flushed from the driver cache and hardware. All needed filters must + be re-added when ntuple is re-enabled. + + Because of the fixed order of the rules, the location of filters is also fixed: + + - Locations 0 - 15 for VLAN ID filters + - Locations 16 - 31 for L2 EtherType filters + - Locations 32 - 39 for L3/L4 5-tuple filters (locations 32, 36 for IPv6) + + The L3/L4 5-tuple (protocol, source and destination IP address, source and + destination TCP/UDP/SCTP port) is compared against 8 filters. For IPv4, up to + 8 source and destination addresses can be matched. For IPv6, up to 2 pairs of + addresses can be supported. Source and destination ports are only compared for + TCP/UDP/SCTP packets. + + To add a filter that directs packet to queue 5, use + ``<-N|-U|--config-nfc|--config-ntuple>`` switch:: + + ethtool -N flow-type udp4 src-ip 10.0.0.1 dst-ip 10.0.0.2 src-port 2000 dst-port 2001 action 5 + + - action is the queue number. + - loc is the rule number. + + For ``flow-type ip4|udp4|tcp4|sctp4|ip6|udp6|tcp6|sctp6`` you must set the loc + number within 32 - 39. + For ``flow-type ip4|udp4|tcp4|sctp4|ip6|udp6|tcp6|sctp6`` you can set 8 rules + for traffic IPv4 or you can set 2 rules for traffic IPv6. Loc number traffic + IPv6 is 32 and 36. + At the moment you can not use IPv4 and IPv6 filters at the same time. + + Example filter for IPv6 filter traffic:: + + sudo ethtool -N flow-type tcp6 src-ip 2001:db8:0:f101::1 dst-ip 2001:db8:0:f101::2 action 1 loc 32 + sudo ethtool -N flow-type ip6 src-ip 2001:db8:0:f101::2 dst-ip 2001:db8:0:f101::5 action -1 loc 36 + + Example filter for IPv4 filter traffic:: + + sudo ethtool -N flow-type udp4 src-ip 10.0.0.4 dst-ip 10.0.0.7 src-port 2000 dst-port 2001 loc 32 + sudo ethtool -N flow-type tcp4 src-ip 10.0.0.3 dst-ip 10.0.0.9 src-port 2000 dst-port 2001 loc 33 + sudo ethtool -N flow-type ip4 src-ip 10.0.0.6 dst-ip 10.0.0.4 loc 34 + + If you set action -1, then all traffic corresponding to the filter will be discarded. + + The maximum value action is 31. + + + The VLAN filter (VLAN id) is compared against 16 filters. + VLAN id must be accompanied by mask 0xF000. That is to distinguish VLAN filter + from L2 Ethertype filter with UserPriority since both User Priority and VLAN ID + are passed in the same 'vlan' parameter. + + To add a filter that directs packets from VLAN 2001 to queue 5:: + + ethtool -N flow-type ip4 vlan 2001 m 0xF000 action 1 loc 0 + + + L2 EtherType filters allows filter packet by EtherType field or both EtherType + and User Priority (PCP) field of 802.1Q. + UserPriority (vlan) parameter must be accompanied by mask 0x1FFF. That is to + distinguish VLAN filter from L2 Ethertype filter with UserPriority since both + User Priority and VLAN ID are passed in the same 'vlan' parameter. + + To add a filter that directs IP4 packess of priority 3 to queue 3:: + + ethtool -N flow-type ether proto 0x800 vlan 0x600 m 0x1FFF action 3 loc 16 + + To see the list of filters currently present:: + + ethtool <-u|-n|--show-nfc|--show-ntuple> + + Rules may be deleted from the table itself. This is done using:: + + sudo ethtool <-N|-U|--config-nfc|--config-ntuple> delete + + - loc is the rule number to be deleted. + + Rx filters is an interface to load the filter table that funnels all flow + into queue 0 unless an alternative queue is specified using "action". In that + case, any flow that matches the filter criteria will be directed to the + appropriate queue. RX filters is supported on all kernels 2.6.30 and later. + +RSS for UDP +----------- + + Currently, NIC does not support RSS for fragmented IP packets, which leads to + incorrect working of RSS for fragmented UDP traffic. To disable RSS for UDP the + RX Flow L3/L4 rule may be used. + + Example:: + + ethtool -N eth0 flow-type udp4 action 0 loc 32 + +UDP GSO hardware offload +------------------------ + + UDP GSO allows to boost UDP tx rates by offloading UDP headers allocation + into hardware. A special userspace socket option is required for this, + could be validated with /kernel/tools/testing/selftests/net/:: + + udpgso_bench_tx -u -4 -D 10.0.1.1 -s 6300 -S 100 + + Will cause sending out of 100 byte sized UDP packets formed from single + 6300 bytes user buffer. + + UDP GSO is configured by:: + + ethtool -K eth0 tx-udp-segmentation on + +Private flags (testing) +----------------------- + + Atlantic driver supports private flags for hardware custom features:: + + $ ethtool --show-priv-flags ethX + + Private flags for ethX: + DMASystemLoopback : off + PKTSystemLoopback : off + DMANetworkLoopback : off + PHYInternalLoopback: off + PHYExternalLoopback: off + + Example:: + + $ ethtool --set-priv-flags ethX DMASystemLoopback on + + DMASystemLoopback: DMA Host loopback. + PKTSystemLoopback: Packet buffer host loopback. + DMANetworkLoopback: Network side loopback on DMA block. + PHYInternalLoopback: Internal loopback on Phy. + PHYExternalLoopback: External loopback on Phy (with loopback ethernet cable). + + +Command Line Parameters +======================= +The following command line parameters are available on atlantic driver: + +aq_itr -Interrupt throttling mode +--------------------------------- +Accepted values: 0, 1, 0xFFFF + +Default value: 0xFFFF + +====== ============================================================== +0 Disable interrupt throttling. +1 Enable interrupt throttling and use specified tx and rx rates. +0xFFFF Auto throttling mode. Driver will choose the best RX and TX + interrupt throtting settings based on link speed. +====== ============================================================== + +aq_itr_tx - TX interrupt throttle rate +-------------------------------------- + +Accepted values: 0 - 0x1FF + +Default value: 0 + +TX side throttling in microseconds. Adapter will setup maximum interrupt delay +to this value. Minimum interrupt delay will be a half of this value + +aq_itr_rx - RX interrupt throttle rate +-------------------------------------- + +Accepted values: 0 - 0x1FF + +Default value: 0 + +RX side throttling in microseconds. Adapter will setup maximum interrupt delay +to this value. Minimum interrupt delay will be a half of this value + +.. note:: + + ITR settings could be changed in runtime by ethtool -c means (see below) + +Config file parameters +====================== + +For some fine tuning and performance optimizations, +some parameters can be changed in the {source_dir}/aq_cfg.h file. + +AQ_CFG_RX_PAGEORDER +------------------- + +Default value: 0 + +RX page order override. Thats a power of 2 number of RX pages allocated for +each descriptor. Received descriptor size is still limited by +AQ_CFG_RX_FRAME_MAX. + +Increasing pageorder makes page reuse better (actual on iommu enabled systems). + +AQ_CFG_RX_REFILL_THRES +---------------------- + +Default value: 32 + +RX refill threshold. RX path will not refill freed descriptors until the +specified number of free descriptors is observed. Larger values may help +better page reuse but may lead to packet drops as well. + +AQ_CFG_VECS_DEF +--------------- + +Number of queues + +Valid Range: 0 - 8 (up to AQ_CFG_VECS_MAX) + +Default value: 8 + +Notice this value will be capped by the number of cores available on the system. + +AQ_CFG_IS_RSS_DEF +----------------- + +Enable/disable Receive Side Scaling + +This feature allows the adapter to distribute receive processing +across multiple CPU-cores and to prevent from overloading a single CPU core. + +Valid values + +== ======== +0 disabled +1 enabled +== ======== + +Default value: 1 + +AQ_CFG_NUM_RSS_QUEUES_DEF +------------------------- + +Number of queues for Receive Side Scaling + +Valid Range: 0 - 8 (up to AQ_CFG_VECS_DEF) + +Default value: AQ_CFG_VECS_DEF + +AQ_CFG_IS_LRO_DEF +----------------- + +Enable/disable Large Receive Offload + +This offload enables the adapter to coalesce multiple TCP segments and indicate +them as a single coalesced unit to the OS networking subsystem. + +The system consumes less energy but it also introduces more latency in packets +processing. + +Valid values + +== ======== +0 disabled +1 enabled +== ======== + +Default value: 1 + +AQ_CFG_TX_CLEAN_BUDGET +---------------------- + +Maximum descriptors to cleanup on TX at once. + +Default value: 256 + +After the aq_cfg.h file changed the driver must be rebuilt to take effect. + +Support +======= + +If an issue is identified with the released source code on the supported +kernel with a supported adapter, email the specific information related +to the issue to aqn_support@marvell.com + +License +======= + +aQuantia Corporation Network Driver + +Copyright |copy| 2014 - 2019 aQuantia Corporation. + +This program is free software; you can redistribute it and/or modify it +under the terms and conditions of the GNU General Public License, +version 2, as published by the Free Software Foundation. diff --git a/Documentation/networking/device_drivers/aquantia/atlantic.txt b/Documentation/networking/device_drivers/aquantia/atlantic.txt deleted file mode 100644 index 2013fcedc2da..000000000000 --- a/Documentation/networking/device_drivers/aquantia/atlantic.txt +++ /dev/null @@ -1,479 +0,0 @@ -Marvell(Aquantia) AQtion Driver for the aQuantia Multi-Gigabit PCI Express -Family of Ethernet Adapters -============================================================================= - -Contents -======== - -- Identifying Your Adapter -- Configuration -- Supported ethtool options -- Command Line Parameters -- Config file parameters -- Support -- License - -Identifying Your Adapter -======================== - -The driver in this release is compatible with AQC-100, AQC-107, AQC-108 based ethernet adapters. - - -SFP+ Devices (for AQC-100 based adapters) ----------------------------------- - -This release tested with passive Direct Attach Cables (DAC) and SFP+/LC Optical Transceiver. - -Configuration -========================= - Viewing Link Messages - --------------------- - Link messages will not be displayed to the console if the distribution is - restricting system messages. In order to see network driver link messages on - your console, set dmesg to eight by entering the following: - - dmesg -n 8 - - NOTE: This setting is not saved across reboots. - - Jumbo Frames - ------------ - The driver supports Jumbo Frames for all adapters. Jumbo Frames support is - enabled by changing the MTU to a value larger than the default of 1500. - The maximum value for the MTU is 16000. Use the `ip` command to - increase the MTU size. For example: - - ip link set mtu 16000 dev enp1s0 - - ethtool - ------- - The driver utilizes the ethtool interface for driver configuration and - diagnostics, as well as displaying statistical information. The latest - ethtool version is required for this functionality. - - NAPI - ---- - NAPI (Rx polling mode) is supported in the atlantic driver. - -Supported ethtool options -============================ - Viewing adapter settings - --------------------- - ethtool - - Output example: - - Settings for enp1s0: - Supported ports: [ TP ] - Supported link modes: 100baseT/Full - 1000baseT/Full - 10000baseT/Full - 2500baseT/Full - 5000baseT/Full - Supported pause frame use: Symmetric - Supports auto-negotiation: Yes - Supported FEC modes: Not reported - Advertised link modes: 100baseT/Full - 1000baseT/Full - 10000baseT/Full - 2500baseT/Full - 5000baseT/Full - Advertised pause frame use: Symmetric - Advertised auto-negotiation: Yes - Advertised FEC modes: Not reported - Speed: 10000Mb/s - Duplex: Full - Port: Twisted Pair - PHYAD: 0 - Transceiver: internal - Auto-negotiation: on - MDI-X: Unknown - Supports Wake-on: g - Wake-on: d - Link detected: yes - - --- - Note: AQrate speeds (2.5/5 Gb/s) will be displayed only with linux kernels > 4.10. - But you can still use these speeds: - ethtool -s eth0 autoneg off speed 2500 - - Viewing adapter information - --------------------- - ethtool -i - - Output example: - - driver: atlantic - version: 5.2.0-050200rc5-generic-kern - firmware-version: 3.1.78 - expansion-rom-version: - bus-info: 0000:01:00.0 - supports-statistics: yes - supports-test: no - supports-eeprom-access: no - supports-register-dump: yes - supports-priv-flags: no - - - Viewing Ethernet adapter statistics: - --------------------- - ethtool -S - - Output example: - NIC statistics: - InPackets: 13238607 - InUCast: 13293852 - InMCast: 52 - InBCast: 3 - InErrors: 0 - OutPackets: 23703019 - OutUCast: 23704941 - OutMCast: 67 - OutBCast: 11 - InUCastOctects: 213182760 - OutUCastOctects: 22698443 - InMCastOctects: 6600 - OutMCastOctects: 8776 - InBCastOctects: 192 - OutBCastOctects: 704 - InOctects: 2131839552 - OutOctects: 226938073 - InPacketsDma: 95532300 - OutPacketsDma: 59503397 - InOctetsDma: 1137102462 - OutOctetsDma: 2394339518 - InDroppedDma: 0 - Queue[0] InPackets: 23567131 - Queue[0] OutPackets: 20070028 - Queue[0] InJumboPackets: 0 - Queue[0] InLroPackets: 0 - Queue[0] InErrors: 0 - Queue[1] InPackets: 45428967 - Queue[1] OutPackets: 11306178 - Queue[1] InJumboPackets: 0 - Queue[1] InLroPackets: 0 - Queue[1] InErrors: 0 - Queue[2] InPackets: 3187011 - Queue[2] OutPackets: 13080381 - Queue[2] InJumboPackets: 0 - Queue[2] InLroPackets: 0 - Queue[2] InErrors: 0 - Queue[3] InPackets: 23349136 - Queue[3] OutPackets: 15046810 - Queue[3] InJumboPackets: 0 - Queue[3] InLroPackets: 0 - Queue[3] InErrors: 0 - - Interrupt coalescing support - --------------------------------- - ITR mode, TX/RX coalescing timings could be viewed with: - - ethtool -c - - and changed with: - - ethtool -C tx-usecs rx-usecs - - To disable coalescing: - - ethtool -C tx-usecs 0 rx-usecs 0 tx-max-frames 1 tx-max-frames 1 - - Wake on LAN support - --------------------------------- - - WOL support by magic packet: - - ethtool -s wol g - - To disable WOL: - - ethtool -s wol d - - Set and check the driver message level - --------------------------------- - - Set message level - - ethtool -s msglvl - - Level values: - - 0x0001 - general driver status. - 0x0002 - hardware probing. - 0x0004 - link state. - 0x0008 - periodic status check. - 0x0010 - interface being brought down. - 0x0020 - interface being brought up. - 0x0040 - receive error. - 0x0080 - transmit error. - 0x0200 - interrupt handling. - 0x0400 - transmit completion. - 0x0800 - receive completion. - 0x1000 - packet contents. - 0x2000 - hardware status. - 0x4000 - Wake-on-LAN status. - - By default, the level of debugging messages is set 0x0001(general driver status). - - Check message level - - ethtool | grep "Current message level" - - If you want to disable the output of messages - - ethtool -s msglvl 0 - - RX flow rules (ntuple filters) - --------------------------------- - There are separate rules supported, that applies in that order: - 1. 16 VLAN ID rules - 2. 16 L2 EtherType rules - 3. 8 L3/L4 5-Tuple rules - - - The driver utilizes the ethtool interface for configuring ntuple filters, - via "ethtool -N ". - - To enable or disable the RX flow rules: - - ethtool -K ethX ntuple - - When disabling ntuple filters, all the user programed filters are - flushed from the driver cache and hardware. All needed filters must - be re-added when ntuple is re-enabled. - - Because of the fixed order of the rules, the location of filters is also fixed: - - Locations 0 - 15 for VLAN ID filters - - Locations 16 - 31 for L2 EtherType filters - - Locations 32 - 39 for L3/L4 5-tuple filters (locations 32, 36 for IPv6) - - The L3/L4 5-tuple (protocol, source and destination IP address, source and - destination TCP/UDP/SCTP port) is compared against 8 filters. For IPv4, up to - 8 source and destination addresses can be matched. For IPv6, up to 2 pairs of - addresses can be supported. Source and destination ports are only compared for - TCP/UDP/SCTP packets. - - To add a filter that directs packet to queue 5, use <-N|-U|--config-nfc|--config-ntuple> switch: - - ethtool -N flow-type udp4 src-ip 10.0.0.1 dst-ip 10.0.0.2 src-port 2000 dst-port 2001 action 5 - - - action is the queue number. - - loc is the rule number. - - For "flow-type ip4|udp4|tcp4|sctp4|ip6|udp6|tcp6|sctp6" you must set the loc - number within 32 - 39. - For "flow-type ip4|udp4|tcp4|sctp4|ip6|udp6|tcp6|sctp6" you can set 8 rules - for traffic IPv4 or you can set 2 rules for traffic IPv6. Loc number traffic - IPv6 is 32 and 36. - At the moment you can not use IPv4 and IPv6 filters at the same time. - - Example filter for IPv6 filter traffic: - - sudo ethtool -N flow-type tcp6 src-ip 2001:db8:0:f101::1 dst-ip 2001:db8:0:f101::2 action 1 loc 32 - sudo ethtool -N flow-type ip6 src-ip 2001:db8:0:f101::2 dst-ip 2001:db8:0:f101::5 action -1 loc 36 - - Example filter for IPv4 filter traffic: - - sudo ethtool -N flow-type udp4 src-ip 10.0.0.4 dst-ip 10.0.0.7 src-port 2000 dst-port 2001 loc 32 - sudo ethtool -N flow-type tcp4 src-ip 10.0.0.3 dst-ip 10.0.0.9 src-port 2000 dst-port 2001 loc 33 - sudo ethtool -N flow-type ip4 src-ip 10.0.0.6 dst-ip 10.0.0.4 loc 34 - - If you set action -1, then all traffic corresponding to the filter will be discarded. - The maximum value action is 31. - - - The VLAN filter (VLAN id) is compared against 16 filters. - VLAN id must be accompanied by mask 0xF000. That is to distinguish VLAN filter - from L2 Ethertype filter with UserPriority since both User Priority and VLAN ID - are passed in the same 'vlan' parameter. - - To add a filter that directs packets from VLAN 2001 to queue 5: - ethtool -N flow-type ip4 vlan 2001 m 0xF000 action 1 loc 0 - - - L2 EtherType filters allows filter packet by EtherType field or both EtherType - and User Priority (PCP) field of 802.1Q. - UserPriority (vlan) parameter must be accompanied by mask 0x1FFF. That is to - distinguish VLAN filter from L2 Ethertype filter with UserPriority since both - User Priority and VLAN ID are passed in the same 'vlan' parameter. - - To add a filter that directs IP4 packess of priority 3 to queue 3: - ethtool -N flow-type ether proto 0x800 vlan 0x600 m 0x1FFF action 3 loc 16 - - - To see the list of filters currently present: - - ethtool <-u|-n|--show-nfc|--show-ntuple> - - Rules may be deleted from the table itself. This is done using: - - sudo ethtool <-N|-U|--config-nfc|--config-ntuple> delete - - - loc is the rule number to be deleted. - - Rx filters is an interface to load the filter table that funnels all flow - into queue 0 unless an alternative queue is specified using "action". In that - case, any flow that matches the filter criteria will be directed to the - appropriate queue. RX filters is supported on all kernels 2.6.30 and later. - - RSS for UDP - --------------------------------- - Currently, NIC does not support RSS for fragmented IP packets, which leads to - incorrect working of RSS for fragmented UDP traffic. To disable RSS for UDP the - RX Flow L3/L4 rule may be used. - - Example: - ethtool -N eth0 flow-type udp4 action 0 loc 32 - - UDP GSO hardware offload - --------------------------------- - UDP GSO allows to boost UDP tx rates by offloading UDP headers allocation - into hardware. A special userspace socket option is required for this, - could be validated with /kernel/tools/testing/selftests/net/ - - udpgso_bench_tx -u -4 -D 10.0.1.1 -s 6300 -S 100 - - Will cause sending out of 100 byte sized UDP packets formed from single - 6300 bytes user buffer. - - UDP GSO is configured by: - - ethtool -K eth0 tx-udp-segmentation on - - Private flags (testing) - --------------------------------- - - Atlantic driver supports private flags for hardware custom features: - - $ ethtool --show-priv-flags ethX - - Private flags for ethX: - DMASystemLoopback : off - PKTSystemLoopback : off - DMANetworkLoopback : off - PHYInternalLoopback: off - PHYExternalLoopback: off - - Example: - - $ ethtool --set-priv-flags ethX DMASystemLoopback on - - DMASystemLoopback: DMA Host loopback. - PKTSystemLoopback: Packet buffer host loopback. - DMANetworkLoopback: Network side loopback on DMA block. - PHYInternalLoopback: Internal loopback on Phy. - PHYExternalLoopback: External loopback on Phy (with loopback ethernet cable). - - -Command Line Parameters -======================= -The following command line parameters are available on atlantic driver: - -aq_itr -Interrupt throttling mode ----------------------------------------- -Accepted values: 0, 1, 0xFFFF -Default value: 0xFFFF -0 - Disable interrupt throttling. -1 - Enable interrupt throttling and use specified tx and rx rates. -0xFFFF - Auto throttling mode. Driver will choose the best RX and TX - interrupt throtting settings based on link speed. - -aq_itr_tx - TX interrupt throttle rate ----------------------------------------- -Accepted values: 0 - 0x1FF -Default value: 0 -TX side throttling in microseconds. Adapter will setup maximum interrupt delay -to this value. Minimum interrupt delay will be a half of this value - -aq_itr_rx - RX interrupt throttle rate ----------------------------------------- -Accepted values: 0 - 0x1FF -Default value: 0 -RX side throttling in microseconds. Adapter will setup maximum interrupt delay -to this value. Minimum interrupt delay will be a half of this value - -Note: ITR settings could be changed in runtime by ethtool -c means (see below) - -Config file parameters -======================= -For some fine tuning and performance optimizations, -some parameters can be changed in the {source_dir}/aq_cfg.h file. - -AQ_CFG_RX_PAGEORDER ----------------------------------------- -Default value: 0 -RX page order override. Thats a power of 2 number of RX pages allocated for -each descriptor. Received descriptor size is still limited by AQ_CFG_RX_FRAME_MAX. -Increasing pageorder makes page reuse better (actual on iommu enabled systems). - -AQ_CFG_RX_REFILL_THRES ----------------------------------------- -Default value: 32 -RX refill threshold. RX path will not refill freed descriptors until the -specified number of free descriptors is observed. Larger values may help -better page reuse but may lead to packet drops as well. - -AQ_CFG_VECS_DEF ------------------------------------------------------------- -Number of queues -Valid Range: 0 - 8 (up to AQ_CFG_VECS_MAX) -Default value: 8 -Notice this value will be capped by the number of cores available on the system. - -AQ_CFG_IS_RSS_DEF ------------------------------------------------------------- -Enable/disable Receive Side Scaling - -This feature allows the adapter to distribute receive processing -across multiple CPU-cores and to prevent from overloading a single CPU core. - -Valid values -0 - disabled -1 - enabled - -Default value: 1 - -AQ_CFG_NUM_RSS_QUEUES_DEF ------------------------------------------------------------- -Number of queues for Receive Side Scaling -Valid Range: 0 - 8 (up to AQ_CFG_VECS_DEF) - -Default value: AQ_CFG_VECS_DEF - -AQ_CFG_IS_LRO_DEF ------------------------------------------------------------- -Enable/disable Large Receive Offload - -This offload enables the adapter to coalesce multiple TCP segments and indicate -them as a single coalesced unit to the OS networking subsystem. -The system consumes less energy but it also introduces more latency in packets processing. - -Valid values -0 - disabled -1 - enabled - -Default value: 1 - -AQ_CFG_TX_CLEAN_BUDGET ----------------------------------------- -Maximum descriptors to cleanup on TX at once. -Default value: 256 - -After the aq_cfg.h file changed the driver must be rebuilt to take effect. - -Support -======= - -If an issue is identified with the released source code on the supported -kernel with a supported adapter, email the specific information related -to the issue to aqn_support@marvell.com - -License -======= - -aQuantia Corporation Network Driver -Copyright(c) 2014 - 2019 aQuantia Corporation. - -This program is free software; you can redistribute it and/or modify it -under the terms and conditions of the GNU General Public License, -version 2, as published by the Free Software Foundation. diff --git a/Documentation/networking/device_drivers/index.rst b/Documentation/networking/device_drivers/index.rst index 019a0d2efe67..7dde314fc957 100644 --- a/Documentation/networking/device_drivers/index.rst +++ b/Documentation/networking/device_drivers/index.rst @@ -30,6 +30,7 @@ Contents: 3com/3c509 3com/vortex amazon/ena + aquantia/atlantic .. only:: subproject and html diff --git a/MAINTAINERS b/MAINTAINERS index 7b6c13cc832f..b5cfee17635e 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -1275,7 +1275,7 @@ L: netdev@vger.kernel.org S: Supported W: https://www.marvell.com/ Q: http://patchwork.ozlabs.org/project/netdev/list/ -F: Documentation/networking/device_drivers/aquantia/atlantic.txt +F: Documentation/networking/device_drivers/aquantia/atlantic.rst F: drivers/net/ethernet/aquantia/atlantic/ AQUANTIA ETHERNET DRIVER PTP SUBSYSTEM -- cgit v1.2.3-59-g8ed1b From c839ce557b35de084d06f91c4e37948bdcef9709 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Fri, 1 May 2020 16:44:38 +0200 Subject: docs: networking: device drivers: convert chelsio/cxgb.txt to ReST - add SPDX header; - use copyright symbol; - adjust titles and chapters, adding proper markups; - comment out text-only TOC from html/pdf output; - mark code blocks and literals as such; - add notes markups; - mark tables as such; - mark lists as such; - adjust identation, whitespaces and blank lines where needed; - add to networking/index.rst. Signed-off-by: Mauro Carvalho Chehab Signed-off-by: David S. Miller --- .../networking/device_drivers/chelsio/cxgb.rst | 393 +++++++++++++++++++++ .../networking/device_drivers/chelsio/cxgb.txt | 352 ------------------ Documentation/networking/device_drivers/index.rst | 1 + drivers/net/ethernet/chelsio/Kconfig | 2 +- 4 files changed, 395 insertions(+), 353 deletions(-) create mode 100644 Documentation/networking/device_drivers/chelsio/cxgb.rst delete mode 100644 Documentation/networking/device_drivers/chelsio/cxgb.txt diff --git a/Documentation/networking/device_drivers/chelsio/cxgb.rst b/Documentation/networking/device_drivers/chelsio/cxgb.rst new file mode 100644 index 000000000000..435dce5fa2c7 --- /dev/null +++ b/Documentation/networking/device_drivers/chelsio/cxgb.rst @@ -0,0 +1,393 @@ +.. SPDX-License-Identifier: GPL-2.0 +.. include:: + +============================================= +Chelsio N210 10Gb Ethernet Network Controller +============================================= + +Driver Release Notes for Linux + +Version 2.1.1 + +June 20, 2005 + +.. Contents + + INTRODUCTION + FEATURES + PERFORMANCE + DRIVER MESSAGES + KNOWN ISSUES + SUPPORT + + +Introduction +============ + + This document describes the Linux driver for Chelsio 10Gb Ethernet Network + Controller. This driver supports the Chelsio N210 NIC and is backward + compatible with the Chelsio N110 model 10Gb NICs. + + +Features +======== + +Adaptive Interrupts (adaptive-rx) +--------------------------------- + + This feature provides an adaptive algorithm that adjusts the interrupt + coalescing parameters, allowing the driver to dynamically adapt the latency + settings to achieve the highest performance during various types of network + load. + + The interface used to control this feature is ethtool. Please see the + ethtool manpage for additional usage information. + + By default, adaptive-rx is disabled. + To enable adaptive-rx:: + + ethtool -C adaptive-rx on + + To disable adaptive-rx, use ethtool:: + + ethtool -C adaptive-rx off + + After disabling adaptive-rx, the timer latency value will be set to 50us. + You may set the timer latency after disabling adaptive-rx:: + + ethtool -C rx-usecs + + An example to set the timer latency value to 100us on eth0:: + + ethtool -C eth0 rx-usecs 100 + + You may also provide a timer latency value while disabling adaptive-rx:: + + ethtool -C adaptive-rx off rx-usecs + + If adaptive-rx is disabled and a timer latency value is specified, the timer + will be set to the specified value until changed by the user or until + adaptive-rx is enabled. + + To view the status of the adaptive-rx and timer latency values:: + + ethtool -c + + +TCP Segmentation Offloading (TSO) Support +----------------------------------------- + + This feature, also known as "large send", enables a system's protocol stack + to offload portions of outbound TCP processing to a network interface card + thereby reducing system CPU utilization and enhancing performance. + + The interface used to control this feature is ethtool version 1.8 or higher. + Please see the ethtool manpage for additional usage information. + + By default, TSO is enabled. + To disable TSO:: + + ethtool -K tso off + + To enable TSO:: + + ethtool -K tso on + + To view the status of TSO:: + + ethtool -k + + +Performance +=========== + + The following information is provided as an example of how to change system + parameters for "performance tuning" an what value to use. You may or may not + want to change these system parameters, depending on your server/workstation + application. Doing so is not warranted in any way by Chelsio Communications, + and is done at "YOUR OWN RISK". Chelsio will not be held responsible for loss + of data or damage to equipment. + + Your distribution may have a different way of doing things, or you may prefer + a different method. These commands are shown only to provide an example of + what to do and are by no means definitive. + + Making any of the following system changes will only last until you reboot + your system. You may want to write a script that runs at boot-up which + includes the optimal settings for your system. + + Setting PCI Latency Timer:: + + setpci -d 1425:: + +* 0x0c.l=0x0000F800 + + Disabling TCP timestamp:: + + sysctl -w net.ipv4.tcp_timestamps=0 + + Disabling SACK:: + + sysctl -w net.ipv4.tcp_sack=0 + + Setting large number of incoming connection requests:: + + sysctl -w net.ipv4.tcp_max_syn_backlog=3000 + + Setting maximum receive socket buffer size:: + + sysctl -w net.core.rmem_max=1024000 + + Setting maximum send socket buffer size:: + + sysctl -w net.core.wmem_max=1024000 + + Set smp_affinity (on a multiprocessor system) to a single CPU:: + + echo 1 > /proc/irq//smp_affinity + + Setting default receive socket buffer size:: + + sysctl -w net.core.rmem_default=524287 + + Setting default send socket buffer size:: + + sysctl -w net.core.wmem_default=524287 + + Setting maximum option memory buffers:: + + sysctl -w net.core.optmem_max=524287 + + Setting maximum backlog (# of unprocessed packets before kernel drops):: + + sysctl -w net.core.netdev_max_backlog=300000 + + Setting TCP read buffers (min/default/max):: + + sysctl -w net.ipv4.tcp_rmem="10000000 10000000 10000000" + + Setting TCP write buffers (min/pressure/max):: + + sysctl -w net.ipv4.tcp_wmem="10000000 10000000 10000000" + + Setting TCP buffer space (min/pressure/max):: + + sysctl -w net.ipv4.tcp_mem="10000000 10000000 10000000" + + TCP window size for single connections: + + The receive buffer (RX_WINDOW) size must be at least as large as the + Bandwidth-Delay Product of the communication link between the sender and + receiver. Due to the variations of RTT, you may want to increase the buffer + size up to 2 times the Bandwidth-Delay Product. Reference page 289 of + "TCP/IP Illustrated, Volume 1, The Protocols" by W. Richard Stevens. + + At 10Gb speeds, use the following formula:: + + RX_WINDOW >= 1.25MBytes * RTT(in milliseconds) + Example for RTT with 100us: RX_WINDOW = (1,250,000 * 0.1) = 125,000 + + RX_WINDOW sizes of 256KB - 512KB should be sufficient. + + Setting the min, max, and default receive buffer (RX_WINDOW) size:: + + sysctl -w net.ipv4.tcp_rmem=" " + + TCP window size for multiple connections: + The receive buffer (RX_WINDOW) size may be calculated the same as single + connections, but should be divided by the number of connections. The + smaller window prevents congestion and facilitates better pacing, + especially if/when MAC level flow control does not work well or when it is + not supported on the machine. Experimentation may be necessary to attain + the correct value. This method is provided as a starting point for the + correct receive buffer size. + + Setting the min, max, and default receive buffer (RX_WINDOW) size is + performed in the same manner as single connection. + + +Driver Messages +=============== + + The following messages are the most common messages logged by syslog. These + may be found in /var/log/messages. + + Driver up:: + + Chelsio Network Driver - version 2.1.1 + + NIC detected:: + + eth#: Chelsio N210 1x10GBaseX NIC (rev #), PCIX 133MHz/64-bit + + Link up:: + + eth#: link is up at 10 Gbps, full duplex + + Link down:: + + eth#: link is down + + +Known Issues +============ + + These issues have been identified during testing. The following information + is provided as a workaround to the problem. In some cases, this problem is + inherent to Linux or to a particular Linux Distribution and/or hardware + platform. + + 1. Large number of TCP retransmits on a multiprocessor (SMP) system. + + On a system with multiple CPUs, the interrupt (IRQ) for the network + controller may be bound to more than one CPU. This will cause TCP + retransmits if the packet data were to be split across different CPUs + and re-assembled in a different order than expected. + + To eliminate the TCP retransmits, set smp_affinity on the particular + interrupt to a single CPU. You can locate the interrupt (IRQ) used on + the N110/N210 by using ifconfig:: + + ifconfig | grep Interrupt + + Set the smp_affinity to a single CPU:: + + echo 1 > /proc/irq//smp_affinity + + It is highly suggested that you do not run the irqbalance daemon on your + system, as this will change any smp_affinity setting you have applied. + The irqbalance daemon runs on a 10 second interval and binds interrupts + to the least loaded CPU determined by the daemon. To disable this daemon:: + + chkconfig --level 2345 irqbalance off + + By default, some Linux distributions enable the kernel feature, + irqbalance, which performs the same function as the daemon. To disable + this feature, add the following line to your bootloader:: + + noirqbalance + + Example using the Grub bootloader:: + + title Red Hat Enterprise Linux AS (2.4.21-27.ELsmp) + root (hd0,0) + kernel /vmlinuz-2.4.21-27.ELsmp ro root=/dev/hda3 noirqbalance + initrd /initrd-2.4.21-27.ELsmp.img + + 2. After running insmod, the driver is loaded and the incorrect network + interface is brought up without running ifup. + + When using 2.4.x kernels, including RHEL kernels, the Linux kernel + invokes a script named "hotplug". This script is primarily used to + automatically bring up USB devices when they are plugged in, however, + the script also attempts to automatically bring up a network interface + after loading the kernel module. The hotplug script does this by scanning + the ifcfg-eth# config files in /etc/sysconfig/network-scripts, looking + for HWADDR=. + + If the hotplug script does not find the HWADDRR within any of the + ifcfg-eth# files, it will bring up the device with the next available + interface name. If this interface is already configured for a different + network card, your new interface will have incorrect IP address and + network settings. + + To solve this issue, you can add the HWADDR= key to the + interface config file of your network controller. + + To disable this "hotplug" feature, you may add the driver (module name) + to the "blacklist" file located in /etc/hotplug. It has been noted that + this does not work for network devices because the net.agent script + does not use the blacklist file. Simply remove, or rename, the net.agent + script located in /etc/hotplug to disable this feature. + + 3. Transport Protocol (TP) hangs when running heavy multi-connection traffic + on an AMD Opteron system with HyperTransport PCI-X Tunnel chipset. + + If your AMD Opteron system uses the AMD-8131 HyperTransport PCI-X Tunnel + chipset, you may experience the "133-Mhz Mode Split Completion Data + Corruption" bug identified by AMD while using a 133Mhz PCI-X card on the + bus PCI-X bus. + + AMD states, "Under highly specific conditions, the AMD-8131 PCI-X Tunnel + can provide stale data via split completion cycles to a PCI-X card that + is operating at 133 Mhz", causing data corruption. + + AMD's provides three workarounds for this problem, however, Chelsio + recommends the first option for best performance with this bug: + + For 133Mhz secondary bus operation, limit the transaction length and + the number of outstanding transactions, via BIOS configuration + programming of the PCI-X card, to the following: + + Data Length (bytes): 1k + + Total allowed outstanding transactions: 2 + + Please refer to AMD 8131-HT/PCI-X Errata 26310 Rev 3.08 August 2004, + section 56, "133-MHz Mode Split Completion Data Corruption" for more + details with this bug and workarounds suggested by AMD. + + It may be possible to work outside AMD's recommended PCI-X settings, try + increasing the Data Length to 2k bytes for increased performance. If you + have issues with these settings, please revert to the "safe" settings + and duplicate the problem before submitting a bug or asking for support. + + .. note:: + + The default setting on most systems is 8 outstanding transactions + and 2k bytes data length. + + 4. On multiprocessor systems, it has been noted that an application which + is handling 10Gb networking can switch between CPUs causing degraded + and/or unstable performance. + + If running on an SMP system and taking performance measurements, it + is suggested you either run the latest netperf-2.4.0+ or use a binding + tool such as Tim Hockin's procstate utilities (runon) + . + + Binding netserver and netperf (or other applications) to particular + CPUs will have a significant difference in performance measurements. + You may need to experiment which CPU to bind the application to in + order to achieve the best performance for your system. + + If you are developing an application designed for 10Gb networking, + please keep in mind you may want to look at kernel functions + sched_setaffinity & sched_getaffinity to bind your application. + + If you are just running user-space applications such as ftp, telnet, + etc., you may want to try the runon tool provided by Tim Hockin's + procstate utility. You could also try binding the interface to a + particular CPU: runon 0 ifup eth0 + + +Support +======= + + If you have problems with the software or hardware, please contact our + customer support team via email at support@chelsio.com or check our website + at http://www.chelsio.com + +------------------------------------------------------------------------------- + +:: + + Chelsio Communications + 370 San Aleso Ave. + Suite 100 + Sunnyvale, CA 94085 + http://www.chelsio.com + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License, version 2, as +published by the Free Software Foundation. + +You should have received a copy of the GNU General Public License along +with this program; if not, write to the Free Software Foundation, Inc., +59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + +THIS SOFTWARE IS PROVIDED ``AS IS`` AND WITHOUT ANY EXPRESS OR IMPLIED +WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF +MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. + +Copyright |copy| 2003-2005 Chelsio Communications. All rights reserved. diff --git a/Documentation/networking/device_drivers/chelsio/cxgb.txt b/Documentation/networking/device_drivers/chelsio/cxgb.txt deleted file mode 100644 index 20a887615c4a..000000000000 --- a/Documentation/networking/device_drivers/chelsio/cxgb.txt +++ /dev/null @@ -1,352 +0,0 @@ - Chelsio N210 10Gb Ethernet Network Controller - - Driver Release Notes for Linux - - Version 2.1.1 - - June 20, 2005 - -CONTENTS -======== - INTRODUCTION - FEATURES - PERFORMANCE - DRIVER MESSAGES - KNOWN ISSUES - SUPPORT - - -INTRODUCTION -============ - - This document describes the Linux driver for Chelsio 10Gb Ethernet Network - Controller. This driver supports the Chelsio N210 NIC and is backward - compatible with the Chelsio N110 model 10Gb NICs. - - -FEATURES -======== - - Adaptive Interrupts (adaptive-rx) - --------------------------------- - - This feature provides an adaptive algorithm that adjusts the interrupt - coalescing parameters, allowing the driver to dynamically adapt the latency - settings to achieve the highest performance during various types of network - load. - - The interface used to control this feature is ethtool. Please see the - ethtool manpage for additional usage information. - - By default, adaptive-rx is disabled. - To enable adaptive-rx: - - ethtool -C adaptive-rx on - - To disable adaptive-rx, use ethtool: - - ethtool -C adaptive-rx off - - After disabling adaptive-rx, the timer latency value will be set to 50us. - You may set the timer latency after disabling adaptive-rx: - - ethtool -C rx-usecs - - An example to set the timer latency value to 100us on eth0: - - ethtool -C eth0 rx-usecs 100 - - You may also provide a timer latency value while disabling adaptive-rx: - - ethtool -C adaptive-rx off rx-usecs - - If adaptive-rx is disabled and a timer latency value is specified, the timer - will be set to the specified value until changed by the user or until - adaptive-rx is enabled. - - To view the status of the adaptive-rx and timer latency values: - - ethtool -c - - - TCP Segmentation Offloading (TSO) Support - ----------------------------------------- - - This feature, also known as "large send", enables a system's protocol stack - to offload portions of outbound TCP processing to a network interface card - thereby reducing system CPU utilization and enhancing performance. - - The interface used to control this feature is ethtool version 1.8 or higher. - Please see the ethtool manpage for additional usage information. - - By default, TSO is enabled. - To disable TSO: - - ethtool -K tso off - - To enable TSO: - - ethtool -K tso on - - To view the status of TSO: - - ethtool -k - - -PERFORMANCE -=========== - - The following information is provided as an example of how to change system - parameters for "performance tuning" an what value to use. You may or may not - want to change these system parameters, depending on your server/workstation - application. Doing so is not warranted in any way by Chelsio Communications, - and is done at "YOUR OWN RISK". Chelsio will not be held responsible for loss - of data or damage to equipment. - - Your distribution may have a different way of doing things, or you may prefer - a different method. These commands are shown only to provide an example of - what to do and are by no means definitive. - - Making any of the following system changes will only last until you reboot - your system. You may want to write a script that runs at boot-up which - includes the optimal settings for your system. - - Setting PCI Latency Timer: - setpci -d 1425:* 0x0c.l=0x0000F800 - - Disabling TCP timestamp: - sysctl -w net.ipv4.tcp_timestamps=0 - - Disabling SACK: - sysctl -w net.ipv4.tcp_sack=0 - - Setting large number of incoming connection requests: - sysctl -w net.ipv4.tcp_max_syn_backlog=3000 - - Setting maximum receive socket buffer size: - sysctl -w net.core.rmem_max=1024000 - - Setting maximum send socket buffer size: - sysctl -w net.core.wmem_max=1024000 - - Set smp_affinity (on a multiprocessor system) to a single CPU: - echo 1 > /proc/irq//smp_affinity - - Setting default receive socket buffer size: - sysctl -w net.core.rmem_default=524287 - - Setting default send socket buffer size: - sysctl -w net.core.wmem_default=524287 - - Setting maximum option memory buffers: - sysctl -w net.core.optmem_max=524287 - - Setting maximum backlog (# of unprocessed packets before kernel drops): - sysctl -w net.core.netdev_max_backlog=300000 - - Setting TCP read buffers (min/default/max): - sysctl -w net.ipv4.tcp_rmem="10000000 10000000 10000000" - - Setting TCP write buffers (min/pressure/max): - sysctl -w net.ipv4.tcp_wmem="10000000 10000000 10000000" - - Setting TCP buffer space (min/pressure/max): - sysctl -w net.ipv4.tcp_mem="10000000 10000000 10000000" - - TCP window size for single connections: - The receive buffer (RX_WINDOW) size must be at least as large as the - Bandwidth-Delay Product of the communication link between the sender and - receiver. Due to the variations of RTT, you may want to increase the buffer - size up to 2 times the Bandwidth-Delay Product. Reference page 289 of - "TCP/IP Illustrated, Volume 1, The Protocols" by W. Richard Stevens. - At 10Gb speeds, use the following formula: - RX_WINDOW >= 1.25MBytes * RTT(in milliseconds) - Example for RTT with 100us: RX_WINDOW = (1,250,000 * 0.1) = 125,000 - RX_WINDOW sizes of 256KB - 512KB should be sufficient. - Setting the min, max, and default receive buffer (RX_WINDOW) size: - sysctl -w net.ipv4.tcp_rmem=" " - - TCP window size for multiple connections: - The receive buffer (RX_WINDOW) size may be calculated the same as single - connections, but should be divided by the number of connections. The - smaller window prevents congestion and facilitates better pacing, - especially if/when MAC level flow control does not work well or when it is - not supported on the machine. Experimentation may be necessary to attain - the correct value. This method is provided as a starting point for the - correct receive buffer size. - Setting the min, max, and default receive buffer (RX_WINDOW) size is - performed in the same manner as single connection. - - -DRIVER MESSAGES -=============== - - The following messages are the most common messages logged by syslog. These - may be found in /var/log/messages. - - Driver up: - Chelsio Network Driver - version 2.1.1 - - NIC detected: - eth#: Chelsio N210 1x10GBaseX NIC (rev #), PCIX 133MHz/64-bit - - Link up: - eth#: link is up at 10 Gbps, full duplex - - Link down: - eth#: link is down - - -KNOWN ISSUES -============ - - These issues have been identified during testing. The following information - is provided as a workaround to the problem. In some cases, this problem is - inherent to Linux or to a particular Linux Distribution and/or hardware - platform. - - 1. Large number of TCP retransmits on a multiprocessor (SMP) system. - - On a system with multiple CPUs, the interrupt (IRQ) for the network - controller may be bound to more than one CPU. This will cause TCP - retransmits if the packet data were to be split across different CPUs - and re-assembled in a different order than expected. - - To eliminate the TCP retransmits, set smp_affinity on the particular - interrupt to a single CPU. You can locate the interrupt (IRQ) used on - the N110/N210 by using ifconfig: - ifconfig | grep Interrupt - Set the smp_affinity to a single CPU: - echo 1 > /proc/irq//smp_affinity - - It is highly suggested that you do not run the irqbalance daemon on your - system, as this will change any smp_affinity setting you have applied. - The irqbalance daemon runs on a 10 second interval and binds interrupts - to the least loaded CPU determined by the daemon. To disable this daemon: - chkconfig --level 2345 irqbalance off - - By default, some Linux distributions enable the kernel feature, - irqbalance, which performs the same function as the daemon. To disable - this feature, add the following line to your bootloader: - noirqbalance - - Example using the Grub bootloader: - title Red Hat Enterprise Linux AS (2.4.21-27.ELsmp) - root (hd0,0) - kernel /vmlinuz-2.4.21-27.ELsmp ro root=/dev/hda3 noirqbalance - initrd /initrd-2.4.21-27.ELsmp.img - - 2. After running insmod, the driver is loaded and the incorrect network - interface is brought up without running ifup. - - When using 2.4.x kernels, including RHEL kernels, the Linux kernel - invokes a script named "hotplug". This script is primarily used to - automatically bring up USB devices when they are plugged in, however, - the script also attempts to automatically bring up a network interface - after loading the kernel module. The hotplug script does this by scanning - the ifcfg-eth# config files in /etc/sysconfig/network-scripts, looking - for HWADDR=. - - If the hotplug script does not find the HWADDRR within any of the - ifcfg-eth# files, it will bring up the device with the next available - interface name. If this interface is already configured for a different - network card, your new interface will have incorrect IP address and - network settings. - - To solve this issue, you can add the HWADDR= key to the - interface config file of your network controller. - - To disable this "hotplug" feature, you may add the driver (module name) - to the "blacklist" file located in /etc/hotplug. It has been noted that - this does not work for network devices because the net.agent script - does not use the blacklist file. Simply remove, or rename, the net.agent - script located in /etc/hotplug to disable this feature. - - 3. Transport Protocol (TP) hangs when running heavy multi-connection traffic - on an AMD Opteron system with HyperTransport PCI-X Tunnel chipset. - - If your AMD Opteron system uses the AMD-8131 HyperTransport PCI-X Tunnel - chipset, you may experience the "133-Mhz Mode Split Completion Data - Corruption" bug identified by AMD while using a 133Mhz PCI-X card on the - bus PCI-X bus. - - AMD states, "Under highly specific conditions, the AMD-8131 PCI-X Tunnel - can provide stale data via split completion cycles to a PCI-X card that - is operating at 133 Mhz", causing data corruption. - - AMD's provides three workarounds for this problem, however, Chelsio - recommends the first option for best performance with this bug: - - For 133Mhz secondary bus operation, limit the transaction length and - the number of outstanding transactions, via BIOS configuration - programming of the PCI-X card, to the following: - - Data Length (bytes): 1k - Total allowed outstanding transactions: 2 - - Please refer to AMD 8131-HT/PCI-X Errata 26310 Rev 3.08 August 2004, - section 56, "133-MHz Mode Split Completion Data Corruption" for more - details with this bug and workarounds suggested by AMD. - - It may be possible to work outside AMD's recommended PCI-X settings, try - increasing the Data Length to 2k bytes for increased performance. If you - have issues with these settings, please revert to the "safe" settings - and duplicate the problem before submitting a bug or asking for support. - - NOTE: The default setting on most systems is 8 outstanding transactions - and 2k bytes data length. - - 4. On multiprocessor systems, it has been noted that an application which - is handling 10Gb networking can switch between CPUs causing degraded - and/or unstable performance. - - If running on an SMP system and taking performance measurements, it - is suggested you either run the latest netperf-2.4.0+ or use a binding - tool such as Tim Hockin's procstate utilities (runon) - . - - Binding netserver and netperf (or other applications) to particular - CPUs will have a significant difference in performance measurements. - You may need to experiment which CPU to bind the application to in - order to achieve the best performance for your system. - - If you are developing an application designed for 10Gb networking, - please keep in mind you may want to look at kernel functions - sched_setaffinity & sched_getaffinity to bind your application. - - If you are just running user-space applications such as ftp, telnet, - etc., you may want to try the runon tool provided by Tim Hockin's - procstate utility. You could also try binding the interface to a - particular CPU: runon 0 ifup eth0 - - -SUPPORT -======= - - If you have problems with the software or hardware, please contact our - customer support team via email at support@chelsio.com or check our website - at http://www.chelsio.com - -=============================================================================== - - Chelsio Communications - 370 San Aleso Ave. - Suite 100 - Sunnyvale, CA 94085 - http://www.chelsio.com - -This program is free software; you can redistribute it and/or modify -it under the terms of the GNU General Public License, version 2, as -published by the Free Software Foundation. - -You should have received a copy of the GNU General Public License along -with this program; if not, write to the Free Software Foundation, Inc., -59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. - -THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR IMPLIED -WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF -MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. - - Copyright (c) 2003-2005 Chelsio Communications. All rights reserved. - -=============================================================================== diff --git a/Documentation/networking/device_drivers/index.rst b/Documentation/networking/device_drivers/index.rst index 7dde314fc957..23c4ec9c9125 100644 --- a/Documentation/networking/device_drivers/index.rst +++ b/Documentation/networking/device_drivers/index.rst @@ -31,6 +31,7 @@ Contents: 3com/vortex amazon/ena aquantia/atlantic + chelsio/cxgb .. only:: subproject and html diff --git a/drivers/net/ethernet/chelsio/Kconfig b/drivers/net/ethernet/chelsio/Kconfig index 9909bfda167e..82cdfa51ce37 100644 --- a/drivers/net/ethernet/chelsio/Kconfig +++ b/drivers/net/ethernet/chelsio/Kconfig @@ -26,7 +26,7 @@ config CHELSIO_T1 This driver supports Chelsio gigabit and 10-gigabit Ethernet cards. More information about adapter features and performance tuning is in - . + . For general information about Chelsio and our products, visit our website at . -- cgit v1.2.3-59-g8ed1b From 714a4da450c03bdc53a0d5fa6a4b3192b30c5cda Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Fri, 1 May 2020 16:44:39 +0200 Subject: docs: networking: device drivers: convert cirrus/cs89x0.txt to ReST - add SPDX header; - adjust title markup; - mark code blocks and literals as such; - adjust identation, whitespaces and blank lines where needed; - add to networking/index.rst. Signed-off-by: Mauro Carvalho Chehab Signed-off-by: David S. Miller --- .../networking/device_drivers/cirrus/cs89x0.rst | 647 +++++++++++++++++++++ .../networking/device_drivers/cirrus/cs89x0.txt | 624 -------------------- Documentation/networking/device_drivers/index.rst | 1 + drivers/net/ethernet/cirrus/Kconfig | 2 +- 4 files changed, 649 insertions(+), 625 deletions(-) create mode 100644 Documentation/networking/device_drivers/cirrus/cs89x0.rst delete mode 100644 Documentation/networking/device_drivers/cirrus/cs89x0.txt diff --git a/Documentation/networking/device_drivers/cirrus/cs89x0.rst b/Documentation/networking/device_drivers/cirrus/cs89x0.rst new file mode 100644 index 000000000000..e5c283940ac5 --- /dev/null +++ b/Documentation/networking/device_drivers/cirrus/cs89x0.rst @@ -0,0 +1,647 @@ +.. SPDX-License-Identifier: GPL-2.0 + +================================================ +Cirrus Logic LAN CS8900/CS8920 Ethernet Adapters +================================================ + +.. note:: + + This document was contributed by Cirrus Logic for kernel 2.2.5. This version + has been updated for 2.3.48 by Andrew Morton. + + Still, this is too outdated! A major cleanup is needed here. + +Cirrus make a copy of this driver available at their website, as +described below. In general, you should use the driver version which +comes with your Linux distribution. + + +Linux Network Interface Driver ver. 2.00 + + +.. TABLE OF CONTENTS + + 1.0 CIRRUS LOGIC LAN CS8900/CS8920 ETHERNET ADAPTERS + 1.1 Product Overview + 1.2 Driver Description + 1.2.1 Driver Name + 1.2.2 File in the Driver Package + 1.3 System Requirements + 1.4 Licensing Information + + 2.0 ADAPTER INSTALLATION and CONFIGURATION + 2.1 CS8900-based Adapter Configuration + 2.2 CS8920-based Adapter Configuration + + 3.0 LOADING THE DRIVER AS A MODULE + + 4.0 COMPILING THE DRIVER + 4.1 Compiling the Driver as a Loadable Module + 4.2 Compiling the driver to support memory mode + 4.3 Compiling the driver to support Rx DMA + + 5.0 TESTING AND TROUBLESHOOTING + 5.1 Known Defects and Limitations + 5.2 Testing the Adapter + 5.2.1 Diagnostic Self-Test + 5.2.2 Diagnostic Network Test + 5.3 Using the Adapter's LEDs + 5.4 Resolving I/O Conflicts + + 6.0 TECHNICAL SUPPORT + 6.1 Contacting Cirrus Logic's Technical Support + 6.2 Information Required Before Contacting Technical Support + 6.3 Obtaining the Latest Driver Version + 6.4 Current maintainer + 6.5 Kernel boot parameters + + +1. Cirrus Logic LAN CS8900/CS8920 Ethernet Adapters +=================================================== + + +1.1. Product Overview +===================== + +The CS8900-based ISA Ethernet Adapters from Cirrus Logic follow +IEEE 802.3 standards and support half or full-duplex operation in ISA bus +computers on 10 Mbps Ethernet networks. The adapters are designed for operation +in 16-bit ISA or EISA bus expansion slots and are available in +10BaseT-only or 3-media configurations (10BaseT, 10Base2, and AUI for 10Base-5 +or fiber networks). + +CS8920-based adapters are similar to the CS8900-based adapter with additional +features for Plug and Play (PnP) support and Wakeup Frame recognition. As +such, the configuration procedures differ somewhat between the two types of +adapters. Refer to the "Adapter Configuration" section for details on +configuring both types of adapters. + + +1.2. Driver Description +======================= + +The CS8900/CS8920 Ethernet Adapter driver for Linux supports the Linux +v2.3.48 or greater kernel. It can be compiled directly into the kernel +or loaded at run-time as a device driver module. + +1.2.1 Driver Name: cs89x0 + +1.2.2 Files in the Driver Archive: + +The files in the driver at Cirrus' website include: + + =================== ==================================================== + readme.txt this file + build batch file to compile cs89x0.c. + cs89x0.c driver C code + cs89x0.h driver header file + cs89x0.o pre-compiled module (for v2.2.5 kernel) + config/Config.in sample file to include cs89x0 driver in the kernel. + config/Makefile sample file to include cs89x0 driver in the kernel. + config/Space.c sample file to include cs89x0 driver in the kernel. + =================== ==================================================== + + + +1.3. System Requirements +------------------------ + +The following hardware is required: + + * Cirrus Logic LAN (CS8900/20-based) Ethernet ISA Adapter + + * IBM or IBM-compatible PC with: + * An 80386 or higher processor + * 16 bytes of contiguous IO space available between 210h - 370h + * One available IRQ (5,10,11,or 12 for the CS8900, 3-7,9-15 for CS8920). + + * Appropriate cable (and connector for AUI, 10BASE-2) for your network + topology. + +The following software is required: + +* LINUX kernel version 2.3.48 or higher + + * CS8900/20 Setup Utility (DOS-based) + + * LINUX kernel sources for your kernel (if compiling into kernel) + + * GNU Toolkit (gcc and make) v2.6 or above (if compiling into kernel + or a module) + + + +1.4. Licensing Information +-------------------------- + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation, version 1. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for +more details. + +For a full copy of the GNU General Public License, write to the Free Software +Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + + + +2. Adapter Installation and Configuration +========================================= + +Both the CS8900 and CS8920-based adapters can be configured using parameters +stored in an on-board EEPROM. You must use the DOS-based CS8900/20 Setup +Utility if you want to change the adapter's configuration in EEPROM. + +When loading the driver as a module, you can specify many of the adapter's +configuration parameters on the command-line to override the EEPROM's settings +or for interface configuration when an EEPROM is not used. (CS8920-based +adapters must use an EEPROM.) See Section 3.0 LOADING THE DRIVER AS A MODULE. + +Since the CS8900/20 Setup Utility is a DOS-based application, you must install +and configure the adapter in a DOS-based system using the CS8900/20 Setup +Utility before installation in the target LINUX system. (Not required if +installing a CS8900-based adapter and the default configuration is acceptable.) + + +2.1. CS8900-based Adapter Configuration +--------------------------------------- + +CS8900-based adapters shipped from Cirrus Logic have been configured +with the following "default" settings:: + + Operation Mode: Memory Mode + IRQ: 10 + Base I/O Address: 300 + Memory Base Address: D0000 + Optimization: DOS Client + Transmission Mode: Half-duplex + BootProm: None + Media Type: Autodetect (3-media cards) or + 10BASE-T (10BASE-T only adapter) + +You should only change the default configuration settings if conflicts with +another adapter exists. To change the adapter's configuration, run the +CS8900/20 Setup Utility. + + +2.2. CS8920-based Adapter Configuration +--------------------------------------- + +CS8920-based adapters are shipped from Cirrus Logic configured as Plug +and Play (PnP) enabled. However, since the cs89x0 driver does NOT +support PnP, you must install the CS8920 adapter in a DOS-based PC and +run the CS8900/20 Setup Utility to disable PnP and configure the +adapter before installation in the target Linux system. Failure to do +this will leave the adapter inactive and the driver will be unable to +communicate with the adapter. + +:: + + **************************************************************** + * CS8920-BASED ADAPTERS: * + * * + * CS8920-BASED ADAPTERS ARE PLUG and PLAY ENABLED BY DEFAULT. * + * THE CS89X0 DRIVER DOES NOT SUPPORT PnP. THEREFORE, YOU MUST * + * RUN THE CS8900/20 SETUP UTILITY TO DISABLE PnP SUPPORT AND * + * TO ACTIVATE THE ADAPTER. * + **************************************************************** + + + + +3. Loading the Driver as a Module +================================= + +If the driver is compiled as a loadable module, you can load the driver module +with the 'modprobe' command. Many of the adapter's configuration parameters can +be specified as command-line arguments to the load command. This facility +provides a means to override the EEPROM's settings or for interface +configuration when an EEPROM is not used. + +Example:: + + insmod cs89x0.o io=0x200 irq=0xA media=aui + +This example loads the module and configures the adapter to use an IO port base +address of 200h, interrupt 10, and use the AUI media connection. The following +configuration options are available on the command line:: + + io=### - specify IO address (200h-360h) + irq=## - specify interrupt level + use_dma=1 - Enable DMA + dma=# - specify dma channel (Driver is compiled to support + Rx DMA only) + dmasize=# (16 or 64) - DMA size 16K or 64K. Default value is set to 16. + media=rj45 - specify media type + or media=bnc + or media=aui + or media=auto + duplex=full - specify forced half/full/autonegotiate duplex + or duplex=half + or duplex=auto + debug=# - debug level (only available if the driver was compiled + for debugging) + +**Notes:** + +a) If an EEPROM is present, any specified command-line parameter + will override the corresponding configuration value stored in + EEPROM. + +b) The "io" parameter must be specified on the command-line. + +c) The driver's hardware probe routine is designed to avoid + writing to I/O space until it knows that there is a cs89x0 + card at the written addresses. This could cause problems + with device probing. To avoid this behaviour, add one + to the ``io=`` module parameter. This doesn't actually change + the I/O address, but it is a flag to tell the driver + to partially initialise the hardware before trying to + identify the card. This could be dangerous if you are + not sure that there is a cs89x0 card at the provided address. + + For example, to scan for an adapter located at IO base 0x300, + specify an IO address of 0x301. + +d) The "duplex=auto" parameter is only supported for the CS8920. + +e) The minimum command-line configuration required if an EEPROM is + not present is: + + io + irq + media type (no autodetect) + +f) The following additional parameters are CS89XX defaults (values + used with no EEPROM or command-line argument). + + * DMA Burst = enabled + * IOCHRDY Enabled = enabled + * UseSA = enabled + * CS8900 defaults to half-duplex if not specified on command-line + * CS8920 defaults to autoneg if not specified on command-line + * Use reset defaults for other config parameters + * dma_mode = 0 + +g) You can use ifconfig to set the adapter's Ethernet address. + +h) Many Linux distributions use the 'modprobe' command to load + modules. This program uses the '/etc/conf.modules' file to + determine configuration information which is passed to a driver + module when it is loaded. All the configuration options which are + described above may be placed within /etc/conf.modules. + + For example:: + + > cat /etc/conf.modules + ... + alias eth0 cs89x0 + options cs89x0 io=0x0200 dma=5 use_dma=1 + ... + + In this example we are telling the module system that the + ethernet driver for this machine should use the cs89x0 driver. We + are asking 'modprobe' to pass the 'io', 'dma' and 'use_dma' + arguments to the driver when it is loaded. + +i) Cirrus recommend that the cs89x0 use the ISA DMA channels 5, 6 or + 7. You will probably find that other DMA channels will not work. + +j) The cs89x0 supports DMA for receiving only. DMA mode is + significantly more efficient. Flooding a 400 MHz Celeron machine + with large ping packets consumes 82% of its CPU capacity in non-DMA + mode. With DMA this is reduced to 45%. + +k) If your Linux kernel was compiled with inbuilt plug-and-play + support you will be able to find information about the cs89x0 card + with the command:: + + cat /proc/isapnp + +l) If during DMA operation you find erratic behavior or network data + corruption you should use your PC's BIOS to slow the EISA bus clock. + +m) If the cs89x0 driver is compiled directly into the kernel + (non-modular) then its I/O address is automatically determined by + ISA bus probing. The IRQ number, media options, etc are determined + from the card's EEPROM. + +n) If the cs89x0 driver is compiled directly into the kernel, DMA + mode may be selected by providing the kernel with a boot option + 'cs89x0_dma=N' where 'N' is the desired DMA channel number (5, 6 or 7). + + Kernel boot options may be provided on the LILO command line:: + + LILO boot: linux cs89x0_dma=5 + + or they may be placed in /etc/lilo.conf:: + + image=/boot/bzImage-2.3.48 + append="cs89x0_dma=5" + label=linux + root=/dev/hda5 + read-only + + The DMA Rx buffer size is hardwired to 16 kbytes in this mode. + (64k mode is not available). + + +4. Compiling the Driver +======================= + +The cs89x0 driver can be compiled directly into the kernel or compiled into +a loadable device driver module. + +Just use the standard way to configure the driver and compile the Kernel. + + +4.1. Compiling the Driver to Support Rx DMA +------------------------------------------- + +The compile-time optionality for DMA was removed in the 2.3 kernel +series. DMA support is now unconditionally part of the driver. It is +enabled by the 'use_dma=1' module option. + + +5. Testing and Troubleshooting +============================== + +5.1. Known Defects and Limitations +---------------------------------- + +Refer to the RELEASE.TXT file distributed as part of this archive for a list of +known defects, driver limitations, and work arounds. + + +5.2. Testing the Adapter +------------------------ + +Once the adapter has been installed and configured, the diagnostic option of +the CS8900/20 Setup Utility can be used to test the functionality of the +adapter and its network connection. Use the diagnostics 'Self Test' option to +test the functionality of the adapter with the hardware configuration you have +assigned. You can use the diagnostics 'Network Test' to test the ability of the +adapter to communicate across the Ethernet with another PC equipped with a +CS8900/20-based adapter card (it must also be running the CS8900/20 Setup +Utility). + +.. note:: + + The Setup Utility's diagnostics are designed to run in a + DOS-only operating system environment. DO NOT run the diagnostics + from a DOS or command prompt session under Windows 95, Windows NT, + OS/2, or other operating system. + +To run the diagnostics tests on the CS8900/20 adapter: + + 1. Boot DOS on the PC and start the CS8900/20 Setup Utility. + + 2. The adapter's current configuration is displayed. Hit the ENTER key to + get to the main menu. + + 4. Select 'Diagnostics' (ALT-G) from the main menu. + * Select 'Self-Test' to test the adapter's basic functionality. + * Select 'Network Test' to test the network connection and cabling. + + +5.2.1. Diagnostic Self-test +^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The diagnostic self-test checks the adapter's basic functionality as well as +its ability to communicate across the ISA bus based on the system resources +assigned during hardware configuration. The following tests are performed: + + * IO Register Read/Write Test + + The IO Register Read/Write test insures that the CS8900/20 can be + accessed in IO mode, and that the IO base address is correct. + + * Shared Memory Test + + The Shared Memory test insures the CS8900/20 can be accessed in memory + mode and that the range of memory addresses assigned does not conflict + with other devices in the system. + + * Interrupt Test + + The Interrupt test insures there are no conflicts with the assigned IRQ + signal. + + * EEPROM Test + + The EEPROM test insures the EEPROM can be read. + + * Chip RAM Test + + The Chip RAM test insures the 4K of memory internal to the CS8900/20 is + working properly. + + * Internal Loop-back Test + + The Internal Loop Back test insures the adapter's transmitter and + receiver are operating properly. If this test fails, make sure the + adapter's cable is connected to the network (check for LED activity for + example). + + * Boot PROM Test + + The Boot PROM test insures the Boot PROM is present, and can be read. + Failure indicates the Boot PROM was not successfully read due to a + hardware problem or due to a conflicts on the Boot PROM address + assignment. (Test only applies if the adapter is configured to use the + Boot PROM option.) + +Failure of a test item indicates a possible system resource conflict with +another device on the ISA bus. In this case, you should use the Manual Setup +option to reconfigure the adapter by selecting a different value for the system +resource that failed. + + +5.2.2. Diagnostic Network Test +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The Diagnostic Network Test verifies a working network connection by +transferring data between two CS8900/20 adapters installed in different PCs +on the same network. (Note: the diagnostic network test should not be run +between two nodes across a router.) + +This test requires that each of the two PCs have a CS8900/20-based adapter +installed and have the CS8900/20 Setup Utility running. The first PC is +configured as a Responder and the other PC is configured as an Initiator. +Once the Initiator is started, it sends data frames to the Responder which +returns the frames to the Initiator. + +The total number of frames received and transmitted are displayed on the +Initiator's display, along with a count of the number of frames received and +transmitted OK or in error. The test can be terminated anytime by the user at +either PC. + +To setup the Diagnostic Network Test: + + 1. Select a PC with a CS8900/20-based adapter and a known working network + connection to act as the Responder. Run the CS8900/20 Setup Utility + and select 'Diagnostics -> Network Test -> Responder' from the main + menu. Hit ENTER to start the Responder. + + 2. Return to the PC with the CS8900/20-based adapter you want to test and + start the CS8900/20 Setup Utility. + + 3. From the main menu, Select 'Diagnostic -> Network Test -> Initiator'. + Hit ENTER to start the test. + +You may stop the test on the Initiator at any time while allowing the Responder +to continue running. In this manner, you can move to additional PCs and test +them by starting the Initiator on another PC without having to stop/start the +Responder. + + + +5.3. Using the Adapter's LEDs +----------------------------- + +The 2 and 3-media adapters have two LEDs visible on the back end of the board +located near the 10Base-T connector. + +Link Integrity LED: A "steady" ON of the green LED indicates a valid 10Base-T +connection. (Only applies to 10Base-T. The green LED has no significance for +a 10Base-2 or AUI connection.) + +TX/RX LED: The yellow LED lights briefly each time the adapter transmits or +receives data. (The yellow LED will appear to "flicker" on a typical network.) + + +5.4. Resolving I/O Conflicts +---------------------------- + +An IO conflict occurs when two or more adapter use the same ISA resource (IO +address, memory address or IRQ). You can usually detect an IO conflict in one +of four ways after installing and or configuring the CS8900/20-based adapter: + + 1. The system does not boot properly (or at all). + + 2. The driver cannot communicate with the adapter, reporting an "Adapter + not found" error message. + + 3. You cannot connect to the network or the driver will not load. + + 4. If you have configured the adapter to run in memory mode but the driver + reports it is using IO mode when loading, this is an indication of a + memory address conflict. + +If an IO conflict occurs, run the CS8900/20 Setup Utility and perform a +diagnostic self-test. Normally, the ISA resource in conflict will fail the +self-test. If so, reconfigure the adapter selecting another choice for the +resource in conflict. Run the diagnostics again to check for further IO +conflicts. + +In some cases, such as when the PC will not boot, it may be necessary to remove +the adapter and reconfigure it by installing it in another PC to run the +CS8900/20 Setup Utility. Once reinstalled in the target system, run the +diagnostics self-test to ensure the new configuration is free of conflicts +before loading the driver again. + +When manually configuring the adapter, keep in mind the typical ISA system +resource usage as indicated in the tables below. + +:: + + I/O Address Device IRQ Device + ----------- -------- --- -------- + 200-20F Game I/O adapter 3 COM2, Bus Mouse + 230-23F Bus Mouse 4 COM1 + 270-27F LPT3: third parallel port 5 LPT2 + 2F0-2FF COM2: second serial port 6 Floppy Disk controller + 320-32F Fixed disk controller 7 LPT1 + 8 Real-time Clock + 9 EGA/VGA display adapter + 12 Mouse (PS/2) + Memory Address Device 13 Math Coprocessor + -------------- --------------------- 14 Hard Disk controller + A000-BFFF EGA Graphics Adapter + A000-C7FF VGA Graphics Adapter + B000-BFFF Mono Graphics Adapter + B800-BFFF Color Graphics Adapter + E000-FFFF AT BIOS + + + + +6. Technical Support +==================== + +6.1. Contacting Cirrus Logic's Technical Support +------------------------------------------------ + +Cirrus Logic's CS89XX Technical Application Support can be reached at:: + + Telephone :(800) 888-5016 (from inside U.S. and Canada) + :(512) 442-7555 (from outside the U.S. and Canada) + Fax :(512) 912-3871 + Email :ethernet@crystal.cirrus.com + WWW :http://www.cirrus.com + + +6.2. Information Required before Contacting Technical Support +------------------------------------------------------------- + +Before contacting Cirrus Logic for technical support, be prepared to provide as +Much of the following information as possible. + +1.) Adapter type (CRD8900, CDB8900, CDB8920, etc.) + +2.) Adapter configuration + + * IO Base, Memory Base, IO or memory mode enabled, IRQ, DMA channel + * Plug and Play enabled/disabled (CS8920-based adapters only) + * Configured for media auto-detect or specific media type (which type). + +3.) PC System's Configuration + + * Plug and Play system (yes/no) + * BIOS (make and version) + * System make and model + * CPU (type and speed) + * System RAM + * SCSI Adapter + +4.) Software + + * CS89XX driver and version + * Your network operating system and version + * Your system's OS version + * Version of all protocol support files + +5.) Any Error Message displayed. + + + +6.3 Obtaining the Latest Driver Version +--------------------------------------- + +You can obtain the latest CS89XX drivers and support software from Cirrus Logic's +Web site. You can also contact Cirrus Logic's Technical Support (email: +ethernet@crystal.cirrus.com) and request that you be registered for automatic +software-update notification. + +Cirrus Logic maintains a web page at http://www.cirrus.com with the +latest drivers and technical publications. + + +6.4. Current maintainer +----------------------- + +In February 2000 the maintenance of this driver was assumed by Andrew +Morton. + +6.5 Kernel module parameters +---------------------------- + +For use in embedded environments with no cs89x0 EEPROM, the kernel boot +parameter ``cs89x0_media=`` has been implemented. Usage is:: + + cs89x0_media=rj45 or + cs89x0_media=aui or + cs89x0_media=bnc diff --git a/Documentation/networking/device_drivers/cirrus/cs89x0.txt b/Documentation/networking/device_drivers/cirrus/cs89x0.txt deleted file mode 100644 index 0e190180eec8..000000000000 --- a/Documentation/networking/device_drivers/cirrus/cs89x0.txt +++ /dev/null @@ -1,624 +0,0 @@ - -NOTE ----- - -This document was contributed by Cirrus Logic for kernel 2.2.5. This version -has been updated for 2.3.48 by Andrew Morton. - -Cirrus make a copy of this driver available at their website, as -described below. In general, you should use the driver version which -comes with your Linux distribution. - - - -CIRRUS LOGIC LAN CS8900/CS8920 ETHERNET ADAPTERS -Linux Network Interface Driver ver. 2.00 -=============================================================================== - - -TABLE OF CONTENTS - -1.0 CIRRUS LOGIC LAN CS8900/CS8920 ETHERNET ADAPTERS - 1.1 Product Overview - 1.2 Driver Description - 1.2.1 Driver Name - 1.2.2 File in the Driver Package - 1.3 System Requirements - 1.4 Licensing Information - -2.0 ADAPTER INSTALLATION and CONFIGURATION - 2.1 CS8900-based Adapter Configuration - 2.2 CS8920-based Adapter Configuration - -3.0 LOADING THE DRIVER AS A MODULE - -4.0 COMPILING THE DRIVER - 4.1 Compiling the Driver as a Loadable Module - 4.2 Compiling the driver to support memory mode - 4.3 Compiling the driver to support Rx DMA - -5.0 TESTING AND TROUBLESHOOTING - 5.1 Known Defects and Limitations - 5.2 Testing the Adapter - 5.2.1 Diagnostic Self-Test - 5.2.2 Diagnostic Network Test - 5.3 Using the Adapter's LEDs - 5.4 Resolving I/O Conflicts - -6.0 TECHNICAL SUPPORT - 6.1 Contacting Cirrus Logic's Technical Support - 6.2 Information Required Before Contacting Technical Support - 6.3 Obtaining the Latest Driver Version - 6.4 Current maintainer - 6.5 Kernel boot parameters - - -1.0 CIRRUS LOGIC LAN CS8900/CS8920 ETHERNET ADAPTERS -=============================================================================== - - -1.1 PRODUCT OVERVIEW - -The CS8900-based ISA Ethernet Adapters from Cirrus Logic follow -IEEE 802.3 standards and support half or full-duplex operation in ISA bus -computers on 10 Mbps Ethernet networks. The adapters are designed for operation -in 16-bit ISA or EISA bus expansion slots and are available in -10BaseT-only or 3-media configurations (10BaseT, 10Base2, and AUI for 10Base-5 -or fiber networks). - -CS8920-based adapters are similar to the CS8900-based adapter with additional -features for Plug and Play (PnP) support and Wakeup Frame recognition. As -such, the configuration procedures differ somewhat between the two types of -adapters. Refer to the "Adapter Configuration" section for details on -configuring both types of adapters. - - -1.2 DRIVER DESCRIPTION - -The CS8900/CS8920 Ethernet Adapter driver for Linux supports the Linux -v2.3.48 or greater kernel. It can be compiled directly into the kernel -or loaded at run-time as a device driver module. - -1.2.1 Driver Name: cs89x0 - -1.2.2 Files in the Driver Archive: - -The files in the driver at Cirrus' website include: - - readme.txt - this file - build - batch file to compile cs89x0.c. - cs89x0.c - driver C code - cs89x0.h - driver header file - cs89x0.o - pre-compiled module (for v2.2.5 kernel) - config/Config.in - sample file to include cs89x0 driver in the kernel. - config/Makefile - sample file to include cs89x0 driver in the kernel. - config/Space.c - sample file to include cs89x0 driver in the kernel. - - - -1.3 SYSTEM REQUIREMENTS - -The following hardware is required: - - * Cirrus Logic LAN (CS8900/20-based) Ethernet ISA Adapter - - * IBM or IBM-compatible PC with: - * An 80386 or higher processor - * 16 bytes of contiguous IO space available between 210h - 370h - * One available IRQ (5,10,11,or 12 for the CS8900, 3-7,9-15 for CS8920). - - * Appropriate cable (and connector for AUI, 10BASE-2) for your network - topology. - -The following software is required: - -* LINUX kernel version 2.3.48 or higher - - * CS8900/20 Setup Utility (DOS-based) - - * LINUX kernel sources for your kernel (if compiling into kernel) - - * GNU Toolkit (gcc and make) v2.6 or above (if compiling into kernel - or a module) - - - -1.4 LICENSING INFORMATION - -This program is free software; you can redistribute it and/or modify it under -the terms of the GNU General Public License as published by the Free Software -Foundation, version 1. - -This program is distributed in the hope that it will be useful, but WITHOUT -ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or -FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for -more details. - -For a full copy of the GNU General Public License, write to the Free Software -Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - - - -2.0 ADAPTER INSTALLATION and CONFIGURATION -=============================================================================== - -Both the CS8900 and CS8920-based adapters can be configured using parameters -stored in an on-board EEPROM. You must use the DOS-based CS8900/20 Setup -Utility if you want to change the adapter's configuration in EEPROM. - -When loading the driver as a module, you can specify many of the adapter's -configuration parameters on the command-line to override the EEPROM's settings -or for interface configuration when an EEPROM is not used. (CS8920-based -adapters must use an EEPROM.) See Section 3.0 LOADING THE DRIVER AS A MODULE. - -Since the CS8900/20 Setup Utility is a DOS-based application, you must install -and configure the adapter in a DOS-based system using the CS8900/20 Setup -Utility before installation in the target LINUX system. (Not required if -installing a CS8900-based adapter and the default configuration is acceptable.) - - -2.1 CS8900-BASED ADAPTER CONFIGURATION - -CS8900-based adapters shipped from Cirrus Logic have been configured -with the following "default" settings: - - Operation Mode: Memory Mode - IRQ: 10 - Base I/O Address: 300 - Memory Base Address: D0000 - Optimization: DOS Client - Transmission Mode: Half-duplex - BootProm: None - Media Type: Autodetect (3-media cards) or - 10BASE-T (10BASE-T only adapter) - -You should only change the default configuration settings if conflicts with -another adapter exists. To change the adapter's configuration, run the -CS8900/20 Setup Utility. - - -2.2 CS8920-BASED ADAPTER CONFIGURATION - -CS8920-based adapters are shipped from Cirrus Logic configured as Plug -and Play (PnP) enabled. However, since the cs89x0 driver does NOT -support PnP, you must install the CS8920 adapter in a DOS-based PC and -run the CS8900/20 Setup Utility to disable PnP and configure the -adapter before installation in the target Linux system. Failure to do -this will leave the adapter inactive and the driver will be unable to -communicate with the adapter. - - - **************************************************************** - * CS8920-BASED ADAPTERS: * - * * - * CS8920-BASED ADAPTERS ARE PLUG and PLAY ENABLED BY DEFAULT. * - * THE CS89X0 DRIVER DOES NOT SUPPORT PnP. THEREFORE, YOU MUST * - * RUN THE CS8900/20 SETUP UTILITY TO DISABLE PnP SUPPORT AND * - * TO ACTIVATE THE ADAPTER. * - **************************************************************** - - - - -3.0 LOADING THE DRIVER AS A MODULE -=============================================================================== - -If the driver is compiled as a loadable module, you can load the driver module -with the 'modprobe' command. Many of the adapter's configuration parameters can -be specified as command-line arguments to the load command. This facility -provides a means to override the EEPROM's settings or for interface -configuration when an EEPROM is not used. - -Example: - - insmod cs89x0.o io=0x200 irq=0xA media=aui - -This example loads the module and configures the adapter to use an IO port base -address of 200h, interrupt 10, and use the AUI media connection. The following -configuration options are available on the command line: - -* io=### - specify IO address (200h-360h) -* irq=## - specify interrupt level -* use_dma=1 - Enable DMA -* dma=# - specify dma channel (Driver is compiled to support - Rx DMA only) -* dmasize=# (16 or 64) - DMA size 16K or 64K. Default value is set to 16. -* media=rj45 - specify media type - or media=bnc - or media=aui - or media=auto -* duplex=full - specify forced half/full/autonegotiate duplex - or duplex=half - or duplex=auto -* debug=# - debug level (only available if the driver was compiled - for debugging) - -NOTES: - -a) If an EEPROM is present, any specified command-line parameter - will override the corresponding configuration value stored in - EEPROM. - -b) The "io" parameter must be specified on the command-line. - -c) The driver's hardware probe routine is designed to avoid - writing to I/O space until it knows that there is a cs89x0 - card at the written addresses. This could cause problems - with device probing. To avoid this behaviour, add one - to the `io=' module parameter. This doesn't actually change - the I/O address, but it is a flag to tell the driver - to partially initialise the hardware before trying to - identify the card. This could be dangerous if you are - not sure that there is a cs89x0 card at the provided address. - - For example, to scan for an adapter located at IO base 0x300, - specify an IO address of 0x301. - -d) The "duplex=auto" parameter is only supported for the CS8920. - -e) The minimum command-line configuration required if an EEPROM is - not present is: - - io - irq - media type (no autodetect) - -f) The following additional parameters are CS89XX defaults (values - used with no EEPROM or command-line argument). - - * DMA Burst = enabled - * IOCHRDY Enabled = enabled - * UseSA = enabled - * CS8900 defaults to half-duplex if not specified on command-line - * CS8920 defaults to autoneg if not specified on command-line - * Use reset defaults for other config parameters - * dma_mode = 0 - -g) You can use ifconfig to set the adapter's Ethernet address. - -h) Many Linux distributions use the 'modprobe' command to load - modules. This program uses the '/etc/conf.modules' file to - determine configuration information which is passed to a driver - module when it is loaded. All the configuration options which are - described above may be placed within /etc/conf.modules. - - For example: - - > cat /etc/conf.modules - ... - alias eth0 cs89x0 - options cs89x0 io=0x0200 dma=5 use_dma=1 - ... - - In this example we are telling the module system that the - ethernet driver for this machine should use the cs89x0 driver. We - are asking 'modprobe' to pass the 'io', 'dma' and 'use_dma' - arguments to the driver when it is loaded. - -i) Cirrus recommend that the cs89x0 use the ISA DMA channels 5, 6 or - 7. You will probably find that other DMA channels will not work. - -j) The cs89x0 supports DMA for receiving only. DMA mode is - significantly more efficient. Flooding a 400 MHz Celeron machine - with large ping packets consumes 82% of its CPU capacity in non-DMA - mode. With DMA this is reduced to 45%. - -k) If your Linux kernel was compiled with inbuilt plug-and-play - support you will be able to find information about the cs89x0 card - with the command - - cat /proc/isapnp - -l) If during DMA operation you find erratic behavior or network data - corruption you should use your PC's BIOS to slow the EISA bus clock. - -m) If the cs89x0 driver is compiled directly into the kernel - (non-modular) then its I/O address is automatically determined by - ISA bus probing. The IRQ number, media options, etc are determined - from the card's EEPROM. - -n) If the cs89x0 driver is compiled directly into the kernel, DMA - mode may be selected by providing the kernel with a boot option - 'cs89x0_dma=N' where 'N' is the desired DMA channel number (5, 6 or 7). - - Kernel boot options may be provided on the LILO command line: - - LILO boot: linux cs89x0_dma=5 - - or they may be placed in /etc/lilo.conf: - - image=/boot/bzImage-2.3.48 - append="cs89x0_dma=5" - label=linux - root=/dev/hda5 - read-only - - The DMA Rx buffer size is hardwired to 16 kbytes in this mode. - (64k mode is not available). - - -4.0 COMPILING THE DRIVER -=============================================================================== - -The cs89x0 driver can be compiled directly into the kernel or compiled into -a loadable device driver module. - - -4.1 COMPILING THE DRIVER AS A LOADABLE MODULE - -To compile the driver into a loadable module, use the following command -(single command line, without quotes): - -"gcc -D__KERNEL__ -I/usr/src/linux/include -I/usr/src/linux/net/inet -Wall --Wstrict-prototypes -O2 -fomit-frame-pointer -DMODULE -DCONFIG_MODVERSIONS --c cs89x0.c" - -4.2 COMPILING THE DRIVER TO SUPPORT MEMORY MODE - -Support for memory mode was not carried over into the 2.3 series kernels. - -4.3 COMPILING THE DRIVER TO SUPPORT Rx DMA - -The compile-time optionality for DMA was removed in the 2.3 kernel -series. DMA support is now unconditionally part of the driver. It is -enabled by the 'use_dma=1' module option. - - -5.0 TESTING AND TROUBLESHOOTING -=============================================================================== - -5.1 KNOWN DEFECTS and LIMITATIONS - -Refer to the RELEASE.TXT file distributed as part of this archive for a list of -known defects, driver limitations, and work arounds. - - -5.2 TESTING THE ADAPTER - -Once the adapter has been installed and configured, the diagnostic option of -the CS8900/20 Setup Utility can be used to test the functionality of the -adapter and its network connection. Use the diagnostics 'Self Test' option to -test the functionality of the adapter with the hardware configuration you have -assigned. You can use the diagnostics 'Network Test' to test the ability of the -adapter to communicate across the Ethernet with another PC equipped with a -CS8900/20-based adapter card (it must also be running the CS8900/20 Setup -Utility). - - NOTE: The Setup Utility's diagnostics are designed to run in a - DOS-only operating system environment. DO NOT run the diagnostics - from a DOS or command prompt session under Windows 95, Windows NT, - OS/2, or other operating system. - -To run the diagnostics tests on the CS8900/20 adapter: - - 1.) Boot DOS on the PC and start the CS8900/20 Setup Utility. - - 2.) The adapter's current configuration is displayed. Hit the ENTER key to - get to the main menu. - - 4.) Select 'Diagnostics' (ALT-G) from the main menu. - * Select 'Self-Test' to test the adapter's basic functionality. - * Select 'Network Test' to test the network connection and cabling. - - -5.2.1 DIAGNOSTIC SELF-TEST - -The diagnostic self-test checks the adapter's basic functionality as well as -its ability to communicate across the ISA bus based on the system resources -assigned during hardware configuration. The following tests are performed: - - * IO Register Read/Write Test - The IO Register Read/Write test insures that the CS8900/20 can be - accessed in IO mode, and that the IO base address is correct. - - * Shared Memory Test - The Shared Memory test insures the CS8900/20 can be accessed in memory - mode and that the range of memory addresses assigned does not conflict - with other devices in the system. - - * Interrupt Test - The Interrupt test insures there are no conflicts with the assigned IRQ - signal. - - * EEPROM Test - The EEPROM test insures the EEPROM can be read. - - * Chip RAM Test - The Chip RAM test insures the 4K of memory internal to the CS8900/20 is - working properly. - - * Internal Loop-back Test - The Internal Loop Back test insures the adapter's transmitter and - receiver are operating properly. If this test fails, make sure the - adapter's cable is connected to the network (check for LED activity for - example). - - * Boot PROM Test - The Boot PROM test insures the Boot PROM is present, and can be read. - Failure indicates the Boot PROM was not successfully read due to a - hardware problem or due to a conflicts on the Boot PROM address - assignment. (Test only applies if the adapter is configured to use the - Boot PROM option.) - -Failure of a test item indicates a possible system resource conflict with -another device on the ISA bus. In this case, you should use the Manual Setup -option to reconfigure the adapter by selecting a different value for the system -resource that failed. - - -5.2.2 DIAGNOSTIC NETWORK TEST - -The Diagnostic Network Test verifies a working network connection by -transferring data between two CS8900/20 adapters installed in different PCs -on the same network. (Note: the diagnostic network test should not be run -between two nodes across a router.) - -This test requires that each of the two PCs have a CS8900/20-based adapter -installed and have the CS8900/20 Setup Utility running. The first PC is -configured as a Responder and the other PC is configured as an Initiator. -Once the Initiator is started, it sends data frames to the Responder which -returns the frames to the Initiator. - -The total number of frames received and transmitted are displayed on the -Initiator's display, along with a count of the number of frames received and -transmitted OK or in error. The test can be terminated anytime by the user at -either PC. - -To setup the Diagnostic Network Test: - - 1.) Select a PC with a CS8900/20-based adapter and a known working network - connection to act as the Responder. Run the CS8900/20 Setup Utility - and select 'Diagnostics -> Network Test -> Responder' from the main - menu. Hit ENTER to start the Responder. - - 2.) Return to the PC with the CS8900/20-based adapter you want to test and - start the CS8900/20 Setup Utility. - - 3.) From the main menu, Select 'Diagnostic -> Network Test -> Initiator'. - Hit ENTER to start the test. - -You may stop the test on the Initiator at any time while allowing the Responder -to continue running. In this manner, you can move to additional PCs and test -them by starting the Initiator on another PC without having to stop/start the -Responder. - - - -5.3 USING THE ADAPTER'S LEDs - -The 2 and 3-media adapters have two LEDs visible on the back end of the board -located near the 10Base-T connector. - -Link Integrity LED: A "steady" ON of the green LED indicates a valid 10Base-T -connection. (Only applies to 10Base-T. The green LED has no significance for -a 10Base-2 or AUI connection.) - -TX/RX LED: The yellow LED lights briefly each time the adapter transmits or -receives data. (The yellow LED will appear to "flicker" on a typical network.) - - -5.4 RESOLVING I/O CONFLICTS - -An IO conflict occurs when two or more adapter use the same ISA resource (IO -address, memory address or IRQ). You can usually detect an IO conflict in one -of four ways after installing and or configuring the CS8900/20-based adapter: - - 1.) The system does not boot properly (or at all). - - 2.) The driver cannot communicate with the adapter, reporting an "Adapter - not found" error message. - - 3.) You cannot connect to the network or the driver will not load. - - 4.) If you have configured the adapter to run in memory mode but the driver - reports it is using IO mode when loading, this is an indication of a - memory address conflict. - -If an IO conflict occurs, run the CS8900/20 Setup Utility and perform a -diagnostic self-test. Normally, the ISA resource in conflict will fail the -self-test. If so, reconfigure the adapter selecting another choice for the -resource in conflict. Run the diagnostics again to check for further IO -conflicts. - -In some cases, such as when the PC will not boot, it may be necessary to remove -the adapter and reconfigure it by installing it in another PC to run the -CS8900/20 Setup Utility. Once reinstalled in the target system, run the -diagnostics self-test to ensure the new configuration is free of conflicts -before loading the driver again. - -When manually configuring the adapter, keep in mind the typical ISA system -resource usage as indicated in the tables below. - -I/O Address Device IRQ Device ------------ -------- --- -------- - 200-20F Game I/O adapter 3 COM2, Bus Mouse - 230-23F Bus Mouse 4 COM1 - 270-27F LPT3: third parallel port 5 LPT2 - 2F0-2FF COM2: second serial port 6 Floppy Disk controller - 320-32F Fixed disk controller 7 LPT1 - 8 Real-time Clock - 9 EGA/VGA display adapter - 12 Mouse (PS/2) -Memory Address Device 13 Math Coprocessor --------------- --------------------- 14 Hard Disk controller -A000-BFFF EGA Graphics Adapter -A000-C7FF VGA Graphics Adapter -B000-BFFF Mono Graphics Adapter -B800-BFFF Color Graphics Adapter -E000-FFFF AT BIOS - - - - -6.0 TECHNICAL SUPPORT -=============================================================================== - -6.1 CONTACTING CIRRUS LOGIC'S TECHNICAL SUPPORT - -Cirrus Logic's CS89XX Technical Application Support can be reached at: - -Telephone :(800) 888-5016 (from inside U.S. and Canada) - :(512) 442-7555 (from outside the U.S. and Canada) -Fax :(512) 912-3871 -Email :ethernet@crystal.cirrus.com -WWW :http://www.cirrus.com - - -6.2 INFORMATION REQUIRED BEFORE CONTACTING TECHNICAL SUPPORT - -Before contacting Cirrus Logic for technical support, be prepared to provide as -Much of the following information as possible. - -1.) Adapter type (CRD8900, CDB8900, CDB8920, etc.) - -2.) Adapter configuration - - * IO Base, Memory Base, IO or memory mode enabled, IRQ, DMA channel - * Plug and Play enabled/disabled (CS8920-based adapters only) - * Configured for media auto-detect or specific media type (which type). - -3.) PC System's Configuration - - * Plug and Play system (yes/no) - * BIOS (make and version) - * System make and model - * CPU (type and speed) - * System RAM - * SCSI Adapter - -4.) Software - - * CS89XX driver and version - * Your network operating system and version - * Your system's OS version - * Version of all protocol support files - -5.) Any Error Message displayed. - - - -6.3 OBTAINING THE LATEST DRIVER VERSION - -You can obtain the latest CS89XX drivers and support software from Cirrus Logic's -Web site. You can also contact Cirrus Logic's Technical Support (email: -ethernet@crystal.cirrus.com) and request that you be registered for automatic -software-update notification. - -Cirrus Logic maintains a web page at http://www.cirrus.com with the -latest drivers and technical publications. - - -6.4 Current maintainer - -In February 2000 the maintenance of this driver was assumed by Andrew -Morton. - -6.5 Kernel module parameters - -For use in embedded environments with no cs89x0 EEPROM, the kernel boot -parameter `cs89x0_media=' has been implemented. Usage is: - - cs89x0_media=rj45 or - cs89x0_media=aui or - cs89x0_media=bnc - diff --git a/Documentation/networking/device_drivers/index.rst b/Documentation/networking/device_drivers/index.rst index 23c4ec9c9125..0b39342e2a1f 100644 --- a/Documentation/networking/device_drivers/index.rst +++ b/Documentation/networking/device_drivers/index.rst @@ -32,6 +32,7 @@ Contents: amazon/ena aquantia/atlantic chelsio/cxgb + cirrus/cs89x0 .. only:: subproject and html diff --git a/drivers/net/ethernet/cirrus/Kconfig b/drivers/net/ethernet/cirrus/Kconfig index 48f3198381bc..8d845f5ee0c5 100644 --- a/drivers/net/ethernet/cirrus/Kconfig +++ b/drivers/net/ethernet/cirrus/Kconfig @@ -24,7 +24,7 @@ config CS89x0 ---help--- Support for CS89x0 chipset based Ethernet cards. If you have a network (Ethernet) card of this type, say Y and read the file - . + . To compile this driver as a module, choose M here. The module will be called cs89x0. -- cgit v1.2.3-59-g8ed1b From e1ddedb5cbd6f8ec2f41874bc06e03023fbd9d99 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Fri, 1 May 2020 16:44:40 +0200 Subject: docs: networking: device drivers: convert davicom/dm9000.txt to ReST - add SPDX header; - add a document title; - mark lists as such; - mark tables as such; - mark code blocks and literals as such; - use the right horizontal tag markup; - adjust identation, whitespaces and blank lines where needed; - add to networking/index.rst. Signed-off-by: Mauro Carvalho Chehab Signed-off-by: David S. Miller --- .../networking/device_drivers/davicom/dm9000.rst | 171 +++++++++++++++++++++ .../networking/device_drivers/davicom/dm9000.txt | 167 -------------------- Documentation/networking/device_drivers/index.rst | 1 + 3 files changed, 172 insertions(+), 167 deletions(-) create mode 100644 Documentation/networking/device_drivers/davicom/dm9000.rst delete mode 100644 Documentation/networking/device_drivers/davicom/dm9000.txt diff --git a/Documentation/networking/device_drivers/davicom/dm9000.rst b/Documentation/networking/device_drivers/davicom/dm9000.rst new file mode 100644 index 000000000000..d5458da01083 --- /dev/null +++ b/Documentation/networking/device_drivers/davicom/dm9000.rst @@ -0,0 +1,171 @@ +.. SPDX-License-Identifier: GPL-2.0 + +===================== +DM9000 Network driver +===================== + +Copyright 2008 Simtec Electronics, + + Ben Dooks + + +Introduction +------------ + +This file describes how to use the DM9000 platform-device based network driver +that is contained in the files drivers/net/dm9000.c and drivers/net/dm9000.h. + +The driver supports three DM9000 variants, the DM9000E which is the first chip +supported as well as the newer DM9000A and DM9000B devices. It is currently +maintained and tested by Ben Dooks, who should be CC: to any patches for this +driver. + + +Defining the platform device +---------------------------- + +The minimum set of resources attached to the platform device are as follows: + + 1) The physical address of the address register + 2) The physical address of the data register + 3) The IRQ line the device's interrupt pin is connected to. + +These resources should be specified in that order, as the ordering of the +two address regions is important (the driver expects these to be address +and then data). + +An example from arch/arm/mach-s3c2410/mach-bast.c is:: + + static struct resource bast_dm9k_resource[] = { + [0] = { + .start = S3C2410_CS5 + BAST_PA_DM9000, + .end = S3C2410_CS5 + BAST_PA_DM9000 + 3, + .flags = IORESOURCE_MEM, + }, + [1] = { + .start = S3C2410_CS5 + BAST_PA_DM9000 + 0x40, + .end = S3C2410_CS5 + BAST_PA_DM9000 + 0x40 + 0x3f, + .flags = IORESOURCE_MEM, + }, + [2] = { + .start = IRQ_DM9000, + .end = IRQ_DM9000, + .flags = IORESOURCE_IRQ | IORESOURCE_IRQ_HIGHLEVEL, + } + }; + + static struct platform_device bast_device_dm9k = { + .name = "dm9000", + .id = 0, + .num_resources = ARRAY_SIZE(bast_dm9k_resource), + .resource = bast_dm9k_resource, + }; + +Note the setting of the IRQ trigger flag in bast_dm9k_resource[2].flags, +as this will generate a warning if it is not present. The trigger from +the flags field will be passed to request_irq() when registering the IRQ +handler to ensure that the IRQ is setup correctly. + +This shows a typical platform device, without the optional configuration +platform data supplied. The next example uses the same resources, but adds +the optional platform data to pass extra configuration data:: + + static struct dm9000_plat_data bast_dm9k_platdata = { + .flags = DM9000_PLATF_16BITONLY, + }; + + static struct platform_device bast_device_dm9k = { + .name = "dm9000", + .id = 0, + .num_resources = ARRAY_SIZE(bast_dm9k_resource), + .resource = bast_dm9k_resource, + .dev = { + .platform_data = &bast_dm9k_platdata, + } + }; + +The platform data is defined in include/linux/dm9000.h and described below. + + +Platform data +------------- + +Extra platform data for the DM9000 can describe the IO bus width to the +device, whether or not an external PHY is attached to the device and +the availability of an external configuration EEPROM. + +The flags for the platform data .flags field are as follows: + +DM9000_PLATF_8BITONLY + + The IO should be done with 8bit operations. + +DM9000_PLATF_16BITONLY + + The IO should be done with 16bit operations. + +DM9000_PLATF_32BITONLY + + The IO should be done with 32bit operations. + +DM9000_PLATF_EXT_PHY + + The chip is connected to an external PHY. + +DM9000_PLATF_NO_EEPROM + + This can be used to signify that the board does not have an + EEPROM, or that the EEPROM should be hidden from the user. + +DM9000_PLATF_SIMPLE_PHY + + Switch to using the simpler PHY polling method which does not + try and read the MII PHY state regularly. This is only available + when using the internal PHY. See the section on link state polling + for more information. + + The config symbol DM9000_FORCE_SIMPLE_PHY_POLL, Kconfig entry + "Force simple NSR based PHY polling" allows this flag to be + forced on at build time. + + +PHY Link state polling +---------------------- + +The driver keeps track of the link state and informs the network core +about link (carrier) availability. This is managed by several methods +depending on the version of the chip and on which PHY is being used. + +For the internal PHY, the original (and currently default) method is +to read the MII state, either when the status changes if we have the +necessary interrupt support in the chip or every two seconds via a +periodic timer. + +To reduce the overhead for the internal PHY, there is now the option +of using the DM9000_FORCE_SIMPLE_PHY_POLL config, or DM9000_PLATF_SIMPLE_PHY +platform data option to read the summary information without the +expensive MII accesses. This method is faster, but does not print +as much information. + +When using an external PHY, the driver currently has to poll the MII +link status as there is no method for getting an interrupt on link change. + + +DM9000A / DM9000B +----------------- + +These chips are functionally similar to the DM9000E and are supported easily +by the same driver. The features are: + + 1) Interrupt on internal PHY state change. This means that the periodic + polling of the PHY status may be disabled on these devices when using + the internal PHY. + + 2) TCP/UDP checksum offloading, which the driver does not currently support. + + +ethtool +------- + +The driver supports the ethtool interface for access to the driver +state information, the PHY state and the EEPROM. diff --git a/Documentation/networking/device_drivers/davicom/dm9000.txt b/Documentation/networking/device_drivers/davicom/dm9000.txt deleted file mode 100644 index 5552e2e575c5..000000000000 --- a/Documentation/networking/device_drivers/davicom/dm9000.txt +++ /dev/null @@ -1,167 +0,0 @@ -DM9000 Network driver -===================== - -Copyright 2008 Simtec Electronics, - Ben Dooks - - -Introduction ------------- - -This file describes how to use the DM9000 platform-device based network driver -that is contained in the files drivers/net/dm9000.c and drivers/net/dm9000.h. - -The driver supports three DM9000 variants, the DM9000E which is the first chip -supported as well as the newer DM9000A and DM9000B devices. It is currently -maintained and tested by Ben Dooks, who should be CC: to any patches for this -driver. - - -Defining the platform device ----------------------------- - -The minimum set of resources attached to the platform device are as follows: - - 1) The physical address of the address register - 2) The physical address of the data register - 3) The IRQ line the device's interrupt pin is connected to. - -These resources should be specified in that order, as the ordering of the -two address regions is important (the driver expects these to be address -and then data). - -An example from arch/arm/mach-s3c2410/mach-bast.c is: - -static struct resource bast_dm9k_resource[] = { - [0] = { - .start = S3C2410_CS5 + BAST_PA_DM9000, - .end = S3C2410_CS5 + BAST_PA_DM9000 + 3, - .flags = IORESOURCE_MEM, - }, - [1] = { - .start = S3C2410_CS5 + BAST_PA_DM9000 + 0x40, - .end = S3C2410_CS5 + BAST_PA_DM9000 + 0x40 + 0x3f, - .flags = IORESOURCE_MEM, - }, - [2] = { - .start = IRQ_DM9000, - .end = IRQ_DM9000, - .flags = IORESOURCE_IRQ | IORESOURCE_IRQ_HIGHLEVEL, - } -}; - -static struct platform_device bast_device_dm9k = { - .name = "dm9000", - .id = 0, - .num_resources = ARRAY_SIZE(bast_dm9k_resource), - .resource = bast_dm9k_resource, -}; - -Note the setting of the IRQ trigger flag in bast_dm9k_resource[2].flags, -as this will generate a warning if it is not present. The trigger from -the flags field will be passed to request_irq() when registering the IRQ -handler to ensure that the IRQ is setup correctly. - -This shows a typical platform device, without the optional configuration -platform data supplied. The next example uses the same resources, but adds -the optional platform data to pass extra configuration data: - -static struct dm9000_plat_data bast_dm9k_platdata = { - .flags = DM9000_PLATF_16BITONLY, -}; - -static struct platform_device bast_device_dm9k = { - .name = "dm9000", - .id = 0, - .num_resources = ARRAY_SIZE(bast_dm9k_resource), - .resource = bast_dm9k_resource, - .dev = { - .platform_data = &bast_dm9k_platdata, - } -}; - -The platform data is defined in include/linux/dm9000.h and described below. - - -Platform data -------------- - -Extra platform data for the DM9000 can describe the IO bus width to the -device, whether or not an external PHY is attached to the device and -the availability of an external configuration EEPROM. - -The flags for the platform data .flags field are as follows: - -DM9000_PLATF_8BITONLY - - The IO should be done with 8bit operations. - -DM9000_PLATF_16BITONLY - - The IO should be done with 16bit operations. - -DM9000_PLATF_32BITONLY - - The IO should be done with 32bit operations. - -DM9000_PLATF_EXT_PHY - - The chip is connected to an external PHY. - -DM9000_PLATF_NO_EEPROM - - This can be used to signify that the board does not have an - EEPROM, or that the EEPROM should be hidden from the user. - -DM9000_PLATF_SIMPLE_PHY - - Switch to using the simpler PHY polling method which does not - try and read the MII PHY state regularly. This is only available - when using the internal PHY. See the section on link state polling - for more information. - - The config symbol DM9000_FORCE_SIMPLE_PHY_POLL, Kconfig entry - "Force simple NSR based PHY polling" allows this flag to be - forced on at build time. - - -PHY Link state polling ----------------------- - -The driver keeps track of the link state and informs the network core -about link (carrier) availability. This is managed by several methods -depending on the version of the chip and on which PHY is being used. - -For the internal PHY, the original (and currently default) method is -to read the MII state, either when the status changes if we have the -necessary interrupt support in the chip or every two seconds via a -periodic timer. - -To reduce the overhead for the internal PHY, there is now the option -of using the DM9000_FORCE_SIMPLE_PHY_POLL config, or DM9000_PLATF_SIMPLE_PHY -platform data option to read the summary information without the -expensive MII accesses. This method is faster, but does not print -as much information. - -When using an external PHY, the driver currently has to poll the MII -link status as there is no method for getting an interrupt on link change. - - -DM9000A / DM9000B ------------------ - -These chips are functionally similar to the DM9000E and are supported easily -by the same driver. The features are: - - 1) Interrupt on internal PHY state change. This means that the periodic - polling of the PHY status may be disabled on these devices when using - the internal PHY. - - 2) TCP/UDP checksum offloading, which the driver does not currently support. - - -ethtool -------- - -The driver supports the ethtool interface for access to the driver -state information, the PHY state and the EEPROM. diff --git a/Documentation/networking/device_drivers/index.rst b/Documentation/networking/device_drivers/index.rst index 0b39342e2a1f..e8db57fef2e9 100644 --- a/Documentation/networking/device_drivers/index.rst +++ b/Documentation/networking/device_drivers/index.rst @@ -33,6 +33,7 @@ Contents: aquantia/atlantic chelsio/cxgb cirrus/cs89x0 + davicom/dm9000 .. only:: subproject and html -- cgit v1.2.3-59-g8ed1b From b6671d71ca811aed02f136a6cd812a542f88c483 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Fri, 1 May 2020 16:44:41 +0200 Subject: docs: networking: device drivers: convert dec/de4x5.txt to ReST - add SPDX header; - add a document title; - mark code blocks and literals as such; - adjust identation, whitespaces and blank lines where needed; - add to networking/index.rst. Signed-off-by: Mauro Carvalho Chehab Signed-off-by: David S. Miller --- .../networking/device_drivers/dec/de4x5.rst | 189 +++++++++++++++++++++ .../networking/device_drivers/dec/de4x5.txt | 178 ------------------- Documentation/networking/device_drivers/index.rst | 1 + drivers/net/ethernet/dec/tulip/Kconfig | 2 +- 4 files changed, 191 insertions(+), 179 deletions(-) create mode 100644 Documentation/networking/device_drivers/dec/de4x5.rst delete mode 100644 Documentation/networking/device_drivers/dec/de4x5.txt diff --git a/Documentation/networking/device_drivers/dec/de4x5.rst b/Documentation/networking/device_drivers/dec/de4x5.rst new file mode 100644 index 000000000000..e03e9c631879 --- /dev/null +++ b/Documentation/networking/device_drivers/dec/de4x5.rst @@ -0,0 +1,189 @@ +.. SPDX-License-Identifier: GPL-2.0 + +=================================== +DEC EtherWORKS Ethernet De4x5 cards +=================================== + + Originally, this driver was written for the Digital Equipment + Corporation series of EtherWORKS Ethernet cards: + + - DE425 TP/COAX EISA + - DE434 TP PCI + - DE435 TP/COAX/AUI PCI + - DE450 TP/COAX/AUI PCI + - DE500 10/100 PCI Fasternet + + but it will now attempt to support all cards which conform to the + Digital Semiconductor SROM Specification. The driver currently + recognises the following chips: + + - DC21040 (no SROM) + - DC21041[A] + - DC21140[A] + - DC21142 + - DC21143 + + So far the driver is known to work with the following cards: + + - KINGSTON + - Linksys + - ZNYX342 + - SMC8432 + - SMC9332 (w/new SROM) + - ZNYX31[45] + - ZNYX346 10/100 4 port (can act as a 10/100 bridge!) + + The driver has been tested on a relatively busy network using the DE425, + DE434, DE435 and DE500 cards and benchmarked with 'ttcp': it transferred + 16M of data to a DECstation 5000/200 as follows:: + + TCP UDP + TX RX TX RX + DE425 1030k 997k 1170k 1128k + DE434 1063k 995k 1170k 1125k + DE435 1063k 995k 1170k 1125k + DE500 1063k 998k 1170k 1125k in 10Mb/s mode + + All values are typical (in kBytes/sec) from a sample of 4 for each + measurement. Their error is +/-20k on a quiet (private) network and also + depend on what load the CPU has. + +---------------------------------------------------------------------------- + + The ability to load this driver as a loadable module has been included + and used extensively during the driver development (to save those long + reboot sequences). Loadable module support under PCI and EISA has been + achieved by letting the driver autoprobe as if it were compiled into the + kernel. Do make sure you're not sharing interrupts with anything that + cannot accommodate interrupt sharing! + + To utilise this ability, you have to do 8 things: + + 0) have a copy of the loadable modules code installed on your system. + 1) copy de4x5.c from the /linux/drivers/net directory to your favourite + temporary directory. + 2) for fixed autoprobes (not recommended), edit the source code near + line 5594 to reflect the I/O address you're using, or assign these when + loading by:: + + insmod de4x5 io=0xghh where g = bus number + hh = device number + + .. note:: + + autoprobing for modules is now supported by default. You may just + use:: + + insmod de4x5 + + to load all available boards. For a specific board, still use + the 'io=?' above. + 3) compile de4x5.c, but include -DMODULE in the command line to ensure + that the correct bits are compiled (see end of source code). + 4) if you are wanting to add a new card, goto 5. Otherwise, recompile a + kernel with the de4x5 configuration turned off and reboot. + 5) insmod de4x5 [io=0xghh] + 6) run the net startup bits for your new eth?? interface(s) manually + (usually /etc/rc.inet[12] at boot time). + 7) enjoy! + + To unload a module, turn off the associated interface(s) + 'ifconfig eth?? down' then 'rmmod de4x5'. + + Automedia detection is included so that in principle you can disconnect + from, e.g. TP, reconnect to BNC and things will still work (after a + pause while the driver figures out where its media went). My tests + using ping showed that it appears to work.... + + By default, the driver will now autodetect any DECchip based card. + Should you have a need to restrict the driver to DIGITAL only cards, you + can compile with a DEC_ONLY define, or if loading as a module, use the + 'dec_only=1' parameter. + + I've changed the timing routines to use the kernel timer and scheduling + functions so that the hangs and other assorted problems that occurred + while autosensing the media should be gone. A bonus for the DC21040 + auto media sense algorithm is that it can now use one that is more in + line with the rest (the DC21040 chip doesn't have a hardware timer). + The downside is the 1 'jiffies' (10ms) resolution. + + IEEE 802.3u MII interface code has been added in anticipation that some + products may use it in the future. + + The SMC9332 card has a non-compliant SROM which needs fixing - I have + patched this driver to detect it because the SROM format used complies + to a previous DEC-STD format. + + I have removed the buffer copies needed for receive on Intels. I cannot + remove them for Alphas since the Tulip hardware only does longword + aligned DMA transfers and the Alphas get alignment traps with non + longword aligned data copies (which makes them really slow). No comment. + + I have added SROM decoding routines to make this driver work with any + card that supports the Digital Semiconductor SROM spec. This will help + all cards running the dc2114x series chips in particular. Cards using + the dc2104x chips should run correctly with the basic driver. I'm in + debt to for the testing and feedback that helped get + this feature working. So far we have tested KINGSTON, SMC8432, SMC9332 + (with the latest SROM complying with the SROM spec V3: their first was + broken), ZNYX342 and LinkSys. ZNYX314 (dual 21041 MAC) and ZNYX 315 + (quad 21041 MAC) cards also appear to work despite their incorrectly + wired IRQs. + + I have added a temporary fix for interrupt problems when some SCSI cards + share the same interrupt as the DECchip based cards. The problem occurs + because the SCSI card wants to grab the interrupt as a fast interrupt + (runs the service routine with interrupts turned off) vs. this card + which really needs to run the service routine with interrupts turned on. + This driver will now add the interrupt service routine as a fast + interrupt if it is bounced from the slow interrupt. THIS IS NOT A + RECOMMENDED WAY TO RUN THE DRIVER and has been done for a limited time + until people sort out their compatibility issues and the kernel + interrupt service code is fixed. YOU SHOULD SEPARATE OUT THE FAST + INTERRUPT CARDS FROM THE SLOW INTERRUPT CARDS to ensure that they do not + run on the same interrupt. PCMCIA/CardBus is another can of worms... + + Finally, I think I have really fixed the module loading problem with + more than one DECchip based card. As a side effect, I don't mess with + the device structure any more which means that if more than 1 card in + 2.0.x is installed (4 in 2.1.x), the user will have to edit + linux/drivers/net/Space.c to make room for them. Hence, module loading + is the preferred way to use this driver, since it doesn't have this + limitation. + + Where SROM media detection is used and full duplex is specified in the + SROM, the feature is ignored unless lp->params.fdx is set at compile + time OR during a module load (insmod de4x5 args='eth??:fdx' [see + below]). This is because there is no way to automatically detect full + duplex links except through autonegotiation. When I include the + autonegotiation feature in the SROM autoconf code, this detection will + occur automatically for that case. + + Command line arguments are now allowed, similar to passing arguments + through LILO. This will allow a per adapter board set up of full duplex + and media. The only lexical constraints are: the board name (dev->name) + appears in the list before its parameters. The list of parameters ends + either at the end of the parameter list or with another board name. The + following parameters are allowed: + + ========= =============================================== + fdx for full duplex + autosense to set the media/speed; with the following + sub-parameters: + TP, TP_NW, BNC, AUI, BNC_AUI, 100Mb, 10Mb, AUTO + ========= =============================================== + + Case sensitivity is important for the sub-parameters. They *must* be + upper case. Examples:: + + insmod de4x5 args='eth1:fdx autosense=BNC eth0:autosense=100Mb'. + + For a compiled in driver, in linux/drivers/net/CONFIG, place e.g.:: + + DE4X5_OPTS = -DDE4X5_PARM='"eth0:fdx autosense=AUI eth2:autosense=TP"' + + Yes, I know full duplex isn't permissible on BNC or AUI; they're just + examples. By default, full duplex is turned off and AUTO is the default + autosense setting. In reality, I expect only the full duplex option to + be used. Note the use of single quotes in the two examples above and the + lack of commas to separate items. diff --git a/Documentation/networking/device_drivers/dec/de4x5.txt b/Documentation/networking/device_drivers/dec/de4x5.txt deleted file mode 100644 index 452aac58341d..000000000000 --- a/Documentation/networking/device_drivers/dec/de4x5.txt +++ /dev/null @@ -1,178 +0,0 @@ - Originally, this driver was written for the Digital Equipment - Corporation series of EtherWORKS Ethernet cards: - - DE425 TP/COAX EISA - DE434 TP PCI - DE435 TP/COAX/AUI PCI - DE450 TP/COAX/AUI PCI - DE500 10/100 PCI Fasternet - - but it will now attempt to support all cards which conform to the - Digital Semiconductor SROM Specification. The driver currently - recognises the following chips: - - DC21040 (no SROM) - DC21041[A] - DC21140[A] - DC21142 - DC21143 - - So far the driver is known to work with the following cards: - - KINGSTON - Linksys - ZNYX342 - SMC8432 - SMC9332 (w/new SROM) - ZNYX31[45] - ZNYX346 10/100 4 port (can act as a 10/100 bridge!) - - The driver has been tested on a relatively busy network using the DE425, - DE434, DE435 and DE500 cards and benchmarked with 'ttcp': it transferred - 16M of data to a DECstation 5000/200 as follows: - - TCP UDP - TX RX TX RX - DE425 1030k 997k 1170k 1128k - DE434 1063k 995k 1170k 1125k - DE435 1063k 995k 1170k 1125k - DE500 1063k 998k 1170k 1125k in 10Mb/s mode - - All values are typical (in kBytes/sec) from a sample of 4 for each - measurement. Their error is +/-20k on a quiet (private) network and also - depend on what load the CPU has. - - ========================================================================= - - The ability to load this driver as a loadable module has been included - and used extensively during the driver development (to save those long - reboot sequences). Loadable module support under PCI and EISA has been - achieved by letting the driver autoprobe as if it were compiled into the - kernel. Do make sure you're not sharing interrupts with anything that - cannot accommodate interrupt sharing! - - To utilise this ability, you have to do 8 things: - - 0) have a copy of the loadable modules code installed on your system. - 1) copy de4x5.c from the /linux/drivers/net directory to your favourite - temporary directory. - 2) for fixed autoprobes (not recommended), edit the source code near - line 5594 to reflect the I/O address you're using, or assign these when - loading by: - - insmod de4x5 io=0xghh where g = bus number - hh = device number - - NB: autoprobing for modules is now supported by default. You may just - use: - - insmod de4x5 - - to load all available boards. For a specific board, still use - the 'io=?' above. - 3) compile de4x5.c, but include -DMODULE in the command line to ensure - that the correct bits are compiled (see end of source code). - 4) if you are wanting to add a new card, goto 5. Otherwise, recompile a - kernel with the de4x5 configuration turned off and reboot. - 5) insmod de4x5 [io=0xghh] - 6) run the net startup bits for your new eth?? interface(s) manually - (usually /etc/rc.inet[12] at boot time). - 7) enjoy! - - To unload a module, turn off the associated interface(s) - 'ifconfig eth?? down' then 'rmmod de4x5'. - - Automedia detection is included so that in principle you can disconnect - from, e.g. TP, reconnect to BNC and things will still work (after a - pause while the driver figures out where its media went). My tests - using ping showed that it appears to work.... - - By default, the driver will now autodetect any DECchip based card. - Should you have a need to restrict the driver to DIGITAL only cards, you - can compile with a DEC_ONLY define, or if loading as a module, use the - 'dec_only=1' parameter. - - I've changed the timing routines to use the kernel timer and scheduling - functions so that the hangs and other assorted problems that occurred - while autosensing the media should be gone. A bonus for the DC21040 - auto media sense algorithm is that it can now use one that is more in - line with the rest (the DC21040 chip doesn't have a hardware timer). - The downside is the 1 'jiffies' (10ms) resolution. - - IEEE 802.3u MII interface code has been added in anticipation that some - products may use it in the future. - - The SMC9332 card has a non-compliant SROM which needs fixing - I have - patched this driver to detect it because the SROM format used complies - to a previous DEC-STD format. - - I have removed the buffer copies needed for receive on Intels. I cannot - remove them for Alphas since the Tulip hardware only does longword - aligned DMA transfers and the Alphas get alignment traps with non - longword aligned data copies (which makes them really slow). No comment. - - I have added SROM decoding routines to make this driver work with any - card that supports the Digital Semiconductor SROM spec. This will help - all cards running the dc2114x series chips in particular. Cards using - the dc2104x chips should run correctly with the basic driver. I'm in - debt to for the testing and feedback that helped get - this feature working. So far we have tested KINGSTON, SMC8432, SMC9332 - (with the latest SROM complying with the SROM spec V3: their first was - broken), ZNYX342 and LinkSys. ZNYX314 (dual 21041 MAC) and ZNYX 315 - (quad 21041 MAC) cards also appear to work despite their incorrectly - wired IRQs. - - I have added a temporary fix for interrupt problems when some SCSI cards - share the same interrupt as the DECchip based cards. The problem occurs - because the SCSI card wants to grab the interrupt as a fast interrupt - (runs the service routine with interrupts turned off) vs. this card - which really needs to run the service routine with interrupts turned on. - This driver will now add the interrupt service routine as a fast - interrupt if it is bounced from the slow interrupt. THIS IS NOT A - RECOMMENDED WAY TO RUN THE DRIVER and has been done for a limited time - until people sort out their compatibility issues and the kernel - interrupt service code is fixed. YOU SHOULD SEPARATE OUT THE FAST - INTERRUPT CARDS FROM THE SLOW INTERRUPT CARDS to ensure that they do not - run on the same interrupt. PCMCIA/CardBus is another can of worms... - - Finally, I think I have really fixed the module loading problem with - more than one DECchip based card. As a side effect, I don't mess with - the device structure any more which means that if more than 1 card in - 2.0.x is installed (4 in 2.1.x), the user will have to edit - linux/drivers/net/Space.c to make room for them. Hence, module loading - is the preferred way to use this driver, since it doesn't have this - limitation. - - Where SROM media detection is used and full duplex is specified in the - SROM, the feature is ignored unless lp->params.fdx is set at compile - time OR during a module load (insmod de4x5 args='eth??:fdx' [see - below]). This is because there is no way to automatically detect full - duplex links except through autonegotiation. When I include the - autonegotiation feature in the SROM autoconf code, this detection will - occur automatically for that case. - - Command line arguments are now allowed, similar to passing arguments - through LILO. This will allow a per adapter board set up of full duplex - and media. The only lexical constraints are: the board name (dev->name) - appears in the list before its parameters. The list of parameters ends - either at the end of the parameter list or with another board name. The - following parameters are allowed: - - fdx for full duplex - autosense to set the media/speed; with the following - sub-parameters: - TP, TP_NW, BNC, AUI, BNC_AUI, 100Mb, 10Mb, AUTO - - Case sensitivity is important for the sub-parameters. They *must* be - upper case. Examples: - - insmod de4x5 args='eth1:fdx autosense=BNC eth0:autosense=100Mb'. - - For a compiled in driver, in linux/drivers/net/CONFIG, place e.g. - DE4X5_OPTS = -DDE4X5_PARM='"eth0:fdx autosense=AUI eth2:autosense=TP"' - - Yes, I know full duplex isn't permissible on BNC or AUI; they're just - examples. By default, full duplex is turned off and AUTO is the default - autosense setting. In reality, I expect only the full duplex option to - be used. Note the use of single quotes in the two examples above and the - lack of commas to separate items. diff --git a/Documentation/networking/device_drivers/index.rst b/Documentation/networking/device_drivers/index.rst index e8db57fef2e9..4ad13ffb5800 100644 --- a/Documentation/networking/device_drivers/index.rst +++ b/Documentation/networking/device_drivers/index.rst @@ -34,6 +34,7 @@ Contents: chelsio/cxgb cirrus/cs89x0 davicom/dm9000 + dec/de4x5 .. only:: subproject and html diff --git a/drivers/net/ethernet/dec/tulip/Kconfig b/drivers/net/ethernet/dec/tulip/Kconfig index 8ce6888ea722..8c4245d94bb2 100644 --- a/drivers/net/ethernet/dec/tulip/Kconfig +++ b/drivers/net/ethernet/dec/tulip/Kconfig @@ -114,7 +114,7 @@ config DE4X5 These include the DE425, DE434, DE435, DE450 and DE500 models. If you have a network card of this type, say Y. More specific information is contained in - . + . To compile this driver as a module, choose M here. The module will be called de4x5. -- cgit v1.2.3-59-g8ed1b From c981977d3a5ce55c96b1b77f42d0a9df0a79244e Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Fri, 1 May 2020 16:44:42 +0200 Subject: docs: networking: device drivers: convert dec/dmfe.txt to ReST - add SPDX header; - adjust titles and chapters, adding proper markups; - comment out text-only TOC from html/pdf output; - mark code blocks and literals as such; - mark tables as such; - adjust identation, whitespaces and blank lines where needed; - add to networking/index.rst. Signed-off-by: Mauro Carvalho Chehab Signed-off-by: David S. Miller --- .../networking/device_drivers/dec/dmfe.rst | 71 ++++++++++++++++++++++ .../networking/device_drivers/dec/dmfe.txt | 66 -------------------- Documentation/networking/device_drivers/index.rst | 1 + MAINTAINERS | 2 +- drivers/net/ethernet/dec/tulip/Kconfig | 2 +- 5 files changed, 74 insertions(+), 68 deletions(-) create mode 100644 Documentation/networking/device_drivers/dec/dmfe.rst delete mode 100644 Documentation/networking/device_drivers/dec/dmfe.txt diff --git a/Documentation/networking/device_drivers/dec/dmfe.rst b/Documentation/networking/device_drivers/dec/dmfe.rst new file mode 100644 index 000000000000..c4cf809cad84 --- /dev/null +++ b/Documentation/networking/device_drivers/dec/dmfe.rst @@ -0,0 +1,71 @@ +.. SPDX-License-Identifier: GPL-2.0 + +============================================================== +Davicom DM9102(A)/DM9132/DM9801 fast ethernet driver for Linux +============================================================== + +Note: This driver doesn't have a maintainer. + + +This program is free software; you can redistribute it and/or +modify it under the terms of the GNU General Public License +as published by the Free Software Foundation; either version 2 +of the License, or (at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + + +This driver provides kernel support for Davicom DM9102(A)/DM9132/DM9801 ethernet cards ( CNET +10/100 ethernet cards uses Davicom chipset too, so this driver supports CNET cards too ).If you +didn't compile this driver as a module, it will automatically load itself on boot and print a +line similar to:: + + dmfe: Davicom DM9xxx net driver, version 1.36.4 (2002-01-17) + +If you compiled this driver as a module, you have to load it on boot.You can load it with command:: + + insmod dmfe + +This way it will autodetect the device mode.This is the suggested way to load the module.Or you can pass +a mode= setting to module while loading, like:: + + insmod dmfe mode=0 # Force 10M Half Duplex + insmod dmfe mode=1 # Force 100M Half Duplex + insmod dmfe mode=4 # Force 10M Full Duplex + insmod dmfe mode=5 # Force 100M Full Duplex + +Next you should configure your network interface with a command similar to:: + + ifconfig eth0 172.22.3.18 + ^^^^^^^^^^^ + Your IP Address + +Then you may have to modify the default routing table with command:: + + route add default eth0 + + +Now your ethernet card should be up and running. + + +TODO: + +- Implement pci_driver::suspend() and pci_driver::resume() power management methods. +- Check on 64 bit boxes. +- Check and fix on big endian boxes. +- Test and make sure PCI latency is now correct for all cases. + + +Authors: + +Sten Wang : Original Author + +Contributors: + +- Marcelo Tosatti +- Alan Cox +- Jeff Garzik +- Vojtech Pavlik diff --git a/Documentation/networking/device_drivers/dec/dmfe.txt b/Documentation/networking/device_drivers/dec/dmfe.txt deleted file mode 100644 index 25320bf19c86..000000000000 --- a/Documentation/networking/device_drivers/dec/dmfe.txt +++ /dev/null @@ -1,66 +0,0 @@ -Note: This driver doesn't have a maintainer. - -Davicom DM9102(A)/DM9132/DM9801 fast ethernet driver for Linux. - -This program is free software; you can redistribute it and/or -modify it under the terms of the GNU General Public License -as published by the Free Software Foundation; either version 2 -of the License, or (at your option) any later version. - -This program is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU General Public License for more details. - - -This driver provides kernel support for Davicom DM9102(A)/DM9132/DM9801 ethernet cards ( CNET -10/100 ethernet cards uses Davicom chipset too, so this driver supports CNET cards too ).If you -didn't compile this driver as a module, it will automatically load itself on boot and print a -line similar to : - - dmfe: Davicom DM9xxx net driver, version 1.36.4 (2002-01-17) - -If you compiled this driver as a module, you have to load it on boot.You can load it with command : - - insmod dmfe - -This way it will autodetect the device mode.This is the suggested way to load the module.Or you can pass -a mode= setting to module while loading, like : - - insmod dmfe mode=0 # Force 10M Half Duplex - insmod dmfe mode=1 # Force 100M Half Duplex - insmod dmfe mode=4 # Force 10M Full Duplex - insmod dmfe mode=5 # Force 100M Full Duplex - -Next you should configure your network interface with a command similar to : - - ifconfig eth0 172.22.3.18 - ^^^^^^^^^^^ - Your IP Address - -Then you may have to modify the default routing table with command : - - route add default eth0 - - -Now your ethernet card should be up and running. - - -TODO: - -Implement pci_driver::suspend() and pci_driver::resume() power management methods. -Check on 64 bit boxes. -Check and fix on big endian boxes. -Test and make sure PCI latency is now correct for all cases. - - -Authors: - -Sten Wang : Original Author - -Contributors: - -Marcelo Tosatti -Alan Cox -Jeff Garzik -Vojtech Pavlik diff --git a/Documentation/networking/device_drivers/index.rst b/Documentation/networking/device_drivers/index.rst index 4ad13ffb5800..09728e964ce1 100644 --- a/Documentation/networking/device_drivers/index.rst +++ b/Documentation/networking/device_drivers/index.rst @@ -35,6 +35,7 @@ Contents: cirrus/cs89x0 davicom/dm9000 dec/de4x5 + dec/dmfe .. only:: subproject and html diff --git a/MAINTAINERS b/MAINTAINERS index b5cfee17635e..f0b18c156176 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -4694,7 +4694,7 @@ F: net/ax25/sysctl_net_ax25.c DAVICOM FAST ETHERNET (DMFE) NETWORK DRIVER L: netdev@vger.kernel.org S: Orphan -F: Documentation/networking/device_drivers/dec/dmfe.txt +F: Documentation/networking/device_drivers/dec/dmfe.rst F: drivers/net/ethernet/dec/tulip/dmfe.c DC390/AM53C974 SCSI driver diff --git a/drivers/net/ethernet/dec/tulip/Kconfig b/drivers/net/ethernet/dec/tulip/Kconfig index 8c4245d94bb2..177f36f4b89d 100644 --- a/drivers/net/ethernet/dec/tulip/Kconfig +++ b/drivers/net/ethernet/dec/tulip/Kconfig @@ -138,7 +138,7 @@ config DM9102 This driver is for DM9102(A)/DM9132/DM9801 compatible PCI cards from Davicom (). If you have such a network (Ethernet) card, say Y. Some information is contained in the file - . + . To compile this driver as a module, choose M here. The module will be called dmfe. -- cgit v1.2.3-59-g8ed1b From ca705e4793f024afb8e86030e08b1e0f16dcc07c Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Fri, 1 May 2020 16:44:43 +0200 Subject: docs: networking: device drivers: convert dlink/dl2k.txt to ReST - add SPDX header; - mark code blocks and literals as such; - mark lists as such; - adjust identation, whitespaces and blank lines where needed; - add to networking/index.rst. Signed-off-by: Mauro Carvalho Chehab Signed-off-by: David S. Miller --- .../networking/device_drivers/dlink/dl2k.rst | 314 +++++++++++++++++++++ .../networking/device_drivers/dlink/dl2k.txt | 282 ------------------ Documentation/networking/device_drivers/index.rst | 1 + drivers/net/ethernet/dlink/dl2k.c | 2 +- 4 files changed, 316 insertions(+), 283 deletions(-) create mode 100644 Documentation/networking/device_drivers/dlink/dl2k.rst delete mode 100644 Documentation/networking/device_drivers/dlink/dl2k.txt diff --git a/Documentation/networking/device_drivers/dlink/dl2k.rst b/Documentation/networking/device_drivers/dlink/dl2k.rst new file mode 100644 index 000000000000..ccdb5d0d7460 --- /dev/null +++ b/Documentation/networking/device_drivers/dlink/dl2k.rst @@ -0,0 +1,314 @@ +.. SPDX-License-Identifier: GPL-2.0 + +========================================================= +D-Link DL2000-based Gigabit Ethernet Adapter Installation +========================================================= + +May 23, 2002 + +.. Contents + + - Compatibility List + - Quick Install + - Compiling the Driver + - Installing the Driver + - Option parameter + - Configuration Script Sample + - Troubleshooting + + +Compatibility List +================== + +Adapter Support: + +- D-Link DGE-550T Gigabit Ethernet Adapter. +- D-Link DGE-550SX Gigabit Ethernet Adapter. +- D-Link DL2000-based Gigabit Ethernet Adapter. + + +The driver support Linux kernel 2.4.7 later. We had tested it +on the environments below. + + . Red Hat v6.2 (update kernel to 2.4.7) + . Red Hat v7.0 (update kernel to 2.4.7) + . Red Hat v7.1 (kernel 2.4.7) + . Red Hat v7.2 (kernel 2.4.7-10) + + +Quick Install +============= +Install linux driver as following command:: + + 1. make all + 2. insmod dl2k.ko + 3. ifconfig eth0 up 10.xxx.xxx.xxx netmask 255.0.0.0 + ^^^^^^^^^^^^^^^\ ^^^^^^^^\ + IP NETMASK + +Now eth0 should active, you can test it by "ping" or get more information by +"ifconfig". If tested ok, continue the next step. + +4. ``cp dl2k.ko /lib/modules/`uname -r`/kernel/drivers/net`` +5. Add the following line to /etc/modprobe.d/dl2k.conf:: + + alias eth0 dl2k + +6. Run ``depmod`` to updated module indexes. +7. Run ``netconfig`` or ``netconf`` to create configuration script ifcfg-eth0 + located at /etc/sysconfig/network-scripts or create it manually. + + [see - Configuration Script Sample] +8. Driver will automatically load and configure at next boot time. + +Compiling the Driver +==================== +In Linux, NIC drivers are most commonly configured as loadable modules. +The approach of building a monolithic kernel has become obsolete. The driver +can be compiled as part of a monolithic kernel, but is strongly discouraged. +The remainder of this section assumes the driver is built as a loadable module. +In the Linux environment, it is a good idea to rebuild the driver from the +source instead of relying on a precompiled version. This approach provides +better reliability since a precompiled driver might depend on libraries or +kernel features that are not present in a given Linux installation. + +The 3 files necessary to build Linux device driver are dl2k.c, dl2k.h and +Makefile. To compile, the Linux installation must include the gcc compiler, +the kernel source, and the kernel headers. The Linux driver supports Linux +Kernels 2.4.7. Copy the files to a directory and enter the following command +to compile and link the driver: + +CD-ROM drive +------------ + +:: + + [root@XXX /] mkdir cdrom + [root@XXX /] mount -r -t iso9660 -o conv=auto /dev/cdrom /cdrom + [root@XXX /] cd root + [root@XXX /root] mkdir dl2k + [root@XXX /root] cd dl2k + [root@XXX dl2k] cp /cdrom/linux/dl2k.tgz /root/dl2k + [root@XXX dl2k] tar xfvz dl2k.tgz + [root@XXX dl2k] make all + +Floppy disc drive +----------------- + +:: + + [root@XXX /] cd root + [root@XXX /root] mkdir dl2k + [root@XXX /root] cd dl2k + [root@XXX dl2k] mcopy a:/linux/dl2k.tgz /root/dl2k + [root@XXX dl2k] tar xfvz dl2k.tgz + [root@XXX dl2k] make all + +Installing the Driver +===================== + +Manual Installation +------------------- + + Once the driver has been compiled, it must be loaded, enabled, and bound + to a protocol stack in order to establish network connectivity. To load a + module enter the command:: + + insmod dl2k.o + + or:: + + insmod dl2k.o ; add parameter + +--------------------------------------------------------- + + example:: + + insmod dl2k.o media=100mbps_hd + + or:: + + insmod dl2k.o media=3 + + or:: + + insmod dl2k.o media=3,2 ; for 2 cards + +--------------------------------------------------------- + + Please reference the list of the command line parameters supported by + the Linux device driver below. + + The insmod command only loads the driver and gives it a name of the form + eth0, eth1, etc. To bring the NIC into an operational state, + it is necessary to issue the following command:: + + ifconfig eth0 up + + Finally, to bind the driver to the active protocol (e.g., TCP/IP with + Linux), enter the following command:: + + ifup eth0 + + Note that this is meaningful only if the system can find a configuration + script that contains the necessary network information. A sample will be + given in the next paragraph. + + The commands to unload a driver are as follows:: + + ifdown eth0 + ifconfig eth0 down + rmmod dl2k.o + + The following are the commands to list the currently loaded modules and + to see the current network configuration:: + + lsmod + ifconfig + + +Automated Installation +---------------------- + This section describes how to install the driver such that it is + automatically loaded and configured at boot time. The following description + is based on a Red Hat 6.0/7.0 distribution, but it can easily be ported to + other distributions as well. + +Red Hat v6.x/v7.x +----------------- + 1. Copy dl2k.o to the network modules directory, typically + /lib/modules/2.x.x-xx/net or /lib/modules/2.x.x/kernel/drivers/net. + 2. Locate the boot module configuration file, most commonly in the + /etc/modprobe.d/ directory. Add the following lines:: + + alias ethx dl2k + options dl2k + + where ethx will be eth0 if the NIC is the only ethernet adapter, eth1 if + one other ethernet adapter is installed, etc. Refer to the table in the + previous section for the list of optional parameters. + 3. Locate the network configuration scripts, normally the + /etc/sysconfig/network-scripts directory, and create a configuration + script named ifcfg-ethx that contains network information. + 4. Note that for most Linux distributions, Red Hat included, a configuration + utility with a graphical user interface is provided to perform steps 2 + and 3 above. + + +Parameter Description +===================== +You can install this driver without any additional parameter. However, if you +are going to have extensive functions then it is necessary to set extra +parameter. Below is a list of the command line parameters supported by the +Linux device +driver. + + +=============================== ============================================== +mtu=packet_size Specifies the maximum packet size. default + is 1500. + +media=media_type Specifies the media type the NIC operates at. + autosense Autosensing active media. + + =========== ========================= + 10mbps_hd 10Mbps half duplex. + 10mbps_fd 10Mbps full duplex. + 100mbps_hd 100Mbps half duplex. + 100mbps_fd 100Mbps full duplex. + 1000mbps_fd 1000Mbps full duplex. + 1000mbps_hd 1000Mbps half duplex. + 0 Autosensing active media. + 1 10Mbps half duplex. + 2 10Mbps full duplex. + 3 100Mbps half duplex. + 4 100Mbps full duplex. + 5 1000Mbps half duplex. + 6 1000Mbps full duplex. + =========== ========================= + + By default, the NIC operates at autosense. + 1000mbps_fd and 1000mbps_hd types are only + available for fiber adapter. + +vlan=n Specifies the VLAN ID. If vlan=0, the + Virtual Local Area Network (VLAN) function is + disable. + +jumbo=[0|1] Specifies the jumbo frame support. If jumbo=1, + the NIC accept jumbo frames. By default, this + function is disabled. + Jumbo frame usually improve the performance + int gigabit. + This feature need jumbo frame compatible + remote. + +rx_coalesce=m Number of rx frame handled each interrupt. +rx_timeout=n Rx DMA wait time for an interrupt. + If set rx_coalesce > 0, hardware only assert + an interrupt for m frames. Hardware won't + assert rx interrupt until m frames received or + reach timeout of n * 640 nano seconds. + Set proper rx_coalesce and rx_timeout can + reduce congestion collapse and overload which + has been a bottleneck for high speed network. + + For example, rx_coalesce=10 rx_timeout=800. + that is, hardware assert only 1 interrupt + for 10 frames received or timeout of 512 us. + +tx_coalesce=n Number of tx frame handled each interrupt. + Set n > 1 can reduce the interrupts + congestion usually lower performance of + high speed network card. Default is 16. + +tx_flow=[1|0] Specifies the Tx flow control. If tx_flow=0, + the Tx flow control disable else driver + autodetect. +rx_flow=[1|0] Specifies the Rx flow control. If rx_flow=0, + the Rx flow control enable else driver + autodetect. +=============================== ============================================== + + +Configuration Script Sample +=========================== +Here is a sample of a simple configuration script:: + + DEVICE=eth0 + USERCTL=no + ONBOOT=yes + POOTPROTO=none + BROADCAST=207.200.5.255 + NETWORK=207.200.5.0 + NETMASK=255.255.255.0 + IPADDR=207.200.5.2 + + +Troubleshooting +=============== +Q1. Source files contain ^ M behind every line. + + Make sure all files are Unix file format (no LF). Try the following + shell command to convert files:: + + cat dl2k.c | col -b > dl2k.tmp + mv dl2k.tmp dl2k.c + + OR:: + + cat dl2k.c | tr -d "\r" > dl2k.tmp + mv dl2k.tmp dl2k.c + +Q2: Could not find header files (``*.h``)? + + To compile the driver, you need kernel header files. After + installing the kernel source, the header files are usually located in + /usr/src/linux/include, which is the default include directory configured + in Makefile. For some distributions, there is a copy of header files in + /usr/src/include/linux and /usr/src/include/asm, that you can change the + INCLUDEDIR in Makefile to /usr/include without installing kernel source. + + Note that RH 7.0 didn't provide correct header files in /usr/include, + including those files will make a wrong version driver. + diff --git a/Documentation/networking/device_drivers/dlink/dl2k.txt b/Documentation/networking/device_drivers/dlink/dl2k.txt deleted file mode 100644 index cba74f7a3abc..000000000000 --- a/Documentation/networking/device_drivers/dlink/dl2k.txt +++ /dev/null @@ -1,282 +0,0 @@ - - D-Link DL2000-based Gigabit Ethernet Adapter Installation - for Linux - May 23, 2002 - -Contents -======== - - Compatibility List - - Quick Install - - Compiling the Driver - - Installing the Driver - - Option parameter - - Configuration Script Sample - - Troubleshooting - - -Compatibility List -================= -Adapter Support: - -D-Link DGE-550T Gigabit Ethernet Adapter. -D-Link DGE-550SX Gigabit Ethernet Adapter. -D-Link DL2000-based Gigabit Ethernet Adapter. - - -The driver support Linux kernel 2.4.7 later. We had tested it -on the environments below. - - . Red Hat v6.2 (update kernel to 2.4.7) - . Red Hat v7.0 (update kernel to 2.4.7) - . Red Hat v7.1 (kernel 2.4.7) - . Red Hat v7.2 (kernel 2.4.7-10) - - -Quick Install -============= -Install linux driver as following command: - -1. make all -2. insmod dl2k.ko -3. ifconfig eth0 up 10.xxx.xxx.xxx netmask 255.0.0.0 - ^^^^^^^^^^^^^^^\ ^^^^^^^^\ - IP NETMASK -Now eth0 should active, you can test it by "ping" or get more information by -"ifconfig". If tested ok, continue the next step. - -4. cp dl2k.ko /lib/modules/`uname -r`/kernel/drivers/net -5. Add the following line to /etc/modprobe.d/dl2k.conf: - alias eth0 dl2k -6. Run depmod to updated module indexes. -7. Run "netconfig" or "netconf" to create configuration script ifcfg-eth0 - located at /etc/sysconfig/network-scripts or create it manually. - [see - Configuration Script Sample] -8. Driver will automatically load and configure at next boot time. - -Compiling the Driver -==================== - In Linux, NIC drivers are most commonly configured as loadable modules. -The approach of building a monolithic kernel has become obsolete. The driver -can be compiled as part of a monolithic kernel, but is strongly discouraged. -The remainder of this section assumes the driver is built as a loadable module. -In the Linux environment, it is a good idea to rebuild the driver from the -source instead of relying on a precompiled version. This approach provides -better reliability since a precompiled driver might depend on libraries or -kernel features that are not present in a given Linux installation. - -The 3 files necessary to build Linux device driver are dl2k.c, dl2k.h and -Makefile. To compile, the Linux installation must include the gcc compiler, -the kernel source, and the kernel headers. The Linux driver supports Linux -Kernels 2.4.7. Copy the files to a directory and enter the following command -to compile and link the driver: - -CD-ROM drive ------------- - -[root@XXX /] mkdir cdrom -[root@XXX /] mount -r -t iso9660 -o conv=auto /dev/cdrom /cdrom -[root@XXX /] cd root -[root@XXX /root] mkdir dl2k -[root@XXX /root] cd dl2k -[root@XXX dl2k] cp /cdrom/linux/dl2k.tgz /root/dl2k -[root@XXX dl2k] tar xfvz dl2k.tgz -[root@XXX dl2k] make all - -Floppy disc drive ------------------ - -[root@XXX /] cd root -[root@XXX /root] mkdir dl2k -[root@XXX /root] cd dl2k -[root@XXX dl2k] mcopy a:/linux/dl2k.tgz /root/dl2k -[root@XXX dl2k] tar xfvz dl2k.tgz -[root@XXX dl2k] make all - -Installing the Driver -===================== - - Manual Installation - ------------------- - Once the driver has been compiled, it must be loaded, enabled, and bound - to a protocol stack in order to establish network connectivity. To load a - module enter the command: - - insmod dl2k.o - - or - - insmod dl2k.o ; add parameter - - =============================================================== - example: insmod dl2k.o media=100mbps_hd - or insmod dl2k.o media=3 - or insmod dl2k.o media=3,2 ; for 2 cards - =============================================================== - - Please reference the list of the command line parameters supported by - the Linux device driver below. - - The insmod command only loads the driver and gives it a name of the form - eth0, eth1, etc. To bring the NIC into an operational state, - it is necessary to issue the following command: - - ifconfig eth0 up - - Finally, to bind the driver to the active protocol (e.g., TCP/IP with - Linux), enter the following command: - - ifup eth0 - - Note that this is meaningful only if the system can find a configuration - script that contains the necessary network information. A sample will be - given in the next paragraph. - - The commands to unload a driver are as follows: - - ifdown eth0 - ifconfig eth0 down - rmmod dl2k.o - - The following are the commands to list the currently loaded modules and - to see the current network configuration. - - lsmod - ifconfig - - - Automated Installation - ---------------------- - This section describes how to install the driver such that it is - automatically loaded and configured at boot time. The following description - is based on a Red Hat 6.0/7.0 distribution, but it can easily be ported to - other distributions as well. - - Red Hat v6.x/v7.x - ----------------- - 1. Copy dl2k.o to the network modules directory, typically - /lib/modules/2.x.x-xx/net or /lib/modules/2.x.x/kernel/drivers/net. - 2. Locate the boot module configuration file, most commonly in the - /etc/modprobe.d/ directory. Add the following lines: - - alias ethx dl2k - options dl2k - - where ethx will be eth0 if the NIC is the only ethernet adapter, eth1 if - one other ethernet adapter is installed, etc. Refer to the table in the - previous section for the list of optional parameters. - 3. Locate the network configuration scripts, normally the - /etc/sysconfig/network-scripts directory, and create a configuration - script named ifcfg-ethx that contains network information. - 4. Note that for most Linux distributions, Red Hat included, a configuration - utility with a graphical user interface is provided to perform steps 2 - and 3 above. - - -Parameter Description -===================== -You can install this driver without any additional parameter. However, if you -are going to have extensive functions then it is necessary to set extra -parameter. Below is a list of the command line parameters supported by the -Linux device -driver. - -mtu=packet_size - Specifies the maximum packet size. default - is 1500. - -media=media_type - Specifies the media type the NIC operates at. - autosense Autosensing active media. - 10mbps_hd 10Mbps half duplex. - 10mbps_fd 10Mbps full duplex. - 100mbps_hd 100Mbps half duplex. - 100mbps_fd 100Mbps full duplex. - 1000mbps_fd 1000Mbps full duplex. - 1000mbps_hd 1000Mbps half duplex. - 0 Autosensing active media. - 1 10Mbps half duplex. - 2 10Mbps full duplex. - 3 100Mbps half duplex. - 4 100Mbps full duplex. - 5 1000Mbps half duplex. - 6 1000Mbps full duplex. - - By default, the NIC operates at autosense. - 1000mbps_fd and 1000mbps_hd types are only - available for fiber adapter. - -vlan=n - Specifies the VLAN ID. If vlan=0, the - Virtual Local Area Network (VLAN) function is - disable. - -jumbo=[0|1] - Specifies the jumbo frame support. If jumbo=1, - the NIC accept jumbo frames. By default, this - function is disabled. - Jumbo frame usually improve the performance - int gigabit. - This feature need jumbo frame compatible - remote. - -rx_coalesce=m - Number of rx frame handled each interrupt. -rx_timeout=n - Rx DMA wait time for an interrupt. - If set rx_coalesce > 0, hardware only assert - an interrupt for m frames. Hardware won't - assert rx interrupt until m frames received or - reach timeout of n * 640 nano seconds. - Set proper rx_coalesce and rx_timeout can - reduce congestion collapse and overload which - has been a bottleneck for high speed network. - - For example, rx_coalesce=10 rx_timeout=800. - that is, hardware assert only 1 interrupt - for 10 frames received or timeout of 512 us. - -tx_coalesce=n - Number of tx frame handled each interrupt. - Set n > 1 can reduce the interrupts - congestion usually lower performance of - high speed network card. Default is 16. - -tx_flow=[1|0] - Specifies the Tx flow control. If tx_flow=0, - the Tx flow control disable else driver - autodetect. -rx_flow=[1|0] - Specifies the Rx flow control. If rx_flow=0, - the Rx flow control enable else driver - autodetect. - - -Configuration Script Sample -=========================== -Here is a sample of a simple configuration script: - -DEVICE=eth0 -USERCTL=no -ONBOOT=yes -POOTPROTO=none -BROADCAST=207.200.5.255 -NETWORK=207.200.5.0 -NETMASK=255.255.255.0 -IPADDR=207.200.5.2 - - -Troubleshooting -=============== -Q1. Source files contain ^ M behind every line. - Make sure all files are Unix file format (no LF). Try the following - shell command to convert files. - - cat dl2k.c | col -b > dl2k.tmp - mv dl2k.tmp dl2k.c - - OR - - cat dl2k.c | tr -d "\r" > dl2k.tmp - mv dl2k.tmp dl2k.c - -Q2: Could not find header files (*.h) ? - To compile the driver, you need kernel header files. After - installing the kernel source, the header files are usually located in - /usr/src/linux/include, which is the default include directory configured - in Makefile. For some distributions, there is a copy of header files in - /usr/src/include/linux and /usr/src/include/asm, that you can change the - INCLUDEDIR in Makefile to /usr/include without installing kernel source. - Note that RH 7.0 didn't provide correct header files in /usr/include, - including those files will make a wrong version driver. - diff --git a/Documentation/networking/device_drivers/index.rst b/Documentation/networking/device_drivers/index.rst index 09728e964ce1..e5d1863379cb 100644 --- a/Documentation/networking/device_drivers/index.rst +++ b/Documentation/networking/device_drivers/index.rst @@ -36,6 +36,7 @@ Contents: davicom/dm9000 dec/de4x5 dec/dmfe + dlink/dl2k .. only:: subproject and html diff --git a/drivers/net/ethernet/dlink/dl2k.c b/drivers/net/ethernet/dlink/dl2k.c index 643090555cc7..5143722c4419 100644 --- a/drivers/net/ethernet/dlink/dl2k.c +++ b/drivers/net/ethernet/dlink/dl2k.c @@ -1869,7 +1869,7 @@ Compile command: gcc -D__KERNEL__ -DMODULE -I/usr/src/linux/include -Wall -Wstrict-prototypes -O2 -c dl2k.c -Read Documentation/networking/device_drivers/dlink/dl2k.txt for details. +Read Documentation/networking/device_drivers/dlink/dl2k.rst for details. */ -- cgit v1.2.3-59-g8ed1b From 0d0d976f59a57e5536ff01825f0bc8a0dbb0fe6b Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Fri, 1 May 2020 16:44:44 +0200 Subject: docs: networking: device drivers: convert freescale/dpaa.txt to ReST - add SPDX header; - adjust titles and chapters, adding proper markups; - mark code blocks and literals as such; - use :field: markup; - add to networking/index.rst. Signed-off-by: Mauro Carvalho Chehab Signed-off-by: David S. Miller --- .../networking/device_drivers/freescale/dpaa.rst | 269 +++++++++++++++++++++ .../networking/device_drivers/freescale/dpaa.txt | 260 -------------------- Documentation/networking/device_drivers/index.rst | 1 + 3 files changed, 270 insertions(+), 260 deletions(-) create mode 100644 Documentation/networking/device_drivers/freescale/dpaa.rst delete mode 100644 Documentation/networking/device_drivers/freescale/dpaa.txt diff --git a/Documentation/networking/device_drivers/freescale/dpaa.rst b/Documentation/networking/device_drivers/freescale/dpaa.rst new file mode 100644 index 000000000000..241c6c6f6e68 --- /dev/null +++ b/Documentation/networking/device_drivers/freescale/dpaa.rst @@ -0,0 +1,269 @@ +.. SPDX-License-Identifier: GPL-2.0 + +============================== +The QorIQ DPAA Ethernet Driver +============================== + +Authors: +- Madalin Bucur +- Camelia Groza + +.. Contents + + - DPAA Ethernet Overview + - DPAA Ethernet Supported SoCs + - Configuring DPAA Ethernet in your kernel + - DPAA Ethernet Frame Processing + - DPAA Ethernet Features + - DPAA IRQ Affinity and Receive Side Scaling + - Debugging + +DPAA Ethernet Overview +====================== + +DPAA stands for Data Path Acceleration Architecture and it is a +set of networking acceleration IPs that are available on several +generations of SoCs, both on PowerPC and ARM64. + +The Freescale DPAA architecture consists of a series of hardware blocks +that support Ethernet connectivity. The Ethernet driver depends upon the +following drivers in the Linux kernel: + + - Peripheral Access Memory Unit (PAMU) (* needed only for PPC platforms) + drivers/iommu/fsl_* + - Frame Manager (FMan) + drivers/net/ethernet/freescale/fman + - Queue Manager (QMan), Buffer Manager (BMan) + drivers/soc/fsl/qbman + +A simplified view of the dpaa_eth interfaces mapped to FMan MACs:: + + dpaa_eth /eth0\ ... /ethN\ + driver | | | | + ------------- ---- ----------- ---- ------------- + -Ports / Tx Rx \ ... / Tx Rx \ + FMan | | | | + -MACs | MAC0 | | MACN | + / dtsec0 \ ... / dtsecN \ (or tgec) + / \ / \(or memac) + --------- -------------- --- -------------- --------- + FMan, FMan Port, FMan SP, FMan MURAM drivers + --------------------------------------------------------- + FMan HW blocks: MURAM, MACs, Ports, SP + --------------------------------------------------------- + +The dpaa_eth relation to the QMan, BMan and FMan:: + + ________________________________ + dpaa_eth / eth0 \ + driver / \ + --------- -^- -^- -^- --- --------- + QMan driver / \ / \ / \ \ / | BMan | + |Rx | |Rx | |Tx | |Tx | | driver | + --------- |Dfl| |Err| |Cnf| |FQs| | | + QMan HW |FQ | |FQ | |FQs| | | | | + / \ / \ / \ \ / | | + --------- --- --- --- -v- --------- + | FMan QMI | | + | FMan HW FMan BMI | BMan HW | + ----------------------- -------- + +where the acronyms used above (and in the code) are: + +=============== =========================================================== +DPAA Data Path Acceleration Architecture +FMan DPAA Frame Manager +QMan DPAA Queue Manager +BMan DPAA Buffers Manager +QMI QMan interface in FMan +BMI BMan interface in FMan +FMan SP FMan Storage Profiles +MURAM Multi-user RAM in FMan +FQ QMan Frame Queue +Rx Dfl FQ default reception FQ +Rx Err FQ Rx error frames FQ +Tx Cnf FQ Tx confirmation FQs +Tx FQs transmission frame queues +dtsec datapath three speed Ethernet controller (10/100/1000 Mbps) +tgec ten gigabit Ethernet controller (10 Gbps) +memac multirate Ethernet MAC (10/100/1000/10000) +=============== =========================================================== + +DPAA Ethernet Supported SoCs +============================ + +The DPAA drivers enable the Ethernet controllers present on the following SoCs: + +PPC +- P1023 +- P2041 +- P3041 +- P4080 +- P5020 +- P5040 +- T1023 +- T1024 +- T1040 +- T1042 +- T2080 +- T4240 +- B4860 + +ARM +- LS1043A +- LS1046A + +Configuring DPAA Ethernet in your kernel +======================================== + +To enable the DPAA Ethernet driver, the following Kconfig options are required:: + + # common for arch/arm64 and arch/powerpc platforms + CONFIG_FSL_DPAA=y + CONFIG_FSL_FMAN=y + CONFIG_FSL_DPAA_ETH=y + CONFIG_FSL_XGMAC_MDIO=y + + # for arch/powerpc only + CONFIG_FSL_PAMU=y + + # common options needed for the PHYs used on the RDBs + CONFIG_VITESSE_PHY=y + CONFIG_REALTEK_PHY=y + CONFIG_AQUANTIA_PHY=y + +DPAA Ethernet Frame Processing +============================== + +On Rx, buffers for the incoming frames are retrieved from the buffers found +in the dedicated interface buffer pool. The driver initializes and seeds these +with one page buffers. + +On Tx, all transmitted frames are returned to the driver through Tx +confirmation frame queues. The driver is then responsible for freeing the +buffers. In order to do this properly, a backpointer is added to the buffer +before transmission that points to the skb. When the buffer returns to the +driver on a confirmation FQ, the skb can be correctly consumed. + +DPAA Ethernet Features +====================== + +Currently the DPAA Ethernet driver enables the basic features required for +a Linux Ethernet driver. The support for advanced features will be added +gradually. + +The driver has Rx and Tx checksum offloading for UDP and TCP. Currently the Rx +checksum offload feature is enabled by default and cannot be controlled through +ethtool. Also, rx-flow-hash and rx-hashing was added. The addition of RSS +provides a big performance boost for the forwarding scenarios, allowing +different traffic flows received by one interface to be processed by different +CPUs in parallel. + +The driver has support for multiple prioritized Tx traffic classes. Priorities +range from 0 (lowest) to 3 (highest). These are mapped to HW workqueues with +strict priority levels. Each traffic class contains NR_CPU TX queues. By +default, only one traffic class is enabled and the lowest priority Tx queues +are used. Higher priority traffic classes can be enabled with the mqprio +qdisc. For example, all four traffic classes are enabled on an interface with +the following command. Furthermore, skb priority levels are mapped to traffic +classes as follows: + + * priorities 0 to 3 - traffic class 0 (low priority) + * priorities 4 to 7 - traffic class 1 (medium-low priority) + * priorities 8 to 11 - traffic class 2 (medium-high priority) + * priorities 12 to 15 - traffic class 3 (high priority) + +:: + + tc qdisc add dev root handle 1: \ + mqprio num_tc 4 map 0 0 0 0 1 1 1 1 2 2 2 2 3 3 3 3 hw 1 + +DPAA IRQ Affinity and Receive Side Scaling +========================================== + +Traffic coming on the DPAA Rx queues or on the DPAA Tx confirmation +queues is seen by the CPU as ingress traffic on a certain portal. +The DPAA QMan portal interrupts are affined each to a certain CPU. +The same portal interrupt services all the QMan portal consumers. + +By default the DPAA Ethernet driver enables RSS, making use of the +DPAA FMan Parser and Keygen blocks to distribute traffic on 128 +hardware frame queues using a hash on IP v4/v6 source and destination +and L4 source and destination ports, in present in the received frame. +When RSS is disabled, all traffic received by a certain interface is +received on the default Rx frame queue. The default DPAA Rx frame +queues are configured to put the received traffic into a pool channel +that allows any available CPU portal to dequeue the ingress traffic. +The default frame queues have the HOLDACTIVE option set, ensuring that +traffic bursts from a certain queue are serviced by the same CPU. +This ensures a very low rate of frame reordering. A drawback of this +is that only one CPU at a time can service the traffic received by a +certain interface when RSS is not enabled. + +To implement RSS, the DPAA Ethernet driver allocates an extra set of +128 Rx frame queues that are configured to dedicated channels, in a +round-robin manner. The mapping of the frame queues to CPUs is now +hardcoded, there is no indirection table to move traffic for a certain +FQ (hash result) to another CPU. The ingress traffic arriving on one +of these frame queues will arrive at the same portal and will always +be processed by the same CPU. This ensures intra-flow order preservation +and workload distribution for multiple traffic flows. + +RSS can be turned off for a certain interface using ethtool, i.e.:: + + # ethtool -N fm1-mac9 rx-flow-hash tcp4 "" + +To turn it back on, one needs to set rx-flow-hash for tcp4/6 or udp4/6:: + + # ethtool -N fm1-mac9 rx-flow-hash udp4 sfdn + +There is no independent control for individual protocols, any command +run for one of tcp4|udp4|ah4|esp4|sctp4|tcp6|udp6|ah6|esp6|sctp6 is +going to control the rx-flow-hashing for all protocols on that interface. + +Besides using the FMan Keygen computed hash for spreading traffic on the +128 Rx FQs, the DPAA Ethernet driver also sets the skb hash value when +the NETIF_F_RXHASH feature is on (active by default). This can be turned +on or off through ethtool, i.e.:: + + # ethtool -K fm1-mac9 rx-hashing off + # ethtool -k fm1-mac9 | grep hash + receive-hashing: off + # ethtool -K fm1-mac9 rx-hashing on + Actual changes: + receive-hashing: on + # ethtool -k fm1-mac9 | grep hash + receive-hashing: on + +Please note that Rx hashing depends upon the rx-flow-hashing being on +for that interface - turning off rx-flow-hashing will also disable the +rx-hashing (without ethtool reporting it as off as that depends on the +NETIF_F_RXHASH feature flag). + +Debugging +========= + +The following statistics are exported for each interface through ethtool: + + - interrupt count per CPU + - Rx packets count per CPU + - Tx packets count per CPU + - Tx confirmed packets count per CPU + - Tx S/G frames count per CPU + - Tx error count per CPU + - Rx error count per CPU + - Rx error count per type + - congestion related statistics: + + - congestion status + - time spent in congestion + - number of time the device entered congestion + - dropped packets count per cause + +The driver also exports the following information in sysfs: + + - the FQ IDs for each FQ type + /sys/devices/platform/soc/.fman/.ethernet/dpaa-ethernet./net/fm-mac/fqids + + - the ID of the buffer pool in use + /sys/devices/platform/soc/.fman/.ethernet/dpaa-ethernet./net/fm-mac/bpids diff --git a/Documentation/networking/device_drivers/freescale/dpaa.txt b/Documentation/networking/device_drivers/freescale/dpaa.txt deleted file mode 100644 index b06601ff9200..000000000000 --- a/Documentation/networking/device_drivers/freescale/dpaa.txt +++ /dev/null @@ -1,260 +0,0 @@ -The QorIQ DPAA Ethernet Driver -============================== - -Authors: -Madalin Bucur -Camelia Groza - -Contents -======== - - - DPAA Ethernet Overview - - DPAA Ethernet Supported SoCs - - Configuring DPAA Ethernet in your kernel - - DPAA Ethernet Frame Processing - - DPAA Ethernet Features - - DPAA IRQ Affinity and Receive Side Scaling - - Debugging - -DPAA Ethernet Overview -====================== - -DPAA stands for Data Path Acceleration Architecture and it is a -set of networking acceleration IPs that are available on several -generations of SoCs, both on PowerPC and ARM64. - -The Freescale DPAA architecture consists of a series of hardware blocks -that support Ethernet connectivity. The Ethernet driver depends upon the -following drivers in the Linux kernel: - - - Peripheral Access Memory Unit (PAMU) (* needed only for PPC platforms) - drivers/iommu/fsl_* - - Frame Manager (FMan) - drivers/net/ethernet/freescale/fman - - Queue Manager (QMan), Buffer Manager (BMan) - drivers/soc/fsl/qbman - -A simplified view of the dpaa_eth interfaces mapped to FMan MACs: - - dpaa_eth /eth0\ ... /ethN\ - driver | | | | - ------------- ---- ----------- ---- ------------- - -Ports / Tx Rx \ ... / Tx Rx \ - FMan | | | | - -MACs | MAC0 | | MACN | - / dtsec0 \ ... / dtsecN \ (or tgec) - / \ / \(or memac) - --------- -------------- --- -------------- --------- - FMan, FMan Port, FMan SP, FMan MURAM drivers - --------------------------------------------------------- - FMan HW blocks: MURAM, MACs, Ports, SP - --------------------------------------------------------- - -The dpaa_eth relation to the QMan, BMan and FMan: - ________________________________ - dpaa_eth / eth0 \ - driver / \ - --------- -^- -^- -^- --- --------- - QMan driver / \ / \ / \ \ / | BMan | - |Rx | |Rx | |Tx | |Tx | | driver | - --------- |Dfl| |Err| |Cnf| |FQs| | | - QMan HW |FQ | |FQ | |FQs| | | | | - / \ / \ / \ \ / | | - --------- --- --- --- -v- --------- - | FMan QMI | | - | FMan HW FMan BMI | BMan HW | - ----------------------- -------- - -where the acronyms used above (and in the code) are: -DPAA = Data Path Acceleration Architecture -FMan = DPAA Frame Manager -QMan = DPAA Queue Manager -BMan = DPAA Buffers Manager -QMI = QMan interface in FMan -BMI = BMan interface in FMan -FMan SP = FMan Storage Profiles -MURAM = Multi-user RAM in FMan -FQ = QMan Frame Queue -Rx Dfl FQ = default reception FQ -Rx Err FQ = Rx error frames FQ -Tx Cnf FQ = Tx confirmation FQs -Tx FQs = transmission frame queues -dtsec = datapath three speed Ethernet controller (10/100/1000 Mbps) -tgec = ten gigabit Ethernet controller (10 Gbps) -memac = multirate Ethernet MAC (10/100/1000/10000) - -DPAA Ethernet Supported SoCs -============================ - -The DPAA drivers enable the Ethernet controllers present on the following SoCs: - -# PPC -P1023 -P2041 -P3041 -P4080 -P5020 -P5040 -T1023 -T1024 -T1040 -T1042 -T2080 -T4240 -B4860 - -# ARM -LS1043A -LS1046A - -Configuring DPAA Ethernet in your kernel -======================================== - -To enable the DPAA Ethernet driver, the following Kconfig options are required: - -# common for arch/arm64 and arch/powerpc platforms -CONFIG_FSL_DPAA=y -CONFIG_FSL_FMAN=y -CONFIG_FSL_DPAA_ETH=y -CONFIG_FSL_XGMAC_MDIO=y - -# for arch/powerpc only -CONFIG_FSL_PAMU=y - -# common options needed for the PHYs used on the RDBs -CONFIG_VITESSE_PHY=y -CONFIG_REALTEK_PHY=y -CONFIG_AQUANTIA_PHY=y - -DPAA Ethernet Frame Processing -============================== - -On Rx, buffers for the incoming frames are retrieved from the buffers found -in the dedicated interface buffer pool. The driver initializes and seeds these -with one page buffers. - -On Tx, all transmitted frames are returned to the driver through Tx -confirmation frame queues. The driver is then responsible for freeing the -buffers. In order to do this properly, a backpointer is added to the buffer -before transmission that points to the skb. When the buffer returns to the -driver on a confirmation FQ, the skb can be correctly consumed. - -DPAA Ethernet Features -====================== - -Currently the DPAA Ethernet driver enables the basic features required for -a Linux Ethernet driver. The support for advanced features will be added -gradually. - -The driver has Rx and Tx checksum offloading for UDP and TCP. Currently the Rx -checksum offload feature is enabled by default and cannot be controlled through -ethtool. Also, rx-flow-hash and rx-hashing was added. The addition of RSS -provides a big performance boost for the forwarding scenarios, allowing -different traffic flows received by one interface to be processed by different -CPUs in parallel. - -The driver has support for multiple prioritized Tx traffic classes. Priorities -range from 0 (lowest) to 3 (highest). These are mapped to HW workqueues with -strict priority levels. Each traffic class contains NR_CPU TX queues. By -default, only one traffic class is enabled and the lowest priority Tx queues -are used. Higher priority traffic classes can be enabled with the mqprio -qdisc. For example, all four traffic classes are enabled on an interface with -the following command. Furthermore, skb priority levels are mapped to traffic -classes as follows: - - * priorities 0 to 3 - traffic class 0 (low priority) - * priorities 4 to 7 - traffic class 1 (medium-low priority) - * priorities 8 to 11 - traffic class 2 (medium-high priority) - * priorities 12 to 15 - traffic class 3 (high priority) - -tc qdisc add dev root handle 1: \ - mqprio num_tc 4 map 0 0 0 0 1 1 1 1 2 2 2 2 3 3 3 3 hw 1 - -DPAA IRQ Affinity and Receive Side Scaling -========================================== - -Traffic coming on the DPAA Rx queues or on the DPAA Tx confirmation -queues is seen by the CPU as ingress traffic on a certain portal. -The DPAA QMan portal interrupts are affined each to a certain CPU. -The same portal interrupt services all the QMan portal consumers. - -By default the DPAA Ethernet driver enables RSS, making use of the -DPAA FMan Parser and Keygen blocks to distribute traffic on 128 -hardware frame queues using a hash on IP v4/v6 source and destination -and L4 source and destination ports, in present in the received frame. -When RSS is disabled, all traffic received by a certain interface is -received on the default Rx frame queue. The default DPAA Rx frame -queues are configured to put the received traffic into a pool channel -that allows any available CPU portal to dequeue the ingress traffic. -The default frame queues have the HOLDACTIVE option set, ensuring that -traffic bursts from a certain queue are serviced by the same CPU. -This ensures a very low rate of frame reordering. A drawback of this -is that only one CPU at a time can service the traffic received by a -certain interface when RSS is not enabled. - -To implement RSS, the DPAA Ethernet driver allocates an extra set of -128 Rx frame queues that are configured to dedicated channels, in a -round-robin manner. The mapping of the frame queues to CPUs is now -hardcoded, there is no indirection table to move traffic for a certain -FQ (hash result) to another CPU. The ingress traffic arriving on one -of these frame queues will arrive at the same portal and will always -be processed by the same CPU. This ensures intra-flow order preservation -and workload distribution for multiple traffic flows. - -RSS can be turned off for a certain interface using ethtool, i.e. - - # ethtool -N fm1-mac9 rx-flow-hash tcp4 "" - -To turn it back on, one needs to set rx-flow-hash for tcp4/6 or udp4/6: - - # ethtool -N fm1-mac9 rx-flow-hash udp4 sfdn - -There is no independent control for individual protocols, any command -run for one of tcp4|udp4|ah4|esp4|sctp4|tcp6|udp6|ah6|esp6|sctp6 is -going to control the rx-flow-hashing for all protocols on that interface. - -Besides using the FMan Keygen computed hash for spreading traffic on the -128 Rx FQs, the DPAA Ethernet driver also sets the skb hash value when -the NETIF_F_RXHASH feature is on (active by default). This can be turned -on or off through ethtool, i.e.: - - # ethtool -K fm1-mac9 rx-hashing off - # ethtool -k fm1-mac9 | grep hash - receive-hashing: off - # ethtool -K fm1-mac9 rx-hashing on - Actual changes: - receive-hashing: on - # ethtool -k fm1-mac9 | grep hash - receive-hashing: on - -Please note that Rx hashing depends upon the rx-flow-hashing being on -for that interface - turning off rx-flow-hashing will also disable the -rx-hashing (without ethtool reporting it as off as that depends on the -NETIF_F_RXHASH feature flag). - -Debugging -========= - -The following statistics are exported for each interface through ethtool: - - - interrupt count per CPU - - Rx packets count per CPU - - Tx packets count per CPU - - Tx confirmed packets count per CPU - - Tx S/G frames count per CPU - - Tx error count per CPU - - Rx error count per CPU - - Rx error count per type - - congestion related statistics: - - congestion status - - time spent in congestion - - number of time the device entered congestion - - dropped packets count per cause - -The driver also exports the following information in sysfs: - - - the FQ IDs for each FQ type - /sys/devices/platform/soc/.fman/.ethernet/dpaa-ethernet./net/fm-mac/fqids - - - the ID of the buffer pool in use - /sys/devices/platform/soc/.fman/.ethernet/dpaa-ethernet./net/fm-mac/bpids diff --git a/Documentation/networking/device_drivers/index.rst b/Documentation/networking/device_drivers/index.rst index e5d1863379cb..7e59ee43c030 100644 --- a/Documentation/networking/device_drivers/index.rst +++ b/Documentation/networking/device_drivers/index.rst @@ -37,6 +37,7 @@ Contents: dec/de4x5 dec/dmfe dlink/dl2k + freescale/dpaa .. only:: subproject and html -- cgit v1.2.3-59-g8ed1b From dc67e91e7f7b2245a3a341e62217cb5e7163d60b Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Fri, 1 May 2020 16:44:45 +0200 Subject: docs: networking: device drivers: convert freescale/gianfar.txt to ReST - add SPDX header; - adjust titles and chapters, adding proper markups; - use :field: markup; - add to networking/index.rst. Signed-off-by: Mauro Carvalho Chehab Signed-off-by: David S. Miller --- .../device_drivers/freescale/gianfar.rst | 51 ++++++++++++++++++++++ .../device_drivers/freescale/gianfar.txt | 42 ------------------ Documentation/networking/device_drivers/index.rst | 1 + 3 files changed, 52 insertions(+), 42 deletions(-) create mode 100644 Documentation/networking/device_drivers/freescale/gianfar.rst delete mode 100644 Documentation/networking/device_drivers/freescale/gianfar.txt diff --git a/Documentation/networking/device_drivers/freescale/gianfar.rst b/Documentation/networking/device_drivers/freescale/gianfar.rst new file mode 100644 index 000000000000..9c4a91d3824b --- /dev/null +++ b/Documentation/networking/device_drivers/freescale/gianfar.rst @@ -0,0 +1,51 @@ +.. SPDX-License-Identifier: GPL-2.0 + +=========================== +The Gianfar Ethernet Driver +=========================== + +:Author: Andy Fleming +:Updated: 2005-07-28 + + +Checksum Offloading +=================== + +The eTSEC controller (first included in parts from late 2005 like +the 8548) has the ability to perform TCP, UDP, and IP checksums +in hardware. The Linux kernel only offloads the TCP and UDP +checksums (and always performs the pseudo header checksums), so +the driver only supports checksumming for TCP/IP and UDP/IP +packets. Use ethtool to enable or disable this feature for RX +and TX. + +VLAN +==== + +In order to use VLAN, please consult Linux documentation on +configuring VLANs. The gianfar driver supports hardware insertion and +extraction of VLAN headers, but not filtering. Filtering will be +done by the kernel. + +Multicasting +============ + +The gianfar driver supports using the group hash table on the +TSEC (and the extended hash table on the eTSEC) for multicast +filtering. On the eTSEC, the exact-match MAC registers are used +before the hash tables. See Linux documentation on how to join +multicast groups. + +Padding +======= + +The gianfar driver supports padding received frames with 2 bytes +to align the IP header to a 16-byte boundary, when supported by +hardware. + +Ethtool +======= + +The gianfar driver supports the use of ethtool for many +configuration options. You must run ethtool only on currently +open interfaces. See ethtool documentation for details. diff --git a/Documentation/networking/device_drivers/freescale/gianfar.txt b/Documentation/networking/device_drivers/freescale/gianfar.txt deleted file mode 100644 index ba1daea7f2e4..000000000000 --- a/Documentation/networking/device_drivers/freescale/gianfar.txt +++ /dev/null @@ -1,42 +0,0 @@ -The Gianfar Ethernet Driver - -Author: Andy Fleming -Updated: 2005-07-28 - - -CHECKSUM OFFLOADING - -The eTSEC controller (first included in parts from late 2005 like -the 8548) has the ability to perform TCP, UDP, and IP checksums -in hardware. The Linux kernel only offloads the TCP and UDP -checksums (and always performs the pseudo header checksums), so -the driver only supports checksumming for TCP/IP and UDP/IP -packets. Use ethtool to enable or disable this feature for RX -and TX. - -VLAN - -In order to use VLAN, please consult Linux documentation on -configuring VLANs. The gianfar driver supports hardware insertion and -extraction of VLAN headers, but not filtering. Filtering will be -done by the kernel. - -MULTICASTING - -The gianfar driver supports using the group hash table on the -TSEC (and the extended hash table on the eTSEC) for multicast -filtering. On the eTSEC, the exact-match MAC registers are used -before the hash tables. See Linux documentation on how to join -multicast groups. - -PADDING - -The gianfar driver supports padding received frames with 2 bytes -to align the IP header to a 16-byte boundary, when supported by -hardware. - -ETHTOOL - -The gianfar driver supports the use of ethtool for many -configuration options. You must run ethtool only on currently -open interfaces. See ethtool documentation for details. diff --git a/Documentation/networking/device_drivers/index.rst b/Documentation/networking/device_drivers/index.rst index 7e59ee43c030..cec3415ee459 100644 --- a/Documentation/networking/device_drivers/index.rst +++ b/Documentation/networking/device_drivers/index.rst @@ -38,6 +38,7 @@ Contents: dec/dmfe dlink/dl2k freescale/dpaa + freescale/gianfar .. only:: subproject and html -- cgit v1.2.3-59-g8ed1b From cf7eba49b2b160f98106b33ca12039b05d812140 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Fri, 1 May 2020 16:44:46 +0200 Subject: docs: networking: device drivers: convert intel/ipw2100.txt to ReST - add SPDX header; - adjust titles and chapters, adding proper markups; - comment out text-only TOC from html/pdf output; - use copyright symbol; - use :field: markup; - mark code blocks and literals as such; - mark tables as such; - adjust identation, whitespaces and blank lines where needed; - add to networking/index.rst. Signed-off-by: Mauro Carvalho Chehab Signed-off-by: David S. Miller --- Documentation/networking/device_drivers/index.rst | 1 + .../networking/device_drivers/intel/ipw2100.rst | 323 +++++++++++++++++++++ .../networking/device_drivers/intel/ipw2100.txt | 293 ------------------- MAINTAINERS | 2 +- drivers/net/wireless/intel/ipw2x00/Kconfig | 2 +- drivers/net/wireless/intel/ipw2x00/ipw2100.c | 2 +- 6 files changed, 327 insertions(+), 296 deletions(-) create mode 100644 Documentation/networking/device_drivers/intel/ipw2100.rst delete mode 100644 Documentation/networking/device_drivers/intel/ipw2100.txt diff --git a/Documentation/networking/device_drivers/index.rst b/Documentation/networking/device_drivers/index.rst index cec3415ee459..54ed10f3d1a7 100644 --- a/Documentation/networking/device_drivers/index.rst +++ b/Documentation/networking/device_drivers/index.rst @@ -39,6 +39,7 @@ Contents: dlink/dl2k freescale/dpaa freescale/gianfar + intel/ipw2100 .. only:: subproject and html diff --git a/Documentation/networking/device_drivers/intel/ipw2100.rst b/Documentation/networking/device_drivers/intel/ipw2100.rst new file mode 100644 index 000000000000..d54ad522f937 --- /dev/null +++ b/Documentation/networking/device_drivers/intel/ipw2100.rst @@ -0,0 +1,323 @@ +.. SPDX-License-Identifier: GPL-2.0 +.. include:: + +=========================================== +Intel(R) PRO/Wireless 2100 Driver for Linux +=========================================== + +Support for: + +- Intel(R) PRO/Wireless 2100 Network Connection + +Copyright |copy| 2003-2006, Intel Corporation + +README.ipw2100 + +:Version: git-1.1.5 +:Date: January 25, 2006 + +.. Index + + 0. IMPORTANT INFORMATION BEFORE USING THIS DRIVER + 1. Introduction + 2. Release git-1.1.5 Current Features + 3. Command Line Parameters + 4. Sysfs Helper Files + 5. Radio Kill Switch + 6. Dynamic Firmware + 7. Power Management + 8. Support + 9. License + + +0. IMPORTANT INFORMATION BEFORE USING THIS DRIVER +================================================= + +Important Notice FOR ALL USERS OR DISTRIBUTORS!!!! + +Intel wireless LAN adapters are engineered, manufactured, tested, and +quality checked to ensure that they meet all necessary local and +governmental regulatory agency requirements for the regions that they +are designated and/or marked to ship into. Since wireless LANs are +generally unlicensed devices that share spectrum with radars, +satellites, and other licensed and unlicensed devices, it is sometimes +necessary to dynamically detect, avoid, and limit usage to avoid +interference with these devices. In many instances Intel is required to +provide test data to prove regional and local compliance to regional and +governmental regulations before certification or approval to use the +product is granted. Intel's wireless LAN's EEPROM, firmware, and +software driver are designed to carefully control parameters that affect +radio operation and to ensure electromagnetic compliance (EMC). These +parameters include, without limitation, RF power, spectrum usage, +channel scanning, and human exposure. + +For these reasons Intel cannot permit any manipulation by third parties +of the software provided in binary format with the wireless WLAN +adapters (e.g., the EEPROM and firmware). Furthermore, if you use any +patches, utilities, or code with the Intel wireless LAN adapters that +have been manipulated by an unauthorized party (i.e., patches, +utilities, or code (including open source code modifications) which have +not been validated by Intel), (i) you will be solely responsible for +ensuring the regulatory compliance of the products, (ii) Intel will bear +no liability, under any theory of liability for any issues associated +with the modified products, including without limitation, claims under +the warranty and/or issues arising from regulatory non-compliance, and +(iii) Intel will not provide or be required to assist in providing +support to any third parties for such modified products. + +Note: Many regulatory agencies consider Wireless LAN adapters to be +modules, and accordingly, condition system-level regulatory approval +upon receipt and review of test data documenting that the antennas and +system configuration do not cause the EMC and radio operation to be +non-compliant. + +The drivers available for download from SourceForge are provided as a +part of a development project. Conformance to local regulatory +requirements is the responsibility of the individual developer. As +such, if you are interested in deploying or shipping a driver as part of +solution intended to be used for purposes other than development, please +obtain a tested driver from Intel Customer Support at: + +http://www.intel.com/support/wireless/sb/CS-006408.htm + +1. Introduction +=============== + +This document provides a brief overview of the features supported by the +IPW2100 driver project. The main project website, where the latest +development version of the driver can be found, is: + + http://ipw2100.sourceforge.net + +There you can find the not only the latest releases, but also information about +potential fixes and patches, as well as links to the development mailing list +for the driver project. + + +2. Release git-1.1.5 Current Supported Features +=============================================== + +- Managed (BSS) and Ad-Hoc (IBSS) +- WEP (shared key and open) +- Wireless Tools support +- 802.1x (tested with XSupplicant 1.0.1) + +Enabled (but not supported) features: +- Monitor/RFMon mode +- WPA/WPA2 + +The distinction between officially supported and enabled is a reflection +on the amount of validation and interoperability testing that has been +performed on a given feature. + + +3. Command Line Parameters +========================== + +If the driver is built as a module, the following optional parameters are used +by entering them on the command line with the modprobe command using this +syntax:: + + modprobe ipw2100 [ do_softirq.part.4+0x4e/0x50 netif_rx_ni+0x60/0xd0 dev_loopback_xmit+0x83/0xf0 ip6_finish_output2+0x575/0x590 [ipv6] ? ip6_cork_release.isra.1+0x64/0x90 [ipv6] ? __ip6_make_skb+0x38d/0x680 [ipv6] ? ip6_output+0x6c/0x140 [ipv6] ip6_output+0x6c/0x140 [ipv6] ip6_send_skb+0x1e/0x60 [ipv6] rawv6_sendmsg+0xc4b/0xe10 [ipv6] ? proc_put_long+0xd0/0xd0 ? rw_copy_check_uvector+0x4e/0x110 ? sock_sendmsg+0x36/0x40 sock_sendmsg+0x36/0x40 ___sys_sendmsg+0x2b6/0x2d0 ? proc_dointvec+0x23/0x30 ? addrconf_sysctl_forward+0x8d/0x250 [ipv6] ? dev_forward_change+0x130/0x130 [ipv6] ? _raw_spin_unlock+0x12/0x30 ? proc_sys_call_handler.isra.14+0x9f/0x110 ? __call_rcu+0x213/0x510 ? get_max_files+0x10/0x10 ? trace_hardirqs_on+0x2c/0xe0 ? __sys_sendmsg+0x63/0xa0 __sys_sendmsg+0x63/0xa0 do_syscall_64+0x6c/0x1e0 entry_SYSCALL_64_after_hwframe+0x49/0xbe Signed-off-by: Alexander Sverdlin Signed-off-by: David S. Miller --- drivers/net/macvlan.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/net/macvlan.c b/drivers/net/macvlan.c index 9a419d5102ce..563aed5b3d9f 100644 --- a/drivers/net/macvlan.c +++ b/drivers/net/macvlan.c @@ -448,6 +448,10 @@ static rx_handler_result_t macvlan_handle_frame(struct sk_buff **pskb) int ret; rx_handler_result_t handle_res; + /* Packets from dev_loopback_xmit() do not have L2 header, bail out */ + if (unlikely(skb->pkt_type == PACKET_LOOPBACK)) + return RX_HANDLER_PASS; + port = macvlan_port_get_rcu(skb->dev); if (is_multicast_ether_addr(eth->h_dest)) { unsigned int hash; -- cgit v1.2.3-59-g8ed1b From 53c0ec4f4db19d430570bbbfc80ce899419d29f4 Mon Sep 17 00:00:00 2001 From: Armin Wolf Date: Tue, 26 May 2020 20:03:02 +0200 Subject: ne2k-pci: Fix various coding-style issues and improve printk() usage Fixed a ton of minor checkpatch errors/warnings and remove version printing at module init/when device is found and use MODULE_VERSION instead. Also modifying the RTL8029 PCI string to include the compatible RTL8029AS nic. The only mayor issue remaining is the missing SPDX tag, but since the exact version of the GPL is not stated anywhere inside the file, its impossible to add such a tag at the moment. But maybe it is possible, since 8390.h states Donald Becker's 8390 drivers are licensed under GPL 2.2 only (= GPL-2.0-only ?). The kernel module containing this patch compiles and runs without problems on a RTL8029AS-based NE2000 clone card with kernel 5.7.0-rc6. Signed-off-by: Armin Wolf Signed-off-by: David S. Miller --- drivers/net/ethernet/8390/ne2k-pci.c | 345 ++++++++++++++++++----------------- 1 file changed, 182 insertions(+), 163 deletions(-) diff --git a/drivers/net/ethernet/8390/ne2k-pci.c b/drivers/net/ethernet/8390/ne2k-pci.c index 42985a82321a..77d78b4c59c4 100644 --- a/drivers/net/ethernet/8390/ne2k-pci.c +++ b/drivers/net/ethernet/8390/ne2k-pci.c @@ -1,39 +1,43 @@ -/* ne2k-pci.c: A NE2000 clone on PCI bus driver for Linux. */ -/* - A Linux device driver for PCI NE2000 clones. - - Authors and other copyright holders: - 1992-2000 by Donald Becker, NE2000 core and various modifications. - 1995-1998 by Paul Gortmaker, core modifications and PCI support. - Copyright 1993 assigned to the United States Government as represented - by the Director, National Security Agency. - - This software may be used and distributed according to the terms of - the GNU General Public License (GPL), incorporated herein by reference. - Drivers based on or derived from this code fall under the GPL and must - retain the authorship, copyright and license notice. This file is not - a complete program and may only be used when the entire operating - system is licensed under the GPL. - - The author may be reached as becker@scyld.com, or C/O - Scyld Computing Corporation - 410 Severn Ave., Suite 210 - Annapolis MD 21403 - - Issues remaining: - People are making PCI ne2000 clones! Oh the horror, the horror... - Limited full-duplex support. -*/ +/* A Linux device driver for PCI NE2000 clones. + * + * Authors and other copyright holders: + * 1992-2000 by Donald Becker, NE2000 core and various modifications. + * 1995-1998 by Paul Gortmaker, core modifications and PCI support. + * Copyright 1993 assigned to the United States Government as represented + * by the Director, National Security Agency. + * + * This software may be used and distributed according to the terms of + * the GNU General Public License (GPL), incorporated herein by reference. + * Drivers based on or derived from this code fall under the GPL and must + * retain the authorship, copyright and license notice. This file is not + * a complete program and may only be used when the entire operating + * system is licensed under the GPL. + * + * The author may be reached as becker@scyld.com, or C/O + * Scyld Computing Corporation + * 410 Severn Ave., Suite 210 + * Annapolis MD 21403 + * + * Issues remaining: + * People are making PCI NE2000 clones! Oh the horror, the horror... + * Limited full-duplex support. + */ #define DRV_NAME "ne2k-pci" +#define DRV_DESCRIPTION "PCI NE2000 clone driver" +#define DRV_AUTHOR "Donald Becker / Paul Gortmaker" #define DRV_VERSION "1.03" #define DRV_RELDATE "9/22/2003" +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt /* The user-configurable values. - These may be modified when a driver module is loaded.*/ + * These may be modified when a driver module is loaded. + */ + +/* More are supported, limit only on options */ +#define MAX_UNITS 8 -#define MAX_UNITS 8 /* More are supported, limit only on options */ /* Used to pass the full-duplex flag, etc. */ static int full_duplex[MAX_UNITS]; static int options[MAX_UNITS]; @@ -52,7 +56,7 @@ static int options[MAX_UNITS]; #include #include -#include +#include #include #include @@ -60,20 +64,14 @@ static int options[MAX_UNITS]; static u32 ne2k_msg_enable; -/* These identify the driver base version and may not be removed. */ -static const char version[] = - KERN_INFO DRV_NAME ".c:v" DRV_VERSION " " DRV_RELDATE - " D. Becker/P. Gortmaker\n"; - #if defined(__powerpc__) #define inl_le(addr) le32_to_cpu(inl(addr)) #define inw_le(addr) le16_to_cpu(inw(addr)) #endif -#define PFX DRV_NAME ": " - -MODULE_AUTHOR("Donald Becker / Paul Gortmaker"); -MODULE_DESCRIPTION("PCI NE2000 clone driver"); +MODULE_AUTHOR(DRV_AUTHOR); +MODULE_DESCRIPTION(DRV_DESCRIPTION); +MODULE_VERSION(DRV_VERSION); MODULE_LICENSE("GPL"); module_param_named(msg_enable, ne2k_msg_enable, uint, 0444); @@ -83,7 +81,8 @@ MODULE_PARM_DESC(msg_enable, "Debug message level (see linux/netdevice.h for bit MODULE_PARM_DESC(options, "Bit 5: full duplex"); MODULE_PARM_DESC(full_duplex, "full duplex setting(s) (1)"); -/* Some defines that people can play with if so inclined. */ +/* Some defines that people can play with if so inclined. + */ /* Use 32 bit data-movement operations instead of 16 bit. */ #define USE_LONGIO @@ -91,14 +90,18 @@ MODULE_PARM_DESC(full_duplex, "full duplex setting(s) (1)"); /* Do we implement the read before write bugfix ? */ /* #define NE_RW_BUGFIX */ -/* Flags. We rename an existing ei_status field to store flags! */ -/* Thus only the low 8 bits are usable for non-init-time flags. */ +/* Flags. We rename an existing ei_status field to store flags! + * Thus only the low 8 bits are usable for non-init-time flags. + */ #define ne2k_flags reg0 + enum { - ONLY_16BIT_IO=8, ONLY_32BIT_IO=4, /* Chip can do only 16/32-bit xfers. */ - FORCE_FDX=0x20, /* User override. */ - REALTEK_FDX=0x40, HOLTEK_FDX=0x80, - STOP_PG_0x60=0x100, + /* Chip can do only 16/32-bit xfers. */ + ONLY_16BIT_IO = 8, ONLY_32BIT_IO = 4, + /* User override. */ + FORCE_FDX = 0x20, + REALTEK_FDX = 0x40, HOLTEK_FDX = 0x80, + STOP_PG_0x60 = 0x100, }; enum ne2k_pci_chipsets { @@ -120,7 +123,7 @@ static struct { char *name; int flags; } pci_clone_list[] = { - {"RealTek RTL-8029", REALTEK_FDX}, + {"RealTek RTL-8029(AS)", REALTEK_FDX}, {"Winbond 89C940", 0}, {"Compex RL2000", 0}, {"KTI ET32P2", 0}, @@ -149,13 +152,14 @@ static const struct pci_device_id ne2k_pci_tbl[] = { { 0x8c4a, 0x1980, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CH_Winbond_89C940_8c4a }, { 0, } }; + MODULE_DEVICE_TABLE(pci, ne2k_pci_tbl); /* ---- No user-serviceable parts below ---- */ #define NE_BASE (dev->base_addr) -#define NE_CMD 0x00 +#define NE_CMD 0x00 #define NE_DATAPORT 0x10 /* NatSemi-defined port window offset. */ #define NE_RESET 0x1f /* Issue a read to reset, a write to clear. */ #define NE_IO_EXTENT 0x20 @@ -168,18 +172,20 @@ static int ne2k_pci_open(struct net_device *dev); static int ne2k_pci_close(struct net_device *dev); static void ne2k_pci_reset_8390(struct net_device *dev); -static void ne2k_pci_get_8390_hdr(struct net_device *dev, struct e8390_pkt_hdr *hdr, - int ring_page); +static void ne2k_pci_get_8390_hdr(struct net_device *dev, + struct e8390_pkt_hdr *hdr, int ring_page); static void ne2k_pci_block_input(struct net_device *dev, int count, - struct sk_buff *skb, int ring_offset); + struct sk_buff *skb, int ring_offset); static void ne2k_pci_block_output(struct net_device *dev, const int count, - const unsigned char *buf, const int start_page); + const unsigned char *buf, + const int start_page); static const struct ethtool_ops ne2k_pci_ethtool_ops; /* There is no room in the standard 8390 structure for extra info we need, - so we build a meta/outer-wrapper structure.. */ + * so we build a meta/outer-wrapper structure.. + */ struct ne2k_pci_card { struct net_device *dev; struct pci_dev *pci_dev; @@ -187,18 +193,17 @@ struct ne2k_pci_card { -/* - NEx000-clone boards have a Station Address (SA) PROM (SAPROM) in the packet - buffer memory space. By-the-spec NE2000 clones have 0x57,0x57 in bytes - 0x0e,0x0f of the SAPROM, while other supposed NE2000 clones must be - detected by their SA prefix. - - Reading the SAPROM from a word-wide card with the 8390 set in byte-wide - mode results in doubled values, which can be detected and compensated for. - - The probe is also responsible for initializing the card and filling - in the 'dev' and 'ei_status' structures. -*/ +/* NEx000-clone boards have a Station Address (SA) PROM (SAPROM) in the packet + * buffer memory space. By-the-spec NE2000 clones have 0x57,0x57 in bytes + * 0x0e,0x0f of the SAPROM, while other supposed NE2000 clones must be + * detected by their SA prefix. + * + * Reading the SAPROM from a word-wide card with the 8390 set in byte-wide + * mode results in doubled values, which can be detected and compensated for. + * + * The probe is also responsible for initializing the card and filling + * in the 'dev' and 'ei_status' structures. + */ static const struct net_device_ops ne2k_netdev_ops = { .ndo_open = ne2k_pci_open, @@ -208,7 +213,7 @@ static const struct net_device_ops ne2k_netdev_ops = { .ndo_get_stats = ei_get_stats, .ndo_set_rx_mode = ei_set_multicast_list, .ndo_validate_addr = eth_validate_addr, - .ndo_set_mac_address = eth_mac_addr, + .ndo_set_mac_address = eth_mac_addr, #ifdef CONFIG_NET_POLL_CONTROLLER .ndo_poll_controller = ei_poll, #endif @@ -227,28 +232,21 @@ static int ne2k_pci_init_one(struct pci_dev *pdev, int flags = pci_clone_list[chip_idx].flags; struct ei_device *ei_local; -/* when built into the kernel, we only print version if device is found */ -#ifndef MODULE - static int printed_version; - if (!printed_version++) - printk(version); -#endif - fnd_cnt++; - i = pci_enable_device (pdev); + i = pci_enable_device(pdev); if (i) return i; - ioaddr = pci_resource_start (pdev, 0); + ioaddr = pci_resource_start(pdev, 0); irq = pdev->irq; - if (!ioaddr || ((pci_resource_flags (pdev, 0) & IORESOURCE_IO) == 0)) { + if (!ioaddr || ((pci_resource_flags(pdev, 0) & IORESOURCE_IO) == 0)) { dev_err(&pdev->dev, "no I/O resource at PCI BAR #0\n"); goto err_out; } - if (request_region (ioaddr, NE_IO_EXTENT, DRV_NAME) == NULL) { + if (!request_region(ioaddr, NE_IO_EXTENT, DRV_NAME)) { dev_err(&pdev->dev, "I/O resource 0x%x @ 0x%lx busy\n", NE_IO_EXTENT, ioaddr); goto err_out; @@ -261,14 +259,17 @@ static int ne2k_pci_init_one(struct pci_dev *pdev, /* Do a preliminary verification that we have a 8390. */ { int regd; - outb(E8390_NODMA+E8390_PAGE1+E8390_STOP, ioaddr + E8390_CMD); + + outb(E8390_NODMA + E8390_PAGE1 + E8390_STOP, ioaddr + E8390_CMD); regd = inb(ioaddr + 0x0d); outb(0xff, ioaddr + 0x0d); - outb(E8390_NODMA+E8390_PAGE0, ioaddr + E8390_CMD); - inb(ioaddr + EN0_COUNTER0); /* Clear the counter by reading. */ + outb(E8390_NODMA + E8390_PAGE0, ioaddr + E8390_CMD); + /* Clear the counter by reading. */ + inb(ioaddr + EN0_COUNTER0); if (inb(ioaddr + EN0_COUNTER0) != 0) { outb(reg0, ioaddr); - outb(regd, ioaddr + 0x0d); /* Restore the old values. */ + /* Restore the old values. */ + outb(regd, ioaddr + 0x0d); goto err_out_free_res; } } @@ -291,9 +292,9 @@ static int ne2k_pci_init_one(struct pci_dev *pdev, outb(inb(ioaddr + NE_RESET), ioaddr + NE_RESET); - /* This looks like a horrible timing loop, but it should never take - more than a few cycles. - */ + /* This looks like a horrible timing loop, but it should never + * take more than a few cycles. + */ while ((inb(ioaddr + EN0_ISR) & ENISR_RESET) == 0) /* Limit wait: '2' avoids jiffy roll-over. */ if (jiffies - reset_start_time > 2) { @@ -301,42 +302,53 @@ static int ne2k_pci_init_one(struct pci_dev *pdev, "Card failure (no reset ack).\n"); goto err_out_free_netdev; } - - outb(0xff, ioaddr + EN0_ISR); /* Ack all intr. */ + /* Ack all intr. */ + outb(0xff, ioaddr + EN0_ISR); } /* Read the 16 bytes of station address PROM. - We must first initialize registers, similar to NS8390_init(eifdev, 0). - We can't reliably read the SAPROM address without this. - (I learned the hard way!). */ + * We must first initialize registers, similar + * to NS8390_init(eifdev, 0). + * We can't reliably read the SAPROM address without this. + * (I learned the hard way!). + */ { struct {unsigned char value, offset; } program_seq[] = { - {E8390_NODMA+E8390_PAGE0+E8390_STOP, E8390_CMD}, /* Select page 0*/ - {0x49, EN0_DCFG}, /* Set word-wide access. */ - {0x00, EN0_RCNTLO}, /* Clear the count regs. */ + /* Select page 0 */ + {E8390_NODMA + E8390_PAGE0 + E8390_STOP, E8390_CMD}, + /* Set word-wide access */ + {0x49, EN0_DCFG}, + /* Clear the count regs. */ + {0x00, EN0_RCNTLO}, + /* Mask completion IRQ */ {0x00, EN0_RCNTHI}, - {0x00, EN0_IMR}, /* Mask completion irq. */ + {0x00, EN0_IMR}, {0xFF, EN0_ISR}, - {E8390_RXOFF, EN0_RXCR}, /* 0x20 Set to monitor */ - {E8390_TXOFF, EN0_TXCR}, /* 0x02 and loopback mode. */ + /* 0x20 Set to monitor */ + {E8390_RXOFF, EN0_RXCR}, + /* 0x02 and loopback mode */ + {E8390_TXOFF, EN0_TXCR}, {32, EN0_RCNTLO}, {0x00, EN0_RCNTHI}, - {0x00, EN0_RSARLO}, /* DMA starting at 0x0000. */ + /* DMA starting at 0x0000 */ + {0x00, EN0_RSARLO}, {0x00, EN0_RSARHI}, {E8390_RREAD+E8390_START, E8390_CMD}, }; for (i = 0; i < ARRAY_SIZE(program_seq); i++) - outb(program_seq[i].value, ioaddr + program_seq[i].offset); + outb(program_seq[i].value, + ioaddr + program_seq[i].offset); } /* Note: all PCI cards have at least 16 bit access, so we don't have - to check for 8 bit cards. Most cards permit 32 bit access. */ + * to check for 8 bit cards. Most cards permit 32 bit access. + */ if (flags & ONLY_32BIT_IO) { for (i = 0; i < 4 ; i++) ((u32 *)SA_prom)[i] = le32_to_cpu(inl(ioaddr + NE_DATAPORT)); } else - for(i = 0; i < 32 /*sizeof(SA_prom)*/; i++) + for (i = 0; i < 32 /* sizeof(SA_prom )*/; i++) SA_prom[i] = inb(ioaddr + NE_DATAPORT); /* We always set the 8390 registers for word mode. */ @@ -356,7 +368,7 @@ static int ne2k_pci_init_one(struct pci_dev *pdev, ei_status.word16 = 1; ei_status.ne2k_flags = flags; if (fnd_cnt < MAX_UNITS) { - if (full_duplex[fnd_cnt] > 0 || (options[fnd_cnt] & FORCE_FDX)) + if (full_duplex[fnd_cnt] > 0 || (options[fnd_cnt] & FORCE_FDX)) ei_status.ne2k_flags |= FORCE_FDX; } @@ -388,16 +400,15 @@ static int ne2k_pci_init_one(struct pci_dev *pdev, return 0; err_out_free_netdev: - free_netdev (dev); + free_netdev(dev); err_out_free_res: - release_region (ioaddr, NE_IO_EXTENT); + release_region(ioaddr, NE_IO_EXTENT); err_out: pci_disable_device(pdev); return -ENODEV; } -/* - * Magic incantation sequence for full duplex on the supported cards. +/* Magic incantation sequence for full duplex on the supported cards. */ static inline int set_realtek_fdx(struct net_device *dev) { @@ -431,7 +442,9 @@ static int ne2k_pci_set_fdx(struct net_device *dev) static int ne2k_pci_open(struct net_device *dev) { - int ret = request_irq(dev->irq, ei_interrupt, IRQF_SHARED, dev->name, dev); + int ret = request_irq(dev->irq, ei_interrupt, IRQF_SHARED, + dev->name, dev); + if (ret) return ret; @@ -450,7 +463,8 @@ static int ne2k_pci_close(struct net_device *dev) } /* Hard reset the card. This used to pause for the same period that a - 8390 reset command required, but that shouldn't be necessary. */ + * 8390 reset command required, but that shouldn't be necessary. + */ static void ne2k_pci_reset_8390(struct net_device *dev) { unsigned long reset_start_time = jiffies; @@ -467,31 +481,34 @@ static void ne2k_pci_reset_8390(struct net_device *dev) /* This check _should_not_ be necessary, omit eventually. */ while ((inb(NE_BASE+EN0_ISR) & ENISR_RESET) == 0) if (jiffies - reset_start_time > 2) { - netdev_err(dev, "ne2k_pci_reset_8390() did not complete.\n"); + netdev_err(dev, "%s did not complete.\n", __func__); break; } - outb(ENISR_RESET, NE_BASE + EN0_ISR); /* Ack intr. */ + /* Ack intr. */ + outb(ENISR_RESET, NE_BASE + EN0_ISR); } /* Grab the 8390 specific header. Similar to the block_input routine, but - we don't need to be concerned with ring wrap as the header will be at - the start of a page, so we optimize accordingly. */ + * we don't need to be concerned with ring wrap as the header will be at + * the start of a page, so we optimize accordingly. + */ -static void ne2k_pci_get_8390_hdr(struct net_device *dev, struct e8390_pkt_hdr *hdr, int ring_page) +static void ne2k_pci_get_8390_hdr(struct net_device *dev, + struct e8390_pkt_hdr *hdr, int ring_page) { long nic_base = dev->base_addr; - /* This *shouldn't* happen. If it does, it's the last thing you'll see */ + /* This *shouldn't* happen. If it does, it's the last thing you'll see + */ if (ei_status.dmaing) { - netdev_err(dev, "DMAing conflict in ne2k_pci_get_8390_hdr " - "[DMAstat:%d][irqlock:%d].\n", - ei_status.dmaing, ei_status.irqlock); + netdev_err(dev, "DMAing conflict in %s [DMAstat:%d][irqlock:%d].\n", + __func__, ei_status.dmaing, ei_status.irqlock); return; } ei_status.dmaing |= 0x01; - outb(E8390_NODMA+E8390_PAGE0+E8390_START, nic_base+ NE_CMD); + outb(E8390_NODMA + E8390_PAGE0 + E8390_START, nic_base + NE_CMD); outb(sizeof(struct e8390_pkt_hdr), nic_base + EN0_RCNTLO); outb(0, nic_base + EN0_RCNTHI); outb(0, nic_base + EN0_RSARLO); /* On page boundary */ @@ -499,20 +516,22 @@ static void ne2k_pci_get_8390_hdr(struct net_device *dev, struct e8390_pkt_hdr * outb(E8390_RREAD+E8390_START, nic_base + NE_CMD); if (ei_status.ne2k_flags & ONLY_16BIT_IO) { - insw(NE_BASE + NE_DATAPORT, hdr, sizeof(struct e8390_pkt_hdr)>>1); + insw(NE_BASE + NE_DATAPORT, hdr, + sizeof(struct e8390_pkt_hdr) >> 1); } else { - *(u32*)hdr = le32_to_cpu(inl(NE_BASE + NE_DATAPORT)); + *(u32 *)hdr = le32_to_cpu(inl(NE_BASE + NE_DATAPORT)); le16_to_cpus(&hdr->count); } - - outb(ENISR_RDC, nic_base + EN0_ISR); /* Ack intr. */ + /* Ack intr. */ + outb(ENISR_RDC, nic_base + EN0_ISR); ei_status.dmaing &= ~0x01; } /* Block input and output, similar to the Crynwr packet driver. If you - are porting to a new ethercard, look at the packet driver source for hints. - The NEx000 doesn't share the on-board packet memory -- you have to put - the packet out through the "remote DMA" dataport using outb. */ + *are porting to a new ethercard, look at the packet driver source for hints. + *The NEx000 doesn't share the on-board packet memory -- you have to put + *the packet out through the "remote DMA" dataport using outb. + */ static void ne2k_pci_block_input(struct net_device *dev, int count, struct sk_buff *skb, int ring_offset) @@ -520,30 +539,30 @@ static void ne2k_pci_block_input(struct net_device *dev, int count, long nic_base = dev->base_addr; char *buf = skb->data; - /* This *shouldn't* happen. If it does, it's the last thing you'll see */ + /* This *shouldn't* happen. + * If it does, it's the last thing you'll see. + */ if (ei_status.dmaing) { - netdev_err(dev, "DMAing conflict in ne2k_pci_block_input " - "[DMAstat:%d][irqlock:%d].\n", - ei_status.dmaing, ei_status.irqlock); + netdev_err(dev, "DMAing conflict in %s [DMAstat:%d][irqlock:%d]\n", + __func__, ei_status.dmaing, ei_status.irqlock); return; } ei_status.dmaing |= 0x01; if (ei_status.ne2k_flags & ONLY_32BIT_IO) count = (count + 3) & 0xFFFC; - outb(E8390_NODMA+E8390_PAGE0+E8390_START, nic_base+ NE_CMD); + outb(E8390_NODMA + E8390_PAGE0 + E8390_START, nic_base + NE_CMD); outb(count & 0xff, nic_base + EN0_RCNTLO); outb(count >> 8, nic_base + EN0_RCNTHI); outb(ring_offset & 0xff, nic_base + EN0_RSARLO); outb(ring_offset >> 8, nic_base + EN0_RSARHI); - outb(E8390_RREAD+E8390_START, nic_base + NE_CMD); + outb(E8390_RREAD + E8390_START, nic_base + NE_CMD); if (ei_status.ne2k_flags & ONLY_16BIT_IO) { - insw(NE_BASE + NE_DATAPORT,buf,count>>1); - if (count & 0x01) { + insw(NE_BASE + NE_DATAPORT, buf, count >> 1); + if (count & 0x01) buf[count-1] = inb(NE_BASE + NE_DATAPORT); - } } else { - insl(NE_BASE + NE_DATAPORT, buf, count>>2); + insl(NE_BASE + NE_DATAPORT, buf, count >> 2); if (count & 3) { buf += count & ~3; if (count & 2) { @@ -556,30 +575,32 @@ static void ne2k_pci_block_input(struct net_device *dev, int count, *buf = inb(NE_BASE + NE_DATAPORT); } } - - outb(ENISR_RDC, nic_base + EN0_ISR); /* Ack intr. */ + /* Ack intr. */ + outb(ENISR_RDC, nic_base + EN0_ISR); ei_status.dmaing &= ~0x01; } static void ne2k_pci_block_output(struct net_device *dev, int count, - const unsigned char *buf, const int start_page) + const unsigned char *buf, const int start_page) { long nic_base = NE_BASE; unsigned long dma_start; /* On little-endian it's always safe to round the count up for - word writes. */ + * word writes. + */ if (ei_status.ne2k_flags & ONLY_32BIT_IO) count = (count + 3) & 0xFFFC; else if (count & 0x01) count++; - /* This *shouldn't* happen. If it does, it's the last thing you'll see */ + /* This *shouldn't* happen. + * If it does, it's the last thing you'll see. + */ if (ei_status.dmaing) { - netdev_err(dev, "DMAing conflict in ne2k_pci_block_output." - "[DMAstat:%d][irqlock:%d]\n", - ei_status.dmaing, ei_status.irqlock); + netdev_err(dev, "DMAing conflict in %s [DMAstat:%d][irqlock:%d]\n", + __func__, ei_status.dmaing, ei_status.irqlock); return; } ei_status.dmaing |= 0x01; @@ -588,9 +609,10 @@ static void ne2k_pci_block_output(struct net_device *dev, int count, #ifdef NE8390_RW_BUGFIX /* Handle the read-before-write bug the same way as the - Crynwr packet driver -- the NatSemi method doesn't work. - Actually this doesn't always work either, but if you have - problems with your NEx000 this is better than nothing! */ + * Crynwr packet driver -- the NatSemi method doesn't work. + * Actually this doesn't always work either, but if you have + * problems with your NEx000 this is better than nothing! + */ outb(0x42, nic_base + EN0_RCNTLO); outb(0x00, nic_base + EN0_RCNTHI); outb(0x42, nic_base + EN0_RSARLO); @@ -599,16 +621,16 @@ static void ne2k_pci_block_output(struct net_device *dev, int count, #endif outb(ENISR_RDC, nic_base + EN0_ISR); - /* Now the normal output. */ + /* Now the normal output. */ outb(count & 0xff, nic_base + EN0_RCNTLO); outb(count >> 8, nic_base + EN0_RCNTHI); outb(0x00, nic_base + EN0_RSARLO); outb(start_page, nic_base + EN0_RSARHI); outb(E8390_RWRITE+E8390_START, nic_base + NE_CMD); if (ei_status.ne2k_flags & ONLY_16BIT_IO) { - outsw(NE_BASE + NE_DATAPORT, buf, count>>1); + outsw(NE_BASE + NE_DATAPORT, buf, count >> 1); } else { - outsl(NE_BASE + NE_DATAPORT, buf, count>>2); + outsl(NE_BASE + NE_DATAPORT, buf, count >> 2); if (count & 3) { buf += count & ~3; if (count & 2) { @@ -623,14 +645,15 @@ static void ne2k_pci_block_output(struct net_device *dev, int count, dma_start = jiffies; while ((inb(nic_base + EN0_ISR) & ENISR_RDC) == 0) - if (jiffies - dma_start > 2) { /* Avoid clock roll-over. */ + /* Avoid clock roll-over. */ + if (jiffies - dma_start > 2) { netdev_warn(dev, "timeout waiting for Tx RDC.\n"); ne2k_pci_reset_8390(dev); - NS8390_init(dev,1); + NS8390_init(dev, 1); break; } - - outb(ENISR_RDC, nic_base + EN0_ISR); /* Ack intr. */ + /* Ack intr. */ + outb(ENISR_RDC, nic_base + EN0_ISR); ei_status.dmaing &= ~0x01; } @@ -640,9 +663,9 @@ static void ne2k_pci_get_drvinfo(struct net_device *dev, struct ei_device *ei = netdev_priv(dev); struct pci_dev *pci_dev = (struct pci_dev *) ei->priv; - strlcpy(info->driver, DRV_NAME, sizeof(info->driver)); - strlcpy(info->version, DRV_VERSION, sizeof(info->version)); - strlcpy(info->bus_info, pci_name(pci_dev), sizeof(info->bus_info)); + strscpy(info->driver, DRV_NAME, sizeof(info->driver)); + strscpy(info->version, DRV_VERSION, sizeof(info->version)); + strscpy(info->bus_info, pci_name(pci_dev), sizeof(info->bus_info)); } static u32 ne2k_pci_get_msglevel(struct net_device *dev) @@ -677,9 +700,9 @@ static void ne2k_pci_remove_one(struct pci_dev *pdev) } #ifdef CONFIG_PM -static int ne2k_pci_suspend (struct pci_dev *pdev, pm_message_t state) +static int ne2k_pci_suspend(struct pci_dev *pdev, pm_message_t state) { - struct net_device *dev = pci_get_drvdata (pdev); + struct net_device *dev = pci_get_drvdata(pdev); netif_device_detach(dev); pci_save_state(pdev); @@ -689,9 +712,9 @@ static int ne2k_pci_suspend (struct pci_dev *pdev, pm_message_t state) return 0; } -static int ne2k_pci_resume (struct pci_dev *pdev) +static int ne2k_pci_resume(struct pci_dev *pdev) { - struct net_device *dev = pci_get_drvdata (pdev); + struct net_device *dev = pci_get_drvdata(pdev); int rc; pci_set_power_state(pdev, PCI_D0); @@ -718,24 +741,20 @@ static struct pci_driver ne2k_driver = { #ifdef CONFIG_PM .suspend = ne2k_pci_suspend, .resume = ne2k_pci_resume, -#endif /* CONFIG_PM */ +#endif }; static int __init ne2k_pci_init(void) { -/* when a module, this is printed whether or not devices are found in probe */ -#ifdef MODULE - printk(version); -#endif return pci_register_driver(&ne2k_driver); } static void __exit ne2k_pci_cleanup(void) { - pci_unregister_driver (&ne2k_driver); + pci_unregister_driver(&ne2k_driver); } module_init(ne2k_pci_init); -- cgit v1.2.3-59-g8ed1b From a331172b156b23e83dfb556ade0ca23426c3f149 Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Wed, 27 May 2020 00:21:37 +0200 Subject: net: ethtool: Add attributes for cable test TDR data Some Ethernet PHYs can return the raw time domain reflectromatry data. Add the attributes to allow this data to be requested and returned via netlink ethtool. Signed-off-by: Andrew Lunn v2: m -> cm Report what the PHY actually used for start/stop/step. Signed-off-by: David S. Miller --- Documentation/networking/ethtool-netlink.rst | 81 ++++++++++++++++++++++++++++ include/uapi/linux/ethtool_netlink.h | 63 ++++++++++++++++++++++ 2 files changed, 144 insertions(+) diff --git a/Documentation/networking/ethtool-netlink.rst b/Documentation/networking/ethtool-netlink.rst index 7e651ea33eab..dae36227d590 100644 --- a/Documentation/networking/ethtool-netlink.rst +++ b/Documentation/networking/ethtool-netlink.rst @@ -205,6 +205,7 @@ Userspace to kernel: ``ETHTOOL_MSG_EEE_SET`` set EEE settings ``ETHTOOL_MSG_TSINFO_GET`` get timestamping info ``ETHTOOL_MSG_CABLE_TEST_ACT`` action start cable test + ``ETHTOOL_MSG_CABLE_TEST_TDR_ACT`` action start raw TDR cable test ===================================== ================================ Kernel to userspace: @@ -237,6 +238,7 @@ Kernel to userspace: ``ETHTOOL_MSG_EEE_NTF`` EEE settings ``ETHTOOL_MSG_TSINFO_GET_REPLY`` timestamping info ``ETHTOOL_MSG_CABLE_TEST_NTF`` Cable test results + ``ETHTOOL_MSG_CABLE_TEST_TDR_NTF`` Cable test TDR results ===================================== ================================= ``GET`` requests are sent by userspace applications to retrieve device @@ -1014,6 +1016,84 @@ information. | | | ``ETHTOOL_A_CABLE_FAULT_LENGTH_CM`` | u32 | length in cm | +-+-+-----------------------------------------+--------+---------------------+ +CABLE_TEST TDR +============== + +Start a cable test and report raw TDR data + +Request contents: + + ==================================== ====== ========================== + ``ETHTOOL_A_CABLE_TEST_TDR_HEADER`` nested request header + ==================================== ====== ========================== + +Notification contents: + +Raw TDR data is gathered by sending a pulse down the cable and +recording the amplitude of the reflected pulse for a given distance. + +It can take a number of seconds to collect TDR data, especial if the +full 100 meters is probed at 1 meter intervals. When the test is +started a notification will be sent containing just +ETHTOOL_A_CABLE_TEST_TDR_STATUS with the value +ETHTOOL_A_CABLE_TEST_NTF_STATUS_STARTED. + +When the test has completed a second notification will be sent +containing ETHTOOL_A_CABLE_TEST_TDR_STATUS with the value +ETHTOOL_A_CABLE_TEST_NTF_STATUS_COMPLETED and the TDR data. + +The message may optionally contain the amplitude of the pulse send +down the cable. This is measured in mV. A reflection should not be +bigger than transmitted pulse. + +Before the raw TDR data should be an ETHTOOL_A_CABLE_TDR_NEST_STEP +nest containing information about the distance along the cable for the +first reading, the last reading, and the step between each +reading. Distances are measured in centimeters. These should be the +exact values the PHY used. These may be different to what the user +requested, if the native measurement resolution is greater than 1 cm. + +For each step along the cable, a ETHTOOL_A_CABLE_TDR_NEST_AMPLITUDE is +used to report the amplitude of the reflection for a given pair. + + +---------------------------------------------+--------+----------------------+ + | ``ETHTOOL_A_CABLE_TEST_TDR_HEADER`` | nested | reply header | + +---------------------------------------------+--------+----------------------+ + | ``ETHTOOL_A_CABLE_TEST_TDR_STATUS`` | u8 | completed | + +---------------------------------------------+--------+----------------------+ + | ``ETHTOOL_A_CABLE_TEST_TDR_NTF_NEST`` | nested | all the results | + +-+-------------------------------------------+--------+----------------------+ + | | ``ETHTOOL_A_CABLE_TDR_NEST_PULSE`` | nested | TX Pulse amplitude | + +-+-+-----------------------------------------+--------+----------------------+ + | | | ``ETHTOOL_A_CABLE_PULSE_mV`` | s16 | Pulse amplitude | + +-+-+-----------------------------------------+--------+----------------------+ + | | ``ETHTOOL_A_CABLE_NEST_STEP`` | nested | TDR step info | + +-+-+-----------------------------------------+--------+----------------------+ + | | | ``ETHTOOL_A_CABLE_STEP_FIRST_DISTANCE ``| u32 | First data distance | + +-+-+-----------------------------------------+--------+----------------------+ + | | | ``ETHTOOL_A_CABLE_STEP_LAST_DISTANCE `` | u32 | Last data distance | + +-+-+-----------------------------------------+--------+----------------------+ + | | | ``ETHTOOL_A_CABLE_STEP_STEP_DISTANCE `` | u32 | distance of each step| + +-+-+-----------------------------------------+--------+----------------------+ + | | ``ETHTOOL_A_CABLE_TDR_NEST_AMPLITUDE`` | nested | Reflection amplitude | + +-+-+-----------------------------------------+--------+----------------------+ + | | | ``ETHTOOL_A_CABLE_RESULTS_PAIR`` | u8 | pair number | + +-+-+-----------------------------------------+--------+----------------------+ + | | | ``ETHTOOL_A_CABLE_AMPLITUDE_mV`` | s16 | Reflection amplitude | + +-+-+-----------------------------------------+--------+----------------------+ + | | ``ETHTOOL_A_CABLE_TDR_NEST_AMPLITUDE`` | nested | Reflection amplitude | + +-+-+-----------------------------------------+--------+----------------------+ + | | | ``ETHTOOL_A_CABLE_RESULTS_PAIR`` | u8 | pair number | + +-+-+-----------------------------------------+--------+----------------------+ + | | | ``ETHTOOL_A_CABLE_AMPLITUDE_mV`` | s16 | Reflection amplitude | + +-+-+-----------------------------------------+--------+----------------------+ + | | ``ETHTOOL_A_CABLE_TDR_NEST_AMPLITUDE`` | nested | Reflection amplitude | + +-+-+-----------------------------------------+--------+----------------------+ + | | | ``ETHTOOL_A_CABLE_RESULTS_PAIR`` | u8 | pair number | + +-+-+-----------------------------------------+--------+----------------------+ + | | | ``ETHTOOL_A_CABLE_AMPLITUDE_mV`` | s16 | Reflection amplitude | + +-+-+-----------------------------------------+--------+----------------------+ + Request translation =================== @@ -1110,4 +1190,5 @@ are netlink only. ``ETHTOOL_GFECPARAM`` n/a ``ETHTOOL_SFECPARAM`` n/a n/a ''ETHTOOL_MSG_CABLE_TEST_ACT'' + n/a ''ETHTOOL_MSG_CABLE_TEST_TDR_ACT'' =================================== ===================================== diff --git a/include/uapi/linux/ethtool_netlink.h b/include/uapi/linux/ethtool_netlink.h index e6f109b76c9a..739faa7070c6 100644 --- a/include/uapi/linux/ethtool_netlink.h +++ b/include/uapi/linux/ethtool_netlink.h @@ -40,6 +40,7 @@ enum { ETHTOOL_MSG_EEE_SET, ETHTOOL_MSG_TSINFO_GET, ETHTOOL_MSG_CABLE_TEST_ACT, + ETHTOOL_MSG_CABLE_TEST_TDR_ACT, /* add new constants above here */ __ETHTOOL_MSG_USER_CNT, @@ -76,6 +77,7 @@ enum { ETHTOOL_MSG_EEE_NTF, ETHTOOL_MSG_TSINFO_GET_REPLY, ETHTOOL_MSG_CABLE_TEST_NTF, + ETHTOOL_MSG_CABLE_TEST_TDR_NTF, /* add new constants above here */ __ETHTOOL_MSG_KERNEL_CNT, @@ -478,6 +480,67 @@ enum { ETHTOOL_A_CABLE_TEST_NTF_MAX = (__ETHTOOL_A_CABLE_TEST_NTF_CNT - 1) }; +/* CABLE TEST TDR */ + +enum { + ETHTOOL_A_CABLE_TEST_TDR_UNSPEC, + ETHTOOL_A_CABLE_TEST_TDR_HEADER, /* nest - _A_HEADER_* */ + + /* add new constants above here */ + __ETHTOOL_A_CABLE_TEST_TDR_CNT, + ETHTOOL_A_CABLE_TEST_TDR_MAX = __ETHTOOL_A_CABLE_TEST_TDR_CNT - 1 +}; + +/* CABLE TEST TDR NOTIFY */ + +enum { + ETHTOOL_A_CABLE_AMPLITUDE_UNSPEC, + ETHTOOL_A_CABLE_AMPLITUDE_PAIR, /* u8 */ + ETHTOOL_A_CABLE_AMPLITUDE_mV, /* s16 */ + + __ETHTOOL_A_CABLE_AMPLITUDE_CNT, + ETHTOOL_A_CABLE_AMPLITUDE_MAX = (__ETHTOOL_A_CABLE_AMPLITUDE_CNT - 1) +}; + +enum { + ETHTOOL_A_CABLE_PULSE_UNSPEC, + ETHTOOL_A_CABLE_PULSE_mV, /* s16 */ + + __ETHTOOL_A_CABLE_PULSE_CNT, + ETHTOOL_A_CABLE_PULSE_MAX = (__ETHTOOL_A_CABLE_PULSE_CNT - 1) +}; + +enum { + ETHTOOL_A_CABLE_STEP_UNSPEC, + ETHTOOL_A_CABLE_STEP_FIRST_DISTANCE, /* u32 */ + ETHTOOL_A_CABLE_STEP_LAST_DISTANCE, /* u32 */ + ETHTOOL_A_CABLE_STEP_STEP_DISTANCE, /* u32 */ + + __ETHTOOL_A_CABLE_STEP_CNT, + ETHTOOL_A_CABLE_STEP_MAX = (__ETHTOOL_A_CABLE_STEP_CNT - 1) +}; + +enum { + ETHTOOL_A_CABLE_TDR_NEST_UNSPEC, + ETHTOOL_A_CABLE_TDR_NEST_STEP, /* nest - ETHTTOOL_A_CABLE_STEP */ + ETHTOOL_A_CABLE_TDR_NEST_AMPLITUDE, /* nest - ETHTOOL_A_CABLE_AMPLITUDE */ + ETHTOOL_A_CABLE_TDR_NEST_PULSE, /* nest - ETHTOOL_A_CABLE_PULSE */ + + __ETHTOOL_A_CABLE_TDR_NEST_CNT, + ETHTOOL_A_CABLE_TDR_NEST_MAX = (__ETHTOOL_A_CABLE_TDR_NEST_CNT - 1) +}; + +enum { + ETHTOOL_A_CABLE_TEST_TDR_NTF_UNSPEC, + ETHTOOL_A_CABLE_TEST_TDR_NTF_HEADER, /* nest - ETHTOOL_A_HEADER_* */ + ETHTOOL_A_CABLE_TEST_TDR_NTF_STATUS, /* u8 - _STARTED/_COMPLETE */ + ETHTOOL_A_CABLE_TEST_TDR_NTF_NEST, /* nest - of results: */ + + /* add new constants above here */ + __ETHTOOL_A_CABLE_TEST_TDR_NTF_CNT, + ETHTOOL_A_CABLE_TEST_TDR_NTF_MAX = __ETHTOOL_A_CABLE_TEST_TDR_NTF_CNT - 1 +}; + /* generic netlink info */ #define ETHTOOL_GENL_NAME "ethtool" #define ETHTOOL_GENL_VERSION 1 -- cgit v1.2.3-59-g8ed1b From 1a644de29f712771c2ec00e52caa391544eb6141 Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Wed, 27 May 2020 00:21:38 +0200 Subject: net: ethtool: Add generic parts of cable test TDR Add the generic parts of the code used to trigger a cable test and return raw TDR data. Any PHY driver which support this must implement the new driver op. Signed-off-by: Andrew Lunn v2 Update nxp-tja11xx for API change. Signed-off-by: David S. Miller --- drivers/net/phy/nxp-tja11xx.c | 2 +- drivers/net/phy/phy.c | 65 ++++++++++++++++++++++++++++++++++++++++- include/linux/ethtool_netlink.h | 4 +-- include/linux/phy.h | 13 +++++++++ net/ethtool/cabletest.c | 64 ++++++++++++++++++++++++++++++++++++---- net/ethtool/netlink.c | 5 ++++ net/ethtool/netlink.h | 1 + 7 files changed, 144 insertions(+), 10 deletions(-) diff --git a/drivers/net/phy/nxp-tja11xx.c b/drivers/net/phy/nxp-tja11xx.c index 1e79c30ca81a..a72fa0d2e7c7 100644 --- a/drivers/net/phy/nxp-tja11xx.c +++ b/drivers/net/phy/nxp-tja11xx.c @@ -194,7 +194,7 @@ static int tja11xx_config_aneg_cable_test(struct phy_device *phydev) !phydev->drv->cable_test_get_status) return 0; - ret = ethnl_cable_test_alloc(phydev); + ret = ethnl_cable_test_alloc(phydev, ETHTOOL_MSG_CABLE_TEST_NTF); if (ret) return ret; diff --git a/drivers/net/phy/phy.c b/drivers/net/phy/phy.c index 27da0c94818f..495d9ba3d5bf 100644 --- a/drivers/net/phy/phy.c +++ b/drivers/net/phy/phy.c @@ -519,7 +519,7 @@ int phy_start_cable_test(struct phy_device *phydev, goto out; } - err = ethnl_cable_test_alloc(phydev); + err = ethnl_cable_test_alloc(phydev, ETHTOOL_MSG_CABLE_TEST_NTF); if (err) goto out; @@ -552,6 +552,69 @@ out: } EXPORT_SYMBOL(phy_start_cable_test); +int phy_start_cable_test_tdr(struct phy_device *phydev, + struct netlink_ext_ack *extack) +{ + struct net_device *dev = phydev->attached_dev; + int err = -ENOMEM; + + if (!(phydev->drv && + phydev->drv->cable_test_tdr_start && + phydev->drv->cable_test_get_status)) { + NL_SET_ERR_MSG(extack, + "PHY driver does not support cable test TDR"); + return -EOPNOTSUPP; + } + + mutex_lock(&phydev->lock); + if (phydev->state == PHY_CABLETEST) { + NL_SET_ERR_MSG(extack, + "PHY already performing a test"); + err = -EBUSY; + goto out; + } + + if (phydev->state < PHY_UP || + phydev->state > PHY_CABLETEST) { + NL_SET_ERR_MSG(extack, + "PHY not configured. Try setting interface up"); + err = -EBUSY; + goto out; + } + + err = ethnl_cable_test_alloc(phydev, ETHTOOL_MSG_CABLE_TEST_TDR_NTF); + if (err) + goto out; + + /* Mark the carrier down until the test is complete */ + phy_link_down(phydev); + + netif_testing_on(dev); + err = phydev->drv->cable_test_tdr_start(phydev); + if (err) { + netif_testing_off(dev); + phy_link_up(phydev); + goto out_free; + } + + phydev->state = PHY_CABLETEST; + + if (phy_polling_mode(phydev)) + phy_trigger_machine(phydev); + + mutex_unlock(&phydev->lock); + + return 0; + +out_free: + ethnl_cable_test_free(phydev); +out: + mutex_unlock(&phydev->lock); + + return err; +} +EXPORT_SYMBOL(phy_start_cable_test_tdr); + static int phy_config_aneg(struct phy_device *phydev) { if (phydev->drv->config_aneg) diff --git a/include/linux/ethtool_netlink.h b/include/linux/ethtool_netlink.h index e317fc99565e..24817ba252a0 100644 --- a/include/linux/ethtool_netlink.h +++ b/include/linux/ethtool_netlink.h @@ -17,13 +17,13 @@ enum ethtool_multicast_groups { struct phy_device; #if IS_ENABLED(CONFIG_ETHTOOL_NETLINK) -int ethnl_cable_test_alloc(struct phy_device *phydev); +int ethnl_cable_test_alloc(struct phy_device *phydev, u8 cmd); void ethnl_cable_test_free(struct phy_device *phydev); void ethnl_cable_test_finished(struct phy_device *phydev); int ethnl_cable_test_result(struct phy_device *phydev, u8 pair, u8 result); int ethnl_cable_test_fault_length(struct phy_device *phydev, u8 pair, u32 cm); #else -static inline int ethnl_cable_test_alloc(struct phy_device *phydev) +static inline int ethnl_cable_test_alloc(struct phy_device *phydev, u8 cmd) { return -EOPNOTSUPP; } diff --git a/include/linux/phy.h b/include/linux/phy.h index 6d256e720a66..d3c384f353ca 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -699,6 +699,10 @@ struct phy_driver { /* Start a cable test */ int (*cable_test_start)(struct phy_device *dev); + + /* Start a raw TDR cable test */ + int (*cable_test_tdr_start)(struct phy_device *dev); + /* Once per second, or on interrupt, request the status of the * test. */ @@ -1251,6 +1255,8 @@ int phy_reset_after_clk_enable(struct phy_device *phydev); #if IS_ENABLED(CONFIG_PHYLIB) int phy_start_cable_test(struct phy_device *phydev, struct netlink_ext_ack *extack); +int phy_start_cable_test_tdr(struct phy_device *phydev, + struct netlink_ext_ack *extack); #else static inline int phy_start_cable_test(struct phy_device *phydev, @@ -1259,6 +1265,13 @@ int phy_start_cable_test(struct phy_device *phydev, NL_SET_ERR_MSG(extack, "Kernel not compiled with PHYLIB support"); return -EOPNOTSUPP; } +static inline +int phy_start_cable_test_tdr(struct phy_device *phydev, + struct netlink_ext_ack *extack) +{ + NL_SET_ERR_MSG(extack, "Kernel not compiled with PHYLIB support"); + return -EOPNOTSUPP; +} #endif int phy_cable_test_result(struct phy_device *phydev, u8 pair, u16 result); diff --git a/net/ethtool/cabletest.c b/net/ethtool/cabletest.c index 5ba06eabe8c2..94e9d5f04353 100644 --- a/net/ethtool/cabletest.c +++ b/net/ethtool/cabletest.c @@ -13,7 +13,7 @@ cable_test_act_policy[ETHTOOL_A_CABLE_TEST_MAX + 1] = { [ETHTOOL_A_CABLE_TEST_HEADER] = { .type = NLA_NESTED }, }; -static int ethnl_cable_test_started(struct phy_device *phydev) +static int ethnl_cable_test_started(struct phy_device *phydev, u8 cmd) { struct sk_buff *skb; int err = -ENOMEM; @@ -23,7 +23,7 @@ static int ethnl_cable_test_started(struct phy_device *phydev) if (!skb) goto out; - ehdr = ethnl_bcastmsg_put(skb, ETHTOOL_MSG_CABLE_TEST_NTF); + ehdr = ethnl_bcastmsg_put(skb, cmd); if (!ehdr) { err = -EMSGSIZE; goto out; @@ -86,7 +86,8 @@ int ethnl_act_cable_test(struct sk_buff *skb, struct genl_info *info) ethnl_ops_complete(dev); if (!ret) - ethnl_cable_test_started(dev->phydev); + ethnl_cable_test_started(dev->phydev, + ETHTOOL_MSG_CABLE_TEST_NTF); out_rtnl: rtnl_unlock(); @@ -95,7 +96,7 @@ out_dev_put: return ret; } -int ethnl_cable_test_alloc(struct phy_device *phydev) +int ethnl_cable_test_alloc(struct phy_device *phydev, u8 cmd) { int err = -ENOMEM; @@ -103,8 +104,7 @@ int ethnl_cable_test_alloc(struct phy_device *phydev) if (!phydev->skb) goto out; - phydev->ehdr = ethnl_bcastmsg_put(phydev->skb, - ETHTOOL_MSG_CABLE_TEST_NTF); + phydev->ehdr = ethnl_bcastmsg_put(phydev->skb, cmd); if (!phydev->ehdr) { err = -EMSGSIZE; goto out; @@ -199,3 +199,55 @@ err: return ret; } EXPORT_SYMBOL_GPL(ethnl_cable_test_fault_length); + +static const struct nla_policy +cable_test_tdr_act_policy[ETHTOOL_A_CABLE_TEST_TDR_MAX + 1] = { + [ETHTOOL_A_CABLE_TEST_TDR_UNSPEC] = { .type = NLA_REJECT }, + [ETHTOOL_A_CABLE_TEST_TDR_HEADER] = { .type = NLA_NESTED }, +}; + +int ethnl_act_cable_test_tdr(struct sk_buff *skb, struct genl_info *info) +{ + struct nlattr *tb[ETHTOOL_A_CABLE_TEST_TDR_MAX + 1]; + struct ethnl_req_info req_info = {}; + struct net_device *dev; + int ret; + + ret = nlmsg_parse(info->nlhdr, GENL_HDRLEN, tb, + ETHTOOL_A_CABLE_TEST_TDR_MAX, + cable_test_tdr_act_policy, info->extack); + if (ret < 0) + return ret; + + ret = ethnl_parse_header_dev_get(&req_info, + tb[ETHTOOL_A_CABLE_TEST_TDR_HEADER], + genl_info_net(info), info->extack, + true); + if (ret < 0) + return ret; + + dev = req_info.dev; + if (!dev->phydev) { + ret = -EOPNOTSUPP; + goto out_dev_put; + } + + rtnl_lock(); + ret = ethnl_ops_begin(dev); + if (ret < 0) + goto out_rtnl; + + ret = phy_start_cable_test_tdr(dev->phydev, info->extack); + + ethnl_ops_complete(dev); + + if (!ret) + ethnl_cable_test_started(dev->phydev, + ETHTOOL_MSG_CABLE_TEST_TDR_NTF); + +out_rtnl: + rtnl_unlock(); +out_dev_put: + dev_put(dev); + return ret; +} diff --git a/net/ethtool/netlink.c b/net/ethtool/netlink.c index 0f2f4754dcf9..88fd07f47040 100644 --- a/net/ethtool/netlink.c +++ b/net/ethtool/netlink.c @@ -844,6 +844,11 @@ static const struct genl_ops ethtool_genl_ops[] = { .flags = GENL_UNS_ADMIN_PERM, .doit = ethnl_act_cable_test, }, + { + .cmd = ETHTOOL_MSG_CABLE_TEST_TDR_ACT, + .flags = GENL_UNS_ADMIN_PERM, + .doit = ethnl_act_cable_test_tdr, + }, }; static const struct genl_multicast_group ethtool_nl_mcgrps[] = { diff --git a/net/ethtool/netlink.h b/net/ethtool/netlink.h index b0eb5d920099..9a96b6e90dc2 100644 --- a/net/ethtool/netlink.h +++ b/net/ethtool/netlink.h @@ -360,5 +360,6 @@ int ethnl_set_coalesce(struct sk_buff *skb, struct genl_info *info); int ethnl_set_pause(struct sk_buff *skb, struct genl_info *info); int ethnl_set_eee(struct sk_buff *skb, struct genl_info *info); int ethnl_act_cable_test(struct sk_buff *skb, struct genl_info *info); +int ethnl_act_cable_test_tdr(struct sk_buff *skb, struct genl_info *info); #endif /* _NET_ETHTOOL_NETLINK_H */ -- cgit v1.2.3-59-g8ed1b From 6b4a0fc106521e480c00b55a7ef38c89f02dc4e8 Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Wed, 27 May 2020 00:21:39 +0200 Subject: net: ethtool: Add helpers for cable test TDR data Add helpers for returning raw TDR helpers in netlink messages. Signed-off-by: Andrew Lunn Signed-off-by: David S. Miller --- include/linux/ethtool_netlink.h | 21 +++++++++++ net/ethtool/cabletest.c | 80 ++++++++++++++++++++++++++++++++++++++++- 2 files changed, 100 insertions(+), 1 deletion(-) diff --git a/include/linux/ethtool_netlink.h b/include/linux/ethtool_netlink.h index 24817ba252a0..8fbe4f97ffad 100644 --- a/include/linux/ethtool_netlink.h +++ b/include/linux/ethtool_netlink.h @@ -22,6 +22,10 @@ void ethnl_cable_test_free(struct phy_device *phydev); void ethnl_cable_test_finished(struct phy_device *phydev); int ethnl_cable_test_result(struct phy_device *phydev, u8 pair, u8 result); int ethnl_cable_test_fault_length(struct phy_device *phydev, u8 pair, u32 cm); +int ethnl_cable_test_amplitude(struct phy_device *phydev, u8 pair, s16 mV); +int ethnl_cable_test_pulse(struct phy_device *phydev, u16 mV); +int ethnl_cable_test_step(struct phy_device *phydev, u32 first, u32 last, + u32 step); #else static inline int ethnl_cable_test_alloc(struct phy_device *phydev, u8 cmd) { @@ -46,5 +50,22 @@ static inline int ethnl_cable_test_fault_length(struct phy_device *phydev, { return -EOPNOTSUPP; } + +static inline int ethnl_cable_test_amplitude(struct phy_device *phydev, + u8 pair, s16 mV) +{ + return -EOPNOTSUPP; +} + +static inline int ethnl_cable_test_pulse(struct phy_device *phydev, u16 mV) +{ + return -EOPNOTSUPP; +} + +static inline int ethnl_cable_test_step(struct phy_device *phydev, u32 first, + u32 last, u32 step) +{ + return -EOPNOTSUPP; +} #endif /* IS_ENABLED(ETHTOOL_NETLINK) */ #endif /* _LINUX_ETHTOOL_NETLINK_H_ */ diff --git a/net/ethtool/cabletest.c b/net/ethtool/cabletest.c index 94e9d5f04353..390d0673ff01 100644 --- a/net/ethtool/cabletest.c +++ b/net/ethtool/cabletest.c @@ -100,7 +100,10 @@ int ethnl_cable_test_alloc(struct phy_device *phydev, u8 cmd) { int err = -ENOMEM; - phydev->skb = genlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL); + /* One TDR sample occupies 20 bytes. For a 150 meter cable, + * with four pairs, around 12K is needed. + */ + phydev->skb = genlmsg_new(SZ_16K, GFP_KERNEL); if (!phydev->skb) goto out; @@ -251,3 +254,78 @@ out_dev_put: dev_put(dev); return ret; } + +int ethnl_cable_test_amplitude(struct phy_device *phydev, + u8 pair, s16 mV) +{ + struct nlattr *nest; + int ret = -EMSGSIZE; + + nest = nla_nest_start(phydev->skb, + ETHTOOL_A_CABLE_TDR_NEST_AMPLITUDE); + if (!nest) + return -EMSGSIZE; + + if (nla_put_u8(phydev->skb, ETHTOOL_A_CABLE_AMPLITUDE_PAIR, pair)) + goto err; + if (nla_put_u16(phydev->skb, ETHTOOL_A_CABLE_AMPLITUDE_mV, mV)) + goto err; + + nla_nest_end(phydev->skb, nest); + return 0; + +err: + nla_nest_cancel(phydev->skb, nest); + return ret; +} +EXPORT_SYMBOL_GPL(ethnl_cable_test_amplitude); + +int ethnl_cable_test_pulse(struct phy_device *phydev, u16 mV) +{ + struct nlattr *nest; + int ret = -EMSGSIZE; + + nest = nla_nest_start(phydev->skb, ETHTOOL_A_CABLE_TDR_NEST_PULSE); + if (!nest) + return -EMSGSIZE; + + if (nla_put_u16(phydev->skb, ETHTOOL_A_CABLE_PULSE_mV, mV)) + goto err; + + nla_nest_end(phydev->skb, nest); + return 0; + +err: + nla_nest_cancel(phydev->skb, nest); + return ret; +} +EXPORT_SYMBOL_GPL(ethnl_cable_test_pulse); + +int ethnl_cable_test_step(struct phy_device *phydev, u32 first, u32 last, + u32 step) +{ + struct nlattr *nest; + int ret = -EMSGSIZE; + + nest = nla_nest_start(phydev->skb, ETHTOOL_A_CABLE_TDR_NEST_STEP); + if (!nest) + return -EMSGSIZE; + + if (nla_put_u32(phydev->skb, ETHTOOL_A_CABLE_STEP_FIRST_DISTANCE, + first)) + goto err; + + if (nla_put_u32(phydev->skb, ETHTOOL_A_CABLE_STEP_LAST_DISTANCE, last)) + goto err; + + if (nla_put_u32(phydev->skb, ETHTOOL_A_CABLE_STEP_STEP_DISTANCE, step)) + goto err; + + nla_nest_end(phydev->skb, nest); + return 0; + +err: + nla_nest_cancel(phydev->skb, nest); + return ret; +} +EXPORT_SYMBOL_GPL(ethnl_cable_test_step); -- cgit v1.2.3-59-g8ed1b From 0c9bcc1d2394acef2c6e89e652d984cc845c7bea Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Wed, 27 May 2020 00:21:40 +0200 Subject: net: phy: marvell: Add support for amplitude graph The Marvell PHYs can measure the amplitude of the returned signal for a given distance. Implement this option of the cable test infrastructure. When reporting the step, convert the distance into cm. Signed-off-by: Andrew Lunn v2: Step based on the measurement resolution, and convert this to cm. Signed-off-by: David S. Miller --- drivers/net/phy/marvell.c | 232 +++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 231 insertions(+), 1 deletion(-) diff --git a/drivers/net/phy/marvell.c b/drivers/net/phy/marvell.c index 4bc7febf9248..e597bee2e966 100644 --- a/drivers/net/phy/marvell.c +++ b/drivers/net/phy/marvell.c @@ -42,6 +42,7 @@ #define MII_MARVELL_FIBER_PAGE 0x01 #define MII_MARVELL_MSCR_PAGE 0x02 #define MII_MARVELL_LED_PAGE 0x03 +#define MII_MARVELL_VCT5_PAGE 0x05 #define MII_MARVELL_MISC_TEST_PAGE 0x06 #define MII_MARVELL_VCT7_PAGE 0x07 #define MII_MARVELL_WOL_PAGE 0x11 @@ -164,6 +165,54 @@ #define MII_88E1510_GEN_CTRL_REG_1_MODE_SGMII 0x1 /* SGMII to copper */ #define MII_88E1510_GEN_CTRL_REG_1_RESET 0x8000 /* Soft reset */ +#define MII_VCT5_TX_RX_MDI0_COUPLING 0x10 +#define MII_VCT5_TX_RX_MDI1_COUPLING 0x11 +#define MII_VCT5_TX_RX_MDI2_COUPLING 0x12 +#define MII_VCT5_TX_RX_MDI3_COUPLING 0x13 +#define MII_VCT5_TX_RX_AMPLITUDE_MASK 0x7f00 +#define MII_VCT5_TX_RX_AMPLITUDE_SHIFT 8 +#define MII_VCT5_TX_RX_COUPLING_POSITIVE_REFLECTION BIT(15) + +#define MII_VCT5_CTRL 0x17 +#define MII_VCT5_CTRL_ENABLE BIT(15) +#define MII_VCT5_CTRL_COMPLETE BIT(14) +#define MII_VCT5_CTRL_TX_SAME_CHANNEL (0x0 << 11) +#define MII_VCT5_CTRL_TX0_CHANNEL (0x4 << 11) +#define MII_VCT5_CTRL_TX1_CHANNEL (0x5 << 11) +#define MII_VCT5_CTRL_TX2_CHANNEL (0x6 << 11) +#define MII_VCT5_CTRL_TX3_CHANNEL (0x7 << 11) +#define MII_VCT5_CTRL_SAMPLES_2 (0x0 << 8) +#define MII_VCT5_CTRL_SAMPLES_4 (0x1 << 8) +#define MII_VCT5_CTRL_SAMPLES_8 (0x2 << 8) +#define MII_VCT5_CTRL_SAMPLES_16 (0x3 << 8) +#define MII_VCT5_CTRL_SAMPLES_32 (0x4 << 8) +#define MII_VCT5_CTRL_SAMPLES_64 (0x5 << 8) +#define MII_VCT5_CTRL_SAMPLES_128 (0x6 << 8) +#define MII_VCT5_CTRL_SAMPLES_DEFAULT (0x6 << 8) +#define MII_VCT5_CTRL_SAMPLES_256 (0x7 << 8) +#define MII_VCT5_CTRL_SAMPLES_SHIFT 8 +#define MII_VCT5_CTRL_MODE_MAXIMUM_PEEK (0x0 << 6) +#define MII_VCT5_CTRL_MODE_FIRST_LAST_PEEK (0x1 << 6) +#define MII_VCT5_CTRL_MODE_OFFSET (0x2 << 6) +#define MII_VCT5_CTRL_SAMPLE_POINT (0x3 << 6) +#define MII_VCT5_CTRL_PEEK_HYST_DEFAULT 3 + +#define MII_VCT5_SAMPLE_POINT_DISTANCE 0x18 +#define MII_VCT5_TX_PULSE_CTRL 0x1c +#define MII_VCT5_TX_PULSE_CTRL_DONT_WAIT_LINK_DOWN BIT(12) +#define MII_VCT5_TX_PULSE_CTRL_PULSE_WIDTH_128nS (0x0 << 10) +#define MII_VCT5_TX_PULSE_CTRL_PULSE_WIDTH_96nS (0x1 << 10) +#define MII_VCT5_TX_PULSE_CTRL_PULSE_WIDTH_64nS (0x2 << 10) +#define MII_VCT5_TX_PULSE_CTRL_PULSE_WIDTH_32nS (0x3 << 10) +#define MII_VCT5_TX_PULSE_CTRL_PULSE_WIDTH_SHIFT 10 +#define MII_VCT5_TX_PULSE_CTRL_PULSE_AMPLITUDE_1000mV (0x0 << 8) +#define MII_VCT5_TX_PULSE_CTRL_PULSE_AMPLITUDE_750mV (0x1 << 8) +#define MII_VCT5_TX_PULSE_CTRL_PULSE_AMPLITUDE_500mV (0x2 << 8) +#define MII_VCT5_TX_PULSE_CTRL_PULSE_AMPLITUDE_250mV (0x3 << 8) +#define MII_VCT5_TX_PULSE_CTRL_PULSE_AMPLITUDE_SHIFT 8 +#define MII_VCT5_TX_PULSE_CTRL_MAX_AMP BIT(7) +#define MII_VCT5_TX_PULSE_CTRL_GT_140m_46_86mV (0x6 << 0) + #define MII_VCT7_PAIR_0_DISTANCE 0x10 #define MII_VCT7_PAIR_1_DISTANCE 0x11 #define MII_VCT7_PAIR_2_DISTANCE 0x12 @@ -220,6 +269,7 @@ struct marvell_priv { u64 stats[ARRAY_SIZE(marvell_hw_stats)]; char *hwmon_name; struct device *hwmon_dev; + bool cable_test_tdr; }; static int marvell_read_page(struct phy_device *phydev) @@ -1690,7 +1740,119 @@ static void marvell_get_stats(struct phy_device *phydev, data[i] = marvell_get_stat(phydev, i); } -static int marvell_vct7_cable_test_start(struct phy_device *phydev) +static int marvell_vct5_wait_complete(struct phy_device *phydev) +{ + int i; + int val; + + for (i = 0; i < 32; i++) { + val = phy_read_paged(phydev, MII_MARVELL_VCT5_PAGE, + MII_VCT5_CTRL); + if (val < 0) + return val; + + if (val & MII_VCT5_CTRL_COMPLETE) + return 0; + + usleep_range(1000, 2000); + } + + phydev_err(phydev, "Timeout while waiting for cable test to finish\n"); + return -ETIMEDOUT; +} + +static int marvell_vct5_amplitude(struct phy_device *phydev, int pair) +{ + int amplitude; + int val; + int reg; + + reg = MII_VCT5_TX_RX_MDI0_COUPLING + pair; + val = phy_read_paged(phydev, MII_MARVELL_VCT5_PAGE, reg); + + if (val < 0) + return 0; + + amplitude = (val & MII_VCT5_TX_RX_AMPLITUDE_MASK) >> + MII_VCT5_TX_RX_AMPLITUDE_SHIFT; + + if (!(val & MII_VCT5_TX_RX_COUPLING_POSITIVE_REFLECTION)) + amplitude = -amplitude; + + return 1000 * amplitude / 128; +} + +static u32 marvell_vct5_distance2cm(int distance) +{ + return distance * 805 / 10; +} + +static int marvell_vct5_amplitude_distance(struct phy_device *phydev, + int distance) +{ + int mV_pair0, mV_pair1, mV_pair2, mV_pair3; + u16 reg; + int err; + + err = phy_write_paged(phydev, MII_MARVELL_VCT5_PAGE, + MII_VCT5_SAMPLE_POINT_DISTANCE, + distance); + if (err) + return err; + + reg = MII_VCT5_CTRL_ENABLE | + MII_VCT5_CTRL_TX_SAME_CHANNEL | + MII_VCT5_CTRL_SAMPLES_DEFAULT | + MII_VCT5_CTRL_SAMPLE_POINT | + MII_VCT5_CTRL_PEEK_HYST_DEFAULT; + err = phy_write_paged(phydev, MII_MARVELL_VCT5_PAGE, + MII_VCT5_CTRL, reg); + if (err) + return err; + + err = marvell_vct5_wait_complete(phydev); + if (err) + return err; + + mV_pair0 = marvell_vct5_amplitude(phydev, 0); + mV_pair1 = marvell_vct5_amplitude(phydev, 1); + mV_pair2 = marvell_vct5_amplitude(phydev, 2); + mV_pair3 = marvell_vct5_amplitude(phydev, 3); + + ethnl_cable_test_amplitude(phydev, ETHTOOL_A_CABLE_PAIR_A, mV_pair0); + ethnl_cable_test_amplitude(phydev, ETHTOOL_A_CABLE_PAIR_B, mV_pair1); + ethnl_cable_test_amplitude(phydev, ETHTOOL_A_CABLE_PAIR_C, mV_pair2); + ethnl_cable_test_amplitude(phydev, ETHTOOL_A_CABLE_PAIR_D, mV_pair3); + + return 0; +} + +static int marvell_vct5_amplitude_graph(struct phy_device *phydev) +{ + int distance; + int err; + u16 reg; + + reg = MII_VCT5_TX_PULSE_CTRL_GT_140m_46_86mV | + MII_VCT5_TX_PULSE_CTRL_DONT_WAIT_LINK_DOWN | + MII_VCT5_TX_PULSE_CTRL_MAX_AMP | + MII_VCT5_TX_PULSE_CTRL_PULSE_WIDTH_32nS; + + err = phy_write_paged(phydev, MII_MARVELL_VCT5_PAGE, + MII_VCT5_TX_PULSE_CTRL, reg); + if (err) + return err; + + for (distance = 0; distance <= 100; distance++) { + err = marvell_vct5_amplitude_distance(phydev, distance); + if (err) + return err; + } + + return 0; +} + +static int marvell_cable_test_start_common(struct phy_device *phydev) { int bmcr, bmsr, ret; @@ -1719,12 +1881,69 @@ static int marvell_vct7_cable_test_start(struct phy_device *phydev) if (bmsr & BMSR_LSTATUS) msleep(1500); + return 0; +} + +static int marvell_vct7_cable_test_start(struct phy_device *phydev) +{ + struct marvell_priv *priv = phydev->priv; + int ret; + + ret = marvell_cable_test_start_common(phydev); + if (ret) + return ret; + + priv->cable_test_tdr = false; + + /* Reset the VCT5 API control to defaults, otherwise + * VCT7 does not work correctly. + */ + ret = phy_write_paged(phydev, MII_MARVELL_VCT5_PAGE, + MII_VCT5_CTRL, + MII_VCT5_CTRL_TX_SAME_CHANNEL | + MII_VCT5_CTRL_SAMPLES_DEFAULT | + MII_VCT5_CTRL_MODE_MAXIMUM_PEEK | + MII_VCT5_CTRL_PEEK_HYST_DEFAULT); + if (ret) + return ret; + + ret = phy_write_paged(phydev, MII_MARVELL_VCT5_PAGE, + MII_VCT5_SAMPLE_POINT_DISTANCE, 0); + if (ret) + return ret; + return phy_write_paged(phydev, MII_MARVELL_VCT7_PAGE, MII_VCT7_CTRL, MII_VCT7_CTRL_RUN_NOW | MII_VCT7_CTRL_CENTIMETERS); } +static int marvell_vct5_cable_test_tdr_start(struct phy_device *phydev) +{ + struct marvell_priv *priv = phydev->priv; + int ret; + + /* Disable VCT7 */ + ret = phy_write_paged(phydev, MII_MARVELL_VCT7_PAGE, + MII_VCT7_CTRL, 0); + if (ret) + return ret; + + ret = marvell_cable_test_start_common(phydev); + if (ret) + return ret; + + priv->cable_test_tdr = true; + ret = ethnl_cable_test_pulse(phydev, 1000); + if (ret) + return ret; + + return ethnl_cable_test_step(phydev, + marvell_vct5_distance2cm(0), + marvell_vct5_distance2cm(100), + marvell_vct5_distance2cm(1)); +} + static int marvell_vct7_distance_to_length(int distance, bool meter) { if (meter) @@ -1828,8 +2047,15 @@ static int marvell_vct7_cable_test_report(struct phy_device *phydev) static int marvell_vct7_cable_test_get_status(struct phy_device *phydev, bool *finished) { + struct marvell_priv *priv = phydev->priv; int ret; + if (priv->cable_test_tdr) { + ret = marvell_vct5_amplitude_graph(phydev); + *finished = true; + return ret; + } + *finished = false; ret = phy_read_paged(phydev, MII_MARVELL_VCT7_PAGE, @@ -2563,6 +2789,7 @@ static struct phy_driver marvell_drivers[] = { .get_tunable = m88e1011_get_tunable, .set_tunable = m88e1011_set_tunable, .cable_test_start = marvell_vct7_cable_test_start, + .cable_test_tdr_start = marvell_vct5_cable_test_tdr_start, .cable_test_get_status = marvell_vct7_cable_test_get_status, }, { @@ -2588,6 +2815,7 @@ static struct phy_driver marvell_drivers[] = { .get_tunable = m88e1540_get_tunable, .set_tunable = m88e1540_set_tunable, .cable_test_start = marvell_vct7_cable_test_start, + .cable_test_tdr_start = marvell_vct5_cable_test_tdr_start, .cable_test_get_status = marvell_vct7_cable_test_get_status, }, { @@ -2613,6 +2841,7 @@ static struct phy_driver marvell_drivers[] = { .get_tunable = m88e1540_get_tunable, .set_tunable = m88e1540_set_tunable, .cable_test_start = marvell_vct7_cable_test_start, + .cable_test_tdr_start = marvell_vct5_cable_test_tdr_start, .cable_test_get_status = marvell_vct7_cable_test_get_status, }, { @@ -2658,6 +2887,7 @@ static struct phy_driver marvell_drivers[] = { .get_tunable = m88e1540_get_tunable, .set_tunable = m88e1540_set_tunable, .cable_test_start = marvell_vct7_cable_test_start, + .cable_test_tdr_start = marvell_vct5_cable_test_tdr_start, .cable_test_get_status = marvell_vct7_cable_test_get_status, }, }; -- cgit v1.2.3-59-g8ed1b From f2bc8ad31a7f814237bc6301d59296d76505a688 Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Wed, 27 May 2020 00:21:41 +0200 Subject: net: ethtool: Allow PHY cable test TDR data to configured Allow the user to configure where on the cable the TDR data should be retrieved, in terms of first and last sample, and the step between samples. Also add the ability to ask for TDR data for just one pair. If this configuration is not provided, it defaults to 1-150m at 1m intervals for all pairs. Signed-off-by: Andrew Lunn v3: Move the TDR configuration into a structure Add a range check on step Use NL_SET_ERR_MSG_ATTR() when appropriate Move TDR configuration into a nest Document attributes in the request Signed-off-by: David S. Miller --- Documentation/networking/ethtool-netlink.rst | 22 +++++- drivers/net/phy/marvell.c | 59 ++++++++++----- drivers/net/phy/phy.c | 5 +- include/linux/phy.h | 21 +++++- include/uapi/linux/ethtool_netlink.h | 13 ++++ net/ethtool/cabletest.c | 104 ++++++++++++++++++++++++++- 6 files changed, 197 insertions(+), 27 deletions(-) diff --git a/Documentation/networking/ethtool-netlink.rst b/Documentation/networking/ethtool-netlink.rst index dae36227d590..d42661b91128 100644 --- a/Documentation/networking/ethtool-netlink.rst +++ b/Documentation/networking/ethtool-netlink.rst @@ -1023,9 +1023,25 @@ Start a cable test and report raw TDR data Request contents: - ==================================== ====== ========================== - ``ETHTOOL_A_CABLE_TEST_TDR_HEADER`` nested request header - ==================================== ====== ========================== + +--------------------------------------------+--------+-----------------------+ + | ``ETHTOOL_A_CABLE_TEST_TDR_HEADER`` | nested | reply header | + +--------------------------------------------+--------+-----------------------+ + | ``ETHTOOL_A_CABLE_TEST_TDR_CFG`` | nested | test configuration | + +-+------------------------------------------+--------+-----------------------+ + | | ``ETHTOOL_A_CABLE_STEP_FIRST_DISTANCE `` | u32 | first data distance | + +-+-+----------------------------------------+--------+-----------------------+ + | | ``ETHTOOL_A_CABLE_STEP_LAST_DISTANCE `` | u32 | last data distance | + +-+-+----------------------------------------+--------+-----------------------+ + | | ``ETHTOOL_A_CABLE_STEP_STEP_DISTANCE `` | u32 | distance of each step | + +-+-+----------------------------------------+--------+-----------------------+ + | | ``ETHTOOL_A_CABLE_TEST_TDR_CFG_PAIR`` | u8 | pair to test | + +-+-+----------------------------------------+--------+-----------------------+ + +The ETHTOOL_A_CABLE_TEST_TDR_CFG is optional, as well as all members +of the nest. All distances are expressed in centimeters. The PHY takes +the distances as a guide, and rounds to the nearest distance it +actually supports. If a pair is passed, only that one pair will be +tested. Otherwise all pairs are tested. Notification contents: diff --git a/drivers/net/phy/marvell.c b/drivers/net/phy/marvell.c index e597bee2e966..335e51d6f138 100644 --- a/drivers/net/phy/marvell.c +++ b/drivers/net/phy/marvell.c @@ -198,6 +198,7 @@ #define MII_VCT5_CTRL_PEEK_HYST_DEFAULT 3 #define MII_VCT5_SAMPLE_POINT_DISTANCE 0x18 +#define MII_VCT5_SAMPLE_POINT_DISTANCE_MAX 511 #define MII_VCT5_TX_PULSE_CTRL 0x1c #define MII_VCT5_TX_PULSE_CTRL_DONT_WAIT_LINK_DOWN BIT(12) #define MII_VCT5_TX_PULSE_CTRL_PULSE_WIDTH_128nS (0x0 << 10) @@ -270,6 +271,10 @@ struct marvell_priv { char *hwmon_name; struct device *hwmon_dev; bool cable_test_tdr; + u32 first; + u32 last; + u32 step; + s8 pair; }; static int marvell_read_page(struct phy_device *phydev) @@ -1787,12 +1792,18 @@ static u32 marvell_vct5_distance2cm(int distance) return distance * 805 / 10; } +static u32 marvell_vct5_cm2distance(int cm) +{ + return cm * 10 / 805; +} + static int marvell_vct5_amplitude_distance(struct phy_device *phydev, - int distance) + int distance, int pair) { - int mV_pair0, mV_pair1, mV_pair2, mV_pair3; u16 reg; int err; + int mV; + int i; err = phy_write_paged(phydev, MII_MARVELL_VCT5_PAGE, MII_VCT5_SAMPLE_POINT_DISTANCE, @@ -1814,21 +1825,20 @@ static int marvell_vct5_amplitude_distance(struct phy_device *phydev, if (err) return err; - mV_pair0 = marvell_vct5_amplitude(phydev, 0); - mV_pair1 = marvell_vct5_amplitude(phydev, 1); - mV_pair2 = marvell_vct5_amplitude(phydev, 2); - mV_pair3 = marvell_vct5_amplitude(phydev, 3); + for (i = 0; i < 4; i++) { + if (pair != PHY_PAIR_ALL && i != pair) + continue; - ethnl_cable_test_amplitude(phydev, ETHTOOL_A_CABLE_PAIR_A, mV_pair0); - ethnl_cable_test_amplitude(phydev, ETHTOOL_A_CABLE_PAIR_B, mV_pair1); - ethnl_cable_test_amplitude(phydev, ETHTOOL_A_CABLE_PAIR_C, mV_pair2); - ethnl_cable_test_amplitude(phydev, ETHTOOL_A_CABLE_PAIR_D, mV_pair3); + mV = marvell_vct5_amplitude(phydev, i); + ethnl_cable_test_amplitude(phydev, i, mV); + } return 0; } static int marvell_vct5_amplitude_graph(struct phy_device *phydev) { + struct marvell_priv *priv = phydev->priv; int distance; int err; u16 reg; @@ -1843,8 +1853,11 @@ static int marvell_vct5_amplitude_graph(struct phy_device *phydev) if (err) return err; - for (distance = 0; distance <= 100; distance++) { - err = marvell_vct5_amplitude_distance(phydev, distance); + for (distance = priv->first; + distance <= priv->last; + distance += priv->step) { + err = marvell_vct5_amplitude_distance(phydev, distance, + priv->pair); if (err) return err; } @@ -1918,11 +1931,24 @@ static int marvell_vct7_cable_test_start(struct phy_device *phydev) MII_VCT7_CTRL_CENTIMETERS); } -static int marvell_vct5_cable_test_tdr_start(struct phy_device *phydev) +static int marvell_vct5_cable_test_tdr_start(struct phy_device *phydev, + const struct phy_tdr_config *cfg) { struct marvell_priv *priv = phydev->priv; int ret; + priv->cable_test_tdr = true; + priv->first = marvell_vct5_cm2distance(cfg->first); + priv->last = marvell_vct5_cm2distance(cfg->last); + priv->step = marvell_vct5_cm2distance(cfg->step); + priv->pair = cfg->pair; + + if (priv->first > MII_VCT5_SAMPLE_POINT_DISTANCE_MAX) + return -EINVAL; + + if (priv->last > MII_VCT5_SAMPLE_POINT_DISTANCE_MAX) + return -EINVAL; + /* Disable VCT7 */ ret = phy_write_paged(phydev, MII_MARVELL_VCT7_PAGE, MII_VCT7_CTRL, 0); @@ -1933,15 +1959,14 @@ static int marvell_vct5_cable_test_tdr_start(struct phy_device *phydev) if (ret) return ret; - priv->cable_test_tdr = true; ret = ethnl_cable_test_pulse(phydev, 1000); if (ret) return ret; return ethnl_cable_test_step(phydev, - marvell_vct5_distance2cm(0), - marvell_vct5_distance2cm(100), - marvell_vct5_distance2cm(1)); + marvell_vct5_distance2cm(priv->first), + marvell_vct5_distance2cm(priv->last), + marvell_vct5_distance2cm(priv->step)); } static int marvell_vct7_distance_to_length(int distance, bool meter) diff --git a/drivers/net/phy/phy.c b/drivers/net/phy/phy.c index 495d9ba3d5bf..1de3938628f4 100644 --- a/drivers/net/phy/phy.c +++ b/drivers/net/phy/phy.c @@ -553,7 +553,8 @@ out: EXPORT_SYMBOL(phy_start_cable_test); int phy_start_cable_test_tdr(struct phy_device *phydev, - struct netlink_ext_ack *extack) + struct netlink_ext_ack *extack, + const struct phy_tdr_config *config) { struct net_device *dev = phydev->attached_dev; int err = -ENOMEM; @@ -590,7 +591,7 @@ int phy_start_cable_test_tdr(struct phy_device *phydev, phy_link_down(phydev); netif_testing_on(dev); - err = phydev->drv->cable_test_tdr_start(phydev); + err = phydev->drv->cable_test_tdr_start(phydev, config); if (err) { netif_testing_off(dev); phy_link_up(phydev); diff --git a/include/linux/phy.h b/include/linux/phy.h index d3c384f353ca..8c05d0fb5c00 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -548,6 +548,18 @@ struct phy_device { #define to_phy_device(d) container_of(to_mdio_device(d), \ struct phy_device, mdio) +/* A structure containing possible configuration parameters + * for a TDR cable test. The driver does not need to implement + * all the parameters, but should report what is actually used. + */ +struct phy_tdr_config { + u32 first; + u32 last; + u32 step; + s8 pair; +}; +#define PHY_PAIR_ALL -1 + /* struct phy_driver: Driver structure for a particular PHY type * * driver_data: static driver data @@ -701,7 +713,8 @@ struct phy_driver { int (*cable_test_start)(struct phy_device *dev); /* Start a raw TDR cable test */ - int (*cable_test_tdr_start)(struct phy_device *dev); + int (*cable_test_tdr_start)(struct phy_device *dev, + const struct phy_tdr_config *config); /* Once per second, or on interrupt, request the status of the * test. @@ -1256,7 +1269,8 @@ int phy_reset_after_clk_enable(struct phy_device *phydev); int phy_start_cable_test(struct phy_device *phydev, struct netlink_ext_ack *extack); int phy_start_cable_test_tdr(struct phy_device *phydev, - struct netlink_ext_ack *extack); + struct netlink_ext_ack *extack, + const struct phy_tdr_config *config); #else static inline int phy_start_cable_test(struct phy_device *phydev, @@ -1267,7 +1281,8 @@ int phy_start_cable_test(struct phy_device *phydev, } static inline int phy_start_cable_test_tdr(struct phy_device *phydev, - struct netlink_ext_ack *extack) + struct netlink_ext_ack *extack, + const struct phy_tdr_config *config) { NL_SET_ERR_MSG(extack, "Kernel not compiled with PHYLIB support"); return -EOPNOTSUPP; diff --git a/include/uapi/linux/ethtool_netlink.h b/include/uapi/linux/ethtool_netlink.h index 739faa7070c6..fc9051f2eeac 100644 --- a/include/uapi/linux/ethtool_netlink.h +++ b/include/uapi/linux/ethtool_netlink.h @@ -482,9 +482,22 @@ enum { /* CABLE TEST TDR */ +enum { + ETHTOOL_A_CABLE_TEST_TDR_CFG_UNSPEC, + ETHTOOL_A_CABLE_TEST_TDR_CFG_FIRST, /* u32 */ + ETHTOOL_A_CABLE_TEST_TDR_CFG_LAST, /* u32 */ + ETHTOOL_A_CABLE_TEST_TDR_CFG_STEP, /* u32 */ + ETHTOOL_A_CABLE_TEST_TDR_CFG_PAIR, /* u8 */ + + /* add new constants above here */ + __ETHTOOL_A_CABLE_TEST_TDR_CFG_CNT, + ETHTOOL_A_CABLE_TEST_TDR_CFG_MAX = __ETHTOOL_A_CABLE_TEST_TDR_CFG_CNT - 1 +}; + enum { ETHTOOL_A_CABLE_TEST_TDR_UNSPEC, ETHTOOL_A_CABLE_TEST_TDR_HEADER, /* nest - _A_HEADER_* */ + ETHTOOL_A_CABLE_TEST_TDR_CFG, /* nest - *_TDR_CFG_* */ /* add new constants above here */ __ETHTOOL_A_CABLE_TEST_TDR_CNT, diff --git a/net/ethtool/cabletest.c b/net/ethtool/cabletest.c index 390d0673ff01..9991688d7d1d 100644 --- a/net/ethtool/cabletest.c +++ b/net/ethtool/cabletest.c @@ -5,7 +5,11 @@ #include "netlink.h" #include "common.h" -/* CABLE_TEST_ACT */ +/* 802.3 standard allows 100 meters for BaseT cables. However longer + * cables might work, depending on the quality of the cables and the + * PHY. So allow testing for up to 150 meters. + */ +#define MAX_CABLE_LENGTH_CM (150 * 100) static const struct nla_policy cable_test_act_policy[ETHTOOL_A_CABLE_TEST_MAX + 1] = { @@ -203,16 +207,107 @@ err: } EXPORT_SYMBOL_GPL(ethnl_cable_test_fault_length); +struct cable_test_tdr_req_info { + struct ethnl_req_info base; +}; + +static const struct nla_policy +cable_test_tdr_act_cfg_policy[ETHTOOL_A_CABLE_TEST_TDR_CFG_MAX + 1] = { + [ETHTOOL_A_CABLE_TEST_TDR_CFG_FIRST] = { .type = NLA_U32 }, + [ETHTOOL_A_CABLE_TEST_TDR_CFG_LAST] = { .type = NLA_U32 }, + [ETHTOOL_A_CABLE_TEST_TDR_CFG_STEP] = { .type = NLA_U32 }, + [ETHTOOL_A_CABLE_TEST_TDR_CFG_PAIR] = { .type = NLA_U8 }, +}; + static const struct nla_policy cable_test_tdr_act_policy[ETHTOOL_A_CABLE_TEST_TDR_MAX + 1] = { [ETHTOOL_A_CABLE_TEST_TDR_UNSPEC] = { .type = NLA_REJECT }, [ETHTOOL_A_CABLE_TEST_TDR_HEADER] = { .type = NLA_NESTED }, + [ETHTOOL_A_CABLE_TEST_TDR_CFG] = { .type = NLA_NESTED }, }; +/* CABLE_TEST_TDR_ACT */ +int ethnl_act_cable_test_tdr_cfg(const struct nlattr *nest, + struct genl_info *info, + struct phy_tdr_config *cfg) +{ + struct nlattr *tb[ETHTOOL_A_CABLE_TEST_TDR_CFG_MAX + 1]; + int ret; + + ret = nla_parse_nested(tb, ETHTOOL_A_CABLE_TEST_TDR_CFG_MAX, nest, + cable_test_tdr_act_cfg_policy, info->extack); + if (ret < 0) + return ret; + + if (tb[ETHTOOL_A_CABLE_TEST_TDR_CFG_FIRST]) + cfg->first = nla_get_u32( + tb[ETHTOOL_A_CABLE_TEST_TDR_CFG_FIRST]); + else + cfg->first = 100; + if (tb[ETHTOOL_A_CABLE_TEST_TDR_CFG_LAST]) + cfg->last = nla_get_u32(tb[ETHTOOL_A_CABLE_TEST_TDR_CFG_LAST]); + else + cfg->last = MAX_CABLE_LENGTH_CM; + + if (tb[ETHTOOL_A_CABLE_TEST_TDR_CFG_STEP]) + cfg->step = nla_get_u32(tb[ETHTOOL_A_CABLE_TEST_TDR_CFG_STEP]); + else + cfg->step = 100; + + if (tb[ETHTOOL_A_CABLE_TEST_TDR_CFG_PAIR]) { + cfg->pair = nla_get_u8(tb[ETHTOOL_A_CABLE_TEST_TDR_CFG_PAIR]); + if (cfg->pair > ETHTOOL_A_CABLE_PAIR_D) { + NL_SET_ERR_MSG_ATTR( + info->extack, + tb[ETHTOOL_A_CABLE_TEST_TDR_CFG_PAIR], + "invalid pair parameter"); + return -EINVAL; + } + } else { + cfg->pair = PHY_PAIR_ALL; + } + + if (cfg->first > MAX_CABLE_LENGTH_CM) { + NL_SET_ERR_MSG_ATTR(info->extack, + tb[ETHTOOL_A_CABLE_TEST_TDR_CFG_FIRST], + "invalid first parameter"); + return -EINVAL; + } + + if (cfg->last > MAX_CABLE_LENGTH_CM) { + NL_SET_ERR_MSG_ATTR(info->extack, + tb[ETHTOOL_A_CABLE_TEST_TDR_CFG_LAST], + "invalid last parameter"); + return -EINVAL; + } + + if (cfg->first > cfg->last) { + NL_SET_ERR_MSG(info->extack, "invalid first/last parameter"); + return -EINVAL; + } + + if (!cfg->step) { + NL_SET_ERR_MSG_ATTR(info->extack, + tb[ETHTOOL_A_CABLE_TEST_TDR_CFG_STEP], + "invalid step parameter"); + return -EINVAL; + } + + if (cfg->step > (cfg->last - cfg->first)) { + NL_SET_ERR_MSG_ATTR(info->extack, + tb[ETHTOOL_A_CABLE_TEST_TDR_CFG_STEP], + "step parameter too big"); + return -EINVAL; + } + + return 0; +} + int ethnl_act_cable_test_tdr(struct sk_buff *skb, struct genl_info *info) { struct nlattr *tb[ETHTOOL_A_CABLE_TEST_TDR_MAX + 1]; struct ethnl_req_info req_info = {}; + struct phy_tdr_config cfg; struct net_device *dev; int ret; @@ -235,12 +330,17 @@ int ethnl_act_cable_test_tdr(struct sk_buff *skb, struct genl_info *info) goto out_dev_put; } + ret = ethnl_act_cable_test_tdr_cfg(tb[ETHTOOL_A_CABLE_TEST_TDR_CFG], + info, &cfg); + if (ret) + goto out_dev_put; + rtnl_lock(); ret = ethnl_ops_begin(dev); if (ret < 0) goto out_rtnl; - ret = phy_start_cable_test_tdr(dev->phydev, info->extack); + ret = phy_start_cable_test_tdr(dev->phydev, info->extack, &cfg); ethnl_ops_complete(dev); -- cgit v1.2.3-59-g8ed1b From a618e86da91d259374caff065cc557899dc181ce Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Wed, 27 May 2020 00:21:42 +0200 Subject: net : phy: marvell: Speedup TDR data retrieval by only changing page once Getting the TDR data requires a large number of MDIO bus transactions. The number can however be reduced if the page is only changed once. Add the needed locking to allow this, and make use of unlocked read/write methods where needed. Signed-off-by: Andrew Lunn Signed-off-by: David S. Miller --- drivers/net/phy/marvell.c | 31 +++++++++++++++++++------------ 1 file changed, 19 insertions(+), 12 deletions(-) diff --git a/drivers/net/phy/marvell.c b/drivers/net/phy/marvell.c index 335e51d6f138..e9deedea5f19 100644 --- a/drivers/net/phy/marvell.c +++ b/drivers/net/phy/marvell.c @@ -1751,15 +1751,12 @@ static int marvell_vct5_wait_complete(struct phy_device *phydev) int val; for (i = 0; i < 32; i++) { - val = phy_read_paged(phydev, MII_MARVELL_VCT5_PAGE, - MII_VCT5_CTRL); + val = __phy_read(phydev, MII_VCT5_CTRL); if (val < 0) return val; if (val & MII_VCT5_CTRL_COMPLETE) return 0; - - usleep_range(1000, 2000); } phydev_err(phydev, "Timeout while waiting for cable test to finish\n"); @@ -1773,7 +1770,7 @@ static int marvell_vct5_amplitude(struct phy_device *phydev, int pair) int reg; reg = MII_VCT5_TX_RX_MDI0_COUPLING + pair; - val = phy_read_paged(phydev, MII_MARVELL_VCT5_PAGE, reg); + val = __phy_read(phydev, reg); if (val < 0) return 0; @@ -1805,9 +1802,8 @@ static int marvell_vct5_amplitude_distance(struct phy_device *phydev, int mV; int i; - err = phy_write_paged(phydev, MII_MARVELL_VCT5_PAGE, - MII_VCT5_SAMPLE_POINT_DISTANCE, - distance); + err = __phy_write(phydev, MII_VCT5_SAMPLE_POINT_DISTANCE, + distance); if (err) return err; @@ -1816,8 +1812,7 @@ static int marvell_vct5_amplitude_distance(struct phy_device *phydev, MII_VCT5_CTRL_SAMPLES_DEFAULT | MII_VCT5_CTRL_SAMPLE_POINT | MII_VCT5_CTRL_PEEK_HYST_DEFAULT; - err = phy_write_paged(phydev, MII_MARVELL_VCT5_PAGE, - MII_VCT5_CTRL, reg); + err = __phy_write(phydev, MII_VCT5_CTRL, reg); if (err) return err; @@ -1840,6 +1835,7 @@ static int marvell_vct5_amplitude_graph(struct phy_device *phydev) { struct marvell_priv *priv = phydev->priv; int distance; + int page; int err; u16 reg; @@ -1853,16 +1849,27 @@ static int marvell_vct5_amplitude_graph(struct phy_device *phydev) if (err) return err; + /* Reading the TDR data is very MDIO heavy. We need to optimize + * access to keep the time to a minimum. So lock the bus once, + * and don't release it until complete. We can then avoid having + * to change the page for every access, greatly speeding things + * up. + */ + page = phy_select_page(phydev, MII_MARVELL_VCT5_PAGE); + if (page < 0) + return page; + for (distance = priv->first; distance <= priv->last; distance += priv->step) { err = marvell_vct5_amplitude_distance(phydev, distance, priv->pair); if (err) - return err; + goto restore_page; } - return 0; +restore_page: + return phy_restore_page(phydev, page, err); } static int marvell_cable_test_start_common(struct phy_device *phydev) -- cgit v1.2.3-59-g8ed1b From db8668a1951954156c039b9f8fe2881d428a522c Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Wed, 27 May 2020 00:21:43 +0200 Subject: net: phy: marvell: Configure TDR pulse based on measurement length When performing a TDR measurement for a short distance, the pulse width should be low, to help differentiate between the outgoing pulse and any reflection. For longer distances, the pulse should be wider, to help with attenuation. Signed-off-by: Andrew Lunn Signed-off-by: David S. Miller --- drivers/net/phy/marvell.c | 25 +++++++++++++++++++++++-- 1 file changed, 23 insertions(+), 2 deletions(-) diff --git a/drivers/net/phy/marvell.c b/drivers/net/phy/marvell.c index e9deedea5f19..2c04e3b2b285 100644 --- a/drivers/net/phy/marvell.c +++ b/drivers/net/phy/marvell.c @@ -214,6 +214,11 @@ #define MII_VCT5_TX_PULSE_CTRL_MAX_AMP BIT(7) #define MII_VCT5_TX_PULSE_CTRL_GT_140m_46_86mV (0x6 << 0) +/* For TDR measurements less than 11 meters, a short pulse should be + * used. + */ +#define TDR_SHORT_CABLE_LENGTH 11 + #define MII_VCT7_PAIR_0_DISTANCE 0x10 #define MII_VCT7_PAIR_1_DISTANCE 0x11 #define MII_VCT7_PAIR_2_DISTANCE 0x12 @@ -1835,14 +1840,19 @@ static int marvell_vct5_amplitude_graph(struct phy_device *phydev) { struct marvell_priv *priv = phydev->priv; int distance; + u16 width; int page; int err; u16 reg; + if (priv->first <= TDR_SHORT_CABLE_LENGTH) + width = MII_VCT5_TX_PULSE_CTRL_PULSE_WIDTH_32nS; + else + width = MII_VCT5_TX_PULSE_CTRL_PULSE_WIDTH_128nS; + reg = MII_VCT5_TX_PULSE_CTRL_GT_140m_46_86mV | MII_VCT5_TX_PULSE_CTRL_DONT_WAIT_LINK_DOWN | - MII_VCT5_TX_PULSE_CTRL_MAX_AMP | - MII_VCT5_TX_PULSE_CTRL_PULSE_WIDTH_32nS; + MII_VCT5_TX_PULSE_CTRL_MAX_AMP | width; err = phy_write_paged(phydev, MII_MARVELL_VCT5_PAGE, MII_VCT5_TX_PULSE_CTRL, reg); @@ -1866,6 +1876,17 @@ static int marvell_vct5_amplitude_graph(struct phy_device *phydev) priv->pair); if (err) goto restore_page; + + if (distance > TDR_SHORT_CABLE_LENGTH && + width == MII_VCT5_TX_PULSE_CTRL_PULSE_WIDTH_32nS) { + width = MII_VCT5_TX_PULSE_CTRL_PULSE_WIDTH_128nS; + reg = MII_VCT5_TX_PULSE_CTRL_GT_140m_46_86mV | + MII_VCT5_TX_PULSE_CTRL_DONT_WAIT_LINK_DOWN | + MII_VCT5_TX_PULSE_CTRL_MAX_AMP | width; + err = __phy_write(phydev, MII_VCT5_TX_PULSE_CTRL, reg); + if (err) + goto restore_page; + } } restore_page: -- cgit v1.2.3-59-g8ed1b From dc0f3ed1973f101508957b59e529e03da1349e09 Mon Sep 17 00:00:00 2001 From: Oleksij Rempel Date: Wed, 27 May 2020 07:08:43 +0200 Subject: net: phy: at803x: add cable diagnostics support for ATH9331 and ATH8032 Add support for Atheros 100Base-T PHYs. The only difference seems to be the ability to test 2 pairs instead of 4 and the lack of 1000Base-T specific register. Only the ATH9331 was tested with this patch. Signed-off-by: Oleksij Rempel Reviewed-by: Andrew Lunn Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/phy/at803x.c | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/drivers/net/phy/at803x.c b/drivers/net/phy/at803x.c index 822b3acf6be7..97cbe593f0ea 100644 --- a/drivers/net/phy/at803x.c +++ b/drivers/net/phy/at803x.c @@ -920,10 +920,16 @@ static int at803x_cable_test_one_pair(struct phy_device *phydev, int pair) static int at803x_cable_test_get_status(struct phy_device *phydev, bool *finished) { - unsigned long pair_mask = 0xf; + unsigned long pair_mask; int retries = 20; int pair, ret; + if (phydev->phy_id == ATH9331_PHY_ID || + phydev->phy_id == ATH8032_PHY_ID) + pair_mask = 0x3; + else + pair_mask = 0xf; + *finished = false; /* According to the datasheet the CDT can be performed when @@ -958,7 +964,9 @@ static int at803x_cable_test_start(struct phy_device *phydev) */ phy_write(phydev, MII_BMCR, BMCR_ANENABLE); phy_write(phydev, MII_ADVERTISE, ADVERTISE_CSMA); - phy_write(phydev, MII_CTRL1000, 0); + if (phydev->phy_id != ATH9331_PHY_ID && + phydev->phy_id != ATH8032_PHY_ID) + phy_write(phydev, MII_CTRL1000, 0); /* we do all the (time consuming) work later */ return 0; @@ -1030,6 +1038,7 @@ static struct phy_driver at803x_driver[] = { .name = "Qualcomm Atheros AR8032", .probe = at803x_probe, .remove = at803x_remove, + .flags = PHY_POLL_CABLE_TEST, .config_init = at803x_config_init, .link_change_notify = at803x_link_change_notify, .set_wol = at803x_set_wol, @@ -1039,15 +1048,20 @@ static struct phy_driver at803x_driver[] = { /* PHY_BASIC_FEATURES */ .ack_interrupt = at803x_ack_interrupt, .config_intr = at803x_config_intr, + .cable_test_start = at803x_cable_test_start, + .cable_test_get_status = at803x_cable_test_get_status, }, { /* ATHEROS AR9331 */ PHY_ID_MATCH_EXACT(ATH9331_PHY_ID), .name = "Qualcomm Atheros AR9331 built-in PHY", .suspend = at803x_suspend, .resume = at803x_resume, + .flags = PHY_POLL_CABLE_TEST, /* PHY_BASIC_FEATURES */ .ack_interrupt = &at803x_ack_interrupt, .config_intr = &at803x_config_intr, + .cable_test_start = at803x_cable_test_start, + .cable_test_get_status = at803x_cable_test_get_status, } }; module_phy_driver(at803x_driver); -- cgit v1.2.3-59-g8ed1b From 2d5d9b7ff49f52ab3ec66ffdc841471a1353ea1b Mon Sep 17 00:00:00 2001 From: Sergey Matyukevich Date: Fri, 24 Apr 2020 14:29:01 +0300 Subject: cfg80211: fix mask type in cfg80211_tid_cfg structure TIDs mask type is u64 in wiphy settings and nl80211 processing, see: - wiphy TIDs mask sizes in tid_config_support structure - prepare driver command in parse_tid_conf Use the same type for TIDs mask in cfg80211_tid_cfg. Signed-off-by: Sergey Matyukevich Link: https://lore.kernel.org/r/20200424112905.26770-2-sergey.matyukevich.os@quantenna.com Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 8b6d5c5184d1..e71d4f690ef1 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -646,7 +646,7 @@ struct cfg80211_chan_def { struct cfg80211_tid_cfg { bool config_override; u8 tids; - u32 mask; + u64 mask; enum nl80211_tid_config noack; u8 retry_long, retry_short; enum nl80211_tid_config ampdu; -- cgit v1.2.3-59-g8ed1b From 60c2ef0ef07f319504763eaaed8cb003af879008 Mon Sep 17 00:00:00 2001 From: Sergey Matyukevich Date: Fri, 24 Apr 2020 14:29:02 +0300 Subject: mac80211: fix variable names in TID config methods Fix all variable names from 'tid' to 'tids' to avoid confusion. Now this is not TID number, but TID mask. Signed-off-by: Sergey Matyukevich Link: https://lore.kernel.org/r/20200424112905.26770-3-sergey.matyukevich.os@quantenna.com Signed-off-by: Johannes Berg --- net/mac80211/cfg.c | 6 +++--- net/mac80211/driver-ops.h | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index 548a384b0509..06a2b7640a9d 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -3957,7 +3957,7 @@ static int ieee80211_set_tid_config(struct wiphy *wiphy, static int ieee80211_reset_tid_config(struct wiphy *wiphy, struct net_device *dev, - const u8 *peer, u8 tid) + const u8 *peer, u8 tids) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); struct sta_info *sta; @@ -3967,7 +3967,7 @@ static int ieee80211_reset_tid_config(struct wiphy *wiphy, return -EOPNOTSUPP; if (!peer) - return drv_reset_tid_config(sdata->local, sdata, NULL, tid); + return drv_reset_tid_config(sdata->local, sdata, NULL, tids); mutex_lock(&sdata->local->sta_mtx); sta = sta_info_get_bss(sdata, peer); @@ -3976,7 +3976,7 @@ static int ieee80211_reset_tid_config(struct wiphy *wiphy, return -ENOENT; } - ret = drv_reset_tid_config(sdata->local, sdata, &sta->sta, tid); + ret = drv_reset_tid_config(sdata->local, sdata, &sta->sta, tids); mutex_unlock(&sdata->local->sta_mtx); return ret; diff --git a/net/mac80211/driver-ops.h b/net/mac80211/driver-ops.h index 3877710e3b48..de69fc9c4f07 100644 --- a/net/mac80211/driver-ops.h +++ b/net/mac80211/driver-ops.h @@ -1375,12 +1375,12 @@ static inline int drv_set_tid_config(struct ieee80211_local *local, static inline int drv_reset_tid_config(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, - struct ieee80211_sta *sta, u8 tid) + struct ieee80211_sta *sta, u8 tids) { int ret; might_sleep(); - ret = local->ops->reset_tid_config(&local->hw, &sdata->vif, sta, tid); + ret = local->ops->reset_tid_config(&local->hw, &sdata->vif, sta, tids); trace_drv_return_int(local, ret); return ret; -- cgit v1.2.3-59-g8ed1b From 33462e68231bccfe563a87614f4c4dd5d333837c Mon Sep 17 00:00:00 2001 From: Sergey Matyukevich Date: Fri, 24 Apr 2020 14:29:03 +0300 Subject: cfg80211: add support for TID specific AMSDU configuration This patch adds support to control per TID MSDU aggregation using the NL80211_TID_CONFIG_ATTR_AMSDU_CTRL attribute. Signed-off-by: Sergey Matyukevich Link: https://lore.kernel.org/r/20200424112905.26770-4-sergey.matyukevich.os@quantenna.com Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 4 +++- include/uapi/linux/nl80211.h | 10 +++++++--- net/wireless/nl80211.c | 8 ++++++++ 3 files changed, 18 insertions(+), 4 deletions(-) diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index e71d4f690ef1..5cacf24cc9f0 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -640,8 +640,9 @@ struct cfg80211_chan_def { * @noack: noack configuration value for the TID * @retry_long: retry count value * @retry_short: retry count value - * @ampdu: Enable/Disable aggregation + * @ampdu: Enable/Disable MPDU aggregation * @rtscts: Enable/Disable RTS/CTS + * @amsdu: Enable/Disable MSDU aggregation */ struct cfg80211_tid_cfg { bool config_override; @@ -651,6 +652,7 @@ struct cfg80211_tid_cfg { u8 retry_long, retry_short; enum nl80211_tid_config ampdu; enum nl80211_tid_config rtscts; + enum nl80211_tid_config amsdu; }; /** diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index 9679d561f7d0..1ccb0bf657ec 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -4844,12 +4844,15 @@ enum nl80211_tid_config { * &NL80211_CMD_SET_TID_CONFIG. Its type is u8, min value is 1 and * the max value is advertised by the driver in this attribute on * output in wiphy capabilities. - * @NL80211_TID_CONFIG_ATTR_AMPDU_CTRL: Enable/Disable aggregation for the TIDs - * specified in %NL80211_TID_CONFIG_ATTR_TIDS. Its type is u8, using - * the values from &nl80211_tid_config. + * @NL80211_TID_CONFIG_ATTR_AMPDU_CTRL: Enable/Disable MPDU aggregation + * for the TIDs specified in %NL80211_TID_CONFIG_ATTR_TIDS. + * Its type is u8, using the values from &nl80211_tid_config. * @NL80211_TID_CONFIG_ATTR_RTSCTS_CTRL: Enable/Disable RTS_CTS for the TIDs * specified in %NL80211_TID_CONFIG_ATTR_TIDS. It is u8 type, using * the values from &nl80211_tid_config. + * @NL80211_TID_CONFIG_ATTR_AMSDU_CTRL: Enable/Disable MSDU aggregation + * for the TIDs specified in %NL80211_TID_CONFIG_ATTR_TIDS. + * Its type is u8, using the values from &nl80211_tid_config. */ enum nl80211_tid_config_attr { __NL80211_TID_CONFIG_ATTR_INVALID, @@ -4863,6 +4866,7 @@ enum nl80211_tid_config_attr { NL80211_TID_CONFIG_ATTR_RETRY_LONG, NL80211_TID_CONFIG_ATTR_AMPDU_CTRL, NL80211_TID_CONFIG_ATTR_RTSCTS_CTRL, + NL80211_TID_CONFIG_ATTR_AMSDU_CTRL, /* keep last */ __NL80211_TID_CONFIG_ATTR_AFTER_LAST, diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index fa66d5b6f557..482a80b78844 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -343,6 +343,8 @@ nl80211_tid_config_attr_policy[NL80211_TID_CONFIG_ATTR_MAX + 1] = { NLA_POLICY_MAX(NLA_U8, NL80211_TID_CONFIG_DISABLE), [NL80211_TID_CONFIG_ATTR_RTSCTS_CTRL] = NLA_POLICY_MAX(NLA_U8, NL80211_TID_CONFIG_DISABLE), + [NL80211_TID_CONFIG_ATTR_AMSDU_CTRL] = + NLA_POLICY_MAX(NLA_U8, NL80211_TID_CONFIG_DISABLE), }; static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = { @@ -14080,6 +14082,12 @@ static int parse_tid_conf(struct cfg80211_registered_device *rdev, nla_get_u8(attrs[NL80211_TID_CONFIG_ATTR_RTSCTS_CTRL]); } + if (attrs[NL80211_TID_CONFIG_ATTR_AMSDU_CTRL]) { + tid_conf->mask |= BIT(NL80211_TID_CONFIG_ATTR_AMSDU_CTRL); + tid_conf->amsdu = + nla_get_u8(attrs[NL80211_TID_CONFIG_ATTR_AMSDU_CTRL]); + } + if (peer) mask = rdev->wiphy.tid_config_support.peer; else -- cgit v1.2.3-59-g8ed1b From c03369558c435f7e82f7c06b0173fa73c1ed15c0 Mon Sep 17 00:00:00 2001 From: Sergey Matyukevich Date: Fri, 24 Apr 2020 14:29:04 +0300 Subject: nl80211: simplify peer specific TID configuration Current rule for applying TID configuration for specific peer looks overly complicated. No need to reject new TID configuration when override flag is specified. Another call with the same TID configuration, but without override flag, allows to apply new configuration anyway. Use the same approach as for the 'all peers' case: if override flag is specified, then reset existing TID configuration and immediately apply a new one. Signed-off-by: Sergey Matyukevich Link: https://lore.kernel.org/r/20200424112905.26770-5-sergey.matyukevich.os@quantenna.com Signed-off-by: Johannes Berg --- include/uapi/linux/nl80211.h | 10 ++++------ net/wireless/nl80211.c | 5 +---- 2 files changed, 5 insertions(+), 10 deletions(-) diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index 1ccb0bf657ec..d1b1d9e49887 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -4823,12 +4823,10 @@ enum nl80211_tid_config { * (%NL80211_TID_CONFIG_ATTR_TIDS, %NL80211_TID_CONFIG_ATTR_OVERRIDE). * @NL80211_TID_CONFIG_ATTR_PEER_SUPP: same as the previous per-vif one, but * per peer instead. - * @NL80211_TID_CONFIG_ATTR_OVERRIDE: flag attribue, if no peer - * is selected, if set indicates that the new configuration overrides - * all previous peer configurations, otherwise previous peer specific - * configurations should be left untouched. If peer is selected then - * it will reset particular TID configuration of that peer and it will - * not accept other TID config attributes along with peer. + * @NL80211_TID_CONFIG_ATTR_OVERRIDE: flag attribue, if set indicates + * that the new configuration overrides all previous peer + * configurations, otherwise previous peer specific configurations + * should be left untouched. * @NL80211_TID_CONFIG_ATTR_TIDS: a bitmask value of TIDs (bit 0 to 7) * Its type is u16. * @NL80211_TID_CONFIG_ATTR_NOACK: Configure ack policy for the TID. diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 482a80b78844..258c621f651c 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -14036,10 +14036,7 @@ static int parse_tid_conf(struct cfg80211_registered_device *rdev, if (rdev->ops->reset_tid_config) { err = rdev_reset_tid_config(rdev, dev, peer, tid_conf->tids); - /* If peer is there no other configuration will be - * allowed - */ - if (err || peer) + if (err) return err; } else { return -EINVAL; -- cgit v1.2.3-59-g8ed1b From e76fede8bf7c90d92c799d9ceb092dec48346e2c Mon Sep 17 00:00:00 2001 From: Thomas Pedersen Date: Thu, 30 Apr 2020 10:25:50 -0700 Subject: cfg80211: add KHz variants of frame RX API Drivers may wish to report the RX frequency in units of KHz. Provide cfg80211_rx_mgmt_khz() and wrap it with cfg80211_rx_mgmt() so exisiting drivers which can't report KHz anyway don't need to change. Add a similar wrapper for cfg80211_report_obss_beacon() so the frequency units stay somewhat consistent. This doesn't actually change the nl80211 API yet. Signed-off-by: Thomas Pedersen Link: https://lore.kernel.org/r/20200430172554.18383-2-thomas@adapt-ip.com [fix mac80211 calling the non-khz version of obss beacon report, drop trace point name changes] Signed-off-by: Johannes Berg --- include/linux/ieee80211.h | 2 ++ include/net/cfg80211.h | 54 ++++++++++++++++++++++++++++++++++++++++++----- net/mac80211/rx.c | 12 ++++++----- net/wireless/mlme.c | 6 +++--- net/wireless/nl80211.c | 12 +++++------ net/wireless/trace.h | 8 +++---- 6 files changed, 71 insertions(+), 23 deletions(-) diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index a561db435a4b..41d5f000c0d9 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -3333,6 +3333,8 @@ static inline int ieee80211_get_tdls_action(struct sk_buff *skb, u32 hdr_size) /* convert frequencies */ #define MHZ_TO_KHZ(freq) ((freq) * 1000) #define KHZ_TO_MHZ(freq) ((freq) / 1000) +#define PR_KHZ(f) KHZ_TO_MHZ(f), f % 1000 +#define KHZ_F "%d.%03d" /* convert powers */ #define DBI_TO_MBI(gain) ((gain) * 100) diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 5cacf24cc9f0..7415f77d99ca 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -6988,6 +6988,26 @@ void cfg80211_conn_failed(struct net_device *dev, const u8 *mac_addr, enum nl80211_connect_failed_reason reason, gfp_t gfp); +/** + * cfg80211_rx_mgmt_khz - notification of received, unprocessed management frame + * @wdev: wireless device receiving the frame + * @freq: Frequency on which the frame was received in KHz + * @sig_dbm: signal strength in dBm, or 0 if unknown + * @buf: Management frame (header + body) + * @len: length of the frame data + * @flags: flags, as defined in enum nl80211_rxmgmt_flags + * + * This function is called whenever an Action frame is received for a station + * mode interface, but is not processed in kernel. + * + * Return: %true if a user space application has registered for this frame. + * For action frames, that makes it responsible for rejecting unrecognized + * action frames; %false otherwise, in which case for action frames the + * driver is responsible for rejecting the frame. + */ +bool cfg80211_rx_mgmt_khz(struct wireless_dev *wdev, int freq, int sig_dbm, + const u8 *buf, size_t len, u32 flags); + /** * cfg80211_rx_mgmt - notification of received, unprocessed management frame * @wdev: wireless device receiving the frame @@ -7005,8 +7025,13 @@ void cfg80211_conn_failed(struct net_device *dev, const u8 *mac_addr, * action frames; %false otherwise, in which case for action frames the * driver is responsible for rejecting the frame. */ -bool cfg80211_rx_mgmt(struct wireless_dev *wdev, int freq, int sig_dbm, - const u8 *buf, size_t len, u32 flags); +static inline bool cfg80211_rx_mgmt(struct wireless_dev *wdev, int freq, + int sig_dbm, const u8 *buf, size_t len, + u32 flags) +{ + return cfg80211_rx_mgmt_khz(wdev, MHZ_TO_KHZ(freq), sig_dbm, buf, len, + flags); +} /** * cfg80211_mgmt_tx_status - notification of TX status for management frame @@ -7204,6 +7229,21 @@ void cfg80211_probe_status(struct net_device *dev, const u8 *addr, u64 cookie, bool acked, s32 ack_signal, bool is_valid_ack_signal, gfp_t gfp); +/** + * cfg80211_report_obss_beacon_khz - report beacon from other APs + * @wiphy: The wiphy that received the beacon + * @frame: the frame + * @len: length of the frame + * @freq: frequency the frame was received on in KHz + * @sig_dbm: signal strength in dBm, or 0 if unknown + * + * Use this function to report to userspace when a beacon was + * received. It is not useful to call this when there is no + * netdev that is in AP/GO mode. + */ +void cfg80211_report_obss_beacon_khz(struct wiphy *wiphy, const u8 *frame, + size_t len, int freq, int sig_dbm); + /** * cfg80211_report_obss_beacon - report beacon from other APs * @wiphy: The wiphy that received the beacon @@ -7216,9 +7256,13 @@ void cfg80211_probe_status(struct net_device *dev, const u8 *addr, * received. It is not useful to call this when there is no * netdev that is in AP/GO mode. */ -void cfg80211_report_obss_beacon(struct wiphy *wiphy, - const u8 *frame, size_t len, - int freq, int sig_dbm); +static inline void cfg80211_report_obss_beacon(struct wiphy *wiphy, + const u8 *frame, size_t len, + int freq, int sig_dbm) +{ + cfg80211_report_obss_beacon_khz(wiphy, frame, len, MHZ_TO_KHZ(freq), + sig_dbm); +} /** * cfg80211_reg_can_beacon - check if beaconing is allowed diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index eaf8931e4627..8e47b0d31051 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -3095,9 +3095,10 @@ ieee80211_rx_h_mgmt_check(struct ieee80211_rx_data *rx) !(status->flag & RX_FLAG_NO_SIGNAL_VAL)) sig = status->signal; - cfg80211_report_obss_beacon(rx->local->hw.wiphy, - rx->skb->data, rx->skb->len, - status->freq, sig); + cfg80211_report_obss_beacon_khz(rx->local->hw.wiphy, + rx->skb->data, rx->skb->len, + ieee80211_rx_status_to_khz(status), + sig); rx->flags |= IEEE80211_RX_BEACON_REPORTED; } @@ -3443,8 +3444,9 @@ ieee80211_rx_h_userspace_mgmt(struct ieee80211_rx_data *rx) !(status->flag & RX_FLAG_NO_SIGNAL_VAL)) sig = status->signal; - if (cfg80211_rx_mgmt(&rx->sdata->wdev, status->freq, sig, - rx->skb->data, rx->skb->len, 0)) { + if (cfg80211_rx_mgmt_khz(&rx->sdata->wdev, + ieee80211_rx_status_to_khz(status), sig, + rx->skb->data, rx->skb->len, 0)) { if (rx->sta) rx->sta->rx_stats.packets++; dev_kfree_skb(rx->skb); diff --git a/net/wireless/mlme.c b/net/wireless/mlme.c index 409497a3527d..189334314cba 100644 --- a/net/wireless/mlme.c +++ b/net/wireless/mlme.c @@ -729,8 +729,8 @@ int cfg80211_mlme_mgmt_tx(struct cfg80211_registered_device *rdev, return rdev_mgmt_tx(rdev, wdev, params, cookie); } -bool cfg80211_rx_mgmt(struct wireless_dev *wdev, int freq, int sig_dbm, - const u8 *buf, size_t len, u32 flags) +bool cfg80211_rx_mgmt_khz(struct wireless_dev *wdev, int freq, int sig_dbm, + const u8 *buf, size_t len, u32 flags) { struct wiphy *wiphy = wdev->wiphy; struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); @@ -785,7 +785,7 @@ bool cfg80211_rx_mgmt(struct wireless_dev *wdev, int freq, int sig_dbm, trace_cfg80211_return_bool(result); return result; } -EXPORT_SYMBOL(cfg80211_rx_mgmt); +EXPORT_SYMBOL(cfg80211_rx_mgmt_khz); void cfg80211_sched_dfs_chan_update(struct cfg80211_registered_device *rdev) { diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 258c621f651c..f6523f1485a3 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -16214,7 +16214,7 @@ int nl80211_send_mgmt(struct cfg80211_registered_device *rdev, netdev->ifindex)) || nla_put_u64_64bit(msg, NL80211_ATTR_WDEV, wdev_id(wdev), NL80211_ATTR_PAD) || - nla_put_u32(msg, NL80211_ATTR_WIPHY_FREQ, freq) || + nla_put_u32(msg, NL80211_ATTR_WIPHY_FREQ, KHZ_TO_MHZ(freq)) || (sig_dbm && nla_put_u32(msg, NL80211_ATTR_RX_SIGNAL_DBM, sig_dbm)) || nla_put(msg, NL80211_ATTR_FRAME, len, buf) || @@ -16840,9 +16840,8 @@ void cfg80211_probe_status(struct net_device *dev, const u8 *addr, } EXPORT_SYMBOL(cfg80211_probe_status); -void cfg80211_report_obss_beacon(struct wiphy *wiphy, - const u8 *frame, size_t len, - int freq, int sig_dbm) +void cfg80211_report_obss_beacon_khz(struct wiphy *wiphy, const u8 *frame, + size_t len, int freq, int sig_dbm) { struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); struct sk_buff *msg; @@ -16865,7 +16864,8 @@ void cfg80211_report_obss_beacon(struct wiphy *wiphy, if (nla_put_u32(msg, NL80211_ATTR_WIPHY, rdev->wiphy_idx) || (freq && - nla_put_u32(msg, NL80211_ATTR_WIPHY_FREQ, freq)) || + nla_put_u32(msg, NL80211_ATTR_WIPHY_FREQ, + KHZ_TO_MHZ(freq))) || (sig_dbm && nla_put_u32(msg, NL80211_ATTR_RX_SIGNAL_DBM, sig_dbm)) || nla_put(msg, NL80211_ATTR_FRAME, len, frame)) @@ -16882,7 +16882,7 @@ void cfg80211_report_obss_beacon(struct wiphy *wiphy, spin_unlock_bh(&rdev->beacon_registrations_lock); nlmsg_free(msg); } -EXPORT_SYMBOL(cfg80211_report_obss_beacon); +EXPORT_SYMBOL(cfg80211_report_obss_beacon_khz); #ifdef CONFIG_PM static int cfg80211_net_detect_results(struct sk_buff *msg, diff --git a/net/wireless/trace.h b/net/wireless/trace.h index 53c887ea67c7..f2ab44a2a3e4 100644 --- a/net/wireless/trace.h +++ b/net/wireless/trace.h @@ -2840,8 +2840,8 @@ TRACE_EVENT(cfg80211_rx_mgmt, __entry->freq = freq; __entry->sig_dbm = sig_dbm; ), - TP_printk(WDEV_PR_FMT ", freq: %d, sig dbm: %d", - WDEV_PR_ARG, __entry->freq, __entry->sig_dbm) + TP_printk(WDEV_PR_FMT ", freq: "KHZ_F", sig dbm: %d", + WDEV_PR_ARG, PR_KHZ(__entry->freq), __entry->sig_dbm) ); TRACE_EVENT(cfg80211_mgmt_tx_status, @@ -3121,8 +3121,8 @@ TRACE_EVENT(cfg80211_report_obss_beacon, __entry->freq = freq; __entry->sig_dbm = sig_dbm; ), - TP_printk(WIPHY_PR_FMT ", freq: %d, sig_dbm: %d", - WIPHY_PR_ARG, __entry->freq, __entry->sig_dbm) + TP_printk(WIPHY_PR_FMT ", freq: "KHZ_F", sig_dbm: %d", + WIPHY_PR_ARG, PR_KHZ(__entry->freq), __entry->sig_dbm) ); TRACE_EVENT(cfg80211_tdls_oper_request, -- cgit v1.2.3-59-g8ed1b From 942ba88ba9c87f5e225574f1f0d6548f0105ed73 Mon Sep 17 00:00:00 2001 From: Thomas Pedersen Date: Thu, 30 Apr 2020 10:25:51 -0700 Subject: nl80211: add KHz frequency offset for most wifi commands cfg80211 recently gained the ability to understand a frequency offset component in KHz. Expose this in nl80211 through the new attributes NL80211_ATTR_WIPHY_FREQ_OFFSET, NL80211_FREQUENCY_ATTR_OFFSET, NL80211_ATTR_CENTER_FREQ1_OFFSET, and NL80211_BSS_FREQUENCY_OFFSET. These add support to send and receive a KHz offset component with the following NL80211 commands: - NL80211_CMD_FRAME - NL80211_CMD_GET_SCAN - NL80211_CMD_AUTHENTICATE - NL80211_CMD_ASSOCIATE - NL80211_CMD_CONNECT Along with any other command which takes a chandef, ie: - NL80211_CMD_SET_CHANNEL - NL80211_CMD_SET_WIPHY - NL80211_CMD_START_AP - NL80211_CMD_RADAR_DETECT - NL80211_CMD_NOTIFY_RADAR - NL80211_CMD_CHANNEL_SWITCH - NL80211_JOIN_IBSS - NL80211_CMD_REMAIN_ON_CHANNEL - NL80211_CMD_JOIN_OCB - NL80211_CMD_JOIN_MESH - NL80211_CMD_TDLS_CHANNEL_SWITCH If the driver advertises a band containing channels with frequency offset, it must also verify support for frequency offset channels in its cfg80211 ops, or return an error. Signed-off-by: Thomas Pedersen Link: https://lore.kernel.org/r/20200430172554.18383-3-thomas@adapt-ip.com Signed-off-by: Johannes Berg --- include/uapi/linux/nl80211.h | 50 ++++++++++++++++++---------- net/wireless/nl80211.c | 78 ++++++++++++++++++++++++++++++++------------ 2 files changed, 91 insertions(+), 37 deletions(-) diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index d1b1d9e49887..b1cd132c1d27 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -296,13 +296,14 @@ * to get a list of all present wiphys. * @NL80211_CMD_SET_WIPHY: set wiphy parameters, needs %NL80211_ATTR_WIPHY or * %NL80211_ATTR_IFINDEX; can be used to set %NL80211_ATTR_WIPHY_NAME, - * %NL80211_ATTR_WIPHY_TXQ_PARAMS, %NL80211_ATTR_WIPHY_FREQ (and the - * attributes determining the channel width; this is used for setting - * monitor mode channel), %NL80211_ATTR_WIPHY_RETRY_SHORT, - * %NL80211_ATTR_WIPHY_RETRY_LONG, %NL80211_ATTR_WIPHY_FRAG_THRESHOLD, - * and/or %NL80211_ATTR_WIPHY_RTS_THRESHOLD. - * However, for setting the channel, see %NL80211_CMD_SET_CHANNEL - * instead, the support here is for backward compatibility only. + * %NL80211_ATTR_WIPHY_TXQ_PARAMS, %NL80211_ATTR_WIPHY_FREQ, + * %NL80211_ATTR_WIPHY_FREQ_OFFSET (and the attributes determining the + * channel width; this is used for setting monitor mode channel), + * %NL80211_ATTR_WIPHY_RETRY_SHORT, %NL80211_ATTR_WIPHY_RETRY_LONG, + * %NL80211_ATTR_WIPHY_FRAG_THRESHOLD, and/or + * %NL80211_ATTR_WIPHY_RTS_THRESHOLD. However, for setting the channel, + * see %NL80211_CMD_SET_CHANNEL instead, the support here is for backward + * compatibility only. * @NL80211_CMD_NEW_WIPHY: Newly created wiphy, response to get request * or rename notification. Has attributes %NL80211_ATTR_WIPHY and * %NL80211_ATTR_WIPHY_NAME. @@ -351,7 +352,8 @@ * %NL80211_ATTR_AUTH_TYPE, %NL80211_ATTR_INACTIVITY_TIMEOUT, * %NL80211_ATTR_ACL_POLICY and %NL80211_ATTR_MAC_ADDRS. * The channel to use can be set on the interface or be given using the - * %NL80211_ATTR_WIPHY_FREQ and the attributes determining channel width. + * %NL80211_ATTR_WIPHY_FREQ and %NL80211_ATTR_WIPHY_FREQ_OFFSET, and the + * attributes determining channel width. * @NL80211_CMD_NEW_BEACON: old alias for %NL80211_CMD_START_AP * @NL80211_CMD_STOP_AP: Stop AP operation on the given interface * @NL80211_CMD_DEL_BEACON: old alias for %NL80211_CMD_STOP_AP @@ -536,11 +538,12 @@ * interface. %NL80211_ATTR_MAC is used to specify PeerSTAAddress (and * BSSID in case of station mode). %NL80211_ATTR_SSID is used to specify * the SSID (mainly for association, but is included in authentication - * request, too, to help BSS selection. %NL80211_ATTR_WIPHY_FREQ is used - * to specify the frequence of the channel in MHz. %NL80211_ATTR_AUTH_TYPE - * is used to specify the authentication type. %NL80211_ATTR_IE is used to - * define IEs (VendorSpecificInfo, but also including RSN IE and FT IEs) - * to be added to the frame. + * request, too, to help BSS selection. %NL80211_ATTR_WIPHY_FREQ + + * %NL80211_ATTR_WIPHY_FREQ_OFFSET is used to specify the frequence of the + * channel in MHz. %NL80211_ATTR_AUTH_TYPE is used to specify the + * authentication type. %NL80211_ATTR_IE is used to define IEs + * (VendorSpecificInfo, but also including RSN IE and FT IEs) to be added + * to the frame. * When used as an event, this reports reception of an Authentication * frame in station and IBSS modes when the local MLME processed the * frame, i.e., it was for the local STA and was received in correct @@ -595,8 +598,9 @@ * requests to connect to a specified network but without separating * auth and assoc steps. For this, you need to specify the SSID in a * %NL80211_ATTR_SSID attribute, and can optionally specify the association - * IEs in %NL80211_ATTR_IE, %NL80211_ATTR_AUTH_TYPE, %NL80211_ATTR_USE_MFP, - * %NL80211_ATTR_MAC, %NL80211_ATTR_WIPHY_FREQ, %NL80211_ATTR_CONTROL_PORT, + * IEs in %NL80211_ATTR_IE, %NL80211_ATTR_AUTH_TYPE, + * %NL80211_ATTR_USE_MFP, %NL80211_ATTR_MAC, %NL80211_ATTR_WIPHY_FREQ, + * %NL80211_ATTR_WIPHY_FREQ_OFFSET, %NL80211_ATTR_CONTROL_PORT, * %NL80211_ATTR_CONTROL_PORT_ETHERTYPE, * %NL80211_ATTR_CONTROL_PORT_NO_ENCRYPT, * %NL80211_ATTR_CONTROL_PORT_OVER_NL80211, %NL80211_ATTR_MAC_HINT, and @@ -1433,7 +1437,8 @@ enum nl80211_commands { * of &enum nl80211_chan_width, describing the channel width. See the * documentation of the enum for more information. * @NL80211_ATTR_CENTER_FREQ1: Center frequency of the first part of the - * channel, used for anything but 20 MHz bandwidth + * channel, used for anything but 20 MHz bandwidth. In S1G this is the + * operating channel center frequency. * @NL80211_ATTR_CENTER_FREQ2: Center frequency of the second part of the * channel, used only for 80+80 MHz bandwidth * @NL80211_ATTR_WIPHY_CHANNEL_TYPE: included with NL80211_ATTR_WIPHY_FREQ @@ -2480,9 +2485,14 @@ enum nl80211_commands { * entry without having to force a disconnection after the PMK timeout. If * no roaming occurs between the reauth threshold and PMK expiration, * disassociation is still forced. - * * @NL80211_ATTR_RECEIVE_MULTICAST: multicast flag for the * %NL80211_CMD_REGISTER_FRAME command, see the description there. + * @NL80211_ATTR_WIPHY_FREQ_OFFSET: offset of the associated + * %NL80211_ATTR_WIPHY_FREQ in positive KHz. Only valid when supplied with + * an %NL80211_ATTR_WIPHY_FREQ_OFFSET. + * @NL80211_ATTR_CENTER_FREQ1_OFFSET: Center frequency offset in KHz for the + * first channel segment specified in %NL80211_ATTR_CENTER_FREQ1. + * * * @NUM_NL80211_ATTR: total number of nl80211_attrs available * @NL80211_ATTR_MAX: highest attribute number currently defined @@ -2960,6 +2970,8 @@ enum nl80211_attrs { NL80211_ATTR_PMK_REAUTH_THRESHOLD, NL80211_ATTR_RECEIVE_MULTICAST, + NL80211_ATTR_WIPHY_FREQ_OFFSET, + NL80211_ATTR_CENTER_FREQ1_OFFSET, /* add attributes here, update the policy in nl80211.c */ @@ -3682,6 +3694,7 @@ enum nl80211_wmm_rule { * (see &enum nl80211_wmm_rule) * @NL80211_FREQUENCY_ATTR_NO_HE: HE operation is not allowed on this channel * in current regulatory domain. + * @NL80211_FREQUENCY_ATTR_OFFSET: frequency offset in KHz * @NL80211_FREQUENCY_ATTR_MAX: highest frequency attribute number * currently defined * @__NL80211_FREQUENCY_ATTR_AFTER_LAST: internal use @@ -3712,6 +3725,7 @@ enum nl80211_frequency_attr { NL80211_FREQUENCY_ATTR_NO_10MHZ, NL80211_FREQUENCY_ATTR_WMM, NL80211_FREQUENCY_ATTR_NO_HE, + NL80211_FREQUENCY_ATTR_OFFSET, /* keep last */ __NL80211_FREQUENCY_ATTR_AFTER_LAST, @@ -4482,6 +4496,7 @@ enum nl80211_bss_scan_width { * @NL80211_BSS_CHAIN_SIGNAL: per-chain signal strength of last BSS update. * Contains a nested array of signal strength attributes (u8, dBm), * using the nesting index as the antenna number. + * @NL80211_BSS_FREQUENCY_OFFSET: frequency offset in KHz * @__NL80211_BSS_AFTER_LAST: internal * @NL80211_BSS_MAX: highest BSS attribute */ @@ -4506,6 +4521,7 @@ enum nl80211_bss { NL80211_BSS_PARENT_TSF, NL80211_BSS_PARENT_BSSID, NL80211_BSS_CHAIN_SIGNAL, + NL80211_BSS_FREQUENCY_OFFSET, /* keep last */ __NL80211_BSS_AFTER_LAST, diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index f6523f1485a3..87d7efd186d0 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -365,6 +365,7 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = { [NL80211_ATTR_CHANNEL_WIDTH] = { .type = NLA_U32 }, [NL80211_ATTR_CENTER_FREQ1] = { .type = NLA_U32 }, + [NL80211_ATTR_CENTER_FREQ1_OFFSET] = NLA_POLICY_RANGE(NLA_U32, 0, 999), [NL80211_ATTR_CENTER_FREQ2] = { .type = NLA_U32 }, [NL80211_ATTR_WIPHY_RETRY_SHORT] = NLA_POLICY_MIN(NLA_U8, 1), @@ -638,6 +639,7 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = { [NL80211_ATTR_PMK_LIFETIME] = NLA_POLICY_MIN(NLA_U32, 1), [NL80211_ATTR_PMK_REAUTH_THRESHOLD] = NLA_POLICY_RANGE(NLA_U8, 1, 100), [NL80211_ATTR_RECEIVE_MULTICAST] = { .type = NLA_FLAG }, + [NL80211_ATTR_WIPHY_FREQ_OFFSET] = NLA_POLICY_RANGE(NLA_U32, 0, 999), }; /* policy for the key attributes */ @@ -904,6 +906,9 @@ static int nl80211_msg_put_channel(struct sk_buff *msg, struct wiphy *wiphy, chan->center_freq)) goto nla_put_failure; + if (nla_put_u32(msg, NL80211_FREQUENCY_ATTR_OFFSET, chan->freq_offset)) + goto nla_put_failure; + if ((chan->flags & IEEE80211_CHAN_DISABLED) && nla_put_flag(msg, NL80211_FREQUENCY_ATTR_DISABLED)) goto nla_put_failure; @@ -1309,13 +1314,11 @@ static int nl80211_key_allowed(struct wireless_dev *wdev) } static struct ieee80211_channel *nl80211_get_valid_chan(struct wiphy *wiphy, - struct nlattr *tb) + u32 freq) { struct ieee80211_channel *chan; - if (tb == NULL) - return NULL; - chan = ieee80211_get_channel(wiphy, nla_get_u32(tb)); + chan = ieee80211_get_channel_khz(wiphy, freq); if (!chan || chan->flags & IEEE80211_CHAN_DISABLED) return NULL; return chan; @@ -2770,13 +2773,17 @@ int nl80211_parse_chandef(struct cfg80211_registered_device *rdev, if (!attrs[NL80211_ATTR_WIPHY_FREQ]) return -EINVAL; - control_freq = nla_get_u32(attrs[NL80211_ATTR_WIPHY_FREQ]); + control_freq = MHZ_TO_KHZ( + nla_get_u32(info->attrs[NL80211_ATTR_WIPHY_FREQ])); + if (info->attrs[NL80211_ATTR_WIPHY_FREQ_OFFSET]) + control_freq += + nla_get_u32(info->attrs[NL80211_ATTR_WIPHY_FREQ_OFFSET]); memset(chandef, 0, sizeof(*chandef)); - - chandef->chan = ieee80211_get_channel(&rdev->wiphy, control_freq); + chandef->chan = ieee80211_get_channel_khz(&rdev->wiphy, control_freq); chandef->width = NL80211_CHAN_WIDTH_20_NOHT; - chandef->center_freq1 = control_freq; + chandef->center_freq1 = KHZ_TO_MHZ(control_freq); + chandef->freq1_offset = control_freq % 1000; chandef->center_freq2 = 0; /* Primary channel not allowed */ @@ -2824,9 +2831,15 @@ int nl80211_parse_chandef(struct cfg80211_registered_device *rdev, } else if (attrs[NL80211_ATTR_CHANNEL_WIDTH]) { chandef->width = nla_get_u32(attrs[NL80211_ATTR_CHANNEL_WIDTH]); - if (attrs[NL80211_ATTR_CENTER_FREQ1]) + if (attrs[NL80211_ATTR_CENTER_FREQ1]) { chandef->center_freq1 = nla_get_u32(attrs[NL80211_ATTR_CENTER_FREQ1]); + if (attrs[NL80211_ATTR_CENTER_FREQ1_OFFSET]) + chandef->freq1_offset = nla_get_u32( + attrs[NL80211_ATTR_CENTER_FREQ1_OFFSET]); + else + chandef->freq1_offset = 0; + } if (attrs[NL80211_ATTR_CENTER_FREQ2]) chandef->center_freq2 = nla_get_u32(attrs[NL80211_ATTR_CENTER_FREQ2]); @@ -3259,6 +3272,9 @@ static int nl80211_send_chandef(struct sk_buff *msg, if (nla_put_u32(msg, NL80211_ATTR_WIPHY_FREQ, chandef->chan->center_freq)) return -ENOBUFS; + if (nla_put_u32(msg, NL80211_ATTR_WIPHY_FREQ_OFFSET, + chandef->chan->freq_offset)) + return -ENOBUFS; switch (chandef->width) { case NL80211_CHAN_WIDTH_20_NOHT: case NL80211_CHAN_WIDTH_20: @@ -8873,6 +8889,8 @@ static int nl80211_send_bss(struct sk_buff *msg, struct netlink_callback *cb, goto nla_put_failure; if (nla_put_u16(msg, NL80211_BSS_CAPABILITY, res->capability) || nla_put_u32(msg, NL80211_BSS_FREQUENCY, res->channel->center_freq) || + nla_put_u32(msg, NL80211_BSS_FREQUENCY_OFFSET, + res->channel->freq_offset) || nla_put_u32(msg, NL80211_BSS_CHAN_WIDTH, res->scan_width) || nla_put_u32(msg, NL80211_BSS_SEEN_MS_AGO, jiffies_to_msecs(jiffies - intbss->ts))) @@ -9141,6 +9159,7 @@ static int nl80211_authenticate(struct sk_buff *skb, struct genl_info *info) enum nl80211_auth_type auth_type; struct key_parse key; bool local_state_change; + u32 freq; if (!info->attrs[NL80211_ATTR_MAC]) return -EINVAL; @@ -9197,8 +9216,12 @@ static int nl80211_authenticate(struct sk_buff *skb, struct genl_info *info) return -EOPNOTSUPP; bssid = nla_data(info->attrs[NL80211_ATTR_MAC]); - chan = nl80211_get_valid_chan(&rdev->wiphy, - info->attrs[NL80211_ATTR_WIPHY_FREQ]); + freq = MHZ_TO_KHZ(nla_get_u32(info->attrs[NL80211_ATTR_WIPHY_FREQ])); + if (info->attrs[NL80211_ATTR_WIPHY_FREQ_OFFSET]) + freq += + nla_get_u32(info->attrs[NL80211_ATTR_WIPHY_FREQ_OFFSET]); + + chan = nl80211_get_valid_chan(&rdev->wiphy, freq); if (!chan) return -EINVAL; @@ -9388,6 +9411,7 @@ static int nl80211_associate(struct sk_buff *skb, struct genl_info *info) struct cfg80211_assoc_request req = {}; const u8 *bssid, *ssid; int err, ssid_len = 0; + u32 freq; if (dev->ieee80211_ptr->conn_owner_nlportid && dev->ieee80211_ptr->conn_owner_nlportid != info->snd_portid) @@ -9407,8 +9431,11 @@ static int nl80211_associate(struct sk_buff *skb, struct genl_info *info) bssid = nla_data(info->attrs[NL80211_ATTR_MAC]); - chan = nl80211_get_valid_chan(&rdev->wiphy, - info->attrs[NL80211_ATTR_WIPHY_FREQ]); + freq = MHZ_TO_KHZ(nla_get_u32(info->attrs[NL80211_ATTR_WIPHY_FREQ])); + if (info->attrs[NL80211_ATTR_WIPHY_FREQ_OFFSET]) + freq += + nla_get_u32(info->attrs[NL80211_ATTR_WIPHY_FREQ_OFFSET]); + chan = nl80211_get_valid_chan(&rdev->wiphy, freq); if (!chan) return -EINVAL; @@ -10088,6 +10115,7 @@ static int nl80211_connect(struct sk_buff *skb, struct genl_info *info) struct cfg80211_connect_params connect; struct wiphy *wiphy; struct cfg80211_cached_keys *connkeys = NULL; + u32 freq = 0; int err; memset(&connect, 0, sizeof(connect)); @@ -10158,14 +10186,21 @@ static int nl80211_connect(struct sk_buff *skb, struct genl_info *info) connect.prev_bssid = nla_data(info->attrs[NL80211_ATTR_PREV_BSSID]); - if (info->attrs[NL80211_ATTR_WIPHY_FREQ]) { - connect.channel = nl80211_get_valid_chan( - wiphy, info->attrs[NL80211_ATTR_WIPHY_FREQ]); + if (info->attrs[NL80211_ATTR_WIPHY_FREQ]) + freq = MHZ_TO_KHZ(nla_get_u32( + info->attrs[NL80211_ATTR_WIPHY_FREQ])); + if (info->attrs[NL80211_ATTR_WIPHY_FREQ_OFFSET]) + freq += + nla_get_u32(info->attrs[NL80211_ATTR_WIPHY_FREQ_OFFSET]); + + if (freq) { + connect.channel = nl80211_get_valid_chan(wiphy, freq); if (!connect.channel) return -EINVAL; } else if (info->attrs[NL80211_ATTR_WIPHY_FREQ_HINT]) { - connect.channel_hint = nl80211_get_valid_chan( - wiphy, info->attrs[NL80211_ATTR_WIPHY_FREQ_HINT]); + freq = nla_get_u32(info->attrs[NL80211_ATTR_WIPHY_FREQ_HINT]); + freq = MHZ_TO_KHZ(freq); + connect.channel_hint = nl80211_get_valid_chan(wiphy, freq); if (!connect.channel_hint) return -EINVAL; } @@ -16215,6 +16250,7 @@ int nl80211_send_mgmt(struct cfg80211_registered_device *rdev, nla_put_u64_64bit(msg, NL80211_ATTR_WDEV, wdev_id(wdev), NL80211_ATTR_PAD) || nla_put_u32(msg, NL80211_ATTR_WIPHY_FREQ, KHZ_TO_MHZ(freq)) || + nla_put_u32(msg, NL80211_ATTR_WIPHY_FREQ_OFFSET, freq % 1000) || (sig_dbm && nla_put_u32(msg, NL80211_ATTR_RX_SIGNAL_DBM, sig_dbm)) || nla_put(msg, NL80211_ATTR_FRAME, len, buf) || @@ -16864,8 +16900,10 @@ void cfg80211_report_obss_beacon_khz(struct wiphy *wiphy, const u8 *frame, if (nla_put_u32(msg, NL80211_ATTR_WIPHY, rdev->wiphy_idx) || (freq && - nla_put_u32(msg, NL80211_ATTR_WIPHY_FREQ, - KHZ_TO_MHZ(freq))) || + (nla_put_u32(msg, NL80211_ATTR_WIPHY_FREQ, + KHZ_TO_MHZ(freq)) || + nla_put_u32(msg, NL80211_ATTR_WIPHY_FREQ_OFFSET, + freq % 1000))) || (sig_dbm && nla_put_u32(msg, NL80211_ATTR_RX_SIGNAL_DBM, sig_dbm)) || nla_put(msg, NL80211_ATTR_FRAME, len, frame)) -- cgit v1.2.3-59-g8ed1b From 2032f3b2f943256ff40df23182913dfc7e73ec6a Mon Sep 17 00:00:00 2001 From: Thomas Pedersen Date: Thu, 30 Apr 2020 10:25:52 -0700 Subject: nl80211: support scan frequencies in KHz If the driver advertises NL80211_EXT_FEATURE_SCAN_FREQ_KHZ userspace can omit NL80211_ATTR_SCAN_FREQUENCIES in favor of an NL80211_ATTR_SCAN_FREQ_KHZ. To get scan results in KHz userspace must also set the NL80211_SCAN_FLAG_FREQ_KHZ. This lets nl80211 remain compatible with older userspaces while not requring and sending redundant (and potentially incorrect) scan frequency sets. Signed-off-by: Thomas Pedersen Link: https://lore.kernel.org/r/20200430172554.18383-4-thomas@adapt-ip.com [use just nla_nest_start() (not _noflag) for NL80211_ATTR_SCAN_FREQ_KHZ] Signed-off-by: Johannes Berg --- include/uapi/linux/nl80211.h | 13 ++++++++++- net/mac80211/main.c | 2 ++ net/wireless/nl80211.c | 51 +++++++++++++++++++++++++++++++++----------- 3 files changed, 53 insertions(+), 13 deletions(-) diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index b1cd132c1d27..47d39b6a073d 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -2492,7 +2492,7 @@ enum nl80211_commands { * an %NL80211_ATTR_WIPHY_FREQ_OFFSET. * @NL80211_ATTR_CENTER_FREQ1_OFFSET: Center frequency offset in KHz for the * first channel segment specified in %NL80211_ATTR_CENTER_FREQ1. - * + * @NL80211_ATTR_SCAN_FREQ_KHZ: nested attribute with KHz frequencies * * @NUM_NL80211_ATTR: total number of nl80211_attrs available * @NL80211_ATTR_MAX: highest attribute number currently defined @@ -2972,6 +2972,7 @@ enum nl80211_attrs { NL80211_ATTR_RECEIVE_MULTICAST, NL80211_ATTR_WIPHY_FREQ_OFFSET, NL80211_ATTR_CENTER_FREQ1_OFFSET, + NL80211_ATTR_SCAN_FREQ_KHZ, /* add attributes here, update the policy in nl80211.c */ @@ -5723,6 +5724,11 @@ enum nl80211_feature_flags { * @NL80211_EXT_FEATURE_MULTICAST_REGISTRATIONS: management frame registrations * are possible for multicast frames and those will be reported properly. * + * @NL80211_EXT_FEATURE_SCAN_FREQ_KHZ: This driver supports receiving and + * reporting scan request with %NL80211_ATTR_SCAN_FREQ_KHZ. In order to + * report %NL80211_ATTR_SCAN_FREQ_KHZ, %NL80211_SCAN_FLAG_FREQ_KHZ must be + * included in the scan request. + * * @NUM_NL80211_EXT_FEATURES: number of extended features. * @MAX_NL80211_EXT_FEATURES: highest extended feature index. */ @@ -5776,6 +5782,7 @@ enum nl80211_ext_feature_index { NL80211_EXT_FEATURE_DEL_IBSS_STA, NL80211_EXT_FEATURE_MULTICAST_REGISTRATIONS, NL80211_EXT_FEATURE_BEACON_PROTECTION_CLIENT, + NL80211_EXT_FEATURE_SCAN_FREQ_KHZ, /* add new features before the definition below */ NUM_NL80211_EXT_FEATURES, @@ -5887,6 +5894,9 @@ enum nl80211_timeout_reason { * @NL80211_SCAN_FLAG_MIN_PREQ_CONTENT: minimize probe request content to * only have supported rates and no additional capabilities (unless * added by userspace explicitly.) + * @NL80211_SCAN_FLAG_FREQ_KHZ: report scan results with + * %NL80211_ATTR_SCAN_FREQ_KHZ. This also means + * %NL80211_ATTR_SCAN_FREQUENCIES will not be included. */ enum nl80211_scan_flags { NL80211_SCAN_FLAG_LOW_PRIORITY = 1<<0, @@ -5902,6 +5912,7 @@ enum nl80211_scan_flags { NL80211_SCAN_FLAG_HIGH_ACCURACY = 1<<10, NL80211_SCAN_FLAG_RANDOM_SN = 1<<11, NL80211_SCAN_FLAG_MIN_PREQ_CONTENT = 1<<12, + NL80211_SCAN_FLAG_FREQ_KHZ = 1<<13, }; /** diff --git a/net/mac80211/main.c b/net/mac80211/main.c index 06c90d360633..ac74bd780b42 100644 --- a/net/mac80211/main.c +++ b/net/mac80211/main.c @@ -596,6 +596,8 @@ struct ieee80211_hw *ieee80211_alloc_hw_nm(size_t priv_data_len, NL80211_EXT_FEATURE_CONTROL_PORT_OVER_NL80211); wiphy_ext_feature_set(wiphy, NL80211_EXT_FEATURE_CONTROL_PORT_NO_PREAUTH); + wiphy_ext_feature_set(wiphy, + NL80211_EXT_FEATURE_SCAN_FREQ_KHZ); if (!ops->hw_scan) { wiphy->features |= NL80211_FEATURE_LOW_PRIORITY_SCAN | diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 87d7efd186d0..84bfa147769a 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -640,6 +640,7 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = { [NL80211_ATTR_PMK_REAUTH_THRESHOLD] = NLA_POLICY_RANGE(NLA_U8, 1, 100), [NL80211_ATTR_RECEIVE_MULTICAST] = { .type = NLA_FLAG }, [NL80211_ATTR_WIPHY_FREQ_OFFSET] = NLA_POLICY_RANGE(NLA_U32, 0, 999), + [NL80211_ATTR_SCAN_FREQ_KHZ] = { .type = NLA_NESTED }, }; /* policy for the key attributes */ @@ -7719,6 +7720,8 @@ static int nl80211_trigger_scan(struct sk_buff *skb, struct genl_info *info) struct cfg80211_registered_device *rdev = info->user_ptr[0]; struct wireless_dev *wdev = info->user_ptr[1]; struct cfg80211_scan_request *request; + struct nlattr *scan_freqs = NULL; + bool scan_freqs_khz = false; struct nlattr *attr; struct wiphy *wiphy; int err, tmp, n_ssids = 0, n_channels, i; @@ -7737,9 +7740,17 @@ static int nl80211_trigger_scan(struct sk_buff *skb, struct genl_info *info) goto unlock; } - if (info->attrs[NL80211_ATTR_SCAN_FREQUENCIES]) { - n_channels = validate_scan_freqs( - info->attrs[NL80211_ATTR_SCAN_FREQUENCIES]); + if (info->attrs[NL80211_ATTR_SCAN_FREQ_KHZ]) { + if (!wiphy_ext_feature_isset(wiphy, + NL80211_EXT_FEATURE_SCAN_FREQ_KHZ)) + return -EOPNOTSUPP; + scan_freqs = info->attrs[NL80211_ATTR_SCAN_FREQ_KHZ]; + scan_freqs_khz = true; + } else if (info->attrs[NL80211_ATTR_SCAN_FREQUENCIES]) + scan_freqs = info->attrs[NL80211_ATTR_SCAN_FREQUENCIES]; + + if (scan_freqs) { + n_channels = validate_scan_freqs(scan_freqs); if (!n_channels) { err = -EINVAL; goto unlock; @@ -7787,13 +7798,16 @@ static int nl80211_trigger_scan(struct sk_buff *skb, struct genl_info *info) } i = 0; - if (info->attrs[NL80211_ATTR_SCAN_FREQUENCIES]) { + if (scan_freqs) { /* user specified, bail out if channel not found */ - nla_for_each_nested(attr, info->attrs[NL80211_ATTR_SCAN_FREQUENCIES], tmp) { + nla_for_each_nested(attr, scan_freqs, tmp) { struct ieee80211_channel *chan; + int freq = nla_get_u32(attr); - chan = ieee80211_get_channel(wiphy, nla_get_u32(attr)); + if (!scan_freqs_khz) + freq = MHZ_TO_KHZ(freq); + chan = ieee80211_get_channel_khz(wiphy, freq); if (!chan) { err = -EINVAL; goto out_free; @@ -15231,14 +15245,27 @@ static int nl80211_add_scan_req(struct sk_buff *msg, } nla_nest_end(msg, nest); - nest = nla_nest_start_noflag(msg, NL80211_ATTR_SCAN_FREQUENCIES); - if (!nest) - goto nla_put_failure; - for (i = 0; i < req->n_channels; i++) { - if (nla_put_u32(msg, i, req->channels[i]->center_freq)) + if (req->flags & NL80211_SCAN_FLAG_FREQ_KHZ) { + nest = nla_nest_start(msg, NL80211_ATTR_SCAN_FREQ_KHZ); + if (!nest) + goto nla_put_failure; + for (i = 0; i < req->n_channels; i++) { + if (nla_put_u32(msg, i, + ieee80211_channel_to_khz(req->channels[i]))) + goto nla_put_failure; + } + nla_nest_end(msg, nest); + } else { + nest = nla_nest_start_noflag(msg, + NL80211_ATTR_SCAN_FREQUENCIES); + if (!nest) goto nla_put_failure; + for (i = 0; i < req->n_channels; i++) { + if (nla_put_u32(msg, i, req->channels[i]->center_freq)) + goto nla_put_failure; + } + nla_nest_end(msg, nest); } - nla_nest_end(msg, nest); if (req->ie && nla_put(msg, NL80211_ATTR_IE, req->ie_len, req->ie)) -- cgit v1.2.3-59-g8ed1b From d6fb67ff86bb991d5ac18471e5f739bc32e5090e Mon Sep 17 00:00:00 2001 From: Thomas Pedersen Date: Thu, 30 Apr 2020 10:25:53 -0700 Subject: ieee80211: S1G defines These are found in IEEE-802.11ah-2016. Signed-off-by: Thomas Pedersen Link: https://lore.kernel.org/r/20200430172554.18383-5-thomas@adapt-ip.com Signed-off-by: Johannes Berg --- include/linux/ieee80211.h | 221 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 221 insertions(+) diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index 41d5f000c0d9..f630b8978a43 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -105,6 +105,51 @@ /* extension, added by 802.11ad */ #define IEEE80211_STYPE_DMG_BEACON 0x0000 +#define IEEE80211_STYPE_S1G_BEACON 0x0010 + +/* bits unique to S1G beacon */ +#define IEEE80211_S1G_BCN_NEXT_TBTT 0x100 + +/* see 802.11ah-2016 9.9 NDP CMAC frames */ +#define IEEE80211_S1G_1MHZ_NDP_BITS 25 +#define IEEE80211_S1G_1MHZ_NDP_BYTES 4 +#define IEEE80211_S1G_2MHZ_NDP_BITS 37 +#define IEEE80211_S1G_2MHZ_NDP_BYTES 5 + +#define IEEE80211_NDP_FTYPE_CTS 0 +#define IEEE80211_NDP_FTYPE_CF_END 0 +#define IEEE80211_NDP_FTYPE_PS_POLL 1 +#define IEEE80211_NDP_FTYPE_ACK 2 +#define IEEE80211_NDP_FTYPE_PS_POLL_ACK 3 +#define IEEE80211_NDP_FTYPE_BA 4 +#define IEEE80211_NDP_FTYPE_BF_REPORT_POLL 5 +#define IEEE80211_NDP_FTYPE_PAGING 6 +#define IEEE80211_NDP_FTYPE_PREQ 7 + +#define SM64(f, v) ((((u64)v) << f##_S) & f) + +/* NDP CMAC frame fields */ +#define IEEE80211_NDP_FTYPE 0x0000000000000007 +#define IEEE80211_NDP_FTYPE_S 0x0000000000000000 + +/* 1M Probe Request 11ah 9.9.3.1.1 */ +#define IEEE80211_NDP_1M_PREQ_ANO 0x0000000000000008 +#define IEEE80211_NDP_1M_PREQ_ANO_S 3 +#define IEEE80211_NDP_1M_PREQ_CSSID 0x00000000000FFFF0 +#define IEEE80211_NDP_1M_PREQ_CSSID_S 4 +#define IEEE80211_NDP_1M_PREQ_RTYPE 0x0000000000100000 +#define IEEE80211_NDP_1M_PREQ_RTYPE_S 20 +#define IEEE80211_NDP_1M_PREQ_RSV 0x0000000001E00000 +#define IEEE80211_NDP_1M_PREQ_RSV 0x0000000001E00000 +/* 2M Probe Request 11ah 9.9.3.1.2 */ +#define IEEE80211_NDP_2M_PREQ_ANO 0x0000000000000008 +#define IEEE80211_NDP_2M_PREQ_ANO_S 3 +#define IEEE80211_NDP_2M_PREQ_CSSID 0x0000000FFFFFFFF0 +#define IEEE80211_NDP_2M_PREQ_CSSID_S 4 +#define IEEE80211_NDP_2M_PREQ_RTYPE 0x0000001000000000 +#define IEEE80211_NDP_2M_PREQ_RTYPE_S 36 + +#define IEEE80211_ANO_NETTYPE_WILD 15 /* control extension - for IEEE80211_FTYPE_CTL | IEEE80211_STYPE_CTL_EXT */ #define IEEE80211_CTL_EXT_POLL 0x2000 @@ -121,6 +166,21 @@ #define IEEE80211_MAX_SN IEEE80211_SN_MASK #define IEEE80211_SN_MODULO (IEEE80211_MAX_SN + 1) + +/* PV1 Layout 11ah 9.8.3.1 */ +#define IEEE80211_PV1_FCTL_VERS 0x0003 +#define IEEE80211_PV1_FCTL_FTYPE 0x001c +#define IEEE80211_PV1_FCTL_STYPE 0x00e0 +#define IEEE80211_PV1_FCTL_TODS 0x0100 +#define IEEE80211_PV1_FCTL_MOREFRAGS 0x0200 +#define IEEE80211_PV1_FCTL_PM 0x0400 +#define IEEE80211_PV1_FCTL_MOREDATA 0x0800 +#define IEEE80211_PV1_FCTL_PROTECTED 0x1000 +#define IEEE80211_PV1_FCTL_END_SP 0x2000 +#define IEEE80211_PV1_FCTL_RELAYED 0x4000 +#define IEEE80211_PV1_FCTL_ACK_POLICY 0x8000 +#define IEEE80211_PV1_FCTL_CTL_EXT 0x0f00 + static inline bool ieee80211_sn_less(u16 sn1, u16 sn2) { return ((sn1 - sn2) & IEEE80211_SN_MASK) > (IEEE80211_SN_MODULO >> 1); @@ -148,6 +208,7 @@ static inline u16 ieee80211_sn_sub(u16 sn1, u16 sn2) #define IEEE80211_MAX_FRAG_THRESHOLD 2352 #define IEEE80211_MAX_RTS_THRESHOLD 2353 #define IEEE80211_MAX_AID 2007 +#define IEEE80211_MAX_AID_S1G 8191 #define IEEE80211_MAX_TIM_LEN 251 #define IEEE80211_MAX_MESH_PEERINGS 63 /* Maximum size for the MA-UNITDATA primitive, 802.11 standard section @@ -371,6 +432,17 @@ static inline bool ieee80211_is_data(__le16 fc) cpu_to_le16(IEEE80211_FTYPE_DATA); } +/** + * ieee80211_is_ext - check if type is IEEE80211_FTYPE_EXT + * @fc: frame control bytes in little-endian byteorder + */ +static inline bool ieee80211_is_ext(__le16 fc) +{ + return (fc & cpu_to_le16(IEEE80211_FCTL_FTYPE)) == + cpu_to_le16(IEEE80211_FTYPE_EXT); +} + + /** * ieee80211_is_data_qos - check if type is IEEE80211_FTYPE_DATA and IEEE80211_STYPE_QOS_DATA is set * @fc: frame control bytes in little-endian byteorder @@ -469,6 +541,18 @@ static inline bool ieee80211_is_beacon(__le16 fc) cpu_to_le16(IEEE80211_FTYPE_MGMT | IEEE80211_STYPE_BEACON); } +/** + * ieee80211_is_s1g_beacon - check if IEEE80211_FTYPE_EXT && + * IEEE80211_STYPE_S1G_BEACON + * @fc: frame control bytes in little-endian byteorder + */ +static inline bool ieee80211_is_s1g_beacon(__le16 fc) +{ + return (fc & cpu_to_le16(IEEE80211_FCTL_FTYPE | + IEEE80211_FCTL_STYPE)) == + cpu_to_le16(IEEE80211_FTYPE_EXT | IEEE80211_STYPE_S1G_BEACON); +} + /** * ieee80211_is_atim - check if IEEE80211_FTYPE_MGMT && IEEE80211_STYPE_ATIM * @fc: frame control bytes in little-endian byteorder @@ -900,6 +984,59 @@ struct ieee80211_addba_ext_ie { u8 data; } __packed; +/** + * struct ieee80211_s1g_bcn_compat_ie + * + * S1G Beacon Compatibility element + */ +struct ieee80211_s1g_bcn_compat_ie { + __le16 compat_info; + __le16 beacon_int; + __le32 tsf_completion; +} __packed; + +/** + * struct ieee80211_s1g_oper_ie + * + * S1G Operation element + */ +struct ieee80211_s1g_oper_ie { + u8 ch_width; + u8 oper_class; + u8 primary_ch; + u8 oper_ch; + __le16 basic_mcs_nss; +} __packed; + +/** + * struct ieee80211_aid_response_ie + * + * AID Response element + */ +struct ieee80211_aid_response_ie { + __le16 aid; + u8 switch_count; + __le16 response_int; +} __packed; + +struct ieee80211_s1g_cap { + u8 capab_info[10]; + u8 supp_mcs_nss[5]; +} __packed; + +struct ieee80211_ext { + __le16 frame_control; + __le16 duration; + union { + struct { + u8 sa[ETH_ALEN]; + __le32 timestamp; + u8 change_seq; + u8 variable[0]; + } __packed s1g_beacon; + } u; +} __packed __aligned(2); + struct ieee80211_mgmt { __le16 frame_control; __le16 duration; @@ -2137,6 +2274,86 @@ ieee80211_he_spr_size(const u8 *he_spr_ie) return spr_len; } +/* S1G Capabilities Information field */ +#define S1G_CAPAB_B0_S1G_LONG BIT(0) +#define S1G_CAPAB_B0_SGI_1MHZ BIT(1) +#define S1G_CAPAB_B0_SGI_2MHZ BIT(2) +#define S1G_CAPAB_B0_SGI_4MHZ BIT(3) +#define S1G_CAPAB_B0_SGI_8MHZ BIT(4) +#define S1G_CAPAB_B0_SGI_16MHZ BIT(5) +#define S1G_CAPAB_B0_SUPP_CH_WIDTH_MASK (BIT(6) | BIT(7)) +#define S1G_CAPAB_B0_SUPP_CH_WIDTH_SHIFT 6 + +#define S1G_CAPAB_B1_RX_LDPC BIT(0) +#define S1G_CAPAB_B1_TX_STBC BIT(1) +#define S1G_CAPAB_B1_RX_STBC BIT(2) +#define S1G_CAPAB_B1_SU_BFER BIT(3) +#define S1G_CAPAB_B1_SU_BFEE BIT(4) +#define S1G_CAPAB_B1_BFEE_STS_MASK (BIT(5) | BIT(6) | BIT(7)) +#define S1G_CAPAB_B1_BFEE_STS_SHIFT 5 + +#define S1G_CAPAB_B2_SOUNDING_DIMENSIONS_MASK (BIT(0) | BIT(1) | BIT(2)) +#define S1G_CAPAB_B2_SOUNDING_DIMENSIONS_SHIFT 0 +#define S1G_CAPAB_B2_MU_BFER BIT(3) +#define S1G_CAPAB_B2_MU_BFEE BIT(4) +#define S1G_CAPAB_B2_PLUS_HTC_VHT BIT(5) +#define S1G_CAPAB_B2_TRAVELING_PILOT_MASK (BIT(6) | BIT(7)) +#define S1G_CAPAB_B2_TRAVELING_PILOT_SHIFT 6 + +#define S1G_CAPAB_B3_RD_RESPONDER BIT(0) +#define S1G_CAPAB_B3_HT_DELAYED_BA BIT(1) +#define S1G_CAPAB_B3_MAX_MPDU_LEN BIT(2) +#define S1G_CAPAB_B3_MAX_AMPDU_LEN_EXP_MASK (BIT(3) | BIT(4)) +#define S1G_CAPAB_B3_MAX_AMPDU_LEN_EXP_SHIFT 3 +#define S1G_CAPAB_B3_MIN_MPDU_START_MASK (BIT(5) | BIT(6) | BIT(7)) +#define S1G_CAPAB_B3_MIN_MPDU_START_SHIFT 5 + +#define S1G_CAPAB_B4_UPLINK_SYNC BIT(0) +#define S1G_CAPAB_B4_DYNAMIC_AID BIT(1) +#define S1G_CAPAB_B4_BAT BIT(2) +#define S1G_CAPAB_B4_TIME_ADE BIT(3) +#define S1G_CAPAB_B4_NON_TIM BIT(4) +#define S1G_CAPAB_B4_GROUP_AID BIT(5) +#define S1G_CAPAB_B4_STA_TYPE_MASK (BIT(6) | BIT(7)) +#define S1G_CAPAB_B4_STA_TYPE_SHIFT 6 + +#define S1G_CAPAB_B5_CENT_AUTH_CONTROL BIT(0) +#define S1G_CAPAB_B5_DIST_AUTH_CONTROL BIT(1) +#define S1G_CAPAB_B5_AMSDU BIT(2) +#define S1G_CAPAB_B5_AMPDU BIT(3) +#define S1G_CAPAB_B5_ASYMMETRIC_BA BIT(4) +#define S1G_CAPAB_B5_FLOW_CONTROL BIT(5) +#define S1G_CAPAB_B5_SECTORIZED_BEAM_MASK (BIT(6) | BIT(7)) +#define S1G_CAPAB_B5_SECTORIZED_BEAM_SHIFT 6 + +#define S1G_CAPAB_B6_OBSS_MITIGATION BIT(0) +#define S1G_CAPAB_B6_FRAGMENT_BA BIT(1) +#define S1G_CAPAB_B6_NDP_PS_POLL BIT(2) +#define S1G_CAPAB_B6_RAW_OPERATION BIT(3) +#define S1G_CAPAB_B6_PAGE_SLICING BIT(4) +#define S1G_CAPAB_B6_TXOP_SHARING_IMP_ACK BIT(5) +#define S1G_CAPAB_B6_VHT_LINK_ADAPT_MASK (BIT(6) | BIT(7)) +#define S1G_CAPAB_B6_VHT_LINK_ADAPT_SHIFT 6 + +#define S1G_CAPAB_B7_TACK_AS_PS_POLL BIT(0) +#define S1G_CAPAB_B7_DUP_1MHZ BIT(1) +#define S1G_CAPAB_B7_MCS_NEGOTIATION BIT(2) +#define S1G_CAPAB_B7_1MHZ_CTL_RESPONSE_PREAMBLE BIT(3) +#define S1G_CAPAB_B7_NDP_BFING_REPORT_POLL BIT(4) +#define S1G_CAPAB_B7_UNSOLICITED_DYN_AID BIT(5) +#define S1G_CAPAB_B7_SECTOR_TRAINING_OPERATION BIT(6) +#define S1G_CAPAB_B7_TEMP_PS_MODE_SWITCH BIT(7) + +#define S1G_CAPAB_B8_TWT_GROUPING BIT(0) +#define S1G_CAPAB_B8_BDT BIT(1) +#define S1G_CAPAB_B8_COLOR_MASK (BIT(2) | BIT(3) | BIT(4)) +#define S1G_CAPAB_B8_COLOR_SHIFT 2 +#define S1G_CAPAB_B8_TWT_REQUEST BIT(5) +#define S1G_CAPAB_B8_TWT_RESPOND BIT(6) +#define S1G_CAPAB_B8_PV1_FRAME BIT(7) + +#define S1G_CAPAB_B9_LINK_ADAPT_PER_CONTROL_RESPONSE BIT(0) + /* Authentication algorithms */ #define WLAN_AUTH_OPEN 0 #define WLAN_AUTH_SHARED_KEY 1 @@ -2532,8 +2749,12 @@ enum ieee80211_eid { WLAN_EID_QUIET_CHANNEL = 198, WLAN_EID_OPMODE_NOTIF = 199, + WLAN_EID_S1G_BCN_COMPAT = 213, + WLAN_EID_S1G_SHORT_BCN_INTERVAL = 214, + WLAN_EID_S1G_CAPABILITIES = 217, WLAN_EID_VENDOR_SPECIFIC = 221, WLAN_EID_QOS_PARAMETER = 222, + WLAN_EID_S1G_OPERATION = 232, WLAN_EID_CAG_NUMBER = 237, WLAN_EID_AP_CSN = 239, WLAN_EID_FILS_INDICATION = 240, -- cgit v1.2.3-59-g8ed1b From fedd0fe4e89b009f31eb53ec36dbdf1e457616c0 Mon Sep 17 00:00:00 2001 From: Tamizh Chelvam Date: Mon, 4 May 2020 22:34:59 +0530 Subject: mac80211: Add new AMPDU factor macro for HE peer caps Add IEEE80211_HE_VHT_MAX_AMPDU_FACTOR and IEEE80211_HE_HT_MAX_AMPDU_FACTOR as per spec to use for peer max ampdu factor. Signed-off-by: Tamizh Chelvam Link: https://lore.kernel.org/r/1588611900-21185-1-git-send-email-tamizhr@codeaurora.org Signed-off-by: Johannes Berg --- include/linux/ieee80211.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index f630b8978a43..2153d465d752 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -1958,6 +1958,8 @@ int ieee80211_get_vht_max_nss(struct ieee80211_vht_cap *cap, #define IEEE80211_HE_MAC_CAP3_FLEX_TWT_SCHED 0x40 #define IEEE80211_HE_MAC_CAP3_RX_CTRL_FRAME_TO_MULTIBSS 0x80 +#define IEEE80211_HE_MAC_CAP3_MAX_AMPDU_LEN_EXP_SHIFT 3 + #define IEEE80211_HE_MAC_CAP4_BSRP_BQRP_A_MPDU_AGG 0x01 #define IEEE80211_HE_MAC_CAP4_QTP 0x02 #define IEEE80211_HE_MAC_CAP4_BQR 0x04 @@ -1979,6 +1981,9 @@ int ieee80211_get_vht_max_nss(struct ieee80211_vht_cap *cap, #define IEEE80211_HE_MAC_CAP5_PUNCTURED_SOUNDING 0x40 #define IEEE80211_HE_MAC_CAP5_HT_VHT_TRIG_FRAME_RX 0x80 +#define IEEE80211_HE_VHT_MAX_AMPDU_FACTOR 20 +#define IEEE80211_HE_HT_MAX_AMPDU_FACTOR 16 + /* 802.11ax HE PHY capabilities */ #define IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_40MHZ_IN_2G 0x02 #define IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_40MHZ_80MHZ_IN_5G 0x04 -- cgit v1.2.3-59-g8ed1b From 396fba0a59f3c94d6fd6443fbeabd8bd9e3956eb Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Thu, 7 May 2020 13:39:09 -0500 Subject: cfg80211: Replace zero-length array with flexible-array The current codebase makes use of the zero-length array language extension to the C90 standard, but the preferred mechanism to declare variable-length types such as these ones is a flexible array member[1][2], introduced in C99: struct foo { int stuff; struct boo array[]; }; By making use of the mechanism above, we will get a compiler warning in case the flexible array does not occur last in the structure, which will help us prevent some kind of undefined behavior bugs from being inadvertently introduced[3] to the codebase from now on. Also, notice that, dynamic memory allocations won't be affected by this change: "Flexible array members have incomplete type, and so the sizeof operator may not be applied. As a quirk of the original implementation of zero-length arrays, sizeof evaluates to zero."[1] sizeof(flexible-array-member) triggers a warning because flexible array members have incomplete type[1]. There are some instances of code in which the sizeof operator is being incorrectly/erroneously applied to zero-length arrays and the result is zero. Such instances may be hiding some bugs. So, this work (flexible-array member conversions) will also help to get completely rid of those sorts of issues. This issue was found with the help of Coccinelle. [1] https://gcc.gnu.org/onlinedocs/gcc/Zero-Length.html [2] https://github.com/KSPP/linux/issues/21 [3] commit 76497732932f ("cxgb3/l2t: Fix undefined behaviour") Signed-off-by: Gustavo A. R. Silva Link: https://lore.kernel.org/r/20200507183909.GA12993@embeddedor Signed-off-by: Johannes Berg --- include/linux/ieee80211.h | 6 +++--- include/net/cfg80211.h | 8 ++++---- net/wireless/core.h | 2 +- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index 2153d465d752..0320ca4c7d28 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -800,7 +800,7 @@ struct ieee80211_msrment_ie { u8 token; u8 mode; u8 type; - u8 request[0]; + u8 request[]; } __packed; /** @@ -1781,7 +1781,7 @@ struct ieee80211_he_operation { __le32 he_oper_params; __le16 he_mcs_nss_set; /* Optional 0,1,3,4,5,7 or 8 bytes: depends on @he_oper_params */ - u8 optional[0]; + u8 optional[]; } __packed; /** @@ -1793,7 +1793,7 @@ struct ieee80211_he_operation { struct ieee80211_he_spr { u8 he_sr_control; /* Optional 0 to 19 bytes: depends on @he_sr_control */ - u8 optional[0]; + u8 optional[]; } __packed; /** diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 7415f77d99ca..021366cfb2b0 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -2037,7 +2037,7 @@ struct cfg80211_scan_request { bool no_cck; /* keep last */ - struct ieee80211_channel *channels[0]; + struct ieee80211_channel *channels[]; }; static inline void get_random_mask_addr(u8 *buf, const u8 *addr, const u8 *mask) @@ -2183,7 +2183,7 @@ struct cfg80211_sched_scan_request { struct list_head list; /* keep last */ - struct ieee80211_channel *channels[0]; + struct ieee80211_channel *channels[]; }; /** @@ -2305,7 +2305,7 @@ struct cfg80211_bss { u8 bssid_index; u8 max_bssid_indicator; - u8 priv[0] __aligned(sizeof(void *)); + u8 priv[] __aligned(sizeof(void *)); }; /** @@ -4852,7 +4852,7 @@ struct wiphy { u8 max_data_retry_count; - char priv[0] __aligned(NETDEV_ALIGN); + char priv[] __aligned(NETDEV_ALIGN); }; static inline struct net *wiphy_net(struct wiphy *wiphy) diff --git a/net/wireless/core.h b/net/wireless/core.h index 639d41896573..e0e5b3ee9699 100644 --- a/net/wireless/core.h +++ b/net/wireless/core.h @@ -286,7 +286,7 @@ struct cfg80211_cqm_config { u32 rssi_hyst; s32 last_rssi_event_value; int n_rssi_thresholds; - s32 rssi_thresholds[0]; + s32 rssi_thresholds[]; }; void cfg80211_destroy_ifaces(struct cfg80211_registered_device *rdev); -- cgit v1.2.3-59-g8ed1b From 3c23215ba8c70c0e9b16beffb7f700a401391e38 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Thu, 7 May 2020 13:59:07 -0500 Subject: mac80211: Replace zero-length array with flexible-array The current codebase makes use of the zero-length array language extension to the C90 standard, but the preferred mechanism to declare variable-length types such as these ones is a flexible array member[1][2], introduced in C99: struct foo { int stuff; struct boo array[]; }; By making use of the mechanism above, we will get a compiler warning in case the flexible array does not occur last in the structure, which will help us prevent some kind of undefined behavior bugs from being inadvertently introduced[3] to the codebase from now on. Also, notice that, dynamic memory allocations won't be affected by this change: "Flexible array members have incomplete type, and so the sizeof operator may not be applied. As a quirk of the original implementation of zero-length arrays, sizeof evaluates to zero."[1] sizeof(flexible-array-member) triggers a warning because flexible array members have incomplete type[1]. There are some instances of code in which the sizeof operator is being incorrectly/erroneously applied to zero-length arrays and the result is zero. Such instances may be hiding some bugs. So, this work (flexible-array member conversions) will also help to get completely rid of those sorts of issues. This issue was found with the help of Coccinelle. [1] https://gcc.gnu.org/onlinedocs/gcc/Zero-Length.html [2] https://github.com/KSPP/linux/issues/21 [3] commit 76497732932f ("cxgb3/l2t: Fix undefined behaviour") Signed-off-by: Gustavo A. R. Silva Link: https://lore.kernel.org/r/20200507185907.GA15102@embeddedor Signed-off-by: Johannes Berg --- include/net/mac80211.h | 10 +++++----- net/mac80211/ieee80211_i.h | 2 +- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/include/net/mac80211.h b/include/net/mac80211.h index 0d48e679efb0..7cb712427df1 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -230,7 +230,7 @@ struct ieee80211_chanctx_conf { bool radar_enabled; - u8 drv_priv[0] __aligned(sizeof(void *)); + u8 drv_priv[] __aligned(sizeof(void *)); }; /** @@ -1670,7 +1670,7 @@ struct ieee80211_vif { bool txqs_stopped[IEEE80211_NUM_ACS]; /* must be last */ - u8 drv_priv[0] __aligned(sizeof(void *)); + u8 drv_priv[] __aligned(sizeof(void *)); }; static inline bool ieee80211_vif_is_mesh(struct ieee80211_vif *vif) @@ -1798,7 +1798,7 @@ struct ieee80211_key_conf { s8 keyidx; u16 flags; u8 keylen; - u8 key[0]; + u8 key[]; }; #define IEEE80211_MAX_PN_LEN 16 @@ -2053,7 +2053,7 @@ struct ieee80211_sta { struct ieee80211_txq *txq[IEEE80211_NUM_TIDS + 1]; /* must be last */ - u8 drv_priv[0] __aligned(sizeof(void *)); + u8 drv_priv[] __aligned(sizeof(void *)); }; /** @@ -2099,7 +2099,7 @@ struct ieee80211_txq { u8 ac; /* must be last */ - u8 drv_priv[0] __aligned(sizeof(void *)); + u8 drv_priv[] __aligned(sizeof(void *)); }; /** diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index 8cbae66b5cdb..2d1b6cb75497 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -267,7 +267,7 @@ struct probe_resp { struct rcu_head rcu_head; int len; u16 csa_counter_offsets[IEEE80211_MAX_CSA_COUNTERS_NUM]; - u8 data[0]; + u8 data[]; }; struct ps_data { -- cgit v1.2.3-59-g8ed1b From dca9ca2d588bd2c0989c671f048540b82e57cf1e Mon Sep 17 00:00:00 2001 From: Markus Theil Date: Fri, 8 May 2020 16:42:00 +0200 Subject: nl80211: add ability to report TX status for control port TX This adds the necessary capabilities in nl80211 to allow drivers to assign a cookie to control port TX frames (returned via extack in the netlink ACK message of the command) and then later report the frame's status. Signed-off-by: Markus Theil Link: https://lore.kernel.org/r/20200508144202.7678-2-markus.theil@tu-ilmenau.de [use extack cookie instead of explicit message, recombine patches] Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 20 +++++++++++++++++++- include/uapi/linux/nl80211.h | 12 ++++++++++++ net/mac80211/ieee80211_i.h | 3 ++- net/mac80211/tx.c | 3 ++- net/wireless/nl80211.c | 41 +++++++++++++++++++++++++++++++++-------- net/wireless/rdev-ops.h | 9 ++++++--- net/wireless/trace.h | 17 +++++++++++++++++ 7 files changed, 91 insertions(+), 14 deletions(-) diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 021366cfb2b0..f842f3652026 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -4069,7 +4069,8 @@ struct cfg80211_ops { struct net_device *dev, const u8 *buf, size_t len, const u8 *dest, const __be16 proto, - const bool noencrypt); + const bool noencrypt, + u64 *cookie); int (*get_ftm_responder_stats)(struct wiphy *wiphy, struct net_device *dev, @@ -7049,6 +7050,23 @@ static inline bool cfg80211_rx_mgmt(struct wireless_dev *wdev, int freq, void cfg80211_mgmt_tx_status(struct wireless_dev *wdev, u64 cookie, const u8 *buf, size_t len, bool ack, gfp_t gfp); +/** + * cfg80211_control_port_tx_status - notification of TX status for control + * port frames + * @wdev: wireless device receiving the frame + * @cookie: Cookie returned by cfg80211_ops::tx_control_port() + * @buf: Data frame (header + body) + * @len: length of the frame data + * @ack: Whether frame was acknowledged + * @gfp: context flags + * + * This function is called whenever a control port frame was requested to be + * transmitted with cfg80211_ops::tx_control_port() to report the TX status of + * the transmission attempt. + */ +void cfg80211_control_port_tx_status(struct wireless_dev *wdev, u64 cookie, + const u8 *buf, size_t len, bool ack, + gfp_t gfp); /** * cfg80211_rx_control_port - notification about a received control port frame diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index 47d39b6a073d..0f324b6b81cc 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -1164,6 +1164,12 @@ * dropped because it did not include a valid MME MIC while beacon * protection was enabled (BIGTK configured in station mode). * + * @NL80211_CMD_CONTROL_PORT_FRAME_TX_STATUS: Report TX status of a control + * port frame transmitted with %NL80211_CMD_CONTROL_PORT_FRAME. + * %NL80211_ATTR_COOKIE identifies the TX command and %NL80211_ATTR_FRAME + * includes the contents of the frame. %NL80211_ATTR_ACK flag is included + * if the recipient acknowledged the frame. + * * @NL80211_CMD_MAX: highest used command number * @__NL80211_CMD_AFTER_LAST: internal use */ @@ -1392,6 +1398,8 @@ enum nl80211_commands { NL80211_CMD_UNPROT_BEACON, + NL80211_CMD_CONTROL_PORT_FRAME_TX_STATUS, + /* add new commands above here */ /* used to define NL80211_CMD_MAX below */ @@ -5729,6 +5737,9 @@ enum nl80211_feature_flags { * report %NL80211_ATTR_SCAN_FREQ_KHZ, %NL80211_SCAN_FLAG_FREQ_KHZ must be * included in the scan request. * + * @NL80211_EXT_FEATURE_CONTROL_PORT_OVER_NL80211_TX_STATUS: The driver + * can report tx status for control port over nl80211 tx operations. + * * @NUM_NL80211_EXT_FEATURES: number of extended features. * @MAX_NL80211_EXT_FEATURES: highest extended feature index. */ @@ -5783,6 +5794,7 @@ enum nl80211_ext_feature_index { NL80211_EXT_FEATURE_MULTICAST_REGISTRATIONS, NL80211_EXT_FEATURE_BEACON_PROTECTION_CLIENT, NL80211_EXT_FEATURE_SCAN_FREQ_KHZ, + NL80211_EXT_FEATURE_CONTROL_PORT_OVER_NL80211_TX_STATUS, /* add new features before the definition below */ NUM_NL80211_EXT_FEATURES, diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index 2d1b6cb75497..b87dc873825b 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -1800,7 +1800,8 @@ void ieee80211_check_fast_xmit_iface(struct ieee80211_sub_if_data *sdata); void ieee80211_clear_fast_xmit(struct sta_info *sta); int ieee80211_tx_control_port(struct wiphy *wiphy, struct net_device *dev, const u8 *buf, size_t len, - const u8 *dest, __be16 proto, bool unencrypted); + const u8 *dest, __be16 proto, bool unencrypted, + u64 *cookie); int ieee80211_probe_mesh_link(struct wiphy *wiphy, struct net_device *dev, const u8 *buf, size_t len); diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index 47f460c8bd74..5931128e1855 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -5339,7 +5339,8 @@ void __ieee80211_tx_skb_tid_band(struct ieee80211_sub_if_data *sdata, int ieee80211_tx_control_port(struct wiphy *wiphy, struct net_device *dev, const u8 *buf, size_t len, - const u8 *dest, __be16 proto, bool unencrypted) + const u8 *dest, __be16 proto, bool unencrypted, + u64 *cookie) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); struct ieee80211_local *local = sdata->local; diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 84bfa147769a..7ea764865546 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -13866,6 +13866,7 @@ static int nl80211_external_auth(struct sk_buff *skb, struct genl_info *info) static int nl80211_tx_control_port(struct sk_buff *skb, struct genl_info *info) { + bool dont_wait_for_ack = info->attrs[NL80211_ATTR_DONT_WAIT_FOR_ACK]; struct cfg80211_registered_device *rdev = info->user_ptr[0]; struct net_device *dev = info->user_ptr[1]; struct wireless_dev *wdev = dev->ieee80211_ptr; @@ -13874,6 +13875,7 @@ static int nl80211_tx_control_port(struct sk_buff *skb, struct genl_info *info) u8 *dest; u16 proto; bool noencrypt; + u64 cookie = 0; int err; if (!wiphy_ext_feature_isset(&rdev->wiphy, @@ -13918,9 +13920,12 @@ static int nl80211_tx_control_port(struct sk_buff *skb, struct genl_info *info) noencrypt = nla_get_flag(info->attrs[NL80211_ATTR_CONTROL_PORT_NO_ENCRYPT]); - return rdev_tx_control_port(rdev, dev, buf, len, - dest, cpu_to_be16(proto), noencrypt); - + err = rdev_tx_control_port(rdev, dev, buf, len, + dest, cpu_to_be16(proto), noencrypt, + dont_wait_for_ack ? NULL : &cookie); + if (!err && !dont_wait_for_ack) + nl_set_extack_cookie_u64(info->extack, cookie); + return err; out: wdev_unlock(wdev); return err; @@ -16294,8 +16299,9 @@ int nl80211_send_mgmt(struct cfg80211_registered_device *rdev, return -ENOBUFS; } -void cfg80211_mgmt_tx_status(struct wireless_dev *wdev, u64 cookie, - const u8 *buf, size_t len, bool ack, gfp_t gfp) +static void nl80211_frame_tx_status(struct wireless_dev *wdev, u64 cookie, + const u8 *buf, size_t len, bool ack, + gfp_t gfp, enum nl80211_commands command) { struct wiphy *wiphy = wdev->wiphy; struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); @@ -16303,13 +16309,16 @@ void cfg80211_mgmt_tx_status(struct wireless_dev *wdev, u64 cookie, struct sk_buff *msg; void *hdr; - trace_cfg80211_mgmt_tx_status(wdev, cookie, ack); + if (command == NL80211_CMD_FRAME_TX_STATUS) + trace_cfg80211_mgmt_tx_status(wdev, cookie, ack); + else + trace_cfg80211_control_port_tx_status(wdev, cookie, ack); msg = nlmsg_new(100 + len, gfp); if (!msg) return; - hdr = nl80211hdr_put(msg, 0, 0, 0, NL80211_CMD_FRAME_TX_STATUS); + hdr = nl80211hdr_put(msg, 0, 0, 0, command); if (!hdr) { nlmsg_free(msg); return; @@ -16332,9 +16341,25 @@ void cfg80211_mgmt_tx_status(struct wireless_dev *wdev, u64 cookie, NL80211_MCGRP_MLME, gfp); return; - nla_put_failure: +nla_put_failure: nlmsg_free(msg); } + +void cfg80211_control_port_tx_status(struct wireless_dev *wdev, u64 cookie, + const u8 *buf, size_t len, bool ack, + gfp_t gfp) +{ + nl80211_frame_tx_status(wdev, cookie, buf, len, ack, gfp, + NL80211_CMD_CONTROL_PORT_FRAME_TX_STATUS); +} +EXPORT_SYMBOL(cfg80211_control_port_tx_status); + +void cfg80211_mgmt_tx_status(struct wireless_dev *wdev, u64 cookie, + const u8 *buf, size_t len, bool ack, gfp_t gfp) +{ + nl80211_frame_tx_status(wdev, cookie, buf, len, ack, gfp, + NL80211_CMD_FRAME_TX_STATUS); +} EXPORT_SYMBOL(cfg80211_mgmt_tx_status); static int __nl80211_rx_control_port(struct net_device *dev, diff --git a/net/wireless/rdev-ops.h b/net/wireless/rdev-ops.h index df5142e86c4f..950d57494168 100644 --- a/net/wireless/rdev-ops.h +++ b/net/wireless/rdev-ops.h @@ -748,14 +748,17 @@ static inline int rdev_tx_control_port(struct cfg80211_registered_device *rdev, struct net_device *dev, const void *buf, size_t len, const u8 *dest, __be16 proto, - const bool noencrypt) + const bool noencrypt, u64 *cookie) { int ret; trace_rdev_tx_control_port(&rdev->wiphy, dev, buf, len, dest, proto, noencrypt); ret = rdev->ops->tx_control_port(&rdev->wiphy, dev, buf, len, - dest, proto, noencrypt); - trace_rdev_return_int(&rdev->wiphy, ret); + dest, proto, noencrypt, cookie); + if (cookie) + trace_rdev_return_int_cookie(&rdev->wiphy, ret, *cookie); + else + trace_rdev_return_int(&rdev->wiphy, ret); return ret; } diff --git a/net/wireless/trace.h b/net/wireless/trace.h index f2ab44a2a3e4..b23cab016521 100644 --- a/net/wireless/trace.h +++ b/net/wireless/trace.h @@ -2861,6 +2861,23 @@ TRACE_EVENT(cfg80211_mgmt_tx_status, WDEV_PR_ARG, __entry->cookie, BOOL_TO_STR(__entry->ack)) ); +TRACE_EVENT(cfg80211_control_port_tx_status, + TP_PROTO(struct wireless_dev *wdev, u64 cookie, bool ack), + TP_ARGS(wdev, cookie, ack), + TP_STRUCT__entry( + WDEV_ENTRY + __field(u64, cookie) + __field(bool, ack) + ), + TP_fast_assign( + WDEV_ASSIGN; + __entry->cookie = cookie; + __entry->ack = ack; + ), + TP_printk(WDEV_PR_FMT", cookie: %llu, ack: %s", + WDEV_PR_ARG, __entry->cookie, BOOL_TO_STR(__entry->ack)) +); + TRACE_EVENT(cfg80211_rx_control_port, TP_PROTO(struct net_device *netdev, struct sk_buff *skb, bool unencrypted), -- cgit v1.2.3-59-g8ed1b From 1ea02224afc29431880a67b8c3198146cc01d33e Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Tue, 26 May 2020 10:31:33 +0200 Subject: mac80211: allow SA-QUERY processing in userspace As discussed with Mathy almost two years ago in http://lore.kernel.org/r/20180806224857.14853-1-Mathy.Vanhoef@cs.kuleuven.be we should let userspace process SA-QUERY frames if it wants to, so that it can handle OCV (operating channel validation) which mac80211 doesn't know how to. Evidently I had been expecting Mathy to (re)send such a patch, but he never did, perhaps expecting me to do it after our discussion. In any case, this came up now with OCV getting more attention, so move the code around as discussed there to let userspace handle it, and do it properly. Signed-off-by: Johannes Berg Link: https://lore.kernel.org/r/20200526103131.1f9cf7e5b6db.Iae5b42b09ad2b1cbcbe13492002c43f0d1d51dfc@changeid Signed-off-by: Johannes Berg --- net/mac80211/rx.c | 49 ++++++++++++++++++++++++++++++++++++------------- 1 file changed, 36 insertions(+), 13 deletions(-) diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index 8e47b0d31051..8b0fa5e345f4 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -3354,19 +3354,6 @@ ieee80211_rx_h_action(struct ieee80211_rx_data *rx) } } break; - case WLAN_CATEGORY_SA_QUERY: - if (len < (IEEE80211_MIN_ACTION_SIZE + - sizeof(mgmt->u.action.u.sa_query))) - break; - - switch (mgmt->u.action.u.sa_query.action) { - case WLAN_ACTION_SA_QUERY_REQUEST: - if (sdata->vif.type != NL80211_IFTYPE_STATION) - break; - ieee80211_process_sa_query_req(sdata, mgmt, len); - goto handled; - } - break; case WLAN_CATEGORY_SELF_PROTECTED: if (len < (IEEE80211_MIN_ACTION_SIZE + sizeof(mgmt->u.action.u.self_prot.action_code))) @@ -3456,6 +3443,41 @@ ieee80211_rx_h_userspace_mgmt(struct ieee80211_rx_data *rx) return RX_CONTINUE; } +static ieee80211_rx_result debug_noinline +ieee80211_rx_h_action_post_userspace(struct ieee80211_rx_data *rx) +{ + struct ieee80211_sub_if_data *sdata = rx->sdata; + struct ieee80211_mgmt *mgmt = (struct ieee80211_mgmt *) rx->skb->data; + int len = rx->skb->len; + + if (!ieee80211_is_action(mgmt->frame_control)) + return RX_CONTINUE; + + switch (mgmt->u.action.category) { + case WLAN_CATEGORY_SA_QUERY: + if (len < (IEEE80211_MIN_ACTION_SIZE + + sizeof(mgmt->u.action.u.sa_query))) + break; + + switch (mgmt->u.action.u.sa_query.action) { + case WLAN_ACTION_SA_QUERY_REQUEST: + if (sdata->vif.type != NL80211_IFTYPE_STATION) + break; + ieee80211_process_sa_query_req(sdata, mgmt, len); + goto handled; + } + break; + } + + return RX_CONTINUE; + + handled: + if (rx->sta) + rx->sta->rx_stats.packets++; + dev_kfree_skb(rx->skb); + return RX_QUEUED; +} + static ieee80211_rx_result debug_noinline ieee80211_rx_h_action_return(struct ieee80211_rx_data *rx) { @@ -3736,6 +3758,7 @@ static void ieee80211_rx_handlers(struct ieee80211_rx_data *rx, CALL_RXH(ieee80211_rx_h_mgmt_check); CALL_RXH(ieee80211_rx_h_action); CALL_RXH(ieee80211_rx_h_userspace_mgmt); + CALL_RXH(ieee80211_rx_h_action_post_userspace); CALL_RXH(ieee80211_rx_h_action_return); CALL_RXH(ieee80211_rx_h_mgmt); -- cgit v1.2.3-59-g8ed1b From 9a5f6488623730dc16cca0836ade23869761adee Mon Sep 17 00:00:00 2001 From: Tamizh Chelvam Date: Wed, 13 May 2020 13:41:44 +0530 Subject: nl80211: Add support to configure TID specific Tx rate configuration This patch adds support to configure per TID Tx Rate configuration through NL80211_TID_CONFIG_ATTR_TX_RATE* attributes. And it uses nl80211_parse_tx_bitrate_mask api to validate the Tx rate mask. Signed-off-by: Tamizh Chelvam Link: https://lore.kernel.org/r/1589357504-10175-1-git-send-email-tamizhr@codeaurora.org Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 29 +++++++++++++---------- include/uapi/linux/nl80211.h | 21 +++++++++++++++++ net/wireless/nl80211.c | 56 +++++++++++++++++++++++++++++++++----------- 3 files changed, 80 insertions(+), 26 deletions(-) diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index f842f3652026..e2dbc9c02ef3 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -630,6 +630,19 @@ struct cfg80211_chan_def { u16 freq1_offset; }; +/* + * cfg80211_bitrate_mask - masks for bitrate control + */ +struct cfg80211_bitrate_mask { + struct { + u32 legacy; + u8 ht_mcs[IEEE80211_HT_MCS_MASK_LEN]; + u16 vht_mcs[NL80211_VHT_NSS_MAX]; + enum nl80211_txrate_gi gi; + } control[NUM_NL80211_BANDS]; +}; + + /** * struct cfg80211_tid_cfg - TID specific configuration * @config_override: Flag to notify driver to reset TID configuration @@ -643,6 +656,8 @@ struct cfg80211_chan_def { * @ampdu: Enable/Disable MPDU aggregation * @rtscts: Enable/Disable RTS/CTS * @amsdu: Enable/Disable MSDU aggregation + * @txrate_type: Tx bitrate mask type + * @txrate_mask: Tx bitrate to be applied for the TID */ struct cfg80211_tid_cfg { bool config_override; @@ -653,6 +668,8 @@ struct cfg80211_tid_cfg { enum nl80211_tid_config ampdu; enum nl80211_tid_config rtscts; enum nl80211_tid_config amsdu; + enum nl80211_tx_rate_setting txrate_type; + struct cfg80211_bitrate_mask txrate_mask; }; /** @@ -1007,18 +1024,6 @@ struct cfg80211_acl_data { struct mac_address mac_addrs[]; }; -/* - * cfg80211_bitrate_mask - masks for bitrate control - */ -struct cfg80211_bitrate_mask { - struct { - u32 legacy; - u8 ht_mcs[IEEE80211_HT_MCS_MASK_LEN]; - u16 vht_mcs[NL80211_VHT_NSS_MAX]; - enum nl80211_txrate_gi gi; - } control[NUM_NL80211_BANDS]; -}; - /** * enum cfg80211_ap_settings_flags - AP settings flags * diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index 0f324b6b81cc..c14666b75e57 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -4841,6 +4841,17 @@ enum nl80211_tid_config { NL80211_TID_CONFIG_DISABLE, }; +/* enum nl80211_tx_rate_setting - TX rate configuration type + * @NL80211_TX_RATE_AUTOMATIC: automatically determine TX rate + * @NL80211_TX_RATE_LIMITED: limit the TX rate by the TX rate parameter + * @NL80211_TX_RATE_FIXED: fix TX rate to the TX rate parameter + */ +enum nl80211_tx_rate_setting { + NL80211_TX_RATE_AUTOMATIC, + NL80211_TX_RATE_LIMITED, + NL80211_TX_RATE_FIXED, +}; + /* enum nl80211_tid_config_attr - TID specific configuration. * @NL80211_TID_CONFIG_ATTR_PAD: pad attribute for 64-bit values * @NL80211_TID_CONFIG_ATTR_VIF_SUPP: a bitmap (u64) of attributes supported @@ -4876,6 +4887,14 @@ enum nl80211_tid_config { * @NL80211_TID_CONFIG_ATTR_AMSDU_CTRL: Enable/Disable MSDU aggregation * for the TIDs specified in %NL80211_TID_CONFIG_ATTR_TIDS. * Its type is u8, using the values from &nl80211_tid_config. + * @NL80211_TID_CONFIG_ATTR_TX_RATE_TYPE: This attribute will be useful + * to notfiy the driver that what type of txrate should be used + * for the TIDs specified in %NL80211_TID_CONFIG_ATTR_TIDS. using + * the values form &nl80211_tx_rate_setting. + * @NL80211_TID_CONFIG_ATTR_TX_RATE: Data frame TX rate mask should be applied + * with the parameters passed through %NL80211_ATTR_TX_RATES. + * configuration is applied to the data frame for the tid to that connected + * station. */ enum nl80211_tid_config_attr { __NL80211_TID_CONFIG_ATTR_INVALID, @@ -4890,6 +4909,8 @@ enum nl80211_tid_config_attr { NL80211_TID_CONFIG_ATTR_AMPDU_CTRL, NL80211_TID_CONFIG_ATTR_RTSCTS_CTRL, NL80211_TID_CONFIG_ATTR_AMSDU_CTRL, + NL80211_TID_CONFIG_ATTR_TX_RATE_TYPE, + NL80211_TID_CONFIG_ATTR_TX_RATE, /* keep last */ __NL80211_TID_CONFIG_ATTR_AFTER_LAST, diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 7ea764865546..22c4d13e28cb 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -329,6 +329,15 @@ he_bss_color_policy[NL80211_HE_BSS_COLOR_ATTR_MAX + 1] = { [NL80211_HE_BSS_COLOR_ATTR_PARTIAL] = { .type = NLA_FLAG }, }; +static const struct nla_policy nl80211_txattr_policy[NL80211_TXRATE_MAX + 1] = { + [NL80211_TXRATE_LEGACY] = { .type = NLA_BINARY, + .len = NL80211_MAX_SUPP_RATES }, + [NL80211_TXRATE_HT] = { .type = NLA_BINARY, + .len = NL80211_MAX_SUPP_HT_RATES }, + [NL80211_TXRATE_VHT] = NLA_POLICY_EXACT_LEN_WARN(sizeof(struct nl80211_txrate_vht)), + [NL80211_TXRATE_GI] = { .type = NLA_U8 }, +}; + static const struct nla_policy nl80211_tid_config_attr_policy[NL80211_TID_CONFIG_ATTR_MAX + 1] = { [NL80211_TID_CONFIG_ATTR_VIF_SUPP] = { .type = NLA_U64 }, @@ -345,6 +354,10 @@ nl80211_tid_config_attr_policy[NL80211_TID_CONFIG_ATTR_MAX + 1] = { NLA_POLICY_MAX(NLA_U8, NL80211_TID_CONFIG_DISABLE), [NL80211_TID_CONFIG_ATTR_AMSDU_CTRL] = NLA_POLICY_MAX(NLA_U8, NL80211_TID_CONFIG_DISABLE), + [NL80211_TID_CONFIG_ATTR_TX_RATE_TYPE] = + NLA_POLICY_MAX(NLA_U8, NL80211_TX_RATE_FIXED), + [NL80211_TID_CONFIG_ATTR_TX_RATE] = + NLA_POLICY_NESTED(nl80211_txattr_policy), }; static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = { @@ -4388,16 +4401,9 @@ static bool vht_set_mcs_mask(struct ieee80211_supported_band *sband, return true; } -static const struct nla_policy nl80211_txattr_policy[NL80211_TXRATE_MAX + 1] = { - [NL80211_TXRATE_LEGACY] = { .type = NLA_BINARY, - .len = NL80211_MAX_SUPP_RATES }, - [NL80211_TXRATE_HT] = { .type = NLA_BINARY, - .len = NL80211_MAX_SUPP_HT_RATES }, - [NL80211_TXRATE_VHT] = NLA_POLICY_EXACT_LEN_WARN(sizeof(struct nl80211_txrate_vht)), - [NL80211_TXRATE_GI] = { .type = NLA_U8 }, -}; - static int nl80211_parse_tx_bitrate_mask(struct genl_info *info, + struct nlattr *attrs[], + enum nl80211_attrs attr, struct cfg80211_bitrate_mask *mask) { struct nlattr *tb[NL80211_TXRATE_MAX + 1]; @@ -4428,14 +4434,14 @@ static int nl80211_parse_tx_bitrate_mask(struct genl_info *info, } /* if no rates are given set it back to the defaults */ - if (!info->attrs[NL80211_ATTR_TX_RATES]) + if (!attrs[attr]) goto out; /* The nested attribute uses enum nl80211_band as the index. This maps * directly to the enum nl80211_band values used in cfg80211. */ BUILD_BUG_ON(NL80211_MAX_SUPP_HT_RATES > IEEE80211_HT_MCS_MASK_LEN * 8); - nla_for_each_nested(tx_rates, info->attrs[NL80211_ATTR_TX_RATES], rem) { + nla_for_each_nested(tx_rates, attrs[attr], rem) { enum nl80211_band band = nla_type(tx_rates); int err; @@ -4940,7 +4946,9 @@ static int nl80211_start_ap(struct sk_buff *skb, struct genl_info *info) return -EINVAL; if (info->attrs[NL80211_ATTR_TX_RATES]) { - err = nl80211_parse_tx_bitrate_mask(info, ¶ms.beacon_rate); + err = nl80211_parse_tx_bitrate_mask(info, info->attrs, + NL80211_ATTR_TX_RATES, + ¶ms.beacon_rate); if (err) return err; @@ -10753,7 +10761,8 @@ static int nl80211_set_tx_bitrate_mask(struct sk_buff *skb, if (!rdev->ops->set_bitrate_mask) return -EOPNOTSUPP; - err = nl80211_parse_tx_bitrate_mask(info, &mask); + err = nl80211_parse_tx_bitrate_mask(info, info->attrs, + NL80211_ATTR_TX_RATES, &mask); if (err) return err; @@ -11359,7 +11368,9 @@ static int nl80211_join_mesh(struct sk_buff *skb, struct genl_info *info) } if (info->attrs[NL80211_ATTR_TX_RATES]) { - err = nl80211_parse_tx_bitrate_mask(info, &setup.beacon_rate); + err = nl80211_parse_tx_bitrate_mask(info, info->attrs, + NL80211_ATTR_TX_RATES, + &setup.beacon_rate); if (err) return err; @@ -14139,6 +14150,23 @@ static int parse_tid_conf(struct cfg80211_registered_device *rdev, nla_get_u8(attrs[NL80211_TID_CONFIG_ATTR_AMSDU_CTRL]); } + if (attrs[NL80211_TID_CONFIG_ATTR_TX_RATE_TYPE]) { + u32 idx = NL80211_TID_CONFIG_ATTR_TX_RATE_TYPE, attr; + + tid_conf->txrate_type = nla_get_u8(attrs[idx]); + + if (tid_conf->txrate_type != NL80211_TX_RATE_AUTOMATIC) { + attr = NL80211_TID_CONFIG_ATTR_TX_RATE; + err = nl80211_parse_tx_bitrate_mask(info, attrs, attr, + &tid_conf->txrate_mask); + if (err) + return err; + + tid_conf->mask |= BIT(NL80211_TID_CONFIG_ATTR_TX_RATE); + } + tid_conf->mask |= BIT(NL80211_TID_CONFIG_ATTR_TX_RATE_TYPE); + } + if (peer) mask = rdev->wiphy.tid_config_support.peer; else -- cgit v1.2.3-59-g8ed1b From a3b018febccd3686c39e86e98b5081bde014fc66 Mon Sep 17 00:00:00 2001 From: Patrick Steinhardt Date: Sun, 17 May 2020 18:30:19 +0200 Subject: cfg80211: fix CFG82011_CRDA_SUPPORT still mentioning internal regdb Back with commit c8c240e284b3 (cfg80211: reg: remove support for built-in regdb, 2015-10-15), support for using CFG80211_INTERNAL_REGDB was removed in favor of loading the regulatory database as firmware file. The documentation of CFG80211_CRDA_SUPPORT was not adjusted, though, which is why it still mentions mentions the old way of loading via the internal regulatory database. Remove it so that the kernel option only mentions using the firmware file. Signed-off-by: Patrick Steinhardt Link: https://lore.kernel.org/r/c56e60207fbd0512029de8c6276ee00f73491924.1589732954.git.ps@pks.im Signed-off-by: Johannes Berg --- net/wireless/Kconfig | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/wireless/Kconfig b/net/wireless/Kconfig index 63cf7131f601..813e93644ae7 100644 --- a/net/wireless/Kconfig +++ b/net/wireless/Kconfig @@ -181,8 +181,8 @@ config CFG80211_CRDA_SUPPORT default y help You should enable this option unless you know for sure you have no - need for it, for example when using internal regdb (above) or the - database loaded as a firmware file. + need for it, for example when using the regulatory database loaded as + a firmware file. If unsure, say Y. -- cgit v1.2.3-59-g8ed1b From 1b9ae0c92925ac40489be526d67d0010d0724ce0 Mon Sep 17 00:00:00 2001 From: Hauke Mehrtens Date: Thu, 21 May 2020 22:14:22 +0200 Subject: wireless: Use linux/stddef.h instead of stddef.h When compiling inside the kernel include linux/stddef.h instead of stddef.h. When I compile this header file in backports for power PC I run into a conflict with ptrdiff_t. I was unable to reproduce this in mainline kernel. I still would like to fix this problem in the kernel. Fixes: 6989310f5d43 ("wireless: Use offsetof instead of custom macro.") Signed-off-by: Hauke Mehrtens Link: https://lore.kernel.org/r/20200521201422.16493-1-hauke@hauke-m.de Signed-off-by: Johannes Berg --- include/uapi/linux/wireless.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/include/uapi/linux/wireless.h b/include/uapi/linux/wireless.h index a2c006a364e0..24f3371ad826 100644 --- a/include/uapi/linux/wireless.h +++ b/include/uapi/linux/wireless.h @@ -74,7 +74,11 @@ #include /* for "struct sockaddr" et al */ #include /* for IFNAMSIZ and co... */ -#include /* for offsetof */ +#ifdef __KERNEL__ +# include /* for offsetof */ +#else +# include /* for offsetof */ +#endif /***************************** VERSION *****************************/ /* -- cgit v1.2.3-59-g8ed1b From c11299243370580832c27882dcedf2604f9f48f8 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Tue, 26 May 2020 14:33:48 +0200 Subject: mac80211: fix HT-Control field reception for management frames If we receive management frames with an HT-Control field, we cannot parse them properly, as we assume a fixed length management header. Since we don't even need the HTC field (for these frames, or really at all), just remove it at the beginning of RX. Reported-by: Haggai Abramovsky Signed-off-by: Johannes Berg Link: https://lore.kernel.org/r/20200526143346.cf5ce70521c5.I333251a084ec4cfe67b7ef7efe2d2f1a33883931@changeid Signed-off-by: Johannes Berg --- net/mac80211/rx.c | 44 +++++++++++++++++++++++++++++++++++++------- 1 file changed, 37 insertions(+), 7 deletions(-) diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index 8b0fa5e345f4..21854a61a2b7 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -93,13 +93,44 @@ static u8 *ieee80211_get_bssid(struct ieee80211_hdr *hdr, size_t len, * This function cleans up the SKB, i.e. it removes all the stuff * only useful for monitoring. */ -static void remove_monitor_info(struct sk_buff *skb, - unsigned int present_fcs_len, - unsigned int rtap_space) +static struct sk_buff *ieee80211_clean_skb(struct sk_buff *skb, + unsigned int present_fcs_len, + unsigned int rtap_space) { + struct ieee80211_hdr *hdr; + unsigned int hdrlen; + __le16 fc; + if (present_fcs_len) __pskb_trim(skb, skb->len - present_fcs_len); __pskb_pull(skb, rtap_space); + + hdr = (void *)skb->data; + fc = hdr->frame_control; + + /* + * Remove the HT-Control field (if present) on management + * frames after we've sent the frame to monitoring. We + * (currently) don't need it, and don't properly parse + * frames with it present, due to the assumption of a + * fixed management header length. + */ + if (likely(!ieee80211_is_mgmt(fc) || !ieee80211_has_order(fc))) + return skb; + + hdrlen = ieee80211_hdrlen(fc); + hdr->frame_control &= ~cpu_to_le16(IEEE80211_FCTL_ORDER); + + if (!pskb_may_pull(skb, hdrlen)) { + dev_kfree_skb(skb); + return NULL; + } + + memmove(skb->data + IEEE80211_HT_CTL_LEN, skb->data, + hdrlen - IEEE80211_HT_CTL_LEN); + __pskb_pull(skb, IEEE80211_HT_CTL_LEN); + + return skb; } static inline bool should_drop_frame(struct sk_buff *skb, int present_fcs_len, @@ -827,8 +858,8 @@ ieee80211_rx_monitor(struct ieee80211_local *local, struct sk_buff *origskb, return NULL; } - remove_monitor_info(origskb, present_fcs_len, rtap_space); - return origskb; + return ieee80211_clean_skb(origskb, present_fcs_len, + rtap_space); } ieee80211_handle_mu_mimo_mon(monitor_sdata, origskb, rtap_space); @@ -871,8 +902,7 @@ ieee80211_rx_monitor(struct ieee80211_local *local, struct sk_buff *origskb, if (!origskb) return NULL; - remove_monitor_info(origskb, present_fcs_len, rtap_space); - return origskb; + return ieee80211_clean_skb(origskb, present_fcs_len, rtap_space); } static void ieee80211_parse_qos(struct ieee80211_rx_data *rx) -- cgit v1.2.3-59-g8ed1b From 119aadf816f5373dc82ca4109d6d5b777e00475b Mon Sep 17 00:00:00 2001 From: Ramon Fontes Date: Fri, 15 May 2020 13:46:40 -0300 Subject: mac80211_hwsim: report the WIPHY_FLAG_SUPPORTS_5_10_MHZ capability Signed-off-by: Ramon Fontes Link: https://lore.kernel.org/r/20200515164640.97276-1-ramonreisfontes@gmail.com [fix indentation to use tabs] Signed-off-by: Johannes Berg --- drivers/net/wireless/mac80211_hwsim.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/wireless/mac80211_hwsim.c b/drivers/net/wireless/mac80211_hwsim.c index f4ded2f2ee3b..1356e8cbe617 100644 --- a/drivers/net/wireless/mac80211_hwsim.c +++ b/drivers/net/wireless/mac80211_hwsim.c @@ -3054,6 +3054,7 @@ static int mac80211_hwsim_new_radio(struct genl_info *info, hw->wiphy->flags |= WIPHY_FLAG_SUPPORTS_TDLS | WIPHY_FLAG_HAS_REMAIN_ON_CHANNEL | WIPHY_FLAG_AP_UAPSD | + WIPHY_FLAG_SUPPORTS_5_10_MHZ | WIPHY_FLAG_HAS_CHANNEL_SWITCH; hw->wiphy->features |= NL80211_FEATURE_ACTIVE_MONITOR | NL80211_FEATURE_AP_MODE_CHAN_WIDTH_CHANGE | -- cgit v1.2.3-59-g8ed1b From 6b646a7e4af69814dd1a3340fca0f02d4977420d Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Mon, 9 Mar 2020 16:44:25 +0200 Subject: net/mlx5: Add ability to read and write ECE options The end result of RDMA-CM ECE handshake is ECE options, which is needed to be used while configuring data QPs. Such options can come in any QP state, so add in/out fields to set and query ECE options. OUT fields: * create_qp() - default ECE options for that type of QP. * modify_qp() - enabled ECE options after QP state transition. IN fields: * create_qp() - create QP with this ECE option. * modify_qp() - requested options. For unconnected QPs, the FW will return an error if ECE is already configured with any options that not equal to previously set. Reviewed-by: Mark Zhang Reviewed-by: Maor Gottlieb Signed-off-by: Leon Romanovsky --- include/linux/mlx5/mlx5_ifc.h | 26 ++++++++++++++++---------- 1 file changed, 16 insertions(+), 10 deletions(-) diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index fd8da4875ea0..1a56dc079c32 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -1208,7 +1208,9 @@ struct mlx5_ifc_cmd_hca_cap_bits { u8 reserved_at_99[0x2]; u8 log_max_qp[0x5]; - u8 reserved_at_a0[0xb]; + u8 reserved_at_a0[0x3]; + u8 ece_support[0x1]; + u8 reserved_at_a4[0x7]; u8 log_max_srq[0x5]; u8 reserved_at_b0[0x10]; @@ -4216,7 +4218,8 @@ struct mlx5_ifc_rts2rts_qp_out_bits { u8 syndrome[0x20]; - u8 reserved_at_40[0x40]; + u8 reserved_at_40[0x20]; + u8 ece[0x20]; }; struct mlx5_ifc_rts2rts_qp_in_bits { @@ -4233,7 +4236,7 @@ struct mlx5_ifc_rts2rts_qp_in_bits { u8 opt_param_mask[0x20]; - u8 reserved_at_a0[0x20]; + u8 ece[0x20]; struct mlx5_ifc_qpc_bits qpc; @@ -4246,7 +4249,8 @@ struct mlx5_ifc_rtr2rts_qp_out_bits { u8 syndrome[0x20]; - u8 reserved_at_40[0x40]; + u8 reserved_at_40[0x20]; + u8 ece[0x20]; }; struct mlx5_ifc_rtr2rts_qp_in_bits { @@ -4263,7 +4267,7 @@ struct mlx5_ifc_rtr2rts_qp_in_bits { u8 opt_param_mask[0x20]; - u8 reserved_at_a0[0x20]; + u8 ece[0x20]; struct mlx5_ifc_qpc_bits qpc; @@ -4815,7 +4819,8 @@ struct mlx5_ifc_query_qp_out_bits { u8 syndrome[0x20]; - u8 reserved_at_40[0x40]; + u8 reserved_at_40[0x20]; + u8 ece[0x20]; u8 opt_param_mask[0x20]; @@ -6580,7 +6585,8 @@ struct mlx5_ifc_init2rtr_qp_out_bits { u8 syndrome[0x20]; - u8 reserved_at_40[0x40]; + u8 reserved_at_40[0x20]; + u8 ece[0x20]; }; struct mlx5_ifc_init2rtr_qp_in_bits { @@ -6597,7 +6603,7 @@ struct mlx5_ifc_init2rtr_qp_in_bits { u8 opt_param_mask[0x20]; - u8 reserved_at_a0[0x20]; + u8 ece[0x20]; struct mlx5_ifc_qpc_bits qpc; @@ -7693,7 +7699,7 @@ struct mlx5_ifc_create_qp_out_bits { u8 reserved_at_40[0x8]; u8 qpn[0x18]; - u8 reserved_at_60[0x20]; + u8 ece[0x20]; }; struct mlx5_ifc_create_qp_in_bits { @@ -7707,7 +7713,7 @@ struct mlx5_ifc_create_qp_in_bits { u8 opt_param_mask[0x20]; - u8 reserved_at_a0[0x20]; + u8 ece[0x20]; struct mlx5_ifc_qpc_bits qpc; -- cgit v1.2.3-59-g8ed1b From d8e79f1dbcee8032667c0718a654c749d64f6304 Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Wed, 27 May 2020 01:00:20 -0700 Subject: nexthop: Fix type of event_type in call_nexthop_notifiers Clang warns: net/ipv4/nexthop.c:841:30: warning: implicit conversion from enumeration type 'enum nexthop_event_type' to different enumeration type 'enum fib_event_type' [-Wenum-conversion] call_nexthop_notifiers(net, NEXTHOP_EVENT_DEL, nh); ~~~~~~~~~~~~~~~~~~~~~~ ^~~~~~~~~~~~~~~~~ 1 warning generated. Use the right type for event_type so that clang does not warn. Fixes: 8590ceedb701 ("nexthop: add support for notifiers") Link: https://github.com/ClangBuiltLinux/linux/issues/1038 Signed-off-by: Nathan Chancellor Reviewed-by: David Ahern Signed-off-by: David S. Miller --- net/ipv4/nexthop.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ipv4/nexthop.c b/net/ipv4/nexthop.c index 143011f9b580..ec1282858cb7 100644 --- a/net/ipv4/nexthop.c +++ b/net/ipv4/nexthop.c @@ -37,7 +37,7 @@ static const struct nla_policy rtm_nh_policy[NHA_MAX + 1] = { }; static int call_nexthop_notifiers(struct net *net, - enum fib_event_type event_type, + enum nexthop_event_type event_type, struct nexthop *nh) { int err; -- cgit v1.2.3-59-g8ed1b From 7cf4eda481b235cbc7c210715cce19fde3d23d55 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Wed, 27 May 2020 09:15:55 +0100 Subject: mlxsw: spectrum_router: remove redundant initialization of pointer br_dev The pointer br_dev is being initialized with a value that is never read and is being updated with a new value later on. The initialization is redundant and can be removed. Addresses-Coverity: ("Unused value") Signed-off-by: Colin Ian King Reviewed-by: Ido Schimmel Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c index c939b3596566..770de0222e7b 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c @@ -7574,7 +7574,7 @@ static struct mlxsw_sp_fid * mlxsw_sp_rif_vlan_fid_get(struct mlxsw_sp_rif *rif, struct netlink_ext_ack *extack) { - struct net_device *br_dev = rif->dev; + struct net_device *br_dev; u16 vid; int err; -- cgit v1.2.3-59-g8ed1b From f96e9641e92b54de27b93d1af03d74b8304ce00a Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Wed, 27 May 2020 11:24:04 +0200 Subject: net: ethernet: mtk-star-emac: fix error path in RX handling The dma_addr field in desc_data must not be overwritten until after the new skb is mapped. Currently we do replace it with uninitialized value in error path. This change fixes it by moving the assignment before the label to which we jump after mapping or allocation errors. Fixes: 8c7bd5a454ff ("net: ethernet: mtk-star-emac: new driver") Reported-by: Nathan Chancellor Signed-off-by: Bartosz Golaszewski Tested-by: Nathan Chancellor # build Signed-off-by: David S. Miller --- drivers/net/ethernet/mediatek/mtk_star_emac.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mediatek/mtk_star_emac.c b/drivers/net/ethernet/mediatek/mtk_star_emac.c index b74349cede28..72bb624a6a68 100644 --- a/drivers/net/ethernet/mediatek/mtk_star_emac.c +++ b/drivers/net/ethernet/mediatek/mtk_star_emac.c @@ -1308,6 +1308,8 @@ static int mtk_star_receive_packet(struct mtk_star_priv *priv) goto push_new_skb; } + desc_data.dma_addr = new_dma_addr; + /* We can't fail anymore at this point: it's safe to unmap the skb. */ mtk_star_dma_unmap_rx(priv, &desc_data); @@ -1318,7 +1320,6 @@ static int mtk_star_receive_packet(struct mtk_star_priv *priv) netif_receive_skb(desc_data.skb); push_new_skb: - desc_data.dma_addr = new_dma_addr; desc_data.len = skb_tailroom(new_skb); desc_data.skb = new_skb; -- cgit v1.2.3-59-g8ed1b From 9f01a71c5cbec10b851588457089d17c20dc5a40 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Wed, 27 May 2020 13:01:29 +0100 Subject: net: dsa: b53: remove redundant premature assignment to new_pvid Variable new_pvid is being assigned with a value that is never read, the following if statement updates new_pvid with a new value in both of the if paths. The assignment is redundant and can be removed. Addresses-Coverity: ("Unused value") Signed-off-by: Colin Ian King Acked-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/dsa/b53/b53_common.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/net/dsa/b53/b53_common.c b/drivers/net/dsa/b53/b53_common.c index ceb8be653182..1df05841ab6b 100644 --- a/drivers/net/dsa/b53/b53_common.c +++ b/drivers/net/dsa/b53/b53_common.c @@ -1325,7 +1325,6 @@ int b53_vlan_filtering(struct dsa_switch *ds, int port, bool vlan_filtering) u16 pvid, new_pvid; b53_read16(dev, B53_VLAN_PAGE, B53_VLAN_PORT_DEF_TAG(port), &pvid); - new_pvid = pvid; if (!vlan_filtering) { /* Filtering is currently enabled, use the default PVID since * the bridge does not expect tagging anymore -- cgit v1.2.3-59-g8ed1b From 20f6a05ef63594feb0c6dfbd629da0448b43124d Mon Sep 17 00:00:00 2001 From: Horatiu Vultur Date: Wed, 27 May 2020 12:34:30 +0000 Subject: bridge: mrp: Rework the MRP netlink interface This patch reworks the MRP netlink interface. Before, each attribute represented a binary structure which made it hard to be extended. Therefore update the MRP netlink interface such that each existing attribute to be a nested attribute which contains the fields of the binary structures. In this way the MRP netlink interface can be extended without breaking the backwards compatibility. It is also using strict checking for attributes under the MRP top attribute. Signed-off-by: Horatiu Vultur Acked-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- include/uapi/linux/if_bridge.h | 64 ++++++++-- net/bridge/br_mrp.c | 8 +- net/bridge/br_mrp_netlink.c | 266 +++++++++++++++++++++++++++++++++++------ net/bridge/br_private_mrp.h | 2 +- 4 files changed, 290 insertions(+), 50 deletions(-) diff --git a/include/uapi/linux/if_bridge.h b/include/uapi/linux/if_bridge.h index bd8c95488f16..5a43eb86c93b 100644 --- a/include/uapi/linux/if_bridge.h +++ b/include/uapi/linux/if_bridge.h @@ -169,17 +169,69 @@ enum { __IFLA_BRIDGE_MRP_MAX, }; +#define IFLA_BRIDGE_MRP_MAX (__IFLA_BRIDGE_MRP_MAX - 1) + +enum { + IFLA_BRIDGE_MRP_INSTANCE_UNSPEC, + IFLA_BRIDGE_MRP_INSTANCE_RING_ID, + IFLA_BRIDGE_MRP_INSTANCE_P_IFINDEX, + IFLA_BRIDGE_MRP_INSTANCE_S_IFINDEX, + __IFLA_BRIDGE_MRP_INSTANCE_MAX, +}; + +#define IFLA_BRIDGE_MRP_INSTANCE_MAX (__IFLA_BRIDGE_MRP_INSTANCE_MAX - 1) + +enum { + IFLA_BRIDGE_MRP_PORT_STATE_UNSPEC, + IFLA_BRIDGE_MRP_PORT_STATE_STATE, + __IFLA_BRIDGE_MRP_PORT_STATE_MAX, +}; + +#define IFLA_BRIDGE_MRP_PORT_STATE_MAX (__IFLA_BRIDGE_MRP_PORT_STATE_MAX - 1) + +enum { + IFLA_BRIDGE_MRP_PORT_ROLE_UNSPEC, + IFLA_BRIDGE_MRP_PORT_ROLE_ROLE, + __IFLA_BRIDGE_MRP_PORT_ROLE_MAX, +}; + +#define IFLA_BRIDGE_MRP_PORT_ROLE_MAX (__IFLA_BRIDGE_MRP_PORT_ROLE_MAX - 1) + +enum { + IFLA_BRIDGE_MRP_RING_STATE_UNSPEC, + IFLA_BRIDGE_MRP_RING_STATE_RING_ID, + IFLA_BRIDGE_MRP_RING_STATE_STATE, + __IFLA_BRIDGE_MRP_RING_STATE_MAX, +}; + +#define IFLA_BRIDGE_MRP_RING_STATE_MAX (__IFLA_BRIDGE_MRP_RING_STATE_MAX - 1) + +enum { + IFLA_BRIDGE_MRP_RING_ROLE_UNSPEC, + IFLA_BRIDGE_MRP_RING_ROLE_RING_ID, + IFLA_BRIDGE_MRP_RING_ROLE_ROLE, + __IFLA_BRIDGE_MRP_RING_ROLE_MAX, +}; + +#define IFLA_BRIDGE_MRP_RING_ROLE_MAX (__IFLA_BRIDGE_MRP_RING_ROLE_MAX - 1) + +enum { + IFLA_BRIDGE_MRP_START_TEST_UNSPEC, + IFLA_BRIDGE_MRP_START_TEST_RING_ID, + IFLA_BRIDGE_MRP_START_TEST_INTERVAL, + IFLA_BRIDGE_MRP_START_TEST_MAX_MISS, + IFLA_BRIDGE_MRP_START_TEST_PERIOD, + __IFLA_BRIDGE_MRP_START_TEST_MAX, +}; + +#define IFLA_BRIDGE_MRP_START_TEST_MAX (__IFLA_BRIDGE_MRP_START_TEST_MAX - 1) + struct br_mrp_instance { __u32 ring_id; __u32 p_ifindex; __u32 s_ifindex; }; -struct br_mrp_port_role { - __u32 ring_id; - __u32 role; -}; - struct br_mrp_ring_state { __u32 ring_id; __u32 ring_state; @@ -197,8 +249,6 @@ struct br_mrp_start_test { __u32 period; }; -#define IFLA_BRIDGE_MRP_MAX (__IFLA_BRIDGE_MRP_MAX - 1) - struct bridge_stp_xstats { __u64 transition_blk; __u64 transition_fwd; diff --git a/net/bridge/br_mrp.c b/net/bridge/br_mrp.c index 528d767eb026..8ea59504ef47 100644 --- a/net/bridge/br_mrp.c +++ b/net/bridge/br_mrp.c @@ -376,24 +376,24 @@ int br_mrp_set_port_state(struct net_bridge_port *p, * note: already called with rtnl_lock */ int br_mrp_set_port_role(struct net_bridge_port *p, - struct br_mrp_port_role *role) + enum br_mrp_port_role_type role) { struct br_mrp *mrp; if (!p || !(p->flags & BR_MRP_AWARE)) return -EINVAL; - mrp = br_mrp_find_id(p->br, role->ring_id); + mrp = br_mrp_find_port(p->br, p); if (!mrp) return -EINVAL; - if (role->role == BR_MRP_PORT_ROLE_PRIMARY) + if (role == BR_MRP_PORT_ROLE_PRIMARY) rcu_assign_pointer(mrp->p_port, p); else rcu_assign_pointer(mrp->s_port, p); - br_mrp_port_switchdev_set_role(p, role->role); + br_mrp_port_switchdev_set_role(p, role); return 0; } diff --git a/net/bridge/br_mrp_netlink.c b/net/bridge/br_mrp_netlink.c index 4a08a99519b0..d9de780d2ce0 100644 --- a/net/bridge/br_mrp_netlink.c +++ b/net/bridge/br_mrp_netlink.c @@ -8,19 +8,222 @@ static const struct nla_policy br_mrp_policy[IFLA_BRIDGE_MRP_MAX + 1] = { [IFLA_BRIDGE_MRP_UNSPEC] = { .type = NLA_REJECT }, - [IFLA_BRIDGE_MRP_INSTANCE] = { .type = NLA_EXACT_LEN, - .len = sizeof(struct br_mrp_instance)}, - [IFLA_BRIDGE_MRP_PORT_STATE] = { .type = NLA_U32 }, - [IFLA_BRIDGE_MRP_PORT_ROLE] = { .type = NLA_EXACT_LEN, - .len = sizeof(struct br_mrp_port_role)}, - [IFLA_BRIDGE_MRP_RING_STATE] = { .type = NLA_EXACT_LEN, - .len = sizeof(struct br_mrp_ring_state)}, - [IFLA_BRIDGE_MRP_RING_ROLE] = { .type = NLA_EXACT_LEN, - .len = sizeof(struct br_mrp_ring_role)}, - [IFLA_BRIDGE_MRP_START_TEST] = { .type = NLA_EXACT_LEN, - .len = sizeof(struct br_mrp_start_test)}, + [IFLA_BRIDGE_MRP_INSTANCE] = { .type = NLA_NESTED }, + [IFLA_BRIDGE_MRP_PORT_STATE] = { .type = NLA_NESTED }, + [IFLA_BRIDGE_MRP_PORT_ROLE] = { .type = NLA_NESTED }, + [IFLA_BRIDGE_MRP_RING_STATE] = { .type = NLA_NESTED }, + [IFLA_BRIDGE_MRP_RING_ROLE] = { .type = NLA_NESTED }, + [IFLA_BRIDGE_MRP_START_TEST] = { .type = NLA_NESTED }, }; +static const struct nla_policy +br_mrp_instance_policy[IFLA_BRIDGE_MRP_INSTANCE_MAX + 1] = { + [IFLA_BRIDGE_MRP_INSTANCE_UNSPEC] = { .type = NLA_REJECT }, + [IFLA_BRIDGE_MRP_INSTANCE_RING_ID] = { .type = NLA_U32 }, + [IFLA_BRIDGE_MRP_INSTANCE_P_IFINDEX] = { .type = NLA_U32 }, + [IFLA_BRIDGE_MRP_INSTANCE_S_IFINDEX] = { .type = NLA_U32 }, +}; + +static int br_mrp_instance_parse(struct net_bridge *br, struct nlattr *attr, + int cmd, struct netlink_ext_ack *extack) +{ + struct nlattr *tb[IFLA_BRIDGE_MRP_INSTANCE_MAX + 1]; + struct br_mrp_instance inst; + int err; + + err = nla_parse_nested(tb, IFLA_BRIDGE_MRP_INSTANCE_MAX, attr, + br_mrp_instance_policy, extack); + if (err) + return err; + + if (!tb[IFLA_BRIDGE_MRP_INSTANCE_RING_ID] || + !tb[IFLA_BRIDGE_MRP_INSTANCE_P_IFINDEX] || + !tb[IFLA_BRIDGE_MRP_INSTANCE_S_IFINDEX]) { + NL_SET_ERR_MSG_MOD(extack, + "Missing attribute: RING_ID or P_IFINDEX or S_IFINDEX"); + return -EINVAL; + } + + memset(&inst, 0, sizeof(inst)); + + inst.ring_id = nla_get_u32(tb[IFLA_BRIDGE_MRP_INSTANCE_RING_ID]); + inst.p_ifindex = nla_get_u32(tb[IFLA_BRIDGE_MRP_INSTANCE_P_IFINDEX]); + inst.s_ifindex = nla_get_u32(tb[IFLA_BRIDGE_MRP_INSTANCE_S_IFINDEX]); + + if (cmd == RTM_SETLINK) + return br_mrp_add(br, &inst); + else + return br_mrp_del(br, &inst); + + return 0; +} + +static const struct nla_policy +br_mrp_port_state_policy[IFLA_BRIDGE_MRP_PORT_STATE_MAX + 1] = { + [IFLA_BRIDGE_MRP_PORT_STATE_UNSPEC] = { .type = NLA_REJECT }, + [IFLA_BRIDGE_MRP_PORT_STATE_STATE] = { .type = NLA_U32 }, +}; + +static int br_mrp_port_state_parse(struct net_bridge_port *p, + struct nlattr *attr, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[IFLA_BRIDGE_MRP_PORT_STATE_MAX + 1]; + enum br_mrp_port_state_type state; + int err; + + err = nla_parse_nested(tb, IFLA_BRIDGE_MRP_PORT_STATE_MAX, attr, + br_mrp_port_state_policy, extack); + if (err) + return err; + + if (!tb[IFLA_BRIDGE_MRP_PORT_STATE_STATE]) { + NL_SET_ERR_MSG_MOD(extack, "Missing attribute: STATE"); + return -EINVAL; + } + + state = nla_get_u32(tb[IFLA_BRIDGE_MRP_PORT_STATE_STATE]); + + return br_mrp_set_port_state(p, state); +} + +static const struct nla_policy +br_mrp_port_role_policy[IFLA_BRIDGE_MRP_PORT_ROLE_MAX + 1] = { + [IFLA_BRIDGE_MRP_PORT_ROLE_UNSPEC] = { .type = NLA_REJECT }, + [IFLA_BRIDGE_MRP_PORT_ROLE_ROLE] = { .type = NLA_U32 }, +}; + +static int br_mrp_port_role_parse(struct net_bridge_port *p, + struct nlattr *attr, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[IFLA_BRIDGE_MRP_PORT_ROLE_MAX + 1]; + enum br_mrp_port_role_type role; + int err; + + err = nla_parse_nested(tb, IFLA_BRIDGE_MRP_PORT_ROLE_MAX, attr, + br_mrp_port_role_policy, extack); + if (err) + return err; + + if (!tb[IFLA_BRIDGE_MRP_PORT_ROLE_ROLE]) { + NL_SET_ERR_MSG_MOD(extack, "Missing attribute: ROLE"); + return -EINVAL; + } + + role = nla_get_u32(tb[IFLA_BRIDGE_MRP_PORT_ROLE_ROLE]); + + return br_mrp_set_port_role(p, role); +} + +static const struct nla_policy +br_mrp_ring_state_policy[IFLA_BRIDGE_MRP_RING_STATE_MAX + 1] = { + [IFLA_BRIDGE_MRP_RING_STATE_UNSPEC] = { .type = NLA_REJECT }, + [IFLA_BRIDGE_MRP_RING_STATE_RING_ID] = { .type = NLA_U32 }, + [IFLA_BRIDGE_MRP_RING_STATE_STATE] = { .type = NLA_U32 }, +}; + +static int br_mrp_ring_state_parse(struct net_bridge *br, struct nlattr *attr, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[IFLA_BRIDGE_MRP_RING_STATE_MAX + 1]; + struct br_mrp_ring_state state; + int err; + + err = nla_parse_nested(tb, IFLA_BRIDGE_MRP_RING_STATE_MAX, attr, + br_mrp_ring_state_policy, extack); + if (err) + return err; + + if (!tb[IFLA_BRIDGE_MRP_RING_STATE_RING_ID] || + !tb[IFLA_BRIDGE_MRP_RING_STATE_STATE]) { + NL_SET_ERR_MSG_MOD(extack, + "Missing attribute: RING_ID or STATE"); + return -EINVAL; + } + + memset(&state, 0x0, sizeof(state)); + + state.ring_id = nla_get_u32(tb[IFLA_BRIDGE_MRP_RING_STATE_RING_ID]); + state.ring_state = nla_get_u32(tb[IFLA_BRIDGE_MRP_RING_STATE_STATE]); + + return br_mrp_set_ring_state(br, &state); +} + +static const struct nla_policy +br_mrp_ring_role_policy[IFLA_BRIDGE_MRP_RING_ROLE_MAX + 1] = { + [IFLA_BRIDGE_MRP_RING_ROLE_UNSPEC] = { .type = NLA_REJECT }, + [IFLA_BRIDGE_MRP_RING_ROLE_RING_ID] = { .type = NLA_U32 }, + [IFLA_BRIDGE_MRP_RING_ROLE_ROLE] = { .type = NLA_U32 }, +}; + +static int br_mrp_ring_role_parse(struct net_bridge *br, struct nlattr *attr, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[IFLA_BRIDGE_MRP_RING_ROLE_MAX + 1]; + struct br_mrp_ring_role role; + int err; + + err = nla_parse_nested(tb, IFLA_BRIDGE_MRP_RING_ROLE_MAX, attr, + br_mrp_ring_role_policy, extack); + if (err) + return err; + + if (!tb[IFLA_BRIDGE_MRP_RING_ROLE_RING_ID] || + !tb[IFLA_BRIDGE_MRP_RING_ROLE_ROLE]) { + NL_SET_ERR_MSG_MOD(extack, + "Missing attribute: RING_ID or ROLE"); + return -EINVAL; + } + + memset(&role, 0x0, sizeof(role)); + + role.ring_id = nla_get_u32(tb[IFLA_BRIDGE_MRP_RING_ROLE_RING_ID]); + role.ring_role = nla_get_u32(tb[IFLA_BRIDGE_MRP_RING_ROLE_ROLE]); + + return br_mrp_set_ring_role(br, &role); +} + +static const struct nla_policy +br_mrp_start_test_policy[IFLA_BRIDGE_MRP_START_TEST_MAX + 1] = { + [IFLA_BRIDGE_MRP_START_TEST_UNSPEC] = { .type = NLA_REJECT }, + [IFLA_BRIDGE_MRP_START_TEST_RING_ID] = { .type = NLA_U32 }, + [IFLA_BRIDGE_MRP_START_TEST_INTERVAL] = { .type = NLA_U32 }, + [IFLA_BRIDGE_MRP_START_TEST_MAX_MISS] = { .type = NLA_U32 }, + [IFLA_BRIDGE_MRP_START_TEST_PERIOD] = { .type = NLA_U32 }, +}; + +static int br_mrp_start_test_parse(struct net_bridge *br, struct nlattr *attr, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[IFLA_BRIDGE_MRP_START_TEST_MAX + 1]; + struct br_mrp_start_test test; + int err; + + err = nla_parse_nested(tb, IFLA_BRIDGE_MRP_START_TEST_MAX, attr, + br_mrp_start_test_policy, extack); + if (err) + return err; + + if (!tb[IFLA_BRIDGE_MRP_START_TEST_RING_ID] || + !tb[IFLA_BRIDGE_MRP_START_TEST_INTERVAL] || + !tb[IFLA_BRIDGE_MRP_START_TEST_MAX_MISS] || + !tb[IFLA_BRIDGE_MRP_START_TEST_PERIOD]) { + NL_SET_ERR_MSG_MOD(extack, + "Missing attribute: RING_ID or INTERVAL or MAX_MISS or PERIOD"); + return -EINVAL; + } + + memset(&test, 0x0, sizeof(test)); + + test.ring_id = nla_get_u32(tb[IFLA_BRIDGE_MRP_START_TEST_RING_ID]); + test.interval = nla_get_u32(tb[IFLA_BRIDGE_MRP_START_TEST_INTERVAL]); + test.max_miss = nla_get_u32(tb[IFLA_BRIDGE_MRP_START_TEST_MAX_MISS]); + test.period = nla_get_u32(tb[IFLA_BRIDGE_MRP_START_TEST_PERIOD]); + + return br_mrp_start_test(br, &test); +} + int br_mrp_parse(struct net_bridge *br, struct net_bridge_port *p, struct nlattr *attr, int cmd, struct netlink_ext_ack *extack) { @@ -44,58 +247,45 @@ int br_mrp_parse(struct net_bridge *br, struct net_bridge_port *p, return err; if (tb[IFLA_BRIDGE_MRP_INSTANCE]) { - struct br_mrp_instance *instance = - nla_data(tb[IFLA_BRIDGE_MRP_INSTANCE]); - - if (cmd == RTM_SETLINK) - err = br_mrp_add(br, instance); - else - err = br_mrp_del(br, instance); + err = br_mrp_instance_parse(br, tb[IFLA_BRIDGE_MRP_INSTANCE], + cmd, extack); if (err) return err; } if (tb[IFLA_BRIDGE_MRP_PORT_STATE]) { - enum br_mrp_port_state_type state = - nla_get_u32(tb[IFLA_BRIDGE_MRP_PORT_STATE]); - - err = br_mrp_set_port_state(p, state); + err = br_mrp_port_state_parse(p, tb[IFLA_BRIDGE_MRP_PORT_STATE], + extack); if (err) return err; } if (tb[IFLA_BRIDGE_MRP_PORT_ROLE]) { - struct br_mrp_port_role *role = - nla_data(tb[IFLA_BRIDGE_MRP_PORT_ROLE]); - - err = br_mrp_set_port_role(p, role); + err = br_mrp_port_role_parse(p, tb[IFLA_BRIDGE_MRP_PORT_ROLE], + extack); if (err) return err; } if (tb[IFLA_BRIDGE_MRP_RING_STATE]) { - struct br_mrp_ring_state *state = - nla_data(tb[IFLA_BRIDGE_MRP_RING_STATE]); - - err = br_mrp_set_ring_state(br, state); + err = br_mrp_ring_state_parse(br, + tb[IFLA_BRIDGE_MRP_RING_STATE], + extack); if (err) return err; } if (tb[IFLA_BRIDGE_MRP_RING_ROLE]) { - struct br_mrp_ring_role *role = - nla_data(tb[IFLA_BRIDGE_MRP_RING_ROLE]); - - err = br_mrp_set_ring_role(br, role); + err = br_mrp_ring_role_parse(br, tb[IFLA_BRIDGE_MRP_RING_ROLE], + extack); if (err) return err; } if (tb[IFLA_BRIDGE_MRP_START_TEST]) { - struct br_mrp_start_test *test = - nla_data(tb[IFLA_BRIDGE_MRP_START_TEST]); - - err = br_mrp_start_test(br, test); + err = br_mrp_start_test_parse(br, + tb[IFLA_BRIDGE_MRP_START_TEST], + extack); if (err) return err; } diff --git a/net/bridge/br_private_mrp.h b/net/bridge/br_private_mrp.h index 2921a4b59f8e..a0f53cc3ab85 100644 --- a/net/bridge/br_private_mrp.h +++ b/net/bridge/br_private_mrp.h @@ -37,7 +37,7 @@ int br_mrp_del(struct net_bridge *br, struct br_mrp_instance *instance); int br_mrp_set_port_state(struct net_bridge_port *p, enum br_mrp_port_state_type state); int br_mrp_set_port_role(struct net_bridge_port *p, - struct br_mrp_port_role *role); + enum br_mrp_port_role_type role); int br_mrp_set_ring_state(struct net_bridge *br, struct br_mrp_ring_state *state); int br_mrp_set_ring_role(struct net_bridge *br, struct br_mrp_ring_role *role); -- cgit v1.2.3-59-g8ed1b From f99c0646ef83076dba88255c42482d1b4325f890 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 27 May 2020 15:34:45 +0200 Subject: mtk-star-emac: mark PM functions as __maybe_unused Without CONFIG_PM, the compiler warns about two unused functions: drivers/net/ethernet/mediatek/mtk_star_emac.c:1472:12: error: unused function 'mtk_star_suspend' [-Werror,-Wunused-function] drivers/net/ethernet/mediatek/mtk_star_emac.c:1488:12: error: unused function 'mtk_star_resume' [-Werror,-Wunused-function] Mark these as __maybe_unused. Fixes: 8c7bd5a454ff ("net: ethernet: mtk-star-emac: new driver") Signed-off-by: Arnd Bergmann Acked-by: Bartosz Golaszewski Signed-off-by: David S. Miller --- drivers/net/ethernet/mediatek/mtk_star_emac.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/mediatek/mtk_star_emac.c b/drivers/net/ethernet/mediatek/mtk_star_emac.c index 72bb624a6a68..8596ca0e60eb 100644 --- a/drivers/net/ethernet/mediatek/mtk_star_emac.c +++ b/drivers/net/ethernet/mediatek/mtk_star_emac.c @@ -1468,7 +1468,7 @@ out_put_node: return ret; } -static int mtk_star_suspend(struct device *dev) +static __maybe_unused int mtk_star_suspend(struct device *dev) { struct mtk_star_priv *priv; struct net_device *ndev; @@ -1484,7 +1484,7 @@ static int mtk_star_suspend(struct device *dev) return 0; } -static int mtk_star_resume(struct device *dev) +static __maybe_unused int mtk_star_resume(struct device *dev) { struct mtk_star_priv *priv; struct net_device *ndev; -- cgit v1.2.3-59-g8ed1b From 5a1b72cebc774ec68854dab59c6838d795ee9370 Mon Sep 17 00:00:00 2001 From: Stephen Worley Date: Wed, 27 May 2020 12:41:42 -0400 Subject: net: add large ecmp group nexthop tests Add a couple large ecmp group nexthop selftests to cover the remnant fixed by d69100b8eee27c2d60ee52df76e0b80a8d492d34. The tests create 100 x32 ecmp groups of ipv4 and ipv6 and then dump them. On kernels without the fix, they will fail due to data remnant during the dump. Signed-off-by: Stephen Worley Reviewed-by: David Ahern Reviewed-by: David Ahern Signed-off-by: David S. Miller --- tools/testing/selftests/net/fib_nexthops.sh | 84 ++++++++++++++++++++++++++++- 1 file changed, 82 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/net/fib_nexthops.sh b/tools/testing/selftests/net/fib_nexthops.sh index 51f8e9afe6ae..1e2f61262e4e 100755 --- a/tools/testing/selftests/net/fib_nexthops.sh +++ b/tools/testing/selftests/net/fib_nexthops.sh @@ -19,8 +19,8 @@ ret=0 ksft_skip=4 # all tests in this script. Can be overridden with -t option -IPV4_TESTS="ipv4_fcnal ipv4_grp_fcnal ipv4_withv6_fcnal ipv4_fcnal_runtime ipv4_compat_mode ipv4_fdb_grp_fcnal" -IPV6_TESTS="ipv6_fcnal ipv6_grp_fcnal ipv6_fcnal_runtime ipv6_compat_mode ipv6_fdb_grp_fcnal" +IPV4_TESTS="ipv4_fcnal ipv4_grp_fcnal ipv4_withv6_fcnal ipv4_fcnal_runtime ipv4_large_grp ipv4_compat_mode ipv4_fdb_grp_fcnal" +IPV6_TESTS="ipv6_fcnal ipv6_grp_fcnal ipv6_fcnal_runtime ipv6_large_grp ipv6_compat_mode ipv6_fdb_grp_fcnal" ALL_TESTS="basic ${IPV4_TESTS} ${IPV6_TESTS}" TESTS="${ALL_TESTS}" @@ -254,6 +254,60 @@ check_route6() check_output "${out}" "${expected}" } +check_large_grp() +{ + local ipv=$1 + local ecmp=$2 + local grpnum=100 + local nhidstart=100 + local grpidstart=1000 + local iter=0 + local nhidstr="" + local grpidstr="" + local grpstr="" + local ipstr="" + + if [ $ipv -eq 4 ]; then + ipstr="172.16.1." + else + ipstr="2001:db8:91::" + fi + + # + # Create $grpnum groups with specified $ecmp and dump them + # + + # create nexthops with different gateways + iter=2 + while [ $iter -le $(($ecmp + 1)) ] + do + nhidstr="$(($nhidstart + $iter))" + run_cmd "$IP nexthop add id $nhidstr via $ipstr$iter dev veth1" + check_nexthop "id $nhidstr" "id $nhidstr via $ipstr$iter dev veth1 scope link" + + if [ $iter -le $ecmp ]; then + grpstr+="$nhidstr/" + else + grpstr+="$nhidstr" + fi + ((iter++)) + done + + # create duplicate large ecmp groups + iter=0 + while [ $iter -le $grpnum ] + do + grpidstr="$(($grpidstart + $iter))" + run_cmd "$IP nexthop add id $grpidstr group $grpstr" + check_nexthop "id $grpidstr" "id $grpidstr group $grpstr" + ((iter++)) + done + + # dump large groups + run_cmd "$IP nexthop list" + log_test $? 0 "Dump large (x$ecmp) ecmp groups" +} + start_ip_monitor() { local mtype=$1 @@ -700,6 +754,19 @@ ipv6_fcnal_runtime() # route with src address and using nexthop - not allowed } +ipv6_large_grp() +{ + local ecmp=32 + + echo + echo "IPv6 large groups (x$ecmp)" + echo "---------------------" + + check_large_grp 6 $ecmp + + $IP nexthop flush >/dev/null 2>&1 +} + ipv4_fcnal() { local rc @@ -1066,6 +1133,19 @@ ipv4_fcnal_runtime() log_test $? 0 "IPv4 route with MPLS encap, v6 gw - check" } +ipv4_large_grp() +{ + local ecmp=32 + + echo + echo "IPv4 large groups (x$ecmp)" + echo "---------------------" + + check_large_grp 4 $ecmp + + $IP nexthop flush >/dev/null 2>&1 +} + sysctl_nexthop_compat_mode_check() { local sysctlname="net.ipv4.nexthop_compat_mode" -- cgit v1.2.3-59-g8ed1b From 626a83238e6a63d88a5b5291febe797b244b5f18 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Wed, 27 May 2020 19:45:38 +0300 Subject: net: dsa: felix: accept VLAN config regardless of bridge VLAN awareness state The ocelot core library is written with the idea in mind that the VLAN table is populated by the bridge. Otherwise, not even a sane default pvid is provided: in standalone mode, the default pvid is 0, and the core expects the bridge layer to change it to 1. So without this patch, the VLAN table is completely empty at the end of the commands below, and traffic is broken as a result: ip link add dev br0 type bridge vlan_filtering 0 && ip link set dev br0 up for eth in $(ls /sys/bus/pci/devices/0000\:00\:00.5/net/); do ip link set dev $eth master br0 ip link set dev $eth up done ip link set dev br0 type bridge vlan_filtering 1 Signed-off-by: Vladimir Oltean Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/dsa/ocelot/felix.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/dsa/ocelot/felix.c b/drivers/net/dsa/ocelot/felix.c index 18c23ffd6b40..a6e272d2110d 100644 --- a/drivers/net/dsa/ocelot/felix.c +++ b/drivers/net/dsa/ocelot/felix.c @@ -594,6 +594,7 @@ static int felix_setup(struct dsa_switch *ds) ANA_FLOODING, tc); ds->mtu_enforcement_ingress = true; + ds->configure_vlan_while_not_filtering = true; /* It looks like the MAC/PCS interrupt register - PM0_IEVENT (0x8040) * isn't instantiated for the Felix PF. * In-band AN may take a few ms to complete, so we need to poll. -- cgit v1.2.3-59-g8ed1b From cb8aa9a3affb7d23b11b11fbed41e2feaabc4b0a Mon Sep 17 00:00:00 2001 From: Romain Bellan Date: Mon, 4 May 2020 21:34:29 +0200 Subject: netfilter: ctnetlink: add kernel side filtering for dump Conntrack dump does not support kernel side filtering (only get exists, but it returns only one entry. And user has to give a full valid tuple) It means that userspace has to implement filtering after receiving many irrelevant entries, consuming resources (conntrack table is sometimes very huge, much more than a routing table for example). This patch adds filtering in kernel side. To achieve this goal, we: * Add a new CTA_FILTER netlink attributes, actually a flag list to parametize filtering * Convert some *nlattr_to_tuple() functions, to allow a partial parsing of CTA_TUPLE_ORIG and CTA_TUPLE_REPLY (so nf_conntrack_tuple it not fully set) Filtering is now possible on: * IP SRC/DST values * Ports for TCP and UDP flows * IMCP(v6) codes types and IDs Filtering is done as an "AND" operator. For example, when flags PROTO_SRC_PORT, PROTO_NUM and IP_SRC are sets, only entries matching all values are dumped. Changes since v1: Set NLM_F_DUMP_FILTERED in nlm flags if entries are filtered Changes since v2: Move several constants to nf_internals.h Move a fix on netlink values check in a separate patch Add a check on not-supported flags Return EOPNOTSUPP if CDA_FILTER is set in ctnetlink_flush_conntrack (not yet implemented) Code style issues Changes since v3: Fix compilation warning reported by kbuild test robot Changes since v4: Fix a regression introduced in v3 (returned EINVAL for valid netlink messages without CTA_MARK) Changes since v5: Change definition of CTA_FILTER_F_ALL Fix a regression when CTA_TUPLE_ZONE is not set Signed-off-by: Romain Bellan Signed-off-by: Florent Fourcot Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_conntrack_l4proto.h | 6 +- include/uapi/linux/netfilter/nfnetlink_conntrack.h | 9 + net/netfilter/nf_conntrack_core.c | 19 +- net/netfilter/nf_conntrack_netlink.c | 334 ++++++++++++++++++--- net/netfilter/nf_conntrack_proto_icmp.c | 40 ++- net/netfilter/nf_conntrack_proto_icmpv6.c | 42 ++- net/netfilter/nf_internals.h | 17 ++ 7 files changed, 394 insertions(+), 73 deletions(-) diff --git a/include/net/netfilter/nf_conntrack_l4proto.h b/include/net/netfilter/nf_conntrack_l4proto.h index 4cad1f0a327a..88186b95b3c2 100644 --- a/include/net/netfilter/nf_conntrack_l4proto.h +++ b/include/net/netfilter/nf_conntrack_l4proto.h @@ -42,7 +42,8 @@ struct nf_conntrack_l4proto { /* Calculate tuple nlattr size */ unsigned int (*nlattr_tuple_size)(void); int (*nlattr_to_tuple)(struct nlattr *tb[], - struct nf_conntrack_tuple *t); + struct nf_conntrack_tuple *t, + u_int32_t flags); const struct nla_policy *nla_policy; struct { @@ -152,7 +153,8 @@ const struct nf_conntrack_l4proto *nf_ct_l4proto_find(u8 l4proto); int nf_ct_port_tuple_to_nlattr(struct sk_buff *skb, const struct nf_conntrack_tuple *tuple); int nf_ct_port_nlattr_to_tuple(struct nlattr *tb[], - struct nf_conntrack_tuple *t); + struct nf_conntrack_tuple *t, + u_int32_t flags); unsigned int nf_ct_port_nlattr_tuple_size(void); extern const struct nla_policy nf_ct_port_nla_policy[]; diff --git a/include/uapi/linux/netfilter/nfnetlink_conntrack.h b/include/uapi/linux/netfilter/nfnetlink_conntrack.h index 1d41810d17e2..262881792671 100644 --- a/include/uapi/linux/netfilter/nfnetlink_conntrack.h +++ b/include/uapi/linux/netfilter/nfnetlink_conntrack.h @@ -55,6 +55,7 @@ enum ctattr_type { CTA_LABELS, CTA_LABELS_MASK, CTA_SYNPROXY, + CTA_FILTER, __CTA_MAX }; #define CTA_MAX (__CTA_MAX - 1) @@ -276,4 +277,12 @@ enum ctattr_expect_stats { }; #define CTA_STATS_EXP_MAX (__CTA_STATS_EXP_MAX - 1) +enum ctattr_filter { + CTA_FILTER_UNSPEC, + CTA_FILTER_ORIG_FLAGS, + CTA_FILTER_REPLY_FLAGS, + __CTA_FILTER_MAX +}; +#define CTA_FILTER_MAX (__CTA_FILTER_MAX - 1) + #endif /* _IPCONNTRACK_NETLINK_H */ diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 1d57b95d3481..8abb1727bcc4 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -1974,13 +1974,22 @@ const struct nla_policy nf_ct_port_nla_policy[CTA_PROTO_MAX+1] = { EXPORT_SYMBOL_GPL(nf_ct_port_nla_policy); int nf_ct_port_nlattr_to_tuple(struct nlattr *tb[], - struct nf_conntrack_tuple *t) + struct nf_conntrack_tuple *t, + u_int32_t flags) { - if (!tb[CTA_PROTO_SRC_PORT] || !tb[CTA_PROTO_DST_PORT]) - return -EINVAL; + if (flags & CTA_FILTER_FLAG(CTA_PROTO_SRC_PORT)) { + if (!tb[CTA_PROTO_SRC_PORT]) + return -EINVAL; + + t->src.u.tcp.port = nla_get_be16(tb[CTA_PROTO_SRC_PORT]); + } - t->src.u.tcp.port = nla_get_be16(tb[CTA_PROTO_SRC_PORT]); - t->dst.u.tcp.port = nla_get_be16(tb[CTA_PROTO_DST_PORT]); + if (flags & CTA_FILTER_FLAG(CTA_PROTO_DST_PORT)) { + if (!tb[CTA_PROTO_DST_PORT]) + return -EINVAL; + + t->dst.u.tcp.port = nla_get_be16(tb[CTA_PROTO_DST_PORT]); + } return 0; } diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index 9ddfcd002d3b..d7bd8b1f27d5 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -54,6 +54,8 @@ #include #include +#include "nf_internals.h" + MODULE_LICENSE("GPL"); static int ctnetlink_dump_tuples_proto(struct sk_buff *skb, @@ -544,14 +546,16 @@ static int ctnetlink_dump_info(struct sk_buff *skb, struct nf_conn *ct) static int ctnetlink_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type, - struct nf_conn *ct, bool extinfo) + struct nf_conn *ct, bool extinfo, unsigned int flags) { const struct nf_conntrack_zone *zone; struct nlmsghdr *nlh; struct nfgenmsg *nfmsg; struct nlattr *nest_parms; - unsigned int flags = portid ? NLM_F_MULTI : 0, event; + unsigned int event; + if (portid) + flags |= NLM_F_MULTI; event = nfnl_msg_type(NFNL_SUBSYS_CTNETLINK, IPCTNL_MSG_CT_NEW); nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nfmsg), flags); if (nlh == NULL) @@ -847,17 +851,70 @@ static int ctnetlink_done(struct netlink_callback *cb) } struct ctnetlink_filter { + u_int32_t cta_flags; u8 family; + + u_int32_t orig_flags; + u_int32_t reply_flags; + + struct nf_conntrack_tuple orig; + struct nf_conntrack_tuple reply; + struct nf_conntrack_zone zone; + struct { u_int32_t val; u_int32_t mask; } mark; }; +static const struct nla_policy cta_filter_nla_policy[CTA_FILTER_MAX + 1] = { + [CTA_FILTER_ORIG_FLAGS] = { .type = NLA_U32 }, + [CTA_FILTER_REPLY_FLAGS] = { .type = NLA_U32 }, +}; + +static int ctnetlink_parse_filter(const struct nlattr *attr, + struct ctnetlink_filter *filter) +{ + struct nlattr *tb[CTA_FILTER_MAX + 1]; + int ret = 0; + + ret = nla_parse_nested(tb, CTA_FILTER_MAX, attr, cta_filter_nla_policy, + NULL); + if (ret) + return ret; + + if (tb[CTA_FILTER_ORIG_FLAGS]) { + filter->orig_flags = nla_get_u32(tb[CTA_FILTER_ORIG_FLAGS]); + if (filter->orig_flags & ~CTA_FILTER_F_ALL) + return -EOPNOTSUPP; + } + + if (tb[CTA_FILTER_REPLY_FLAGS]) { + filter->reply_flags = nla_get_u32(tb[CTA_FILTER_REPLY_FLAGS]); + if (filter->reply_flags & ~CTA_FILTER_F_ALL) + return -EOPNOTSUPP; + } + + return 0; +} + +static int ctnetlink_parse_zone(const struct nlattr *attr, + struct nf_conntrack_zone *zone); +static int ctnetlink_parse_tuple_filter(const struct nlattr * const cda[], + struct nf_conntrack_tuple *tuple, + u32 type, u_int8_t l3num, + struct nf_conntrack_zone *zone, + u_int32_t flags); + +/* applied on filters */ +#define CTA_FILTER_F_CTA_MARK (1 << 0) +#define CTA_FILTER_F_CTA_MARK_MASK (1 << 1) + static struct ctnetlink_filter * ctnetlink_alloc_filter(const struct nlattr * const cda[], u8 family) { struct ctnetlink_filter *filter; + int err; #ifndef CONFIG_NF_CONNTRACK_MARK if (cda[CTA_MARK] || cda[CTA_MARK_MASK]) @@ -871,14 +928,65 @@ ctnetlink_alloc_filter(const struct nlattr * const cda[], u8 family) filter->family = family; #ifdef CONFIG_NF_CONNTRACK_MARK - if (cda[CTA_MARK] && cda[CTA_MARK_MASK]) { + if (cda[CTA_MARK]) { filter->mark.val = ntohl(nla_get_be32(cda[CTA_MARK])); - filter->mark.mask = ntohl(nla_get_be32(cda[CTA_MARK_MASK])); + filter->cta_flags |= CTA_FILTER_FLAG(CTA_MARK); + + if (cda[CTA_MARK_MASK]) { + filter->mark.mask = ntohl(nla_get_be32(cda[CTA_MARK_MASK])); + filter->cta_flags |= CTA_FILTER_FLAG(CTA_MARK_MASK); + } else { + filter->mark.mask = 0xffffffff; + } + } else if (cda[CTA_MARK_MASK]) { + return ERR_PTR(-EINVAL); } #endif + if (!cda[CTA_FILTER]) + return filter; + + err = ctnetlink_parse_zone(cda[CTA_ZONE], &filter->zone); + if (err < 0) + return ERR_PTR(err); + + err = ctnetlink_parse_filter(cda[CTA_FILTER], filter); + if (err < 0) + return ERR_PTR(err); + + if (filter->orig_flags) { + if (!cda[CTA_TUPLE_ORIG]) + return ERR_PTR(-EINVAL); + + err = ctnetlink_parse_tuple_filter(cda, &filter->orig, + CTA_TUPLE_ORIG, + filter->family, + &filter->zone, + filter->orig_flags); + if (err < 0) + return ERR_PTR(err); + } + + if (filter->reply_flags) { + if (!cda[CTA_TUPLE_REPLY]) + return ERR_PTR(-EINVAL); + + err = ctnetlink_parse_tuple_filter(cda, &filter->reply, + CTA_TUPLE_REPLY, + filter->family, + &filter->zone, + filter->orig_flags); + if (err < 0) + return ERR_PTR(err); + } + return filter; } +static bool ctnetlink_needs_filter(u8 family, const struct nlattr * const *cda) +{ + return family || cda[CTA_MARK] || cda[CTA_FILTER]; +} + static int ctnetlink_start(struct netlink_callback *cb) { const struct nlattr * const *cda = cb->data; @@ -886,7 +994,7 @@ static int ctnetlink_start(struct netlink_callback *cb) struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh); u8 family = nfmsg->nfgen_family; - if (family || (cda[CTA_MARK] && cda[CTA_MARK_MASK])) { + if (ctnetlink_needs_filter(family, cda)) { filter = ctnetlink_alloc_filter(cda, family); if (IS_ERR(filter)) return PTR_ERR(filter); @@ -896,9 +1004,79 @@ static int ctnetlink_start(struct netlink_callback *cb) return 0; } +static int ctnetlink_filter_match_tuple(struct nf_conntrack_tuple *filter_tuple, + struct nf_conntrack_tuple *ct_tuple, + u_int32_t flags, int family) +{ + switch (family) { + case NFPROTO_IPV4: + if ((flags & CTA_FILTER_FLAG(CTA_IP_SRC)) && + filter_tuple->src.u3.ip != ct_tuple->src.u3.ip) + return 0; + + if ((flags & CTA_FILTER_FLAG(CTA_IP_DST)) && + filter_tuple->dst.u3.ip != ct_tuple->dst.u3.ip) + return 0; + break; + case NFPROTO_IPV6: + if ((flags & CTA_FILTER_FLAG(CTA_IP_SRC)) && + !ipv6_addr_cmp(&filter_tuple->src.u3.in6, + &ct_tuple->src.u3.in6)) + return 0; + + if ((flags & CTA_FILTER_FLAG(CTA_IP_DST)) && + !ipv6_addr_cmp(&filter_tuple->dst.u3.in6, + &ct_tuple->dst.u3.in6)) + return 0; + break; + } + + if ((flags & CTA_FILTER_FLAG(CTA_PROTO_NUM)) && + filter_tuple->dst.protonum != ct_tuple->dst.protonum) + return 0; + + switch (ct_tuple->dst.protonum) { + case IPPROTO_TCP: + case IPPROTO_UDP: + if ((flags & CTA_FILTER_FLAG(CTA_PROTO_SRC_PORT)) && + filter_tuple->src.u.tcp.port != ct_tuple->src.u.tcp.port) + return 0; + + if ((flags & CTA_FILTER_FLAG(CTA_PROTO_DST_PORT)) && + filter_tuple->dst.u.tcp.port != ct_tuple->dst.u.tcp.port) + return 0; + break; + case IPPROTO_ICMP: + if ((flags & CTA_FILTER_FLAG(CTA_PROTO_ICMP_TYPE)) && + filter_tuple->dst.u.icmp.type != ct_tuple->dst.u.icmp.type) + return 0; + if ((flags & CTA_FILTER_FLAG(CTA_PROTO_ICMP_CODE)) && + filter_tuple->dst.u.icmp.code != ct_tuple->dst.u.icmp.code) + return 0; + if ((flags & CTA_FILTER_FLAG(CTA_PROTO_ICMP_ID)) && + filter_tuple->src.u.icmp.id != ct_tuple->src.u.icmp.id) + return 0; + break; + case IPPROTO_ICMPV6: + if ((flags & CTA_FILTER_FLAG(CTA_PROTO_ICMPV6_TYPE)) && + filter_tuple->dst.u.icmp.type != ct_tuple->dst.u.icmp.type) + return 0; + if ((flags & CTA_FILTER_FLAG(CTA_PROTO_ICMPV6_CODE)) && + filter_tuple->dst.u.icmp.code != ct_tuple->dst.u.icmp.code) + return 0; + if ((flags & CTA_FILTER_FLAG(CTA_PROTO_ICMPV6_ID)) && + filter_tuple->src.u.icmp.id != ct_tuple->src.u.icmp.id) + return 0; + break; + } + + return 1; +} + static int ctnetlink_filter_match(struct nf_conn *ct, void *data) { struct ctnetlink_filter *filter = data; + struct nf_conntrack_tuple *tuple; if (filter == NULL) goto out; @@ -910,8 +1088,28 @@ static int ctnetlink_filter_match(struct nf_conn *ct, void *data) if (filter->family && nf_ct_l3num(ct) != filter->family) goto ignore_entry; + if (filter->orig_flags) { + tuple = nf_ct_tuple(ct, IP_CT_DIR_ORIGINAL); + if (!ctnetlink_filter_match_tuple(&filter->orig, tuple, + filter->orig_flags, + filter->family)) + goto ignore_entry; + } + + if (filter->reply_flags) { + tuple = nf_ct_tuple(ct, IP_CT_DIR_REPLY); + if (!ctnetlink_filter_match_tuple(&filter->reply, tuple, + filter->reply_flags, + filter->family)) + goto ignore_entry; + } + #ifdef CONFIG_NF_CONNTRACK_MARK - if ((ct->mark & filter->mark.mask) != filter->mark.val) + if ((filter->cta_flags & CTA_FILTER_FLAG(CTA_MARK_MASK)) && + (ct->mark & filter->mark.mask) != filter->mark.val) + goto ignore_entry; + else if ((filter->cta_flags & CTA_FILTER_FLAG(CTA_MARK)) && + ct->mark != filter->mark.val) goto ignore_entry; #endif @@ -925,6 +1123,7 @@ ignore_entry: static int ctnetlink_dump_table(struct sk_buff *skb, struct netlink_callback *cb) { + unsigned int flags = cb->data ? NLM_F_DUMP_FILTERED : 0; struct net *net = sock_net(skb->sk); struct nf_conn *ct, *last; struct nf_conntrack_tuple_hash *h; @@ -979,7 +1178,7 @@ restart: ctnetlink_fill_info(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NFNL_MSG_TYPE(cb->nlh->nlmsg_type), - ct, true); + ct, true, flags); if (res < 0) { nf_conntrack_get(&ct->ct_general); cb->args[1] = (unsigned long)ct; @@ -1014,31 +1213,50 @@ out: } static int ipv4_nlattr_to_tuple(struct nlattr *tb[], - struct nf_conntrack_tuple *t) + struct nf_conntrack_tuple *t, + u_int32_t flags) { - if (!tb[CTA_IP_V4_SRC] || !tb[CTA_IP_V4_DST]) - return -EINVAL; + if (flags & CTA_FILTER_FLAG(CTA_IP_SRC)) { + if (!tb[CTA_IP_V4_SRC]) + return -EINVAL; + + t->src.u3.ip = nla_get_in_addr(tb[CTA_IP_V4_SRC]); + } + + if (flags & CTA_FILTER_FLAG(CTA_IP_DST)) { + if (!tb[CTA_IP_V4_DST]) + return -EINVAL; - t->src.u3.ip = nla_get_in_addr(tb[CTA_IP_V4_SRC]); - t->dst.u3.ip = nla_get_in_addr(tb[CTA_IP_V4_DST]); + t->dst.u3.ip = nla_get_in_addr(tb[CTA_IP_V4_DST]); + } return 0; } static int ipv6_nlattr_to_tuple(struct nlattr *tb[], - struct nf_conntrack_tuple *t) + struct nf_conntrack_tuple *t, + u_int32_t flags) { - if (!tb[CTA_IP_V6_SRC] || !tb[CTA_IP_V6_DST]) - return -EINVAL; + if (flags & CTA_FILTER_FLAG(CTA_IP_SRC)) { + if (!tb[CTA_IP_V6_SRC]) + return -EINVAL; - t->src.u3.in6 = nla_get_in6_addr(tb[CTA_IP_V6_SRC]); - t->dst.u3.in6 = nla_get_in6_addr(tb[CTA_IP_V6_DST]); + t->src.u3.in6 = nla_get_in6_addr(tb[CTA_IP_V6_SRC]); + } + + if (flags & CTA_FILTER_FLAG(CTA_IP_DST)) { + if (!tb[CTA_IP_V6_DST]) + return -EINVAL; + + t->dst.u3.in6 = nla_get_in6_addr(tb[CTA_IP_V6_DST]); + } return 0; } static int ctnetlink_parse_tuple_ip(struct nlattr *attr, - struct nf_conntrack_tuple *tuple) + struct nf_conntrack_tuple *tuple, + u_int32_t flags) { struct nlattr *tb[CTA_IP_MAX+1]; int ret = 0; @@ -1054,10 +1272,10 @@ static int ctnetlink_parse_tuple_ip(struct nlattr *attr, switch (tuple->src.l3num) { case NFPROTO_IPV4: - ret = ipv4_nlattr_to_tuple(tb, tuple); + ret = ipv4_nlattr_to_tuple(tb, tuple, flags); break; case NFPROTO_IPV6: - ret = ipv6_nlattr_to_tuple(tb, tuple); + ret = ipv6_nlattr_to_tuple(tb, tuple, flags); break; } @@ -1069,7 +1287,8 @@ static const struct nla_policy proto_nla_policy[CTA_PROTO_MAX+1] = { }; static int ctnetlink_parse_tuple_proto(struct nlattr *attr, - struct nf_conntrack_tuple *tuple) + struct nf_conntrack_tuple *tuple, + u_int32_t flags) { const struct nf_conntrack_l4proto *l4proto; struct nlattr *tb[CTA_PROTO_MAX+1]; @@ -1080,8 +1299,12 @@ static int ctnetlink_parse_tuple_proto(struct nlattr *attr, if (ret < 0) return ret; + if (!(flags & CTA_FILTER_FLAG(CTA_PROTO_NUM))) + return 0; + if (!tb[CTA_PROTO_NUM]) return -EINVAL; + tuple->dst.protonum = nla_get_u8(tb[CTA_PROTO_NUM]); rcu_read_lock(); @@ -1092,7 +1315,7 @@ static int ctnetlink_parse_tuple_proto(struct nlattr *attr, l4proto->nla_policy, NULL); if (ret == 0) - ret = l4proto->nlattr_to_tuple(tb, tuple); + ret = l4proto->nlattr_to_tuple(tb, tuple, flags); } rcu_read_unlock(); @@ -1143,10 +1366,21 @@ static const struct nla_policy tuple_nla_policy[CTA_TUPLE_MAX+1] = { [CTA_TUPLE_ZONE] = { .type = NLA_U16 }, }; +#define CTA_FILTER_F_ALL_CTA_PROTO \ + (CTA_FILTER_F_CTA_PROTO_SRC_PORT | \ + CTA_FILTER_F_CTA_PROTO_DST_PORT | \ + CTA_FILTER_F_CTA_PROTO_ICMP_TYPE | \ + CTA_FILTER_F_CTA_PROTO_ICMP_CODE | \ + CTA_FILTER_F_CTA_PROTO_ICMP_ID | \ + CTA_FILTER_F_CTA_PROTO_ICMPV6_TYPE | \ + CTA_FILTER_F_CTA_PROTO_ICMPV6_CODE | \ + CTA_FILTER_F_CTA_PROTO_ICMPV6_ID) + static int -ctnetlink_parse_tuple(const struct nlattr * const cda[], - struct nf_conntrack_tuple *tuple, u32 type, - u_int8_t l3num, struct nf_conntrack_zone *zone) +ctnetlink_parse_tuple_filter(const struct nlattr * const cda[], + struct nf_conntrack_tuple *tuple, u32 type, + u_int8_t l3num, struct nf_conntrack_zone *zone, + u_int32_t flags) { struct nlattr *tb[CTA_TUPLE_MAX+1]; int err; @@ -1158,23 +1392,32 @@ ctnetlink_parse_tuple(const struct nlattr * const cda[], if (err < 0) return err; - if (!tb[CTA_TUPLE_IP]) - return -EINVAL; tuple->src.l3num = l3num; - err = ctnetlink_parse_tuple_ip(tb[CTA_TUPLE_IP], tuple); - if (err < 0) - return err; + if (flags & CTA_FILTER_FLAG(CTA_IP_DST) || + flags & CTA_FILTER_FLAG(CTA_IP_SRC)) { + if (!tb[CTA_TUPLE_IP]) + return -EINVAL; - if (!tb[CTA_TUPLE_PROTO]) - return -EINVAL; + err = ctnetlink_parse_tuple_ip(tb[CTA_TUPLE_IP], tuple, flags); + if (err < 0) + return err; + } - err = ctnetlink_parse_tuple_proto(tb[CTA_TUPLE_PROTO], tuple); - if (err < 0) - return err; + if (flags & CTA_FILTER_FLAG(CTA_PROTO_NUM)) { + if (!tb[CTA_TUPLE_PROTO]) + return -EINVAL; - if (tb[CTA_TUPLE_ZONE]) { + err = ctnetlink_parse_tuple_proto(tb[CTA_TUPLE_PROTO], tuple, flags); + if (err < 0) + return err; + } else if (flags & CTA_FILTER_FLAG(ALL_CTA_PROTO)) { + /* Can't manage proto flags without a protonum */ + return -EINVAL; + } + + if ((flags & CTA_FILTER_FLAG(CTA_TUPLE_ZONE)) && tb[CTA_TUPLE_ZONE]) { if (!zone) return -EINVAL; @@ -1193,6 +1436,15 @@ ctnetlink_parse_tuple(const struct nlattr * const cda[], return 0; } +static int +ctnetlink_parse_tuple(const struct nlattr * const cda[], + struct nf_conntrack_tuple *tuple, u32 type, + u_int8_t l3num, struct nf_conntrack_zone *zone) +{ + return ctnetlink_parse_tuple_filter(cda, tuple, type, l3num, zone, + CTA_FILTER_FLAG(ALL)); +} + static const struct nla_policy help_nla_policy[CTA_HELP_MAX+1] = { [CTA_HELP_NAME] = { .type = NLA_NUL_STRING, .len = NF_CT_HELPER_NAME_LEN - 1 }, @@ -1240,6 +1492,7 @@ static const struct nla_policy ct_nla_policy[CTA_MAX+1] = { .len = NF_CT_LABELS_MAX_SIZE }, [CTA_LABELS_MASK] = { .type = NLA_BINARY, .len = NF_CT_LABELS_MAX_SIZE }, + [CTA_FILTER] = { .type = NLA_NESTED }, }; static int ctnetlink_flush_iterate(struct nf_conn *ct, void *data) @@ -1256,7 +1509,10 @@ static int ctnetlink_flush_conntrack(struct net *net, { struct ctnetlink_filter *filter = NULL; - if (family || (cda[CTA_MARK] && cda[CTA_MARK_MASK])) { + if (ctnetlink_needs_filter(family, cda)) { + if (cda[CTA_FILTER]) + return -EOPNOTSUPP; + filter = ctnetlink_alloc_filter(cda, family); if (IS_ERR(filter)) return PTR_ERR(filter); @@ -1385,7 +1641,7 @@ static int ctnetlink_get_conntrack(struct net *net, struct sock *ctnl, } err = ctnetlink_fill_info(skb2, NETLINK_CB(skb).portid, nlh->nlmsg_seq, - NFNL_MSG_TYPE(nlh->nlmsg_type), ct, true); + NFNL_MSG_TYPE(nlh->nlmsg_type), ct, true, 0); nf_ct_put(ct); if (err <= 0) goto free; @@ -1458,7 +1714,7 @@ restart: res = ctnetlink_fill_info(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NFNL_MSG_TYPE(cb->nlh->nlmsg_type), - ct, dying ? true : false); + ct, dying ? true : false, 0); if (res < 0) { if (!atomic_inc_not_zero(&ct->ct_general.use)) continue; diff --git a/net/netfilter/nf_conntrack_proto_icmp.c b/net/netfilter/nf_conntrack_proto_icmp.c index c2e3dff773bc..4efd8741c105 100644 --- a/net/netfilter/nf_conntrack_proto_icmp.c +++ b/net/netfilter/nf_conntrack_proto_icmp.c @@ -20,6 +20,8 @@ #include #include +#include "nf_internals.h" + static const unsigned int nf_ct_icmp_timeout = 30*HZ; bool icmp_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff, @@ -271,20 +273,32 @@ static const struct nla_policy icmp_nla_policy[CTA_PROTO_MAX+1] = { }; static int icmp_nlattr_to_tuple(struct nlattr *tb[], - struct nf_conntrack_tuple *tuple) + struct nf_conntrack_tuple *tuple, + u_int32_t flags) { - if (!tb[CTA_PROTO_ICMP_TYPE] || - !tb[CTA_PROTO_ICMP_CODE] || - !tb[CTA_PROTO_ICMP_ID]) - return -EINVAL; - - tuple->dst.u.icmp.type = nla_get_u8(tb[CTA_PROTO_ICMP_TYPE]); - tuple->dst.u.icmp.code = nla_get_u8(tb[CTA_PROTO_ICMP_CODE]); - tuple->src.u.icmp.id = nla_get_be16(tb[CTA_PROTO_ICMP_ID]); - - if (tuple->dst.u.icmp.type >= sizeof(invmap) || - !invmap[tuple->dst.u.icmp.type]) - return -EINVAL; + if (flags & CTA_FILTER_FLAG(CTA_PROTO_ICMP_TYPE)) { + if (!tb[CTA_PROTO_ICMP_TYPE]) + return -EINVAL; + + tuple->dst.u.icmp.type = nla_get_u8(tb[CTA_PROTO_ICMP_TYPE]); + if (tuple->dst.u.icmp.type >= sizeof(invmap) || + !invmap[tuple->dst.u.icmp.type]) + return -EINVAL; + } + + if (flags & CTA_FILTER_FLAG(CTA_PROTO_ICMP_CODE)) { + if (!tb[CTA_PROTO_ICMP_CODE]) + return -EINVAL; + + tuple->dst.u.icmp.code = nla_get_u8(tb[CTA_PROTO_ICMP_CODE]); + } + + if (flags & CTA_FILTER_FLAG(CTA_PROTO_ICMP_ID)) { + if (!tb[CTA_PROTO_ICMP_ID]) + return -EINVAL; + + tuple->src.u.icmp.id = nla_get_be16(tb[CTA_PROTO_ICMP_ID]); + } return 0; } diff --git a/net/netfilter/nf_conntrack_proto_icmpv6.c b/net/netfilter/nf_conntrack_proto_icmpv6.c index 6f9144e1f1c1..facd8c64ec4e 100644 --- a/net/netfilter/nf_conntrack_proto_icmpv6.c +++ b/net/netfilter/nf_conntrack_proto_icmpv6.c @@ -24,6 +24,8 @@ #include #include +#include "nf_internals.h" + static const unsigned int nf_ct_icmpv6_timeout = 30*HZ; bool icmpv6_pkt_to_tuple(const struct sk_buff *skb, @@ -193,21 +195,33 @@ static const struct nla_policy icmpv6_nla_policy[CTA_PROTO_MAX+1] = { }; static int icmpv6_nlattr_to_tuple(struct nlattr *tb[], - struct nf_conntrack_tuple *tuple) + struct nf_conntrack_tuple *tuple, + u_int32_t flags) { - if (!tb[CTA_PROTO_ICMPV6_TYPE] || - !tb[CTA_PROTO_ICMPV6_CODE] || - !tb[CTA_PROTO_ICMPV6_ID]) - return -EINVAL; - - tuple->dst.u.icmp.type = nla_get_u8(tb[CTA_PROTO_ICMPV6_TYPE]); - tuple->dst.u.icmp.code = nla_get_u8(tb[CTA_PROTO_ICMPV6_CODE]); - tuple->src.u.icmp.id = nla_get_be16(tb[CTA_PROTO_ICMPV6_ID]); - - if (tuple->dst.u.icmp.type < 128 || - tuple->dst.u.icmp.type - 128 >= sizeof(invmap) || - !invmap[tuple->dst.u.icmp.type - 128]) - return -EINVAL; + if (flags & CTA_FILTER_FLAG(CTA_PROTO_ICMPV6_TYPE)) { + if (!tb[CTA_PROTO_ICMPV6_TYPE]) + return -EINVAL; + + tuple->dst.u.icmp.type = nla_get_u8(tb[CTA_PROTO_ICMPV6_TYPE]); + if (tuple->dst.u.icmp.type < 128 || + tuple->dst.u.icmp.type - 128 >= sizeof(invmap) || + !invmap[tuple->dst.u.icmp.type - 128]) + return -EINVAL; + } + + if (flags & CTA_FILTER_FLAG(CTA_PROTO_ICMPV6_CODE)) { + if (!tb[CTA_PROTO_ICMPV6_CODE]) + return -EINVAL; + + tuple->dst.u.icmp.code = nla_get_u8(tb[CTA_PROTO_ICMPV6_CODE]); + } + + if (flags & CTA_FILTER_FLAG(CTA_PROTO_ICMPV6_ID)) { + if (!tb[CTA_PROTO_ICMPV6_ID]) + return -EINVAL; + + tuple->src.u.icmp.id = nla_get_be16(tb[CTA_PROTO_ICMPV6_ID]); + } return 0; } diff --git a/net/netfilter/nf_internals.h b/net/netfilter/nf_internals.h index d6c43902ebd7..832ae64179f0 100644 --- a/net/netfilter/nf_internals.h +++ b/net/netfilter/nf_internals.h @@ -6,6 +6,23 @@ #include #include +/* nf_conntrack_netlink.c: applied on tuple filters */ +#define CTA_FILTER_F_CTA_IP_SRC (1 << 0) +#define CTA_FILTER_F_CTA_IP_DST (1 << 1) +#define CTA_FILTER_F_CTA_TUPLE_ZONE (1 << 2) +#define CTA_FILTER_F_CTA_PROTO_NUM (1 << 3) +#define CTA_FILTER_F_CTA_PROTO_SRC_PORT (1 << 4) +#define CTA_FILTER_F_CTA_PROTO_DST_PORT (1 << 5) +#define CTA_FILTER_F_CTA_PROTO_ICMP_TYPE (1 << 6) +#define CTA_FILTER_F_CTA_PROTO_ICMP_CODE (1 << 7) +#define CTA_FILTER_F_CTA_PROTO_ICMP_ID (1 << 8) +#define CTA_FILTER_F_CTA_PROTO_ICMPV6_TYPE (1 << 9) +#define CTA_FILTER_F_CTA_PROTO_ICMPV6_CODE (1 << 10) +#define CTA_FILTER_F_CTA_PROTO_ICMPV6_ID (1 << 11) +#define CTA_FILTER_F_MAX (1 << 12) +#define CTA_FILTER_F_ALL (CTA_FILTER_F_MAX-1) +#define CTA_FILTER_FLAG(ctattr) CTA_FILTER_F_ ## ctattr + /* nf_queue.c */ void nf_queue_nf_hook_drop(struct net *net); -- cgit v1.2.3-59-g8ed1b From d9246a53752fdb777ed176d5f091c4ac0e482bba Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 20 May 2020 13:42:44 +0200 Subject: netfilter: nf_tables: generalise flowtable hook parsing Update nft_flowtable_parse_hook() to take the flowtable hook list as parameter. This allows to reuse this function to update the hooks. Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 36 +++++++++++++++++++++++++----------- 1 file changed, 25 insertions(+), 11 deletions(-) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 3558e76e2733..87945b4a6789 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -6178,21 +6178,30 @@ nft_flowtable_lookup_byhandle(const struct nft_table *table, return ERR_PTR(-ENOENT); } +struct nft_flowtable_hook { + u32 num; + int priority; + struct list_head list; +}; + static const struct nla_policy nft_flowtable_hook_policy[NFTA_FLOWTABLE_HOOK_MAX + 1] = { [NFTA_FLOWTABLE_HOOK_NUM] = { .type = NLA_U32 }, [NFTA_FLOWTABLE_HOOK_PRIORITY] = { .type = NLA_U32 }, [NFTA_FLOWTABLE_HOOK_DEVS] = { .type = NLA_NESTED }, }; -static int nf_tables_flowtable_parse_hook(const struct nft_ctx *ctx, - const struct nlattr *attr, - struct nft_flowtable *flowtable) +static int nft_flowtable_parse_hook(const struct nft_ctx *ctx, + const struct nlattr *attr, + struct nft_flowtable_hook *flowtable_hook, + struct nf_flowtable *ft) { struct nlattr *tb[NFTA_FLOWTABLE_HOOK_MAX + 1]; struct nft_hook *hook; int hooknum, priority; int err; + INIT_LIST_HEAD(&flowtable_hook->list); + err = nla_parse_nested_deprecated(tb, NFTA_FLOWTABLE_HOOK_MAX, attr, nft_flowtable_hook_policy, NULL); if (err < 0) @@ -6211,19 +6220,19 @@ static int nf_tables_flowtable_parse_hook(const struct nft_ctx *ctx, err = nf_tables_parse_netdev_hooks(ctx->net, tb[NFTA_FLOWTABLE_HOOK_DEVS], - &flowtable->hook_list); + &flowtable_hook->list); if (err < 0) return err; - flowtable->hooknum = hooknum; - flowtable->data.priority = priority; + flowtable_hook->priority = priority; + flowtable_hook->num = hooknum; - list_for_each_entry(hook, &flowtable->hook_list, list) { + list_for_each_entry(hook, &flowtable_hook->list, list) { hook->ops.pf = NFPROTO_NETDEV; hook->ops.hooknum = hooknum; hook->ops.priority = priority; - hook->ops.priv = &flowtable->data; - hook->ops.hook = flowtable->data.type->hook; + hook->ops.priv = ft; + hook->ops.hook = ft->type->hook; } return err; @@ -6336,6 +6345,7 @@ static int nf_tables_newflowtable(struct net *net, struct sock *nlsk, struct netlink_ext_ack *extack) { const struct nfgenmsg *nfmsg = nlmsg_data(nlh); + struct nft_flowtable_hook flowtable_hook; const struct nf_flowtable_type *type; u8 genmask = nft_genmask_next(net); int family = nfmsg->nfgen_family; @@ -6409,11 +6419,15 @@ static int nf_tables_newflowtable(struct net *net, struct sock *nlsk, if (err < 0) goto err3; - err = nf_tables_flowtable_parse_hook(&ctx, nla[NFTA_FLOWTABLE_HOOK], - flowtable); + err = nft_flowtable_parse_hook(&ctx, nla[NFTA_FLOWTABLE_HOOK], + &flowtable_hook, &flowtable->data); if (err < 0) goto err4; + list_splice(&flowtable_hook.list, &flowtable->hook_list); + flowtable->data.priority = flowtable_hook.priority; + flowtable->hooknum = flowtable_hook.num; + err = nft_register_flowtable_net_hooks(ctx.net, table, flowtable); if (err < 0) { list_for_each_entry_safe(hook, next, &flowtable->hook_list, list) { -- cgit v1.2.3-59-g8ed1b From f9382669cf5e75ebc7636bd78e637facf27d53f7 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Tue, 19 May 2020 01:00:07 +0200 Subject: netfilter: nf_tables: pass hook list to nft_{un,}register_flowtable_net_hooks() This patch prepares for incremental flowtable hook updates. Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 87945b4a6789..1505552aaa74 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -6279,23 +6279,24 @@ static void nft_unregister_flowtable_hook(struct net *net, } static void nft_unregister_flowtable_net_hooks(struct net *net, - struct nft_flowtable *flowtable) + struct list_head *hook_list) { struct nft_hook *hook; - list_for_each_entry(hook, &flowtable->hook_list, list) + list_for_each_entry(hook, hook_list, list) nf_unregister_net_hook(net, &hook->ops); } static int nft_register_flowtable_net_hooks(struct net *net, struct nft_table *table, + struct list_head *hook_list, struct nft_flowtable *flowtable) { struct nft_hook *hook, *hook2, *next; struct nft_flowtable *ft; int err, i = 0; - list_for_each_entry(hook, &flowtable->hook_list, list) { + list_for_each_entry(hook, hook_list, list) { list_for_each_entry(ft, &table->flowtables, list) { list_for_each_entry(hook2, &ft->hook_list, list) { if (hook->ops.dev == hook2->ops.dev && @@ -6326,7 +6327,7 @@ static int nft_register_flowtable_net_hooks(struct net *net, return 0; err_unregister_net_hooks: - list_for_each_entry_safe(hook, next, &flowtable->hook_list, list) { + list_for_each_entry_safe(hook, next, hook_list, list) { if (i-- <= 0) break; @@ -6428,7 +6429,9 @@ static int nf_tables_newflowtable(struct net *net, struct sock *nlsk, flowtable->data.priority = flowtable_hook.priority; flowtable->hooknum = flowtable_hook.num; - err = nft_register_flowtable_net_hooks(ctx.net, table, flowtable); + err = nft_register_flowtable_net_hooks(ctx.net, table, + &flowtable->hook_list, + flowtable); if (err < 0) { list_for_each_entry_safe(hook, next, &flowtable->hook_list, list) { list_del_rcu(&hook->list); @@ -7493,7 +7496,7 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) nft_trans_flowtable(trans), NFT_MSG_DELFLOWTABLE); nft_unregister_flowtable_net_hooks(net, - nft_trans_flowtable(trans)); + &nft_trans_flowtable(trans)->hook_list); break; } } @@ -7652,7 +7655,7 @@ static int __nf_tables_abort(struct net *net, bool autoload) trans->ctx.table->use--; list_del_rcu(&nft_trans_flowtable(trans)->list); nft_unregister_flowtable_net_hooks(net, - nft_trans_flowtable(trans)); + &nft_trans_flowtable(trans)->hook_list); break; case NFT_MSG_DELFLOWTABLE: trans->ctx.table->use++; -- cgit v1.2.3-59-g8ed1b From 389a2cbcb7f15e2af9babdc0c63cec318537e7ed Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 20 May 2020 13:43:43 +0200 Subject: netfilter: nf_tables: add nft_flowtable_hooks_destroy() This patch adds a helper function destroy the flowtable hooks. Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 1505552aaa74..f5d100787ccb 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -6339,6 +6339,16 @@ err_unregister_net_hooks: return err; } +static void nft_flowtable_hooks_destroy(struct list_head *hook_list) +{ + struct nft_hook *hook, *next; + + list_for_each_entry_safe(hook, next, hook_list, list) { + list_del_rcu(&hook->list); + kfree_rcu(hook, rcu); + } +} + static int nf_tables_newflowtable(struct net *net, struct sock *nlsk, struct sk_buff *skb, const struct nlmsghdr *nlh, @@ -6433,10 +6443,7 @@ static int nf_tables_newflowtable(struct net *net, struct sock *nlsk, &flowtable->hook_list, flowtable); if (err < 0) { - list_for_each_entry_safe(hook, next, &flowtable->hook_list, list) { - list_del_rcu(&hook->list); - kfree_rcu(hook, rcu); - } + nft_flowtable_hooks_destroy(&flowtable->hook_list); goto err4; } -- cgit v1.2.3-59-g8ed1b From c42d8bda69e291c5551497a02db71b50d95510d4 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 20 May 2020 13:44:18 +0200 Subject: netfilter: nf_tables: pass hook list to flowtable event notifier Update the flowtable netlink notifier to take the list of hooks as input. This allows to reuse this function in incremental flowtable hook updates. Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index f5d100787ccb..4db70e68d7f4 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -6523,7 +6523,8 @@ static int nf_tables_delflowtable(struct net *net, struct sock *nlsk, static int nf_tables_fill_flowtable_info(struct sk_buff *skb, struct net *net, u32 portid, u32 seq, int event, u32 flags, int family, - struct nft_flowtable *flowtable) + struct nft_flowtable *flowtable, + struct list_head *hook_list) { struct nlattr *nest, *nest_devs; struct nfgenmsg *nfmsg; @@ -6559,7 +6560,7 @@ static int nf_tables_fill_flowtable_info(struct sk_buff *skb, struct net *net, if (!nest_devs) goto nla_put_failure; - list_for_each_entry_rcu(hook, &flowtable->hook_list, list) { + list_for_each_entry_rcu(hook, hook_list, list) { if (nla_put_string(skb, NFTA_DEVICE_NAME, hook->ops.dev->name)) goto nla_put_failure; } @@ -6612,7 +6613,9 @@ static int nf_tables_dump_flowtable(struct sk_buff *skb, cb->nlh->nlmsg_seq, NFT_MSG_NEWFLOWTABLE, NLM_F_MULTI | NLM_F_APPEND, - table->family, flowtable) < 0) + table->family, + flowtable, + &flowtable->hook_list) < 0) goto done; nl_dump_check_consistent(cb, nlmsg_hdr(skb)); @@ -6709,7 +6712,7 @@ static int nf_tables_getflowtable(struct net *net, struct sock *nlsk, err = nf_tables_fill_flowtable_info(skb2, net, NETLINK_CB(skb).portid, nlh->nlmsg_seq, NFT_MSG_NEWFLOWTABLE, 0, family, - flowtable); + flowtable, &flowtable->hook_list); if (err < 0) goto err; @@ -6721,6 +6724,7 @@ err: static void nf_tables_flowtable_notify(struct nft_ctx *ctx, struct nft_flowtable *flowtable, + struct list_head *hook_list, int event) { struct sk_buff *skb; @@ -6736,7 +6740,7 @@ static void nf_tables_flowtable_notify(struct nft_ctx *ctx, err = nf_tables_fill_flowtable_info(skb, ctx->net, ctx->portid, ctx->seq, event, 0, - ctx->family, flowtable); + ctx->family, flowtable, hook_list); if (err < 0) { kfree_skb(skb); goto err; @@ -7494,6 +7498,7 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) nft_clear(net, nft_trans_flowtable(trans)); nf_tables_flowtable_notify(&trans->ctx, nft_trans_flowtable(trans), + &nft_trans_flowtable(trans)->hook_list, NFT_MSG_NEWFLOWTABLE); nft_trans_destroy(trans); break; @@ -7501,6 +7506,7 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) list_del_rcu(&nft_trans_flowtable(trans)->list); nf_tables_flowtable_notify(&trans->ctx, nft_trans_flowtable(trans), + &nft_trans_flowtable(trans)->hook_list, NFT_MSG_DELFLOWTABLE); nft_unregister_flowtable_net_hooks(net, &nft_trans_flowtable(trans)->hook_list); -- cgit v1.2.3-59-g8ed1b From 78d9f48f7f44431a25da2b46b3a8812f6ff2b981 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 20 May 2020 13:46:47 +0200 Subject: netfilter: nf_tables: add devices to existing flowtable This patch allows users to add devices to an existing flowtable. Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 6 +++ net/netfilter/nf_tables_api.c | 97 ++++++++++++++++++++++++++++++++++----- 2 files changed, 92 insertions(+), 11 deletions(-) diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index d4e29c952c40..4f58c4411bb4 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -1481,10 +1481,16 @@ struct nft_trans_obj { struct nft_trans_flowtable { struct nft_flowtable *flowtable; + bool update; + struct list_head hook_list; }; #define nft_trans_flowtable(trans) \ (((struct nft_trans_flowtable *)trans->data)->flowtable) +#define nft_trans_flowtable_update(trans) \ + (((struct nft_trans_flowtable *)trans->data)->update) +#define nft_trans_flowtable_hooks(trans) \ + (((struct nft_trans_flowtable *)trans->data)->hook_list) int __init nft_chain_filter_init(void); void nft_chain_filter_fini(void); diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 4db70e68d7f4..98f2cbb97e39 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -6349,6 +6349,62 @@ static void nft_flowtable_hooks_destroy(struct list_head *hook_list) } } +static int nft_flowtable_update(struct nft_ctx *ctx, const struct nlmsghdr *nlh, + struct nft_flowtable *flowtable) +{ + const struct nlattr * const *nla = ctx->nla; + struct nft_flowtable_hook flowtable_hook; + struct nft_hook *hook, *next; + struct nft_trans *trans; + bool unregister = false; + int err; + + err = nft_flowtable_parse_hook(ctx, nla[NFTA_FLOWTABLE_HOOK], + &flowtable_hook, &flowtable->data); + if (err < 0) + return err; + + list_for_each_entry_safe(hook, next, &flowtable_hook.list, list) { + if (nft_hook_list_find(&flowtable->hook_list, hook)) { + list_del(&hook->list); + kfree(hook); + } + } + + err = nft_register_flowtable_net_hooks(ctx->net, ctx->table, + &flowtable_hook.list, flowtable); + if (err < 0) + goto err_flowtable_update_hook; + + trans = nft_trans_alloc(ctx, NFT_MSG_NEWFLOWTABLE, + sizeof(struct nft_trans_flowtable)); + if (!trans) { + unregister = true; + err = -ENOMEM; + goto err_flowtable_update_hook; + } + + nft_trans_flowtable(trans) = flowtable; + nft_trans_flowtable_update(trans) = true; + INIT_LIST_HEAD(&nft_trans_flowtable_hooks(trans)); + list_splice(&flowtable_hook.list, &nft_trans_flowtable_hooks(trans)); + + list_add_tail(&trans->list, &ctx->net->nft.commit_list); + + return 0; + +err_flowtable_update_hook: + list_for_each_entry_safe(hook, next, &flowtable_hook.list, list) { + if (unregister) + nft_unregister_flowtable_hook(ctx->net, flowtable, hook); + list_del_rcu(&hook->list); + kfree_rcu(hook, rcu); + } + + return err; + +} + static int nf_tables_newflowtable(struct net *net, struct sock *nlsk, struct sk_buff *skb, const struct nlmsghdr *nlh, @@ -6392,7 +6448,9 @@ static int nf_tables_newflowtable(struct net *net, struct sock *nlsk, return -EEXIST; } - return 0; + nft_ctx_init(&ctx, net, skb, nlh, family, table, NULL, nla); + + return nft_flowtable_update(&ctx, nlh, flowtable); } nft_ctx_init(&ctx, net, skb, nlh, family, table, NULL, nla); @@ -7495,11 +7553,20 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) NFT_MSG_DELOBJ); break; case NFT_MSG_NEWFLOWTABLE: - nft_clear(net, nft_trans_flowtable(trans)); - nf_tables_flowtable_notify(&trans->ctx, - nft_trans_flowtable(trans), - &nft_trans_flowtable(trans)->hook_list, - NFT_MSG_NEWFLOWTABLE); + if (nft_trans_flowtable_update(trans)) { + nf_tables_flowtable_notify(&trans->ctx, + nft_trans_flowtable(trans), + &nft_trans_flowtable_hooks(trans), + NFT_MSG_NEWFLOWTABLE); + list_splice(&nft_trans_flowtable_hooks(trans), + &nft_trans_flowtable(trans)->hook_list); + } else { + nft_clear(net, nft_trans_flowtable(trans)); + nf_tables_flowtable_notify(&trans->ctx, + nft_trans_flowtable(trans), + &nft_trans_flowtable(trans)->hook_list, + NFT_MSG_NEWFLOWTABLE); + } nft_trans_destroy(trans); break; case NFT_MSG_DELFLOWTABLE: @@ -7558,7 +7625,10 @@ static void nf_tables_abort_release(struct nft_trans *trans) nft_obj_destroy(&trans->ctx, nft_trans_obj(trans)); break; case NFT_MSG_NEWFLOWTABLE: - nf_tables_flowtable_destroy(nft_trans_flowtable(trans)); + if (nft_trans_flowtable_update(trans)) + nft_flowtable_hooks_destroy(&nft_trans_flowtable_hooks(trans)); + else + nf_tables_flowtable_destroy(nft_trans_flowtable(trans)); break; } kfree(trans); @@ -7665,10 +7735,15 @@ static int __nf_tables_abort(struct net *net, bool autoload) nft_trans_destroy(trans); break; case NFT_MSG_NEWFLOWTABLE: - trans->ctx.table->use--; - list_del_rcu(&nft_trans_flowtable(trans)->list); - nft_unregister_flowtable_net_hooks(net, - &nft_trans_flowtable(trans)->hook_list); + if (nft_trans_flowtable_update(trans)) { + nft_unregister_flowtable_net_hooks(net, + &nft_trans_flowtable_hooks(trans)); + } else { + trans->ctx.table->use--; + list_del_rcu(&nft_trans_flowtable(trans)->list); + nft_unregister_flowtable_net_hooks(net, + &nft_trans_flowtable(trans)->hook_list); + } break; case NFT_MSG_DELFLOWTABLE: trans->ctx.table->use++; -- cgit v1.2.3-59-g8ed1b From abadb2f865d72a223d691fc68e006943ecadf0d9 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 20 May 2020 13:46:51 +0200 Subject: netfilter: nf_tables: delete devices from flowtable This patch allows users to delete devices from existing flowtables. Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 1 + net/netfilter/nf_tables_api.c | 113 ++++++++++++++++++++++++++++++++------ 2 files changed, 98 insertions(+), 16 deletions(-) diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 4f58c4411bb4..6f0f6fca9ac3 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -1002,6 +1002,7 @@ struct nft_stats { struct nft_hook { struct list_head list; + bool inactive; struct nf_hook_ops ops; struct rcu_head rcu; }; diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 98f2cbb97e39..1c2c3bb78fa0 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -1669,6 +1669,7 @@ static struct nft_hook *nft_netdev_hook_alloc(struct net *net, goto err_hook_dev; } hook->ops.dev = dev; + hook->inactive = false; return hook; @@ -1678,17 +1679,17 @@ err_hook_alloc: return ERR_PTR(err); } -static bool nft_hook_list_find(struct list_head *hook_list, - const struct nft_hook *this) +static struct nft_hook *nft_hook_list_find(struct list_head *hook_list, + const struct nft_hook *this) { struct nft_hook *hook; list_for_each_entry(hook, hook_list, list) { if (this->ops.dev == hook->ops.dev) - return true; + return hook; } - return false; + return NULL; } static int nf_tables_parse_netdev_hooks(struct net *net, @@ -6530,6 +6531,51 @@ err1: return err; } +static int nft_delflowtable_hook(struct nft_ctx *ctx, + struct nft_flowtable *flowtable) +{ + const struct nlattr * const *nla = ctx->nla; + struct nft_flowtable_hook flowtable_hook; + struct nft_hook *this, *next, *hook; + struct nft_trans *trans; + int err; + + err = nft_flowtable_parse_hook(ctx, nla[NFTA_FLOWTABLE_HOOK], + &flowtable_hook, &flowtable->data); + if (err < 0) + return err; + + list_for_each_entry_safe(this, next, &flowtable_hook.list, list) { + hook = nft_hook_list_find(&flowtable->hook_list, this); + if (!hook) { + err = -ENOENT; + goto err_flowtable_del_hook; + } + hook->inactive = true; + list_del(&this->list); + kfree(this); + } + + trans = nft_trans_alloc(ctx, NFT_MSG_DELFLOWTABLE, + sizeof(struct nft_trans_flowtable)); + if (!trans) + return -ENOMEM; + + nft_trans_flowtable(trans) = flowtable; + nft_trans_flowtable_update(trans) = true; + INIT_LIST_HEAD(&nft_trans_flowtable_hooks(trans)); + + list_add_tail(&trans->list, &ctx->net->nft.commit_list); + + return 0; + +err_flowtable_del_hook: + list_for_each_entry(hook, &flowtable_hook.list, list) + hook->inactive = false; + + return err; +} + static int nf_tables_delflowtable(struct net *net, struct sock *nlsk, struct sk_buff *skb, const struct nlmsghdr *nlh, @@ -6568,13 +6614,17 @@ static int nf_tables_delflowtable(struct net *net, struct sock *nlsk, NL_SET_BAD_ATTR(extack, attr); return PTR_ERR(flowtable); } + + nft_ctx_init(&ctx, net, skb, nlh, family, table, NULL, nla); + + if (nla[NFTA_FLOWTABLE_HOOK]) + return nft_delflowtable_hook(&ctx, flowtable); + if (flowtable->use > 0) { NL_SET_BAD_ATTR(extack, attr); return -EBUSY; } - nft_ctx_init(&ctx, net, skb, nlh, family, table, NULL, nla); - return nft_delflowtable(&ctx, flowtable); } @@ -7184,7 +7234,10 @@ static void nft_commit_release(struct nft_trans *trans) nft_obj_destroy(&trans->ctx, nft_trans_obj(trans)); break; case NFT_MSG_DELFLOWTABLE: - nf_tables_flowtable_destroy(nft_trans_flowtable(trans)); + if (nft_trans_flowtable_update(trans)) + nft_flowtable_hooks_destroy(&nft_trans_flowtable_hooks(trans)); + else + nf_tables_flowtable_destroy(nft_trans_flowtable(trans)); break; } @@ -7345,6 +7398,17 @@ static void nft_chain_del(struct nft_chain *chain) list_del_rcu(&chain->list); } +static void nft_flowtable_hooks_del(struct nft_flowtable *flowtable, + struct list_head *hook_list) +{ + struct nft_hook *hook, *next; + + list_for_each_entry_safe(hook, next, &flowtable->hook_list, list) { + if (hook->inactive) + list_move(&hook->list, hook_list); + } +} + static void nf_tables_module_autoload_cleanup(struct net *net) { struct nft_module_request *req, *next; @@ -7570,13 +7634,24 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) nft_trans_destroy(trans); break; case NFT_MSG_DELFLOWTABLE: - list_del_rcu(&nft_trans_flowtable(trans)->list); - nf_tables_flowtable_notify(&trans->ctx, - nft_trans_flowtable(trans), - &nft_trans_flowtable(trans)->hook_list, - NFT_MSG_DELFLOWTABLE); - nft_unregister_flowtable_net_hooks(net, - &nft_trans_flowtable(trans)->hook_list); + if (nft_trans_flowtable_update(trans)) { + nft_flowtable_hooks_del(nft_trans_flowtable(trans), + &nft_trans_flowtable_hooks(trans)); + nf_tables_flowtable_notify(&trans->ctx, + nft_trans_flowtable(trans), + &nft_trans_flowtable_hooks(trans), + NFT_MSG_DELFLOWTABLE); + nft_unregister_flowtable_net_hooks(net, + &nft_trans_flowtable_hooks(trans)); + } else { + list_del_rcu(&nft_trans_flowtable(trans)->list); + nf_tables_flowtable_notify(&trans->ctx, + nft_trans_flowtable(trans), + &nft_trans_flowtable(trans)->hook_list, + NFT_MSG_DELFLOWTABLE); + nft_unregister_flowtable_net_hooks(net, + &nft_trans_flowtable(trans)->hook_list); + } break; } } @@ -7638,6 +7713,7 @@ static int __nf_tables_abort(struct net *net, bool autoload) { struct nft_trans *trans, *next; struct nft_trans_elem *te; + struct nft_hook *hook; list_for_each_entry_safe_reverse(trans, next, &net->nft.commit_list, list) { @@ -7746,8 +7822,13 @@ static int __nf_tables_abort(struct net *net, bool autoload) } break; case NFT_MSG_DELFLOWTABLE: - trans->ctx.table->use++; - nft_clear(trans->ctx.net, nft_trans_flowtable(trans)); + if (nft_trans_flowtable_update(trans)) { + list_for_each_entry(hook, &nft_trans_flowtable(trans)->hook_list, list) + hook->inactive = false; + } else { + trans->ctx.table->use++; + nft_clear(trans->ctx.net, nft_trans_flowtable(trans)); + } nft_trans_destroy(trans); break; } -- cgit v1.2.3-59-g8ed1b From 05abe4456fa376040f6cc3cc6830d2e328723478 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 20 May 2020 15:44:37 +0200 Subject: netfilter: nf_tables: allow to register flowtable with no devices A flowtable might be composed of dynamic interfaces only. Such dynamic interfaces might show up at a later stage. This patch allows users to register a flowtable with no devices. Once the dynamic interface becomes available, the user adds the dynamic devices to the flowtable. Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 1c2c3bb78fa0..897ac5fbe079 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -1724,8 +1724,6 @@ static int nf_tables_parse_netdev_hooks(struct net *net, goto err_hook; } } - if (!n) - return -EINVAL; return 0; @@ -1762,6 +1760,9 @@ static int nft_chain_parse_netdev(struct net *net, hook_list); if (err < 0) return err; + + if (list_empty(hook_list)) + return -EINVAL; } else { return -EINVAL; } @@ -6209,8 +6210,7 @@ static int nft_flowtable_parse_hook(const struct nft_ctx *ctx, return err; if (!tb[NFTA_FLOWTABLE_HOOK_NUM] || - !tb[NFTA_FLOWTABLE_HOOK_PRIORITY] || - !tb[NFTA_FLOWTABLE_HOOK_DEVS]) + !tb[NFTA_FLOWTABLE_HOOK_PRIORITY]) return -EINVAL; hooknum = ntohl(nla_get_be32(tb[NFTA_FLOWTABLE_HOOK_NUM])); @@ -6219,11 +6219,13 @@ static int nft_flowtable_parse_hook(const struct nft_ctx *ctx, priority = ntohl(nla_get_be32(tb[NFTA_FLOWTABLE_HOOK_PRIORITY])); - err = nf_tables_parse_netdev_hooks(ctx->net, - tb[NFTA_FLOWTABLE_HOOK_DEVS], - &flowtable_hook->list); - if (err < 0) - return err; + if (tb[NFTA_FLOWTABLE_HOOK_DEVS]) { + err = nf_tables_parse_netdev_hooks(ctx->net, + tb[NFTA_FLOWTABLE_HOOK_DEVS], + &flowtable_hook->list); + if (err < 0) + return err; + } flowtable_hook->priority = priority; flowtable_hook->num = hooknum; -- cgit v1.2.3-59-g8ed1b From 5b6743fb2c2a1fcb31c8b227558f537095dbece4 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Sat, 23 May 2020 12:05:22 +0200 Subject: netfilter: nf_tables: skip flowtable hooknum and priority on device updates On device updates, the hooknum and priority attributes are not required. This patch makes optional these two netlink attributes. Moreover, bail out with EOPNOTSUPP if userspace tries to update the hooknum and priority for existing flowtables. While at this, turn EINVAL into EOPNOTSUPP in case the hooknum is not ingress. EINVAL is reserved for missing netlink attribute / malformed netlink messages. Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 53 ++++++++++++++++++++++++++++--------------- 1 file changed, 35 insertions(+), 18 deletions(-) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 897ac5fbe079..073aa1051d43 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -6195,7 +6195,7 @@ static const struct nla_policy nft_flowtable_hook_policy[NFTA_FLOWTABLE_HOOK_MAX static int nft_flowtable_parse_hook(const struct nft_ctx *ctx, const struct nlattr *attr, struct nft_flowtable_hook *flowtable_hook, - struct nf_flowtable *ft) + struct nft_flowtable *flowtable, bool add) { struct nlattr *tb[NFTA_FLOWTABLE_HOOK_MAX + 1]; struct nft_hook *hook; @@ -6209,15 +6209,35 @@ static int nft_flowtable_parse_hook(const struct nft_ctx *ctx, if (err < 0) return err; - if (!tb[NFTA_FLOWTABLE_HOOK_NUM] || - !tb[NFTA_FLOWTABLE_HOOK_PRIORITY]) - return -EINVAL; + if (add) { + if (!tb[NFTA_FLOWTABLE_HOOK_NUM] || + !tb[NFTA_FLOWTABLE_HOOK_PRIORITY]) + return -EINVAL; - hooknum = ntohl(nla_get_be32(tb[NFTA_FLOWTABLE_HOOK_NUM])); - if (hooknum != NF_NETDEV_INGRESS) - return -EINVAL; + hooknum = ntohl(nla_get_be32(tb[NFTA_FLOWTABLE_HOOK_NUM])); + if (hooknum != NF_NETDEV_INGRESS) + return -EOPNOTSUPP; + + priority = ntohl(nla_get_be32(tb[NFTA_FLOWTABLE_HOOK_PRIORITY])); + + flowtable_hook->priority = priority; + flowtable_hook->num = hooknum; + } else { + if (tb[NFTA_FLOWTABLE_HOOK_NUM]) { + hooknum = ntohl(nla_get_be32(tb[NFTA_FLOWTABLE_HOOK_NUM])); + if (hooknum != flowtable->hooknum) + return -EOPNOTSUPP; + } + + if (tb[NFTA_FLOWTABLE_HOOK_PRIORITY]) { + priority = ntohl(nla_get_be32(tb[NFTA_FLOWTABLE_HOOK_PRIORITY])); + if (priority != flowtable->data.priority) + return -EOPNOTSUPP; + } - priority = ntohl(nla_get_be32(tb[NFTA_FLOWTABLE_HOOK_PRIORITY])); + flowtable_hook->priority = flowtable->data.priority; + flowtable_hook->num = flowtable->hooknum; + } if (tb[NFTA_FLOWTABLE_HOOK_DEVS]) { err = nf_tables_parse_netdev_hooks(ctx->net, @@ -6227,15 +6247,12 @@ static int nft_flowtable_parse_hook(const struct nft_ctx *ctx, return err; } - flowtable_hook->priority = priority; - flowtable_hook->num = hooknum; - list_for_each_entry(hook, &flowtable_hook->list, list) { hook->ops.pf = NFPROTO_NETDEV; - hook->ops.hooknum = hooknum; - hook->ops.priority = priority; - hook->ops.priv = ft; - hook->ops.hook = ft->type->hook; + hook->ops.hooknum = flowtable_hook->num; + hook->ops.priority = flowtable_hook->priority; + hook->ops.priv = &flowtable->data; + hook->ops.hook = flowtable->data.type->hook; } return err; @@ -6363,7 +6380,7 @@ static int nft_flowtable_update(struct nft_ctx *ctx, const struct nlmsghdr *nlh, int err; err = nft_flowtable_parse_hook(ctx, nla[NFTA_FLOWTABLE_HOOK], - &flowtable_hook, &flowtable->data); + &flowtable_hook, flowtable, false); if (err < 0) return err; @@ -6492,7 +6509,7 @@ static int nf_tables_newflowtable(struct net *net, struct sock *nlsk, goto err3; err = nft_flowtable_parse_hook(&ctx, nla[NFTA_FLOWTABLE_HOOK], - &flowtable_hook, &flowtable->data); + &flowtable_hook, flowtable, true); if (err < 0) goto err4; @@ -6543,7 +6560,7 @@ static int nft_delflowtable_hook(struct nft_ctx *ctx, int err; err = nft_flowtable_parse_hook(ctx, nla[NFTA_FLOWTABLE_HOOK], - &flowtable_hook, &flowtable->data); + &flowtable_hook, flowtable, false); if (err < 0) return err; -- cgit v1.2.3-59-g8ed1b From 7ff4f3f315db361e35c1d61a6fdbfddbe345b633 Mon Sep 17 00:00:00 2001 From: Antoine Tenart Date: Tue, 26 May 2020 17:01:48 +0200 Subject: net: mscc: use the PHY MII ioctl interface when possible Allow ioctl to be implemented by the PHY, when a PHY is attached to the Ocelot switch. In case the ioctl is a request to set or get the hardware timestamp, use the Ocelot switch implementation for now. Signed-off-by: Antoine Tenart Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- drivers/net/ethernet/mscc/ocelot.c | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/drivers/net/ethernet/mscc/ocelot.c b/drivers/net/ethernet/mscc/ocelot.c index e621c4c3ee86..2151c08a57c7 100644 --- a/drivers/net/ethernet/mscc/ocelot.c +++ b/drivers/net/ethernet/mscc/ocelot.c @@ -1204,18 +1204,16 @@ static int ocelot_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) struct ocelot *ocelot = priv->port.ocelot; int port = priv->chip_port; - /* The function is only used for PTP operations for now */ - if (!ocelot->ptp) - return -EOPNOTSUPP; - - switch (cmd) { - case SIOCSHWTSTAMP: - return ocelot_hwstamp_set(ocelot, port, ifr); - case SIOCGHWTSTAMP: - return ocelot_hwstamp_get(ocelot, port, ifr); - default: - return -EOPNOTSUPP; + if (ocelot->ptp) { + switch (cmd) { + case SIOCSHWTSTAMP: + return ocelot_hwstamp_set(ocelot, port, ifr); + case SIOCGHWTSTAMP: + return ocelot_hwstamp_get(ocelot, port, ifr); + } } + + return phy_mii_ioctl(dev->phydev, ifr, cmd); } static const struct net_device_ops ocelot_port_netdev_ops = { -- cgit v1.2.3-59-g8ed1b From b2e118f638fb8984e430624a8cf27483cc23cf8d Mon Sep 17 00:00:00 2001 From: Antoine Tenart Date: Tue, 26 May 2020 17:01:49 +0200 Subject: net: mscc: allow offloading timestamping operations to the PHY This patch adds support for offloading timestamping operations not only to the Ocelot switch (as already supported) but to compatible PHYs. When both the PHY and the Ocelot switch support timestamping operations, the PHY implementation is chosen as the timestamp will happen closer to the medium. Signed-off-by: Antoine Tenart Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- drivers/net/ethernet/mscc/ocelot.c | 5 ++++- drivers/net/ethernet/mscc/ocelot_board.c | 3 ++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/mscc/ocelot.c b/drivers/net/ethernet/mscc/ocelot.c index 2151c08a57c7..9cfe1fd98c30 100644 --- a/drivers/net/ethernet/mscc/ocelot.c +++ b/drivers/net/ethernet/mscc/ocelot.c @@ -1204,7 +1204,10 @@ static int ocelot_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) struct ocelot *ocelot = priv->port.ocelot; int port = priv->chip_port; - if (ocelot->ptp) { + /* If the attached PHY device isn't capable of timestamping operations, + * use our own (when possible). + */ + if (!phy_has_hwtstamp(dev->phydev) && ocelot->ptp) { switch (cmd) { case SIOCSHWTSTAMP: return ocelot_hwstamp_set(ocelot, port, ifr); diff --git a/drivers/net/ethernet/mscc/ocelot_board.c b/drivers/net/ethernet/mscc/ocelot_board.c index 67a8d61c926a..4a15d2ff8b70 100644 --- a/drivers/net/ethernet/mscc/ocelot_board.c +++ b/drivers/net/ethernet/mscc/ocelot_board.c @@ -189,7 +189,8 @@ static irqreturn_t ocelot_xtr_irq_handler(int irq, void *arg) skb->offload_fwd_mark = 1; skb->protocol = eth_type_trans(skb, dev); - netif_rx(skb); + if (!skb_defer_rx_timestamp(skb)) + netif_rx(skb); dev->stats.rx_bytes += len; dev->stats.rx_packets++; } while (ocelot_read(ocelot, QS_XTR_DATA_PRESENT) & BIT(grp)); -- cgit v1.2.3-59-g8ed1b From 4cd5beaa89a95b331e5586d55469f5569faa18f6 Mon Sep 17 00:00:00 2001 From: Guangbin Huang Date: Wed, 27 May 2020 08:59:14 +0800 Subject: net: hns3: add a resetting check in hclgevf_init_nic_client_instance() To prevent from initializing VF NIC client in reset handling state, this patch adds resetting check in hclgevf_init_nic_client_instance(). Signed-off-by: Guangbin Huang Signed-off-by: Huazhong Tan Signed-off-by: David S. Miller --- drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c index 32341dcaa6c1..59fcb80671c8 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c @@ -2713,6 +2713,7 @@ static int hclgevf_init_nic_client_instance(struct hnae3_ae_dev *ae_dev, struct hnae3_client *client) { struct hclgevf_dev *hdev = ae_dev->priv; + int rst_cnt = hdev->rst_stats.rst_cnt; int ret; ret = client->ops->init_instance(&hdev->nic); @@ -2720,6 +2721,14 @@ static int hclgevf_init_nic_client_instance(struct hnae3_ae_dev *ae_dev, return ret; set_bit(HCLGEVF_STATE_NIC_REGISTERED, &hdev->state); + if (test_bit(HCLGEVF_STATE_RST_HANDLING, &hdev->state) || + rst_cnt != hdev->rst_stats.rst_cnt) { + clear_bit(HCLGEVF_STATE_NIC_REGISTERED, &hdev->state); + + client->ops->uninit_instance(&hdev->nic, 0); + return -EBUSY; + } + hnae3_set_client_init_flag(client, ae_dev, 1); if (netif_msg_drv(&hdev->nic)) -- cgit v1.2.3-59-g8ed1b From 60c800c64d8fdd2fc5b8c62686fb08c6a6fb1045 Mon Sep 17 00:00:00 2001 From: Yufeng Mo Date: Wed, 27 May 2020 08:59:15 +0800 Subject: net: hns3: change the order of reinitializing RoCE and NIC client during reset The HNS RDMA driver will support VF device later, whose re-initialization should be done after PF's. This patch changes the order of hclge_reset_prepare_up() and hclge_notify_roce_client(), so that PF's RoCE client will be reinitialized before VF's. Signed-off-by: Yufeng Mo Signed-off-by: Huazhong Tan Signed-off-by: David S. Miller --- drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c index b796d3fb5b0b..6e1e2cf385c8 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c @@ -3770,11 +3770,6 @@ static int hclge_reset_rebuild(struct hclge_dev *hdev) hclge_clear_reset_cause(hdev); - ret = hclge_reset_prepare_up(hdev); - if (ret) - return ret; - - ret = hclge_notify_roce_client(hdev, HNAE3_INIT_CLIENT); /* ignore RoCE notify error if it fails HCLGE_RESET_MAX_FAIL_CNT - 1 * times @@ -3783,6 +3778,10 @@ static int hclge_reset_rebuild(struct hclge_dev *hdev) hdev->rst_stats.reset_fail_cnt < HCLGE_RESET_MAX_FAIL_CNT - 1) return ret; + ret = hclge_reset_prepare_up(hdev); + if (ret) + return ret; + rtnl_lock(); ret = hclge_notify_client(hdev, HNAE3_UP_CLIENT); rtnl_unlock(); -- cgit v1.2.3-59-g8ed1b From 01952206e17ee34b5fe32f211619ac59ec043990 Mon Sep 17 00:00:00 2001 From: Yufeng Mo Date: Wed, 27 May 2020 08:59:16 +0800 Subject: net: hns3: remove unnecessary MAC enable in app loopback Packets will not pass through MAC during app loopback. Therefore, it is meaningless to enable MAC while doing app loopback. This patch removes this unnecessary action. Signed-off-by: Yufeng Mo Signed-off-by: Huazhong Tan Signed-off-by: David S. Miller --- drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c index 6e1e2cf385c8..7c9f2ba1f272 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c @@ -6583,8 +6583,6 @@ static int hclge_set_app_loopback(struct hclge_dev *hdev, bool en) /* 2 Then setup the loopback flag */ loop_en = le32_to_cpu(req->txrx_pad_fcs_loop_en); hnae3_set_bit(loop_en, HCLGE_MAC_APP_LP_B, en ? 1 : 0); - hnae3_set_bit(loop_en, HCLGE_MAC_TX_EN_B, en ? 1 : 0); - hnae3_set_bit(loop_en, HCLGE_MAC_RX_EN_B, en ? 1 : 0); req->txrx_pad_fcs_loop_en = cpu_to_le32(loop_en); -- cgit v1.2.3-59-g8ed1b From 6f45a9bdd2aee6cc1e4223fdc9e4f548a3f54595 Mon Sep 17 00:00:00 2001 From: Huazhong Tan Date: Wed, 27 May 2020 08:59:17 +0800 Subject: net: hns3: add a print for initializing CMDQ when reset pending When initializing CMDQ fails because of reset pending, there is no hint for debugging, so adds a log for it. Signed-off-by: Huazhong Tan Signed-off-by: David S. Miller --- drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.c index 7f509eff562e..64a1d0bdd7d1 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.c @@ -426,6 +426,9 @@ int hclge_cmd_init(struct hclge_dev *hdev) * reset may happen when lower level reset is being processed. */ if ((hclge_is_reset_pending(hdev))) { + dev_err(&hdev->pdev->dev, + "failed to init cmd since reset %#lx pending\n", + hdev->reset_pending); ret = -EBUSY; goto err_cmd_init; } -- cgit v1.2.3-59-g8ed1b From f745664257b62f6ba29f45fd21fbe7193b4da57b Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 26 May 2020 19:48:49 -0700 Subject: tcp: add tcp_ld_RTO_revert() helper RFC 6069 logic has been implemented for IPv4 only so far, right in the middle of tcp_v4_err() and was error prone. Move this code to one helper, to make tcp_v4_err() more readable and to eventually expand RFC 6069 to IPv6 in the future. Also perform sock_owned_by_user() check a bit sooner. Signed-off-by: Eric Dumazet Acked-by: Neal Cardwell Tested-by: Neal Cardwell Signed-off-by: David S. Miller --- net/ipv4/tcp_ipv4.c | 85 ++++++++++++++++++++++++++++------------------------- 1 file changed, 45 insertions(+), 40 deletions(-) diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 6789671f0f5a..f32dcadc91b7 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -403,6 +403,45 @@ void tcp_req_err(struct sock *sk, u32 seq, bool abort) } EXPORT_SYMBOL(tcp_req_err); +/* TCP-LD (RFC 6069) logic */ +static void tcp_ld_RTO_revert(struct sock *sk, u32 seq) +{ + struct inet_connection_sock *icsk = inet_csk(sk); + struct tcp_sock *tp = tcp_sk(sk); + struct sk_buff *skb; + s32 remaining; + u32 delta_us; + + if (sock_owned_by_user(sk)) + return; + + if (seq != tp->snd_una || !icsk->icsk_retransmits || + !icsk->icsk_backoff) + return; + + skb = tcp_rtx_queue_head(sk); + if (WARN_ON_ONCE(!skb)) + return; + + icsk->icsk_backoff--; + icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) : TCP_TIMEOUT_INIT; + icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX); + + tcp_mstamp_refresh(tp); + delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb)); + remaining = icsk->icsk_rto - usecs_to_jiffies(delta_us); + + if (remaining > 0) { + inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, + remaining, TCP_RTO_MAX); + } else { + /* RTO revert clocked out retransmission. + * Will retransmit now. + */ + tcp_retransmit_timer(sk); + } +} + /* * This routine is called by the ICMP module when it gets some * sort of error condition. If err < 0 then the socket should @@ -423,17 +462,13 @@ int tcp_v4_err(struct sk_buff *icmp_skb, u32 info) { const struct iphdr *iph = (const struct iphdr *)icmp_skb->data; struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2)); - struct inet_connection_sock *icsk; struct tcp_sock *tp; struct inet_sock *inet; const int type = icmp_hdr(icmp_skb)->type; const int code = icmp_hdr(icmp_skb)->code; struct sock *sk; - struct sk_buff *skb; struct request_sock *fastopen; u32 seq, snd_una; - s32 remaining; - u32 delta_us; int err; struct net *net = dev_net(icmp_skb->dev); @@ -476,7 +511,6 @@ int tcp_v4_err(struct sk_buff *icmp_skb, u32 info) goto out; } - icsk = inet_csk(sk); tp = tcp_sk(sk); /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */ fastopen = rcu_dereference(tp->fastopen_rsk); @@ -521,41 +555,12 @@ int tcp_v4_err(struct sk_buff *icmp_skb, u32 info) } err = icmp_err_convert[code].errno; - /* check if icmp_skb allows revert of backoff - * (see draft-zimmermann-tcp-lcd) */ - if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH) - break; - if (seq != tp->snd_una || !icsk->icsk_retransmits || - !icsk->icsk_backoff || fastopen) - break; - - if (sock_owned_by_user(sk)) - break; - - skb = tcp_rtx_queue_head(sk); - if (WARN_ON_ONCE(!skb)) - break; - - icsk->icsk_backoff--; - icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) : - TCP_TIMEOUT_INIT; - icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX); - - - tcp_mstamp_refresh(tp); - delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb)); - remaining = icsk->icsk_rto - - usecs_to_jiffies(delta_us); - - if (remaining > 0) { - inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, - remaining, TCP_RTO_MAX); - } else { - /* RTO revert clocked out retransmission. - * Will retransmit now */ - tcp_retransmit_timer(sk); - } - + /* check if this ICMP message allows revert of backoff. + * (see RFC 6069) + */ + if (!fastopen && + (code == ICMP_NET_UNREACH || code == ICMP_HOST_UNREACH)) + tcp_ld_RTO_revert(sk, seq); break; case ICMP_TIME_EXCEEDED: err = EHOSTUNREACH; -- cgit v1.2.3-59-g8ed1b From a12daf13a4492bb7fb26231e52afb381927f938e Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 26 May 2020 19:48:50 -0700 Subject: tcp: rename tcp_v4_err() skb parameter This essentially reverts 4d1a2d9ec1c1 ("Revert Backoff [v3]: Rename skb to icmp_skb in tcp_v4_err()") Now we have tcp_ld_RTO_revert() helper, we can use the usual name for sk_buff parameter, so that tcp_v4_err() and tcp_v6_err() use similar names. Signed-off-by: Eric Dumazet Acked-by: Neal Cardwell Signed-off-by: David S. Miller --- net/ipv4/tcp_ipv4.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index f32dcadc91b7..4eef5b84fff1 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -458,23 +458,23 @@ static void tcp_ld_RTO_revert(struct sock *sk, u32 seq) * */ -int tcp_v4_err(struct sk_buff *icmp_skb, u32 info) +int tcp_v4_err(struct sk_buff *skb, u32 info) { - const struct iphdr *iph = (const struct iphdr *)icmp_skb->data; - struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2)); + const struct iphdr *iph = (const struct iphdr *)skb->data; + struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2)); struct tcp_sock *tp; struct inet_sock *inet; - const int type = icmp_hdr(icmp_skb)->type; - const int code = icmp_hdr(icmp_skb)->code; + const int type = icmp_hdr(skb)->type; + const int code = icmp_hdr(skb)->code; struct sock *sk; struct request_sock *fastopen; u32 seq, snd_una; int err; - struct net *net = dev_net(icmp_skb->dev); + struct net *net = dev_net(skb->dev); sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr, th->dest, iph->saddr, ntohs(th->source), - inet_iif(icmp_skb), 0); + inet_iif(skb), 0); if (!sk) { __ICMP_INC_STATS(net, ICMP_MIB_INERRORS); return -ENOENT; @@ -524,7 +524,7 @@ int tcp_v4_err(struct sk_buff *icmp_skb, u32 info) switch (type) { case ICMP_REDIRECT: if (!sock_owned_by_user(sk)) - do_redirect(icmp_skb, sk); + do_redirect(skb, sk); goto out; case ICMP_SOURCE_QUENCH: /* Just silently ignore these. */ @@ -578,7 +578,7 @@ int tcp_v4_err(struct sk_buff *icmp_skb, u32 info) if (fastopen && !fastopen->sk) break; - ip_icmp_error(sk, icmp_skb, err, th->dest, info, (u8 *)th); + ip_icmp_error(sk, skb, err, th->dest, info, (u8 *)th); if (!sock_owned_by_user(sk)) { sk->sk_err = err; -- cgit v1.2.3-59-g8ed1b From 8fa54b1160721e5d94a039cd736aa0c63044d58a Mon Sep 17 00:00:00 2001 From: Wang Wenhu Date: Tue, 26 May 2020 20:19:24 -0700 Subject: drivers: ipa: fix typoes for ipa Change "transactio" -> "transaction". Also an alignment correction. Signed-off-by: Wang Wenhu Signed-off-by: David S. Miller --- drivers/net/ipa/gsi.c | 2 +- drivers/net/ipa/ipa_endpoint.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ipa/gsi.c b/drivers/net/ipa/gsi.c index 012304ddaed2..55226b264e3c 100644 --- a/drivers/net/ipa/gsi.c +++ b/drivers/net/ipa/gsi.c @@ -1358,7 +1358,7 @@ static void gsi_channel_update(struct gsi_channel *channel) * gsi_channel_poll_one() - Return a single completed transaction on a channel * @channel: Channel to be polled * - * @Return: Transaction pointer, or null if none are available + * @Return: Transaction pointer, or null if none are available * * This function returns the first entry on a channel's completed transaction * list. If that list is empty, the hardware is consulted to determine diff --git a/drivers/net/ipa/ipa_endpoint.c b/drivers/net/ipa/ipa_endpoint.c index 82066a223a67..66649a806dd1 100644 --- a/drivers/net/ipa/ipa_endpoint.c +++ b/drivers/net/ipa/ipa_endpoint.c @@ -364,7 +364,7 @@ int ipa_endpoint_modem_exception_reset_all(struct ipa *ipa) /* We need one command per modem TX endpoint. We can get an upper * bound on that by assuming all initialized endpoints are modem->IPA. * That won't happen, and we could be more precise, but this is fine - * for now. We need to end the transactio with a "tag process." + * for now. We need to end the transaction with a "tag process." */ count = hweight32(initialized) + ipa_cmd_tag_process_count(); trans = ipa_cmd_trans_alloc(ipa, count); -- cgit v1.2.3-59-g8ed1b From b3037ac50130a169c3980e63f8df2f0b599db411 Mon Sep 17 00:00:00 2001 From: Wang Wenhu Date: Tue, 26 May 2020 20:32:22 -0700 Subject: drivers: ipa: remove discription of nonexistent element No element named "client" exists within "struct ipa_endpoint". It might be a heritage forgotten to be removed. Delete it now. Signed-off-by: Wang Wenhu Signed-off-by: David S. Miller --- drivers/net/ipa/ipa_endpoint.h | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/net/ipa/ipa_endpoint.h b/drivers/net/ipa/ipa_endpoint.h index 3b297d65828e..58a245de488e 100644 --- a/drivers/net/ipa/ipa_endpoint.h +++ b/drivers/net/ipa/ipa_endpoint.h @@ -41,7 +41,6 @@ enum ipa_endpoint_name { /** * struct ipa_endpoint - IPA endpoint information - * @client: Client associated with the endpoint * @channel_id: EP's GSI channel * @evt_ring_id: EP's GSI channel event ring */ -- cgit v1.2.3-59-g8ed1b From 4909daba37846317ec7dcba16fba009636f7fe21 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Tue, 26 May 2020 21:35:23 -0700 Subject: net_sched: use qdisc_reset() in qdisc_destroy() qdisc_destroy() calls ops->reset() and cleans up qdisc->gso_skb and qdisc->skb_bad_txq, these are nearly same with qdisc_reset(), so just call it directly, and cosolidate the code for the next patch. Cc: Jamal Hadi Salim Cc: Jiri Pirko Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- net/sched/sch_generic.c | 16 +++------------- 1 file changed, 3 insertions(+), 13 deletions(-) diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index ebc55d884247..7a0b06001e48 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -949,7 +949,6 @@ static void qdisc_free_cb(struct rcu_head *head) static void qdisc_destroy(struct Qdisc *qdisc) { const struct Qdisc_ops *ops = qdisc->ops; - struct sk_buff *skb, *tmp; #ifdef CONFIG_NET_SCHED qdisc_hash_del(qdisc); @@ -957,24 +956,15 @@ static void qdisc_destroy(struct Qdisc *qdisc) qdisc_put_stab(rtnl_dereference(qdisc->stab)); #endif gen_kill_estimator(&qdisc->rate_est); - if (ops->reset) - ops->reset(qdisc); + + qdisc_reset(qdisc); + if (ops->destroy) ops->destroy(qdisc); module_put(ops->owner); dev_put(qdisc_dev(qdisc)); - skb_queue_walk_safe(&qdisc->gso_skb, skb, tmp) { - __skb_unlink(skb, &qdisc->gso_skb); - kfree_skb_list(skb); - } - - skb_queue_walk_safe(&qdisc->skb_bad_txq, skb, tmp) { - __skb_unlink(skb, &qdisc->skb_bad_txq); - kfree_skb_list(skb); - } - call_rcu(&qdisc->rcu, qdisc_free_cb); } -- cgit v1.2.3-59-g8ed1b From a34dac0b9055202cf9c64e08d8d8dc5e23029d3a Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Tue, 26 May 2020 21:35:24 -0700 Subject: net_sched: add tracepoints for qdisc_reset() and qdisc_destroy() Add two tracepoints for qdisc_reset() and qdisc_destroy() to track qdisc resetting and destroying. Sample output: tc-756 [000] ...3 138.355662: qdisc_reset: dev=ens3 kind=pfifo_fast parent=ffff:ffff handle=0:0 tc-756 [000] ...1 138.355720: qdisc_reset: dev=ens3 kind=pfifo_fast parent=ffff:ffff handle=0:0 tc-756 [000] ...1 138.355867: qdisc_reset: dev=ens3 kind=pfifo_fast parent=ffff:ffff handle=0:0 tc-756 [000] ...1 138.355930: qdisc_destroy: dev=ens3 kind=pfifo_fast parent=ffff:ffff handle=0:0 tc-757 [000] ...2 143.073780: qdisc_reset: dev=ens3 kind=fq_codel parent=ffff:ffff handle=8001:0 tc-757 [000] ...1 143.073878: qdisc_reset: dev=ens3 kind=fq_codel parent=ffff:ffff handle=8001:0 tc-757 [000] ...1 143.074114: qdisc_reset: dev=ens3 kind=fq_codel parent=ffff:ffff handle=8001:0 tc-757 [000] ...1 143.074228: qdisc_destroy: dev=ens3 kind=fq_codel parent=ffff:ffff handle=8001:0 Cc: Jamal Hadi Salim Cc: Jiri Pirko Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- include/trace/events/qdisc.h | 52 ++++++++++++++++++++++++++++++++++++++++++++ net/sched/sch_generic.c | 4 ++++ 2 files changed, 56 insertions(+) diff --git a/include/trace/events/qdisc.h b/include/trace/events/qdisc.h index 0d1a9ebf55ba..2b948801afa3 100644 --- a/include/trace/events/qdisc.h +++ b/include/trace/events/qdisc.h @@ -8,6 +8,8 @@ #include #include #include +#include +#include TRACE_EVENT(qdisc_dequeue, @@ -44,6 +46,56 @@ TRACE_EVENT(qdisc_dequeue, __entry->txq_state, __entry->packets, __entry->skbaddr ) ); +TRACE_EVENT(qdisc_reset, + + TP_PROTO(struct Qdisc *q), + + TP_ARGS(q), + + TP_STRUCT__entry( + __string( dev, qdisc_dev(q) ) + __string( kind, q->ops->id ) + __field( u32, parent ) + __field( u32, handle ) + ), + + TP_fast_assign( + __assign_str(dev, qdisc_dev(q)); + __assign_str(kind, q->ops->id); + __entry->parent = q->parent; + __entry->handle = q->handle; + ), + + TP_printk("dev=%s kind=%s parent=%x:%x handle=%x:%x", __get_str(dev), + __get_str(kind), TC_H_MAJ(__entry->parent) >> 16, TC_H_MIN(__entry->parent), + TC_H_MAJ(__entry->handle) >> 16, TC_H_MIN(__entry->handle)) +); + +TRACE_EVENT(qdisc_destroy, + + TP_PROTO(struct Qdisc *q), + + TP_ARGS(q), + + TP_STRUCT__entry( + __string( dev, qdisc_dev(q) ) + __string( kind, q->ops->id ) + __field( u32, parent ) + __field( u32, handle ) + ), + + TP_fast_assign( + __assign_str(dev, qdisc_dev(q)); + __assign_str(kind, q->ops->id); + __entry->parent = q->parent; + __entry->handle = q->handle; + ), + + TP_printk("dev=%s kind=%s parent=%x:%x handle=%x:%x", __get_str(dev), + __get_str(kind), TC_H_MAJ(__entry->parent) >> 16, TC_H_MIN(__entry->parent), + TC_H_MAJ(__entry->handle) >> 16, TC_H_MIN(__entry->handle)) +); + #endif /* _TRACE_QDISC_H */ /* This part must be outside protection */ diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index 7a0b06001e48..abaa446ed01a 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -911,6 +911,8 @@ void qdisc_reset(struct Qdisc *qdisc) const struct Qdisc_ops *ops = qdisc->ops; struct sk_buff *skb, *tmp; + trace_qdisc_reset(qdisc); + if (ops->reset) ops->reset(qdisc); @@ -965,6 +967,8 @@ static void qdisc_destroy(struct Qdisc *qdisc) module_put(ops->owner); dev_put(qdisc_dev(qdisc)); + trace_qdisc_destroy(qdisc); + call_rcu(&qdisc->rcu, qdisc_free_cb); } -- cgit v1.2.3-59-g8ed1b From f5a7833e83628f18c1ee94e6ffcb1d232f029be9 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Tue, 26 May 2020 21:35:25 -0700 Subject: net_sched: add a tracepoint for qdisc creation With this tracepoint, we could know when qdisc's are created, especially those default qdisc's. Sample output: tc-736 [001] ...1 56.230107: qdisc_create: dev=ens3 kind=pfifo parent=1:0 tc-736 [001] ...1 56.230113: qdisc_create: dev=ens3 kind=hfsc parent=ffff:ffff tc-738 [001] ...1 56.256816: qdisc_create: dev=ens3 kind=pfifo parent=1:100 tc-739 [001] ...1 56.267584: qdisc_create: dev=ens3 kind=pfifo parent=1:200 tc-740 [001] ...1 56.279649: qdisc_create: dev=ens3 kind=fq_codel parent=1:100 tc-741 [001] ...1 56.289996: qdisc_create: dev=ens3 kind=pfifo_fast parent=1:200 tc-745 [000] .N.1 111.687483: qdisc_create: dev=ens3 kind=ingress parent=ffff:fff1 Cc: Jamal Hadi Salim Cc: Jiri Pirko Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- include/trace/events/qdisc.h | 23 +++++++++++++++++++++++ net/sched/sch_api.c | 3 +++ net/sched/sch_generic.c | 4 +++- 3 files changed, 29 insertions(+), 1 deletion(-) diff --git a/include/trace/events/qdisc.h b/include/trace/events/qdisc.h index 2b948801afa3..330d32d84485 100644 --- a/include/trace/events/qdisc.h +++ b/include/trace/events/qdisc.h @@ -96,6 +96,29 @@ TRACE_EVENT(qdisc_destroy, TC_H_MAJ(__entry->handle) >> 16, TC_H_MIN(__entry->handle)) ); +TRACE_EVENT(qdisc_create, + + TP_PROTO(const struct Qdisc_ops *ops, struct net_device *dev, u32 parent), + + TP_ARGS(ops, dev, parent), + + TP_STRUCT__entry( + __string( dev, dev->name ) + __string( kind, ops->id ) + __field( u32, parent ) + ), + + TP_fast_assign( + __assign_str(dev, dev->name); + __assign_str(kind, ops->id); + __entry->parent = parent; + ), + + TP_printk("dev=%s kind=%s parent=%x:%x", + __get_str(dev), __get_str(kind), + TC_H_MAJ(__entry->parent) >> 16, TC_H_MIN(__entry->parent)) +); + #endif /* _TRACE_QDISC_H */ /* This part must be outside protection */ diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index 0d99df1e764d..9a3449b56bd6 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -32,6 +32,8 @@ #include #include +#include + /* Short review. @@ -1283,6 +1285,7 @@ static struct Qdisc *qdisc_create(struct net_device *dev, } qdisc_hash_add(sch, false); + trace_qdisc_create(ops, dev, parent); return sch; diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index abaa446ed01a..a4271e47f220 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -896,8 +896,10 @@ struct Qdisc *qdisc_create_dflt(struct netdev_queue *dev_queue, } sch->parent = parentid; - if (!ops->init || ops->init(sch, NULL, extack) == 0) + if (!ops->init || ops->init(sch, NULL, extack) == 0) { + trace_qdisc_create(ops, dev_queue->dev, parentid); return sch; + } qdisc_put(sch); return NULL; -- cgit v1.2.3-59-g8ed1b From 70f50965338a12e17454c31ad5ece27069719358 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Tue, 26 May 2020 21:35:26 -0700 Subject: net_sched: avoid resetting active qdisc for multiple times MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Except for sch_mq and sch_mqprio, each dev queue points to the same root qdisc, so when we reset the dev queues with netdev_for_each_tx_queue() we end up resetting the same instance of the root qdisc for multiple times. Avoid this by checking the __QDISC_STATE_DEACTIVATED bit in each iteration, so for sch_mq/sch_mqprio, we still reset all of them like before, for the rest, we only reset it once. Reported-by: Václav Zindulka Tested-by: Václav Zindulka Cc: Jamal Hadi Salim Cc: Jiri Pirko Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- net/sched/sch_generic.c | 37 +++++++++++++++++++++++-------------- 1 file changed, 23 insertions(+), 14 deletions(-) diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index a4271e47f220..d13e27467470 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -1128,6 +1128,28 @@ void dev_activate(struct net_device *dev) } EXPORT_SYMBOL(dev_activate); +static void qdisc_deactivate(struct Qdisc *qdisc) +{ + bool nolock = qdisc->flags & TCQ_F_NOLOCK; + + if (qdisc->flags & TCQ_F_BUILTIN) + return; + if (test_bit(__QDISC_STATE_DEACTIVATED, &qdisc->state)) + return; + + if (nolock) + spin_lock_bh(&qdisc->seqlock); + spin_lock_bh(qdisc_lock(qdisc)); + + set_bit(__QDISC_STATE_DEACTIVATED, &qdisc->state); + + qdisc_reset(qdisc); + + spin_unlock_bh(qdisc_lock(qdisc)); + if (nolock) + spin_unlock_bh(&qdisc->seqlock); +} + static void dev_deactivate_queue(struct net_device *dev, struct netdev_queue *dev_queue, void *_qdisc_default) @@ -1137,21 +1159,8 @@ static void dev_deactivate_queue(struct net_device *dev, qdisc = rtnl_dereference(dev_queue->qdisc); if (qdisc) { - bool nolock = qdisc->flags & TCQ_F_NOLOCK; - - if (nolock) - spin_lock_bh(&qdisc->seqlock); - spin_lock_bh(qdisc_lock(qdisc)); - - if (!(qdisc->flags & TCQ_F_BUILTIN)) - set_bit(__QDISC_STATE_DEACTIVATED, &qdisc->state); - + qdisc_deactivate(qdisc); rcu_assign_pointer(dev_queue->qdisc, qdisc_default); - qdisc_reset(qdisc); - - spin_unlock_bh(qdisc_lock(qdisc)); - if (nolock) - spin_unlock_bh(&qdisc->seqlock); } } -- cgit v1.2.3-59-g8ed1b From 759ae57f1b7bf5dfea1816f786f568d69efe34e2 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Tue, 26 May 2020 21:35:27 -0700 Subject: net_sched: get rid of unnecessary dev_qdisc_reset() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Resetting old qdisc on dev_queue->qdisc_sleeping in dev_qdisc_reset() is redundant, because this qdisc, even if not same with dev_queue->qdisc, is reset via qdisc_put() right after calling dev_graft_qdisc() when hitting refcnt 0. This is very easy to observe with qdisc_reset() tracepoint and stack traces. Reported-by: Václav Zindulka Tested-by: Václav Zindulka Cc: Jamal Hadi Salim Cc: Jiri Pirko Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- net/sched/sch_generic.c | 16 ---------------- 1 file changed, 16 deletions(-) diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index d13e27467470..b19a0021a0bd 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -1191,16 +1191,6 @@ static bool some_qdisc_is_busy(struct net_device *dev) return false; } -static void dev_qdisc_reset(struct net_device *dev, - struct netdev_queue *dev_queue, - void *none) -{ - struct Qdisc *qdisc = dev_queue->qdisc_sleeping; - - if (qdisc) - qdisc_reset(qdisc); -} - /** * dev_deactivate_many - deactivate transmissions on several devices * @head: list of devices to deactivate @@ -1237,12 +1227,6 @@ void dev_deactivate_many(struct list_head *head) */ schedule_timeout_uninterruptible(1); } - /* The new qdisc is assigned at this point so we can safely - * unwind stale skb lists and qdisc statistics - */ - netdev_for_each_tx_queue(dev, dev_qdisc_reset, NULL); - if (dev_ingress_queue(dev)) - dev_qdisc_reset(dev, dev_ingress_queue(dev), NULL); } } -- cgit v1.2.3-59-g8ed1b From b3ae2459f89773adcbf16fef4b68deaaa3be1929 Mon Sep 17 00:00:00 2001 From: Tariq Toukan Date: Wed, 27 May 2020 12:25:26 +0300 Subject: net/tls: Add force_resync for driver resync This patch adds a field to the tls rx offload context which enables drivers to force a send_resync call. This field can be used by drivers to request a resync at the next possible tls record. It is beneficial for hardware that provides the resync sequence number asynchronously. In such cases, the packet that triggered the resync does not contain the information required for a resync. Instead, the driver requests resync for all the following TLS record until the asynchronous notification with the resync request TCP sequence arrives. A following series for mlx5e ConnectX-6DX TLS RX offload support will use this mechanism. Signed-off-by: Boris Pismenny Signed-off-by: Tariq Toukan Reviewed-by: Maxim Mikityanskiy Reviewed-by: Saeed Mahameed Signed-off-by: David S. Miller --- include/net/tls.h | 12 +++++++++++- net/tls/tls_device.c | 9 ++++++--- 2 files changed, 17 insertions(+), 4 deletions(-) diff --git a/include/net/tls.h b/include/net/tls.h index bf9eb4823933..cf9ec152fbb7 100644 --- a/include/net/tls.h +++ b/include/net/tls.h @@ -594,12 +594,22 @@ tls_driver_ctx(const struct sock *sk, enum tls_offload_ctx_dir direction) #endif /* The TLS context is valid until sk_destruct is called */ +#define RESYNC_REQ (1 << 0) +#define RESYNC_REQ_FORCE (1 << 1) static inline void tls_offload_rx_resync_request(struct sock *sk, __be32 seq) { struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_offload_context_rx *rx_ctx = tls_offload_ctx_rx(tls_ctx); - atomic64_set(&rx_ctx->resync_req, ((u64)ntohl(seq) << 32) | 1); + atomic64_set(&rx_ctx->resync_req, ((u64)ntohl(seq) << 32) | RESYNC_REQ); +} + +static inline void tls_offload_rx_force_resync_request(struct sock *sk) +{ + struct tls_context *tls_ctx = tls_get_ctx(sk); + struct tls_offload_context_rx *rx_ctx = tls_offload_ctx_rx(tls_ctx); + + atomic64_set(&rx_ctx->resync_req, RESYNC_REQ | RESYNC_REQ_FORCE); } static inline void diff --git a/net/tls/tls_device.c b/net/tls/tls_device.c index a562ebaaa33c..0e55f8365ce2 100644 --- a/net/tls/tls_device.c +++ b/net/tls/tls_device.c @@ -694,10 +694,11 @@ void tls_device_rx_resync_new_rec(struct sock *sk, u32 rcd_len, u32 seq) { struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_offload_context_rx *rx_ctx; + bool is_req_pending, is_force_resync; u8 rcd_sn[TLS_MAX_REC_SEQ_SIZE]; - u32 sock_data, is_req_pending; struct tls_prot_info *prot; s64 resync_req; + u32 sock_data; u32 req_seq; if (tls_ctx->rx_conf != TLS_HW) @@ -712,9 +713,11 @@ void tls_device_rx_resync_new_rec(struct sock *sk, u32 rcd_len, u32 seq) resync_req = atomic64_read(&rx_ctx->resync_req); req_seq = resync_req >> 32; seq += TLS_HEADER_SIZE - 1; - is_req_pending = resync_req; + is_req_pending = resync_req & RESYNC_REQ; + is_force_resync = resync_req & RESYNC_REQ_FORCE; - if (likely(!is_req_pending) || req_seq != seq || + if (likely(!is_req_pending) || + (!is_force_resync && req_seq != seq) || !atomic64_try_cmpxchg(&rx_ctx->resync_req, &resync_req, 0)) return; break; -- cgit v1.2.3-59-g8ed1b From 50ce4c099bebf56be86c9448f7f4bcd34f33663c Mon Sep 17 00:00:00 2001 From: Jonas Falkevik Date: Wed, 27 May 2020 11:59:43 +0200 Subject: sctp: fix typo sctp_ulpevent_nofity_peer_addr_change change typo in function name "nofity" to "notify" sctp_ulpevent_nofity_peer_addr_change -> sctp_ulpevent_notify_peer_addr_change Signed-off-by: Jonas Falkevik Signed-off-by: David S. Miller --- include/net/sctp/ulpevent.h | 2 +- net/sctp/associola.c | 8 ++++---- net/sctp/ulpevent.c | 2 +- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/include/net/sctp/ulpevent.h b/include/net/sctp/ulpevent.h index 0b032b92da0b..994e984eef32 100644 --- a/include/net/sctp/ulpevent.h +++ b/include/net/sctp/ulpevent.h @@ -80,7 +80,7 @@ struct sctp_ulpevent *sctp_ulpevent_make_assoc_change( struct sctp_chunk *chunk, gfp_t gfp); -void sctp_ulpevent_nofity_peer_addr_change(struct sctp_transport *transport, +void sctp_ulpevent_notify_peer_addr_change(struct sctp_transport *transport, int state, int error); struct sctp_ulpevent *sctp_ulpevent_make_remote_error( diff --git a/net/sctp/associola.c b/net/sctp/associola.c index 437079a4883d..72315137d7e7 100644 --- a/net/sctp/associola.c +++ b/net/sctp/associola.c @@ -432,7 +432,7 @@ void sctp_assoc_set_primary(struct sctp_association *asoc, changeover = 1 ; asoc->peer.primary_path = transport; - sctp_ulpevent_nofity_peer_addr_change(transport, + sctp_ulpevent_notify_peer_addr_change(transport, SCTP_ADDR_MADE_PRIM, 0); /* Set a default msg_name for events. */ @@ -574,7 +574,7 @@ void sctp_assoc_rm_peer(struct sctp_association *asoc, asoc->peer.transport_count--; - sctp_ulpevent_nofity_peer_addr_change(peer, SCTP_ADDR_REMOVED, 0); + sctp_ulpevent_notify_peer_addr_change(peer, SCTP_ADDR_REMOVED, 0); sctp_transport_free(peer); } @@ -714,7 +714,7 @@ struct sctp_transport *sctp_assoc_add_peer(struct sctp_association *asoc, list_add_tail_rcu(&peer->transports, &asoc->peer.transport_addr_list); asoc->peer.transport_count++; - sctp_ulpevent_nofity_peer_addr_change(peer, SCTP_ADDR_ADDED, 0); + sctp_ulpevent_notify_peer_addr_change(peer, SCTP_ADDR_ADDED, 0); /* If we do not yet have a primary path, set one. */ if (!asoc->peer.primary_path) { @@ -840,7 +840,7 @@ void sctp_assoc_control_transport(struct sctp_association *asoc, * to the user. */ if (ulp_notify) - sctp_ulpevent_nofity_peer_addr_change(transport, + sctp_ulpevent_notify_peer_addr_change(transport, spc_state, error); /* Select new active and retran paths. */ diff --git a/net/sctp/ulpevent.c b/net/sctp/ulpevent.c index c82dbdcf13f2..f0640306e77f 100644 --- a/net/sctp/ulpevent.c +++ b/net/sctp/ulpevent.c @@ -336,7 +336,7 @@ fail: return NULL; } -void sctp_ulpevent_nofity_peer_addr_change(struct sctp_transport *transport, +void sctp_ulpevent_notify_peer_addr_change(struct sctp_transport *transport, int state, int error) { struct sctp_association *asoc = transport->asoc; -- cgit v1.2.3-59-g8ed1b From 0774dc7643db525f0bb9d0aa212cbfad3a412fc6 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 27 May 2020 20:22:28 +0200 Subject: dlm: use the tcp version of accept_from_sock for sctp as well The only difference between a few missing fixes applied to the SCTP one is that TCP uses ->getpeername to get the remote address, while SCTP uses kernel_getsockopt(.. SCTP_PRIMARY_ADDR). But given that getpeername is defined to return the primary address for sctp, there doesn't seem to be any reason for the different way of quering the peername, or all the code duplication. Signed-off-by: Christoph Hellwig Signed-off-by: David S. Miller --- fs/dlm/lowcomms.c | 123 ++---------------------------------------------------- 1 file changed, 3 insertions(+), 120 deletions(-) diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c index cdfaf4f0e11a..f13dad0fd9ef 100644 --- a/fs/dlm/lowcomms.c +++ b/fs/dlm/lowcomms.c @@ -724,7 +724,7 @@ out_close: } /* Listening socket is busy, accept a connection */ -static int tcp_accept_from_sock(struct connection *con) +static int accept_from_sock(struct connection *con) { int result; struct sockaddr_storage peeraddr; @@ -852,123 +852,6 @@ accept_err: return result; } -static int sctp_accept_from_sock(struct connection *con) -{ - /* Check that the new node is in the lockspace */ - struct sctp_prim prim; - int nodeid; - int prim_len, ret; - int addr_len; - struct connection *newcon; - struct connection *addcon; - struct socket *newsock; - - mutex_lock(&connections_lock); - if (!dlm_allow_conn) { - mutex_unlock(&connections_lock); - return -1; - } - mutex_unlock(&connections_lock); - - mutex_lock_nested(&con->sock_mutex, 0); - - ret = kernel_accept(con->sock, &newsock, O_NONBLOCK); - if (ret < 0) - goto accept_err; - - memset(&prim, 0, sizeof(struct sctp_prim)); - prim_len = sizeof(struct sctp_prim); - - ret = kernel_getsockopt(newsock, IPPROTO_SCTP, SCTP_PRIMARY_ADDR, - (char *)&prim, &prim_len); - if (ret < 0) { - log_print("getsockopt/sctp_primary_addr failed: %d", ret); - goto accept_err; - } - - make_sockaddr(&prim.ssp_addr, 0, &addr_len); - ret = addr_to_nodeid(&prim.ssp_addr, &nodeid); - if (ret) { - unsigned char *b = (unsigned char *)&prim.ssp_addr; - - log_print("reject connect from unknown addr"); - print_hex_dump_bytes("ss: ", DUMP_PREFIX_NONE, - b, sizeof(struct sockaddr_storage)); - goto accept_err; - } - - newcon = nodeid2con(nodeid, GFP_NOFS); - if (!newcon) { - ret = -ENOMEM; - goto accept_err; - } - - mutex_lock_nested(&newcon->sock_mutex, 1); - - if (newcon->sock) { - struct connection *othercon = newcon->othercon; - - if (!othercon) { - othercon = kmem_cache_zalloc(con_cache, GFP_NOFS); - if (!othercon) { - log_print("failed to allocate incoming socket"); - mutex_unlock(&newcon->sock_mutex); - ret = -ENOMEM; - goto accept_err; - } - othercon->nodeid = nodeid; - othercon->rx_action = receive_from_sock; - mutex_init(&othercon->sock_mutex); - INIT_LIST_HEAD(&othercon->writequeue); - spin_lock_init(&othercon->writequeue_lock); - INIT_WORK(&othercon->swork, process_send_sockets); - INIT_WORK(&othercon->rwork, process_recv_sockets); - set_bit(CF_IS_OTHERCON, &othercon->flags); - } - mutex_lock_nested(&othercon->sock_mutex, 2); - if (!othercon->sock) { - newcon->othercon = othercon; - add_sock(newsock, othercon); - addcon = othercon; - mutex_unlock(&othercon->sock_mutex); - } else { - printk("Extra connection from node %d attempted\n", nodeid); - ret = -EAGAIN; - mutex_unlock(&othercon->sock_mutex); - mutex_unlock(&newcon->sock_mutex); - goto accept_err; - } - } else { - newcon->rx_action = receive_from_sock; - add_sock(newsock, newcon); - addcon = newcon; - } - - log_print("connected to %d", nodeid); - - mutex_unlock(&newcon->sock_mutex); - - /* - * Add it to the active queue in case we got data - * between processing the accept adding the socket - * to the read_sockets list - */ - if (!test_and_set_bit(CF_READ_PENDING, &addcon->flags)) - queue_work(recv_workqueue, &addcon->rwork); - mutex_unlock(&con->sock_mutex); - - return 0; - -accept_err: - mutex_unlock(&con->sock_mutex); - if (newsock) - sock_release(newsock); - if (ret != -EAGAIN) - log_print("error accepting connection from node: %d", ret); - - return ret; -} - static void free_entry(struct writequeue_entry *e) { __free_page(e->page); @@ -1253,7 +1136,7 @@ static struct socket *tcp_create_listen_sock(struct connection *con, write_lock_bh(&sock->sk->sk_callback_lock); sock->sk->sk_user_data = con; save_listen_callbacks(sock); - con->rx_action = tcp_accept_from_sock; + con->rx_action = accept_from_sock; con->connect_action = tcp_connect_to_sock; write_unlock_bh(&sock->sk->sk_callback_lock); @@ -1340,7 +1223,7 @@ static int sctp_listen_for_all(void) save_listen_callbacks(sock); con->sock = sock; con->sock->sk->sk_data_ready = lowcomms_data_ready; - con->rx_action = sctp_accept_from_sock; + con->rx_action = accept_from_sock; con->connect_action = sctp_connect_to_sock; write_unlock_bh(&sock->sk->sk_callback_lock); -- cgit v1.2.3-59-g8ed1b From 7a15b2e013f535a125ad7351ffc808c79bc6de35 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 27 May 2020 20:22:29 +0200 Subject: net: remove kernel_getsockopt No users left. Signed-off-by: Christoph Hellwig Signed-off-by: David S. Miller --- include/linux/net.h | 2 -- net/socket.c | 34 ---------------------------------- 2 files changed, 36 deletions(-) diff --git a/include/linux/net.h b/include/linux/net.h index 6451425e828f..74ef5d7315f7 100644 --- a/include/linux/net.h +++ b/include/linux/net.h @@ -303,8 +303,6 @@ int kernel_connect(struct socket *sock, struct sockaddr *addr, int addrlen, int flags); int kernel_getsockname(struct socket *sock, struct sockaddr *addr); int kernel_getpeername(struct socket *sock, struct sockaddr *addr); -int kernel_getsockopt(struct socket *sock, int level, int optname, char *optval, - int *optlen); int kernel_setsockopt(struct socket *sock, int level, int optname, char *optval, unsigned int optlen); int kernel_sendpage(struct socket *sock, struct page *page, int offset, diff --git a/net/socket.c b/net/socket.c index 80422fc3c836..81a98b6cbd08 100644 --- a/net/socket.c +++ b/net/socket.c @@ -3624,40 +3624,6 @@ int kernel_getpeername(struct socket *sock, struct sockaddr *addr) } EXPORT_SYMBOL(kernel_getpeername); -/** - * kernel_getsockopt - get a socket option (kernel space) - * @sock: socket - * @level: API level (SOL_SOCKET, ...) - * @optname: option tag - * @optval: option value - * @optlen: option length - * - * Assigns the option length to @optlen. - * Returns 0 or an error. - */ - -int kernel_getsockopt(struct socket *sock, int level, int optname, - char *optval, int *optlen) -{ - mm_segment_t oldfs = get_fs(); - char __user *uoptval; - int __user *uoptlen; - int err; - - uoptval = (char __user __force *) optval; - uoptlen = (int __user __force *) optlen; - - set_fs(KERNEL_DS); - if (level == SOL_SOCKET) - err = sock_getsockopt(sock, level, optname, uoptval, uoptlen); - else - err = sock->ops->getsockopt(sock, level, optname, uoptval, - uoptlen); - set_fs(oldfs); - return err; -} -EXPORT_SYMBOL(kernel_getsockopt); - /** * kernel_setsockopt - set a socket option (kernel space) * @sock: socket -- cgit v1.2.3-59-g8ed1b From 22bef5e78f1193b664f59834361704cb22f9d5d7 Mon Sep 17 00:00:00 2001 From: Jesse Brandeburg Date: Fri, 15 May 2020 17:36:38 -0700 Subject: ice: fix signed vs unsigned comparisons Fix the remaining signed vs unsigned issues, which appear when compiling with -Werror=sign-compare. Many of these are because there is an external interface that is passing an int to us (which we can't change) but that we (rightfully) store and compare against as an unsigned in our data structures. Signed-off-by: Jesse Brandeburg Signed-off-by: Bruce Allan Signed-off-by: Tony Nguyen Tested-by: Andrew Bowers Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/ice/ice_base.c | 4 ++-- drivers/net/ethernet/intel/ice/ice_main.c | 8 ++++---- drivers/net/ethernet/intel/ice/ice_txrx.h | 7 ++++--- 3 files changed, 10 insertions(+), 9 deletions(-) diff --git a/drivers/net/ethernet/intel/ice/ice_base.c b/drivers/net/ethernet/intel/ice/ice_base.c index 94d833b4e745..9452c0eb70b0 100644 --- a/drivers/net/ethernet/intel/ice/ice_base.c +++ b/drivers/net/ethernet/intel/ice/ice_base.c @@ -13,7 +13,7 @@ */ static int __ice_vsi_get_qs_contig(struct ice_qs_cfg *qs_cfg) { - int offset, i; + unsigned int offset, i; mutex_lock(qs_cfg->qs_mutex); offset = bitmap_find_next_zero_area(qs_cfg->pf_map, qs_cfg->pf_map_size, @@ -39,7 +39,7 @@ static int __ice_vsi_get_qs_contig(struct ice_qs_cfg *qs_cfg) */ static int __ice_vsi_get_qs_sc(struct ice_qs_cfg *qs_cfg) { - int i, index = 0; + unsigned int i, index = 0; mutex_lock(qs_cfg->qs_mutex); for (i = 0; i < qs_cfg->q_count; i++) { diff --git a/drivers/net/ethernet/intel/ice/ice_main.c b/drivers/net/ethernet/intel/ice/ice_main.c index 1c255b27244c..c2da3e1a2e17 100644 --- a/drivers/net/ethernet/intel/ice/ice_main.c +++ b/drivers/net/ethernet/intel/ice/ice_main.c @@ -5035,7 +5035,7 @@ static int ice_change_mtu(struct net_device *netdev, int new_mtu) struct ice_pf *pf = vsi->back; u8 count = 0; - if (new_mtu == netdev->mtu) { + if (new_mtu == (int)netdev->mtu) { netdev_warn(netdev, "MTU is already %u\n", netdev->mtu); return 0; } @@ -5050,11 +5050,11 @@ static int ice_change_mtu(struct net_device *netdev, int new_mtu) } } - if (new_mtu < netdev->min_mtu) { + if (new_mtu < (int)netdev->min_mtu) { netdev_err(netdev, "new MTU invalid. min_mtu is %d\n", netdev->min_mtu); return -EINVAL; - } else if (new_mtu > netdev->max_mtu) { + } else if (new_mtu > (int)netdev->max_mtu) { netdev_err(netdev, "new MTU invalid. max_mtu is %d\n", netdev->min_mtu); return -EINVAL; @@ -5075,7 +5075,7 @@ static int ice_change_mtu(struct net_device *netdev, int new_mtu) return -EBUSY; } - netdev->mtu = new_mtu; + netdev->mtu = (unsigned int)new_mtu; /* if VSI is up, bring it down and then back up */ if (!test_and_set_bit(__ICE_DOWN, vsi->state)) { diff --git a/drivers/net/ethernet/intel/ice/ice_txrx.h b/drivers/net/ethernet/intel/ice/ice_txrx.h index cf21b4fe928a..e70c4619edc3 100644 --- a/drivers/net/ethernet/intel/ice/ice_txrx.h +++ b/drivers/net/ethernet/intel/ice/ice_txrx.h @@ -38,7 +38,8 @@ */ #if (PAGE_SIZE < 8192) #define ICE_2K_TOO_SMALL_WITH_PADDING \ -((NET_SKB_PAD + ICE_RXBUF_1536) > SKB_WITH_OVERHEAD(ICE_RXBUF_2048)) + ((unsigned int)(NET_SKB_PAD + ICE_RXBUF_1536) > \ + SKB_WITH_OVERHEAD(ICE_RXBUF_2048)) /** * ice_compute_pad - compute the padding @@ -107,8 +108,8 @@ static inline int ice_skb_pad(void) #define DESC_NEEDED (MAX_SKB_FRAGS + ICE_DESCS_FOR_CTX_DESC + \ ICE_DESCS_PER_CACHE_LINE + ICE_DESCS_FOR_SKB_DATA_PTR) #define ICE_DESC_UNUSED(R) \ - ((((R)->next_to_clean > (R)->next_to_use) ? 0 : (R)->count) + \ - (R)->next_to_clean - (R)->next_to_use - 1) + (u16)((((R)->next_to_clean > (R)->next_to_use) ? 0 : (R)->count) + \ + (R)->next_to_clean - (R)->next_to_use - 1) #define ICE_TX_FLAGS_TSO BIT(0) #define ICE_TX_FLAGS_HW_VLAN BIT(1) -- cgit v1.2.3-59-g8ed1b From f0cbbb9c6e06532fc839770de20e1e94e0d999dd Mon Sep 17 00:00:00 2001 From: Jesse Brandeburg Date: Fri, 15 May 2020 17:36:39 -0700 Subject: ice: remove unused macro The driver had an unused define that can be removed. Found by compiler -Werror=unused-macros check. Signed-off-by: Jesse Brandeburg Tested-by: Andrew Bowers Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/ice/ice_dcb_nl.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/net/ethernet/intel/ice/ice_dcb_nl.c b/drivers/net/ethernet/intel/ice/ice_dcb_nl.c index 93cf70d06fe5..87f91b750d59 100644 --- a/drivers/net/ethernet/intel/ice/ice_dcb_nl.c +++ b/drivers/net/ethernet/intel/ice/ice_dcb_nl.c @@ -7,8 +7,6 @@ #include "ice_dcb_nl.h" #include -#define ICE_APP_PROT_ID_ROCE 0x8915 - /** * ice_dcbnl_devreset - perform enough of a ifdown/ifup to sync DCBNL info * @netdev: device associated with interface that needs reset -- cgit v1.2.3-59-g8ed1b From bf8987df8aa57da884276b07a64307eac577eaaf Mon Sep 17 00:00:00 2001 From: Paul Greenwalt Date: Fri, 15 May 2020 17:36:40 -0700 Subject: ice: set VF default LAN address Remove is_zero_ether_add() check when setting the VF default LAN address. This check assumed that the address had been delete and zeroed before calling ice_vc_add_mac_addr(). Now the default LAN address will be set to the last unicast MAC address added by the VF. The default LAN address is reported by the PF via ndo_get_vf_config. Signed-off-by: Paul Greenwalt Tested-by: Andrew Bowers Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c b/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c index 9b09a111321c..efd54299a220 100644 --- a/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c +++ b/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c @@ -2862,9 +2862,11 @@ ice_vc_add_mac_addr(struct ice_vf *vf, struct ice_vsi *vsi, u8 *mac_addr) return -EIO; } - /* only set dflt_lan_addr once */ - if (is_zero_ether_addr(vf->dflt_lan_addr.addr) && - is_unicast_ether_addr(mac_addr)) + /* Set the default LAN address to the latest unicast MAC address added + * by the VF. The default LAN address is reported by the PF via + * ndo_get_vf_config. + */ + if (is_unicast_ether_addr(mac_addr)) ether_addr_copy(vf->dflt_lan_addr.addr, mac_addr); vf->num_mac++; -- cgit v1.2.3-59-g8ed1b From 5df42c8267418bfb8da54cc4772b397ea4c88aea Mon Sep 17 00:00:00 2001 From: Jesse Brandeburg Date: Fri, 15 May 2020 17:36:41 -0700 Subject: ice: fix MAC write command The manage MAC write command was implemented in an overly complex way that actually didn't work, as it wasn't symmetric to the manage MAC read command, and was feeding bytes out of order to the firmware. Fix the implementation by just using a simple array to represent the MAC address when it is being written via firmware command. Signed-off-by: Jesse Brandeburg Tested-by: Andrew Bowers Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/ice/ice_adminq_cmd.h | 10 ++++------ drivers/net/ethernet/intel/ice/ice_common.c | 5 +---- 2 files changed, 5 insertions(+), 10 deletions(-) diff --git a/drivers/net/ethernet/intel/ice/ice_adminq_cmd.h b/drivers/net/ethernet/intel/ice/ice_adminq_cmd.h index 586d69491268..f04c338fb6e0 100644 --- a/drivers/net/ethernet/intel/ice/ice_adminq_cmd.h +++ b/drivers/net/ethernet/intel/ice/ice_adminq_cmd.h @@ -156,13 +156,11 @@ struct ice_aqc_manage_mac_write { #define ICE_AQC_MAN_MAC_WR_MC_MAG_EN BIT(0) #define ICE_AQC_MAN_MAC_WR_WOL_LAA_PFR_KEEP BIT(1) #define ICE_AQC_MAN_MAC_WR_S 6 -#define ICE_AQC_MAN_MAC_WR_M (3 << ICE_AQC_MAN_MAC_WR_S) +#define ICE_AQC_MAN_MAC_WR_M ICE_M(3, ICE_AQC_MAN_MAC_WR_S) #define ICE_AQC_MAN_MAC_UPDATE_LAA 0 -#define ICE_AQC_MAN_MAC_UPDATE_LAA_WOL (BIT(0) << ICE_AQC_MAN_MAC_WR_S) - /* High 16 bits of MAC address in big endian order */ - __be16 sah; - /* Low 32 bits of MAC address in big endian order */ - __be32 sal; +#define ICE_AQC_MAN_MAC_UPDATE_LAA_WOL BIT(ICE_AQC_MAN_MAC_WR_S) + /* byte stream in network order */ + u8 mac_addr[ETH_ALEN]; __le32 addr_high; __le32 addr_low; }; diff --git a/drivers/net/ethernet/intel/ice/ice_common.c b/drivers/net/ethernet/intel/ice/ice_common.c index 0a0b00fffaf7..5da369ae33e0 100644 --- a/drivers/net/ethernet/intel/ice/ice_common.c +++ b/drivers/net/ethernet/intel/ice/ice_common.c @@ -1994,10 +1994,7 @@ ice_aq_manage_mac_write(struct ice_hw *hw, const u8 *mac_addr, u8 flags, ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_manage_mac_write); cmd->flags = flags; - - /* Prep values for flags, sah, sal */ - cmd->sah = htons(*((const u16 *)mac_addr)); - cmd->sal = htonl(*((const u32 *)(mac_addr + 2))); + ether_addr_copy(cmd->mac_addr, mac_addr); return ice_aq_send_cmd(hw, &desc, NULL, 0, cd); } -- cgit v1.2.3-59-g8ed1b From 1aaef2bc4e0a5ce9e4dd86359e6a0bf52c6aa64f Mon Sep 17 00:00:00 2001 From: Surabhi Boob Date: Fri, 15 May 2020 17:36:42 -0700 Subject: ice: Fix memory leak Handle memory leak on filter management initialization failure. Signed-off-by: Surabhi Boob Signed-off-by: Tony Nguyen Tested-by: Andrew Bowers Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/ice/ice_common.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/intel/ice/ice_common.c b/drivers/net/ethernet/intel/ice/ice_common.c index 5da369ae33e0..ee62cfa3a69e 100644 --- a/drivers/net/ethernet/intel/ice/ice_common.c +++ b/drivers/net/ethernet/intel/ice/ice_common.c @@ -387,6 +387,7 @@ ice_aq_set_mac_cfg(struct ice_hw *hw, u16 max_frame_size, struct ice_sq_cd *cd) static enum ice_status ice_init_fltr_mgmt_struct(struct ice_hw *hw) { struct ice_switch_info *sw; + enum ice_status status; hw->switch_info = devm_kzalloc(ice_hw_to_dev(hw), sizeof(*hw->switch_info), GFP_KERNEL); @@ -397,7 +398,12 @@ static enum ice_status ice_init_fltr_mgmt_struct(struct ice_hw *hw) INIT_LIST_HEAD(&sw->vsi_list_map_head); - return ice_init_def_sw_recp(hw); + status = ice_init_def_sw_recp(hw); + if (status) { + devm_kfree(ice_hw_to_dev(hw), hw->switch_info); + return status; + } + return 0; } /** -- cgit v1.2.3-59-g8ed1b From 68d270783742783f96e89ef92ac24ab3c7fb1d31 Mon Sep 17 00:00:00 2001 From: Surabhi Boob Date: Fri, 15 May 2020 17:36:43 -0700 Subject: ice: Fix for memory leaks and modify ICE_FREE_CQ_BUFS Handle memory leaks during control queue initialization and buffer allocation failures. The macro ICE_FREE_CQ_BUFS is modified to re-use for this fix. Signed-off-by: Surabhi Boob Signed-off-by: Tony Nguyen Tested-by: Andrew Bowers Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/ice/ice_controlq.c | 49 +++++++++++++++------------ 1 file changed, 28 insertions(+), 21 deletions(-) diff --git a/drivers/net/ethernet/intel/ice/ice_controlq.c b/drivers/net/ethernet/intel/ice/ice_controlq.c index 9a865962296d..62c2c1e621d2 100644 --- a/drivers/net/ethernet/intel/ice/ice_controlq.c +++ b/drivers/net/ethernet/intel/ice/ice_controlq.c @@ -199,7 +199,9 @@ unwind_alloc_rq_bufs: cq->rq.r.rq_bi[i].pa = 0; cq->rq.r.rq_bi[i].size = 0; } + cq->rq.r.rq_bi = NULL; devm_kfree(ice_hw_to_dev(hw), cq->rq.dma_head); + cq->rq.dma_head = NULL; return ICE_ERR_NO_MEMORY; } @@ -245,7 +247,9 @@ unwind_alloc_sq_bufs: cq->sq.r.sq_bi[i].pa = 0; cq->sq.r.sq_bi[i].size = 0; } + cq->sq.r.sq_bi = NULL; devm_kfree(ice_hw_to_dev(hw), cq->sq.dma_head); + cq->sq.dma_head = NULL; return ICE_ERR_NO_MEMORY; } @@ -304,6 +308,28 @@ ice_cfg_rq_regs(struct ice_hw *hw, struct ice_ctl_q_info *cq) return 0; } +#define ICE_FREE_CQ_BUFS(hw, qi, ring) \ +do { \ + int i; \ + /* free descriptors */ \ + if ((qi)->ring.r.ring##_bi) \ + for (i = 0; i < (qi)->num_##ring##_entries; i++) \ + if ((qi)->ring.r.ring##_bi[i].pa) { \ + dmam_free_coherent(ice_hw_to_dev(hw), \ + (qi)->ring.r.ring##_bi[i].size, \ + (qi)->ring.r.ring##_bi[i].va, \ + (qi)->ring.r.ring##_bi[i].pa); \ + (qi)->ring.r.ring##_bi[i].va = NULL;\ + (qi)->ring.r.ring##_bi[i].pa = 0;\ + (qi)->ring.r.ring##_bi[i].size = 0;\ + } \ + /* free the buffer info list */ \ + if ((qi)->ring.cmd_buf) \ + devm_kfree(ice_hw_to_dev(hw), (qi)->ring.cmd_buf); \ + /* free DMA head */ \ + devm_kfree(ice_hw_to_dev(hw), (qi)->ring.dma_head); \ +} while (0) + /** * ice_init_sq - main initialization routine for Control ATQ * @hw: pointer to the hardware structure @@ -357,6 +383,7 @@ static enum ice_status ice_init_sq(struct ice_hw *hw, struct ice_ctl_q_info *cq) goto init_ctrlq_exit; init_ctrlq_free_rings: + ICE_FREE_CQ_BUFS(hw, cq, sq); ice_free_cq_ring(hw, &cq->sq); init_ctrlq_exit: @@ -416,33 +443,13 @@ static enum ice_status ice_init_rq(struct ice_hw *hw, struct ice_ctl_q_info *cq) goto init_ctrlq_exit; init_ctrlq_free_rings: + ICE_FREE_CQ_BUFS(hw, cq, rq); ice_free_cq_ring(hw, &cq->rq); init_ctrlq_exit: return ret_code; } -#define ICE_FREE_CQ_BUFS(hw, qi, ring) \ -do { \ - int i; \ - /* free descriptors */ \ - for (i = 0; i < (qi)->num_##ring##_entries; i++) \ - if ((qi)->ring.r.ring##_bi[i].pa) { \ - dmam_free_coherent(ice_hw_to_dev(hw), \ - (qi)->ring.r.ring##_bi[i].size,\ - (qi)->ring.r.ring##_bi[i].va,\ - (qi)->ring.r.ring##_bi[i].pa);\ - (qi)->ring.r.ring##_bi[i].va = NULL; \ - (qi)->ring.r.ring##_bi[i].pa = 0; \ - (qi)->ring.r.ring##_bi[i].size = 0; \ - } \ - /* free the buffer info list */ \ - if ((qi)->ring.cmd_buf) \ - devm_kfree(ice_hw_to_dev(hw), (qi)->ring.cmd_buf); \ - /* free DMA head */ \ - devm_kfree(ice_hw_to_dev(hw), (qi)->ring.dma_head); \ -} while (0) - /** * ice_shutdown_sq - shutdown the Control ATQ * @hw: pointer to the hardware structure -- cgit v1.2.3-59-g8ed1b From 4f1fe43c920b92ac41c34f151fe452d46936b79d Mon Sep 17 00:00:00 2001 From: Brett Creeley Date: Fri, 15 May 2020 17:36:44 -0700 Subject: ice: Add more Rx errors to netdev's rx_error counter Currently we are only including illegal_bytes and rx_crc_errors in the PF netdev's rx_error counter. There are many more causes of Rx errors that the device supports and reports via Ethtool. Accumulate all Rx errors in the PF netdev's rx_error counter. Signed-off-by: Brett Creeley Tested-by: Andrew Bowers Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/ice/ice_main.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/intel/ice/ice_main.c b/drivers/net/ethernet/intel/ice/ice_main.c index c2da3e1a2e17..93a42ff7496b 100644 --- a/drivers/net/ethernet/intel/ice/ice_main.c +++ b/drivers/net/ethernet/intel/ice/ice_main.c @@ -4295,7 +4295,13 @@ void ice_update_vsi_stats(struct ice_vsi *vsi) if (vsi->type == ICE_VSI_PF) { cur_ns->rx_crc_errors = pf->stats.crc_errors; cur_ns->rx_errors = pf->stats.crc_errors + - pf->stats.illegal_bytes; + pf->stats.illegal_bytes + + pf->stats.rx_len_errors + + pf->stats.rx_undersize + + pf->hw_csum_rx_error + + pf->stats.rx_jabber + + pf->stats.rx_fragments + + pf->stats.rx_oversize; cur_ns->rx_length_errors = pf->stats.rx_len_errors; /* record drops from the port level */ cur_ns->rx_missed_errors = pf->stats.eth.rx_discards; -- cgit v1.2.3-59-g8ed1b From 1960827570c7ed83fb0725debf856b06f46e1a77 Mon Sep 17 00:00:00 2001 From: Brett Creeley Date: Fri, 15 May 2020 17:42:13 -0700 Subject: ice: Don't allow VLAN stripping change when pvid set Currently, if the PVID is set in the VLAN handling section of the VSI context the driver still allows VLAN stripping to be enabled/disabled. VLAN stripping should only be modifiable when the PVID is not set. Fix this by preventing VLAN stripping modification when PVID is set. Signed-off-by: Brett Creeley Tested-by: Andrew Bowers Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/ice/ice_lib.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/net/ethernet/intel/ice/ice_lib.c b/drivers/net/ethernet/intel/ice/ice_lib.c index f81bd4c30bbc..89962c14e31f 100644 --- a/drivers/net/ethernet/intel/ice/ice_lib.c +++ b/drivers/net/ethernet/intel/ice/ice_lib.c @@ -1812,6 +1812,12 @@ int ice_vsi_manage_vlan_stripping(struct ice_vsi *vsi, bool ena) enum ice_status status; int ret = 0; + /* do not allow modifying VLAN stripping when a port VLAN is configured + * on this VSI + */ + if (vsi->info.pvid) + return 0; + ctxt = kzalloc(sizeof(*ctxt), GFP_KERNEL); if (!ctxt) return -ENOMEM; -- cgit v1.2.3-59-g8ed1b From b5c7f857e5c98c21271678992fb2634df38292f5 Mon Sep 17 00:00:00 2001 From: Evan Swanson Date: Fri, 15 May 2020 17:42:14 -0700 Subject: ice: Handle critical FW error during admin queue initialization A race condition between FW and SW can occur between admin queue setup and the first command sent. A link event may occur and FW attempts to notify a non-existent queue. FW will set the critical error bit and disable the queue. When this happens retry queue setup. Signed-off-by: Evan Swanson Signed-off-by: Anirudh Venkataramanan Signed-off-by: Tony Nguyen Tested-by: Andrew Bowers Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/ice/ice_controlq.c | 126 ++++++++++++++---------- drivers/net/ethernet/intel/ice/ice_controlq.h | 3 + drivers/net/ethernet/intel/ice/ice_hw_autogen.h | 2 + drivers/net/ethernet/intel/ice/ice_main.c | 2 + drivers/net/ethernet/intel/ice/ice_status.h | 1 + 5 files changed, 80 insertions(+), 54 deletions(-) diff --git a/drivers/net/ethernet/intel/ice/ice_controlq.c b/drivers/net/ethernet/intel/ice/ice_controlq.c index 62c2c1e621d2..479a74efc536 100644 --- a/drivers/net/ethernet/intel/ice/ice_controlq.c +++ b/drivers/net/ethernet/intel/ice/ice_controlq.c @@ -12,6 +12,7 @@ do { \ (qinfo)->sq.bal = prefix##_ATQBAL; \ (qinfo)->sq.len_mask = prefix##_ATQLEN_ATQLEN_M; \ (qinfo)->sq.len_ena_mask = prefix##_ATQLEN_ATQENABLE_M; \ + (qinfo)->sq.len_crit_mask = prefix##_ATQLEN_ATQCRIT_M; \ (qinfo)->sq.head_mask = prefix##_ATQH_ATQH_M; \ (qinfo)->rq.head = prefix##_ARQH; \ (qinfo)->rq.tail = prefix##_ARQT; \ @@ -20,6 +21,7 @@ do { \ (qinfo)->rq.bal = prefix##_ARQBAL; \ (qinfo)->rq.len_mask = prefix##_ARQLEN_ARQLEN_M; \ (qinfo)->rq.len_ena_mask = prefix##_ARQLEN_ARQENABLE_M; \ + (qinfo)->rq.len_crit_mask = prefix##_ARQLEN_ARQCRIT_M; \ (qinfo)->rq.head_mask = prefix##_ARQH_ARQH_M; \ } while (0) @@ -641,6 +643,50 @@ init_ctrlq_free_sq: return ret_code; } +/** + * ice_shutdown_ctrlq - shutdown routine for any control queue + * @hw: pointer to the hardware structure + * @q_type: specific Control queue type + * + * NOTE: this function does not destroy the control queue locks. + */ +static void ice_shutdown_ctrlq(struct ice_hw *hw, enum ice_ctl_q q_type) +{ + struct ice_ctl_q_info *cq; + + switch (q_type) { + case ICE_CTL_Q_ADMIN: + cq = &hw->adminq; + if (ice_check_sq_alive(hw, cq)) + ice_aq_q_shutdown(hw, true); + break; + case ICE_CTL_Q_MAILBOX: + cq = &hw->mailboxq; + break; + default: + return; + } + + ice_shutdown_sq(hw, cq); + ice_shutdown_rq(hw, cq); +} + +/** + * ice_shutdown_all_ctrlq - shutdown routine for all control queues + * @hw: pointer to the hardware structure + * + * NOTE: this function does not destroy the control queue locks. The driver + * may call this at runtime to shutdown and later restart control queues, such + * as in response to a reset event. + */ +void ice_shutdown_all_ctrlq(struct ice_hw *hw) +{ + /* Shutdown FW admin queue */ + ice_shutdown_ctrlq(hw, ICE_CTL_Q_ADMIN); + /* Shutdown PF-VF Mailbox */ + ice_shutdown_ctrlq(hw, ICE_CTL_Q_MAILBOX); +} + /** * ice_init_all_ctrlq - main initialization routine for all control queues * @hw: pointer to the hardware structure @@ -656,17 +702,27 @@ init_ctrlq_free_sq: */ enum ice_status ice_init_all_ctrlq(struct ice_hw *hw) { - enum ice_status ret_code; + enum ice_status status; + u32 retry = 0; /* Init FW admin queue */ - ret_code = ice_init_ctrlq(hw, ICE_CTL_Q_ADMIN); - if (ret_code) - return ret_code; + do { + status = ice_init_ctrlq(hw, ICE_CTL_Q_ADMIN); + if (status) + return status; - ret_code = ice_init_check_adminq(hw); - if (ret_code) - return ret_code; + status = ice_init_check_adminq(hw); + if (status != ICE_ERR_AQ_FW_CRITICAL) + break; + ice_debug(hw, ICE_DBG_AQ_MSG, + "Retry Admin Queue init due to FW critical error\n"); + ice_shutdown_ctrlq(hw, ICE_CTL_Q_ADMIN); + msleep(ICE_CTL_Q_ADMIN_INIT_MSEC); + } while (retry++ < ICE_CTL_Q_ADMIN_INIT_TIMEOUT); + + if (status) + return status; /* Init Mailbox queue */ return ice_init_ctrlq(hw, ICE_CTL_Q_MAILBOX); } @@ -707,50 +763,6 @@ enum ice_status ice_create_all_ctrlq(struct ice_hw *hw) return ice_init_all_ctrlq(hw); } -/** - * ice_shutdown_ctrlq - shutdown routine for any control queue - * @hw: pointer to the hardware structure - * @q_type: specific Control queue type - * - * NOTE: this function does not destroy the control queue locks. - */ -static void ice_shutdown_ctrlq(struct ice_hw *hw, enum ice_ctl_q q_type) -{ - struct ice_ctl_q_info *cq; - - switch (q_type) { - case ICE_CTL_Q_ADMIN: - cq = &hw->adminq; - if (ice_check_sq_alive(hw, cq)) - ice_aq_q_shutdown(hw, true); - break; - case ICE_CTL_Q_MAILBOX: - cq = &hw->mailboxq; - break; - default: - return; - } - - ice_shutdown_sq(hw, cq); - ice_shutdown_rq(hw, cq); -} - -/** - * ice_shutdown_all_ctrlq - shutdown routine for all control queues - * @hw: pointer to the hardware structure - * - * NOTE: this function does not destroy the control queue locks. The driver - * may call this at runtime to shutdown and later restart control queues, such - * as in response to a reset event. - */ -void ice_shutdown_all_ctrlq(struct ice_hw *hw) -{ - /* Shutdown FW admin queue */ - ice_shutdown_ctrlq(hw, ICE_CTL_Q_ADMIN); - /* Shutdown PF-VF Mailbox */ - ice_shutdown_ctrlq(hw, ICE_CTL_Q_MAILBOX); -} - /** * ice_destroy_ctrlq_locks - Destroy locks for a control queue * @cq: pointer to the control queue @@ -1049,9 +1061,15 @@ ice_sq_send_cmd(struct ice_hw *hw, struct ice_ctl_q_info *cq, /* update the error if time out occurred */ if (!cmd_completed) { - ice_debug(hw, ICE_DBG_AQ_MSG, - "Control Send Queue Writeback timeout.\n"); - status = ICE_ERR_AQ_TIMEOUT; + if (rd32(hw, cq->rq.len) & cq->rq.len_crit_mask || + rd32(hw, cq->sq.len) & cq->sq.len_crit_mask) { + ice_debug(hw, ICE_DBG_AQ_MSG, "Critical FW error.\n"); + status = ICE_ERR_AQ_FW_CRITICAL; + } else { + ice_debug(hw, ICE_DBG_AQ_MSG, + "Control Send Queue Writeback timeout.\n"); + status = ICE_ERR_AQ_TIMEOUT; + } } sq_send_command_error: diff --git a/drivers/net/ethernet/intel/ice/ice_controlq.h b/drivers/net/ethernet/intel/ice/ice_controlq.h index bf0ebe6149e8..faaa08e8171b 100644 --- a/drivers/net/ethernet/intel/ice/ice_controlq.h +++ b/drivers/net/ethernet/intel/ice/ice_controlq.h @@ -34,6 +34,8 @@ enum ice_ctl_q { /* Control Queue timeout settings - max delay 250ms */ #define ICE_CTL_Q_SQ_CMD_TIMEOUT 2500 /* Count 2500 times */ #define ICE_CTL_Q_SQ_CMD_USEC 100 /* Check every 100usec */ +#define ICE_CTL_Q_ADMIN_INIT_TIMEOUT 10 /* Count 10 times */ +#define ICE_CTL_Q_ADMIN_INIT_MSEC 100 /* Check every 100msec */ struct ice_ctl_q_ring { void *dma_head; /* Virtual address to DMA head */ @@ -59,6 +61,7 @@ struct ice_ctl_q_ring { u32 bal; u32 len_mask; u32 len_ena_mask; + u32 len_crit_mask; u32 head_mask; }; diff --git a/drivers/net/ethernet/intel/ice/ice_hw_autogen.h b/drivers/net/ethernet/intel/ice/ice_hw_autogen.h index 2f1c776747a4..1086c9f778b4 100644 --- a/drivers/net/ethernet/intel/ice/ice_hw_autogen.h +++ b/drivers/net/ethernet/intel/ice/ice_hw_autogen.h @@ -39,6 +39,7 @@ #define PF_MBX_ARQH_ARQH_M ICE_M(0x3FF, 0) #define PF_MBX_ARQLEN 0x0022E480 #define PF_MBX_ARQLEN_ARQLEN_M ICE_M(0x3FF, 0) +#define PF_MBX_ARQLEN_ARQCRIT_M BIT(30) #define PF_MBX_ARQLEN_ARQENABLE_M BIT(31) #define PF_MBX_ARQT 0x0022E580 #define PF_MBX_ATQBAH 0x0022E180 @@ -47,6 +48,7 @@ #define PF_MBX_ATQH_ATQH_M ICE_M(0x3FF, 0) #define PF_MBX_ATQLEN 0x0022E200 #define PF_MBX_ATQLEN_ATQLEN_M ICE_M(0x3FF, 0) +#define PF_MBX_ATQLEN_ATQCRIT_M BIT(30) #define PF_MBX_ATQLEN_ATQENABLE_M BIT(31) #define PF_MBX_ATQT 0x0022E300 #define PRTDCB_GENC 0x00083000 diff --git a/drivers/net/ethernet/intel/ice/ice_main.c b/drivers/net/ethernet/intel/ice/ice_main.c index 93a42ff7496b..247e7b186b3c 100644 --- a/drivers/net/ethernet/intel/ice/ice_main.c +++ b/drivers/net/ethernet/intel/ice/ice_main.c @@ -5207,6 +5207,8 @@ const char *ice_stat_str(enum ice_status stat_err) return "ICE_ERR_AQ_NO_WORK"; case ICE_ERR_AQ_EMPTY: return "ICE_ERR_AQ_EMPTY"; + case ICE_ERR_AQ_FW_CRITICAL: + return "ICE_ERR_AQ_FW_CRITICAL"; } return "ICE_ERR_UNKNOWN"; diff --git a/drivers/net/ethernet/intel/ice/ice_status.h b/drivers/net/ethernet/intel/ice/ice_status.h index 546a02856d09..4028c6365172 100644 --- a/drivers/net/ethernet/intel/ice/ice_status.h +++ b/drivers/net/ethernet/intel/ice/ice_status.h @@ -37,6 +37,7 @@ enum ice_status { ICE_ERR_AQ_FULL = -102, ICE_ERR_AQ_NO_WORK = -103, ICE_ERR_AQ_EMPTY = -104, + ICE_ERR_AQ_FW_CRITICAL = -105, }; #endif /* _ICE_STATUS_H_ */ -- cgit v1.2.3-59-g8ed1b From c8f135c6ee7851ad72bd4d877216950fcbd45fb6 Mon Sep 17 00:00:00 2001 From: Marta Plantykow Date: Fri, 15 May 2020 17:42:15 -0700 Subject: ice: Change number of XDP TxQ to 0 when destroying rings When XDP Tx rings are destroyed the number of XDP Tx queues is not changing. This patch is changing this number to 0. Signed-off-by: Marta Plantykow Tested-by: Andrew Bowers Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/ice/ice_main.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/net/ethernet/intel/ice/ice_main.c b/drivers/net/ethernet/intel/ice/ice_main.c index 247e7b186b3c..081fec3131cd 100644 --- a/drivers/net/ethernet/intel/ice/ice_main.c +++ b/drivers/net/ethernet/intel/ice/ice_main.c @@ -1899,6 +1899,9 @@ free_qmap: for (i = 0; i < vsi->tc_cfg.numtc; i++) max_txqs[i] = vsi->num_txq; + /* change number of XDP Tx queues to 0 */ + vsi->num_xdp_txq = 0; + return ice_cfg_vsi_lan(vsi->port_info, vsi->idx, vsi->tc_cfg.ena_tc, max_txqs); } -- cgit v1.2.3-59-g8ed1b From 49d358e0e746dc24bfb1b1cf98c17064e5177424 Mon Sep 17 00:00:00 2001 From: Marta Plantykow Date: Fri, 15 May 2020 17:42:16 -0700 Subject: ice: Add XDP Tx to VSI ring stats When XDP Tx program is loaded and packets are sent from interface, VSI statistics are not updated. This patch adds packets sent on Tx XDP ring to VSI ring stats. Signed-off-by: Marta Plantykow Tested-by: Andrew Bowers Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/ice/ice_main.c | 42 ++++++++++++++++++++++++------- 1 file changed, 33 insertions(+), 9 deletions(-) diff --git a/drivers/net/ethernet/intel/ice/ice_main.c b/drivers/net/ethernet/intel/ice/ice_main.c index 081fec3131cd..81c5f0ce5b8f 100644 --- a/drivers/net/ethernet/intel/ice/ice_main.c +++ b/drivers/net/ethernet/intel/ice/ice_main.c @@ -4219,6 +4219,33 @@ ice_fetch_u64_stats_per_ring(struct ice_ring *ring, u64 *pkts, u64 *bytes) } while (u64_stats_fetch_retry_irq(&ring->syncp, start)); } +/** + * ice_update_vsi_tx_ring_stats - Update VSI Tx ring stats counters + * @vsi: the VSI to be updated + * @rings: rings to work on + * @count: number of rings + */ +static void +ice_update_vsi_tx_ring_stats(struct ice_vsi *vsi, struct ice_ring **rings, + u16 count) +{ + struct rtnl_link_stats64 *vsi_stats = &vsi->net_stats; + u16 i; + + for (i = 0; i < count; i++) { + struct ice_ring *ring; + u64 pkts, bytes; + + ring = READ_ONCE(rings[i]); + ice_fetch_u64_stats_per_ring(ring, &pkts, &bytes); + vsi_stats->tx_packets += pkts; + vsi_stats->tx_bytes += bytes; + vsi->tx_restart += ring->tx_stats.restart_q; + vsi->tx_busy += ring->tx_stats.tx_busy; + vsi->tx_linearize += ring->tx_stats.tx_linearize; + } +} + /** * ice_update_vsi_ring_stats - Update VSI stats counters * @vsi: the VSI to be updated @@ -4246,15 +4273,7 @@ static void ice_update_vsi_ring_stats(struct ice_vsi *vsi) rcu_read_lock(); /* update Tx rings counters */ - ice_for_each_txq(vsi, i) { - ring = READ_ONCE(vsi->tx_rings[i]); - ice_fetch_u64_stats_per_ring(ring, &pkts, &bytes); - vsi_stats->tx_packets += pkts; - vsi_stats->tx_bytes += bytes; - vsi->tx_restart += ring->tx_stats.restart_q; - vsi->tx_busy += ring->tx_stats.tx_busy; - vsi->tx_linearize += ring->tx_stats.tx_linearize; - } + ice_update_vsi_tx_ring_stats(vsi, vsi->tx_rings, vsi->num_txq); /* update Rx rings counters */ ice_for_each_rxq(vsi, i) { @@ -4266,6 +4285,11 @@ static void ice_update_vsi_ring_stats(struct ice_vsi *vsi) vsi->rx_page_failed += ring->rx_stats.alloc_page_failed; } + /* update XDP Tx rings counters */ + if (ice_is_xdp_ena_vsi(vsi)) + ice_update_vsi_tx_ring_stats(vsi, vsi->xdp_rings, + vsi->num_xdp_txq); + rcu_read_unlock(); } -- cgit v1.2.3-59-g8ed1b From ae15e0ba1b333f391ab0d678abb752cb6a7f2782 Mon Sep 17 00:00:00 2001 From: Marta Plantykow Date: Fri, 15 May 2020 17:42:17 -0700 Subject: ice: Change number of XDP Tx queues to match number of Rx queues In current implementation number of XDP Tx queues is the same as the number of transmit queues, which is not always true. This patch changes this number to match the number of receive queues. XDP programs are running on Rx rings, so what we actually need to provide is the XDP Tx ring per each Rx ring so that the whole XDP ecosystem is functional, e.g. if the result of XDP prog is XDP_TX then you have the need to access the XDP Tx ring. Signed-off-by: Marta Plantykow Tested-by: Andrew Bowers Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/ice/ice_lib.c | 2 +- drivers/net/ethernet/intel/ice/ice_main.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/intel/ice/ice_lib.c b/drivers/net/ethernet/intel/ice/ice_lib.c index 89962c14e31f..6f3ee8ac11ce 100644 --- a/drivers/net/ethernet/intel/ice/ice_lib.c +++ b/drivers/net/ethernet/intel/ice/ice_lib.c @@ -2785,7 +2785,7 @@ int ice_vsi_rebuild(struct ice_vsi *vsi, bool init_vsi) ice_vsi_map_rings_to_vectors(vsi); if (ice_is_xdp_ena_vsi(vsi)) { - vsi->num_xdp_txq = vsi->alloc_txq; + vsi->num_xdp_txq = vsi->alloc_rxq; ret = ice_prepare_xdp_rings(vsi, vsi->xdp_prog); if (ret) goto err_vectors; diff --git a/drivers/net/ethernet/intel/ice/ice_main.c b/drivers/net/ethernet/intel/ice/ice_main.c index 81c5f0ce5b8f..b64c4e796636 100644 --- a/drivers/net/ethernet/intel/ice/ice_main.c +++ b/drivers/net/ethernet/intel/ice/ice_main.c @@ -1935,7 +1935,7 @@ ice_xdp_setup_prog(struct ice_vsi *vsi, struct bpf_prog *prog, } if (!ice_is_xdp_ena_vsi(vsi) && prog) { - vsi->num_xdp_txq = vsi->alloc_txq; + vsi->num_xdp_txq = vsi->alloc_rxq; xdp_ring_err = ice_prepare_xdp_rings(vsi, prog); if (xdp_ring_err) NL_SET_ERR_MSG_MOD(extack, "Setting up XDP Tx resources failed"); -- cgit v1.2.3-59-g8ed1b From 7e34786a74e14038faadabb24b0d7f4436961c6b Mon Sep 17 00:00:00 2001 From: Bruce Allan Date: Fri, 15 May 2020 17:42:18 -0700 Subject: ice: avoid undefined behavior When writing the driver's struct ice_tlan_ctx structure, do not write the 8-bit element int_q_state with the associated internal-to-hardware field which is 122-bits, otherwise the helper function ice_write_byte() will use undefined behavior when setting the mask used for that write. This should not cause any functional change and will avoid use of undefined behavior. Also, update a comment to highlight this structure element is not written. Signed-off-by: Bruce Allan Signed-off-by: Tony Nguyen Tested-by: Andrew Bowers Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/ice/ice_base.c | 5 +++-- drivers/net/ethernet/intel/ice/ice_common.c | 12 ++++++++++-- drivers/net/ethernet/intel/ice/ice_common.h | 3 ++- drivers/net/ethernet/intel/ice/ice_lan_tx_rx.h | 2 +- 4 files changed, 16 insertions(+), 6 deletions(-) diff --git a/drivers/net/ethernet/intel/ice/ice_base.c b/drivers/net/ethernet/intel/ice/ice_base.c index 9452c0eb70b0..18076e0d12d0 100644 --- a/drivers/net/ethernet/intel/ice/ice_base.c +++ b/drivers/net/ethernet/intel/ice/ice_base.c @@ -638,6 +638,7 @@ ice_vsi_cfg_txq(struct ice_vsi *vsi, struct ice_ring *ring, struct ice_aqc_add_txqs_perq *txq; struct ice_pf *pf = vsi->back; u8 buf_len = sizeof(*qg_buf); + struct ice_hw *hw = &pf->hw; enum ice_status status; u16 pf_q; u8 tc; @@ -646,13 +647,13 @@ ice_vsi_cfg_txq(struct ice_vsi *vsi, struct ice_ring *ring, ice_setup_tx_ctx(ring, &tlan_ctx, pf_q); /* copy context contents into the qg_buf */ qg_buf->txqs[0].txq_id = cpu_to_le16(pf_q); - ice_set_ctx((u8 *)&tlan_ctx, qg_buf->txqs[0].txq_ctx, + ice_set_ctx(hw, (u8 *)&tlan_ctx, qg_buf->txqs[0].txq_ctx, ice_tlan_ctx_info); /* init queue specific tail reg. It is referred as * transmit comm scheduler queue doorbell. */ - ring->tail = pf->hw.hw_addr + QTX_COMM_DBELL(pf_q); + ring->tail = hw->hw_addr + QTX_COMM_DBELL(pf_q); if (IS_ENABLED(CONFIG_DCB)) tc = ring->dcb_tc; diff --git a/drivers/net/ethernet/intel/ice/ice_common.c b/drivers/net/ethernet/intel/ice/ice_common.c index ee62cfa3a69e..8c73e161829d 100644 --- a/drivers/net/ethernet/intel/ice/ice_common.c +++ b/drivers/net/ethernet/intel/ice/ice_common.c @@ -1098,7 +1098,7 @@ ice_write_rxq_ctx(struct ice_hw *hw, struct ice_rlan_ctx *rlan_ctx, rlan_ctx->prefena = 1; - ice_set_ctx((u8 *)rlan_ctx, ctx_buf, ice_rlan_ctx_info); + ice_set_ctx(hw, (u8 *)rlan_ctx, ctx_buf, ice_rlan_ctx_info); return ice_copy_rxq_ctx_to_hw(hw, ctx_buf, rxq_index); } @@ -3199,12 +3199,14 @@ ice_write_qword(u8 *src_ctx, u8 *dest_ctx, const struct ice_ctx_ele *ce_info) /** * ice_set_ctx - set context bits in packed structure + * @hw: pointer to the hardware structure * @src_ctx: pointer to a generic non-packed context structure * @dest_ctx: pointer to memory for the packed structure * @ce_info: a description of the structure to be transformed */ enum ice_status -ice_set_ctx(u8 *src_ctx, u8 *dest_ctx, const struct ice_ctx_ele *ce_info) +ice_set_ctx(struct ice_hw *hw, u8 *src_ctx, u8 *dest_ctx, + const struct ice_ctx_ele *ce_info) { int f; @@ -3213,6 +3215,12 @@ ice_set_ctx(u8 *src_ctx, u8 *dest_ctx, const struct ice_ctx_ele *ce_info) * using the correct size so that we are correct regardless * of the endianness of the machine. */ + if (ce_info[f].width > (ce_info[f].size_of * BITS_PER_BYTE)) { + ice_debug(hw, ICE_DBG_QCTX, + "Field %d width of %d bits larger than size of %d byte(s) ... skipping write\n", + f, ce_info[f].width, ce_info[f].size_of); + continue; + } switch (ce_info[f].size_of) { case sizeof(u8): ice_write_byte(src_ctx, dest_ctx, &ce_info[f]); diff --git a/drivers/net/ethernet/intel/ice/ice_common.h b/drivers/net/ethernet/intel/ice/ice_common.h index bea755a658eb..9b9e50d2398b 100644 --- a/drivers/net/ethernet/intel/ice/ice_common.h +++ b/drivers/net/ethernet/intel/ice/ice_common.h @@ -70,7 +70,8 @@ enum ice_status ice_aq_q_shutdown(struct ice_hw *hw, bool unloading); void ice_fill_dflt_direct_cmd_desc(struct ice_aq_desc *desc, u16 opcode); extern const struct ice_ctx_ele ice_tlan_ctx_info[]; enum ice_status -ice_set_ctx(u8 *src_ctx, u8 *dest_ctx, const struct ice_ctx_ele *ce_info); +ice_set_ctx(struct ice_hw *hw, u8 *src_ctx, u8 *dest_ctx, + const struct ice_ctx_ele *ce_info); extern struct mutex ice_global_cfg_lock_sw; diff --git a/drivers/net/ethernet/intel/ice/ice_lan_tx_rx.h b/drivers/net/ethernet/intel/ice/ice_lan_tx_rx.h index bd2cd3435768..14dfbbc1b2cf 100644 --- a/drivers/net/ethernet/intel/ice/ice_lan_tx_rx.h +++ b/drivers/net/ethernet/intel/ice/ice_lan_tx_rx.h @@ -581,7 +581,7 @@ struct ice_tlan_ctx { u8 drop_ena; u8 cache_prof_idx; u8 pkt_shaper_prof_idx; - u8 int_q_state; /* width not needed - internal do not write */ + u8 int_q_state; /* width not needed - internal - DO NOT WRITE!!! */ }; /* macro to make the table lines short */ -- cgit v1.2.3-59-g8ed1b From 13f90b393f7338e6a2b2646fa1b677cc8b50cd23 Mon Sep 17 00:00:00 2001 From: Anirudh Venkataramanan Date: Fri, 15 May 2020 17:42:19 -0700 Subject: ice: Refactor Rx checksum checks We don't need both rx_status and rx_error parameters, as the latter is a subset of the former. Remove rx_error completely and check the right bit in rx_status. Rename rx_status to rx_status0, and rx_status_err1 to rx_status1. This naming more closely reflects the specification. Signed-off-by: Anirudh Venkataramanan Signed-off-by: Tony Nguyen Tested-by: Andrew Bowers Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/ice/ice_txrx_lib.c | 27 +++++++++++---------------- 1 file changed, 11 insertions(+), 16 deletions(-) diff --git a/drivers/net/ethernet/intel/ice/ice_txrx_lib.c b/drivers/net/ethernet/intel/ice/ice_txrx_lib.c index 1ba97172d8d0..ab2031b1c635 100644 --- a/drivers/net/ethernet/intel/ice/ice_txrx_lib.c +++ b/drivers/net/ethernet/intel/ice/ice_txrx_lib.c @@ -84,17 +84,12 @@ ice_rx_csum(struct ice_ring *ring, struct sk_buff *skb, union ice_32b_rx_flex_desc *rx_desc, u8 ptype) { struct ice_rx_ptype_decoded decoded; - u16 rx_error, rx_status; - u16 rx_stat_err1; + u16 rx_status0, rx_status1; bool ipv4, ipv6; - rx_status = le16_to_cpu(rx_desc->wb.status_error0); - rx_error = rx_status & (BIT(ICE_RX_FLEX_DESC_STATUS0_XSUM_IPE_S) | - BIT(ICE_RX_FLEX_DESC_STATUS0_XSUM_L4E_S) | - BIT(ICE_RX_FLEX_DESC_STATUS0_XSUM_EIPE_S) | - BIT(ICE_RX_FLEX_DESC_STATUS0_XSUM_EUDPE_S)); + rx_status0 = le16_to_cpu(rx_desc->wb.status_error0); + rx_status1 = le16_to_cpu(rx_desc->wb.status_error1); - rx_stat_err1 = le16_to_cpu(rx_desc->wb.status_error1); decoded = ice_decode_rx_desc_ptype(ptype); /* Start with CHECKSUM_NONE and by default csum_level = 0 */ @@ -106,7 +101,7 @@ ice_rx_csum(struct ice_ring *ring, struct sk_buff *skb, return; /* check if HW has decoded the packet and checksum */ - if (!(rx_status & BIT(ICE_RX_FLEX_DESC_STATUS0_L3L4P_S))) + if (!(rx_status0 & BIT(ICE_RX_FLEX_DESC_STATUS0_L3L4P_S))) return; if (!(decoded.known && decoded.outer_ip)) @@ -117,22 +112,22 @@ ice_rx_csum(struct ice_ring *ring, struct sk_buff *skb, ipv6 = (decoded.outer_ip == ICE_RX_PTYPE_OUTER_IP) && (decoded.outer_ip_ver == ICE_RX_PTYPE_OUTER_IPV6); - if (ipv4 && (rx_error & (BIT(ICE_RX_FLEX_DESC_STATUS0_XSUM_IPE_S) | - BIT(ICE_RX_FLEX_DESC_STATUS0_XSUM_EIPE_S)))) + if (ipv4 && (rx_status0 & (BIT(ICE_RX_FLEX_DESC_STATUS0_XSUM_IPE_S) | + BIT(ICE_RX_FLEX_DESC_STATUS0_XSUM_EIPE_S)))) goto checksum_fail; - else if (ipv6 && (rx_status & - (BIT(ICE_RX_FLEX_DESC_STATUS0_IPV6EXADD_S)))) + + if (ipv6 && (rx_status0 & (BIT(ICE_RX_FLEX_DESC_STATUS0_IPV6EXADD_S)))) goto checksum_fail; /* check for L4 errors and handle packets that were not able to be * checksummed due to arrival speed */ - if (rx_error & BIT(ICE_RX_FLEX_DESC_STATUS0_XSUM_L4E_S)) + if (rx_status0 & BIT(ICE_RX_FLEX_DESC_STATUS0_XSUM_L4E_S)) goto checksum_fail; /* check for outer UDP checksum error in tunneled packets */ - if ((rx_stat_err1 & BIT(ICE_RX_FLEX_DESC_STATUS1_NAT_S)) && - (rx_error & BIT(ICE_RX_FLEX_DESC_STATUS0_XSUM_EUDPE_S))) + if ((rx_status1 & BIT(ICE_RX_FLEX_DESC_STATUS1_NAT_S)) && + (rx_status0 & BIT(ICE_RX_FLEX_DESC_STATUS0_XSUM_EUDPE_S))) goto checksum_fail; /* If there is an outer header present that might contain a checksum -- cgit v1.2.3-59-g8ed1b From ea651a86d46895a8b342664db66c3dee3412ad34 Mon Sep 17 00:00:00 2001 From: Vu Pham Date: Wed, 6 Nov 2019 09:57:12 -0800 Subject: net/mlx5: E-Switch, Refactor eswitch egress acl codes Refactor the egress acl codes so that offloads and legacy modes can configure specifically their own needs of egress acl table, groups and rules. While at it, restructure the eswitch egress acl codes into eswitch directory and different files: . Acl egress helper functions to acl_helper.c/h . Acl egress functions used in offloads mode to acl_egress_ofld.c . Acl egress functions used in legacy mode to acl_egress_lgy.c This patch does not change any functionality. Signed-off-by: Vu Pham Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/Makefile | 3 + .../mellanox/mlx5/core/esw/acl/egress_lgcy.c | 170 +++++++++++++++ .../mellanox/mlx5/core/esw/acl/egress_ofld.c | 88 ++++++++ .../ethernet/mellanox/mlx5/core/esw/acl/helper.c | 142 +++++++++++++ .../ethernet/mellanox/mlx5/core/esw/acl/helper.h | 22 ++ .../net/ethernet/mellanox/mlx5/core/esw/acl/lgcy.h | 13 ++ .../net/ethernet/mellanox/mlx5/core/esw/acl/ofld.h | 13 ++ drivers/net/ethernet/mellanox/mlx5/core/eswitch.c | 235 +-------------------- drivers/net/ethernet/mellanox/mlx5/core/eswitch.h | 15 +- .../ethernet/mellanox/mlx5/core/eswitch_offloads.c | 36 +--- 10 files changed, 462 insertions(+), 275 deletions(-) create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/esw/acl/egress_lgcy.c create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/esw/acl/egress_ofld.c create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/esw/acl/helper.c create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/esw/acl/helper.h create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/esw/acl/lgcy.h create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/esw/acl/ofld.h diff --git a/drivers/net/ethernet/mellanox/mlx5/core/Makefile b/drivers/net/ethernet/mellanox/mlx5/core/Makefile index e5ee9103fefb..ad046b2ea4f9 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/Makefile +++ b/drivers/net/ethernet/mellanox/mlx5/core/Makefile @@ -46,6 +46,9 @@ mlx5_core-$(CONFIG_MLX5_TC_CT) += en/tc_ct.o # mlx5_core-$(CONFIG_MLX5_ESWITCH) += eswitch.o eswitch_offloads.o eswitch_offloads_termtbl.o \ ecpf.o rdma.o +mlx5_core-$(CONFIG_MLX5_ESWITCH) += esw/acl/helper.o \ + esw/acl/egress_lgcy.o esw/acl/egress_ofld.o + mlx5_core-$(CONFIG_MLX5_MPFS) += lib/mpfs.o mlx5_core-$(CONFIG_VXLAN) += lib/vxlan.o mlx5_core-$(CONFIG_PTP_1588_CLOCK) += lib/clock.o diff --git a/drivers/net/ethernet/mellanox/mlx5/core/esw/acl/egress_lgcy.c b/drivers/net/ethernet/mellanox/mlx5/core/esw/acl/egress_lgcy.c new file mode 100644 index 000000000000..d46f8b225ebe --- /dev/null +++ b/drivers/net/ethernet/mellanox/mlx5/core/esw/acl/egress_lgcy.c @@ -0,0 +1,170 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* Copyright (c) 2020 Mellanox Technologies Inc. All rights reserved. */ + +#include "mlx5_core.h" +#include "eswitch.h" +#include "helper.h" +#include "lgcy.h" + +static void esw_acl_egress_lgcy_rules_destroy(struct mlx5_vport *vport) +{ + esw_acl_egress_vlan_destroy(vport); + if (!IS_ERR_OR_NULL(vport->egress.legacy.drop_rule)) { + mlx5_del_flow_rules(vport->egress.legacy.drop_rule); + vport->egress.legacy.drop_rule = NULL; + } +} + +static int esw_acl_egress_lgcy_groups_create(struct mlx5_eswitch *esw, + struct mlx5_vport *vport) +{ + int inlen = MLX5_ST_SZ_BYTES(create_flow_group_in); + struct mlx5_core_dev *dev = esw->dev; + struct mlx5_flow_group *drop_grp; + u32 *flow_group_in; + int err = 0; + + err = esw_acl_egress_vlan_grp_create(esw, vport); + if (err) + return err; + + flow_group_in = kvzalloc(inlen, GFP_KERNEL); + if (!flow_group_in) { + err = -ENOMEM; + goto alloc_err; + } + + MLX5_SET(create_flow_group_in, flow_group_in, start_flow_index, 1); + MLX5_SET(create_flow_group_in, flow_group_in, end_flow_index, 1); + drop_grp = mlx5_create_flow_group(vport->egress.acl, flow_group_in); + if (IS_ERR(drop_grp)) { + err = PTR_ERR(drop_grp); + esw_warn(dev, "Failed to create E-Switch vport[%d] egress drop flow group, err(%d)\n", + vport->vport, err); + goto drop_grp_err; + } + + vport->egress.legacy.drop_grp = drop_grp; + kvfree(flow_group_in); + return 0; + +drop_grp_err: + kvfree(flow_group_in); +alloc_err: + esw_acl_egress_vlan_grp_destroy(vport); + return err; +} + +static void esw_acl_egress_lgcy_groups_destroy(struct mlx5_vport *vport) +{ + if (!IS_ERR_OR_NULL(vport->egress.legacy.drop_grp)) { + mlx5_destroy_flow_group(vport->egress.legacy.drop_grp); + vport->egress.legacy.drop_grp = NULL; + } + esw_acl_egress_vlan_grp_destroy(vport); +} + +int esw_acl_egress_lgcy_setup(struct mlx5_eswitch *esw, + struct mlx5_vport *vport) +{ + struct mlx5_flow_destination drop_ctr_dst = {}; + struct mlx5_flow_destination *dst = NULL; + struct mlx5_fc *drop_counter = NULL; + struct mlx5_flow_act flow_act = {}; + /* The egress acl table contains 2 rules: + * 1)Allow traffic with vlan_tag=vst_vlan_id + * 2)Drop all other traffic. + */ + int table_size = 2; + int dest_num = 0; + int err = 0; + + if (MLX5_CAP_ESW_EGRESS_ACL(esw->dev, flow_counter)) { + drop_counter = mlx5_fc_create(esw->dev, false); + if (IS_ERR(drop_counter)) + esw_warn(esw->dev, + "vport[%d] configure egress drop rule counter err(%ld)\n", + vport->vport, PTR_ERR(drop_counter)); + vport->egress.legacy.drop_counter = drop_counter; + } + + esw_acl_egress_lgcy_rules_destroy(vport); + + if (!vport->info.vlan && !vport->info.qos) { + esw_acl_egress_lgcy_cleanup(esw, vport); + return 0; + } + + if (!IS_ERR_OR_NULL(vport->egress.acl)) + return 0; + + vport->egress.acl = esw_acl_table_create(esw, vport->vport, + MLX5_FLOW_NAMESPACE_ESW_EGRESS, + table_size); + if (IS_ERR_OR_NULL(vport->egress.acl)) { + err = PTR_ERR(vport->egress.acl); + vport->egress.acl = NULL; + goto out; + } + + err = esw_acl_egress_lgcy_groups_create(esw, vport); + if (err) + goto out; + + esw_debug(esw->dev, + "vport[%d] configure egress rules, vlan(%d) qos(%d)\n", + vport->vport, vport->info.vlan, vport->info.qos); + + /* Allowed vlan rule */ + err = esw_egress_acl_vlan_create(esw, vport, NULL, vport->info.vlan, + MLX5_FLOW_CONTEXT_ACTION_ALLOW); + if (err) + goto out; + + flow_act.action = MLX5_FLOW_CONTEXT_ACTION_DROP; + + /* Attach egress drop flow counter */ + if (!IS_ERR_OR_NULL(drop_counter)) { + flow_act.action |= MLX5_FLOW_CONTEXT_ACTION_COUNT; + drop_ctr_dst.type = MLX5_FLOW_DESTINATION_TYPE_COUNTER; + drop_ctr_dst.counter_id = mlx5_fc_id(drop_counter); + dst = &drop_ctr_dst; + dest_num++; + } + vport->egress.legacy.drop_rule = + mlx5_add_flow_rules(vport->egress.acl, NULL, + &flow_act, dst, dest_num); + if (IS_ERR(vport->egress.legacy.drop_rule)) { + err = PTR_ERR(vport->egress.legacy.drop_rule); + esw_warn(esw->dev, + "vport[%d] configure egress drop rule failed, err(%d)\n", + vport->vport, err); + vport->egress.legacy.drop_rule = NULL; + goto out; + } + + return err; + +out: + esw_acl_egress_lgcy_cleanup(esw, vport); + return err; +} + +void esw_acl_egress_lgcy_cleanup(struct mlx5_eswitch *esw, + struct mlx5_vport *vport) +{ + if (IS_ERR_OR_NULL(vport->egress.acl)) + goto clean_drop_counter; + + esw_debug(esw->dev, "Destroy vport[%d] E-Switch egress ACL\n", vport->vport); + + esw_acl_egress_lgcy_rules_destroy(vport); + esw_acl_egress_lgcy_groups_destroy(vport); + esw_acl_egress_table_destroy(vport); + +clean_drop_counter: + if (!IS_ERR_OR_NULL(vport->egress.legacy.drop_counter)) { + mlx5_fc_destroy(esw->dev, vport->egress.legacy.drop_counter); + vport->egress.legacy.drop_counter = NULL; + } +} diff --git a/drivers/net/ethernet/mellanox/mlx5/core/esw/acl/egress_ofld.c b/drivers/net/ethernet/mellanox/mlx5/core/esw/acl/egress_ofld.c new file mode 100644 index 000000000000..49a53ebf56dd --- /dev/null +++ b/drivers/net/ethernet/mellanox/mlx5/core/esw/acl/egress_ofld.c @@ -0,0 +1,88 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* Copyright (c) 2020 Mellanox Technologies Inc. All rights reserved. */ + +#include "mlx5_core.h" +#include "eswitch.h" +#include "helper.h" +#include "ofld.h" + +static int esw_acl_egress_ofld_rules_create(struct mlx5_eswitch *esw, + struct mlx5_vport *vport) +{ + if (!MLX5_CAP_GEN(esw->dev, prio_tag_required)) + return 0; + + /* For prio tag mode, there is only 1 FTEs: + * 1) prio tag packets - pop the prio tag VLAN, allow + * Unmatched traffic is allowed by default + */ + esw_debug(esw->dev, + "vport[%d] configure prio tag egress rules\n", vport->vport); + + /* prio tag vlan rule - pop it so vport receives untagged packets */ + return esw_egress_acl_vlan_create(esw, vport, NULL, 0, + MLX5_FLOW_CONTEXT_ACTION_VLAN_POP | + MLX5_FLOW_CONTEXT_ACTION_ALLOW); +} + +static void esw_acl_egress_ofld_rules_destroy(struct mlx5_vport *vport) +{ + esw_acl_egress_vlan_destroy(vport); +} + +static int esw_acl_egress_ofld_groups_create(struct mlx5_eswitch *esw, + struct mlx5_vport *vport) +{ + if (!MLX5_CAP_GEN(esw->dev, prio_tag_required)) + return 0; + + return esw_acl_egress_vlan_grp_create(esw, vport); +} + +static void esw_acl_egress_ofld_groups_destroy(struct mlx5_vport *vport) +{ + esw_acl_egress_vlan_grp_destroy(vport); +} + +int esw_acl_egress_ofld_setup(struct mlx5_eswitch *esw, struct mlx5_vport *vport) +{ + int err; + + if (!MLX5_CAP_GEN(esw->dev, prio_tag_required)) + return 0; + + esw_acl_egress_ofld_rules_destroy(vport); + + vport->egress.acl = esw_acl_table_create(esw, vport->vport, + MLX5_FLOW_NAMESPACE_ESW_EGRESS, 0); + if (IS_ERR_OR_NULL(vport->egress.acl)) { + err = PTR_ERR(vport->egress.acl); + vport->egress.acl = NULL; + return err; + } + + err = esw_acl_egress_ofld_groups_create(esw, vport); + if (err) + goto group_err; + + esw_debug(esw->dev, "vport[%d] configure egress rules\n", vport->vport); + + err = esw_acl_egress_ofld_rules_create(esw, vport); + if (err) + goto rules_err; + + return 0; + +rules_err: + esw_acl_egress_ofld_groups_destroy(vport); +group_err: + esw_acl_egress_table_destroy(vport); + return err; +} + +void esw_acl_egress_ofld_cleanup(struct mlx5_vport *vport) +{ + esw_acl_egress_ofld_rules_destroy(vport); + esw_acl_egress_ofld_groups_destroy(vport); + esw_acl_egress_table_destroy(vport); +} diff --git a/drivers/net/ethernet/mellanox/mlx5/core/esw/acl/helper.c b/drivers/net/ethernet/mellanox/mlx5/core/esw/acl/helper.c new file mode 100644 index 000000000000..8b7996721a7c --- /dev/null +++ b/drivers/net/ethernet/mellanox/mlx5/core/esw/acl/helper.c @@ -0,0 +1,142 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* Copyright (c) 2020 Mellanox Technologies Inc. All rights reserved. */ + +#include "mlx5_core.h" +#include "eswitch.h" +#include "helper.h" + +struct mlx5_flow_table * +esw_acl_table_create(struct mlx5_eswitch *esw, u16 vport_num, int ns, int size) +{ + struct mlx5_core_dev *dev = esw->dev; + struct mlx5_flow_namespace *root_ns; + struct mlx5_flow_table *acl; + int acl_supported; + int vport_index; + int err; + + acl_supported = (ns == MLX5_FLOW_NAMESPACE_ESW_INGRESS) ? + MLX5_CAP_ESW_INGRESS_ACL(dev, ft_support) : + MLX5_CAP_ESW_EGRESS_ACL(dev, ft_support); + + if (!acl_supported) + return ERR_PTR(-EOPNOTSUPP); + + esw_debug(dev, "Create vport[%d] %s ACL table\n", vport_num, + ns == MLX5_FLOW_NAMESPACE_ESW_INGRESS ? "ingress" : "egress"); + + vport_index = mlx5_eswitch_vport_num_to_index(esw, vport_num); + root_ns = mlx5_get_flow_vport_acl_namespace(dev, ns, vport_index); + if (!root_ns) { + esw_warn(dev, "Failed to get E-Switch root namespace for vport (%d)\n", + vport_num); + return ERR_PTR(-EOPNOTSUPP); + } + + acl = mlx5_create_vport_flow_table(root_ns, 0, size, 0, vport_num); + if (IS_ERR(acl)) { + err = PTR_ERR(acl); + esw_warn(dev, "vport[%d] create %s ACL table, err(%d)\n", vport_num, + ns == MLX5_FLOW_NAMESPACE_ESW_INGRESS ? "ingress" : "egress", err); + } + return acl; +} + +int esw_egress_acl_vlan_create(struct mlx5_eswitch *esw, + struct mlx5_vport *vport, + struct mlx5_flow_destination *fwd_dest, + u16 vlan_id, u32 flow_action) +{ + struct mlx5_flow_act flow_act = {}; + struct mlx5_flow_spec *spec; + int err = 0; + + if (vport->egress.allowed_vlan) + return -EEXIST; + + spec = kvzalloc(sizeof(*spec), GFP_KERNEL); + if (!spec) + return -ENOMEM; + + MLX5_SET_TO_ONES(fte_match_param, spec->match_criteria, outer_headers.cvlan_tag); + MLX5_SET_TO_ONES(fte_match_param, spec->match_value, outer_headers.cvlan_tag); + MLX5_SET_TO_ONES(fte_match_param, spec->match_criteria, outer_headers.first_vid); + MLX5_SET(fte_match_param, spec->match_value, outer_headers.first_vid, vlan_id); + + spec->match_criteria_enable = MLX5_MATCH_OUTER_HEADERS; + flow_act.action = flow_action; + vport->egress.allowed_vlan = + mlx5_add_flow_rules(vport->egress.acl, spec, + &flow_act, fwd_dest, 0); + if (IS_ERR(vport->egress.allowed_vlan)) { + err = PTR_ERR(vport->egress.allowed_vlan); + esw_warn(esw->dev, + "vport[%d] configure egress vlan rule failed, err(%d)\n", + vport->vport, err); + vport->egress.allowed_vlan = NULL; + } + + kvfree(spec); + return err; +} + +void esw_acl_egress_vlan_destroy(struct mlx5_vport *vport) +{ + if (!IS_ERR_OR_NULL(vport->egress.allowed_vlan)) { + mlx5_del_flow_rules(vport->egress.allowed_vlan); + vport->egress.allowed_vlan = NULL; + } +} + +int esw_acl_egress_vlan_grp_create(struct mlx5_eswitch *esw, struct mlx5_vport *vport) +{ + int inlen = MLX5_ST_SZ_BYTES(create_flow_group_in); + struct mlx5_flow_group *vlan_grp; + void *match_criteria; + u32 *flow_group_in; + int ret = 0; + + flow_group_in = kvzalloc(inlen, GFP_KERNEL); + if (!flow_group_in) + return -ENOMEM; + + MLX5_SET(create_flow_group_in, flow_group_in, + match_criteria_enable, MLX5_MATCH_OUTER_HEADERS); + match_criteria = MLX5_ADDR_OF(create_flow_group_in, + flow_group_in, match_criteria); + MLX5_SET_TO_ONES(fte_match_param, match_criteria, outer_headers.cvlan_tag); + MLX5_SET_TO_ONES(fte_match_param, match_criteria, outer_headers.first_vid); + MLX5_SET(create_flow_group_in, flow_group_in, start_flow_index, 0); + MLX5_SET(create_flow_group_in, flow_group_in, end_flow_index, 0); + + vlan_grp = mlx5_create_flow_group(vport->egress.acl, flow_group_in); + if (IS_ERR(vlan_grp)) { + ret = PTR_ERR(vlan_grp); + esw_warn(esw->dev, + "Failed to create E-Switch vport[%d] egress pop vlans flow group, err(%d)\n", + vport->vport, ret); + goto out; + } + vport->egress.vlan_grp = vlan_grp; + +out: + kvfree(flow_group_in); + return ret; +} + +void esw_acl_egress_vlan_grp_destroy(struct mlx5_vport *vport) +{ + if (!IS_ERR_OR_NULL(vport->egress.vlan_grp)) { + mlx5_destroy_flow_group(vport->egress.vlan_grp); + vport->egress.vlan_grp = NULL; + } +} + +void esw_acl_egress_table_destroy(struct mlx5_vport *vport) +{ + if (IS_ERR_OR_NULL(vport->egress.acl)) + return; + + mlx5_destroy_flow_table(vport->egress.acl); + vport->egress.acl = NULL; +} diff --git a/drivers/net/ethernet/mellanox/mlx5/core/esw/acl/helper.h b/drivers/net/ethernet/mellanox/mlx5/core/esw/acl/helper.h new file mode 100644 index 000000000000..543372df6196 --- /dev/null +++ b/drivers/net/ethernet/mellanox/mlx5/core/esw/acl/helper.h @@ -0,0 +1,22 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* Copyright (c) 2020 Mellanox Technologies Inc. All rights reserved. */ + +#ifndef __MLX5_ESWITCH_ACL_HELPER_H__ +#define __MLX5_ESWITCH_ACL_HELPER_H__ + +#include "eswitch.h" + +/* General acl helper functions */ +struct mlx5_flow_table * +esw_acl_table_create(struct mlx5_eswitch *esw, u16 vport_num, int ns, int size); + +/* Egress acl helper functions */ +void esw_acl_egress_table_destroy(struct mlx5_vport *vport); +int esw_egress_acl_vlan_create(struct mlx5_eswitch *esw, struct mlx5_vport *vport, + struct mlx5_flow_destination *fwd_dest, + u16 vlan_id, u32 flow_action); +void esw_acl_egress_vlan_destroy(struct mlx5_vport *vport); +int esw_acl_egress_vlan_grp_create(struct mlx5_eswitch *esw, struct mlx5_vport *vport); +void esw_acl_egress_vlan_grp_destroy(struct mlx5_vport *vport); + +#endif /* __MLX5_ESWITCH_ACL_HELPER_H__ */ diff --git a/drivers/net/ethernet/mellanox/mlx5/core/esw/acl/lgcy.h b/drivers/net/ethernet/mellanox/mlx5/core/esw/acl/lgcy.h new file mode 100644 index 000000000000..6b05a3af4462 --- /dev/null +++ b/drivers/net/ethernet/mellanox/mlx5/core/esw/acl/lgcy.h @@ -0,0 +1,13 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* Copyright (c) 2020 Mellanox Technologies Inc. All rights reserved. */ + +#ifndef __MLX5_ESWITCH_ACL_LGCY_H__ +#define __MLX5_ESWITCH_ACL_LGCY_H__ + +#include "eswitch.h" + +/* Eswitch acl egress external APIs */ +int esw_acl_egress_lgcy_setup(struct mlx5_eswitch *esw, struct mlx5_vport *vport); +void esw_acl_egress_lgcy_cleanup(struct mlx5_eswitch *esw, struct mlx5_vport *vport); + +#endif /* __MLX5_ESWITCH_ACL_LGCY_H__ */ diff --git a/drivers/net/ethernet/mellanox/mlx5/core/esw/acl/ofld.h b/drivers/net/ethernet/mellanox/mlx5/core/esw/acl/ofld.h new file mode 100644 index 000000000000..fc912b254226 --- /dev/null +++ b/drivers/net/ethernet/mellanox/mlx5/core/esw/acl/ofld.h @@ -0,0 +1,13 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* Copyright (c) 2020 Mellanox Technologies Inc. All rights reserved. */ + +#ifndef __MLX5_ESWITCH_ACL_OFLD_H__ +#define __MLX5_ESWITCH_ACL_OFLD_H__ + +#include "eswitch.h" + +/* Eswitch acl egress external APIs */ +int esw_acl_egress_ofld_setup(struct mlx5_eswitch *esw, struct mlx5_vport *vport); +void esw_acl_egress_ofld_cleanup(struct mlx5_vport *vport); + +#endif /* __MLX5_ESWITCH_ACL_OFLD_H__ */ diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c index ac79b7c9aeb3..ae74486b9c9e 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c @@ -35,6 +35,7 @@ #include #include #include +#include "esw/acl/lgcy.h" #include "mlx5_core.h" #include "lib/eq.h" #include "eswitch.h" @@ -936,121 +937,6 @@ static void esw_vport_change_handler(struct work_struct *work) mutex_unlock(&esw->state_lock); } -int esw_vport_enable_egress_acl(struct mlx5_eswitch *esw, - struct mlx5_vport *vport) -{ - int inlen = MLX5_ST_SZ_BYTES(create_flow_group_in); - struct mlx5_flow_group *vlan_grp = NULL; - struct mlx5_flow_group *drop_grp = NULL; - struct mlx5_core_dev *dev = esw->dev; - struct mlx5_flow_namespace *root_ns; - struct mlx5_flow_table *acl; - void *match_criteria; - u32 *flow_group_in; - /* The egress acl table contains 2 rules: - * 1)Allow traffic with vlan_tag=vst_vlan_id - * 2)Drop all other traffic. - */ - int table_size = 2; - int err = 0; - - if (!MLX5_CAP_ESW_EGRESS_ACL(dev, ft_support)) - return -EOPNOTSUPP; - - if (!IS_ERR_OR_NULL(vport->egress.acl)) - return 0; - - esw_debug(dev, "Create vport[%d] egress ACL log_max_size(%d)\n", - vport->vport, MLX5_CAP_ESW_EGRESS_ACL(dev, log_max_ft_size)); - - root_ns = mlx5_get_flow_vport_acl_namespace(dev, MLX5_FLOW_NAMESPACE_ESW_EGRESS, - mlx5_eswitch_vport_num_to_index(esw, vport->vport)); - if (!root_ns) { - esw_warn(dev, "Failed to get E-Switch egress flow namespace for vport (%d)\n", vport->vport); - return -EOPNOTSUPP; - } - - flow_group_in = kvzalloc(inlen, GFP_KERNEL); - if (!flow_group_in) - return -ENOMEM; - - acl = mlx5_create_vport_flow_table(root_ns, 0, table_size, 0, vport->vport); - if (IS_ERR(acl)) { - err = PTR_ERR(acl); - esw_warn(dev, "Failed to create E-Switch vport[%d] egress flow Table, err(%d)\n", - vport->vport, err); - goto out; - } - - MLX5_SET(create_flow_group_in, flow_group_in, match_criteria_enable, MLX5_MATCH_OUTER_HEADERS); - match_criteria = MLX5_ADDR_OF(create_flow_group_in, flow_group_in, match_criteria); - MLX5_SET_TO_ONES(fte_match_param, match_criteria, outer_headers.cvlan_tag); - MLX5_SET_TO_ONES(fte_match_param, match_criteria, outer_headers.first_vid); - MLX5_SET(create_flow_group_in, flow_group_in, start_flow_index, 0); - MLX5_SET(create_flow_group_in, flow_group_in, end_flow_index, 0); - - vlan_grp = mlx5_create_flow_group(acl, flow_group_in); - if (IS_ERR(vlan_grp)) { - err = PTR_ERR(vlan_grp); - esw_warn(dev, "Failed to create E-Switch vport[%d] egress allowed vlans flow group, err(%d)\n", - vport->vport, err); - goto out; - } - - memset(flow_group_in, 0, inlen); - MLX5_SET(create_flow_group_in, flow_group_in, start_flow_index, 1); - MLX5_SET(create_flow_group_in, flow_group_in, end_flow_index, 1); - drop_grp = mlx5_create_flow_group(acl, flow_group_in); - if (IS_ERR(drop_grp)) { - err = PTR_ERR(drop_grp); - esw_warn(dev, "Failed to create E-Switch vport[%d] egress drop flow group, err(%d)\n", - vport->vport, err); - goto out; - } - - vport->egress.acl = acl; - vport->egress.drop_grp = drop_grp; - vport->egress.allowed_vlans_grp = vlan_grp; -out: - kvfree(flow_group_in); - if (err && !IS_ERR_OR_NULL(vlan_grp)) - mlx5_destroy_flow_group(vlan_grp); - if (err && !IS_ERR_OR_NULL(acl)) - mlx5_destroy_flow_table(acl); - return err; -} - -void esw_vport_cleanup_egress_rules(struct mlx5_eswitch *esw, - struct mlx5_vport *vport) -{ - if (!IS_ERR_OR_NULL(vport->egress.allowed_vlan)) { - mlx5_del_flow_rules(vport->egress.allowed_vlan); - vport->egress.allowed_vlan = NULL; - } - - if (!IS_ERR_OR_NULL(vport->egress.legacy.drop_rule)) { - mlx5_del_flow_rules(vport->egress.legacy.drop_rule); - vport->egress.legacy.drop_rule = NULL; - } -} - -void esw_vport_disable_egress_acl(struct mlx5_eswitch *esw, - struct mlx5_vport *vport) -{ - if (IS_ERR_OR_NULL(vport->egress.acl)) - return; - - esw_debug(esw->dev, "Destroy vport[%d] E-Switch egress ACL\n", vport->vport); - - esw_vport_cleanup_egress_rules(esw, vport); - mlx5_destroy_flow_group(vport->egress.allowed_vlans_grp); - mlx5_destroy_flow_group(vport->egress.drop_grp); - mlx5_destroy_flow_table(vport->egress.acl); - vport->egress.allowed_vlans_grp = NULL; - vport->egress.drop_grp = NULL; - vport->egress.acl = NULL; -} - static int esw_vport_create_legacy_ingress_acl_groups(struct mlx5_eswitch *esw, struct mlx5_vport *vport) @@ -1346,102 +1232,6 @@ out: return err; } -int mlx5_esw_create_vport_egress_acl_vlan(struct mlx5_eswitch *esw, - struct mlx5_vport *vport, - u16 vlan_id, u32 flow_action) -{ - struct mlx5_flow_act flow_act = {}; - struct mlx5_flow_spec *spec; - int err = 0; - - if (vport->egress.allowed_vlan) - return -EEXIST; - - spec = kvzalloc(sizeof(*spec), GFP_KERNEL); - if (!spec) - return -ENOMEM; - - MLX5_SET_TO_ONES(fte_match_param, spec->match_criteria, outer_headers.cvlan_tag); - MLX5_SET_TO_ONES(fte_match_param, spec->match_value, outer_headers.cvlan_tag); - MLX5_SET_TO_ONES(fte_match_param, spec->match_criteria, outer_headers.first_vid); - MLX5_SET(fte_match_param, spec->match_value, outer_headers.first_vid, vlan_id); - - spec->match_criteria_enable = MLX5_MATCH_OUTER_HEADERS; - flow_act.action = flow_action; - vport->egress.allowed_vlan = - mlx5_add_flow_rules(vport->egress.acl, spec, - &flow_act, NULL, 0); - if (IS_ERR(vport->egress.allowed_vlan)) { - err = PTR_ERR(vport->egress.allowed_vlan); - esw_warn(esw->dev, - "vport[%d] configure egress vlan rule failed, err(%d)\n", - vport->vport, err); - vport->egress.allowed_vlan = NULL; - } - - kvfree(spec); - return err; -} - -static int esw_vport_egress_config(struct mlx5_eswitch *esw, - struct mlx5_vport *vport) -{ - struct mlx5_fc *counter = vport->egress.legacy.drop_counter; - struct mlx5_flow_destination drop_ctr_dst = {0}; - struct mlx5_flow_destination *dst = NULL; - struct mlx5_flow_act flow_act = {0}; - int dest_num = 0; - int err = 0; - - esw_vport_cleanup_egress_rules(esw, vport); - - if (!vport->info.vlan && !vport->info.qos) { - esw_vport_disable_egress_acl(esw, vport); - return 0; - } - - err = esw_vport_enable_egress_acl(esw, vport); - if (err) { - mlx5_core_warn(esw->dev, - "failed to enable egress acl (%d) on vport[%d]\n", - err, vport->vport); - return err; - } - - esw_debug(esw->dev, - "vport[%d] configure egress rules, vlan(%d) qos(%d)\n", - vport->vport, vport->info.vlan, vport->info.qos); - - /* Allowed vlan rule */ - err = mlx5_esw_create_vport_egress_acl_vlan(esw, vport, vport->info.vlan, - MLX5_FLOW_CONTEXT_ACTION_ALLOW); - if (err) - return err; - - flow_act.action = MLX5_FLOW_CONTEXT_ACTION_DROP; - - /* Attach egress drop flow counter */ - if (counter) { - flow_act.action |= MLX5_FLOW_CONTEXT_ACTION_COUNT; - drop_ctr_dst.type = MLX5_FLOW_DESTINATION_TYPE_COUNTER; - drop_ctr_dst.counter_id = mlx5_fc_id(counter); - dst = &drop_ctr_dst; - dest_num++; - } - vport->egress.legacy.drop_rule = - mlx5_add_flow_rules(vport->egress.acl, NULL, - &flow_act, dst, dest_num); - if (IS_ERR(vport->egress.legacy.drop_rule)) { - err = PTR_ERR(vport->egress.legacy.drop_rule); - esw_warn(esw->dev, - "vport[%d] configure egress drop rule failed, err(%d)\n", - vport->vport, err); - vport->egress.legacy.drop_rule = NULL; - } - - return err; -} - static bool element_type_supported(struct mlx5_eswitch *esw, int type) { const struct mlx5_core_dev *dev = esw->dev; @@ -1667,17 +1457,7 @@ static int esw_vport_create_legacy_acl_tables(struct mlx5_eswitch *esw, if (ret) goto ingress_err; - if (MLX5_CAP_ESW_EGRESS_ACL(esw->dev, flow_counter)) { - vport->egress.legacy.drop_counter = mlx5_fc_create(esw->dev, false); - if (IS_ERR(vport->egress.legacy.drop_counter)) { - esw_warn(esw->dev, - "vport[%d] configure egress drop rule counter failed\n", - vport->vport); - vport->egress.legacy.drop_counter = NULL; - } - } - - ret = esw_vport_egress_config(esw, vport); + ret = esw_acl_egress_lgcy_setup(esw, vport); if (ret) goto egress_err; @@ -1685,9 +1465,6 @@ static int esw_vport_create_legacy_acl_tables(struct mlx5_eswitch *esw, egress_err: esw_vport_disable_legacy_ingress_acl(esw, vport); - mlx5_fc_destroy(esw->dev, vport->egress.legacy.drop_counter); - vport->egress.legacy.drop_counter = NULL; - ingress_err: mlx5_fc_destroy(esw->dev, vport->ingress.legacy.drop_counter); vport->ingress.legacy.drop_counter = NULL; @@ -1710,9 +1487,7 @@ static void esw_vport_destroy_legacy_acl_tables(struct mlx5_eswitch *esw, if (mlx5_esw_is_manager_vport(esw, vport->vport)) return; - esw_vport_disable_egress_acl(esw, vport); - mlx5_fc_destroy(esw->dev, vport->egress.legacy.drop_counter); - vport->egress.legacy.drop_counter = NULL; + esw_acl_egress_lgcy_cleanup(esw, vport); esw_vport_disable_legacy_ingress_acl(esw, vport); mlx5_fc_destroy(esw->dev, vport->ingress.legacy.drop_counter); @@ -2433,7 +2208,7 @@ int __mlx5_eswitch_set_vport_vlan(struct mlx5_eswitch *esw, err = esw_vport_ingress_config(esw, evport); if (err) return err; - err = esw_vport_egress_config(esw, evport); + err = esw_acl_egress_lgcy_setup(esw, evport); } return err; @@ -2734,7 +2509,7 @@ static int mlx5_eswitch_query_vport_drop_stats(struct mlx5_core_dev *dev, if (!vport->enabled) goto unlock; - if (vport->egress.legacy.drop_counter) + if (!IS_ERR_OR_NULL(vport->egress.legacy.drop_counter)) mlx5_fc_query(dev, vport->egress.legacy.drop_counter, &stats->rx_dropped, &bytes); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h index ccbbea3e0505..490410401631 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h @@ -99,10 +99,10 @@ struct vport_ingress { struct vport_egress { struct mlx5_flow_table *acl; - struct mlx5_flow_group *allowed_vlans_grp; - struct mlx5_flow_group *drop_grp; struct mlx5_flow_handle *allowed_vlan; + struct mlx5_flow_group *vlan_grp; struct { + struct mlx5_flow_group *drop_grp; struct mlx5_flow_handle *drop_rule; struct mlx5_fc *drop_counter; } legacy; @@ -291,12 +291,7 @@ int esw_vport_create_ingress_acl_table(struct mlx5_eswitch *esw, struct mlx5_vport *vport, int table_size); void esw_vport_destroy_ingress_acl_table(struct mlx5_vport *vport); -void esw_vport_cleanup_egress_rules(struct mlx5_eswitch *esw, - struct mlx5_vport *vport); -int esw_vport_enable_egress_acl(struct mlx5_eswitch *esw, - struct mlx5_vport *vport); -void esw_vport_disable_egress_acl(struct mlx5_eswitch *esw, - struct mlx5_vport *vport); + int mlx5_esw_modify_vport_rate(struct mlx5_eswitch *esw, u16 vport_num, u32 rate_mbps); @@ -458,10 +453,6 @@ int mlx5_eswitch_del_vlan_action(struct mlx5_eswitch *esw, int __mlx5_eswitch_set_vport_vlan(struct mlx5_eswitch *esw, u16 vport, u16 vlan, u8 qos, u8 set_flags); -int mlx5_esw_create_vport_egress_acl_vlan(struct mlx5_eswitch *esw, - struct mlx5_vport *vport, - u16 vlan_id, u32 flow_action); - static inline bool mlx5_esw_qos_enabled(struct mlx5_eswitch *esw) { return esw->qos.enabled; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c index 554fc64d8ef6..0b00b30187ce 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c @@ -37,6 +37,7 @@ #include #include "mlx5_core.h" #include "eswitch.h" +#include "esw/acl/ofld.h" #include "esw/chains.h" #include "rdma.h" #include "en.h" @@ -2093,37 +2094,6 @@ group_err: return err; } -static int esw_vport_egress_config(struct mlx5_eswitch *esw, - struct mlx5_vport *vport) -{ - int err; - - if (!MLX5_CAP_GEN(esw->dev, prio_tag_required)) - return 0; - - esw_vport_cleanup_egress_rules(esw, vport); - - err = esw_vport_enable_egress_acl(esw, vport); - if (err) - return err; - - /* For prio tag mode, there is only 1 FTEs: - * 1) prio tag packets - pop the prio tag VLAN, allow - * Unmatched traffic is allowed by default - */ - esw_debug(esw->dev, - "vport[%d] configure prio tag egress rules\n", vport->vport); - - /* prio tag vlan rule - pop it so VF receives untagged packets */ - err = mlx5_esw_create_vport_egress_acl_vlan(esw, vport, 0, - MLX5_FLOW_CONTEXT_ACTION_VLAN_POP | - MLX5_FLOW_CONTEXT_ACTION_ALLOW); - if (err) - esw_vport_disable_egress_acl(esw, vport); - - return err; -} - static bool esw_check_vport_match_metadata_supported(const struct mlx5_eswitch *esw) { @@ -2167,7 +2137,7 @@ esw_vport_create_offloads_acl_tables(struct mlx5_eswitch *esw, return err; if (mlx5_eswitch_is_vf_vport(esw, vport->vport)) { - err = esw_vport_egress_config(esw, vport); + err = esw_acl_egress_ofld_setup(esw, vport); if (err) { esw_vport_cleanup_ingress_rules(esw, vport); esw_vport_del_ingress_acl_modify_metadata(esw, vport); @@ -2182,7 +2152,7 @@ void esw_vport_destroy_offloads_acl_tables(struct mlx5_eswitch *esw, struct mlx5_vport *vport) { - esw_vport_disable_egress_acl(esw, vport); + esw_acl_egress_ofld_cleanup(vport); esw_vport_cleanup_ingress_rules(esw, vport); esw_vport_del_ingress_acl_modify_metadata(esw, vport); esw_vport_destroy_ingress_acl_group(vport); -- cgit v1.2.3-59-g8ed1b From 07bab9502641dff9c3c864162270d12c6dd0e834 Mon Sep 17 00:00:00 2001 From: Vu Pham Date: Fri, 27 Mar 2020 23:12:22 -0700 Subject: net/mlx5: E-Switch, Refactor eswitch ingress acl codes Restructure the eswitch ingress acl codes into eswitch directory and different files: . Acl ingress helper functions to acl_helper.c/h . Acl ingress functions used in offloads mode to acl_ingress_ofld.c . Acl ingress functions used in legacy mode to acl_ingress_lgy.c This patch does not change any functionality. Signed-off-by: Vu Pham --- drivers/net/ethernet/mellanox/mlx5/core/Makefile | 3 +- .../ethernet/mellanox/mlx5/core/esw/acl/helper.c | 18 ++ .../ethernet/mellanox/mlx5/core/esw/acl/helper.h | 4 + .../mellanox/mlx5/core/esw/acl/ingress_lgcy.c | 279 ++++++++++++++++++ .../mellanox/mlx5/core/esw/acl/ingress_ofld.c | 293 +++++++++++++++++++ .../net/ethernet/mellanox/mlx5/core/esw/acl/lgcy.h | 4 + .../net/ethernet/mellanox/mlx5/core/esw/acl/ofld.h | 4 + drivers/net/ethernet/mellanox/mlx5/core/eswitch.c | 322 +-------------------- drivers/net/ethernet/mellanox/mlx5/core/eswitch.h | 6 - .../ethernet/mellanox/mlx5/core/eswitch_offloads.c | 269 +---------------- 10 files changed, 619 insertions(+), 583 deletions(-) create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/esw/acl/ingress_lgcy.c create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/esw/acl/ingress_ofld.c diff --git a/drivers/net/ethernet/mellanox/mlx5/core/Makefile b/drivers/net/ethernet/mellanox/mlx5/core/Makefile index ad046b2ea4f9..3934dc258041 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/Makefile +++ b/drivers/net/ethernet/mellanox/mlx5/core/Makefile @@ -47,7 +47,8 @@ mlx5_core-$(CONFIG_MLX5_TC_CT) += en/tc_ct.o mlx5_core-$(CONFIG_MLX5_ESWITCH) += eswitch.o eswitch_offloads.o eswitch_offloads_termtbl.o \ ecpf.o rdma.o mlx5_core-$(CONFIG_MLX5_ESWITCH) += esw/acl/helper.o \ - esw/acl/egress_lgcy.o esw/acl/egress_ofld.o + esw/acl/egress_lgcy.o esw/acl/egress_ofld.o \ + esw/acl/ingress_lgcy.o esw/acl/ingress_ofld.o mlx5_core-$(CONFIG_MLX5_MPFS) += lib/mpfs.o mlx5_core-$(CONFIG_VXLAN) += lib/vxlan.o diff --git a/drivers/net/ethernet/mellanox/mlx5/core/esw/acl/helper.c b/drivers/net/ethernet/mellanox/mlx5/core/esw/acl/helper.c index 8b7996721a7c..22f4c1c28006 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/esw/acl/helper.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/esw/acl/helper.c @@ -140,3 +140,21 @@ void esw_acl_egress_table_destroy(struct mlx5_vport *vport) mlx5_destroy_flow_table(vport->egress.acl); vport->egress.acl = NULL; } + +void esw_acl_ingress_table_destroy(struct mlx5_vport *vport) +{ + if (!vport->ingress.acl) + return; + + mlx5_destroy_flow_table(vport->ingress.acl); + vport->ingress.acl = NULL; +} + +void esw_acl_ingress_allow_rule_destroy(struct mlx5_vport *vport) +{ + if (!vport->ingress.allow_rule) + return; + + mlx5_del_flow_rules(vport->ingress.allow_rule); + vport->ingress.allow_rule = NULL; +} diff --git a/drivers/net/ethernet/mellanox/mlx5/core/esw/acl/helper.h b/drivers/net/ethernet/mellanox/mlx5/core/esw/acl/helper.h index 543372df6196..8dc4cab66a71 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/esw/acl/helper.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/esw/acl/helper.h @@ -19,4 +19,8 @@ void esw_acl_egress_vlan_destroy(struct mlx5_vport *vport); int esw_acl_egress_vlan_grp_create(struct mlx5_eswitch *esw, struct mlx5_vport *vport); void esw_acl_egress_vlan_grp_destroy(struct mlx5_vport *vport); +/* Ingress acl helper functions */ +void esw_acl_ingress_table_destroy(struct mlx5_vport *vport); +void esw_acl_ingress_allow_rule_destroy(struct mlx5_vport *vport); + #endif /* __MLX5_ESWITCH_ACL_HELPER_H__ */ diff --git a/drivers/net/ethernet/mellanox/mlx5/core/esw/acl/ingress_lgcy.c b/drivers/net/ethernet/mellanox/mlx5/core/esw/acl/ingress_lgcy.c new file mode 100644 index 000000000000..9bda4fe2eafa --- /dev/null +++ b/drivers/net/ethernet/mellanox/mlx5/core/esw/acl/ingress_lgcy.c @@ -0,0 +1,279 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* Copyright (c) 2020 Mellanox Technologies Inc. All rights reserved. */ + +#include "mlx5_core.h" +#include "eswitch.h" +#include "helper.h" +#include "lgcy.h" + +static void esw_acl_ingress_lgcy_rules_destroy(struct mlx5_vport *vport) +{ + if (vport->ingress.legacy.drop_rule) { + mlx5_del_flow_rules(vport->ingress.legacy.drop_rule); + vport->ingress.legacy.drop_rule = NULL; + } + esw_acl_ingress_allow_rule_destroy(vport); +} + +static int esw_acl_ingress_lgcy_groups_create(struct mlx5_eswitch *esw, + struct mlx5_vport *vport) +{ + int inlen = MLX5_ST_SZ_BYTES(create_flow_group_in); + struct mlx5_core_dev *dev = esw->dev; + struct mlx5_flow_group *g; + void *match_criteria; + u32 *flow_group_in; + int err; + + flow_group_in = kvzalloc(inlen, GFP_KERNEL); + if (!flow_group_in) + return -ENOMEM; + + match_criteria = MLX5_ADDR_OF(create_flow_group_in, flow_group_in, match_criteria); + + MLX5_SET(create_flow_group_in, flow_group_in, match_criteria_enable, + MLX5_MATCH_OUTER_HEADERS); + MLX5_SET_TO_ONES(fte_match_param, match_criteria, outer_headers.cvlan_tag); + MLX5_SET_TO_ONES(fte_match_param, match_criteria, outer_headers.smac_47_16); + MLX5_SET_TO_ONES(fte_match_param, match_criteria, outer_headers.smac_15_0); + MLX5_SET(create_flow_group_in, flow_group_in, start_flow_index, 0); + MLX5_SET(create_flow_group_in, flow_group_in, end_flow_index, 0); + + g = mlx5_create_flow_group(vport->ingress.acl, flow_group_in); + if (IS_ERR(g)) { + err = PTR_ERR(g); + esw_warn(dev, "vport[%d] ingress create untagged spoofchk flow group, err(%d)\n", + vport->vport, err); + goto spoof_err; + } + vport->ingress.legacy.allow_untagged_spoofchk_grp = g; + + memset(flow_group_in, 0, inlen); + MLX5_SET(create_flow_group_in, flow_group_in, match_criteria_enable, + MLX5_MATCH_OUTER_HEADERS); + MLX5_SET_TO_ONES(fte_match_param, match_criteria, outer_headers.cvlan_tag); + MLX5_SET(create_flow_group_in, flow_group_in, start_flow_index, 1); + MLX5_SET(create_flow_group_in, flow_group_in, end_flow_index, 1); + + g = mlx5_create_flow_group(vport->ingress.acl, flow_group_in); + if (IS_ERR(g)) { + err = PTR_ERR(g); + esw_warn(dev, "vport[%d] ingress create untagged flow group, err(%d)\n", + vport->vport, err); + goto untagged_err; + } + vport->ingress.legacy.allow_untagged_only_grp = g; + + memset(flow_group_in, 0, inlen); + MLX5_SET(create_flow_group_in, flow_group_in, match_criteria_enable, + MLX5_MATCH_OUTER_HEADERS); + MLX5_SET_TO_ONES(fte_match_param, match_criteria, outer_headers.smac_47_16); + MLX5_SET_TO_ONES(fte_match_param, match_criteria, outer_headers.smac_15_0); + MLX5_SET(create_flow_group_in, flow_group_in, start_flow_index, 2); + MLX5_SET(create_flow_group_in, flow_group_in, end_flow_index, 2); + + g = mlx5_create_flow_group(vport->ingress.acl, flow_group_in); + if (IS_ERR(g)) { + err = PTR_ERR(g); + esw_warn(dev, "vport[%d] ingress create spoofchk flow group, err(%d)\n", + vport->vport, err); + goto allow_spoof_err; + } + vport->ingress.legacy.allow_spoofchk_only_grp = g; + + memset(flow_group_in, 0, inlen); + MLX5_SET(create_flow_group_in, flow_group_in, start_flow_index, 3); + MLX5_SET(create_flow_group_in, flow_group_in, end_flow_index, 3); + + g = mlx5_create_flow_group(vport->ingress.acl, flow_group_in); + if (IS_ERR(g)) { + err = PTR_ERR(g); + esw_warn(dev, "vport[%d] ingress create drop flow group, err(%d)\n", + vport->vport, err); + goto drop_err; + } + vport->ingress.legacy.drop_grp = g; + kvfree(flow_group_in); + return 0; + +drop_err: + if (!IS_ERR_OR_NULL(vport->ingress.legacy.allow_spoofchk_only_grp)) { + mlx5_destroy_flow_group(vport->ingress.legacy.allow_spoofchk_only_grp); + vport->ingress.legacy.allow_spoofchk_only_grp = NULL; + } +allow_spoof_err: + if (!IS_ERR_OR_NULL(vport->ingress.legacy.allow_untagged_only_grp)) { + mlx5_destroy_flow_group(vport->ingress.legacy.allow_untagged_only_grp); + vport->ingress.legacy.allow_untagged_only_grp = NULL; + } +untagged_err: + if (!IS_ERR_OR_NULL(vport->ingress.legacy.allow_untagged_spoofchk_grp)) { + mlx5_destroy_flow_group(vport->ingress.legacy.allow_untagged_spoofchk_grp); + vport->ingress.legacy.allow_untagged_spoofchk_grp = NULL; + } +spoof_err: + kvfree(flow_group_in); + return err; +} + +static void esw_acl_ingress_lgcy_groups_destroy(struct mlx5_vport *vport) +{ + if (vport->ingress.legacy.allow_spoofchk_only_grp) { + mlx5_destroy_flow_group(vport->ingress.legacy.allow_spoofchk_only_grp); + vport->ingress.legacy.allow_spoofchk_only_grp = NULL; + } + if (vport->ingress.legacy.allow_untagged_only_grp) { + mlx5_destroy_flow_group(vport->ingress.legacy.allow_untagged_only_grp); + vport->ingress.legacy.allow_untagged_only_grp = NULL; + } + if (vport->ingress.legacy.allow_untagged_spoofchk_grp) { + mlx5_destroy_flow_group(vport->ingress.legacy.allow_untagged_spoofchk_grp); + vport->ingress.legacy.allow_untagged_spoofchk_grp = NULL; + } + if (vport->ingress.legacy.drop_grp) { + mlx5_destroy_flow_group(vport->ingress.legacy.drop_grp); + vport->ingress.legacy.drop_grp = NULL; + } +} + +int esw_acl_ingress_lgcy_setup(struct mlx5_eswitch *esw, + struct mlx5_vport *vport) +{ + struct mlx5_flow_destination drop_ctr_dst = {}; + struct mlx5_flow_destination *dst = NULL; + struct mlx5_flow_act flow_act = {}; + struct mlx5_flow_spec *spec = NULL; + struct mlx5_fc *counter = NULL; + /* The ingress acl table contains 4 groups + * (2 active rules at the same time - + * 1 allow rule from one of the first 3 groups. + * 1 drop rule from the last group): + * 1)Allow untagged traffic with smac=original mac. + * 2)Allow untagged traffic. + * 3)Allow traffic with smac=original mac. + * 4)Drop all other traffic. + */ + int table_size = 4; + int dest_num = 0; + int err = 0; + u8 *smac_v; + + esw_acl_ingress_lgcy_rules_destroy(vport); + + if (MLX5_CAP_ESW_INGRESS_ACL(esw->dev, flow_counter)) { + counter = mlx5_fc_create(esw->dev, false); + if (IS_ERR(counter)) + esw_warn(esw->dev, + "vport[%d] configure ingress drop rule counter failed\n", + vport->vport); + vport->ingress.legacy.drop_counter = counter; + } + + if (!vport->info.vlan && !vport->info.qos && !vport->info.spoofchk) { + esw_acl_ingress_lgcy_cleanup(esw, vport); + return 0; + } + + if (!vport->ingress.acl) { + vport->ingress.acl = esw_acl_table_create(esw, vport->vport, + MLX5_FLOW_NAMESPACE_ESW_INGRESS, + table_size); + if (IS_ERR_OR_NULL(vport->ingress.acl)) { + err = PTR_ERR(vport->ingress.acl); + vport->ingress.acl = NULL; + return err; + } + + err = esw_acl_ingress_lgcy_groups_create(esw, vport); + if (err) + goto out; + } + + esw_debug(esw->dev, + "vport[%d] configure ingress rules, vlan(%d) qos(%d)\n", + vport->vport, vport->info.vlan, vport->info.qos); + + spec = kvzalloc(sizeof(*spec), GFP_KERNEL); + if (!spec) { + err = -ENOMEM; + goto out; + } + + if (vport->info.vlan || vport->info.qos) + MLX5_SET_TO_ONES(fte_match_param, spec->match_criteria, + outer_headers.cvlan_tag); + + if (vport->info.spoofchk) { + MLX5_SET_TO_ONES(fte_match_param, spec->match_criteria, + outer_headers.smac_47_16); + MLX5_SET_TO_ONES(fte_match_param, spec->match_criteria, + outer_headers.smac_15_0); + smac_v = MLX5_ADDR_OF(fte_match_param, + spec->match_value, + outer_headers.smac_47_16); + ether_addr_copy(smac_v, vport->info.mac); + } + + /* Create ingress allow rule */ + memset(spec, 0, sizeof(*spec)); + spec->match_criteria_enable = MLX5_MATCH_OUTER_HEADERS; + flow_act.action = MLX5_FLOW_CONTEXT_ACTION_ALLOW; + vport->ingress.allow_rule = mlx5_add_flow_rules(vport->ingress.acl, spec, + &flow_act, NULL, 0); + if (IS_ERR(vport->ingress.allow_rule)) { + err = PTR_ERR(vport->ingress.allow_rule); + esw_warn(esw->dev, + "vport[%d] configure ingress allow rule, err(%d)\n", + vport->vport, err); + vport->ingress.allow_rule = NULL; + goto out; + } + + memset(&flow_act, 0, sizeof(flow_act)); + flow_act.action = MLX5_FLOW_CONTEXT_ACTION_DROP; + /* Attach drop flow counter */ + if (counter) { + flow_act.action |= MLX5_FLOW_CONTEXT_ACTION_COUNT; + drop_ctr_dst.type = MLX5_FLOW_DESTINATION_TYPE_COUNTER; + drop_ctr_dst.counter_id = mlx5_fc_id(counter); + dst = &drop_ctr_dst; + dest_num++; + } + vport->ingress.legacy.drop_rule = + mlx5_add_flow_rules(vport->ingress.acl, NULL, + &flow_act, dst, dest_num); + if (IS_ERR(vport->ingress.legacy.drop_rule)) { + err = PTR_ERR(vport->ingress.legacy.drop_rule); + esw_warn(esw->dev, + "vport[%d] configure ingress drop rule, err(%d)\n", + vport->vport, err); + vport->ingress.legacy.drop_rule = NULL; + goto out; + } + kvfree(spec); + return 0; + +out: + esw_acl_ingress_lgcy_cleanup(esw, vport); + kvfree(spec); + return err; +} + +void esw_acl_ingress_lgcy_cleanup(struct mlx5_eswitch *esw, + struct mlx5_vport *vport) +{ + if (IS_ERR_OR_NULL(vport->ingress.acl)) + goto clean_drop_counter; + + esw_debug(esw->dev, "Destroy vport[%d] E-Switch ingress ACL\n", vport->vport); + + esw_acl_ingress_lgcy_rules_destroy(vport); + esw_acl_ingress_lgcy_groups_destroy(vport); + esw_acl_ingress_table_destroy(vport); + +clean_drop_counter: + if (!IS_ERR_OR_NULL(vport->ingress.legacy.drop_counter)) { + mlx5_fc_destroy(esw->dev, vport->ingress.legacy.drop_counter); + vport->ingress.legacy.drop_counter = NULL; + } +} diff --git a/drivers/net/ethernet/mellanox/mlx5/core/esw/acl/ingress_ofld.c b/drivers/net/ethernet/mellanox/mlx5/core/esw/acl/ingress_ofld.c new file mode 100644 index 000000000000..1bae549f3fa7 --- /dev/null +++ b/drivers/net/ethernet/mellanox/mlx5/core/esw/acl/ingress_ofld.c @@ -0,0 +1,293 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* Copyright (c) 2020 Mellanox Technologies Inc. All rights reserved. */ + +#include "mlx5_core.h" +#include "eswitch.h" +#include "helper.h" +#include "ofld.h" + +static bool +esw_acl_ingress_prio_tag_enabled(const struct mlx5_eswitch *esw, + const struct mlx5_vport *vport) +{ + return (MLX5_CAP_GEN(esw->dev, prio_tag_required) && + mlx5_eswitch_is_vf_vport(esw, vport->vport)); +} + +static int esw_acl_ingress_prio_tag_create(struct mlx5_eswitch *esw, + struct mlx5_vport *vport) +{ + struct mlx5_flow_act flow_act = {}; + struct mlx5_flow_spec *spec; + int err = 0; + + /* For prio tag mode, there is only 1 FTEs: + * 1) Untagged packets - push prio tag VLAN and modify metadata if + * required, allow + * Unmatched traffic is allowed by default + */ + spec = kvzalloc(sizeof(*spec), GFP_KERNEL); + if (!spec) + return -ENOMEM; + + /* Untagged packets - push prio tag VLAN, allow */ + MLX5_SET_TO_ONES(fte_match_param, spec->match_criteria, outer_headers.cvlan_tag); + MLX5_SET(fte_match_param, spec->match_value, outer_headers.cvlan_tag, 0); + spec->match_criteria_enable = MLX5_MATCH_OUTER_HEADERS; + flow_act.action = MLX5_FLOW_CONTEXT_ACTION_VLAN_PUSH | + MLX5_FLOW_CONTEXT_ACTION_ALLOW; + flow_act.vlan[0].ethtype = ETH_P_8021Q; + flow_act.vlan[0].vid = 0; + flow_act.vlan[0].prio = 0; + + if (vport->ingress.offloads.modify_metadata_rule) { + flow_act.action |= MLX5_FLOW_CONTEXT_ACTION_MOD_HDR; + flow_act.modify_hdr = vport->ingress.offloads.modify_metadata; + } + + vport->ingress.allow_rule = mlx5_add_flow_rules(vport->ingress.acl, spec, + &flow_act, NULL, 0); + if (IS_ERR(vport->ingress.allow_rule)) { + err = PTR_ERR(vport->ingress.allow_rule); + esw_warn(esw->dev, + "vport[%d] configure ingress untagged allow rule, err(%d)\n", + vport->vport, err); + vport->ingress.allow_rule = NULL; + } + + kvfree(spec); + return err; +} + +static int esw_acl_ingress_mod_metadata_create(struct mlx5_eswitch *esw, + struct mlx5_vport *vport) +{ + u8 action[MLX5_UN_SZ_BYTES(set_add_copy_action_in_auto)] = {}; + struct mlx5_flow_act flow_act = {}; + int err = 0; + u32 key; + + key = mlx5_eswitch_get_vport_metadata_for_match(esw, vport->vport); + key >>= ESW_SOURCE_PORT_METADATA_OFFSET; + + MLX5_SET(set_action_in, action, action_type, MLX5_ACTION_TYPE_SET); + MLX5_SET(set_action_in, action, field, + MLX5_ACTION_IN_FIELD_METADATA_REG_C_0); + MLX5_SET(set_action_in, action, data, key); + MLX5_SET(set_action_in, action, offset, + ESW_SOURCE_PORT_METADATA_OFFSET); + MLX5_SET(set_action_in, action, length, + ESW_SOURCE_PORT_METADATA_BITS); + + vport->ingress.offloads.modify_metadata = + mlx5_modify_header_alloc(esw->dev, MLX5_FLOW_NAMESPACE_ESW_INGRESS, + 1, action); + if (IS_ERR(vport->ingress.offloads.modify_metadata)) { + err = PTR_ERR(vport->ingress.offloads.modify_metadata); + esw_warn(esw->dev, + "failed to alloc modify header for vport %d ingress acl (%d)\n", + vport->vport, err); + return err; + } + + flow_act.action = MLX5_FLOW_CONTEXT_ACTION_MOD_HDR | MLX5_FLOW_CONTEXT_ACTION_ALLOW; + flow_act.modify_hdr = vport->ingress.offloads.modify_metadata; + vport->ingress.offloads.modify_metadata_rule = + mlx5_add_flow_rules(vport->ingress.acl, + NULL, &flow_act, NULL, 0); + if (IS_ERR(vport->ingress.offloads.modify_metadata_rule)) { + err = PTR_ERR(vport->ingress.offloads.modify_metadata_rule); + esw_warn(esw->dev, + "failed to add setting metadata rule for vport %d ingress acl, err(%d)\n", + vport->vport, err); + mlx5_modify_header_dealloc(esw->dev, vport->ingress.offloads.modify_metadata); + vport->ingress.offloads.modify_metadata_rule = NULL; + } + return err; +} + +static void esw_acl_ingress_mod_metadata_destroy(struct mlx5_eswitch *esw, + struct mlx5_vport *vport) +{ + if (!vport->ingress.offloads.modify_metadata_rule) + return; + + mlx5_del_flow_rules(vport->ingress.offloads.modify_metadata_rule); + mlx5_modify_header_dealloc(esw->dev, vport->ingress.offloads.modify_metadata); + vport->ingress.offloads.modify_metadata_rule = NULL; +} + +static int esw_acl_ingress_ofld_rules_create(struct mlx5_eswitch *esw, + struct mlx5_vport *vport) +{ + int err; + + if (mlx5_eswitch_vport_match_metadata_enabled(esw)) { + err = esw_acl_ingress_mod_metadata_create(esw, vport); + if (err) { + esw_warn(esw->dev, + "vport(%d) create ingress modify metadata, err(%d)\n", + vport->vport, err); + return err; + } + } + + if (esw_acl_ingress_prio_tag_enabled(esw, vport)) { + err = esw_acl_ingress_prio_tag_create(esw, vport); + if (err) { + esw_warn(esw->dev, + "vport(%d) create ingress prio tag rule, err(%d)\n", + vport->vport, err); + goto prio_tag_err; + } + } + + return 0; + +prio_tag_err: + esw_acl_ingress_mod_metadata_destroy(esw, vport); + return err; +} + +static void esw_acl_ingress_ofld_rules_destroy(struct mlx5_eswitch *esw, + struct mlx5_vport *vport) +{ + esw_acl_ingress_allow_rule_destroy(vport); + esw_acl_ingress_mod_metadata_destroy(esw, vport); +} + +static int esw_acl_ingress_ofld_groups_create(struct mlx5_eswitch *esw, + struct mlx5_vport *vport) +{ + int inlen = MLX5_ST_SZ_BYTES(create_flow_group_in); + struct mlx5_flow_group *g; + void *match_criteria; + u32 *flow_group_in; + u32 flow_index = 0; + int ret = 0; + + flow_group_in = kvzalloc(inlen, GFP_KERNEL); + if (!flow_group_in) + return -ENOMEM; + + if (esw_acl_ingress_prio_tag_enabled(esw, vport)) { + /* This group is to hold FTE to match untagged packets when prio_tag + * is enabled. + */ + match_criteria = MLX5_ADDR_OF(create_flow_group_in, + flow_group_in, match_criteria); + MLX5_SET(create_flow_group_in, flow_group_in, + match_criteria_enable, MLX5_MATCH_OUTER_HEADERS); + MLX5_SET_TO_ONES(fte_match_param, match_criteria, outer_headers.cvlan_tag); + MLX5_SET(create_flow_group_in, flow_group_in, start_flow_index, flow_index); + MLX5_SET(create_flow_group_in, flow_group_in, end_flow_index, flow_index); + + g = mlx5_create_flow_group(vport->ingress.acl, flow_group_in); + if (IS_ERR(g)) { + ret = PTR_ERR(g); + esw_warn(esw->dev, "vport[%d] ingress create untagged flow group, err(%d)\n", + vport->vport, ret); + goto prio_tag_err; + } + vport->ingress.offloads.metadata_prio_tag_grp = g; + flow_index++; + } + + if (mlx5_eswitch_vport_match_metadata_enabled(esw)) { + /* This group holds an FTE with no match to add metadata for + * tagged packets if prio-tag is enabled, or for all untagged + * traffic in case prio-tag is disabled. + */ + memset(flow_group_in, 0, inlen); + MLX5_SET(create_flow_group_in, flow_group_in, start_flow_index, flow_index); + MLX5_SET(create_flow_group_in, flow_group_in, end_flow_index, flow_index); + + g = mlx5_create_flow_group(vport->ingress.acl, flow_group_in); + if (IS_ERR(g)) { + ret = PTR_ERR(g); + esw_warn(esw->dev, "vport[%d] ingress create drop flow group, err(%d)\n", + vport->vport, ret); + goto metadata_err; + } + vport->ingress.offloads.metadata_allmatch_grp = g; + } + + kvfree(flow_group_in); + return 0; + +metadata_err: + if (!IS_ERR_OR_NULL(vport->ingress.offloads.metadata_prio_tag_grp)) { + mlx5_destroy_flow_group(vport->ingress.offloads.metadata_prio_tag_grp); + vport->ingress.offloads.metadata_prio_tag_grp = NULL; + } +prio_tag_err: + kvfree(flow_group_in); + return ret; +} + +static void esw_acl_ingress_ofld_groups_destroy(struct mlx5_vport *vport) +{ + if (vport->ingress.offloads.metadata_allmatch_grp) { + mlx5_destroy_flow_group(vport->ingress.offloads.metadata_allmatch_grp); + vport->ingress.offloads.metadata_allmatch_grp = NULL; + } + + if (vport->ingress.offloads.metadata_prio_tag_grp) { + mlx5_destroy_flow_group(vport->ingress.offloads.metadata_prio_tag_grp); + vport->ingress.offloads.metadata_prio_tag_grp = NULL; + } +} + +int esw_acl_ingress_ofld_setup(struct mlx5_eswitch *esw, + struct mlx5_vport *vport) +{ + int num_ftes = 0; + int err; + + if (!mlx5_eswitch_vport_match_metadata_enabled(esw) && + !esw_acl_ingress_prio_tag_enabled(esw, vport)) + return 0; + + esw_acl_ingress_allow_rule_destroy(vport); + + if (mlx5_eswitch_vport_match_metadata_enabled(esw)) + num_ftes++; + if (esw_acl_ingress_prio_tag_enabled(esw, vport)) + num_ftes++; + + vport->ingress.acl = esw_acl_table_create(esw, vport->vport, + MLX5_FLOW_NAMESPACE_ESW_INGRESS, + num_ftes); + if (IS_ERR_OR_NULL(vport->ingress.acl)) { + err = PTR_ERR(vport->ingress.acl); + vport->ingress.acl = NULL; + return err; + } + + err = esw_acl_ingress_ofld_groups_create(esw, vport); + if (err) + goto group_err; + + esw_debug(esw->dev, + "vport[%d] configure ingress rules\n", vport->vport); + + err = esw_acl_ingress_ofld_rules_create(esw, vport); + if (err) + goto rules_err; + + return 0; + +rules_err: + esw_acl_ingress_ofld_groups_destroy(vport); +group_err: + esw_acl_ingress_table_destroy(vport); + return err; +} + +void esw_acl_ingress_ofld_cleanup(struct mlx5_eswitch *esw, + struct mlx5_vport *vport) +{ + esw_acl_ingress_ofld_rules_destroy(esw, vport); + esw_acl_ingress_ofld_groups_destroy(vport); + esw_acl_ingress_table_destroy(vport); +} diff --git a/drivers/net/ethernet/mellanox/mlx5/core/esw/acl/lgcy.h b/drivers/net/ethernet/mellanox/mlx5/core/esw/acl/lgcy.h index 6b05a3af4462..44c152da3d83 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/esw/acl/lgcy.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/esw/acl/lgcy.h @@ -10,4 +10,8 @@ int esw_acl_egress_lgcy_setup(struct mlx5_eswitch *esw, struct mlx5_vport *vport); void esw_acl_egress_lgcy_cleanup(struct mlx5_eswitch *esw, struct mlx5_vport *vport); +/* Eswitch acl ingress external APIs */ +int esw_acl_ingress_lgcy_setup(struct mlx5_eswitch *esw, struct mlx5_vport *vport); +void esw_acl_ingress_lgcy_cleanup(struct mlx5_eswitch *esw, struct mlx5_vport *vport); + #endif /* __MLX5_ESWITCH_ACL_LGCY_H__ */ diff --git a/drivers/net/ethernet/mellanox/mlx5/core/esw/acl/ofld.h b/drivers/net/ethernet/mellanox/mlx5/core/esw/acl/ofld.h index fc912b254226..9e5e0fac29ef 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/esw/acl/ofld.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/esw/acl/ofld.h @@ -10,4 +10,8 @@ int esw_acl_egress_ofld_setup(struct mlx5_eswitch *esw, struct mlx5_vport *vport); void esw_acl_egress_ofld_cleanup(struct mlx5_vport *vport); +/* Eswitch acl ingress external APIs */ +int esw_acl_ingress_ofld_setup(struct mlx5_eswitch *esw, struct mlx5_vport *vport); +void esw_acl_ingress_ofld_cleanup(struct mlx5_eswitch *esw, struct mlx5_vport *vport); + #endif /* __MLX5_ESWITCH_ACL_OFLD_H__ */ diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c index ae74486b9c9e..20ab13ff2303 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c @@ -937,301 +937,6 @@ static void esw_vport_change_handler(struct work_struct *work) mutex_unlock(&esw->state_lock); } -static int -esw_vport_create_legacy_ingress_acl_groups(struct mlx5_eswitch *esw, - struct mlx5_vport *vport) -{ - int inlen = MLX5_ST_SZ_BYTES(create_flow_group_in); - struct mlx5_core_dev *dev = esw->dev; - struct mlx5_flow_group *g; - void *match_criteria; - u32 *flow_group_in; - int err; - - flow_group_in = kvzalloc(inlen, GFP_KERNEL); - if (!flow_group_in) - return -ENOMEM; - - match_criteria = MLX5_ADDR_OF(create_flow_group_in, flow_group_in, match_criteria); - - MLX5_SET(create_flow_group_in, flow_group_in, match_criteria_enable, MLX5_MATCH_OUTER_HEADERS); - MLX5_SET_TO_ONES(fte_match_param, match_criteria, outer_headers.cvlan_tag); - MLX5_SET_TO_ONES(fte_match_param, match_criteria, outer_headers.smac_47_16); - MLX5_SET_TO_ONES(fte_match_param, match_criteria, outer_headers.smac_15_0); - MLX5_SET(create_flow_group_in, flow_group_in, start_flow_index, 0); - MLX5_SET(create_flow_group_in, flow_group_in, end_flow_index, 0); - - g = mlx5_create_flow_group(vport->ingress.acl, flow_group_in); - if (IS_ERR(g)) { - err = PTR_ERR(g); - esw_warn(dev, "vport[%d] ingress create untagged spoofchk flow group, err(%d)\n", - vport->vport, err); - goto spoof_err; - } - vport->ingress.legacy.allow_untagged_spoofchk_grp = g; - - memset(flow_group_in, 0, inlen); - MLX5_SET(create_flow_group_in, flow_group_in, match_criteria_enable, MLX5_MATCH_OUTER_HEADERS); - MLX5_SET_TO_ONES(fte_match_param, match_criteria, outer_headers.cvlan_tag); - MLX5_SET(create_flow_group_in, flow_group_in, start_flow_index, 1); - MLX5_SET(create_flow_group_in, flow_group_in, end_flow_index, 1); - - g = mlx5_create_flow_group(vport->ingress.acl, flow_group_in); - if (IS_ERR(g)) { - err = PTR_ERR(g); - esw_warn(dev, "vport[%d] ingress create untagged flow group, err(%d)\n", - vport->vport, err); - goto untagged_err; - } - vport->ingress.legacy.allow_untagged_only_grp = g; - - memset(flow_group_in, 0, inlen); - MLX5_SET(create_flow_group_in, flow_group_in, match_criteria_enable, MLX5_MATCH_OUTER_HEADERS); - MLX5_SET_TO_ONES(fte_match_param, match_criteria, outer_headers.smac_47_16); - MLX5_SET_TO_ONES(fte_match_param, match_criteria, outer_headers.smac_15_0); - MLX5_SET(create_flow_group_in, flow_group_in, start_flow_index, 2); - MLX5_SET(create_flow_group_in, flow_group_in, end_flow_index, 2); - - g = mlx5_create_flow_group(vport->ingress.acl, flow_group_in); - if (IS_ERR(g)) { - err = PTR_ERR(g); - esw_warn(dev, "vport[%d] ingress create spoofchk flow group, err(%d)\n", - vport->vport, err); - goto allow_spoof_err; - } - vport->ingress.legacy.allow_spoofchk_only_grp = g; - - memset(flow_group_in, 0, inlen); - MLX5_SET(create_flow_group_in, flow_group_in, start_flow_index, 3); - MLX5_SET(create_flow_group_in, flow_group_in, end_flow_index, 3); - - g = mlx5_create_flow_group(vport->ingress.acl, flow_group_in); - if (IS_ERR(g)) { - err = PTR_ERR(g); - esw_warn(dev, "vport[%d] ingress create drop flow group, err(%d)\n", - vport->vport, err); - goto drop_err; - } - vport->ingress.legacy.drop_grp = g; - kvfree(flow_group_in); - return 0; - -drop_err: - if (!IS_ERR_OR_NULL(vport->ingress.legacy.allow_spoofchk_only_grp)) { - mlx5_destroy_flow_group(vport->ingress.legacy.allow_spoofchk_only_grp); - vport->ingress.legacy.allow_spoofchk_only_grp = NULL; - } -allow_spoof_err: - if (!IS_ERR_OR_NULL(vport->ingress.legacy.allow_untagged_only_grp)) { - mlx5_destroy_flow_group(vport->ingress.legacy.allow_untagged_only_grp); - vport->ingress.legacy.allow_untagged_only_grp = NULL; - } -untagged_err: - if (!IS_ERR_OR_NULL(vport->ingress.legacy.allow_untagged_spoofchk_grp)) { - mlx5_destroy_flow_group(vport->ingress.legacy.allow_untagged_spoofchk_grp); - vport->ingress.legacy.allow_untagged_spoofchk_grp = NULL; - } -spoof_err: - kvfree(flow_group_in); - return err; -} - -int esw_vport_create_ingress_acl_table(struct mlx5_eswitch *esw, - struct mlx5_vport *vport, int table_size) -{ - struct mlx5_core_dev *dev = esw->dev; - struct mlx5_flow_namespace *root_ns; - struct mlx5_flow_table *acl; - int vport_index; - int err; - - if (!MLX5_CAP_ESW_INGRESS_ACL(dev, ft_support)) - return -EOPNOTSUPP; - - esw_debug(dev, "Create vport[%d] ingress ACL log_max_size(%d)\n", - vport->vport, MLX5_CAP_ESW_INGRESS_ACL(dev, log_max_ft_size)); - - vport_index = mlx5_eswitch_vport_num_to_index(esw, vport->vport); - root_ns = mlx5_get_flow_vport_acl_namespace(dev, MLX5_FLOW_NAMESPACE_ESW_INGRESS, - vport_index); - if (!root_ns) { - esw_warn(dev, "Failed to get E-Switch ingress flow namespace for vport (%d)\n", - vport->vport); - return -EOPNOTSUPP; - } - - acl = mlx5_create_vport_flow_table(root_ns, 0, table_size, 0, vport->vport); - if (IS_ERR(acl)) { - err = PTR_ERR(acl); - esw_warn(dev, "vport[%d] ingress create flow Table, err(%d)\n", - vport->vport, err); - return err; - } - vport->ingress.acl = acl; - return 0; -} - -void esw_vport_destroy_ingress_acl_table(struct mlx5_vport *vport) -{ - if (!vport->ingress.acl) - return; - - mlx5_destroy_flow_table(vport->ingress.acl); - vport->ingress.acl = NULL; -} - -void esw_vport_cleanup_ingress_rules(struct mlx5_eswitch *esw, - struct mlx5_vport *vport) -{ - if (vport->ingress.legacy.drop_rule) { - mlx5_del_flow_rules(vport->ingress.legacy.drop_rule); - vport->ingress.legacy.drop_rule = NULL; - } - - if (vport->ingress.allow_rule) { - mlx5_del_flow_rules(vport->ingress.allow_rule); - vport->ingress.allow_rule = NULL; - } -} - -static void esw_vport_disable_legacy_ingress_acl(struct mlx5_eswitch *esw, - struct mlx5_vport *vport) -{ - if (!vport->ingress.acl) - return; - - esw_debug(esw->dev, "Destroy vport[%d] E-Switch ingress ACL\n", vport->vport); - - esw_vport_cleanup_ingress_rules(esw, vport); - if (vport->ingress.legacy.allow_spoofchk_only_grp) { - mlx5_destroy_flow_group(vport->ingress.legacy.allow_spoofchk_only_grp); - vport->ingress.legacy.allow_spoofchk_only_grp = NULL; - } - if (vport->ingress.legacy.allow_untagged_only_grp) { - mlx5_destroy_flow_group(vport->ingress.legacy.allow_untagged_only_grp); - vport->ingress.legacy.allow_untagged_only_grp = NULL; - } - if (vport->ingress.legacy.allow_untagged_spoofchk_grp) { - mlx5_destroy_flow_group(vport->ingress.legacy.allow_untagged_spoofchk_grp); - vport->ingress.legacy.allow_untagged_spoofchk_grp = NULL; - } - if (vport->ingress.legacy.drop_grp) { - mlx5_destroy_flow_group(vport->ingress.legacy.drop_grp); - vport->ingress.legacy.drop_grp = NULL; - } - esw_vport_destroy_ingress_acl_table(vport); -} - -static int esw_vport_ingress_config(struct mlx5_eswitch *esw, - struct mlx5_vport *vport) -{ - struct mlx5_fc *counter = vport->ingress.legacy.drop_counter; - struct mlx5_flow_destination drop_ctr_dst = {0}; - struct mlx5_flow_destination *dst = NULL; - struct mlx5_flow_act flow_act = {0}; - struct mlx5_flow_spec *spec = NULL; - int dest_num = 0; - int err = 0; - u8 *smac_v; - - /* The ingress acl table contains 4 groups - * (2 active rules at the same time - - * 1 allow rule from one of the first 3 groups. - * 1 drop rule from the last group): - * 1)Allow untagged traffic with smac=original mac. - * 2)Allow untagged traffic. - * 3)Allow traffic with smac=original mac. - * 4)Drop all other traffic. - */ - int table_size = 4; - - esw_vport_cleanup_ingress_rules(esw, vport); - - if (!vport->info.vlan && !vport->info.qos && !vport->info.spoofchk) { - esw_vport_disable_legacy_ingress_acl(esw, vport); - return 0; - } - - if (!vport->ingress.acl) { - err = esw_vport_create_ingress_acl_table(esw, vport, table_size); - if (err) { - esw_warn(esw->dev, - "vport[%d] enable ingress acl err (%d)\n", - err, vport->vport); - return err; - } - - err = esw_vport_create_legacy_ingress_acl_groups(esw, vport); - if (err) - goto out; - } - - esw_debug(esw->dev, - "vport[%d] configure ingress rules, vlan(%d) qos(%d)\n", - vport->vport, vport->info.vlan, vport->info.qos); - - spec = kvzalloc(sizeof(*spec), GFP_KERNEL); - if (!spec) { - err = -ENOMEM; - goto out; - } - - if (vport->info.vlan || vport->info.qos) - MLX5_SET_TO_ONES(fte_match_param, spec->match_criteria, outer_headers.cvlan_tag); - - if (vport->info.spoofchk) { - MLX5_SET_TO_ONES(fte_match_param, spec->match_criteria, outer_headers.smac_47_16); - MLX5_SET_TO_ONES(fte_match_param, spec->match_criteria, outer_headers.smac_15_0); - smac_v = MLX5_ADDR_OF(fte_match_param, - spec->match_value, - outer_headers.smac_47_16); - ether_addr_copy(smac_v, vport->info.mac); - } - - spec->match_criteria_enable = MLX5_MATCH_OUTER_HEADERS; - flow_act.action = MLX5_FLOW_CONTEXT_ACTION_ALLOW; - vport->ingress.allow_rule = - mlx5_add_flow_rules(vport->ingress.acl, spec, - &flow_act, NULL, 0); - if (IS_ERR(vport->ingress.allow_rule)) { - err = PTR_ERR(vport->ingress.allow_rule); - esw_warn(esw->dev, - "vport[%d] configure ingress allow rule, err(%d)\n", - vport->vport, err); - vport->ingress.allow_rule = NULL; - goto out; - } - - flow_act.action = MLX5_FLOW_CONTEXT_ACTION_DROP; - - /* Attach drop flow counter */ - if (counter) { - flow_act.action |= MLX5_FLOW_CONTEXT_ACTION_COUNT; - drop_ctr_dst.type = MLX5_FLOW_DESTINATION_TYPE_COUNTER; - drop_ctr_dst.counter_id = mlx5_fc_id(counter); - dst = &drop_ctr_dst; - dest_num++; - } - vport->ingress.legacy.drop_rule = - mlx5_add_flow_rules(vport->ingress.acl, NULL, - &flow_act, dst, dest_num); - if (IS_ERR(vport->ingress.legacy.drop_rule)) { - err = PTR_ERR(vport->ingress.legacy.drop_rule); - esw_warn(esw->dev, - "vport[%d] configure ingress drop rule, err(%d)\n", - vport->vport, err); - vport->ingress.legacy.drop_rule = NULL; - goto out; - } - kvfree(spec); - return 0; - -out: - esw_vport_disable_legacy_ingress_acl(esw, vport); - kvfree(spec); - return err; -} - static bool element_type_supported(struct mlx5_eswitch *esw, int type) { const struct mlx5_core_dev *dev = esw->dev; @@ -1443,17 +1148,7 @@ static int esw_vport_create_legacy_acl_tables(struct mlx5_eswitch *esw, if (mlx5_esw_is_manager_vport(esw, vport->vport)) return 0; - if (MLX5_CAP_ESW_INGRESS_ACL(esw->dev, flow_counter)) { - vport->ingress.legacy.drop_counter = mlx5_fc_create(esw->dev, false); - if (IS_ERR(vport->ingress.legacy.drop_counter)) { - esw_warn(esw->dev, - "vport[%d] configure ingress drop rule counter failed\n", - vport->vport); - vport->ingress.legacy.drop_counter = NULL; - } - } - - ret = esw_vport_ingress_config(esw, vport); + ret = esw_acl_ingress_lgcy_setup(esw, vport); if (ret) goto ingress_err; @@ -1464,10 +1159,8 @@ static int esw_vport_create_legacy_acl_tables(struct mlx5_eswitch *esw, return 0; egress_err: - esw_vport_disable_legacy_ingress_acl(esw, vport); + esw_acl_ingress_lgcy_cleanup(esw, vport); ingress_err: - mlx5_fc_destroy(esw->dev, vport->ingress.legacy.drop_counter); - vport->ingress.legacy.drop_counter = NULL; return ret; } @@ -1488,10 +1181,7 @@ static void esw_vport_destroy_legacy_acl_tables(struct mlx5_eswitch *esw, return; esw_acl_egress_lgcy_cleanup(esw, vport); - - esw_vport_disable_legacy_ingress_acl(esw, vport); - mlx5_fc_destroy(esw->dev, vport->ingress.legacy.drop_counter); - vport->ingress.legacy.drop_counter = NULL; + esw_acl_ingress_lgcy_cleanup(esw, vport); } static void esw_vport_cleanup_acl(struct mlx5_eswitch *esw, @@ -2123,7 +1813,7 @@ int mlx5_eswitch_set_vport_mac(struct mlx5_eswitch *esw, ether_addr_copy(evport->info.mac, mac); evport->info.node_guid = node_guid; if (evport->enabled && esw->mode == MLX5_ESWITCH_LEGACY) - err = esw_vport_ingress_config(esw, evport); + err = esw_acl_ingress_lgcy_setup(esw, evport); unlock: mutex_unlock(&esw->state_lock); @@ -2205,7 +1895,7 @@ int __mlx5_eswitch_set_vport_vlan(struct mlx5_eswitch *esw, evport->info.vlan = vlan; evport->info.qos = qos; if (evport->enabled && esw->mode == MLX5_ESWITCH_LEGACY) { - err = esw_vport_ingress_config(esw, evport); + err = esw_acl_ingress_lgcy_setup(esw, evport); if (err) return err; err = esw_acl_egress_lgcy_setup(esw, evport); @@ -2250,7 +1940,7 @@ int mlx5_eswitch_set_vport_spoofchk(struct mlx5_eswitch *esw, "Spoofchk in set while MAC is invalid, vport(%d)\n", evport->vport); if (evport->enabled && esw->mode == MLX5_ESWITCH_LEGACY) - err = esw_vport_ingress_config(esw, evport); + err = esw_acl_ingress_lgcy_setup(esw, evport); if (err) evport->info.spoofchk = pschk; mutex_unlock(&esw->state_lock); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h index 490410401631..ca7b7961c295 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h @@ -285,12 +285,6 @@ void esw_offloads_disable(struct mlx5_eswitch *esw); int esw_offloads_enable(struct mlx5_eswitch *esw); void esw_offloads_cleanup_reps(struct mlx5_eswitch *esw); int esw_offloads_init_reps(struct mlx5_eswitch *esw); -void esw_vport_cleanup_ingress_rules(struct mlx5_eswitch *esw, - struct mlx5_vport *vport); -int esw_vport_create_ingress_acl_table(struct mlx5_eswitch *esw, - struct mlx5_vport *vport, - int table_size); -void esw_vport_destroy_ingress_acl_table(struct mlx5_vport *vport); int mlx5_esw_modify_vport_rate(struct mlx5_eswitch *esw, u16 vport_num, u32 rate_mbps); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c index 0b00b30187ce..11bc9cc1d5f0 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c @@ -235,13 +235,6 @@ static struct mlx5_eswitch_rep *mlx5_eswitch_get_rep(struct mlx5_eswitch *esw, return &esw->offloads.vport_reps[idx]; } -static bool -esw_check_ingress_prio_tag_enabled(const struct mlx5_eswitch *esw, - const struct mlx5_vport *vport) -{ - return (MLX5_CAP_GEN(esw->dev, prio_tag_required) && - mlx5_eswitch_is_vf_vport(esw, vport->vport)); -} static void mlx5_eswitch_set_rule_source_port(struct mlx5_eswitch *esw, @@ -1852,248 +1845,6 @@ static void esw_offloads_devcom_cleanup(struct mlx5_eswitch *esw) mlx5_devcom_unregister_component(devcom, MLX5_DEVCOM_ESW_OFFLOADS); } -static int esw_vport_ingress_prio_tag_config(struct mlx5_eswitch *esw, - struct mlx5_vport *vport) -{ - struct mlx5_flow_act flow_act = {0}; - struct mlx5_flow_spec *spec; - int err = 0; - - /* For prio tag mode, there is only 1 FTEs: - * 1) Untagged packets - push prio tag VLAN and modify metadata if - * required, allow - * Unmatched traffic is allowed by default - */ - spec = kvzalloc(sizeof(*spec), GFP_KERNEL); - if (!spec) - return -ENOMEM; - - /* Untagged packets - push prio tag VLAN, allow */ - MLX5_SET_TO_ONES(fte_match_param, spec->match_criteria, outer_headers.cvlan_tag); - MLX5_SET(fte_match_param, spec->match_value, outer_headers.cvlan_tag, 0); - spec->match_criteria_enable = MLX5_MATCH_OUTER_HEADERS; - flow_act.action = MLX5_FLOW_CONTEXT_ACTION_VLAN_PUSH | - MLX5_FLOW_CONTEXT_ACTION_ALLOW; - flow_act.vlan[0].ethtype = ETH_P_8021Q; - flow_act.vlan[0].vid = 0; - flow_act.vlan[0].prio = 0; - - if (vport->ingress.offloads.modify_metadata_rule) { - flow_act.action |= MLX5_FLOW_CONTEXT_ACTION_MOD_HDR; - flow_act.modify_hdr = vport->ingress.offloads.modify_metadata; - } - - vport->ingress.allow_rule = - mlx5_add_flow_rules(vport->ingress.acl, spec, - &flow_act, NULL, 0); - if (IS_ERR(vport->ingress.allow_rule)) { - err = PTR_ERR(vport->ingress.allow_rule); - esw_warn(esw->dev, - "vport[%d] configure ingress untagged allow rule, err(%d)\n", - vport->vport, err); - vport->ingress.allow_rule = NULL; - } - - kvfree(spec); - return err; -} - -static int esw_vport_add_ingress_acl_modify_metadata(struct mlx5_eswitch *esw, - struct mlx5_vport *vport) -{ - u8 action[MLX5_UN_SZ_BYTES(set_add_copy_action_in_auto)] = {}; - struct mlx5_flow_act flow_act = {}; - int err = 0; - u32 key; - - key = mlx5_eswitch_get_vport_metadata_for_match(esw, vport->vport); - key >>= ESW_SOURCE_PORT_METADATA_OFFSET; - - MLX5_SET(set_action_in, action, action_type, MLX5_ACTION_TYPE_SET); - MLX5_SET(set_action_in, action, field, - MLX5_ACTION_IN_FIELD_METADATA_REG_C_0); - MLX5_SET(set_action_in, action, data, key); - MLX5_SET(set_action_in, action, offset, - ESW_SOURCE_PORT_METADATA_OFFSET); - MLX5_SET(set_action_in, action, length, - ESW_SOURCE_PORT_METADATA_BITS); - - vport->ingress.offloads.modify_metadata = - mlx5_modify_header_alloc(esw->dev, MLX5_FLOW_NAMESPACE_ESW_INGRESS, - 1, action); - if (IS_ERR(vport->ingress.offloads.modify_metadata)) { - err = PTR_ERR(vport->ingress.offloads.modify_metadata); - esw_warn(esw->dev, - "failed to alloc modify header for vport %d ingress acl (%d)\n", - vport->vport, err); - return err; - } - - flow_act.action = MLX5_FLOW_CONTEXT_ACTION_MOD_HDR | MLX5_FLOW_CONTEXT_ACTION_ALLOW; - flow_act.modify_hdr = vport->ingress.offloads.modify_metadata; - vport->ingress.offloads.modify_metadata_rule = - mlx5_add_flow_rules(vport->ingress.acl, - NULL, &flow_act, NULL, 0); - if (IS_ERR(vport->ingress.offloads.modify_metadata_rule)) { - err = PTR_ERR(vport->ingress.offloads.modify_metadata_rule); - esw_warn(esw->dev, - "failed to add setting metadata rule for vport %d ingress acl, err(%d)\n", - vport->vport, err); - mlx5_modify_header_dealloc(esw->dev, vport->ingress.offloads.modify_metadata); - vport->ingress.offloads.modify_metadata_rule = NULL; - } - return err; -} - -static void esw_vport_del_ingress_acl_modify_metadata(struct mlx5_eswitch *esw, - struct mlx5_vport *vport) -{ - if (vport->ingress.offloads.modify_metadata_rule) { - mlx5_del_flow_rules(vport->ingress.offloads.modify_metadata_rule); - mlx5_modify_header_dealloc(esw->dev, vport->ingress.offloads.modify_metadata); - - vport->ingress.offloads.modify_metadata_rule = NULL; - } -} - -static int esw_vport_create_ingress_acl_group(struct mlx5_eswitch *esw, - struct mlx5_vport *vport) -{ - int inlen = MLX5_ST_SZ_BYTES(create_flow_group_in); - struct mlx5_flow_group *g; - void *match_criteria; - u32 *flow_group_in; - u32 flow_index = 0; - int ret = 0; - - flow_group_in = kvzalloc(inlen, GFP_KERNEL); - if (!flow_group_in) - return -ENOMEM; - - if (esw_check_ingress_prio_tag_enabled(esw, vport)) { - /* This group is to hold FTE to match untagged packets when prio_tag - * is enabled. - */ - memset(flow_group_in, 0, inlen); - - match_criteria = MLX5_ADDR_OF(create_flow_group_in, - flow_group_in, match_criteria); - MLX5_SET(create_flow_group_in, flow_group_in, - match_criteria_enable, MLX5_MATCH_OUTER_HEADERS); - MLX5_SET_TO_ONES(fte_match_param, match_criteria, outer_headers.cvlan_tag); - MLX5_SET(create_flow_group_in, flow_group_in, start_flow_index, flow_index); - MLX5_SET(create_flow_group_in, flow_group_in, end_flow_index, flow_index); - - g = mlx5_create_flow_group(vport->ingress.acl, flow_group_in); - if (IS_ERR(g)) { - ret = PTR_ERR(g); - esw_warn(esw->dev, "vport[%d] ingress create untagged flow group, err(%d)\n", - vport->vport, ret); - goto prio_tag_err; - } - vport->ingress.offloads.metadata_prio_tag_grp = g; - flow_index++; - } - - if (mlx5_eswitch_vport_match_metadata_enabled(esw)) { - /* This group holds an FTE with no matches for add metadata for - * tagged packets, if prio-tag is enabled (as a fallthrough), - * or all traffic in case prio-tag is disabled. - */ - memset(flow_group_in, 0, inlen); - MLX5_SET(create_flow_group_in, flow_group_in, start_flow_index, flow_index); - MLX5_SET(create_flow_group_in, flow_group_in, end_flow_index, flow_index); - - g = mlx5_create_flow_group(vport->ingress.acl, flow_group_in); - if (IS_ERR(g)) { - ret = PTR_ERR(g); - esw_warn(esw->dev, "vport[%d] ingress create drop flow group, err(%d)\n", - vport->vport, ret); - goto metadata_err; - } - vport->ingress.offloads.metadata_allmatch_grp = g; - } - - kvfree(flow_group_in); - return 0; - -metadata_err: - if (!IS_ERR_OR_NULL(vport->ingress.offloads.metadata_prio_tag_grp)) { - mlx5_destroy_flow_group(vport->ingress.offloads.metadata_prio_tag_grp); - vport->ingress.offloads.metadata_prio_tag_grp = NULL; - } -prio_tag_err: - kvfree(flow_group_in); - return ret; -} - -static void esw_vport_destroy_ingress_acl_group(struct mlx5_vport *vport) -{ - if (vport->ingress.offloads.metadata_allmatch_grp) { - mlx5_destroy_flow_group(vport->ingress.offloads.metadata_allmatch_grp); - vport->ingress.offloads.metadata_allmatch_grp = NULL; - } - - if (vport->ingress.offloads.metadata_prio_tag_grp) { - mlx5_destroy_flow_group(vport->ingress.offloads.metadata_prio_tag_grp); - vport->ingress.offloads.metadata_prio_tag_grp = NULL; - } -} - -static int esw_vport_ingress_config(struct mlx5_eswitch *esw, - struct mlx5_vport *vport) -{ - int num_ftes = 0; - int err; - - if (!mlx5_eswitch_vport_match_metadata_enabled(esw) && - !esw_check_ingress_prio_tag_enabled(esw, vport)) - return 0; - - esw_vport_cleanup_ingress_rules(esw, vport); - - if (mlx5_eswitch_vport_match_metadata_enabled(esw)) - num_ftes++; - if (esw_check_ingress_prio_tag_enabled(esw, vport)) - num_ftes++; - - err = esw_vport_create_ingress_acl_table(esw, vport, num_ftes); - if (err) { - esw_warn(esw->dev, - "failed to enable ingress acl (%d) on vport[%d]\n", - err, vport->vport); - return err; - } - - err = esw_vport_create_ingress_acl_group(esw, vport); - if (err) - goto group_err; - - esw_debug(esw->dev, - "vport[%d] configure ingress rules\n", vport->vport); - - if (mlx5_eswitch_vport_match_metadata_enabled(esw)) { - err = esw_vport_add_ingress_acl_modify_metadata(esw, vport); - if (err) - goto metadata_err; - } - - if (esw_check_ingress_prio_tag_enabled(esw, vport)) { - err = esw_vport_ingress_prio_tag_config(esw, vport); - if (err) - goto prio_tag_err; - } - return 0; - -prio_tag_err: - esw_vport_del_ingress_acl_modify_metadata(esw, vport); -metadata_err: - esw_vport_destroy_ingress_acl_group(vport); -group_err: - esw_vport_destroy_ingress_acl_table(vport); - return err; -} - static bool esw_check_vport_match_metadata_supported(const struct mlx5_eswitch *esw) { @@ -2132,19 +1883,20 @@ esw_vport_create_offloads_acl_tables(struct mlx5_eswitch *esw, { int err; - err = esw_vport_ingress_config(esw, vport); + err = esw_acl_ingress_ofld_setup(esw, vport); if (err) return err; if (mlx5_eswitch_is_vf_vport(esw, vport->vport)) { err = esw_acl_egress_ofld_setup(esw, vport); - if (err) { - esw_vport_cleanup_ingress_rules(esw, vport); - esw_vport_del_ingress_acl_modify_metadata(esw, vport); - esw_vport_destroy_ingress_acl_group(vport); - esw_vport_destroy_ingress_acl_table(vport); - } + if (err) + goto egress_err; } + + return 0; + +egress_err: + esw_acl_ingress_ofld_cleanup(esw, vport); return err; } @@ -2153,10 +1905,7 @@ esw_vport_destroy_offloads_acl_tables(struct mlx5_eswitch *esw, struct mlx5_vport *vport) { esw_acl_egress_ofld_cleanup(vport); - esw_vport_cleanup_ingress_rules(esw, vport); - esw_vport_del_ingress_acl_modify_metadata(esw, vport); - esw_vport_destroy_ingress_acl_group(vport); - esw_vport_destroy_ingress_acl_table(vport); + esw_acl_ingress_ofld_cleanup(esw, vport); } static int esw_create_uplink_offloads_acl_tables(struct mlx5_eswitch *esw) -- cgit v1.2.3-59-g8ed1b From bf773dc0e6d55a828a9111124b1d7836f2d4492c Mon Sep 17 00:00:00 2001 From: Vu Pham Date: Mon, 16 Mar 2020 17:32:50 -0700 Subject: net/mlx5: E-Switch, Introduce APIs to enable egress acl forward-to-vport rule By default, e-switch vport's egress acl just forward packets to its counterpart NIC vport using existing egress acl table. During port failover in bonding scenario where two VFs representors are bonded, the egress acl forward-to-vport rule will be added to the existing egress acl table of e-switch vport of passive/inactive slave representor to forward packets to other NIC vport ie. the active slave representor's NIC vport to handle egress "failover" traffic. Enable egress acl and have APIs to create and destroy egress acl forward-to-vport rule and group. Signed-off-by: Vu Pham Reviewed-by: Parav Pandit Reviewed-by: Roi Dayan Signed-off-by: Saeed Mahameed --- .../mellanox/mlx5/core/esw/acl/egress_ofld.c | 185 ++++++++++++++++++--- .../net/ethernet/mellanox/mlx5/core/esw/acl/ofld.h | 10 ++ drivers/net/ethernet/mellanox/mlx5/core/eswitch.h | 16 +- 3 files changed, 187 insertions(+), 24 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/esw/acl/egress_ofld.c b/drivers/net/ethernet/mellanox/mlx5/core/esw/acl/egress_ofld.c index 49a53ebf56dd..07b2acd7e6b3 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/esw/acl/egress_ofld.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/esw/acl/egress_ofld.c @@ -6,55 +6,165 @@ #include "helper.h" #include "ofld.h" +static void esw_acl_egress_ofld_fwd2vport_destroy(struct mlx5_vport *vport) +{ + if (!vport->egress.offloads.fwd_rule) + return; + + mlx5_del_flow_rules(vport->egress.offloads.fwd_rule); + vport->egress.offloads.fwd_rule = NULL; +} + +static int esw_acl_egress_ofld_fwd2vport_create(struct mlx5_eswitch *esw, + struct mlx5_vport *vport, + struct mlx5_flow_destination *fwd_dest) +{ + struct mlx5_flow_act flow_act = {}; + int err = 0; + + esw_debug(esw->dev, "vport(%d) configure egress acl rule fwd2vport(%d)\n", + vport->vport, fwd_dest->vport.num); + + /* Delete the old egress forward-to-vport rule if any */ + esw_acl_egress_ofld_fwd2vport_destroy(vport); + + flow_act.action = MLX5_FLOW_CONTEXT_ACTION_FWD_DEST; + + vport->egress.offloads.fwd_rule = + mlx5_add_flow_rules(vport->egress.acl, NULL, + &flow_act, fwd_dest, 1); + if (IS_ERR(vport->egress.offloads.fwd_rule)) { + err = PTR_ERR(vport->egress.offloads.fwd_rule); + esw_warn(esw->dev, + "vport(%d) failed to add fwd2vport acl rule err(%d)\n", + vport->vport, err); + vport->egress.offloads.fwd_rule = NULL; + } + + return err; +} + static int esw_acl_egress_ofld_rules_create(struct mlx5_eswitch *esw, - struct mlx5_vport *vport) + struct mlx5_vport *vport, + struct mlx5_flow_destination *fwd_dest) { - if (!MLX5_CAP_GEN(esw->dev, prio_tag_required)) - return 0; + int err = 0; + int action; + + if (MLX5_CAP_GEN(esw->dev, prio_tag_required)) { + /* For prio tag mode, there is only 1 FTEs: + * 1) prio tag packets - pop the prio tag VLAN, allow + * Unmatched traffic is allowed by default + */ + esw_debug(esw->dev, + "vport[%d] configure prio tag egress rules\n", vport->vport); + + action = MLX5_FLOW_CONTEXT_ACTION_VLAN_POP; + action |= fwd_dest ? MLX5_FLOW_CONTEXT_ACTION_FWD_DEST : + MLX5_FLOW_CONTEXT_ACTION_ALLOW; + + /* prio tag vlan rule - pop it so vport receives untagged packets */ + err = esw_egress_acl_vlan_create(esw, vport, fwd_dest, 0, action); + if (err) + goto prio_err; + } - /* For prio tag mode, there is only 1 FTEs: - * 1) prio tag packets - pop the prio tag VLAN, allow - * Unmatched traffic is allowed by default - */ - esw_debug(esw->dev, - "vport[%d] configure prio tag egress rules\n", vport->vport); + if (fwd_dest) { + err = esw_acl_egress_ofld_fwd2vport_create(esw, vport, fwd_dest); + if (err) + goto fwd_err; + } - /* prio tag vlan rule - pop it so vport receives untagged packets */ - return esw_egress_acl_vlan_create(esw, vport, NULL, 0, - MLX5_FLOW_CONTEXT_ACTION_VLAN_POP | - MLX5_FLOW_CONTEXT_ACTION_ALLOW); + return 0; + +fwd_err: + esw_acl_egress_vlan_destroy(vport); +prio_err: + return err; } static void esw_acl_egress_ofld_rules_destroy(struct mlx5_vport *vport) { esw_acl_egress_vlan_destroy(vport); + esw_acl_egress_ofld_fwd2vport_destroy(vport); } static int esw_acl_egress_ofld_groups_create(struct mlx5_eswitch *esw, struct mlx5_vport *vport) { - if (!MLX5_CAP_GEN(esw->dev, prio_tag_required)) - return 0; + int inlen = MLX5_ST_SZ_BYTES(create_flow_group_in); + struct mlx5_flow_group *fwd_grp; + u32 *flow_group_in; + u32 flow_index = 0; + int ret = 0; + + if (MLX5_CAP_GEN(esw->dev, prio_tag_required)) { + ret = esw_acl_egress_vlan_grp_create(esw, vport); + if (ret) + return ret; + + flow_index++; + } + + if (!mlx5_esw_acl_egress_fwd2vport_supported(esw)) + goto out; + + flow_group_in = kvzalloc(inlen, GFP_KERNEL); + if (!flow_group_in) { + ret = -ENOMEM; + goto fwd_grp_err; + } + + /* This group holds 1 FTE to forward all packets to other vport + * when bond vports is supported. + */ + MLX5_SET(create_flow_group_in, flow_group_in, start_flow_index, flow_index); + MLX5_SET(create_flow_group_in, flow_group_in, end_flow_index, flow_index); + fwd_grp = mlx5_create_flow_group(vport->egress.acl, flow_group_in); + if (IS_ERR(fwd_grp)) { + ret = PTR_ERR(fwd_grp); + esw_warn(esw->dev, + "Failed to create vport[%d] egress fwd2vport flow group, err(%d)\n", + vport->vport, ret); + kvfree(flow_group_in); + goto fwd_grp_err; + } + vport->egress.offloads.fwd_grp = fwd_grp; + kvfree(flow_group_in); + return 0; - return esw_acl_egress_vlan_grp_create(esw, vport); +fwd_grp_err: + esw_acl_egress_vlan_grp_destroy(vport); +out: + return ret; } static void esw_acl_egress_ofld_groups_destroy(struct mlx5_vport *vport) { + if (!IS_ERR_OR_NULL(vport->egress.offloads.fwd_grp)) { + mlx5_destroy_flow_group(vport->egress.offloads.fwd_grp); + vport->egress.offloads.fwd_grp = NULL; + } esw_acl_egress_vlan_grp_destroy(vport); } int esw_acl_egress_ofld_setup(struct mlx5_eswitch *esw, struct mlx5_vport *vport) { + int table_size = 0; int err; - if (!MLX5_CAP_GEN(esw->dev, prio_tag_required)) + if (!mlx5_esw_acl_egress_fwd2vport_supported(esw) && + !MLX5_CAP_GEN(esw->dev, prio_tag_required)) return 0; esw_acl_egress_ofld_rules_destroy(vport); + if (mlx5_esw_acl_egress_fwd2vport_supported(esw)) + table_size++; + if (MLX5_CAP_GEN(esw->dev, prio_tag_required)) + table_size++; vport->egress.acl = esw_acl_table_create(esw, vport->vport, - MLX5_FLOW_NAMESPACE_ESW_EGRESS, 0); + MLX5_FLOW_NAMESPACE_ESW_EGRESS, table_size); if (IS_ERR_OR_NULL(vport->egress.acl)) { err = PTR_ERR(vport->egress.acl); vport->egress.acl = NULL; @@ -67,7 +177,7 @@ int esw_acl_egress_ofld_setup(struct mlx5_eswitch *esw, struct mlx5_vport *vport esw_debug(esw->dev, "vport[%d] configure egress rules\n", vport->vport); - err = esw_acl_egress_ofld_rules_create(esw, vport); + err = esw_acl_egress_ofld_rules_create(esw, vport, NULL); if (err) goto rules_err; @@ -86,3 +196,40 @@ void esw_acl_egress_ofld_cleanup(struct mlx5_vport *vport) esw_acl_egress_ofld_groups_destroy(vport); esw_acl_egress_table_destroy(vport); } + +int mlx5_esw_acl_egress_vport_bond(struct mlx5_eswitch *esw, u16 active_vport_num, + u16 passive_vport_num) +{ + struct mlx5_vport *passive_vport = mlx5_eswitch_get_vport(esw, passive_vport_num); + struct mlx5_vport *active_vport = mlx5_eswitch_get_vport(esw, active_vport_num); + struct mlx5_flow_destination fwd_dest = {}; + + if (IS_ERR(active_vport)) + return PTR_ERR(active_vport); + if (IS_ERR(passive_vport)) + return PTR_ERR(passive_vport); + + /* Cleanup and recreate rules WITHOUT fwd2vport of active vport */ + esw_acl_egress_ofld_rules_destroy(active_vport); + esw_acl_egress_ofld_rules_create(esw, active_vport, NULL); + + /* Cleanup and recreate all rules + fwd2vport rule of passive vport to forward */ + esw_acl_egress_ofld_rules_destroy(passive_vport); + fwd_dest.type = MLX5_FLOW_DESTINATION_TYPE_VPORT; + fwd_dest.vport.num = active_vport_num; + fwd_dest.vport.vhca_id = MLX5_CAP_GEN(esw->dev, vhca_id); + fwd_dest.vport.flags = MLX5_FLOW_DEST_VPORT_VHCA_ID; + + return esw_acl_egress_ofld_rules_create(esw, passive_vport, &fwd_dest); +} + +int mlx5_esw_acl_egress_vport_unbond(struct mlx5_eswitch *esw, u16 vport_num) +{ + struct mlx5_vport *vport = mlx5_eswitch_get_vport(esw, vport_num); + + if (IS_ERR(vport)) + return PTR_ERR(vport); + + esw_acl_egress_ofld_rules_destroy(vport); + return esw_acl_egress_ofld_rules_create(esw, vport, NULL); +} diff --git a/drivers/net/ethernet/mellanox/mlx5/core/esw/acl/ofld.h b/drivers/net/ethernet/mellanox/mlx5/core/esw/acl/ofld.h index 9e5e0fac29ef..90ddc5d7da46 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/esw/acl/ofld.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/esw/acl/ofld.h @@ -9,6 +9,16 @@ /* Eswitch acl egress external APIs */ int esw_acl_egress_ofld_setup(struct mlx5_eswitch *esw, struct mlx5_vport *vport); void esw_acl_egress_ofld_cleanup(struct mlx5_vport *vport); +int mlx5_esw_acl_egress_vport_bond(struct mlx5_eswitch *esw, u16 active_vport_num, + u16 passive_vport_num); +int mlx5_esw_acl_egress_vport_unbond(struct mlx5_eswitch *esw, u16 vport_num); + +static inline bool mlx5_esw_acl_egress_fwd2vport_supported(struct mlx5_eswitch *esw) +{ + return esw && esw->mode == MLX5_ESWITCH_OFFLOADS && + mlx5_eswitch_vport_match_metadata_enabled(esw) && + MLX5_CAP_ESW_FLOWTABLE(esw->dev, egress_acl_forward_to_vport); +} /* Eswitch acl ingress external APIs */ int esw_acl_ingress_ofld_setup(struct mlx5_eswitch *esw, struct mlx5_vport *vport); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h index ca7b7961c295..7b6b3686b666 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h @@ -101,11 +101,17 @@ struct vport_egress { struct mlx5_flow_table *acl; struct mlx5_flow_handle *allowed_vlan; struct mlx5_flow_group *vlan_grp; - struct { - struct mlx5_flow_group *drop_grp; - struct mlx5_flow_handle *drop_rule; - struct mlx5_fc *drop_counter; - } legacy; + union { + struct { + struct mlx5_flow_group *drop_grp; + struct mlx5_flow_handle *drop_rule; + struct mlx5_fc *drop_counter; + } legacy; + struct { + struct mlx5_flow_group *fwd_grp; + struct mlx5_flow_handle *fwd_rule; + } offloads; + }; }; struct mlx5_vport_drop_stats { -- cgit v1.2.3-59-g8ed1b From 7e51891a237f9ea319f53f9beb83afb0077d88e6 Mon Sep 17 00:00:00 2001 From: Or Gerlitz Date: Fri, 21 Jun 2019 13:23:44 -0700 Subject: net/mlx5e: Use netdev events to set/del egress acl forward-to-vport rule Register a notifier block to handle netdev events for bond device of non-uplink representors to support eswitch vports bonding. When a non-uplink representor is a lower dev (slave) of bond and becomes active, adding egress acl forward-to-vport rule of all slave netdevs (active + standby) to forward to this representor's vport. Use change lower netdev event to do this. Use change upper event to detect slave representor unslaved from lag device to delete its vport egress acl forward rule if any. Signed-off-by: Or Gerlitz Signed-off-by: Vu Pham Reviewed-by: Parav Pandit Reviewed-by: Roi Dayan Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/Makefile | 3 +- .../net/ethernet/mellanox/mlx5/core/en/rep/bond.c | 161 +++++++++++++++++++++ drivers/net/ethernet/mellanox/mlx5/core/en_rep.c | 8 +- drivers/net/ethernet/mellanox/mlx5/core/en_rep.h | 7 + 4 files changed, 175 insertions(+), 4 deletions(-) create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/en/rep/bond.c diff --git a/drivers/net/ethernet/mellanox/mlx5/core/Makefile b/drivers/net/ethernet/mellanox/mlx5/core/Makefile index 3934dc258041..b61e47bc16e8 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/Makefile +++ b/drivers/net/ethernet/mellanox/mlx5/core/Makefile @@ -34,7 +34,8 @@ mlx5_core-$(CONFIG_MLX5_EN_ARFS) += en_arfs.o mlx5_core-$(CONFIG_MLX5_EN_RXNFC) += en_fs_ethtool.o mlx5_core-$(CONFIG_MLX5_CORE_EN_DCB) += en_dcbnl.o en/port_buffer.o mlx5_core-$(CONFIG_PCI_HYPERV_INTERFACE) += en/hv_vhca_stats.o -mlx5_core-$(CONFIG_MLX5_ESWITCH) += en_rep.o lib/geneve.o lib/port_tun.o lag_mp.o +mlx5_core-$(CONFIG_MLX5_ESWITCH) += lag_mp.o lib/geneve.o lib/port_tun.o \ + en_rep.o en/rep/bond.o mlx5_core-$(CONFIG_MLX5_CLS_ACT) += en_tc.o en/rep/tc.o en/rep/neigh.o \ en/mapping.o esw/chains.o en/tc_tun.o \ en/tc_tun_vxlan.o en/tc_tun_gre.o en/tc_tun_geneve.o \ diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/rep/bond.c b/drivers/net/ethernet/mellanox/mlx5/core/en/rep/bond.c new file mode 100644 index 000000000000..d0aab36f1947 --- /dev/null +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/rep/bond.c @@ -0,0 +1,161 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* Copyright (c) 2020 Mellanox Technologies Inc. All rights reserved. */ + +#include + +#include "mlx5_core.h" +#include "eswitch.h" +#include "esw/acl/ofld.h" +#include "en_rep.h" + +struct mlx5e_rep_bond { + struct notifier_block nb; + struct netdev_net_notifier nn; +}; + +static bool mlx5e_rep_is_lag_netdev(struct net_device *netdev) +{ + struct mlx5e_priv *priv = netdev_priv(netdev); + struct mlx5e_rep_priv *rpriv = priv->ppriv; + + /* A given netdev is not a representor or not a slave of LAG configuration */ + if (!mlx5e_eswitch_rep(netdev) || !bond_slave_get_rtnl(netdev)) + return false; + + /* Egress acl forward to vport is supported only non-uplink representor */ + return rpriv->rep->vport != MLX5_VPORT_UPLINK; +} + +static void mlx5e_rep_changelowerstate_event(struct net_device *netdev, void *ptr) +{ + struct netdev_notifier_changelowerstate_info *info; + struct netdev_lag_lower_state_info *lag_info; + struct mlx5e_rep_priv *rpriv; + struct net_device *lag_dev; + struct mlx5e_priv *priv; + struct list_head *iter; + struct net_device *dev; + u16 acl_vport_num; + u16 fwd_vport_num; + + if (!mlx5e_rep_is_lag_netdev(netdev)) + return; + + info = ptr; + lag_info = info->lower_state_info; + /* This is not an event of a representor becoming active slave */ + if (!lag_info->tx_enabled) + return; + + priv = netdev_priv(netdev); + rpriv = priv->ppriv; + fwd_vport_num = rpriv->rep->vport; + lag_dev = netdev_master_upper_dev_get(netdev); + + netdev_dbg(netdev, "lag_dev(%s)'s slave vport(%d) is txable(%d)\n", + lag_dev->name, fwd_vport_num, net_lag_port_dev_txable(netdev)); + + /* Point everyone's egress acl to the vport of the active representor */ + netdev_for_each_lower_dev(lag_dev, dev, iter) { + priv = netdev_priv(dev); + rpriv = priv->ppriv; + acl_vport_num = rpriv->rep->vport; + if (acl_vport_num != fwd_vport_num) { + mlx5_esw_acl_egress_vport_bond(priv->mdev->priv.eswitch, + fwd_vport_num, + acl_vport_num); + } + } +} + +static void mlx5e_rep_changeupper_event(struct net_device *netdev, void *ptr) +{ + struct netdev_notifier_changeupper_info *info = ptr; + struct mlx5e_rep_priv *rpriv; + struct mlx5e_priv *priv; + + if (!mlx5e_rep_is_lag_netdev(netdev)) + return; + + /* Nothing to setup for new enslaved representor */ + if (info->linking) + return; + + priv = netdev_priv(netdev); + rpriv = priv->ppriv; + netdev_dbg(netdev, "Unslave, reset vport(%d) egress acl\n", rpriv->rep->vport); + + /* Reset all egress acl rules of unslave representor's vport */ + mlx5_esw_acl_egress_vport_unbond(priv->mdev->priv.eswitch, + rpriv->rep->vport); +} + +/* Bond device of representors and netdev events are used here in specific way + * to support eswitch vports bonding and to perform failover of eswitch vport + * by modifying the vport's egress acl of lower dev representors. Thus this + * also change the traditional behavior of lower dev under bond device. + * All non-representor netdevs or representors of other vendors as lower dev + * of bond device are not supported. + */ +static int mlx5e_rep_esw_bond_netevent(struct notifier_block *nb, + unsigned long event, void *ptr) +{ + struct net_device *netdev = netdev_notifier_info_to_dev(ptr); + + switch (event) { + case NETDEV_CHANGELOWERSTATE: + mlx5e_rep_changelowerstate_event(netdev, ptr); + break; + case NETDEV_CHANGEUPPER: + mlx5e_rep_changeupper_event(netdev, ptr); + break; + } + return NOTIFY_DONE; +} + +/* If HW support eswitch vports bonding, register a specific notifier to + * handle it when two or more representors are bonded + */ +int mlx5e_rep_bond_init(struct mlx5e_rep_priv *rpriv) +{ + struct mlx5_rep_uplink_priv *uplink_priv = &rpriv->uplink_priv; + struct net_device *netdev = rpriv->netdev; + struct mlx5e_priv *priv; + int ret = 0; + + priv = netdev_priv(netdev); + if (!mlx5_esw_acl_egress_fwd2vport_supported(priv->mdev->priv.eswitch)) + goto out; + + uplink_priv->bond = kvzalloc(sizeof(*uplink_priv->bond), GFP_KERNEL); + if (!uplink_priv->bond) { + ret = -ENOMEM; + goto out; + } + + uplink_priv->bond->nb.notifier_call = mlx5e_rep_esw_bond_netevent; + ret = register_netdevice_notifier_dev_net(netdev, + &uplink_priv->bond->nb, + &uplink_priv->bond->nn); + if (ret) { + netdev_err(netdev, "register bonding netevent notifier, err(%d)\n", ret); + kvfree(uplink_priv->bond); + uplink_priv->bond = NULL; + } +out: + return ret; +} + +void mlx5e_rep_bond_cleanup(struct mlx5e_rep_priv *rpriv) +{ + struct mlx5e_priv *priv = netdev_priv(rpriv->netdev); + + if (!mlx5_esw_acl_egress_fwd2vport_supported(priv->mdev->priv.eswitch) || + !rpriv->uplink_priv.bond) + return; + + unregister_netdevice_notifier_dev_net(rpriv->netdev, + &rpriv->uplink_priv.bond->nb, + &rpriv->uplink_priv.bond->nn); + kvfree(rpriv->uplink_priv.bond); +} diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c index 4e13e37a9ecd..12593d75e885 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c @@ -959,16 +959,18 @@ static int mlx5e_init_uplink_rep_tx(struct mlx5e_rep_priv *rpriv) mlx5_init_port_tun_entropy(&uplink_priv->tun_entropy, priv->mdev); + mlx5e_rep_bond_init(rpriv); err = mlx5e_rep_tc_netdevice_event_register(rpriv); if (err) { mlx5_core_err(priv->mdev, "Failed to register netdev notifier, err: %d\n", err); - goto tc_rep_cleanup; + goto err_event_reg; } return 0; -tc_rep_cleanup: +err_event_reg: + mlx5e_rep_bond_cleanup(rpriv); mlx5e_rep_tc_cleanup(rpriv); return err; } @@ -1001,7 +1003,7 @@ static void mlx5e_cleanup_uplink_rep_tx(struct mlx5e_rep_priv *rpriv) { mlx5e_rep_tc_netdevice_event_unregister(rpriv); mlx5e_rep_indr_clean_block_privs(rpriv); - + mlx5e_rep_bond_cleanup(rpriv); mlx5e_rep_tc_cleanup(rpriv); } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.h b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.h index 1c4af8522467..7e56787aa224 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.h @@ -56,6 +56,7 @@ struct mlx5e_neigh_update_table { }; struct mlx5_tc_ct_priv; +struct mlx5e_rep_bond; struct mlx5_rep_uplink_priv { /* Filters DB - instantiated by the uplink representor and shared by * the uplink's VFs @@ -89,6 +90,9 @@ struct mlx5_rep_uplink_priv { struct mapping_ctx *tunnel_enc_opts_mapping; struct mlx5_tc_ct_priv *ct_priv; + + /* support eswitch vports bonding */ + struct mlx5e_rep_bond *bond; }; struct mlx5e_rep_priv { @@ -211,6 +215,9 @@ struct mlx5e_rep_sq { void mlx5e_rep_register_vport_reps(struct mlx5_core_dev *mdev); void mlx5e_rep_unregister_vport_reps(struct mlx5_core_dev *mdev); +int mlx5e_rep_bond_init(struct mlx5e_rep_priv *rpriv); +void mlx5e_rep_bond_cleanup(struct mlx5e_rep_priv *rpriv); + bool mlx5e_is_uplink_rep(struct mlx5e_priv *priv); int mlx5e_add_sqs_fwd_rules(struct mlx5e_priv *priv); void mlx5e_remove_sqs_fwd_rules(struct mlx5e_priv *priv); -- cgit v1.2.3-59-g8ed1b From 553f9328385d954644d74dedb655f85b687a9470 Mon Sep 17 00:00:00 2001 From: Vu Pham Date: Fri, 2 Aug 2019 16:13:10 -0700 Subject: net/mlx5e: Support tc block sharing for representors Currently offloading a rule over a tc block shared by multiple representors fails because an e-switch global hashtable to keep the mapping from tc cookies to mlx5e flow instances is used, and tc block sharing offloads the same rule/cookie multiple times, each time for different representor sharing the tc block. Changing the implementation and behavior by acknowledging and returning success if the same rule/cookie is offloaded again to other slave representor sharing the tc block by setting, checking and comparing the netdev that added the rule first. Signed-off-by: Vu Pham Reviewed-by: Parav Pandit Reviewed-by: Roi Dayan Reviewed-by: Or Gerlitz Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en_tc.c | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c index 571da14809fe..f3e65a15c950 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c @@ -145,6 +145,7 @@ struct mlx5e_tc_flow { struct list_head hairpin; /* flows sharing the same hairpin */ struct list_head peer; /* flows with peer flow */ struct list_head unready; /* flows not ready to be offloaded (e.g due to missing route) */ + struct net_device *orig_dev; /* netdev adding flow first */ int tmp_efi_index; struct list_head tmp_list; /* temporary flow list used by neigh update */ refcount_t refcnt; @@ -4624,11 +4625,21 @@ mlx5e_tc_add_flow(struct mlx5e_priv *priv, return err; } +static bool is_flow_rule_duplicate_allowed(struct net_device *dev, + struct mlx5e_rep_priv *rpriv) +{ + /* Offloaded flow rule is allowed to duplicate on non-uplink representor + * sharing tc block with other slaves of a lag device. + */ + return netif_is_lag_port(dev) && rpriv->rep->vport != MLX5_VPORT_UPLINK; +} + int mlx5e_configure_flower(struct net_device *dev, struct mlx5e_priv *priv, struct flow_cls_offload *f, unsigned long flags) { struct netlink_ext_ack *extack = f->common.extack; struct rhashtable *tc_ht = get_tc_ht(priv, flags); + struct mlx5e_rep_priv *rpriv = priv->ppriv; struct mlx5e_tc_flow *flow; int err = 0; @@ -4636,6 +4647,12 @@ int mlx5e_configure_flower(struct net_device *dev, struct mlx5e_priv *priv, flow = rhashtable_lookup(tc_ht, &f->cookie, tc_ht_params); rcu_read_unlock(); if (flow) { + /* Same flow rule offloaded to non-uplink representor sharing tc block, + * just return 0. + */ + if (is_flow_rule_duplicate_allowed(dev, rpriv) && flow->orig_dev != dev) + goto out; + NL_SET_ERR_MSG_MOD(extack, "flow cookie already exists, ignoring"); netdev_warn_once(priv->netdev, @@ -4650,6 +4667,12 @@ int mlx5e_configure_flower(struct net_device *dev, struct mlx5e_priv *priv, if (err) goto out; + /* Flow rule offloaded to non-uplink representor sharing tc block, + * set the flow's owner dev. + */ + if (is_flow_rule_duplicate_allowed(dev, rpriv)) + flow->orig_dev = dev; + err = rhashtable_lookup_insert_fast(tc_ht, &flow->node, tc_ht_params); if (err) goto err_free; -- cgit v1.2.3-59-g8ed1b From d34eb2fcd00472323d9e26ee0aec498c2c6f5b6f Mon Sep 17 00:00:00 2001 From: Or Gerlitz Date: Tue, 5 Mar 2019 21:11:14 +0200 Subject: net/mlx5e: Offload flow rules to active lower representor When a bond device is created over one or more non uplink representors, and when a flow rule is offloaded to such bond device, offload a rule to the active lower device. Assuming that this is active-backup lag, the rules should be offloaded to the active lower device which is the representor of the direct path (not the failover). Signed-off-by: Or Gerlitz Signed-off-by: Parav Pandit Signed-off-by: Vu Pham Reviewed-by: Roi Dayan Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en_tc.c | 35 ++++++++++++++++++------- 1 file changed, 26 insertions(+), 9 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c index f3e65a15c950..58f797da4d8d 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c @@ -50,6 +50,7 @@ #include #include #include +#include #include "en.h" #include "en_rep.h" #include "en/rep/tc.h" @@ -3759,6 +3760,28 @@ static int parse_tc_vlan_action(struct mlx5e_priv *priv, return 0; } +static struct net_device *get_fdb_out_dev(struct net_device *uplink_dev, + struct net_device *out_dev) +{ + struct net_device *fdb_out_dev = out_dev; + struct net_device *uplink_upper; + + rcu_read_lock(); + uplink_upper = netdev_master_upper_dev_get_rcu(uplink_dev); + if (uplink_upper && netif_is_lag_master(uplink_upper) && + uplink_upper == out_dev) { + fdb_out_dev = uplink_dev; + } else if (netif_is_lag_master(out_dev)) { + fdb_out_dev = bond_option_active_slave_get_rcu(netdev_priv(out_dev)); + if (fdb_out_dev && + (!mlx5e_eswitch_rep(fdb_out_dev) || + !netdev_port_same_parent_id(fdb_out_dev, uplink_dev))) + fdb_out_dev = NULL; + } + rcu_read_unlock(); + return fdb_out_dev; +} + static int add_vlan_push_action(struct mlx5e_priv *priv, struct mlx5_esw_flow_attr *attr, struct net_device **out_dev, @@ -4074,7 +4097,6 @@ static int parse_tc_fdb_actions(struct mlx5e_priv *priv, } else if (netdev_port_same_parent_id(priv->netdev, out_dev)) { struct mlx5_eswitch *esw = priv->mdev->priv.eswitch; struct net_device *uplink_dev = mlx5_eswitch_uplink_get_proto_dev(esw, REP_ETH); - struct net_device *uplink_upper; if (is_duplicated_output_device(priv->netdev, out_dev, @@ -4086,14 +4108,9 @@ static int parse_tc_fdb_actions(struct mlx5e_priv *priv, ifindexes[if_count] = out_dev->ifindex; if_count++; - rcu_read_lock(); - uplink_upper = - netdev_master_upper_dev_get_rcu(uplink_dev); - if (uplink_upper && - netif_is_lag_master(uplink_upper) && - uplink_upper == out_dev) - out_dev = uplink_dev; - rcu_read_unlock(); + out_dev = get_fdb_out_dev(uplink_dev, out_dev); + if (!out_dev) + return -ENODEV; if (is_vlan_dev(out_dev)) { err = add_vlan_push_action(priv, attr, -- cgit v1.2.3-59-g8ed1b From d97555e1452943264295cd3c1f066474bc3660dd Mon Sep 17 00:00:00 2001 From: Vu Pham Date: Fri, 28 Feb 2020 14:28:27 -0800 Subject: net/mlx5e: Add bond_metadata and its slave entries Adding bond_metadata and its slave entries to represent a lag device and its slaves VF representors. Bond_metadata structure includes a unique metadata shared by slaves VF respresentors, and a list of slaves representors slave entries. On enslaving event, create a bond_metadata structure representing the upper lag device of this slave representor if it has not been created yet. Create and add entry for the slave representor to the slaves list. On unslaving event, free the slave entry of the slave representor. On the last unslave event, free the bond_metadata structure and its resources. Introduce APIs to create and remove bond_metadata and its resources, enslave and unslave VF representor slave entries. Signed-off-by: Vu Pham Reviewed-by: Parav Pandit Reviewed-by: Roi Dayan Signed-off-by: Saeed Mahameed --- .../net/ethernet/mellanox/mlx5/core/en/rep/bond.c | 128 +++++++++++++++++++++ drivers/net/ethernet/mellanox/mlx5/core/en_rep.h | 5 + 2 files changed, 133 insertions(+) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/rep/bond.c b/drivers/net/ethernet/mellanox/mlx5/core/en/rep/bond.c index d0aab36f1947..932e94362ceb 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/rep/bond.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/rep/bond.c @@ -1,6 +1,8 @@ // SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB /* Copyright (c) 2020 Mellanox Technologies Inc. All rights reserved. */ +#include +#include #include #include "mlx5_core.h" @@ -11,8 +13,132 @@ struct mlx5e_rep_bond { struct notifier_block nb; struct netdev_net_notifier nn; + struct list_head metadata_list; }; +struct mlx5e_rep_bond_slave_entry { + struct list_head list; + struct net_device *netdev; +}; + +struct mlx5e_rep_bond_metadata { + struct list_head list; /* link to global list of rep_bond_metadata */ + struct mlx5_eswitch *esw; + /* private of uplink holding rep bond metadata list */ + struct net_device *lag_dev; + u32 metadata_reg_c_0; + + struct list_head slaves_list; /* slaves list */ + int slaves; +}; + +static struct mlx5e_rep_bond_metadata * +mlx5e_lookup_rep_bond_metadata(struct mlx5_rep_uplink_priv *uplink_priv, + const struct net_device *lag_dev) +{ + struct mlx5e_rep_bond_metadata *found = NULL; + struct mlx5e_rep_bond_metadata *cur; + + list_for_each_entry(cur, &uplink_priv->bond->metadata_list, list) { + if (cur->lag_dev == lag_dev) { + found = cur; + break; + } + } + + return found; +} + +static struct mlx5e_rep_bond_slave_entry * +mlx5e_lookup_rep_bond_slave_entry(struct mlx5e_rep_bond_metadata *mdata, + const struct net_device *netdev) +{ + struct mlx5e_rep_bond_slave_entry *found = NULL; + struct mlx5e_rep_bond_slave_entry *cur; + + list_for_each_entry(cur, &mdata->slaves_list, list) { + if (cur->netdev == netdev) { + found = cur; + break; + } + } + + return found; +} + +static void mlx5e_rep_bond_metadata_release(struct mlx5e_rep_bond_metadata *mdata) +{ + netdev_dbg(mdata->lag_dev, "destroy rep_bond_metadata(%d)\n", + mdata->metadata_reg_c_0); + list_del(&mdata->list); + WARN_ON(!list_empty(&mdata->slaves_list)); + kfree(mdata); +} + +/* This must be called under rtnl_lock */ +int mlx5e_rep_bond_enslave(struct mlx5_eswitch *esw, struct net_device *netdev, + struct net_device *lag_dev) +{ + struct mlx5e_rep_bond_slave_entry *s_entry; + struct mlx5e_rep_bond_metadata *mdata; + struct mlx5e_rep_priv *rpriv; + + ASSERT_RTNL(); + + rpriv = mlx5_eswitch_get_uplink_priv(esw, REP_ETH); + mdata = mlx5e_lookup_rep_bond_metadata(&rpriv->uplink_priv, lag_dev); + if (!mdata) { + /* First netdev becomes slave, no metadata presents the lag_dev. Create one */ + mdata = kzalloc(sizeof(*mdata), GFP_KERNEL); + if (!mdata) + return -ENOMEM; + + mdata->lag_dev = lag_dev; + mdata->esw = esw; + INIT_LIST_HEAD(&mdata->slaves_list); + list_add(&mdata->list, &rpriv->uplink_priv.bond->metadata_list); + + netdev_dbg(lag_dev, "create rep_bond_metadata(%d)\n", + mdata->metadata_reg_c_0); + } + + s_entry = kzalloc(sizeof(*s_entry), GFP_KERNEL); + if (!s_entry) + return -ENOMEM; + + s_entry->netdev = netdev; + mdata->slaves++; + list_add_tail(&s_entry->list, &mdata->slaves_list); + + return 0; +} + +/* This must be called under rtnl_lock */ +void mlx5e_rep_bond_unslave(struct mlx5_eswitch *esw, + const struct net_device *netdev, + const struct net_device *lag_dev) +{ + struct mlx5e_rep_bond_slave_entry *s_entry; + struct mlx5e_rep_bond_metadata *mdata; + struct mlx5e_rep_priv *rpriv; + + ASSERT_RTNL(); + + rpriv = mlx5_eswitch_get_uplink_priv(esw, REP_ETH); + mdata = mlx5e_lookup_rep_bond_metadata(&rpriv->uplink_priv, lag_dev); + if (!mdata) + return; + + s_entry = mlx5e_lookup_rep_bond_slave_entry(mdata, netdev); + if (!s_entry) + return; + + list_del(&s_entry->list); + if (--mdata->slaves == 0) + mlx5e_rep_bond_metadata_release(mdata); + kfree(s_entry); +} + static bool mlx5e_rep_is_lag_netdev(struct net_device *netdev) { struct mlx5e_priv *priv = netdev_priv(netdev); @@ -133,6 +259,7 @@ int mlx5e_rep_bond_init(struct mlx5e_rep_priv *rpriv) goto out; } + INIT_LIST_HEAD(&uplink_priv->bond->metadata_list); uplink_priv->bond->nb.notifier_call = mlx5e_rep_esw_bond_netevent; ret = register_netdevice_notifier_dev_net(netdev, &uplink_priv->bond->nb, @@ -142,6 +269,7 @@ int mlx5e_rep_bond_init(struct mlx5e_rep_priv *rpriv) kvfree(uplink_priv->bond); uplink_priv->bond = NULL; } + out: return ret; } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.h b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.h index 7e56787aa224..ed741b6e6af2 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.h @@ -217,6 +217,11 @@ void mlx5e_rep_register_vport_reps(struct mlx5_core_dev *mdev); void mlx5e_rep_unregister_vport_reps(struct mlx5_core_dev *mdev); int mlx5e_rep_bond_init(struct mlx5e_rep_priv *rpriv); void mlx5e_rep_bond_cleanup(struct mlx5e_rep_priv *rpriv); +int mlx5e_rep_bond_enslave(struct mlx5_eswitch *esw, struct net_device *netdev, + struct net_device *lag_dev); +void mlx5e_rep_bond_unslave(struct mlx5_eswitch *esw, + const struct net_device *netdev, + const struct net_device *lag_dev); bool mlx5e_is_uplink_rep(struct mlx5e_priv *priv); int mlx5e_add_sqs_fwd_rules(struct mlx5e_priv *priv); -- cgit v1.2.3-59-g8ed1b From 133dcfc577eaec6538db4ebd8b9205b361f59018 Mon Sep 17 00:00:00 2001 From: Vu Pham Date: Fri, 28 Feb 2020 16:10:34 -0800 Subject: net/mlx5: E-Switch, Alloc and free unique metadata for match Introduce infrastructure to create unique metadata for match for vport without depending on vport_num. Vport uses its default metadata for match in standalone configuration but will share a different unique "bond_metadata" for match with other vports in bond configuration. Using ida to generate unique metadata for match for vports in default and bond configurations. Introduce APIs to generate, free metadata for match. Introduce APIs to set vport's bond_metadata and replace its ingress acl rules with bond_metatada. Signed-off-by: Vu Pham Reviewed-by: Parav Pandit Reviewed-by: Roi Dayan Reviewed-by: Mark Bloch Signed-off-by: Saeed Mahameed --- .../mellanox/mlx5/core/esw/acl/ingress_ofld.c | 29 +++++++ .../net/ethernet/mellanox/mlx5/core/esw/acl/ofld.h | 2 + drivers/net/ethernet/mellanox/mlx5/core/eswitch.c | 2 + drivers/net/ethernet/mellanox/mlx5/core/eswitch.h | 6 ++ .../ethernet/mellanox/mlx5/core/eswitch_offloads.c | 96 ++++++++++++++-------- 5 files changed, 103 insertions(+), 32 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/esw/acl/ingress_ofld.c b/drivers/net/ethernet/mellanox/mlx5/core/esw/acl/ingress_ofld.c index 1bae549f3fa7..4e55d7225a26 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/esw/acl/ingress_ofld.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/esw/acl/ingress_ofld.c @@ -291,3 +291,32 @@ void esw_acl_ingress_ofld_cleanup(struct mlx5_eswitch *esw, esw_acl_ingress_ofld_groups_destroy(vport); esw_acl_ingress_table_destroy(vport); } + +/* Caller must hold rtnl_lock */ +int mlx5_esw_acl_ingress_vport_bond_update(struct mlx5_eswitch *esw, u16 vport_num, + u32 metadata) +{ + struct mlx5_vport *vport = mlx5_eswitch_get_vport(esw, vport_num); + int err; + + if (WARN_ON_ONCE(IS_ERR(vport))) { + esw_warn(esw->dev, "vport(%d) invalid!\n", vport_num); + err = PTR_ERR(vport); + goto out; + } + + esw_acl_ingress_ofld_rules_destroy(esw, vport); + + vport->metadata = metadata ? metadata : vport->default_metadata; + + /* Recreate ingress acl rules with vport->metadata */ + err = esw_acl_ingress_ofld_rules_create(esw, vport); + if (err) + goto out; + + return 0; + +out: + vport->metadata = vport->default_metadata; + return err; +} diff --git a/drivers/net/ethernet/mellanox/mlx5/core/esw/acl/ofld.h b/drivers/net/ethernet/mellanox/mlx5/core/esw/acl/ofld.h index 90ddc5d7da46..c57869b93d60 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/esw/acl/ofld.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/esw/acl/ofld.h @@ -23,5 +23,7 @@ static inline bool mlx5_esw_acl_egress_fwd2vport_supported(struct mlx5_eswitch * /* Eswitch acl ingress external APIs */ int esw_acl_ingress_ofld_setup(struct mlx5_eswitch *esw, struct mlx5_vport *vport); void esw_acl_ingress_ofld_cleanup(struct mlx5_eswitch *esw, struct mlx5_vport *vport); +int mlx5_esw_acl_ingress_vport_bond_update(struct mlx5_eswitch *esw, u16 vport_num, + u32 metadata); #endif /* __MLX5_ESWITCH_ACL_OFLD_H__ */ diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c index 20ab13ff2303..1116ab9bea6c 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c @@ -1730,6 +1730,7 @@ int mlx5_eswitch_init(struct mlx5_core_dev *dev) mutex_init(&esw->offloads.decap_tbl_lock); hash_init(esw->offloads.decap_tbl); atomic64_set(&esw->offloads.num_flows, 0); + ida_init(&esw->offloads.vport_metadata_ida); mutex_init(&esw->state_lock); mutex_init(&esw->mode_lock); @@ -1768,6 +1769,7 @@ void mlx5_eswitch_cleanup(struct mlx5_eswitch *esw) esw_offloads_cleanup_reps(esw); mutex_destroy(&esw->mode_lock); mutex_destroy(&esw->state_lock); + ida_destroy(&esw->offloads.vport_metadata_ida); mutex_destroy(&esw->offloads.mod_hdr.lock); mutex_destroy(&esw->offloads.encap_tbl_lock); mutex_destroy(&esw->offloads.decap_tbl_lock); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h index 7b6b3686b666..a5175e98c0b3 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h @@ -149,6 +149,8 @@ struct mlx5_vport { struct vport_ingress ingress; struct vport_egress egress; + u32 default_metadata; + u32 metadata; struct mlx5_vport_info info; @@ -224,6 +226,7 @@ struct mlx5_esw_offload { u8 inline_mode; atomic64_t num_flows; enum devlink_eswitch_encap_mode encap; + struct ida vport_metadata_ida; }; /* E-Switch MC FDB table hash node */ @@ -292,6 +295,9 @@ int esw_offloads_enable(struct mlx5_eswitch *esw); void esw_offloads_cleanup_reps(struct mlx5_eswitch *esw); int esw_offloads_init_reps(struct mlx5_eswitch *esw); +u32 mlx5_esw_match_metadata_alloc(struct mlx5_eswitch *esw); +void mlx5_esw_match_metadata_free(struct mlx5_eswitch *esw, u32 metadata); + int mlx5_esw_modify_vport_rate(struct mlx5_eswitch *esw, u16 vport_num, u32 rate_mbps); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c index 11bc9cc1d5f0..060354bb211a 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c @@ -31,6 +31,7 @@ */ #include +#include #include #include #include @@ -1877,15 +1878,69 @@ static bool esw_use_vport_metadata(const struct mlx5_eswitch *esw) esw_check_vport_match_metadata_supported(esw); } +u32 mlx5_esw_match_metadata_alloc(struct mlx5_eswitch *esw) +{ + u32 num_vports = GENMASK(ESW_VPORT_BITS - 1, 0) - 1; + u32 vhca_id_mask = GENMASK(ESW_VHCA_ID_BITS - 1, 0); + u32 vhca_id = MLX5_CAP_GEN(esw->dev, vhca_id); + u32 start; + u32 end; + int id; + + /* Make sure the vhca_id fits the ESW_VHCA_ID_BITS */ + WARN_ON_ONCE(vhca_id >= BIT(ESW_VHCA_ID_BITS)); + + /* Trim vhca_id to ESW_VHCA_ID_BITS */ + vhca_id &= vhca_id_mask; + + start = (vhca_id << ESW_VPORT_BITS); + end = start + num_vports; + if (!vhca_id) + start += 1; /* zero is reserved/invalid metadata */ + id = ida_alloc_range(&esw->offloads.vport_metadata_ida, start, end, GFP_KERNEL); + + return (id < 0) ? 0 : id; +} + +void mlx5_esw_match_metadata_free(struct mlx5_eswitch *esw, u32 metadata) +{ + ida_free(&esw->offloads.vport_metadata_ida, metadata); +} + +static int esw_offloads_vport_metadata_setup(struct mlx5_eswitch *esw, + struct mlx5_vport *vport) +{ + if (vport->vport == MLX5_VPORT_UPLINK) + return 0; + + vport->default_metadata = mlx5_esw_match_metadata_alloc(esw); + vport->metadata = vport->default_metadata; + return vport->metadata ? 0 : -ENOSPC; +} + +static void esw_offloads_vport_metadata_cleanup(struct mlx5_eswitch *esw, + struct mlx5_vport *vport) +{ + if (vport->vport == MLX5_VPORT_UPLINK || !vport->default_metadata) + return; + + WARN_ON(vport->metadata != vport->default_metadata); + mlx5_esw_match_metadata_free(esw, vport->default_metadata); +} + int esw_vport_create_offloads_acl_tables(struct mlx5_eswitch *esw, struct mlx5_vport *vport) { int err; + err = esw_offloads_vport_metadata_setup(esw, vport); + if (err) + goto metadata_err; + err = esw_acl_ingress_ofld_setup(esw, vport); if (err) - return err; + goto ingress_err; if (mlx5_eswitch_is_vf_vport(esw, vport->vport)) { err = esw_acl_egress_ofld_setup(esw, vport); @@ -1897,6 +1952,9 @@ esw_vport_create_offloads_acl_tables(struct mlx5_eswitch *esw, egress_err: esw_acl_ingress_ofld_cleanup(esw, vport); +ingress_err: + esw_offloads_vport_metadata_cleanup(esw, vport); +metadata_err: return err; } @@ -1906,6 +1964,7 @@ esw_vport_destroy_offloads_acl_tables(struct mlx5_eswitch *esw, { esw_acl_egress_ofld_cleanup(vport); esw_acl_ingress_ofld_cleanup(esw, vport); + esw_offloads_vport_metadata_cleanup(esw, vport); } static int esw_create_uplink_offloads_acl_tables(struct mlx5_eswitch *esw) @@ -2571,38 +2630,11 @@ EXPORT_SYMBOL(mlx5_eswitch_vport_match_metadata_enabled); u32 mlx5_eswitch_get_vport_metadata_for_match(struct mlx5_eswitch *esw, u16 vport_num) { - u32 vport_num_mask = GENMASK(ESW_VPORT_BITS - 1, 0); - u32 vhca_id_mask = GENMASK(ESW_VHCA_ID_BITS - 1, 0); - u32 vhca_id = MLX5_CAP_GEN(esw->dev, vhca_id); - u32 val; + struct mlx5_vport *vport = mlx5_eswitch_get_vport(esw, vport_num); - /* Make sure the vhca_id fits the ESW_VHCA_ID_BITS */ - WARN_ON_ONCE(vhca_id >= BIT(ESW_VHCA_ID_BITS)); - - /* Trim vhca_id to ESW_VHCA_ID_BITS */ - vhca_id &= vhca_id_mask; - - /* Make sure pf and ecpf map to end of ESW_VPORT_BITS range so they - * don't overlap with VF numbers, and themselves, after trimming. - */ - WARN_ON_ONCE((MLX5_VPORT_UPLINK & vport_num_mask) < - vport_num_mask - 1); - WARN_ON_ONCE((MLX5_VPORT_ECPF & vport_num_mask) < - vport_num_mask - 1); - WARN_ON_ONCE((MLX5_VPORT_UPLINK & vport_num_mask) == - (MLX5_VPORT_ECPF & vport_num_mask)); - - /* Make sure that the VF vport_num fits ESW_VPORT_BITS and don't - * overlap with pf and ecpf. - */ - if (vport_num != MLX5_VPORT_UPLINK && - vport_num != MLX5_VPORT_ECPF) - WARN_ON_ONCE(vport_num >= vport_num_mask - 1); - - /* We can now trim vport_num to ESW_VPORT_BITS */ - vport_num &= vport_num_mask; + if (WARN_ON_ONCE(IS_ERR(vport))) + return 0; - val = (vhca_id << ESW_VPORT_BITS) | vport_num; - return val << (32 - ESW_SOURCE_PORT_METADATA_BITS); + return vport->metadata << (32 - ESW_SOURCE_PORT_METADATA_BITS); } EXPORT_SYMBOL(mlx5_eswitch_get_vport_metadata_for_match); -- cgit v1.2.3-59-g8ed1b From 88e96e533cfa11e996c59a44bbb6b0e0b9891970 Mon Sep 17 00:00:00 2001 From: Vu Pham Date: Mon, 2 Mar 2020 10:33:49 -0800 Subject: net/mlx5e: Slave representors sharing unique metadata for match Bonded slave representors' vports must share a unique metadata for match. On enslaving event of slave representor to lag device, allocate new unique "bond_metadata" for match if this is the first slave. The subsequent enslaved representors will share the same unique "bond_metadata". On unslaving event of slave representor, reset the slave representor's vport to use its own default metadata. Replace ingress acl and rx rules of the slave representors' vports using new vport->bond_metadata. Signed-off-by: Vu Pham Reviewed-by: Parav Pandit Reviewed-by: Roi Dayan Signed-off-by: Saeed Mahameed --- .../net/ethernet/mellanox/mlx5/core/en/rep/bond.c | 65 ++++++++++++++++++++-- drivers/net/ethernet/mellanox/mlx5/core/en_rep.c | 22 +++++++- drivers/net/ethernet/mellanox/mlx5/core/en_rep.h | 1 + 3 files changed, 80 insertions(+), 8 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/rep/bond.c b/drivers/net/ethernet/mellanox/mlx5/core/en/rep/bond.c index 932e94362ceb..13500f60bef6 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/rep/bond.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/rep/bond.c @@ -71,6 +71,7 @@ static void mlx5e_rep_bond_metadata_release(struct mlx5e_rep_bond_metadata *mdat netdev_dbg(mdata->lag_dev, "destroy rep_bond_metadata(%d)\n", mdata->metadata_reg_c_0); list_del(&mdata->list); + mlx5_esw_match_metadata_free(mdata->esw, mdata->metadata_reg_c_0); WARN_ON(!list_empty(&mdata->slaves_list)); kfree(mdata); } @@ -82,6 +83,8 @@ int mlx5e_rep_bond_enslave(struct mlx5_eswitch *esw, struct net_device *netdev, struct mlx5e_rep_bond_slave_entry *s_entry; struct mlx5e_rep_bond_metadata *mdata; struct mlx5e_rep_priv *rpriv; + struct mlx5e_priv *priv; + int err; ASSERT_RTNL(); @@ -96,6 +99,11 @@ int mlx5e_rep_bond_enslave(struct mlx5_eswitch *esw, struct net_device *netdev, mdata->lag_dev = lag_dev; mdata->esw = esw; INIT_LIST_HEAD(&mdata->slaves_list); + mdata->metadata_reg_c_0 = mlx5_esw_match_metadata_alloc(esw); + if (!mdata->metadata_reg_c_0) { + kfree(mdata); + return -ENOSPC; + } list_add(&mdata->list, &rpriv->uplink_priv.bond->metadata_list); netdev_dbg(lag_dev, "create rep_bond_metadata(%d)\n", @@ -103,14 +111,33 @@ int mlx5e_rep_bond_enslave(struct mlx5_eswitch *esw, struct net_device *netdev, } s_entry = kzalloc(sizeof(*s_entry), GFP_KERNEL); - if (!s_entry) - return -ENOMEM; + if (!s_entry) { + err = -ENOMEM; + goto entry_alloc_err; + } s_entry->netdev = netdev; + priv = netdev_priv(netdev); + rpriv = priv->ppriv; + + err = mlx5_esw_acl_ingress_vport_bond_update(esw, rpriv->rep->vport, + mdata->metadata_reg_c_0); + if (err) + goto ingress_err; + mdata->slaves++; list_add_tail(&s_entry->list, &mdata->slaves_list); + netdev_dbg(netdev, "enslave rep vport(%d) lag_dev(%s) metadata(0x%x)\n", + rpriv->rep->vport, lag_dev->name, mdata->metadata_reg_c_0); return 0; + +ingress_err: + kfree(s_entry); +entry_alloc_err: + if (!mdata->slaves) + mlx5e_rep_bond_metadata_release(mdata); + return err; } /* This must be called under rtnl_lock */ @@ -121,6 +148,7 @@ void mlx5e_rep_bond_unslave(struct mlx5_eswitch *esw, struct mlx5e_rep_bond_slave_entry *s_entry; struct mlx5e_rep_bond_metadata *mdata; struct mlx5e_rep_priv *rpriv; + struct mlx5e_priv *priv; ASSERT_RTNL(); @@ -133,7 +161,16 @@ void mlx5e_rep_bond_unslave(struct mlx5_eswitch *esw, if (!s_entry) return; + priv = netdev_priv(netdev); + rpriv = priv->ppriv; + + mlx5_esw_acl_ingress_vport_bond_update(esw, rpriv->rep->vport, 0); + mlx5e_rep_bond_update(priv, false); list_del(&s_entry->list); + + netdev_dbg(netdev, "unslave rep vport(%d) lag_dev(%s) metadata(0x%x)\n", + rpriv->rep->vport, lag_dev->name, mdata->metadata_reg_c_0); + if (--mdata->slaves == 0) mlx5e_rep_bond_metadata_release(mdata); kfree(s_entry); @@ -163,6 +200,7 @@ static void mlx5e_rep_changelowerstate_event(struct net_device *netdev, void *pt struct net_device *dev; u16 acl_vport_num; u16 fwd_vport_num; + int err; if (!mlx5e_rep_is_lag_netdev(netdev)) return; @@ -187,11 +225,28 @@ static void mlx5e_rep_changelowerstate_event(struct net_device *netdev, void *pt rpriv = priv->ppriv; acl_vport_num = rpriv->rep->vport; if (acl_vport_num != fwd_vport_num) { - mlx5_esw_acl_egress_vport_bond(priv->mdev->priv.eswitch, - fwd_vport_num, - acl_vport_num); + /* Only single rx_rule for unique bond_metadata should be + * present, delete it if it's saved as passive vport's + * rx_rule with destination as passive vport's root_ft + */ + mlx5e_rep_bond_update(priv, true); + err = mlx5_esw_acl_egress_vport_bond(priv->mdev->priv.eswitch, + fwd_vport_num, + acl_vport_num); + if (err) + netdev_warn(dev, + "configure slave vport(%d) egress fwd, err(%d)", + acl_vport_num, err); } } + + /* Insert new rx_rule for unique bond_metadata, save it as active vport's + * rx_rule with new destination as active vport's root_ft + */ + err = mlx5e_rep_bond_update(netdev_priv(netdev), false); + if (err) + netdev_warn(netdev, "configure active slave vport(%d) rx_rule, err(%d)", + fwd_vport_num, err); } static void mlx5e_rep_changeupper_event(struct net_device *netdev, void *ptr) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c index 12593d75e885..af89a4803c7d 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c @@ -854,6 +854,24 @@ static int mlx5e_create_rep_vport_rx_rule(struct mlx5e_priv *priv) return 0; } +static void rep_vport_rx_rule_destroy(struct mlx5e_priv *priv) +{ + struct mlx5e_rep_priv *rpriv = priv->ppriv; + + if (!rpriv->vport_rx_rule) + return; + + mlx5_del_flow_rules(rpriv->vport_rx_rule); + rpriv->vport_rx_rule = NULL; +} + +int mlx5e_rep_bond_update(struct mlx5e_priv *priv, bool cleanup) +{ + rep_vport_rx_rule_destroy(priv); + + return cleanup ? 0 : mlx5e_create_rep_vport_rx_rule(priv); +} + static int mlx5e_init_rep_rx(struct mlx5e_priv *priv) { struct mlx5_core_dev *mdev = priv->mdev; @@ -918,9 +936,7 @@ err_close_drop_rq: static void mlx5e_cleanup_rep_rx(struct mlx5e_priv *priv) { - struct mlx5e_rep_priv *rpriv = priv->ppriv; - - mlx5_del_flow_rules(rpriv->vport_rx_rule); + rep_vport_rx_rule_destroy(priv); mlx5e_destroy_rep_root_ft(priv); mlx5e_destroy_ttc_table(priv, &priv->fs.ttc); mlx5e_destroy_direct_tirs(priv, priv->direct_tir); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.h b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.h index ed741b6e6af2..da9f1686d525 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.h @@ -222,6 +222,7 @@ int mlx5e_rep_bond_enslave(struct mlx5_eswitch *esw, struct net_device *netdev, void mlx5e_rep_bond_unslave(struct mlx5_eswitch *esw, const struct net_device *netdev, const struct net_device *lag_dev); +int mlx5e_rep_bond_update(struct mlx5e_priv *priv, bool cleanup); bool mlx5e_is_uplink_rep(struct mlx5e_priv *priv); int mlx5e_add_sqs_fwd_rules(struct mlx5e_priv *priv); -- cgit v1.2.3-59-g8ed1b From 9728366f53d283be943ee99d5989f155e55fc077 Mon Sep 17 00:00:00 2001 From: Vu Pham Date: Thu, 12 Mar 2020 10:26:25 -0700 Subject: net/mlx5e: Use change upper event to setup representors' bond_metadata Use change upper event to detect slave representor from enslaving/unslaving to/from lag device. On enslaving event, call mlx5_enslave_rep() API to create, add this slave representor shadow entry to the slaves list of bond_metadata structure representing master lag device and use its metadata to setup ingress acl metadata header. On unslaving event, resetting the vport of unslaved representor to use its default ingress/egress acls and rx rules with its default_metadata. The last slave will free the shared bond_metadata and its unique metadata. Signed-off-by: Vu Pham Reviewed-by: Parav Pandit Reviewed-by: Roi Dayan Signed-off-by: Saeed Mahameed --- .../net/ethernet/mellanox/mlx5/core/en/rep/bond.c | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/rep/bond.c b/drivers/net/ethernet/mellanox/mlx5/core/en/rep/bond.c index 13500f60bef6..bdb71332cbf2 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/rep/bond.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/rep/bond.c @@ -164,8 +164,13 @@ void mlx5e_rep_bond_unslave(struct mlx5_eswitch *esw, priv = netdev_priv(netdev); rpriv = priv->ppriv; + /* Reset bond_metadata to zero first then reset all ingress/egress + * acls and rx rules of unslave representor's vport + */ mlx5_esw_acl_ingress_vport_bond_update(esw, rpriv->rep->vport, 0); + mlx5_esw_acl_egress_vport_unbond(esw, rpriv->rep->vport); mlx5e_rep_bond_update(priv, false); + list_del(&s_entry->list); netdev_dbg(netdev, "unslave rep vport(%d) lag_dev(%s) metadata(0x%x)\n", @@ -253,22 +258,23 @@ static void mlx5e_rep_changeupper_event(struct net_device *netdev, void *ptr) { struct netdev_notifier_changeupper_info *info = ptr; struct mlx5e_rep_priv *rpriv; + struct net_device *lag_dev; struct mlx5e_priv *priv; if (!mlx5e_rep_is_lag_netdev(netdev)) return; - /* Nothing to setup for new enslaved representor */ - if (info->linking) - return; - priv = netdev_priv(netdev); rpriv = priv->ppriv; - netdev_dbg(netdev, "Unslave, reset vport(%d) egress acl\n", rpriv->rep->vport); + lag_dev = info->upper_dev; - /* Reset all egress acl rules of unslave representor's vport */ - mlx5_esw_acl_egress_vport_unbond(priv->mdev->priv.eswitch, - rpriv->rep->vport); + netdev_dbg(netdev, "%sslave vport(%d) lag(%s)\n", + info->linking ? "en" : "un", rpriv->rep->vport, lag_dev->name); + + if (info->linking) + mlx5e_rep_bond_enslave(priv->mdev->priv.eswitch, netdev, lag_dev); + else + mlx5e_rep_bond_unslave(priv->mdev->priv.eswitch, netdev, lag_dev); } /* Bond device of representors and netdev events are used here in specific way -- cgit v1.2.3-59-g8ed1b From 810cbb25549b81f0c0848320f8a1614106d3a0e1 Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Thu, 14 May 2020 23:42:45 -0500 Subject: net/mlx5: Add missing mutex destroy Add mutex destroy calls to balance with mutex_init() done in the init path. Signed-off-by: Parav Pandit Reviewed-by: Moshe Shemesh Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/main.c | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c index 92f2395dd31a..30de3bf35c6d 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c @@ -1272,7 +1272,7 @@ static int mlx5_mdev_init(struct mlx5_core_dev *dev, int profile_idx) mlx5_debugfs_root); if (!priv->dbg_root) { dev_err(dev->device, "mlx5_core: error, Cannot create debugfs dir, aborting\n"); - return -ENOMEM; + goto err_dbg_root; } err = mlx5_health_init(dev); @@ -1289,15 +1289,27 @@ err_pagealloc_init: mlx5_health_cleanup(dev); err_health_init: debugfs_remove(dev->priv.dbg_root); - +err_dbg_root: + mutex_destroy(&priv->pgdir_mutex); + mutex_destroy(&priv->alloc_mutex); + mutex_destroy(&priv->bfregs.wc_head.lock); + mutex_destroy(&priv->bfregs.reg_head.lock); + mutex_destroy(&dev->intf_state_mutex); return err; } static void mlx5_mdev_uninit(struct mlx5_core_dev *dev) { + struct mlx5_priv *priv = &dev->priv; + mlx5_pagealloc_cleanup(dev); mlx5_health_cleanup(dev); debugfs_remove_recursive(dev->priv.dbg_root); + mutex_destroy(&priv->pgdir_mutex); + mutex_destroy(&priv->alloc_mutex); + mutex_destroy(&priv->bfregs.wc_head.lock); + mutex_destroy(&priv->bfregs.reg_head.lock); + mutex_destroy(&dev->intf_state_mutex); } #define MLX5_IB_MOD "mlx5_ib" -- cgit v1.2.3-59-g8ed1b From 4a5d5d7392106a48c7db345a3843e854b66ea0ff Mon Sep 17 00:00:00 2001 From: Eli Britstein Date: Mon, 11 May 2020 19:20:29 +0000 Subject: net/mlx5e: Helper function to set ethertype Set ethertype match in a helper function as a pre-step towards optimizing it. Signed-off-by: Eli Britstein Reviewed-by: Roi Dayan Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en/tc_ct.c | 5 +---- drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun.c | 21 +++++++++++++-------- drivers/net/ethernet/mellanox/mlx5/core/en_tc.c | 14 ++++++++++---- drivers/net/ethernet/mellanox/mlx5/core/en_tc.h | 3 +++ 4 files changed, 27 insertions(+), 16 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_ct.c b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_ct.c index 995b2ef1fb3b..ba72410c55fa 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_ct.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_ct.c @@ -134,10 +134,7 @@ mlx5_tc_ct_set_tuple_match(struct mlx5_flow_spec *spec, flow_rule_match_basic(rule, &match); - MLX5_SET(fte_match_set_lyr_2_4, headers_c, ethertype, - ntohs(match.mask->n_proto)); - MLX5_SET(fte_match_set_lyr_2_4, headers_v, ethertype, - ntohs(match.key->n_proto)); + mlx5e_tc_set_ethertype(headers_c, headers_v, &match); MLX5_SET(fte_match_set_lyr_2_4, headers_c, ip_protocol, match.mask->ip_proto); MLX5_SET(fte_match_set_lyr_2_4, headers_v, ip_protocol, diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun.c b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun.c index e99382f58807..6d7fded75264 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun.c @@ -512,6 +512,13 @@ int mlx5e_tc_tun_parse(struct net_device *filter_dev, } if (flow_rule_match_key(rule, FLOW_DISSECTOR_KEY_ENC_CONTROL)) { + struct flow_dissector_key_basic key_basic = {}; + struct flow_dissector_key_basic mask_basic = { + .n_proto = htons(0xFFFF), + }; + struct flow_match_basic match_basic = { + .key = &key_basic, .mask = &mask_basic, + }; struct flow_match_control match; u16 addr_type; @@ -537,10 +544,9 @@ int mlx5e_tc_tun_parse(struct net_device *filter_dev, dst_ipv4_dst_ipv6.ipv4_layout.ipv4, ntohl(match.key->dst)); - MLX5_SET_TO_ONES(fte_match_set_lyr_2_4, headers_c, - ethertype); - MLX5_SET(fte_match_set_lyr_2_4, headers_v, ethertype, - ETH_P_IP); + key_basic.n_proto = htons(ETH_P_IP); + mlx5e_tc_set_ethertype(headers_c, headers_v, + &match_basic); } else if (addr_type == FLOW_DISSECTOR_KEY_IPV6_ADDRS) { struct flow_match_ipv6_addrs match; @@ -563,10 +569,9 @@ int mlx5e_tc_tun_parse(struct net_device *filter_dev, &match.key->dst, MLX5_FLD_SZ_BYTES(ipv6_layout, ipv6)); - MLX5_SET_TO_ONES(fte_match_set_lyr_2_4, headers_c, - ethertype); - MLX5_SET(fte_match_set_lyr_2_4, headers_v, ethertype, - ETH_P_IPV6); + key_basic.n_proto = htons(ETH_P_IPV6); + mlx5e_tc_set_ethertype(headers_c, headers_v, + &match_basic); } } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c index 58f797da4d8d..680b9e090057 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c @@ -2020,6 +2020,15 @@ u32 mlx5e_tc_get_flow_tun_id(struct mlx5e_tc_flow *flow) return flow->tunnel_id; } +void mlx5e_tc_set_ethertype(void *headers_c, void *headers_v, + struct flow_match_basic *match) +{ + MLX5_SET(fte_match_set_lyr_2_4, headers_c, ethertype, + ntohs(match->mask->n_proto)); + MLX5_SET(fte_match_set_lyr_2_4, headers_v, ethertype, + ntohs(match->key->n_proto)); +} + static int parse_tunnel_attr(struct mlx5e_priv *priv, struct mlx5e_tc_flow *flow, struct mlx5_flow_spec *spec, @@ -2241,10 +2250,7 @@ static int __parse_cls_flower(struct mlx5e_priv *priv, struct flow_match_basic match; flow_rule_match_basic(rule, &match); - MLX5_SET(fte_match_set_lyr_2_4, headers_c, ethertype, - ntohs(match.mask->n_proto)); - MLX5_SET(fte_match_set_lyr_2_4, headers_v, ethertype, - ntohs(match.key->n_proto)); + mlx5e_tc_set_ethertype(headers_c, headers_v, &match); if (match.mask->n_proto) *match_level = MLX5_MATCH_L2; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.h b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.h index 037aa73bf9ab..144b71f571ea 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.h @@ -170,6 +170,9 @@ void dealloc_mod_hdr_actions(struct mlx5e_tc_mod_hdr_acts *mod_hdr_acts); struct mlx5e_tc_flow; u32 mlx5e_tc_get_flow_tun_id(struct mlx5e_tc_flow *flow); +void mlx5e_tc_set_ethertype(void *headers_c, void *headers_v, + struct flow_match_basic *match); + #if IS_ENABLED(CONFIG_MLX5_CLS_ACT) int mlx5e_tc_nic_init(struct mlx5e_priv *priv); -- cgit v1.2.3-59-g8ed1b From fca533041aac0426f5b5618a564aeb588fc125e9 Mon Sep 17 00:00:00 2001 From: Eli Britstein Date: Tue, 19 May 2020 05:55:59 +0000 Subject: net/mlx5e: Optimize performance for IPv4/IPv6 ethertype The HW is optimized for IPv4/IPv6. For such cases, pending capability, avoid matching on ethertype, and use ip_version field instead. Signed-off-by: Eli Britstein Reviewed-by: Roi Dayan Signed-off-by: Saeed Mahameed --- .../mellanox/mlx5/core/diag/fs_tracepoint.c | 85 +++++++++++----------- drivers/net/ethernet/mellanox/mlx5/core/en/tc_ct.c | 7 +- .../net/ethernet/mellanox/mlx5/core/en/tc_tun.c | 8 +- drivers/net/ethernet/mellanox/mlx5/core/en_tc.c | 40 +++++++--- drivers/net/ethernet/mellanox/mlx5/core/en_tc.h | 5 +- 5 files changed, 85 insertions(+), 60 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/diag/fs_tracepoint.c b/drivers/net/ethernet/mellanox/mlx5/core/diag/fs_tracepoint.c index 8ecac81a385d..a700f3c86899 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/diag/fs_tracepoint.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/diag/fs_tracepoint.c @@ -76,58 +76,59 @@ static void print_lyr_2_4_hdrs(struct trace_seq *p, .v = MLX5_GET(fte_match_set_lyr_2_4, value, dmac_47_16) << 16 | MLX5_GET(fte_match_set_lyr_2_4, value, dmac_15_0)}; MASK_VAL_L2(u16, ethertype, ethertype); + MASK_VAL_L2(u8, ip_version, ip_version); PRINT_MASKED_VALP(smac, u8 *, p, "%pM"); PRINT_MASKED_VALP(dmac, u8 *, p, "%pM"); PRINT_MASKED_VAL(ethertype, p, "%04x"); - if (ethertype.m == 0xffff) { - if (ethertype.v == ETH_P_IP) { + if ((ethertype.m == 0xffff && ethertype.v == ETH_P_IP) || + (ip_version.m == 0xf && ip_version.v == 4)) { #define MASK_VAL_L2_BE(type, name, fld) \ MASK_VAL_BE(type, fte_match_set_lyr_2_4, name, mask, value, fld) - MASK_VAL_L2_BE(u32, src_ipv4, - src_ipv4_src_ipv6.ipv4_layout.ipv4); - MASK_VAL_L2_BE(u32, dst_ipv4, - dst_ipv4_dst_ipv6.ipv4_layout.ipv4); + MASK_VAL_L2_BE(u32, src_ipv4, + src_ipv4_src_ipv6.ipv4_layout.ipv4); + MASK_VAL_L2_BE(u32, dst_ipv4, + dst_ipv4_dst_ipv6.ipv4_layout.ipv4); - PRINT_MASKED_VALP(src_ipv4, typeof(&src_ipv4.v), p, - "%pI4"); - PRINT_MASKED_VALP(dst_ipv4, typeof(&dst_ipv4.v), p, - "%pI4"); - } else if (ethertype.v == ETH_P_IPV6) { - static const struct in6_addr full_ones = { - .in6_u.u6_addr32 = {__constant_htonl(0xffffffff), - __constant_htonl(0xffffffff), - __constant_htonl(0xffffffff), - __constant_htonl(0xffffffff)}, - }; - DECLARE_MASK_VAL(struct in6_addr, src_ipv6); - DECLARE_MASK_VAL(struct in6_addr, dst_ipv6); + PRINT_MASKED_VALP(src_ipv4, typeof(&src_ipv4.v), p, + "%pI4"); + PRINT_MASKED_VALP(dst_ipv4, typeof(&dst_ipv4.v), p, + "%pI4"); + } else if ((ethertype.m == 0xffff && ethertype.v == ETH_P_IPV6) || + (ip_version.m == 0xf && ip_version.v == 6)) { + static const struct in6_addr full_ones = { + .in6_u.u6_addr32 = {__constant_htonl(0xffffffff), + __constant_htonl(0xffffffff), + __constant_htonl(0xffffffff), + __constant_htonl(0xffffffff)}, + }; + DECLARE_MASK_VAL(struct in6_addr, src_ipv6); + DECLARE_MASK_VAL(struct in6_addr, dst_ipv6); - memcpy(src_ipv6.m.in6_u.u6_addr8, - MLX5_ADDR_OF(fte_match_set_lyr_2_4, mask, - src_ipv4_src_ipv6.ipv6_layout.ipv6), - sizeof(src_ipv6.m)); - memcpy(dst_ipv6.m.in6_u.u6_addr8, - MLX5_ADDR_OF(fte_match_set_lyr_2_4, mask, - dst_ipv4_dst_ipv6.ipv6_layout.ipv6), - sizeof(dst_ipv6.m)); - memcpy(src_ipv6.v.in6_u.u6_addr8, - MLX5_ADDR_OF(fte_match_set_lyr_2_4, value, - src_ipv4_src_ipv6.ipv6_layout.ipv6), - sizeof(src_ipv6.v)); - memcpy(dst_ipv6.v.in6_u.u6_addr8, - MLX5_ADDR_OF(fte_match_set_lyr_2_4, value, - dst_ipv4_dst_ipv6.ipv6_layout.ipv6), - sizeof(dst_ipv6.v)); + memcpy(src_ipv6.m.in6_u.u6_addr8, + MLX5_ADDR_OF(fte_match_set_lyr_2_4, mask, + src_ipv4_src_ipv6.ipv6_layout.ipv6), + sizeof(src_ipv6.m)); + memcpy(dst_ipv6.m.in6_u.u6_addr8, + MLX5_ADDR_OF(fte_match_set_lyr_2_4, mask, + dst_ipv4_dst_ipv6.ipv6_layout.ipv6), + sizeof(dst_ipv6.m)); + memcpy(src_ipv6.v.in6_u.u6_addr8, + MLX5_ADDR_OF(fte_match_set_lyr_2_4, value, + src_ipv4_src_ipv6.ipv6_layout.ipv6), + sizeof(src_ipv6.v)); + memcpy(dst_ipv6.v.in6_u.u6_addr8, + MLX5_ADDR_OF(fte_match_set_lyr_2_4, value, + dst_ipv4_dst_ipv6.ipv6_layout.ipv6), + sizeof(dst_ipv6.v)); - if (!memcmp(&src_ipv6.m, &full_ones, sizeof(full_ones))) - trace_seq_printf(p, "src_ipv6=%pI6 ", - &src_ipv6.v); - if (!memcmp(&dst_ipv6.m, &full_ones, sizeof(full_ones))) - trace_seq_printf(p, "dst_ipv6=%pI6 ", - &dst_ipv6.v); - } + if (!memcmp(&src_ipv6.m, &full_ones, sizeof(full_ones))) + trace_seq_printf(p, "src_ipv6=%pI6 ", + &src_ipv6.v); + if (!memcmp(&dst_ipv6.m, &full_ones, sizeof(full_ones))) + trace_seq_printf(p, "dst_ipv6=%pI6 ", + &dst_ipv6.v); } #define PRINT_MASKED_VAL_L2(type, name, fld, p, format) {\ diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_ct.c b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_ct.c index ba72410c55fa..afc19dca1f5f 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_ct.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_ct.c @@ -119,7 +119,7 @@ mlx5_tc_ct_get_ct_priv(struct mlx5e_priv *priv) } static int -mlx5_tc_ct_set_tuple_match(struct mlx5_flow_spec *spec, +mlx5_tc_ct_set_tuple_match(struct mlx5e_priv *priv, struct mlx5_flow_spec *spec, struct flow_rule *rule) { void *headers_c = MLX5_ADDR_OF(fte_match_param, spec->match_criteria, @@ -134,7 +134,8 @@ mlx5_tc_ct_set_tuple_match(struct mlx5_flow_spec *spec, flow_rule_match_basic(rule, &match); - mlx5e_tc_set_ethertype(headers_c, headers_v, &match); + mlx5e_tc_set_ethertype(priv->mdev, &match, true, headers_c, + headers_v); MLX5_SET(fte_match_set_lyr_2_4, headers_c, ip_protocol, match.mask->ip_proto); MLX5_SET(fte_match_set_lyr_2_4, headers_v, ip_protocol, @@ -530,7 +531,7 @@ mlx5_tc_ct_entry_add_rule(struct mlx5_tc_ct_priv *ct_priv, attr->counter = entry->counter; attr->flags |= MLX5_ESW_ATTR_FLAG_NO_IN_PORT; - mlx5_tc_ct_set_tuple_match(spec, flow_rule); + mlx5_tc_ct_set_tuple_match(netdev_priv(ct_priv->netdev), spec, flow_rule); mlx5e_tc_match_to_reg_match(spec, ZONE_TO_REG, entry->zone & MLX5_CT_ZONE_MASK, MLX5_CT_ZONE_MASK); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun.c b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun.c index 6d7fded75264..7cce85faa16f 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun.c @@ -545,8 +545,8 @@ int mlx5e_tc_tun_parse(struct net_device *filter_dev, ntohl(match.key->dst)); key_basic.n_proto = htons(ETH_P_IP); - mlx5e_tc_set_ethertype(headers_c, headers_v, - &match_basic); + mlx5e_tc_set_ethertype(priv->mdev, &match_basic, true, + headers_c, headers_v); } else if (addr_type == FLOW_DISSECTOR_KEY_IPV6_ADDRS) { struct flow_match_ipv6_addrs match; @@ -570,8 +570,8 @@ int mlx5e_tc_tun_parse(struct net_device *filter_dev, ipv6)); key_basic.n_proto = htons(ETH_P_IPV6); - mlx5e_tc_set_ethertype(headers_c, headers_v, - &match_basic); + mlx5e_tc_set_ethertype(priv->mdev, &match_basic, true, + headers_c, headers_v); } } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c index 680b9e090057..0f119c08b835 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c @@ -2020,13 +2020,30 @@ u32 mlx5e_tc_get_flow_tun_id(struct mlx5e_tc_flow *flow) return flow->tunnel_id; } -void mlx5e_tc_set_ethertype(void *headers_c, void *headers_v, - struct flow_match_basic *match) -{ - MLX5_SET(fte_match_set_lyr_2_4, headers_c, ethertype, - ntohs(match->mask->n_proto)); - MLX5_SET(fte_match_set_lyr_2_4, headers_v, ethertype, - ntohs(match->key->n_proto)); +void mlx5e_tc_set_ethertype(struct mlx5_core_dev *mdev, + struct flow_match_basic *match, bool outer, + void *headers_c, void *headers_v) +{ + bool ip_version_cap; + + ip_version_cap = outer ? + MLX5_CAP_FLOWTABLE_NIC_RX(mdev, + ft_field_support.outer_ip_version) : + MLX5_CAP_FLOWTABLE_NIC_RX(mdev, + ft_field_support.inner_ip_version); + + if (ip_version_cap && match->mask->n_proto == htons(0xFFFF) && + (match->key->n_proto == htons(ETH_P_IP) || + match->key->n_proto == htons(ETH_P_IPV6))) { + MLX5_SET_TO_ONES(fte_match_set_lyr_2_4, headers_c, ip_version); + MLX5_SET(fte_match_set_lyr_2_4, headers_v, ip_version, + match->key->n_proto == htons(ETH_P_IP) ? 4 : 6); + } else { + MLX5_SET(fte_match_set_lyr_2_4, headers_c, ethertype, + ntohs(match->mask->n_proto)); + MLX5_SET(fte_match_set_lyr_2_4, headers_v, ethertype, + ntohs(match->key->n_proto)); + } } static int parse_tunnel_attr(struct mlx5e_priv *priv, @@ -2250,7 +2267,9 @@ static int __parse_cls_flower(struct mlx5e_priv *priv, struct flow_match_basic match; flow_rule_match_basic(rule, &match); - mlx5e_tc_set_ethertype(headers_c, headers_v, &match); + mlx5e_tc_set_ethertype(priv->mdev, &match, + match_level == outer_match_level, + headers_c, headers_v); if (match.mask->n_proto) *match_level = MLX5_MATCH_L2; @@ -3126,16 +3145,19 @@ static bool modify_header_match_supported(struct mlx5_flow_spec *spec, { const struct flow_action_entry *act; bool modify_ip_header; + void *headers_c; void *headers_v; u16 ethertype; u8 ip_proto; int i, err; + headers_c = get_match_headers_criteria(actions, spec); headers_v = get_match_headers_value(actions, spec); ethertype = MLX5_GET(fte_match_set_lyr_2_4, headers_v, ethertype); /* for non-IP we only re-write MACs, so we're okay */ - if (ethertype != ETH_P_IP && ethertype != ETH_P_IPV6) + if (MLX5_GET(fte_match_set_lyr_2_4, headers_c, ip_version) == 0 && + ethertype != ETH_P_IP && ethertype != ETH_P_IPV6) goto out_ok; modify_ip_header = false; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.h b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.h index 144b71f571ea..5c330b0cae21 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.h @@ -170,8 +170,9 @@ void dealloc_mod_hdr_actions(struct mlx5e_tc_mod_hdr_acts *mod_hdr_acts); struct mlx5e_tc_flow; u32 mlx5e_tc_get_flow_tun_id(struct mlx5e_tc_flow *flow); -void mlx5e_tc_set_ethertype(void *headers_c, void *headers_v, - struct flow_match_basic *match); +void mlx5e_tc_set_ethertype(struct mlx5_core_dev *mdev, + struct flow_match_basic *match, bool outer, + void *headers_c, void *headers_v); #if IS_ENABLED(CONFIG_MLX5_CLS_ACT) -- cgit v1.2.3-59-g8ed1b From cedb28191fdfb4fc1da0a7612465624998de7da2 Mon Sep 17 00:00:00 2001 From: Alex Vesker Date: Wed, 20 May 2020 18:09:14 +0300 Subject: net/mlx5: DR, Add a spinlock to protect the send ring Adding this lock will allow writing steering entries without locking the dr_domain and allow parallel insertion. Signed-off-by: Alex Vesker Reviewed-by: Mark Bloch Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/steering/dr_send.c | 13 +++++++++---- drivers/net/ethernet/mellanox/mlx5/core/steering/dr_types.h | 1 + 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_send.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_send.c index b8d97d44be7b..f421013b0b54 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_send.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_send.c @@ -357,9 +357,11 @@ static int dr_postsend_icm_data(struct mlx5dr_domain *dmn, u32 buff_offset; int ret; + spin_lock(&send_ring->lock); + ret = dr_handle_pending_wc(dmn, send_ring); if (ret) - return ret; + goto out_unlock; if (send_info->write.length > dmn->info.max_inline_size) { buff_offset = (send_ring->tx_head & @@ -377,7 +379,9 @@ static int dr_postsend_icm_data(struct mlx5dr_domain *dmn, dr_fill_data_segs(send_ring, send_info); dr_post_send(send_ring->qp, send_info); - return 0; +out_unlock: + spin_unlock(&send_ring->lock); + return ret; } static int dr_get_tbl_copy_details(struct mlx5dr_domain *dmn, @@ -563,9 +567,7 @@ int mlx5dr_send_postsend_action(struct mlx5dr_domain *dmn, send_info.remote_addr = action->rewrite.chunk->mr_addr; send_info.rkey = action->rewrite.chunk->rkey; - mutex_lock(&dmn->mutex); ret = dr_postsend_icm_data(dmn, &send_info); - mutex_unlock(&dmn->mutex); return ret; } @@ -886,6 +888,7 @@ int mlx5dr_send_ring_alloc(struct mlx5dr_domain *dmn) init_attr.pdn = dmn->pdn; init_attr.uar = dmn->uar; init_attr.max_send_wr = QUEUE_SIZE; + spin_lock_init(&dmn->send_ring->lock); dmn->send_ring->qp = dr_create_rc_qp(dmn->mdev, &init_attr); if (!dmn->send_ring->qp) { @@ -990,7 +993,9 @@ int mlx5dr_send_ring_force_drain(struct mlx5dr_domain *dmn) return ret; } + spin_lock(&send_ring->lock); ret = dr_handle_pending_wc(dmn, send_ring); + spin_unlock(&send_ring->lock); return ret; } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_types.h b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_types.h index 984783238baa..b6061c639cb1 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_types.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_types.h @@ -1043,6 +1043,7 @@ struct mlx5dr_send_ring { struct ib_wc wc[MAX_SEND_CQE]; u8 sync_buff[MIN_READ_SYNC]; struct mlx5dr_mr *sync_mr; + spinlock_t lock; /* Protect the data path of the send ring */ }; int mlx5dr_send_ring_alloc(struct mlx5dr_domain *dmn); -- cgit v1.2.3-59-g8ed1b From ed03a418abe8e5a3ba541a805314bbf8a9eadda3 Mon Sep 17 00:00:00 2001 From: Alex Vesker Date: Wed, 20 May 2020 18:09:35 +0300 Subject: net/mlx5: DR, Split RX and TX lock for parallel insertion Change the locking flow to support RX and TX locks, splitting the single lock to two will allow inserting rules in parallel for RX and TX parts of the FDB. Locking the dr_domain will be done by locking the RX domain and the TX domain locks, this is mostly used for control operations on the dr_domain. When inserting rules for RX or TX the single nic_doamin RX or TX lock will be used. Splitting the lock is safe since RX and TX domains are logically separated from each other, shared objects such the send-ring and memory pool are protected by locks. Signed-off-by: Alex Vesker Reviewed-by: Mark Bloch Reviewed-by: Erez Shitrit Signed-off-by: Saeed Mahameed --- .../mellanox/mlx5/core/steering/dr_domain.c | 14 +++++----- .../mellanox/mlx5/core/steering/dr_matcher.c | 10 +++---- .../ethernet/mellanox/mlx5/core/steering/dr_rule.c | 31 ++++++++++------------ .../mellanox/mlx5/core/steering/dr_table.c | 12 ++++----- .../mellanox/mlx5/core/steering/dr_types.h | 24 ++++++++++++++++- 5 files changed, 56 insertions(+), 35 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_domain.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_domain.c index 48b6358b6845..890767a2a7cb 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_domain.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_domain.c @@ -297,7 +297,8 @@ mlx5dr_domain_create(struct mlx5_core_dev *mdev, enum mlx5dr_domain_type type) dmn->mdev = mdev; dmn->type = type; refcount_set(&dmn->refcount, 1); - mutex_init(&dmn->mutex); + mutex_init(&dmn->info.rx.mutex); + mutex_init(&dmn->info.tx.mutex); if (dr_domain_caps_init(mdev, dmn)) { mlx5dr_err(dmn, "Failed init domain, no caps\n"); @@ -345,9 +346,9 @@ int mlx5dr_domain_sync(struct mlx5dr_domain *dmn, u32 flags) int ret = 0; if (flags & MLX5DR_DOMAIN_SYNC_FLAGS_SW) { - mutex_lock(&dmn->mutex); + mlx5dr_domain_lock(dmn); ret = mlx5dr_send_ring_force_drain(dmn); - mutex_unlock(&dmn->mutex); + mlx5dr_domain_unlock(dmn); if (ret) { mlx5dr_err(dmn, "Force drain failed flags: %d, ret: %d\n", flags, ret); @@ -371,7 +372,8 @@ int mlx5dr_domain_destroy(struct mlx5dr_domain *dmn) dr_domain_uninit_cache(dmn); dr_domain_uninit_resources(dmn); dr_domain_caps_uninit(dmn); - mutex_destroy(&dmn->mutex); + mutex_destroy(&dmn->info.tx.mutex); + mutex_destroy(&dmn->info.rx.mutex); kfree(dmn); return 0; } @@ -379,7 +381,7 @@ int mlx5dr_domain_destroy(struct mlx5dr_domain *dmn) void mlx5dr_domain_set_peer(struct mlx5dr_domain *dmn, struct mlx5dr_domain *peer_dmn) { - mutex_lock(&dmn->mutex); + mlx5dr_domain_lock(dmn); if (dmn->peer_dmn) refcount_dec(&dmn->peer_dmn->refcount); @@ -389,5 +391,5 @@ void mlx5dr_domain_set_peer(struct mlx5dr_domain *dmn, if (dmn->peer_dmn) refcount_inc(&dmn->peer_dmn->refcount); - mutex_unlock(&dmn->mutex); + mlx5dr_domain_unlock(dmn); } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_matcher.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_matcher.c index a95938874798..31abcbb95ca2 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_matcher.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_matcher.c @@ -690,7 +690,7 @@ mlx5dr_matcher_create(struct mlx5dr_table *tbl, refcount_set(&matcher->refcount, 1); INIT_LIST_HEAD(&matcher->matcher_list); - mutex_lock(&tbl->dmn->mutex); + mlx5dr_domain_lock(tbl->dmn); ret = dr_matcher_init(matcher, mask); if (ret) @@ -700,14 +700,14 @@ mlx5dr_matcher_create(struct mlx5dr_table *tbl, if (ret) goto matcher_uninit; - mutex_unlock(&tbl->dmn->mutex); + mlx5dr_domain_unlock(tbl->dmn); return matcher; matcher_uninit: dr_matcher_uninit(matcher); free_matcher: - mutex_unlock(&tbl->dmn->mutex); + mlx5dr_domain_unlock(tbl->dmn); kfree(matcher); dec_ref: refcount_dec(&tbl->refcount); @@ -791,13 +791,13 @@ int mlx5dr_matcher_destroy(struct mlx5dr_matcher *matcher) if (refcount_read(&matcher->refcount) > 1) return -EBUSY; - mutex_lock(&tbl->dmn->mutex); + mlx5dr_domain_lock(tbl->dmn); dr_matcher_remove_from_tbl(matcher); dr_matcher_uninit(matcher); refcount_dec(&matcher->tbl->refcount); - mutex_unlock(&tbl->dmn->mutex); + mlx5dr_domain_unlock(tbl->dmn); kfree(matcher); return 0; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_rule.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_rule.c index cce3ee7a6614..cd708dcc2e3a 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_rule.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_rule.c @@ -938,7 +938,10 @@ static bool dr_rule_verify(struct mlx5dr_matcher *matcher, static int dr_rule_destroy_rule_nic(struct mlx5dr_rule *rule, struct mlx5dr_rule_rx_tx *nic_rule) { + mlx5dr_domain_nic_lock(nic_rule->nic_matcher->nic_tbl->nic_dmn); dr_rule_clean_rule_members(rule, nic_rule); + mlx5dr_domain_nic_unlock(nic_rule->nic_matcher->nic_tbl->nic_dmn); + return 0; } @@ -1039,18 +1042,18 @@ dr_rule_create_rule_nic(struct mlx5dr_rule *rule, if (dr_rule_skip(dmn->type, nic_dmn->ste_type, &matcher->mask, param)) return 0; + hw_ste_arr = kzalloc(DR_RULE_MAX_STE_CHAIN * DR_STE_SIZE, GFP_KERNEL); + if (!hw_ste_arr) + return -ENOMEM; + + mlx5dr_domain_nic_lock(nic_dmn); + ret = mlx5dr_matcher_select_builders(matcher, nic_matcher, dr_rule_get_ipv(¶m->outer), dr_rule_get_ipv(¶m->inner)); if (ret) - goto out_err; - - hw_ste_arr = kzalloc(DR_RULE_MAX_STE_CHAIN * DR_STE_SIZE, GFP_KERNEL); - if (!hw_ste_arr) { - ret = -ENOMEM; - goto out_err; - } + goto free_hw_ste; /* Set the tag values inside the ste array */ ret = mlx5dr_ste_build_ste_arr(matcher, nic_matcher, param, hw_ste_arr); @@ -1115,6 +1118,8 @@ dr_rule_create_rule_nic(struct mlx5dr_rule *rule, if (htbl) mlx5dr_htbl_put(htbl); + mlx5dr_domain_nic_unlock(nic_dmn); + kfree(hw_ste_arr); return 0; @@ -1129,8 +1134,8 @@ free_rule: kfree(ste_info); } free_hw_ste: + mlx5dr_domain_nic_unlock(nic_dmn); kfree(hw_ste_arr); -out_err: return ret; } @@ -1232,31 +1237,23 @@ struct mlx5dr_rule *mlx5dr_rule_create(struct mlx5dr_matcher *matcher, { struct mlx5dr_rule *rule; - mutex_lock(&matcher->tbl->dmn->mutex); refcount_inc(&matcher->refcount); rule = dr_rule_create_rule(matcher, value, num_actions, actions); if (!rule) refcount_dec(&matcher->refcount); - mutex_unlock(&matcher->tbl->dmn->mutex); - return rule; } int mlx5dr_rule_destroy(struct mlx5dr_rule *rule) { struct mlx5dr_matcher *matcher = rule->matcher; - struct mlx5dr_table *tbl = rule->matcher->tbl; int ret; - mutex_lock(&tbl->dmn->mutex); - ret = dr_rule_destroy_rule(rule); - - mutex_unlock(&tbl->dmn->mutex); - if (!ret) refcount_dec(&matcher->refcount); + return ret; } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_table.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_table.c index c2fe48d7b75a..b599b6beb5b9 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_table.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_table.c @@ -14,7 +14,7 @@ int mlx5dr_table_set_miss_action(struct mlx5dr_table *tbl, if (action && action->action_type != DR_ACTION_TYP_FT) return -EOPNOTSUPP; - mutex_lock(&tbl->dmn->mutex); + mlx5dr_domain_lock(tbl->dmn); if (!list_empty(&tbl->matcher_list)) last_matcher = list_last_entry(&tbl->matcher_list, @@ -78,7 +78,7 @@ int mlx5dr_table_set_miss_action(struct mlx5dr_table *tbl, refcount_inc(&action->refcount); out: - mutex_unlock(&tbl->dmn->mutex); + mlx5dr_domain_unlock(tbl->dmn); return ret; } @@ -95,7 +95,7 @@ static void dr_table_uninit_fdb(struct mlx5dr_table *tbl) static void dr_table_uninit(struct mlx5dr_table *tbl) { - mutex_lock(&tbl->dmn->mutex); + mlx5dr_domain_lock(tbl->dmn); switch (tbl->dmn->type) { case MLX5DR_DOMAIN_TYPE_NIC_RX: @@ -112,7 +112,7 @@ static void dr_table_uninit(struct mlx5dr_table *tbl) break; } - mutex_unlock(&tbl->dmn->mutex); + mlx5dr_domain_unlock(tbl->dmn); } static int dr_table_init_nic(struct mlx5dr_domain *dmn, @@ -177,7 +177,7 @@ static int dr_table_init(struct mlx5dr_table *tbl) INIT_LIST_HEAD(&tbl->matcher_list); - mutex_lock(&tbl->dmn->mutex); + mlx5dr_domain_lock(tbl->dmn); switch (tbl->dmn->type) { case MLX5DR_DOMAIN_TYPE_NIC_RX: @@ -201,7 +201,7 @@ static int dr_table_init(struct mlx5dr_table *tbl) break; } - mutex_unlock(&tbl->dmn->mutex); + mlx5dr_domain_unlock(tbl->dmn); return ret; } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_types.h b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_types.h index b6061c639cb1..c6d5a81d138b 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_types.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_types.h @@ -636,6 +636,7 @@ struct mlx5dr_domain_rx_tx { u64 drop_icm_addr; u64 default_icm_addr; enum mlx5dr_ste_entry_type ste_type; + struct mutex mutex; /* protect rx/tx domain */ }; struct mlx5dr_domain_info { @@ -660,7 +661,6 @@ struct mlx5dr_domain { struct mlx5_uars_page *uar; enum mlx5dr_domain_type type; refcount_t refcount; - struct mutex mutex; /* protect domain */ struct mlx5dr_icm_pool *ste_icm_pool; struct mlx5dr_icm_pool *action_icm_pool; struct mlx5dr_send_ring *send_ring; @@ -814,6 +814,28 @@ struct mlx5dr_icm_chunk { struct list_head *miss_list; }; +static inline void mlx5dr_domain_nic_lock(struct mlx5dr_domain_rx_tx *nic_dmn) +{ + mutex_lock(&nic_dmn->mutex); +} + +static inline void mlx5dr_domain_nic_unlock(struct mlx5dr_domain_rx_tx *nic_dmn) +{ + mutex_unlock(&nic_dmn->mutex); +} + +static inline void mlx5dr_domain_lock(struct mlx5dr_domain *dmn) +{ + mlx5dr_domain_nic_lock(&dmn->info.rx); + mlx5dr_domain_nic_lock(&dmn->info.tx); +} + +static inline void mlx5dr_domain_unlock(struct mlx5dr_domain *dmn) +{ + mlx5dr_domain_nic_unlock(&dmn->info.tx); + mlx5dr_domain_nic_unlock(&dmn->info.rx); +} + static inline int mlx5dr_matcher_supp_flex_parser_icmp_v4(struct mlx5dr_cmd_caps *caps) { -- cgit v1.2.3-59-g8ed1b From 3f0d97cdfe6e900a5c71817b0dfb77247afee36d Mon Sep 17 00:00:00 2001 From: Krzysztof Kazimierczak Date: Fri, 15 May 2020 17:42:20 -0700 Subject: ice: Check UMEM FQ size when allocating bufs If a UMEM is present on a queue when an interface/queue pair is being enabled, the driver will try to prepare the Rx buffers in advance to improve performance. However, if fill queue is shorter than HW Rx ring, the driver will report failure after getting the last address from the fill queue. This still lets the driver process the packets correctly during the NAPI poll, but leads to a constant NAPI rescheduling. Not allocating the buffers in advance would result in a potential performance decrease. Commit d57d76428ae9 ("xsk: Add API to check for available entries in FQ") provides an API that lets drivers check the number of addresses that the fill queue holds. Notify the user if fill queue is not long enough to prepare all buffers before packet processing starts, and allocate the buffers during the NAPI poll. If the fill queue size is sufficient, prepare Rx buffers in advance. Signed-off-by: Krzysztof Kazimierczak Tested-by: Andrew Bowers Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/ice/ice_base.c | 30 +++++++++++++++++++++--------- 1 file changed, 21 insertions(+), 9 deletions(-) diff --git a/drivers/net/ethernet/intel/ice/ice_base.c b/drivers/net/ethernet/intel/ice/ice_base.c index 18076e0d12d0..a174911d8994 100644 --- a/drivers/net/ethernet/intel/ice/ice_base.c +++ b/drivers/net/ethernet/intel/ice/ice_base.c @@ -281,7 +281,9 @@ ice_setup_tx_ctx(struct ice_ring *ring, struct ice_tlan_ctx *tlan_ctx, u16 pf_q) */ int ice_setup_rx_ctx(struct ice_ring *ring) { + struct device *dev = ice_pf_to_dev(ring->vsi->back); int chain_len = ICE_MAX_CHAINED_RX_BUFS; + u16 num_bufs = ICE_DESC_UNUSED(ring); struct ice_vsi *vsi = ring->vsi; u32 rxdid = ICE_RXDID_FLEX_NIC; struct ice_rlan_ctx rlan_ctx; @@ -324,7 +326,7 @@ int ice_setup_rx_ctx(struct ice_ring *ring) return err; xsk_buff_set_rxq_info(ring->xsk_umem, &ring->xdp_rxq); - dev_info(ice_pf_to_dev(vsi->back), "Registered XDP mem model MEM_TYPE_XSK_BUFF_POOL on Rx ring %d\n", + dev_info(dev, "Registered XDP mem model MEM_TYPE_XSK_BUFF_POOL on Rx ring %d\n", ring->q_index); } else { if (!xdp_rxq_info_is_reg(&ring->xdp_rxq)) @@ -408,7 +410,7 @@ int ice_setup_rx_ctx(struct ice_ring *ring) /* Absolute queue number out of 2K needs to be passed */ err = ice_write_rxq_ctx(hw, &rlan_ctx, pf_q); if (err) { - dev_err(ice_pf_to_dev(vsi->back), "Failed to set LAN Rx queue context for absolute Rx queue %d error: %d\n", + dev_err(dev, "Failed to set LAN Rx queue context for absolute Rx queue %d error: %d\n", pf_q, err); return -EIO; } @@ -426,13 +428,23 @@ int ice_setup_rx_ctx(struct ice_ring *ring) ring->tail = hw->hw_addr + QRX_TAIL(pf_q); writel(0, ring->tail); - err = ring->xsk_umem ? - ice_alloc_rx_bufs_zc(ring, ICE_DESC_UNUSED(ring)) : - ice_alloc_rx_bufs(ring, ICE_DESC_UNUSED(ring)); - if (err) - dev_info(ice_pf_to_dev(vsi->back), "Failed allocate some buffers on %sRx ring %d (pf_q %d)\n", - ring->xsk_umem ? "UMEM enabled " : "", - ring->q_index, pf_q); + if (ring->xsk_umem) { + if (!xsk_buff_can_alloc(ring->xsk_umem, num_bufs)) { + dev_warn(dev, "UMEM does not provide enough addresses to fill %d buffers on Rx ring %d\n", + num_bufs, ring->q_index); + dev_warn(dev, "Change Rx ring/fill queue size to avoid performance issues\n"); + + return 0; + } + + err = ice_alloc_rx_bufs_zc(ring, num_bufs); + if (err) + dev_info(dev, "Failed to allocate some buffers on UMEM enabled Rx ring %d (pf_q %d)\n", + ring->q_index, pf_q); + return 0; + } + + ice_alloc_rx_bufs(ring, num_bufs); return 0; } -- cgit v1.2.3-59-g8ed1b From a7528198add88a6f51b8ade17a5cf86804b8f7ee Mon Sep 17 00:00:00 2001 From: Markus Theil Date: Wed, 27 May 2020 18:03:34 +0200 Subject: mac80211: support control port TX status reporting Add support for TX status reporting for the control port TX API; this will be used by hostapd when it moves to the control port TX API. Signed-off-by: Markus Theil Link: https://lore.kernel.org/r/20200527160334.19224-1-markus.theil@tu-ilmenau.de [fix commit message, it was referring to nl80211] Signed-off-by: Johannes Berg --- net/mac80211/ieee80211_i.h | 3 ++- net/mac80211/main.c | 2 ++ net/mac80211/status.c | 9 ++++++- net/mac80211/tdls.c | 2 +- net/mac80211/tx.c | 62 ++++++++++++++++++++++++++++++++-------------- 5 files changed, 56 insertions(+), 22 deletions(-) diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index b87dc873825b..b7935f3d000d 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -1783,7 +1783,8 @@ netdev_tx_t ieee80211_subif_start_xmit_8023(struct sk_buff *skb, void __ieee80211_subif_start_xmit(struct sk_buff *skb, struct net_device *dev, u32 info_flags, - u32 ctrl_flags); + u32 ctrl_flags, + u64 *cookie); void ieee80211_purge_tx_queue(struct ieee80211_hw *hw, struct sk_buff_head *skbs); struct sk_buff * diff --git a/net/mac80211/main.c b/net/mac80211/main.c index ac74bd780b42..b4a2efe8e83a 100644 --- a/net/mac80211/main.c +++ b/net/mac80211/main.c @@ -596,6 +596,8 @@ struct ieee80211_hw *ieee80211_alloc_hw_nm(size_t priv_data_len, NL80211_EXT_FEATURE_CONTROL_PORT_OVER_NL80211); wiphy_ext_feature_set(wiphy, NL80211_EXT_FEATURE_CONTROL_PORT_NO_PREAUTH); + wiphy_ext_feature_set(wiphy, + NL80211_EXT_FEATURE_CONTROL_PORT_OVER_NL80211_TX_STATUS); wiphy_ext_feature_set(wiphy, NL80211_EXT_FEATURE_SCAN_FREQ_KHZ); diff --git a/net/mac80211/status.c b/net/mac80211/status.c index 22512805eafb..7b1bacac39c6 100644 --- a/net/mac80211/status.c +++ b/net/mac80211/status.c @@ -649,10 +649,17 @@ static void ieee80211_report_ack_skb(struct ieee80211_local *local, info->status.ack_signal, info->status.is_valid_ack_signal, GFP_ATOMIC); - else + else if (ieee80211_is_mgmt(hdr->frame_control)) cfg80211_mgmt_tx_status(&sdata->wdev, cookie, skb->data, skb->len, acked, GFP_ATOMIC); + else + cfg80211_control_port_tx_status(&sdata->wdev, + cookie, + skb->data, + skb->len, + acked, + GFP_ATOMIC); } rcu_read_unlock(); diff --git a/net/mac80211/tdls.c b/net/mac80211/tdls.c index 8ad420db3766..4b0cff4a07bd 100644 --- a/net/mac80211/tdls.c +++ b/net/mac80211/tdls.c @@ -1054,7 +1054,7 @@ ieee80211_tdls_prep_mgmt_packet(struct wiphy *wiphy, struct net_device *dev, /* disable bottom halves when entering the Tx path */ local_bh_disable(); - __ieee80211_subif_start_xmit(skb, dev, flags, 0); + __ieee80211_subif_start_xmit(skb, dev, flags, 0, NULL); local_bh_enable(); return ret; diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index 5931128e1855..e9ce658141f5 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -2436,13 +2436,19 @@ int ieee80211_lookup_ra_sta(struct ieee80211_sub_if_data *sdata, return 0; } -static int ieee80211_store_ack_skb(struct ieee80211_local *local, +static u16 ieee80211_store_ack_skb(struct ieee80211_local *local, struct sk_buff *skb, - u32 *info_flags) + u32 *info_flags, + u64 *cookie) { - struct sk_buff *ack_skb = skb_clone_sk(skb); + struct sk_buff *ack_skb; u16 info_id = 0; + if (skb->sk) + ack_skb = skb_clone_sk(skb); + else + ack_skb = skb_clone(skb, GFP_ATOMIC); + if (ack_skb) { unsigned long flags; int id; @@ -2455,6 +2461,10 @@ static int ieee80211_store_ack_skb(struct ieee80211_local *local, if (id >= 0) { info_id = id; *info_flags |= IEEE80211_TX_CTL_REQ_TX_STATUS; + if (cookie) { + *cookie = ieee80211_mgmt_tx_cookie(local); + IEEE80211_SKB_CB(ack_skb)->ack.cookie = *cookie; + } } else { kfree_skb(ack_skb); } @@ -2484,7 +2494,8 @@ static int ieee80211_store_ack_skb(struct ieee80211_local *local, */ static struct sk_buff *ieee80211_build_hdr(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb, u32 info_flags, - struct sta_info *sta, u32 ctrl_flags) + struct sta_info *sta, u32 ctrl_flags, + u64 *cookie) { struct ieee80211_local *local = sdata->local; struct ieee80211_tx_info *info; @@ -2755,9 +2766,11 @@ static struct sk_buff *ieee80211_build_hdr(struct ieee80211_sub_if_data *sdata, goto free; } - if (unlikely(!multicast && skb->sk && - skb_shinfo(skb)->tx_flags & SKBTX_WIFI_STATUS)) - info_id = ieee80211_store_ack_skb(local, skb, &info_flags); + if (unlikely(!multicast && ((skb->sk && + skb_shinfo(skb)->tx_flags & SKBTX_WIFI_STATUS) || + ctrl_flags & IEEE80211_TX_CTL_REQ_TX_STATUS))) + info_id = ieee80211_store_ack_skb(local, skb, &info_flags, + cookie); /* * If the skb is shared we need to obtain our own copy. @@ -3913,7 +3926,8 @@ EXPORT_SYMBOL(ieee80211_txq_schedule_start); void __ieee80211_subif_start_xmit(struct sk_buff *skb, struct net_device *dev, u32 info_flags, - u32 ctrl_flags) + u32 ctrl_flags, + u64 *cookie) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); struct ieee80211_local *local = sdata->local; @@ -3983,7 +3997,7 @@ void __ieee80211_subif_start_xmit(struct sk_buff *skb, skb_mark_not_on_list(skb); skb = ieee80211_build_hdr(sdata, skb, info_flags, - sta, ctrl_flags); + sta, ctrl_flags, cookie); if (IS_ERR(skb)) { kfree_skb_list(next); goto out; @@ -4125,9 +4139,9 @@ netdev_tx_t ieee80211_subif_start_xmit(struct sk_buff *skb, __skb_queue_head_init(&queue); ieee80211_convert_to_unicast(skb, dev, &queue); while ((skb = __skb_dequeue(&queue))) - __ieee80211_subif_start_xmit(skb, dev, 0, 0); + __ieee80211_subif_start_xmit(skb, dev, 0, 0, NULL); } else { - __ieee80211_subif_start_xmit(skb, dev, 0, 0); + __ieee80211_subif_start_xmit(skb, dev, 0, 0, NULL); } return NETDEV_TX_OK; @@ -4215,7 +4229,7 @@ static void ieee80211_8023_xmit(struct ieee80211_sub_if_data *sdata, if (unlikely(!multicast && skb->sk && skb_shinfo(skb)->tx_flags & SKBTX_WIFI_STATUS)) - ieee80211_store_ack_skb(local, skb, &info->flags); + ieee80211_store_ack_skb(local, skb, &info->flags, NULL); memset(info, 0, sizeof(*info)); @@ -4299,7 +4313,7 @@ ieee80211_build_data_template(struct ieee80211_sub_if_data *sdata, goto out; } - skb = ieee80211_build_hdr(sdata, skb, info_flags, sta, 0); + skb = ieee80211_build_hdr(sdata, skb, info_flags, sta, 0, NULL); if (IS_ERR(skb)) goto out; @@ -5347,7 +5361,7 @@ int ieee80211_tx_control_port(struct wiphy *wiphy, struct net_device *dev, struct sk_buff *skb; struct ethhdr *ehdr; u32 ctrl_flags = 0; - u32 flags; + u32 flags = 0; /* Only accept CONTROL_PORT_PROTOCOL configured in CONNECT/ASSOCIATE * or Pre-Authentication @@ -5360,9 +5374,13 @@ int ieee80211_tx_control_port(struct wiphy *wiphy, struct net_device *dev, ctrl_flags |= IEEE80211_TX_CTRL_PORT_CTRL_PROTO; if (unencrypted) - flags = IEEE80211_TX_INTFL_DONT_ENCRYPT; - else - flags = 0; + flags |= IEEE80211_TX_INTFL_DONT_ENCRYPT; + + if (cookie) + ctrl_flags |= IEEE80211_TX_CTL_REQ_TX_STATUS; + + flags |= IEEE80211_TX_INTFL_NL80211_FRAME_TX | + IEEE80211_TX_CTL_INJECTED; skb = dev_alloc_skb(local->hw.extra_tx_headroom + sizeof(struct ethhdr) + len); @@ -5383,10 +5401,15 @@ int ieee80211_tx_control_port(struct wiphy *wiphy, struct net_device *dev, skb_reset_network_header(skb); skb_reset_mac_header(skb); + /* mutex lock is only needed for incrementing the cookie counter */ + mutex_lock(&local->mtx); + local_bh_disable(); - __ieee80211_subif_start_xmit(skb, skb->dev, flags, ctrl_flags); + __ieee80211_subif_start_xmit(skb, skb->dev, flags, ctrl_flags, cookie); local_bh_enable(); + mutex_unlock(&local->mtx); + return 0; } @@ -5413,7 +5436,8 @@ int ieee80211_probe_mesh_link(struct wiphy *wiphy, struct net_device *dev, local_bh_disable(); __ieee80211_subif_start_xmit(skb, skb->dev, 0, - IEEE80211_TX_CTRL_SKIP_MPATH_LOOKUP); + IEEE80211_TX_CTRL_SKIP_MPATH_LOOKUP, + NULL); local_bh_enable(); return 0; -- cgit v1.2.3-59-g8ed1b From bf0ddd104167bfc08a5a169b3669f06c9052c1b0 Mon Sep 17 00:00:00 2001 From: "Azamat H. Hackimov" Date: Sun, 24 May 2020 20:41:29 +0300 Subject: Bluetooth: btbcm: Added 003.006.007, changed 001.003.015 Added new Broadcom device BCM4350C5, changed BCM4354A2 to BCM4356A2. Based on Broadcom Windows drivers 001.003.015 should be BCM4356A2. I have user report that firmware name is misplaced (https://github.com/winterheart/broadcom-bt-firmware/issues/3). Signed-off-by: Azamat H. Hackimov Signed-off-by: Marcel Holtmann --- drivers/bluetooth/btbcm.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/bluetooth/btbcm.c b/drivers/bluetooth/btbcm.c index df7a8a22e53c..1b9743b7f2ef 100644 --- a/drivers/bluetooth/btbcm.c +++ b/drivers/bluetooth/btbcm.c @@ -414,11 +414,12 @@ static const struct bcm_subver_table bcm_usb_subver_table[] = { { 0x2118, "BCM20702A0" }, /* 001.001.024 */ { 0x2126, "BCM4335A0" }, /* 001.001.038 */ { 0x220e, "BCM20702A1" }, /* 001.002.014 */ - { 0x230f, "BCM4354A2" }, /* 001.003.015 */ + { 0x230f, "BCM4356A2" }, /* 001.003.015 */ { 0x4106, "BCM4335B0" }, /* 002.001.006 */ { 0x410e, "BCM20702B0" }, /* 002.001.014 */ { 0x6109, "BCM4335C0" }, /* 003.001.009 */ { 0x610c, "BCM4354" }, /* 003.001.012 */ + { 0x6607, "BCM4350C5" }, /* 003.006.007 */ { } }; -- cgit v1.2.3-59-g8ed1b From 7307f29687fda5486fa3bf2f9a5abe7a352bbce3 Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Fri, 15 May 2020 19:03:42 +0200 Subject: mt76: mt7615: introduce remain_on_channel support Introduce remain_on_channel support to mt7615 driver if the device is running offload firmware Co-developed-by: Sean Wang Signed-off-by: Sean Wang Signed-off-by: Lorenzo Bianconi Signed-off-by: Felix Fietkau --- drivers/net/wireless/mediatek/mt76/mt76.h | 1 + drivers/net/wireless/mediatek/mt76/mt7615/init.c | 10 +++ drivers/net/wireless/mediatek/mt76/mt7615/mac.c | 10 ++- drivers/net/wireless/mediatek/mt76/mt7615/main.c | 79 ++++++++++++++++++++++ drivers/net/wireless/mediatek/mt76/mt7615/mcu.c | 51 ++++++++++++++ drivers/net/wireless/mediatek/mt76/mt7615/mcu.h | 19 ++++++ drivers/net/wireless/mediatek/mt76/mt7615/mt7615.h | 9 +++ drivers/net/wireless/mediatek/mt76/mt7615/usb.c | 2 + 8 files changed, 179 insertions(+), 2 deletions(-) diff --git a/drivers/net/wireless/mediatek/mt76/mt76.h b/drivers/net/wireless/mediatek/mt76/mt76.h index e6de4a1b8f26..e2926e091c0f 100644 --- a/drivers/net/wireless/mediatek/mt76/mt76.h +++ b/drivers/net/wireless/mediatek/mt76/mt76.h @@ -290,6 +290,7 @@ enum { MT76_STATE_POWER_OFF, MT76_STATE_PS, MT76_STATE_SUSPEND, + MT76_STATE_ROC, }; struct mt76_hw_cap { diff --git a/drivers/net/wireless/mediatek/mt76/mt7615/init.c b/drivers/net/wireless/mediatek/mt76/mt7615/init.c index 1d8fdc7e062b..18a570ffb815 100644 --- a/drivers/net/wireless/mediatek/mt76/mt7615/init.c +++ b/drivers/net/wireless/mediatek/mt76/mt7615/init.c @@ -139,8 +139,10 @@ void mt7615_check_offload_capability(struct mt7615_dev *dev) ieee80211_hw_set(hw, SUPPORTS_PS); ieee80211_hw_set(hw, SUPPORTS_DYNAMIC_PS); + wiphy->max_remain_on_channel_duration = 5000; wiphy->features |= NL80211_FEATURE_SCHED_SCAN_RANDOM_MAC_ADDR | NL80211_FEATURE_SCAN_RANDOM_MAC_ADDR | + WIPHY_FLAG_HAS_REMAIN_ON_CHANNEL | NL80211_FEATURE_P2P_GO_CTWIN | NL80211_FEATURE_P2P_GO_OPPPS; } else { @@ -149,6 +151,8 @@ void mt7615_check_offload_capability(struct mt7615_dev *dev) dev->ops->sched_scan_start = NULL; dev->ops->sched_scan_stop = NULL; dev->ops->set_rekey_data = NULL; + dev->ops->remain_on_channel = NULL; + dev->ops->cancel_remain_on_channel = NULL; wiphy->max_sched_scan_plan_interval = 0; wiphy->max_sched_scan_ie_len = 0; @@ -373,6 +377,9 @@ int mt7615_register_ext_phy(struct mt7615_dev *dev) skb_queue_head_init(&phy->scan_event_list); INIT_WORK(&phy->ps_work, mt7615_ps_work); + INIT_WORK(&phy->roc_work, mt7615_roc_work); + timer_setup(&phy->roc_timer, mt7615_roc_timer, 0); + init_waitqueue_head(&phy->roc_wait); mt7615_cap_dbdc_enable(dev); mphy = mt76_alloc_phy(&dev->mt76, sizeof(*phy), &mt7615_ops); @@ -437,9 +444,12 @@ void mt7615_init_device(struct mt7615_dev *dev) INIT_LIST_HEAD(&dev->sta_poll_list); spin_lock_init(&dev->sta_poll_lock); init_waitqueue_head(&dev->reset_wait); + init_waitqueue_head(&dev->phy.roc_wait); INIT_WORK(&dev->reset_work, mt7615_mac_reset_work); INIT_WORK(&dev->phy.ps_work, mt7615_ps_work); + INIT_WORK(&dev->phy.roc_work, mt7615_roc_work); + timer_setup(&dev->phy.roc_timer, mt7615_roc_timer, 0); mt7615_init_wiphy(hw); dev->mphy.sband_2g.sband.ht_cap.cap |= IEEE80211_HT_CAP_LDPC_CODING; diff --git a/drivers/net/wireless/mediatek/mt76/mt7615/mac.c b/drivers/net/wireless/mediatek/mt76/mt7615/mac.c index 7d65a3fb0c23..6b5c38ab9f5d 100644 --- a/drivers/net/wireless/mediatek/mt76/mt7615/mac.c +++ b/drivers/net/wireless/mediatek/mt76/mt7615/mac.c @@ -175,7 +175,8 @@ mt7615_get_status_freq_info(struct mt7615_dev *dev, struct mt76_phy *mphy, struct mt76_rx_status *status, u8 chfreq) { if (!test_bit(MT76_HW_SCANNING, &mphy->state) && - !test_bit(MT76_HW_SCHED_SCANNING, &mphy->state)) { + !test_bit(MT76_HW_SCHED_SCANNING, &mphy->state) && + !test_bit(MT76_STATE_ROC, &mphy->state)) { status->freq = mphy->chandef.chan->center_freq; status->band = mphy->chandef.chan->band; return; @@ -1849,8 +1850,13 @@ void mt7615_mac_reset_work(struct work_struct *work) set_bit(MT76_MCU_RESET, &dev->mphy.state); wake_up(&dev->mt76.mcu.wait); cancel_delayed_work_sync(&dev->phy.mac_work); - if (phy2) + del_timer_sync(&dev->phy.roc_timer); + cancel_work_sync(&dev->phy.roc_work); + if (phy2) { cancel_delayed_work_sync(&phy2->mac_work); + del_timer_sync(&phy2->roc_timer); + cancel_work_sync(&phy2->roc_work); + } /* lock/unlock all queues to ensure that no tx is pending */ mt76_txq_schedule_all(&dev->mphy); diff --git a/drivers/net/wireless/mediatek/mt76/mt7615/main.c b/drivers/net/wireless/mediatek/mt76/mt7615/main.c index 2e9e9d3519d7..f8cbee1770ce 100644 --- a/drivers/net/wireless/mediatek/mt76/mt7615/main.c +++ b/drivers/net/wireless/mediatek/mt76/mt7615/main.c @@ -72,6 +72,8 @@ static void mt7615_stop(struct ieee80211_hw *hw) cancel_delayed_work_sync(&phy->mac_work); cancel_work_sync(&phy->ps_work); + del_timer_sync(&phy->roc_timer); + cancel_work_sync(&phy->roc_work); mutex_lock(&dev->mt76.mutex); @@ -791,6 +793,37 @@ mt7615_set_antenna(struct ieee80211_hw *hw, u32 tx_ant, u32 rx_ant) return 0; } +static void mt7615_roc_iter(void *priv, u8 *mac, + struct ieee80211_vif *vif) +{ + struct mt7615_phy *phy = priv; + + mt7615_mcu_set_roc(phy, vif, NULL, 0); +} + +void mt7615_roc_work(struct work_struct *work) +{ + struct mt7615_phy *phy; + + phy = (struct mt7615_phy *)container_of(work, struct mt7615_phy, + roc_work); + + if (!test_and_clear_bit(MT76_STATE_ROC, &phy->mt76->state)) + return; + + ieee80211_iterate_active_interfaces(phy->mt76->hw, + IEEE80211_IFACE_ITER_RESUME_ALL, + mt7615_roc_iter, phy); + ieee80211_remain_on_channel_expired(phy->mt76->hw); +} + +void mt7615_roc_timer(struct timer_list *timer) +{ + struct mt7615_phy *phy = from_timer(phy, timer, roc_timer); + + ieee80211_queue_work(phy->mt76->hw, &phy->roc_work); +} + void mt7615_scan_work(struct work_struct *work) { struct mt7615_phy *phy; @@ -864,6 +897,50 @@ mt7615_stop_sched_scan(struct ieee80211_hw *hw, struct ieee80211_vif *vif) return mt7615_mcu_sched_scan_enable(mphy->priv, vif, false); } +static int mt7615_remain_on_channel(struct ieee80211_hw *hw, + struct ieee80211_vif *vif, + struct ieee80211_channel *chan, + int duration, + enum ieee80211_roc_type type) +{ + struct mt7615_phy *phy = mt7615_hw_phy(hw); + int err; + + if (test_and_set_bit(MT76_STATE_ROC, &phy->mt76->state)) + return 0; + + err = mt7615_mcu_set_roc(phy, vif, chan, duration); + if (err < 0) { + clear_bit(MT76_STATE_ROC, &phy->mt76->state); + return err; + } + + if (!wait_event_timeout(phy->roc_wait, phy->roc_grant, HZ)) { + mt7615_mcu_set_roc(phy, vif, NULL, 0); + clear_bit(MT76_STATE_ROC, &phy->mt76->state); + + return -ETIMEDOUT; + } + + return 0; +} + +static int mt7615_cancel_remain_on_channel(struct ieee80211_hw *hw, + struct ieee80211_vif *vif) +{ + struct mt7615_phy *phy = mt7615_hw_phy(hw); + + if (!test_and_clear_bit(MT76_STATE_ROC, &phy->mt76->state)) + return 0; + + del_timer_sync(&phy->roc_timer); + cancel_work_sync(&phy->roc_work); + + mt7615_mcu_set_roc(phy, vif, NULL, 0); + + return 0; +} + #ifdef CONFIG_PM static int mt7615_suspend(struct ieee80211_hw *hw, struct cfg80211_wowlan *wowlan) @@ -978,6 +1055,8 @@ const struct ieee80211_ops mt7615_ops = { .cancel_hw_scan = mt7615_cancel_hw_scan, .sched_scan_start = mt7615_start_sched_scan, .sched_scan_stop = mt7615_stop_sched_scan, + .remain_on_channel = mt7615_remain_on_channel, + .cancel_remain_on_channel = mt7615_cancel_remain_on_channel, #ifdef CONFIG_PM .suspend = mt7615_suspend, .resume = mt7615_resume, diff --git a/drivers/net/wireless/mediatek/mt76/mt7615/mcu.c b/drivers/net/wireless/mediatek/mt76/mt7615/mcu.c index b944f372738a..7eb99bde3394 100644 --- a/drivers/net/wireless/mediatek/mt76/mt7615/mcu.c +++ b/drivers/net/wireless/mediatek/mt76/mt7615/mcu.c @@ -359,6 +359,33 @@ mt7615_mcu_scan_event(struct mt7615_dev *dev, struct sk_buff *skb) MT7615_HW_SCAN_TIMEOUT); } +static void +mt7615_mcu_roc_event(struct mt7615_dev *dev, struct sk_buff *skb) +{ + struct mt7615_roc_tlv *event; + struct mt7615_phy *phy; + struct mt76_phy *mphy; + int duration; + + skb_pull(skb, sizeof(struct mt7615_mcu_rxd)); + event = (struct mt7615_roc_tlv *)skb->data; + + if (event->dbdc_band && dev->mt76.phy2) + mphy = dev->mt76.phy2; + else + mphy = &dev->mt76.phy; + + ieee80211_ready_on_channel(mphy->hw); + + phy = (struct mt7615_phy *)mphy->priv; + phy->roc_grant = true; + wake_up(&phy->roc_wait); + + duration = le32_to_cpu(event->max_interval); + mod_timer(&phy->roc_timer, + round_jiffies_up(jiffies + msecs_to_jiffies(duration))); +} + static void mt7615_mcu_beacon_loss_iter(void *priv, u8 *mac, struct ieee80211_vif *vif) { @@ -426,6 +453,9 @@ mt7615_mcu_rx_unsolicited_event(struct mt7615_dev *dev, struct sk_buff *skb) case MCU_EVENT_BSS_BEACON_LOSS: mt7615_mcu_beacon_loss_event(dev, skb); break; + case MCU_EVENT_ROC: + mt7615_mcu_roc_event(dev, skb); + break; case MCU_EVENT_SCHED_SCAN_DONE: case MCU_EVENT_SCAN_DONE: mt7615_mcu_scan_event(dev, skb); @@ -451,6 +481,7 @@ void mt7615_mcu_rx_event(struct mt7615_dev *dev, struct sk_buff *skb) rxd->eid == MCU_EVENT_SCHED_SCAN_DONE || rxd->eid == MCU_EVENT_BSS_ABSENCE || rxd->eid == MCU_EVENT_SCAN_DONE || + rxd->eid == MCU_EVENT_ROC || !rxd->seq) mt7615_mcu_rx_unsolicited_event(dev, skb); else @@ -3601,6 +3632,26 @@ int mt7615_mcu_update_gtk_rekey(struct ieee80211_hw *hw, } #endif /* CONFIG_PM */ +int mt7615_mcu_set_roc(struct mt7615_phy *phy, struct ieee80211_vif *vif, + struct ieee80211_channel *chan, int duration) +{ + struct mt7615_vif *mvif = (struct mt7615_vif *)vif->drv_priv; + struct mt7615_dev *dev = phy->dev; + struct mt7615_roc_tlv req = { + .bss_idx = mvif->idx, + .active = !chan, + .max_interval = cpu_to_le32(duration), + .primary_chan = chan ? chan->hw_value : 0, + .band = chan ? chan->band : 0, + .req_type = 2, + }; + + phy->roc_grant = false; + + return __mt76_mcu_send_msg(&dev->mt76, MCU_CMD_SET_ROC, &req, + sizeof(req), false); +} + int mt7615_mcu_set_p2p_oppps(struct ieee80211_hw *hw, struct ieee80211_vif *vif) { diff --git a/drivers/net/wireless/mediatek/mt76/mt7615/mcu.h b/drivers/net/wireless/mediatek/mt76/mt7615/mcu.h index 348521b0d44c..fd40d99f5a23 100644 --- a/drivers/net/wireless/mediatek/mt76/mt7615/mcu.h +++ b/drivers/net/wireless/mediatek/mt76/mt7615/mcu.h @@ -82,6 +82,7 @@ enum { MCU_EVENT_ACCESS_REG = 0x02, MCU_EVENT_MT_PATCH_SEM = 0x04, MCU_EVENT_SCAN_DONE = 0x0d, + MCU_EVENT_ROC = 0x10, MCU_EVENT_BSS_ABSENCE = 0x11, MCU_EVENT_BSS_BEACON_LOSS = 0x13, MCU_EVENT_CH_PRIVILEGE = 0x18, @@ -525,6 +526,23 @@ struct mt7615_gtk_rekey_tlv { u8 reserverd[3]; } __packed; +struct mt7615_roc_tlv { + u8 bss_idx; + u8 token; + u8 active; + u8 primary_chan; + u8 sco; + u8 band; + u8 width; /* To support 80/160MHz bandwidth */ + u8 freq_seg1; /* To support 80/160MHz bandwidth */ + u8 freq_seg2; /* To support 80/160MHz bandwidth */ + u8 req_type; + u8 dbdc_band; + u8 rsv0; + __le32 max_interval; /* ms */ + u8 rsv1[8]; +} __packed; + /* offload mcu commands */ enum { MCU_CMD_START_HW_SCAN = MCU_CE_PREFIX | 0x03, @@ -533,6 +551,7 @@ enum { MCU_CMD_SET_BSS_CONNECTED = MCU_CE_PREFIX | 0x16, MCU_CMD_SET_BSS_ABORT = MCU_CE_PREFIX | 0x17, MCU_CMD_CANCEL_HW_SCAN = MCU_CE_PREFIX | 0x1b, + MCU_CMD_SET_ROC = MCU_CE_PREFIX | 0x1c, MCU_CMD_SET_P2P_OPPPS = MCU_CE_PREFIX | 0x33, MCU_CMD_SCHED_SCAN_ENABLE = MCU_CE_PREFIX | 0x61, MCU_CMD_SCHED_SCAN_REQ = MCU_CE_PREFIX | 0x62, diff --git a/drivers/net/wireless/mediatek/mt76/mt7615/mt7615.h b/drivers/net/wireless/mediatek/mt76/mt7615/mt7615.h index ebdfca64b079..71d5d5973116 100644 --- a/drivers/net/wireless/mediatek/mt76/mt7615/mt7615.h +++ b/drivers/net/wireless/mediatek/mt76/mt7615/mt7615.h @@ -201,6 +201,11 @@ struct mt7615_phy { struct sk_buff_head scan_event_list; struct delayed_work scan_work; + struct work_struct roc_work; + struct timer_list roc_timer; + wait_queue_head_t roc_wait; + bool roc_grant; + struct work_struct ps_work; }; @@ -441,6 +446,8 @@ static inline u16 mt7615_wtbl_size(struct mt7615_dev *dev) void mt7615_dma_reset(struct mt7615_dev *dev); void mt7615_scan_work(struct work_struct *work); +void mt7615_roc_work(struct work_struct *work); +void mt7615_roc_timer(struct timer_list *timer); void mt7615_ps_work(struct work_struct *work); void mt7615_init_txpower(struct mt7615_dev *dev, struct ieee80211_supported_band *sband); @@ -532,6 +539,8 @@ int mt7615_dfs_init_radar_detector(struct mt7615_phy *phy); int mt7615_mcu_set_p2p_oppps(struct ieee80211_hw *hw, struct ieee80211_vif *vif); +int mt7615_mcu_set_roc(struct mt7615_phy *phy, struct ieee80211_vif *vif, + struct ieee80211_channel *chan, int duration); int mt7615_firmware_own(struct mt7615_dev *dev); int mt7615_driver_own(struct mt7615_dev *dev); diff --git a/drivers/net/wireless/mediatek/mt76/mt7615/usb.c b/drivers/net/wireless/mediatek/mt76/mt7615/usb.c index d74253319622..c292b41c76e3 100644 --- a/drivers/net/wireless/mediatek/mt76/mt7615/usb.c +++ b/drivers/net/wireless/mediatek/mt76/mt7615/usb.c @@ -52,6 +52,8 @@ static void mt7663u_stop(struct ieee80211_hw *hw) clear_bit(MT76_STATE_RUNNING, &dev->mphy.state); cancel_work_sync(&phy->ps_work); + del_timer_sync(&phy->roc_timer); + cancel_work_sync(&phy->roc_work); cancel_delayed_work_sync(&phy->scan_work); cancel_delayed_work_sync(&phy->mac_work); mt76u_stop_tx(&dev->mt76); -- cgit v1.2.3-59-g8ed1b From 802b836a01cf4a4c8a0ac67f2567a8f743b50701 Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Fri, 15 May 2020 19:05:59 +0200 Subject: mt76: mt76x02: remove check in mt76x02_mcu_msg_send mt76x02_mcu_msg_send is run just by mmio code so get rid of mt76_is_mmio() check Signed-off-by: Lorenzo Bianconi Signed-off-by: Felix Fietkau --- drivers/net/wireless/mediatek/mt76/mt76x02_mcu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/wireless/mediatek/mt76/mt76x02_mcu.c b/drivers/net/wireless/mediatek/mt76/mt76x02_mcu.c index 89a8992d84fa..267058086a90 100644 --- a/drivers/net/wireless/mediatek/mt76/mt76x02_mcu.c +++ b/drivers/net/wireless/mediatek/mt76/mt76x02_mcu.c @@ -20,7 +20,7 @@ int mt76x02_mcu_msg_send(struct mt76_dev *mdev, int cmd, const void *data, int ret; u8 seq; - if (mt76_is_mmio(&dev->mt76) && dev->mcu_timeout) + if (dev->mcu_timeout) return -EIO; skb = mt76_mcu_msg_alloc(mdev, data, len); -- cgit v1.2.3-59-g8ed1b From 06acdd380a7d3893a1115c6a6ef83961cee21f98 Mon Sep 17 00:00:00 2001 From: Ryder Lee Date: Sat, 16 May 2020 03:33:28 +0800 Subject: mt76: mt7915: add spatial reuse support Enable or disable OBSS PD when the bss config changes or we assoc to an AP that broadcasts the IE. With this patch, we can get ~20% gain in OBSS OTA environment. Tested-by: Evelyn Tsai Signed-off-by: Ryder Lee Signed-off-by: Felix Fietkau --- drivers/net/wireless/mediatek/mt76/mt7915/main.c | 7 +++++- drivers/net/wireless/mediatek/mt76/mt7915/mcu.c | 25 ++++++++++++++++++++++ drivers/net/wireless/mediatek/mt76/mt7915/mcu.h | 1 + drivers/net/wireless/mediatek/mt76/mt7915/mt7915.h | 2 ++ 4 files changed, 34 insertions(+), 1 deletion(-) diff --git a/drivers/net/wireless/mediatek/mt76/mt7915/main.c b/drivers/net/wireless/mediatek/mt76/mt7915/main.c index 98567374c2c9..e045dc234100 100644 --- a/drivers/net/wireless/mediatek/mt76/mt7915/main.c +++ b/drivers/net/wireless/mediatek/mt76/mt7915/main.c @@ -437,8 +437,10 @@ static void mt7915_bss_info_changed(struct ieee80211_hw *hw, mt7915_mcu_add_sta(dev, vif, NULL, join); } - if (changed & BSS_CHANGED_ASSOC) + if (changed & BSS_CHANGED_ASSOC) { mt7915_mcu_add_bss_info(phy, vif, info->assoc); + mt7915_mcu_add_obss_spr(dev, vif, info->he_obss_pd.enable); + } if (changed & BSS_CHANGED_ERP_SLOT) { int slottime = info->use_short_slot ? 9 : 20; @@ -458,6 +460,9 @@ static void mt7915_bss_info_changed(struct ieee80211_hw *hw, if (changed & (BSS_CHANGED_QOS | BSS_CHANGED_BEACON_ENABLED)) mt7915_mcu_set_tx(dev, vif); + if (changed & BSS_CHANGED_HE_OBSS_PD) + mt7915_mcu_add_obss_spr(dev, vif, info->he_obss_pd.enable); + if (changed & (BSS_CHANGED_BEACON | BSS_CHANGED_BEACON_ENABLED)) mt7915_mcu_add_beacon(hw, vif, info->enable_beacon); diff --git a/drivers/net/wireless/mediatek/mt76/mt7915/mcu.c b/drivers/net/wireless/mediatek/mt76/mt7915/mcu.c index 99eeea42478f..2edff868b7c9 100644 --- a/drivers/net/wireless/mediatek/mt76/mt7915/mcu.c +++ b/drivers/net/wireless/mediatek/mt76/mt7915/mcu.c @@ -3159,3 +3159,28 @@ int mt7915_mcu_set_txbf_sounding(struct mt7915_dev *dev) return __mt76_mcu_send_msg(&dev->mt76, MCU_EXT_CMD_TXBF_ACTION, &req, sizeof(req), true); } + +int mt7915_mcu_add_obss_spr(struct mt7915_dev *dev, struct ieee80211_vif *vif, + bool enable) +{ +#define MT_SPR_ENABLE 1 + struct mt7915_vif *mvif = (struct mt7915_vif *)vif->drv_priv; + struct { + u8 action; + u8 arg_num; + u8 band_idx; + u8 status; + u8 drop_tx_idx; + u8 sta_idx; /* 256 sta */ + u8 rsv[2]; + u32 val; + } __packed req = { + .action = MT_SPR_ENABLE, + .arg_num = 1, + .band_idx = mvif->band_idx, + .val = enable, + }; + + return __mt76_mcu_send_msg(&dev->mt76, MCU_EXT_CMD_SET_SPR, + &req, sizeof(req), true); +} diff --git a/drivers/net/wireless/mediatek/mt76/mt7915/mcu.h b/drivers/net/wireless/mediatek/mt76/mt7915/mcu.h index 34ace6e672d0..c241dd7c4c36 100644 --- a/drivers/net/wireless/mediatek/mt76/mt7915/mcu.h +++ b/drivers/net/wireless/mediatek/mt76/mt7915/mcu.h @@ -212,6 +212,7 @@ enum { MCU_EXT_CMD_RATE_CTRL = 0x87, MCU_EXT_CMD_FW_DBG_CTRL = 0x95, MCU_EXT_CMD_SET_RDD_TH = 0x9d, + MCU_EXT_CMD_SET_SPR = 0xa8, }; enum { diff --git a/drivers/net/wireless/mediatek/mt76/mt7915/mt7915.h b/drivers/net/wireless/mediatek/mt76/mt7915/mt7915.h index 5392292a838e..85d74ecd0351 100644 --- a/drivers/net/wireless/mediatek/mt76/mt7915/mt7915.h +++ b/drivers/net/wireless/mediatek/mt76/mt7915/mt7915.h @@ -301,6 +301,8 @@ int mt7915_mcu_add_key(struct mt7915_dev *dev, struct ieee80211_vif *vif, enum set_key_cmd cmd); int mt7915_mcu_add_beacon(struct ieee80211_hw *hw, struct ieee80211_vif *vif, int enable); +int mt7915_mcu_add_obss_spr(struct mt7915_dev *dev, struct ieee80211_vif *vif, + bool enable); int mt7915_mcu_add_rate_ctrl(struct mt7915_dev *dev, struct ieee80211_vif *vif, struct ieee80211_sta *sta); int mt7915_mcu_add_smps(struct mt7915_dev *dev, struct ieee80211_vif *vif, -- cgit v1.2.3-59-g8ed1b From f9a5c0561029c856afe5860495a9dfe8e9004b02 Mon Sep 17 00:00:00 2001 From: Ryder Lee Date: Sat, 16 May 2020 17:05:18 +0800 Subject: mt76: mt7915: fix some sparse warnings This fixes the following sparse warning: drivers/net/wireless/mediatek/mt76/mt7915/mcu.c:253:16: sparse: sparse: mixing different enum types: drivers/net/wireless/mediatek/mt76/mt7915/mcu.c:253:16: sparse: unsigned int enum mt7915_txq_id drivers/net/wireless/mediatek/mt76/mt7915/mcu.c:253:16: sparse: unsigned int enum mt76_txq_id drivers/net/wireless/mediatek/mt76/mt7915/mcu.c:758:63: sparse: sparse: incorrect type in argument 2 (different address spaces) drivers/net/wireless/mediatek/mt76/mt7915/mcu.c:758:63: sparse: expected unsigned char const [usertype] *ies drivers/net/wireless/mediatek/mt76/mt7915/mcu.c:758:63: sparse: got unsigned char const [noderef] * drivers/net/wireless/mediatek/mt76/mt7915/mcu.c:1390:23: sparse: sparse: incorrect type in argument 1 (different base types) drivers/net/wireless/mediatek/mt76/mt7915/mcu.c:1390:23: sparse: expected unsigned int w drivers/net/wireless/mediatek/mt76/mt7915/mcu.c:1390:23: sparse: got restricted __le32 [usertype] supp_ht_mcs drivers/net/wireless/mediatek/mt76/mt7915/mcu.c:1390:23: sparse: sparse: restricted __le32 degrades to integer drivers/net/wireless/mediatek/mt76/mt7915/mcu.c:1429:60: sparse: sparse: bad assignment (>>=) to restricted __le16 drivers/net/wireless/mediatek/mt76/mt7915/mcu.c:1773:16: sparse: sparse: restricted __le32 degrades to integer Fixes: 6094f86fb371 ("mt76: mt7915: add HE bss_conf support for interfaces") Reported-by: kbuild test robot Signed-off-by: Ryder Lee Signed-off-by: Felix Fietkau --- drivers/net/wireless/mediatek/mt76/mt7915/mcu.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/drivers/net/wireless/mediatek/mt76/mt7915/mcu.c b/drivers/net/wireless/mediatek/mt76/mt7915/mcu.c index 2edff868b7c9..695364d35eb2 100644 --- a/drivers/net/wireless/mediatek/mt76/mt7915/mcu.c +++ b/drivers/net/wireless/mediatek/mt76/mt7915/mcu.c @@ -220,7 +220,7 @@ static int __mt7915_mcu_msg_send(struct mt7915_dev *dev, struct sk_buff *skb, { struct mt7915_mcu_txd *mcu_txd; u8 seq, pkt_fmt, qidx; - enum mt7915_txq_id txq; + enum mt76_txq_id txq; __le32 *txd; u32 val; @@ -815,8 +815,7 @@ static void mt7915_check_he_obss_narrow_bw_ru_iter(struct wiphy *wiphy, struct mt7915_he_obss_narrow_bw_ru_data *data = _data; const struct element *elem; - elem = cfg80211_find_elem(WLAN_EID_EXT_CAPABILITY, bss->ies->data, - bss->ies->len); + elem = ieee80211_bss_get_elem(bss, WLAN_EID_EXT_CAPABILITY); if (!elem || elem->datalen < 10 || !(elem->data[10] & @@ -1954,7 +1953,7 @@ mt7915_mcu_sta_rate_ctrl_tlv(struct sk_buff *skb, struct mt7915_dev *dev, ra->supp_ht_mcs = *(__le32 *)ra->ht_mcs; ra->supp_mode |= MODE_HT; - mcs = hweight32(ra->supp_ht_mcs) - 1; + mcs = hweight32(le32_to_cpu(ra->supp_ht_mcs)) - 1; ra->af = sta->ht_cap.ampdu_factor; ra->ht_gf = !!(sta->ht_cap.cap & IEEE80211_HT_CAP_GRN_FLD); @@ -1972,7 +1971,7 @@ mt7915_mcu_sta_rate_ctrl_tlv(struct sk_buff *skb, struct mt7915_dev *dev, } if (sta->vht_cap.vht_supported) { - __le16 mcs_map = sta->vht_cap.vht_mcs.rx_mcs_map; + u16 mcs_map = le16_to_cpu(sta->vht_cap.vht_mcs.rx_mcs_map); u16 vht_mcs; u8 af, mcs_prev; @@ -2399,7 +2398,7 @@ static int mt7915_mcu_init_download(struct mt7915_dev *dev, u32 addr, }; int attr; - if (req.addr == MCU_PATCH_ADDRESS) + if (req.addr == cpu_to_le32(MCU_PATCH_ADDRESS)) attr = -MCU_CMD_PATCH_START_REQ; else attr = -MCU_CMD_TARGET_ADDRESS_LEN_REQ; -- cgit v1.2.3-59-g8ed1b From 19e29c69cc4760c0d340ac1fa7f8c423fcd70a08 Mon Sep 17 00:00:00 2001 From: Ryder Lee Date: Sat, 16 May 2020 17:05:19 +0800 Subject: mt76: mt7915: fix sparse warnings: incorrect type initializer drivers/net/wireless/mediatek/mt76/mt7915/mcu.c:2317:31: sparse: sparse: incorrect type in initializer (different base types) Fixes: 5517f78b0063 ("mt76: mt7915: enable firmware module debug support") Reported-by: kbuild test robot Signed-off-by: Ryder Lee Signed-off-by: Felix Fietkau --- drivers/net/wireless/mediatek/mt76/mt7915/mcu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/wireless/mediatek/mt76/mt7915/mcu.c b/drivers/net/wireless/mediatek/mt76/mt7915/mcu.c index 695364d35eb2..8460cd453213 100644 --- a/drivers/net/wireless/mediatek/mt76/mt7915/mcu.c +++ b/drivers/net/wireless/mediatek/mt76/mt7915/mcu.c @@ -2671,7 +2671,7 @@ int mt7915_mcu_fw_dbg_ctrl(struct mt7915_dev *dev, u32 module, u8 level) u16 len; u8 level; u8 rsv[3]; - u32 module_idx; + __le32 module_idx; } data = { .module_idx = cpu_to_le32(module), .level = level, -- cgit v1.2.3-59-g8ed1b From 4c04f25dd449a825fbcd7610c7f20be1e51b088d Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Sat, 16 May 2020 14:56:33 +0200 Subject: mt76: mt7615: fix NULL pointer deref in mt7615_register_ext_phy Fix following NULL pointer dereference in mt7615_register_ext_phy routine [ 27.648860] Unable to handle kernel NULL pointer dereference at virtual address 0000000000000060 [ 27.657697] Mem abort info: [ 27.660495] ESR = 0x96000046 [ 27.663549] EC = 0x25: DABT (current EL), IL = 32 bits [ 27.668857] SET = 0, FnV = 0 [ 27.671910] EA = 0, S1PTW = 0 [ 27.675040] Data abort info: [ 27.677918] ISV = 0, ISS = 0x00000046 [ 27.681751] CM = 0, WnR = 1 [ 27.684717] user pgtable: 4k pages, 39-bit VAs, pgdp=000000007d8cc000 [ 27.691156] [0000000000000060] pgd=000000007d281003, pud=000000007d281003, pmd=0000000000000000 [ 27.699857] Internal error: Oops: 96000046 [#1] SMP [ 27.774939] CPU: 1 PID: 701 Comm: ash Not tainted 5.4.41 #0 [ 27.780500] Hardware name: Bananapi BPI-R64 (DT) [ 27.785108] pstate: 60000005 (nZCv daif -PAN -UAO) [ 27.789897] pc : mt7615_register_ext_phy+0x60/0x2c8 [mt7615_common] [ 27.796156] lr : mt7615_init_debugfs+0x99c/0x18e0 [mt7615_common] [ 27.802237] sp : ffffffc0115dbcb0 [ 27.805541] x29: ffffffc0115dbcb0 x28: ffffff803e309600 [ 27.810843] x27: 0000000000000000 x26: 0000000000000000 [ 27.816144] x25: ffffff803d936928 x24: ffffff803d936950 [ 27.821447] x23: 0000000000000000 x22: 0000000fffffffe0 [ 27.826749] x21: 0000000000000002 x20: ffffff8001e82620 [ 27.832050] x19: 0000000000000000 x18: 0000000000000000 [ 27.837352] x17: 0000000000000000 x16: 0000000000000000 [ 27.842653] x15: 0000000000000000 x14: 0000000000000000 [ 27.847955] x13: 0000000000000000 x12: 0000000000000000 [ 27.853256] x11: 0000000000000000 x10: 0000000000000040 [ 27.858558] x9 : ffffffc0112b3eb0 x8 : ffffffc0112b3ea8 [ 27.863859] x7 : ffffff803e400048 x6 : 0000000000000000 [ 27.869161] x5 : ffffff803e400000 x4 : 0000000000000000 [ 27.874462] x3 : 0000000000000001 x2 : 0000000000007615 [ 27.879764] x1 : 0000000000000068 x0 : ffffffc0088ccc58 [ 27.885066] Call trace: [ 27.887505] mt7615_register_ext_phy+0x60/0x2c8 [mt7615_common] [ 27.893416] mt7615_init_debugfs+0x99c/0x18e0 [mt7615_common] [ 27.899156] simple_attr_write+0xf0/0x178 [ 27.903158] debugfs_attr_write+0x4c/0x70 [ 27.907159] full_proxy_write+0x60/0x90 [ 27.910987] __vfs_write+0x18/0x40 [ 27.914379] vfs_write+0xb0/0x1b8 [ 27.917685] ksys_write+0x4c/0xc8 [ 27.920989] __arm64_sys_write+0x18/0x20 Signed-off-by: Lorenzo Bianconi Signed-off-by: Felix Fietkau --- drivers/net/wireless/mediatek/mt76/mt7615/init.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/drivers/net/wireless/mediatek/mt76/mt7615/init.c b/drivers/net/wireless/mediatek/mt76/mt7615/init.c index 18a570ffb815..0d105e4abdfd 100644 --- a/drivers/net/wireless/mediatek/mt76/mt7615/init.c +++ b/drivers/net/wireless/mediatek/mt76/mt7615/init.c @@ -372,15 +372,6 @@ int mt7615_register_ext_phy(struct mt7615_dev *dev) if (phy) return 0; - INIT_DELAYED_WORK(&phy->mac_work, mt7615_mac_work); - INIT_DELAYED_WORK(&phy->scan_work, mt7615_scan_work); - skb_queue_head_init(&phy->scan_event_list); - - INIT_WORK(&phy->ps_work, mt7615_ps_work); - INIT_WORK(&phy->roc_work, mt7615_roc_work); - timer_setup(&phy->roc_timer, mt7615_roc_timer, 0); - init_waitqueue_head(&phy->roc_wait); - mt7615_cap_dbdc_enable(dev); mphy = mt76_alloc_phy(&dev->mt76, sizeof(*phy), &mt7615_ops); if (!mphy) @@ -393,6 +384,15 @@ int mt7615_register_ext_phy(struct mt7615_dev *dev) mphy->antenna_mask = BIT(hweight8(phy->chainmask)) - 1; mt7615_init_wiphy(mphy->hw); + INIT_DELAYED_WORK(&phy->mac_work, mt7615_mac_work); + INIT_DELAYED_WORK(&phy->scan_work, mt7615_scan_work); + skb_queue_head_init(&phy->scan_event_list); + + INIT_WORK(&phy->ps_work, mt7615_ps_work); + INIT_WORK(&phy->roc_work, mt7615_roc_work); + timer_setup(&phy->roc_timer, mt7615_roc_timer, 0); + init_waitqueue_head(&phy->roc_wait); + mt7615_mac_set_scs(phy, true); /* -- cgit v1.2.3-59-g8ed1b From ae4027a798988ba001584d77cc57e6b4cd77ddec Mon Sep 17 00:00:00 2001 From: Ryder Lee Date: Wed, 13 May 2020 18:50:55 +0800 Subject: mt76: mt7915: fix decoded radiotap HE flags Move assignment of .data1 and .data2 to a single place and fix overwriting of values from the template Signed-off-by: Ryder Lee Signed-off-by: Felix Fietkau --- drivers/net/wireless/mediatek/mt76/mt7915/mac.c | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/drivers/net/wireless/mediatek/mt76/mt7915/mac.c b/drivers/net/wireless/mediatek/mt76/mt7915/mac.c index 7ad7c2b7afdc..bf96b389c813 100644 --- a/drivers/net/wireless/mediatek/mt76/mt7915/mac.c +++ b/drivers/net/wireless/mediatek/mt76/mt7915/mac.c @@ -235,9 +235,14 @@ mt7915_mac_decode_he_radiotap(struct sk_buff *skb, .data1 = HE_BITS(DATA1_DATA_MCS_KNOWN) | HE_BITS(DATA1_DATA_DCM_KNOWN) | HE_BITS(DATA1_STBC_KNOWN) | - HE_BITS(DATA1_CODING_KNOWN), + HE_BITS(DATA1_CODING_KNOWN) | + HE_BITS(DATA1_LDPC_XSYMSEG_KNOWN) | + HE_BITS(DATA1_DOPPLER_KNOWN) | + HE_BITS(DATA1_BSS_COLOR_KNOWN), .data2 = HE_BITS(DATA2_GI_KNOWN) | - HE_BITS(DATA2_TXBF_KNOWN), + HE_BITS(DATA2_TXBF_KNOWN) | + HE_BITS(DATA2_PE_DISAMBIG_KNOWN) | + HE_BITS(DATA2_TXOP_KNOWN), }; struct ieee80211_radiotap_he *he = NULL; __le32 v2 = rxv->v[2]; @@ -248,12 +253,6 @@ mt7915_mac_decode_he_radiotap(struct sk_buff *skb, he = skb_push(skb, sizeof(known)); memcpy(he, &known, sizeof(known)); - he->data1 = HE_BITS(DATA1_LDPC_XSYMSEG_KNOWN) | - HE_BITS(DATA1_DOPPLER_KNOWN) | - HE_BITS(DATA1_BSS_COLOR_KNOWN); - he->data2 = HE_BITS(DATA2_PE_DISAMBIG_KNOWN) | - HE_BITS(DATA2_TXOP_KNOWN); - he->data3 = HE_PREP(DATA3_BSS_COLOR, BSS_COLOR, v14) | HE_PREP(DATA3_LDPC_XSYMSEG, LDPC_EXT_SYM, v2); he->data5 = HE_PREP(DATA5_PE_DISAMBIG, PE_DISAMBIG, v2) | @@ -296,10 +295,10 @@ mt7915_mac_decode_he_radiotap(struct sk_buff *skb, HE_BITS(DATA1_SPTL_REUSE3_KNOWN) | HE_BITS(DATA1_SPTL_REUSE4_KNOWN); - he->data4 = HE_PREP(DATA4_TB_SPTL_REUSE1, SR_MASK, v11) | - HE_PREP(DATA4_TB_SPTL_REUSE2, SR1_MASK, v11) | - HE_PREP(DATA4_TB_SPTL_REUSE3, SR2_MASK, v11) | - HE_PREP(DATA4_TB_SPTL_REUSE4, SR3_MASK, v11); + he->data4 |= HE_PREP(DATA4_TB_SPTL_REUSE1, SR_MASK, v11) | + HE_PREP(DATA4_TB_SPTL_REUSE2, SR1_MASK, v11) | + HE_PREP(DATA4_TB_SPTL_REUSE3, SR2_MASK, v11) | + HE_PREP(DATA4_TB_SPTL_REUSE4, SR3_MASK, v11); mt7915_mac_decode_he_radiotap_ru(status, rxv, he); break; -- cgit v1.2.3-59-g8ed1b From 238f5d6fc0285053a1684cbb676b9f507080633d Mon Sep 17 00:00:00 2001 From: Felix Fietkau Date: Wed, 20 May 2020 08:04:47 +0200 Subject: mt76: fix per-driver wcid range checks after wcid array size bump All drivers before MT7915 have a limit of 128 WCID entries. Stop relying on ARRAY_SIZE(dev->mt76.wcid), since it no longer reflects that limit. Fixes: 49e649c3e0a6 ("mt76: adjust wcid size to support new 802.11ax generation") Reported-by: kbuild test robot Signed-off-by: Felix Fietkau --- drivers/net/wireless/mediatek/mt76/mt7603/mac.c | 4 ++-- drivers/net/wireless/mediatek/mt76/mt7615/mac.c | 4 ++-- drivers/net/wireless/mediatek/mt76/mt76x02.h | 3 ++- drivers/net/wireless/mediatek/mt76/mt76x02_mac.c | 2 +- drivers/net/wireless/mediatek/mt76/mt76x02_mmio.c | 2 +- drivers/net/wireless/mediatek/mt76/mt76x02_util.c | 2 +- 6 files changed, 9 insertions(+), 8 deletions(-) diff --git a/drivers/net/wireless/mediatek/mt76/mt7603/mac.c b/drivers/net/wireless/mediatek/mt76/mt7603/mac.c index f8c0c957ca01..0f205ffe4905 100644 --- a/drivers/net/wireless/mediatek/mt76/mt7603/mac.c +++ b/drivers/net/wireless/mediatek/mt76/mt7603/mac.c @@ -473,7 +473,7 @@ mt7603_rx_get_wcid(struct mt7603_dev *dev, u8 idx, bool unicast) struct mt7603_sta *sta; struct mt76_wcid *wcid; - if (idx >= ARRAY_SIZE(dev->mt76.wcid)) + if (idx >= MT7603_WTBL_SIZE) return NULL; wcid = rcu_dereference(dev->mt76.wcid[idx]); @@ -1238,7 +1238,7 @@ void mt7603_mac_add_txs(struct mt7603_dev *dev, void *data) if (pid == MT_PACKET_ID_NO_ACK) return; - if (wcidx >= ARRAY_SIZE(dev->mt76.wcid)) + if (wcidx >= MT7603_WTBL_SIZE) return; rcu_read_lock(); diff --git a/drivers/net/wireless/mediatek/mt76/mt7615/mac.c b/drivers/net/wireless/mediatek/mt76/mt7615/mac.c index 6b5c38ab9f5d..f1009c92ec1b 100644 --- a/drivers/net/wireless/mediatek/mt76/mt7615/mac.c +++ b/drivers/net/wireless/mediatek/mt76/mt7615/mac.c @@ -61,7 +61,7 @@ static struct mt76_wcid *mt7615_rx_get_wcid(struct mt7615_dev *dev, struct mt7615_sta *sta; struct mt76_wcid *wcid; - if (idx >= ARRAY_SIZE(dev->mt76.wcid)) + if (idx >= MT7615_WTBL_SIZE) return NULL; wcid = rcu_dereference(dev->mt76.wcid[idx]); @@ -1303,7 +1303,7 @@ static void mt7615_mac_add_txs(struct mt7615_dev *dev, void *data) if (pid == MT_PACKET_ID_NO_ACK) return; - if (wcidx >= ARRAY_SIZE(dev->mt76.wcid)) + if (wcidx >= MT7615_WTBL_SIZE) return; rcu_read_lock(); diff --git a/drivers/net/wireless/mediatek/mt76/mt76x02.h b/drivers/net/wireless/mediatek/mt76/mt76x02.h index 6ea210bd3f07..4c9bbc7ce023 100644 --- a/drivers/net/wireless/mediatek/mt76/mt76x02.h +++ b/drivers/net/wireless/mediatek/mt76/mt76x02.h @@ -15,6 +15,7 @@ #include "mt76x02_dfs.h" #include "mt76x02_dma.h" +#define MT76x02_N_WCIDS 128 #define MT_CALIBRATE_INTERVAL HZ #define MT_MAC_WORK_INTERVAL (HZ / 10) @@ -246,7 +247,7 @@ mt76x02_rx_get_sta(struct mt76_dev *dev, u8 idx) { struct mt76_wcid *wcid; - if (idx >= ARRAY_SIZE(dev->wcid)) + if (idx >= MT76x02_N_WCIDS) return NULL; wcid = rcu_dereference(dev->wcid[idx]); diff --git a/drivers/net/wireless/mediatek/mt76/mt76x02_mac.c b/drivers/net/wireless/mediatek/mt76/mt76x02_mac.c index a5a3bcd30d6f..e4e03beabe43 100644 --- a/drivers/net/wireless/mediatek/mt76/mt76x02_mac.c +++ b/drivers/net/wireless/mediatek/mt76/mt76x02_mac.c @@ -561,7 +561,7 @@ void mt76x02_send_tx_status(struct mt76x02_dev *dev, rcu_read_lock(); - if (stat->wcid < ARRAY_SIZE(dev->mt76.wcid)) + if (stat->wcid < MT76x02_N_WCIDS) wcid = rcu_dereference(dev->mt76.wcid[stat->wcid]); if (wcid && wcid->sta) { diff --git a/drivers/net/wireless/mediatek/mt76/mt76x02_mmio.c b/drivers/net/wireless/mediatek/mt76/mt76x02_mmio.c index 7e389dbccfeb..18adedfbbb8e 100644 --- a/drivers/net/wireless/mediatek/mt76/mt76x02_mmio.c +++ b/drivers/net/wireless/mediatek/mt76/mt76x02_mmio.c @@ -415,7 +415,7 @@ static void mt76x02_reset_state(struct mt76x02_dev *dev) ieee80211_iter_keys_rcu(dev->mt76.hw, NULL, mt76x02_key_sync, NULL); rcu_read_unlock(); - for (i = 0; i < ARRAY_SIZE(dev->mt76.wcid); i++) { + for (i = 0; i < MT76x02_N_WCIDS; i++) { struct ieee80211_sta *sta; struct ieee80211_vif *vif; struct mt76x02_sta *msta; diff --git a/drivers/net/wireless/mediatek/mt76/mt76x02_util.c b/drivers/net/wireless/mediatek/mt76/mt76x02_util.c index 9a2c9afa2fb5..44822a849eb1 100644 --- a/drivers/net/wireless/mediatek/mt76/mt76x02_util.c +++ b/drivers/net/wireless/mediatek/mt76/mt76x02_util.c @@ -249,7 +249,7 @@ int mt76x02_sta_add(struct mt76_dev *mdev, struct ieee80211_vif *vif, memset(msta, 0, sizeof(*msta)); - idx = mt76_wcid_alloc(dev->mt76.wcid_mask, ARRAY_SIZE(dev->mt76.wcid)); + idx = mt76_wcid_alloc(dev->mt76.wcid_mask, MT76x02_N_WCIDS); if (idx < 0) return -ENOSPC; -- cgit v1.2.3-59-g8ed1b From b62db09aa81c148328843afac9fcbaefdaba6913 Mon Sep 17 00:00:00 2001 From: Ryder Lee Date: Tue, 19 May 2020 02:07:38 +0800 Subject: mt76: mt7915: fix some sparse warnings drivers/net/wireless/mediatek/mt76/mt7915/main.c:694:1: sparse: sparse: context imbalance in 'mt7915_sta_rc_update' - wrong count at exit drivers/net/wireless/mediatek/mt76/mt7915/mac.c:303:43: sparse: sparse: cast to restricted __le32 drivers/net/wireless/mediatek/mt76/mt7915/mac.c:304:43: sparse: sparse: cast to restricted __le32 drivers/net/wireless/mediatek/mt76/mt7915/mac.c:305:43: sparse: sparse: cast to restricted __le32 drivers/net/wireless/mediatek/mt76/mt7915/mac.c:319:35: sparse: sparse: cast to restricted __le32 drivers/net/wireless/mediatek/mt76/mt7915/mac.c:327:35: sparse: sparse: cast to restricted __le32 drivers/net/wireless/mediatek/mt76/mt7915/mac.c:345:41: sparse: sparse: cast to restricted __le32 drivers/net/wireless/mediatek/mt76/mt7915/mac.c:355:33: sparse: sparse: cast to restricted __le32 drivers/net/wireless/mediatek/mt76/mt7915/mac.c:451:21: sparse: sparse: invalid assignment: |= drivers/net/wireless/mediatek/mt76/mt7915/mac.c:451:21: sparse: left side has type unsigned int drivers/net/wireless/mediatek/mt76/mt7915/mac.c:451:21: sparse: right side has type restricted __le32 Fixes: e57b7901469f ("mt76: add mac80211 driver for MT7915 PCIe-based chipsets") Reported-by: kbuild test robot Signed-off-by: Ryder Lee Signed-off-by: Felix Fietkau --- drivers/net/wireless/mediatek/mt76/mt7915/mac.c | 32 ++++++++++++++---------- drivers/net/wireless/mediatek/mt76/mt7915/main.c | 2 +- 2 files changed, 20 insertions(+), 14 deletions(-) diff --git a/drivers/net/wireless/mediatek/mt76/mt7915/mac.c b/drivers/net/wireless/mediatek/mt76/mt7915/mac.c index bf96b389c813..ab20dfde94af 100644 --- a/drivers/net/wireless/mediatek/mt76/mt7915/mac.c +++ b/drivers/net/wireless/mediatek/mt76/mt7915/mac.c @@ -425,20 +425,26 @@ int mt7915_mac_fill_rx(struct mt7915_dev *dev, struct sk_buff *skb) /* RXD Group 3 - P-RXV */ if (rxd1 & MT_RXD1_NORMAL_GROUP_3) { + u32 v0, v1, v2; + memcpy(rxv.v, rxd, sizeof(rxv.v)); rxd += 2; if ((u8 *)rxd - skb->data >= skb->len) return -EINVAL; - if (rxv.v[0] & MT_PRXV_HT_AD_CODE) + v0 = le32_to_cpu(rxv.v[0]); + v1 = le32_to_cpu(rxv.v[1]); + v2 = le32_to_cpu(rxv.v[2]); + + if (v0 & MT_PRXV_HT_AD_CODE) status->enc_flags |= RX_ENC_FLAG_LDPC; status->chains = mphy->antenna_mask; - status->chain_signal[0] = to_rssi(MT_PRXV_RCPI0, rxv.v[1]); - status->chain_signal[1] = to_rssi(MT_PRXV_RCPI1, rxv.v[1]); - status->chain_signal[2] = to_rssi(MT_PRXV_RCPI2, rxv.v[1]); - status->chain_signal[3] = to_rssi(MT_PRXV_RCPI3, rxv.v[1]); + status->chain_signal[0] = to_rssi(MT_PRXV_RCPI0, v1); + status->chain_signal[1] = to_rssi(MT_PRXV_RCPI1, v1); + status->chain_signal[2] = to_rssi(MT_PRXV_RCPI2, v1); + status->chain_signal[3] = to_rssi(MT_PRXV_RCPI3, v1); status->signal = status->chain_signal[0]; for (i = 1; i < hweight8(mphy->antenna_mask); i++) { @@ -451,16 +457,16 @@ int mt7915_mac_fill_rx(struct mt7915_dev *dev, struct sk_buff *skb) /* RXD Group 5 - C-RXV */ if (rxd1 & MT_RXD1_NORMAL_GROUP_5) { - u8 stbc = FIELD_GET(MT_CRXV_HT_STBC, rxv.v[2]); - u8 gi = FIELD_GET(MT_CRXV_HT_SHORT_GI, rxv.v[2]); + u8 stbc = FIELD_GET(MT_CRXV_HT_STBC, v2); + u8 gi = FIELD_GET(MT_CRXV_HT_SHORT_GI, v2); bool cck = false; rxd += 18; if ((u8 *)rxd - skb->data >= skb->len) return -EINVAL; - idx = i = FIELD_GET(MT_PRXV_TX_RATE, rxv.v[0]); - rxv.phy = FIELD_GET(MT_CRXV_TX_MODE, rxv.v[2]); + idx = i = FIELD_GET(MT_PRXV_TX_RATE, v0); + rxv.phy = FIELD_GET(MT_CRXV_TX_MODE, v2); switch (rxv.phy) { case MT_PHY_TYPE_CCK: @@ -477,7 +483,7 @@ int mt7915_mac_fill_rx(struct mt7915_dev *dev, struct sk_buff *skb) break; case MT_PHY_TYPE_VHT: status->nss = - FIELD_GET(MT_PRXV_NSTS, rxv.v[0]) + 1; + FIELD_GET(MT_PRXV_NSTS, v0) + 1; status->encoding = RX_ENC_VHT; if (i > 9) return -EINVAL; @@ -489,7 +495,7 @@ int mt7915_mac_fill_rx(struct mt7915_dev *dev, struct sk_buff *skb) case MT_PHY_TYPE_HE_EXT_SU: case MT_PHY_TYPE_HE_TB: status->nss = - FIELD_GET(MT_PRXV_NSTS, rxv.v[0]) + 1; + FIELD_GET(MT_PRXV_NSTS, v0) + 1; status->encoding = RX_ENC_HE; status->flag |= RX_FLAG_RADIOTAP_HE; i &= GENMASK(3, 0); @@ -505,7 +511,7 @@ int mt7915_mac_fill_rx(struct mt7915_dev *dev, struct sk_buff *skb) } status->rate_idx = i; - switch (FIELD_GET(MT_CRXV_FRAME_MODE, rxv.v[2])) { + switch (FIELD_GET(MT_CRXV_FRAME_MODE, v2)) { case IEEE80211_STA_RX_BW_20: break; case IEEE80211_STA_RX_BW_40: @@ -611,7 +617,7 @@ void mt7915_mac_write_txwi(struct mt7915_dev *dev, __le32 *txwi, skb->priority & IEEE80211_QOS_CTL_TID_MASK) | FIELD_PREP(MT_TXD1_OWN_MAC, omac_idx); if (ext_phy && q_idx >= MT_LMAC_ALTX0 && q_idx <= MT_LMAC_BCN0) - val |= cpu_to_le32(MT_TXD1_TGID); + val |= MT_TXD1_TGID; txwi[1] = cpu_to_le32(val); diff --git a/drivers/net/wireless/mediatek/mt76/mt7915/main.c b/drivers/net/wireless/mediatek/mt76/mt7915/main.c index e045dc234100..0575c259f245 100644 --- a/drivers/net/wireless/mediatek/mt76/mt7915/main.c +++ b/drivers/net/wireless/mediatek/mt76/mt7915/main.c @@ -795,7 +795,7 @@ mt7915_sta_rc_update(struct ieee80211_hw *hw, rcu_read_unlock(); return; } - rcu_read_lock(); + rcu_read_unlock(); set_bit(changed, &msta->stats.changed); ieee80211_queue_work(hw, &msta->stats_work); -- cgit v1.2.3-59-g8ed1b From a5e0aa78f5c44e30562a33662eeb4a594a920ce9 Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Tue, 19 May 2020 10:42:11 +0200 Subject: mt76: mt7615: switch to per-vif power_save support switch to per-vif ps support since mt7615 offload firmware can handle it properly. This patch allows enabling/disabling power-save support on p2p interface Tested-by: Sean Wang Signed-off-by: Lorenzo Bianconi Signed-off-by: Felix Fietkau --- drivers/net/wireless/mediatek/mt76/mt76.h | 1 - drivers/net/wireless/mediatek/mt76/mt7615/init.c | 2 -- drivers/net/wireless/mediatek/mt76/mt7615/main.c | 26 +++------------------- drivers/net/wireless/mediatek/mt76/mt7615/mcu.c | 14 ++++++------ drivers/net/wireless/mediatek/mt76/mt7615/mt7615.h | 5 +---- drivers/net/wireless/mediatek/mt76/mt7615/usb.c | 1 - 6 files changed, 11 insertions(+), 38 deletions(-) diff --git a/drivers/net/wireless/mediatek/mt76/mt76.h b/drivers/net/wireless/mediatek/mt76/mt76.h index e2926e091c0f..5c9195f59ae1 100644 --- a/drivers/net/wireless/mediatek/mt76/mt76.h +++ b/drivers/net/wireless/mediatek/mt76/mt76.h @@ -288,7 +288,6 @@ enum { MT76_REMOVED, MT76_READING_STATS, MT76_STATE_POWER_OFF, - MT76_STATE_PS, MT76_STATE_SUSPEND, MT76_STATE_ROC, }; diff --git a/drivers/net/wireless/mediatek/mt76/mt7615/init.c b/drivers/net/wireless/mediatek/mt76/mt7615/init.c index 0d105e4abdfd..e2d80518e5af 100644 --- a/drivers/net/wireless/mediatek/mt76/mt7615/init.c +++ b/drivers/net/wireless/mediatek/mt76/mt7615/init.c @@ -388,7 +388,6 @@ int mt7615_register_ext_phy(struct mt7615_dev *dev) INIT_DELAYED_WORK(&phy->scan_work, mt7615_scan_work); skb_queue_head_init(&phy->scan_event_list); - INIT_WORK(&phy->ps_work, mt7615_ps_work); INIT_WORK(&phy->roc_work, mt7615_roc_work); timer_setup(&phy->roc_timer, mt7615_roc_timer, 0); init_waitqueue_head(&phy->roc_wait); @@ -447,7 +446,6 @@ void mt7615_init_device(struct mt7615_dev *dev) init_waitqueue_head(&dev->phy.roc_wait); INIT_WORK(&dev->reset_work, mt7615_mac_reset_work); - INIT_WORK(&dev->phy.ps_work, mt7615_ps_work); INIT_WORK(&dev->phy.roc_work, mt7615_roc_work); timer_setup(&dev->phy.roc_timer, mt7615_roc_timer, 0); diff --git a/drivers/net/wireless/mediatek/mt76/mt7615/main.c b/drivers/net/wireless/mediatek/mt76/mt7615/main.c index f8cbee1770ce..320dfda6b4e5 100644 --- a/drivers/net/wireless/mediatek/mt76/mt7615/main.c +++ b/drivers/net/wireless/mediatek/mt76/mt7615/main.c @@ -71,7 +71,6 @@ static void mt7615_stop(struct ieee80211_hw *hw) struct mt7615_phy *phy = mt7615_hw_phy(hw); cancel_delayed_work_sync(&phy->mac_work); - cancel_work_sync(&phy->ps_work); del_timer_sync(&phy->roc_timer); cancel_work_sync(&phy->roc_work); @@ -362,20 +361,6 @@ static int mt7615_set_key(struct ieee80211_hw *hw, enum set_key_cmd cmd, return mt7615_mac_wtbl_set_key(dev, wcid, key, cmd); } -void mt7615_ps_work(struct work_struct *work) -{ - struct mt7615_phy *phy; - - phy = (struct mt7615_phy *)container_of(work, struct mt7615_phy, - ps_work); - - mutex_lock(&phy->dev->mt76.mutex); - ieee80211_iterate_active_interfaces(phy->mt76->hw, - IEEE80211_IFACE_ITER_RESUME_ALL, - m7615_mcu_set_ps_iter, phy); - mutex_unlock(&phy->dev->mt76.mutex); -} - static int mt7615_config(struct ieee80211_hw *hw, u32 changed) { struct mt7615_dev *dev = mt7615_hw_dev(hw); @@ -401,14 +386,6 @@ static int mt7615_config(struct ieee80211_hw *hw, u32 changed) mt76_wr(dev, MT_WF_RFCR(band), phy->rxfilter); } - if (changed & IEEE80211_CONF_CHANGE_PS) { - if (hw->conf.flags & IEEE80211_CONF_PS) - set_bit(MT76_STATE_PS, &phy->mt76->state); - else - clear_bit(MT76_STATE_PS, &phy->mt76->state); - ieee80211_queue_work(hw, &phy->ps_work); - } - mutex_unlock(&dev->mt76.mutex); return ret; @@ -511,6 +488,9 @@ static void mt7615_bss_info_changed(struct ieee80211_hw *hw, BSS_CHANGED_BEACON_ENABLED)) mt7615_mcu_add_beacon(dev, hw, vif, info->enable_beacon); + if (changed & BSS_CHANGED_PS) + mt7615_mcu_set_vif_ps(dev, vif); + mutex_unlock(&dev->mt76.mutex); } diff --git a/drivers/net/wireless/mediatek/mt76/mt7615/mcu.c b/drivers/net/wireless/mediatek/mt76/mt7615/mcu.c index 7eb99bde3394..14c2b5d7dbbd 100644 --- a/drivers/net/wireless/mediatek/mt76/mt7615/mcu.c +++ b/drivers/net/wireless/mediatek/mt76/mt7615/mcu.c @@ -2772,11 +2772,9 @@ int mt7615_mcu_set_sku_en(struct mt7615_phy *phy, bool enable) sizeof(req), true); } -void m7615_mcu_set_ps_iter(void *priv, u8 *mac, struct ieee80211_vif *vif) +int mt7615_mcu_set_vif_ps(struct mt7615_dev *dev, struct ieee80211_vif *vif) { struct mt7615_vif *mvif = (struct mt7615_vif *)vif->drv_priv; - struct mt7615_phy *phy = priv; - struct mt76_phy *mphy = phy->mt76; struct { u8 bss_idx; u8 ps_state; /* 0: device awake @@ -2785,12 +2783,14 @@ void m7615_mcu_set_ps_iter(void *priv, u8 *mac, struct ieee80211_vif *vif) */ } req = { .bss_idx = mvif->idx, - .ps_state = test_bit(MT76_STATE_PS, &mphy->state) ? 2 : 0, + .ps_state = vif->bss_conf.ps ? 2 : 0, }; - if (vif->type == NL80211_IFTYPE_STATION) - __mt76_mcu_send_msg(&phy->dev->mt76, MCU_CMD_SET_PS_PROFILE, - &req, sizeof(req), false); + if (vif->type != NL80211_IFTYPE_STATION) + return -ENOTSUPP; + + return __mt76_mcu_send_msg(&dev->mt76, MCU_CMD_SET_PS_PROFILE, + &req, sizeof(req), false); } int mt7615_mcu_set_channel_domain(struct mt7615_phy *phy) diff --git a/drivers/net/wireless/mediatek/mt76/mt7615/mt7615.h b/drivers/net/wireless/mediatek/mt76/mt7615/mt7615.h index 71d5d5973116..170d3c2bbbb4 100644 --- a/drivers/net/wireless/mediatek/mt76/mt7615/mt7615.h +++ b/drivers/net/wireless/mediatek/mt76/mt7615/mt7615.h @@ -205,8 +205,6 @@ struct mt7615_phy { struct timer_list roc_timer; wait_queue_head_t roc_wait; bool roc_grant; - - struct work_struct ps_work; }; #define mt7615_mcu_add_tx_ba(dev, ...) (dev)->mcu_ops->add_tx_ba((dev), __VA_ARGS__) @@ -448,7 +446,6 @@ void mt7615_dma_reset(struct mt7615_dev *dev); void mt7615_scan_work(struct work_struct *work); void mt7615_roc_work(struct work_struct *work); void mt7615_roc_timer(struct timer_list *timer); -void mt7615_ps_work(struct work_struct *work); void mt7615_init_txpower(struct mt7615_dev *dev, struct ieee80211_supported_band *sband); void mt7615_phy_init(struct mt7615_dev *dev); @@ -534,7 +531,7 @@ int mt7615_mcu_set_radar_th(struct mt7615_dev *dev, int index, int mt7615_mcu_set_sku_en(struct mt7615_phy *phy, bool enable); int mt7615_mcu_apply_rx_dcoc(struct mt7615_phy *phy); int mt7615_mcu_apply_tx_dpd(struct mt7615_phy *phy); -void m7615_mcu_set_ps_iter(void *priv, u8 *mac, struct ieee80211_vif *vif); +int mt7615_mcu_set_vif_ps(struct mt7615_dev *dev, struct ieee80211_vif *vif); int mt7615_dfs_init_radar_detector(struct mt7615_phy *phy); int mt7615_mcu_set_p2p_oppps(struct ieee80211_hw *hw, diff --git a/drivers/net/wireless/mediatek/mt76/mt7615/usb.c b/drivers/net/wireless/mediatek/mt76/mt7615/usb.c index c292b41c76e3..a50077eb24d7 100644 --- a/drivers/net/wireless/mediatek/mt76/mt7615/usb.c +++ b/drivers/net/wireless/mediatek/mt76/mt7615/usb.c @@ -51,7 +51,6 @@ static void mt7663u_stop(struct ieee80211_hw *hw) struct mt7615_dev *dev = hw->priv; clear_bit(MT76_STATE_RUNNING, &dev->mphy.state); - cancel_work_sync(&phy->ps_work); del_timer_sync(&phy->roc_timer); cancel_work_sync(&phy->roc_work); cancel_delayed_work_sync(&phy->scan_work); -- cgit v1.2.3-59-g8ed1b From 6f4bd8528c36e9abecddf698600696c7a5578f2e Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Tue, 19 May 2020 23:48:20 +0100 Subject: mt76: mt7915: fix a handful of spelling mistakes There are some spelling mistakes in some literal strings. Fix these. Signed-off-by: Colin Ian King Signed-off-by: Felix Fietkau --- drivers/net/wireless/mediatek/mt76/mt7915/debugfs.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/net/wireless/mediatek/mt76/mt7915/debugfs.c b/drivers/net/wireless/mediatek/mt76/mt7915/debugfs.c index ee0066fedd04..5278bee812f1 100644 --- a/drivers/net/wireless/mediatek/mt76/mt7915/debugfs.c +++ b/drivers/net/wireless/mediatek/mt76/mt7915/debugfs.c @@ -173,14 +173,14 @@ mt7915_txbf_stat_read_phy(struct mt7915_phy *phy, struct seq_file *s) /* Tx Beamformee Rx NDPA & Tx feedback report */ cnt = mt76_rr(dev, MT_ETBF_TX_NDP_BFRP(ext_phy)); - seq_printf(s, "Tx Beamformee sucessful feedback frames: %ld\n", + seq_printf(s, "Tx Beamformee successful feedback frames: %ld\n", FIELD_GET(MT_ETBF_TX_FB_CPL, cnt)); - seq_printf(s, "Tx Beamformee feedback triggerd counts: %ld\n", + seq_printf(s, "Tx Beamformee feedback triggered counts: %ld\n", FIELD_GET(MT_ETBF_TX_FB_TRI, cnt)); /* Tx SU counters */ cnt = mt76_rr(dev, MT_MIB_DR11(ext_phy)); - seq_printf(s, "Tx single-user sucessful MPDU counts: %d\n", cnt); + seq_printf(s, "Tx single-user successful MPDU counts: %d\n", cnt); seq_puts(s, "\n"); } -- cgit v1.2.3-59-g8ed1b From eca026555f01e16011eeb6a7f63ceabd8da4f4a8 Mon Sep 17 00:00:00 2001 From: Sean Wang Date: Fri, 22 May 2020 09:10:24 +0200 Subject: mt76: mt7615: fix hw_scan with ssid_type for specified SSID only Fix hw_scan with ssid_type for specified SSID only The definition for ssid_type in current firmware is that ssid_type BIT(2) set actually for specified SSID + wildcard SSID. ssid_type BIT(2) and ssid_type_ext BIT(0) both set actually for specified SSID only; Signed-off-by: Sean Wang Signed-off-by: Lorenzo Bianconi Signed-off-by: Felix Fietkau --- drivers/net/wireless/mediatek/mt76/mt7615/mcu.c | 1 + drivers/net/wireless/mediatek/mt76/mt7615/mcu.h | 6 ++++-- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/net/wireless/mediatek/mt76/mt7615/mcu.c b/drivers/net/wireless/mediatek/mt76/mt7615/mcu.c index 14c2b5d7dbbd..6e869b8c5e26 100644 --- a/drivers/net/wireless/mediatek/mt76/mt7615/mcu.c +++ b/drivers/net/wireless/mediatek/mt76/mt7615/mcu.c @@ -2898,6 +2898,7 @@ int mt7615_mcu_hw_scan(struct mt7615_phy *phy, struct ieee80211_vif *vif, n_ssids++; } req->ssid_type = n_ssids ? BIT(2) : BIT(0); + req->ssid_type_ext = n_ssids ? BIT(0) : 0; req->ssids_num = n_ssids; /* increase channel time for passive scan */ diff --git a/drivers/net/wireless/mediatek/mt76/mt7615/mcu.h b/drivers/net/wireless/mediatek/mt76/mt7615/mcu.h index fd40d99f5a23..2314d0b23af1 100644 --- a/drivers/net/wireless/mediatek/mt76/mt7615/mcu.h +++ b/drivers/net/wireless/mediatek/mt76/mt7615/mcu.h @@ -327,7 +327,8 @@ struct mt7615_hw_scan_req { */ u8 ssid_type; /* BIT(0) wildcard SSID * BIT(1) P2P wildcard SSID - * BIT(2) specified SSID + * BIT(2) specified SSID + wildcard SSID + * BIT(2) + ssid_type_ext BIT(0) specified SSID only */ u8 ssids_num; u8 probe_req_num; /* Number of probe request for each SSID */ @@ -362,7 +363,8 @@ struct mt7615_hw_scan_req { struct mt7615_mcu_scan_ssid ext_ssids[6]; u8 bssid[ETH_ALEN]; u8 random_mac[ETH_ALEN]; /* valid when BIT(1) in scan_func is set. */ - u8 pad[64]; + u8 pad[63]; + u8 ssid_type_ext; } __packed; #define SCAN_DONE_EVENT_MAX_CHANNEL_NUM 64 -- cgit v1.2.3-59-g8ed1b From ec2bb3a570ec5bfafb71113b3617929434de5ff0 Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Fri, 22 May 2020 09:26:06 +0200 Subject: mt76: mt7915: fix possible NULL pointer dereference in mt7915_register_ext_phy Fix a NULL pointer dereference in mt7915_register_ext_phy since phy data structure is allocated by mt76_alloc_phy routine Signed-off-by: Lorenzo Bianconi Signed-off-by: Felix Fietkau --- drivers/net/wireless/mediatek/mt76/mt7915/init.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/wireless/mediatek/mt76/mt7915/init.c b/drivers/net/wireless/mediatek/mt76/mt7915/init.c index 6f200ab3ac28..aadf56e80bae 100644 --- a/drivers/net/wireless/mediatek/mt76/mt7915/init.c +++ b/drivers/net/wireless/mediatek/mt76/mt7915/init.c @@ -592,7 +592,6 @@ int mt7915_register_ext_phy(struct mt7915_dev *dev) if (phy) return 0; - INIT_DELAYED_WORK(&phy->mac_work, mt7915_mac_work); mt7915_cap_dbdc_enable(dev); mphy = mt76_alloc_phy(&dev->mt76, sizeof(*phy), &mt7915_ops); if (!mphy) @@ -605,6 +604,8 @@ int mt7915_register_ext_phy(struct mt7915_dev *dev) mphy->antenna_mask = BIT(hweight8(phy->chainmask)) - 1; mt7915_init_wiphy(mphy->hw); + INIT_DELAYED_WORK(&phy->mac_work, mt7915_mac_work); + /* * Make the secondary PHY MAC address local without overlapping with * the usual MAC address allocation scheme on multiple virtual interfaces -- cgit v1.2.3-59-g8ed1b From 5e616ad216ef560b2a856c858137c772351eee9f Mon Sep 17 00:00:00 2001 From: Felix Fietkau Date: Sat, 23 May 2020 14:40:57 +0200 Subject: mt76: fix wcid allocation issues mt76 core uses ffs() to find the next free bit. This works well for 32 bit architectures where BITS_PER_LONG is 32. ffs only checks 32 bit values, so allocation fails on 64 bit architectures. Additionally, the wcid mask array was too small in cases where the array was not a multiple of BITS_PER_LONG. Fix this by making the wcid mask array u32 instead and use DIV_ROUND_UP for the size, just in case we ever bump it to a value that's not a multiple of 32. Reported-by: Ryder Lee Signed-off-by: Felix Fietkau --- drivers/net/wireless/mediatek/mt76/mt76.h | 4 ++-- drivers/net/wireless/mediatek/mt76/util.c | 12 ++++++------ drivers/net/wireless/mediatek/mt76/util.h | 14 +++++++------- 3 files changed, 15 insertions(+), 15 deletions(-) diff --git a/drivers/net/wireless/mediatek/mt76/mt76.h b/drivers/net/wireless/mediatek/mt76/mt76.h index 5c9195f59ae1..afb1ccf61b74 100644 --- a/drivers/net/wireless/mediatek/mt76/mt76.h +++ b/drivers/net/wireless/mediatek/mt76/mt76.h @@ -537,8 +537,8 @@ struct mt76_dev { wait_queue_head_t tx_wait; struct sk_buff_head status_list; - unsigned long wcid_mask[MT76_N_WCIDS / BITS_PER_LONG]; - unsigned long wcid_phy_mask[MT76_N_WCIDS / BITS_PER_LONG]; + u32 wcid_mask[DIV_ROUND_UP(MT76_N_WCIDS, 32)]; + u32 wcid_phy_mask[DIV_ROUND_UP(MT76_N_WCIDS, 32)]; struct mt76_wcid global_wcid; struct mt76_wcid __rcu *wcid[MT76_N_WCIDS]; diff --git a/drivers/net/wireless/mediatek/mt76/util.c b/drivers/net/wireless/mediatek/mt76/util.c index 07cf71242d9e..ecde87465bf6 100644 --- a/drivers/net/wireless/mediatek/mt76/util.c +++ b/drivers/net/wireless/mediatek/mt76/util.c @@ -42,17 +42,17 @@ bool __mt76_poll_msec(struct mt76_dev *dev, u32 offset, u32 mask, u32 val, } EXPORT_SYMBOL_GPL(__mt76_poll_msec); -int mt76_wcid_alloc(unsigned long *mask, int size) +int mt76_wcid_alloc(u32 *mask, int size) { int i, idx = 0, cur; - for (i = 0; i < DIV_ROUND_UP(size, BITS_PER_LONG); i++) { + for (i = 0; i < DIV_ROUND_UP(size, 32); i++) { idx = ffs(~mask[i]); if (!idx) continue; idx--; - cur = i * BITS_PER_LONG + idx; + cur = i * 32 + idx; if (cur >= size) break; @@ -74,13 +74,13 @@ int mt76_get_min_avg_rssi(struct mt76_dev *dev, bool ext_phy) rcu_read_lock(); for (i = 0; i < ARRAY_SIZE(dev->wcid_mask); i++) { - unsigned long mask = dev->wcid_mask[i]; - unsigned long phy_mask = dev->wcid_phy_mask[i]; + u32 mask = dev->wcid_mask[i]; + u32 phy_mask = dev->wcid_phy_mask[i]; if (!mask) continue; - for (j = i * BITS_PER_LONG; mask; j++, mask >>= 1, phy_mask >>= 1) { + for (j = i * 32; mask; j++, mask >>= 1, phy_mask >>= 1) { if (!(mask & 1)) continue; diff --git a/drivers/net/wireless/mediatek/mt76/util.h b/drivers/net/wireless/mediatek/mt76/util.h index 48a71e7479e5..fd1a68820e0a 100644 --- a/drivers/net/wireless/mediatek/mt76/util.h +++ b/drivers/net/wireless/mediatek/mt76/util.h @@ -14,24 +14,24 @@ #define MT76_INCR(_var, _size) \ (_var = (((_var) + 1) % (_size))) -int mt76_wcid_alloc(unsigned long *mask, int size); +int mt76_wcid_alloc(u32 *mask, int size); static inline bool -mt76_wcid_mask_test(unsigned long *mask, int idx) +mt76_wcid_mask_test(u32 *mask, int idx) { - return mask[idx / BITS_PER_LONG] & BIT(idx % BITS_PER_LONG); + return mask[idx / 32] & BIT(idx % 32); } static inline void -mt76_wcid_mask_set(unsigned long *mask, int idx) +mt76_wcid_mask_set(u32 *mask, int idx) { - mask[idx / BITS_PER_LONG] |= BIT(idx % BITS_PER_LONG); + mask[idx / 32] |= BIT(idx % 32); } static inline void -mt76_wcid_mask_clear(unsigned long *mask, int idx) +mt76_wcid_mask_clear(u32 *mask, int idx) { - mask[idx / BITS_PER_LONG] &= ~BIT(idx % BITS_PER_LONG); + mask[idx / 32] &= ~BIT(idx % 32); } static inline void -- cgit v1.2.3-59-g8ed1b From e47f2245375feef8f72ff119a939865fe5e830fd Mon Sep 17 00:00:00 2001 From: DENG Qingfang Date: Sun, 24 May 2020 11:41:10 +0800 Subject: mt76: mt7615: add support for MT7611N MT7611N is basically the same as MT7615N, except it only supports 5GHz It is used by some TP-Link and Mercury wireless routers Signed-off-by: DENG Qingfang Signed-off-by: Felix Fietkau --- drivers/net/wireless/mediatek/mt76/mt7615/eeprom.c | 7 +++++++ drivers/net/wireless/mediatek/mt76/mt7615/mt7615.h | 7 ++++++- drivers/net/wireless/mediatek/mt76/mt7615/pci.c | 1 + 3 files changed, 14 insertions(+), 1 deletion(-) diff --git a/drivers/net/wireless/mediatek/mt76/mt7615/eeprom.c b/drivers/net/wireless/mediatek/mt76/mt7615/eeprom.c index 6a5ae047c63b..edac37e7847b 100644 --- a/drivers/net/wireless/mediatek/mt76/mt7615/eeprom.c +++ b/drivers/net/wireless/mediatek/mt76/mt7615/eeprom.c @@ -111,6 +111,12 @@ mt7615_eeprom_parse_hw_band_cap(struct mt7615_dev *dev) return; } + if (is_mt7611(&dev->mt76)) { + /* 5GHz only */ + dev->mt76.cap.has_5ghz = true; + return; + } + val = FIELD_GET(MT_EE_NIC_WIFI_CONF_BAND_SEL, eeprom[MT_EE_WIFI_CONF]); switch (val) { @@ -310,6 +316,7 @@ static void mt7615_cal_free_data(struct mt7615_dev *dev) mt7622_apply_cal_free_data(dev); break; case 0x7615: + case 0x7611: mt7615_apply_cal_free_data(dev); break; } diff --git a/drivers/net/wireless/mediatek/mt76/mt7615/mt7615.h b/drivers/net/wireless/mediatek/mt76/mt7615/mt7615.h index 170d3c2bbbb4..d6176d316bee 100644 --- a/drivers/net/wireless/mediatek/mt76/mt7615/mt7615.h +++ b/drivers/net/wireless/mediatek/mt76/mt7615/mt7615.h @@ -414,7 +414,7 @@ static inline bool is_mt7622(struct mt76_dev *dev) static inline bool is_mt7615(struct mt76_dev *dev) { - return mt76_chip(dev) == 0x7615; + return mt76_chip(dev) == 0x7615 || mt76_chip(dev) == 0x7611; } static inline bool is_mt7663(struct mt76_dev *dev) @@ -422,6 +422,11 @@ static inline bool is_mt7663(struct mt76_dev *dev) return mt76_chip(dev) == 0x7663; } +static inline bool is_mt7611(struct mt76_dev *dev) +{ + return mt76_chip(dev) == 0x7611; +} + static inline void mt7615_irq_enable(struct mt7615_dev *dev, u32 mask) { mt76_set_irq_mask(&dev->mt76, 0, 0, mask); diff --git a/drivers/net/wireless/mediatek/mt76/mt7615/pci.c b/drivers/net/wireless/mediatek/mt76/mt7615/pci.c index 88ff14564521..b09d08d0dac9 100644 --- a/drivers/net/wireless/mediatek/mt76/mt7615/pci.c +++ b/drivers/net/wireless/mediatek/mt76/mt7615/pci.c @@ -14,6 +14,7 @@ static const struct pci_device_id mt7615_pci_device_table[] = { { PCI_DEVICE(0x14c3, 0x7615) }, { PCI_DEVICE(0x14c3, 0x7663) }, + { PCI_DEVICE(0x14c3, 0x7611) }, { }, }; -- cgit v1.2.3-59-g8ed1b From f473b42ac516befcb3ba6b0a5ef16f865f7579c9 Mon Sep 17 00:00:00 2001 From: Felix Fietkau Date: Sun, 24 May 2020 14:44:52 +0200 Subject: mt76: only iterate over initialized rx queues Fixes the following reported crash: [ 2.361127] BUG: spinlock bad magic on CPU#0, modprobe/456 [ 2.361583] lock: 0xffffa1287525b3b8, .magic: 00000000, .owner: /-1, .owner_cpu: 0 [ 2.362250] CPU: 0 PID: 456 Comm: modprobe Not tainted 4.14.177 #5 [ 2.362751] Hardware name: HP Meep/Meep, BIOS Google_Meep.11297.75.0 06/17/2019 [ 2.363343] Call Trace: [ 2.363552] dump_stack+0x97/0xdb [ 2.363826] ? spin_bug+0xa6/0xb3 [ 2.364096] do_raw_spin_lock+0x6a/0x9a [ 2.364417] mt76_dma_rx_fill+0x44/0x1de [mt76] [ 2.364787] ? mt76_dma_kick_queue+0x18/0x18 [mt76] [ 2.365184] mt76_dma_init+0x53/0x85 [mt76] [ 2.365532] mt7615_dma_init+0x3d7/0x546 [mt7615e] [ 2.365928] mt7615_register_device+0xe6/0x1a0 [mt7615e] [ 2.366364] mt7615_mmio_probe+0x14b/0x171 [mt7615e] [ 2.366771] mt7615_pci_probe+0x118/0x13b [mt7615e] [ 2.367169] pci_device_probe+0xaf/0x13d [ 2.367491] driver_probe_device+0x284/0x2ca [ 2.367840] __driver_attach+0x7a/0x9e [ 2.368146] ? driver_attach+0x1f/0x1f [ 2.368451] bus_for_each_dev+0xa0/0xdb [ 2.368765] bus_add_driver+0x132/0x204 [ 2.369078] driver_register+0x8e/0xcd [ 2.369384] do_one_initcall+0x160/0x257 [ 2.369706] ? 0xffffffffc0240000 [ 2.369980] do_init_module+0x60/0x1bb [ 2.370286] load_module+0x18c2/0x1a2b [ 2.370596] ? kernel_read_file+0x141/0x1b9 [ 2.370937] ? kernel_read_file_from_fd+0x46/0x71 [ 2.371320] SyS_finit_module+0xcc/0xf0 [ 2.371636] do_syscall_64+0x6b/0xf7 [ 2.371930] entry_SYSCALL_64_after_hwframe+0x3d/0xa2 [ 2.372344] RIP: 0033:0x7da218ae4199 [ 2.372637] RSP: 002b:00007fffd0608398 EFLAGS: 00000246 ORIG_RAX: 0000000000000139 [ 2.373252] RAX: ffffffffffffffda RBX: 00005a705449df90 RCX: 00007da218ae4199 [ 2.373833] RDX: 0000000000000000 RSI: 00005a7052e73bd8 RDI: 0000000000000006 [ 2.374411] RBP: 00007fffd06083e0 R08: 0000000000000000 R09: 00005a705449d540 [ 2.374989] R10: 0000000000000006 R11: 0000000000000246 R12: 0000000000000000 [ 2.375569] R13: 00005a705449def0 R14: 00005a7052e73bd8 R15: 0000000000000000 Reported-by: Sean Wang Fixes: d3377b78cec6 ("mt76: add HE phy modes and hardware queue") Signed-off-by: Felix Fietkau --- drivers/net/wireless/mediatek/mt76/debugfs.c | 2 +- drivers/net/wireless/mediatek/mt76/dma.c | 4 ++-- drivers/net/wireless/mediatek/mt76/mt76.h | 4 ++++ drivers/net/wireless/mediatek/mt76/mt7603/mac.c | 3 ++- drivers/net/wireless/mediatek/mt76/mt7615/mac.c | 3 ++- drivers/net/wireless/mediatek/mt76/mt7615/pci.c | 8 +++++--- drivers/net/wireless/mediatek/mt76/mt76x02_mmio.c | 3 ++- drivers/net/wireless/mediatek/mt76/mt7915/mac.c | 3 ++- 8 files changed, 20 insertions(+), 10 deletions(-) diff --git a/drivers/net/wireless/mediatek/mt76/debugfs.c b/drivers/net/wireless/mediatek/mt76/debugfs.c index 0278e1b44576..3a5de1d1b121 100644 --- a/drivers/net/wireless/mediatek/mt76/debugfs.c +++ b/drivers/net/wireless/mediatek/mt76/debugfs.c @@ -51,7 +51,7 @@ static int mt76_rx_queues_read(struct seq_file *s, void *data) struct mt76_dev *dev = dev_get_drvdata(s->private); int i, queued; - for (i = 0; i < ARRAY_SIZE(dev->q_rx); i++) { + mt76_for_each_q_rx(dev, i) { struct mt76_queue *q = &dev->q_rx[i]; if (!q->ndesc) diff --git a/drivers/net/wireless/mediatek/mt76/dma.c b/drivers/net/wireless/mediatek/mt76/dma.c index 75e659774e07..f4d6074fe32a 100644 --- a/drivers/net/wireless/mediatek/mt76/dma.c +++ b/drivers/net/wireless/mediatek/mt76/dma.c @@ -576,7 +576,7 @@ mt76_dma_init(struct mt76_dev *dev) init_dummy_netdev(&dev->napi_dev); - for (i = 0; i < ARRAY_SIZE(dev->q_rx); i++) { + mt76_for_each_q_rx(dev, i) { netif_napi_add(&dev->napi_dev, &dev->napi[i], mt76_dma_rx_poll, 64); mt76_dma_rx_fill(dev, &dev->q_rx[i]); @@ -610,7 +610,7 @@ void mt76_dma_cleanup(struct mt76_dev *dev) for (i = 0; i < ARRAY_SIZE(dev->q_tx); i++) mt76_dma_tx_cleanup(dev, i, true); - for (i = 0; i < ARRAY_SIZE(dev->q_rx); i++) { + mt76_for_each_q_rx(dev, i) { netif_napi_del(&dev->napi[i]); mt76_dma_rx_cleanup(dev, &dev->q_rx[i]); } diff --git a/drivers/net/wireless/mediatek/mt76/mt76.h b/drivers/net/wireless/mediatek/mt76/mt76.h index afb1ccf61b74..dfe625a53c63 100644 --- a/drivers/net/wireless/mediatek/mt76/mt76.h +++ b/drivers/net/wireless/mediatek/mt76/mt76.h @@ -671,6 +671,10 @@ static inline u16 mt76_rev(struct mt76_dev *dev) #define mt76_queue_tx_cleanup(dev, ...) (dev)->mt76.queue_ops->tx_cleanup(&((dev)->mt76), __VA_ARGS__) #define mt76_queue_kick(dev, ...) (dev)->mt76.queue_ops->kick(&((dev)->mt76), __VA_ARGS__) +#define mt76_for_each_q_rx(dev, i) \ + for (i = 0; i < ARRAY_SIZE((dev)->q_rx) && \ + (dev)->q_rx[i].ndesc; i++) + struct mt76_dev *mt76_alloc_device(struct device *pdev, unsigned int size, const struct ieee80211_ops *ops, const struct mt76_driver_ops *drv_ops); diff --git a/drivers/net/wireless/mediatek/mt76/mt7603/mac.c b/drivers/net/wireless/mediatek/mt76/mt7603/mac.c index 0f205ffe4905..8060c1514396 100644 --- a/drivers/net/wireless/mediatek/mt76/mt7603/mac.c +++ b/drivers/net/wireless/mediatek/mt76/mt7603/mac.c @@ -1438,8 +1438,9 @@ static void mt7603_mac_watchdog_reset(struct mt7603_dev *dev) for (i = 0; i < __MT_TXQ_MAX; i++) mt76_queue_tx_cleanup(dev, i, true); - for (i = 0; i < ARRAY_SIZE(dev->mt76.q_rx); i++) + mt76_for_each_q_rx(&dev->mt76, i) { mt76_queue_rx_reset(dev, i); + } mt7603_dma_sched_reset(dev); diff --git a/drivers/net/wireless/mediatek/mt76/mt7615/mac.c b/drivers/net/wireless/mediatek/mt76/mt7615/mac.c index f1009c92ec1b..9f1c6ca7a665 100644 --- a/drivers/net/wireless/mediatek/mt76/mt7615/mac.c +++ b/drivers/net/wireless/mediatek/mt76/mt7615/mac.c @@ -1820,8 +1820,9 @@ void mt7615_dma_reset(struct mt7615_dev *dev) for (i = 0; i < __MT_TXQ_MAX; i++) mt76_queue_tx_cleanup(dev, i, true); - for (i = 0; i < ARRAY_SIZE(dev->mt76.q_rx); i++) + mt76_for_each_q_rx(&dev->mt76, i) { mt76_queue_rx_reset(dev, i); + } mt76_set(dev, MT_WPDMA_GLO_CFG, MT_WPDMA_GLO_CFG_RX_DMA_EN | MT_WPDMA_GLO_CFG_TX_DMA_EN | diff --git a/drivers/net/wireless/mediatek/mt76/mt7615/pci.c b/drivers/net/wireless/mediatek/mt76/mt7615/pci.c index b09d08d0dac9..ba12f199bce0 100644 --- a/drivers/net/wireless/mediatek/mt76/mt7615/pci.c +++ b/drivers/net/wireless/mediatek/mt76/mt7615/pci.c @@ -86,8 +86,9 @@ static int mt7615_pci_suspend(struct pci_dev *pdev, pm_message_t state) napi_disable(&mdev->tx_napi); tasklet_kill(&mdev->tx_tasklet); - for (i = 0; i < ARRAY_SIZE(mdev->q_rx); i++) + mt76_for_each_q_rx(mdev, i) { napi_disable(&mdev->napi[i]); + } tasklet_kill(&dev->irq_tasklet); mt7615_dma_reset(dev); @@ -120,8 +121,9 @@ static int mt7615_pci_suspend(struct pci_dev *pdev, pm_message_t state) return 0; restore: - for (i = 0; i < ARRAY_SIZE(mdev->q_rx); i++) + mt76_for_each_q_rx(mdev, i) { napi_enable(&mdev->napi[i]); + } napi_enable(&mdev->tx_napi); if (hif_suspend) mt7615_mcu_set_hif_suspend(dev, false); @@ -156,7 +158,7 @@ static int mt7615_pci_resume(struct pci_dev *pdev) if (pdma_reset) dev_err(mdev->dev, "PDMA engine must be reinitialized\n"); - for (i = 0; i < ARRAY_SIZE(mdev->q_rx); i++) { + mt76_for_each_q_rx(mdev, i) { napi_enable(&mdev->napi[i]); napi_schedule(&mdev->napi[i]); } diff --git a/drivers/net/wireless/mediatek/mt76/mt76x02_mmio.c b/drivers/net/wireless/mediatek/mt76/mt76x02_mmio.c index 18adedfbbb8e..cbbe986655fe 100644 --- a/drivers/net/wireless/mediatek/mt76/mt76x02_mmio.c +++ b/drivers/net/wireless/mediatek/mt76/mt76x02_mmio.c @@ -489,8 +489,9 @@ static void mt76x02_watchdog_reset(struct mt76x02_dev *dev) for (i = 0; i < __MT_TXQ_MAX; i++) mt76_queue_tx_cleanup(dev, i, true); - for (i = 0; i < ARRAY_SIZE(dev->mt76.q_rx); i++) + mt76_for_each_q_rx(&dev->mt76, i) { mt76_queue_rx_reset(dev, i); + } mt76x02_mac_start(dev); diff --git a/drivers/net/wireless/mediatek/mt76/mt7915/mac.c b/drivers/net/wireless/mediatek/mt76/mt7915/mac.c index ab20dfde94af..a264e304a3df 100644 --- a/drivers/net/wireless/mediatek/mt76/mt7915/mac.c +++ b/drivers/net/wireless/mediatek/mt76/mt7915/mac.c @@ -1146,8 +1146,9 @@ mt7915_dma_reset(struct mt7915_dev *dev) for (i = 0; i < __MT_TXQ_MAX; i++) mt76_queue_tx_cleanup(dev, i, true); - for (i = 0; i < ARRAY_SIZE(dev->mt76.q_rx); i++) + mt76_for_each_q_rx(&dev->mt76, i) { mt76_queue_rx_reset(dev, i); + } /* re-init prefetch settings after reset */ mt7915_dma_prefetch(dev); -- cgit v1.2.3-59-g8ed1b From 194a1508e082582159e312f818e11ab0f8e96e50 Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Thu, 28 May 2020 07:48:56 +0000 Subject: mt76: mt7615: Use kmemdup in mt7615_queue_key_update() Use kmemdup rather than duplicating its implementation Signed-off-by: YueHaibing Signed-off-by: Felix Fietkau --- drivers/net/wireless/mediatek/mt76/mt7615/main.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/net/wireless/mediatek/mt76/mt7615/main.c b/drivers/net/wireless/mediatek/mt76/mt7615/main.c index 320dfda6b4e5..c26f99b368d9 100644 --- a/drivers/net/wireless/mediatek/mt76/mt7615/main.c +++ b/drivers/net/wireless/mediatek/mt76/mt7615/main.c @@ -290,12 +290,11 @@ mt7615_queue_key_update(struct mt7615_dev *dev, enum set_key_cmd cmd, wd->type = MT7615_WTBL_KEY_DESC; wd->sta = msta; - wd->key.key = kzalloc(key->keylen, GFP_KERNEL); + wd->key.key = kmemdup(key->key, key->keylen, GFP_KERNEL); if (!wd->key.key) { kfree(wd); return -ENOMEM; } - memcpy(wd->key.key, key->key, key->keylen); wd->key.cipher = key->cipher; wd->key.keyidx = key->keyidx; wd->key.keylen = key->keylen; -- cgit v1.2.3-59-g8ed1b From d9045b18cd445e0d0a53903ffd5d79793d9df59e Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Thu, 28 May 2020 07:48:29 +0000 Subject: mt76: mt7915: remove set but not used variable 'msta' Cc: linux-wireless@vger.kernel.org, linux-arm-kernel@lists.infradead.org, linux-mediatek@lists.infradead.org, netdev@vger.kernel.org, kernel-janitors@vger.kernel.org Fixes gcc '-Wunused-but-set-variable' warning: drivers/net/wireless/mediatek/mt76/mt7915/mcu.c: In function 'mt7915_mcu_sta_txbf_type': drivers/net/wireless/mediatek/mt76/mt7915/mcu.c:1805:21: warning: variable 'msta' set but not used [-Wunused-but-set-variable] It is never used, so can be removed. Reported-by: Hulk Robot Signed-off-by: YueHaibing Signed-off-by: Felix Fietkau --- drivers/net/wireless/mediatek/mt76/mt7915/mcu.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/drivers/net/wireless/mediatek/mt76/mt7915/mcu.c b/drivers/net/wireless/mediatek/mt76/mt7915/mcu.c index 8460cd453213..c8c12c740c1a 100644 --- a/drivers/net/wireless/mediatek/mt76/mt7915/mcu.c +++ b/drivers/net/wireless/mediatek/mt76/mt7915/mcu.c @@ -1801,15 +1801,12 @@ static u8 mt7915_mcu_sta_txbf_type(struct mt7915_phy *phy, struct ieee80211_vif *vif, struct ieee80211_sta *sta) { - struct mt7915_sta *msta; u8 type = 0; if (vif->type != NL80211_IFTYPE_STATION && vif->type != NL80211_IFTYPE_AP) return 0; - msta = (struct mt7915_sta *)sta->drv_priv; - if (sta->he_cap.has_he) { struct ieee80211_he_cap_elem *pe; const struct ieee80211_he_cap_elem *ve; -- cgit v1.2.3-59-g8ed1b From 7c741868ceab825bb99cf6c72859e9364d54a07c Mon Sep 17 00:00:00 2001 From: David Ahern Date: Wed, 27 May 2020 18:03:44 -0600 Subject: selftests: Add torture tests to nexthop tests Add Nik's torture tests as a new set to stress the replace and cleanup paths. Torture test created by Nikolay Aleksandrov and then I adapted to selftest and added IPv6 version. Signed-off-by: David Ahern Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- tools/testing/selftests/net/fib_nexthops.sh | 115 +++++++++++++++++++++++++++- 1 file changed, 113 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/net/fib_nexthops.sh b/tools/testing/selftests/net/fib_nexthops.sh index 1e2f61262e4e..dee567f7576a 100755 --- a/tools/testing/selftests/net/fib_nexthops.sh +++ b/tools/testing/selftests/net/fib_nexthops.sh @@ -19,8 +19,8 @@ ret=0 ksft_skip=4 # all tests in this script. Can be overridden with -t option -IPV4_TESTS="ipv4_fcnal ipv4_grp_fcnal ipv4_withv6_fcnal ipv4_fcnal_runtime ipv4_large_grp ipv4_compat_mode ipv4_fdb_grp_fcnal" -IPV6_TESTS="ipv6_fcnal ipv6_grp_fcnal ipv6_fcnal_runtime ipv6_large_grp ipv6_compat_mode ipv6_fdb_grp_fcnal" +IPV4_TESTS="ipv4_fcnal ipv4_grp_fcnal ipv4_withv6_fcnal ipv4_fcnal_runtime ipv4_large_grp ipv4_compat_mode ipv4_fdb_grp_fcnal ipv4_torture" +IPV6_TESTS="ipv6_fcnal ipv6_grp_fcnal ipv6_fcnal_runtime ipv6_large_grp ipv6_compat_mode ipv6_fdb_grp_fcnal ipv6_torture" ALL_TESTS="basic ${IPV4_TESTS} ${IPV6_TESTS}" TESTS="${ALL_TESTS}" @@ -767,6 +767,62 @@ ipv6_large_grp() $IP nexthop flush >/dev/null 2>&1 } +ipv6_del_add_loop1() +{ + while :; do + $IP nexthop del id 100 + $IP nexthop add id 100 via 2001:db8:91::2 dev veth1 + done >/dev/null 2>&1 +} + +ipv6_grp_replace_loop() +{ + while :; do + $IP nexthop replace id 102 group 100/101 + done >/dev/null 2>&1 +} + +ipv6_torture() +{ + local pid1 + local pid2 + local pid3 + local pid4 + local pid5 + + echo + echo "IPv6 runtime torture" + echo "--------------------" + if [ ! -x "$(command -v mausezahn)" ]; then + echo "SKIP: Could not run test; need mausezahn tool" + return + fi + + run_cmd "$IP nexthop add id 100 via 2001:db8:91::2 dev veth1" + run_cmd "$IP nexthop add id 101 via 2001:db8:92::2 dev veth3" + run_cmd "$IP nexthop add id 102 group 100/101" + run_cmd "$IP route add 2001:db8:101::1 nhid 102" + run_cmd "$IP route add 2001:db8:101::2 nhid 102" + + ipv6_del_add_loop1 & + pid1=$! + ipv6_grp_replace_loop & + pid2=$! + ip netns exec me ping -f 2001:db8:101::1 >/dev/null 2>&1 & + pid3=$! + ip netns exec me ping -f 2001:db8:101::2 >/dev/null 2>&1 & + pid4=$! + ip netns exec me mausezahn veth1 -B 2001:db8:101::2 -A 2001:db8:91::1 -c 0 -t tcp "dp=1-1023, flags=syn" >/dev/null 2>&1 & + pid5=$! + + sleep 300 + kill -9 $pid1 $pid2 $pid3 $pid4 $pid5 + + # if we did not crash, success + log_test 0 0 "IPv6 torture test" +} + + ipv4_fcnal() { local rc @@ -1313,6 +1369,61 @@ ipv4_compat_mode() sysctl_nexthop_compat_mode_set 1 "IPv4" } +ipv4_del_add_loop1() +{ + while :; do + $IP nexthop del id 100 + $IP nexthop add id 100 via 172.16.1.2 dev veth1 + done >/dev/null 2>&1 +} + +ipv4_grp_replace_loop() +{ + while :; do + $IP nexthop replace id 102 group 100/101 + done >/dev/null 2>&1 +} + +ipv4_torture() +{ + local pid1 + local pid2 + local pid3 + local pid4 + local pid5 + + echo + echo "IPv4 runtime torture" + echo "--------------------" + if [ ! -x "$(command -v mausezahn)" ]; then + echo "SKIP: Could not run test; need mausezahn tool" + return + fi + + run_cmd "$IP nexthop add id 100 via 172.16.1.2 dev veth1" + run_cmd "$IP nexthop add id 101 via 172.16.2.2 dev veth3" + run_cmd "$IP nexthop add id 102 group 100/101" + run_cmd "$IP route add 172.16.101.1 nhid 102" + run_cmd "$IP route add 172.16.101.2 nhid 102" + + ipv4_del_add_loop1 & + pid1=$! + ipv4_grp_replace_loop & + pid2=$! + ip netns exec me ping -f 172.16.101.1 >/dev/null 2>&1 & + pid3=$! + ip netns exec me ping -f 172.16.101.2 >/dev/null 2>&1 & + pid4=$! + ip netns exec me mausezahn veth1 -B 172.16.101.2 -A 172.16.1.1 -c 0 -t tcp "dp=1-1023, flags=syn" >/dev/null 2>&1 & + pid5=$! + + sleep 300 + kill -9 $pid1 $pid2 $pid3 $pid4 $pid5 + + # if we did not crash, success + log_test 0 0 "IPv4 torture test" +} + basic() { echo -- cgit v1.2.3-59-g8ed1b From 4d7525085a9ba86b9d78561d379b2ff8c0b30468 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Thu, 28 May 2020 03:27:58 +0300 Subject: net: dsa: sja1105: offload the Credit-Based Shaper qdisc SJA1105, being AVB/TSN switches, provide hardware assist for the Credit-Based Shaper as described in the IEEE 8021Q-2018 document. First generation has 10 shapers, freely assignable to any of the 4 external ports and 8 traffic classes, and second generation has 16 shapers. The Credit-Based Shaper tables are accessed through the dynamic reconfiguration interface, so we have to restore them manually after a switch reset. The tables are backed up by the static config only on P/Q/R/S, and we don't want to add custom code only for that family, since the procedure that is in place now works for both. Tested with the following commands: data_rate_kbps=67000 port_transmit_rate_kbps=1000000 idleslope=$data_rate_kbps sendslope=$(($idleslope - $port_transmit_rate_kbps)) locredit=$((-0x80000000)) hicredit=$((0x7fffffff)) tc qdisc add dev swp2 root handle 1: mqprio hw 0 num_tc 8 \ map 0 1 2 3 4 5 6 7 \ queues 1@0 1@1 1@2 1@3 1@4 1@5 1@6 1@7 tc qdisc replace dev swp2 parent 1:1 cbs \ idleslope $idleslope \ sendslope $sendslope \ hicredit $hicredit \ locredit $locredit \ offload 1 Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- drivers/net/dsa/sja1105/sja1105.h | 2 + drivers/net/dsa/sja1105/sja1105_dynamic_config.c | 76 +++++++++++++++++ drivers/net/dsa/sja1105/sja1105_main.c | 100 +++++++++++++++++++++++ drivers/net/dsa/sja1105/sja1105_spi.c | 6 ++ drivers/net/dsa/sja1105/sja1105_static_config.h | 15 ++++ 5 files changed, 199 insertions(+) diff --git a/drivers/net/dsa/sja1105/sja1105.h b/drivers/net/dsa/sja1105/sja1105.h index 198d2a7d7f95..cb3c81a49fbc 100644 --- a/drivers/net/dsa/sja1105/sja1105.h +++ b/drivers/net/dsa/sja1105/sja1105.h @@ -84,6 +84,7 @@ struct sja1105_info { * the egress timestamps. */ int ptpegr_ts_bytes; + int num_cbs_shapers; const struct sja1105_dynamic_table_ops *dyn_ops; const struct sja1105_table_ops *static_ops; const struct sja1105_regs *regs; @@ -218,6 +219,7 @@ struct sja1105_private { struct mutex mgmt_lock; bool expect_dsa_8021q; enum sja1105_vlan_state vlan_state; + struct sja1105_cbs_entry *cbs; struct sja1105_tagger_data tagger_data; struct sja1105_ptp_data ptp_data; struct sja1105_tas_data tas_data; diff --git a/drivers/net/dsa/sja1105/sja1105_dynamic_config.c b/drivers/net/dsa/sja1105/sja1105_dynamic_config.c index 2a8fbd7fdedc..7516f2ffdd4e 100644 --- a/drivers/net/dsa/sja1105/sja1105_dynamic_config.c +++ b/drivers/net/dsa/sja1105/sja1105_dynamic_config.c @@ -136,6 +136,12 @@ #define SJA1105_SIZE_RETAGGING_DYN_CMD \ (SJA1105_SIZE_DYN_CMD + SJA1105_SIZE_RETAGGING_ENTRY) +#define SJA1105ET_SIZE_CBS_DYN_CMD \ + (SJA1105_SIZE_DYN_CMD + SJA1105ET_SIZE_CBS_ENTRY) + +#define SJA1105PQRS_SIZE_CBS_DYN_CMD \ + (SJA1105_SIZE_DYN_CMD + SJA1105PQRS_SIZE_CBS_ENTRY) + #define SJA1105_MAX_DYN_CMD_SIZE \ SJA1105PQRS_SIZE_MAC_CONFIG_DYN_CMD @@ -542,6 +548,60 @@ sja1105_retagging_cmd_packing(void *buf, struct sja1105_dyn_cmd *cmd, sja1105_packing(p, &cmd->index, 5, 0, size, op); } +static void sja1105et_cbs_cmd_packing(void *buf, struct sja1105_dyn_cmd *cmd, + enum packing_op op) +{ + u8 *p = buf + SJA1105ET_SIZE_CBS_ENTRY; + const int size = SJA1105_SIZE_DYN_CMD; + + sja1105_packing(p, &cmd->valid, 31, 31, size, op); + sja1105_packing(p, &cmd->index, 19, 16, size, op); +} + +static size_t sja1105et_cbs_entry_packing(void *buf, void *entry_ptr, + enum packing_op op) +{ + const size_t size = SJA1105ET_SIZE_CBS_ENTRY; + struct sja1105_cbs_entry *entry = entry_ptr; + u8 *cmd = buf + size; + u32 *p = buf; + + sja1105_packing(cmd, &entry->port, 5, 3, SJA1105_SIZE_DYN_CMD, op); + sja1105_packing(cmd, &entry->prio, 2, 0, SJA1105_SIZE_DYN_CMD, op); + sja1105_packing(p + 3, &entry->credit_lo, 31, 0, size, op); + sja1105_packing(p + 2, &entry->credit_hi, 31, 0, size, op); + sja1105_packing(p + 1, &entry->send_slope, 31, 0, size, op); + sja1105_packing(p + 0, &entry->idle_slope, 31, 0, size, op); + return size; +} + +static void sja1105pqrs_cbs_cmd_packing(void *buf, struct sja1105_dyn_cmd *cmd, + enum packing_op op) +{ + u8 *p = buf + SJA1105PQRS_SIZE_CBS_ENTRY; + const int size = SJA1105_SIZE_DYN_CMD; + + sja1105_packing(p, &cmd->valid, 31, 31, size, op); + sja1105_packing(p, &cmd->rdwrset, 30, 30, size, op); + sja1105_packing(p, &cmd->errors, 29, 29, size, op); + sja1105_packing(p, &cmd->index, 3, 0, size, op); +} + +static size_t sja1105pqrs_cbs_entry_packing(void *buf, void *entry_ptr, + enum packing_op op) +{ + const size_t size = SJA1105PQRS_SIZE_CBS_ENTRY; + struct sja1105_cbs_entry *entry = entry_ptr; + + sja1105_packing(buf, &entry->port, 159, 157, size, op); + sja1105_packing(buf, &entry->prio, 156, 154, size, op); + sja1105_packing(buf, &entry->credit_lo, 153, 122, size, op); + sja1105_packing(buf, &entry->credit_hi, 121, 90, size, op); + sja1105_packing(buf, &entry->send_slope, 89, 58, size, op); + sja1105_packing(buf, &entry->idle_slope, 57, 26, size, op); + return size; +} + #define OP_READ BIT(0) #define OP_WRITE BIT(1) #define OP_DEL BIT(2) @@ -631,6 +691,14 @@ struct sja1105_dynamic_table_ops sja1105et_dyn_ops[BLK_IDX_MAX_DYN] = { .packed_size = SJA1105_SIZE_RETAGGING_DYN_CMD, .addr = 0x31, }, + [BLK_IDX_CBS] = { + .entry_packing = sja1105et_cbs_entry_packing, + .cmd_packing = sja1105et_cbs_cmd_packing, + .max_entry_count = SJA1105ET_MAX_CBS_COUNT, + .access = OP_WRITE, + .packed_size = SJA1105ET_SIZE_CBS_DYN_CMD, + .addr = 0x2c, + }, [BLK_IDX_XMII_PARAMS] = {0}, }; @@ -725,6 +793,14 @@ struct sja1105_dynamic_table_ops sja1105pqrs_dyn_ops[BLK_IDX_MAX_DYN] = { .packed_size = SJA1105_SIZE_RETAGGING_DYN_CMD, .addr = 0x38, }, + [BLK_IDX_CBS] = { + .entry_packing = sja1105pqrs_cbs_entry_packing, + .cmd_packing = sja1105pqrs_cbs_cmd_packing, + .max_entry_count = SJA1105PQRS_MAX_CBS_COUNT, + .access = OP_WRITE, + .packed_size = SJA1105PQRS_SIZE_CBS_DYN_CMD, + .addr = 0x32, + }, [BLK_IDX_XMII_PARAMS] = {0}, }; diff --git a/drivers/net/dsa/sja1105/sja1105_main.c b/drivers/net/dsa/sja1105/sja1105_main.c index 44ce7882dfb1..36ab527449e6 100644 --- a/drivers/net/dsa/sja1105/sja1105_main.c +++ b/drivers/net/dsa/sja1105/sja1105_main.c @@ -1640,6 +1640,92 @@ static void sja1105_bridge_leave(struct dsa_switch *ds, int port, sja1105_bridge_member(ds, port, br, false); } +#define BYTES_PER_KBIT (1000LL / 8) + +static int sja1105_find_unused_cbs_shaper(struct sja1105_private *priv) +{ + int i; + + for (i = 0; i < priv->info->num_cbs_shapers; i++) + if (!priv->cbs[i].idle_slope && !priv->cbs[i].send_slope) + return i; + + return -1; +} + +static int sja1105_delete_cbs_shaper(struct sja1105_private *priv, int port, + int prio) +{ + int i; + + for (i = 0; i < priv->info->num_cbs_shapers; i++) { + struct sja1105_cbs_entry *cbs = &priv->cbs[i]; + + if (cbs->port == port && cbs->prio == prio) { + memset(cbs, 0, sizeof(*cbs)); + return sja1105_dynamic_config_write(priv, BLK_IDX_CBS, + i, cbs, true); + } + } + + return 0; +} + +static int sja1105_setup_tc_cbs(struct dsa_switch *ds, int port, + struct tc_cbs_qopt_offload *offload) +{ + struct sja1105_private *priv = ds->priv; + struct sja1105_cbs_entry *cbs; + int index; + + if (!offload->enable) + return sja1105_delete_cbs_shaper(priv, port, offload->queue); + + index = sja1105_find_unused_cbs_shaper(priv); + if (index < 0) + return -ENOSPC; + + cbs = &priv->cbs[index]; + cbs->port = port; + cbs->prio = offload->queue; + /* locredit and sendslope are negative by definition. In hardware, + * positive values must be provided, and the negative sign is implicit. + */ + cbs->credit_hi = offload->hicredit; + cbs->credit_lo = abs(offload->locredit); + /* User space is in kbits/sec, hardware in bytes/sec */ + cbs->idle_slope = offload->idleslope * BYTES_PER_KBIT; + cbs->send_slope = abs(offload->sendslope * BYTES_PER_KBIT); + /* Convert the negative values from 64-bit 2's complement + * to 32-bit 2's complement (for the case of 0x80000000 whose + * negative is still negative). + */ + cbs->credit_lo &= GENMASK_ULL(31, 0); + cbs->send_slope &= GENMASK_ULL(31, 0); + + return sja1105_dynamic_config_write(priv, BLK_IDX_CBS, index, cbs, + true); +} + +static int sja1105_reload_cbs(struct sja1105_private *priv) +{ + int rc = 0, i; + + for (i = 0; i < priv->info->num_cbs_shapers; i++) { + struct sja1105_cbs_entry *cbs = &priv->cbs[i]; + + if (!cbs->idle_slope && !cbs->send_slope) + continue; + + rc = sja1105_dynamic_config_write(priv, BLK_IDX_CBS, i, cbs, + true); + if (rc) + break; + } + + return rc; +} + static const char * const sja1105_reset_reasons[] = { [SJA1105_VLAN_FILTERING] = "VLAN filtering", [SJA1105_RX_HWTSTAMPING] = "RX timestamping", @@ -1754,6 +1840,10 @@ out_unlock_ptp: sja1105_sgmii_pcs_force_speed(priv, speed); } } + + rc = sja1105_reload_cbs(priv); + if (rc < 0) + goto out; out: mutex_unlock(&priv->mgmt_lock); @@ -3131,6 +3221,8 @@ static int sja1105_port_setup_tc(struct dsa_switch *ds, int port, switch (type) { case TC_SETUP_QDISC_TAPRIO: return sja1105_setup_tc_taprio(ds, port, type_data); + case TC_SETUP_QDISC_CBS: + return sja1105_setup_tc_cbs(ds, port, type_data); default: return -EOPNOTSUPP; } @@ -3408,6 +3500,14 @@ static int sja1105_probe(struct spi_device *spi) if (rc) return rc; + if (IS_ENABLED(CONFIG_NET_SCH_CBS)) { + priv->cbs = devm_kcalloc(dev, priv->info->num_cbs_shapers, + sizeof(struct sja1105_cbs_entry), + GFP_KERNEL); + if (!priv->cbs) + return -ENOMEM; + } + /* Connections between dsa_port and sja1105_port */ for (port = 0; port < SJA1105_NUM_PORTS; port++) { struct sja1105_port *sp = &priv->ports[port]; diff --git a/drivers/net/dsa/sja1105/sja1105_spi.c b/drivers/net/dsa/sja1105/sja1105_spi.c index a0dacae803cc..bb52b9c841b2 100644 --- a/drivers/net/dsa/sja1105/sja1105_spi.c +++ b/drivers/net/dsa/sja1105/sja1105_spi.c @@ -515,6 +515,7 @@ struct sja1105_info sja1105e_info = { .qinq_tpid = ETH_P_8021Q, .ptp_ts_bits = 24, .ptpegr_ts_bytes = 4, + .num_cbs_shapers = SJA1105ET_MAX_CBS_COUNT, .reset_cmd = sja1105et_reset_cmd, .fdb_add_cmd = sja1105et_fdb_add, .fdb_del_cmd = sja1105et_fdb_del, @@ -530,6 +531,7 @@ struct sja1105_info sja1105t_info = { .qinq_tpid = ETH_P_8021Q, .ptp_ts_bits = 24, .ptpegr_ts_bytes = 4, + .num_cbs_shapers = SJA1105ET_MAX_CBS_COUNT, .reset_cmd = sja1105et_reset_cmd, .fdb_add_cmd = sja1105et_fdb_add, .fdb_del_cmd = sja1105et_fdb_del, @@ -545,6 +547,7 @@ struct sja1105_info sja1105p_info = { .qinq_tpid = ETH_P_8021AD, .ptp_ts_bits = 32, .ptpegr_ts_bytes = 8, + .num_cbs_shapers = SJA1105PQRS_MAX_CBS_COUNT, .setup_rgmii_delay = sja1105pqrs_setup_rgmii_delay, .reset_cmd = sja1105pqrs_reset_cmd, .fdb_add_cmd = sja1105pqrs_fdb_add, @@ -561,6 +564,7 @@ struct sja1105_info sja1105q_info = { .qinq_tpid = ETH_P_8021AD, .ptp_ts_bits = 32, .ptpegr_ts_bytes = 8, + .num_cbs_shapers = SJA1105PQRS_MAX_CBS_COUNT, .setup_rgmii_delay = sja1105pqrs_setup_rgmii_delay, .reset_cmd = sja1105pqrs_reset_cmd, .fdb_add_cmd = sja1105pqrs_fdb_add, @@ -577,6 +581,7 @@ struct sja1105_info sja1105r_info = { .qinq_tpid = ETH_P_8021AD, .ptp_ts_bits = 32, .ptpegr_ts_bytes = 8, + .num_cbs_shapers = SJA1105PQRS_MAX_CBS_COUNT, .setup_rgmii_delay = sja1105pqrs_setup_rgmii_delay, .reset_cmd = sja1105pqrs_reset_cmd, .fdb_add_cmd = sja1105pqrs_fdb_add, @@ -594,6 +599,7 @@ struct sja1105_info sja1105s_info = { .qinq_tpid = ETH_P_8021AD, .ptp_ts_bits = 32, .ptpegr_ts_bytes = 8, + .num_cbs_shapers = SJA1105PQRS_MAX_CBS_COUNT, .setup_rgmii_delay = sja1105pqrs_setup_rgmii_delay, .reset_cmd = sja1105pqrs_reset_cmd, .fdb_add_cmd = sja1105pqrs_fdb_add, diff --git a/drivers/net/dsa/sja1105/sja1105_static_config.h b/drivers/net/dsa/sja1105/sja1105_static_config.h index 5946847bb5b9..9b62b9b5549d 100644 --- a/drivers/net/dsa/sja1105/sja1105_static_config.h +++ b/drivers/net/dsa/sja1105/sja1105_static_config.h @@ -30,11 +30,13 @@ #define SJA1105ET_SIZE_L2_LOOKUP_PARAMS_ENTRY 4 #define SJA1105ET_SIZE_GENERAL_PARAMS_ENTRY 40 #define SJA1105ET_SIZE_AVB_PARAMS_ENTRY 12 +#define SJA1105ET_SIZE_CBS_ENTRY 16 #define SJA1105PQRS_SIZE_L2_LOOKUP_ENTRY 20 #define SJA1105PQRS_SIZE_MAC_CONFIG_ENTRY 32 #define SJA1105PQRS_SIZE_L2_LOOKUP_PARAMS_ENTRY 16 #define SJA1105PQRS_SIZE_GENERAL_PARAMS_ENTRY 44 #define SJA1105PQRS_SIZE_AVB_PARAMS_ENTRY 16 +#define SJA1105PQRS_SIZE_CBS_ENTRY 20 /* UM10944.pdf Page 11, Table 2. Configuration Blocks */ enum { @@ -56,6 +58,7 @@ enum { BLKID_AVB_PARAMS = 0x10, BLKID_GENERAL_PARAMS = 0x11, BLKID_RETAGGING = 0x12, + BLKID_CBS = 0x13, BLKID_XMII_PARAMS = 0x4E, }; @@ -78,6 +81,7 @@ enum sja1105_blk_idx { BLK_IDX_AVB_PARAMS, BLK_IDX_GENERAL_PARAMS, BLK_IDX_RETAGGING, + BLK_IDX_CBS, BLK_IDX_XMII_PARAMS, BLK_IDX_MAX, /* Fake block indices that are only valid for dynamic access */ @@ -105,6 +109,8 @@ enum sja1105_blk_idx { #define SJA1105_MAX_RETAGGING_COUNT 32 #define SJA1105_MAX_XMII_PARAMS_COUNT 1 #define SJA1105_MAX_AVB_PARAMS_COUNT 1 +#define SJA1105ET_MAX_CBS_COUNT 10 +#define SJA1105PQRS_MAX_CBS_COUNT 16 #define SJA1105_MAX_FRAME_MEMORY 929 #define SJA1105_MAX_FRAME_MEMORY_RETAGGING 910 @@ -289,6 +295,15 @@ struct sja1105_retagging_entry { u64 destports; }; +struct sja1105_cbs_entry { + u64 port; + u64 prio; + u64 credit_hi; + u64 credit_lo; + u64 send_slope; + u64 idle_slope; +}; + struct sja1105_xmii_params_entry { u64 phy_mac[5]; u64 xmii_mode[5]; -- cgit v1.2.3-59-g8ed1b From d29245692a44d71d5e2e0770463184a693696232 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 27 May 2020 17:34:58 -0700 Subject: tcp: ipv6: support RFC 6069 (TCP-LD) Make tcp_ld_RTO_revert() helper available to IPv6, and implement RFC 6069 : Quoting this RFC : 3. Connectivity Disruption Indication For Internet Protocol version 6 (IPv6) [RFC2460], the counterpart of the ICMP destination unreachable message of code 0 (net unreachable) and of code 1 (host unreachable) is the ICMPv6 destination unreachable message of code 0 (no route to destination) [RFC4443]. As with IPv4, a router should generate an ICMPv6 destination unreachable message of code 0 in response to a packet that cannot be delivered to its destination address because it lacks a matching entry in its routing table. Signed-off-by: Eric Dumazet Acked-by: Yuchung Cheng Acked-by: Neal Cardwell Signed-off-by: David S. Miller --- include/net/tcp.h | 1 + net/ipv4/tcp_ipv4.c | 3 ++- net/ipv6/tcp_ipv6.c | 9 +++++++++ 3 files changed, 12 insertions(+), 1 deletion(-) diff --git a/include/net/tcp.h b/include/net/tcp.h index b681338a8320..66e4b8331850 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -437,6 +437,7 @@ u16 tcp_get_syncookie_mss(struct request_sock_ops *rsk_ops, void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb); void tcp_v4_mtu_reduced(struct sock *sk); void tcp_req_err(struct sock *sk, u32 seq, bool abort); +void tcp_ld_RTO_revert(struct sock *sk, u32 seq); int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb); struct sock *tcp_create_openreq_child(const struct sock *sk, struct request_sock *req, diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 4eef5b84fff1..ad6435ba6d72 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -404,7 +404,7 @@ void tcp_req_err(struct sock *sk, u32 seq, bool abort) EXPORT_SYMBOL(tcp_req_err); /* TCP-LD (RFC 6069) logic */ -static void tcp_ld_RTO_revert(struct sock *sk, u32 seq) +void tcp_ld_RTO_revert(struct sock *sk, u32 seq) { struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); @@ -441,6 +441,7 @@ static void tcp_ld_RTO_revert(struct sock *sk, u32 seq) tcp_retransmit_timer(sk); } } +EXPORT_SYMBOL(tcp_ld_RTO_revert); /* * This routine is called by the ICMP module when it gets some diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 01a6f5111a77..b7415ca75c2d 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -473,6 +473,15 @@ static int tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, } else sk->sk_err_soft = err; goto out; + case TCP_LISTEN: + break; + default: + /* check if this ICMP message allows revert of backoff. + * (see RFC 6069) + */ + if (!fastopen && type == ICMPV6_DEST_UNREACH && + code == ICMPV6_NOROUTE) + tcp_ld_RTO_revert(sk, seq); } if (!sock_owned_by_user(sk) && np->recverr) { -- cgit v1.2.3-59-g8ed1b From b58f0e8f38c0a44afa59601a115bd231f23471e1 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 28 May 2020 07:12:09 +0200 Subject: net: add sock_set_reuseaddr Add a helper to directly set the SO_REUSEADDR sockopt from kernel space without going through a fake uaccess. For this the iscsi target now has to formally depend on inet to avoid a mostly theoretical compile failure. For actual operation it already did depend on having ipv4 or ipv6 support. Signed-off-by: Christoph Hellwig Acked-by: Sagi Grimberg Signed-off-by: David S. Miller --- drivers/infiniband/sw/siw/siw_cm.c | 18 +++++------------- drivers/nvme/target/tcp.c | 8 +------- drivers/target/iscsi/Kconfig | 2 +- drivers/target/iscsi/iscsi_target_login.c | 9 +-------- fs/dlm/lowcomms.c | 6 +----- include/net/sock.h | 2 ++ net/core/sock.c | 8 ++++++++ 7 files changed, 19 insertions(+), 34 deletions(-) diff --git a/drivers/infiniband/sw/siw/siw_cm.c b/drivers/infiniband/sw/siw/siw_cm.c index 559e5fd3bad8..d1860f3e8740 100644 --- a/drivers/infiniband/sw/siw/siw_cm.c +++ b/drivers/infiniband/sw/siw/siw_cm.c @@ -1312,17 +1312,14 @@ static void siw_cm_llp_state_change(struct sock *sk) static int kernel_bindconnect(struct socket *s, struct sockaddr *laddr, struct sockaddr *raddr) { - int rv, flags = 0, s_val = 1; + int rv, flags = 0; size_t size = laddr->sa_family == AF_INET ? sizeof(struct sockaddr_in) : sizeof(struct sockaddr_in6); /* * Make address available again asap. */ - rv = kernel_setsockopt(s, SOL_SOCKET, SO_REUSEADDR, (char *)&s_val, - sizeof(s_val)); - if (rv < 0) - return rv; + sock_set_reuseaddr(s->sk); rv = s->ops->bind(s, laddr, size); if (rv < 0) @@ -1781,7 +1778,7 @@ int siw_create_listen(struct iw_cm_id *id, int backlog) struct siw_cep *cep = NULL; struct siw_device *sdev = to_siw_dev(id->device); int addr_family = id->local_addr.ss_family; - int rv = 0, s_val; + int rv = 0; if (addr_family != AF_INET && addr_family != AF_INET6) return -EAFNOSUPPORT; @@ -1793,13 +1790,8 @@ int siw_create_listen(struct iw_cm_id *id, int backlog) /* * Allow binding local port when still in TIME_WAIT from last close. */ - s_val = 1; - rv = kernel_setsockopt(s, SOL_SOCKET, SO_REUSEADDR, (char *)&s_val, - sizeof(s_val)); - if (rv) { - siw_dbg(id->device, "setsockopt error: %d\n", rv); - goto error; - } + sock_set_reuseaddr(s->sk); + if (addr_family == AF_INET) { struct sockaddr_in *laddr = &to_sockaddr_in(id->local_addr); diff --git a/drivers/nvme/target/tcp.c b/drivers/nvme/target/tcp.c index f0da04e960f4..40757a63f455 100644 --- a/drivers/nvme/target/tcp.c +++ b/drivers/nvme/target/tcp.c @@ -1632,6 +1632,7 @@ static int nvmet_tcp_add_port(struct nvmet_port *nport) port->sock->sk->sk_user_data = port; port->data_ready = port->sock->sk->sk_data_ready; port->sock->sk->sk_data_ready = nvmet_tcp_listen_data_ready; + sock_set_reuseaddr(port->sock->sk); opt = 1; ret = kernel_setsockopt(port->sock, IPPROTO_TCP, @@ -1641,13 +1642,6 @@ static int nvmet_tcp_add_port(struct nvmet_port *nport) goto err_sock; } - ret = kernel_setsockopt(port->sock, SOL_SOCKET, SO_REUSEADDR, - (char *)&opt, sizeof(opt)); - if (ret) { - pr_err("failed to set SO_REUSEADDR sock opt %d\n", ret); - goto err_sock; - } - if (so_priority > 0) { ret = kernel_setsockopt(port->sock, SOL_SOCKET, SO_PRIORITY, (char *)&so_priority, sizeof(so_priority)); diff --git a/drivers/target/iscsi/Kconfig b/drivers/target/iscsi/Kconfig index 1f93ea381353..922484ea4e30 100644 --- a/drivers/target/iscsi/Kconfig +++ b/drivers/target/iscsi/Kconfig @@ -1,7 +1,7 @@ # SPDX-License-Identifier: GPL-2.0-only config ISCSI_TARGET tristate "Linux-iSCSI.org iSCSI Target Mode Stack" - depends on NET + depends on INET select CRYPTO select CRYPTO_CRC32C select CRYPTO_CRC32C_INTEL if X86 diff --git a/drivers/target/iscsi/iscsi_target_login.c b/drivers/target/iscsi/iscsi_target_login.c index 731ee67fe914..91acb3f07b4c 100644 --- a/drivers/target/iscsi/iscsi_target_login.c +++ b/drivers/target/iscsi/iscsi_target_login.c @@ -909,14 +909,7 @@ int iscsit_setup_np( } } - /* FIXME: Someone please explain why this is endian-safe */ - ret = kernel_setsockopt(sock, SOL_SOCKET, SO_REUSEADDR, - (char *)&opt, sizeof(opt)); - if (ret < 0) { - pr_err("kernel_setsockopt() for SO_REUSEADDR" - " failed\n"); - goto fail; - } + sock_set_reuseaddr(sock->sk); ret = kernel_setsockopt(sock, IPPROTO_IP, IP_FREEBIND, (char *)&opt, sizeof(opt)); diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c index f13dad0fd9ef..88f2574ca63a 100644 --- a/fs/dlm/lowcomms.c +++ b/fs/dlm/lowcomms.c @@ -1127,12 +1127,8 @@ static struct socket *tcp_create_listen_sock(struct connection *con, kernel_setsockopt(sock, SOL_TCP, TCP_NODELAY, (char *)&one, sizeof(one)); - result = kernel_setsockopt(sock, SOL_SOCKET, SO_REUSEADDR, - (char *)&one, sizeof(one)); + sock_set_reuseaddr(sock->sk); - if (result < 0) { - log_print("Failed to set SO_REUSEADDR on socket: %d", result); - } write_lock_bh(&sock->sk->sk_callback_lock); sock->sk->sk_user_data = con; save_listen_callbacks(sock); diff --git a/include/net/sock.h b/include/net/sock.h index 3e8c6d4b4b59..2ec085044790 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -2688,4 +2688,6 @@ static inline bool sk_dev_equal_l3scope(struct sock *sk, int dif) void sock_def_readable(struct sock *sk); +void sock_set_reuseaddr(struct sock *sk); + #endif /* _SOCK_H */ diff --git a/net/core/sock.c b/net/core/sock.c index fd85e651ce28..18eb84fdf5fb 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -712,6 +712,14 @@ bool sk_mc_loop(struct sock *sk) } EXPORT_SYMBOL(sk_mc_loop); +void sock_set_reuseaddr(struct sock *sk) +{ + lock_sock(sk); + sk->sk_reuse = SK_CAN_REUSE; + release_sock(sk); +} +EXPORT_SYMBOL(sock_set_reuseaddr); + /* * This is meant for all protocols to use and covers goings on * at the socket level. Everything here is generic. -- cgit v1.2.3-59-g8ed1b From c433594c07457d2b2e41a87014bfad9bec279abf Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 28 May 2020 07:12:10 +0200 Subject: net: add sock_no_linger Add a helper to directly set the SO_LINGER sockopt from kernel space with onoff set to true and a linger time of 0 without going through a fake uaccess. Signed-off-by: Christoph Hellwig Acked-by: Sagi Grimberg Signed-off-by: David S. Miller --- drivers/nvme/host/tcp.c | 9 +-------- drivers/nvme/target/tcp.c | 6 +----- include/net/sock.h | 1 + net/core/sock.c | 9 +++++++++ net/rds/tcp.h | 1 - net/rds/tcp_connect.c | 2 +- net/rds/tcp_listen.c | 13 +------------ net/sunrpc/svcsock.c | 12 ++---------- 8 files changed, 16 insertions(+), 37 deletions(-) diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index c15a92163c1f..e72d87482eb7 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -1313,7 +1313,6 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl, { struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl); struct nvme_tcp_queue *queue = &ctrl->queues[qid]; - struct linger sol = { .l_onoff = 1, .l_linger = 0 }; int ret, opt, rcv_pdu_size; queue->ctrl = ctrl; @@ -1361,13 +1360,7 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl, * close. This is done to prevent stale data from being sent should * the network connection be restored before TCP times out. */ - ret = kernel_setsockopt(queue->sock, SOL_SOCKET, SO_LINGER, - (char *)&sol, sizeof(sol)); - if (ret) { - dev_err(nctrl->device, - "failed to set SO_LINGER sock opt %d\n", ret); - goto err_sock; - } + sock_no_linger(queue->sock->sk); if (so_priority > 0) { ret = kernel_setsockopt(queue->sock, SOL_SOCKET, SO_PRIORITY, diff --git a/drivers/nvme/target/tcp.c b/drivers/nvme/target/tcp.c index 40757a63f455..e0801494b097 100644 --- a/drivers/nvme/target/tcp.c +++ b/drivers/nvme/target/tcp.c @@ -1429,7 +1429,6 @@ static int nvmet_tcp_set_queue_sock(struct nvmet_tcp_queue *queue) { struct socket *sock = queue->sock; struct inet_sock *inet = inet_sk(sock->sk); - struct linger sol = { .l_onoff = 1, .l_linger = 0 }; int ret; ret = kernel_getsockname(sock, @@ -1447,10 +1446,7 @@ static int nvmet_tcp_set_queue_sock(struct nvmet_tcp_queue *queue) * close. This is done to prevent stale data from being sent should * the network connection be restored before TCP times out. */ - ret = kernel_setsockopt(sock, SOL_SOCKET, SO_LINGER, - (char *)&sol, sizeof(sol)); - if (ret) - return ret; + sock_no_linger(sock->sk); if (so_priority > 0) { ret = kernel_setsockopt(sock, SOL_SOCKET, SO_PRIORITY, diff --git a/include/net/sock.h b/include/net/sock.h index 2ec085044790..6ed00bf009bb 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -2688,6 +2688,7 @@ static inline bool sk_dev_equal_l3scope(struct sock *sk, int dif) void sock_def_readable(struct sock *sk); +void sock_no_linger(struct sock *sk); void sock_set_reuseaddr(struct sock *sk); #endif /* _SOCK_H */ diff --git a/net/core/sock.c b/net/core/sock.c index 18eb84fdf5fb..f0f09524911c 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -720,6 +720,15 @@ void sock_set_reuseaddr(struct sock *sk) } EXPORT_SYMBOL(sock_set_reuseaddr); +void sock_no_linger(struct sock *sk) +{ + lock_sock(sk); + sk->sk_lingertime = 0; + sock_set_flag(sk, SOCK_LINGER); + release_sock(sk); +} +EXPORT_SYMBOL(sock_no_linger); + /* * This is meant for all protocols to use and covers goings on * at the socket level. Everything here is generic. diff --git a/net/rds/tcp.h b/net/rds/tcp.h index 3c69361d21c7..d640e210b97b 100644 --- a/net/rds/tcp.h +++ b/net/rds/tcp.h @@ -73,7 +73,6 @@ void rds_tcp_listen_data_ready(struct sock *sk); int rds_tcp_accept_one(struct socket *sock); int rds_tcp_keepalive(struct socket *sock); void *rds_tcp_listen_sock_def_readable(struct net *net); -void rds_tcp_set_linger(struct socket *sock); /* tcp_recv.c */ int rds_tcp_recv_init(void); diff --git a/net/rds/tcp_connect.c b/net/rds/tcp_connect.c index 008f50fb25dd..4e64598176b0 100644 --- a/net/rds/tcp_connect.c +++ b/net/rds/tcp_connect.c @@ -207,7 +207,7 @@ void rds_tcp_conn_path_shutdown(struct rds_conn_path *cp) if (sock) { if (rds_destroy_pending(cp->cp_conn)) - rds_tcp_set_linger(sock); + sock_no_linger(sock->sk); sock->ops->shutdown(sock, RCV_SHUTDOWN | SEND_SHUTDOWN); lock_sock(sock->sk); rds_tcp_restore_callbacks(sock, tc); /* tc->tc_sock = NULL */ diff --git a/net/rds/tcp_listen.c b/net/rds/tcp_listen.c index 810a3a49e947..bbb31b9c0b39 100644 --- a/net/rds/tcp_listen.c +++ b/net/rds/tcp_listen.c @@ -111,17 +111,6 @@ struct rds_tcp_connection *rds_tcp_accept_one_path(struct rds_connection *conn) return NULL; } -void rds_tcp_set_linger(struct socket *sock) -{ - struct linger no_linger = { - .l_onoff = 1, - .l_linger = 0, - }; - - kernel_setsockopt(sock, SOL_SOCKET, SO_LINGER, - (char *)&no_linger, sizeof(no_linger)); -} - int rds_tcp_accept_one(struct socket *sock) { struct socket *new_sock = NULL; @@ -241,7 +230,7 @@ rst_nsk: * be pending on it. By setting linger, we achieve the side-effect * of avoiding TIME_WAIT state on new_sock. */ - rds_tcp_set_linger(new_sock); + sock_no_linger(new_sock->sk); kernel_sock_shutdown(new_sock, SHUT_RDWR); ret = 0; out: diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index 023514e392b3..6773dacc64d8 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -323,17 +323,9 @@ static int svc_tcp_has_wspace(struct svc_xprt *xprt) static void svc_tcp_kill_temp_xprt(struct svc_xprt *xprt) { - struct svc_sock *svsk; - struct socket *sock; - struct linger no_linger = { - .l_onoff = 1, - .l_linger = 0, - }; + struct svc_sock *svsk = container_of(xprt, struct svc_sock, sk_xprt); - svsk = container_of(xprt, struct svc_sock, sk_xprt); - sock = svsk->sk_sock; - kernel_setsockopt(sock, SOL_SOCKET, SO_LINGER, - (char *)&no_linger, sizeof(no_linger)); + sock_no_linger(svsk->sk_sock->sk); } /* -- cgit v1.2.3-59-g8ed1b From 6e43496745e75ac49d644df984d2f4ee5b5b6b4e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 28 May 2020 07:12:11 +0200 Subject: net: add sock_set_priority Add a helper to directly set the SO_PRIORITY sockopt from kernel space without going through a fake uaccess. Signed-off-by: Christoph Hellwig Acked-by: Sagi Grimberg Signed-off-by: David S. Miller --- drivers/nvme/host/tcp.c | 12 ++---------- drivers/nvme/target/tcp.c | 18 ++++-------------- include/net/sock.h | 1 + net/core/sock.c | 8 ++++++++ 4 files changed, 15 insertions(+), 24 deletions(-) diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index e72d87482eb7..a307972d33a0 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -1362,16 +1362,8 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl, */ sock_no_linger(queue->sock->sk); - if (so_priority > 0) { - ret = kernel_setsockopt(queue->sock, SOL_SOCKET, SO_PRIORITY, - (char *)&so_priority, sizeof(so_priority)); - if (ret) { - dev_err(ctrl->ctrl.device, - "failed to set SO_PRIORITY sock opt, ret %d\n", - ret); - goto err_sock; - } - } + if (so_priority > 0) + sock_set_priority(queue->sock->sk, so_priority); /* Set socket type of service */ if (nctrl->opts->tos >= 0) { diff --git a/drivers/nvme/target/tcp.c b/drivers/nvme/target/tcp.c index e0801494b097..f3088156d01d 100644 --- a/drivers/nvme/target/tcp.c +++ b/drivers/nvme/target/tcp.c @@ -1448,12 +1448,8 @@ static int nvmet_tcp_set_queue_sock(struct nvmet_tcp_queue *queue) */ sock_no_linger(sock->sk); - if (so_priority > 0) { - ret = kernel_setsockopt(sock, SOL_SOCKET, SO_PRIORITY, - (char *)&so_priority, sizeof(so_priority)); - if (ret) - return ret; - } + if (so_priority > 0) + sock_set_priority(sock->sk, so_priority); /* Set socket type of service */ if (inet->rcv_tos > 0) { @@ -1638,14 +1634,8 @@ static int nvmet_tcp_add_port(struct nvmet_port *nport) goto err_sock; } - if (so_priority > 0) { - ret = kernel_setsockopt(port->sock, SOL_SOCKET, SO_PRIORITY, - (char *)&so_priority, sizeof(so_priority)); - if (ret) { - pr_err("failed to set SO_PRIORITY sock opt %d\n", ret); - goto err_sock; - } - } + if (so_priority > 0) + sock_set_priority(port->sock->sk, so_priority); ret = kernel_bind(port->sock, (struct sockaddr *)&port->addr, sizeof(port->addr)); diff --git a/include/net/sock.h b/include/net/sock.h index 6ed00bf009bb..a3a43141a4be 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -2689,6 +2689,7 @@ static inline bool sk_dev_equal_l3scope(struct sock *sk, int dif) void sock_def_readable(struct sock *sk); void sock_no_linger(struct sock *sk); +void sock_set_priority(struct sock *sk, u32 priority); void sock_set_reuseaddr(struct sock *sk); #endif /* _SOCK_H */ diff --git a/net/core/sock.c b/net/core/sock.c index f0f09524911c..ceda1a9248b3 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -729,6 +729,14 @@ void sock_no_linger(struct sock *sk) } EXPORT_SYMBOL(sock_no_linger); +void sock_set_priority(struct sock *sk, u32 priority) +{ + lock_sock(sk); + sk->sk_priority = priority; + release_sock(sk); +} +EXPORT_SYMBOL(sock_set_priority); + /* * This is meant for all protocols to use and covers goings on * at the socket level. Everything here is generic. -- cgit v1.2.3-59-g8ed1b From 76ee0785f42afbc0418072b7179d95f450d3c9a8 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 28 May 2020 07:12:12 +0200 Subject: net: add sock_set_sndtimeo Add a helper to directly set the SO_SNDTIMEO_NEW sockopt from kernel space without going through a fake uaccess. The interface is simplified to only pass the seconds value, as that is the only thing needed at the moment. Signed-off-by: Christoph Hellwig Signed-off-by: David S. Miller --- fs/dlm/lowcomms.c | 8 ++------ include/net/sock.h | 1 + net/core/sock.c | 11 +++++++++++ 3 files changed, 14 insertions(+), 6 deletions(-) diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c index 88f2574ca63a..b79711d0aac7 100644 --- a/fs/dlm/lowcomms.c +++ b/fs/dlm/lowcomms.c @@ -918,7 +918,6 @@ static void sctp_connect_to_sock(struct connection *con) int result; int addr_len; struct socket *sock; - struct __kernel_sock_timeval tv = { .tv_sec = 5, .tv_usec = 0 }; if (con->nodeid == 0) { log_print("attempt to connect sock 0 foiled"); @@ -970,13 +969,10 @@ static void sctp_connect_to_sock(struct connection *con) * since O_NONBLOCK argument in connect() function does not work here, * then, we should restore the default value of this attribute. */ - kernel_setsockopt(sock, SOL_SOCKET, SO_SNDTIMEO_NEW, (char *)&tv, - sizeof(tv)); + sock_set_sndtimeo(sock->sk, 5); result = sock->ops->connect(sock, (struct sockaddr *)&daddr, addr_len, 0); - memset(&tv, 0, sizeof(tv)); - kernel_setsockopt(sock, SOL_SOCKET, SO_SNDTIMEO_NEW, (char *)&tv, - sizeof(tv)); + sock_set_sndtimeo(sock->sk, 0); if (result == -EINPROGRESS) result = 0; diff --git a/include/net/sock.h b/include/net/sock.h index a3a43141a4be..9a7b9e98685a 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -2691,5 +2691,6 @@ void sock_def_readable(struct sock *sk); void sock_no_linger(struct sock *sk); void sock_set_priority(struct sock *sk, u32 priority); void sock_set_reuseaddr(struct sock *sk); +void sock_set_sndtimeo(struct sock *sk, s64 secs); #endif /* _SOCK_H */ diff --git a/net/core/sock.c b/net/core/sock.c index ceda1a9248b3..d3b1d61e4f76 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -737,6 +737,17 @@ void sock_set_priority(struct sock *sk, u32 priority) } EXPORT_SYMBOL(sock_set_priority); +void sock_set_sndtimeo(struct sock *sk, s64 secs) +{ + lock_sock(sk); + if (secs && secs < MAX_SCHEDULE_TIMEOUT / HZ - 1) + sk->sk_sndtimeo = secs * HZ; + else + sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT; + release_sock(sk); +} +EXPORT_SYMBOL(sock_set_sndtimeo); + /* * This is meant for all protocols to use and covers goings on * at the socket level. Everything here is generic. -- cgit v1.2.3-59-g8ed1b From 7594888c782e735f8a7b110094307a4dbe7b3f03 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 28 May 2020 07:12:13 +0200 Subject: net: add sock_bindtoindex Add a helper to directly set the SO_BINDTOIFINDEX sockopt from kernel space without going through a fake uaccess. Signed-off-by: Christoph Hellwig Signed-off-by: David S. Miller --- include/net/sock.h | 1 + net/core/sock.c | 21 +++++++++++++++------ net/ipv4/udp_tunnel.c | 4 +--- net/ipv6/ip6_udp_tunnel.c | 4 +--- 4 files changed, 18 insertions(+), 12 deletions(-) diff --git a/include/net/sock.h b/include/net/sock.h index 9a7b9e98685a..cdec7bc055d5 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -2688,6 +2688,7 @@ static inline bool sk_dev_equal_l3scope(struct sock *sk, int dif) void sock_def_readable(struct sock *sk); +int sock_bindtoindex(struct sock *sk, int ifindex); void sock_no_linger(struct sock *sk); void sock_set_priority(struct sock *sk, u32 priority); void sock_set_reuseaddr(struct sock *sk); diff --git a/net/core/sock.c b/net/core/sock.c index d3b1d61e4f76..23f80880fbb2 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -566,7 +566,7 @@ struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie) } EXPORT_SYMBOL(sk_dst_check); -static int sock_setbindtodevice_locked(struct sock *sk, int ifindex) +static int sock_bindtoindex_locked(struct sock *sk, int ifindex) { int ret = -ENOPROTOOPT; #ifdef CONFIG_NETDEVICES @@ -594,6 +594,18 @@ out: return ret; } +int sock_bindtoindex(struct sock *sk, int ifindex) +{ + int ret; + + lock_sock(sk); + ret = sock_bindtoindex_locked(sk, ifindex); + release_sock(sk); + + return ret; +} +EXPORT_SYMBOL(sock_bindtoindex); + static int sock_setbindtodevice(struct sock *sk, char __user *optval, int optlen) { @@ -634,10 +646,7 @@ static int sock_setbindtodevice(struct sock *sk, char __user *optval, goto out; } - lock_sock(sk); - ret = sock_setbindtodevice_locked(sk, index); - release_sock(sk); - + return sock_bindtoindex(sk, index); out: #endif @@ -1216,7 +1225,7 @@ set_rcvbuf: break; case SO_BINDTOIFINDEX: - ret = sock_setbindtodevice_locked(sk, val); + ret = sock_bindtoindex_locked(sk, val); break; default: diff --git a/net/ipv4/udp_tunnel.c b/net/ipv4/udp_tunnel.c index 150e6f0fdbf5..2158e8bddf41 100644 --- a/net/ipv4/udp_tunnel.c +++ b/net/ipv4/udp_tunnel.c @@ -22,9 +22,7 @@ int udp_sock_create4(struct net *net, struct udp_port_cfg *cfg, goto error; if (cfg->bind_ifindex) { - err = kernel_setsockopt(sock, SOL_SOCKET, SO_BINDTOIFINDEX, - (void *)&cfg->bind_ifindex, - sizeof(cfg->bind_ifindex)); + err = sock_bindtoindex(sock->sk, cfg->bind_ifindex); if (err < 0) goto error; } diff --git a/net/ipv6/ip6_udp_tunnel.c b/net/ipv6/ip6_udp_tunnel.c index 58956a6b66a2..6523609516d2 100644 --- a/net/ipv6/ip6_udp_tunnel.c +++ b/net/ipv6/ip6_udp_tunnel.c @@ -33,9 +33,7 @@ int udp_sock_create6(struct net *net, struct udp_port_cfg *cfg, goto error; } if (cfg->bind_ifindex) { - err = kernel_setsockopt(sock, SOL_SOCKET, SO_BINDTOIFINDEX, - (void *)&cfg->bind_ifindex, - sizeof(cfg->bind_ifindex)); + err = sock_bindtoindex(sock->sk, cfg->bind_ifindex); if (err < 0) goto error; } -- cgit v1.2.3-59-g8ed1b From 783da70e83967efeacf3c02c9dcfdc2b17bd62eb Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 28 May 2020 07:12:14 +0200 Subject: net: add sock_enable_timestamps Add a helper to directly enable timestamps instead of setting the SO_TIMESTAMP* sockopts from kernel space and going through a fake uaccess. Signed-off-by: Christoph Hellwig Signed-off-by: David S. Miller --- include/net/sock.h | 1 + net/core/sock.c | 47 +++++++++++++++++++++++++++++------------------ net/rxrpc/local_object.c | 8 +------- 3 files changed, 31 insertions(+), 25 deletions(-) diff --git a/include/net/sock.h b/include/net/sock.h index cdec7bc055d5..99ef43508d2b 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -2689,6 +2689,7 @@ static inline bool sk_dev_equal_l3scope(struct sock *sk, int dif) void sock_def_readable(struct sock *sk); int sock_bindtoindex(struct sock *sk, int ifindex); +void sock_enable_timestamps(struct sock *sk); void sock_no_linger(struct sock *sk); void sock_set_priority(struct sock *sk, u32 priority); void sock_set_reuseaddr(struct sock *sk); diff --git a/net/core/sock.c b/net/core/sock.c index 23f80880fbb2..e4a4dd2b3d8b 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -757,6 +757,28 @@ void sock_set_sndtimeo(struct sock *sk, s64 secs) } EXPORT_SYMBOL(sock_set_sndtimeo); +static void __sock_set_timestamps(struct sock *sk, bool val, bool new, bool ns) +{ + if (val) { + sock_valbool_flag(sk, SOCK_TSTAMP_NEW, new); + sock_valbool_flag(sk, SOCK_RCVTSTAMPNS, ns); + sock_set_flag(sk, SOCK_RCVTSTAMP); + sock_enable_timestamp(sk, SOCK_TIMESTAMP); + } else { + sock_reset_flag(sk, SOCK_RCVTSTAMP); + sock_reset_flag(sk, SOCK_RCVTSTAMPNS); + sock_reset_flag(sk, SOCK_TSTAMP_NEW); + } +} + +void sock_enable_timestamps(struct sock *sk) +{ + lock_sock(sk); + __sock_set_timestamps(sk, true, false, true); + release_sock(sk); +} +EXPORT_SYMBOL(sock_enable_timestamps); + /* * This is meant for all protocols to use and covers goings on * at the socket level. Everything here is generic. @@ -948,28 +970,17 @@ set_rcvbuf: break; case SO_TIMESTAMP_OLD: + __sock_set_timestamps(sk, valbool, false, false); + break; case SO_TIMESTAMP_NEW: + __sock_set_timestamps(sk, valbool, true, false); + break; case SO_TIMESTAMPNS_OLD: + __sock_set_timestamps(sk, valbool, false, true); + break; case SO_TIMESTAMPNS_NEW: - if (valbool) { - if (optname == SO_TIMESTAMP_NEW || optname == SO_TIMESTAMPNS_NEW) - sock_set_flag(sk, SOCK_TSTAMP_NEW); - else - sock_reset_flag(sk, SOCK_TSTAMP_NEW); - - if (optname == SO_TIMESTAMP_OLD || optname == SO_TIMESTAMP_NEW) - sock_reset_flag(sk, SOCK_RCVTSTAMPNS); - else - sock_set_flag(sk, SOCK_RCVTSTAMPNS); - sock_set_flag(sk, SOCK_RCVTSTAMP); - sock_enable_timestamp(sk, SOCK_TIMESTAMP); - } else { - sock_reset_flag(sk, SOCK_RCVTSTAMP); - sock_reset_flag(sk, SOCK_RCVTSTAMPNS); - sock_reset_flag(sk, SOCK_TSTAMP_NEW); - } + __sock_set_timestamps(sk, valbool, true, true); break; - case SO_TIMESTAMPING_NEW: sock_set_flag(sk, SOCK_TSTAMP_NEW); /* fall through */ diff --git a/net/rxrpc/local_object.c b/net/rxrpc/local_object.c index 01135e54d95d..5ea2bd01fdd5 100644 --- a/net/rxrpc/local_object.c +++ b/net/rxrpc/local_object.c @@ -189,13 +189,7 @@ static int rxrpc_open_socket(struct rxrpc_local *local, struct net *net) } /* We want receive timestamps. */ - opt = 1; - ret = kernel_setsockopt(local->socket, SOL_SOCKET, SO_TIMESTAMPNS_OLD, - (char *)&opt, sizeof(opt)); - if (ret < 0) { - _debug("setsockopt failed"); - goto error; - } + sock_enable_timestamps(local->socket->sk); break; default: -- cgit v1.2.3-59-g8ed1b From ce3d9544cecacd40389c399d2b7ca31acc533b70 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 28 May 2020 07:12:15 +0200 Subject: net: add sock_set_keepalive Add a helper to directly set the SO_KEEPALIVE sockopt from kernel space without going through a fake uaccess. Signed-off-by: Christoph Hellwig Signed-off-by: David S. Miller --- fs/dlm/lowcomms.c | 6 +----- include/net/sock.h | 1 + net/core/sock.c | 10 ++++++++++ net/rds/tcp_listen.c | 6 +----- net/sunrpc/xprtsock.c | 4 +--- 5 files changed, 14 insertions(+), 13 deletions(-) diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c index b79711d0aac7..b6e6dba28154 100644 --- a/fs/dlm/lowcomms.c +++ b/fs/dlm/lowcomms.c @@ -1142,11 +1142,7 @@ static struct socket *tcp_create_listen_sock(struct connection *con, con->sock = NULL; goto create_out; } - result = kernel_setsockopt(sock, SOL_SOCKET, SO_KEEPALIVE, - (char *)&one, sizeof(one)); - if (result < 0) { - log_print("Set keepalive failed: %d", result); - } + sock_set_keepalive(sock->sk); result = sock->ops->listen(sock, 5); if (result < 0) { diff --git a/include/net/sock.h b/include/net/sock.h index 99ef43508d2b..dc08c176238f 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -2691,6 +2691,7 @@ void sock_def_readable(struct sock *sk); int sock_bindtoindex(struct sock *sk, int ifindex); void sock_enable_timestamps(struct sock *sk); void sock_no_linger(struct sock *sk); +void sock_set_keepalive(struct sock *sk); void sock_set_priority(struct sock *sk, u32 priority); void sock_set_reuseaddr(struct sock *sk); void sock_set_sndtimeo(struct sock *sk, s64 secs); diff --git a/net/core/sock.c b/net/core/sock.c index e4a4dd2b3d8b..728f5fb156a0 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -779,6 +779,16 @@ void sock_enable_timestamps(struct sock *sk) } EXPORT_SYMBOL(sock_enable_timestamps); +void sock_set_keepalive(struct sock *sk) +{ + lock_sock(sk); + if (sk->sk_prot->keepalive) + sk->sk_prot->keepalive(sk, true); + sock_valbool_flag(sk, SOCK_KEEPOPEN, true); + release_sock(sk); +} +EXPORT_SYMBOL(sock_set_keepalive); + /* * This is meant for all protocols to use and covers goings on * at the socket level. Everything here is generic. diff --git a/net/rds/tcp_listen.c b/net/rds/tcp_listen.c index bbb31b9c0b39..d8bd13276959 100644 --- a/net/rds/tcp_listen.c +++ b/net/rds/tcp_listen.c @@ -43,13 +43,9 @@ int rds_tcp_keepalive(struct socket *sock) /* values below based on xs_udp_default_timeout */ int keepidle = 5; /* send a probe 'keepidle' secs after last data */ int keepcnt = 5; /* number of unack'ed probes before declaring dead */ - int keepalive = 1; int ret = 0; - ret = kernel_setsockopt(sock, SOL_SOCKET, SO_KEEPALIVE, - (char *)&keepalive, sizeof(keepalive)); - if (ret < 0) - goto bail; + sock_set_keepalive(sock->sk); ret = kernel_setsockopt(sock, IPPROTO_TCP, TCP_KEEPCNT, (char *)&keepcnt, sizeof(keepcnt)); diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 845d0be805ec..30082cd03996 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -2110,7 +2110,6 @@ static void xs_tcp_set_socket_timeouts(struct rpc_xprt *xprt, struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt); unsigned int keepidle; unsigned int keepcnt; - unsigned int opt_on = 1; unsigned int timeo; spin_lock(&xprt->transport_lock); @@ -2122,8 +2121,7 @@ static void xs_tcp_set_socket_timeouts(struct rpc_xprt *xprt, spin_unlock(&xprt->transport_lock); /* TCP Keepalive options */ - kernel_setsockopt(sock, SOL_SOCKET, SO_KEEPALIVE, - (char *)&opt_on, sizeof(opt_on)); + sock_set_keepalive(sock->sk); kernel_setsockopt(sock, SOL_TCP, TCP_KEEPIDLE, (char *)&keepidle, sizeof(keepidle)); kernel_setsockopt(sock, SOL_TCP, TCP_KEEPINTVL, -- cgit v1.2.3-59-g8ed1b From 26cfabf9cdd273650126d84a48a7f8dedbcded48 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 28 May 2020 07:12:16 +0200 Subject: net: add sock_set_rcvbuf Add a helper to directly set the SO_RCVBUFFORCE sockopt from kernel space without going through a fake uaccess. Signed-off-by: Christoph Hellwig Signed-off-by: David S. Miller --- fs/dlm/lowcomms.c | 7 +------ include/net/sock.h | 1 + net/core/sock.c | 59 +++++++++++++++++++++++++++++------------------------- 3 files changed, 34 insertions(+), 33 deletions(-) diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c index b6e6dba28154..2822a430a2b4 100644 --- a/fs/dlm/lowcomms.c +++ b/fs/dlm/lowcomms.c @@ -1180,7 +1180,6 @@ static int sctp_listen_for_all(void) struct socket *sock = NULL; int result = -EINVAL; struct connection *con = nodeid2con(0, GFP_NOFS); - int bufsize = NEEDED_RMEM; int one = 1; if (!con) @@ -1195,11 +1194,7 @@ static int sctp_listen_for_all(void) goto out; } - result = kernel_setsockopt(sock, SOL_SOCKET, SO_RCVBUFFORCE, - (char *)&bufsize, sizeof(bufsize)); - if (result) - log_print("Error increasing buffer space on socket %d", result); - + sock_set_rcvbuf(sock->sk, NEEDED_RMEM); result = kernel_setsockopt(sock, SOL_SCTP, SCTP_NODELAY, (char *)&one, sizeof(one)); if (result < 0) diff --git a/include/net/sock.h b/include/net/sock.h index dc08c176238f..c997289aabbf 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -2693,6 +2693,7 @@ void sock_enable_timestamps(struct sock *sk); void sock_no_linger(struct sock *sk); void sock_set_keepalive(struct sock *sk); void sock_set_priority(struct sock *sk, u32 priority); +void sock_set_rcvbuf(struct sock *sk, int val); void sock_set_reuseaddr(struct sock *sk); void sock_set_sndtimeo(struct sock *sk, s64 secs); diff --git a/net/core/sock.c b/net/core/sock.c index 728f5fb156a0..3c6ebf952e9a 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -789,6 +789,35 @@ void sock_set_keepalive(struct sock *sk) } EXPORT_SYMBOL(sock_set_keepalive); +static void __sock_set_rcvbuf(struct sock *sk, int val) +{ + /* Ensure val * 2 fits into an int, to prevent max_t() from treating it + * as a negative value. + */ + val = min_t(int, val, INT_MAX / 2); + sk->sk_userlocks |= SOCK_RCVBUF_LOCK; + + /* We double it on the way in to account for "struct sk_buff" etc. + * overhead. Applications assume that the SO_RCVBUF setting they make + * will allow that much actual data to be received on that socket. + * + * Applications are unaware that "struct sk_buff" and other overheads + * allocate from the receive buffer during socket buffer allocation. + * + * And after considering the possible alternatives, returning the value + * we actually used in getsockopt is the most desirable behavior. + */ + WRITE_ONCE(sk->sk_rcvbuf, max_t(int, val * 2, SOCK_MIN_RCVBUF)); +} + +void sock_set_rcvbuf(struct sock *sk, int val) +{ + lock_sock(sk); + __sock_set_rcvbuf(sk, val); + release_sock(sk); +} +EXPORT_SYMBOL(sock_set_rcvbuf); + /* * This is meant for all protocols to use and covers goings on * at the socket level. Everything here is generic. @@ -885,30 +914,7 @@ set_sndbuf: * play 'guess the biggest size' games. RCVBUF/SNDBUF * are treated in BSD as hints */ - val = min_t(u32, val, sysctl_rmem_max); -set_rcvbuf: - /* Ensure val * 2 fits into an int, to prevent max_t() - * from treating it as a negative value. - */ - val = min_t(int, val, INT_MAX / 2); - sk->sk_userlocks |= SOCK_RCVBUF_LOCK; - /* - * We double it on the way in to account for - * "struct sk_buff" etc. overhead. Applications - * assume that the SO_RCVBUF setting they make will - * allow that much actual data to be received on that - * socket. - * - * Applications are unaware that "struct sk_buff" and - * other overheads allocate from the receive buffer - * during socket buffer allocation. - * - * And after considering the possible alternatives, - * returning the value we actually used in getsockopt - * is the most desirable behavior. - */ - WRITE_ONCE(sk->sk_rcvbuf, - max_t(int, val * 2, SOCK_MIN_RCVBUF)); + __sock_set_rcvbuf(sk, min_t(u32, val, sysctl_rmem_max)); break; case SO_RCVBUFFORCE: @@ -920,9 +926,8 @@ set_rcvbuf: /* No negative values (to prevent underflow, as val will be * multiplied by 2). */ - if (val < 0) - val = 0; - goto set_rcvbuf; + __sock_set_rcvbuf(sk, max(val, 0)); + break; case SO_KEEPALIVE: if (sk->sk_prot->keepalive) -- cgit v1.2.3-59-g8ed1b From fe31a326a4aadb4a3ba2b21deacc380d06802737 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 28 May 2020 07:12:17 +0200 Subject: net: add sock_set_reuseport Add a helper to directly set the SO_REUSEPORT sockopt from kernel space without going through a fake uaccess. Signed-off-by: Christoph Hellwig Signed-off-by: David S. Miller --- include/net/sock.h | 1 + net/core/sock.c | 8 ++++++++ net/sunrpc/xprtsock.c | 17 +---------------- 3 files changed, 10 insertions(+), 16 deletions(-) diff --git a/include/net/sock.h b/include/net/sock.h index c997289aabbf..d994daa418ec 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -2695,6 +2695,7 @@ void sock_set_keepalive(struct sock *sk); void sock_set_priority(struct sock *sk, u32 priority); void sock_set_rcvbuf(struct sock *sk, int val); void sock_set_reuseaddr(struct sock *sk); +void sock_set_reuseport(struct sock *sk); void sock_set_sndtimeo(struct sock *sk, s64 secs); #endif /* _SOCK_H */ diff --git a/net/core/sock.c b/net/core/sock.c index 3c6ebf952e9a..2ca3425b519c 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -729,6 +729,14 @@ void sock_set_reuseaddr(struct sock *sk) } EXPORT_SYMBOL(sock_set_reuseaddr); +void sock_set_reuseport(struct sock *sk) +{ + lock_sock(sk); + sk->sk_reuseport = true; + release_sock(sk); +} +EXPORT_SYMBOL(sock_set_reuseport); + void sock_no_linger(struct sock *sk) { lock_sock(sk); diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 30082cd03996..399848c2bcb2 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -1594,21 +1594,6 @@ static int xs_get_random_port(void) return rand + min; } -/** - * xs_set_reuseaddr_port - set the socket's port and address reuse options - * @sock: socket - * - * Note that this function has to be called on all sockets that share the - * same port, and it must be called before binding. - */ -static void xs_sock_set_reuseport(struct socket *sock) -{ - int opt = 1; - - kernel_setsockopt(sock, SOL_SOCKET, SO_REUSEPORT, - (char *)&opt, sizeof(opt)); -} - static unsigned short xs_sock_getport(struct socket *sock) { struct sockaddr_storage buf; @@ -1801,7 +1786,7 @@ static struct socket *xs_create_sock(struct rpc_xprt *xprt, xs_reclassify_socket(family, sock); if (reuseport) - xs_sock_set_reuseport(sock); + sock_set_reuseport(sock->sk); err = xs_bind(transport, sock); if (err) { -- cgit v1.2.3-59-g8ed1b From db10538a4b997a77a1fd561adaaa58afc7dcfa2f Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 28 May 2020 07:12:18 +0200 Subject: tcp: add tcp_sock_set_cork Add a helper to directly set the TCP_CORK sockopt from kernel space without going through a fake uaccess. Cleanup the callers to avoid pointless wrappers now that this is a simple function call. Signed-off-by: Christoph Hellwig Signed-off-by: David S. Miller --- drivers/block/drbd/drbd_int.h | 14 ----------- drivers/block/drbd/drbd_receiver.c | 4 +-- drivers/block/drbd/drbd_worker.c | 6 ++--- fs/cifs/transport.c | 8 ++---- include/linux/tcp.h | 2 ++ net/ipv4/tcp.c | 51 ++++++++++++++++++++++++-------------- net/rds/tcp_send.c | 9 ++----- 7 files changed, 43 insertions(+), 51 deletions(-) diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h index aae99a2d7bd4..3550adc93c68 100644 --- a/drivers/block/drbd/drbd_int.h +++ b/drivers/block/drbd/drbd_int.h @@ -1570,20 +1570,6 @@ extern void drbd_set_recv_tcq(struct drbd_device *device, int tcq_enabled); extern void _drbd_clear_done_ee(struct drbd_device *device, struct list_head *to_be_freed); extern int drbd_connected(struct drbd_peer_device *); -static inline void drbd_tcp_cork(struct socket *sock) -{ - int val = 1; - (void) kernel_setsockopt(sock, SOL_TCP, TCP_CORK, - (char*)&val, sizeof(val)); -} - -static inline void drbd_tcp_uncork(struct socket *sock) -{ - int val = 0; - (void) kernel_setsockopt(sock, SOL_TCP, TCP_CORK, - (char*)&val, sizeof(val)); -} - static inline void drbd_tcp_nodelay(struct socket *sock) { int val = 1; diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c index c15e7083b13a..55ea907ad33c 100644 --- a/drivers/block/drbd/drbd_receiver.c +++ b/drivers/block/drbd/drbd_receiver.c @@ -6162,7 +6162,7 @@ void drbd_send_acks_wf(struct work_struct *ws) rcu_read_unlock(); if (tcp_cork) - drbd_tcp_cork(connection->meta.socket); + tcp_sock_set_cork(connection->meta.socket->sk, true); err = drbd_finish_peer_reqs(device); kref_put(&device->kref, drbd_destroy_device); @@ -6175,7 +6175,7 @@ void drbd_send_acks_wf(struct work_struct *ws) } if (tcp_cork) - drbd_tcp_uncork(connection->meta.socket); + tcp_sock_set_cork(connection->meta.socket->sk, false); return; } diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c index 0dc019da1f8d..2b89c9f2ca70 100644 --- a/drivers/block/drbd/drbd_worker.c +++ b/drivers/block/drbd/drbd_worker.c @@ -2098,7 +2098,7 @@ static void wait_for_work(struct drbd_connection *connection, struct list_head * if (uncork) { mutex_lock(&connection->data.mutex); if (connection->data.socket) - drbd_tcp_uncork(connection->data.socket); + tcp_sock_set_cork(connection->data.socket->sk, false); mutex_unlock(&connection->data.mutex); } @@ -2153,9 +2153,9 @@ static void wait_for_work(struct drbd_connection *connection, struct list_head * mutex_lock(&connection->data.mutex); if (connection->data.socket) { if (cork) - drbd_tcp_cork(connection->data.socket); + tcp_sock_set_cork(connection->data.socket->sk, true); else if (!uncork) - drbd_tcp_uncork(connection->data.socket); + tcp_sock_set_cork(connection->data.socket->sk, false); } mutex_unlock(&connection->data.mutex); } diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c index c97570eb2c18..99760063e000 100644 --- a/fs/cifs/transport.c +++ b/fs/cifs/transport.c @@ -325,7 +325,6 @@ __smb_send_rqst(struct TCP_Server_Info *server, int num_rqst, size_t total_len = 0, sent, size; struct socket *ssocket = server->ssocket; struct msghdr smb_msg; - int val = 1; __be32 rfc1002_marker; if (cifs_rdma_enabled(server)) { @@ -345,8 +344,7 @@ __smb_send_rqst(struct TCP_Server_Info *server, int num_rqst, } /* cork the socket */ - kernel_setsockopt(ssocket, SOL_TCP, TCP_CORK, - (char *)&val, sizeof(val)); + tcp_sock_set_cork(ssocket->sk, true); for (j = 0; j < num_rqst; j++) send_length += smb_rqst_len(server, &rqst[j]); @@ -435,9 +433,7 @@ unmask: } /* uncork it */ - val = 0; - kernel_setsockopt(ssocket, SOL_TCP, TCP_CORK, - (char *)&val, sizeof(val)); + tcp_sock_set_cork(ssocket->sk, false); if ((total_len > 0) && (total_len != send_length)) { cifs_dbg(FYI, "partial send (wanted=%u sent=%zu): terminating session\n", diff --git a/include/linux/tcp.h b/include/linux/tcp.h index bf44e85d709d..889eeb2256c2 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -497,4 +497,6 @@ static inline u16 tcp_mss_clamp(const struct tcp_sock *tp, u16 mss) int tcp_skb_shift(struct sk_buff *to, struct sk_buff *from, int pcount, int shiftlen); +void tcp_sock_set_cork(struct sock *sk, bool on); + #endif /* _LINUX_TCP_H */ diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 970064996377..e6cf702e16d6 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2801,6 +2801,37 @@ static void tcp_enable_tx_delay(void) } } +/* When set indicates to always queue non-full frames. Later the user clears + * this option and we transmit any pending partial frames in the queue. This is + * meant to be used alongside sendfile() to get properly filled frames when the + * user (for example) must write out headers with a write() call first and then + * use sendfile to send out the data parts. + * + * TCP_CORK can be set together with TCP_NODELAY and it is stronger than + * TCP_NODELAY. + */ +static void __tcp_sock_set_cork(struct sock *sk, bool on) +{ + struct tcp_sock *tp = tcp_sk(sk); + + if (on) { + tp->nonagle |= TCP_NAGLE_CORK; + } else { + tp->nonagle &= ~TCP_NAGLE_CORK; + if (tp->nonagle & TCP_NAGLE_OFF) + tp->nonagle |= TCP_NAGLE_PUSH; + tcp_push_pending_frames(sk); + } +} + +void tcp_sock_set_cork(struct sock *sk, bool on) +{ + lock_sock(sk); + __tcp_sock_set_cork(sk, on); + release_sock(sk); +} +EXPORT_SYMBOL(tcp_sock_set_cork); + /* * Socket option code for TCP. */ @@ -2979,25 +3010,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level, break; case TCP_CORK: - /* When set indicates to always queue non-full frames. - * Later the user clears this option and we transmit - * any pending partial frames in the queue. This is - * meant to be used alongside sendfile() to get properly - * filled frames when the user (for example) must write - * out headers with a write() call first and then use - * sendfile to send out the data parts. - * - * TCP_CORK can be set together with TCP_NODELAY and it is - * stronger than TCP_NODELAY. - */ - if (val) { - tp->nonagle |= TCP_NAGLE_CORK; - } else { - tp->nonagle &= ~TCP_NAGLE_CORK; - if (tp->nonagle&TCP_NAGLE_OFF) - tp->nonagle |= TCP_NAGLE_PUSH; - tcp_push_pending_frames(sk); - } + __tcp_sock_set_cork(sk, val); break; case TCP_KEEPIDLE: diff --git a/net/rds/tcp_send.c b/net/rds/tcp_send.c index 78a2554a4497..8c4d1d6e9249 100644 --- a/net/rds/tcp_send.c +++ b/net/rds/tcp_send.c @@ -38,23 +38,18 @@ #include "rds.h" #include "tcp.h" -static void rds_tcp_cork(struct socket *sock, int val) -{ - kernel_setsockopt(sock, SOL_TCP, TCP_CORK, (void *)&val, sizeof(val)); -} - void rds_tcp_xmit_path_prepare(struct rds_conn_path *cp) { struct rds_tcp_connection *tc = cp->cp_transport_data; - rds_tcp_cork(tc->t_sock, 1); + tcp_sock_set_cork(tc->t_sock->sk, true); } void rds_tcp_xmit_path_complete(struct rds_conn_path *cp) { struct rds_tcp_connection *tc = cp->cp_transport_data; - rds_tcp_cork(tc->t_sock, 0); + tcp_sock_set_cork(tc->t_sock->sk, false); } /* the core send_sem serializes this with other xmit and shutdown */ -- cgit v1.2.3-59-g8ed1b From 12abc5ee7873a085cc280240822b8ac53c86fecd Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 28 May 2020 07:12:19 +0200 Subject: tcp: add tcp_sock_set_nodelay Add a helper to directly set the TCP_NODELAY sockopt from kernel space without going through a fake uaccess. Cleanup the callers to avoid pointless wrappers now that this is a simple function call. Signed-off-by: Christoph Hellwig Acked-by: Sagi Grimberg Acked-by: Jason Gunthorpe Signed-off-by: David S. Miller --- drivers/block/drbd/drbd_int.h | 7 ------ drivers/block/drbd/drbd_main.c | 2 +- drivers/block/drbd/drbd_receiver.c | 4 ++-- drivers/infiniband/sw/siw/siw_cm.c | 24 ++++--------------- drivers/nvme/host/tcp.c | 9 +------ drivers/nvme/target/tcp.c | 12 ++-------- drivers/target/iscsi/iscsi_target_login.c | 15 +++--------- fs/cifs/connect.c | 10 ++------ fs/dlm/lowcomms.c | 8 ++----- fs/ocfs2/cluster/tcp.c | 20 ++-------------- include/linux/tcp.h | 1 + net/ceph/messenger.c | 11 ++------- net/ipv4/tcp.c | 39 ++++++++++++++++++++----------- net/rds/tcp.c | 11 +-------- net/rds/tcp.h | 1 - net/rds/tcp_listen.c | 2 +- 16 files changed, 49 insertions(+), 127 deletions(-) diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h index 3550adc93c68..e24bba87c8e0 100644 --- a/drivers/block/drbd/drbd_int.h +++ b/drivers/block/drbd/drbd_int.h @@ -1570,13 +1570,6 @@ extern void drbd_set_recv_tcq(struct drbd_device *device, int tcq_enabled); extern void _drbd_clear_done_ee(struct drbd_device *device, struct list_head *to_be_freed); extern int drbd_connected(struct drbd_peer_device *); -static inline void drbd_tcp_nodelay(struct socket *sock) -{ - int val = 1; - (void) kernel_setsockopt(sock, SOL_TCP, TCP_NODELAY, - (char*)&val, sizeof(val)); -} - static inline void drbd_tcp_quickack(struct socket *sock) { int val = 2; diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index c094c3c2c5d4..45fbd526c453 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c @@ -660,7 +660,7 @@ static int __send_command(struct drbd_connection *connection, int vnr, /* DRBD protocol "pings" are latency critical. * This is supposed to trigger tcp_push_pending_frames() */ if (!err && (cmd == P_PING || cmd == P_PING_ACK)) - drbd_tcp_nodelay(sock->socket); + tcp_sock_set_nodelay(sock->socket->sk); return err; } diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c index 55ea907ad33c..20a5e94494ac 100644 --- a/drivers/block/drbd/drbd_receiver.c +++ b/drivers/block/drbd/drbd_receiver.c @@ -1051,8 +1051,8 @@ randomize: /* we don't want delays. * we use TCP_CORK where appropriate, though */ - drbd_tcp_nodelay(sock.socket); - drbd_tcp_nodelay(msock.socket); + tcp_sock_set_nodelay(sock.socket->sk); + tcp_sock_set_nodelay(msock.socket->sk); connection->data.socket = sock.socket; connection->meta.socket = msock.socket; diff --git a/drivers/infiniband/sw/siw/siw_cm.c b/drivers/infiniband/sw/siw/siw_cm.c index d1860f3e8740..1662216be66d 100644 --- a/drivers/infiniband/sw/siw/siw_cm.c +++ b/drivers/infiniband/sw/siw/siw_cm.c @@ -947,16 +947,8 @@ static void siw_accept_newconn(struct siw_cep *cep) siw_cep_get(new_cep); new_s->sk->sk_user_data = new_cep; - if (siw_tcp_nagle == false) { - int val = 1; - - rv = kernel_setsockopt(new_s, SOL_TCP, TCP_NODELAY, - (char *)&val, sizeof(val)); - if (rv) { - siw_dbg_cep(cep, "setsockopt NODELAY error: %d\n", rv); - goto error; - } - } + if (siw_tcp_nagle == false) + tcp_sock_set_nodelay(new_s->sk); new_cep->state = SIW_EPSTATE_AWAIT_MPAREQ; rv = siw_cm_queue_work(new_cep, SIW_CM_WORK_MPATIMEOUT); @@ -1386,16 +1378,8 @@ int siw_connect(struct iw_cm_id *id, struct iw_cm_conn_param *params) siw_dbg_qp(qp, "kernel_bindconnect: error %d\n", rv); goto error; } - if (siw_tcp_nagle == false) { - int val = 1; - - rv = kernel_setsockopt(s, SOL_TCP, TCP_NODELAY, (char *)&val, - sizeof(val)); - if (rv) { - siw_dbg_qp(qp, "setsockopt NODELAY error: %d\n", rv); - goto error; - } - } + if (siw_tcp_nagle == false) + tcp_sock_set_nodelay(s->sk); cep = siw_cep_alloc(sdev); if (!cep) { rv = -ENOMEM; diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index a307972d33a0..4e4a750ecdb9 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -1346,14 +1346,7 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl, } /* Set TCP no delay */ - opt = 1; - ret = kernel_setsockopt(queue->sock, IPPROTO_TCP, - TCP_NODELAY, (char *)&opt, sizeof(opt)); - if (ret) { - dev_err(nctrl->device, - "failed to set TCP_NODELAY sock opt %d\n", ret); - goto err_sock; - } + tcp_sock_set_nodelay(queue->sock->sk); /* * Cleanup whatever is sitting in the TCP transmit queue on socket diff --git a/drivers/nvme/target/tcp.c b/drivers/nvme/target/tcp.c index f3088156d01d..55bc4c3c0a74 100644 --- a/drivers/nvme/target/tcp.c +++ b/drivers/nvme/target/tcp.c @@ -1580,7 +1580,7 @@ static int nvmet_tcp_add_port(struct nvmet_port *nport) { struct nvmet_tcp_port *port; __kernel_sa_family_t af; - int opt, ret; + int ret; port = kzalloc(sizeof(*port), GFP_KERNEL); if (!port) @@ -1625,15 +1625,7 @@ static int nvmet_tcp_add_port(struct nvmet_port *nport) port->data_ready = port->sock->sk->sk_data_ready; port->sock->sk->sk_data_ready = nvmet_tcp_listen_data_ready; sock_set_reuseaddr(port->sock->sk); - - opt = 1; - ret = kernel_setsockopt(port->sock, IPPROTO_TCP, - TCP_NODELAY, (char *)&opt, sizeof(opt)); - if (ret) { - pr_err("failed to set TCP_NODELAY sock opt %d\n", ret); - goto err_sock; - } - + tcp_sock_set_nodelay(port->sock->sk); if (so_priority > 0) sock_set_priority(port->sock->sk, so_priority); diff --git a/drivers/target/iscsi/iscsi_target_login.c b/drivers/target/iscsi/iscsi_target_login.c index 91acb3f07b4c..b561b07a869a 100644 --- a/drivers/target/iscsi/iscsi_target_login.c +++ b/drivers/target/iscsi/iscsi_target_login.c @@ -897,20 +897,11 @@ int iscsit_setup_np( /* * Set SO_REUSEADDR, and disable Nagel Algorithm with TCP_NODELAY. */ - /* FIXME: Someone please explain why this is endian-safe */ - opt = 1; - if (np->np_network_transport == ISCSI_TCP) { - ret = kernel_setsockopt(sock, IPPROTO_TCP, TCP_NODELAY, - (char *)&opt, sizeof(opt)); - if (ret < 0) { - pr_err("kernel_setsockopt() for TCP_NODELAY" - " failed: %d\n", ret); - goto fail; - } - } - + if (np->np_network_transport == ISCSI_TCP) + tcp_sock_set_nodelay(sock->sk); sock_set_reuseaddr(sock->sk); + opt = 1; ret = kernel_setsockopt(sock, IPPROTO_IP, IP_FREEBIND, (char *)&opt, sizeof(opt)); if (ret < 0) { diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 28268ed461b8..ad8fb53b3682 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -3929,14 +3929,8 @@ generic_ip_connect(struct TCP_Server_Info *server) socket->sk->sk_rcvbuf = 140 * 1024; } - if (server->tcp_nodelay) { - int val = 1; - rc = kernel_setsockopt(socket, SOL_TCP, TCP_NODELAY, - (char *)&val, sizeof(val)); - if (rc) - cifs_dbg(FYI, "set TCP_NODELAY socket option error %d\n", - rc); - } + if (server->tcp_nodelay) + tcp_sock_set_nodelay(socket->sk); cifs_dbg(FYI, "sndbuf %d rcvbuf %d rcvtimeo 0x%lx\n", socket->sk->sk_sndbuf, diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c index 2822a430a2b4..69333728d871 100644 --- a/fs/dlm/lowcomms.c +++ b/fs/dlm/lowcomms.c @@ -1011,7 +1011,6 @@ static void tcp_connect_to_sock(struct connection *con) struct sockaddr_storage saddr, src_addr; int addr_len; struct socket *sock = NULL; - int one = 1; int result; if (con->nodeid == 0) { @@ -1060,8 +1059,7 @@ static void tcp_connect_to_sock(struct connection *con) log_print("connecting to %d", con->nodeid); /* Turn off Nagle's algorithm */ - kernel_setsockopt(sock, SOL_TCP, TCP_NODELAY, (char *)&one, - sizeof(one)); + tcp_sock_set_nodelay(sock->sk); result = sock->ops->connect(sock, (struct sockaddr *)&saddr, addr_len, O_NONBLOCK); @@ -1103,7 +1101,6 @@ static struct socket *tcp_create_listen_sock(struct connection *con, { struct socket *sock = NULL; int result = 0; - int one = 1; int addr_len; if (dlm_local_addr[0]->ss_family == AF_INET) @@ -1120,8 +1117,7 @@ static struct socket *tcp_create_listen_sock(struct connection *con, } /* Turn off Nagle's algorithm */ - kernel_setsockopt(sock, SOL_TCP, TCP_NODELAY, (char *)&one, - sizeof(one)); + tcp_sock_set_nodelay(sock->sk); sock_set_reuseaddr(sock->sk); diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c index 2c512b40a940..4c70fe9d19ab 100644 --- a/fs/ocfs2/cluster/tcp.c +++ b/fs/ocfs2/cluster/tcp.c @@ -1441,14 +1441,6 @@ static void o2net_rx_until_empty(struct work_struct *work) sc_put(sc); } -static int o2net_set_nodelay(struct socket *sock) -{ - int val = 1; - - return kernel_setsockopt(sock, SOL_TCP, TCP_NODELAY, - (void *)&val, sizeof(val)); -} - static int o2net_set_usertimeout(struct socket *sock) { int user_timeout = O2NET_TCP_USER_TIMEOUT; @@ -1636,11 +1628,7 @@ static void o2net_start_connect(struct work_struct *work) goto out; } - ret = o2net_set_nodelay(sc->sc_sock); - if (ret) { - mlog(ML_ERROR, "setting TCP_NODELAY failed with %d\n", ret); - goto out; - } + tcp_sock_set_nodelay(sc->sc_sock->sk); ret = o2net_set_usertimeout(sock); if (ret) { @@ -1832,11 +1820,7 @@ static int o2net_accept_one(struct socket *sock, int *more) *more = 1; new_sock->sk->sk_allocation = GFP_ATOMIC; - ret = o2net_set_nodelay(new_sock); - if (ret) { - mlog(ML_ERROR, "setting TCP_NODELAY failed with %d\n", ret); - goto out; - } + tcp_sock_set_nodelay(new_sock->sk); ret = o2net_set_usertimeout(new_sock); if (ret) { diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 889eeb2256c2..9e42c7fe50a8 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -498,5 +498,6 @@ int tcp_skb_shift(struct sk_buff *to, struct sk_buff *from, int pcount, int shiftlen); void tcp_sock_set_cork(struct sock *sk, bool on); +void tcp_sock_set_nodelay(struct sock *sk); #endif /* _LINUX_TCP_H */ diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index f8ca5edc5f2c..27d6ab11f9ee 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -490,15 +490,8 @@ static int ceph_tcp_connect(struct ceph_connection *con) return ret; } - if (ceph_test_opt(from_msgr(con->msgr), TCP_NODELAY)) { - int optval = 1; - - ret = kernel_setsockopt(sock, SOL_TCP, TCP_NODELAY, - (char *)&optval, sizeof(optval)); - if (ret) - pr_err("kernel_setsockopt(TCP_NODELAY) failed: %d", - ret); - } + if (ceph_test_opt(from_msgr(con->msgr), TCP_NODELAY)) + tcp_sock_set_nodelay(sock->sk); con->sock = sock; return 0; diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index e6cf702e16d6..a65f293a19fa 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2832,6 +2832,30 @@ void tcp_sock_set_cork(struct sock *sk, bool on) } EXPORT_SYMBOL(tcp_sock_set_cork); +/* TCP_NODELAY is weaker than TCP_CORK, so that this option on corked socket is + * remembered, but it is not activated until cork is cleared. + * + * However, when TCP_NODELAY is set we make an explicit push, which overrides + * even TCP_CORK for currently queued segments. + */ +static void __tcp_sock_set_nodelay(struct sock *sk, bool on) +{ + if (on) { + tcp_sk(sk)->nonagle |= TCP_NAGLE_OFF|TCP_NAGLE_PUSH; + tcp_push_pending_frames(sk); + } else { + tcp_sk(sk)->nonagle &= ~TCP_NAGLE_OFF; + } +} + +void tcp_sock_set_nodelay(struct sock *sk) +{ + lock_sock(sk); + __tcp_sock_set_nodelay(sk, true); + release_sock(sk); +} +EXPORT_SYMBOL(tcp_sock_set_nodelay); + /* * Socket option code for TCP. */ @@ -2929,20 +2953,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level, break; case TCP_NODELAY: - if (val) { - /* TCP_NODELAY is weaker than TCP_CORK, so that - * this option on corked socket is remembered, but - * it is not activated until cork is cleared. - * - * However, when TCP_NODELAY is set we make - * an explicit push, which overrides even TCP_CORK - * for currently queued segments. - */ - tp->nonagle |= TCP_NAGLE_OFF|TCP_NAGLE_PUSH; - tcp_push_pending_frames(sk); - } else { - tp->nonagle &= ~TCP_NAGLE_OFF; - } + __tcp_sock_set_nodelay(sk, val); break; case TCP_THIN_LINEAR_TIMEOUTS: diff --git a/net/rds/tcp.c b/net/rds/tcp.c index 46782fac4c16..43db0eca911f 100644 --- a/net/rds/tcp.c +++ b/net/rds/tcp.c @@ -89,15 +89,6 @@ static struct ctl_table rds_tcp_sysctl_table[] = { { } }; -/* doing it this way avoids calling tcp_sk() */ -void rds_tcp_nonagle(struct socket *sock) -{ - int val = 1; - - kernel_setsockopt(sock, SOL_TCP, TCP_NODELAY, (void *)&val, - sizeof(val)); -} - u32 rds_tcp_write_seq(struct rds_tcp_connection *tc) { /* seq# of the last byte of data in tcp send buffer */ @@ -502,7 +493,7 @@ void rds_tcp_tune(struct socket *sock) struct net *net = sock_net(sk); struct rds_tcp_net *rtn = net_generic(net, rds_tcp_netid); - rds_tcp_nonagle(sock); + tcp_sock_set_nodelay(sock->sk); lock_sock(sk); if (rtn->sndbuf_size > 0) { sk->sk_sndbuf = rtn->sndbuf_size; diff --git a/net/rds/tcp.h b/net/rds/tcp.h index d640e210b97b..f6d75d8cb167 100644 --- a/net/rds/tcp.h +++ b/net/rds/tcp.h @@ -50,7 +50,6 @@ struct rds_tcp_statistics { /* tcp.c */ void rds_tcp_tune(struct socket *sock); -void rds_tcp_nonagle(struct socket *sock); void rds_tcp_set_callbacks(struct socket *sock, struct rds_conn_path *cp); void rds_tcp_reset_callbacks(struct socket *sock, struct rds_conn_path *cp); void rds_tcp_restore_callbacks(struct socket *sock, diff --git a/net/rds/tcp_listen.c b/net/rds/tcp_listen.c index d8bd13276959..6f90ea077adc 100644 --- a/net/rds/tcp_listen.c +++ b/net/rds/tcp_listen.c @@ -288,7 +288,7 @@ struct socket *rds_tcp_listen_init(struct net *net, bool isv6) } sock->sk->sk_reuse = SK_CAN_REUSE; - rds_tcp_nonagle(sock); + tcp_sock_set_nodelay(sock->sk); write_lock_bh(&sock->sk->sk_callback_lock); sock->sk->sk_user_data = sock->sk->sk_data_ready; -- cgit v1.2.3-59-g8ed1b From ddd061b8daed3ce0c01109a69c9a2a9f9669f01a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 28 May 2020 07:12:20 +0200 Subject: tcp: add tcp_sock_set_quickack Add a helper to directly set the TCP_QUICKACK sockopt from kernel space without going through a fake uaccess. Cleanup the callers to avoid pointless wrappers now that this is a simple function call. Signed-off-by: Christoph Hellwig Signed-off-by: David S. Miller --- drivers/block/drbd/drbd_int.h | 7 ------- drivers/block/drbd/drbd_receiver.c | 5 ++--- include/linux/tcp.h | 1 + net/ipv4/tcp.c | 39 +++++++++++++++++++++++++------------- 4 files changed, 29 insertions(+), 23 deletions(-) diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h index e24bba87c8e0..14345a87c7cc 100644 --- a/drivers/block/drbd/drbd_int.h +++ b/drivers/block/drbd/drbd_int.h @@ -1570,13 +1570,6 @@ extern void drbd_set_recv_tcq(struct drbd_device *device, int tcq_enabled); extern void _drbd_clear_done_ee(struct drbd_device *device, struct list_head *to_be_freed); extern int drbd_connected(struct drbd_peer_device *); -static inline void drbd_tcp_quickack(struct socket *sock) -{ - int val = 2; - (void) kernel_setsockopt(sock, SOL_TCP, TCP_QUICKACK, - (char*)&val, sizeof(val)); -} - /* sets the number of 512 byte sectors of our virtual device */ void drbd_set_my_capacity(struct drbd_device *device, sector_t size); diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c index 20a5e94494ac..3a3f2b6a821f 100644 --- a/drivers/block/drbd/drbd_receiver.c +++ b/drivers/block/drbd/drbd_receiver.c @@ -1223,7 +1223,7 @@ static int drbd_recv_header_maybe_unplug(struct drbd_connection *connection, str * quickly as possible, and let remote TCP know what we have * received so far. */ if (err == -EAGAIN) { - drbd_tcp_quickack(connection->data.socket); + tcp_sock_set_quickack(connection->data.socket->sk, 2); drbd_unplug_all_devices(connection); } if (err > 0) { @@ -4959,8 +4959,7 @@ static int receive_UnplugRemote(struct drbd_connection *connection, struct packe { /* Make sure we've acked all the TCP data associated * with the data requests being unplugged */ - drbd_tcp_quickack(connection->data.socket); - + tcp_sock_set_quickack(connection->data.socket->sk, 2); return 0; } diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 9e42c7fe50a8..2eaf8320b9db 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -499,5 +499,6 @@ int tcp_skb_shift(struct sk_buff *to, struct sk_buff *from, int pcount, void tcp_sock_set_cork(struct sock *sk, bool on); void tcp_sock_set_nodelay(struct sock *sk); +void tcp_sock_set_quickack(struct sock *sk, int val); #endif /* _LINUX_TCP_H */ diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index a65f293a19fa..27b5e7a4e2ef 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2856,6 +2856,31 @@ void tcp_sock_set_nodelay(struct sock *sk) } EXPORT_SYMBOL(tcp_sock_set_nodelay); +static void __tcp_sock_set_quickack(struct sock *sk, int val) +{ + if (!val) { + inet_csk_enter_pingpong_mode(sk); + return; + } + + inet_csk_exit_pingpong_mode(sk); + if ((1 << sk->sk_state) & (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT) && + inet_csk_ack_scheduled(sk)) { + inet_csk(sk)->icsk_ack.pending |= ICSK_ACK_PUSHED; + tcp_cleanup_rbuf(sk, 1); + if (!(val & 1)) + inet_csk_enter_pingpong_mode(sk); + } +} + +void tcp_sock_set_quickack(struct sock *sk, int val) +{ + lock_sock(sk); + __tcp_sock_set_quickack(sk, val); + release_sock(sk); +} +EXPORT_SYMBOL(tcp_sock_set_quickack); + /* * Socket option code for TCP. */ @@ -3096,19 +3121,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level, break; case TCP_QUICKACK: - if (!val) { - inet_csk_enter_pingpong_mode(sk); - } else { - inet_csk_exit_pingpong_mode(sk); - if ((1 << sk->sk_state) & - (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT) && - inet_csk_ack_scheduled(sk)) { - icsk->icsk_ack.pending |= ICSK_ACK_PUSHED; - tcp_cleanup_rbuf(sk, 1); - if (!(val & 1)) - inet_csk_enter_pingpong_mode(sk); - } - } + __tcp_sock_set_quickack(sk, val); break; #ifdef CONFIG_TCP_MD5SIG -- cgit v1.2.3-59-g8ed1b From 557eadfcc5ee8f8fa98a795e05ed21db58a65db5 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 28 May 2020 07:12:21 +0200 Subject: tcp: add tcp_sock_set_syncnt Add a helper to directly set the TCP_SYNCNT sockopt from kernel space without going through a fake uaccess. Signed-off-by: Christoph Hellwig Acked-by: Sagi Grimberg Signed-off-by: David S. Miller --- drivers/nvme/host/tcp.c | 9 +-------- include/linux/tcp.h | 1 + net/ipv4/tcp.c | 12 ++++++++++++ 3 files changed, 14 insertions(+), 8 deletions(-) diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index 4e4a750ecdb9..2872584f52f6 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -1336,14 +1336,7 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl, } /* Single syn retry */ - opt = 1; - ret = kernel_setsockopt(queue->sock, IPPROTO_TCP, TCP_SYNCNT, - (char *)&opt, sizeof(opt)); - if (ret) { - dev_err(nctrl->device, - "failed to set TCP_SYNCNT sock opt %d\n", ret); - goto err_sock; - } + tcp_sock_set_syncnt(queue->sock->sk, 1); /* Set TCP no delay */ tcp_sock_set_nodelay(queue->sock->sk); diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 2eaf8320b9db..6aa4ae5ebf3d 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -500,5 +500,6 @@ int tcp_skb_shift(struct sk_buff *to, struct sk_buff *from, int pcount, void tcp_sock_set_cork(struct sock *sk, bool on); void tcp_sock_set_nodelay(struct sock *sk); void tcp_sock_set_quickack(struct sock *sk, int val); +int tcp_sock_set_syncnt(struct sock *sk, int val); #endif /* _LINUX_TCP_H */ diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 27b5e7a4e2ef..d2c67ae1da07 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2881,6 +2881,18 @@ void tcp_sock_set_quickack(struct sock *sk, int val) } EXPORT_SYMBOL(tcp_sock_set_quickack); +int tcp_sock_set_syncnt(struct sock *sk, int val) +{ + if (val < 1 || val > MAX_TCP_SYNCNT) + return -EINVAL; + + lock_sock(sk); + inet_csk(sk)->icsk_syn_retries = val; + release_sock(sk); + return 0; +} +EXPORT_SYMBOL(tcp_sock_set_syncnt); + /* * Socket option code for TCP. */ -- cgit v1.2.3-59-g8ed1b From c488aeadcbd002a992593e6090d54e8ac27c4310 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 28 May 2020 07:12:22 +0200 Subject: tcp: add tcp_sock_set_user_timeout Add a helper to directly set the TCP_USER_TIMEOUT sockopt from kernel space without going through a fake uaccess. Signed-off-by: Christoph Hellwig Signed-off-by: David S. Miller --- fs/ocfs2/cluster/tcp.c | 22 ++-------------------- include/linux/tcp.h | 1 + net/ipv4/tcp.c | 8 ++++++++ net/sunrpc/xprtsock.c | 3 +-- 4 files changed, 12 insertions(+), 22 deletions(-) diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c index 4c70fe9d19ab..79a231719460 100644 --- a/fs/ocfs2/cluster/tcp.c +++ b/fs/ocfs2/cluster/tcp.c @@ -1441,14 +1441,6 @@ static void o2net_rx_until_empty(struct work_struct *work) sc_put(sc); } -static int o2net_set_usertimeout(struct socket *sock) -{ - int user_timeout = O2NET_TCP_USER_TIMEOUT; - - return kernel_setsockopt(sock, SOL_TCP, TCP_USER_TIMEOUT, - (void *)&user_timeout, sizeof(user_timeout)); -} - static void o2net_initialize_handshake(void) { o2net_hand->o2hb_heartbeat_timeout_ms = cpu_to_be32( @@ -1629,12 +1621,7 @@ static void o2net_start_connect(struct work_struct *work) } tcp_sock_set_nodelay(sc->sc_sock->sk); - - ret = o2net_set_usertimeout(sock); - if (ret) { - mlog(ML_ERROR, "set TCP_USER_TIMEOUT failed with %d\n", ret); - goto out; - } + tcp_sock_set_user_timeout(sock->sk, O2NET_TCP_USER_TIMEOUT); o2net_register_callbacks(sc->sc_sock->sk, sc); @@ -1821,12 +1808,7 @@ static int o2net_accept_one(struct socket *sock, int *more) new_sock->sk->sk_allocation = GFP_ATOMIC; tcp_sock_set_nodelay(new_sock->sk); - - ret = o2net_set_usertimeout(new_sock); - if (ret) { - mlog(ML_ERROR, "set TCP_USER_TIMEOUT failed with %d\n", ret); - goto out; - } + tcp_sock_set_user_timeout(new_sock->sk, O2NET_TCP_USER_TIMEOUT); ret = new_sock->ops->getname(new_sock, (struct sockaddr *) &sin, 1); if (ret < 0) diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 6aa4ae5ebf3d..de682143efe4 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -501,5 +501,6 @@ void tcp_sock_set_cork(struct sock *sk, bool on); void tcp_sock_set_nodelay(struct sock *sk); void tcp_sock_set_quickack(struct sock *sk, int val); int tcp_sock_set_syncnt(struct sock *sk, int val); +void tcp_sock_set_user_timeout(struct sock *sk, u32 val); #endif /* _LINUX_TCP_H */ diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index d2c67ae1da07..0004bd9ae7b0 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2893,6 +2893,14 @@ int tcp_sock_set_syncnt(struct sock *sk, int val) } EXPORT_SYMBOL(tcp_sock_set_syncnt); +void tcp_sock_set_user_timeout(struct sock *sk, u32 val) +{ + lock_sock(sk); + inet_csk(sk)->icsk_user_timeout = val; + release_sock(sk); +} +EXPORT_SYMBOL(tcp_sock_set_user_timeout); + /* * Socket option code for TCP. */ diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 399848c2bcb2..231fd6162f68 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -2115,8 +2115,7 @@ static void xs_tcp_set_socket_timeouts(struct rpc_xprt *xprt, (char *)&keepcnt, sizeof(keepcnt)); /* TCP user timeout (see RFC5482) */ - kernel_setsockopt(sock, SOL_TCP, TCP_USER_TIMEOUT, - (char *)&timeo, sizeof(timeo)); + tcp_sock_set_user_timeout(sock->sk, timeo); } static void xs_tcp_set_connect_timeout(struct rpc_xprt *xprt, -- cgit v1.2.3-59-g8ed1b From 71c48eb81c9ecb6fed49dc33e7c9b621fdcb7bf8 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 28 May 2020 07:12:23 +0200 Subject: tcp: add tcp_sock_set_keepidle Add a helper to directly set the TCP_KEEP_IDLE sockopt from kernel space without going through a fake uaccess. Signed-off-by: Christoph Hellwig Signed-off-by: David S. Miller --- include/linux/tcp.h | 1 + net/ipv4/tcp.c | 49 ++++++++++++++++++++++++++++++++++--------------- net/rds/tcp_listen.c | 5 +---- net/sunrpc/xprtsock.c | 3 +-- 4 files changed, 37 insertions(+), 21 deletions(-) diff --git a/include/linux/tcp.h b/include/linux/tcp.h index de682143efe4..5724dd84a85e 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -498,6 +498,7 @@ int tcp_skb_shift(struct sk_buff *to, struct sk_buff *from, int pcount, int shiftlen); void tcp_sock_set_cork(struct sock *sk, bool on); +int tcp_sock_set_keepidle(struct sock *sk, int val); void tcp_sock_set_nodelay(struct sock *sk); void tcp_sock_set_quickack(struct sock *sk, int val); int tcp_sock_set_syncnt(struct sock *sk, int val); diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 0004bd9ae7b0..bdf0ff933351 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2901,6 +2901,39 @@ void tcp_sock_set_user_timeout(struct sock *sk, u32 val) } EXPORT_SYMBOL(tcp_sock_set_user_timeout); +static int __tcp_sock_set_keepidle(struct sock *sk, int val) +{ + struct tcp_sock *tp = tcp_sk(sk); + + if (val < 1 || val > MAX_TCP_KEEPIDLE) + return -EINVAL; + + tp->keepalive_time = val * HZ; + if (sock_flag(sk, SOCK_KEEPOPEN) && + !((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))) { + u32 elapsed = keepalive_time_elapsed(tp); + + if (tp->keepalive_time > elapsed) + elapsed = tp->keepalive_time - elapsed; + else + elapsed = 0; + inet_csk_reset_keepalive_timer(sk, elapsed); + } + + return 0; +} + +int tcp_sock_set_keepidle(struct sock *sk, int val) +{ + int err; + + lock_sock(sk); + err = __tcp_sock_set_keepidle(sk, val); + release_sock(sk); + return err; +} +EXPORT_SYMBOL(tcp_sock_set_keepidle); + /* * Socket option code for TCP. */ @@ -3070,21 +3103,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level, break; case TCP_KEEPIDLE: - if (val < 1 || val > MAX_TCP_KEEPIDLE) - err = -EINVAL; - else { - tp->keepalive_time = val * HZ; - if (sock_flag(sk, SOCK_KEEPOPEN) && - !((1 << sk->sk_state) & - (TCPF_CLOSE | TCPF_LISTEN))) { - u32 elapsed = keepalive_time_elapsed(tp); - if (tp->keepalive_time > elapsed) - elapsed = tp->keepalive_time - elapsed; - else - elapsed = 0; - inet_csk_reset_keepalive_timer(sk, elapsed); - } - } + err = __tcp_sock_set_keepidle(sk, val); break; case TCP_KEEPINTVL: if (val < 1 || val > MAX_TCP_KEEPINTVL) diff --git a/net/rds/tcp_listen.c b/net/rds/tcp_listen.c index 6f90ea077adc..79f9adc00811 100644 --- a/net/rds/tcp_listen.c +++ b/net/rds/tcp_listen.c @@ -52,10 +52,7 @@ int rds_tcp_keepalive(struct socket *sock) if (ret < 0) goto bail; - ret = kernel_setsockopt(sock, IPPROTO_TCP, TCP_KEEPIDLE, - (char *)&keepidle, sizeof(keepidle)); - if (ret < 0) - goto bail; + tcp_sock_set_keepidle(sock->sk, keepidle); /* KEEPINTVL is the interval between successive probes. We follow * the model in xs_tcp_finish_connecting() and re-use keepidle. diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 231fd6162f68..473290f7c5c0 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -2107,8 +2107,7 @@ static void xs_tcp_set_socket_timeouts(struct rpc_xprt *xprt, /* TCP Keepalive options */ sock_set_keepalive(sock->sk); - kernel_setsockopt(sock, SOL_TCP, TCP_KEEPIDLE, - (char *)&keepidle, sizeof(keepidle)); + tcp_sock_set_keepidle(sock->sk, keepidle); kernel_setsockopt(sock, SOL_TCP, TCP_KEEPINTVL, (char *)&keepidle, sizeof(keepidle)); kernel_setsockopt(sock, SOL_TCP, TCP_KEEPCNT, -- cgit v1.2.3-59-g8ed1b From d41ecaac903c9f4658a71d4e7a708673cfb5abba Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 28 May 2020 07:12:24 +0200 Subject: tcp: add tcp_sock_set_keepintvl Add a helper to directly set the TCP_KEEPINTVL sockopt from kernel space without going through a fake uaccess. Signed-off-by: Christoph Hellwig Signed-off-by: David S. Miller --- include/linux/tcp.h | 1 + net/ipv4/tcp.c | 12 ++++++++++++ net/rds/tcp_listen.c | 4 +--- net/sunrpc/xprtsock.c | 3 +-- 4 files changed, 15 insertions(+), 5 deletions(-) diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 5724dd84a85e..1f9bada00faa 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -499,6 +499,7 @@ int tcp_skb_shift(struct sk_buff *to, struct sk_buff *from, int pcount, void tcp_sock_set_cork(struct sock *sk, bool on); int tcp_sock_set_keepidle(struct sock *sk, int val); +int tcp_sock_set_keepintvl(struct sock *sk, int val); void tcp_sock_set_nodelay(struct sock *sk); void tcp_sock_set_quickack(struct sock *sk, int val); int tcp_sock_set_syncnt(struct sock *sk, int val); diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index bdf0ff933351..7eb083e09786 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2934,6 +2934,18 @@ int tcp_sock_set_keepidle(struct sock *sk, int val) } EXPORT_SYMBOL(tcp_sock_set_keepidle); +int tcp_sock_set_keepintvl(struct sock *sk, int val) +{ + if (val < 1 || val > MAX_TCP_KEEPINTVL) + return -EINVAL; + + lock_sock(sk); + tcp_sk(sk)->keepalive_intvl = val * HZ; + release_sock(sk); + return 0; +} +EXPORT_SYMBOL(tcp_sock_set_keepintvl); + /* * Socket option code for TCP. */ diff --git a/net/rds/tcp_listen.c b/net/rds/tcp_listen.c index 79f9adc00811..9ad555c48d15 100644 --- a/net/rds/tcp_listen.c +++ b/net/rds/tcp_listen.c @@ -53,12 +53,10 @@ int rds_tcp_keepalive(struct socket *sock) goto bail; tcp_sock_set_keepidle(sock->sk, keepidle); - /* KEEPINTVL is the interval between successive probes. We follow * the model in xs_tcp_finish_connecting() and re-use keepidle. */ - ret = kernel_setsockopt(sock, IPPROTO_TCP, TCP_KEEPINTVL, - (char *)&keepidle, sizeof(keepidle)); + tcp_sock_set_keepintvl(sock->sk, keepidle); bail: return ret; } diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 473290f7c5c0..5ca64e12af0c 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -2108,8 +2108,7 @@ static void xs_tcp_set_socket_timeouts(struct rpc_xprt *xprt, /* TCP Keepalive options */ sock_set_keepalive(sock->sk); tcp_sock_set_keepidle(sock->sk, keepidle); - kernel_setsockopt(sock, SOL_TCP, TCP_KEEPINTVL, - (char *)&keepidle, sizeof(keepidle)); + tcp_sock_set_keepintvl(sock->sk, keepidle); kernel_setsockopt(sock, SOL_TCP, TCP_KEEPCNT, (char *)&keepcnt, sizeof(keepcnt)); -- cgit v1.2.3-59-g8ed1b From 480aeb9639d6a077c611b303a22f9b1e5937d081 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 28 May 2020 07:12:25 +0200 Subject: tcp: add tcp_sock_set_keepcnt Add a helper to directly set the TCP_KEEPCNT sockopt from kernel space without going through a fake uaccess. Signed-off-by: Christoph Hellwig Signed-off-by: David S. Miller --- include/linux/tcp.h | 1 + net/ipv4/tcp.c | 12 ++++++++++++ net/rds/tcp.h | 2 +- net/rds/tcp_listen.c | 17 +++-------------- net/sunrpc/xprtsock.c | 3 +-- 5 files changed, 18 insertions(+), 17 deletions(-) diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 1f9bada00faa..9aac824c523c 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -498,6 +498,7 @@ int tcp_skb_shift(struct sk_buff *to, struct sk_buff *from, int pcount, int shiftlen); void tcp_sock_set_cork(struct sock *sk, bool on); +int tcp_sock_set_keepcnt(struct sock *sk, int val); int tcp_sock_set_keepidle(struct sock *sk, int val); int tcp_sock_set_keepintvl(struct sock *sk, int val); void tcp_sock_set_nodelay(struct sock *sk); diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 7eb083e09786..15d47d5e7951 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2946,6 +2946,18 @@ int tcp_sock_set_keepintvl(struct sock *sk, int val) } EXPORT_SYMBOL(tcp_sock_set_keepintvl); +int tcp_sock_set_keepcnt(struct sock *sk, int val) +{ + if (val < 1 || val > MAX_TCP_KEEPCNT) + return -EINVAL; + + lock_sock(sk); + tcp_sk(sk)->keepalive_probes = val; + release_sock(sk); + return 0; +} +EXPORT_SYMBOL(tcp_sock_set_keepcnt); + /* * Socket option code for TCP. */ diff --git a/net/rds/tcp.h b/net/rds/tcp.h index f6d75d8cb167..bad9cf49d565 100644 --- a/net/rds/tcp.h +++ b/net/rds/tcp.h @@ -70,7 +70,7 @@ struct socket *rds_tcp_listen_init(struct net *net, bool isv6); void rds_tcp_listen_stop(struct socket *sock, struct work_struct *acceptor); void rds_tcp_listen_data_ready(struct sock *sk); int rds_tcp_accept_one(struct socket *sock); -int rds_tcp_keepalive(struct socket *sock); +void rds_tcp_keepalive(struct socket *sock); void *rds_tcp_listen_sock_def_readable(struct net *net); /* tcp_recv.c */ diff --git a/net/rds/tcp_listen.c b/net/rds/tcp_listen.c index 9ad555c48d15..101cf14215a0 100644 --- a/net/rds/tcp_listen.c +++ b/net/rds/tcp_listen.c @@ -38,27 +38,19 @@ #include "rds.h" #include "tcp.h" -int rds_tcp_keepalive(struct socket *sock) +void rds_tcp_keepalive(struct socket *sock) { /* values below based on xs_udp_default_timeout */ int keepidle = 5; /* send a probe 'keepidle' secs after last data */ int keepcnt = 5; /* number of unack'ed probes before declaring dead */ - int ret = 0; sock_set_keepalive(sock->sk); - - ret = kernel_setsockopt(sock, IPPROTO_TCP, TCP_KEEPCNT, - (char *)&keepcnt, sizeof(keepcnt)); - if (ret < 0) - goto bail; - + tcp_sock_set_keepcnt(sock->sk, keepcnt); tcp_sock_set_keepidle(sock->sk, keepidle); /* KEEPINTVL is the interval between successive probes. We follow * the model in xs_tcp_finish_connecting() and re-use keepidle. */ tcp_sock_set_keepintvl(sock->sk, keepidle); -bail: - return ret; } /* rds_tcp_accept_one_path(): if accepting on cp_index > 0, make sure the @@ -140,10 +132,7 @@ int rds_tcp_accept_one(struct socket *sock) new_sock->ops = sock->ops; __module_get(new_sock->ops->owner); - ret = rds_tcp_keepalive(new_sock); - if (ret < 0) - goto out; - + rds_tcp_keepalive(new_sock); rds_tcp_tune(new_sock); inet = inet_sk(new_sock->sk); diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 5ca64e12af0c..0d3ec055bc12 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -2109,8 +2109,7 @@ static void xs_tcp_set_socket_timeouts(struct rpc_xprt *xprt, sock_set_keepalive(sock->sk); tcp_sock_set_keepidle(sock->sk, keepidle); tcp_sock_set_keepintvl(sock->sk, keepidle); - kernel_setsockopt(sock, SOL_TCP, TCP_KEEPCNT, - (char *)&keepcnt, sizeof(keepcnt)); + tcp_sock_set_keepcnt(sock->sk, keepcnt); /* TCP user timeout (see RFC5482) */ tcp_sock_set_user_timeout(sock->sk, timeo); -- cgit v1.2.3-59-g8ed1b From 6ebf71bab9fb476fc8132be4c12b88201278f0ca Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 28 May 2020 07:12:26 +0200 Subject: ipv4: add ip_sock_set_tos Add a helper to directly set the IP_TOS sockopt from kernel space without going through a fake uaccess. Signed-off-by: Christoph Hellwig Acked-by: Sagi Grimberg Signed-off-by: David S. Miller --- drivers/nvme/host/tcp.c | 14 +++----------- drivers/nvme/target/tcp.c | 10 ++-------- include/net/ip.h | 2 ++ net/ipv4/ip_sockglue.c | 30 +++++++++++++++++++++--------- 4 files changed, 28 insertions(+), 28 deletions(-) diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index 2872584f52f6..4c972d8abf31 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -1313,7 +1313,7 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl, { struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl); struct nvme_tcp_queue *queue = &ctrl->queues[qid]; - int ret, opt, rcv_pdu_size; + int ret, rcv_pdu_size; queue->ctrl = ctrl; INIT_LIST_HEAD(&queue->send_list); @@ -1352,16 +1352,8 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl, sock_set_priority(queue->sock->sk, so_priority); /* Set socket type of service */ - if (nctrl->opts->tos >= 0) { - opt = nctrl->opts->tos; - ret = kernel_setsockopt(queue->sock, SOL_IP, IP_TOS, - (char *)&opt, sizeof(opt)); - if (ret) { - dev_err(nctrl->device, - "failed to set IP_TOS sock opt %d\n", ret); - goto err_sock; - } - } + if (nctrl->opts->tos >= 0) + ip_sock_set_tos(queue->sock->sk, nctrl->opts->tos); queue->sock->sk->sk_allocation = GFP_ATOMIC; nvme_tcp_set_queue_io_cpu(queue); diff --git a/drivers/nvme/target/tcp.c b/drivers/nvme/target/tcp.c index 55bc4c3c0a74..4546049a96b3 100644 --- a/drivers/nvme/target/tcp.c +++ b/drivers/nvme/target/tcp.c @@ -1452,14 +1452,8 @@ static int nvmet_tcp_set_queue_sock(struct nvmet_tcp_queue *queue) sock_set_priority(sock->sk, so_priority); /* Set socket type of service */ - if (inet->rcv_tos > 0) { - int tos = inet->rcv_tos; - - ret = kernel_setsockopt(sock, SOL_IP, IP_TOS, - (char *)&tos, sizeof(tos)); - if (ret) - return ret; - } + if (inet->rcv_tos > 0) + ip_sock_set_tos(sock->sk, inet->rcv_tos); write_lock_bh(&sock->sk->sk_callback_lock); sock->sk->sk_user_data = queue; diff --git a/include/net/ip.h b/include/net/ip.h index 5b317c9f4470..2fc52e26fa88 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -765,4 +765,6 @@ static inline bool inetdev_valid_mtu(unsigned int mtu) return likely(mtu >= IPV4_MIN_MTU); } +void ip_sock_set_tos(struct sock *sk, int val); + #endif /* _IP_H */ diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index f43d5f12aa86..b43a29e11f4a 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -560,6 +560,26 @@ out: return err; } +static void __ip_sock_set_tos(struct sock *sk, int val) +{ + if (sk->sk_type == SOCK_STREAM) { + val &= ~INET_ECN_MASK; + val |= inet_sk(sk)->tos & INET_ECN_MASK; + } + if (inet_sk(sk)->tos != val) { + inet_sk(sk)->tos = val; + sk->sk_priority = rt_tos2priority(val); + sk_dst_reset(sk); + } +} + +void ip_sock_set_tos(struct sock *sk, int val) +{ + lock_sock(sk); + __ip_sock_set_tos(sk, val); + release_sock(sk); +} +EXPORT_SYMBOL(ip_sock_set_tos); /* * Socket option code for IP. This is the end of the line after any @@ -823,15 +843,7 @@ static int do_ip_setsockopt(struct sock *sk, int level, inet->cmsg_flags &= ~IP_CMSG_RECVFRAGSIZE; break; case IP_TOS: /* This sets both TOS and Precedence */ - if (sk->sk_type == SOCK_STREAM) { - val &= ~INET_ECN_MASK; - val |= inet->tos & INET_ECN_MASK; - } - if (inet->tos != val) { - inet->tos = val; - sk->sk_priority = rt_tos2priority(val); - sk_dst_reset(sk); - } + __ip_sock_set_tos(sk, val); break; case IP_TTL: if (optlen < 1) -- cgit v1.2.3-59-g8ed1b From c4e446bf5a06a1db24b4f0115a89f0380a495c62 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 28 May 2020 07:12:27 +0200 Subject: ipv4: add ip_sock_set_freebind Add a helper to directly set the IP_FREEBIND sockopt from kernel space without going through a fake uaccess. Signed-off-by: Christoph Hellwig Signed-off-by: David S. Miller --- drivers/target/iscsi/iscsi_target_login.c | 13 +++---------- include/net/ip.h | 1 + net/ipv4/ip_sockglue.c | 8 ++++++++ 3 files changed, 12 insertions(+), 10 deletions(-) diff --git a/drivers/target/iscsi/iscsi_target_login.c b/drivers/target/iscsi/iscsi_target_login.c index b561b07a869a..85748e338858 100644 --- a/drivers/target/iscsi/iscsi_target_login.c +++ b/drivers/target/iscsi/iscsi_target_login.c @@ -15,6 +15,7 @@ #include #include #include /* TCP_NODELAY */ +#include #include /* ipv6_addr_v4mapped() */ #include #include @@ -855,7 +856,7 @@ int iscsit_setup_np( struct sockaddr_storage *sockaddr) { struct socket *sock = NULL; - int backlog = ISCSIT_TCP_BACKLOG, ret, opt = 0, len; + int backlog = ISCSIT_TCP_BACKLOG, ret, len; switch (np->np_network_transport) { case ISCSI_TCP: @@ -900,15 +901,7 @@ int iscsit_setup_np( if (np->np_network_transport == ISCSI_TCP) tcp_sock_set_nodelay(sock->sk); sock_set_reuseaddr(sock->sk); - - opt = 1; - ret = kernel_setsockopt(sock, IPPROTO_IP, IP_FREEBIND, - (char *)&opt, sizeof(opt)); - if (ret < 0) { - pr_err("kernel_setsockopt() for IP_FREEBIND" - " failed\n"); - goto fail; - } + ip_sock_set_freebind(sock->sk); ret = kernel_bind(sock, (struct sockaddr *)&np->np_sockaddr, len); if (ret < 0) { diff --git a/include/net/ip.h b/include/net/ip.h index 2fc52e26fa88..5f5d8226b6ab 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -765,6 +765,7 @@ static inline bool inetdev_valid_mtu(unsigned int mtu) return likely(mtu >= IPV4_MIN_MTU); } +void ip_sock_set_freebind(struct sock *sk); void ip_sock_set_tos(struct sock *sk, int val); #endif /* _IP_H */ diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index b43a29e11f4a..767838d2030d 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -581,6 +581,14 @@ void ip_sock_set_tos(struct sock *sk, int val) } EXPORT_SYMBOL(ip_sock_set_tos); +void ip_sock_set_freebind(struct sock *sk) +{ + lock_sock(sk); + inet_sk(sk)->freebind = true; + release_sock(sk); +} +EXPORT_SYMBOL(ip_sock_set_freebind); + /* * Socket option code for IP. This is the end of the line after any * TCP,UDP etc options on an IP socket. -- cgit v1.2.3-59-g8ed1b From db45c0ef258ef6c7ef3c1b8ea9e06e133e083c27 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 28 May 2020 07:12:28 +0200 Subject: ipv4: add ip_sock_set_recverr Add a helper to directly set the IP_RECVERR sockopt from kernel space without going through a fake uaccess. Signed-off-by: Christoph Hellwig Reviewed-by: David Howells Signed-off-by: David S. Miller --- include/net/ip.h | 1 + net/ipv4/ip_sockglue.c | 8 ++++++++ net/rxrpc/local_object.c | 8 +------- 3 files changed, 10 insertions(+), 7 deletions(-) diff --git a/include/net/ip.h b/include/net/ip.h index 5f5d8226b6ab..f063a491b906 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -766,6 +766,7 @@ static inline bool inetdev_valid_mtu(unsigned int mtu) } void ip_sock_set_freebind(struct sock *sk); +void ip_sock_set_recverr(struct sock *sk); void ip_sock_set_tos(struct sock *sk, int val); #endif /* _IP_H */ diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index 767838d2030d..aca6b81da9ba 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -589,6 +589,14 @@ void ip_sock_set_freebind(struct sock *sk) } EXPORT_SYMBOL(ip_sock_set_freebind); +void ip_sock_set_recverr(struct sock *sk) +{ + lock_sock(sk); + inet_sk(sk)->recverr = true; + release_sock(sk); +} +EXPORT_SYMBOL(ip_sock_set_recverr); + /* * Socket option code for IP. This is the end of the line after any * TCP,UDP etc options on an IP socket. diff --git a/net/rxrpc/local_object.c b/net/rxrpc/local_object.c index 5ea2bd01fdd5..4c0e8fe5ec1f 100644 --- a/net/rxrpc/local_object.c +++ b/net/rxrpc/local_object.c @@ -171,13 +171,7 @@ static int rxrpc_open_socket(struct rxrpc_local *local, struct net *net) /* Fall through */ case AF_INET: /* we want to receive ICMP errors */ - opt = 1; - ret = kernel_setsockopt(local->socket, SOL_IP, IP_RECVERR, - (char *) &opt, sizeof(opt)); - if (ret < 0) { - _debug("setsockopt failed"); - goto error; - } + ip_sock_set_recverr(local->socket->sk); /* we want to set the don't fragment bit */ opt = IP_PMTUDISC_DO; -- cgit v1.2.3-59-g8ed1b From 2de569bda2a66d1308ad3f205bb29cf4f95f5636 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 28 May 2020 07:12:29 +0200 Subject: ipv4: add ip_sock_set_mtu_discover Add a helper to directly set the IP_MTU_DISCOVER sockopt from kernel space without going through a fake uaccess. Signed-off-by: Christoph Hellwig Reviewed-by: David Howells [rxrpc bits] Signed-off-by: David S. Miller --- include/net/ip.h | 1 + net/ipv4/ip_sockglue.c | 11 +++++++++++ net/rxrpc/local_object.c | 8 +------- net/rxrpc/output.c | 14 +++++--------- 4 files changed, 18 insertions(+), 16 deletions(-) diff --git a/include/net/ip.h b/include/net/ip.h index f063a491b906..d3649c49dd33 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -766,6 +766,7 @@ static inline bool inetdev_valid_mtu(unsigned int mtu) } void ip_sock_set_freebind(struct sock *sk); +int ip_sock_set_mtu_discover(struct sock *sk, int val); void ip_sock_set_recverr(struct sock *sk); void ip_sock_set_tos(struct sock *sk, int val); diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index aca6b81da9ba..aa115be11dcf 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -597,6 +597,17 @@ void ip_sock_set_recverr(struct sock *sk) } EXPORT_SYMBOL(ip_sock_set_recverr); +int ip_sock_set_mtu_discover(struct sock *sk, int val) +{ + if (val < IP_PMTUDISC_DONT || val > IP_PMTUDISC_OMIT) + return -EINVAL; + lock_sock(sk); + inet_sk(sk)->pmtudisc = val; + release_sock(sk); + return 0; +} +EXPORT_SYMBOL(ip_sock_set_mtu_discover); + /* * Socket option code for IP. This is the end of the line after any * TCP,UDP etc options on an IP socket. diff --git a/net/rxrpc/local_object.c b/net/rxrpc/local_object.c index 4c0e8fe5ec1f..6f4e6b4817cf 100644 --- a/net/rxrpc/local_object.c +++ b/net/rxrpc/local_object.c @@ -174,13 +174,7 @@ static int rxrpc_open_socket(struct rxrpc_local *local, struct net *net) ip_sock_set_recverr(local->socket->sk); /* we want to set the don't fragment bit */ - opt = IP_PMTUDISC_DO; - ret = kernel_setsockopt(local->socket, SOL_IP, IP_MTU_DISCOVER, - (char *) &opt, sizeof(opt)); - if (ret < 0) { - _debug("setsockopt failed"); - goto error; - } + ip_sock_set_mtu_discover(local->socket->sk, IP_PMTUDISC_DO); /* We want receive timestamps. */ sock_enable_timestamps(local->socket->sk); diff --git a/net/rxrpc/output.c b/net/rxrpc/output.c index f8b632a5c619..1ba43c3df4ad 100644 --- a/net/rxrpc/output.c +++ b/net/rxrpc/output.c @@ -321,7 +321,7 @@ int rxrpc_send_data_packet(struct rxrpc_call *call, struct sk_buff *skb, struct kvec iov[2]; rxrpc_serial_t serial; size_t len; - int ret, opt; + int ret; _enter(",{%d}", skb->len); @@ -473,18 +473,14 @@ send_fragmentable: switch (conn->params.local->srx.transport.family) { case AF_INET6: case AF_INET: - opt = IP_PMTUDISC_DONT; - kernel_setsockopt(conn->params.local->socket, - SOL_IP, IP_MTU_DISCOVER, - (char *)&opt, sizeof(opt)); + ip_sock_set_mtu_discover(conn->params.local->socket->sk, + IP_PMTUDISC_DONT); ret = kernel_sendmsg(conn->params.local->socket, &msg, iov, 2, len); conn->params.peer->last_tx_at = ktime_get_seconds(); - opt = IP_PMTUDISC_DO; - kernel_setsockopt(conn->params.local->socket, - SOL_IP, IP_MTU_DISCOVER, - (char *)&opt, sizeof(opt)); + ip_sock_set_mtu_discover(conn->params.local->socket->sk, + IP_PMTUDISC_DO); break; default: -- cgit v1.2.3-59-g8ed1b From c1f9ec5776dd05eaf62cf6788ecdfc905dc8ec2b Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 28 May 2020 07:12:30 +0200 Subject: ipv4: add ip_sock_set_pktinfo Add a helper to directly set the IP_PKTINFO sockopt from kernel space without going through a fake uaccess. Signed-off-by: Christoph Hellwig Signed-off-by: David S. Miller --- include/net/ip.h | 1 + net/ipv4/ip_sockglue.c | 8 ++++++++ net/sunrpc/svcsock.c | 5 ++--- 3 files changed, 11 insertions(+), 3 deletions(-) diff --git a/include/net/ip.h b/include/net/ip.h index d3649c49dd33..04ebe7bf54c6 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -767,6 +767,7 @@ static inline bool inetdev_valid_mtu(unsigned int mtu) void ip_sock_set_freebind(struct sock *sk); int ip_sock_set_mtu_discover(struct sock *sk, int val); +void ip_sock_set_pktinfo(struct sock *sk); void ip_sock_set_recverr(struct sock *sk); void ip_sock_set_tos(struct sock *sk, int val); diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index aa115be11dcf..84ec3703c909 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -608,6 +608,14 @@ int ip_sock_set_mtu_discover(struct sock *sk, int val) } EXPORT_SYMBOL(ip_sock_set_mtu_discover); +void ip_sock_set_pktinfo(struct sock *sk) +{ + lock_sock(sk); + inet_sk(sk)->cmsg_flags |= IP_CMSG_PKTINFO; + release_sock(sk); +} +EXPORT_SYMBOL(ip_sock_set_pktinfo); + /* * Socket option code for IP. This is the end of the line after any * TCP,UDP etc options on an IP socket. diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index 6773dacc64d8..7a805d165689 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -616,9 +616,8 @@ static void svc_udp_init(struct svc_sock *svsk, struct svc_serv *serv) /* make sure we get destination address info */ switch (svsk->sk_sk->sk_family) { case AF_INET: - level = SOL_IP; - optname = IP_PKTINFO; - break; + ip_sock_set_pktinfo(svsk->sk_sock->sk); + return; case AF_INET6: level = SOL_IPV6; optname = IPV6_RECVPKTINFO; -- cgit v1.2.3-59-g8ed1b From 9b115749acb24d11083ded4fe947ddd654a940e3 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 28 May 2020 07:12:31 +0200 Subject: ipv6: add ip6_sock_set_v6only Add a helper to directly set the IPV6_V6ONLY sockopt from kernel space without going through a fake uaccess. Signed-off-by: Christoph Hellwig Signed-off-by: David S. Miller --- include/net/ipv6.h | 11 +++++++++++ net/ipv6/ip6_udp_tunnel.c | 5 +---- net/sunrpc/svcsock.c | 6 +----- 3 files changed, 13 insertions(+), 9 deletions(-) diff --git a/include/net/ipv6.h b/include/net/ipv6.h index 39a00d3ef5e2..9b91188c9a74 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -1177,4 +1177,15 @@ int ipv6_sock_mc_join_ssm(struct sock *sk, int ifindex, const struct in6_addr *addr, unsigned int mode); int ipv6_sock_mc_drop(struct sock *sk, int ifindex, const struct in6_addr *addr); + +static inline int ip6_sock_set_v6only(struct sock *sk) +{ + if (inet_sk(sk)->inet_num) + return -EINVAL; + lock_sock(sk); + sk->sk_ipv6only = true; + release_sock(sk); + return 0; +} + #endif /* _NET_IPV6_H */ diff --git a/net/ipv6/ip6_udp_tunnel.c b/net/ipv6/ip6_udp_tunnel.c index 6523609516d2..2e0ad1bc84a8 100644 --- a/net/ipv6/ip6_udp_tunnel.c +++ b/net/ipv6/ip6_udp_tunnel.c @@ -25,10 +25,7 @@ int udp_sock_create6(struct net *net, struct udp_port_cfg *cfg, goto error; if (cfg->ipv6_v6only) { - int val = 1; - - err = kernel_setsockopt(sock, IPPROTO_IPV6, IPV6_V6ONLY, - (char *) &val, sizeof(val)); + err = ip6_sock_set_v6only(sock->sk); if (err < 0) goto error; } diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index 7a805d165689..a391892977cd 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -1328,7 +1328,6 @@ static struct svc_xprt *svc_create_socket(struct svc_serv *serv, struct sockaddr *newsin = (struct sockaddr *)&addr; int newlen; int family; - int val; RPC_IFDEBUG(char buf[RPC_MAX_ADDRBUFLEN]); dprintk("svc: svc_create_socket(%s, %d, %s)\n", @@ -1364,11 +1363,8 @@ static struct svc_xprt *svc_create_socket(struct svc_serv *serv, * getting requests from IPv4 remotes. Those should * be shunted to a PF_INET listener via rpcbind. */ - val = 1; if (family == PF_INET6) - kernel_setsockopt(sock, SOL_IPV6, IPV6_V6ONLY, - (char *)&val, sizeof(val)); - + ip6_sock_set_v6only(sock->sk); if (type == SOCK_STREAM) sock->sk->sk_reuse = SK_CAN_REUSE; /* allow address reuse */ error = kernel_bind(sock, sin, len); -- cgit v1.2.3-59-g8ed1b From fce934949c0f0003c1777fbf8c0706ba82a8cf7e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 28 May 2020 07:12:32 +0200 Subject: ipv6: add ip6_sock_set_recverr Add a helper to directly set the IPV6_RECVERR sockopt from kernel space without going through a fake uaccess. Signed-off-by: Christoph Hellwig Reviewed-by: David Howells Signed-off-by: David S. Miller --- include/net/ipv6.h | 7 +++++++ net/rxrpc/local_object.c | 10 ++-------- 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/include/net/ipv6.h b/include/net/ipv6.h index 9b91188c9a74..49c4abf99148 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -1188,4 +1188,11 @@ static inline int ip6_sock_set_v6only(struct sock *sk) return 0; } +static inline void ip6_sock_set_recverr(struct sock *sk) +{ + lock_sock(sk); + inet6_sk(sk)->recverr = true; + release_sock(sk); +} + #endif /* _NET_IPV6_H */ diff --git a/net/rxrpc/local_object.c b/net/rxrpc/local_object.c index 6f4e6b4817cf..c8b2097f499c 100644 --- a/net/rxrpc/local_object.c +++ b/net/rxrpc/local_object.c @@ -107,7 +107,7 @@ static struct rxrpc_local *rxrpc_alloc_local(struct rxrpc_net *rxnet, static int rxrpc_open_socket(struct rxrpc_local *local, struct net *net) { struct sock *usk; - int ret, opt; + int ret; _enter("%p{%d,%d}", local, local->srx.transport_type, local->srx.transport.family); @@ -157,13 +157,7 @@ static int rxrpc_open_socket(struct rxrpc_local *local, struct net *net) switch (local->srx.transport.family) { case AF_INET6: /* we want to receive ICMPv6 errors */ - opt = 1; - ret = kernel_setsockopt(local->socket, SOL_IPV6, IPV6_RECVERR, - (char *) &opt, sizeof(opt)); - if (ret < 0) { - _debug("setsockopt failed"); - goto error; - } + ip6_sock_set_recverr(local->socket->sk); /* Fall through and set IPv4 options too otherwise we don't get * errors from IPv4 packets sent through the IPv6 socket. -- cgit v1.2.3-59-g8ed1b From 18d5ad62327576cbb1e5b9938a59d63ac0c15832 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 28 May 2020 07:12:33 +0200 Subject: ipv6: add ip6_sock_set_addr_preferences Add a helper to directly set the IPV6_ADD_PREFERENCES sockopt from kernel space without going through a fake uaccess. Signed-off-by: Christoph Hellwig Signed-off-by: David S. Miller --- include/net/ipv6.h | 67 ++++++++++++++++++++++++++++++++++++++++++++++++ net/ipv6/ipv6_sockglue.c | 59 +----------------------------------------- net/sunrpc/xprtsock.c | 7 ++--- 3 files changed, 72 insertions(+), 61 deletions(-) diff --git a/include/net/ipv6.h b/include/net/ipv6.h index 49c4abf99148..9a9075983016 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -1195,4 +1195,71 @@ static inline void ip6_sock_set_recverr(struct sock *sk) release_sock(sk); } +static inline int __ip6_sock_set_addr_preferences(struct sock *sk, int val) +{ + unsigned int pref = 0; + unsigned int prefmask = ~0; + + /* check PUBLIC/TMP/PUBTMP_DEFAULT conflicts */ + switch (val & (IPV6_PREFER_SRC_PUBLIC | + IPV6_PREFER_SRC_TMP | + IPV6_PREFER_SRC_PUBTMP_DEFAULT)) { + case IPV6_PREFER_SRC_PUBLIC: + pref |= IPV6_PREFER_SRC_PUBLIC; + prefmask &= ~(IPV6_PREFER_SRC_PUBLIC | + IPV6_PREFER_SRC_TMP); + break; + case IPV6_PREFER_SRC_TMP: + pref |= IPV6_PREFER_SRC_TMP; + prefmask &= ~(IPV6_PREFER_SRC_PUBLIC | + IPV6_PREFER_SRC_TMP); + break; + case IPV6_PREFER_SRC_PUBTMP_DEFAULT: + prefmask &= ~(IPV6_PREFER_SRC_PUBLIC | + IPV6_PREFER_SRC_TMP); + break; + case 0: + break; + default: + return -EINVAL; + } + + /* check HOME/COA conflicts */ + switch (val & (IPV6_PREFER_SRC_HOME | IPV6_PREFER_SRC_COA)) { + case IPV6_PREFER_SRC_HOME: + prefmask &= ~IPV6_PREFER_SRC_COA; + break; + case IPV6_PREFER_SRC_COA: + pref |= IPV6_PREFER_SRC_COA; + break; + case 0: + break; + default: + return -EINVAL; + } + + /* check CGA/NONCGA conflicts */ + switch (val & (IPV6_PREFER_SRC_CGA|IPV6_PREFER_SRC_NONCGA)) { + case IPV6_PREFER_SRC_CGA: + case IPV6_PREFER_SRC_NONCGA: + case 0: + break; + default: + return -EINVAL; + } + + inet6_sk(sk)->srcprefs = (inet6_sk(sk)->srcprefs & prefmask) | pref; + return 0; +} + +static inline int ip6_sock_set_addr_preferences(struct sock *sk, bool val) +{ + int ret; + + lock_sock(sk); + ret = __ip6_sock_set_addr_preferences(sk, val); + release_sock(sk); + return ret; +} + #endif /* _NET_IPV6_H */ diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c index e10258c2210e..adbfed6adf11 100644 --- a/net/ipv6/ipv6_sockglue.c +++ b/net/ipv6/ipv6_sockglue.c @@ -845,67 +845,10 @@ done: break; case IPV6_ADDR_PREFERENCES: - { - unsigned int pref = 0; - unsigned int prefmask = ~0; - if (optlen < sizeof(int)) goto e_inval; - - retv = -EINVAL; - - /* check PUBLIC/TMP/PUBTMP_DEFAULT conflicts */ - switch (val & (IPV6_PREFER_SRC_PUBLIC| - IPV6_PREFER_SRC_TMP| - IPV6_PREFER_SRC_PUBTMP_DEFAULT)) { - case IPV6_PREFER_SRC_PUBLIC: - pref |= IPV6_PREFER_SRC_PUBLIC; - break; - case IPV6_PREFER_SRC_TMP: - pref |= IPV6_PREFER_SRC_TMP; - break; - case IPV6_PREFER_SRC_PUBTMP_DEFAULT: - break; - case 0: - goto pref_skip_pubtmp; - default: - goto e_inval; - } - - prefmask &= ~(IPV6_PREFER_SRC_PUBLIC| - IPV6_PREFER_SRC_TMP); -pref_skip_pubtmp: - - /* check HOME/COA conflicts */ - switch (val & (IPV6_PREFER_SRC_HOME|IPV6_PREFER_SRC_COA)) { - case IPV6_PREFER_SRC_HOME: - break; - case IPV6_PREFER_SRC_COA: - pref |= IPV6_PREFER_SRC_COA; - case 0: - goto pref_skip_coa; - default: - goto e_inval; - } - - prefmask &= ~IPV6_PREFER_SRC_COA; -pref_skip_coa: - - /* check CGA/NONCGA conflicts */ - switch (val & (IPV6_PREFER_SRC_CGA|IPV6_PREFER_SRC_NONCGA)) { - case IPV6_PREFER_SRC_CGA: - case IPV6_PREFER_SRC_NONCGA: - case 0: - break; - default: - goto e_inval; - } - - np->srcprefs = (np->srcprefs & prefmask) | pref; - retv = 0; - + retv = __ip6_sock_set_addr_preferences(sk, val); break; - } case IPV6_MINHOPCOUNT: if (optlen < sizeof(int)) goto e_inval; diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 0d3ec055bc12..3a143e250b9a 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -2150,7 +2150,6 @@ static int xs_tcp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock) if (!transport->inet) { struct sock *sk = sock->sk; - unsigned int addr_pref = IPV6_PREFER_SRC_PUBLIC; /* Avoid temporary address, they are bad for long-lived * connections such as NFS mounts. @@ -2159,8 +2158,10 @@ static int xs_tcp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock) * knowledge about the normal duration of connections, * MAY override this as appropriate. */ - kernel_setsockopt(sock, SOL_IPV6, IPV6_ADDR_PREFERENCES, - (char *)&addr_pref, sizeof(addr_pref)); + if (xs_addr(xprt)->sa_family == PF_INET6) { + ip6_sock_set_addr_preferences(sk, + IPV6_PREFER_SRC_PUBLIC); + } xs_tcp_set_socket_timeouts(xprt, sock); -- cgit v1.2.3-59-g8ed1b From 7d7207c2d57080af93fc323dc6a85bd79207b4c6 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 28 May 2020 07:12:34 +0200 Subject: ipv6: add ip6_sock_set_recvpktinfo Add a helper to directly set the IPV6_RECVPKTINFO sockopt from kernel space without going through a fake uaccess. Signed-off-by: Christoph Hellwig Signed-off-by: David S. Miller --- include/net/ipv6.h | 7 +++++++ net/sunrpc/svcsock.c | 10 ++-------- 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/include/net/ipv6.h b/include/net/ipv6.h index 9a9075983016..5e65bf2fd32d 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -1262,4 +1262,11 @@ static inline int ip6_sock_set_addr_preferences(struct sock *sk, bool val) return ret; } +static inline void ip6_sock_set_recvpktinfo(struct sock *sk) +{ + lock_sock(sk); + inet6_sk(sk)->rxopt.bits.rxinfo = true; + release_sock(sk); +} + #endif /* _NET_IPV6_H */ diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index a391892977cd..e7a0037d9b56 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -595,8 +595,6 @@ static struct svc_xprt_class svc_udp_class = { static void svc_udp_init(struct svc_sock *svsk, struct svc_serv *serv) { - int err, level, optname, one = 1; - svc_xprt_init(sock_net(svsk->sk_sock->sk), &svc_udp_class, &svsk->sk_xprt, serv); clear_bit(XPT_CACHE_AUTH, &svsk->sk_xprt.xpt_flags); @@ -617,17 +615,13 @@ static void svc_udp_init(struct svc_sock *svsk, struct svc_serv *serv) switch (svsk->sk_sk->sk_family) { case AF_INET: ip_sock_set_pktinfo(svsk->sk_sock->sk); - return; + break; case AF_INET6: - level = SOL_IPV6; - optname = IPV6_RECVPKTINFO; + ip6_sock_set_recvpktinfo(svsk->sk_sock->sk); break; default: BUG(); } - err = kernel_setsockopt(svsk->sk_sock, level, optname, - (char *)&one, sizeof(one)); - dprintk("svc: kernel_setsockopt returned %d\n", err); } /* -- cgit v1.2.3-59-g8ed1b From 298cd88a66a02c899772ffafbf648786ceb5ab95 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 28 May 2020 07:12:35 +0200 Subject: rxrpc: add rxrpc_sock_set_min_security_level Add a helper to directly set the RXRPC_MIN_SECURITY_LEVEL sockopt from kernel space without going through a fake uaccess. Thanks to David Howells for the documentation updates. Signed-off-by: Christoph Hellwig Acked-by: David Howells Signed-off-by: David S. Miller --- Documentation/networking/rxrpc.rst | 13 +++++++++++-- fs/afs/rxrpc.c | 6 ++---- include/net/af_rxrpc.h | 2 ++ net/rxrpc/af_rxrpc.c | 13 +++++++++++++ 4 files changed, 28 insertions(+), 6 deletions(-) diff --git a/Documentation/networking/rxrpc.rst b/Documentation/networking/rxrpc.rst index 5ad35113d0f4..68552b92dc44 100644 --- a/Documentation/networking/rxrpc.rst +++ b/Documentation/networking/rxrpc.rst @@ -477,7 +477,7 @@ AF_RXRPC sockets support a few socket options at the SOL_RXRPC level: Encrypted checksum plus packet padded and first eight bytes of packet encrypted - which includes the actual packet length. - (c) RXRPC_SECURITY_ENCRYPTED + (c) RXRPC_SECURITY_ENCRYPT Encrypted checksum plus entire packet padded and encrypted, including actual packet length. @@ -578,7 +578,7 @@ A client would issue an operation by: This issues a request_key() to get the key representing the security context. The minimum security level can be set:: - unsigned int sec = RXRPC_SECURITY_ENCRYPTED; + unsigned int sec = RXRPC_SECURITY_ENCRYPT; setsockopt(client, SOL_RXRPC, RXRPC_MIN_SECURITY_LEVEL, &sec, sizeof(sec)); @@ -1090,6 +1090,15 @@ The kernel interface functions are as follows: jiffies). In the event of the timeout occurring, the call will be aborted and -ETIME or -ETIMEDOUT will be returned. + (#) Apply the RXRPC_MIN_SECURITY_LEVEL sockopt to a socket from within in the + kernel:: + + int rxrpc_sock_set_min_security_level(struct sock *sk, + unsigned int val); + + This specifies the minimum security level required for calls on this + socket. + Configurable Parameters ======================= diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c index 1ecc67da6c1a..e313dae01674 100644 --- a/fs/afs/rxrpc.c +++ b/fs/afs/rxrpc.c @@ -37,7 +37,6 @@ int afs_open_socket(struct afs_net *net) { struct sockaddr_rxrpc srx; struct socket *socket; - unsigned int min_level; int ret; _enter(""); @@ -57,9 +56,8 @@ int afs_open_socket(struct afs_net *net) srx.transport.sin6.sin6_family = AF_INET6; srx.transport.sin6.sin6_port = htons(AFS_CM_PORT); - min_level = RXRPC_SECURITY_ENCRYPT; - ret = kernel_setsockopt(socket, SOL_RXRPC, RXRPC_MIN_SECURITY_LEVEL, - (void *)&min_level, sizeof(min_level)); + ret = rxrpc_sock_set_min_security_level(socket->sk, + RXRPC_SECURITY_ENCRYPT); if (ret < 0) goto error_2; diff --git a/include/net/af_rxrpc.h b/include/net/af_rxrpc.h index ab988940bf04..91eacbdcf33d 100644 --- a/include/net/af_rxrpc.h +++ b/include/net/af_rxrpc.h @@ -72,4 +72,6 @@ bool rxrpc_kernel_call_is_complete(struct rxrpc_call *); void rxrpc_kernel_set_max_life(struct socket *, struct rxrpc_call *, unsigned long); +int rxrpc_sock_set_min_security_level(struct sock *sk, unsigned int val); + #endif /* _NET_RXRPC_H */ diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c index 15ee92d79581..394189b81849 100644 --- a/net/rxrpc/af_rxrpc.c +++ b/net/rxrpc/af_rxrpc.c @@ -571,6 +571,19 @@ out: return ret; } +int rxrpc_sock_set_min_security_level(struct sock *sk, unsigned int val) +{ + if (sk->sk_state != RXRPC_UNBOUND) + return -EISCONN; + if (val > RXRPC_SECURITY_MAX) + return -EINVAL; + lock_sock(sk); + rxrpc_sk(sk)->min_sec_level = val; + release_sock(sk); + return 0; +} +EXPORT_SYMBOL(rxrpc_sock_set_min_security_level); + /* * set RxRPC socket options */ -- cgit v1.2.3-59-g8ed1b From 095ae612530c9465df6d372d688cb30c6abfc5f5 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 28 May 2020 07:12:36 +0200 Subject: tipc: call tsk_set_importance from tipc_topsrv_create_listener Avoid using kernel_setsockopt for the TIPC_IMPORTANCE option when we can just use the internal helper. The only change needed is to pass a struct sock instead of tipc_sock, which is private to socket.c Signed-off-by: Christoph Hellwig Signed-off-by: David S. Miller --- net/tipc/socket.c | 18 +++++++++--------- net/tipc/socket.h | 2 ++ net/tipc/topsrv.c | 6 +++--- 3 files changed, 14 insertions(+), 12 deletions(-) diff --git a/net/tipc/socket.c b/net/tipc/socket.c index d6b67d07d22e..3734cdbedc9c 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -196,17 +196,17 @@ static int tsk_importance(struct tipc_sock *tsk) return msg_importance(&tsk->phdr); } -static int tsk_set_importance(struct tipc_sock *tsk, int imp) +static struct tipc_sock *tipc_sk(const struct sock *sk) { - if (imp > TIPC_CRITICAL_IMPORTANCE) - return -EINVAL; - msg_set_importance(&tsk->phdr, (u32)imp); - return 0; + return container_of(sk, struct tipc_sock, sk); } -static struct tipc_sock *tipc_sk(const struct sock *sk) +int tsk_set_importance(struct sock *sk, int imp) { - return container_of(sk, struct tipc_sock, sk); + if (imp > TIPC_CRITICAL_IMPORTANCE) + return -EINVAL; + msg_set_importance(&tipc_sk(sk)->phdr, (u32)imp); + return 0; } static bool tsk_conn_cong(struct tipc_sock *tsk) @@ -2721,7 +2721,7 @@ static int tipc_accept(struct socket *sock, struct socket *new_sock, int flags, /* Connect new socket to it's peer */ tipc_sk_finish_conn(new_tsock, msg_origport(msg), msg_orignode(msg)); - tsk_set_importance(new_tsock, msg_importance(msg)); + tsk_set_importance(new_sk, msg_importance(msg)); if (msg_named(msg)) { new_tsock->conn_type = msg_nametype(msg); new_tsock->conn_instance = msg_nameinst(msg); @@ -3139,7 +3139,7 @@ static int tipc_setsockopt(struct socket *sock, int lvl, int opt, switch (opt) { case TIPC_IMPORTANCE: - res = tsk_set_importance(tsk, value); + res = tsk_set_importance(sk, value); break; case TIPC_SRC_DROPPABLE: if (sock->type != SOCK_STREAM) diff --git a/net/tipc/socket.h b/net/tipc/socket.h index 235b9679acee..b11575afc66f 100644 --- a/net/tipc/socket.h +++ b/net/tipc/socket.h @@ -75,4 +75,6 @@ u32 tipc_sock_get_portid(struct sock *sk); bool tipc_sk_overlimit1(struct sock *sk, struct sk_buff *skb); bool tipc_sk_overlimit2(struct sock *sk, struct sk_buff *skb); +int tsk_set_importance(struct sock *sk, int imp); + #endif diff --git a/net/tipc/topsrv.c b/net/tipc/topsrv.c index 446af7bbd13e..1489cfb941d8 100644 --- a/net/tipc/topsrv.c +++ b/net/tipc/topsrv.c @@ -497,7 +497,6 @@ static void tipc_topsrv_listener_data_ready(struct sock *sk) static int tipc_topsrv_create_listener(struct tipc_topsrv *srv) { - int imp = TIPC_CRITICAL_IMPORTANCE; struct socket *lsock = NULL; struct sockaddr_tipc saddr; struct sock *sk; @@ -514,8 +513,9 @@ static int tipc_topsrv_create_listener(struct tipc_topsrv *srv) sk->sk_user_data = srv; write_unlock_bh(&sk->sk_callback_lock); - rc = kernel_setsockopt(lsock, SOL_TIPC, TIPC_IMPORTANCE, - (char *)&imp, sizeof(imp)); + lock_sock(sk); + rc = tsk_set_importance(sk, TIPC_CRITICAL_IMPORTANCE); + release_sock(sk); if (rc < 0) goto err; -- cgit v1.2.3-59-g8ed1b From b113cabd4378ddd98dccdd7748a16f9f1f094ef0 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 27 May 2020 15:41:06 +0200 Subject: sfc: avoid an unused-variable warning 'nic_data' is no longer used outside of the #ifdef block in efx_ef10_set_mac_address: drivers/net/ethernet/sfc/ef10.c:3231:28: error: unused variable 'nic_data' [-Werror,-Wunused-variable] struct efx_ef10_nic_data *nic_data = efx->nic_data; Move the variable into a local scope. Fixes: dfcabb078847 ("sfc: move vport_id to struct efx_nic") Signed-off-by: Arnd Bergmann Signed-off-by: David S. Miller --- drivers/net/ethernet/sfc/ef10.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/sfc/ef10.c b/drivers/net/ethernet/sfc/ef10.c index e634e8110585..964c5e842cec 100644 --- a/drivers/net/ethernet/sfc/ef10.c +++ b/drivers/net/ethernet/sfc/ef10.c @@ -3228,7 +3228,6 @@ reset_nic: static int efx_ef10_set_mac_address(struct efx_nic *efx) { MCDI_DECLARE_BUF(inbuf, MC_CMD_VADAPTOR_SET_MAC_IN_LEN); - struct efx_ef10_nic_data *nic_data = efx->nic_data; bool was_enabled = efx->port_enabled; int rc; @@ -3256,6 +3255,7 @@ static int efx_ef10_set_mac_address(struct efx_nic *efx) #ifdef CONFIG_SFC_SRIOV if (efx->pci_dev->is_virtfn && efx->pci_dev->physfn) { + struct efx_ef10_nic_data *nic_data = efx->nic_data; struct pci_dev *pci_dev_pf = efx->pci_dev->physfn; if (rc == -EPERM) { -- cgit v1.2.3-59-g8ed1b From 9918f2d22fd3ff1e76693512d29e743eba3dc8cb Mon Sep 17 00:00:00 2001 From: Anirudh Venkataramanan Date: Fri, 15 May 2020 17:42:22 -0700 Subject: ice: Poll for reset completion when DDP load fails There are certain cases where the DDP load fails and the FW issues a core reset. For these cases, wait for reset to complete before proceeding with reset of the driver init. Signed-off-by: Anirudh Venkataramanan Tested-by: Andrew Bowers Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/ice/ice_main.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/net/ethernet/intel/ice/ice_main.c b/drivers/net/ethernet/intel/ice/ice_main.c index b64c4e796636..6583acf32575 100644 --- a/drivers/net/ethernet/intel/ice/ice_main.c +++ b/drivers/net/ethernet/intel/ice/ice_main.c @@ -3086,6 +3086,9 @@ ice_log_pkg_init(struct ice_hw *hw, enum ice_status *status) case ICE_AQ_RC_EBADMAN: case ICE_AQ_RC_EBADBUF: dev_err(dev, "An error occurred on the device while loading the DDP package. The device will be reset.\n"); + /* poll for reset to complete */ + if (ice_check_reset(hw)) + dev_err(dev, "Error resetting device. Please reload the driver\n"); return; default: break; -- cgit v1.2.3-59-g8ed1b From 072064a43ef38fab8559edfbf12f918f8acdd85b Mon Sep 17 00:00:00 2001 From: Jacob Keller Date: Fri, 15 May 2020 17:42:23 -0700 Subject: ice: cleanup VSI context initialization Remove an unnecessary copy of vsi->info into ctxt->info in ice_vsi_init. This line is essentially a no-op because ice_set_dflt_vsi_ctx performs a memset to clear the info from the context structure. Signed-off-by: Jacob Keller Tested-by: Andrew Bowers Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/ice/ice_lib.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/net/ethernet/intel/ice/ice_lib.c b/drivers/net/ethernet/intel/ice/ice_lib.c index 6f3ee8ac11ce..89e8e4f7f56f 100644 --- a/drivers/net/ethernet/intel/ice/ice_lib.c +++ b/drivers/net/ethernet/intel/ice/ice_lib.c @@ -938,7 +938,6 @@ static int ice_vsi_init(struct ice_vsi *vsi, bool init_vsi) if (!ctxt) return -ENOMEM; - ctxt->info = vsi->info; switch (vsi->type) { case ICE_VSI_CTRL: case ICE_VSI_LB: -- cgit v1.2.3-59-g8ed1b From bc3a024101ca497bea4c69be4054c32a5c349f1d Mon Sep 17 00:00:00 2001 From: Jacob Keller Date: Fri, 15 May 2020 17:42:24 -0700 Subject: ice: fix potential double free in probe unrolling If ice_init_interrupt_scheme fails, ice_probe will jump to clearing up the interrupts. This can lead to some static analysis tools such as the compiler sanitizers complaining about double free problems. Since ice_init_interrupt_scheme already unrolls internally on failure, there is no need to call ice_clear_interrupt_scheme when it fails. Add a new unroll label and use that instead. Signed-off-by: Jacob Keller Tested-by: Andrew Bowers Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/ice/ice_main.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/intel/ice/ice_main.c b/drivers/net/ethernet/intel/ice/ice_main.c index 6583acf32575..5cffaf360cb0 100644 --- a/drivers/net/ethernet/intel/ice/ice_main.c +++ b/drivers/net/ethernet/intel/ice/ice_main.c @@ -3418,7 +3418,7 @@ ice_probe(struct pci_dev *pdev, const struct pci_device_id __always_unused *ent) if (err) { dev_err(dev, "ice_init_interrupt_scheme failed: %d\n", err); err = -EIO; - goto err_init_interrupt_unroll; + goto err_init_vsi_unroll; } /* In case of MSIX we are going to setup the misc vector right here @@ -3511,6 +3511,7 @@ err_msix_misc_unroll: ice_free_irq_msix_misc(pf); err_init_interrupt_unroll: ice_clear_interrupt_scheme(pf); +err_init_vsi_unroll: devm_kfree(dev, pf->vsi); err_init_pf_unroll: ice_deinit_pf(pf); -- cgit v1.2.3-59-g8ed1b From c2b313b783e0441dab2441cc1ee216eb4b9447a6 Mon Sep 17 00:00:00 2001 From: Jacob Keller Date: Fri, 15 May 2020 17:42:25 -0700 Subject: ice: fix kernel BUG if register_netdev fails If register_netdev() fails, the driver will attempt to cleanup the q_vectors and inadvertently trigger a kernel BUG due to a NULL pointer dereference. This occurs because cleaning up q_vectors attempts to call netif_napi_del on napi_structs which were never initialized. Resolve this by releasing the netdev in ice_cfg_netdev and setting vsi->netdev to NULL. This ensures that after ice_cfg_netdev fails the state is rewound to match as if ice_cfg_netdev was never called. Signed-off-by: Jacob Keller Tested-by: Andrew Bowers Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/ice/ice_main.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/intel/ice/ice_main.c b/drivers/net/ethernet/intel/ice/ice_main.c index 5cffaf360cb0..69854b8644a6 100644 --- a/drivers/net/ethernet/intel/ice/ice_main.c +++ b/drivers/net/ethernet/intel/ice/ice_main.c @@ -2428,7 +2428,7 @@ static int ice_cfg_netdev(struct ice_vsi *vsi) err = register_netdev(vsi->netdev); if (err) - goto err_destroy_devlink_port; + goto err_free_netdev; devlink_port_type_eth_set(&pf->devlink_port, vsi->netdev); @@ -2439,9 +2439,11 @@ static int ice_cfg_netdev(struct ice_vsi *vsi) return 0; +err_free_netdev: + free_netdev(vsi->netdev); + vsi->netdev = NULL; err_destroy_devlink_port: ice_devlink_destroy_port(pf); - return err; } -- cgit v1.2.3-59-g8ed1b From d3112cd1abec7a28bbe885c2151875bcff4e9092 Mon Sep 17 00:00:00 2001 From: Tony Nguyen Date: Fri, 15 May 2020 17:42:26 -0700 Subject: ice: Declare functions static ice_get_pfa_module_tlv() and ice_read_sr_word() are not being called outside of their file. Declare them as static. Signed-off-by: Tony Nguyen Tested-by: Andrew Bowers Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/ice/ice_nvm.c | 5 +++-- drivers/net/ethernet/intel/ice/ice_nvm.h | 4 ---- 2 files changed, 3 insertions(+), 6 deletions(-) diff --git a/drivers/net/ethernet/intel/ice/ice_nvm.c b/drivers/net/ethernet/intel/ice/ice_nvm.c index 7c2a06892bbb..b049c1c30c88 100644 --- a/drivers/net/ethernet/intel/ice/ice_nvm.c +++ b/drivers/net/ethernet/intel/ice/ice_nvm.c @@ -172,7 +172,8 @@ void ice_release_nvm(struct ice_hw *hw) * * Reads one 16 bit word from the Shadow RAM using the ice_read_sr_word_aq. */ -enum ice_status ice_read_sr_word(struct ice_hw *hw, u16 offset, u16 *data) +static enum ice_status +ice_read_sr_word(struct ice_hw *hw, u16 offset, u16 *data) { enum ice_status status; @@ -196,7 +197,7 @@ enum ice_status ice_read_sr_word(struct ice_hw *hw, u16 offset, u16 *data) * Area (PFA) and returns the TLV pointer and length. The caller can * use these to read the variable length TLV value. */ -enum ice_status +static enum ice_status ice_get_pfa_module_tlv(struct ice_hw *hw, u16 *module_tlv, u16 *module_tlv_len, u16 module_type) { diff --git a/drivers/net/ethernet/intel/ice/ice_nvm.h b/drivers/net/ethernet/intel/ice/ice_nvm.h index 999f273ba6ad..165eda07b93d 100644 --- a/drivers/net/ethernet/intel/ice/ice_nvm.h +++ b/drivers/net/ethernet/intel/ice/ice_nvm.h @@ -11,10 +11,6 @@ enum ice_status ice_read_flat_nvm(struct ice_hw *hw, u32 offset, u32 *length, u8 *data, bool read_shadow_ram); enum ice_status -ice_get_pfa_module_tlv(struct ice_hw *hw, u16 *module_tlv, u16 *module_tlv_len, - u16 module_type); -enum ice_status ice_read_pba_string(struct ice_hw *hw, u8 *pba_num, u32 pba_num_size); enum ice_status ice_init_nvm(struct ice_hw *hw); -enum ice_status ice_read_sr_word(struct ice_hw *hw, u16 offset, u16 *data); #endif /* _ICE_NVM_H_ */ -- cgit v1.2.3-59-g8ed1b From ac3716134a40b04e18a2dda78800797129138005 Mon Sep 17 00:00:00 2001 From: Brett Creeley Date: Fri, 15 May 2020 17:51:07 -0700 Subject: ice: Refactor ice_ena_vf_mappings to split MSIX and queue mappings Currently ice_ena_vf_mappings() does all of the VF's MSIX and queue mapping in one function. This makes it hard to digest. Fix this by creating a new function for enabling MSIX mappings and one for enabling queue mappings. Also, rename some variables in the functions for clarity. Signed-off-by: Brett Creeley Tested-by: Andrew Bowers Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c | 91 +++++++++++++++--------- 1 file changed, 59 insertions(+), 32 deletions(-) diff --git a/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c b/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c index efd54299a220..621ec0cc6fff 100644 --- a/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c +++ b/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c @@ -651,55 +651,70 @@ ice_alloc_vf_res_exit: } /** - * ice_ena_vf_mappings - * @vf: pointer to the VF structure + * ice_ena_vf_msix_mappings - enable VF MSIX mappings in hardware + * @vf: VF to enable MSIX mappings for * - * Enable VF vectors and queues allocation by writing the details into - * respective registers. + * Some of the registers need to be indexed/configured using hardware global + * device values and other registers need 0-based values, which represent PF + * based values. */ -static void ice_ena_vf_mappings(struct ice_vf *vf) +static void ice_ena_vf_msix_mappings(struct ice_vf *vf) { - int abs_vf_id, abs_first, abs_last; + int device_based_first_msix, device_based_last_msix; + int pf_based_first_msix, pf_based_last_msix, v; struct ice_pf *pf = vf->pf; - struct ice_vsi *vsi; - struct device *dev; - int first, last, v; + int device_based_vf_id; struct ice_hw *hw; u32 reg; - dev = ice_pf_to_dev(pf); hw = &pf->hw; - vsi = pf->vsi[vf->lan_vsi_idx]; - first = vf->first_vector_idx; - last = (first + pf->num_msix_per_vf) - 1; - abs_first = first + pf->hw.func_caps.common_cap.msix_vector_first_id; - abs_last = (abs_first + pf->num_msix_per_vf) - 1; - abs_vf_id = vf->vf_id + hw->func_caps.vf_base_id; - - /* VF Vector allocation */ - reg = (((abs_first << VPINT_ALLOC_FIRST_S) & VPINT_ALLOC_FIRST_M) | - ((abs_last << VPINT_ALLOC_LAST_S) & VPINT_ALLOC_LAST_M) | - VPINT_ALLOC_VALID_M); + pf_based_first_msix = vf->first_vector_idx; + pf_based_last_msix = (pf_based_first_msix + pf->num_msix_per_vf) - 1; + + device_based_first_msix = pf_based_first_msix + + pf->hw.func_caps.common_cap.msix_vector_first_id; + device_based_last_msix = + (device_based_first_msix + pf->num_msix_per_vf) - 1; + device_based_vf_id = vf->vf_id + hw->func_caps.vf_base_id; + + reg = (((device_based_first_msix << VPINT_ALLOC_FIRST_S) & + VPINT_ALLOC_FIRST_M) | + ((device_based_last_msix << VPINT_ALLOC_LAST_S) & + VPINT_ALLOC_LAST_M) | VPINT_ALLOC_VALID_M); wr32(hw, VPINT_ALLOC(vf->vf_id), reg); - reg = (((abs_first << VPINT_ALLOC_PCI_FIRST_S) + reg = (((device_based_first_msix << VPINT_ALLOC_PCI_FIRST_S) & VPINT_ALLOC_PCI_FIRST_M) | - ((abs_last << VPINT_ALLOC_PCI_LAST_S) & VPINT_ALLOC_PCI_LAST_M) | - VPINT_ALLOC_PCI_VALID_M); + ((device_based_last_msix << VPINT_ALLOC_PCI_LAST_S) & + VPINT_ALLOC_PCI_LAST_M) | VPINT_ALLOC_PCI_VALID_M); wr32(hw, VPINT_ALLOC_PCI(vf->vf_id), reg); + /* map the interrupts to its functions */ - for (v = first; v <= last; v++) { - reg = (((abs_vf_id << GLINT_VECT2FUNC_VF_NUM_S) & + for (v = pf_based_first_msix; v <= pf_based_last_msix; v++) { + reg = (((device_based_vf_id << GLINT_VECT2FUNC_VF_NUM_S) & GLINT_VECT2FUNC_VF_NUM_M) | ((hw->pf_id << GLINT_VECT2FUNC_PF_NUM_S) & GLINT_VECT2FUNC_PF_NUM_M)); wr32(hw, GLINT_VECT2FUNC(v), reg); } - /* Map mailbox interrupt. We put an explicit 0 here to remind us that - * VF admin queue interrupts will go to VF MSI-X vector 0. - */ - wr32(hw, VPINT_MBX_CTL(abs_vf_id), VPINT_MBX_CTL_CAUSE_ENA_M | 0); + /* Map mailbox interrupt to VF MSI-X vector 0 */ + wr32(hw, VPINT_MBX_CTL(device_based_vf_id), VPINT_MBX_CTL_CAUSE_ENA_M); +} + +/** + * ice_ena_vf_q_mappings - enable Rx/Tx queue mappings for a VF + * @vf: VF to enable the mappings for + * @max_txq: max Tx queues allowed on the VF's VSI + * @max_rxq: max Rx queues allowed on the VF's VSI + */ +static void ice_ena_vf_q_mappings(struct ice_vf *vf, u16 max_txq, u16 max_rxq) +{ + struct ice_vsi *vsi = vf->pf->vsi[vf->lan_vsi_idx]; + struct device *dev = ice_pf_to_dev(vf->pf); + struct ice_hw *hw = &vf->pf->hw; + u32 reg; + /* set regardless of mapping mode */ wr32(hw, VPLAN_TXQ_MAPENA(vf->vf_id), VPLAN_TXQ_MAPENA_TX_ENA_M); @@ -711,7 +726,7 @@ static void ice_ena_vf_mappings(struct ice_vf *vf) */ reg = (((vsi->txq_map[0] << VPLAN_TX_QBASE_VFFIRSTQ_S) & VPLAN_TX_QBASE_VFFIRSTQ_M) | - (((vsi->alloc_txq - 1) << VPLAN_TX_QBASE_VFNUMQ_S) & + (((max_txq - 1) << VPLAN_TX_QBASE_VFNUMQ_S) & VPLAN_TX_QBASE_VFNUMQ_M)); wr32(hw, VPLAN_TX_QBASE(vf->vf_id), reg); } else { @@ -729,7 +744,7 @@ static void ice_ena_vf_mappings(struct ice_vf *vf) */ reg = (((vsi->rxq_map[0] << VPLAN_RX_QBASE_VFFIRSTQ_S) & VPLAN_RX_QBASE_VFFIRSTQ_M) | - (((vsi->alloc_txq - 1) << VPLAN_RX_QBASE_VFNUMQ_S) & + (((max_rxq - 1) << VPLAN_RX_QBASE_VFNUMQ_S) & VPLAN_RX_QBASE_VFNUMQ_M)); wr32(hw, VPLAN_RX_QBASE(vf->vf_id), reg); } else { @@ -737,6 +752,18 @@ static void ice_ena_vf_mappings(struct ice_vf *vf) } } +/** + * ice_ena_vf_mappings - enable VF MSIX and queue mapping + * @vf: pointer to the VF structure + */ +static void ice_ena_vf_mappings(struct ice_vf *vf) +{ + struct ice_vsi *vsi = vf->pf->vsi[vf->lan_vsi_idx]; + + ice_ena_vf_msix_mappings(vf); + ice_ena_vf_q_mappings(vf, vsi->alloc_txq, vsi->alloc_rxq); +} + /** * ice_determine_res * @pf: pointer to the PF structure -- cgit v1.2.3-59-g8ed1b From 02337f1f59148ad2d361feae0a11f6596b04632b Mon Sep 17 00:00:00 2001 From: Brett Creeley Date: Fri, 15 May 2020 17:51:08 -0700 Subject: ice: Simplify ice_sriov_configure Add a new function for checking if SR-IOV can be configured based on the PF and/or device's state/capabilities. Also, simplify the flow in ice_sriov_configure(). Signed-off-by: Brett Creeley Signed-off-by: Tony Nguyen Tested-by: Andrew Bowers Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c | 72 ++++++++++++++++-------- 1 file changed, 48 insertions(+), 24 deletions(-) diff --git a/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c b/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c index 621ec0cc6fff..b699ca81d8c4 100644 --- a/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c +++ b/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c @@ -1460,6 +1460,8 @@ static bool ice_pf_state_is_nominal(struct ice_pf *pf) * ice_pci_sriov_ena - Enable or change number of VFs * @pf: pointer to the PF structure * @num_vfs: number of VFs to allocate + * + * Returns 0 on success and negative on failure */ static int ice_pci_sriov_ena(struct ice_pf *pf, int num_vfs) { @@ -1467,20 +1469,10 @@ static int ice_pci_sriov_ena(struct ice_pf *pf, int num_vfs) struct device *dev = ice_pf_to_dev(pf); int err; - if (!ice_pf_state_is_nominal(pf)) { - dev_err(dev, "Cannot enable SR-IOV, device not ready\n"); - return -EBUSY; - } - - if (!test_bit(ICE_FLAG_SRIOV_CAPABLE, pf->flags)) { - dev_err(dev, "This device is not capable of SR-IOV\n"); - return -EOPNOTSUPP; - } - if (pre_existing_vfs && pre_existing_vfs != num_vfs) ice_free_vfs(pf); else if (pre_existing_vfs && pre_existing_vfs == num_vfs) - return num_vfs; + return 0; if (num_vfs > pf->num_vfs_supported) { dev_err(dev, "Can't enable %d VFs, max VFs supported is %d\n", @@ -1496,37 +1488,69 @@ static int ice_pci_sriov_ena(struct ice_pf *pf, int num_vfs) } set_bit(ICE_FLAG_SRIOV_ENA, pf->flags); - return num_vfs; + return 0; +} + +/** + * ice_check_sriov_allowed - check if SR-IOV is allowed based on various checks + * @pf: PF to enabled SR-IOV on + */ +static int ice_check_sriov_allowed(struct ice_pf *pf) +{ + struct device *dev = ice_pf_to_dev(pf); + + if (!test_bit(ICE_FLAG_SRIOV_CAPABLE, pf->flags)) { + dev_err(dev, "This device is not capable of SR-IOV\n"); + return -EOPNOTSUPP; + } + + if (ice_is_safe_mode(pf)) { + dev_err(dev, "SR-IOV cannot be configured - Device is in Safe Mode\n"); + return -EOPNOTSUPP; + } + + if (!ice_pf_state_is_nominal(pf)) { + dev_err(dev, "Cannot enable SR-IOV, device not ready\n"); + return -EBUSY; + } + + return 0; } /** * ice_sriov_configure - Enable or change number of VFs via sysfs * @pdev: pointer to a pci_dev structure - * @num_vfs: number of VFs to allocate + * @num_vfs: number of VFs to allocate or 0 to free VFs * - * This function is called when the user updates the number of VFs in sysfs. + * This function is called when the user updates the number of VFs in sysfs. On + * success return whatever num_vfs was set to by the caller. Return negative on + * failure. */ int ice_sriov_configure(struct pci_dev *pdev, int num_vfs) { struct ice_pf *pf = pci_get_drvdata(pdev); struct device *dev = ice_pf_to_dev(pf); + int err; - if (ice_is_safe_mode(pf)) { - dev_err(dev, "SR-IOV cannot be configured - Device is in Safe Mode\n"); - return -EOPNOTSUPP; - } + err = ice_check_sriov_allowed(pf); + if (err) + return err; - if (num_vfs) - return ice_pci_sriov_ena(pf, num_vfs); + if (!num_vfs) { + if (!pci_vfs_assigned(pdev)) { + ice_free_vfs(pf); + return 0; + } - if (!pci_vfs_assigned(pdev)) { - ice_free_vfs(pf); - } else { dev_err(dev, "can't free VFs because some are assigned to VMs.\n"); return -EBUSY; } - return 0; + err = ice_pci_sriov_ena(pf, num_vfs); + if (err) + return err; + + return num_vfs; } /** -- cgit v1.2.3-59-g8ed1b From cfcee02b6c15e8866d03cae3b80edc4e9ad8cc7d Mon Sep 17 00:00:00 2001 From: Brett Creeley Date: Fri, 15 May 2020 17:51:09 -0700 Subject: ice: Add helper function for clearing VPGEN_VFRTRIG Create a helper function for clearing VPGEN_VFRTRIG as this needs to be done on reset to notify the VF that we are done resetting it. Also, it needs to be done on SR-IOV initialization/creation in case it was left in a bad state after SR-IOV tear down. Signed-off-by: Brett Creeley Tested-by: Andrew Bowers Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c | 31 +++++++++++++++--------- 1 file changed, 20 insertions(+), 11 deletions(-) diff --git a/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c b/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c index b699ca81d8c4..039f0b057603 100644 --- a/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c +++ b/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c @@ -961,6 +961,21 @@ static int ice_set_per_vf_res(struct ice_pf *pf) return 0; } +/** + * ice_clear_vf_reset_trigger - enable VF to access hardware + * @vf: VF to enabled hardware access for + */ +static void ice_clear_vf_reset_trigger(struct ice_vf *vf) +{ + struct ice_hw *hw = &vf->pf->hw; + u32 reg; + + reg = rd32(hw, VPGEN_VFRTRIG(vf->vf_id)); + reg &= ~VPGEN_VFRTRIG_VFSWR_M; + wr32(hw, VPGEN_VFRTRIG(vf->vf_id), reg); + ice_flush(hw); +} + /** * ice_cleanup_and_realloc_vf - Clean up VF and reallocate resources after reset * @vf: pointer to the VF structure @@ -974,26 +989,20 @@ static void ice_cleanup_and_realloc_vf(struct ice_vf *vf) { struct ice_pf *pf = vf->pf; struct ice_hw *hw; - u32 reg; hw = &pf->hw; - /* PF software completes the flow by notifying VF that reset flow is - * completed. This is done by enabling hardware by clearing the reset - * bit in the VPGEN_VFRTRIG reg and setting VFR_STATE in the VFGEN_RSTAT - * register to VFR completed (done at the end of this function) - * By doing this we allow HW to access VF memory at any point. If we - * did it any sooner, HW could access memory while it was being freed - * in ice_free_vf_res(), causing an IOMMU fault. + /* Allow HW to access VF memory after calling + * ice_clear_vf_reset_trigger(). If we did it any sooner, HW could + * access memory while it was being freed in ice_free_vf_res(), causing + * an IOMMU fault. * * On the other hand, this needs to be done ASAP, because the VF driver * is waiting for this to happen and may report a timeout. It's * harmless, but it gets logged into Guest OS kernel log, so best avoid * it. */ - reg = rd32(hw, VPGEN_VFRTRIG(vf->vf_id)); - reg &= ~VPGEN_VFRTRIG_VFSWR_M; - wr32(hw, VPGEN_VFRTRIG(vf->vf_id), reg); + ice_clear_vf_reset_trigger(vf); /* reallocate VF resources to finish resetting the VSI state */ if (!ice_alloc_vf_res(vf)) { -- cgit v1.2.3-59-g8ed1b From 916c7fdf5e938052a869f78b35a0ca2214d25b12 Mon Sep 17 00:00:00 2001 From: Brett Creeley Date: Fri, 15 May 2020 17:51:10 -0700 Subject: ice: Separate VF VSI initialization/creation from reset flow Currently the same flow is used for VF VSI initialization/creation and VF VSI reset. This makes the initialization/creation flow unnecessarily complicated. Fix this by separating the initialization/creation of the VF VSI from the reset flow. Signed-off-by: Brett Creeley Tested-by: Andrew Bowers Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c | 110 ++++++++++++++++++++++- 1 file changed, 106 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c b/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c index 039f0b057603..72a9da3164d9 100644 --- a/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c +++ b/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c @@ -1375,6 +1375,99 @@ static void ice_vc_notify_vf_reset(struct ice_vf *vf) NULL); } +/** + * ice_init_vf_vsi_res - initialize/setup VF VSI resources + * @vf: VF to initialize/setup the VSI for + * + * This function creates a VSI for the VF, adds a VLAN 0 filter, and sets up the + * VF VSI's broadcast filter and is only used during initial VF creation. + */ +static int ice_init_vf_vsi_res(struct ice_vf *vf) +{ + struct ice_pf *pf = vf->pf; + u8 broadcast[ETH_ALEN]; + enum ice_status status; + struct ice_vsi *vsi; + struct device *dev; + int err; + + vf->first_vector_idx = ice_calc_vf_first_vector_idx(pf, vf); + + dev = ice_pf_to_dev(pf); + vsi = ice_vf_vsi_setup(pf, pf->hw.port_info, vf->vf_id); + if (!vsi) { + dev_err(dev, "Failed to create VF VSI\n"); + return -ENOMEM; + } + + vf->lan_vsi_idx = vsi->idx; + vf->lan_vsi_num = vsi->vsi_num; + + err = ice_vsi_add_vlan(vsi, 0, ICE_FWD_TO_VSI); + if (err) { + dev_warn(dev, "Failed to add VLAN 0 filter for VF %d\n", + vf->vf_id); + goto release_vsi; + } + + eth_broadcast_addr(broadcast); + status = ice_fltr_add_mac(vsi, broadcast, ICE_FWD_TO_VSI); + if (status) { + dev_err(dev, "Failed to add broadcast MAC filter for VF %d, status %s\n", + vf->vf_id, ice_stat_str(status)); + err = ice_status_to_errno(status); + goto release_vsi; + } + + vf->num_mac = 1; + + return 0; + +release_vsi: + ice_vsi_release(vsi); + return err; +} + +/** + * ice_start_vfs - start VFs so they are ready to be used by SR-IOV + * @pf: PF the VFs are associated with + */ +static int ice_start_vfs(struct ice_pf *pf) +{ + struct ice_hw *hw = &pf->hw; + int retval, i; + + ice_for_each_vf(pf, i) { + struct ice_vf *vf = &pf->vf[i]; + + ice_clear_vf_reset_trigger(vf); + + retval = ice_init_vf_vsi_res(vf); + if (retval) { + dev_err(ice_pf_to_dev(pf), "Failed to initialize VSI resources for VF %d, error %d\n", + vf->vf_id, retval); + goto teardown; + } + + set_bit(ICE_VF_STATE_INIT, vf->vf_states); + ice_ena_vf_mappings(vf); + wr32(hw, VFGEN_RSTAT(vf->vf_id), VIRTCHNL_VFR_VFACTIVE); + } + + ice_flush(hw); + return 0; + +teardown: + for (i = i - 1; i >= 0; i--) { + struct ice_vf *vf = &pf->vf[i]; + + ice_dis_vf_mappings(vf); + ice_vsi_release(pf->vsi[vf->lan_vsi_idx]); + } + + return retval; +} + /** * ice_alloc_vfs - Allocate and set up VFs resources * @pf: pointer to the PF structure @@ -1407,6 +1500,13 @@ static int ice_alloc_vfs(struct ice_pf *pf, u16 num_alloc_vfs) pf->vf = vfs; pf->num_alloc_vfs = num_alloc_vfs; + if (ice_set_per_vf_res(pf)) { + dev_err(dev, "Not enough resources for %d VFs, try with fewer number of VFs\n", + num_alloc_vfs); + ret = -ENOSPC; + goto err_unroll_sriov; + } + /* apply default profile */ ice_for_each_vf(pf, i) { vfs[i].pf = pf; @@ -1416,15 +1516,17 @@ static int ice_alloc_vfs(struct ice_pf *pf, u16 num_alloc_vfs) /* assign default capabilities */ set_bit(ICE_VIRTCHNL_VF_CAP_L2, &vfs[i].vf_caps); vfs[i].spoofchk = true; + vfs[i].num_vf_qs = pf->num_qps_per_vf; } - /* VF resources get allocated with initialization */ - if (!ice_config_res_vfs(pf)) { - ret = -EIO; + if (ice_start_vfs(pf)) { + dev_err(dev, "Failed to start VF(s)\n"); + ret = -EAGAIN; goto err_unroll_sriov; } - return ret; + clear_bit(__ICE_VF_DIS, pf->state); + return 0; err_unroll_sriov: pf->vf = NULL; -- cgit v1.2.3-59-g8ed1b From a06325a0901a887009d28e9fa06168d05c35c941 Mon Sep 17 00:00:00 2001 From: Brett Creeley Date: Fri, 15 May 2020 17:51:11 -0700 Subject: ice: Renaming and simplification in VF init path Some function names weren't very clear and some portions of VF creation could be moved into functions for clarity. Fix this by renaming some functions and move pieces of code into clearly name functions. Signed-off-by: Brett Creeley Tested-by: Andrew Bowers Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c | 83 +++++++++++++++--------- 1 file changed, 54 insertions(+), 29 deletions(-) diff --git a/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c b/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c index 72a9da3164d9..92a442ec7314 100644 --- a/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c +++ b/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c @@ -1469,16 +1469,56 @@ teardown: } /** - * ice_alloc_vfs - Allocate and set up VFs resources + * ice_set_dflt_settings - set VF defaults during initialization/creation + * @pf: PF holding reference to all VFs for default configuration + */ +static void ice_set_dflt_settings_vfs(struct ice_pf *pf) +{ + int i; + + ice_for_each_vf(pf, i) { + struct ice_vf *vf = &pf->vf[i]; + + vf->pf = pf; + vf->vf_id = i; + vf->vf_sw_id = pf->first_sw; + /* assign default capabilities */ + set_bit(ICE_VIRTCHNL_VF_CAP_L2, &vf->vf_caps); + vf->spoofchk = true; + vf->num_vf_qs = pf->num_qps_per_vf; + } +} + +/** + * ice_alloc_vfs - allocate num_vfs in the PF structure + * @pf: PF to store the allocated VFs in + * @num_vfs: number of VFs to allocate + */ +static int ice_alloc_vfs(struct ice_pf *pf, int num_vfs) +{ + struct ice_vf *vfs; + + vfs = devm_kcalloc(ice_pf_to_dev(pf), num_vfs, sizeof(*vfs), + GFP_KERNEL); + if (!vfs) + return -ENOMEM; + + pf->vf = vfs; + pf->num_alloc_vfs = num_vfs; + + return 0; +} + +/** + * ice_ena_vfs - enable VFs so they are ready to be used * @pf: pointer to the PF structure - * @num_alloc_vfs: number of VFs to allocate + * @num_vfs: number of VFs to enable */ -static int ice_alloc_vfs(struct ice_pf *pf, u16 num_alloc_vfs) +static int ice_ena_vfs(struct ice_pf *pf, u16 num_vfs) { struct device *dev = ice_pf_to_dev(pf); struct ice_hw *hw = &pf->hw; - struct ice_vf *vfs; - int i, ret; + int ret; /* Disable global interrupt 0 so we don't try to handle the VFLR. */ wr32(hw, GLINT_DYN_CTL(pf->oicr_idx), @@ -1486,38 +1526,24 @@ static int ice_alloc_vfs(struct ice_pf *pf, u16 num_alloc_vfs) set_bit(__ICE_OICR_INTR_DIS, pf->state); ice_flush(hw); - ret = pci_enable_sriov(pf->pdev, num_alloc_vfs); + ret = pci_enable_sriov(pf->pdev, num_vfs); if (ret) { pf->num_alloc_vfs = 0; goto err_unroll_intr; } - /* allocate memory */ - vfs = devm_kcalloc(dev, num_alloc_vfs, sizeof(*vfs), GFP_KERNEL); - if (!vfs) { - ret = -ENOMEM; + + ret = ice_alloc_vfs(pf, num_vfs); + if (ret) goto err_pci_disable_sriov; - } - pf->vf = vfs; - pf->num_alloc_vfs = num_alloc_vfs; if (ice_set_per_vf_res(pf)) { dev_err(dev, "Not enough resources for %d VFs, try with fewer number of VFs\n", - num_alloc_vfs); + num_vfs); ret = -ENOSPC; goto err_unroll_sriov; } - /* apply default profile */ - ice_for_each_vf(pf, i) { - vfs[i].pf = pf; - vfs[i].vf_sw_id = pf->first_sw; - vfs[i].vf_id = i; - - /* assign default capabilities */ - set_bit(ICE_VIRTCHNL_VF_CAP_L2, &vfs[i].vf_caps); - vfs[i].spoofchk = true; - vfs[i].num_vf_qs = pf->num_qps_per_vf; - } + ice_set_dflt_settings_vfs(pf); if (ice_start_vfs(pf)) { dev_err(dev, "Failed to start VF(s)\n"); @@ -1529,9 +1555,8 @@ static int ice_alloc_vfs(struct ice_pf *pf, u16 num_alloc_vfs) return 0; err_unroll_sriov: + devm_kfree(dev, pf->vf); pf->vf = NULL; - devm_kfree(dev, vfs); - vfs = NULL; pf->num_alloc_vfs = 0; err_pci_disable_sriov: pci_disable_sriov(pf->pdev); @@ -1591,8 +1616,8 @@ static int ice_pci_sriov_ena(struct ice_pf *pf, int num_vfs) return -EOPNOTSUPP; } - dev_info(dev, "Allocating %d VFs\n", num_vfs); - err = ice_alloc_vfs(pf, num_vfs); + dev_info(dev, "Enabling %d VFs\n", num_vfs); + err = ice_ena_vfs(pf, num_vfs); if (err) { dev_err(dev, "Failed to enable SR-IOV: %d\n", err); return err; -- cgit v1.2.3-59-g8ed1b From eb2af3ee94de7f493745adc34bb9d170ced8c5a9 Mon Sep 17 00:00:00 2001 From: Brett Creeley Date: Fri, 15 May 2020 17:51:12 -0700 Subject: ice: Add function to set trust mode bit on reset As the title says, use a function to set trust mode bit on reset. Signed-off-by: Brett Creeley Tested-by: Andrew Bowers Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c b/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c index 92a442ec7314..4005a4caf2f0 100644 --- a/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c +++ b/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c @@ -610,6 +610,18 @@ ice_alloc_vsi_res_exit: return status; } +/** + * ice_vf_set_host_trust_cfg - set trust setting based on pre-reset value + * @vf: VF to configure trust setting for + */ +static void ice_vf_set_host_trust_cfg(struct ice_vf *vf) +{ + if (vf->trusted) + set_bit(ICE_VIRTCHNL_VF_CAP_PRIVILEGE, &vf->vf_caps); + else + clear_bit(ICE_VIRTCHNL_VF_CAP_PRIVILEGE, &vf->vf_caps); +} + /** * ice_alloc_vf_res - Allocate VF resources * @vf: pointer to the VF structure @@ -635,10 +647,7 @@ static int ice_alloc_vf_res(struct ice_vf *vf) if (status) goto ice_alloc_vf_res_exit; - if (vf->trusted) - set_bit(ICE_VIRTCHNL_VF_CAP_PRIVILEGE, &vf->vf_caps); - else - clear_bit(ICE_VIRTCHNL_VF_CAP_PRIVILEGE, &vf->vf_caps); + ice_vf_set_host_trust_cfg(vf); /* VF is now completely initialized */ set_bit(ICE_VF_STATE_INIT, vf->vf_states); -- cgit v1.2.3-59-g8ed1b From 350e822cd54ff331bb421a59b6c096bb1739d22b Mon Sep 17 00:00:00 2001 From: Brett Creeley Date: Fri, 15 May 2020 17:51:13 -0700 Subject: ice: Add functions to rebuild host VLAN/MAC config for a VF When resetting a VF the VLAN and MAC filter configurations need to be replayed. Add helper functions for this purpose. Signed-off-by: Brett Creeley Signed-off-by: Tony Nguyen Tested-by: Andrew Bowers Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c | 121 +++++++++++++++++------ 1 file changed, 89 insertions(+), 32 deletions(-) diff --git a/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c b/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c index 4005a4caf2f0..3a714c81b5b2 100644 --- a/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c +++ b/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c @@ -540,6 +540,82 @@ static int ice_calc_vf_first_vector_idx(struct ice_pf *pf, struct ice_vf *vf) return pf->sriov_base_vector + vf->vf_id * pf->num_msix_per_vf; } +/** + * ice_vf_rebuild_host_vlan_cfg - add VLAN 0 filter or rebuild the Port VLAN + * @vf: VF to add MAC filters for + * + * Called after a VF VSI has been re-added/rebuilt during reset. The PF driver + * always re-adds either a VLAN 0 or port VLAN based filter after reset. + */ +static int ice_vf_rebuild_host_vlan_cfg(struct ice_vf *vf) +{ + struct ice_vsi *vsi = vf->pf->vsi[vf->lan_vsi_idx]; + struct device *dev = ice_pf_to_dev(vf->pf); + u16 vlan_id = 0; + int err; + + if (vf->port_vlan_info) { + err = ice_vsi_manage_pvid(vsi, vf->port_vlan_info, true); + if (err) { + dev_err(dev, "failed to configure port VLAN via VSI parameters for VF %u, error %d\n", + vf->vf_id, err); + return err; + } + + vlan_id = vf->port_vlan_info & VLAN_VID_MASK; + } + + /* vlan_id will either be 0 or the port VLAN number */ + err = ice_vsi_add_vlan(vsi, vlan_id, ICE_FWD_TO_VSI); + if (err) { + dev_err(dev, "failed to add %s VLAN %u filter for VF %u, error %d\n", + vf->port_vlan_info ? "port" : "", vlan_id, vf->vf_id, + err); + return err; + } + + return 0; +} + +/** + * ice_vf_rebuild_host_mac_cfg - add broadcast and the VF's perm_addr/LAA + * @vf: VF to add MAC filters for + * + * Called after a VF VSI has been re-added/rebuilt during reset. The PF driver + * always re-adds a broadcast filter and the VF's perm_addr/LAA after reset. + */ +static int ice_vf_rebuild_host_mac_cfg(struct ice_vf *vf) +{ + struct ice_vsi *vsi = vf->pf->vsi[vf->lan_vsi_idx]; + struct device *dev = ice_pf_to_dev(vf->pf); + enum ice_status status; + u8 broadcast[ETH_ALEN]; + + eth_broadcast_addr(broadcast); + status = ice_fltr_add_mac(vsi, broadcast, ICE_FWD_TO_VSI); + if (status) { + dev_err(dev, "failed to add broadcast MAC filter for VF %u, error %s\n", + vf->vf_id, ice_stat_str(status)); + return ice_status_to_errno(status); + } + + vf->num_mac++; + + if (is_valid_ether_addr(vf->dflt_lan_addr.addr)) { + status = ice_fltr_add_mac(vsi, vf->dflt_lan_addr.addr, + ICE_FWD_TO_VSI); + if (status) { + dev_err(dev, "failed to add default unicast MAC filter %pM for VF %u, error %s\n", + &vf->dflt_lan_addr.addr[0], vf->vf_id, + ice_stat_str(status)); + return ice_status_to_errno(status); + } + vf->num_mac++; + } + + return 0; +} + /** * ice_alloc_vsi_res - Setup VF VSI and its resources * @vf: pointer to the VF structure @@ -549,10 +625,9 @@ static int ice_calc_vf_first_vector_idx(struct ice_pf *pf, struct ice_vf *vf) static int ice_alloc_vsi_res(struct ice_vf *vf) { struct ice_pf *pf = vf->pf; - u8 broadcast[ETH_ALEN]; struct ice_vsi *vsi; struct device *dev; - int status = 0; + int ret; dev = ice_pf_to_dev(pf); /* first vector index is the VFs OICR index */ @@ -567,38 +642,20 @@ static int ice_alloc_vsi_res(struct ice_vf *vf) vf->lan_vsi_idx = vsi->idx; vf->lan_vsi_num = vsi->vsi_num; - /* Check if port VLAN exist before, and restore it accordingly */ - if (vf->port_vlan_info) { - ice_vsi_manage_pvid(vsi, vf->port_vlan_info, true); - if (ice_vsi_add_vlan(vsi, vf->port_vlan_info & VLAN_VID_MASK, - ICE_FWD_TO_VSI)) - dev_warn(ice_pf_to_dev(pf), "Failed to add Port VLAN %d filter for VF %d\n", - vf->port_vlan_info & VLAN_VID_MASK, vf->vf_id); - } else { - /* set VLAN 0 filter by default when no port VLAN is - * enabled. If a port VLAN is enabled we don't want - * untagged broadcast/multicast traffic seen on the VF - * interface. - */ - if (ice_vsi_add_vlan(vsi, 0, ICE_FWD_TO_VSI)) - dev_warn(ice_pf_to_dev(pf), "Failed to add VLAN 0 filter for VF %d, MDD events will trigger. Reset the VF, disable spoofchk, or enable 8021q module on the guest\n", - vf->vf_id); + ret = ice_vf_rebuild_host_vlan_cfg(vf); + if (ret) { + dev_err(dev, "failed to rebuild default MAC configuration for VF %d, error %d\n", + vf->vf_id, ret); + goto ice_alloc_vsi_res_exit; } - if (is_valid_ether_addr(vf->dflt_lan_addr.addr)) { - status = ice_fltr_add_mac(vsi, vf->dflt_lan_addr.addr, - ICE_FWD_TO_VSI); - if (status) - goto ice_alloc_vsi_res_exit; - } - eth_broadcast_addr(broadcast); - status = ice_fltr_add_mac(vsi, broadcast, ICE_FWD_TO_VSI); - if (status) - dev_err(dev, "could not add mac filters error %d\n", - status); - else - vf->num_mac = 1; + ret = ice_vf_rebuild_host_mac_cfg(vf); + if (ret) { + dev_err(dev, "failed to rebuild default MAC configuration for VF %d, error %d\n", + vf->vf_id, ret); + goto ice_alloc_vsi_res_exit; + } /* Clear this bit after VF initialization since we shouldn't reclaim * and reassign interrupts for synchronous or asynchronous VFR events. @@ -607,7 +664,7 @@ static int ice_alloc_vsi_res(struct ice_vf *vf) * more vectors. */ ice_alloc_vsi_res_exit: - return status; + return ret; } /** -- cgit v1.2.3-59-g8ed1b From a58e1d817475eb45471869190ff72dd1b493a936 Mon Sep 17 00:00:00 2001 From: Paul Greenwalt Date: Fri, 15 May 2020 17:51:14 -0700 Subject: ice: remove VM/VF disable command on CORER/GLOBR reset Remove VM/VF disable AQC (opcode 0x0C31) when resetting all VFs. This is not required for CORER/GLOBR reset. Signed-off-by: Paul Greenwalt Tested-by: Andrew Bowers Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c b/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c index 3a714c81b5b2..245310a52e1b 100644 --- a/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c +++ b/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c @@ -1196,17 +1196,6 @@ bool ice_reset_all_vfs(struct ice_pf *pf, bool is_vflr) ice_for_each_vf(pf, v) ice_trigger_vf_reset(&pf->vf[v], is_vflr, true); - ice_for_each_vf(pf, v) { - struct ice_vsi *vsi; - - vf = &pf->vf[v]; - vsi = pf->vsi[vf->lan_vsi_idx]; - if (test_bit(ICE_VF_STATE_QS_ENA, vf->vf_states)) - ice_dis_vf_qs(vf); - ice_dis_vsi_txq(vsi->port_info, vsi->idx, 0, 0, NULL, NULL, - NULL, ICE_VF_RESET, vf->vf_id, NULL); - } - /* HW requires some time to make sure it can flush the FIFO for a VF * when it resets it. Poll the VPGEN_VFRSTAT register for each VF in * sequence to make sure that it has completed. We'll keep track of -- cgit v1.2.3-59-g8ed1b From 12bb018c538c3b9a050f69f62fa09fa6c9160bca Mon Sep 17 00:00:00 2001 From: Brett Creeley Date: Fri, 15 May 2020 17:51:15 -0700 Subject: ice: Refactor VF reset Currently VF VSI are being reset twice during a PFR or greater. This is causing reset, specifically resetting all VFs, to take too long. This is causing various issues with VF drivers not being able to gracefully handle the VF reset timeout. Fix this by refactoring how VF reset is handled for the case mentioned previously and for the VFR/VFLR case. The refactor was done by doing the following: 1. Removing the call to ice_vsi_rebuild_by_type for ICE_VSI_VF VSI, which was causing the initial VSI rebuild. 2. Adding functions for pre/post VSI rebuild functions that can be called in both the reset all VFs case and reset individual VF case. 3. Adding VSI rebuild functions that are specific for the reset all VFs case and adding functions that are specific for the reset individual VF case. 4. Calling the pre-rebuild function, then the specific VSI rebuild function based on the reset type, and then calling the post-rebuild function to handle VF resets. This patch series makes some assumptions about how VSI are handling by FW during reset: 1. During a PFR or greater all VSI in FW will be cleared. 2. During a VFR/VFLR the VSI rebuild responsibility is in the hands of the PF software. 3. There is code in the ice_reset_all_vfs() case to amortize operations if possible. This was left intact. 4. PF software should not be replaying VSI based filters that were added other than host configured, PF software configured, or the VF's default/LAA MAC. This is the VF drivers job after it has been reset. Signed-off-by: Brett Creeley Tested-by: Andrew Bowers Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/ice/ice_main.c | 13 +- drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c | 304 ++++++++++------------- 2 files changed, 130 insertions(+), 187 deletions(-) diff --git a/drivers/net/ethernet/intel/ice/ice_main.c b/drivers/net/ethernet/intel/ice/ice_main.c index 69854b8644a6..bbf92d2f1ac1 100644 --- a/drivers/net/ethernet/intel/ice/ice_main.c +++ b/drivers/net/ethernet/intel/ice/ice_main.c @@ -4897,6 +4897,11 @@ static void ice_update_pf_netdev_link(struct ice_pf *pf) * ice_rebuild - rebuild after reset * @pf: PF to rebuild * @reset_type: type of reset + * + * Do not rebuild VF VSI in this flow because that is already handled via + * ice_reset_all_vfs(). This is because requirements for resetting a VF after a + * PFR/CORER/GLOBER/etc. are different than the normal flow. Also, we don't want + * to reset/rebuild all the VF VSI twice. */ static void ice_rebuild(struct ice_pf *pf, enum ice_reset_req reset_type) { @@ -4994,14 +4999,6 @@ static void ice_rebuild(struct ice_pf *pf, enum ice_reset_req reset_type) goto err_vsi_rebuild; } - if (test_bit(ICE_FLAG_SRIOV_ENA, pf->flags)) { - err = ice_vsi_rebuild_by_type(pf, ICE_VSI_VF); - if (err) { - dev_err(dev, "VF VSI rebuild failed: %d\n", err); - goto err_vsi_rebuild; - } - } - /* If Flow Director is active */ if (test_bit(ICE_FLAG_FD_ENA, pf->flags)) { err = ice_vsi_rebuild_by_type(pf, ICE_VSI_CTRL); diff --git a/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c b/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c index 245310a52e1b..727f371db465 100644 --- a/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c +++ b/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c @@ -413,10 +413,7 @@ static void ice_trigger_vf_reset(struct ice_vf *vf, bool is_vflr, bool is_pfr) clear_bit(ICE_VF_STATE_ACTIVE, vf->vf_states); /* Disable VF's configuration API during reset. The flag is re-enabled - * in ice_alloc_vf_res(), when it's safe again to access VF's VSI. - * It's normally disabled in ice_free_vf_res(), but it's safer - * to do it earlier to give some time to finish to any VF config - * functions that may still be running at this point. + * when it's safe again to access VF's VSI. */ clear_bit(ICE_VF_STATE_INIT, vf->vf_states); @@ -616,57 +613,6 @@ static int ice_vf_rebuild_host_mac_cfg(struct ice_vf *vf) return 0; } -/** - * ice_alloc_vsi_res - Setup VF VSI and its resources - * @vf: pointer to the VF structure - * - * Returns 0 on success, negative value on failure - */ -static int ice_alloc_vsi_res(struct ice_vf *vf) -{ - struct ice_pf *pf = vf->pf; - struct ice_vsi *vsi; - struct device *dev; - int ret; - - dev = ice_pf_to_dev(pf); - /* first vector index is the VFs OICR index */ - vf->first_vector_idx = ice_calc_vf_first_vector_idx(pf, vf); - - vsi = ice_vf_vsi_setup(pf, pf->hw.port_info, vf->vf_id); - if (!vsi) { - dev_err(dev, "Failed to create VF VSI\n"); - return -ENOMEM; - } - - vf->lan_vsi_idx = vsi->idx; - vf->lan_vsi_num = vsi->vsi_num; - - ret = ice_vf_rebuild_host_vlan_cfg(vf); - if (ret) { - dev_err(dev, "failed to rebuild default MAC configuration for VF %d, error %d\n", - vf->vf_id, ret); - goto ice_alloc_vsi_res_exit; - } - - - ret = ice_vf_rebuild_host_mac_cfg(vf); - if (ret) { - dev_err(dev, "failed to rebuild default MAC configuration for VF %d, error %d\n", - vf->vf_id, ret); - goto ice_alloc_vsi_res_exit; - } - - /* Clear this bit after VF initialization since we shouldn't reclaim - * and reassign interrupts for synchronous or asynchronous VFR events. - * We don't want to reconfigure interrupts since AVF driver doesn't - * expect vector assignment to be changed unless there is a request for - * more vectors. - */ -ice_alloc_vsi_res_exit: - return ret; -} - /** * ice_vf_set_host_trust_cfg - set trust setting based on pre-reset value * @vf: VF to configure trust setting for @@ -679,43 +625,6 @@ static void ice_vf_set_host_trust_cfg(struct ice_vf *vf) clear_bit(ICE_VIRTCHNL_VF_CAP_PRIVILEGE, &vf->vf_caps); } -/** - * ice_alloc_vf_res - Allocate VF resources - * @vf: pointer to the VF structure - */ -static int ice_alloc_vf_res(struct ice_vf *vf) -{ - struct ice_pf *pf = vf->pf; - int tx_rx_queue_left; - int status; - - /* Update number of VF queues, in case VF had requested for queue - * changes - */ - tx_rx_queue_left = min_t(int, ice_get_avail_txq_count(pf), - ice_get_avail_rxq_count(pf)); - tx_rx_queue_left += pf->num_qps_per_vf; - if (vf->num_req_qs && vf->num_req_qs <= tx_rx_queue_left && - vf->num_req_qs != vf->num_vf_qs) - vf->num_vf_qs = vf->num_req_qs; - - /* setup VF VSI and necessary resources */ - status = ice_alloc_vsi_res(vf); - if (status) - goto ice_alloc_vf_res_exit; - - ice_vf_set_host_trust_cfg(vf); - - /* VF is now completely initialized */ - set_bit(ICE_VF_STATE_INIT, vf->vf_states); - - return status; - -ice_alloc_vf_res_exit: - ice_free_vf_res(vf); - return status; -} - /** * ice_ena_vf_msix_mappings - enable VF MSIX mappings in hardware * @vf: VF to enable MSIX mappings for @@ -1042,48 +951,6 @@ static void ice_clear_vf_reset_trigger(struct ice_vf *vf) ice_flush(hw); } -/** - * ice_cleanup_and_realloc_vf - Clean up VF and reallocate resources after reset - * @vf: pointer to the VF structure - * - * Cleanup a VF after the hardware reset is finished. Expects the caller to - * have verified whether the reset is finished properly, and ensure the - * minimum amount of wait time has passed. Reallocate VF resources back to make - * VF state active - */ -static void ice_cleanup_and_realloc_vf(struct ice_vf *vf) -{ - struct ice_pf *pf = vf->pf; - struct ice_hw *hw; - - hw = &pf->hw; - - /* Allow HW to access VF memory after calling - * ice_clear_vf_reset_trigger(). If we did it any sooner, HW could - * access memory while it was being freed in ice_free_vf_res(), causing - * an IOMMU fault. - * - * On the other hand, this needs to be done ASAP, because the VF driver - * is waiting for this to happen and may report a timeout. It's - * harmless, but it gets logged into Guest OS kernel log, so best avoid - * it. - */ - ice_clear_vf_reset_trigger(vf); - - /* reallocate VF resources to finish resetting the VSI state */ - if (!ice_alloc_vf_res(vf)) { - ice_ena_vf_mappings(vf); - set_bit(ICE_VF_STATE_ACTIVE, vf->vf_states); - clear_bit(ICE_VF_STATE_DIS, vf->vf_states); - } - - /* Tell the VF driver the reset is done. This needs to be done only - * after VF has been fully initialized, because the VF driver may - * request resources immediately after setting this flag. - */ - wr32(hw, VFGEN_RSTAT(vf->vf_id), VIRTCHNL_VFR_VFACTIVE); -} - /** * ice_vf_set_vsi_promisc - set given VF VSI to given promiscuous mode(s) * @vf: pointer to the VF info @@ -1125,44 +992,134 @@ ice_vf_set_vsi_promisc(struct ice_vf *vf, struct ice_vsi *vsi, u8 promisc_m, return status; } +static void ice_vf_clear_counters(struct ice_vf *vf) +{ + struct ice_vsi *vsi = vf->pf->vsi[vf->lan_vsi_idx]; + + vf->num_mac = 0; + vsi->num_vlan = 0; + memset(&vf->mdd_tx_events, 0, sizeof(vf->mdd_tx_events)); + memset(&vf->mdd_rx_events, 0, sizeof(vf->mdd_rx_events)); +} + /** - * ice_config_res_vfs - Finalize allocation of VFs resources in one go - * @pf: pointer to the PF structure + * ice_vf_pre_vsi_rebuild - tasks to be done prior to VSI rebuild + * @vf: VF to perform pre VSI rebuild tasks * - * This function is being called as last part of resetting all VFs, or when - * configuring VFs for the first time, where there is no resource to be freed - * Returns true if resources were properly allocated for all VFs, and false - * otherwise. + * These tasks are items that don't need to be amortized since they are most + * likely called in a for loop with all VF(s) in the reset_all_vfs() case. */ -static bool ice_config_res_vfs(struct ice_pf *pf) +static void ice_vf_pre_vsi_rebuild(struct ice_vf *vf) { - struct device *dev = ice_pf_to_dev(pf); - struct ice_hw *hw = &pf->hw; - int v; + ice_vf_clear_counters(vf); + ice_clear_vf_reset_trigger(vf); +} - if (ice_set_per_vf_res(pf)) { - dev_err(dev, "Cannot allocate VF resources, try with fewer number of VFs\n"); - return false; +/** + * ice_vf_rebuild_host_cfg - host admin configuration is persistent across reset + * @vf: VF to rebuild host configuration on + */ +static void ice_vf_rebuild_host_cfg(struct ice_vf *vf) +{ + struct device *dev = ice_pf_to_dev(vf->pf); + + ice_vf_set_host_trust_cfg(vf); + + if (ice_vf_rebuild_host_mac_cfg(vf)) + dev_err(dev, "failed to rebuild default MAC configuration for VF %d\n", + vf->vf_id); + + if (ice_vf_rebuild_host_vlan_cfg(vf)) + dev_err(dev, "failed to rebuild VLAN configuration for VF %u\n", + vf->vf_id); +} + +/** + * ice_vf_rebuild_vsi_with_release - release and setup the VF's VSI + * @vf: VF to release and setup the VSI for + * + * This is only called when a single VF is being reset (i.e. VFR, VFLR, host VF + * configuration change, etc.). + */ +static int ice_vf_rebuild_vsi_with_release(struct ice_vf *vf) +{ + struct ice_pf *pf = vf->pf; + struct ice_vsi *vsi; + + vsi = pf->vsi[vf->lan_vsi_idx]; + ice_vsi_release(vsi); + vsi = ice_vf_vsi_setup(pf, pf->hw.port_info, vf->vf_id); + if (!vsi) { + dev_err(ice_pf_to_dev(pf), "Failed to create VF VSI\n"); + return -ENOMEM; } - /* rearm global interrupts */ - if (test_and_clear_bit(__ICE_OICR_INTR_DIS, pf->state)) - ice_irq_dynamic_ena(hw, NULL, NULL); + vf->lan_vsi_idx = vsi->idx; + vf->lan_vsi_num = vsi->vsi_num; - /* Finish resetting each VF and allocate resources */ - ice_for_each_vf(pf, v) { - struct ice_vf *vf = &pf->vf[v]; + return 0; +} - vf->num_vf_qs = pf->num_qps_per_vf; - dev_dbg(dev, "VF-id %d has %d queues configured\n", vf->vf_id, - vf->num_vf_qs); - ice_cleanup_and_realloc_vf(vf); +/** + * ice_vf_rebuild_vsi - rebuild the VF's VSI + * @vf: VF to rebuild the VSI for + * + * This is only called when all VF(s) are being reset (i.e. PCIe Reset on the + * host, PFR, CORER, etc.). + */ +static int ice_vf_rebuild_vsi(struct ice_vf *vf) +{ + struct ice_pf *pf = vf->pf; + struct ice_vsi *vsi; + + vsi = pf->vsi[vf->lan_vsi_idx]; + + if (ice_vsi_rebuild(vsi, true)) { + dev_err(ice_pf_to_dev(pf), "failed to rebuild VF %d VSI\n", + vf->vf_id); + return -EIO; } + /* vsi->idx will remain the same in this case so don't update + * vf->lan_vsi_idx + */ + vsi->vsi_num = ice_get_hw_vsi_num(&pf->hw, vsi->idx); + vf->lan_vsi_num = vsi->vsi_num; - ice_flush(hw); - clear_bit(__ICE_VF_DIS, pf->state); + return 0; +} - return true; +/** + * ice_vf_set_initialized - VF is ready for VIRTCHNL communication + * @vf: VF to set in initialized state + * + * After this function the VF will be ready to receive/handle the + * VIRTCHNL_OP_GET_VF_RESOURCES message + */ +static void ice_vf_set_initialized(struct ice_vf *vf) +{ + ice_set_vf_state_qs_dis(vf); + clear_bit(ICE_VF_STATE_MC_PROMISC, vf->vf_states); + clear_bit(ICE_VF_STATE_UC_PROMISC, vf->vf_states); + clear_bit(ICE_VF_STATE_DIS, vf->vf_states); + set_bit(ICE_VF_STATE_INIT, vf->vf_states); +} + +/** + * ice_vf_post_vsi_rebuild - tasks to do after the VF's VSI have been rebuilt + * @vf: VF to perform tasks on + */ +static void ice_vf_post_vsi_rebuild(struct ice_vf *vf) +{ + struct ice_pf *pf = vf->pf; + struct ice_hw *hw; + + hw = &pf->hw; + + ice_vf_rebuild_host_cfg(vf); + + ice_vf_set_initialized(vf); + ice_ena_vf_mappings(vf); + wr32(hw, VFGEN_RSTAT(vf->vf_id), VIRTCHNL_VFR_VFACTIVE); } /** @@ -1232,21 +1189,13 @@ bool ice_reset_all_vfs(struct ice_pf *pf, bool is_vflr) ice_for_each_vf(pf, v) { vf = &pf->vf[v]; - ice_free_vf_res(vf); - - /* Free VF queues as well, and reallocate later. - * If a given VF has different number of queues - * configured, the request for update will come - * via mailbox communication. - */ - vf->num_vf_qs = 0; + ice_vf_pre_vsi_rebuild(vf); + ice_vf_rebuild_vsi(vf); + ice_vf_post_vsi_rebuild(vf); } - if (ice_sriov_free_msix_res(pf)) - dev_err(dev, "Failed to free MSIX resources used by SR-IOV\n"); - - if (!ice_config_res_vfs(pf)) - return false; + ice_flush(hw); + clear_bit(__ICE_VF_DIS, pf->state); return true; } @@ -1358,12 +1307,9 @@ bool ice_reset_vf(struct ice_vf *vf, bool is_vflr) dev_err(dev, "disabling promiscuous mode failed\n"); } - /* free VF resources to begin resetting the VSI state */ - ice_free_vf_res(vf); - - ice_cleanup_and_realloc_vf(vf); - - ice_flush(hw); + ice_vf_pre_vsi_rebuild(vf); + ice_vf_rebuild_vsi_with_release(vf); + ice_vf_post_vsi_rebuild(vf); return true; } -- cgit v1.2.3-59-g8ed1b From 3726cce258908ed6e30d52e2d4dfffe96ad2f962 Mon Sep 17 00:00:00 2001 From: Brett Creeley Date: Fri, 15 May 2020 17:51:16 -0700 Subject: ice: Refactor VF VSI release and setup functions Currently when a VF VSI calls ice_vsi_release() and ice_vsi_setup() it subsequently clears/sets the VF cached variables for lan_vsi_idx and lan_vsi_num. This works fine, but can be improved by handling this in the VF specific VSI release and setup functions. Also, when a VF VSI is setup too many parameters are passed that can be derived from the VF. Fix this by only calling VF VSI setup with the bare minimum parameters. Also, add functionality to invalidate a VF's VSI when it's released and/or setup fails. This will make it so a VF VSI cannot be accessed via its cached vsi_idx/vsi_num in these cases. Finally when a VF's VSI is invalidated set the lan_vsi_idx and lan_vsi_num to ICE_NO_VSI to clearly show that there is no valid VSI associated with this VF. Signed-off-by: Brett Creeley Tested-by: Andrew Bowers Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c | 86 +++++++++++++++--------- 1 file changed, 55 insertions(+), 31 deletions(-) diff --git a/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c b/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c index 727f371db465..a126e7c7663d 100644 --- a/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c +++ b/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c @@ -181,6 +181,26 @@ static void ice_vc_notify_vf_link_state(struct ice_vf *vf) sizeof(pfe), NULL); } +/** + * ice_vf_invalidate_vsi - invalidate vsi_idx/vsi_num to remove VSI access + * @vf: VF to remove access to VSI for + */ +static void ice_vf_invalidate_vsi(struct ice_vf *vf) +{ + vf->lan_vsi_idx = ICE_NO_VSI; + vf->lan_vsi_num = ICE_NO_VSI; +} + +/** + * ice_vf_vsi_release - invalidate the VF's VSI after freeing it + * @vf: invalidate this VF's VSI after freeing it + */ +static void ice_vf_vsi_release(struct ice_vf *vf) +{ + ice_vsi_release(vf->pf->vsi[vf->lan_vsi_idx]); + ice_vf_invalidate_vsi(vf); +} + /** * ice_free_vf_res - Free a VF's resources * @vf: pointer to the VF info @@ -196,10 +216,8 @@ static void ice_free_vf_res(struct ice_vf *vf) clear_bit(ICE_VF_STATE_INIT, vf->vf_states); /* free VSI and disconnect it from the parent uplink */ - if (vf->lan_vsi_idx) { - ice_vsi_release(pf->vsi[vf->lan_vsi_idx]); - vf->lan_vsi_idx = 0; - vf->lan_vsi_num = 0; + if (vf->lan_vsi_idx != ICE_NO_VSI) { + ice_vf_vsi_release(vf); vf->num_mac = 0; } @@ -505,19 +523,40 @@ out: return ret; } +/** + * ice_vf_get_port_info - Get the VF's port info structure + * @vf: VF used to get the port info structure for + */ +static struct ice_port_info *ice_vf_get_port_info(struct ice_vf *vf) +{ + return vf->pf->hw.port_info; +} + /** * ice_vf_vsi_setup - Set up a VF VSI - * @pf: board private structure - * @pi: pointer to the port_info instance - * @vf_id: defines VF ID to which this VSI connects. + * @vf: VF to setup VSI for * * Returns pointer to the successfully allocated VSI struct on success, * otherwise returns NULL on failure. */ -static struct ice_vsi * -ice_vf_vsi_setup(struct ice_pf *pf, struct ice_port_info *pi, u16 vf_id) +static struct ice_vsi *ice_vf_vsi_setup(struct ice_vf *vf) { - return ice_vsi_setup(pf, pi, ICE_VSI_VF, vf_id); + struct ice_port_info *pi = ice_vf_get_port_info(vf); + struct ice_pf *pf = vf->pf; + struct ice_vsi *vsi; + + vsi = ice_vsi_setup(pf, pi, ICE_VSI_VF, vf->vf_id); + + if (!vsi) { + dev_err(ice_pf_to_dev(pf), "Failed to create VF VSI\n"); + ice_vf_invalidate_vsi(vf); + return NULL; + } + + vf->lan_vsi_idx = vsi->idx; + vf->lan_vsi_num = vsi->vsi_num; + + return vsi; } /** @@ -1043,19 +1082,9 @@ static void ice_vf_rebuild_host_cfg(struct ice_vf *vf) */ static int ice_vf_rebuild_vsi_with_release(struct ice_vf *vf) { - struct ice_pf *pf = vf->pf; - struct ice_vsi *vsi; - - vsi = pf->vsi[vf->lan_vsi_idx]; - ice_vsi_release(vsi); - vsi = ice_vf_vsi_setup(pf, pf->hw.port_info, vf->vf_id); - if (!vsi) { - dev_err(ice_pf_to_dev(pf), "Failed to create VF VSI\n"); + ice_vf_vsi_release(vf); + if (!ice_vf_vsi_setup(vf)) return -ENOMEM; - } - - vf->lan_vsi_idx = vsi->idx; - vf->lan_vsi_num = vsi->vsi_num; return 0; } @@ -1395,14 +1424,9 @@ static int ice_init_vf_vsi_res(struct ice_vf *vf) vf->first_vector_idx = ice_calc_vf_first_vector_idx(pf, vf); dev = ice_pf_to_dev(pf); - vsi = ice_vf_vsi_setup(pf, pf->hw.port_info, vf->vf_id); - if (!vsi) { - dev_err(dev, "Failed to create VF VSI\n"); + vsi = ice_vf_vsi_setup(vf); + if (!vsi) return -ENOMEM; - } - - vf->lan_vsi_idx = vsi->idx; - vf->lan_vsi_num = vsi->vsi_num; err = ice_vsi_add_vlan(vsi, 0, ICE_FWD_TO_VSI); if (err) { @@ -1425,7 +1449,7 @@ static int ice_init_vf_vsi_res(struct ice_vf *vf) return 0; release_vsi: - ice_vsi_release(vsi); + ice_vf_vsi_release(vf); return err; } @@ -1463,7 +1487,7 @@ teardown: struct ice_vf *vf = &pf->vf[i]; ice_dis_vf_mappings(vf); - ice_vsi_release(pf->vsi[vf->lan_vsi_idx]); + ice_vf_vsi_release(vf); } return retval; -- cgit v1.2.3-59-g8ed1b From 123db31d01219a4f794f3769e7bca6649d65ecb1 Mon Sep 17 00:00:00 2001 From: Ronak Doshi Date: Thu, 28 May 2020 14:53:19 -0700 Subject: vmxnet3: prepare for version 4 changes vmxnet3 is currently at version 3 and this patch initiates the preparation to accommodate changes for version 4. Introduced utility macros for vmxnet3 version 4 comparison and update Copyright information. Signed-off-by: Ronak Doshi Signed-off-by: David S. Miller --- drivers/net/vmxnet3/Makefile | 2 +- drivers/net/vmxnet3/upt1_defs.h | 2 +- drivers/net/vmxnet3/vmxnet3_defs.h | 2 +- drivers/net/vmxnet3/vmxnet3_drv.c | 2 +- drivers/net/vmxnet3/vmxnet3_ethtool.c | 2 +- drivers/net/vmxnet3/vmxnet3_int.h | 5 ++++- 6 files changed, 9 insertions(+), 6 deletions(-) diff --git a/drivers/net/vmxnet3/Makefile b/drivers/net/vmxnet3/Makefile index 8cdbb63d1bb0..c5a167a1c85c 100644 --- a/drivers/net/vmxnet3/Makefile +++ b/drivers/net/vmxnet3/Makefile @@ -2,7 +2,7 @@ # # Linux driver for VMware's vmxnet3 ethernet NIC. # -# Copyright (C) 2007-2016, VMware, Inc. All Rights Reserved. +# Copyright (C) 2007-2020, VMware, Inc. All Rights Reserved. # # This program is free software; you can redistribute it and/or modify it # under the terms of the GNU General Public License as published by the diff --git a/drivers/net/vmxnet3/upt1_defs.h b/drivers/net/vmxnet3/upt1_defs.h index db9f1fde3aac..65a203c842b2 100644 --- a/drivers/net/vmxnet3/upt1_defs.h +++ b/drivers/net/vmxnet3/upt1_defs.h @@ -1,7 +1,7 @@ /* * Linux driver for VMware's vmxnet3 ethernet NIC. * - * Copyright (C) 2008-2016, VMware, Inc. All Rights Reserved. + * Copyright (C) 2008-2020, VMware, Inc. All Rights Reserved. * * This program is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the diff --git a/drivers/net/vmxnet3/vmxnet3_defs.h b/drivers/net/vmxnet3/vmxnet3_defs.h index c3a31646189f..c77274228a3e 100644 --- a/drivers/net/vmxnet3/vmxnet3_defs.h +++ b/drivers/net/vmxnet3/vmxnet3_defs.h @@ -1,7 +1,7 @@ /* * Linux driver for VMware's vmxnet3 ethernet NIC. * - * Copyright (C) 2008-2016, VMware, Inc. All Rights Reserved. + * Copyright (C) 2008-2020, VMware, Inc. All Rights Reserved. * * This program is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the diff --git a/drivers/net/vmxnet3/vmxnet3_drv.c b/drivers/net/vmxnet3/vmxnet3_drv.c index 722cb054a5cd..ec2878f8c1f6 100644 --- a/drivers/net/vmxnet3/vmxnet3_drv.c +++ b/drivers/net/vmxnet3/vmxnet3_drv.c @@ -1,7 +1,7 @@ /* * Linux driver for VMware's vmxnet3 ethernet NIC. * - * Copyright (C) 2008-2016, VMware, Inc. All Rights Reserved. + * Copyright (C) 2008-2020, VMware, Inc. All Rights Reserved. * * This program is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the diff --git a/drivers/net/vmxnet3/vmxnet3_ethtool.c b/drivers/net/vmxnet3/vmxnet3_ethtool.c index 6528940ce5f3..1163eca7aba5 100644 --- a/drivers/net/vmxnet3/vmxnet3_ethtool.c +++ b/drivers/net/vmxnet3/vmxnet3_ethtool.c @@ -1,7 +1,7 @@ /* * Linux driver for VMware's vmxnet3 ethernet NIC. * - * Copyright (C) 2008-2016, VMware, Inc. All Rights Reserved. + * Copyright (C) 2008-2020, VMware, Inc. All Rights Reserved. * * This program is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the diff --git a/drivers/net/vmxnet3/vmxnet3_int.h b/drivers/net/vmxnet3/vmxnet3_int.h index 1cc1cd4aaa59..e803ffad75d6 100644 --- a/drivers/net/vmxnet3/vmxnet3_int.h +++ b/drivers/net/vmxnet3/vmxnet3_int.h @@ -1,7 +1,7 @@ /* * Linux driver for VMware's vmxnet3 ethernet NIC. * - * Copyright (C) 2008-2016, VMware, Inc. All Rights Reserved. + * Copyright (C) 2008-2020, VMware, Inc. All Rights Reserved. * * This program is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the @@ -81,6 +81,7 @@ #define VMXNET3_RSS #endif +#define VMXNET3_REV_4 3 /* Vmxnet3 Rev. 4 */ #define VMXNET3_REV_3 2 /* Vmxnet3 Rev. 3 */ #define VMXNET3_REV_2 1 /* Vmxnet3 Rev. 2 */ #define VMXNET3_REV_1 0 /* Vmxnet3 Rev. 1 */ @@ -412,6 +413,8 @@ struct vmxnet3_adapter { (adapter->version >= VMXNET3_REV_2 + 1) #define VMXNET3_VERSION_GE_3(adapter) \ (adapter->version >= VMXNET3_REV_3 + 1) +#define VMXNET3_VERSION_GE_4(adapter) \ + (adapter->version >= VMXNET3_REV_4 + 1) /* must be a multiple of VMXNET3_RING_SIZE_ALIGN */ #define VMXNET3_DEF_TX_RING_SIZE 512 -- cgit v1.2.3-59-g8ed1b From d3a8a9e5c3b334d443e97daa59bb95c0b69f4794 Mon Sep 17 00:00:00 2001 From: Ronak Doshi Date: Thu, 28 May 2020 14:53:20 -0700 Subject: vmxnet3: add support to get/set rx flow hash With vmxnet3 version 4, the emulation supports multiqueue(RSS) for UDP and ESP traffic. A guest can enable/disable RSS for UDP/ESP over IPv4/IPv6 by issuing commands introduced in this patch. ESP ipv6 is not yet supported in this patch. This patch implements get_rss_hash_opts and set_rss_hash_opts methods to allow querying and configuring different Rx flow hash configurations. Signed-off-by: Ronak Doshi Signed-off-by: David S. Miller --- drivers/net/vmxnet3/vmxnet3_defs.h | 12 ++ drivers/net/vmxnet3/vmxnet3_drv.c | 39 ++++++ drivers/net/vmxnet3/vmxnet3_ethtool.c | 219 +++++++++++++++++++++++++++++++++- drivers/net/vmxnet3/vmxnet3_int.h | 4 + 4 files changed, 272 insertions(+), 2 deletions(-) diff --git a/drivers/net/vmxnet3/vmxnet3_defs.h b/drivers/net/vmxnet3/vmxnet3_defs.h index c77274228a3e..aac97fac1186 100644 --- a/drivers/net/vmxnet3/vmxnet3_defs.h +++ b/drivers/net/vmxnet3/vmxnet3_defs.h @@ -82,6 +82,7 @@ enum { VMXNET3_CMD_RESERVED3, VMXNET3_CMD_SET_COALESCE, VMXNET3_CMD_REGISTER_MEMREGS, + VMXNET3_CMD_SET_RSS_FIELDS, VMXNET3_CMD_FIRST_GET = 0xF00D0000, VMXNET3_CMD_GET_QUEUE_STATUS = VMXNET3_CMD_FIRST_GET, @@ -96,6 +97,7 @@ enum { VMXNET3_CMD_GET_RESERVED1, VMXNET3_CMD_GET_TXDATA_DESC_SIZE, VMXNET3_CMD_GET_COALESCE, + VMXNET3_CMD_GET_RSS_FIELDS, }; /* @@ -685,12 +687,22 @@ struct Vmxnet3_MemRegs { struct Vmxnet3_MemoryRegion memRegs[1]; }; +enum Vmxnet3_RSSField { + VMXNET3_RSS_FIELDS_TCPIP4 = 0x0001, + VMXNET3_RSS_FIELDS_TCPIP6 = 0x0002, + VMXNET3_RSS_FIELDS_UDPIP4 = 0x0004, + VMXNET3_RSS_FIELDS_UDPIP6 = 0x0008, + VMXNET3_RSS_FIELDS_ESPIP4 = 0x0010, + VMXNET3_RSS_FIELDS_ESPIP6 = 0x0020, +}; + /* If the command data <= 16 bytes, use the shared memory directly. * otherwise, use variable length configuration descriptor. */ union Vmxnet3_CmdInfo { struct Vmxnet3_VariableLenConfDesc varConf; struct Vmxnet3_SetPolling setPolling; + enum Vmxnet3_RSSField setRssFields; __le64 data[2]; }; diff --git a/drivers/net/vmxnet3/vmxnet3_drv.c b/drivers/net/vmxnet3/vmxnet3_drv.c index ec2878f8c1f6..4ea7a40ada88 100644 --- a/drivers/net/vmxnet3/vmxnet3_drv.c +++ b/drivers/net/vmxnet3/vmxnet3_drv.c @@ -2554,6 +2554,39 @@ vmxnet3_init_coalesce(struct vmxnet3_adapter *adapter) spin_unlock_irqrestore(&adapter->cmd_lock, flags); } +static void +vmxnet3_init_rssfields(struct vmxnet3_adapter *adapter) +{ + struct Vmxnet3_DriverShared *shared = adapter->shared; + union Vmxnet3_CmdInfo *cmdInfo = &shared->cu.cmdInfo; + unsigned long flags; + + if (!VMXNET3_VERSION_GE_4(adapter)) + return; + + spin_lock_irqsave(&adapter->cmd_lock, flags); + + if (adapter->default_rss_fields) { + VMXNET3_WRITE_BAR1_REG(adapter, VMXNET3_REG_CMD, + VMXNET3_CMD_GET_RSS_FIELDS); + adapter->rss_fields = + VMXNET3_READ_BAR1_REG(adapter, VMXNET3_REG_CMD); + } else { + cmdInfo->setRssFields = adapter->rss_fields; + VMXNET3_WRITE_BAR1_REG(adapter, VMXNET3_REG_CMD, + VMXNET3_CMD_SET_RSS_FIELDS); + /* Not all requested RSS may get applied, so get and + * cache what was actually applied. + */ + VMXNET3_WRITE_BAR1_REG(adapter, VMXNET3_REG_CMD, + VMXNET3_CMD_GET_RSS_FIELDS); + adapter->rss_fields = + VMXNET3_READ_BAR1_REG(adapter, VMXNET3_REG_CMD); + } + + spin_unlock_irqrestore(&adapter->cmd_lock, flags); +} + int vmxnet3_activate_dev(struct vmxnet3_adapter *adapter) { @@ -2603,6 +2636,7 @@ vmxnet3_activate_dev(struct vmxnet3_adapter *adapter) } vmxnet3_init_coalesce(adapter); + vmxnet3_init_rssfields(adapter); for (i = 0; i < adapter->num_rx_queues; i++) { VMXNET3_WRITE_BAR0_REG(adapter, @@ -3430,6 +3464,11 @@ vmxnet3_probe_device(struct pci_dev *pdev, adapter->default_coal_mode = true; } + if (VMXNET3_VERSION_GE_4(adapter)) { + adapter->default_rss_fields = true; + adapter->rss_fields = VMXNET3_RSS_FIELDS_DEFAULT; + } + SET_NETDEV_DEV(netdev, &pdev->dev); vmxnet3_declare_features(adapter, dma64); diff --git a/drivers/net/vmxnet3/vmxnet3_ethtool.c b/drivers/net/vmxnet3/vmxnet3_ethtool.c index 1163eca7aba5..57460cf1967f 100644 --- a/drivers/net/vmxnet3/vmxnet3_ethtool.c +++ b/drivers/net/vmxnet3/vmxnet3_ethtool.c @@ -665,18 +665,232 @@ out: return err; } +static int +vmxnet3_get_rss_hash_opts(struct vmxnet3_adapter *adapter, + struct ethtool_rxnfc *info) +{ + enum Vmxnet3_RSSField rss_fields; + + if (netif_running(adapter->netdev)) { + unsigned long flags; + + spin_lock_irqsave(&adapter->cmd_lock, flags); + + VMXNET3_WRITE_BAR1_REG(adapter, VMXNET3_REG_CMD, + VMXNET3_CMD_GET_RSS_FIELDS); + rss_fields = VMXNET3_READ_BAR1_REG(adapter, VMXNET3_REG_CMD); + spin_unlock_irqrestore(&adapter->cmd_lock, flags); + } else { + rss_fields = adapter->rss_fields; + } + + info->data = 0; + + /* Report default options for RSS on vmxnet3 */ + switch (info->flow_type) { + case TCP_V4_FLOW: + case TCP_V6_FLOW: + info->data |= RXH_L4_B_0_1 | RXH_L4_B_2_3 | + RXH_IP_SRC | RXH_IP_DST; + break; + case UDP_V4_FLOW: + if (rss_fields & VMXNET3_RSS_FIELDS_UDPIP4) + info->data |= RXH_L4_B_0_1 | RXH_L4_B_2_3; + info->data |= RXH_IP_SRC | RXH_IP_DST; + break; + case AH_ESP_V4_FLOW: + case AH_V4_FLOW: + case ESP_V4_FLOW: + if (rss_fields & VMXNET3_RSS_FIELDS_ESPIP4) + info->data |= RXH_L4_B_0_1 | RXH_L4_B_2_3; + /* fallthrough */ + case SCTP_V4_FLOW: + case IPV4_FLOW: + info->data |= RXH_IP_SRC | RXH_IP_DST; + break; + case UDP_V6_FLOW: + if (rss_fields & VMXNET3_RSS_FIELDS_UDPIP6) + info->data |= RXH_L4_B_0_1 | RXH_L4_B_2_3; + info->data |= RXH_IP_SRC | RXH_IP_DST; + break; + case AH_ESP_V6_FLOW: + case AH_V6_FLOW: + case ESP_V6_FLOW: + case SCTP_V6_FLOW: + case IPV6_FLOW: + info->data |= RXH_IP_SRC | RXH_IP_DST; + break; + default: + return -EINVAL; + } + + return 0; +} + +static int +vmxnet3_set_rss_hash_opt(struct net_device *netdev, + struct vmxnet3_adapter *adapter, + struct ethtool_rxnfc *nfc) +{ + enum Vmxnet3_RSSField rss_fields = adapter->rss_fields; + + /* RSS does not support anything other than hashing + * to queues on src and dst IPs and ports + */ + if (nfc->data & ~(RXH_IP_SRC | RXH_IP_DST | + RXH_L4_B_0_1 | RXH_L4_B_2_3)) + return -EINVAL; + + switch (nfc->flow_type) { + case TCP_V4_FLOW: + case TCP_V6_FLOW: + if (!(nfc->data & RXH_IP_SRC) || + !(nfc->data & RXH_IP_DST) || + !(nfc->data & RXH_L4_B_0_1) || + !(nfc->data & RXH_L4_B_2_3)) + return -EINVAL; + break; + case UDP_V4_FLOW: + if (!(nfc->data & RXH_IP_SRC) || + !(nfc->data & RXH_IP_DST)) + return -EINVAL; + switch (nfc->data & (RXH_L4_B_0_1 | RXH_L4_B_2_3)) { + case 0: + rss_fields &= ~VMXNET3_RSS_FIELDS_UDPIP4; + break; + case (RXH_L4_B_0_1 | RXH_L4_B_2_3): + rss_fields |= VMXNET3_RSS_FIELDS_UDPIP4; + break; + default: + return -EINVAL; + } + break; + case UDP_V6_FLOW: + if (!(nfc->data & RXH_IP_SRC) || + !(nfc->data & RXH_IP_DST)) + return -EINVAL; + switch (nfc->data & (RXH_L4_B_0_1 | RXH_L4_B_2_3)) { + case 0: + rss_fields &= ~VMXNET3_RSS_FIELDS_UDPIP6; + break; + case (RXH_L4_B_0_1 | RXH_L4_B_2_3): + rss_fields |= VMXNET3_RSS_FIELDS_UDPIP6; + break; + default: + return -EINVAL; + } + break; + case ESP_V4_FLOW: + case AH_V4_FLOW: + case AH_ESP_V4_FLOW: + if (!(nfc->data & RXH_IP_SRC) || + !(nfc->data & RXH_IP_DST)) + return -EINVAL; + switch (nfc->data & (RXH_L4_B_0_1 | RXH_L4_B_2_3)) { + case 0: + rss_fields &= ~VMXNET3_RSS_FIELDS_ESPIP4; + break; + case (RXH_L4_B_0_1 | RXH_L4_B_2_3): + rss_fields |= VMXNET3_RSS_FIELDS_ESPIP4; + break; + default: + return -EINVAL; + } + break; + case ESP_V6_FLOW: + case AH_V6_FLOW: + case AH_ESP_V6_FLOW: + case SCTP_V4_FLOW: + case SCTP_V6_FLOW: + if (!(nfc->data & RXH_IP_SRC) || + !(nfc->data & RXH_IP_DST) || + (nfc->data & RXH_L4_B_0_1) || + (nfc->data & RXH_L4_B_2_3)) + return -EINVAL; + break; + default: + return -EINVAL; + } + + /* if we changed something we need to update flags */ + if (rss_fields != adapter->rss_fields) { + adapter->default_rss_fields = false; + if (netif_running(netdev)) { + struct Vmxnet3_DriverShared *shared = adapter->shared; + union Vmxnet3_CmdInfo *cmdInfo = &shared->cu.cmdInfo; + unsigned long flags; + + spin_lock_irqsave(&adapter->cmd_lock, flags); + cmdInfo->setRssFields = rss_fields; + VMXNET3_WRITE_BAR1_REG(adapter, VMXNET3_REG_CMD, + VMXNET3_CMD_SET_RSS_FIELDS); + + /* Not all requested RSS may get applied, so get and + * cache what was actually applied. + */ + VMXNET3_WRITE_BAR1_REG(adapter, VMXNET3_REG_CMD, + VMXNET3_CMD_GET_RSS_FIELDS); + adapter->rss_fields = + VMXNET3_READ_BAR1_REG(adapter, VMXNET3_REG_CMD); + spin_unlock_irqrestore(&adapter->cmd_lock, flags); + } else { + /* When the device is activated, we will try to apply + * these rules and cache the applied value later. + */ + adapter->rss_fields = rss_fields; + } + } + return 0; +} static int vmxnet3_get_rxnfc(struct net_device *netdev, struct ethtool_rxnfc *info, u32 *rules) { struct vmxnet3_adapter *adapter = netdev_priv(netdev); + int err = 0; + switch (info->cmd) { case ETHTOOL_GRXRINGS: info->data = adapter->num_rx_queues; - return 0; + break; + case ETHTOOL_GRXFH: + if (!VMXNET3_VERSION_GE_4(adapter)) { + err = -EOPNOTSUPP; + break; + } + err = vmxnet3_get_rss_hash_opts(adapter, info); + break; + default: + err = -EOPNOTSUPP; + break; } - return -EOPNOTSUPP; + + return err; +} + +static int +vmxnet3_set_rxnfc(struct net_device *netdev, struct ethtool_rxnfc *info) +{ + struct vmxnet3_adapter *adapter = netdev_priv(netdev); + int err = 0; + + if (!VMXNET3_VERSION_GE_4(adapter)) { + err = -EOPNOTSUPP; + goto done; + } + + switch (info->cmd) { + case ETHTOOL_SRXFH: + err = vmxnet3_set_rss_hash_opt(netdev, adapter, info); + break; + default: + err = -EOPNOTSUPP; + break; + } + +done: + return err; } #ifdef VMXNET3_RSS @@ -887,6 +1101,7 @@ static const struct ethtool_ops vmxnet3_ethtool_ops = { .get_ringparam = vmxnet3_get_ringparam, .set_ringparam = vmxnet3_set_ringparam, .get_rxnfc = vmxnet3_get_rxnfc, + .set_rxnfc = vmxnet3_set_rxnfc, #ifdef VMXNET3_RSS .get_rxfh_indir_size = vmxnet3_get_rss_indir_size, .get_rxfh = vmxnet3_get_rss, diff --git a/drivers/net/vmxnet3/vmxnet3_int.h b/drivers/net/vmxnet3/vmxnet3_int.h index e803ffad75d6..d52ccc3eeba2 100644 --- a/drivers/net/vmxnet3/vmxnet3_int.h +++ b/drivers/net/vmxnet3/vmxnet3_int.h @@ -377,6 +377,8 @@ struct vmxnet3_adapter { u16 rxdata_desc_size; bool rxdataring_enabled; + bool default_rss_fields; + enum Vmxnet3_RSSField rss_fields; struct work_struct work; @@ -438,6 +440,8 @@ struct vmxnet3_adapter { #define VMXNET3_COAL_RBC_RATE(usecs) (1000000 / usecs) #define VMXNET3_COAL_RBC_USECS(rbc_rate) (1000000 / rbc_rate) +#define VMXNET3_RSS_FIELDS_DEFAULT (VMXNET3_RSS_FIELDS_TCPIP4 | \ + VMXNET3_RSS_FIELDS_TCPIP6) int vmxnet3_quiesce_dev(struct vmxnet3_adapter *adapter); -- cgit v1.2.3-59-g8ed1b From dacce2be33124df3c71f979ac47e3d6354a41125 Mon Sep 17 00:00:00 2001 From: Ronak Doshi Date: Thu, 28 May 2020 14:53:21 -0700 Subject: vmxnet3: add geneve and vxlan tunnel offload support Vmxnet3 version 3 device supports checksum/TSO offload. Thus, vNIC to pNIC traffic can leverage hardware checksum/TSO offloads. However, vmxnet3 does not support checksum/TSO offload for Geneve/VXLAN encapsulated packets. Thus, for a vNIC configured with an overlay, the guest stack must first segment the inner packet, compute the inner checksum for each segment and encapsulate each segment before transmitting the packet via the vNIC. This results in significant performance penalty. This patch will enhance vmxnet3 to support Geneve/VXLAN TSO as well as checksum offload. Signed-off-by: Ronak Doshi Signed-off-by: David S. Miller --- drivers/net/vmxnet3/upt1_defs.h | 3 + drivers/net/vmxnet3/vmxnet3_defs.h | 17 +++-- drivers/net/vmxnet3/vmxnet3_drv.c | 120 +++++++++++++++++++++++++++------- drivers/net/vmxnet3/vmxnet3_ethtool.c | 42 +++++++++++- drivers/net/vmxnet3/vmxnet3_int.h | 12 +++- 5 files changed, 161 insertions(+), 33 deletions(-) diff --git a/drivers/net/vmxnet3/upt1_defs.h b/drivers/net/vmxnet3/upt1_defs.h index 65a203c842b2..8c014c98471c 100644 --- a/drivers/net/vmxnet3/upt1_defs.h +++ b/drivers/net/vmxnet3/upt1_defs.h @@ -92,5 +92,8 @@ enum { UPT1_F_RSS = cpu_to_le64(0x0002), UPT1_F_RXVLAN = cpu_to_le64(0x0004), /* VLAN tag stripping */ UPT1_F_LRO = cpu_to_le64(0x0008), + UPT1_F_RXINNEROFLD = cpu_to_le64(0x00010), /* Geneve/Vxlan rx csum + * offloading + */ }; #endif diff --git a/drivers/net/vmxnet3/vmxnet3_defs.h b/drivers/net/vmxnet3/vmxnet3_defs.h index aac97fac1186..a8d5ebd47c71 100644 --- a/drivers/net/vmxnet3/vmxnet3_defs.h +++ b/drivers/net/vmxnet3/vmxnet3_defs.h @@ -103,14 +103,14 @@ enum { /* * Little Endian layout of bitfields - * Byte 0 : 7.....len.....0 - * Byte 1 : rsvd gen 13.len.8 + * Byte 1 : oco gen 13.len.8 * Byte 2 : 5.msscof.0 ext1 dtype * Byte 3 : 13...msscof...6 * * Big Endian layout of bitfields - * Byte 0: 13...msscof...6 * Byte 1 : 5.msscof.0 ext1 dtype - * Byte 2 : rsvd gen 13.len.8 + * Byte 2 : oco gen 13.len.8 * Byte 3 : 7.....len.....0 * * Thus, le32_to_cpu on the dword will allow the big endian driver to read @@ -125,13 +125,13 @@ struct Vmxnet3_TxDesc { u32 msscof:14; /* MSS, checksum offset, flags */ u32 ext1:1; u32 dtype:1; /* descriptor type */ - u32 rsvd:1; + u32 oco:1; u32 gen:1; /* generation bit */ u32 len:14; #else u32 len:14; u32 gen:1; /* generation bit */ - u32 rsvd:1; + u32 oco:1; u32 dtype:1; /* descriptor type */ u32 ext1:1; u32 msscof:14; /* MSS, checksum offset, flags */ @@ -157,9 +157,10 @@ struct Vmxnet3_TxDesc { }; /* TxDesc.OM values */ -#define VMXNET3_OM_NONE 0 -#define VMXNET3_OM_CSUM 2 -#define VMXNET3_OM_TSO 3 +#define VMXNET3_OM_NONE 0 +#define VMXNET3_OM_ENCAP 1 +#define VMXNET3_OM_CSUM 2 +#define VMXNET3_OM_TSO 3 /* fields in TxDesc we access w/o using bit fields */ #define VMXNET3_TXD_EOP_SHIFT 12 @@ -226,6 +227,8 @@ struct Vmxnet3_RxDesc { #define VMXNET3_RXD_BTYPE_SHIFT 14 #define VMXNET3_RXD_GEN_SHIFT 31 +#define VMXNET3_RCD_HDR_INNER_SHIFT 13 + struct Vmxnet3_RxCompDesc { #ifdef __BIG_ENDIAN_BITFIELD u32 ext2:1; diff --git a/drivers/net/vmxnet3/vmxnet3_drv.c b/drivers/net/vmxnet3/vmxnet3_drv.c index 4ea7a40ada88..171d4b1d1d04 100644 --- a/drivers/net/vmxnet3/vmxnet3_drv.c +++ b/drivers/net/vmxnet3/vmxnet3_drv.c @@ -842,12 +842,22 @@ vmxnet3_parse_hdr(struct sk_buff *skb, struct vmxnet3_tx_queue *tq, u8 protocol = 0; if (ctx->mss) { /* TSO */ - ctx->eth_ip_hdr_size = skb_transport_offset(skb); - ctx->l4_hdr_size = tcp_hdrlen(skb); - ctx->copy_size = ctx->eth_ip_hdr_size + ctx->l4_hdr_size; + if (VMXNET3_VERSION_GE_4(adapter) && skb->encapsulation) { + ctx->l4_offset = skb_inner_transport_offset(skb); + ctx->l4_hdr_size = inner_tcp_hdrlen(skb); + ctx->copy_size = ctx->l4_offset + ctx->l4_hdr_size; + } else { + ctx->l4_offset = skb_transport_offset(skb); + ctx->l4_hdr_size = tcp_hdrlen(skb); + ctx->copy_size = ctx->l4_offset + ctx->l4_hdr_size; + } } else { if (skb->ip_summed == CHECKSUM_PARTIAL) { - ctx->eth_ip_hdr_size = skb_checksum_start_offset(skb); + /* For encap packets, skb_checksum_start_offset refers + * to inner L4 offset. Thus, below works for encap as + * well as non-encap case + */ + ctx->l4_offset = skb_checksum_start_offset(skb); if (ctx->ipv4) { const struct iphdr *iph = ip_hdr(skb); @@ -871,10 +881,10 @@ vmxnet3_parse_hdr(struct sk_buff *skb, struct vmxnet3_tx_queue *tq, break; } - ctx->copy_size = min(ctx->eth_ip_hdr_size + + ctx->copy_size = min(ctx->l4_offset + ctx->l4_hdr_size, skb->len); } else { - ctx->eth_ip_hdr_size = 0; + ctx->l4_offset = 0; ctx->l4_hdr_size = 0; /* copy as much as allowed */ ctx->copy_size = min_t(unsigned int, @@ -929,6 +939,25 @@ vmxnet3_copy_hdr(struct sk_buff *skb, struct vmxnet3_tx_queue *tq, } +static void +vmxnet3_prepare_inner_tso(struct sk_buff *skb, + struct vmxnet3_tx_ctx *ctx) +{ + struct tcphdr *tcph = inner_tcp_hdr(skb); + struct iphdr *iph = inner_ip_hdr(skb); + + if (ctx->ipv4) { + iph->check = 0; + tcph->check = ~csum_tcpudp_magic(iph->saddr, iph->daddr, 0, + IPPROTO_TCP, 0); + } else if (ctx->ipv6) { + struct ipv6hdr *iph = inner_ipv6_hdr(skb); + + tcph->check = ~csum_ipv6_magic(&iph->saddr, &iph->daddr, 0, + IPPROTO_TCP, 0); + } +} + static void vmxnet3_prepare_tso(struct sk_buff *skb, struct vmxnet3_tx_ctx *ctx) @@ -987,6 +1016,7 @@ vmxnet3_tq_xmit(struct sk_buff *skb, struct vmxnet3_tx_queue *tq, /* Use temporary descriptor to avoid touching bits multiple times */ union Vmxnet3_GenericDesc tempTxDesc; #endif + struct udphdr *udph; count = txd_estimate(skb); @@ -1003,7 +1033,11 @@ vmxnet3_tq_xmit(struct sk_buff *skb, struct vmxnet3_tx_queue *tq, } tq->stats.copy_skb_header++; } - vmxnet3_prepare_tso(skb, &ctx); + if (skb->encapsulation) { + vmxnet3_prepare_inner_tso(skb, &ctx); + } else { + vmxnet3_prepare_tso(skb, &ctx); + } } else { if (unlikely(count > VMXNET3_MAX_TXD_PER_PKT)) { @@ -1026,14 +1060,14 @@ vmxnet3_tq_xmit(struct sk_buff *skb, struct vmxnet3_tx_queue *tq, BUG_ON(ret <= 0 && ctx.copy_size != 0); /* hdrs parsed, check against other limits */ if (ctx.mss) { - if (unlikely(ctx.eth_ip_hdr_size + ctx.l4_hdr_size > + if (unlikely(ctx.l4_offset + ctx.l4_hdr_size > VMXNET3_MAX_TX_BUF_SIZE)) { tq->stats.drop_oversized_hdr++; goto drop_pkt; } } else { if (skb->ip_summed == CHECKSUM_PARTIAL) { - if (unlikely(ctx.eth_ip_hdr_size + + if (unlikely(ctx.l4_offset + skb->csum_offset > VMXNET3_MAX_CSUM_OFFSET)) { tq->stats.drop_oversized_hdr++; @@ -1080,16 +1114,34 @@ vmxnet3_tq_xmit(struct sk_buff *skb, struct vmxnet3_tx_queue *tq, #endif tx_num_deferred = le32_to_cpu(tq->shared->txNumDeferred); if (ctx.mss) { - gdesc->txd.hlen = ctx.eth_ip_hdr_size + ctx.l4_hdr_size; - gdesc->txd.om = VMXNET3_OM_TSO; - gdesc->txd.msscof = ctx.mss; + if (VMXNET3_VERSION_GE_4(adapter) && skb->encapsulation) { + gdesc->txd.hlen = ctx.l4_offset + ctx.l4_hdr_size; + gdesc->txd.om = VMXNET3_OM_ENCAP; + gdesc->txd.msscof = ctx.mss; + + udph = udp_hdr(skb); + if (udph->check) + gdesc->txd.oco = 1; + } else { + gdesc->txd.hlen = ctx.l4_offset + ctx.l4_hdr_size; + gdesc->txd.om = VMXNET3_OM_TSO; + gdesc->txd.msscof = ctx.mss; + } num_pkts = (skb->len - gdesc->txd.hlen + ctx.mss - 1) / ctx.mss; } else { if (skb->ip_summed == CHECKSUM_PARTIAL) { - gdesc->txd.hlen = ctx.eth_ip_hdr_size; - gdesc->txd.om = VMXNET3_OM_CSUM; - gdesc->txd.msscof = ctx.eth_ip_hdr_size + - skb->csum_offset; + if (VMXNET3_VERSION_GE_4(adapter) && + skb->encapsulation) { + gdesc->txd.hlen = ctx.l4_offset + + ctx.l4_hdr_size; + gdesc->txd.om = VMXNET3_OM_ENCAP; + gdesc->txd.msscof = 0; /* Reserved */ + } else { + gdesc->txd.hlen = ctx.l4_offset; + gdesc->txd.om = VMXNET3_OM_CSUM; + gdesc->txd.msscof = ctx.l4_offset + + skb->csum_offset; + } } else { gdesc->txd.om = 0; gdesc->txd.msscof = 0; @@ -1168,13 +1220,21 @@ vmxnet3_rx_csum(struct vmxnet3_adapter *adapter, (le32_to_cpu(gdesc->dword[3]) & VMXNET3_RCD_CSUM_OK) == VMXNET3_RCD_CSUM_OK) { skb->ip_summed = CHECKSUM_UNNECESSARY; - BUG_ON(!(gdesc->rcd.tcp || gdesc->rcd.udp)); - BUG_ON(gdesc->rcd.frg); + WARN_ON_ONCE(!(gdesc->rcd.tcp || gdesc->rcd.udp) && + !(le32_to_cpu(gdesc->dword[0]) & + (1UL << VMXNET3_RCD_HDR_INNER_SHIFT))); + WARN_ON_ONCE(gdesc->rcd.frg && + !(le32_to_cpu(gdesc->dword[0]) & + (1UL << VMXNET3_RCD_HDR_INNER_SHIFT))); } else if (gdesc->rcd.v6 && (le32_to_cpu(gdesc->dword[3]) & (1 << VMXNET3_RCD_TUC_SHIFT))) { skb->ip_summed = CHECKSUM_UNNECESSARY; - BUG_ON(!(gdesc->rcd.tcp || gdesc->rcd.udp)); - BUG_ON(gdesc->rcd.frg); + WARN_ON_ONCE(!(gdesc->rcd.tcp || gdesc->rcd.udp) && + !(le32_to_cpu(gdesc->dword[0]) & + (1UL << VMXNET3_RCD_HDR_INNER_SHIFT))); + WARN_ON_ONCE(gdesc->rcd.frg && + !(le32_to_cpu(gdesc->dword[0]) & + (1UL << VMXNET3_RCD_HDR_INNER_SHIFT))); } else { if (gdesc->rcd.csum) { skb->csum = htons(gdesc->rcd.csum); @@ -2429,6 +2489,10 @@ vmxnet3_setup_driver_shared(struct vmxnet3_adapter *adapter) if (adapter->netdev->features & NETIF_F_HW_VLAN_CTAG_RX) devRead->misc.uptFeatures |= UPT1_F_RXVLAN; + if (adapter->netdev->features & (NETIF_F_GSO_UDP_TUNNEL | + NETIF_F_GSO_UDP_TUNNEL_CSUM)) + devRead->misc.uptFeatures |= UPT1_F_RXINNEROFLD; + devRead->misc.mtu = cpu_to_le32(adapter->netdev->mtu); devRead->misc.queueDescPA = cpu_to_le64(adapter->queue_desc_pa); devRead->misc.queueDescLen = cpu_to_le32( @@ -2561,8 +2625,8 @@ vmxnet3_init_rssfields(struct vmxnet3_adapter *adapter) union Vmxnet3_CmdInfo *cmdInfo = &shared->cu.cmdInfo; unsigned long flags; - if (!VMXNET3_VERSION_GE_4(adapter)) - return; + if (!VMXNET3_VERSION_GE_4(adapter)) + return; spin_lock_irqsave(&adapter->cmd_lock, flags); @@ -3073,6 +3137,18 @@ vmxnet3_declare_features(struct vmxnet3_adapter *adapter, bool dma64) NETIF_F_HW_CSUM | NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_CTAG_RX | NETIF_F_TSO | NETIF_F_TSO6 | NETIF_F_LRO; + + if (VMXNET3_VERSION_GE_4(adapter)) { + netdev->hw_features |= NETIF_F_GSO_UDP_TUNNEL | + NETIF_F_GSO_UDP_TUNNEL_CSUM; + + netdev->hw_enc_features = NETIF_F_SG | NETIF_F_RXCSUM | + NETIF_F_HW_CSUM | NETIF_F_HW_VLAN_CTAG_TX | + NETIF_F_HW_VLAN_CTAG_RX | NETIF_F_TSO | NETIF_F_TSO6 | + NETIF_F_LRO | NETIF_F_GSO_UDP_TUNNEL | + NETIF_F_GSO_UDP_TUNNEL_CSUM; + } + if (dma64) netdev->hw_features |= NETIF_F_HIGHDMA; netdev->vlan_features = netdev->hw_features & diff --git a/drivers/net/vmxnet3/vmxnet3_ethtool.c b/drivers/net/vmxnet3/vmxnet3_ethtool.c index 57460cf1967f..bfdda0f34b97 100644 --- a/drivers/net/vmxnet3/vmxnet3_ethtool.c +++ b/drivers/net/vmxnet3/vmxnet3_ethtool.c @@ -267,14 +267,43 @@ netdev_features_t vmxnet3_fix_features(struct net_device *netdev, return features; } +static void vmxnet3_enable_encap_offloads(struct net_device *netdev) +{ + struct vmxnet3_adapter *adapter = netdev_priv(netdev); + + if (VMXNET3_VERSION_GE_4(adapter)) { + netdev->hw_enc_features |= NETIF_F_SG | NETIF_F_RXCSUM | + NETIF_F_HW_CSUM | NETIF_F_HW_VLAN_CTAG_TX | + NETIF_F_HW_VLAN_CTAG_RX | NETIF_F_TSO | NETIF_F_TSO6 | + NETIF_F_LRO | NETIF_F_GSO_UDP_TUNNEL | + NETIF_F_GSO_UDP_TUNNEL_CSUM; + } +} + +static void vmxnet3_disable_encap_offloads(struct net_device *netdev) +{ + struct vmxnet3_adapter *adapter = netdev_priv(netdev); + + if (VMXNET3_VERSION_GE_4(adapter)) { + netdev->hw_enc_features &= ~(NETIF_F_SG | NETIF_F_RXCSUM | + NETIF_F_HW_CSUM | NETIF_F_HW_VLAN_CTAG_TX | + NETIF_F_HW_VLAN_CTAG_RX | NETIF_F_TSO | NETIF_F_TSO6 | + NETIF_F_LRO | NETIF_F_GSO_UDP_TUNNEL | + NETIF_F_GSO_UDP_TUNNEL_CSUM); + } +} + int vmxnet3_set_features(struct net_device *netdev, netdev_features_t features) { struct vmxnet3_adapter *adapter = netdev_priv(netdev); unsigned long flags; netdev_features_t changed = features ^ netdev->features; + netdev_features_t tun_offload_mask = NETIF_F_GSO_UDP_TUNNEL | + NETIF_F_GSO_UDP_TUNNEL_CSUM; + u8 udp_tun_enabled = (netdev->features & tun_offload_mask) != 0; if (changed & (NETIF_F_RXCSUM | NETIF_F_LRO | - NETIF_F_HW_VLAN_CTAG_RX)) { + NETIF_F_HW_VLAN_CTAG_RX | tun_offload_mask)) { if (features & NETIF_F_RXCSUM) adapter->shared->devRead.misc.uptFeatures |= UPT1_F_RXCSUM; @@ -297,6 +326,17 @@ int vmxnet3_set_features(struct net_device *netdev, netdev_features_t features) adapter->shared->devRead.misc.uptFeatures &= ~UPT1_F_RXVLAN; + if ((features & tun_offload_mask) != 0 && !udp_tun_enabled) { + vmxnet3_enable_encap_offloads(netdev); + adapter->shared->devRead.misc.uptFeatures |= + UPT1_F_RXINNEROFLD; + } else if ((features & tun_offload_mask) == 0 && + udp_tun_enabled) { + vmxnet3_disable_encap_offloads(netdev); + adapter->shared->devRead.misc.uptFeatures &= + ~UPT1_F_RXINNEROFLD; + } + spin_lock_irqsave(&adapter->cmd_lock, flags); VMXNET3_WRITE_BAR1_REG(adapter, VMXNET3_REG_CMD, VMXNET3_CMD_UPDATE_FEATURE); diff --git a/drivers/net/vmxnet3/vmxnet3_int.h b/drivers/net/vmxnet3/vmxnet3_int.h index d52ccc3eeba2..86db809c7592 100644 --- a/drivers/net/vmxnet3/vmxnet3_int.h +++ b/drivers/net/vmxnet3/vmxnet3_int.h @@ -219,10 +219,16 @@ struct vmxnet3_tx_ctx { bool ipv4; bool ipv6; u16 mss; - u32 eth_ip_hdr_size; /* only valid for pkts requesting tso or csum - * offloading + u32 l4_offset; /* only valid for pkts requesting tso or csum + * offloading. For encap offload, it refers to + * inner L4 offset i.e. it includes outer header + * encap header and inner eth and ip header size + */ + + u32 l4_hdr_size; /* only valid if mss != 0 + * Refers to inner L4 hdr size for encap + * offload */ - u32 l4_hdr_size; /* only valid if mss != 0 */ u32 copy_size; /* # of bytes copied into the data ring */ union Vmxnet3_GenericDesc *sop_txd; union Vmxnet3_GenericDesc *eop_txd; -- cgit v1.2.3-59-g8ed1b From a31135e36eccd0d16e500d3041f23c3ece62096f Mon Sep 17 00:00:00 2001 From: Ronak Doshi Date: Thu, 28 May 2020 14:53:22 -0700 Subject: vmxnet3: update to version 4 With all vmxnet3 version 4 changes incorporated in the vmxnet3 driver, the driver can configure emulation to run at vmxnet3 version 4, provided the emulation advertises support for version 4. Signed-off-by: Ronak Doshi Signed-off-by: David S. Miller --- drivers/net/vmxnet3/vmxnet3_drv.c | 7 ++++++- drivers/net/vmxnet3/vmxnet3_int.h | 4 ++-- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/drivers/net/vmxnet3/vmxnet3_drv.c b/drivers/net/vmxnet3/vmxnet3_drv.c index 171d4b1d1d04..3d07ce6cb706 100644 --- a/drivers/net/vmxnet3/vmxnet3_drv.c +++ b/drivers/net/vmxnet3/vmxnet3_drv.c @@ -3492,7 +3492,12 @@ vmxnet3_probe_device(struct pci_dev *pdev, goto err_alloc_pci; ver = VMXNET3_READ_BAR1_REG(adapter, VMXNET3_REG_VRRS); - if (ver & (1 << VMXNET3_REV_3)) { + if (ver & (1 << VMXNET3_REV_4)) { + VMXNET3_WRITE_BAR1_REG(adapter, + VMXNET3_REG_VRRS, + 1 << VMXNET3_REV_4); + adapter->version = VMXNET3_REV_4 + 1; + } else if (ver & (1 << VMXNET3_REV_3)) { VMXNET3_WRITE_BAR1_REG(adapter, VMXNET3_REG_VRRS, 1 << VMXNET3_REV_3); diff --git a/drivers/net/vmxnet3/vmxnet3_int.h b/drivers/net/vmxnet3/vmxnet3_int.h index 86db809c7592..5d2b062215a2 100644 --- a/drivers/net/vmxnet3/vmxnet3_int.h +++ b/drivers/net/vmxnet3/vmxnet3_int.h @@ -69,12 +69,12 @@ /* * Version numbers */ -#define VMXNET3_DRIVER_VERSION_STRING "1.4.17.0-k" +#define VMXNET3_DRIVER_VERSION_STRING "1.5.0.0-k" /* Each byte of this 32-bit integer encodes a version number in * VMXNET3_DRIVER_VERSION_STRING. */ -#define VMXNET3_DRIVER_VERSION_NUM 0x01041100 +#define VMXNET3_DRIVER_VERSION_NUM 0x01050000 #if defined(CONFIG_PCI_MSI) /* RSS only makes sense if MSI-X is supported. */ -- cgit v1.2.3-59-g8ed1b From d320692d9f8502e819e511ea5294552e0a8d3d9b Mon Sep 17 00:00:00 2001 From: Marek Vasut Date: Fri, 29 May 2020 00:21:28 +0200 Subject: net: ks8851: Factor out spi->dev in probe()/remove() Pull out the spi->dev into one common place in the function instead of having it repeated over and over again. This is done in preparation for unifying ks8851 and ks8851-mll drivers. No functional change. Reviewed-by: Andrew Lunn Signed-off-by: Marek Vasut Cc: David S. Miller Cc: Lukas Wunner Cc: Petr Stetiar Cc: YueHaibing Signed-off-by: David S. Miller --- drivers/net/ethernet/micrel/ks8851.c | 29 ++++++++++++++--------------- 1 file changed, 14 insertions(+), 15 deletions(-) diff --git a/drivers/net/ethernet/micrel/ks8851.c b/drivers/net/ethernet/micrel/ks8851.c index 33305c9c5a62..e32ef9403803 100644 --- a/drivers/net/ethernet/micrel/ks8851.c +++ b/drivers/net/ethernet/micrel/ks8851.c @@ -1413,6 +1413,7 @@ static SIMPLE_DEV_PM_OPS(ks8851_pm_ops, ks8851_suspend, ks8851_resume); static int ks8851_probe(struct spi_device *spi) { + struct device *dev = &spi->dev; struct net_device *ndev; struct ks8851_net *ks; int ret; @@ -1431,8 +1432,7 @@ static int ks8851_probe(struct spi_device *spi) ks->spidev = spi; ks->tx_space = 6144; - gpio = of_get_named_gpio_flags(spi->dev.of_node, "reset-gpios", - 0, NULL); + gpio = of_get_named_gpio_flags(dev->of_node, "reset-gpios", 0, NULL); if (gpio == -EPROBE_DEFER) { ret = gpio; goto err_gpio; @@ -1440,15 +1440,15 @@ static int ks8851_probe(struct spi_device *spi) ks->gpio = gpio; if (gpio_is_valid(gpio)) { - ret = devm_gpio_request_one(&spi->dev, gpio, + ret = devm_gpio_request_one(dev, gpio, GPIOF_OUT_INIT_LOW, "ks8851_rst_n"); if (ret) { - dev_err(&spi->dev, "reset gpio request failed\n"); + dev_err(dev, "reset gpio request failed\n"); goto err_gpio; } } - ks->vdd_io = devm_regulator_get(&spi->dev, "vdd-io"); + ks->vdd_io = devm_regulator_get(dev, "vdd-io"); if (IS_ERR(ks->vdd_io)) { ret = PTR_ERR(ks->vdd_io); goto err_reg_io; @@ -1456,12 +1456,11 @@ static int ks8851_probe(struct spi_device *spi) ret = regulator_enable(ks->vdd_io); if (ret) { - dev_err(&spi->dev, "regulator vdd_io enable fail: %d\n", - ret); + dev_err(dev, "regulator vdd_io enable fail: %d\n", ret); goto err_reg_io; } - ks->vdd_reg = devm_regulator_get(&spi->dev, "vdd"); + ks->vdd_reg = devm_regulator_get(dev, "vdd"); if (IS_ERR(ks->vdd_reg)) { ret = PTR_ERR(ks->vdd_reg); goto err_reg; @@ -1469,8 +1468,7 @@ static int ks8851_probe(struct spi_device *spi) ret = regulator_enable(ks->vdd_reg); if (ret) { - dev_err(&spi->dev, "regulator vdd enable fail: %d\n", - ret); + dev_err(dev, "regulator vdd enable fail: %d\n", ret); goto err_reg; } @@ -1509,7 +1507,7 @@ static int ks8851_probe(struct spi_device *spi) ks->mii.mdio_read = ks8851_phy_read; ks->mii.mdio_write = ks8851_phy_write; - dev_info(&spi->dev, "message enable is %d\n", msg_enable); + dev_info(dev, "message enable is %d\n", msg_enable); /* set the default message enable */ ks->msg_enable = netif_msg_init(msg_enable, (NETIF_MSG_DRV | @@ -1519,7 +1517,7 @@ static int ks8851_probe(struct spi_device *spi) skb_queue_head_init(&ks->txq); ndev->ethtool_ops = &ks8851_ethtool_ops; - SET_NETDEV_DEV(ndev, &spi->dev); + SET_NETDEV_DEV(ndev, dev); spi_set_drvdata(spi, ks); @@ -1534,7 +1532,7 @@ static int ks8851_probe(struct spi_device *spi) /* simple check for a valid chip being connected to the bus */ cider = ks8851_rdreg16(ks, KS_CIDER); if ((cider & ~CIDER_REV_MASK) != CIDER_ID) { - dev_err(&spi->dev, "failed to read device ID\n"); + dev_err(dev, "failed to read device ID\n"); ret = -ENODEV; goto err_id; } @@ -1547,7 +1545,7 @@ static int ks8851_probe(struct spi_device *spi) ret = register_netdev(ndev); if (ret) { - dev_err(&spi->dev, "failed to register network device\n"); + dev_err(dev, "failed to register network device\n"); goto err_netdev; } @@ -1573,9 +1571,10 @@ err_gpio: static int ks8851_remove(struct spi_device *spi) { struct ks8851_net *priv = spi_get_drvdata(spi); + struct device *dev = &spi->dev; if (netif_msg_drv(priv)) - dev_info(&spi->dev, "remove\n"); + dev_info(dev, "remove\n"); unregister_netdev(priv->netdev); if (gpio_is_valid(priv->gpio)) -- cgit v1.2.3-59-g8ed1b From bfd1e0eb08f6ac25659e3d9bfd80b8ddffe57c94 Mon Sep 17 00:00:00 2001 From: Marek Vasut Date: Fri, 29 May 2020 00:21:29 +0200 Subject: net: ks8851: Rename ndev to netdev in probe Rename ndev variable to netdev for the sake of consistency. No functional change. Reviewed-by: Andrew Lunn Signed-off-by: Marek Vasut Cc: David S. Miller Cc: Lukas Wunner Cc: Petr Stetiar Cc: YueHaibing Signed-off-by: David S. Miller --- drivers/net/ethernet/micrel/ks8851.c | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/drivers/net/ethernet/micrel/ks8851.c b/drivers/net/ethernet/micrel/ks8851.c index e32ef9403803..2b85072993c5 100644 --- a/drivers/net/ethernet/micrel/ks8851.c +++ b/drivers/net/ethernet/micrel/ks8851.c @@ -1414,21 +1414,21 @@ static SIMPLE_DEV_PM_OPS(ks8851_pm_ops, ks8851_suspend, ks8851_resume); static int ks8851_probe(struct spi_device *spi) { struct device *dev = &spi->dev; - struct net_device *ndev; + struct net_device *netdev; struct ks8851_net *ks; int ret; unsigned cider; int gpio; - ndev = alloc_etherdev(sizeof(struct ks8851_net)); - if (!ndev) + netdev = alloc_etherdev(sizeof(struct ks8851_net)); + if (!netdev) return -ENOMEM; spi->bits_per_word = 8; - ks = netdev_priv(ndev); + ks = netdev_priv(netdev); - ks->netdev = ndev; + ks->netdev = netdev; ks->spidev = spi; ks->tx_space = 6144; @@ -1500,7 +1500,7 @@ static int ks8851_probe(struct spi_device *spi) ks->eeprom.register_write = ks8851_eeprom_regwrite; /* setup mii state */ - ks->mii.dev = ndev; + ks->mii.dev = netdev; ks->mii.phy_id = 1, ks->mii.phy_id_mask = 1; ks->mii.reg_num_mask = 0xf; @@ -1516,15 +1516,15 @@ static int ks8851_probe(struct spi_device *spi) skb_queue_head_init(&ks->txq); - ndev->ethtool_ops = &ks8851_ethtool_ops; - SET_NETDEV_DEV(ndev, dev); + netdev->ethtool_ops = &ks8851_ethtool_ops; + SET_NETDEV_DEV(netdev, dev); spi_set_drvdata(spi, ks); netif_carrier_off(ks->netdev); - ndev->if_port = IF_PORT_100BASET; - ndev->netdev_ops = &ks8851_netdev_ops; - ndev->irq = spi->irq; + netdev->if_port = IF_PORT_100BASET; + netdev->netdev_ops = &ks8851_netdev_ops; + netdev->irq = spi->irq; /* issue a global soft reset to reset the device. */ ks8851_soft_reset(ks, GRR_GSR); @@ -1543,14 +1543,14 @@ static int ks8851_probe(struct spi_device *spi) ks8851_read_selftest(ks); ks8851_init_mac(ks); - ret = register_netdev(ndev); + ret = register_netdev(netdev); if (ret) { dev_err(dev, "failed to register network device\n"); goto err_netdev; } - netdev_info(ndev, "revision %d, MAC %pM, IRQ %d, %s EEPROM\n", - CIDER_REV_GET(cider), ndev->dev_addr, ndev->irq, + netdev_info(netdev, "revision %d, MAC %pM, IRQ %d, %s EEPROM\n", + CIDER_REV_GET(cider), netdev->dev_addr, netdev->irq, ks->rc_ccr & CCR_EEPROM ? "has" : "no"); return 0; @@ -1564,7 +1564,7 @@ err_reg: regulator_disable(ks->vdd_io); err_reg_io: err_gpio: - free_netdev(ndev); + free_netdev(netdev); return ret; } -- cgit v1.2.3-59-g8ed1b From 2f3271c952bfa9486aa3746f591788c983fc280d Mon Sep 17 00:00:00 2001 From: Marek Vasut Date: Fri, 29 May 2020 00:21:30 +0200 Subject: net: ks8851: Replace dev_err() with netdev_err() in IRQ handler Use netdev_err() instead of dev_err() to avoid accessing the spidev->dev in the interrupt handler. This is the only place which uses the spidev in this function, so replace it with netdev_err() to get rid of it. This is done in preparation for unifying the KS8851 SPI and parallel drivers. Reviewed-by: Andrew Lunn Signed-off-by: Marek Vasut Cc: David S. Miller Cc: Lukas Wunner Cc: Petr Stetiar Cc: YueHaibing Signed-off-by: David S. Miller --- drivers/net/ethernet/micrel/ks8851.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/micrel/ks8851.c b/drivers/net/ethernet/micrel/ks8851.c index 2b85072993c5..0088df970ad6 100644 --- a/drivers/net/ethernet/micrel/ks8851.c +++ b/drivers/net/ethernet/micrel/ks8851.c @@ -631,7 +631,7 @@ static irqreturn_t ks8851_irq(int irq, void *_ks) handled |= IRQ_RXI; if (status & IRQ_SPIBEI) { - dev_err(&ks->spidev->dev, "%s: spi bus error\n", __func__); + netdev_err(ks->netdev, "%s: spi bus error\n", __func__); handled |= IRQ_SPIBEI; } -- cgit v1.2.3-59-g8ed1b From 848fc0ce6cb84430c4b1dd0a749015ad9af1b766 Mon Sep 17 00:00:00 2001 From: Marek Vasut Date: Fri, 29 May 2020 00:21:31 +0200 Subject: net: ks8851: Pass device node into ks8851_init_mac() Since the driver probe function already has a struct device *dev pointer and can easily derive of_node pointer from it, pass the of_node pointer as a parameter to ks8851_init_mac() to avoid fishing it out from ks->spidev. This is the only reference to spidev in the function, so get rid of it. This is done in preparation for unifying the KS8851 SPI and parallel bus drivers. No functional change. Signed-off-by: Marek Vasut Cc: David S. Miller Cc: Lukas Wunner Cc: Petr Stetiar Cc: YueHaibing Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- drivers/net/ethernet/micrel/ks8851.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/micrel/ks8851.c b/drivers/net/ethernet/micrel/ks8851.c index 0088df970ad6..582092a95afc 100644 --- a/drivers/net/ethernet/micrel/ks8851.c +++ b/drivers/net/ethernet/micrel/ks8851.c @@ -409,6 +409,7 @@ static void ks8851_read_mac_addr(struct net_device *dev) /** * ks8851_init_mac - initialise the mac address * @ks: The device structure + * @np: The device node pointer * * Get or create the initial mac address for the device and then set that * into the station address register. A mac address supplied in the device @@ -416,12 +417,12 @@ static void ks8851_read_mac_addr(struct net_device *dev) * we try that. If no valid mac address is found we use eth_random_addr() * to create a new one. */ -static void ks8851_init_mac(struct ks8851_net *ks) +static void ks8851_init_mac(struct ks8851_net *ks, struct device_node *np) { struct net_device *dev = ks->netdev; const u8 *mac_addr; - mac_addr = of_get_mac_address(ks->spidev->dev.of_node); + mac_addr = of_get_mac_address(np); if (!IS_ERR(mac_addr)) { ether_addr_copy(dev->dev_addr, mac_addr); ks8851_write_mac_addr(dev); @@ -1541,7 +1542,7 @@ static int ks8851_probe(struct spi_device *spi) ks->rc_ccr = ks8851_rdreg16(ks, KS_CCR); ks8851_read_selftest(ks); - ks8851_init_mac(ks); + ks8851_init_mac(ks, dev->of_node); ret = register_netdev(netdev); if (ret) { -- cgit v1.2.3-59-g8ed1b From b6948e1b7b094ccc86b1ae768beab734e5355ced Mon Sep 17 00:00:00 2001 From: Marek Vasut Date: Fri, 29 May 2020 00:21:32 +0200 Subject: net: ks8851: Use devm_alloc_etherdev() Use device managed version of alloc_etherdev() to simplify the code. No functional change intended. Reviewed-by: Andrew Lunn Signed-off-by: Marek Vasut Cc: David S. Miller Cc: Lukas Wunner Cc: Petr Stetiar Cc: YueHaibing Signed-off-by: David S. Miller --- drivers/net/ethernet/micrel/ks8851.c | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/drivers/net/ethernet/micrel/ks8851.c b/drivers/net/ethernet/micrel/ks8851.c index 582092a95afc..86bfe55f346d 100644 --- a/drivers/net/ethernet/micrel/ks8851.c +++ b/drivers/net/ethernet/micrel/ks8851.c @@ -1421,7 +1421,7 @@ static int ks8851_probe(struct spi_device *spi) unsigned cider; int gpio; - netdev = alloc_etherdev(sizeof(struct ks8851_net)); + netdev = devm_alloc_etherdev(dev, sizeof(struct ks8851_net)); if (!netdev) return -ENOMEM; @@ -1434,10 +1434,8 @@ static int ks8851_probe(struct spi_device *spi) ks->tx_space = 6144; gpio = of_get_named_gpio_flags(dev->of_node, "reset-gpios", 0, NULL); - if (gpio == -EPROBE_DEFER) { - ret = gpio; - goto err_gpio; - } + if (gpio == -EPROBE_DEFER) + return gpio; ks->gpio = gpio; if (gpio_is_valid(gpio)) { @@ -1445,7 +1443,7 @@ static int ks8851_probe(struct spi_device *spi) GPIOF_OUT_INIT_LOW, "ks8851_rst_n"); if (ret) { dev_err(dev, "reset gpio request failed\n"); - goto err_gpio; + return ret; } } @@ -1564,8 +1562,6 @@ err_id: err_reg: regulator_disable(ks->vdd_io); err_reg_io: -err_gpio: - free_netdev(netdev); return ret; } @@ -1582,7 +1578,6 @@ static int ks8851_remove(struct spi_device *spi) gpio_set_value(priv->gpio, 0); regulator_disable(priv->vdd_reg); regulator_disable(priv->vdd_io); - free_netdev(priv->netdev); return 0; } -- cgit v1.2.3-59-g8ed1b From 2c5b0a86ac54dae5bcf854a93dbb086191368014 Mon Sep 17 00:00:00 2001 From: Marek Vasut Date: Fri, 29 May 2020 00:21:33 +0200 Subject: net: ks8851: Use dev_{get,set}_drvdata() Replace spi_{get,set}_drvdata() with dev_{get,set}_drvdata(), which works for both SPI and platform drivers. This is done in preparation for unifying the KS8851 SPI and parallel bus drivers. There should be no functional change. Reviewed-by: Andrew Lunn Signed-off-by: Marek Vasut Cc: David S. Miller Cc: Lukas Wunner Cc: Petr Stetiar Cc: YueHaibing Signed-off-by: David S. Miller --- drivers/net/ethernet/micrel/ks8851.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/micrel/ks8851.c b/drivers/net/ethernet/micrel/ks8851.c index 86bfe55f346d..fe2037e166dc 100644 --- a/drivers/net/ethernet/micrel/ks8851.c +++ b/drivers/net/ethernet/micrel/ks8851.c @@ -1518,7 +1518,7 @@ static int ks8851_probe(struct spi_device *spi) netdev->ethtool_ops = &ks8851_ethtool_ops; SET_NETDEV_DEV(netdev, dev); - spi_set_drvdata(spi, ks); + dev_set_drvdata(dev, ks); netif_carrier_off(ks->netdev); netdev->if_port = IF_PORT_100BASET; @@ -1567,8 +1567,10 @@ err_reg_io: static int ks8851_remove(struct spi_device *spi) { - struct ks8851_net *priv = spi_get_drvdata(spi); struct device *dev = &spi->dev; + struct ks8851_net *priv; + + priv = dev_get_drvdata(dev); if (netif_msg_drv(priv)) dev_info(dev, "remove\n"); -- cgit v1.2.3-59-g8ed1b From 806f66495e791eb349cd3ea286ee411bcf1df4d0 Mon Sep 17 00:00:00 2001 From: Marek Vasut Date: Fri, 29 May 2020 00:21:34 +0200 Subject: net: ks8851: Remove ks8851_rdreg32() The ks8851_rdreg32() is used only in one place, to read two registers using a single read. To make it easier to support 16-bit accesses via parallel bus later on, replace this single read with two 16-bit reads from each of the registers and drop the ks8851_rdreg32() altogether. If this has noticeable performance impact on the SPI variant of KS8851, then we should consider using regmap to abstract the SPI and parallel bus options and in case of SPI, permit regmap to merge register reads of neighboring registers into single, longer, read. Signed-off-by: Marek Vasut Cc: David S. Miller Cc: Lukas Wunner Cc: Petr Stetiar Cc: YueHaibing Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- drivers/net/ethernet/micrel/ks8851.c | 25 ++----------------------- 1 file changed, 2 insertions(+), 23 deletions(-) diff --git a/drivers/net/ethernet/micrel/ks8851.c b/drivers/net/ethernet/micrel/ks8851.c index fe2037e166dc..8df130efbde1 100644 --- a/drivers/net/ethernet/micrel/ks8851.c +++ b/drivers/net/ethernet/micrel/ks8851.c @@ -296,25 +296,6 @@ static unsigned ks8851_rdreg16(struct ks8851_net *ks, unsigned reg) return le16_to_cpu(rx); } -/** - * ks8851_rdreg32 - read 32 bit register from device - * @ks: The chip information - * @reg: The register address - * - * Read a 32bit register from the chip. - * - * Note, this read requires the address be aligned to 4 bytes. -*/ -static unsigned ks8851_rdreg32(struct ks8851_net *ks, unsigned reg) -{ - __le32 rx = 0; - - WARN_ON(reg & 3); - - ks8851_rdreg(ks, MK_OP(0xf, reg), (u8 *)&rx, 4); - return le32_to_cpu(rx); -} - /** * ks8851_soft_reset - issue one of the soft reset to the device * @ks: The device state. @@ -508,7 +489,6 @@ static void ks8851_rx_pkts(struct ks8851_net *ks) unsigned rxfc; unsigned rxlen; unsigned rxstat; - u32 rxh; u8 *rxpkt; rxfc = ks8851_rdreg8(ks, KS_RXFC); @@ -527,9 +507,8 @@ static void ks8851_rx_pkts(struct ks8851_net *ks) */ for (; rxfc != 0; rxfc--) { - rxh = ks8851_rdreg32(ks, KS_RXFHSR); - rxstat = rxh & 0xffff; - rxlen = (rxh >> 16) & 0xfff; + rxstat = ks8851_rdreg16(ks, KS_RXFHSR); + rxlen = ks8851_rdreg16(ks, KS_RXFHBCR) & RXFHBCR_CNT_MASK; netif_dbg(ks, rx_status, ks->netdev, "rx: stat 0x%04x, len 0x%04x\n", rxstat, rxlen); -- cgit v1.2.3-59-g8ed1b From 88cfedd0d7ab5ff047ac7eb8243fb9a69526fee7 Mon Sep 17 00:00:00 2001 From: Marek Vasut Date: Fri, 29 May 2020 00:21:35 +0200 Subject: net: ks8851: Use 16-bit writes to program MAC address On the SPI variant of KS8851, the MAC address can be programmed with either 8/16/32-bit writes. To make it easier to support the 16-bit parallel option of KS8851 too, switch both the MAC address programming and readout to 16-bit operations. Remove ks8851_wrreg8() as it is not used anywhere anymore. There should be no functional change. Reviewed-by: Andrew Lunn Signed-off-by: Marek Vasut Cc: David S. Miller Cc: Lukas Wunner Cc: Petr Stetiar Cc: YueHaibing Signed-off-by: David S. Miller --- drivers/net/ethernet/micrel/ks8851.c | 47 ++++++++++-------------------------- drivers/net/ethernet/micrel/ks8851.h | 2 +- 2 files changed, 14 insertions(+), 35 deletions(-) diff --git a/drivers/net/ethernet/micrel/ks8851.c b/drivers/net/ethernet/micrel/ks8851.c index 8df130efbde1..1b81340e811f 100644 --- a/drivers/net/ethernet/micrel/ks8851.c +++ b/drivers/net/ethernet/micrel/ks8851.c @@ -185,36 +185,6 @@ static void ks8851_wrreg16(struct ks8851_net *ks, unsigned reg, unsigned val) netdev_err(ks->netdev, "spi_sync() failed\n"); } -/** - * ks8851_wrreg8 - write 8bit register value to chip - * @ks: The chip state - * @reg: The register address - * @val: The value to write - * - * Issue a write to put the value @val into the register specified in @reg. - */ -static void ks8851_wrreg8(struct ks8851_net *ks, unsigned reg, unsigned val) -{ - struct spi_transfer *xfer = &ks->spi_xfer1; - struct spi_message *msg = &ks->spi_msg1; - __le16 txb[2]; - int ret; - int bit; - - bit = 1 << (reg & 3); - - txb[0] = cpu_to_le16(MK_OP(bit, reg) | KS_SPIOP_WR); - txb[1] = val; - - xfer->tx_buf = txb; - xfer->rx_buf = NULL; - xfer->len = 3; - - ret = spi_sync(ks->spidev, msg); - if (ret < 0) - netdev_err(ks->netdev, "spi_sync() failed\n"); -} - /** * ks8851_rdreg - issue read register command and return the data * @ks: The device state @@ -349,6 +319,7 @@ static void ks8851_set_powermode(struct ks8851_net *ks, unsigned pwrmode) static int ks8851_write_mac_addr(struct net_device *dev) { struct ks8851_net *ks = netdev_priv(dev); + u16 val; int i; mutex_lock(&ks->lock); @@ -358,8 +329,12 @@ static int ks8851_write_mac_addr(struct net_device *dev) * the first write to the MAC address does not take effect. */ ks8851_set_powermode(ks, PMECR_PM_NORMAL); - for (i = 0; i < ETH_ALEN; i++) - ks8851_wrreg8(ks, KS_MAR(i), dev->dev_addr[i]); + + for (i = 0; i < ETH_ALEN; i += 2) { + val = (dev->dev_addr[i] << 8) | dev->dev_addr[i + 1]; + ks8851_wrreg16(ks, KS_MAR(i), val); + } + if (!netif_running(dev)) ks8851_set_powermode(ks, PMECR_PM_SOFTDOWN); @@ -377,12 +352,16 @@ static int ks8851_write_mac_addr(struct net_device *dev) static void ks8851_read_mac_addr(struct net_device *dev) { struct ks8851_net *ks = netdev_priv(dev); + u16 reg; int i; mutex_lock(&ks->lock); - for (i = 0; i < ETH_ALEN; i++) - dev->dev_addr[i] = ks8851_rdreg8(ks, KS_MAR(i)); + for (i = 0; i < ETH_ALEN; i += 2) { + reg = ks8851_rdreg16(ks, KS_MAR(i)); + dev->dev_addr[i] = reg >> 8; + dev->dev_addr[i + 1] = reg & 0xff; + } mutex_unlock(&ks->lock); } diff --git a/drivers/net/ethernet/micrel/ks8851.h b/drivers/net/ethernet/micrel/ks8851.h index 8f834aef8e32..f210d18a10b5 100644 --- a/drivers/net/ethernet/micrel/ks8851.h +++ b/drivers/net/ethernet/micrel/ks8851.h @@ -19,7 +19,7 @@ #define CCR_32PIN (1 << 0) /* KSZ8851SNL */ /* MAC address registers */ -#define KS_MAR(_m) (0x15 - (_m)) +#define KS_MAR(_m) (0x14 - (_m)) #define KS_MARL 0x10 #define KS_MARM 0x12 #define KS_MARH 0x14 -- cgit v1.2.3-59-g8ed1b From aa39bf6730b72e495f711b0f06ca92e9673d648c Mon Sep 17 00:00:00 2001 From: Marek Vasut Date: Fri, 29 May 2020 00:21:36 +0200 Subject: net: ks8851: Use 16-bit read of RXFC register The RXFC register is the only one being read using 8-bit accessors. To make it easier to support the 16-bit accesses used by the parallel bus variant of KS8851, use 16-bit accessor to read RXFC register as well as neighboring RXFCTR register. Remove ks8851_rdreg8() as it is not used anywhere anymore. There should be no functional change. Signed-off-by: Marek Vasut Cc: David S. Miller Cc: Lukas Wunner Cc: Petr Stetiar Cc: YueHaibing Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- drivers/net/ethernet/micrel/ks8851.c | 17 +---------------- 1 file changed, 1 insertion(+), 16 deletions(-) diff --git a/drivers/net/ethernet/micrel/ks8851.c b/drivers/net/ethernet/micrel/ks8851.c index 1b81340e811f..e2e75041e931 100644 --- a/drivers/net/ethernet/micrel/ks8851.c +++ b/drivers/net/ethernet/micrel/ks8851.c @@ -236,21 +236,6 @@ static void ks8851_rdreg(struct ks8851_net *ks, unsigned op, memcpy(rxb, trx + 2, rxl); } -/** - * ks8851_rdreg8 - read 8 bit register from device - * @ks: The chip information - * @reg: The register address - * - * Read a 8bit register from the chip, returning the result -*/ -static unsigned ks8851_rdreg8(struct ks8851_net *ks, unsigned reg) -{ - u8 rxb[1]; - - ks8851_rdreg(ks, MK_OP(1 << (reg & 3), reg), rxb, 1); - return rxb[0]; -} - /** * ks8851_rdreg16 - read 16 bit register from device * @ks: The chip information @@ -470,7 +455,7 @@ static void ks8851_rx_pkts(struct ks8851_net *ks) unsigned rxstat; u8 *rxpkt; - rxfc = ks8851_rdreg8(ks, KS_RXFC); + rxfc = (ks8851_rdreg16(ks, KS_RXFCTR) >> 8) & 0xff; netif_dbg(ks, rx_status, ks->netdev, "%s: %d packets\n", __func__, rxfc); -- cgit v1.2.3-59-g8ed1b From 22726020050beacc7a164eaaafef0048d302cf41 Mon Sep 17 00:00:00 2001 From: Marek Vasut Date: Fri, 29 May 2020 00:21:37 +0200 Subject: net: ks8851: Factor out bus lock handling Pull out bus access locking code into separate functions, this is done in preparation for unifying the driver with the parallel bus one. The parallel bus driver does not need heavy mutex locking of the bus and works better with spinlocks, hence prepare these locking functions to be overridden then. Reviewed-by: Andrew Lunn Signed-off-by: Marek Vasut Cc: David S. Miller Cc: Lukas Wunner Cc: Petr Stetiar Cc: YueHaibing Signed-off-by: David S. Miller --- drivers/net/ethernet/micrel/ks8851.c | 97 +++++++++++++++++++++++++----------- 1 file changed, 68 insertions(+), 29 deletions(-) diff --git a/drivers/net/ethernet/micrel/ks8851.c b/drivers/net/ethernet/micrel/ks8851.c index e2e75041e931..053d6d085539 100644 --- a/drivers/net/ethernet/micrel/ks8851.c +++ b/drivers/net/ethernet/micrel/ks8851.c @@ -151,6 +151,30 @@ static int msg_enable; /* turn register number and byte-enable mask into data for start of packet */ #define MK_OP(_byteen, _reg) (BYTE_EN(_byteen) | (_reg) << (8+2) | (_reg) >> 6) +/** + * ks8851_lock - register access lock + * @ks: The chip state + * @flags: Spinlock flags + * + * Claim chip register access lock + */ +static void ks8851_lock(struct ks8851_net *ks, unsigned long *flags) +{ + mutex_lock(&ks->lock); +} + +/** + * ks8851_unlock - register access unlock + * @ks: The chip state + * @flags: Spinlock flags + * + * Release chip register access lock + */ +static void ks8851_unlock(struct ks8851_net *ks, unsigned long *flags) +{ + mutex_unlock(&ks->lock); +} + /* SPI register read/write calls. * * All these calls issue SPI transactions to access the chip's registers. They @@ -304,10 +328,11 @@ static void ks8851_set_powermode(struct ks8851_net *ks, unsigned pwrmode) static int ks8851_write_mac_addr(struct net_device *dev) { struct ks8851_net *ks = netdev_priv(dev); + unsigned long flags; u16 val; int i; - mutex_lock(&ks->lock); + ks8851_lock(ks, &flags); /* * Wake up chip in case it was powered off when stopped; otherwise, @@ -323,7 +348,7 @@ static int ks8851_write_mac_addr(struct net_device *dev) if (!netif_running(dev)) ks8851_set_powermode(ks, PMECR_PM_SOFTDOWN); - mutex_unlock(&ks->lock); + ks8851_unlock(ks, &flags); return 0; } @@ -337,10 +362,11 @@ static int ks8851_write_mac_addr(struct net_device *dev) static void ks8851_read_mac_addr(struct net_device *dev) { struct ks8851_net *ks = netdev_priv(dev); + unsigned long flags; u16 reg; int i; - mutex_lock(&ks->lock); + ks8851_lock(ks, &flags); for (i = 0; i < ETH_ALEN; i += 2) { reg = ks8851_rdreg16(ks, KS_MAR(i)); @@ -348,7 +374,7 @@ static void ks8851_read_mac_addr(struct net_device *dev) dev->dev_addr[i + 1] = reg & 0xff; } - mutex_unlock(&ks->lock); + ks8851_unlock(ks, &flags); } /** @@ -534,10 +560,11 @@ static void ks8851_rx_pkts(struct ks8851_net *ks) static irqreturn_t ks8851_irq(int irq, void *_ks) { struct ks8851_net *ks = _ks; - unsigned status; unsigned handled = 0; + unsigned long flags; + unsigned int status; - mutex_lock(&ks->lock); + ks8851_lock(ks, &flags); status = ks8851_rdreg16(ks, KS_ISR); @@ -606,7 +633,7 @@ static irqreturn_t ks8851_irq(int irq, void *_ks) ks8851_wrreg16(ks, KS_RXCR1, rxc->rxcr1); } - mutex_unlock(&ks->lock); + ks8851_unlock(ks, &flags); if (status & IRQ_LCI) mii_check_link(&ks->mii); @@ -700,10 +727,11 @@ static void ks8851_done_tx(struct ks8851_net *ks, struct sk_buff *txb) static void ks8851_tx_work(struct work_struct *work) { struct ks8851_net *ks = container_of(work, struct ks8851_net, tx_work); + unsigned long flags; struct sk_buff *txb; bool last = skb_queue_empty(&ks->txq); - mutex_lock(&ks->lock); + ks8851_lock(ks, &flags); while (!last) { txb = skb_dequeue(&ks->txq); @@ -719,7 +747,7 @@ static void ks8851_tx_work(struct work_struct *work) } } - mutex_unlock(&ks->lock); + ks8851_unlock(ks, &flags); } /** @@ -732,6 +760,7 @@ static void ks8851_tx_work(struct work_struct *work) static int ks8851_net_open(struct net_device *dev) { struct ks8851_net *ks = netdev_priv(dev); + unsigned long flags; int ret; ret = request_threaded_irq(dev->irq, NULL, ks8851_irq, @@ -744,7 +773,7 @@ static int ks8851_net_open(struct net_device *dev) /* lock the card, even if we may not actually be doing anything * else at the moment */ - mutex_lock(&ks->lock); + ks8851_lock(ks, &flags); netif_dbg(ks, ifup, ks->netdev, "opening\n"); @@ -804,7 +833,7 @@ static int ks8851_net_open(struct net_device *dev) netif_dbg(ks, ifup, ks->netdev, "network device up\n"); - mutex_unlock(&ks->lock); + ks8851_unlock(ks, &flags); mii_check_link(&ks->mii); return 0; } @@ -820,22 +849,23 @@ static int ks8851_net_open(struct net_device *dev) static int ks8851_net_stop(struct net_device *dev) { struct ks8851_net *ks = netdev_priv(dev); + unsigned long flags; netif_info(ks, ifdown, dev, "shutting down\n"); netif_stop_queue(dev); - mutex_lock(&ks->lock); + ks8851_lock(ks, &flags); /* turn off the IRQs and ack any outstanding */ ks8851_wrreg16(ks, KS_IER, 0x0000); ks8851_wrreg16(ks, KS_ISR, 0xffff); - mutex_unlock(&ks->lock); + ks8851_unlock(ks, &flags); /* stop any outstanding work */ flush_work(&ks->tx_work); flush_work(&ks->rxctrl_work); - mutex_lock(&ks->lock); + ks8851_lock(ks, &flags); /* shutdown RX process */ ks8851_wrreg16(ks, KS_RXCR1, 0x0000); @@ -844,7 +874,7 @@ static int ks8851_net_stop(struct net_device *dev) /* set powermode to soft power down to save power */ ks8851_set_powermode(ks, PMECR_PM_SOFTDOWN); - mutex_unlock(&ks->lock); + ks8851_unlock(ks, &flags); /* ensure any queued tx buffers are dumped */ while (!skb_queue_empty(&ks->txq)) { @@ -916,13 +946,14 @@ static netdev_tx_t ks8851_start_xmit(struct sk_buff *skb, static void ks8851_rxctrl_work(struct work_struct *work) { struct ks8851_net *ks = container_of(work, struct ks8851_net, rxctrl_work); + unsigned long flags; - mutex_lock(&ks->lock); + ks8851_lock(ks, &flags); /* need to shutdown RXQ before modifying filter parameters */ ks8851_wrreg16(ks, KS_RXCR1, 0x00); - mutex_unlock(&ks->lock); + ks8851_unlock(ks, &flags); } static void ks8851_set_rx_mode(struct net_device *dev) @@ -1104,11 +1135,6 @@ static void ks8851_eeprom_regwrite(struct eeprom_93cx6 *ee) */ static int ks8851_eeprom_claim(struct ks8851_net *ks) { - if (!(ks->rc_ccr & CCR_EEPROM)) - return -ENOENT; - - mutex_lock(&ks->lock); - /* start with clock low, cs high */ ks8851_wrreg16(ks, KS_EEPCR, EEPCR_EESA | EEPCR_EECS); return 0; @@ -1125,7 +1151,6 @@ static void ks8851_eeprom_release(struct ks8851_net *ks) unsigned val = ks8851_rdreg16(ks, KS_EEPCR); ks8851_wrreg16(ks, KS_EEPCR, val & ~EEPCR_EESA); - mutex_unlock(&ks->lock); } #define KS_EEPROM_MAGIC (0x00008851) @@ -1135,6 +1160,7 @@ static int ks8851_set_eeprom(struct net_device *dev, { struct ks8851_net *ks = netdev_priv(dev); int offset = ee->offset; + unsigned long flags; int len = ee->len; u16 tmp; @@ -1145,9 +1171,13 @@ static int ks8851_set_eeprom(struct net_device *dev, if (ee->magic != KS_EEPROM_MAGIC) return -EINVAL; - if (ks8851_eeprom_claim(ks)) + if (!(ks->rc_ccr & CCR_EEPROM)) return -ENOENT; + ks8851_lock(ks, &flags); + + ks8851_eeprom_claim(ks); + eeprom_93cx6_wren(&ks->eeprom, true); /* ethtool currently only supports writing bytes, which means @@ -1167,6 +1197,7 @@ static int ks8851_set_eeprom(struct net_device *dev, eeprom_93cx6_wren(&ks->eeprom, false); ks8851_eeprom_release(ks); + ks8851_unlock(ks, &flags); return 0; } @@ -1176,19 +1207,25 @@ static int ks8851_get_eeprom(struct net_device *dev, { struct ks8851_net *ks = netdev_priv(dev); int offset = ee->offset; + unsigned long flags; int len = ee->len; /* must be 2 byte aligned */ if (len & 1 || offset & 1) return -EINVAL; - if (ks8851_eeprom_claim(ks)) + if (!(ks->rc_ccr & CCR_EEPROM)) return -ENOENT; + ks8851_lock(ks, &flags); + + ks8851_eeprom_claim(ks); + ee->magic = KS_EEPROM_MAGIC; eeprom_93cx6_multiread(&ks->eeprom, offset/2, (__le16 *)data, len/2); ks8851_eeprom_release(ks); + ks8851_unlock(ks, &flags); return 0; } @@ -1262,6 +1299,7 @@ static int ks8851_phy_reg(int reg) static int ks8851_phy_read(struct net_device *dev, int phy_addr, int reg) { struct ks8851_net *ks = netdev_priv(dev); + unsigned long flags; int ksreg; int result; @@ -1269,9 +1307,9 @@ static int ks8851_phy_read(struct net_device *dev, int phy_addr, int reg) if (!ksreg) return 0x0; /* no error return allowed, so use zero */ - mutex_lock(&ks->lock); + ks8851_lock(ks, &flags); result = ks8851_rdreg16(ks, ksreg); - mutex_unlock(&ks->lock); + ks8851_unlock(ks, &flags); return result; } @@ -1280,13 +1318,14 @@ static void ks8851_phy_write(struct net_device *dev, int phy, int reg, int value) { struct ks8851_net *ks = netdev_priv(dev); + unsigned long flags; int ksreg; ksreg = ks8851_phy_reg(reg); if (ksreg) { - mutex_lock(&ks->lock); + ks8851_lock(ks, &flags); ks8851_wrreg16(ks, ksreg, value); - mutex_unlock(&ks->lock); + ks8851_unlock(ks, &flags); } } -- cgit v1.2.3-59-g8ed1b From 18a3df73093287ed5ccf650e5d700b6afc3e0663 Mon Sep 17 00:00:00 2001 From: Marek Vasut Date: Fri, 29 May 2020 00:21:38 +0200 Subject: net: ks8851: Factor out SKB receive function Factor out this netif_rx_ni(), so it could be overridden by the parallel bus variant of the KS8851 driver. Signed-off-by: Marek Vasut Cc: David S. Miller Cc: Lukas Wunner Cc: Petr Stetiar Cc: YueHaibing Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- drivers/net/ethernet/micrel/ks8851.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/micrel/ks8851.c b/drivers/net/ethernet/micrel/ks8851.c index 053d6d085539..087d2a39cdce 100644 --- a/drivers/net/ethernet/micrel/ks8851.c +++ b/drivers/net/ethernet/micrel/ks8851.c @@ -465,6 +465,15 @@ static void ks8851_dbg_dumpkkt(struct ks8851_net *ks, u8 *rxpkt) rxpkt[12], rxpkt[13], rxpkt[14], rxpkt[15]); } +/** + * ks8851_rx_skb - receive skbuff + * @skb: The skbuff + */ +static void ks8851_rx_skb(struct sk_buff *skb) +{ + netif_rx_ni(skb); +} + /** * ks8851_rx_pkts - receive packets from the host * @ks: The device information. @@ -533,7 +542,7 @@ static void ks8851_rx_pkts(struct ks8851_net *ks) ks8851_dbg_dumpkkt(ks, rxpkt); skb->protocol = eth_type_trans(skb, ks->netdev); - netif_rx_ni(skb); + ks8851_rx_skb(skb); ks->netdev->stats.rx_packets++; ks->netdev->stats.rx_bytes += rxlen; -- cgit v1.2.3-59-g8ed1b From d48b7634c692bc3df1aec413b4b4c823d55d7899 Mon Sep 17 00:00:00 2001 From: Marek Vasut Date: Fri, 29 May 2020 00:21:39 +0200 Subject: net: ks8851: Split out SPI specific entries in struct ks8851_net Add a new struct ks8851_net_spi, which embeds the original struct ks8851_net and contains the entries specific only to the SPI variant of KS8851. There should be no functional change. Reviewed-by: Andrew Lunn Signed-off-by: Marek Vasut Cc: David S. Miller Cc: Lukas Wunner Cc: Petr Stetiar Cc: YueHaibing Signed-off-by: David S. Miller --- drivers/net/ethernet/micrel/ks8851.c | 128 +++++++++++++++++++++-------------- 1 file changed, 79 insertions(+), 49 deletions(-) diff --git a/drivers/net/ethernet/micrel/ks8851.c b/drivers/net/ethernet/micrel/ks8851.c index 087d2a39cdce..482c65b1accf 100644 --- a/drivers/net/ethernet/micrel/ks8851.c +++ b/drivers/net/ethernet/micrel/ks8851.c @@ -64,16 +64,11 @@ union ks8851_tx_hdr { /** * struct ks8851_net - KS8851 driver private data * @netdev: The network device we're bound to - * @spidev: The spi device we're bound to. - * @lock: Lock to ensure that the device is not accessed when busy. * @statelock: Lock on this structure for tx list. * @mii: The MII state information for the mii calls. * @rxctrl: RX settings for @rxctrl_work. - * @tx_work: Work queue for tx packets * @rxctrl_work: Work queue for updating RX mode and multicast lists * @txq: Queue of packets for transmission. - * @spi_msg1: pre-setup SPI transfer with one message, @spi_xfer1. - * @spi_msg2: pre-setup SPI transfer with two messages, @spi_xfer2. * @txh: Space for generating packet TX header in DMA-able data * @rxd: Space for receiving SPI data, in DMA-able space. * @txd: Space for transmitting SPI data, in DMA-able space. @@ -87,11 +82,6 @@ union ks8851_tx_hdr { * @vdd_io: Optional digital power supply for IO * @gpio: Optional reset_n gpio * - * The @lock ensures that the chip is protected when certain operations are - * in progress. When the read or write packet transfer is in progress, most - * of the chip registers are not ccessible until the transfer is finished and - * the DMA has been de-asserted. - * * The @statelock is used to protect information in the structure which may * need to be accessed via several sources, such as the network driver layer * or one of the work queues. @@ -102,8 +92,6 @@ union ks8851_tx_hdr { */ struct ks8851_net { struct net_device *netdev; - struct spi_device *spidev; - struct mutex lock; spinlock_t statelock; union ks8851_tx_hdr txh ____cacheline_aligned; @@ -121,22 +109,43 @@ struct ks8851_net { struct mii_if_info mii; struct ks8851_rxctrl rxctrl; - struct work_struct tx_work; struct work_struct rxctrl_work; struct sk_buff_head txq; - struct spi_message spi_msg1; - struct spi_message spi_msg2; - struct spi_transfer spi_xfer1; - struct spi_transfer spi_xfer2[2]; - struct eeprom_93cx6 eeprom; struct regulator *vdd_reg; struct regulator *vdd_io; int gpio; }; +/** + * struct ks8851_net_spi - KS8851 SPI driver private data + * @ks8851: KS8851 driver common private data + * @lock: Lock to ensure that the device is not accessed when busy. + * @tx_work: Work queue for tx packets + * @spidev: The spi device we're bound to. + * @spi_msg1: pre-setup SPI transfer with one message, @spi_xfer1. + * @spi_msg2: pre-setup SPI transfer with two messages, @spi_xfer2. + * + * The @lock ensures that the chip is protected when certain operations are + * in progress. When the read or write packet transfer is in progress, most + * of the chip registers are not ccessible until the transfer is finished and + * the DMA has been de-asserted. + */ +struct ks8851_net_spi { + struct ks8851_net ks8851; + struct mutex lock; + struct work_struct tx_work; + struct spi_device *spidev; + struct spi_message spi_msg1; + struct spi_message spi_msg2; + struct spi_transfer spi_xfer1; + struct spi_transfer spi_xfer2[2]; +}; + +#define to_ks8851_spi(ks) container_of((ks), struct ks8851_net_spi, ks8851) + static int msg_enable; /* SPI frame opcodes */ @@ -160,7 +169,9 @@ static int msg_enable; */ static void ks8851_lock(struct ks8851_net *ks, unsigned long *flags) { - mutex_lock(&ks->lock); + struct ks8851_net_spi *kss = to_ks8851_spi(ks); + + mutex_lock(&kss->lock); } /** @@ -172,7 +183,9 @@ static void ks8851_lock(struct ks8851_net *ks, unsigned long *flags) */ static void ks8851_unlock(struct ks8851_net *ks, unsigned long *flags) { - mutex_unlock(&ks->lock); + struct ks8851_net_spi *kss = to_ks8851_spi(ks); + + mutex_unlock(&kss->lock); } /* SPI register read/write calls. @@ -192,8 +205,9 @@ static void ks8851_unlock(struct ks8851_net *ks, unsigned long *flags) */ static void ks8851_wrreg16(struct ks8851_net *ks, unsigned reg, unsigned val) { - struct spi_transfer *xfer = &ks->spi_xfer1; - struct spi_message *msg = &ks->spi_msg1; + struct ks8851_net_spi *kss = to_ks8851_spi(ks); + struct spi_transfer *xfer = &kss->spi_xfer1; + struct spi_message *msg = &kss->spi_msg1; __le16 txb[2]; int ret; @@ -204,7 +218,7 @@ static void ks8851_wrreg16(struct ks8851_net *ks, unsigned reg, unsigned val) xfer->rx_buf = NULL; xfer->len = 4; - ret = spi_sync(ks->spidev, msg); + ret = spi_sync(kss->spidev, msg); if (ret < 0) netdev_err(ks->netdev, "spi_sync() failed\n"); } @@ -222,6 +236,7 @@ static void ks8851_wrreg16(struct ks8851_net *ks, unsigned reg, unsigned val) static void ks8851_rdreg(struct ks8851_net *ks, unsigned op, u8 *rxb, unsigned rxl) { + struct ks8851_net_spi *kss = to_ks8851_spi(ks); struct spi_transfer *xfer; struct spi_message *msg; __le16 *txb = (__le16 *)ks->txd; @@ -230,9 +245,9 @@ static void ks8851_rdreg(struct ks8851_net *ks, unsigned op, txb[0] = cpu_to_le16(op | KS_SPIOP_RD); - if (ks->spidev->master->flags & SPI_MASTER_HALF_DUPLEX) { - msg = &ks->spi_msg2; - xfer = ks->spi_xfer2; + if (kss->spidev->master->flags & SPI_MASTER_HALF_DUPLEX) { + msg = &kss->spi_msg2; + xfer = kss->spi_xfer2; xfer->tx_buf = txb; xfer->rx_buf = NULL; @@ -243,18 +258,18 @@ static void ks8851_rdreg(struct ks8851_net *ks, unsigned op, xfer->rx_buf = trx; xfer->len = rxl; } else { - msg = &ks->spi_msg1; - xfer = &ks->spi_xfer1; + msg = &kss->spi_msg1; + xfer = &kss->spi_xfer1; xfer->tx_buf = txb; xfer->rx_buf = trx; xfer->len = rxl + 2; } - ret = spi_sync(ks->spidev, msg); + ret = spi_sync(kss->spidev, msg); if (ret < 0) netdev_err(ks->netdev, "read: spi_sync() failed\n"); - else if (ks->spidev->master->flags & SPI_MASTER_HALF_DUPLEX) + else if (kss->spidev->master->flags & SPI_MASTER_HALF_DUPLEX) memcpy(rxb, trx, rxl); else memcpy(rxb, trx + 2, rxl); @@ -424,8 +439,9 @@ static void ks8851_init_mac(struct ks8851_net *ks, struct device_node *np) */ static void ks8851_rdfifo(struct ks8851_net *ks, u8 *buff, unsigned len) { - struct spi_transfer *xfer = ks->spi_xfer2; - struct spi_message *msg = &ks->spi_msg2; + struct ks8851_net_spi *kss = to_ks8851_spi(ks); + struct spi_transfer *xfer = kss->spi_xfer2; + struct spi_message *msg = &kss->spi_msg2; u8 txb[1]; int ret; @@ -444,7 +460,7 @@ static void ks8851_rdfifo(struct ks8851_net *ks, u8 *buff, unsigned len) xfer->tx_buf = NULL; xfer->len = len; - ret = spi_sync(ks->spidev, msg); + ret = spi_sync(kss->spidev, msg); if (ret < 0) netdev_err(ks->netdev, "%s: spi_sync() failed\n", __func__); } @@ -678,8 +694,9 @@ static inline unsigned calc_txlen(unsigned len) */ static void ks8851_wrpkt(struct ks8851_net *ks, struct sk_buff *txp, bool irq) { - struct spi_transfer *xfer = ks->spi_xfer2; - struct spi_message *msg = &ks->spi_msg2; + struct ks8851_net_spi *kss = to_ks8851_spi(ks); + struct spi_transfer *xfer = kss->spi_xfer2; + struct spi_message *msg = &kss->spi_msg2; unsigned fid = 0; int ret; @@ -706,7 +723,7 @@ static void ks8851_wrpkt(struct ks8851_net *ks, struct sk_buff *txp, bool irq) xfer->rx_buf = NULL; xfer->len = ALIGN(txp->len, 4); - ret = spi_sync(ks->spidev, msg); + ret = spi_sync(kss->spidev, msg); if (ret < 0) netdev_err(ks->netdev, "%s: spi_sync() failed\n", __func__); } @@ -735,10 +752,15 @@ static void ks8851_done_tx(struct ks8851_net *ks, struct sk_buff *txb) */ static void ks8851_tx_work(struct work_struct *work) { - struct ks8851_net *ks = container_of(work, struct ks8851_net, tx_work); + struct ks8851_net_spi *kss; + struct ks8851_net *ks; unsigned long flags; struct sk_buff *txb; - bool last = skb_queue_empty(&ks->txq); + bool last; + + kss = container_of(work, struct ks8851_net_spi, tx_work); + ks = &kss->ks8851; + last = skb_queue_empty(&ks->txq); ks8851_lock(ks, &flags); @@ -858,8 +880,11 @@ static int ks8851_net_open(struct net_device *dev) static int ks8851_net_stop(struct net_device *dev) { struct ks8851_net *ks = netdev_priv(dev); + struct ks8851_net_spi *kss; unsigned long flags; + kss = to_ks8851_spi(ks); + netif_info(ks, ifdown, dev, "shutting down\n"); netif_stop_queue(dev); @@ -871,7 +896,7 @@ static int ks8851_net_stop(struct net_device *dev) ks8851_unlock(ks, &flags); /* stop any outstanding work */ - flush_work(&ks->tx_work); + flush_work(&kss->tx_work); flush_work(&ks->rxctrl_work); ks8851_lock(ks, &flags); @@ -919,6 +944,9 @@ static netdev_tx_t ks8851_start_xmit(struct sk_buff *skb, struct ks8851_net *ks = netdev_priv(dev); unsigned needed = calc_txlen(skb->len); netdev_tx_t ret = NETDEV_TX_OK; + struct ks8851_net_spi *kss; + + kss = to_ks8851_spi(ks); netif_dbg(ks, tx_queued, ks->netdev, "%s: skb %p, %d@%p\n", __func__, skb, skb->len, skb->data); @@ -934,7 +962,7 @@ static netdev_tx_t ks8851_start_xmit(struct sk_buff *skb, } spin_unlock(&ks->statelock); - schedule_work(&ks->tx_work); + schedule_work(&kss->tx_work); return ret; } @@ -1406,22 +1434,24 @@ static SIMPLE_DEV_PM_OPS(ks8851_pm_ops, ks8851_suspend, ks8851_resume); static int ks8851_probe(struct spi_device *spi) { struct device *dev = &spi->dev; + struct ks8851_net_spi *kss; struct net_device *netdev; struct ks8851_net *ks; int ret; unsigned cider; int gpio; - netdev = devm_alloc_etherdev(dev, sizeof(struct ks8851_net)); + netdev = devm_alloc_etherdev(dev, sizeof(struct ks8851_net_spi)); if (!netdev) return -ENOMEM; spi->bits_per_word = 8; ks = netdev_priv(netdev); + kss = to_ks8851_spi(ks); ks->netdev = netdev; - ks->spidev = spi; + kss->spidev = spi; ks->tx_space = 6144; gpio = of_get_named_gpio_flags(dev->of_node, "reset-gpios", 0, NULL); @@ -1467,20 +1497,20 @@ static int ks8851_probe(struct spi_device *spi) gpio_set_value(gpio, 1); } - mutex_init(&ks->lock); + mutex_init(&kss->lock); spin_lock_init(&ks->statelock); - INIT_WORK(&ks->tx_work, ks8851_tx_work); + INIT_WORK(&kss->tx_work, ks8851_tx_work); INIT_WORK(&ks->rxctrl_work, ks8851_rxctrl_work); /* initialise pre-made spi transfer messages */ - spi_message_init(&ks->spi_msg1); - spi_message_add_tail(&ks->spi_xfer1, &ks->spi_msg1); + spi_message_init(&kss->spi_msg1); + spi_message_add_tail(&kss->spi_xfer1, &kss->spi_msg1); - spi_message_init(&ks->spi_msg2); - spi_message_add_tail(&ks->spi_xfer2[0], &ks->spi_msg2); - spi_message_add_tail(&ks->spi_xfer2[1], &ks->spi_msg2); + spi_message_init(&kss->spi_msg2); + spi_message_add_tail(&kss->spi_xfer2[0], &kss->spi_msg2); + spi_message_add_tail(&kss->spi_xfer2[1], &kss->spi_msg2); /* setup EEPROM state */ -- cgit v1.2.3-59-g8ed1b From 24be72632c68e616c23ae01b08722d38242e552b Mon Sep 17 00:00:00 2001 From: Marek Vasut Date: Fri, 29 May 2020 00:21:40 +0200 Subject: net: ks8851: Split out SPI specific code from probe() and remove() Factor out common code into ks8851_probe_common() and ks8851_remove_common() to permit both SPI and parallel bus driver variants to use the common code path for both probing and removal. There should be no functional change. Reviewed-by: Andrew Lunn Signed-off-by: Marek Vasut Cc: David S. Miller Cc: Lukas Wunner Cc: Petr Stetiar Cc: YueHaibing Signed-off-by: David S. Miller --- drivers/net/ethernet/micrel/ks8851.c | 86 ++++++++++++++++++++---------------- 1 file changed, 48 insertions(+), 38 deletions(-) diff --git a/drivers/net/ethernet/micrel/ks8851.c b/drivers/net/ethernet/micrel/ks8851.c index 482c65b1accf..4283ba5bee81 100644 --- a/drivers/net/ethernet/micrel/ks8851.c +++ b/drivers/net/ethernet/micrel/ks8851.c @@ -1431,27 +1431,15 @@ static int ks8851_resume(struct device *dev) static SIMPLE_DEV_PM_OPS(ks8851_pm_ops, ks8851_suspend, ks8851_resume); -static int ks8851_probe(struct spi_device *spi) +static int ks8851_probe_common(struct net_device *netdev, struct device *dev, + int msg_en) { - struct device *dev = &spi->dev; - struct ks8851_net_spi *kss; - struct net_device *netdev; - struct ks8851_net *ks; - int ret; + struct ks8851_net *ks = netdev_priv(netdev); unsigned cider; int gpio; - - netdev = devm_alloc_etherdev(dev, sizeof(struct ks8851_net_spi)); - if (!netdev) - return -ENOMEM; - - spi->bits_per_word = 8; - - ks = netdev_priv(netdev); - kss = to_ks8851_spi(ks); + int ret; ks->netdev = netdev; - kss->spidev = spi; ks->tx_space = 6144; gpio = of_get_named_gpio_flags(dev->of_node, "reset-gpios", 0, NULL); @@ -1497,23 +1485,11 @@ static int ks8851_probe(struct spi_device *spi) gpio_set_value(gpio, 1); } - mutex_init(&kss->lock); spin_lock_init(&ks->statelock); - INIT_WORK(&kss->tx_work, ks8851_tx_work); INIT_WORK(&ks->rxctrl_work, ks8851_rxctrl_work); - /* initialise pre-made spi transfer messages */ - - spi_message_init(&kss->spi_msg1); - spi_message_add_tail(&kss->spi_xfer1, &kss->spi_msg1); - - spi_message_init(&kss->spi_msg2); - spi_message_add_tail(&kss->spi_xfer2[0], &kss->spi_msg2); - spi_message_add_tail(&kss->spi_xfer2[1], &kss->spi_msg2); - /* setup EEPROM state */ - ks->eeprom.data = ks; ks->eeprom.width = PCI_EEPROM_WIDTH_93C46; ks->eeprom.register_read = ks8851_eeprom_regread; @@ -1527,12 +1503,12 @@ static int ks8851_probe(struct spi_device *spi) ks->mii.mdio_read = ks8851_phy_read; ks->mii.mdio_write = ks8851_phy_write; - dev_info(dev, "message enable is %d\n", msg_enable); + dev_info(dev, "message enable is %d\n", msg_en); /* set the default message enable */ - ks->msg_enable = netif_msg_init(msg_enable, (NETIF_MSG_DRV | - NETIF_MSG_PROBE | - NETIF_MSG_LINK)); + ks->msg_enable = netif_msg_init(msg_en, NETIF_MSG_DRV | + NETIF_MSG_PROBE | + NETIF_MSG_LINK); skb_queue_head_init(&ks->txq); @@ -1544,7 +1520,6 @@ static int ks8851_probe(struct spi_device *spi) netif_carrier_off(ks->netdev); netdev->if_port = IF_PORT_100BASET; netdev->netdev_ops = &ks8851_netdev_ops; - netdev->irq = spi->irq; /* issue a global soft reset to reset the device. */ ks8851_soft_reset(ks, GRR_GSR); @@ -1586,12 +1561,9 @@ err_reg_io: return ret; } -static int ks8851_remove(struct spi_device *spi) +static int ks8851_remove_common(struct device *dev) { - struct device *dev = &spi->dev; - struct ks8851_net *priv; - - priv = dev_get_drvdata(dev); + struct ks8851_net *priv = dev_get_drvdata(dev); if (netif_msg_drv(priv)) dev_info(dev, "remove\n"); @@ -1605,6 +1577,44 @@ static int ks8851_remove(struct spi_device *spi) return 0; } +static int ks8851_probe(struct spi_device *spi) +{ + struct device *dev = &spi->dev; + struct ks8851_net_spi *kss; + struct net_device *netdev; + struct ks8851_net *ks; + + netdev = devm_alloc_etherdev(dev, sizeof(struct ks8851_net_spi)); + if (!netdev) + return -ENOMEM; + + spi->bits_per_word = 8; + + ks = netdev_priv(netdev); + kss = to_ks8851_spi(ks); + + kss->spidev = spi; + mutex_init(&kss->lock); + INIT_WORK(&kss->tx_work, ks8851_tx_work); + + /* initialise pre-made spi transfer messages */ + spi_message_init(&kss->spi_msg1); + spi_message_add_tail(&kss->spi_xfer1, &kss->spi_msg1); + + spi_message_init(&kss->spi_msg2); + spi_message_add_tail(&kss->spi_xfer2[0], &kss->spi_msg2); + spi_message_add_tail(&kss->spi_xfer2[1], &kss->spi_msg2); + + netdev->irq = spi->irq; + + return ks8851_probe_common(netdev, dev, msg_enable); +} + +static int ks8851_remove(struct spi_device *spi) +{ + return ks8851_remove_common(&spi->dev); +} + static const struct of_device_id ks8851_match_table[] = { { .compatible = "micrel,ks8851" }, { } -- cgit v1.2.3-59-g8ed1b From 144ad36c3d3b53045a2b3f334bd735f469cb8ea8 Mon Sep 17 00:00:00 2001 From: Marek Vasut Date: Fri, 29 May 2020 00:21:41 +0200 Subject: net: ks8851: Factor out TX work flush function While the SPI version of the KS8851 requires a TX worker thread to pump data via SPI, the parallel bus version can write data into the TX FIFO directly in .ndo_start_xmit, as the parallel bus access is much faster and does not sleep. Factor out this TX work flush part, so it can be overridden by the parallel bus driver. Reviewed-by: Andrew Lunn Signed-off-by: Marek Vasut Cc: David S. Miller Cc: Lukas Wunner Cc: Petr Stetiar Cc: YueHaibing Signed-off-by: David S. Miller --- drivers/net/ethernet/micrel/ks8851.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/micrel/ks8851.c b/drivers/net/ethernet/micrel/ks8851.c index 4283ba5bee81..458c86903ac0 100644 --- a/drivers/net/ethernet/micrel/ks8851.c +++ b/drivers/net/ethernet/micrel/ks8851.c @@ -781,6 +781,17 @@ static void ks8851_tx_work(struct work_struct *work) ks8851_unlock(ks, &flags); } +/** + * ks8851_flush_tx_work - flush outstanding TX work + * @ks: The device state + */ +static void ks8851_flush_tx_work(struct ks8851_net *ks) +{ + struct ks8851_net_spi *kss = to_ks8851_spi(ks); + + flush_work(&kss->tx_work); +} + /** * ks8851_net_open - open network device * @dev: The network device being opened. @@ -880,11 +891,8 @@ static int ks8851_net_open(struct net_device *dev) static int ks8851_net_stop(struct net_device *dev) { struct ks8851_net *ks = netdev_priv(dev); - struct ks8851_net_spi *kss; unsigned long flags; - kss = to_ks8851_spi(ks); - netif_info(ks, ifdown, dev, "shutting down\n"); netif_stop_queue(dev); @@ -896,7 +904,7 @@ static int ks8851_net_stop(struct net_device *dev) ks8851_unlock(ks, &flags); /* stop any outstanding work */ - flush_work(&kss->tx_work); + ks8851_flush_tx_work(ks); flush_work(&ks->rxctrl_work); ks8851_lock(ks, &flags); -- cgit v1.2.3-59-g8ed1b From d2a1c643a00e881de3ceae7ebe70e90f791dbb22 Mon Sep 17 00:00:00 2001 From: Marek Vasut Date: Fri, 29 May 2020 00:21:42 +0200 Subject: net: ks8851: Permit overridding interrupt enable register The parallel bus variant does not need to use the TX interrupt at all as it writes the TX FIFO directly with in .ndo_start_xmit, permit the drivers to configure the interrupt enable bits. Reviewed-by: Andrew Lunn Signed-off-by: Marek Vasut Cc: David S. Miller Cc: Lukas Wunner Cc: Petr Stetiar Cc: YueHaibing Signed-off-by: David S. Miller --- drivers/net/ethernet/micrel/ks8851.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/drivers/net/ethernet/micrel/ks8851.c b/drivers/net/ethernet/micrel/ks8851.c index 458c86903ac0..baf424f9893b 100644 --- a/drivers/net/ethernet/micrel/ks8851.c +++ b/drivers/net/ethernet/micrel/ks8851.c @@ -859,17 +859,8 @@ static int ks8851_net_open(struct net_device *dev) ks8851_wrreg16(ks, KS_RXQCR, ks->rc_rxqcr); /* clear then enable interrupts */ - -#define STD_IRQ (IRQ_LCI | /* Link Change */ \ - IRQ_TXI | /* TX done */ \ - IRQ_RXI | /* RX done */ \ - IRQ_SPIBEI | /* SPI bus error */ \ - IRQ_TXPSI | /* TX process stop */ \ - IRQ_RXPSI) /* RX process stop */ - - ks->rc_ier = STD_IRQ; - ks8851_wrreg16(ks, KS_ISR, STD_IRQ); - ks8851_wrreg16(ks, KS_IER, STD_IRQ); + ks8851_wrreg16(ks, KS_ISR, ks->rc_ier); + ks8851_wrreg16(ks, KS_IER, ks->rc_ier); netif_start_queue(ks->netdev); @@ -1599,6 +1590,15 @@ static int ks8851_probe(struct spi_device *spi) spi->bits_per_word = 8; ks = netdev_priv(netdev); + +#define STD_IRQ (IRQ_LCI | /* Link Change */ \ + IRQ_TXI | /* TX done */ \ + IRQ_RXI | /* RX done */ \ + IRQ_SPIBEI | /* SPI bus error */ \ + IRQ_TXPSI | /* TX process stop */ \ + IRQ_RXPSI) /* RX process stop */ + ks->rc_ier = STD_IRQ; + kss = to_ks8851_spi(ks); kss->spidev = spi; -- cgit v1.2.3-59-g8ed1b From 7a552c850c4581d698b8d47d9d03196fe572f32d Mon Sep 17 00:00:00 2001 From: Marek Vasut Date: Fri, 29 May 2020 00:21:43 +0200 Subject: net: ks8851: Implement register, FIFO, lock accessor callbacks The register and FIFO accessors are bus specific, so is locking. Implement callbacks so that each variant of the KS8851 can implement matching accessors and locking, and use the rest of the common code. Reviewed-by: Andrew Lunn Signed-off-by: Marek Vasut Cc: David S. Miller Cc: Lukas Wunner Cc: Petr Stetiar Cc: YueHaibing Signed-off-by: David S. Miller --- drivers/net/ethernet/micrel/ks8851.c | 178 ++++++++++++++++++++++++++++++----- 1 file changed, 156 insertions(+), 22 deletions(-) diff --git a/drivers/net/ethernet/micrel/ks8851.c b/drivers/net/ethernet/micrel/ks8851.c index baf424f9893b..1fa907d5dd5b 100644 --- a/drivers/net/ethernet/micrel/ks8851.c +++ b/drivers/net/ethernet/micrel/ks8851.c @@ -81,6 +81,15 @@ union ks8851_tx_hdr { * @vdd_reg: Optional regulator supplying the chip * @vdd_io: Optional digital power supply for IO * @gpio: Optional reset_n gpio + * @lock: Bus access lock callback + * @unlock: Bus access unlock callback + * @rdreg16: 16bit register read callback + * @wrreg16: 16bit register write callback + * @rdfifo: FIFO read callback + * @wrfifo: FIFO write callback + * @start_xmit: start_xmit() implementation callback + * @rx_skb: rx_skb() implementation callback + * @flush_tx_work: flush_tx_work() implementation callback * * The @statelock is used to protect information in the structure which may * need to be accessed via several sources, such as the network driver layer @@ -117,6 +126,24 @@ struct ks8851_net { struct regulator *vdd_reg; struct regulator *vdd_io; int gpio; + + void (*lock)(struct ks8851_net *ks, + unsigned long *flags); + void (*unlock)(struct ks8851_net *ks, + unsigned long *flags); + unsigned int (*rdreg16)(struct ks8851_net *ks, + unsigned int reg); + void (*wrreg16)(struct ks8851_net *ks, + unsigned int reg, unsigned int val); + void (*rdfifo)(struct ks8851_net *ks, u8 *buff, + unsigned int len); + void (*wrfifo)(struct ks8851_net *ks, + struct sk_buff *txp, bool irq); + netdev_tx_t (*start_xmit)(struct sk_buff *skb, + struct net_device *dev); + void (*rx_skb)(struct ks8851_net *ks, + struct sk_buff *skb); + void (*flush_tx_work)(struct ks8851_net *ks); }; /** @@ -161,13 +188,13 @@ static int msg_enable; #define MK_OP(_byteen, _reg) (BYTE_EN(_byteen) | (_reg) << (8+2) | (_reg) >> 6) /** - * ks8851_lock - register access lock + * ks8851_lock_spi - register access lock for SPI * @ks: The chip state * @flags: Spinlock flags * * Claim chip register access lock */ -static void ks8851_lock(struct ks8851_net *ks, unsigned long *flags) +static void ks8851_lock_spi(struct ks8851_net *ks, unsigned long *flags) { struct ks8851_net_spi *kss = to_ks8851_spi(ks); @@ -175,19 +202,43 @@ static void ks8851_lock(struct ks8851_net *ks, unsigned long *flags) } /** - * ks8851_unlock - register access unlock + * ks8851_unlock_spi - register access unlock for SPI * @ks: The chip state * @flags: Spinlock flags * * Release chip register access lock */ -static void ks8851_unlock(struct ks8851_net *ks, unsigned long *flags) +static void ks8851_unlock_spi(struct ks8851_net *ks, unsigned long *flags) { struct ks8851_net_spi *kss = to_ks8851_spi(ks); mutex_unlock(&kss->lock); } +/** + * ks8851_lock - register access lock + * @ks: The chip state + * @flags: Spinlock flags + * + * Claim chip register access lock + */ +static void ks8851_lock(struct ks8851_net *ks, unsigned long *flags) +{ + ks->lock(ks, flags); +} + +/** + * ks8851_unlock - register access unlock + * @ks: The chip state + * @flags: Spinlock flags + * + * Release chip register access lock + */ +static void ks8851_unlock(struct ks8851_net *ks, unsigned long *flags) +{ + ks->unlock(ks, flags); +} + /* SPI register read/write calls. * * All these calls issue SPI transactions to access the chip's registers. They @@ -196,14 +247,15 @@ static void ks8851_unlock(struct ks8851_net *ks, unsigned long *flags) */ /** - * ks8851_wrreg16 - write 16bit register value to chip + * ks8851_wrreg16_spi - write 16bit register value to chip via SPI * @ks: The chip state * @reg: The register address * @val: The value to write * * Issue a write to put the value @val into the register specified in @reg. */ -static void ks8851_wrreg16(struct ks8851_net *ks, unsigned reg, unsigned val) +static void ks8851_wrreg16_spi(struct ks8851_net *ks, unsigned int reg, + unsigned int val) { struct ks8851_net_spi *kss = to_ks8851_spi(ks); struct spi_transfer *xfer = &kss->spi_xfer1; @@ -223,6 +275,20 @@ static void ks8851_wrreg16(struct ks8851_net *ks, unsigned reg, unsigned val) netdev_err(ks->netdev, "spi_sync() failed\n"); } +/** + * ks8851_wrreg16 - write 16bit register value to chip + * @ks: The chip state + * @reg: The register address + * @val: The value to write + * + * Issue a write to put the value @val into the register specified in @reg. + */ +static void ks8851_wrreg16(struct ks8851_net *ks, unsigned int reg, + unsigned int val) +{ + ks->wrreg16(ks, reg, val); +} + /** * ks8851_rdreg - issue read register command and return the data * @ks: The device state @@ -276,13 +342,14 @@ static void ks8851_rdreg(struct ks8851_net *ks, unsigned op, } /** - * ks8851_rdreg16 - read 16 bit register from device + * ks8851_rdreg16_spi - read 16 bit register from device via SPI * @ks: The chip information * @reg: The register address * * Read a 16bit register from the chip, returning the result */ -static unsigned ks8851_rdreg16(struct ks8851_net *ks, unsigned reg) +static unsigned int ks8851_rdreg16_spi(struct ks8851_net *ks, + unsigned int reg) { __le16 rx = 0; @@ -290,6 +357,19 @@ static unsigned ks8851_rdreg16(struct ks8851_net *ks, unsigned reg) return le16_to_cpu(rx); } +/** + * ks8851_rdreg16 - read 16 bit register from device + * @ks: The chip information + * @reg: The register address + * + * Read a 16bit register from the chip, returning the result + */ +static unsigned int ks8851_rdreg16(struct ks8851_net *ks, + unsigned int reg) +{ + return ks->rdreg16(ks, reg); +} + /** * ks8851_soft_reset - issue one of the soft reset to the device * @ks: The device state. @@ -429,7 +509,7 @@ static void ks8851_init_mac(struct ks8851_net *ks, struct device_node *np) } /** - * ks8851_rdfifo - read data from the receive fifo + * ks8851_rdfifo_spi - read data from the receive fifo via SPI * @ks: The device state. * @buff: The buffer address * @len: The length of the data to read @@ -437,7 +517,8 @@ static void ks8851_init_mac(struct ks8851_net *ks, struct device_node *np) * Issue an RXQ FIFO read command and read the @len amount of data from * the FIFO into the buffer specified by @buff. */ -static void ks8851_rdfifo(struct ks8851_net *ks, u8 *buff, unsigned len) +static void ks8851_rdfifo_spi(struct ks8851_net *ks, u8 *buff, + unsigned int len) { struct ks8851_net_spi *kss = to_ks8851_spi(ks); struct spi_transfer *xfer = kss->spi_xfer2; @@ -482,14 +563,25 @@ static void ks8851_dbg_dumpkkt(struct ks8851_net *ks, u8 *rxpkt) } /** - * ks8851_rx_skb - receive skbuff + * ks8851_rx_skb_spi - receive skbuff for SPI + * @ks: The device state * @skb: The skbuff */ -static void ks8851_rx_skb(struct sk_buff *skb) +static void ks8851_rx_skb_spi(struct ks8851_net *ks, struct sk_buff *skb) { netif_rx_ni(skb); } +/** + * ks8851_rx_skb - receive skbuff + * @ks: The device state + * @skb: The skbuff + */ +static void ks8851_rx_skb(struct ks8851_net *ks, struct sk_buff *skb) +{ + ks->rx_skb(ks, skb); +} + /** * ks8851_rx_pkts - receive packets from the host * @ks: The device information. @@ -552,13 +644,13 @@ static void ks8851_rx_pkts(struct ks8851_net *ks) rxpkt = skb_put(skb, rxlen) - 8; - ks8851_rdfifo(ks, rxpkt, rxalign + 8); + ks->rdfifo(ks, rxpkt, rxalign + 8); if (netif_msg_pktdata(ks)) ks8851_dbg_dumpkkt(ks, rxpkt); skb->protocol = eth_type_trans(skb, ks->netdev); - ks8851_rx_skb(skb); + ks8851_rx_skb(ks, skb); ks->netdev->stats.rx_packets++; ks->netdev->stats.rx_bytes += rxlen; @@ -682,7 +774,7 @@ static inline unsigned calc_txlen(unsigned len) } /** - * ks8851_wrpkt - write packet to TX FIFO + * ks8851_wrpkt_spi - write packet to TX FIFO via SPI * @ks: The device state. * @txp: The sk_buff to transmit. * @irq: IRQ on completion of the packet. @@ -692,7 +784,8 @@ static inline unsigned calc_txlen(unsigned len) * needs, such as IRQ on completion. Send the header and the packet data to * the device. */ -static void ks8851_wrpkt(struct ks8851_net *ks, struct sk_buff *txp, bool irq) +static void ks8851_wrpkt_spi(struct ks8851_net *ks, struct sk_buff *txp, + bool irq) { struct ks8851_net_spi *kss = to_ks8851_spi(ks); struct spi_transfer *xfer = kss->spi_xfer2; @@ -770,7 +863,7 @@ static void ks8851_tx_work(struct work_struct *work) if (txb != NULL) { ks8851_wrreg16(ks, KS_RXQCR, ks->rc_rxqcr | RXQCR_SDA); - ks8851_wrpkt(ks, txb, last); + ks->wrfifo(ks, txb, last); ks8851_wrreg16(ks, KS_RXQCR, ks->rc_rxqcr); ks8851_wrreg16(ks, KS_TXQCR, TXQCR_METFE); @@ -782,16 +875,26 @@ static void ks8851_tx_work(struct work_struct *work) } /** - * ks8851_flush_tx_work - flush outstanding TX work + * ks8851_flush_tx_work_spi - flush outstanding TX work for SPI * @ks: The device state */ -static void ks8851_flush_tx_work(struct ks8851_net *ks) +static void ks8851_flush_tx_work_spi(struct ks8851_net *ks) { struct ks8851_net_spi *kss = to_ks8851_spi(ks); flush_work(&kss->tx_work); } +/** + * ks8851_flush_tx_work - flush outstanding TX work + * @ks: The device state + */ +static void ks8851_flush_tx_work(struct ks8851_net *ks) +{ + if (ks->flush_tx_work) + ks->flush_tx_work(ks); +} + /** * ks8851_net_open - open network device * @dev: The network device being opened. @@ -925,7 +1028,7 @@ static int ks8851_net_stop(struct net_device *dev) } /** - * ks8851_start_xmit - transmit packet + * ks8851_start_xmit_spi - transmit packet using SPI * @skb: The buffer to transmit * @dev: The device used to transmit the packet. * @@ -937,8 +1040,8 @@ static int ks8851_net_stop(struct net_device *dev) * and secondly so we can round up more than one packet to transmit which * means we can try and avoid generating too many transmit done interrupts. */ -static netdev_tx_t ks8851_start_xmit(struct sk_buff *skb, - struct net_device *dev) +static netdev_tx_t ks8851_start_xmit_spi(struct sk_buff *skb, + struct net_device *dev) { struct ks8851_net *ks = netdev_priv(dev); unsigned needed = calc_txlen(skb->len); @@ -966,6 +1069,27 @@ static netdev_tx_t ks8851_start_xmit(struct sk_buff *skb, return ret; } +/** + * ks8851_start_xmit - transmit packet + * @skb: The buffer to transmit + * @dev: The device used to transmit the packet. + * + * Called by the network layer to transmit the @skb. Queue the packet for + * the device and schedule the necessary work to transmit the packet when + * it is free. + * + * We do this to firstly avoid sleeping with the network device locked, + * and secondly so we can round up more than one packet to transmit which + * means we can try and avoid generating too many transmit done interrupts. + */ +static netdev_tx_t ks8851_start_xmit(struct sk_buff *skb, + struct net_device *dev) +{ + struct ks8851_net *ks = netdev_priv(dev); + + return ks->start_xmit(skb, dev); +} + /** * ks8851_rxctrl_work - work handler to change rx mode * @work: The work structure this belongs to. @@ -1591,6 +1715,16 @@ static int ks8851_probe(struct spi_device *spi) ks = netdev_priv(netdev); + ks->lock = ks8851_lock_spi; + ks->unlock = ks8851_unlock_spi; + ks->rdreg16 = ks8851_rdreg16_spi; + ks->wrreg16 = ks8851_wrreg16_spi; + ks->rdfifo = ks8851_rdfifo_spi; + ks->wrfifo = ks8851_wrpkt_spi; + ks->start_xmit = ks8851_start_xmit_spi; + ks->rx_skb = ks8851_rx_skb_spi; + ks->flush_tx_work = ks8851_flush_tx_work_spi; + #define STD_IRQ (IRQ_LCI | /* Link Change */ \ IRQ_TXI | /* TX done */ \ IRQ_RXI | /* RX done */ \ -- cgit v1.2.3-59-g8ed1b From b07f987a8d773c5c7dd5e5714e0468729abf132c Mon Sep 17 00:00:00 2001 From: Marek Vasut Date: Fri, 29 May 2020 00:21:44 +0200 Subject: net: ks8851: Separate SPI operations into separate file Pull all the SPI bus specific code into a separate file, so that it is not mixed with the common code. Rename ks8851.c to ks8851_common.c. The ks8851_common.c is linked with ks8851_spi.c now, so it can call the accessors in the ks8851_spi.c without any pointer indirection. Signed-off-by: Marek Vasut Cc: David S. Miller Cc: Lukas Wunner Cc: Petr Stetiar Cc: YueHaibing Signed-off-by: David S. Miller --- drivers/net/ethernet/micrel/Makefile | 1 + drivers/net/ethernet/micrel/ks8851.c | 1783 --------------------------- drivers/net/ethernet/micrel/ks8851.h | 149 +++ drivers/net/ethernet/micrel/ks8851_common.c | 1193 ++++++++++++++++++ drivers/net/ethernet/micrel/ks8851_spi.c | 485 ++++++++ 5 files changed, 1828 insertions(+), 1783 deletions(-) delete mode 100644 drivers/net/ethernet/micrel/ks8851.c create mode 100644 drivers/net/ethernet/micrel/ks8851_common.c create mode 100644 drivers/net/ethernet/micrel/ks8851_spi.c diff --git a/drivers/net/ethernet/micrel/Makefile b/drivers/net/ethernet/micrel/Makefile index 6d8ac5527aef..c7a4725c2e95 100644 --- a/drivers/net/ethernet/micrel/Makefile +++ b/drivers/net/ethernet/micrel/Makefile @@ -5,5 +5,6 @@ obj-$(CONFIG_KS8842) += ks8842.o obj-$(CONFIG_KS8851) += ks8851.o +ks8851-objs = ks8851_common.o ks8851_spi.o obj-$(CONFIG_KS8851_MLL) += ks8851_mll.o obj-$(CONFIG_KSZ884X_PCI) += ksz884x.o diff --git a/drivers/net/ethernet/micrel/ks8851.c b/drivers/net/ethernet/micrel/ks8851.c deleted file mode 100644 index 1fa907d5dd5b..000000000000 --- a/drivers/net/ethernet/micrel/ks8851.c +++ /dev/null @@ -1,1783 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* drivers/net/ethernet/micrel/ks8851.c - * - * Copyright 2009 Simtec Electronics - * http://www.simtec.co.uk/ - * Ben Dooks - */ - -#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt - -#define DEBUG - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include - -#include "ks8851.h" - -/** - * struct ks8851_rxctrl - KS8851 driver rx control - * @mchash: Multicast hash-table data. - * @rxcr1: KS_RXCR1 register setting - * @rxcr2: KS_RXCR2 register setting - * - * Representation of the settings needs to control the receive filtering - * such as the multicast hash-filter and the receive register settings. This - * is used to make the job of working out if the receive settings change and - * then issuing the new settings to the worker that will send the necessary - * commands. - */ -struct ks8851_rxctrl { - u16 mchash[4]; - u16 rxcr1; - u16 rxcr2; -}; - -/** - * union ks8851_tx_hdr - tx header data - * @txb: The header as bytes - * @txw: The header as 16bit, little-endian words - * - * A dual representation of the tx header data to allow - * access to individual bytes, and to allow 16bit accesses - * with 16bit alignment. - */ -union ks8851_tx_hdr { - u8 txb[6]; - __le16 txw[3]; -}; - -/** - * struct ks8851_net - KS8851 driver private data - * @netdev: The network device we're bound to - * @statelock: Lock on this structure for tx list. - * @mii: The MII state information for the mii calls. - * @rxctrl: RX settings for @rxctrl_work. - * @rxctrl_work: Work queue for updating RX mode and multicast lists - * @txq: Queue of packets for transmission. - * @txh: Space for generating packet TX header in DMA-able data - * @rxd: Space for receiving SPI data, in DMA-able space. - * @txd: Space for transmitting SPI data, in DMA-able space. - * @msg_enable: The message flags controlling driver output (see ethtool). - * @fid: Incrementing frame id tag. - * @rc_ier: Cached copy of KS_IER. - * @rc_ccr: Cached copy of KS_CCR. - * @rc_rxqcr: Cached copy of KS_RXQCR. - * @eeprom: 93CX6 EEPROM state for accessing on-board EEPROM. - * @vdd_reg: Optional regulator supplying the chip - * @vdd_io: Optional digital power supply for IO - * @gpio: Optional reset_n gpio - * @lock: Bus access lock callback - * @unlock: Bus access unlock callback - * @rdreg16: 16bit register read callback - * @wrreg16: 16bit register write callback - * @rdfifo: FIFO read callback - * @wrfifo: FIFO write callback - * @start_xmit: start_xmit() implementation callback - * @rx_skb: rx_skb() implementation callback - * @flush_tx_work: flush_tx_work() implementation callback - * - * The @statelock is used to protect information in the structure which may - * need to be accessed via several sources, such as the network driver layer - * or one of the work queues. - * - * We align the buffers we may use for rx/tx to ensure that if the SPI driver - * wants to DMA map them, it will not have any problems with data the driver - * modifies. - */ -struct ks8851_net { - struct net_device *netdev; - spinlock_t statelock; - - union ks8851_tx_hdr txh ____cacheline_aligned; - u8 rxd[8]; - u8 txd[8]; - - u32 msg_enable ____cacheline_aligned; - u16 tx_space; - u8 fid; - - u16 rc_ier; - u16 rc_rxqcr; - u16 rc_ccr; - - struct mii_if_info mii; - struct ks8851_rxctrl rxctrl; - - struct work_struct rxctrl_work; - - struct sk_buff_head txq; - - struct eeprom_93cx6 eeprom; - struct regulator *vdd_reg; - struct regulator *vdd_io; - int gpio; - - void (*lock)(struct ks8851_net *ks, - unsigned long *flags); - void (*unlock)(struct ks8851_net *ks, - unsigned long *flags); - unsigned int (*rdreg16)(struct ks8851_net *ks, - unsigned int reg); - void (*wrreg16)(struct ks8851_net *ks, - unsigned int reg, unsigned int val); - void (*rdfifo)(struct ks8851_net *ks, u8 *buff, - unsigned int len); - void (*wrfifo)(struct ks8851_net *ks, - struct sk_buff *txp, bool irq); - netdev_tx_t (*start_xmit)(struct sk_buff *skb, - struct net_device *dev); - void (*rx_skb)(struct ks8851_net *ks, - struct sk_buff *skb); - void (*flush_tx_work)(struct ks8851_net *ks); -}; - -/** - * struct ks8851_net_spi - KS8851 SPI driver private data - * @ks8851: KS8851 driver common private data - * @lock: Lock to ensure that the device is not accessed when busy. - * @tx_work: Work queue for tx packets - * @spidev: The spi device we're bound to. - * @spi_msg1: pre-setup SPI transfer with one message, @spi_xfer1. - * @spi_msg2: pre-setup SPI transfer with two messages, @spi_xfer2. - * - * The @lock ensures that the chip is protected when certain operations are - * in progress. When the read or write packet transfer is in progress, most - * of the chip registers are not ccessible until the transfer is finished and - * the DMA has been de-asserted. - */ -struct ks8851_net_spi { - struct ks8851_net ks8851; - struct mutex lock; - struct work_struct tx_work; - struct spi_device *spidev; - struct spi_message spi_msg1; - struct spi_message spi_msg2; - struct spi_transfer spi_xfer1; - struct spi_transfer spi_xfer2[2]; -}; - -#define to_ks8851_spi(ks) container_of((ks), struct ks8851_net_spi, ks8851) - -static int msg_enable; - -/* SPI frame opcodes */ -#define KS_SPIOP_RD (0x00) -#define KS_SPIOP_WR (0x40) -#define KS_SPIOP_RXFIFO (0x80) -#define KS_SPIOP_TXFIFO (0xC0) - -/* shift for byte-enable data */ -#define BYTE_EN(_x) ((_x) << 2) - -/* turn register number and byte-enable mask into data for start of packet */ -#define MK_OP(_byteen, _reg) (BYTE_EN(_byteen) | (_reg) << (8+2) | (_reg) >> 6) - -/** - * ks8851_lock_spi - register access lock for SPI - * @ks: The chip state - * @flags: Spinlock flags - * - * Claim chip register access lock - */ -static void ks8851_lock_spi(struct ks8851_net *ks, unsigned long *flags) -{ - struct ks8851_net_spi *kss = to_ks8851_spi(ks); - - mutex_lock(&kss->lock); -} - -/** - * ks8851_unlock_spi - register access unlock for SPI - * @ks: The chip state - * @flags: Spinlock flags - * - * Release chip register access lock - */ -static void ks8851_unlock_spi(struct ks8851_net *ks, unsigned long *flags) -{ - struct ks8851_net_spi *kss = to_ks8851_spi(ks); - - mutex_unlock(&kss->lock); -} - -/** - * ks8851_lock - register access lock - * @ks: The chip state - * @flags: Spinlock flags - * - * Claim chip register access lock - */ -static void ks8851_lock(struct ks8851_net *ks, unsigned long *flags) -{ - ks->lock(ks, flags); -} - -/** - * ks8851_unlock - register access unlock - * @ks: The chip state - * @flags: Spinlock flags - * - * Release chip register access lock - */ -static void ks8851_unlock(struct ks8851_net *ks, unsigned long *flags) -{ - ks->unlock(ks, flags); -} - -/* SPI register read/write calls. - * - * All these calls issue SPI transactions to access the chip's registers. They - * all require that the necessary lock is held to prevent accesses when the - * chip is busy transferring packet data (RX/TX FIFO accesses). - */ - -/** - * ks8851_wrreg16_spi - write 16bit register value to chip via SPI - * @ks: The chip state - * @reg: The register address - * @val: The value to write - * - * Issue a write to put the value @val into the register specified in @reg. - */ -static void ks8851_wrreg16_spi(struct ks8851_net *ks, unsigned int reg, - unsigned int val) -{ - struct ks8851_net_spi *kss = to_ks8851_spi(ks); - struct spi_transfer *xfer = &kss->spi_xfer1; - struct spi_message *msg = &kss->spi_msg1; - __le16 txb[2]; - int ret; - - txb[0] = cpu_to_le16(MK_OP(reg & 2 ? 0xC : 0x03, reg) | KS_SPIOP_WR); - txb[1] = cpu_to_le16(val); - - xfer->tx_buf = txb; - xfer->rx_buf = NULL; - xfer->len = 4; - - ret = spi_sync(kss->spidev, msg); - if (ret < 0) - netdev_err(ks->netdev, "spi_sync() failed\n"); -} - -/** - * ks8851_wrreg16 - write 16bit register value to chip - * @ks: The chip state - * @reg: The register address - * @val: The value to write - * - * Issue a write to put the value @val into the register specified in @reg. - */ -static void ks8851_wrreg16(struct ks8851_net *ks, unsigned int reg, - unsigned int val) -{ - ks->wrreg16(ks, reg, val); -} - -/** - * ks8851_rdreg - issue read register command and return the data - * @ks: The device state - * @op: The register address and byte enables in message format. - * @rxb: The RX buffer to return the result into - * @rxl: The length of data expected. - * - * This is the low level read call that issues the necessary spi message(s) - * to read data from the register specified in @op. - */ -static void ks8851_rdreg(struct ks8851_net *ks, unsigned op, - u8 *rxb, unsigned rxl) -{ - struct ks8851_net_spi *kss = to_ks8851_spi(ks); - struct spi_transfer *xfer; - struct spi_message *msg; - __le16 *txb = (__le16 *)ks->txd; - u8 *trx = ks->rxd; - int ret; - - txb[0] = cpu_to_le16(op | KS_SPIOP_RD); - - if (kss->spidev->master->flags & SPI_MASTER_HALF_DUPLEX) { - msg = &kss->spi_msg2; - xfer = kss->spi_xfer2; - - xfer->tx_buf = txb; - xfer->rx_buf = NULL; - xfer->len = 2; - - xfer++; - xfer->tx_buf = NULL; - xfer->rx_buf = trx; - xfer->len = rxl; - } else { - msg = &kss->spi_msg1; - xfer = &kss->spi_xfer1; - - xfer->tx_buf = txb; - xfer->rx_buf = trx; - xfer->len = rxl + 2; - } - - ret = spi_sync(kss->spidev, msg); - if (ret < 0) - netdev_err(ks->netdev, "read: spi_sync() failed\n"); - else if (kss->spidev->master->flags & SPI_MASTER_HALF_DUPLEX) - memcpy(rxb, trx, rxl); - else - memcpy(rxb, trx + 2, rxl); -} - -/** - * ks8851_rdreg16_spi - read 16 bit register from device via SPI - * @ks: The chip information - * @reg: The register address - * - * Read a 16bit register from the chip, returning the result -*/ -static unsigned int ks8851_rdreg16_spi(struct ks8851_net *ks, - unsigned int reg) -{ - __le16 rx = 0; - - ks8851_rdreg(ks, MK_OP(reg & 2 ? 0xC : 0x3, reg), (u8 *)&rx, 2); - return le16_to_cpu(rx); -} - -/** - * ks8851_rdreg16 - read 16 bit register from device - * @ks: The chip information - * @reg: The register address - * - * Read a 16bit register from the chip, returning the result - */ -static unsigned int ks8851_rdreg16(struct ks8851_net *ks, - unsigned int reg) -{ - return ks->rdreg16(ks, reg); -} - -/** - * ks8851_soft_reset - issue one of the soft reset to the device - * @ks: The device state. - * @op: The bit(s) to set in the GRR - * - * Issue the relevant soft-reset command to the device's GRR register - * specified by @op. - * - * Note, the delays are in there as a caution to ensure that the reset - * has time to take effect and then complete. Since the datasheet does - * not currently specify the exact sequence, we have chosen something - * that seems to work with our device. - */ -static void ks8851_soft_reset(struct ks8851_net *ks, unsigned op) -{ - ks8851_wrreg16(ks, KS_GRR, op); - mdelay(1); /* wait a short time to effect reset */ - ks8851_wrreg16(ks, KS_GRR, 0); - mdelay(1); /* wait for condition to clear */ -} - -/** - * ks8851_set_powermode - set power mode of the device - * @ks: The device state - * @pwrmode: The power mode value to write to KS_PMECR. - * - * Change the power mode of the chip. - */ -static void ks8851_set_powermode(struct ks8851_net *ks, unsigned pwrmode) -{ - unsigned pmecr; - - netif_dbg(ks, hw, ks->netdev, "setting power mode %d\n", pwrmode); - - pmecr = ks8851_rdreg16(ks, KS_PMECR); - pmecr &= ~PMECR_PM_MASK; - pmecr |= pwrmode; - - ks8851_wrreg16(ks, KS_PMECR, pmecr); -} - -/** - * ks8851_write_mac_addr - write mac address to device registers - * @dev: The network device - * - * Update the KS8851 MAC address registers from the address in @dev. - * - * This call assumes that the chip is not running, so there is no need to - * shutdown the RXQ process whilst setting this. -*/ -static int ks8851_write_mac_addr(struct net_device *dev) -{ - struct ks8851_net *ks = netdev_priv(dev); - unsigned long flags; - u16 val; - int i; - - ks8851_lock(ks, &flags); - - /* - * Wake up chip in case it was powered off when stopped; otherwise, - * the first write to the MAC address does not take effect. - */ - ks8851_set_powermode(ks, PMECR_PM_NORMAL); - - for (i = 0; i < ETH_ALEN; i += 2) { - val = (dev->dev_addr[i] << 8) | dev->dev_addr[i + 1]; - ks8851_wrreg16(ks, KS_MAR(i), val); - } - - if (!netif_running(dev)) - ks8851_set_powermode(ks, PMECR_PM_SOFTDOWN); - - ks8851_unlock(ks, &flags); - - return 0; -} - -/** - * ks8851_read_mac_addr - read mac address from device registers - * @dev: The network device - * - * Update our copy of the KS8851 MAC address from the registers of @dev. -*/ -static void ks8851_read_mac_addr(struct net_device *dev) -{ - struct ks8851_net *ks = netdev_priv(dev); - unsigned long flags; - u16 reg; - int i; - - ks8851_lock(ks, &flags); - - for (i = 0; i < ETH_ALEN; i += 2) { - reg = ks8851_rdreg16(ks, KS_MAR(i)); - dev->dev_addr[i] = reg >> 8; - dev->dev_addr[i + 1] = reg & 0xff; - } - - ks8851_unlock(ks, &flags); -} - -/** - * ks8851_init_mac - initialise the mac address - * @ks: The device structure - * @np: The device node pointer - * - * Get or create the initial mac address for the device and then set that - * into the station address register. A mac address supplied in the device - * tree takes precedence. Otherwise, if there is an EEPROM present, then - * we try that. If no valid mac address is found we use eth_random_addr() - * to create a new one. - */ -static void ks8851_init_mac(struct ks8851_net *ks, struct device_node *np) -{ - struct net_device *dev = ks->netdev; - const u8 *mac_addr; - - mac_addr = of_get_mac_address(np); - if (!IS_ERR(mac_addr)) { - ether_addr_copy(dev->dev_addr, mac_addr); - ks8851_write_mac_addr(dev); - return; - } - - if (ks->rc_ccr & CCR_EEPROM) { - ks8851_read_mac_addr(dev); - if (is_valid_ether_addr(dev->dev_addr)) - return; - - netdev_err(ks->netdev, "invalid mac address read %pM\n", - dev->dev_addr); - } - - eth_hw_addr_random(dev); - ks8851_write_mac_addr(dev); -} - -/** - * ks8851_rdfifo_spi - read data from the receive fifo via SPI - * @ks: The device state. - * @buff: The buffer address - * @len: The length of the data to read - * - * Issue an RXQ FIFO read command and read the @len amount of data from - * the FIFO into the buffer specified by @buff. - */ -static void ks8851_rdfifo_spi(struct ks8851_net *ks, u8 *buff, - unsigned int len) -{ - struct ks8851_net_spi *kss = to_ks8851_spi(ks); - struct spi_transfer *xfer = kss->spi_xfer2; - struct spi_message *msg = &kss->spi_msg2; - u8 txb[1]; - int ret; - - netif_dbg(ks, rx_status, ks->netdev, - "%s: %d@%p\n", __func__, len, buff); - - /* set the operation we're issuing */ - txb[0] = KS_SPIOP_RXFIFO; - - xfer->tx_buf = txb; - xfer->rx_buf = NULL; - xfer->len = 1; - - xfer++; - xfer->rx_buf = buff; - xfer->tx_buf = NULL; - xfer->len = len; - - ret = spi_sync(kss->spidev, msg); - if (ret < 0) - netdev_err(ks->netdev, "%s: spi_sync() failed\n", __func__); -} - -/** - * ks8851_dbg_dumpkkt - dump initial packet contents to debug - * @ks: The device state - * @rxpkt: The data for the received packet - * - * Dump the initial data from the packet to dev_dbg(). -*/ -static void ks8851_dbg_dumpkkt(struct ks8851_net *ks, u8 *rxpkt) -{ - netdev_dbg(ks->netdev, - "pkt %02x%02x%02x%02x %02x%02x%02x%02x %02x%02x%02x%02x\n", - rxpkt[4], rxpkt[5], rxpkt[6], rxpkt[7], - rxpkt[8], rxpkt[9], rxpkt[10], rxpkt[11], - rxpkt[12], rxpkt[13], rxpkt[14], rxpkt[15]); -} - -/** - * ks8851_rx_skb_spi - receive skbuff for SPI - * @ks: The device state - * @skb: The skbuff - */ -static void ks8851_rx_skb_spi(struct ks8851_net *ks, struct sk_buff *skb) -{ - netif_rx_ni(skb); -} - -/** - * ks8851_rx_skb - receive skbuff - * @ks: The device state - * @skb: The skbuff - */ -static void ks8851_rx_skb(struct ks8851_net *ks, struct sk_buff *skb) -{ - ks->rx_skb(ks, skb); -} - -/** - * ks8851_rx_pkts - receive packets from the host - * @ks: The device information. - * - * This is called from the IRQ work queue when the system detects that there - * are packets in the receive queue. Find out how many packets there are and - * read them from the FIFO. - */ -static void ks8851_rx_pkts(struct ks8851_net *ks) -{ - struct sk_buff *skb; - unsigned rxfc; - unsigned rxlen; - unsigned rxstat; - u8 *rxpkt; - - rxfc = (ks8851_rdreg16(ks, KS_RXFCTR) >> 8) & 0xff; - - netif_dbg(ks, rx_status, ks->netdev, - "%s: %d packets\n", __func__, rxfc); - - /* Currently we're issuing a read per packet, but we could possibly - * improve the code by issuing a single read, getting the receive - * header, allocating the packet and then reading the packet data - * out in one go. - * - * This form of operation would require us to hold the SPI bus' - * chipselect low during the entie transaction to avoid any - * reset to the data stream coming from the chip. - */ - - for (; rxfc != 0; rxfc--) { - rxstat = ks8851_rdreg16(ks, KS_RXFHSR); - rxlen = ks8851_rdreg16(ks, KS_RXFHBCR) & RXFHBCR_CNT_MASK; - - netif_dbg(ks, rx_status, ks->netdev, - "rx: stat 0x%04x, len 0x%04x\n", rxstat, rxlen); - - /* the length of the packet includes the 32bit CRC */ - - /* set dma read address */ - ks8851_wrreg16(ks, KS_RXFDPR, RXFDPR_RXFPAI | 0x00); - - /* start DMA access */ - ks8851_wrreg16(ks, KS_RXQCR, ks->rc_rxqcr | RXQCR_SDA); - - if (rxlen > 4) { - unsigned int rxalign; - - rxlen -= 4; - rxalign = ALIGN(rxlen, 4); - skb = netdev_alloc_skb_ip_align(ks->netdev, rxalign); - if (skb) { - - /* 4 bytes of status header + 4 bytes of - * garbage: we put them before ethernet - * header, so that they are copied, - * but ignored. - */ - - rxpkt = skb_put(skb, rxlen) - 8; - - ks->rdfifo(ks, rxpkt, rxalign + 8); - - if (netif_msg_pktdata(ks)) - ks8851_dbg_dumpkkt(ks, rxpkt); - - skb->protocol = eth_type_trans(skb, ks->netdev); - ks8851_rx_skb(ks, skb); - - ks->netdev->stats.rx_packets++; - ks->netdev->stats.rx_bytes += rxlen; - } - } - - /* end DMA access and dequeue packet */ - ks8851_wrreg16(ks, KS_RXQCR, ks->rc_rxqcr | RXQCR_RRXEF); - } -} - -/** - * ks8851_irq - IRQ handler for dealing with interrupt requests - * @irq: IRQ number - * @_ks: cookie - * - * This handler is invoked when the IRQ line asserts to find out what happened. - * As we cannot allow ourselves to sleep in HARDIRQ context, this handler runs - * in thread context. - * - * Read the interrupt status, work out what needs to be done and then clear - * any of the interrupts that are not needed. - */ -static irqreturn_t ks8851_irq(int irq, void *_ks) -{ - struct ks8851_net *ks = _ks; - unsigned handled = 0; - unsigned long flags; - unsigned int status; - - ks8851_lock(ks, &flags); - - status = ks8851_rdreg16(ks, KS_ISR); - - netif_dbg(ks, intr, ks->netdev, - "%s: status 0x%04x\n", __func__, status); - - if (status & IRQ_LCI) - handled |= IRQ_LCI; - - if (status & IRQ_LDI) { - u16 pmecr = ks8851_rdreg16(ks, KS_PMECR); - pmecr &= ~PMECR_WKEVT_MASK; - ks8851_wrreg16(ks, KS_PMECR, pmecr | PMECR_WKEVT_LINK); - - handled |= IRQ_LDI; - } - - if (status & IRQ_RXPSI) - handled |= IRQ_RXPSI; - - if (status & IRQ_TXI) { - handled |= IRQ_TXI; - - /* no lock here, tx queue should have been stopped */ - - /* update our idea of how much tx space is available to the - * system */ - ks->tx_space = ks8851_rdreg16(ks, KS_TXMIR); - - netif_dbg(ks, intr, ks->netdev, - "%s: txspace %d\n", __func__, ks->tx_space); - } - - if (status & IRQ_RXI) - handled |= IRQ_RXI; - - if (status & IRQ_SPIBEI) { - netdev_err(ks->netdev, "%s: spi bus error\n", __func__); - handled |= IRQ_SPIBEI; - } - - ks8851_wrreg16(ks, KS_ISR, handled); - - if (status & IRQ_RXI) { - /* the datasheet says to disable the rx interrupt during - * packet read-out, however we're masking the interrupt - * from the device so do not bother masking just the RX - * from the device. */ - - ks8851_rx_pkts(ks); - } - - /* if something stopped the rx process, probably due to wanting - * to change the rx settings, then do something about restarting - * it. */ - if (status & IRQ_RXPSI) { - struct ks8851_rxctrl *rxc = &ks->rxctrl; - - /* update the multicast hash table */ - ks8851_wrreg16(ks, KS_MAHTR0, rxc->mchash[0]); - ks8851_wrreg16(ks, KS_MAHTR1, rxc->mchash[1]); - ks8851_wrreg16(ks, KS_MAHTR2, rxc->mchash[2]); - ks8851_wrreg16(ks, KS_MAHTR3, rxc->mchash[3]); - - ks8851_wrreg16(ks, KS_RXCR2, rxc->rxcr2); - ks8851_wrreg16(ks, KS_RXCR1, rxc->rxcr1); - } - - ks8851_unlock(ks, &flags); - - if (status & IRQ_LCI) - mii_check_link(&ks->mii); - - if (status & IRQ_TXI) - netif_wake_queue(ks->netdev); - - return IRQ_HANDLED; -} - -/** - * calc_txlen - calculate size of message to send packet - * @len: Length of data - * - * Returns the size of the TXFIFO message needed to send - * this packet. - */ -static inline unsigned calc_txlen(unsigned len) -{ - return ALIGN(len + 4, 4); -} - -/** - * ks8851_wrpkt_spi - write packet to TX FIFO via SPI - * @ks: The device state. - * @txp: The sk_buff to transmit. - * @irq: IRQ on completion of the packet. - * - * Send the @txp to the chip. This means creating the relevant packet header - * specifying the length of the packet and the other information the chip - * needs, such as IRQ on completion. Send the header and the packet data to - * the device. - */ -static void ks8851_wrpkt_spi(struct ks8851_net *ks, struct sk_buff *txp, - bool irq) -{ - struct ks8851_net_spi *kss = to_ks8851_spi(ks); - struct spi_transfer *xfer = kss->spi_xfer2; - struct spi_message *msg = &kss->spi_msg2; - unsigned fid = 0; - int ret; - - netif_dbg(ks, tx_queued, ks->netdev, "%s: skb %p, %d@%p, irq %d\n", - __func__, txp, txp->len, txp->data, irq); - - fid = ks->fid++; - fid &= TXFR_TXFID_MASK; - - if (irq) - fid |= TXFR_TXIC; /* irq on completion */ - - /* start header at txb[1] to align txw entries */ - ks->txh.txb[1] = KS_SPIOP_TXFIFO; - ks->txh.txw[1] = cpu_to_le16(fid); - ks->txh.txw[2] = cpu_to_le16(txp->len); - - xfer->tx_buf = &ks->txh.txb[1]; - xfer->rx_buf = NULL; - xfer->len = 5; - - xfer++; - xfer->tx_buf = txp->data; - xfer->rx_buf = NULL; - xfer->len = ALIGN(txp->len, 4); - - ret = spi_sync(kss->spidev, msg); - if (ret < 0) - netdev_err(ks->netdev, "%s: spi_sync() failed\n", __func__); -} - -/** - * ks8851_done_tx - update and then free skbuff after transmitting - * @ks: The device state - * @txb: The buffer transmitted - */ -static void ks8851_done_tx(struct ks8851_net *ks, struct sk_buff *txb) -{ - struct net_device *dev = ks->netdev; - - dev->stats.tx_bytes += txb->len; - dev->stats.tx_packets++; - - dev_kfree_skb(txb); -} - -/** - * ks8851_tx_work - process tx packet(s) - * @work: The work strucutre what was scheduled. - * - * This is called when a number of packets have been scheduled for - * transmission and need to be sent to the device. - */ -static void ks8851_tx_work(struct work_struct *work) -{ - struct ks8851_net_spi *kss; - struct ks8851_net *ks; - unsigned long flags; - struct sk_buff *txb; - bool last; - - kss = container_of(work, struct ks8851_net_spi, tx_work); - ks = &kss->ks8851; - last = skb_queue_empty(&ks->txq); - - ks8851_lock(ks, &flags); - - while (!last) { - txb = skb_dequeue(&ks->txq); - last = skb_queue_empty(&ks->txq); - - if (txb != NULL) { - ks8851_wrreg16(ks, KS_RXQCR, ks->rc_rxqcr | RXQCR_SDA); - ks->wrfifo(ks, txb, last); - ks8851_wrreg16(ks, KS_RXQCR, ks->rc_rxqcr); - ks8851_wrreg16(ks, KS_TXQCR, TXQCR_METFE); - - ks8851_done_tx(ks, txb); - } - } - - ks8851_unlock(ks, &flags); -} - -/** - * ks8851_flush_tx_work_spi - flush outstanding TX work for SPI - * @ks: The device state - */ -static void ks8851_flush_tx_work_spi(struct ks8851_net *ks) -{ - struct ks8851_net_spi *kss = to_ks8851_spi(ks); - - flush_work(&kss->tx_work); -} - -/** - * ks8851_flush_tx_work - flush outstanding TX work - * @ks: The device state - */ -static void ks8851_flush_tx_work(struct ks8851_net *ks) -{ - if (ks->flush_tx_work) - ks->flush_tx_work(ks); -} - -/** - * ks8851_net_open - open network device - * @dev: The network device being opened. - * - * Called when the network device is marked active, such as a user executing - * 'ifconfig up' on the device. - */ -static int ks8851_net_open(struct net_device *dev) -{ - struct ks8851_net *ks = netdev_priv(dev); - unsigned long flags; - int ret; - - ret = request_threaded_irq(dev->irq, NULL, ks8851_irq, - IRQF_TRIGGER_LOW | IRQF_ONESHOT, - dev->name, ks); - if (ret < 0) { - netdev_err(dev, "failed to get irq\n"); - return ret; - } - - /* lock the card, even if we may not actually be doing anything - * else at the moment */ - ks8851_lock(ks, &flags); - - netif_dbg(ks, ifup, ks->netdev, "opening\n"); - - /* bring chip out of any power saving mode it was in */ - ks8851_set_powermode(ks, PMECR_PM_NORMAL); - - /* issue a soft reset to the RX/TX QMU to put it into a known - * state. */ - ks8851_soft_reset(ks, GRR_QMU); - - /* setup transmission parameters */ - - ks8851_wrreg16(ks, KS_TXCR, (TXCR_TXE | /* enable transmit process */ - TXCR_TXPE | /* pad to min length */ - TXCR_TXCRC | /* add CRC */ - TXCR_TXFCE)); /* enable flow control */ - - /* auto-increment tx data, reset tx pointer */ - ks8851_wrreg16(ks, KS_TXFDPR, TXFDPR_TXFPAI); - - /* setup receiver control */ - - ks8851_wrreg16(ks, KS_RXCR1, (RXCR1_RXPAFMA | /* from mac filter */ - RXCR1_RXFCE | /* enable flow control */ - RXCR1_RXBE | /* broadcast enable */ - RXCR1_RXUE | /* unicast enable */ - RXCR1_RXE)); /* enable rx block */ - - /* transfer entire frames out in one go */ - ks8851_wrreg16(ks, KS_RXCR2, RXCR2_SRDBL_FRAME); - - /* set receive counter timeouts */ - ks8851_wrreg16(ks, KS_RXDTTR, 1000); /* 1ms after first frame to IRQ */ - ks8851_wrreg16(ks, KS_RXDBCTR, 4096); /* >4Kbytes in buffer to IRQ */ - ks8851_wrreg16(ks, KS_RXFCTR, 10); /* 10 frames to IRQ */ - - ks->rc_rxqcr = (RXQCR_RXFCTE | /* IRQ on frame count exceeded */ - RXQCR_RXDBCTE | /* IRQ on byte count exceeded */ - RXQCR_RXDTTE); /* IRQ on time exceeded */ - - ks8851_wrreg16(ks, KS_RXQCR, ks->rc_rxqcr); - - /* clear then enable interrupts */ - ks8851_wrreg16(ks, KS_ISR, ks->rc_ier); - ks8851_wrreg16(ks, KS_IER, ks->rc_ier); - - netif_start_queue(ks->netdev); - - netif_dbg(ks, ifup, ks->netdev, "network device up\n"); - - ks8851_unlock(ks, &flags); - mii_check_link(&ks->mii); - return 0; -} - -/** - * ks8851_net_stop - close network device - * @dev: The device being closed. - * - * Called to close down a network device which has been active. Cancell any - * work, shutdown the RX and TX process and then place the chip into a low - * power state whilst it is not being used. - */ -static int ks8851_net_stop(struct net_device *dev) -{ - struct ks8851_net *ks = netdev_priv(dev); - unsigned long flags; - - netif_info(ks, ifdown, dev, "shutting down\n"); - - netif_stop_queue(dev); - - ks8851_lock(ks, &flags); - /* turn off the IRQs and ack any outstanding */ - ks8851_wrreg16(ks, KS_IER, 0x0000); - ks8851_wrreg16(ks, KS_ISR, 0xffff); - ks8851_unlock(ks, &flags); - - /* stop any outstanding work */ - ks8851_flush_tx_work(ks); - flush_work(&ks->rxctrl_work); - - ks8851_lock(ks, &flags); - /* shutdown RX process */ - ks8851_wrreg16(ks, KS_RXCR1, 0x0000); - - /* shutdown TX process */ - ks8851_wrreg16(ks, KS_TXCR, 0x0000); - - /* set powermode to soft power down to save power */ - ks8851_set_powermode(ks, PMECR_PM_SOFTDOWN); - ks8851_unlock(ks, &flags); - - /* ensure any queued tx buffers are dumped */ - while (!skb_queue_empty(&ks->txq)) { - struct sk_buff *txb = skb_dequeue(&ks->txq); - - netif_dbg(ks, ifdown, ks->netdev, - "%s: freeing txb %p\n", __func__, txb); - - dev_kfree_skb(txb); - } - - free_irq(dev->irq, ks); - - return 0; -} - -/** - * ks8851_start_xmit_spi - transmit packet using SPI - * @skb: The buffer to transmit - * @dev: The device used to transmit the packet. - * - * Called by the network layer to transmit the @skb. Queue the packet for - * the device and schedule the necessary work to transmit the packet when - * it is free. - * - * We do this to firstly avoid sleeping with the network device locked, - * and secondly so we can round up more than one packet to transmit which - * means we can try and avoid generating too many transmit done interrupts. - */ -static netdev_tx_t ks8851_start_xmit_spi(struct sk_buff *skb, - struct net_device *dev) -{ - struct ks8851_net *ks = netdev_priv(dev); - unsigned needed = calc_txlen(skb->len); - netdev_tx_t ret = NETDEV_TX_OK; - struct ks8851_net_spi *kss; - - kss = to_ks8851_spi(ks); - - netif_dbg(ks, tx_queued, ks->netdev, - "%s: skb %p, %d@%p\n", __func__, skb, skb->len, skb->data); - - spin_lock(&ks->statelock); - - if (needed > ks->tx_space) { - netif_stop_queue(dev); - ret = NETDEV_TX_BUSY; - } else { - ks->tx_space -= needed; - skb_queue_tail(&ks->txq, skb); - } - - spin_unlock(&ks->statelock); - schedule_work(&kss->tx_work); - - return ret; -} - -/** - * ks8851_start_xmit - transmit packet - * @skb: The buffer to transmit - * @dev: The device used to transmit the packet. - * - * Called by the network layer to transmit the @skb. Queue the packet for - * the device and schedule the necessary work to transmit the packet when - * it is free. - * - * We do this to firstly avoid sleeping with the network device locked, - * and secondly so we can round up more than one packet to transmit which - * means we can try and avoid generating too many transmit done interrupts. - */ -static netdev_tx_t ks8851_start_xmit(struct sk_buff *skb, - struct net_device *dev) -{ - struct ks8851_net *ks = netdev_priv(dev); - - return ks->start_xmit(skb, dev); -} - -/** - * ks8851_rxctrl_work - work handler to change rx mode - * @work: The work structure this belongs to. - * - * Lock the device and issue the necessary changes to the receive mode from - * the network device layer. This is done so that we can do this without - * having to sleep whilst holding the network device lock. - * - * Since the recommendation from Micrel is that the RXQ is shutdown whilst the - * receive parameters are programmed, we issue a write to disable the RXQ and - * then wait for the interrupt handler to be triggered once the RXQ shutdown is - * complete. The interrupt handler then writes the new values into the chip. - */ -static void ks8851_rxctrl_work(struct work_struct *work) -{ - struct ks8851_net *ks = container_of(work, struct ks8851_net, rxctrl_work); - unsigned long flags; - - ks8851_lock(ks, &flags); - - /* need to shutdown RXQ before modifying filter parameters */ - ks8851_wrreg16(ks, KS_RXCR1, 0x00); - - ks8851_unlock(ks, &flags); -} - -static void ks8851_set_rx_mode(struct net_device *dev) -{ - struct ks8851_net *ks = netdev_priv(dev); - struct ks8851_rxctrl rxctrl; - - memset(&rxctrl, 0, sizeof(rxctrl)); - - if (dev->flags & IFF_PROMISC) { - /* interface to receive everything */ - - rxctrl.rxcr1 = RXCR1_RXAE | RXCR1_RXINVF; - } else if (dev->flags & IFF_ALLMULTI) { - /* accept all multicast packets */ - - rxctrl.rxcr1 = (RXCR1_RXME | RXCR1_RXAE | - RXCR1_RXPAFMA | RXCR1_RXMAFMA); - } else if (dev->flags & IFF_MULTICAST && !netdev_mc_empty(dev)) { - struct netdev_hw_addr *ha; - u32 crc; - - /* accept some multicast */ - - netdev_for_each_mc_addr(ha, dev) { - crc = ether_crc(ETH_ALEN, ha->addr); - crc >>= (32 - 6); /* get top six bits */ - - rxctrl.mchash[crc >> 4] |= (1 << (crc & 0xf)); - } - - rxctrl.rxcr1 = RXCR1_RXME | RXCR1_RXPAFMA; - } else { - /* just accept broadcast / unicast */ - rxctrl.rxcr1 = RXCR1_RXPAFMA; - } - - rxctrl.rxcr1 |= (RXCR1_RXUE | /* unicast enable */ - RXCR1_RXBE | /* broadcast enable */ - RXCR1_RXE | /* RX process enable */ - RXCR1_RXFCE); /* enable flow control */ - - rxctrl.rxcr2 |= RXCR2_SRDBL_FRAME; - - /* schedule work to do the actual set of the data if needed */ - - spin_lock(&ks->statelock); - - if (memcmp(&rxctrl, &ks->rxctrl, sizeof(rxctrl)) != 0) { - memcpy(&ks->rxctrl, &rxctrl, sizeof(ks->rxctrl)); - schedule_work(&ks->rxctrl_work); - } - - spin_unlock(&ks->statelock); -} - -static int ks8851_set_mac_address(struct net_device *dev, void *addr) -{ - struct sockaddr *sa = addr; - - if (netif_running(dev)) - return -EBUSY; - - if (!is_valid_ether_addr(sa->sa_data)) - return -EADDRNOTAVAIL; - - memcpy(dev->dev_addr, sa->sa_data, ETH_ALEN); - return ks8851_write_mac_addr(dev); -} - -static int ks8851_net_ioctl(struct net_device *dev, struct ifreq *req, int cmd) -{ - struct ks8851_net *ks = netdev_priv(dev); - - if (!netif_running(dev)) - return -EINVAL; - - return generic_mii_ioctl(&ks->mii, if_mii(req), cmd, NULL); -} - -static const struct net_device_ops ks8851_netdev_ops = { - .ndo_open = ks8851_net_open, - .ndo_stop = ks8851_net_stop, - .ndo_do_ioctl = ks8851_net_ioctl, - .ndo_start_xmit = ks8851_start_xmit, - .ndo_set_mac_address = ks8851_set_mac_address, - .ndo_set_rx_mode = ks8851_set_rx_mode, - .ndo_validate_addr = eth_validate_addr, -}; - -/* ethtool support */ - -static void ks8851_get_drvinfo(struct net_device *dev, - struct ethtool_drvinfo *di) -{ - strlcpy(di->driver, "KS8851", sizeof(di->driver)); - strlcpy(di->version, "1.00", sizeof(di->version)); - strlcpy(di->bus_info, dev_name(dev->dev.parent), sizeof(di->bus_info)); -} - -static u32 ks8851_get_msglevel(struct net_device *dev) -{ - struct ks8851_net *ks = netdev_priv(dev); - return ks->msg_enable; -} - -static void ks8851_set_msglevel(struct net_device *dev, u32 to) -{ - struct ks8851_net *ks = netdev_priv(dev); - ks->msg_enable = to; -} - -static int ks8851_get_link_ksettings(struct net_device *dev, - struct ethtool_link_ksettings *cmd) -{ - struct ks8851_net *ks = netdev_priv(dev); - - mii_ethtool_get_link_ksettings(&ks->mii, cmd); - - return 0; -} - -static int ks8851_set_link_ksettings(struct net_device *dev, - const struct ethtool_link_ksettings *cmd) -{ - struct ks8851_net *ks = netdev_priv(dev); - return mii_ethtool_set_link_ksettings(&ks->mii, cmd); -} - -static u32 ks8851_get_link(struct net_device *dev) -{ - struct ks8851_net *ks = netdev_priv(dev); - return mii_link_ok(&ks->mii); -} - -static int ks8851_nway_reset(struct net_device *dev) -{ - struct ks8851_net *ks = netdev_priv(dev); - return mii_nway_restart(&ks->mii); -} - -/* EEPROM support */ - -static void ks8851_eeprom_regread(struct eeprom_93cx6 *ee) -{ - struct ks8851_net *ks = ee->data; - unsigned val; - - val = ks8851_rdreg16(ks, KS_EEPCR); - - ee->reg_data_out = (val & EEPCR_EESB) ? 1 : 0; - ee->reg_data_clock = (val & EEPCR_EESCK) ? 1 : 0; - ee->reg_chip_select = (val & EEPCR_EECS) ? 1 : 0; -} - -static void ks8851_eeprom_regwrite(struct eeprom_93cx6 *ee) -{ - struct ks8851_net *ks = ee->data; - unsigned val = EEPCR_EESA; /* default - eeprom access on */ - - if (ee->drive_data) - val |= EEPCR_EESRWA; - if (ee->reg_data_in) - val |= EEPCR_EEDO; - if (ee->reg_data_clock) - val |= EEPCR_EESCK; - if (ee->reg_chip_select) - val |= EEPCR_EECS; - - ks8851_wrreg16(ks, KS_EEPCR, val); -} - -/** - * ks8851_eeprom_claim - claim device EEPROM and activate the interface - * @ks: The network device state. - * - * Check for the presence of an EEPROM, and then activate software access - * to the device. - */ -static int ks8851_eeprom_claim(struct ks8851_net *ks) -{ - /* start with clock low, cs high */ - ks8851_wrreg16(ks, KS_EEPCR, EEPCR_EESA | EEPCR_EECS); - return 0; -} - -/** - * ks8851_eeprom_release - release the EEPROM interface - * @ks: The device state - * - * Release the software access to the device EEPROM - */ -static void ks8851_eeprom_release(struct ks8851_net *ks) -{ - unsigned val = ks8851_rdreg16(ks, KS_EEPCR); - - ks8851_wrreg16(ks, KS_EEPCR, val & ~EEPCR_EESA); -} - -#define KS_EEPROM_MAGIC (0x00008851) - -static int ks8851_set_eeprom(struct net_device *dev, - struct ethtool_eeprom *ee, u8 *data) -{ - struct ks8851_net *ks = netdev_priv(dev); - int offset = ee->offset; - unsigned long flags; - int len = ee->len; - u16 tmp; - - /* currently only support byte writing */ - if (len != 1) - return -EINVAL; - - if (ee->magic != KS_EEPROM_MAGIC) - return -EINVAL; - - if (!(ks->rc_ccr & CCR_EEPROM)) - return -ENOENT; - - ks8851_lock(ks, &flags); - - ks8851_eeprom_claim(ks); - - eeprom_93cx6_wren(&ks->eeprom, true); - - /* ethtool currently only supports writing bytes, which means - * we have to read/modify/write our 16bit EEPROMs */ - - eeprom_93cx6_read(&ks->eeprom, offset/2, &tmp); - - if (offset & 1) { - tmp &= 0xff; - tmp |= *data << 8; - } else { - tmp &= 0xff00; - tmp |= *data; - } - - eeprom_93cx6_write(&ks->eeprom, offset/2, tmp); - eeprom_93cx6_wren(&ks->eeprom, false); - - ks8851_eeprom_release(ks); - ks8851_unlock(ks, &flags); - - return 0; -} - -static int ks8851_get_eeprom(struct net_device *dev, - struct ethtool_eeprom *ee, u8 *data) -{ - struct ks8851_net *ks = netdev_priv(dev); - int offset = ee->offset; - unsigned long flags; - int len = ee->len; - - /* must be 2 byte aligned */ - if (len & 1 || offset & 1) - return -EINVAL; - - if (!(ks->rc_ccr & CCR_EEPROM)) - return -ENOENT; - - ks8851_lock(ks, &flags); - - ks8851_eeprom_claim(ks); - - ee->magic = KS_EEPROM_MAGIC; - - eeprom_93cx6_multiread(&ks->eeprom, offset/2, (__le16 *)data, len/2); - ks8851_eeprom_release(ks); - ks8851_unlock(ks, &flags); - - return 0; -} - -static int ks8851_get_eeprom_len(struct net_device *dev) -{ - struct ks8851_net *ks = netdev_priv(dev); - - /* currently, we assume it is an 93C46 attached, so return 128 */ - return ks->rc_ccr & CCR_EEPROM ? 128 : 0; -} - -static const struct ethtool_ops ks8851_ethtool_ops = { - .get_drvinfo = ks8851_get_drvinfo, - .get_msglevel = ks8851_get_msglevel, - .set_msglevel = ks8851_set_msglevel, - .get_link = ks8851_get_link, - .nway_reset = ks8851_nway_reset, - .get_eeprom_len = ks8851_get_eeprom_len, - .get_eeprom = ks8851_get_eeprom, - .set_eeprom = ks8851_set_eeprom, - .get_link_ksettings = ks8851_get_link_ksettings, - .set_link_ksettings = ks8851_set_link_ksettings, -}; - -/* MII interface controls */ - -/** - * ks8851_phy_reg - convert MII register into a KS8851 register - * @reg: MII register number. - * - * Return the KS8851 register number for the corresponding MII PHY register - * if possible. Return zero if the MII register has no direct mapping to the - * KS8851 register set. - */ -static int ks8851_phy_reg(int reg) -{ - switch (reg) { - case MII_BMCR: - return KS_P1MBCR; - case MII_BMSR: - return KS_P1MBSR; - case MII_PHYSID1: - return KS_PHY1ILR; - case MII_PHYSID2: - return KS_PHY1IHR; - case MII_ADVERTISE: - return KS_P1ANAR; - case MII_LPA: - return KS_P1ANLPR; - } - - return 0x0; -} - -/** - * ks8851_phy_read - MII interface PHY register read. - * @dev: The network device the PHY is on. - * @phy_addr: Address of PHY (ignored as we only have one) - * @reg: The register to read. - * - * This call reads data from the PHY register specified in @reg. Since the - * device does not support all the MII registers, the non-existent values - * are always returned as zero. - * - * We return zero for unsupported registers as the MII code does not check - * the value returned for any error status, and simply returns it to the - * caller. The mii-tool that the driver was tested with takes any -ve error - * as real PHY capabilities, thus displaying incorrect data to the user. - */ -static int ks8851_phy_read(struct net_device *dev, int phy_addr, int reg) -{ - struct ks8851_net *ks = netdev_priv(dev); - unsigned long flags; - int ksreg; - int result; - - ksreg = ks8851_phy_reg(reg); - if (!ksreg) - return 0x0; /* no error return allowed, so use zero */ - - ks8851_lock(ks, &flags); - result = ks8851_rdreg16(ks, ksreg); - ks8851_unlock(ks, &flags); - - return result; -} - -static void ks8851_phy_write(struct net_device *dev, - int phy, int reg, int value) -{ - struct ks8851_net *ks = netdev_priv(dev); - unsigned long flags; - int ksreg; - - ksreg = ks8851_phy_reg(reg); - if (ksreg) { - ks8851_lock(ks, &flags); - ks8851_wrreg16(ks, ksreg, value); - ks8851_unlock(ks, &flags); - } -} - -/** - * ks8851_read_selftest - read the selftest memory info. - * @ks: The device state - * - * Read and check the TX/RX memory selftest information. - */ -static int ks8851_read_selftest(struct ks8851_net *ks) -{ - unsigned both_done = MBIR_TXMBF | MBIR_RXMBF; - int ret = 0; - unsigned rd; - - rd = ks8851_rdreg16(ks, KS_MBIR); - - if ((rd & both_done) != both_done) { - netdev_warn(ks->netdev, "Memory selftest not finished\n"); - return 0; - } - - if (rd & MBIR_TXMBFA) { - netdev_err(ks->netdev, "TX memory selftest fail\n"); - ret |= 1; - } - - if (rd & MBIR_RXMBFA) { - netdev_err(ks->netdev, "RX memory selftest fail\n"); - ret |= 2; - } - - return 0; -} - -/* driver bus management functions */ - -#ifdef CONFIG_PM_SLEEP - -static int ks8851_suspend(struct device *dev) -{ - struct ks8851_net *ks = dev_get_drvdata(dev); - struct net_device *netdev = ks->netdev; - - if (netif_running(netdev)) { - netif_device_detach(netdev); - ks8851_net_stop(netdev); - } - - return 0; -} - -static int ks8851_resume(struct device *dev) -{ - struct ks8851_net *ks = dev_get_drvdata(dev); - struct net_device *netdev = ks->netdev; - - if (netif_running(netdev)) { - ks8851_net_open(netdev); - netif_device_attach(netdev); - } - - return 0; -} -#endif - -static SIMPLE_DEV_PM_OPS(ks8851_pm_ops, ks8851_suspend, ks8851_resume); - -static int ks8851_probe_common(struct net_device *netdev, struct device *dev, - int msg_en) -{ - struct ks8851_net *ks = netdev_priv(netdev); - unsigned cider; - int gpio; - int ret; - - ks->netdev = netdev; - ks->tx_space = 6144; - - gpio = of_get_named_gpio_flags(dev->of_node, "reset-gpios", 0, NULL); - if (gpio == -EPROBE_DEFER) - return gpio; - - ks->gpio = gpio; - if (gpio_is_valid(gpio)) { - ret = devm_gpio_request_one(dev, gpio, - GPIOF_OUT_INIT_LOW, "ks8851_rst_n"); - if (ret) { - dev_err(dev, "reset gpio request failed\n"); - return ret; - } - } - - ks->vdd_io = devm_regulator_get(dev, "vdd-io"); - if (IS_ERR(ks->vdd_io)) { - ret = PTR_ERR(ks->vdd_io); - goto err_reg_io; - } - - ret = regulator_enable(ks->vdd_io); - if (ret) { - dev_err(dev, "regulator vdd_io enable fail: %d\n", ret); - goto err_reg_io; - } - - ks->vdd_reg = devm_regulator_get(dev, "vdd"); - if (IS_ERR(ks->vdd_reg)) { - ret = PTR_ERR(ks->vdd_reg); - goto err_reg; - } - - ret = regulator_enable(ks->vdd_reg); - if (ret) { - dev_err(dev, "regulator vdd enable fail: %d\n", ret); - goto err_reg; - } - - if (gpio_is_valid(gpio)) { - usleep_range(10000, 11000); - gpio_set_value(gpio, 1); - } - - spin_lock_init(&ks->statelock); - - INIT_WORK(&ks->rxctrl_work, ks8851_rxctrl_work); - - /* setup EEPROM state */ - ks->eeprom.data = ks; - ks->eeprom.width = PCI_EEPROM_WIDTH_93C46; - ks->eeprom.register_read = ks8851_eeprom_regread; - ks->eeprom.register_write = ks8851_eeprom_regwrite; - - /* setup mii state */ - ks->mii.dev = netdev; - ks->mii.phy_id = 1, - ks->mii.phy_id_mask = 1; - ks->mii.reg_num_mask = 0xf; - ks->mii.mdio_read = ks8851_phy_read; - ks->mii.mdio_write = ks8851_phy_write; - - dev_info(dev, "message enable is %d\n", msg_en); - - /* set the default message enable */ - ks->msg_enable = netif_msg_init(msg_en, NETIF_MSG_DRV | - NETIF_MSG_PROBE | - NETIF_MSG_LINK); - - skb_queue_head_init(&ks->txq); - - netdev->ethtool_ops = &ks8851_ethtool_ops; - SET_NETDEV_DEV(netdev, dev); - - dev_set_drvdata(dev, ks); - - netif_carrier_off(ks->netdev); - netdev->if_port = IF_PORT_100BASET; - netdev->netdev_ops = &ks8851_netdev_ops; - - /* issue a global soft reset to reset the device. */ - ks8851_soft_reset(ks, GRR_GSR); - - /* simple check for a valid chip being connected to the bus */ - cider = ks8851_rdreg16(ks, KS_CIDER); - if ((cider & ~CIDER_REV_MASK) != CIDER_ID) { - dev_err(dev, "failed to read device ID\n"); - ret = -ENODEV; - goto err_id; - } - - /* cache the contents of the CCR register for EEPROM, etc. */ - ks->rc_ccr = ks8851_rdreg16(ks, KS_CCR); - - ks8851_read_selftest(ks); - ks8851_init_mac(ks, dev->of_node); - - ret = register_netdev(netdev); - if (ret) { - dev_err(dev, "failed to register network device\n"); - goto err_netdev; - } - - netdev_info(netdev, "revision %d, MAC %pM, IRQ %d, %s EEPROM\n", - CIDER_REV_GET(cider), netdev->dev_addr, netdev->irq, - ks->rc_ccr & CCR_EEPROM ? "has" : "no"); - - return 0; - -err_netdev: -err_id: - if (gpio_is_valid(gpio)) - gpio_set_value(gpio, 0); - regulator_disable(ks->vdd_reg); -err_reg: - regulator_disable(ks->vdd_io); -err_reg_io: - return ret; -} - -static int ks8851_remove_common(struct device *dev) -{ - struct ks8851_net *priv = dev_get_drvdata(dev); - - if (netif_msg_drv(priv)) - dev_info(dev, "remove\n"); - - unregister_netdev(priv->netdev); - if (gpio_is_valid(priv->gpio)) - gpio_set_value(priv->gpio, 0); - regulator_disable(priv->vdd_reg); - regulator_disable(priv->vdd_io); - - return 0; -} - -static int ks8851_probe(struct spi_device *spi) -{ - struct device *dev = &spi->dev; - struct ks8851_net_spi *kss; - struct net_device *netdev; - struct ks8851_net *ks; - - netdev = devm_alloc_etherdev(dev, sizeof(struct ks8851_net_spi)); - if (!netdev) - return -ENOMEM; - - spi->bits_per_word = 8; - - ks = netdev_priv(netdev); - - ks->lock = ks8851_lock_spi; - ks->unlock = ks8851_unlock_spi; - ks->rdreg16 = ks8851_rdreg16_spi; - ks->wrreg16 = ks8851_wrreg16_spi; - ks->rdfifo = ks8851_rdfifo_spi; - ks->wrfifo = ks8851_wrpkt_spi; - ks->start_xmit = ks8851_start_xmit_spi; - ks->rx_skb = ks8851_rx_skb_spi; - ks->flush_tx_work = ks8851_flush_tx_work_spi; - -#define STD_IRQ (IRQ_LCI | /* Link Change */ \ - IRQ_TXI | /* TX done */ \ - IRQ_RXI | /* RX done */ \ - IRQ_SPIBEI | /* SPI bus error */ \ - IRQ_TXPSI | /* TX process stop */ \ - IRQ_RXPSI) /* RX process stop */ - ks->rc_ier = STD_IRQ; - - kss = to_ks8851_spi(ks); - - kss->spidev = spi; - mutex_init(&kss->lock); - INIT_WORK(&kss->tx_work, ks8851_tx_work); - - /* initialise pre-made spi transfer messages */ - spi_message_init(&kss->spi_msg1); - spi_message_add_tail(&kss->spi_xfer1, &kss->spi_msg1); - - spi_message_init(&kss->spi_msg2); - spi_message_add_tail(&kss->spi_xfer2[0], &kss->spi_msg2); - spi_message_add_tail(&kss->spi_xfer2[1], &kss->spi_msg2); - - netdev->irq = spi->irq; - - return ks8851_probe_common(netdev, dev, msg_enable); -} - -static int ks8851_remove(struct spi_device *spi) -{ - return ks8851_remove_common(&spi->dev); -} - -static const struct of_device_id ks8851_match_table[] = { - { .compatible = "micrel,ks8851" }, - { } -}; -MODULE_DEVICE_TABLE(of, ks8851_match_table); - -static struct spi_driver ks8851_driver = { - .driver = { - .name = "ks8851", - .of_match_table = ks8851_match_table, - .pm = &ks8851_pm_ops, - }, - .probe = ks8851_probe, - .remove = ks8851_remove, -}; -module_spi_driver(ks8851_driver); - -MODULE_DESCRIPTION("KS8851 Network driver"); -MODULE_AUTHOR("Ben Dooks "); -MODULE_LICENSE("GPL"); - -module_param_named(message, msg_enable, int, 0); -MODULE_PARM_DESC(message, "Message verbosity level (0=none, 31=all)"); -MODULE_ALIAS("spi:ks8851"); diff --git a/drivers/net/ethernet/micrel/ks8851.h b/drivers/net/ethernet/micrel/ks8851.h index f210d18a10b5..2b319e451121 100644 --- a/drivers/net/ethernet/micrel/ks8851.h +++ b/drivers/net/ethernet/micrel/ks8851.h @@ -7,6 +7,11 @@ * KS8851 register definitions */ +#ifndef __KS8851_H__ +#define __KS8851_H__ + +#include + #define KS_CCR 0x08 #define CCR_LE (1 << 10) /* KSZ8851-16MLL */ #define CCR_EEPROM (1 << 9) @@ -300,3 +305,147 @@ #define TXFR_TXIC (1 << 15) #define TXFR_TXFID_MASK (0x3f << 0) #define TXFR_TXFID_SHIFT (0) + +/** + * struct ks8851_rxctrl - KS8851 driver rx control + * @mchash: Multicast hash-table data. + * @rxcr1: KS_RXCR1 register setting + * @rxcr2: KS_RXCR2 register setting + * + * Representation of the settings needs to control the receive filtering + * such as the multicast hash-filter and the receive register settings. This + * is used to make the job of working out if the receive settings change and + * then issuing the new settings to the worker that will send the necessary + * commands. + */ +struct ks8851_rxctrl { + u16 mchash[4]; + u16 rxcr1; + u16 rxcr2; +}; + +/** + * union ks8851_tx_hdr - tx header data + * @txb: The header as bytes + * @txw: The header as 16bit, little-endian words + * + * A dual representation of the tx header data to allow + * access to individual bytes, and to allow 16bit accesses + * with 16bit alignment. + */ +union ks8851_tx_hdr { + u8 txb[6]; + __le16 txw[3]; +}; + +/** + * struct ks8851_net - KS8851 driver private data + * @netdev: The network device we're bound to + * @statelock: Lock on this structure for tx list. + * @mii: The MII state information for the mii calls. + * @rxctrl: RX settings for @rxctrl_work. + * @rxctrl_work: Work queue for updating RX mode and multicast lists + * @txq: Queue of packets for transmission. + * @txh: Space for generating packet TX header in DMA-able data + * @rxd: Space for receiving SPI data, in DMA-able space. + * @txd: Space for transmitting SPI data, in DMA-able space. + * @msg_enable: The message flags controlling driver output (see ethtool). + * @fid: Incrementing frame id tag. + * @rc_ier: Cached copy of KS_IER. + * @rc_ccr: Cached copy of KS_CCR. + * @rc_rxqcr: Cached copy of KS_RXQCR. + * @eeprom: 93CX6 EEPROM state for accessing on-board EEPROM. + * @vdd_reg: Optional regulator supplying the chip + * @vdd_io: Optional digital power supply for IO + * @gpio: Optional reset_n gpio + * @lock: Bus access lock callback + * @unlock: Bus access unlock callback + * @rdreg16: 16bit register read callback + * @wrreg16: 16bit register write callback + * @rdfifo: FIFO read callback + * @wrfifo: FIFO write callback + * @start_xmit: start_xmit() implementation callback + * @rx_skb: rx_skb() implementation callback + * @flush_tx_work: flush_tx_work() implementation callback + * + * The @statelock is used to protect information in the structure which may + * need to be accessed via several sources, such as the network driver layer + * or one of the work queues. + * + * We align the buffers we may use for rx/tx to ensure that if the SPI driver + * wants to DMA map them, it will not have any problems with data the driver + * modifies. + */ +struct ks8851_net { + struct net_device *netdev; + spinlock_t statelock; + + union ks8851_tx_hdr txh ____cacheline_aligned; + u8 rxd[8]; + u8 txd[8]; + + u32 msg_enable ____cacheline_aligned; + u16 tx_space; + u8 fid; + + u16 rc_ier; + u16 rc_rxqcr; + u16 rc_ccr; + + struct mii_if_info mii; + struct ks8851_rxctrl rxctrl; + + struct work_struct rxctrl_work; + + struct sk_buff_head txq; + + struct eeprom_93cx6 eeprom; + struct regulator *vdd_reg; + struct regulator *vdd_io; + int gpio; + + void (*lock)(struct ks8851_net *ks, + unsigned long *flags); + void (*unlock)(struct ks8851_net *ks, + unsigned long *flags); + unsigned int (*rdreg16)(struct ks8851_net *ks, + unsigned int reg); + void (*wrreg16)(struct ks8851_net *ks, + unsigned int reg, unsigned int val); + void (*rdfifo)(struct ks8851_net *ks, u8 *buff, + unsigned int len); + void (*wrfifo)(struct ks8851_net *ks, + struct sk_buff *txp, bool irq); + netdev_tx_t (*start_xmit)(struct sk_buff *skb, + struct net_device *dev); + void (*rx_skb)(struct ks8851_net *ks, + struct sk_buff *skb); + void (*flush_tx_work)(struct ks8851_net *ks); +}; + +int ks8851_probe_common(struct net_device *netdev, struct device *dev, + int msg_en); +int ks8851_remove_common(struct device *dev); +int ks8851_suspend(struct device *dev); +int ks8851_resume(struct device *dev); + +static __maybe_unused SIMPLE_DEV_PM_OPS(ks8851_pm_ops, + ks8851_suspend, ks8851_resume); + +/** + * ks8851_done_tx - update and then free skbuff after transmitting + * @ks: The device state + * @txb: The buffer transmitted + */ +static void __maybe_unused ks8851_done_tx(struct ks8851_net *ks, + struct sk_buff *txb) +{ + struct net_device *dev = ks->netdev; + + dev->stats.tx_bytes += txb->len; + dev->stats.tx_packets++; + + dev_kfree_skb(txb); +} + +#endif /* __KS8851_H__ */ diff --git a/drivers/net/ethernet/micrel/ks8851_common.c b/drivers/net/ethernet/micrel/ks8851_common.c new file mode 100644 index 000000000000..d65872172229 --- /dev/null +++ b/drivers/net/ethernet/micrel/ks8851_common.c @@ -0,0 +1,1193 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* drivers/net/ethernet/micrel/ks8851.c + * + * Copyright 2009 Simtec Electronics + * http://www.simtec.co.uk/ + * Ben Dooks + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#define DEBUG + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include "ks8851.h" + +/** + * ks8851_lock - register access lock + * @ks: The chip state + * @flags: Spinlock flags + * + * Claim chip register access lock + */ +static void ks8851_lock(struct ks8851_net *ks, unsigned long *flags) +{ + ks->lock(ks, flags); +} + +/** + * ks8851_unlock - register access unlock + * @ks: The chip state + * @flags: Spinlock flags + * + * Release chip register access lock + */ +static void ks8851_unlock(struct ks8851_net *ks, unsigned long *flags) +{ + ks->unlock(ks, flags); +} + +/** + * ks8851_wrreg16 - write 16bit register value to chip + * @ks: The chip state + * @reg: The register address + * @val: The value to write + * + * Issue a write to put the value @val into the register specified in @reg. + */ +static void ks8851_wrreg16(struct ks8851_net *ks, unsigned int reg, + unsigned int val) +{ + ks->wrreg16(ks, reg, val); +} + +/** + * ks8851_rdreg16 - read 16 bit register from device + * @ks: The chip information + * @reg: The register address + * + * Read a 16bit register from the chip, returning the result + */ +static unsigned int ks8851_rdreg16(struct ks8851_net *ks, + unsigned int reg) +{ + return ks->rdreg16(ks, reg); +} + +/** + * ks8851_soft_reset - issue one of the soft reset to the device + * @ks: The device state. + * @op: The bit(s) to set in the GRR + * + * Issue the relevant soft-reset command to the device's GRR register + * specified by @op. + * + * Note, the delays are in there as a caution to ensure that the reset + * has time to take effect and then complete. Since the datasheet does + * not currently specify the exact sequence, we have chosen something + * that seems to work with our device. + */ +static void ks8851_soft_reset(struct ks8851_net *ks, unsigned op) +{ + ks8851_wrreg16(ks, KS_GRR, op); + mdelay(1); /* wait a short time to effect reset */ + ks8851_wrreg16(ks, KS_GRR, 0); + mdelay(1); /* wait for condition to clear */ +} + +/** + * ks8851_set_powermode - set power mode of the device + * @ks: The device state + * @pwrmode: The power mode value to write to KS_PMECR. + * + * Change the power mode of the chip. + */ +static void ks8851_set_powermode(struct ks8851_net *ks, unsigned pwrmode) +{ + unsigned pmecr; + + netif_dbg(ks, hw, ks->netdev, "setting power mode %d\n", pwrmode); + + pmecr = ks8851_rdreg16(ks, KS_PMECR); + pmecr &= ~PMECR_PM_MASK; + pmecr |= pwrmode; + + ks8851_wrreg16(ks, KS_PMECR, pmecr); +} + +/** + * ks8851_write_mac_addr - write mac address to device registers + * @dev: The network device + * + * Update the KS8851 MAC address registers from the address in @dev. + * + * This call assumes that the chip is not running, so there is no need to + * shutdown the RXQ process whilst setting this. +*/ +static int ks8851_write_mac_addr(struct net_device *dev) +{ + struct ks8851_net *ks = netdev_priv(dev); + unsigned long flags; + u16 val; + int i; + + ks8851_lock(ks, &flags); + + /* + * Wake up chip in case it was powered off when stopped; otherwise, + * the first write to the MAC address does not take effect. + */ + ks8851_set_powermode(ks, PMECR_PM_NORMAL); + + for (i = 0; i < ETH_ALEN; i += 2) { + val = (dev->dev_addr[i] << 8) | dev->dev_addr[i + 1]; + ks8851_wrreg16(ks, KS_MAR(i), val); + } + + if (!netif_running(dev)) + ks8851_set_powermode(ks, PMECR_PM_SOFTDOWN); + + ks8851_unlock(ks, &flags); + + return 0; +} + +/** + * ks8851_read_mac_addr - read mac address from device registers + * @dev: The network device + * + * Update our copy of the KS8851 MAC address from the registers of @dev. +*/ +static void ks8851_read_mac_addr(struct net_device *dev) +{ + struct ks8851_net *ks = netdev_priv(dev); + unsigned long flags; + u16 reg; + int i; + + ks8851_lock(ks, &flags); + + for (i = 0; i < ETH_ALEN; i += 2) { + reg = ks8851_rdreg16(ks, KS_MAR(i)); + dev->dev_addr[i] = reg >> 8; + dev->dev_addr[i + 1] = reg & 0xff; + } + + ks8851_unlock(ks, &flags); +} + +/** + * ks8851_init_mac - initialise the mac address + * @ks: The device structure + * @np: The device node pointer + * + * Get or create the initial mac address for the device and then set that + * into the station address register. A mac address supplied in the device + * tree takes precedence. Otherwise, if there is an EEPROM present, then + * we try that. If no valid mac address is found we use eth_random_addr() + * to create a new one. + */ +static void ks8851_init_mac(struct ks8851_net *ks, struct device_node *np) +{ + struct net_device *dev = ks->netdev; + const u8 *mac_addr; + + mac_addr = of_get_mac_address(np); + if (!IS_ERR(mac_addr)) { + ether_addr_copy(dev->dev_addr, mac_addr); + ks8851_write_mac_addr(dev); + return; + } + + if (ks->rc_ccr & CCR_EEPROM) { + ks8851_read_mac_addr(dev); + if (is_valid_ether_addr(dev->dev_addr)) + return; + + netdev_err(ks->netdev, "invalid mac address read %pM\n", + dev->dev_addr); + } + + eth_hw_addr_random(dev); + ks8851_write_mac_addr(dev); +} + +/** + * ks8851_dbg_dumpkkt - dump initial packet contents to debug + * @ks: The device state + * @rxpkt: The data for the received packet + * + * Dump the initial data from the packet to dev_dbg(). + */ +static void ks8851_dbg_dumpkkt(struct ks8851_net *ks, u8 *rxpkt) +{ + netdev_dbg(ks->netdev, + "pkt %02x%02x%02x%02x %02x%02x%02x%02x %02x%02x%02x%02x\n", + rxpkt[4], rxpkt[5], rxpkt[6], rxpkt[7], + rxpkt[8], rxpkt[9], rxpkt[10], rxpkt[11], + rxpkt[12], rxpkt[13], rxpkt[14], rxpkt[15]); +} + +/** + * ks8851_rx_skb - receive skbuff + * @ks: The device state. + * @skb: The skbuff + */ +static void ks8851_rx_skb(struct ks8851_net *ks, struct sk_buff *skb) +{ + ks->rx_skb(ks, skb); +} + +/** + * ks8851_rx_pkts - receive packets from the host + * @ks: The device information. + * + * This is called from the IRQ work queue when the system detects that there + * are packets in the receive queue. Find out how many packets there are and + * read them from the FIFO. + */ +static void ks8851_rx_pkts(struct ks8851_net *ks) +{ + struct sk_buff *skb; + unsigned rxfc; + unsigned rxlen; + unsigned rxstat; + u8 *rxpkt; + + rxfc = (ks8851_rdreg16(ks, KS_RXFCTR) >> 8) & 0xff; + + netif_dbg(ks, rx_status, ks->netdev, + "%s: %d packets\n", __func__, rxfc); + + /* Currently we're issuing a read per packet, but we could possibly + * improve the code by issuing a single read, getting the receive + * header, allocating the packet and then reading the packet data + * out in one go. + * + * This form of operation would require us to hold the SPI bus' + * chipselect low during the entie transaction to avoid any + * reset to the data stream coming from the chip. + */ + + for (; rxfc != 0; rxfc--) { + rxstat = ks8851_rdreg16(ks, KS_RXFHSR); + rxlen = ks8851_rdreg16(ks, KS_RXFHBCR) & RXFHBCR_CNT_MASK; + + netif_dbg(ks, rx_status, ks->netdev, + "rx: stat 0x%04x, len 0x%04x\n", rxstat, rxlen); + + /* the length of the packet includes the 32bit CRC */ + + /* set dma read address */ + ks8851_wrreg16(ks, KS_RXFDPR, RXFDPR_RXFPAI | 0x00); + + /* start DMA access */ + ks8851_wrreg16(ks, KS_RXQCR, ks->rc_rxqcr | RXQCR_SDA); + + if (rxlen > 4) { + unsigned int rxalign; + + rxlen -= 4; + rxalign = ALIGN(rxlen, 4); + skb = netdev_alloc_skb_ip_align(ks->netdev, rxalign); + if (skb) { + + /* 4 bytes of status header + 4 bytes of + * garbage: we put them before ethernet + * header, so that they are copied, + * but ignored. + */ + + rxpkt = skb_put(skb, rxlen) - 8; + + ks->rdfifo(ks, rxpkt, rxalign + 8); + + if (netif_msg_pktdata(ks)) + ks8851_dbg_dumpkkt(ks, rxpkt); + + skb->protocol = eth_type_trans(skb, ks->netdev); + ks8851_rx_skb(ks, skb); + + ks->netdev->stats.rx_packets++; + ks->netdev->stats.rx_bytes += rxlen; + } + } + + /* end DMA access and dequeue packet */ + ks8851_wrreg16(ks, KS_RXQCR, ks->rc_rxqcr | RXQCR_RRXEF); + } +} + +/** + * ks8851_irq - IRQ handler for dealing with interrupt requests + * @irq: IRQ number + * @_ks: cookie + * + * This handler is invoked when the IRQ line asserts to find out what happened. + * As we cannot allow ourselves to sleep in HARDIRQ context, this handler runs + * in thread context. + * + * Read the interrupt status, work out what needs to be done and then clear + * any of the interrupts that are not needed. + */ +static irqreturn_t ks8851_irq(int irq, void *_ks) +{ + struct ks8851_net *ks = _ks; + unsigned handled = 0; + unsigned long flags; + unsigned int status; + + ks8851_lock(ks, &flags); + + status = ks8851_rdreg16(ks, KS_ISR); + + netif_dbg(ks, intr, ks->netdev, + "%s: status 0x%04x\n", __func__, status); + + if (status & IRQ_LCI) + handled |= IRQ_LCI; + + if (status & IRQ_LDI) { + u16 pmecr = ks8851_rdreg16(ks, KS_PMECR); + pmecr &= ~PMECR_WKEVT_MASK; + ks8851_wrreg16(ks, KS_PMECR, pmecr | PMECR_WKEVT_LINK); + + handled |= IRQ_LDI; + } + + if (status & IRQ_RXPSI) + handled |= IRQ_RXPSI; + + if (status & IRQ_TXI) { + handled |= IRQ_TXI; + + /* no lock here, tx queue should have been stopped */ + + /* update our idea of how much tx space is available to the + * system */ + ks->tx_space = ks8851_rdreg16(ks, KS_TXMIR); + + netif_dbg(ks, intr, ks->netdev, + "%s: txspace %d\n", __func__, ks->tx_space); + } + + if (status & IRQ_RXI) + handled |= IRQ_RXI; + + if (status & IRQ_SPIBEI) { + netdev_err(ks->netdev, "%s: spi bus error\n", __func__); + handled |= IRQ_SPIBEI; + } + + ks8851_wrreg16(ks, KS_ISR, handled); + + if (status & IRQ_RXI) { + /* the datasheet says to disable the rx interrupt during + * packet read-out, however we're masking the interrupt + * from the device so do not bother masking just the RX + * from the device. */ + + ks8851_rx_pkts(ks); + } + + /* if something stopped the rx process, probably due to wanting + * to change the rx settings, then do something about restarting + * it. */ + if (status & IRQ_RXPSI) { + struct ks8851_rxctrl *rxc = &ks->rxctrl; + + /* update the multicast hash table */ + ks8851_wrreg16(ks, KS_MAHTR0, rxc->mchash[0]); + ks8851_wrreg16(ks, KS_MAHTR1, rxc->mchash[1]); + ks8851_wrreg16(ks, KS_MAHTR2, rxc->mchash[2]); + ks8851_wrreg16(ks, KS_MAHTR3, rxc->mchash[3]); + + ks8851_wrreg16(ks, KS_RXCR2, rxc->rxcr2); + ks8851_wrreg16(ks, KS_RXCR1, rxc->rxcr1); + } + + ks8851_unlock(ks, &flags); + + if (status & IRQ_LCI) + mii_check_link(&ks->mii); + + if (status & IRQ_TXI) + netif_wake_queue(ks->netdev); + + return IRQ_HANDLED; +} + +/** + * ks8851_flush_tx_work - flush outstanding TX work + * @ks: The device state + */ +static void ks8851_flush_tx_work(struct ks8851_net *ks) +{ + if (ks->flush_tx_work) + ks->flush_tx_work(ks); +} + +/** + * ks8851_net_open - open network device + * @dev: The network device being opened. + * + * Called when the network device is marked active, such as a user executing + * 'ifconfig up' on the device. + */ +static int ks8851_net_open(struct net_device *dev) +{ + struct ks8851_net *ks = netdev_priv(dev); + unsigned long flags; + int ret; + + ret = request_threaded_irq(dev->irq, NULL, ks8851_irq, + IRQF_TRIGGER_LOW | IRQF_ONESHOT, + dev->name, ks); + if (ret < 0) { + netdev_err(dev, "failed to get irq\n"); + return ret; + } + + /* lock the card, even if we may not actually be doing anything + * else at the moment */ + ks8851_lock(ks, &flags); + + netif_dbg(ks, ifup, ks->netdev, "opening\n"); + + /* bring chip out of any power saving mode it was in */ + ks8851_set_powermode(ks, PMECR_PM_NORMAL); + + /* issue a soft reset to the RX/TX QMU to put it into a known + * state. */ + ks8851_soft_reset(ks, GRR_QMU); + + /* setup transmission parameters */ + + ks8851_wrreg16(ks, KS_TXCR, (TXCR_TXE | /* enable transmit process */ + TXCR_TXPE | /* pad to min length */ + TXCR_TXCRC | /* add CRC */ + TXCR_TXFCE)); /* enable flow control */ + + /* auto-increment tx data, reset tx pointer */ + ks8851_wrreg16(ks, KS_TXFDPR, TXFDPR_TXFPAI); + + /* setup receiver control */ + + ks8851_wrreg16(ks, KS_RXCR1, (RXCR1_RXPAFMA | /* from mac filter */ + RXCR1_RXFCE | /* enable flow control */ + RXCR1_RXBE | /* broadcast enable */ + RXCR1_RXUE | /* unicast enable */ + RXCR1_RXE)); /* enable rx block */ + + /* transfer entire frames out in one go */ + ks8851_wrreg16(ks, KS_RXCR2, RXCR2_SRDBL_FRAME); + + /* set receive counter timeouts */ + ks8851_wrreg16(ks, KS_RXDTTR, 1000); /* 1ms after first frame to IRQ */ + ks8851_wrreg16(ks, KS_RXDBCTR, 4096); /* >4Kbytes in buffer to IRQ */ + ks8851_wrreg16(ks, KS_RXFCTR, 10); /* 10 frames to IRQ */ + + ks->rc_rxqcr = (RXQCR_RXFCTE | /* IRQ on frame count exceeded */ + RXQCR_RXDBCTE | /* IRQ on byte count exceeded */ + RXQCR_RXDTTE); /* IRQ on time exceeded */ + + ks8851_wrreg16(ks, KS_RXQCR, ks->rc_rxqcr); + + /* clear then enable interrupts */ + ks8851_wrreg16(ks, KS_ISR, ks->rc_ier); + ks8851_wrreg16(ks, KS_IER, ks->rc_ier); + + netif_start_queue(ks->netdev); + + netif_dbg(ks, ifup, ks->netdev, "network device up\n"); + + ks8851_unlock(ks, &flags); + mii_check_link(&ks->mii); + return 0; +} + +/** + * ks8851_net_stop - close network device + * @dev: The device being closed. + * + * Called to close down a network device which has been active. Cancell any + * work, shutdown the RX and TX process and then place the chip into a low + * power state whilst it is not being used. + */ +static int ks8851_net_stop(struct net_device *dev) +{ + struct ks8851_net *ks = netdev_priv(dev); + unsigned long flags; + + netif_info(ks, ifdown, dev, "shutting down\n"); + + netif_stop_queue(dev); + + ks8851_lock(ks, &flags); + /* turn off the IRQs and ack any outstanding */ + ks8851_wrreg16(ks, KS_IER, 0x0000); + ks8851_wrreg16(ks, KS_ISR, 0xffff); + ks8851_unlock(ks, &flags); + + /* stop any outstanding work */ + ks8851_flush_tx_work(ks); + flush_work(&ks->rxctrl_work); + + ks8851_lock(ks, &flags); + /* shutdown RX process */ + ks8851_wrreg16(ks, KS_RXCR1, 0x0000); + + /* shutdown TX process */ + ks8851_wrreg16(ks, KS_TXCR, 0x0000); + + /* set powermode to soft power down to save power */ + ks8851_set_powermode(ks, PMECR_PM_SOFTDOWN); + ks8851_unlock(ks, &flags); + + /* ensure any queued tx buffers are dumped */ + while (!skb_queue_empty(&ks->txq)) { + struct sk_buff *txb = skb_dequeue(&ks->txq); + + netif_dbg(ks, ifdown, ks->netdev, + "%s: freeing txb %p\n", __func__, txb); + + dev_kfree_skb(txb); + } + + free_irq(dev->irq, ks); + + return 0; +} + +/** + * ks8851_start_xmit - transmit packet + * @skb: The buffer to transmit + * @dev: The device used to transmit the packet. + * + * Called by the network layer to transmit the @skb. Queue the packet for + * the device and schedule the necessary work to transmit the packet when + * it is free. + * + * We do this to firstly avoid sleeping with the network device locked, + * and secondly so we can round up more than one packet to transmit which + * means we can try and avoid generating too many transmit done interrupts. + */ +static netdev_tx_t ks8851_start_xmit(struct sk_buff *skb, + struct net_device *dev) +{ + struct ks8851_net *ks = netdev_priv(dev); + + return ks->start_xmit(skb, dev); +} + +/** + * ks8851_rxctrl_work - work handler to change rx mode + * @work: The work structure this belongs to. + * + * Lock the device and issue the necessary changes to the receive mode from + * the network device layer. This is done so that we can do this without + * having to sleep whilst holding the network device lock. + * + * Since the recommendation from Micrel is that the RXQ is shutdown whilst the + * receive parameters are programmed, we issue a write to disable the RXQ and + * then wait for the interrupt handler to be triggered once the RXQ shutdown is + * complete. The interrupt handler then writes the new values into the chip. + */ +static void ks8851_rxctrl_work(struct work_struct *work) +{ + struct ks8851_net *ks = container_of(work, struct ks8851_net, rxctrl_work); + unsigned long flags; + + ks8851_lock(ks, &flags); + + /* need to shutdown RXQ before modifying filter parameters */ + ks8851_wrreg16(ks, KS_RXCR1, 0x00); + + ks8851_unlock(ks, &flags); +} + +static void ks8851_set_rx_mode(struct net_device *dev) +{ + struct ks8851_net *ks = netdev_priv(dev); + struct ks8851_rxctrl rxctrl; + + memset(&rxctrl, 0, sizeof(rxctrl)); + + if (dev->flags & IFF_PROMISC) { + /* interface to receive everything */ + + rxctrl.rxcr1 = RXCR1_RXAE | RXCR1_RXINVF; + } else if (dev->flags & IFF_ALLMULTI) { + /* accept all multicast packets */ + + rxctrl.rxcr1 = (RXCR1_RXME | RXCR1_RXAE | + RXCR1_RXPAFMA | RXCR1_RXMAFMA); + } else if (dev->flags & IFF_MULTICAST && !netdev_mc_empty(dev)) { + struct netdev_hw_addr *ha; + u32 crc; + + /* accept some multicast */ + + netdev_for_each_mc_addr(ha, dev) { + crc = ether_crc(ETH_ALEN, ha->addr); + crc >>= (32 - 6); /* get top six bits */ + + rxctrl.mchash[crc >> 4] |= (1 << (crc & 0xf)); + } + + rxctrl.rxcr1 = RXCR1_RXME | RXCR1_RXPAFMA; + } else { + /* just accept broadcast / unicast */ + rxctrl.rxcr1 = RXCR1_RXPAFMA; + } + + rxctrl.rxcr1 |= (RXCR1_RXUE | /* unicast enable */ + RXCR1_RXBE | /* broadcast enable */ + RXCR1_RXE | /* RX process enable */ + RXCR1_RXFCE); /* enable flow control */ + + rxctrl.rxcr2 |= RXCR2_SRDBL_FRAME; + + /* schedule work to do the actual set of the data if needed */ + + spin_lock(&ks->statelock); + + if (memcmp(&rxctrl, &ks->rxctrl, sizeof(rxctrl)) != 0) { + memcpy(&ks->rxctrl, &rxctrl, sizeof(ks->rxctrl)); + schedule_work(&ks->rxctrl_work); + } + + spin_unlock(&ks->statelock); +} + +static int ks8851_set_mac_address(struct net_device *dev, void *addr) +{ + struct sockaddr *sa = addr; + + if (netif_running(dev)) + return -EBUSY; + + if (!is_valid_ether_addr(sa->sa_data)) + return -EADDRNOTAVAIL; + + memcpy(dev->dev_addr, sa->sa_data, ETH_ALEN); + return ks8851_write_mac_addr(dev); +} + +static int ks8851_net_ioctl(struct net_device *dev, struct ifreq *req, int cmd) +{ + struct ks8851_net *ks = netdev_priv(dev); + + if (!netif_running(dev)) + return -EINVAL; + + return generic_mii_ioctl(&ks->mii, if_mii(req), cmd, NULL); +} + +static const struct net_device_ops ks8851_netdev_ops = { + .ndo_open = ks8851_net_open, + .ndo_stop = ks8851_net_stop, + .ndo_do_ioctl = ks8851_net_ioctl, + .ndo_start_xmit = ks8851_start_xmit, + .ndo_set_mac_address = ks8851_set_mac_address, + .ndo_set_rx_mode = ks8851_set_rx_mode, + .ndo_validate_addr = eth_validate_addr, +}; + +/* ethtool support */ + +static void ks8851_get_drvinfo(struct net_device *dev, + struct ethtool_drvinfo *di) +{ + strlcpy(di->driver, "KS8851", sizeof(di->driver)); + strlcpy(di->version, "1.00", sizeof(di->version)); + strlcpy(di->bus_info, dev_name(dev->dev.parent), sizeof(di->bus_info)); +} + +static u32 ks8851_get_msglevel(struct net_device *dev) +{ + struct ks8851_net *ks = netdev_priv(dev); + return ks->msg_enable; +} + +static void ks8851_set_msglevel(struct net_device *dev, u32 to) +{ + struct ks8851_net *ks = netdev_priv(dev); + ks->msg_enable = to; +} + +static int ks8851_get_link_ksettings(struct net_device *dev, + struct ethtool_link_ksettings *cmd) +{ + struct ks8851_net *ks = netdev_priv(dev); + + mii_ethtool_get_link_ksettings(&ks->mii, cmd); + + return 0; +} + +static int ks8851_set_link_ksettings(struct net_device *dev, + const struct ethtool_link_ksettings *cmd) +{ + struct ks8851_net *ks = netdev_priv(dev); + return mii_ethtool_set_link_ksettings(&ks->mii, cmd); +} + +static u32 ks8851_get_link(struct net_device *dev) +{ + struct ks8851_net *ks = netdev_priv(dev); + return mii_link_ok(&ks->mii); +} + +static int ks8851_nway_reset(struct net_device *dev) +{ + struct ks8851_net *ks = netdev_priv(dev); + return mii_nway_restart(&ks->mii); +} + +/* EEPROM support */ + +static void ks8851_eeprom_regread(struct eeprom_93cx6 *ee) +{ + struct ks8851_net *ks = ee->data; + unsigned val; + + val = ks8851_rdreg16(ks, KS_EEPCR); + + ee->reg_data_out = (val & EEPCR_EESB) ? 1 : 0; + ee->reg_data_clock = (val & EEPCR_EESCK) ? 1 : 0; + ee->reg_chip_select = (val & EEPCR_EECS) ? 1 : 0; +} + +static void ks8851_eeprom_regwrite(struct eeprom_93cx6 *ee) +{ + struct ks8851_net *ks = ee->data; + unsigned val = EEPCR_EESA; /* default - eeprom access on */ + + if (ee->drive_data) + val |= EEPCR_EESRWA; + if (ee->reg_data_in) + val |= EEPCR_EEDO; + if (ee->reg_data_clock) + val |= EEPCR_EESCK; + if (ee->reg_chip_select) + val |= EEPCR_EECS; + + ks8851_wrreg16(ks, KS_EEPCR, val); +} + +/** + * ks8851_eeprom_claim - claim device EEPROM and activate the interface + * @ks: The network device state. + * + * Check for the presence of an EEPROM, and then activate software access + * to the device. + */ +static int ks8851_eeprom_claim(struct ks8851_net *ks) +{ + /* start with clock low, cs high */ + ks8851_wrreg16(ks, KS_EEPCR, EEPCR_EESA | EEPCR_EECS); + return 0; +} + +/** + * ks8851_eeprom_release - release the EEPROM interface + * @ks: The device state + * + * Release the software access to the device EEPROM + */ +static void ks8851_eeprom_release(struct ks8851_net *ks) +{ + unsigned val = ks8851_rdreg16(ks, KS_EEPCR); + + ks8851_wrreg16(ks, KS_EEPCR, val & ~EEPCR_EESA); +} + +#define KS_EEPROM_MAGIC (0x00008851) + +static int ks8851_set_eeprom(struct net_device *dev, + struct ethtool_eeprom *ee, u8 *data) +{ + struct ks8851_net *ks = netdev_priv(dev); + int offset = ee->offset; + unsigned long flags; + int len = ee->len; + u16 tmp; + + /* currently only support byte writing */ + if (len != 1) + return -EINVAL; + + if (ee->magic != KS_EEPROM_MAGIC) + return -EINVAL; + + if (!(ks->rc_ccr & CCR_EEPROM)) + return -ENOENT; + + ks8851_lock(ks, &flags); + + ks8851_eeprom_claim(ks); + + eeprom_93cx6_wren(&ks->eeprom, true); + + /* ethtool currently only supports writing bytes, which means + * we have to read/modify/write our 16bit EEPROMs */ + + eeprom_93cx6_read(&ks->eeprom, offset/2, &tmp); + + if (offset & 1) { + tmp &= 0xff; + tmp |= *data << 8; + } else { + tmp &= 0xff00; + tmp |= *data; + } + + eeprom_93cx6_write(&ks->eeprom, offset/2, tmp); + eeprom_93cx6_wren(&ks->eeprom, false); + + ks8851_eeprom_release(ks); + ks8851_unlock(ks, &flags); + + return 0; +} + +static int ks8851_get_eeprom(struct net_device *dev, + struct ethtool_eeprom *ee, u8 *data) +{ + struct ks8851_net *ks = netdev_priv(dev); + int offset = ee->offset; + unsigned long flags; + int len = ee->len; + + /* must be 2 byte aligned */ + if (len & 1 || offset & 1) + return -EINVAL; + + if (!(ks->rc_ccr & CCR_EEPROM)) + return -ENOENT; + + ks8851_lock(ks, &flags); + + ks8851_eeprom_claim(ks); + + ee->magic = KS_EEPROM_MAGIC; + + eeprom_93cx6_multiread(&ks->eeprom, offset/2, (__le16 *)data, len/2); + ks8851_eeprom_release(ks); + ks8851_unlock(ks, &flags); + + return 0; +} + +static int ks8851_get_eeprom_len(struct net_device *dev) +{ + struct ks8851_net *ks = netdev_priv(dev); + + /* currently, we assume it is an 93C46 attached, so return 128 */ + return ks->rc_ccr & CCR_EEPROM ? 128 : 0; +} + +static const struct ethtool_ops ks8851_ethtool_ops = { + .get_drvinfo = ks8851_get_drvinfo, + .get_msglevel = ks8851_get_msglevel, + .set_msglevel = ks8851_set_msglevel, + .get_link = ks8851_get_link, + .nway_reset = ks8851_nway_reset, + .get_eeprom_len = ks8851_get_eeprom_len, + .get_eeprom = ks8851_get_eeprom, + .set_eeprom = ks8851_set_eeprom, + .get_link_ksettings = ks8851_get_link_ksettings, + .set_link_ksettings = ks8851_set_link_ksettings, +}; + +/* MII interface controls */ + +/** + * ks8851_phy_reg - convert MII register into a KS8851 register + * @reg: MII register number. + * + * Return the KS8851 register number for the corresponding MII PHY register + * if possible. Return zero if the MII register has no direct mapping to the + * KS8851 register set. + */ +static int ks8851_phy_reg(int reg) +{ + switch (reg) { + case MII_BMCR: + return KS_P1MBCR; + case MII_BMSR: + return KS_P1MBSR; + case MII_PHYSID1: + return KS_PHY1ILR; + case MII_PHYSID2: + return KS_PHY1IHR; + case MII_ADVERTISE: + return KS_P1ANAR; + case MII_LPA: + return KS_P1ANLPR; + } + + return 0x0; +} + +/** + * ks8851_phy_read - MII interface PHY register read. + * @dev: The network device the PHY is on. + * @phy_addr: Address of PHY (ignored as we only have one) + * @reg: The register to read. + * + * This call reads data from the PHY register specified in @reg. Since the + * device does not support all the MII registers, the non-existent values + * are always returned as zero. + * + * We return zero for unsupported registers as the MII code does not check + * the value returned for any error status, and simply returns it to the + * caller. The mii-tool that the driver was tested with takes any -ve error + * as real PHY capabilities, thus displaying incorrect data to the user. + */ +static int ks8851_phy_read(struct net_device *dev, int phy_addr, int reg) +{ + struct ks8851_net *ks = netdev_priv(dev); + unsigned long flags; + int ksreg; + int result; + + ksreg = ks8851_phy_reg(reg); + if (!ksreg) + return 0x0; /* no error return allowed, so use zero */ + + ks8851_lock(ks, &flags); + result = ks8851_rdreg16(ks, ksreg); + ks8851_unlock(ks, &flags); + + return result; +} + +static void ks8851_phy_write(struct net_device *dev, + int phy, int reg, int value) +{ + struct ks8851_net *ks = netdev_priv(dev); + unsigned long flags; + int ksreg; + + ksreg = ks8851_phy_reg(reg); + if (ksreg) { + ks8851_lock(ks, &flags); + ks8851_wrreg16(ks, ksreg, value); + ks8851_unlock(ks, &flags); + } +} + +/** + * ks8851_read_selftest - read the selftest memory info. + * @ks: The device state + * + * Read and check the TX/RX memory selftest information. + */ +static int ks8851_read_selftest(struct ks8851_net *ks) +{ + unsigned both_done = MBIR_TXMBF | MBIR_RXMBF; + int ret = 0; + unsigned rd; + + rd = ks8851_rdreg16(ks, KS_MBIR); + + if ((rd & both_done) != both_done) { + netdev_warn(ks->netdev, "Memory selftest not finished\n"); + return 0; + } + + if (rd & MBIR_TXMBFA) { + netdev_err(ks->netdev, "TX memory selftest fail\n"); + ret |= 1; + } + + if (rd & MBIR_RXMBFA) { + netdev_err(ks->netdev, "RX memory selftest fail\n"); + ret |= 2; + } + + return 0; +} + +/* driver bus management functions */ + +#ifdef CONFIG_PM_SLEEP + +int ks8851_suspend(struct device *dev) +{ + struct ks8851_net *ks = dev_get_drvdata(dev); + struct net_device *netdev = ks->netdev; + + if (netif_running(netdev)) { + netif_device_detach(netdev); + ks8851_net_stop(netdev); + } + + return 0; +} + +int ks8851_resume(struct device *dev) +{ + struct ks8851_net *ks = dev_get_drvdata(dev); + struct net_device *netdev = ks->netdev; + + if (netif_running(netdev)) { + ks8851_net_open(netdev); + netif_device_attach(netdev); + } + + return 0; +} +#endif + +int ks8851_probe_common(struct net_device *netdev, struct device *dev, + int msg_en) +{ + struct ks8851_net *ks = netdev_priv(netdev); + unsigned cider; + int gpio; + int ret; + + ks->netdev = netdev; + ks->tx_space = 6144; + + gpio = of_get_named_gpio_flags(dev->of_node, "reset-gpios", 0, NULL); + if (gpio == -EPROBE_DEFER) + return gpio; + + ks->gpio = gpio; + if (gpio_is_valid(gpio)) { + ret = devm_gpio_request_one(dev, gpio, + GPIOF_OUT_INIT_LOW, "ks8851_rst_n"); + if (ret) { + dev_err(dev, "reset gpio request failed\n"); + return ret; + } + } + + ks->vdd_io = devm_regulator_get(dev, "vdd-io"); + if (IS_ERR(ks->vdd_io)) { + ret = PTR_ERR(ks->vdd_io); + goto err_reg_io; + } + + ret = regulator_enable(ks->vdd_io); + if (ret) { + dev_err(dev, "regulator vdd_io enable fail: %d\n", ret); + goto err_reg_io; + } + + ks->vdd_reg = devm_regulator_get(dev, "vdd"); + if (IS_ERR(ks->vdd_reg)) { + ret = PTR_ERR(ks->vdd_reg); + goto err_reg; + } + + ret = regulator_enable(ks->vdd_reg); + if (ret) { + dev_err(dev, "regulator vdd enable fail: %d\n", ret); + goto err_reg; + } + + if (gpio_is_valid(gpio)) { + usleep_range(10000, 11000); + gpio_set_value(gpio, 1); + } + + spin_lock_init(&ks->statelock); + + INIT_WORK(&ks->rxctrl_work, ks8851_rxctrl_work); + + /* setup EEPROM state */ + ks->eeprom.data = ks; + ks->eeprom.width = PCI_EEPROM_WIDTH_93C46; + ks->eeprom.register_read = ks8851_eeprom_regread; + ks->eeprom.register_write = ks8851_eeprom_regwrite; + + /* setup mii state */ + ks->mii.dev = netdev; + ks->mii.phy_id = 1, + ks->mii.phy_id_mask = 1; + ks->mii.reg_num_mask = 0xf; + ks->mii.mdio_read = ks8851_phy_read; + ks->mii.mdio_write = ks8851_phy_write; + + dev_info(dev, "message enable is %d\n", msg_en); + + /* set the default message enable */ + ks->msg_enable = netif_msg_init(msg_en, NETIF_MSG_DRV | + NETIF_MSG_PROBE | + NETIF_MSG_LINK); + + skb_queue_head_init(&ks->txq); + + netdev->ethtool_ops = &ks8851_ethtool_ops; + SET_NETDEV_DEV(netdev, dev); + + dev_set_drvdata(dev, ks); + + netif_carrier_off(ks->netdev); + netdev->if_port = IF_PORT_100BASET; + netdev->netdev_ops = &ks8851_netdev_ops; + + /* issue a global soft reset to reset the device. */ + ks8851_soft_reset(ks, GRR_GSR); + + /* simple check for a valid chip being connected to the bus */ + cider = ks8851_rdreg16(ks, KS_CIDER); + if ((cider & ~CIDER_REV_MASK) != CIDER_ID) { + dev_err(dev, "failed to read device ID\n"); + ret = -ENODEV; + goto err_id; + } + + /* cache the contents of the CCR register for EEPROM, etc. */ + ks->rc_ccr = ks8851_rdreg16(ks, KS_CCR); + + ks8851_read_selftest(ks); + ks8851_init_mac(ks, dev->of_node); + + ret = register_netdev(netdev); + if (ret) { + dev_err(dev, "failed to register network device\n"); + goto err_netdev; + } + + netdev_info(netdev, "revision %d, MAC %pM, IRQ %d, %s EEPROM\n", + CIDER_REV_GET(cider), netdev->dev_addr, netdev->irq, + ks->rc_ccr & CCR_EEPROM ? "has" : "no"); + + return 0; + +err_netdev: +err_id: + if (gpio_is_valid(gpio)) + gpio_set_value(gpio, 0); + regulator_disable(ks->vdd_reg); +err_reg: + regulator_disable(ks->vdd_io); +err_reg_io: + return ret; +} + +int ks8851_remove_common(struct device *dev) +{ + struct ks8851_net *priv = dev_get_drvdata(dev); + + if (netif_msg_drv(priv)) + dev_info(dev, "remove\n"); + + unregister_netdev(priv->netdev); + if (gpio_is_valid(priv->gpio)) + gpio_set_value(priv->gpio, 0); + regulator_disable(priv->vdd_reg); + regulator_disable(priv->vdd_io); + + return 0; +} diff --git a/drivers/net/ethernet/micrel/ks8851_spi.c b/drivers/net/ethernet/micrel/ks8851_spi.c new file mode 100644 index 000000000000..4ec7f1615977 --- /dev/null +++ b/drivers/net/ethernet/micrel/ks8851_spi.c @@ -0,0 +1,485 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* drivers/net/ethernet/micrel/ks8851.c + * + * Copyright 2009 Simtec Electronics + * http://www.simtec.co.uk/ + * Ben Dooks + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#define DEBUG + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include "ks8851.h" + +static int msg_enable; + +/** + * struct ks8851_net_spi - KS8851 SPI driver private data + * @lock: Lock to ensure that the device is not accessed when busy. + * @tx_work: Work queue for tx packets + * @ks8851: KS8851 driver common private data + * @spidev: The spi device we're bound to. + * @spi_msg1: pre-setup SPI transfer with one message, @spi_xfer1. + * @spi_msg2: pre-setup SPI transfer with two messages, @spi_xfer2. + * @spi_xfer1: @spi_msg1 SPI transfer structure + * @spi_xfer2: @spi_msg2 SPI transfer structure + * + * The @lock ensures that the chip is protected when certain operations are + * in progress. When the read or write packet transfer is in progress, most + * of the chip registers are not ccessible until the transfer is finished and + * the DMA has been de-asserted. + */ +struct ks8851_net_spi { + struct ks8851_net ks8851; + struct mutex lock; + struct work_struct tx_work; + struct spi_device *spidev; + struct spi_message spi_msg1; + struct spi_message spi_msg2; + struct spi_transfer spi_xfer1; + struct spi_transfer spi_xfer2[2]; +}; + +#define to_ks8851_spi(ks) container_of((ks), struct ks8851_net_spi, ks8851) + +/* SPI frame opcodes */ +#define KS_SPIOP_RD 0x00 +#define KS_SPIOP_WR 0x40 +#define KS_SPIOP_RXFIFO 0x80 +#define KS_SPIOP_TXFIFO 0xC0 + +/* shift for byte-enable data */ +#define BYTE_EN(_x) ((_x) << 2) + +/* turn register number and byte-enable mask into data for start of packet */ +#define MK_OP(_byteen, _reg) \ + (BYTE_EN(_byteen) | (_reg) << (8 + 2) | (_reg) >> 6) + +/** + * ks8851_lock_spi - register access lock + * @ks: The chip state + * @flags: Spinlock flags + * + * Claim chip register access lock + */ +static void ks8851_lock_spi(struct ks8851_net *ks, unsigned long *flags) +{ + struct ks8851_net_spi *kss = to_ks8851_spi(ks); + + mutex_lock(&kss->lock); +} + +/** + * ks8851_unlock_spi - register access unlock + * @ks: The chip state + * @flags: Spinlock flags + * + * Release chip register access lock + */ +static void ks8851_unlock_spi(struct ks8851_net *ks, unsigned long *flags) +{ + struct ks8851_net_spi *kss = to_ks8851_spi(ks); + + mutex_unlock(&kss->lock); +} + +/* SPI register read/write calls. + * + * All these calls issue SPI transactions to access the chip's registers. They + * all require that the necessary lock is held to prevent accesses when the + * chip is busy transferring packet data (RX/TX FIFO accesses). + */ + +/** + * ks8851_wrreg16_spi - write 16bit register value to chip via SPI + * @ks: The chip state + * @reg: The register address + * @val: The value to write + * + * Issue a write to put the value @val into the register specified in @reg. + */ +static void ks8851_wrreg16_spi(struct ks8851_net *ks, unsigned int reg, + unsigned int val) +{ + struct ks8851_net_spi *kss = to_ks8851_spi(ks); + struct spi_transfer *xfer = &kss->spi_xfer1; + struct spi_message *msg = &kss->spi_msg1; + __le16 txb[2]; + int ret; + + txb[0] = cpu_to_le16(MK_OP(reg & 2 ? 0xC : 0x03, reg) | KS_SPIOP_WR); + txb[1] = cpu_to_le16(val); + + xfer->tx_buf = txb; + xfer->rx_buf = NULL; + xfer->len = 4; + + ret = spi_sync(kss->spidev, msg); + if (ret < 0) + netdev_err(ks->netdev, "spi_sync() failed\n"); +} + +/** + * ks8851_rdreg - issue read register command and return the data + * @ks: The device state + * @op: The register address and byte enables in message format. + * @rxb: The RX buffer to return the result into + * @rxl: The length of data expected. + * + * This is the low level read call that issues the necessary spi message(s) + * to read data from the register specified in @op. + */ +static void ks8851_rdreg(struct ks8851_net *ks, unsigned int op, + u8 *rxb, unsigned int rxl) +{ + struct ks8851_net_spi *kss = to_ks8851_spi(ks); + struct spi_transfer *xfer; + struct spi_message *msg; + __le16 *txb = (__le16 *)ks->txd; + u8 *trx = ks->rxd; + int ret; + + txb[0] = cpu_to_le16(op | KS_SPIOP_RD); + + if (kss->spidev->master->flags & SPI_MASTER_HALF_DUPLEX) { + msg = &kss->spi_msg2; + xfer = kss->spi_xfer2; + + xfer->tx_buf = txb; + xfer->rx_buf = NULL; + xfer->len = 2; + + xfer++; + xfer->tx_buf = NULL; + xfer->rx_buf = trx; + xfer->len = rxl; + } else { + msg = &kss->spi_msg1; + xfer = &kss->spi_xfer1; + + xfer->tx_buf = txb; + xfer->rx_buf = trx; + xfer->len = rxl + 2; + } + + ret = spi_sync(kss->spidev, msg); + if (ret < 0) + netdev_err(ks->netdev, "read: spi_sync() failed\n"); + else if (kss->spidev->master->flags & SPI_MASTER_HALF_DUPLEX) + memcpy(rxb, trx, rxl); + else + memcpy(rxb, trx + 2, rxl); +} + +/** + * ks8851_rdreg16_spi - read 16 bit register from device via SPI + * @ks: The chip information + * @reg: The register address + * + * Read a 16bit register from the chip, returning the result + */ +static unsigned int ks8851_rdreg16_spi(struct ks8851_net *ks, unsigned int reg) +{ + __le16 rx = 0; + + ks8851_rdreg(ks, MK_OP(reg & 2 ? 0xC : 0x3, reg), (u8 *)&rx, 2); + return le16_to_cpu(rx); +} + +/** + * ks8851_rdfifo_spi - read data from the receive fifo via SPI + * @ks: The device state. + * @buff: The buffer address + * @len: The length of the data to read + * + * Issue an RXQ FIFO read command and read the @len amount of data from + * the FIFO into the buffer specified by @buff. + */ +static void ks8851_rdfifo_spi(struct ks8851_net *ks, u8 *buff, unsigned int len) +{ + struct ks8851_net_spi *kss = to_ks8851_spi(ks); + struct spi_transfer *xfer = kss->spi_xfer2; + struct spi_message *msg = &kss->spi_msg2; + u8 txb[1]; + int ret; + + netif_dbg(ks, rx_status, ks->netdev, + "%s: %d@%p\n", __func__, len, buff); + + /* set the operation we're issuing */ + txb[0] = KS_SPIOP_RXFIFO; + + xfer->tx_buf = txb; + xfer->rx_buf = NULL; + xfer->len = 1; + + xfer++; + xfer->rx_buf = buff; + xfer->tx_buf = NULL; + xfer->len = len; + + ret = spi_sync(kss->spidev, msg); + if (ret < 0) + netdev_err(ks->netdev, "%s: spi_sync() failed\n", __func__); +} + +/** + * ks8851_wrfifo_spi - write packet to TX FIFO via SPI + * @ks: The device state. + * @txp: The sk_buff to transmit. + * @irq: IRQ on completion of the packet. + * + * Send the @txp to the chip. This means creating the relevant packet header + * specifying the length of the packet and the other information the chip + * needs, such as IRQ on completion. Send the header and the packet data to + * the device. + */ +static void ks8851_wrfifo_spi(struct ks8851_net *ks, struct sk_buff *txp, + bool irq) +{ + struct ks8851_net_spi *kss = to_ks8851_spi(ks); + struct spi_transfer *xfer = kss->spi_xfer2; + struct spi_message *msg = &kss->spi_msg2; + unsigned int fid = 0; + int ret; + + netif_dbg(ks, tx_queued, ks->netdev, "%s: skb %p, %d@%p, irq %d\n", + __func__, txp, txp->len, txp->data, irq); + + fid = ks->fid++; + fid &= TXFR_TXFID_MASK; + + if (irq) + fid |= TXFR_TXIC; /* irq on completion */ + + /* start header at txb[1] to align txw entries */ + ks->txh.txb[1] = KS_SPIOP_TXFIFO; + ks->txh.txw[1] = cpu_to_le16(fid); + ks->txh.txw[2] = cpu_to_le16(txp->len); + + xfer->tx_buf = &ks->txh.txb[1]; + xfer->rx_buf = NULL; + xfer->len = 5; + + xfer++; + xfer->tx_buf = txp->data; + xfer->rx_buf = NULL; + xfer->len = ALIGN(txp->len, 4); + + ret = spi_sync(kss->spidev, msg); + if (ret < 0) + netdev_err(ks->netdev, "%s: spi_sync() failed\n", __func__); +} + +/** + * ks8851_rx_skb_spi - receive skbuff + * @ks: The device state + * @skb: The skbuff + */ +static void ks8851_rx_skb_spi(struct ks8851_net *ks, struct sk_buff *skb) +{ + netif_rx_ni(skb); +} + +/** + * ks8851_tx_work - process tx packet(s) + * @work: The work strucutre what was scheduled. + * + * This is called when a number of packets have been scheduled for + * transmission and need to be sent to the device. + */ +static void ks8851_tx_work(struct work_struct *work) +{ + struct ks8851_net_spi *kss; + struct ks8851_net *ks; + unsigned long flags; + struct sk_buff *txb; + bool last; + + kss = container_of(work, struct ks8851_net_spi, tx_work); + ks = &kss->ks8851; + last = skb_queue_empty(&ks->txq); + + ks8851_lock_spi(ks, &flags); + + while (!last) { + txb = skb_dequeue(&ks->txq); + last = skb_queue_empty(&ks->txq); + + if (txb) { + ks8851_wrreg16_spi(ks, KS_RXQCR, + ks->rc_rxqcr | RXQCR_SDA); + ks8851_wrfifo_spi(ks, txb, last); + ks8851_wrreg16_spi(ks, KS_RXQCR, ks->rc_rxqcr); + ks8851_wrreg16_spi(ks, KS_TXQCR, TXQCR_METFE); + + ks8851_done_tx(ks, txb); + } + } + + ks8851_unlock_spi(ks, &flags); +} + +/** + * ks8851_flush_tx_work_spi - flush outstanding TX work + * @ks: The device state + */ +static void ks8851_flush_tx_work_spi(struct ks8851_net *ks) +{ + struct ks8851_net_spi *kss = to_ks8851_spi(ks); + + flush_work(&kss->tx_work); +} + +/** + * calc_txlen - calculate size of message to send packet + * @len: Length of data + * + * Returns the size of the TXFIFO message needed to send + * this packet. + */ +static unsigned int calc_txlen(unsigned int len) +{ + return ALIGN(len + 4, 4); +} + +/** + * ks8851_start_xmit_spi - transmit packet using SPI + * @skb: The buffer to transmit + * @dev: The device used to transmit the packet. + * + * Called by the network layer to transmit the @skb. Queue the packet for + * the device and schedule the necessary work to transmit the packet when + * it is free. + * + * We do this to firstly avoid sleeping with the network device locked, + * and secondly so we can round up more than one packet to transmit which + * means we can try and avoid generating too many transmit done interrupts. + */ +static netdev_tx_t ks8851_start_xmit_spi(struct sk_buff *skb, + struct net_device *dev) +{ + unsigned int needed = calc_txlen(skb->len); + struct ks8851_net *ks = netdev_priv(dev); + netdev_tx_t ret = NETDEV_TX_OK; + struct ks8851_net_spi *kss; + + kss = to_ks8851_spi(ks); + + netif_dbg(ks, tx_queued, ks->netdev, + "%s: skb %p, %d@%p\n", __func__, skb, skb->len, skb->data); + + spin_lock(&ks->statelock); + + if (needed > ks->tx_space) { + netif_stop_queue(dev); + ret = NETDEV_TX_BUSY; + } else { + ks->tx_space -= needed; + skb_queue_tail(&ks->txq, skb); + } + + spin_unlock(&ks->statelock); + schedule_work(&kss->tx_work); + + return ret; +} + +static int ks8851_probe_spi(struct spi_device *spi) +{ + struct device *dev = &spi->dev; + struct ks8851_net_spi *kss; + struct net_device *netdev; + struct ks8851_net *ks; + + netdev = devm_alloc_etherdev(dev, sizeof(struct ks8851_net_spi)); + if (!netdev) + return -ENOMEM; + + spi->bits_per_word = 8; + + ks = netdev_priv(netdev); + + ks->lock = ks8851_lock_spi; + ks->unlock = ks8851_unlock_spi; + ks->rdreg16 = ks8851_rdreg16_spi; + ks->wrreg16 = ks8851_wrreg16_spi; + ks->rdfifo = ks8851_rdfifo_spi; + ks->wrfifo = ks8851_wrfifo_spi; + ks->start_xmit = ks8851_start_xmit_spi; + ks->rx_skb = ks8851_rx_skb_spi; + ks->flush_tx_work = ks8851_flush_tx_work_spi; + +#define STD_IRQ (IRQ_LCI | /* Link Change */ \ + IRQ_TXI | /* TX done */ \ + IRQ_RXI | /* RX done */ \ + IRQ_SPIBEI | /* SPI bus error */ \ + IRQ_TXPSI | /* TX process stop */ \ + IRQ_RXPSI) /* RX process stop */ + ks->rc_ier = STD_IRQ; + + kss = to_ks8851_spi(ks); + + kss->spidev = spi; + mutex_init(&kss->lock); + INIT_WORK(&kss->tx_work, ks8851_tx_work); + + /* initialise pre-made spi transfer messages */ + spi_message_init(&kss->spi_msg1); + spi_message_add_tail(&kss->spi_xfer1, &kss->spi_msg1); + + spi_message_init(&kss->spi_msg2); + spi_message_add_tail(&kss->spi_xfer2[0], &kss->spi_msg2); + spi_message_add_tail(&kss->spi_xfer2[1], &kss->spi_msg2); + + netdev->irq = spi->irq; + + return ks8851_probe_common(netdev, dev, msg_enable); +} + +static int ks8851_remove_spi(struct spi_device *spi) +{ + return ks8851_remove_common(&spi->dev); +} + +static const struct of_device_id ks8851_match_table[] = { + { .compatible = "micrel,ks8851" }, + { } +}; +MODULE_DEVICE_TABLE(of, ks8851_match_table); + +static struct spi_driver ks8851_driver = { + .driver = { + .name = "ks8851", + .of_match_table = ks8851_match_table, + .pm = &ks8851_pm_ops, + }, + .probe = ks8851_probe_spi, + .remove = ks8851_remove_spi, +}; +module_spi_driver(ks8851_driver); + +MODULE_DESCRIPTION("KS8851 Network driver"); +MODULE_AUTHOR("Ben Dooks "); +MODULE_LICENSE("GPL"); + +module_param_named(message, msg_enable, int, 0); +MODULE_PARM_DESC(message, "Message verbosity level (0=none, 31=all)"); +MODULE_ALIAS("spi:ks8851"); -- cgit v1.2.3-59-g8ed1b From 797047f875b5463719cc70ba213eb691d453c946 Mon Sep 17 00:00:00 2001 From: Marek Vasut Date: Fri, 29 May 2020 00:21:45 +0200 Subject: net: ks8851: Implement Parallel bus operations Implement accessors for KS8851-16MLL/MLLI/MLLU parallel bus variant of the KS8851. This is based off the ks8851_mll.c , which is a driver for exactly the same hardware, however the ks8851.c code is much higher quality. Hence, this patch pulls out the relevant information from the ks8851_mll.c on how to access the bus, but uses the common ks8851.c code. To make this patch reviewable, instead of rewriting ks8851_mll.c, ks8851_mll.c is removed in a separate subsequent patch. Signed-off-by: Marek Vasut Cc: David S. Miller Cc: Lukas Wunner Cc: Petr Stetiar Cc: YueHaibing Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- drivers/net/ethernet/micrel/Kconfig | 2 + drivers/net/ethernet/micrel/Makefile | 1 + drivers/net/ethernet/micrel/ks8851_par.c | 357 +++++++++++++++++++++++++++++++ 3 files changed, 360 insertions(+) create mode 100644 drivers/net/ethernet/micrel/ks8851_par.c diff --git a/drivers/net/ethernet/micrel/Kconfig b/drivers/net/ethernet/micrel/Kconfig index b9c4d48e28e4..09f35209d43d 100644 --- a/drivers/net/ethernet/micrel/Kconfig +++ b/drivers/net/ethernet/micrel/Kconfig @@ -38,6 +38,8 @@ config KS8851_MLL tristate "Micrel KS8851 MLL" depends on HAS_IOMEM select MII + select CRC32 + select EEPROM_93CX6 ---help--- This platform driver is for Micrel KS8851 Address/data bus multiplexed network chip. diff --git a/drivers/net/ethernet/micrel/Makefile b/drivers/net/ethernet/micrel/Makefile index c7a4725c2e95..5cc00d22c708 100644 --- a/drivers/net/ethernet/micrel/Makefile +++ b/drivers/net/ethernet/micrel/Makefile @@ -7,4 +7,5 @@ obj-$(CONFIG_KS8842) += ks8842.o obj-$(CONFIG_KS8851) += ks8851.o ks8851-objs = ks8851_common.o ks8851_spi.o obj-$(CONFIG_KS8851_MLL) += ks8851_mll.o +ks8851_mll-objs = ks8851_common.o ks8851_par.o obj-$(CONFIG_KSZ884X_PCI) += ksz884x.o diff --git a/drivers/net/ethernet/micrel/ks8851_par.c b/drivers/net/ethernet/micrel/ks8851_par.c new file mode 100644 index 000000000000..3bab0cb2b1a5 --- /dev/null +++ b/drivers/net/ethernet/micrel/ks8851_par.c @@ -0,0 +1,357 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* drivers/net/ethernet/micrel/ks8851.c + * + * Copyright 2009 Simtec Electronics + * http://www.simtec.co.uk/ + * Ben Dooks + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#define DEBUG + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "ks8851.h" + +static int msg_enable; + +#define BE3 0x8000 /* Byte Enable 3 */ +#define BE2 0x4000 /* Byte Enable 2 */ +#define BE1 0x2000 /* Byte Enable 1 */ +#define BE0 0x1000 /* Byte Enable 0 */ + +/** + * struct ks8851_net_par - KS8851 Parallel driver private data + * @ks8851: KS8851 driver common private data + * @lock: Lock to ensure that the device is not accessed when busy. + * @hw_addr : start address of data register. + * @hw_addr_cmd : start address of command register. + * @cmd_reg_cache : command register cached. + * + * The @lock ensures that the chip is protected when certain operations are + * in progress. When the read or write packet transfer is in progress, most + * of the chip registers are not accessible until the transfer is finished + * and the DMA has been de-asserted. + */ +struct ks8851_net_par { + struct ks8851_net ks8851; + spinlock_t lock; + void __iomem *hw_addr; + void __iomem *hw_addr_cmd; + u16 cmd_reg_cache; +}; + +#define to_ks8851_par(ks) container_of((ks), struct ks8851_net_par, ks8851) + +/** + * ks8851_lock_par - register access lock + * @ks: The chip state + * @flags: Spinlock flags + * + * Claim chip register access lock + */ +static void ks8851_lock_par(struct ks8851_net *ks, unsigned long *flags) +{ + struct ks8851_net_par *ksp = to_ks8851_par(ks); + + spin_lock_irqsave(&ksp->lock, *flags); +} + +/** + * ks8851_unlock_par - register access unlock + * @ks: The chip state + * @flags: Spinlock flags + * + * Release chip register access lock + */ +static void ks8851_unlock_par(struct ks8851_net *ks, unsigned long *flags) +{ + struct ks8851_net_par *ksp = to_ks8851_par(ks); + + spin_unlock_irqrestore(&ksp->lock, *flags); +} + +/** + * ks_check_endian - Check whether endianness of the bus is correct + * @ks : The chip information + * + * The KS8851-16MLL EESK pin allows selecting the endianness of the 16bit + * bus. To maintain optimum performance, the bus endianness should be set + * such that it matches the endianness of the CPU. + */ +static int ks_check_endian(struct ks8851_net *ks) +{ + struct ks8851_net_par *ksp = to_ks8851_par(ks); + u16 cider; + + /* + * Read CIDER register first, however read it the "wrong" way around. + * If the endian strap on the KS8851-16MLL in incorrect and the chip + * is operating in different endianness than the CPU, then the meaning + * of BE[3:0] byte-enable bits is also swapped such that: + * BE[3,2,1,0] becomes BE[1,0,3,2] + * + * Luckily for us, the byte-enable bits are the top four MSbits of + * the address register and the CIDER register is at offset 0xc0. + * Hence, by reading address 0xc0c0, which is not impacted by endian + * swapping, we assert either BE[3:2] or BE[1:0] while reading the + * CIDER register. + * + * If the bus configuration is correct, reading 0xc0c0 asserts + * BE[3:2] and this read returns 0x0000, because to read register + * with bottom two LSbits of address set to 0, BE[1:0] must be + * asserted. + * + * If the bus configuration is NOT correct, reading 0xc0c0 asserts + * BE[1:0] and this read returns non-zero 0x8872 value. + */ + iowrite16(BE3 | BE2 | KS_CIDER, ksp->hw_addr_cmd); + cider = ioread16(ksp->hw_addr); + if (!cider) + return 0; + + netdev_err(ks->netdev, "incorrect EESK endian strap setting\n"); + + return -EINVAL; +} + +/** + * ks8851_wrreg16_par - write 16bit register value to chip + * @ks: The chip state + * @reg: The register address + * @val: The value to write + * + * Issue a write to put the value @val into the register specified in @reg. + */ +static void ks8851_wrreg16_par(struct ks8851_net *ks, unsigned int reg, + unsigned int val) +{ + struct ks8851_net_par *ksp = to_ks8851_par(ks); + + ksp->cmd_reg_cache = (u16)reg | ((BE1 | BE0) << (reg & 0x02)); + iowrite16(ksp->cmd_reg_cache, ksp->hw_addr_cmd); + iowrite16(val, ksp->hw_addr); +} + +/** + * ks8851_rdreg16_par - read 16 bit register from chip + * @ks: The chip information + * @reg: The register address + * + * Read a 16bit register from the chip, returning the result + */ +static unsigned int ks8851_rdreg16_par(struct ks8851_net *ks, unsigned int reg) +{ + struct ks8851_net_par *ksp = to_ks8851_par(ks); + + ksp->cmd_reg_cache = (u16)reg | ((BE1 | BE0) << (reg & 0x02)); + iowrite16(ksp->cmd_reg_cache, ksp->hw_addr_cmd); + return ioread16(ksp->hw_addr); +} + +/** + * ks8851_rdfifo_par - read data from the receive fifo + * @ks: The device state. + * @buff: The buffer address + * @len: The length of the data to read + * + * Issue an RXQ FIFO read command and read the @len amount of data from + * the FIFO into the buffer specified by @buff. + */ +static void ks8851_rdfifo_par(struct ks8851_net *ks, u8 *buff, unsigned int len) +{ + struct ks8851_net_par *ksp = to_ks8851_par(ks); + + netif_dbg(ks, rx_status, ks->netdev, + "%s: %d@%p\n", __func__, len, buff); + + ioread16_rep(ksp->hw_addr, (u16 *)buff + 1, len / 2); +} + +/** + * ks8851_wrfifo_par - write packet to TX FIFO + * @ks: The device state. + * @txp: The sk_buff to transmit. + * @irq: IRQ on completion of the packet. + * + * Send the @txp to the chip. This means creating the relevant packet header + * specifying the length of the packet and the other information the chip + * needs, such as IRQ on completion. Send the header and the packet data to + * the device. + */ +static void ks8851_wrfifo_par(struct ks8851_net *ks, struct sk_buff *txp, + bool irq) +{ + struct ks8851_net_par *ksp = to_ks8851_par(ks); + unsigned int len = ALIGN(txp->len, 4); + unsigned int fid = 0; + + netif_dbg(ks, tx_queued, ks->netdev, "%s: skb %p, %d@%p, irq %d\n", + __func__, txp, txp->len, txp->data, irq); + + fid = ks->fid++; + fid &= TXFR_TXFID_MASK; + + if (irq) + fid |= TXFR_TXIC; /* irq on completion */ + + iowrite16(fid, ksp->hw_addr); + iowrite16(txp->len, ksp->hw_addr); + + iowrite16_rep(ksp->hw_addr, txp->data, len / 2); +} + +/** + * ks8851_rx_skb_par - receive skbuff + * @ks: The device state. + * @skb: The skbuff + */ +static void ks8851_rx_skb_par(struct ks8851_net *ks, struct sk_buff *skb) +{ + netif_rx(skb); +} + +static unsigned int ks8851_rdreg16_par_txqcr(struct ks8851_net *ks) +{ + return ks8851_rdreg16_par(ks, KS_TXQCR); +} + +/** + * ks8851_start_xmit_par - transmit packet + * @skb: The buffer to transmit + * @dev: The device used to transmit the packet. + * + * Called by the network layer to transmit the @skb. Queue the packet for + * the device and schedule the necessary work to transmit the packet when + * it is free. + * + * We do this to firstly avoid sleeping with the network device locked, + * and secondly so we can round up more than one packet to transmit which + * means we can try and avoid generating too many transmit done interrupts. + */ +static netdev_tx_t ks8851_start_xmit_par(struct sk_buff *skb, + struct net_device *dev) +{ + struct ks8851_net *ks = netdev_priv(dev); + netdev_tx_t ret = NETDEV_TX_OK; + unsigned long flags; + unsigned int txqcr; + u16 txmir; + int err; + + netif_dbg(ks, tx_queued, ks->netdev, + "%s: skb %p, %d@%p\n", __func__, skb, skb->len, skb->data); + + ks8851_lock_par(ks, &flags); + + txmir = ks8851_rdreg16_par(ks, KS_TXMIR) & 0x1fff; + + if (likely(txmir >= skb->len + 12)) { + ks8851_wrreg16_par(ks, KS_RXQCR, ks->rc_rxqcr | RXQCR_SDA); + ks8851_wrfifo_par(ks, skb, false); + ks8851_wrreg16_par(ks, KS_RXQCR, ks->rc_rxqcr); + ks8851_wrreg16_par(ks, KS_TXQCR, TXQCR_METFE); + + err = readx_poll_timeout_atomic(ks8851_rdreg16_par_txqcr, ks, + txqcr, !(txqcr & TXQCR_METFE), + 5, 1000000); + if (err) + ret = NETDEV_TX_BUSY; + + ks8851_done_tx(ks, skb); + } else { + ret = NETDEV_TX_BUSY; + } + + ks8851_unlock_par(ks, &flags); + + return ret; +} + +static int ks8851_probe_par(struct platform_device *pdev) +{ + struct device *dev = &pdev->dev; + struct ks8851_net_par *ksp; + struct net_device *netdev; + struct ks8851_net *ks; + int ret; + + netdev = devm_alloc_etherdev(dev, sizeof(struct ks8851_net_par)); + if (!netdev) + return -ENOMEM; + + ks = netdev_priv(netdev); + + ks->lock = ks8851_lock_par; + ks->unlock = ks8851_unlock_par; + ks->rdreg16 = ks8851_rdreg16_par; + ks->wrreg16 = ks8851_wrreg16_par; + ks->rdfifo = ks8851_rdfifo_par; + ks->wrfifo = ks8851_wrfifo_par; + ks->start_xmit = ks8851_start_xmit_par; + ks->rx_skb = ks8851_rx_skb_par; + +#define STD_IRQ (IRQ_LCI | /* Link Change */ \ + IRQ_RXI | /* RX done */ \ + IRQ_RXPSI) /* RX process stop */ + ks->rc_ier = STD_IRQ; + + ksp = to_ks8851_par(ks); + spin_lock_init(&ksp->lock); + + ksp->hw_addr = devm_platform_ioremap_resource(pdev, 0); + if (IS_ERR(ksp->hw_addr)) + return PTR_ERR(ksp->hw_addr); + + ksp->hw_addr_cmd = devm_platform_ioremap_resource(pdev, 1); + if (IS_ERR(ksp->hw_addr_cmd)) + return PTR_ERR(ksp->hw_addr_cmd); + + ret = ks_check_endian(ks); + if (ret) + return ret; + + netdev->irq = platform_get_irq(pdev, 0); + + return ks8851_probe_common(netdev, dev, msg_enable); +} + +static int ks8851_remove_par(struct platform_device *pdev) +{ + return ks8851_remove_common(&pdev->dev); +} + +static const struct of_device_id ks8851_match_table[] = { + { .compatible = "micrel,ks8851-mll" }, + { } +}; +MODULE_DEVICE_TABLE(of, ks8851_match_table); + +static struct platform_driver ks8851_driver = { + .driver = { + .name = "ks8851", + .of_match_table = ks8851_match_table, + .pm = &ks8851_pm_ops, + }, + .probe = ks8851_probe_par, + .remove = ks8851_remove_par, +}; +module_platform_driver(ks8851_driver); + +MODULE_DESCRIPTION("KS8851 Network driver"); +MODULE_AUTHOR("Ben Dooks "); +MODULE_LICENSE("GPL"); + +module_param_named(message, msg_enable, int, 0); +MODULE_PARM_DESC(message, "Message verbosity level (0=none, 31=all)"); -- cgit v1.2.3-59-g8ed1b From 72628da6d63413299a6369e71f5c97f0edea6a8b Mon Sep 17 00:00:00 2001 From: Marek Vasut Date: Fri, 29 May 2020 00:21:46 +0200 Subject: net: ks8851: Remove ks8851_mll.c The ks8851_mll.c is replaced by ks8851_par.c, which is using common code from ks8851.c, just like ks8851_spi.c . Remove this old ad-hoc driver. Signed-off-by: Marek Vasut Cc: David S. Miller Cc: Lukas Wunner Cc: Petr Stetiar Cc: YueHaibing Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- drivers/net/ethernet/micrel/ks8851_mll.c | 1393 ------------------------------ 1 file changed, 1393 deletions(-) delete mode 100644 drivers/net/ethernet/micrel/ks8851_mll.c diff --git a/drivers/net/ethernet/micrel/ks8851_mll.c b/drivers/net/ethernet/micrel/ks8851_mll.c deleted file mode 100644 index 45cc840d8e2e..000000000000 --- a/drivers/net/ethernet/micrel/ks8851_mll.c +++ /dev/null @@ -1,1393 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/** - * drivers/net/ethernet/micrel/ks8851_mll.c - * Copyright (c) 2009 Micrel Inc. - */ - -/* Supports: - * KS8851 16bit MLL chip from Micrel Inc. - */ - -#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "ks8851.h" - -#define DRV_NAME "ks8851_mll" - -static u8 KS_DEFAULT_MAC_ADDRESS[] = { 0x00, 0x10, 0xA1, 0x86, 0x95, 0x11 }; -#define MAX_RECV_FRAMES 255 -#define MAX_BUF_SIZE 2048 -#define TX_BUF_SIZE 2000 -#define RX_BUF_SIZE 2000 - -#define RXCR1_FILTER_MASK (RXCR1_RXINVF | RXCR1_RXAE | \ - RXCR1_RXMAFMA | RXCR1_RXPAFMA) -#define RXQCR_CMD_CNTL (RXQCR_RXFCTE|RXQCR_ADRFE) - -#define ENUM_BUS_NONE 0 -#define ENUM_BUS_8BIT 1 -#define ENUM_BUS_16BIT 2 -#define ENUM_BUS_32BIT 3 - -#define MAX_MCAST_LST 32 -#define HW_MCAST_SIZE 8 - -/** - * union ks_tx_hdr - tx header data - * @txb: The header as bytes - * @txw: The header as 16bit, little-endian words - * - * A dual representation of the tx header data to allow - * access to individual bytes, and to allow 16bit accesses - * with 16bit alignment. - */ -union ks_tx_hdr { - u8 txb[4]; - __le16 txw[2]; -}; - -/** - * struct ks_net - KS8851 driver private data - * @net_device : The network device we're bound to - * @hw_addr : start address of data register. - * @hw_addr_cmd : start address of command register. - * @txh : temporaly buffer to save status/length. - * @lock : Lock to ensure that the device is not accessed when busy. - * @pdev : Pointer to platform device. - * @mii : The MII state information for the mii calls. - * @frame_head_info : frame header information for multi-pkt rx. - * @statelock : Lock on this structure for tx list. - * @msg_enable : The message flags controlling driver output (see ethtool). - * @frame_cnt : number of frames received. - * @bus_width : i/o bus width. - * @rc_rxqcr : Cached copy of KS_RXQCR. - * @rc_txcr : Cached copy of KS_TXCR. - * @rc_ier : Cached copy of KS_IER. - * @sharedbus : Multipex(addr and data bus) mode indicator. - * @cmd_reg_cache : command register cached. - * @cmd_reg_cache_int : command register cached. Used in the irq handler. - * @promiscuous : promiscuous mode indicator. - * @all_mcast : mutlicast indicator. - * @mcast_lst_size : size of multicast list. - * @mcast_lst : multicast list. - * @mcast_bits : multicast enabed. - * @mac_addr : MAC address assigned to this device. - * @fid : frame id. - * @extra_byte : number of extra byte prepended rx pkt. - * @enabled : indicator this device works. - * - * The @lock ensures that the chip is protected when certain operations are - * in progress. When the read or write packet transfer is in progress, most - * of the chip registers are not accessible until the transfer is finished and - * the DMA has been de-asserted. - * - * The @statelock is used to protect information in the structure which may - * need to be accessed via several sources, such as the network driver layer - * or one of the work queues. - * - */ - -/* Receive multiplex framer header info */ -struct type_frame_head { - u16 sts; /* Frame status */ - u16 len; /* Byte count */ -}; - -struct ks_net { - struct net_device *netdev; - void __iomem *hw_addr; - void __iomem *hw_addr_cmd; - union ks_tx_hdr txh ____cacheline_aligned; - struct mutex lock; /* spinlock to be interrupt safe */ - struct platform_device *pdev; - struct mii_if_info mii; - struct type_frame_head *frame_head_info; - spinlock_t statelock; - u32 msg_enable; - u32 frame_cnt; - int bus_width; - - u16 rc_rxqcr; - u16 rc_txcr; - u16 rc_ier; - u16 sharedbus; - u16 cmd_reg_cache; - u16 cmd_reg_cache_int; - u16 promiscuous; - u16 all_mcast; - u16 mcast_lst_size; - u8 mcast_lst[MAX_MCAST_LST][ETH_ALEN]; - u8 mcast_bits[HW_MCAST_SIZE]; - u8 mac_addr[6]; - u8 fid; - u8 extra_byte; - u8 enabled; -}; - -static int msg_enable; - -#define BE3 0x8000 /* Byte Enable 3 */ -#define BE2 0x4000 /* Byte Enable 2 */ -#define BE1 0x2000 /* Byte Enable 1 */ -#define BE0 0x1000 /* Byte Enable 0 */ - -/* register read/write calls. - * - * All these calls issue transactions to access the chip's registers. They - * all require that the necessary lock is held to prevent accesses when the - * chip is busy transferring packet data (RX/TX FIFO accesses). - */ - -/** - * ks_check_endian - Check whether endianness of the bus is correct - * @ks : The chip information - * - * The KS8851-16MLL EESK pin allows selecting the endianness of the 16bit - * bus. To maintain optimum performance, the bus endianness should be set - * such that it matches the endianness of the CPU. - */ - -static int ks_check_endian(struct ks_net *ks) -{ - u16 cider; - - /* - * Read CIDER register first, however read it the "wrong" way around. - * If the endian strap on the KS8851-16MLL in incorrect and the chip - * is operating in different endianness than the CPU, then the meaning - * of BE[3:0] byte-enable bits is also swapped such that: - * BE[3,2,1,0] becomes BE[1,0,3,2] - * - * Luckily for us, the byte-enable bits are the top four MSbits of - * the address register and the CIDER register is at offset 0xc0. - * Hence, by reading address 0xc0c0, which is not impacted by endian - * swapping, we assert either BE[3:2] or BE[1:0] while reading the - * CIDER register. - * - * If the bus configuration is correct, reading 0xc0c0 asserts - * BE[3:2] and this read returns 0x0000, because to read register - * with bottom two LSbits of address set to 0, BE[1:0] must be - * asserted. - * - * If the bus configuration is NOT correct, reading 0xc0c0 asserts - * BE[1:0] and this read returns non-zero 0x8872 value. - */ - iowrite16(BE3 | BE2 | KS_CIDER, ks->hw_addr_cmd); - cider = ioread16(ks->hw_addr); - if (!cider) - return 0; - - netdev_err(ks->netdev, "incorrect EESK endian strap setting\n"); - - return -EINVAL; -} - -/** - * ks_rdreg16 - read 16 bit register from device - * @ks : The chip information - * @offset: The register address - * - * Read a 16bit register from the chip, returning the result - */ - -static u16 ks_rdreg16(struct ks_net *ks, int offset) -{ - ks->cmd_reg_cache = (u16)offset | ((BE1 | BE0) << (offset & 0x02)); - iowrite16(ks->cmd_reg_cache, ks->hw_addr_cmd); - return ioread16(ks->hw_addr); -} - -/** - * ks_wrreg16 - write 16bit register value to chip - * @ks: The chip information - * @offset: The register address - * @value: The value to write - * - */ - -static void ks_wrreg16(struct ks_net *ks, int offset, u16 value) -{ - ks->cmd_reg_cache = (u16)offset | ((BE1 | BE0) << (offset & 0x02)); - iowrite16(ks->cmd_reg_cache, ks->hw_addr_cmd); - iowrite16(value, ks->hw_addr); -} - -/** - * ks_inblk - read a block of data from QMU. This is called after sudo DMA mode enabled. - * @ks: The chip state - * @wptr: buffer address to save data - * @len: length in byte to read - * - */ -static inline void ks_inblk(struct ks_net *ks, u16 *wptr, u32 len) -{ - len >>= 1; - while (len--) - *wptr++ = (u16)ioread16(ks->hw_addr); -} - -/** - * ks_outblk - write data to QMU. This is called after sudo DMA mode enabled. - * @ks: The chip information - * @wptr: buffer address - * @len: length in byte to write - * - */ -static inline void ks_outblk(struct ks_net *ks, u16 *wptr, u32 len) -{ - len >>= 1; - while (len--) - iowrite16(*wptr++, ks->hw_addr); -} - -static void ks_disable_int(struct ks_net *ks) -{ - ks_wrreg16(ks, KS_IER, 0x0000); -} /* ks_disable_int */ - -static void ks_enable_int(struct ks_net *ks) -{ - ks_wrreg16(ks, KS_IER, ks->rc_ier); -} /* ks_enable_int */ - -/** - * ks_tx_fifo_space - return the available hardware buffer size. - * @ks: The chip information - * - */ -static inline u16 ks_tx_fifo_space(struct ks_net *ks) -{ - return ks_rdreg16(ks, KS_TXMIR) & 0x1fff; -} - -/** - * ks_save_cmd_reg - save the command register from the cache. - * @ks: The chip information - * - */ -static inline void ks_save_cmd_reg(struct ks_net *ks) -{ - /*ks8851 MLL has a bug to read back the command register. - * So rely on software to save the content of command register. - */ - ks->cmd_reg_cache_int = ks->cmd_reg_cache; -} - -/** - * ks_restore_cmd_reg - restore the command register from the cache and - * write to hardware register. - * @ks: The chip information - * - */ -static inline void ks_restore_cmd_reg(struct ks_net *ks) -{ - ks->cmd_reg_cache = ks->cmd_reg_cache_int; - iowrite16(ks->cmd_reg_cache, ks->hw_addr_cmd); -} - -/** - * ks_set_powermode - set power mode of the device - * @ks: The chip information - * @pwrmode: The power mode value to write to KS_PMECR. - * - * Change the power mode of the chip. - */ -static void ks_set_powermode(struct ks_net *ks, unsigned pwrmode) -{ - unsigned pmecr; - - netif_dbg(ks, hw, ks->netdev, "setting power mode %d\n", pwrmode); - - ks_rdreg16(ks, KS_GRR); - pmecr = ks_rdreg16(ks, KS_PMECR); - pmecr &= ~PMECR_PM_MASK; - pmecr |= pwrmode; - - ks_wrreg16(ks, KS_PMECR, pmecr); -} - -/** - * ks_read_config - read chip configuration of bus width. - * @ks: The chip information - * - */ -static void ks_read_config(struct ks_net *ks) -{ - u16 reg_data = 0; - - /* Regardless of bus width, 8 bit read should always work.*/ - reg_data = ks_rdreg16(ks, KS_CCR); - - /* addr/data bus are multiplexed */ - ks->sharedbus = (reg_data & CCR_SHARED) == CCR_SHARED; - - /* There are garbage data when reading data from QMU, - depending on bus-width. - */ - - if (reg_data & CCR_8BIT) { - ks->bus_width = ENUM_BUS_8BIT; - ks->extra_byte = 1; - } else if (reg_data & CCR_16BIT) { - ks->bus_width = ENUM_BUS_16BIT; - ks->extra_byte = 2; - } else { - ks->bus_width = ENUM_BUS_32BIT; - ks->extra_byte = 4; - } -} - -/** - * ks_soft_reset - issue one of the soft reset to the device - * @ks: The device state. - * @op: The bit(s) to set in the GRR - * - * Issue the relevant soft-reset command to the device's GRR register - * specified by @op. - * - * Note, the delays are in there as a caution to ensure that the reset - * has time to take effect and then complete. Since the datasheet does - * not currently specify the exact sequence, we have chosen something - * that seems to work with our device. - */ -static void ks_soft_reset(struct ks_net *ks, unsigned op) -{ - /* Disable interrupt first */ - ks_wrreg16(ks, KS_IER, 0x0000); - ks_wrreg16(ks, KS_GRR, op); - mdelay(10); /* wait a short time to effect reset */ - ks_wrreg16(ks, KS_GRR, 0); - mdelay(1); /* wait for condition to clear */ -} - - -static void ks_enable_qmu(struct ks_net *ks) -{ - u16 w; - - w = ks_rdreg16(ks, KS_TXCR); - /* Enables QMU Transmit (TXCR). */ - ks_wrreg16(ks, KS_TXCR, w | TXCR_TXE); - - /* - * RX Frame Count Threshold Enable and Auto-Dequeue RXQ Frame - * Enable - */ - - w = ks_rdreg16(ks, KS_RXQCR); - ks_wrreg16(ks, KS_RXQCR, w | RXQCR_RXFCTE); - - /* Enables QMU Receive (RXCR1). */ - w = ks_rdreg16(ks, KS_RXCR1); - ks_wrreg16(ks, KS_RXCR1, w | RXCR1_RXE); - ks->enabled = true; -} /* ks_enable_qmu */ - -static void ks_disable_qmu(struct ks_net *ks) -{ - u16 w; - - w = ks_rdreg16(ks, KS_TXCR); - - /* Disables QMU Transmit (TXCR). */ - w &= ~TXCR_TXE; - ks_wrreg16(ks, KS_TXCR, w); - - /* Disables QMU Receive (RXCR1). */ - w = ks_rdreg16(ks, KS_RXCR1); - w &= ~RXCR1_RXE ; - ks_wrreg16(ks, KS_RXCR1, w); - - ks->enabled = false; - -} /* ks_disable_qmu */ - -/** - * ks_read_qmu - read 1 pkt data from the QMU. - * @ks: The chip information - * @buf: buffer address to save 1 pkt - * @len: Pkt length - * Here is the sequence to read 1 pkt: - * 1. set sudo DMA mode - * 2. read prepend data - * 3. read pkt data - * 4. reset sudo DMA Mode - */ -static inline void ks_read_qmu(struct ks_net *ks, u16 *buf, u32 len) -{ - u32 r = ks->extra_byte & 0x1 ; - u32 w = ks->extra_byte - r; - - /* 1. set sudo DMA mode */ - ks_wrreg16(ks, KS_RXFDPR, RXFDPR_RXFPAI); - ks_wrreg16(ks, KS_RXQCR, ks->rc_rxqcr | RXQCR_SDA); - - /* 2. read prepend data */ - /** - * read 4 + extra bytes and discard them. - * extra bytes for dummy, 2 for status, 2 for len - */ - - /* use likely(r) for 8 bit access for performance */ - if (unlikely(r)) - ioread8(ks->hw_addr); - ks_inblk(ks, buf, w + 2 + 2); - - /* 3. read pkt data */ - ks_inblk(ks, buf, ALIGN(len, 4)); - - /* 4. reset sudo DMA Mode */ - ks_wrreg16(ks, KS_RXQCR, ks->rc_rxqcr); -} - -/** - * ks_rcv - read multiple pkts data from the QMU. - * @ks: The chip information - * @netdev: The network device being opened. - * - * Read all of header information before reading pkt content. - * It is not allowed only port of pkts in QMU after issuing - * interrupt ack. - */ -static void ks_rcv(struct ks_net *ks, struct net_device *netdev) -{ - u32 i; - struct type_frame_head *frame_hdr = ks->frame_head_info; - struct sk_buff *skb; - - ks->frame_cnt = ks_rdreg16(ks, KS_RXFCTR) >> 8; - - /* read all header information */ - for (i = 0; i < ks->frame_cnt; i++) { - /* Checking Received packet status */ - frame_hdr->sts = ks_rdreg16(ks, KS_RXFHSR); - /* Get packet len from hardware */ - frame_hdr->len = ks_rdreg16(ks, KS_RXFHBCR); - frame_hdr++; - } - - frame_hdr = ks->frame_head_info; - while (ks->frame_cnt--) { - if (unlikely(!(frame_hdr->sts & RXFSHR_RXFV) || - frame_hdr->len >= RX_BUF_SIZE || - frame_hdr->len <= 0)) { - - /* discard an invalid packet */ - ks_wrreg16(ks, KS_RXQCR, (ks->rc_rxqcr | RXQCR_RRXEF)); - netdev->stats.rx_dropped++; - if (!(frame_hdr->sts & RXFSHR_RXFV)) - netdev->stats.rx_frame_errors++; - else - netdev->stats.rx_length_errors++; - frame_hdr++; - continue; - } - - skb = netdev_alloc_skb(netdev, frame_hdr->len + 16); - if (likely(skb)) { - skb_reserve(skb, 2); - /* read data block including CRC 4 bytes */ - ks_read_qmu(ks, (u16 *)skb->data, frame_hdr->len); - skb_put(skb, frame_hdr->len - 4); - skb->protocol = eth_type_trans(skb, netdev); - netif_rx(skb); - /* exclude CRC size */ - netdev->stats.rx_bytes += frame_hdr->len - 4; - netdev->stats.rx_packets++; - } else { - ks_wrreg16(ks, KS_RXQCR, (ks->rc_rxqcr | RXQCR_RRXEF)); - netdev->stats.rx_dropped++; - } - frame_hdr++; - } -} - -/** - * ks_update_link_status - link status update. - * @netdev: The network device being opened. - * @ks: The chip information - * - */ - -static void ks_update_link_status(struct net_device *netdev, struct ks_net *ks) -{ - /* check the status of the link */ - u32 link_up_status; - if (ks_rdreg16(ks, KS_P1SR) & P1SR_LINK_GOOD) { - netif_carrier_on(netdev); - link_up_status = true; - } else { - netif_carrier_off(netdev); - link_up_status = false; - } - netif_dbg(ks, link, ks->netdev, - "%s: %s\n", __func__, link_up_status ? "UP" : "DOWN"); -} - -/** - * ks_irq - device interrupt handler - * @irq: Interrupt number passed from the IRQ handler. - * @pw: The private word passed to register_irq(), our struct ks_net. - * - * This is the handler invoked to find out what happened - * - * Read the interrupt status, work out what needs to be done and then clear - * any of the interrupts that are not needed. - */ - -static irqreturn_t ks_irq(int irq, void *pw) -{ - struct net_device *netdev = pw; - struct ks_net *ks = netdev_priv(netdev); - unsigned long flags; - u16 status; - - spin_lock_irqsave(&ks->statelock, flags); - /*this should be the first in IRQ handler */ - ks_save_cmd_reg(ks); - - status = ks_rdreg16(ks, KS_ISR); - if (unlikely(!status)) { - ks_restore_cmd_reg(ks); - spin_unlock_irqrestore(&ks->statelock, flags); - return IRQ_NONE; - } - - ks_wrreg16(ks, KS_ISR, status); - - if (likely(status & IRQ_RXI)) - ks_rcv(ks, netdev); - - if (unlikely(status & IRQ_LCI)) - ks_update_link_status(netdev, ks); - - if (unlikely(status & IRQ_TXI)) - netif_wake_queue(netdev); - - if (unlikely(status & IRQ_LDI)) { - - u16 pmecr = ks_rdreg16(ks, KS_PMECR); - pmecr &= ~PMECR_WKEVT_MASK; - ks_wrreg16(ks, KS_PMECR, pmecr | PMECR_WKEVT_LINK); - } - - if (unlikely(status & IRQ_RXOI)) - ks->netdev->stats.rx_over_errors++; - /* this should be the last in IRQ handler*/ - ks_restore_cmd_reg(ks); - spin_unlock_irqrestore(&ks->statelock, flags); - return IRQ_HANDLED; -} - - -/** - * ks_net_open - open network device - * @netdev: The network device being opened. - * - * Called when the network device is marked active, such as a user executing - * 'ifconfig up' on the device. - */ -static int ks_net_open(struct net_device *netdev) -{ - struct ks_net *ks = netdev_priv(netdev); - int err; - -#define KS_INT_FLAGS IRQF_TRIGGER_LOW - /* lock the card, even if we may not actually do anything - * else at the moment. - */ - - netif_dbg(ks, ifup, ks->netdev, "%s - entry\n", __func__); - - /* reset the HW */ - err = request_irq(netdev->irq, ks_irq, KS_INT_FLAGS, DRV_NAME, netdev); - - if (err) { - pr_err("Failed to request IRQ: %d: %d\n", netdev->irq, err); - return err; - } - - /* wake up powermode to normal mode */ - ks_set_powermode(ks, PMECR_PM_NORMAL); - mdelay(1); /* wait for normal mode to take effect */ - - ks_wrreg16(ks, KS_ISR, 0xffff); - ks_enable_int(ks); - ks_enable_qmu(ks); - netif_start_queue(ks->netdev); - - netif_dbg(ks, ifup, ks->netdev, "network device up\n"); - - return 0; -} - -/** - * ks_net_stop - close network device - * @netdev: The device being closed. - * - * Called to close down a network device which has been active. Cancell any - * work, shutdown the RX and TX process and then place the chip into a low - * power state whilst it is not being used. - */ -static int ks_net_stop(struct net_device *netdev) -{ - struct ks_net *ks = netdev_priv(netdev); - - netif_info(ks, ifdown, netdev, "shutting down\n"); - - netif_stop_queue(netdev); - - mutex_lock(&ks->lock); - - /* turn off the IRQs and ack any outstanding */ - ks_wrreg16(ks, KS_IER, 0x0000); - ks_wrreg16(ks, KS_ISR, 0xffff); - - /* shutdown RX/TX QMU */ - ks_disable_qmu(ks); - ks_disable_int(ks); - - /* set powermode to soft power down to save power */ - ks_set_powermode(ks, PMECR_PM_SOFTDOWN); - free_irq(netdev->irq, netdev); - mutex_unlock(&ks->lock); - return 0; -} - - -/** - * ks_write_qmu - write 1 pkt data to the QMU. - * @ks: The chip information - * @pdata: buffer address to save 1 pkt - * @len: Pkt length in byte - * Here is the sequence to write 1 pkt: - * 1. set sudo DMA mode - * 2. write status/length - * 3. write pkt data - * 4. reset sudo DMA Mode - * 5. reset sudo DMA mode - * 6. Wait until pkt is out - */ -static void ks_write_qmu(struct ks_net *ks, u8 *pdata, u16 len) -{ - /* start header at txb[0] to align txw entries */ - ks->txh.txw[0] = 0; - ks->txh.txw[1] = cpu_to_le16(len); - - /* 1. set sudo-DMA mode */ - ks_wrreg16(ks, KS_RXQCR, ks->rc_rxqcr | RXQCR_SDA); - /* 2. write status/lenth info */ - ks_outblk(ks, ks->txh.txw, 4); - /* 3. write pkt data */ - ks_outblk(ks, (u16 *)pdata, ALIGN(len, 4)); - /* 4. reset sudo-DMA mode */ - ks_wrreg16(ks, KS_RXQCR, ks->rc_rxqcr); - /* 5. Enqueue Tx(move the pkt from TX buffer into TXQ) */ - ks_wrreg16(ks, KS_TXQCR, TXQCR_METFE); - /* 6. wait until TXQCR_METFE is auto-cleared */ - while (ks_rdreg16(ks, KS_TXQCR) & TXQCR_METFE) - ; -} - -/** - * ks_start_xmit - transmit packet - * @skb : The buffer to transmit - * @netdev : The device used to transmit the packet. - * - * Called by the network layer to transmit the @skb. - * spin_lock_irqsave is required because tx and rx should be mutual exclusive. - * So while tx is in-progress, prevent IRQ interrupt from happenning. - */ -static netdev_tx_t ks_start_xmit(struct sk_buff *skb, struct net_device *netdev) -{ - netdev_tx_t retv = NETDEV_TX_OK; - struct ks_net *ks = netdev_priv(netdev); - unsigned long flags; - - spin_lock_irqsave(&ks->statelock, flags); - - /* Extra space are required: - * 4 byte for alignment, 4 for status/length, 4 for CRC - */ - - if (likely(ks_tx_fifo_space(ks) >= skb->len + 12)) { - ks_write_qmu(ks, skb->data, skb->len); - /* add tx statistics */ - netdev->stats.tx_bytes += skb->len; - netdev->stats.tx_packets++; - dev_kfree_skb(skb); - } else - retv = NETDEV_TX_BUSY; - spin_unlock_irqrestore(&ks->statelock, flags); - return retv; -} - -/** - * ks_start_rx - ready to serve pkts - * @ks : The chip information - * - */ -static void ks_start_rx(struct ks_net *ks) -{ - u16 cntl; - - /* Enables QMU Receive (RXCR1). */ - cntl = ks_rdreg16(ks, KS_RXCR1); - cntl |= RXCR1_RXE ; - ks_wrreg16(ks, KS_RXCR1, cntl); -} /* ks_start_rx */ - -/** - * ks_stop_rx - stop to serve pkts - * @ks : The chip information - * - */ -static void ks_stop_rx(struct ks_net *ks) -{ - u16 cntl; - - /* Disables QMU Receive (RXCR1). */ - cntl = ks_rdreg16(ks, KS_RXCR1); - cntl &= ~RXCR1_RXE ; - ks_wrreg16(ks, KS_RXCR1, cntl); - -} /* ks_stop_rx */ - -static unsigned long const ethernet_polynomial = CRC32_POLY_BE; - -static unsigned long ether_gen_crc(int length, u8 *data) -{ - long crc = -1; - while (--length >= 0) { - u8 current_octet = *data++; - int bit; - - for (bit = 0; bit < 8; bit++, current_octet >>= 1) { - crc = (crc << 1) ^ - ((crc < 0) ^ (current_octet & 1) ? - ethernet_polynomial : 0); - } - } - return (unsigned long)crc; -} /* ether_gen_crc */ - -/** -* ks_set_grpaddr - set multicast information -* @ks : The chip information -*/ - -static void ks_set_grpaddr(struct ks_net *ks) -{ - u8 i; - u32 index, position, value; - - memset(ks->mcast_bits, 0, sizeof(u8) * HW_MCAST_SIZE); - - for (i = 0; i < ks->mcast_lst_size; i++) { - position = (ether_gen_crc(6, ks->mcast_lst[i]) >> 26) & 0x3f; - index = position >> 3; - value = 1 << (position & 7); - ks->mcast_bits[index] |= (u8)value; - } - - for (i = 0; i < HW_MCAST_SIZE; i++) { - if (i & 1) { - ks_wrreg16(ks, (u16)((KS_MAHTR0 + i) & ~1), - (ks->mcast_bits[i] << 8) | - ks->mcast_bits[i - 1]); - } - } -} /* ks_set_grpaddr */ - -/** -* ks_clear_mcast - clear multicast information -* -* @ks : The chip information -* This routine removes all mcast addresses set in the hardware. -*/ - -static void ks_clear_mcast(struct ks_net *ks) -{ - u16 i, mcast_size; - for (i = 0; i < HW_MCAST_SIZE; i++) - ks->mcast_bits[i] = 0; - - mcast_size = HW_MCAST_SIZE >> 2; - for (i = 0; i < mcast_size; i++) - ks_wrreg16(ks, KS_MAHTR0 + (2*i), 0); -} - -static void ks_set_promis(struct ks_net *ks, u16 promiscuous_mode) -{ - u16 cntl; - ks->promiscuous = promiscuous_mode; - ks_stop_rx(ks); /* Stop receiving for reconfiguration */ - cntl = ks_rdreg16(ks, KS_RXCR1); - - cntl &= ~RXCR1_FILTER_MASK; - if (promiscuous_mode) - /* Enable Promiscuous mode */ - cntl |= RXCR1_RXAE | RXCR1_RXINVF; - else - /* Disable Promiscuous mode (default normal mode) */ - cntl |= RXCR1_RXPAFMA; - - ks_wrreg16(ks, KS_RXCR1, cntl); - - if (ks->enabled) - ks_start_rx(ks); - -} /* ks_set_promis */ - -static void ks_set_mcast(struct ks_net *ks, u16 mcast) -{ - u16 cntl; - - ks->all_mcast = mcast; - ks_stop_rx(ks); /* Stop receiving for reconfiguration */ - cntl = ks_rdreg16(ks, KS_RXCR1); - cntl &= ~RXCR1_FILTER_MASK; - if (mcast) - /* Enable "Perfect with Multicast address passed mode" */ - cntl |= (RXCR1_RXAE | RXCR1_RXMAFMA | RXCR1_RXPAFMA); - else - /** - * Disable "Perfect with Multicast address passed - * mode" (normal mode). - */ - cntl |= RXCR1_RXPAFMA; - - ks_wrreg16(ks, KS_RXCR1, cntl); - - if (ks->enabled) - ks_start_rx(ks); -} /* ks_set_mcast */ - -static void ks_set_rx_mode(struct net_device *netdev) -{ - struct ks_net *ks = netdev_priv(netdev); - struct netdev_hw_addr *ha; - - /* Turn on/off promiscuous mode. */ - if ((netdev->flags & IFF_PROMISC) == IFF_PROMISC) - ks_set_promis(ks, - (u16)((netdev->flags & IFF_PROMISC) == IFF_PROMISC)); - /* Turn on/off all mcast mode. */ - else if ((netdev->flags & IFF_ALLMULTI) == IFF_ALLMULTI) - ks_set_mcast(ks, - (u16)((netdev->flags & IFF_ALLMULTI) == IFF_ALLMULTI)); - else - ks_set_promis(ks, false); - - if ((netdev->flags & IFF_MULTICAST) && netdev_mc_count(netdev)) { - if (netdev_mc_count(netdev) <= MAX_MCAST_LST) { - int i = 0; - - netdev_for_each_mc_addr(ha, netdev) { - if (i >= MAX_MCAST_LST) - break; - memcpy(ks->mcast_lst[i++], ha->addr, ETH_ALEN); - } - ks->mcast_lst_size = (u8)i; - ks_set_grpaddr(ks); - } else { - /** - * List too big to support so - * turn on all mcast mode. - */ - ks->mcast_lst_size = MAX_MCAST_LST; - ks_set_mcast(ks, true); - } - } else { - ks->mcast_lst_size = 0; - ks_clear_mcast(ks); - } -} /* ks_set_rx_mode */ - -static void ks_set_mac(struct ks_net *ks, u8 *data) -{ - u16 *pw = (u16 *)data; - u16 w, u; - - ks_stop_rx(ks); /* Stop receiving for reconfiguration */ - - u = *pw++; - w = ((u & 0xFF) << 8) | ((u >> 8) & 0xFF); - ks_wrreg16(ks, KS_MARH, w); - - u = *pw++; - w = ((u & 0xFF) << 8) | ((u >> 8) & 0xFF); - ks_wrreg16(ks, KS_MARM, w); - - u = *pw; - w = ((u & 0xFF) << 8) | ((u >> 8) & 0xFF); - ks_wrreg16(ks, KS_MARL, w); - - memcpy(ks->mac_addr, data, ETH_ALEN); - - if (ks->enabled) - ks_start_rx(ks); -} - -static int ks_set_mac_address(struct net_device *netdev, void *paddr) -{ - struct ks_net *ks = netdev_priv(netdev); - struct sockaddr *addr = paddr; - u8 *da; - - memcpy(netdev->dev_addr, addr->sa_data, netdev->addr_len); - - da = (u8 *)netdev->dev_addr; - - ks_set_mac(ks, da); - return 0; -} - -static int ks_net_ioctl(struct net_device *netdev, struct ifreq *req, int cmd) -{ - struct ks_net *ks = netdev_priv(netdev); - - if (!netif_running(netdev)) - return -EINVAL; - - return generic_mii_ioctl(&ks->mii, if_mii(req), cmd, NULL); -} - -static const struct net_device_ops ks_netdev_ops = { - .ndo_open = ks_net_open, - .ndo_stop = ks_net_stop, - .ndo_do_ioctl = ks_net_ioctl, - .ndo_start_xmit = ks_start_xmit, - .ndo_set_mac_address = ks_set_mac_address, - .ndo_set_rx_mode = ks_set_rx_mode, - .ndo_validate_addr = eth_validate_addr, -}; - -/* ethtool support */ - -static void ks_get_drvinfo(struct net_device *netdev, - struct ethtool_drvinfo *di) -{ - strlcpy(di->driver, DRV_NAME, sizeof(di->driver)); - strlcpy(di->version, "1.00", sizeof(di->version)); - strlcpy(di->bus_info, dev_name(netdev->dev.parent), - sizeof(di->bus_info)); -} - -static u32 ks_get_msglevel(struct net_device *netdev) -{ - struct ks_net *ks = netdev_priv(netdev); - return ks->msg_enable; -} - -static void ks_set_msglevel(struct net_device *netdev, u32 to) -{ - struct ks_net *ks = netdev_priv(netdev); - ks->msg_enable = to; -} - -static int ks_get_link_ksettings(struct net_device *netdev, - struct ethtool_link_ksettings *cmd) -{ - struct ks_net *ks = netdev_priv(netdev); - - mii_ethtool_get_link_ksettings(&ks->mii, cmd); - - return 0; -} - -static int ks_set_link_ksettings(struct net_device *netdev, - const struct ethtool_link_ksettings *cmd) -{ - struct ks_net *ks = netdev_priv(netdev); - return mii_ethtool_set_link_ksettings(&ks->mii, cmd); -} - -static u32 ks_get_link(struct net_device *netdev) -{ - struct ks_net *ks = netdev_priv(netdev); - return mii_link_ok(&ks->mii); -} - -static int ks_nway_reset(struct net_device *netdev) -{ - struct ks_net *ks = netdev_priv(netdev); - return mii_nway_restart(&ks->mii); -} - -static const struct ethtool_ops ks_ethtool_ops = { - .get_drvinfo = ks_get_drvinfo, - .get_msglevel = ks_get_msglevel, - .set_msglevel = ks_set_msglevel, - .get_link = ks_get_link, - .nway_reset = ks_nway_reset, - .get_link_ksettings = ks_get_link_ksettings, - .set_link_ksettings = ks_set_link_ksettings, -}; - -/* MII interface controls */ - -/** - * ks_phy_reg - convert MII register into a KS8851 register - * @reg: MII register number. - * - * Return the KS8851 register number for the corresponding MII PHY register - * if possible. Return zero if the MII register has no direct mapping to the - * KS8851 register set. - */ -static int ks_phy_reg(int reg) -{ - switch (reg) { - case MII_BMCR: - return KS_P1MBCR; - case MII_BMSR: - return KS_P1MBSR; - case MII_PHYSID1: - return KS_PHY1ILR; - case MII_PHYSID2: - return KS_PHY1IHR; - case MII_ADVERTISE: - return KS_P1ANAR; - case MII_LPA: - return KS_P1ANLPR; - } - - return 0x0; -} - -/** - * ks_phy_read - MII interface PHY register read. - * @netdev: The network device the PHY is on. - * @phy_addr: Address of PHY (ignored as we only have one) - * @reg: The register to read. - * - * This call reads data from the PHY register specified in @reg. Since the - * device does not support all the MII registers, the non-existent values - * are always returned as zero. - * - * We return zero for unsupported registers as the MII code does not check - * the value returned for any error status, and simply returns it to the - * caller. The mii-tool that the driver was tested with takes any -ve error - * as real PHY capabilities, thus displaying incorrect data to the user. - */ -static int ks_phy_read(struct net_device *netdev, int phy_addr, int reg) -{ - struct ks_net *ks = netdev_priv(netdev); - int ksreg; - int result; - - ksreg = ks_phy_reg(reg); - if (!ksreg) - return 0x0; /* no error return allowed, so use zero */ - - mutex_lock(&ks->lock); - result = ks_rdreg16(ks, ksreg); - mutex_unlock(&ks->lock); - - return result; -} - -static void ks_phy_write(struct net_device *netdev, - int phy, int reg, int value) -{ - struct ks_net *ks = netdev_priv(netdev); - int ksreg; - - ksreg = ks_phy_reg(reg); - if (ksreg) { - mutex_lock(&ks->lock); - ks_wrreg16(ks, ksreg, value); - mutex_unlock(&ks->lock); - } -} - -/** - * ks_read_selftest - read the selftest memory info. - * @ks: The device state - * - * Read and check the TX/RX memory selftest information. - */ -static int ks_read_selftest(struct ks_net *ks) -{ - unsigned both_done = MBIR_TXMBF | MBIR_RXMBF; - int ret = 0; - unsigned rd; - - rd = ks_rdreg16(ks, KS_MBIR); - - if ((rd & both_done) != both_done) { - netdev_warn(ks->netdev, "Memory selftest not finished\n"); - return 0; - } - - if (rd & MBIR_TXMBFA) { - netdev_err(ks->netdev, "TX memory selftest fails\n"); - ret |= 1; - } - - if (rd & MBIR_RXMBFA) { - netdev_err(ks->netdev, "RX memory selftest fails\n"); - ret |= 2; - } - - netdev_info(ks->netdev, "the selftest passes\n"); - return ret; -} - -static void ks_setup(struct ks_net *ks) -{ - u16 w; - - /** - * Configure QMU Transmit - */ - - /* Setup Transmit Frame Data Pointer Auto-Increment (TXFDPR) */ - ks_wrreg16(ks, KS_TXFDPR, TXFDPR_TXFPAI); - - /* Setup Receive Frame Data Pointer Auto-Increment */ - ks_wrreg16(ks, KS_RXFDPR, RXFDPR_RXFPAI); - - /* Setup Receive Frame Threshold - 1 frame (RXFCTFC) */ - ks_wrreg16(ks, KS_RXFCTR, 1 & RXFCTR_RXFCT_MASK); - - /* Setup RxQ Command Control (RXQCR) */ - ks->rc_rxqcr = RXQCR_CMD_CNTL; - ks_wrreg16(ks, KS_RXQCR, ks->rc_rxqcr); - - /** - * set the force mode to half duplex, default is full duplex - * because if the auto-negotiation fails, most switch uses - * half-duplex. - */ - - w = ks_rdreg16(ks, KS_P1MBCR); - w &= ~BMCR_FULLDPLX; - ks_wrreg16(ks, KS_P1MBCR, w); - - w = TXCR_TXFCE | TXCR_TXPE | TXCR_TXCRC | TXCR_TCGIP; - ks_wrreg16(ks, KS_TXCR, w); - - w = RXCR1_RXFCE | RXCR1_RXBE | RXCR1_RXUE | RXCR1_RXME | RXCR1_RXIPFCC; - - if (ks->promiscuous) /* bPromiscuous */ - w |= (RXCR1_RXAE | RXCR1_RXINVF); - else if (ks->all_mcast) /* Multicast address passed mode */ - w |= (RXCR1_RXAE | RXCR1_RXMAFMA | RXCR1_RXPAFMA); - else /* Normal mode */ - w |= RXCR1_RXPAFMA; - - ks_wrreg16(ks, KS_RXCR1, w); -} /*ks_setup */ - - -static void ks_setup_int(struct ks_net *ks) -{ - ks->rc_ier = 0x00; - /* Clear the interrupts status of the hardware. */ - ks_wrreg16(ks, KS_ISR, 0xffff); - - /* Enables the interrupts of the hardware. */ - ks->rc_ier = (IRQ_LCI | IRQ_TXI | IRQ_RXI); -} /* ks_setup_int */ - -static int ks_hw_init(struct ks_net *ks) -{ -#define MHEADER_SIZE (sizeof(struct type_frame_head) * MAX_RECV_FRAMES) - ks->promiscuous = 0; - ks->all_mcast = 0; - ks->mcast_lst_size = 0; - - ks->frame_head_info = devm_kmalloc(&ks->pdev->dev, MHEADER_SIZE, - GFP_KERNEL); - if (!ks->frame_head_info) - return false; - - ks_set_mac(ks, KS_DEFAULT_MAC_ADDRESS); - return true; -} - -#if defined(CONFIG_OF) -static const struct of_device_id ks8851_ml_dt_ids[] = { - { .compatible = "micrel,ks8851-mll" }, - { /* sentinel */ } -}; -MODULE_DEVICE_TABLE(of, ks8851_ml_dt_ids); -#endif - -static int ks8851_probe(struct platform_device *pdev) -{ - int err; - struct net_device *netdev; - struct ks_net *ks; - u16 id, data; - const char *mac; - - netdev = alloc_etherdev(sizeof(struct ks_net)); - if (!netdev) - return -ENOMEM; - - SET_NETDEV_DEV(netdev, &pdev->dev); - - ks = netdev_priv(netdev); - ks->netdev = netdev; - - ks->hw_addr = devm_platform_ioremap_resource(pdev, 0); - if (IS_ERR(ks->hw_addr)) { - err = PTR_ERR(ks->hw_addr); - goto err_free; - } - - ks->hw_addr_cmd = devm_platform_ioremap_resource(pdev, 1); - if (IS_ERR(ks->hw_addr_cmd)) { - err = PTR_ERR(ks->hw_addr_cmd); - goto err_free; - } - - err = ks_check_endian(ks); - if (err) - goto err_free; - - netdev->irq = platform_get_irq(pdev, 0); - - if ((int)netdev->irq < 0) { - err = netdev->irq; - goto err_free; - } - - ks->pdev = pdev; - - mutex_init(&ks->lock); - spin_lock_init(&ks->statelock); - - netdev->netdev_ops = &ks_netdev_ops; - netdev->ethtool_ops = &ks_ethtool_ops; - - /* setup mii state */ - ks->mii.dev = netdev; - ks->mii.phy_id = 1, - ks->mii.phy_id_mask = 1; - ks->mii.reg_num_mask = 0xf; - ks->mii.mdio_read = ks_phy_read; - ks->mii.mdio_write = ks_phy_write; - - netdev_info(netdev, "message enable is %d\n", msg_enable); - /* set the default message enable */ - ks->msg_enable = netif_msg_init(msg_enable, (NETIF_MSG_DRV | - NETIF_MSG_PROBE | - NETIF_MSG_LINK)); - ks_read_config(ks); - - /* simple check for a valid chip being connected to the bus */ - if ((ks_rdreg16(ks, KS_CIDER) & ~CIDER_REV_MASK) != CIDER_ID) { - netdev_err(netdev, "failed to read device ID\n"); - err = -ENODEV; - goto err_free; - } - - if (ks_read_selftest(ks)) { - netdev_err(netdev, "failed to read device ID\n"); - err = -ENODEV; - goto err_free; - } - - err = register_netdev(netdev); - if (err) - goto err_free; - - platform_set_drvdata(pdev, netdev); - - ks_soft_reset(ks, GRR_GSR); - ks_hw_init(ks); - ks_disable_qmu(ks); - ks_setup(ks); - ks_setup_int(ks); - - data = ks_rdreg16(ks, KS_OBCR); - ks_wrreg16(ks, KS_OBCR, data | OBCR_ODS_16mA); - - /* overwriting the default MAC address */ - if (pdev->dev.of_node) { - mac = of_get_mac_address(pdev->dev.of_node); - if (!IS_ERR(mac)) - ether_addr_copy(ks->mac_addr, mac); - } else { - struct ks8851_mll_platform_data *pdata; - - pdata = dev_get_platdata(&pdev->dev); - if (!pdata) { - netdev_err(netdev, "No platform data\n"); - err = -ENODEV; - goto err_pdata; - } - memcpy(ks->mac_addr, pdata->mac_addr, ETH_ALEN); - } - if (!is_valid_ether_addr(ks->mac_addr)) { - /* Use random MAC address if none passed */ - eth_random_addr(ks->mac_addr); - netdev_info(netdev, "Using random mac address\n"); - } - netdev_info(netdev, "Mac address is: %pM\n", ks->mac_addr); - - memcpy(netdev->dev_addr, ks->mac_addr, ETH_ALEN); - - ks_set_mac(ks, netdev->dev_addr); - - id = ks_rdreg16(ks, KS_CIDER); - - netdev_info(netdev, "Found chip, family: 0x%x, id: 0x%x, rev: 0x%x\n", - (id >> 8) & 0xff, (id >> 4) & 0xf, (id >> 1) & 0x7); - return 0; - -err_pdata: - unregister_netdev(netdev); -err_free: - free_netdev(netdev); - return err; -} - -static int ks8851_remove(struct platform_device *pdev) -{ - struct net_device *netdev = platform_get_drvdata(pdev); - - unregister_netdev(netdev); - free_netdev(netdev); - return 0; - -} - -static struct platform_driver ks8851_platform_driver = { - .driver = { - .name = DRV_NAME, - .of_match_table = of_match_ptr(ks8851_ml_dt_ids), - }, - .probe = ks8851_probe, - .remove = ks8851_remove, -}; - -module_platform_driver(ks8851_platform_driver); - -MODULE_DESCRIPTION("KS8851 MLL Network driver"); -MODULE_AUTHOR("David Choi "); -MODULE_LICENSE("GPL"); -module_param_named(message, msg_enable, int, 0); -MODULE_PARM_DESC(message, "Message verbosity level (0=none, 31=all)"); - -- cgit v1.2.3-59-g8ed1b From 2421ee24777e9f7effc4b6db29276eced7ca2114 Mon Sep 17 00:00:00 2001 From: Huazhong Tan Date: Thu, 28 May 2020 21:48:08 +0800 Subject: net: hns3: remove an unnecessary 'goto' in hclge_init_ae_dev() Remove the redundant 'goto' and return -ENOMEM directly, when allocating memory for 'hdev' fails in hclge_init_ae_dev(). Signed-off-by: Huazhong Tan Signed-off-by: David S. Miller --- drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c index 7c9f2ba1f272..0e36f037f69c 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c @@ -9928,10 +9928,8 @@ static int hclge_init_ae_dev(struct hnae3_ae_dev *ae_dev) int ret; hdev = devm_kzalloc(&pdev->dev, sizeof(*hdev), GFP_KERNEL); - if (!hdev) { - ret = -ENOMEM; - goto out; - } + if (!hdev) + return -ENOMEM; hdev->pdev = pdev; hdev->ae_dev = ae_dev; -- cgit v1.2.3-59-g8ed1b From 9516352150c0cb896b3de3997b1dfe43fb96d8a5 Mon Sep 17 00:00:00 2001 From: Huazhong Tan Date: Thu, 28 May 2020 21:48:09 +0800 Subject: net: hns3: add a missing mutex destroy in hclge_init_ad_dev() Add a mutex destroy call in hclge_init_ae_dev() when fails. Signed-off-by: Huazhong Tan Signed-off-by: David S. Miller --- drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c index 0e36f037f69c..7d5c304bd6b1 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c @@ -10108,6 +10108,7 @@ err_pci_uninit: pci_release_regions(pdev); pci_disable_device(pdev); out: + mutex_destroy(&hdev->vport_lock); return ret; } -- cgit v1.2.3-59-g8ed1b From 9f5a9816065f92683fd5f23cd8ec98719f20144f Mon Sep 17 00:00:00 2001 From: Huazhong Tan Date: Thu, 28 May 2020 21:48:10 +0800 Subject: net: hns3: refactor hclge_config_tso() Since parameters 'tso_mss_min' and 'tso_mss_max' only indicate the minimum and maximum MSS, the hnae3_set_field() calls are meaningless, remove them and change the type of these two parameters to u16. Signed-off-by: Huazhong Tan Signed-off-by: David S. Miller --- drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c | 17 ++++------------- 1 file changed, 4 insertions(+), 13 deletions(-) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c index 7d5c304bd6b1..35e5cb87f48c 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c @@ -1429,26 +1429,17 @@ static int hclge_configure(struct hclge_dev *hdev) return ret; } -static int hclge_config_tso(struct hclge_dev *hdev, unsigned int tso_mss_min, - unsigned int tso_mss_max) +static int hclge_config_tso(struct hclge_dev *hdev, u16 tso_mss_min, + u16 tso_mss_max) { struct hclge_cfg_tso_status_cmd *req; struct hclge_desc desc; - u16 tso_mss; hclge_cmd_setup_basic_desc(&desc, HCLGE_OPC_TSO_GENERIC_CONFIG, false); req = (struct hclge_cfg_tso_status_cmd *)desc.data; - - tso_mss = 0; - hnae3_set_field(tso_mss, HCLGE_TSO_MSS_MIN_M, - HCLGE_TSO_MSS_MIN_S, tso_mss_min); - req->tso_mss_min = cpu_to_le16(tso_mss); - - tso_mss = 0; - hnae3_set_field(tso_mss, HCLGE_TSO_MSS_MIN_M, - HCLGE_TSO_MSS_MIN_S, tso_mss_max); - req->tso_mss_max = cpu_to_le16(tso_mss); + req->tso_mss_min = cpu_to_le16(tso_mss_min); + req->tso_mss_max = cpu_to_le16(tso_mss_max); return hclge_cmd_send(&hdev->hw, &desc, 1); } -- cgit v1.2.3-59-g8ed1b From 5caa039f320d023fb2a40c8c7ededfca3ce85501 Mon Sep 17 00:00:00 2001 From: Huazhong Tan Date: Thu, 28 May 2020 21:48:11 +0800 Subject: net: hns3: refactor hclge_query_bd_num_cmd_send() In order to improve code maintainability and readability, rewrite the process of BDs' initialization in hclge_query_bd_num_cmd_send(). Signed-off-by: Huazhong Tan Signed-off-by: David S. Miller --- .../net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c index 35e5cb87f48c..e9b0e1c00970 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c @@ -10723,16 +10723,19 @@ static int hclge_get_64_bit_regs(struct hclge_dev *hdev, u32 regs_num, int hclge_query_bd_num_cmd_send(struct hclge_dev *hdev, struct hclge_desc *desc) { - /*prepare 4 commands to query DFX BD number*/ - hclge_cmd_setup_basic_desc(&desc[0], HCLGE_OPC_DFX_BD_NUM, true); - desc[0].flag |= cpu_to_le16(HCLGE_CMD_FLAG_NEXT); - hclge_cmd_setup_basic_desc(&desc[1], HCLGE_OPC_DFX_BD_NUM, true); - desc[1].flag |= cpu_to_le16(HCLGE_CMD_FLAG_NEXT); - hclge_cmd_setup_basic_desc(&desc[2], HCLGE_OPC_DFX_BD_NUM, true); - desc[2].flag |= cpu_to_le16(HCLGE_CMD_FLAG_NEXT); - hclge_cmd_setup_basic_desc(&desc[3], HCLGE_OPC_DFX_BD_NUM, true); + int i; + + /* initialize command BD except the last one */ + for (i = 0; i < HCLGE_GET_DFX_REG_TYPE_CNT - 1; i++) { + hclge_cmd_setup_basic_desc(&desc[i], HCLGE_OPC_DFX_BD_NUM, + true); + desc[i].flag |= cpu_to_le16(HCLGE_CMD_FLAG_NEXT); + } + + /* initialize the last command BD */ + hclge_cmd_setup_basic_desc(&desc[i], HCLGE_OPC_DFX_BD_NUM, true); - return hclge_cmd_send(&hdev->hw, desc, 4); + return hclge_cmd_send(&hdev->hw, desc, HCLGE_GET_DFX_REG_TYPE_CNT); } static int hclge_get_dfx_reg_bd_num(struct hclge_dev *hdev, -- cgit v1.2.3-59-g8ed1b From 639d84d0c4281e6d8814bb2cc230bfe7ccf5019d Mon Sep 17 00:00:00 2001 From: Huazhong Tan Date: Thu, 28 May 2020 21:48:12 +0800 Subject: net: hns3: modify an incorrect type in struct hclge_cfg_gro_status_cmd Modify field .gro_en in struct hclge_cfg_gro_status_cmd to u8 according to the UM, otherwise, it will overwrite the reserved byte which may be used for other purpose. Signed-off-by: Huazhong Tan Signed-off-by: David S. Miller --- drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h | 4 ++-- drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h index e3bab8f3847f..463f29151ef0 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h @@ -884,8 +884,8 @@ struct hclge_cfg_tso_status_cmd { #define HCLGE_GRO_EN_B 0 struct hclge_cfg_gro_status_cmd { - __le16 gro_en; - u8 rsv[22]; + u8 gro_en; + u8 rsv[23]; }; #define HCLGE_TSO_MSS_MIN 256 diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c index e9b0e1c00970..1e4f28518d69 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c @@ -1456,7 +1456,7 @@ static int hclge_config_gro(struct hclge_dev *hdev, bool en) hclge_cmd_setup_basic_desc(&desc, HCLGE_OPC_GRO_GENERIC_CONFIG, false); req = (struct hclge_cfg_gro_status_cmd *)desc.data; - req->gro_en = cpu_to_le16(en ? 1 : 0); + req->gro_en = en ? 1 : 0; ret = hclge_cmd_send(&hdev->hw, &desc, 1); if (ret) -- cgit v1.2.3-59-g8ed1b From fb9e44d63dc33b455a50b772a37faf43e793da91 Mon Sep 17 00:00:00 2001 From: Huazhong Tan Date: Thu, 28 May 2020 21:48:13 +0800 Subject: net: hns3: modify an incorrect type in struct hclgevf_cfg_gro_status_cmd Modify field .gro_en in struct hclgevf_cfg_gro_status_cmd to u8 according to the UM, otherwise, it will overwrite the reserved byte which may be used for other purpose. Signed-off-by: Huazhong Tan Signed-off-by: David S. Miller --- drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_cmd.h | 4 ++-- drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_cmd.h b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_cmd.h index f830eef02e5c..40d6e602ab51 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_cmd.h +++ b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_cmd.h @@ -161,8 +161,8 @@ struct hclgevf_query_res_cmd { #define HCLGEVF_GRO_EN_B 0 struct hclgevf_cfg_gro_status_cmd { - __le16 gro_en; - u8 rsv[22]; + u8 gro_en; + u8 rsv[23]; }; #define HCLGEVF_RSS_DEFAULT_OUTPORT_B 4 diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c index 59fcb80671c8..be5789bb5ddd 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c @@ -2403,7 +2403,7 @@ static int hclgevf_config_gro(struct hclgevf_dev *hdev, bool en) false); req = (struct hclgevf_cfg_gro_status_cmd *)desc.data; - req->gro_en = cpu_to_le16(en ? 1 : 0); + req->gro_en = en ? 1 : 0; ret = hclgevf_cmd_send(&hdev->hw, &desc, 1); if (ret) -- cgit v1.2.3-59-g8ed1b From 5e86178dcead4941fcdadc963f31ed4e859e58ce Mon Sep 17 00:00:00 2001 From: Huazhong Tan Date: Thu, 28 May 2020 21:48:14 +0800 Subject: net: hns3: remove some unused fields in struct hns3_nic_priv Remove some fileds which defined in struct hns3_nic_priv, but not used, and remove the related definition of struct hns3_udp_tunnel and enum hns3_udp_tnl_type. Signed-off-by: Huazhong Tan Signed-off-by: David S. Miller --- drivers/net/ethernet/hisilicon/hns3/hns3_enet.h | 22 ---------------------- 1 file changed, 22 deletions(-) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.h b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.h index 60f82ad89957..66cd4395f781 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.h +++ b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.h @@ -469,21 +469,8 @@ struct hns3_enet_tqp_vector { unsigned long last_jiffies; } ____cacheline_internodealigned_in_smp; -enum hns3_udp_tnl_type { - HNS3_UDP_TNL_VXLAN, - HNS3_UDP_TNL_GENEVE, - HNS3_UDP_TNL_MAX, -}; - -struct hns3_udp_tunnel { - u16 dst_port; - int used; -}; - struct hns3_nic_priv { struct hnae3_handle *ae_handle; - u32 enet_ver; - u32 port_id; struct net_device *netdev; struct device *dev; @@ -495,19 +482,10 @@ struct hns3_nic_priv { struct hns3_enet_tqp_vector *tqp_vector; u16 vector_num; - /* The most recently read link state */ - int link; u64 tx_timeout_count; unsigned long state; - struct timer_list service_timer; - - struct work_struct service_task; - - struct notifier_block notifier_block; - /* Vxlan/Geneve information */ - struct hns3_udp_tunnel udp_tnl[HNS3_UDP_TNL_MAX]; struct hns3_enet_coalesce tx_coal; struct hns3_enet_coalesce rx_coal; }; -- cgit v1.2.3-59-g8ed1b From c496299e0677fe8c12af8fd233783df1c8aa9c4e Mon Sep 17 00:00:00 2001 From: Huazhong Tan Date: Thu, 28 May 2020 21:48:15 +0800 Subject: net: hns3; remove unused HNAE3_RESTORE_CLIENT in enum hnae3_reset_notify_type Remove HNAE3_RESTORE_CLIENT which is not needed now. Signed-off-by: Huazhong Tan Signed-off-by: David S. Miller --- drivers/net/ethernet/hisilicon/hns3/hnae3.h | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/net/ethernet/hisilicon/hns3/hnae3.h b/drivers/net/ethernet/hisilicon/hns3/hnae3.h index 7506cabaa16e..0a4aac44efa5 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hnae3.h +++ b/drivers/net/ethernet/hisilicon/hns3/hnae3.h @@ -145,7 +145,6 @@ enum hnae3_reset_notify_type { HNAE3_DOWN_CLIENT, HNAE3_INIT_CLIENT, HNAE3_UNINIT_CLIENT, - HNAE3_RESTORE_CLIENT, }; enum hnae3_hw_error_type { -- cgit v1.2.3-59-g8ed1b From 4828b5766a69e93ca76b15f820c97f03ebd3a48c Mon Sep 17 00:00:00 2001 From: Huazhong Tan Date: Thu, 28 May 2020 21:48:16 +0800 Subject: net: hns3: remove unused struct hnae3_unic_private_info Since field .uinfo in struct hnae3_handle never be used, so remove it and its structure definition. Signed-off-by: Huazhong Tan Signed-off-by: David S. Miller --- drivers/net/ethernet/hisilicon/hns3/hnae3.h | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/drivers/net/ethernet/hisilicon/hns3/hnae3.h b/drivers/net/ethernet/hisilicon/hns3/hnae3.h index 0a4aac44efa5..d041cac9a487 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hnae3.h +++ b/drivers/net/ethernet/hisilicon/hns3/hnae3.h @@ -621,16 +621,6 @@ struct hnae3_roce_private_info { unsigned long state; }; -struct hnae3_unic_private_info { - struct net_device *netdev; - u16 rx_buf_len; - u16 num_tx_desc; - u16 num_rx_desc; - - u16 num_tqps; /* total number of tqps in this handle */ - struct hnae3_queue **tqp; /* array base of all TQPs of this instance */ -}; - #define HNAE3_SUPPORT_APP_LOOPBACK BIT(0) #define HNAE3_SUPPORT_PHY_LOOPBACK BIT(1) #define HNAE3_SUPPORT_SERDES_SERIAL_LOOPBACK BIT(2) @@ -656,7 +646,6 @@ struct hnae3_handle { union { struct net_device *netdev; /* first member */ struct hnae3_knic_private_info kinfo; - struct hnae3_unic_private_info uinfo; struct hnae3_roce_private_info rinfo; }; -- cgit v1.2.3-59-g8ed1b From 9cee2e8d303940a413d20c5d275bdaf418b09b17 Mon Sep 17 00:00:00 2001 From: Huazhong Tan Date: Thu, 28 May 2020 21:48:17 +0800 Subject: net: hns3: remove two duplicated register macros in hclgevf_main.h HCLGEVF_CMDQ_INTR_SRC_REG and HCLGEVF_CMDQ_INTR_STS_REG are same as HCLGEVF_VECTOR0_CMDQ_SRC_REG and HCLGEVF_VECTOR0_CMDQ_STAT_REG, replace the former with the latter, and rename macro HCLGEVF_VECTOR0_CMDQ_STAT_REG since 'stat' is not abbreviation of 'state'. Signed-off-by: Huazhong Tan Signed-off-by: David S. Miller --- drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c | 6 +++--- drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.h | 4 +--- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c index be5789bb5ddd..a8c0e79901f5 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c @@ -46,7 +46,7 @@ static const u32 cmdq_reg_addr_list[] = {HCLGEVF_CMDQ_TX_ADDR_L_REG, HCLGEVF_CMDQ_RX_TAIL_REG, HCLGEVF_CMDQ_RX_HEAD_REG, HCLGEVF_VECTOR0_CMDQ_SRC_REG, - HCLGEVF_CMDQ_INTR_STS_REG, + HCLGEVF_VECTOR0_CMDQ_STATE_REG, HCLGEVF_CMDQ_INTR_EN_REG, HCLGEVF_CMDQ_INTR_GEN_REG}; @@ -1826,7 +1826,7 @@ static void hclgevf_dump_rst_info(struct hclgevf_dev *hdev) dev_info(&hdev->pdev->dev, "vector0 interrupt enable status: 0x%x\n", hclgevf_read_dev(&hdev->hw, HCLGEVF_MISC_VECTOR_REG_BASE)); dev_info(&hdev->pdev->dev, "vector0 interrupt status: 0x%x\n", - hclgevf_read_dev(&hdev->hw, HCLGEVF_VECTOR0_CMDQ_STAT_REG)); + hclgevf_read_dev(&hdev->hw, HCLGEVF_VECTOR0_CMDQ_STATE_REG)); dev_info(&hdev->pdev->dev, "handshake status: 0x%x\n", hclgevf_read_dev(&hdev->hw, HCLGEVF_CMDQ_TX_DEPTH_REG)); dev_info(&hdev->pdev->dev, "function reset status: 0x%x\n", @@ -2250,7 +2250,7 @@ static enum hclgevf_evt_cause hclgevf_check_evt_cause(struct hclgevf_dev *hdev, /* fetch the events from their corresponding regs */ cmdq_stat_reg = hclgevf_read_dev(&hdev->hw, - HCLGEVF_VECTOR0_CMDQ_STAT_REG); + HCLGEVF_VECTOR0_CMDQ_STATE_REG); if (BIT(HCLGEVF_VECTOR0_RST_INT_B) & cmdq_stat_reg) { rst_ing_reg = hclgevf_read_dev(&hdev->hw, HCLGEVF_RST_ING); diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.h b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.h index f19583c4bc9b..738de124cfc4 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.h +++ b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.h @@ -42,8 +42,6 @@ #define HCLGEVF_CMDQ_RX_DEPTH_REG 0x27020 #define HCLGEVF_CMDQ_RX_TAIL_REG 0x27024 #define HCLGEVF_CMDQ_RX_HEAD_REG 0x27028 -#define HCLGEVF_CMDQ_INTR_SRC_REG 0x27100 -#define HCLGEVF_CMDQ_INTR_STS_REG 0x27104 #define HCLGEVF_CMDQ_INTR_EN_REG 0x27108 #define HCLGEVF_CMDQ_INTR_GEN_REG 0x2710C @@ -88,7 +86,7 @@ /* Vector0 interrupt CMDQ event source register(RW) */ #define HCLGEVF_VECTOR0_CMDQ_SRC_REG 0x27100 /* Vector0 interrupt CMDQ event status register(RO) */ -#define HCLGEVF_VECTOR0_CMDQ_STAT_REG 0x27104 +#define HCLGEVF_VECTOR0_CMDQ_STATE_REG 0x27104 /* CMDQ register bits for RX event(=MBX event) */ #define HCLGEVF_VECTOR0_RX_CMDQ_INT_B 1 /* RST register bits for RESET event */ -- cgit v1.2.3-59-g8ed1b From 7c6643cac0ed78395ec10fe5b3b279e61b0ee51f Mon Sep 17 00:00:00 2001 From: Huazhong Tan Date: Thu, 28 May 2020 21:48:18 +0800 Subject: net: hns3: remove some unused fields in struct hclge_dev Remove some fields in struct hclge_dev which have not been used. Signed-off-by: Huazhong Tan Signed-off-by: David S. Miller --- drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h | 6 ------ 1 file changed, 6 deletions(-) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h index 913c4f677404..46e6e0fef3ba 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h @@ -771,12 +771,6 @@ struct hclge_dev { u16 num_roce_msi; /* Num of roce vectors for this PF */ int roce_base_vector; - u16 pending_udp_bitmap; - - u16 rx_itr_default; - u16 tx_itr_default; - - u16 adminq_work_limit; /* Num of admin receive queue desc to process */ unsigned long service_timer_period; unsigned long service_timer_previous; struct timer_list reset_timer; -- cgit v1.2.3-59-g8ed1b From ead38a8537bf87228917f23c2131c7a020fe0951 Mon Sep 17 00:00:00 2001 From: Huazhong Tan Date: Thu, 28 May 2020 21:48:19 +0800 Subject: net: hns3: print out speed info when parsing speed fails When calling hclge_parse_speed() fails, printing out the speed is helpful for debugging. Signed-off-by: Huazhong Tan Signed-off-by: David S. Miller --- drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c index 1e4f28518d69..96bfad52630d 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c @@ -1387,7 +1387,8 @@ static int hclge_configure(struct hclge_dev *hdev) ret = hclge_parse_speed(cfg.default_speed, &hdev->hw.mac.speed); if (ret) { - dev_err(&hdev->pdev->dev, "Get wrong speed ret=%d.\n", ret); + dev_err(&hdev->pdev->dev, "failed to parse speed %u, ret = %d\n", + cfg.default_speed, ret); return ret; } -- cgit v1.2.3-59-g8ed1b From c28481a88cb38a3e1b7b533f53eb82e5e34f7597 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Wed, 11 Mar 2020 09:37:40 +0100 Subject: i40e: Use scnprintf() for avoiding potential buffer overflow Since snprintf() returns the would-be-output size instead of the actual output size, the succeeding calls may go beyond the given buffer limit. Fix it by replacing with scnprintf(). Signed-off-by: Takashi Iwai Tested-by: Andrew Bowers Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/i40e/i40e_main.c | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c index ea7395b391e5..5d807c8004f8 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_main.c +++ b/drivers/net/ethernet/intel/i40e/i40e_main.c @@ -14486,29 +14486,29 @@ static void i40e_print_features(struct i40e_pf *pf) i = snprintf(buf, INFO_STRING_LEN, "Features: PF-id[%d]", hw->pf_id); #ifdef CONFIG_PCI_IOV - i += snprintf(&buf[i], REMAIN(i), " VFs: %d", pf->num_req_vfs); + i += scnprintf(&buf[i], REMAIN(i), " VFs: %d", pf->num_req_vfs); #endif - i += snprintf(&buf[i], REMAIN(i), " VSIs: %d QP: %d", + i += scnprintf(&buf[i], REMAIN(i), " VSIs: %d QP: %d", pf->hw.func_caps.num_vsis, pf->vsi[pf->lan_vsi]->num_queue_pairs); if (pf->flags & I40E_FLAG_RSS_ENABLED) - i += snprintf(&buf[i], REMAIN(i), " RSS"); + i += scnprintf(&buf[i], REMAIN(i), " RSS"); if (pf->flags & I40E_FLAG_FD_ATR_ENABLED) - i += snprintf(&buf[i], REMAIN(i), " FD_ATR"); + i += scnprintf(&buf[i], REMAIN(i), " FD_ATR"); if (pf->flags & I40E_FLAG_FD_SB_ENABLED) { - i += snprintf(&buf[i], REMAIN(i), " FD_SB"); - i += snprintf(&buf[i], REMAIN(i), " NTUPLE"); + i += scnprintf(&buf[i], REMAIN(i), " FD_SB"); + i += scnprintf(&buf[i], REMAIN(i), " NTUPLE"); } if (pf->flags & I40E_FLAG_DCB_CAPABLE) - i += snprintf(&buf[i], REMAIN(i), " DCB"); - i += snprintf(&buf[i], REMAIN(i), " VxLAN"); - i += snprintf(&buf[i], REMAIN(i), " Geneve"); + i += scnprintf(&buf[i], REMAIN(i), " DCB"); + i += scnprintf(&buf[i], REMAIN(i), " VxLAN"); + i += scnprintf(&buf[i], REMAIN(i), " Geneve"); if (pf->flags & I40E_FLAG_PTP) - i += snprintf(&buf[i], REMAIN(i), " PTP"); + i += scnprintf(&buf[i], REMAIN(i), " PTP"); if (pf->flags & I40E_FLAG_VEB_MODE_ENABLED) - i += snprintf(&buf[i], REMAIN(i), " VEB"); + i += scnprintf(&buf[i], REMAIN(i), " VEB"); else - i += snprintf(&buf[i], REMAIN(i), " VEPA"); + i += scnprintf(&buf[i], REMAIN(i), " VEPA"); dev_info(&pf->pdev->dev, "%s\n", buf); kfree(buf); -- cgit v1.2.3-59-g8ed1b From e92c0e0235c204a4787d186b541b33814a393d7f Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Mon, 16 Mar 2020 11:16:38 +0100 Subject: i40e: trivial fixup of comments in i40e_xsk.c MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The comment above i40e_run_xdp_zc() was clearly copy-pasted from function i40e_xsk_umem_setup, which is just above. Signed-off-by: Jesper Dangaard Brouer Acked-by: Björn Töpel Tested-by: Andrew Bowers Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/i40e/i40e_xsk.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/net/ethernet/intel/i40e/i40e_xsk.c b/drivers/net/ethernet/intel/i40e/i40e_xsk.c index f3953744c505..7276580cbe64 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_xsk.c +++ b/drivers/net/ethernet/intel/i40e/i40e_xsk.c @@ -139,8 +139,6 @@ int i40e_xsk_umem_setup(struct i40e_vsi *vsi, struct xdp_umem *umem, * @rx_ring: Rx ring * @xdp: xdp_buff used as input to the XDP program * - * This function enables or disables a UMEM to a certain ring. - * * Returns any of I40E_XDP_{PASS, CONSUMED, TX, REDIR} **/ static int i40e_run_xdp_zc(struct i40e_ring *rx_ring, struct xdp_buff *xdp) @@ -224,7 +222,7 @@ no_buffers: } /** - * i40e_construct_skb_zc - Create skbufff from zero-copy Rx buffer + * i40e_construct_skb_zc - Create skbuff from zero-copy Rx buffer * @rx_ring: Rx ring * @xdp: xdp_buff * -- cgit v1.2.3-59-g8ed1b From 3b70683fc4d68f5d915d9dc7e5ba72c732c7315c Mon Sep 17 00:00:00 2001 From: Xie XiuQi Date: Tue, 5 May 2020 10:45:21 +0800 Subject: ixgbe: fix signed-integer-overflow warning ubsan report this warning, fix it by adding a unsigned suffix. UBSAN: signed-integer-overflow in drivers/net/ethernet/intel/ixgbe/ixgbe_common.c:2246:26 65535 * 65537 cannot be represented in type 'int' CPU: 21 PID: 7 Comm: kworker/u256:0 Not tainted 5.7.0-rc3-debug+ #39 Hardware name: Huawei TaiShan 2280 V2/BC82AMDC, BIOS 2280-V2 03/27/2020 Workqueue: ixgbe ixgbe_service_task [ixgbe] Call trace: dump_backtrace+0x0/0x3f0 show_stack+0x28/0x38 dump_stack+0x154/0x1e4 ubsan_epilogue+0x18/0x60 handle_overflow+0xf8/0x148 __ubsan_handle_mul_overflow+0x34/0x48 ixgbe_fc_enable_generic+0x4d0/0x590 [ixgbe] ixgbe_service_task+0xc20/0x1f78 [ixgbe] process_one_work+0x8f0/0xf18 worker_thread+0x430/0x6d0 kthread+0x218/0x238 ret_from_fork+0x10/0x18 Reported-by: Hulk Robot Signed-off-by: Xie XiuQi Tested-by: Andrew Bowers Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/ixgbe/ixgbe_common.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_common.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_common.c index 0bd1294ba517..39c5e6fdb72c 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_common.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_common.c @@ -2243,7 +2243,7 @@ s32 ixgbe_fc_enable_generic(struct ixgbe_hw *hw) } /* Configure pause time (2 TCs per register) */ - reg = hw->fc.pause_time * 0x00010001; + reg = hw->fc.pause_time * 0x00010001U; for (i = 0; i < (MAX_TRAFFIC_CLASS / 2); i++) IXGBE_WRITE_REG(hw, IXGBE_FCTTV(i), reg); -- cgit v1.2.3-59-g8ed1b From 85c41c5b16ee5a4939a22ec833c6a76753e3d428 Mon Sep 17 00:00:00 2001 From: Jason Yan Date: Tue, 5 May 2020 15:41:57 +0800 Subject: ixgbe: Remove conversion to bool in ixgbe_device_supports_autoneg_fc() No need to convert '==' expression to bool. This fixes the following coccicheck warning: drivers/net/ethernet/intel/ixgbe/ixgbe_common.c:68:11-16: WARNING: conversion to bool not needed here Signed-off-by: Jason Yan Tested-by: Andrew Bowers Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/ixgbe/ixgbe_common.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_common.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_common.c index 39c5e6fdb72c..17357a12cbdc 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_common.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_common.c @@ -64,8 +64,7 @@ bool ixgbe_device_supports_autoneg_fc(struct ixgbe_hw *hw) hw->mac.ops.check_link(hw, &speed, &link_up, false); /* if link is down, assume supported */ if (link_up) - supported = speed == IXGBE_LINK_SPEED_1GB_FULL ? - true : false; + supported = speed == IXGBE_LINK_SPEED_1GB_FULL; else supported = true; } -- cgit v1.2.3-59-g8ed1b From c2d77e598b1b7170a091b25e32710a0a9f9e4169 Mon Sep 17 00:00:00 2001 From: Jason Yan Date: Tue, 5 May 2020 15:43:37 +0800 Subject: ixgbe: Use true, false for bool variable in __ixgbe_enable_sriov() Fix the following coccicheck warning: drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.c:105:2-38: WARNING: Assignment of 0/1 to bool variable Signed-off-by: Jason Yan Tested-by: Andrew Bowers Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.c index 537dfff585e0..d05a5690e66b 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.c @@ -102,7 +102,7 @@ static int __ixgbe_enable_sriov(struct ixgbe_adapter *adapter, * indirection table and RSS hash key with PF therefore * we want to disable the querying by default. */ - adapter->vfinfo[i].rss_query_enabled = 0; + adapter->vfinfo[i].rss_query_enabled = false; /* Untrust all VFs */ adapter->vfinfo[i].trusted = false; -- cgit v1.2.3-59-g8ed1b From f2d9f294120fddec48e38e50d420c1d0a247661d Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Tue, 5 May 2020 16:35:54 +0800 Subject: ixgbe: Remove unused inline function ixgbe_irq_disable_queues commit b5f69ccf6765 ("ixgbe: avoid bringing rings up/down as macvlans are added/removed") left behind this, remove it. Signed-off-by: YueHaibing Tested-by: Andrew Bowers Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 29 --------------------------- 1 file changed, 29 deletions(-) diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c index 45fc7ce1a543..a59c166f794f 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c @@ -2973,35 +2973,6 @@ static inline void ixgbe_irq_enable_queues(struct ixgbe_adapter *adapter, /* skip the flush */ } -static inline void ixgbe_irq_disable_queues(struct ixgbe_adapter *adapter, - u64 qmask) -{ - u32 mask; - struct ixgbe_hw *hw = &adapter->hw; - - switch (hw->mac.type) { - case ixgbe_mac_82598EB: - mask = (IXGBE_EIMS_RTX_QUEUE & qmask); - IXGBE_WRITE_REG(hw, IXGBE_EIMC, mask); - break; - case ixgbe_mac_82599EB: - case ixgbe_mac_X540: - case ixgbe_mac_X550: - case ixgbe_mac_X550EM_x: - case ixgbe_mac_x550em_a: - mask = (qmask & 0xFFFFFFFF); - if (mask) - IXGBE_WRITE_REG(hw, IXGBE_EIMC_EX(0), mask); - mask = (qmask >> 32); - if (mask) - IXGBE_WRITE_REG(hw, IXGBE_EIMC_EX(1), mask); - break; - default: - break; - } - /* skip the flush */ -} - /** * ixgbe_irq_enable - Enable default interrupt generation settings * @adapter: board private structure -- cgit v1.2.3-59-g8ed1b From 49c65e95f331201d431386dcb7c652bf02b306d1 Mon Sep 17 00:00:00 2001 From: Jason Yan Date: Thu, 7 May 2020 19:09:15 +0800 Subject: igb: make igb_set_fc_watermarks() return void This function always return 0 now, we can make it return void to simplify the code. This fixes the following coccicheck warning: drivers/net/ethernet/intel/igb/e1000_mac.c:728:5-12: Unneeded variable: "ret_val". Return "0" on line 751 Signed-off-by: Jason Yan Tested-by: Aaron Brown Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/igb/e1000_mac.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/drivers/net/ethernet/intel/igb/e1000_mac.c b/drivers/net/ethernet/intel/igb/e1000_mac.c index 79ee0a747260..3254737c07a3 100644 --- a/drivers/net/ethernet/intel/igb/e1000_mac.c +++ b/drivers/net/ethernet/intel/igb/e1000_mac.c @@ -12,7 +12,7 @@ #include "igb.h" static s32 igb_set_default_fc(struct e1000_hw *hw); -static s32 igb_set_fc_watermarks(struct e1000_hw *hw); +static void igb_set_fc_watermarks(struct e1000_hw *hw); /** * igb_get_bus_info_pcie - Get PCIe bus information @@ -687,7 +687,7 @@ s32 igb_setup_link(struct e1000_hw *hw) wr32(E1000_FCTTV, hw->fc.pause_time); - ret_val = igb_set_fc_watermarks(hw); + igb_set_fc_watermarks(hw); out: @@ -723,9 +723,8 @@ void igb_config_collision_dist(struct e1000_hw *hw) * flow control XON frame transmission is enabled, then set XON frame * tansmission as well. **/ -static s32 igb_set_fc_watermarks(struct e1000_hw *hw) +static void igb_set_fc_watermarks(struct e1000_hw *hw) { - s32 ret_val = 0; u32 fcrtl = 0, fcrth = 0; /* Set the flow control receive threshold registers. Normally, @@ -747,8 +746,6 @@ static s32 igb_set_fc_watermarks(struct e1000_hw *hw) } wr32(E1000_FCRTL, fcrtl); wr32(E1000_FCRTH, fcrth); - - return ret_val; } /** -- cgit v1.2.3-59-g8ed1b From 2c3076f5ed3c17e06c2e09a442035906c99311b2 Mon Sep 17 00:00:00 2001 From: Sasha Neftin Date: Sun, 10 May 2020 18:52:00 +0300 Subject: igc: Remove unused flags Transmit underrun, late and excess collision flags not in use. This patch comes to clean up these flags. Signed-off-by: Sasha Neftin Tested-by: Aaron Brown Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/igc/igc_defines.h | 4 ---- 1 file changed, 4 deletions(-) diff --git a/drivers/net/ethernet/intel/igc/igc_defines.h b/drivers/net/ethernet/intel/igc/igc_defines.h index 3d8d40d6fa3f..186deb1d9375 100644 --- a/drivers/net/ethernet/intel/igc/igc_defines.h +++ b/drivers/net/ethernet/intel/igc/igc_defines.h @@ -269,13 +269,9 @@ #define IGC_TXD_CMD_DEXT 0x20000000 /* Desc extension (0 = legacy) */ #define IGC_TXD_CMD_VLE 0x40000000 /* Add VLAN tag */ #define IGC_TXD_STAT_DD 0x00000001 /* Descriptor Done */ -#define IGC_TXD_STAT_EC 0x00000002 /* Excess Collisions */ -#define IGC_TXD_STAT_LC 0x00000004 /* Late Collisions */ -#define IGC_TXD_STAT_TU 0x00000008 /* Transmit underrun */ #define IGC_TXD_CMD_TCP 0x01000000 /* TCP packet */ #define IGC_TXD_CMD_IP 0x02000000 /* IP packet */ #define IGC_TXD_CMD_TSE 0x04000000 /* TCP Seg enable */ -#define IGC_TXD_STAT_TC 0x00000004 /* Tx Underrun */ #define IGC_TXD_EXTCMD_TSTAMP 0x00000010 /* IEEE1588 Timestamp packet */ /* IPSec Encrypt Enable */ -- cgit v1.2.3-59-g8ed1b From 3d3e9b6b6a878d01c04629eae8787de132056533 Mon Sep 17 00:00:00 2001 From: Andre Guedes Date: Tue, 12 May 2020 10:35:52 -0700 Subject: igc: Reject NFC rules with multiple matches The way Rx queue assignment based on mac address, Ethertype and VLAN priority filtering operates in I225 doesn't allow us to properly support NFC rules with multiple matches. Consider the following example which assigns to queue 2 frames matching the address MACADDR *and* Ethertype ETYPE. $ ethtool -N eth0 flow-type ether dst proto queue 2 When such rule is applied, we have 2 unwanted behaviors: 1) Any frame matching MACADDR will be assigned to queue 2. It doesn't matter the ETYPE value. 2) Any accepted frame that has Ethertype equals to ETYPE, no matter the mac address, will be assigned to queue 2 as well. In current code, multiple-match filters are accepted by the driver, even though it doesn't support them properly. This patch adds a check for multiple-match rules in igc_ethtool_is_nfc_rule_valid() so they are rejected. Signed-off-by: Andre Guedes Tested-by: Aaron Brown Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/igc/igc_ethtool.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/drivers/net/ethernet/intel/igc/igc_ethtool.c b/drivers/net/ethernet/intel/igc/igc_ethtool.c index 946e775e34ae..a938ec8db681 100644 --- a/drivers/net/ethernet/intel/igc/igc_ethtool.c +++ b/drivers/net/ethernet/intel/igc/igc_ethtool.c @@ -1222,8 +1222,8 @@ static void igc_ethtool_init_nfc_rule(struct igc_nfc_rule *rule, * @adapter: Pointer to adapter * @rule: Rule under evaluation * - * Rules with both destination and source MAC addresses are considered invalid - * since the driver doesn't support them. + * The driver doesn't support rules with multiple matches so if more than + * one bit in filter flags is set, @rule is considered invalid. * * Also, if there is already another rule with the same filter in a different * location, @rule is considered invalid. @@ -1244,9 +1244,8 @@ static int igc_ethtool_check_nfc_rule(struct igc_adapter *adapter, return -EINVAL; } - if (flags & IGC_FILTER_FLAG_DST_MAC_ADDR && - flags & IGC_FILTER_FLAG_SRC_MAC_ADDR) { - netdev_dbg(dev, "Filters with both dst and src are not supported\n"); + if (flags & (flags - 1)) { + netdev_dbg(dev, "Rule with multiple matches not supported\n"); return -EOPNOTSUPP; } -- cgit v1.2.3-59-g8ed1b From e087d3bbc4bfb1458b28f77caa0eed092f632b2b Mon Sep 17 00:00:00 2001 From: Andre Guedes Date: Tue, 12 May 2020 10:35:53 -0700 Subject: igc: Fix IGC_MAX_RXNFC_RULES IGC supports a total of 32 rules. 16 MAC address based, 8 VLAN priority based, and 8 Ethertype based. This patch fixes IGC_MAX_RXNFC_RULES accordingly. Signed-off-by: Andre Guedes Acked-by: Sasha Neftin Tested-by: Aaron Brown Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/igc/igc.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/intel/igc/igc.h b/drivers/net/ethernet/intel/igc/igc.h index 14f9edaaaf83..5dbc5a156626 100644 --- a/drivers/net/ethernet/intel/igc/igc.h +++ b/drivers/net/ethernet/intel/igc/igc.h @@ -457,7 +457,10 @@ struct igc_nfc_rule { u16 action; }; -#define IGC_MAX_RXNFC_RULES 16 +/* IGC supports a total of 32 NFC rules: 16 MAC address based,, 8 VLAN priority + * based, and 8 ethertype based. + */ +#define IGC_MAX_RXNFC_RULES 32 /* igc_desc_unused - calculate if we have unused descriptors */ static inline u16 igc_desc_unused(const struct igc_ring *ring) -- cgit v1.2.3-59-g8ed1b From d601afcae2febc49665008e9a79e701248d56c50 Mon Sep 17 00:00:00 2001 From: Punit Agrawal Date: Fri, 15 May 2020 13:31:27 +0900 Subject: e1000e: Relax condition to trigger reset for ME workaround It's an error if the value of the RX/TX tail descriptor does not match what was written. The error condition is true regardless the duration of the interference from ME. But the driver only performs the reset if E1000_ICH_FWSM_PCIM2PCI_COUNT (2000) iterations of 50us delay have transpired. The extra condition can lead to inconsistency between the state of hardware as expected by the driver. Fix this by dropping the check for number of delay iterations. While at it, also make __ew32_prepare() static as it's not used anywhere else. CC: stable Signed-off-by: Punit Agrawal Reviewed-by: Alexander Duyck Tested-by: Aaron Brown Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/e1000e/e1000.h | 1 - drivers/net/ethernet/intel/e1000e/netdev.c | 12 +++++------- 2 files changed, 5 insertions(+), 8 deletions(-) diff --git a/drivers/net/ethernet/intel/e1000e/e1000.h b/drivers/net/ethernet/intel/e1000e/e1000.h index 37a2314d3e6b..944abd5eae11 100644 --- a/drivers/net/ethernet/intel/e1000e/e1000.h +++ b/drivers/net/ethernet/intel/e1000e/e1000.h @@ -576,7 +576,6 @@ static inline u32 __er32(struct e1000_hw *hw, unsigned long reg) #define er32(reg) __er32(hw, E1000_##reg) -s32 __ew32_prepare(struct e1000_hw *hw); void __ew32(struct e1000_hw *hw, unsigned long reg, u32 val); #define ew32(reg, val) __ew32(hw, E1000_##reg, (val)) diff --git a/drivers/net/ethernet/intel/e1000e/netdev.c b/drivers/net/ethernet/intel/e1000e/netdev.c index 32f23a15ff64..444532292588 100644 --- a/drivers/net/ethernet/intel/e1000e/netdev.c +++ b/drivers/net/ethernet/intel/e1000e/netdev.c @@ -158,14 +158,12 @@ static bool e1000e_check_me(u16 device_id) * has bit 24 set while ME is accessing MAC CSR registers, wait if it is set * and try again a number of times. **/ -s32 __ew32_prepare(struct e1000_hw *hw) +static void __ew32_prepare(struct e1000_hw *hw) { s32 i = E1000_ICH_FWSM_PCIM2PCI_COUNT; while ((er32(FWSM) & E1000_ICH_FWSM_PCIM2PCI) && --i) udelay(50); - - return i; } void __ew32(struct e1000_hw *hw, unsigned long reg, u32 val) @@ -646,11 +644,11 @@ static void e1000e_update_rdt_wa(struct e1000_ring *rx_ring, unsigned int i) { struct e1000_adapter *adapter = rx_ring->adapter; struct e1000_hw *hw = &adapter->hw; - s32 ret_val = __ew32_prepare(hw); + __ew32_prepare(hw); writel(i, rx_ring->tail); - if (unlikely(!ret_val && (i != readl(rx_ring->tail)))) { + if (unlikely(i != readl(rx_ring->tail))) { u32 rctl = er32(RCTL); ew32(RCTL, rctl & ~E1000_RCTL_EN); @@ -663,11 +661,11 @@ static void e1000e_update_tdt_wa(struct e1000_ring *tx_ring, unsigned int i) { struct e1000_adapter *adapter = tx_ring->adapter; struct e1000_hw *hw = &adapter->hw; - s32 ret_val = __ew32_prepare(hw); + __ew32_prepare(hw); writel(i, tx_ring->tail); - if (unlikely(!ret_val && (i != readl(tx_ring->tail)))) { + if (unlikely(i != readl(tx_ring->tail))) { u32 tctl = er32(TCTL); ew32(TCTL, tctl & ~E1000_TCTL_EN); -- cgit v1.2.3-59-g8ed1b From 3f6023f77ad044a1a3e1b57ccaaff79432910a57 Mon Sep 17 00:00:00 2001 From: Jason Yan Date: Wed, 6 May 2020 14:18:35 +0800 Subject: i40e: Make i40e_shutdown_adminq() return void Fix the following coccicheck warning: drivers/net/ethernet/intel/i40e/i40e_adminq.c:699:13-21: Unneeded variable: "ret_code". Return "0" on line 710 Signed-off-by: Jason Yan Tested-by: Andrew Bowers Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/i40e/i40e_adminq.c | 6 +----- drivers/net/ethernet/intel/i40e/i40e_prototype.h | 2 +- 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/drivers/net/ethernet/intel/i40e/i40e_adminq.c b/drivers/net/ethernet/intel/i40e/i40e_adminq.c index 37514a75f928..6a089848c857 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_adminq.c +++ b/drivers/net/ethernet/intel/i40e/i40e_adminq.c @@ -694,10 +694,8 @@ init_adminq_exit: * i40e_shutdown_adminq - shutdown routine for the Admin Queue * @hw: pointer to the hardware structure **/ -i40e_status i40e_shutdown_adminq(struct i40e_hw *hw) +void i40e_shutdown_adminq(struct i40e_hw *hw) { - i40e_status ret_code = 0; - if (i40e_check_asq_alive(hw)) i40e_aq_queue_shutdown(hw, true); @@ -706,8 +704,6 @@ i40e_status i40e_shutdown_adminq(struct i40e_hw *hw) if (hw->nvm_buff.va) i40e_free_virt_mem(hw, &hw->nvm_buff); - - return ret_code; } /** diff --git a/drivers/net/ethernet/intel/i40e/i40e_prototype.h b/drivers/net/ethernet/intel/i40e/i40e_prototype.h index bbb478f09093..5c1378641b3b 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_prototype.h +++ b/drivers/net/ethernet/intel/i40e/i40e_prototype.h @@ -17,7 +17,7 @@ /* adminq functions */ i40e_status i40e_init_adminq(struct i40e_hw *hw); -i40e_status i40e_shutdown_adminq(struct i40e_hw *hw); +void i40e_shutdown_adminq(struct i40e_hw *hw); void i40e_adminq_init_ring_data(struct i40e_hw *hw); i40e_status i40e_clean_arq_element(struct i40e_hw *hw, struct i40e_arq_event_info *e, -- cgit v1.2.3-59-g8ed1b From 758b51e1e71e38257dfcb753edaf07d417611786 Mon Sep 17 00:00:00 2001 From: Sasha Neftin Date: Tue, 19 May 2020 17:36:34 +0300 Subject: igc: Remove symbol error counter Accordance to the i225 datasheet symbol error counter does not applicable to the i225 device. This patch comes to clean up this counter. Signed-off-by: Sasha Neftin Tested-by: Aaron Brown Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/igc/igc_mac.c | 1 - drivers/net/ethernet/intel/igc/igc_main.c | 1 - drivers/net/ethernet/intel/igc/igc_regs.h | 1 - 3 files changed, 3 deletions(-) diff --git a/drivers/net/ethernet/intel/igc/igc_mac.c b/drivers/net/ethernet/intel/igc/igc_mac.c index 89445ab02a98..9de70a24cb9e 100644 --- a/drivers/net/ethernet/intel/igc/igc_mac.c +++ b/drivers/net/ethernet/intel/igc/igc_mac.c @@ -235,7 +235,6 @@ out: void igc_clear_hw_cntrs_base(struct igc_hw *hw) { rd32(IGC_CRCERRS); - rd32(IGC_SYMERRS); rd32(IGC_MPC); rd32(IGC_SCC); rd32(IGC_ECOL); diff --git a/drivers/net/ethernet/intel/igc/igc_main.c b/drivers/net/ethernet/intel/igc/igc_main.c index 97d26991c87e..662f06a647e6 100644 --- a/drivers/net/ethernet/intel/igc/igc_main.c +++ b/drivers/net/ethernet/intel/igc/igc_main.c @@ -3701,7 +3701,6 @@ void igc_update_stats(struct igc_adapter *adapter) adapter->stats.prc511 += rd32(IGC_PRC511); adapter->stats.prc1023 += rd32(IGC_PRC1023); adapter->stats.prc1522 += rd32(IGC_PRC1522); - adapter->stats.symerrs += rd32(IGC_SYMERRS); adapter->stats.sec += rd32(IGC_SEC); mpc = rd32(IGC_MPC); diff --git a/drivers/net/ethernet/intel/igc/igc_regs.h b/drivers/net/ethernet/intel/igc/igc_regs.h index 7f999cfc9b39..a3e4ec922948 100644 --- a/drivers/net/ethernet/intel/igc/igc_regs.h +++ b/drivers/net/ethernet/intel/igc/igc_regs.h @@ -127,7 +127,6 @@ /* Statistics Register Descriptions */ #define IGC_CRCERRS 0x04000 /* CRC Error Count - R/clr */ #define IGC_ALGNERRC 0x04004 /* Alignment Error Count - R/clr */ -#define IGC_SYMERRS 0x04008 /* Symbol Error Count - R/clr */ #define IGC_RXERRC 0x0400C /* Receive Error Count - R/clr */ #define IGC_MPC 0x04010 /* Missed Packet Count - R/clr */ #define IGC_SCC 0x04014 /* Single Collision Count - R/clr */ -- cgit v1.2.3-59-g8ed1b From 51c657b42f58fcf061dfd6d01df26ff1701ae72c Mon Sep 17 00:00:00 2001 From: Sasha Neftin Date: Tue, 19 May 2020 17:55:42 +0300 Subject: igc: Add Receive Error Counter Receive error counter reflect total number of non-filtered packets received with errors. This includes: CRC error, symbol error, Rx data error and carrier extend error. Signed-off-by: Sasha Neftin Tested-by: Aaron Brown Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/igc/igc_mac.c | 1 + drivers/net/ethernet/intel/igc/igc_main.c | 1 + drivers/net/ethernet/intel/igc/igc_regs.h | 1 + 3 files changed, 3 insertions(+) diff --git a/drivers/net/ethernet/intel/igc/igc_mac.c b/drivers/net/ethernet/intel/igc/igc_mac.c index 9de70a24cb9e..a5a087e1ac02 100644 --- a/drivers/net/ethernet/intel/igc/igc_mac.c +++ b/drivers/net/ethernet/intel/igc/igc_mac.c @@ -241,6 +241,7 @@ void igc_clear_hw_cntrs_base(struct igc_hw *hw) rd32(IGC_MCC); rd32(IGC_LATECOL); rd32(IGC_COLC); + rd32(IGC_RERC); rd32(IGC_DC); rd32(IGC_SEC); rd32(IGC_RLEC); diff --git a/drivers/net/ethernet/intel/igc/igc_main.c b/drivers/net/ethernet/intel/igc/igc_main.c index 662f06a647e6..e0c45ffa12c4 100644 --- a/drivers/net/ethernet/intel/igc/igc_main.c +++ b/drivers/net/ethernet/intel/igc/igc_main.c @@ -3740,6 +3740,7 @@ void igc_update_stats(struct igc_adapter *adapter) adapter->stats.tpt += rd32(IGC_TPT); adapter->stats.colc += rd32(IGC_COLC); + adapter->stats.colc += rd32(IGC_RERC); adapter->stats.algnerrc += rd32(IGC_ALGNERRC); diff --git a/drivers/net/ethernet/intel/igc/igc_regs.h b/drivers/net/ethernet/intel/igc/igc_regs.h index a3e4ec922948..7ac3b611708c 100644 --- a/drivers/net/ethernet/intel/igc/igc_regs.h +++ b/drivers/net/ethernet/intel/igc/igc_regs.h @@ -134,6 +134,7 @@ #define IGC_MCC 0x0401C /* Multiple Collision Count - R/clr */ #define IGC_LATECOL 0x04020 /* Late Collision Count - R/clr */ #define IGC_COLC 0x04028 /* Collision Count - R/clr */ +#define IGC_RERC 0x0402C /* Receive Error Count - R/clr */ #define IGC_DC 0x04030 /* Defer Count - R/clr */ #define IGC_TNCRS 0x04034 /* Tx-No CRS - R/clr */ #define IGC_SEC 0x04038 /* Sequence Error Count - R/clr */ -- cgit v1.2.3-59-g8ed1b From e2d0f2031effc8b08a162e6db64d2c97da4cf9f5 Mon Sep 17 00:00:00 2001 From: Sasha Neftin Date: Thu, 28 May 2020 10:11:11 +0300 Subject: igc: Remove Sequence Error Counter Accordance to the i225 datasheet sequence error counter does not applicable to the i225 device. This patch comes to clean up this counter. Signed-off-by: Sasha Neftin Tested-by: Aaron Brown Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/igc/igc_mac.c | 1 - drivers/net/ethernet/intel/igc/igc_main.c | 1 - drivers/net/ethernet/intel/igc/igc_regs.h | 1 - 3 files changed, 3 deletions(-) diff --git a/drivers/net/ethernet/intel/igc/igc_mac.c b/drivers/net/ethernet/intel/igc/igc_mac.c index a5a087e1ac02..fb496617e8e1 100644 --- a/drivers/net/ethernet/intel/igc/igc_mac.c +++ b/drivers/net/ethernet/intel/igc/igc_mac.c @@ -243,7 +243,6 @@ void igc_clear_hw_cntrs_base(struct igc_hw *hw) rd32(IGC_COLC); rd32(IGC_RERC); rd32(IGC_DC); - rd32(IGC_SEC); rd32(IGC_RLEC); rd32(IGC_XONRXC); rd32(IGC_XONTXC); diff --git a/drivers/net/ethernet/intel/igc/igc_main.c b/drivers/net/ethernet/intel/igc/igc_main.c index e0c45ffa12c4..43fcabb5c023 100644 --- a/drivers/net/ethernet/intel/igc/igc_main.c +++ b/drivers/net/ethernet/intel/igc/igc_main.c @@ -3701,7 +3701,6 @@ void igc_update_stats(struct igc_adapter *adapter) adapter->stats.prc511 += rd32(IGC_PRC511); adapter->stats.prc1023 += rd32(IGC_PRC1023); adapter->stats.prc1522 += rd32(IGC_PRC1522); - adapter->stats.sec += rd32(IGC_SEC); mpc = rd32(IGC_MPC); adapter->stats.mpc += mpc; diff --git a/drivers/net/ethernet/intel/igc/igc_regs.h b/drivers/net/ethernet/intel/igc/igc_regs.h index 7ac3b611708c..2b7a877dadac 100644 --- a/drivers/net/ethernet/intel/igc/igc_regs.h +++ b/drivers/net/ethernet/intel/igc/igc_regs.h @@ -137,7 +137,6 @@ #define IGC_RERC 0x0402C /* Receive Error Count - R/clr */ #define IGC_DC 0x04030 /* Defer Count - R/clr */ #define IGC_TNCRS 0x04034 /* Tx-No CRS - R/clr */ -#define IGC_SEC 0x04038 /* Sequence Error Count - R/clr */ #define IGC_CEXTERR 0x0403C /* Carrier Extension Error Count - R/clr */ #define IGC_RLEC 0x04040 /* Receive Length Error Count - R/clr */ #define IGC_XONRXC 0x04048 /* XON Rx Count - R/clr */ -- cgit v1.2.3-59-g8ed1b From 480b7a5a3fdb99afaf9a59681616bc70c1fbfe2f Mon Sep 17 00:00:00 2001 From: Sasha Neftin Date: Thu, 28 May 2020 10:25:21 +0300 Subject: igc: Fix wrong register name Accordance to the i225 datasheet this register address used by Host Transmit Discarded Packet by MAC counter and not by not applicable Carrier Extension Error counter. This patch comes to fix this wrong definition. Signed-off-by: Sasha Neftin Tested-by: Aaron Brown Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/igc/igc_mac.c | 2 +- drivers/net/ethernet/intel/igc/igc_regs.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/intel/igc/igc_mac.c b/drivers/net/ethernet/intel/igc/igc_mac.c index fb496617e8e1..410aeb01de5c 100644 --- a/drivers/net/ethernet/intel/igc/igc_mac.c +++ b/drivers/net/ethernet/intel/igc/igc_mac.c @@ -287,7 +287,7 @@ void igc_clear_hw_cntrs_base(struct igc_hw *hw) rd32(IGC_ALGNERRC); rd32(IGC_RXERRC); rd32(IGC_TNCRS); - rd32(IGC_CEXTERR); + rd32(IGC_HTDPMC); rd32(IGC_TSCTC); rd32(IGC_TSCTFC); diff --git a/drivers/net/ethernet/intel/igc/igc_regs.h b/drivers/net/ethernet/intel/igc/igc_regs.h index 2b7a877dadac..232e82dec62e 100644 --- a/drivers/net/ethernet/intel/igc/igc_regs.h +++ b/drivers/net/ethernet/intel/igc/igc_regs.h @@ -137,7 +137,7 @@ #define IGC_RERC 0x0402C /* Receive Error Count - R/clr */ #define IGC_DC 0x04030 /* Defer Count - R/clr */ #define IGC_TNCRS 0x04034 /* Tx-No CRS - R/clr */ -#define IGC_CEXTERR 0x0403C /* Carrier Extension Error Count - R/clr */ +#define IGC_HTDPMC 0x0403C /* Host Transmit Discarded by MAC - R/clr */ #define IGC_RLEC 0x04040 /* Receive Length Error Count - R/clr */ #define IGC_XONRXC 0x04048 /* XON Rx Count - R/clr */ #define IGC_XONTXC 0x0404C /* XON Tx Count - R/clr */ -- cgit v1.2.3-59-g8ed1b From 6a3faa4d7e013af13fa0230f7537640dcb3abc38 Mon Sep 17 00:00:00 2001 From: Hari Date: Sat, 23 May 2020 18:43:26 +0530 Subject: e1000: Fix typo in the comment Continuous Double "the" in a comment. Changed it to single "the" Signed-off-by: Hari Tested-by: Aaron Brown Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/e1000/e1000_hw.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/intel/e1000/e1000_hw.c b/drivers/net/ethernet/intel/e1000/e1000_hw.c index 48428d6a00be..623e516a9630 100644 --- a/drivers/net/ethernet/intel/e1000/e1000_hw.c +++ b/drivers/net/ethernet/intel/e1000/e1000_hw.c @@ -3960,7 +3960,7 @@ static s32 e1000_do_read_eeprom(struct e1000_hw *hw, u16 offset, u16 words, * @hw: Struct containing variables accessed by shared code * * Reads the first 64 16 bit words of the EEPROM and sums the values read. - * If the the sum of the 64 16 bit words is 0xBABA, the EEPROM's checksum is + * If the sum of the 64 16 bit words is 0xBABA, the EEPROM's checksum is * valid. */ s32 e1000_validate_eeprom_checksum(struct e1000_hw *hw) -- cgit v1.2.3-59-g8ed1b From a224883cc97f0c476f1d1460ac8716bf0768070a Mon Sep 17 00:00:00 2001 From: Haim Dreyfuss Date: Fri, 29 May 2020 09:39:21 +0300 Subject: iwlwifi: set NO_HE if the regulatory domain forbids it If the firmware's regulatory domain forbids HE operation, set it in the cfg80211 regdomain. Signed-off-by: Haim Dreyfuss Signed-off-by: Luca Coelho Link: https://lore.kernel.org/r/iwlwifi.20200529092401.c3e50c36c628.I991bfa662c0ef35de5be9eaf5b78ef190b67cb56@changeid --- drivers/net/wireless/intel/iwlwifi/iwl-nvm-parse.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/net/wireless/intel/iwlwifi/iwl-nvm-parse.c b/drivers/net/wireless/intel/iwlwifi/iwl-nvm-parse.c index d91a8e8349e6..ee410417761d 100644 --- a/drivers/net/wireless/intel/iwlwifi/iwl-nvm-parse.c +++ b/drivers/net/wireless/intel/iwlwifi/iwl-nvm-parse.c @@ -240,6 +240,7 @@ enum iwl_nvm_channel_flags { * @REG_CAPA_40MHZ_FORBIDDEN: 11n channel with a width of 40Mhz is forbidden * for this regulatory domain (valid only in 5Ghz). * @REG_CAPA_DC_HIGH_ENABLED: DC HIGH allowed. + * @REG_CAPA_11AX_DISABLED: 11ax is forbidden for this regulatory domain. */ enum iwl_reg_capa_flags { REG_CAPA_BF_CCD_LOW_BAND = BIT(0), @@ -250,6 +251,7 @@ enum iwl_reg_capa_flags { REG_CAPA_MCS_9_ALLOWED = BIT(5), REG_CAPA_40MHZ_FORBIDDEN = BIT(7), REG_CAPA_DC_HIGH_ENABLED = BIT(9), + REG_CAPA_11AX_DISABLED = BIT(10), }; static inline void iwl_nvm_print_channel_flags(struct device *dev, u32 level, @@ -1115,6 +1117,9 @@ static u32 iwl_nvm_get_regdom_bw_flags(const u16 *nvm_chan, flags |= NL80211_RRF_NO_160MHZ; } + if (cap_flags & REG_CAPA_11AX_DISABLED) + flags |= NL80211_RRF_NO_HE; + return flags; } -- cgit v1.2.3-59-g8ed1b From 771db3a10361ef67d59c00098a442be4a8395861 Mon Sep 17 00:00:00 2001 From: Haim Dreyfuss Date: Fri, 29 May 2020 09:39:22 +0300 Subject: iwlwifi: pcie: don't count on the FW to set persistence mode Apparently the FW can't set the persistence in all flows. Don't count on the FW setting it in AX210 devices or above either to avoid potential resets on resume. Signed-off-by: Haim Dreyfuss Signed-off-by: Luca Coelho Link: https://lore.kernel.org/r/iwlwifi.20200529092401.5405db448555.Ie3c110932ebbd5b6aca99938a5e0a1e4dfbaa848@changeid --- drivers/net/wireless/intel/iwlwifi/pcie/trans.c | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/drivers/net/wireless/intel/iwlwifi/pcie/trans.c b/drivers/net/wireless/intel/iwlwifi/pcie/trans.c index 8ccfc7cc7348..3bcbc2967c88 100644 --- a/drivers/net/wireless/intel/iwlwifi/pcie/trans.c +++ b/drivers/net/wireless/intel/iwlwifi/pcie/trans.c @@ -5,10 +5,9 @@ * * GPL LICENSE SUMMARY * - * Copyright(c) 2007 - 2015 Intel Corporation. All rights reserved. * Copyright(c) 2013 - 2015 Intel Mobile Communications GmbH * Copyright(c) 2016 - 2017 Intel Deutschland GmbH - * Copyright(c) 2018 - 2019 Intel Corporation + * Copyright(c) 2007 - 2015, 2018 - 2020 Intel Corporation * * This program is free software; you can redistribute it and/or modify * it under the terms of version 2 of the GNU General Public License as @@ -28,10 +27,9 @@ * * BSD LICENSE * - * Copyright(c) 2005 - 2015 Intel Corporation. All rights reserved. * Copyright(c) 2013 - 2015 Intel Mobile Communications GmbH * Copyright(c) 2016 - 2017 Intel Deutschland GmbH - * Copyright(c) 2018 - 2019 Intel Corporation + * Copyright(c) 2007 - 2015, 2018 - 2020 Intel Corporation * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -1495,14 +1493,10 @@ static int iwl_trans_pcie_d3_suspend(struct iwl_trans *trans, bool test, int ret; struct iwl_trans_pcie *trans_pcie = IWL_TRANS_GET_PCIE_TRANS(trans); - /* - * Family IWL_DEVICE_FAMILY_AX210 and above persist mode is set by FW. - */ - if (!reset && trans->trans_cfg->device_family < IWL_DEVICE_FAMILY_AX210) { + if (!reset) /* Enable persistence mode to avoid reset */ iwl_set_bit(trans, CSR_HW_IF_CONFIG_REG, CSR_HW_IF_CONFIG_REG_PERSIST_MODE); - } if (trans->trans_cfg->device_family >= IWL_DEVICE_FAMILY_AX210) { iwl_write_umac_prph(trans, UREG_DOORBELL_TO_ISR6, -- cgit v1.2.3-59-g8ed1b From fcac70029ccf36c5157b8f37817eff332e0145a7 Mon Sep 17 00:00:00 2001 From: Mordechay Goodstein Date: Fri, 29 May 2020 09:39:23 +0300 Subject: iwlwifi: pcie: keep trans instead of trans_pcie in iwl_txq We used both the trans and the trans_pcie structures in iwl_txq, so we can keep the trans structure instead. This helps with the refactoring of txq code out of pcie. Signed-off-by: Mordechay Goodstein Signed-off-by: Luca Coelho Link: https://lore.kernel.org/r/iwlwifi.20200529092401.1f826d34339e.I23182a59bfbe089a1f659742d6fee6f64d2ed08c@changeid --- drivers/net/wireless/intel/iwlwifi/pcie/internal.h | 4 ++-- drivers/net/wireless/intel/iwlwifi/pcie/tx.c | 8 ++++---- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/net/wireless/intel/iwlwifi/pcie/internal.h b/drivers/net/wireless/intel/iwlwifi/pcie/internal.h index b76c0396335a..3950f5784a15 100644 --- a/drivers/net/wireless/intel/iwlwifi/pcie/internal.h +++ b/drivers/net/wireless/intel/iwlwifi/pcie/internal.h @@ -332,7 +332,7 @@ struct iwl_pcie_first_tb_buf { * @entries: transmit entries (driver state) * @lock: queue lock * @stuck_timer: timer that fires if queue gets stuck - * @trans_pcie: pointer back to transport (for timer) + * @trans: pointer back to transport (for timer) * @need_update: indicates need to update read/write index * @ampdu: true if this queue is an ampdu queue for an specific RA/TID * @wd_timeout: queue watchdog timeout (jiffies) - per queue @@ -371,7 +371,7 @@ struct iwl_txq { spinlock_t lock; unsigned long frozen_expiry_remainder; struct timer_list stuck_timer; - struct iwl_trans_pcie *trans_pcie; + struct iwl_trans *trans; bool need_update; bool frozen; bool ampdu; diff --git a/drivers/net/wireless/intel/iwlwifi/pcie/tx.c b/drivers/net/wireless/intel/iwlwifi/pcie/tx.c index 9ff78bca460b..757cf4e9de33 100644 --- a/drivers/net/wireless/intel/iwlwifi/pcie/tx.c +++ b/drivers/net/wireless/intel/iwlwifi/pcie/tx.c @@ -183,8 +183,7 @@ void iwl_pcie_free_dma_ptr(struct iwl_trans *trans, struct iwl_dma_ptr *ptr) static void iwl_pcie_txq_stuck_timer(struct timer_list *t) { struct iwl_txq *txq = from_timer(txq, t, stuck_timer); - struct iwl_trans_pcie *trans_pcie = txq->trans_pcie; - struct iwl_trans *trans = iwl_trans_pcie_get_trans(trans_pcie); + struct iwl_trans *trans = txq->trans; spin_lock(&txq->lock); /* check if triggered erroneously */ @@ -535,7 +534,7 @@ int iwl_pcie_txq_alloc(struct iwl_trans *trans, struct iwl_txq *txq, tfd_sz = trans_pcie->tfd_size * slots_num; timer_setup(&txq->stuck_timer, iwl_pcie_txq_stuck_timer, 0); - txq->trans_pcie = trans_pcie; + txq->trans = trans; txq->n_window = slots_num; @@ -2129,7 +2128,8 @@ static int iwl_fill_data_tbs_amsdu(struct iwl_trans *trans, struct sk_buff *skb, u16 tb1_len) { struct iwl_tx_cmd *tx_cmd = (void *)dev_cmd->payload; - struct iwl_trans_pcie *trans_pcie = txq->trans_pcie; + struct iwl_trans_pcie *trans_pcie = + IWL_TRANS_GET_PCIE_TRANS(txq->trans); struct ieee80211_hdr *hdr = (void *)skb->data; unsigned int snap_ip_tcp_hdrlen, ip_hdrlen, total_len, hdr_room; unsigned int mss = skb_shinfo(skb)->gso_size; -- cgit v1.2.3-59-g8ed1b From 9db93491f29eb4a4a68c72783dd6f078bdd94302 Mon Sep 17 00:00:00 2001 From: Gil Adam Date: Fri, 29 May 2020 09:39:24 +0300 Subject: iwlwifi: acpi: support device specific method (DSM) ACPI Device Specific Method (DSM) allows standardized feature configuration through the ACPI interface without the namespace pollution of the usual mechanism (ACPI method for each feature). Add generic function for evaluating DSM objects and function for evaluating a DSM with no arguments and a single int return value. also implement the required backport for UUID. Signed-off-by: Gil Adam Signed-off-by: Luca Coelho Link: https://lore.kernel.org/r/iwlwifi.20200529092401.c3242ff3ba5c.Icb48c8d61bede5dda7ef267bff10e4798e9dc77b@changeid --- drivers/net/wireless/intel/iwlwifi/fw/acpi.c | 99 ++++++++++++++++++++++++---- drivers/net/wireless/intel/iwlwifi/fw/acpi.h | 22 +++++++ 2 files changed, 110 insertions(+), 11 deletions(-) diff --git a/drivers/net/wireless/intel/iwlwifi/fw/acpi.c b/drivers/net/wireless/intel/iwlwifi/fw/acpi.c index e2184ba4d8b5..dc769b580431 100644 --- a/drivers/net/wireless/intel/iwlwifi/fw/acpi.c +++ b/drivers/net/wireless/intel/iwlwifi/fw/acpi.c @@ -58,44 +58,121 @@ * *****************************************************************************/ +#include #include "iwl-drv.h" #include "iwl-debug.h" #include "acpi.h" #include "fw/runtime.h" -void *iwl_acpi_get_object(struct device *dev, acpi_string method) +static const guid_t intel_wifi_guid = GUID_INIT(0xF21202BF, 0x8F78, 0x4DC6, + 0xA5, 0xB3, 0x1F, 0x73, + 0x8E, 0x28, 0x5A, 0xDE); + +static int iwl_acpi_get_handle(struct device *dev, acpi_string method, + acpi_handle *ret_handle) { acpi_handle root_handle; - acpi_handle handle; - struct acpi_buffer buf = {ACPI_ALLOCATE_BUFFER, NULL}; acpi_status status; root_handle = ACPI_HANDLE(dev); if (!root_handle) { IWL_DEBUG_DEV_RADIO(dev, - "Could not retrieve root port ACPI handle\n"); - return ERR_PTR(-ENOENT); + "ACPI: Could not retrieve root port handle\n"); + return -ENOENT; } - /* Get the method's handle */ - status = acpi_get_handle(root_handle, method, &handle); + status = acpi_get_handle(root_handle, method, ret_handle); if (ACPI_FAILURE(status)) { - IWL_DEBUG_DEV_RADIO(dev, "%s method not found\n", method); - return ERR_PTR(-ENOENT); + IWL_DEBUG_DEV_RADIO(dev, + "ACPI: %s method not found\n", method); + return -ENOENT; } + return 0; +} + +void *iwl_acpi_get_object(struct device *dev, acpi_string method) +{ + struct acpi_buffer buf = {ACPI_ALLOCATE_BUFFER, NULL}; + acpi_handle handle; + acpi_status status; + int ret; + + ret = iwl_acpi_get_handle(dev, method, &handle); + if (ret) + return ERR_PTR(-ENOENT); /* Call the method with no arguments */ status = acpi_evaluate_object(handle, NULL, NULL, &buf); if (ACPI_FAILURE(status)) { - IWL_DEBUG_DEV_RADIO(dev, "%s invocation failed (0x%x)\n", + IWL_DEBUG_DEV_RADIO(dev, + "ACPI: %s method invocation failed (status: 0x%x)\n", method, status); return ERR_PTR(-ENOENT); } - return buf.pointer; } IWL_EXPORT_SYMBOL(iwl_acpi_get_object); +/** +* Generic function for evaluating a method defined in the device specific +* method (DSM) interface. The returned acpi object must be freed by calling +* function. +*/ +void *iwl_acpi_get_dsm_object(struct device *dev, int rev, int func, + union acpi_object *args) +{ + union acpi_object *obj; + + obj = acpi_evaluate_dsm(ACPI_HANDLE(dev), &intel_wifi_guid, rev, func, + args); + if (!obj) { + IWL_DEBUG_DEV_RADIO(dev, + "ACPI: DSM method invocation failed (rev: %d, func:%d)\n", + rev, func); + return ERR_PTR(-ENOENT); + } + return obj; +} + +/** + * Evaluate a DSM with no arguments and a single u8 return value (inside a + * buffer object), verify and return that value. + */ +int iwl_acpi_get_dsm_u8(struct device *dev, int rev, int func) +{ + union acpi_object *obj; + int ret; + + obj = iwl_acpi_get_dsm_object(dev, rev, func, NULL); + if (IS_ERR(obj)) + return -ENOENT; + + if (obj->type != ACPI_TYPE_BUFFER) { + IWL_DEBUG_DEV_RADIO(dev, + "ACPI: DSM method did not return a valid object, type=%d\n", + obj->type); + ret = -EINVAL; + goto out; + } + + if (obj->buffer.length != sizeof(u8)) { + IWL_DEBUG_DEV_RADIO(dev, + "ACPI: DSM method returned invalid buffer, length=%d\n", + obj->buffer.length); + ret = -EINVAL; + goto out; + } + + ret = obj->buffer.pointer[0]; + IWL_DEBUG_DEV_RADIO(dev, + "ACPI: DSM method evaluated: func=%d, ret=%d\n", + func, ret); +out: + ACPI_FREE(obj); + return ret; +} +IWL_EXPORT_SYMBOL(iwl_acpi_get_dsm_u8); + union acpi_object *iwl_acpi_get_wifi_pkg(struct device *dev, union acpi_object *data, int data_size, int *tbl_rev) diff --git a/drivers/net/wireless/intel/iwlwifi/fw/acpi.h b/drivers/net/wireless/intel/iwlwifi/fw/acpi.h index 6a646dc524e1..0ada9eddb8b1 100644 --- a/drivers/net/wireless/intel/iwlwifi/fw/acpi.h +++ b/drivers/net/wireless/intel/iwlwifi/fw/acpi.h @@ -127,12 +127,23 @@ struct iwl_geo_profile { u8 values[ACPI_GEO_TABLE_SIZE]; }; +enum iwl_dsm_funcs_rev_0 { + DSM_FUNC_QUERY = 0, + DSM_FUNC_DISABLE_SRD = 1, + DSM_FUNC_ENABLE_INDONESIA_5G2 = 2, +}; + #ifdef CONFIG_ACPI struct iwl_fw_runtime; void *iwl_acpi_get_object(struct device *dev, acpi_string method); +void *iwl_acpi_get_dsm_object(struct device *dev, int rev, int func, + union acpi_object *args); + +int iwl_acpi_get_dsm_u8(struct device *dev, int rev, int func); + union acpi_object *iwl_acpi_get_wifi_pkg(struct device *dev, union acpi_object *data, int data_size, int *tbl_rev); @@ -192,6 +203,17 @@ static inline void *iwl_acpi_get_object(struct device *dev, acpi_string method) return ERR_PTR(-ENOENT); } +static inline void *iwl_acpi_get_dsm_object(struct device *dev, int rev, + int func, union acpi_object *args) +{ + return ERR_PTR(-ENOENT); +} + +static inline int iwl_acpi_get_dsm_u8(struct device *dev, int rev, int func) +{ + return -ENOENT; +} + static inline union acpi_object *iwl_acpi_get_wifi_pkg(struct device *dev, union acpi_object *data, int data_size, -- cgit v1.2.3-59-g8ed1b From f5b1cb2e615f57a6bf00d600b1f31e9f8058daa5 Mon Sep 17 00:00:00 2001 From: Gil Adam Date: Fri, 29 May 2020 09:39:25 +0300 Subject: iwlwifi: acpi: evaluate dsm to enable 5.2 bands in Indonesia Evaluate the appropriate DSM from ACPI to enable 5.15,5.35 GHz bands in Indonesia. If enabled send LARI_CONFIG_CHANGE cmd to fw. Signed-off-by: Gil Adam Signed-off-by: Luca Coelho Link: https://lore.kernel.org/r/iwlwifi.20200529092401.f549b75bfdac.Iac74a6ffe45aff887cea13ee1d31b100ca11e249@changeid --- .../net/wireless/intel/iwlwifi/fw/api/nvm-reg.h | 34 ++++++++++++++++- drivers/net/wireless/intel/iwlwifi/mvm/fw.c | 43 ++++++++++++++++++++++ 2 files changed, 75 insertions(+), 2 deletions(-) diff --git a/drivers/net/wireless/intel/iwlwifi/fw/api/nvm-reg.h b/drivers/net/wireless/intel/iwlwifi/fw/api/nvm-reg.h index 2d230a7893c2..fd719c37428c 100644 --- a/drivers/net/wireless/intel/iwlwifi/fw/api/nvm-reg.h +++ b/drivers/net/wireless/intel/iwlwifi/fw/api/nvm-reg.h @@ -8,7 +8,7 @@ * Copyright(c) 2012 - 2014 Intel Corporation. All rights reserved. * Copyright(c) 2013 - 2015 Intel Mobile Communications GmbH * Copyright(c) 2016 - 2017 Intel Deutschland GmbH - * Copyright(C) 2018 - 2019 Intel Corporation + * Copyright(C) 2018 - 2020 Intel Corporation * * This program is free software; you can redistribute it and/or modify * it under the terms of version 2 of the GNU General Public License as @@ -31,7 +31,7 @@ * Copyright(c) 2012 - 2014 Intel Corporation. All rights reserved. * Copyright(c) 2013 - 2015 Intel Mobile Communications GmbH * Copyright(c) 2016 - 2017 Intel Deutschland GmbH - * Copyright(C) 2018 - 2019 Intel Corporation + * Copyright(C) 2018 - 2020 Intel Corporation * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -74,6 +74,11 @@ enum iwl_regulatory_and_nvm_subcmd_ids { */ NVM_ACCESS_COMPLETE = 0x0, + /** + * @LARI_CONFIG_CHANGE: &struct iwl_lari_config_change_cmd + */ + LARI_CONFIG_CHANGE = 0x1, + /** * @NVM_GET_INFO: * Command is &struct iwl_nvm_get_info, @@ -446,4 +451,29 @@ struct iwl_tas_config_cmd { __le32 black_list_size; __le32 black_list_array[IWL_TAS_BLACK_LIST_MAX]; } __packed; /* TAS_CONFIG_CMD_API_S_VER_2 */ + +/** + * enum iwl_lari_configs - bit masks for the various LARI config operations + * @LARI_CONFIG_DISABLE_11AC_UKRAINE_MSK: disable 11ac in ukraine + * @LARI_CONFIG_CHANGE_ETSI_TO_PASSIVE_MSK: ETSI 5.8GHz SRD passive scan + * @LARI_CONFIG_CHANGE_ETSI_TO_DISABLED_MSK: ETSI 5.8GHz SRD disabled + * @LARI_CONFIG_ENABLE_5G2_IN_INDONESIA_MSK: enable 5.15/5.35GHz bands in + * Indonesia + */ +enum iwl_lari_config_masks { + LARI_CONFIG_DISABLE_11AC_UKRAINE_MSK = BIT(0), + LARI_CONFIG_CHANGE_ETSI_TO_PASSIVE_MSK = BIT(1), + LARI_CONFIG_CHANGE_ETSI_TO_DISABLED_MSK = BIT(2), + LARI_CONFIG_ENABLE_5G2_IN_INDONESIA_MSK = BIT(3), +}; + +/** + * struct iwl_lari_config_change_cmd - change LARI configuration + * @config_bitmap: bit map of the config commands. each bit will trigger a + * different predefined FW config operation + */ +struct iwl_lari_config_change_cmd { + __le32 config_bitmap; +} __packed; /* LARI_CHANGE_CONF_CMD_S_VER_1 */ + #endif /* __iwl_fw_api_nvm_reg_h__ */ diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/fw.c b/drivers/net/wireless/intel/iwlwifi/mvm/fw.c index 5e8d3f8c3d86..95a613537047 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/fw.c +++ b/drivers/net/wireless/intel/iwlwifi/mvm/fw.c @@ -988,6 +988,44 @@ static void iwl_mvm_tas_init(struct iwl_mvm *mvm) if (ret < 0) IWL_DEBUG_RADIO(mvm, "failed to send TAS_CONFIG (%d)\n", ret); } + +static bool iwl_mvm_eval_dsm_indonesia_5g2(struct iwl_mvm *mvm) +{ + int ret = iwl_acpi_get_dsm_u8((&mvm->fwrt)->dev, 0, + DSM_FUNC_ENABLE_INDONESIA_5G2); + + IWL_DEBUG_RADIO(mvm, + "Evaluated DSM function ENABLE_INDONESIA_5G2, ret=%d\n", + ret); + + return ret == 1; +} + +static void iwl_mvm_lari_cfg(struct iwl_mvm *mvm) +{ + int ret; + struct iwl_lari_config_change_cmd cmd = {}; + + if (iwl_mvm_eval_dsm_indonesia_5g2(mvm)) + cmd.config_bitmap |= + cpu_to_le32(LARI_CONFIG_ENABLE_5G2_IN_INDONESIA_MSK); + + /* apply more config masks here */ + + if (cmd.config_bitmap) { + IWL_DEBUG_RADIO(mvm, + "sending LARI_CONFIG_CHANGE, config_bitmap=0x%x\n", + le32_to_cpu(cmd.config_bitmap)); + ret = iwl_mvm_send_cmd_pdu(mvm, + WIDE_ID(REGULATORY_AND_NVM_GROUP, + LARI_CONFIG_CHANGE), + 0, sizeof(cmd), &cmd); + if (ret < 0) + IWL_DEBUG_RADIO(mvm, + "Failed to send LARI_CONFIG_CHANGE (%d)\n", + ret); + } +} #else /* CONFIG_ACPI */ inline int iwl_mvm_sar_select_profile(struct iwl_mvm *mvm, @@ -1019,6 +1057,10 @@ static int iwl_mvm_ppag_init(struct iwl_mvm *mvm) static void iwl_mvm_tas_init(struct iwl_mvm *mvm) { } + +static void iwl_mvm_lari_cfg(struct iwl_mvm *mvm) +{ +} #endif /* CONFIG_ACPI */ void iwl_mvm_send_recovery_cmd(struct iwl_mvm *mvm, u32 flags) @@ -1293,6 +1335,7 @@ int iwl_mvm_up(struct iwl_mvm *mvm) if (ret) goto error; + iwl_mvm_lari_cfg(mvm); /* * RTNL is not taken during Ct-kill, but we don't need to scan/Tx * anyway, so don't init MCC. -- cgit v1.2.3-59-g8ed1b From 61576240558afce5abb8f391a88a6dfcf25ca7a1 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Fri, 29 May 2020 09:39:26 +0300 Subject: iwlwifi: pcie: gen3: indicate 8k/12k RB size to device Newer firmware versions will parse a few extra bits in the context info to be able to determine whether we are using bigger than 4k RBs, indicate 8k/12k to them if we actually use those (e.g. for sniffer based on the module parameter). Signed-off-by: Johannes Berg Signed-off-by: Luca Coelho Link: https://lore.kernel.org/r/iwlwifi.20200529092401.f83f994572ca.Ibcfd66c3f9b69e68a53b3b2df8331ffb225db655@changeid --- drivers/net/wireless/intel/iwlwifi/iwl-context-info-gen3.h | 12 ++++++++++-- drivers/net/wireless/intel/iwlwifi/pcie/ctxt-info-gen3.c | 8 ++++++++ 2 files changed, 18 insertions(+), 2 deletions(-) diff --git a/drivers/net/wireless/intel/iwlwifi/iwl-context-info-gen3.h b/drivers/net/wireless/intel/iwlwifi/iwl-context-info-gen3.h index ebea99189ca9..9d7a04833cd0 100644 --- a/drivers/net/wireless/intel/iwlwifi/iwl-context-info-gen3.h +++ b/drivers/net/wireless/intel/iwlwifi/iwl-context-info-gen3.h @@ -5,7 +5,7 @@ * * GPL LICENSE SUMMARY * - * Copyright(c) 2018 Intel Corporation + * Copyright(c) 2018, 2020 Intel Corporation * * This program is free software; you can redistribute it and/or modify * it under the terms of version 2 of the GNU General Public License as @@ -18,7 +18,7 @@ * * BSD LICENSE * - * Copyright(c) 2018 Intel Corporation + * Copyright(c) 2018, 2020 Intel Corporation * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -93,6 +93,11 @@ enum iwl_prph_scratch_mtr_format { * @IWL_PRPH_SCRATCH_MTR_FORMAT: a mask for the size of the tfd. * There are 4 optional values: 0: 16 bit, 1: 32 bit, 2: 64 bit, * 3: 256 bit. + * @IWL_PRPH_SCRATCH_RB_SIZE_EXT_MASK: RB size full information, ignored + * by older firmware versions, so set IWL_PRPH_SCRATCH_RB_SIZE_4K + * appropriately; use the below values for this. + * @IWL_PRPH_SCRATCH_RB_SIZE_EXT_8K: 8kB RB size + * @IWL_PRPH_SCRATCH_RB_SIZE_EXT_12K: 12kB RB size */ enum iwl_prph_scratch_flags { IWL_PRPH_SCRATCH_EARLY_DEBUG_EN = BIT(4), @@ -103,6 +108,9 @@ enum iwl_prph_scratch_flags { IWL_PRPH_SCRATCH_RB_SIZE_4K = BIT(16), IWL_PRPH_SCRATCH_MTR_MODE = BIT(17), IWL_PRPH_SCRATCH_MTR_FORMAT = BIT(18) | BIT(19), + IWL_PRPH_SCRATCH_RB_SIZE_EXT_MASK = 0xf << 20, + IWL_PRPH_SCRATCH_RB_SIZE_EXT_8K = 8 << 20, + IWL_PRPH_SCRATCH_RB_SIZE_EXT_12K = 9 << 20, }; /* diff --git a/drivers/net/wireless/intel/iwlwifi/pcie/ctxt-info-gen3.c b/drivers/net/wireless/intel/iwlwifi/pcie/ctxt-info-gen3.c index b6a5921a63c3..dcd81ee1f773 100644 --- a/drivers/net/wireless/intel/iwlwifi/pcie/ctxt-info-gen3.c +++ b/drivers/net/wireless/intel/iwlwifi/pcie/ctxt-info-gen3.c @@ -138,9 +138,17 @@ int iwl_pcie_ctxt_info_gen3_init(struct iwl_trans *trans, case IWL_AMSDU_2K: break; case IWL_AMSDU_4K: + control_flags |= IWL_PRPH_SCRATCH_RB_SIZE_4K; + break; case IWL_AMSDU_8K: + control_flags |= IWL_PRPH_SCRATCH_RB_SIZE_4K; + /* if firmware supports the ext size, tell it */ + control_flags |= IWL_PRPH_SCRATCH_RB_SIZE_EXT_8K; + break; case IWL_AMSDU_12K: control_flags |= IWL_PRPH_SCRATCH_RB_SIZE_4K; + /* if firmware supports the ext size, tell it */ + control_flags |= IWL_PRPH_SCRATCH_RB_SIZE_EXT_12K; break; } -- cgit v1.2.3-59-g8ed1b From 4807e73685f1d52e9143977ee1763b9d050daef3 Mon Sep 17 00:00:00 2001 From: Mordechay Goodstein Date: Fri, 29 May 2020 09:39:27 +0300 Subject: iwlwifi: move iwl_txq and substructures to a common trans header The txq code is not directly related to the PCIe transport, so move the structures it uses to the common iwl-trans.h header. Signed-off-by: Mordechay Goodstein Signed-off-by: Luca Coelho Link: https://lore.kernel.org/r/iwlwifi.20200529092401.d9d0082b8369.I8298f6e83804c1ea99217a79d95d23ef68b184d4@changeid --- drivers/net/wireless/intel/iwlwifi/iwl-trans.h | 107 +++++++++++++++++++++ drivers/net/wireless/intel/iwlwifi/pcie/internal.h | 107 --------------------- 2 files changed, 107 insertions(+), 107 deletions(-) diff --git a/drivers/net/wireless/intel/iwlwifi/iwl-trans.h b/drivers/net/wireless/intel/iwlwifi/iwl-trans.h index bba527b339b5..57361b27351e 100644 --- a/drivers/net/wireless/intel/iwlwifi/iwl-trans.h +++ b/drivers/net/wireless/intel/iwlwifi/iwl-trans.h @@ -795,6 +795,113 @@ struct iwl_trans_debug { u32 domains_bitmap; }; +struct iwl_dma_ptr { + dma_addr_t dma; + void *addr; + size_t size; +}; + +struct iwl_cmd_meta { + /* only for SYNC commands, iff the reply skb is wanted */ + struct iwl_host_cmd *source; + u32 flags; + u32 tbs; +}; + +/* + * The FH will write back to the first TB only, so we need to copy some data + * into the buffer regardless of whether it should be mapped or not. + * This indicates how big the first TB must be to include the scratch buffer + * and the assigned PN. + * Since PN location is 8 bytes at offset 12, it's 20 now. + * If we make it bigger then allocations will be bigger and copy slower, so + * that's probably not useful. + */ +#define IWL_FIRST_TB_SIZE 20 +#define IWL_FIRST_TB_SIZE_ALIGN ALIGN(IWL_FIRST_TB_SIZE, 64) + +struct iwl_pcie_txq_entry { + void *cmd; + struct sk_buff *skb; + /* buffer to free after command completes */ + const void *free_buf; + struct iwl_cmd_meta meta; +}; + +struct iwl_pcie_first_tb_buf { + u8 buf[IWL_FIRST_TB_SIZE_ALIGN]; +}; + +/** + * struct iwl_txq - Tx Queue for DMA + * @q: generic Rx/Tx queue descriptor + * @tfds: transmit frame descriptors (DMA memory) + * @first_tb_bufs: start of command headers, including scratch buffers, for + * the writeback -- this is DMA memory and an array holding one buffer + * for each command on the queue + * @first_tb_dma: DMA address for the first_tb_bufs start + * @entries: transmit entries (driver state) + * @lock: queue lock + * @stuck_timer: timer that fires if queue gets stuck + * @trans: pointer back to transport (for timer) + * @need_update: indicates need to update read/write index + * @ampdu: true if this queue is an ampdu queue for an specific RA/TID + * @wd_timeout: queue watchdog timeout (jiffies) - per queue + * @frozen: tx stuck queue timer is frozen + * @frozen_expiry_remainder: remember how long until the timer fires + * @bc_tbl: byte count table of the queue (relevant only for gen2 transport) + * @write_ptr: 1-st empty entry (index) host_w + * @read_ptr: last used entry (index) host_r + * @dma_addr: physical addr for BD's + * @n_window: safe queue window + * @id: queue id + * @low_mark: low watermark, resume queue if free space more than this + * @high_mark: high watermark, stop queue if free space less than this + * + * A Tx queue consists of circular buffer of BDs (a.k.a. TFDs, transmit frame + * descriptors) and required locking structures. + * + * Note the difference between TFD_QUEUE_SIZE_MAX and n_window: the hardware + * always assumes 256 descriptors, so TFD_QUEUE_SIZE_MAX is always 256 (unless + * there might be HW changes in the future). For the normal TX + * queues, n_window, which is the size of the software queue data + * is also 256; however, for the command queue, n_window is only + * 32 since we don't need so many commands pending. Since the HW + * still uses 256 BDs for DMA though, TFD_QUEUE_SIZE_MAX stays 256. + * This means that we end up with the following: + * HW entries: | 0 | ... | N * 32 | ... | N * 32 + 31 | ... | 255 | + * SW entries: | 0 | ... | 31 | + * where N is a number between 0 and 7. This means that the SW + * data is a window overlayed over the HW queue. + */ +struct iwl_txq { + void *tfds; + struct iwl_pcie_first_tb_buf *first_tb_bufs; + dma_addr_t first_tb_dma; + struct iwl_pcie_txq_entry *entries; + /* lock for syncing changes on the queue */ + spinlock_t lock; + unsigned long frozen_expiry_remainder; + struct timer_list stuck_timer; + struct iwl_trans *trans; + bool need_update; + bool frozen; + bool ampdu; + int block; + unsigned long wd_timeout; + struct sk_buff_head overflow_q; + struct iwl_dma_ptr bc_tbl; + + int write_ptr; + int read_ptr; + dma_addr_t dma_addr; + int n_window; + u32 id; + int low_mark; + int high_mark; + + bool overflow_tx; +}; /** * struct iwl_trans - transport common data * diff --git a/drivers/net/wireless/intel/iwlwifi/pcie/internal.h b/drivers/net/wireless/intel/iwlwifi/pcie/internal.h index 3950f5784a15..3c6a119aede4 100644 --- a/drivers/net/wireless/intel/iwlwifi/pcie/internal.h +++ b/drivers/net/wireless/intel/iwlwifi/pcie/internal.h @@ -246,12 +246,6 @@ struct iwl_rb_allocator { struct work_struct rx_alloc; }; -struct iwl_dma_ptr { - dma_addr_t dma; - void *addr; - size_t size; -}; - /** * iwl_queue_inc_wrap - increment queue index, wrap back to beginning * @index -- current index @@ -290,107 +284,6 @@ static inline int iwl_queue_dec_wrap(struct iwl_trans *trans, int index) (trans->trans_cfg->base_params->max_tfd_queue_size - 1); } -struct iwl_cmd_meta { - /* only for SYNC commands, iff the reply skb is wanted */ - struct iwl_host_cmd *source; - u32 flags; - u32 tbs; -}; - -/* - * The FH will write back to the first TB only, so we need to copy some data - * into the buffer regardless of whether it should be mapped or not. - * This indicates how big the first TB must be to include the scratch buffer - * and the assigned PN. - * Since PN location is 8 bytes at offset 12, it's 20 now. - * If we make it bigger then allocations will be bigger and copy slower, so - * that's probably not useful. - */ -#define IWL_FIRST_TB_SIZE 20 -#define IWL_FIRST_TB_SIZE_ALIGN ALIGN(IWL_FIRST_TB_SIZE, 64) - -struct iwl_pcie_txq_entry { - void *cmd; - struct sk_buff *skb; - /* buffer to free after command completes */ - const void *free_buf; - struct iwl_cmd_meta meta; -}; - -struct iwl_pcie_first_tb_buf { - u8 buf[IWL_FIRST_TB_SIZE_ALIGN]; -}; - -/** - * struct iwl_txq - Tx Queue for DMA - * @q: generic Rx/Tx queue descriptor - * @tfds: transmit frame descriptors (DMA memory) - * @first_tb_bufs: start of command headers, including scratch buffers, for - * the writeback -- this is DMA memory and an array holding one buffer - * for each command on the queue - * @first_tb_dma: DMA address for the first_tb_bufs start - * @entries: transmit entries (driver state) - * @lock: queue lock - * @stuck_timer: timer that fires if queue gets stuck - * @trans: pointer back to transport (for timer) - * @need_update: indicates need to update read/write index - * @ampdu: true if this queue is an ampdu queue for an specific RA/TID - * @wd_timeout: queue watchdog timeout (jiffies) - per queue - * @frozen: tx stuck queue timer is frozen - * @frozen_expiry_remainder: remember how long until the timer fires - * @bc_tbl: byte count table of the queue (relevant only for gen2 transport) - * @write_ptr: 1-st empty entry (index) host_w - * @read_ptr: last used entry (index) host_r - * @dma_addr: physical addr for BD's - * @n_window: safe queue window - * @id: queue id - * @low_mark: low watermark, resume queue if free space more than this - * @high_mark: high watermark, stop queue if free space less than this - * - * A Tx queue consists of circular buffer of BDs (a.k.a. TFDs, transmit frame - * descriptors) and required locking structures. - * - * Note the difference between TFD_QUEUE_SIZE_MAX and n_window: the hardware - * always assumes 256 descriptors, so TFD_QUEUE_SIZE_MAX is always 256 (unless - * there might be HW changes in the future). For the normal TX - * queues, n_window, which is the size of the software queue data - * is also 256; however, for the command queue, n_window is only - * 32 since we don't need so many commands pending. Since the HW - * still uses 256 BDs for DMA though, TFD_QUEUE_SIZE_MAX stays 256. - * This means that we end up with the following: - * HW entries: | 0 | ... | N * 32 | ... | N * 32 + 31 | ... | 255 | - * SW entries: | 0 | ... | 31 | - * where N is a number between 0 and 7. This means that the SW - * data is a window overlayed over the HW queue. - */ -struct iwl_txq { - void *tfds; - struct iwl_pcie_first_tb_buf *first_tb_bufs; - dma_addr_t first_tb_dma; - struct iwl_pcie_txq_entry *entries; - spinlock_t lock; - unsigned long frozen_expiry_remainder; - struct timer_list stuck_timer; - struct iwl_trans *trans; - bool need_update; - bool frozen; - bool ampdu; - int block; - unsigned long wd_timeout; - struct sk_buff_head overflow_q; - struct iwl_dma_ptr bc_tbl; - - int write_ptr; - int read_ptr; - dma_addr_t dma_addr; - int n_window; - u32 id; - int low_mark; - int high_mark; - - bool overflow_tx; -}; - static inline dma_addr_t iwl_pcie_get_first_tb_dma(struct iwl_txq *txq, int idx) { -- cgit v1.2.3-59-g8ed1b From 4f4822b7cd5ab818a62815cc8eeca495cf8ec872 Mon Sep 17 00:00:00 2001 From: Mordechay Goodstein Date: Fri, 29 May 2020 09:39:28 +0300 Subject: iwlwifi: move txq-specific from trans_pcie to common trans We don't want to have txq code in the PCIe transport code, so move all the relevant elements to a new iwl_txq structure and store it in iwl_trans. spatch @ replace_pcie @ struct iwl_trans_pcie *trans_pcie; @@ ( -trans_pcie->queue_stopped +trans->txqs.queue_stopped | -trans_pcie->queue_used +trans->txqs.queue_used | -trans_pcie->txq +trans->txqs.txq | -trans_pcie->txq +trans->txqs.txq | -trans_pcie->cmd_queue +trans->txqs.cmd.q_id | -trans_pcie->cmd_fifo +trans->txqs.cmd.fifo | -trans_pcie->cmd_q_wdg_timeout +trans->txqs.cmd.wdg_timeout ) // clean all new unused variables @ depends on replace_pcie @ type T; identifier i; expression E; @@ - T i = E; ... when != i Signed-off-by: Mordechay Goodstein Signed-off-by: Luca Coelho Link: https://lore.kernel.org/r/iwlwifi.20200529092401.a428d3c9d66f.Ie04ae55f33954636a39c98e7ae1e739c0507435b@changeid --- drivers/net/wireless/intel/iwlwifi/iwl-trans.h | 21 ++++ .../wireless/intel/iwlwifi/pcie/ctxt-info-gen3.c | 2 +- .../net/wireless/intel/iwlwifi/pcie/ctxt-info.c | 6 +- drivers/net/wireless/intel/iwlwifi/pcie/internal.h | 14 +-- drivers/net/wireless/intel/iwlwifi/pcie/rx.c | 6 +- .../net/wireless/intel/iwlwifi/pcie/trans-gen2.c | 11 +- drivers/net/wireless/intel/iwlwifi/pcie/trans.c | 35 +++---- drivers/net/wireless/intel/iwlwifi/pcie/tx-gen2.c | 70 ++++++------- drivers/net/wireless/intel/iwlwifi/pcie/tx.c | 112 ++++++++++----------- 9 files changed, 138 insertions(+), 139 deletions(-) diff --git a/drivers/net/wireless/intel/iwlwifi/iwl-trans.h b/drivers/net/wireless/intel/iwlwifi/iwl-trans.h index 57361b27351e..a301e2484cdb 100644 --- a/drivers/net/wireless/intel/iwlwifi/iwl-trans.h +++ b/drivers/net/wireless/intel/iwlwifi/iwl-trans.h @@ -902,6 +902,25 @@ struct iwl_txq { bool overflow_tx; }; + +/** + * struct iwl_trans_txqs - transport tx queues data + * + * @queue_used - bit mask of used queues + * @queue_stopped - bit mask of stopped queues + */ +struct iwl_trans_txqs { + unsigned long queue_used[BITS_TO_LONGS(IWL_MAX_TVQM_QUEUES)]; + unsigned long queue_stopped[BITS_TO_LONGS(IWL_MAX_TVQM_QUEUES)]; + struct iwl_txq *txq[IWL_MAX_TVQM_QUEUES]; + struct { + u8 fifo; + u8 q_id; + unsigned int wdg_timeout; + } cmd; + +}; + /** * struct iwl_trans - transport common data * @@ -935,6 +954,7 @@ struct iwl_txq { * @system_pm_mode: the system-wide power management mode in use. * This mode is set dynamically, depending on the WoWLAN values * configured from the userspace at runtime. + * @iwl_trans_txqs: transport tx queues data. */ struct iwl_trans { const struct iwl_trans_ops *ops; @@ -982,6 +1002,7 @@ struct iwl_trans { enum iwl_plat_pm_mode system_pm_mode; const char *name; + struct iwl_trans_txqs txqs; /* pointer to trans specific struct */ /*Ensure that this pointer will always be aligned to sizeof pointer */ diff --git a/drivers/net/wireless/intel/iwlwifi/pcie/ctxt-info-gen3.c b/drivers/net/wireless/intel/iwlwifi/pcie/ctxt-info-gen3.c index dcd81ee1f773..1ab136600415 100644 --- a/drivers/net/wireless/intel/iwlwifi/pcie/ctxt-info-gen3.c +++ b/drivers/net/wireless/intel/iwlwifi/pcie/ctxt-info-gen3.c @@ -221,7 +221,7 @@ int iwl_pcie_ctxt_info_gen3_init(struct iwl_trans *trans, ctxt_info_gen3->tr_idx_arr_size = cpu_to_le16(IWL_NUM_OF_TRANSFER_RINGS); ctxt_info_gen3->mtr_base_addr = - cpu_to_le64(trans_pcie->txq[trans_pcie->cmd_queue]->dma_addr); + cpu_to_le64(trans->txqs.txq[trans->txqs.cmd.q_id]->dma_addr); ctxt_info_gen3->mcr_base_addr = cpu_to_le64(trans_pcie->rxq->used_bd_dma); ctxt_info_gen3->mtr_size = diff --git a/drivers/net/wireless/intel/iwlwifi/pcie/ctxt-info.c b/drivers/net/wireless/intel/iwlwifi/pcie/ctxt-info.c index b65405009d02..23abfbd096b0 100644 --- a/drivers/net/wireless/intel/iwlwifi/pcie/ctxt-info.c +++ b/drivers/net/wireless/intel/iwlwifi/pcie/ctxt-info.c @@ -6,7 +6,7 @@ * GPL LICENSE SUMMARY * * Copyright(c) 2017 Intel Deutschland GmbH - * Copyright(c) 2018 - 2019 Intel Corporation + * Copyright(c) 2018 - 2020 Intel Corporation * * This program is free software; you can redistribute it and/or modify * it under the terms of version 2 of the GNU General Public License as @@ -20,7 +20,7 @@ * BSD LICENSE * * Copyright(c) 2017 Intel Deutschland GmbH - * Copyright(c) 2018 - 2019 Intel Corporation + * Copyright(c) 2018 - 2020 Intel Corporation * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -263,7 +263,7 @@ int iwl_pcie_ctxt_info_init(struct iwl_trans *trans, /* initialize TX command queue */ ctxt_info->hcmd_cfg.cmd_queue_addr = - cpu_to_le64(trans_pcie->txq[trans_pcie->cmd_queue]->dma_addr); + cpu_to_le64(trans->txqs.txq[trans->txqs.cmd.q_id]->dma_addr); ctxt_info->hcmd_cfg.cmd_queue_size = TFD_QUEUE_CB_SIZE(IWL_CMD_QUEUE_SIZE); diff --git a/drivers/net/wireless/intel/iwlwifi/pcie/internal.h b/drivers/net/wireless/intel/iwlwifi/pcie/internal.h index 3c6a119aede4..55808ba10d27 100644 --- a/drivers/net/wireless/intel/iwlwifi/pcie/internal.h +++ b/drivers/net/wireless/intel/iwlwifi/pcie/internal.h @@ -454,9 +454,6 @@ struct iwl_trans_pcie { struct dma_pool *bc_pool; struct iwl_txq *txq_memory; - struct iwl_txq *txq[IWL_MAX_TVQM_QUEUES]; - unsigned long queue_used[BITS_TO_LONGS(IWL_MAX_TVQM_QUEUES)]; - unsigned long queue_stopped[BITS_TO_LONGS(IWL_MAX_TVQM_QUEUES)]; /* PCI bus related data */ struct pci_dev *pci_dev; @@ -470,10 +467,7 @@ struct iwl_trans_pcie { u8 page_offs, dev_cmd_offs; - u8 cmd_queue; u8 def_rx_queue; - u8 cmd_fifo; - unsigned int cmd_q_wdg_timeout; u8 n_no_reclaim_cmds; u8 no_reclaim_cmds[MAX_NO_RECLAIM_CMDS]; u8 max_tbs; @@ -876,9 +870,7 @@ void iwl_pcie_handle_rfkill_irq(struct iwl_trans *trans); static inline void iwl_wake_queue(struct iwl_trans *trans, struct iwl_txq *txq) { - struct iwl_trans_pcie *trans_pcie = IWL_TRANS_GET_PCIE_TRANS(trans); - - if (test_and_clear_bit(txq->id, trans_pcie->queue_stopped)) { + if (test_and_clear_bit(txq->id, trans->txqs.queue_stopped)) { IWL_DEBUG_TX_QUEUES(trans, "Wake hwq %d\n", txq->id); iwl_op_mode_queue_not_full(trans->op_mode, txq->id); } @@ -887,9 +879,7 @@ static inline void iwl_wake_queue(struct iwl_trans *trans, static inline void iwl_stop_queue(struct iwl_trans *trans, struct iwl_txq *txq) { - struct iwl_trans_pcie *trans_pcie = IWL_TRANS_GET_PCIE_TRANS(trans); - - if (!test_and_set_bit(txq->id, trans_pcie->queue_stopped)) { + if (!test_and_set_bit(txq->id, trans->txqs.queue_stopped)) { iwl_op_mode_queue_full(trans->op_mode, txq->id); IWL_DEBUG_TX_QUEUES(trans, "Stop hwq %d\n", txq->id); } else diff --git a/drivers/net/wireless/intel/iwlwifi/pcie/rx.c b/drivers/net/wireless/intel/iwlwifi/pcie/rx.c index 72d1cf27e6a4..24cb1b1f21f0 100644 --- a/drivers/net/wireless/intel/iwlwifi/pcie/rx.c +++ b/drivers/net/wireless/intel/iwlwifi/pcie/rx.c @@ -1284,7 +1284,7 @@ static void iwl_pcie_rx_handle_rb(struct iwl_trans *trans, int i) { struct iwl_trans_pcie *trans_pcie = IWL_TRANS_GET_PCIE_TRANS(trans); - struct iwl_txq *txq = trans_pcie->txq[trans_pcie->cmd_queue]; + struct iwl_txq *txq = trans->txqs.txq[trans->txqs.cmd.q_id]; bool page_stolen = false; int max_len = trans_pcie->rx_buf_bytes; u32 offset = 0; @@ -1671,9 +1671,9 @@ static void iwl_pcie_irq_handle_error(struct iwl_trans *trans) } for (i = 0; i < trans->trans_cfg->base_params->num_of_queues; i++) { - if (!trans_pcie->txq[i]) + if (!trans->txqs.txq[i]) continue; - del_timer(&trans_pcie->txq[i]->stuck_timer); + del_timer(&trans->txqs.txq[i]->stuck_timer); } /* The STATUS_FW_ERROR bit is set in this function. This must happen diff --git a/drivers/net/wireless/intel/iwlwifi/pcie/trans-gen2.c b/drivers/net/wireless/intel/iwlwifi/pcie/trans-gen2.c index 19a2c72081ab..97c9e9c87436 100644 --- a/drivers/net/wireless/intel/iwlwifi/pcie/trans-gen2.c +++ b/drivers/net/wireless/intel/iwlwifi/pcie/trans-gen2.c @@ -6,7 +6,7 @@ * GPL LICENSE SUMMARY * * Copyright(c) 2017 Intel Deutschland GmbH - * Copyright(c) 2018 - 2019 Intel Corporation + * Copyright(c) 2018 - 2020 Intel Corporation * * This program is free software; you can redistribute it and/or modify * it under the terms of version 2 of the GNU General Public License as @@ -20,7 +20,7 @@ * BSD LICENSE * * Copyright(c) 2017 Intel Deutschland GmbH - * Copyright(c) 2018 - 2019 Intel Corporation + * Copyright(c) 2018 - 2020 Intel Corporation * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -245,7 +245,7 @@ static int iwl_pcie_gen2_nic_init(struct iwl_trans *trans) return -ENOMEM; /* Allocate or reset and init all Tx and Command queues */ - if (iwl_pcie_gen2_tx_init(trans, trans_pcie->cmd_queue, queue_size)) + if (iwl_pcie_gen2_tx_init(trans, trans->txqs.cmd.q_id, queue_size)) return -ENOMEM; /* enable shadow regs in HW */ @@ -262,8 +262,9 @@ void iwl_trans_pcie_gen2_fw_alive(struct iwl_trans *trans, u32 scd_addr) iwl_pcie_reset_ict(trans); /* make sure all queue are not stopped/used */ - memset(trans_pcie->queue_stopped, 0, sizeof(trans_pcie->queue_stopped)); - memset(trans_pcie->queue_used, 0, sizeof(trans_pcie->queue_used)); + memset(trans->txqs.queue_stopped, 0, + sizeof(trans->txqs.queue_stopped)); + memset(trans->txqs.queue_used, 0, sizeof(trans->txqs.queue_used)); /* now that we got alive we can free the fw image & the context info. * paging memory cannot be freed included since FW will still use it diff --git a/drivers/net/wireless/intel/iwlwifi/pcie/trans.c b/drivers/net/wireless/intel/iwlwifi/pcie/trans.c index 3bcbc2967c88..e5160d620868 100644 --- a/drivers/net/wireless/intel/iwlwifi/pcie/trans.c +++ b/drivers/net/wireless/intel/iwlwifi/pcie/trans.c @@ -1904,9 +1904,9 @@ static void iwl_trans_pcie_configure(struct iwl_trans *trans, { struct iwl_trans_pcie *trans_pcie = IWL_TRANS_GET_PCIE_TRANS(trans); - trans_pcie->cmd_queue = trans_cfg->cmd_queue; - trans_pcie->cmd_fifo = trans_cfg->cmd_fifo; - trans_pcie->cmd_q_wdg_timeout = trans_cfg->cmd_q_wdg_timeout; + trans->txqs.cmd.q_id = trans_cfg->cmd_queue; + trans->txqs.cmd.fifo = trans_cfg->cmd_fifo; + trans->txqs.cmd.wdg_timeout = trans_cfg->cmd_q_wdg_timeout; if (WARN_ON(trans_cfg->n_no_reclaim_cmds > MAX_NO_RECLAIM_CMDS)) trans_pcie->n_no_reclaim_cmds = 0; else @@ -2199,11 +2199,10 @@ static void iwl_trans_pcie_freeze_txq_timer(struct iwl_trans *trans, unsigned long txqs, bool freeze) { - struct iwl_trans_pcie *trans_pcie = IWL_TRANS_GET_PCIE_TRANS(trans); int queue; for_each_set_bit(queue, &txqs, BITS_PER_LONG) { - struct iwl_txq *txq = trans_pcie->txq[queue]; + struct iwl_txq *txq = trans->txqs.txq[queue]; unsigned long now; spin_lock_bh(&txq->lock); @@ -2251,13 +2250,12 @@ next_queue: static void iwl_trans_pcie_block_txq_ptrs(struct iwl_trans *trans, bool block) { - struct iwl_trans_pcie *trans_pcie = IWL_TRANS_GET_PCIE_TRANS(trans); int i; for (i = 0; i < trans->trans_cfg->base_params->num_of_queues; i++) { - struct iwl_txq *txq = trans_pcie->txq[i]; + struct iwl_txq *txq = trans->txqs.txq[i]; - if (i == trans_pcie->cmd_queue) + if (i == trans->txqs.cmd.q_id) continue; spin_lock_bh(&txq->lock); @@ -2326,7 +2324,6 @@ static int iwl_trans_pcie_rxq_dma_data(struct iwl_trans *trans, int queue, static int iwl_trans_pcie_wait_txq_empty(struct iwl_trans *trans, int txq_idx) { - struct iwl_trans_pcie *trans_pcie = IWL_TRANS_GET_PCIE_TRANS(trans); struct iwl_txq *txq; unsigned long now = jiffies; bool overflow_tx; @@ -2336,11 +2333,11 @@ static int iwl_trans_pcie_wait_txq_empty(struct iwl_trans *trans, int txq_idx) if (test_bit(STATUS_TRANS_DEAD, &trans->status)) return -ENODEV; - if (!test_bit(txq_idx, trans_pcie->queue_used)) + if (!test_bit(txq_idx, trans->txqs.queue_used)) return -EINVAL; IWL_DEBUG_TX_QUEUES(trans, "Emptying queue %d...\n", txq_idx); - txq = trans_pcie->txq[txq_idx]; + txq = trans->txqs.txq[txq_idx]; spin_lock_bh(&txq->lock); overflow_tx = txq->overflow_tx || @@ -2388,7 +2385,6 @@ static int iwl_trans_pcie_wait_txq_empty(struct iwl_trans *trans, int txq_idx) static int iwl_trans_pcie_wait_txqs_empty(struct iwl_trans *trans, u32 txq_bm) { - struct iwl_trans_pcie *trans_pcie = IWL_TRANS_GET_PCIE_TRANS(trans); int cnt; int ret = 0; @@ -2397,9 +2393,9 @@ static int iwl_trans_pcie_wait_txqs_empty(struct iwl_trans *trans, u32 txq_bm) cnt < trans->trans_cfg->base_params->num_of_queues; cnt++) { - if (cnt == trans_pcie->cmd_queue) + if (cnt == trans->txqs.cmd.q_id) continue; - if (!test_bit(cnt, trans_pcie->queue_used)) + if (!test_bit(cnt, trans->txqs.queue_used)) continue; if (!(BIT(cnt) & txq_bm)) continue; @@ -2573,13 +2569,12 @@ static int iwl_dbgfs_tx_queue_seq_show(struct seq_file *seq, void *v) struct iwl_dbgfs_tx_queue_priv *priv = seq->private; struct iwl_dbgfs_tx_queue_state *state = v; struct iwl_trans *trans = priv->trans; - struct iwl_trans_pcie *trans_pcie = IWL_TRANS_GET_PCIE_TRANS(trans); - struct iwl_txq *txq = trans_pcie->txq[state->pos]; + struct iwl_txq *txq = trans->txqs.txq[state->pos]; seq_printf(seq, "hwq %.3u: used=%d stopped=%d ", (unsigned int)state->pos, - !!test_bit(state->pos, trans_pcie->queue_used), - !!test_bit(state->pos, trans_pcie->queue_stopped)); + !!test_bit(state->pos, trans->txqs.queue_used), + !!test_bit(state->pos, trans->txqs.queue_stopped)); if (txq) seq_printf(seq, "read=%u write=%u need_update=%d frozen=%d n_window=%d ampdu=%d", @@ -2589,7 +2584,7 @@ static int iwl_dbgfs_tx_queue_seq_show(struct seq_file *seq, void *v) else seq_puts(seq, "(unallocated)"); - if (state->pos == trans_pcie->cmd_queue) + if (state->pos == trans->txqs.cmd.q_id) seq_puts(seq, " (HCMD)"); seq_puts(seq, "\n"); @@ -3265,7 +3260,7 @@ static struct iwl_trans_dump_data { struct iwl_trans_pcie *trans_pcie = IWL_TRANS_GET_PCIE_TRANS(trans); struct iwl_fw_error_dump_data *data; - struct iwl_txq *cmdq = trans_pcie->txq[trans_pcie->cmd_queue]; + struct iwl_txq *cmdq = trans->txqs.txq[trans->txqs.cmd.q_id]; struct iwl_fw_error_dump_txcmd *txcmd; struct iwl_trans_dump_data *dump_data; u32 len, num_rbs = 0, monitor_len = 0; diff --git a/drivers/net/wireless/intel/iwlwifi/pcie/tx-gen2.c b/drivers/net/wireless/intel/iwlwifi/pcie/tx-gen2.c index bb55563bba68..7fc7542535d8 100644 --- a/drivers/net/wireless/intel/iwlwifi/pcie/tx-gen2.c +++ b/drivers/net/wireless/intel/iwlwifi/pcie/tx-gen2.c @@ -64,7 +64,6 @@ */ void iwl_pcie_gen2_tx_stop(struct iwl_trans *trans) { - struct iwl_trans_pcie *trans_pcie = IWL_TRANS_GET_PCIE_TRANS(trans); int txq_id; /* @@ -72,12 +71,13 @@ void iwl_pcie_gen2_tx_stop(struct iwl_trans *trans) * queues. This happens when we have an rfkill interrupt. * Since we stop Tx altogether - mark the queues as stopped. */ - memset(trans_pcie->queue_stopped, 0, sizeof(trans_pcie->queue_stopped)); - memset(trans_pcie->queue_used, 0, sizeof(trans_pcie->queue_used)); + memset(trans->txqs.queue_stopped, 0, + sizeof(trans->txqs.queue_stopped)); + memset(trans->txqs.queue_used, 0, sizeof(trans->txqs.queue_used)); /* Unmap DMA from host system and free skb's */ - for (txq_id = 0; txq_id < ARRAY_SIZE(trans_pcie->txq); txq_id++) { - if (!trans_pcie->txq[txq_id]) + for (txq_id = 0; txq_id < ARRAY_SIZE(trans->txqs.txq); txq_id++) { + if (!trans->txqs.txq[txq_id]) continue; iwl_pcie_gen2_txq_unmap(trans, txq_id); } @@ -716,7 +716,7 @@ int iwl_trans_pcie_gen2_tx(struct iwl_trans *trans, struct sk_buff *skb, { struct iwl_trans_pcie *trans_pcie = IWL_TRANS_GET_PCIE_TRANS(trans); struct iwl_cmd_meta *out_meta; - struct iwl_txq *txq = trans_pcie->txq[txq_id]; + struct iwl_txq *txq = trans->txqs.txq[txq_id]; u16 cmd_len; int idx; void *tfd; @@ -725,7 +725,7 @@ int iwl_trans_pcie_gen2_tx(struct iwl_trans *trans, struct sk_buff *skb, "queue %d out of range", txq_id)) return -EINVAL; - if (WARN_ONCE(!test_bit(txq_id, trans_pcie->queue_used), + if (WARN_ONCE(!test_bit(txq_id, trans->txqs.queue_used), "TX on unused queue %d\n", txq_id)) return -EINVAL; @@ -819,7 +819,7 @@ static int iwl_pcie_gen2_enqueue_hcmd(struct iwl_trans *trans, struct iwl_host_cmd *cmd) { struct iwl_trans_pcie *trans_pcie = IWL_TRANS_GET_PCIE_TRANS(trans); - struct iwl_txq *txq = trans_pcie->txq[trans_pcie->cmd_queue]; + struct iwl_txq *txq = trans->txqs.txq[trans->txqs.cmd.q_id]; struct iwl_device_cmd *out_cmd; struct iwl_cmd_meta *out_meta; unsigned long flags; @@ -931,7 +931,7 @@ static int iwl_pcie_gen2_enqueue_hcmd(struct iwl_trans *trans, cpu_to_le16(cmd_size - sizeof(struct iwl_cmd_header_wide)); out_cmd->hdr_wide.reserved = 0; out_cmd->hdr_wide.sequence = - cpu_to_le16(QUEUE_TO_SEQ(trans_pcie->cmd_queue) | + cpu_to_le16(QUEUE_TO_SEQ(trans->txqs.cmd.q_id) | INDEX_TO_SEQ(txq->write_ptr)); cmd_pos = sizeof(struct iwl_cmd_header_wide); @@ -979,7 +979,7 @@ static int iwl_pcie_gen2_enqueue_hcmd(struct iwl_trans *trans, "Sending command %s (%.2x.%.2x), seq: 0x%04X, %d bytes at %d[%d]:%d\n", iwl_get_cmd_string(trans, cmd->id), group_id, out_cmd->hdr.cmd, le16_to_cpu(out_cmd->hdr.sequence), - cmd_size, txq->write_ptr, idx, trans_pcie->cmd_queue); + cmd_size, txq->write_ptr, idx, trans->txqs.cmd.q_id); /* start the TFD with the minimum copy bytes */ tb0_size = min_t(int, copy_size, IWL_FIRST_TB_SIZE); @@ -1056,7 +1056,7 @@ static int iwl_pcie_gen2_send_hcmd_sync(struct iwl_trans *trans, { struct iwl_trans_pcie *trans_pcie = IWL_TRANS_GET_PCIE_TRANS(trans); const char *cmd_str = iwl_get_cmd_string(trans, cmd->id); - struct iwl_txq *txq = trans_pcie->txq[trans_pcie->cmd_queue]; + struct iwl_txq *txq = trans->txqs.txq[trans->txqs.cmd.q_id]; int cmd_idx; int ret; @@ -1175,14 +1175,14 @@ int iwl_trans_pcie_gen2_send_hcmd(struct iwl_trans *trans, void iwl_pcie_gen2_txq_unmap(struct iwl_trans *trans, int txq_id) { struct iwl_trans_pcie *trans_pcie = IWL_TRANS_GET_PCIE_TRANS(trans); - struct iwl_txq *txq = trans_pcie->txq[txq_id]; + struct iwl_txq *txq = trans->txqs.txq[txq_id]; spin_lock_bh(&txq->lock); while (txq->write_ptr != txq->read_ptr) { IWL_DEBUG_TX_REPLY(trans, "Q %d Free %d\n", txq_id, txq->read_ptr); - if (txq_id != trans_pcie->cmd_queue) { + if (txq_id != trans->txqs.cmd.q_id) { int idx = iwl_pcie_get_cmd_index(txq, txq->read_ptr); struct sk_buff *skb = txq->entries[idx].skb; @@ -1240,7 +1240,6 @@ void iwl_pcie_gen2_txq_free_memory(struct iwl_trans *trans, */ static void iwl_pcie_gen2_txq_free(struct iwl_trans *trans, int txq_id) { - struct iwl_trans_pcie *trans_pcie = IWL_TRANS_GET_PCIE_TRANS(trans); struct iwl_txq *txq; int i; @@ -1248,7 +1247,7 @@ static void iwl_pcie_gen2_txq_free(struct iwl_trans *trans, int txq_id) "queue %d out of range", txq_id)) return; - txq = trans_pcie->txq[txq_id]; + txq = trans->txqs.txq[txq_id]; if (WARN_ON(!txq)) return; @@ -1256,7 +1255,7 @@ static void iwl_pcie_gen2_txq_free(struct iwl_trans *trans, int txq_id) iwl_pcie_gen2_txq_unmap(trans, txq_id); /* De-alloc array of command/tx buffers */ - if (txq_id == trans_pcie->cmd_queue) + if (txq_id == trans->txqs.cmd.q_id) for (i = 0; i < txq->n_window; i++) { kzfree(txq->entries[i].cmd); kzfree(txq->entries[i].free_buf); @@ -1265,9 +1264,9 @@ static void iwl_pcie_gen2_txq_free(struct iwl_trans *trans, int txq_id) iwl_pcie_gen2_txq_free_memory(trans, txq); - trans_pcie->txq[txq_id] = NULL; + trans->txqs.txq[txq_id] = NULL; - clear_bit(txq_id, trans_pcie->queue_used); + clear_bit(txq_id, trans->txqs.queue_used); } int iwl_trans_pcie_dyn_txq_alloc_dma(struct iwl_trans *trans, @@ -1327,7 +1326,6 @@ int iwl_trans_pcie_txq_alloc_response(struct iwl_trans *trans, struct iwl_txq *txq, struct iwl_host_cmd *hcmd) { - struct iwl_trans_pcie *trans_pcie = IWL_TRANS_GET_PCIE_TRANS(trans); struct iwl_tx_queue_cfg_rsp *rsp; int ret, qid; u32 wr_ptr; @@ -1342,20 +1340,20 @@ int iwl_trans_pcie_txq_alloc_response(struct iwl_trans *trans, qid = le16_to_cpu(rsp->queue_number); wr_ptr = le16_to_cpu(rsp->write_pointer); - if (qid >= ARRAY_SIZE(trans_pcie->txq)) { + if (qid >= ARRAY_SIZE(trans->txqs.txq)) { WARN_ONCE(1, "queue index %d unsupported", qid); ret = -EIO; goto error_free_resp; } - if (test_and_set_bit(qid, trans_pcie->queue_used)) { + if (test_and_set_bit(qid, trans->txqs.queue_used)) { WARN_ONCE(1, "queue %d already used", qid); ret = -EIO; goto error_free_resp; } txq->id = qid; - trans_pcie->txq[qid] = txq; + trans->txqs.txq[qid] = txq; wr_ptr &= (trans->trans_cfg->base_params->max_tfd_queue_size - 1); /* Place first TFD at index corresponding to start sequence number */ @@ -1413,8 +1411,6 @@ error: void iwl_trans_pcie_dyn_txq_free(struct iwl_trans *trans, int queue) { - struct iwl_trans_pcie *trans_pcie = IWL_TRANS_GET_PCIE_TRANS(trans); - if (WARN(queue >= IWL_MAX_TVQM_QUEUES, "queue %d out of range", queue)) return; @@ -1425,7 +1421,7 @@ void iwl_trans_pcie_dyn_txq_free(struct iwl_trans *trans, int queue) * allow the op_mode to call txq_disable after it already called * stop_device. */ - if (!test_and_clear_bit(queue, trans_pcie->queue_used)) { + if (!test_and_clear_bit(queue, trans->txqs.queue_used)) { WARN_ONCE(test_bit(STATUS_DEVICE_ENABLED, &trans->status), "queue %d not used", queue); return; @@ -1433,22 +1429,21 @@ void iwl_trans_pcie_dyn_txq_free(struct iwl_trans *trans, int queue) iwl_pcie_gen2_txq_unmap(trans, queue); - iwl_pcie_gen2_txq_free_memory(trans, trans_pcie->txq[queue]); - trans_pcie->txq[queue] = NULL; + iwl_pcie_gen2_txq_free_memory(trans, trans->txqs.txq[queue]); + trans->txqs.txq[queue] = NULL; IWL_DEBUG_TX_QUEUES(trans, "Deactivate queue %d\n", queue); } void iwl_pcie_gen2_tx_free(struct iwl_trans *trans) { - struct iwl_trans_pcie *trans_pcie = IWL_TRANS_GET_PCIE_TRANS(trans); int i; - memset(trans_pcie->queue_used, 0, sizeof(trans_pcie->queue_used)); + memset(trans->txqs.queue_used, 0, sizeof(trans->txqs.queue_used)); /* Free all TX queues */ - for (i = 0; i < ARRAY_SIZE(trans_pcie->txq); i++) { - if (!trans_pcie->txq[i]) + for (i = 0; i < ARRAY_SIZE(trans->txqs.txq); i++) { + if (!trans->txqs.txq[i]) continue; iwl_pcie_gen2_txq_free(trans, i); @@ -1457,35 +1452,34 @@ void iwl_pcie_gen2_tx_free(struct iwl_trans *trans) int iwl_pcie_gen2_tx_init(struct iwl_trans *trans, int txq_id, int queue_size) { - struct iwl_trans_pcie *trans_pcie = IWL_TRANS_GET_PCIE_TRANS(trans); struct iwl_txq *queue; int ret; /* alloc and init the tx queue */ - if (!trans_pcie->txq[txq_id]) { + if (!trans->txqs.txq[txq_id]) { queue = kzalloc(sizeof(*queue), GFP_KERNEL); if (!queue) { IWL_ERR(trans, "Not enough memory for tx queue\n"); return -ENOMEM; } - trans_pcie->txq[txq_id] = queue; + trans->txqs.txq[txq_id] = queue; ret = iwl_pcie_txq_alloc(trans, queue, queue_size, true); if (ret) { IWL_ERR(trans, "Tx %d queue init failed\n", txq_id); goto error; } } else { - queue = trans_pcie->txq[txq_id]; + queue = trans->txqs.txq[txq_id]; } ret = iwl_pcie_txq_init(trans, queue, queue_size, - (txq_id == trans_pcie->cmd_queue)); + (txq_id == trans->txqs.cmd.q_id)); if (ret) { IWL_ERR(trans, "Tx %d queue alloc failed\n", txq_id); goto error; } - trans_pcie->txq[txq_id]->id = txq_id; - set_bit(txq_id, trans_pcie->queue_used); + trans->txqs.txq[txq_id]->id = txq_id; + set_bit(txq_id, trans->txqs.queue_used); return 0; diff --git a/drivers/net/wireless/intel/iwlwifi/pcie/tx.c b/drivers/net/wireless/intel/iwlwifi/pcie/tx.c index 757cf4e9de33..5c6c3fa0d29f 100644 --- a/drivers/net/wireless/intel/iwlwifi/pcie/tx.c +++ b/drivers/net/wireless/intel/iwlwifi/pcie/tx.c @@ -261,7 +261,7 @@ static void iwl_pcie_txq_inval_byte_cnt_tbl(struct iwl_trans *trans, WARN_ON(read_ptr >= TFD_QUEUE_SIZE_MAX); - if (txq_id != trans_pcie->cmd_queue) + if (txq_id != trans->txqs.cmd.q_id) sta_id = tx_cmd->sta_id; bc_ent = cpu_to_le16(1 | (sta_id << 12)); @@ -279,7 +279,6 @@ static void iwl_pcie_txq_inval_byte_cnt_tbl(struct iwl_trans *trans, static void iwl_pcie_txq_inc_wr_ptr(struct iwl_trans *trans, struct iwl_txq *txq) { - struct iwl_trans_pcie *trans_pcie = IWL_TRANS_GET_PCIE_TRANS(trans); u32 reg = 0; int txq_id = txq->id; @@ -292,7 +291,7 @@ static void iwl_pcie_txq_inc_wr_ptr(struct iwl_trans *trans, * 3. there is a chance that the NIC is asleep */ if (!trans->trans_cfg->base_params->shadow_reg_enable && - txq_id != trans_pcie->cmd_queue && + txq_id != trans->txqs.cmd.q_id && test_bit(STATUS_TPOWER_PMI, &trans->status)) { /* * wake up nic if it's powered down ... @@ -323,13 +322,12 @@ static void iwl_pcie_txq_inc_wr_ptr(struct iwl_trans *trans, void iwl_pcie_txq_check_wrptrs(struct iwl_trans *trans) { - struct iwl_trans_pcie *trans_pcie = IWL_TRANS_GET_PCIE_TRANS(trans); int i; for (i = 0; i < trans->trans_cfg->base_params->num_of_queues; i++) { - struct iwl_txq *txq = trans_pcie->txq[i]; + struct iwl_txq *txq = trans->txqs.txq[i]; - if (!test_bit(i, trans_pcie->queue_used)) + if (!test_bit(i, trans->txqs.queue_used)) continue; spin_lock_bh(&txq->lock); @@ -660,14 +658,14 @@ static void iwl_pcie_clear_cmd_in_flight(struct iwl_trans *trans) static void iwl_pcie_txq_unmap(struct iwl_trans *trans, int txq_id) { struct iwl_trans_pcie *trans_pcie = IWL_TRANS_GET_PCIE_TRANS(trans); - struct iwl_txq *txq = trans_pcie->txq[txq_id]; + struct iwl_txq *txq = trans->txqs.txq[txq_id]; spin_lock_bh(&txq->lock); while (txq->write_ptr != txq->read_ptr) { IWL_DEBUG_TX_REPLY(trans, "Q %d Free %d\n", txq_id, txq->read_ptr); - if (txq_id != trans_pcie->cmd_queue) { + if (txq_id != trans->txqs.cmd.q_id) { struct sk_buff *skb = txq->entries[txq->read_ptr].skb; if (WARN_ON_ONCE(!skb)) @@ -682,7 +680,7 @@ static void iwl_pcie_txq_unmap(struct iwl_trans *trans, int txq_id) unsigned long flags; spin_lock_irqsave(&trans_pcie->reg_lock, flags); - if (txq_id == trans_pcie->cmd_queue) + if (txq_id == trans->txqs.cmd.q_id) iwl_pcie_clear_cmd_in_flight(trans); spin_unlock_irqrestore(&trans_pcie->reg_lock, flags); } @@ -711,7 +709,7 @@ static void iwl_pcie_txq_unmap(struct iwl_trans *trans, int txq_id) static void iwl_pcie_txq_free(struct iwl_trans *trans, int txq_id) { struct iwl_trans_pcie *trans_pcie = IWL_TRANS_GET_PCIE_TRANS(trans); - struct iwl_txq *txq = trans_pcie->txq[txq_id]; + struct iwl_txq *txq = trans->txqs.txq[txq_id]; struct device *dev = trans->dev; int i; @@ -721,7 +719,7 @@ static void iwl_pcie_txq_free(struct iwl_trans *trans, int txq_id) iwl_pcie_txq_unmap(trans, txq_id); /* De-alloc array of command/tx buffers */ - if (txq_id == trans_pcie->cmd_queue) + if (txq_id == trans->txqs.cmd.q_id) for (i = 0; i < txq->n_window; i++) { kzfree(txq->entries[i].cmd); kzfree(txq->entries[i].free_buf); @@ -760,8 +758,9 @@ void iwl_pcie_tx_start(struct iwl_trans *trans, u32 scd_base_addr) SCD_CONTEXT_MEM_LOWER_BOUND) / sizeof(u32); /* make sure all queue are not stopped/used */ - memset(trans_pcie->queue_stopped, 0, sizeof(trans_pcie->queue_stopped)); - memset(trans_pcie->queue_used, 0, sizeof(trans_pcie->queue_used)); + memset(trans->txqs.queue_stopped, 0, + sizeof(trans->txqs.queue_stopped)); + memset(trans->txqs.queue_used, 0, sizeof(trans->txqs.queue_used)); trans_pcie->scd_base_addr = iwl_read_prph(trans, SCD_SRAM_BASE_ADDR); @@ -783,9 +782,9 @@ void iwl_pcie_tx_start(struct iwl_trans *trans, u32 scd_base_addr) if (trans->trans_cfg->base_params->scd_chain_ext_wa) iwl_write_prph(trans, SCD_CHAINEXT_EN, 0); - iwl_trans_ac_txq_enable(trans, trans_pcie->cmd_queue, - trans_pcie->cmd_fifo, - trans_pcie->cmd_q_wdg_timeout); + iwl_trans_ac_txq_enable(trans, trans->txqs.cmd.q_id, + trans->txqs.cmd.fifo, + trans->txqs.cmd.wdg_timeout); /* Activate all Tx DMA/FIFO channels */ iwl_scd_activate_fifos(trans); @@ -821,7 +820,7 @@ void iwl_trans_pcie_tx_reset(struct iwl_trans *trans) for (txq_id = 0; txq_id < trans->trans_cfg->base_params->num_of_queues; txq_id++) { - struct iwl_txq *txq = trans_pcie->txq[txq_id]; + struct iwl_txq *txq = trans->txqs.txq[txq_id]; if (trans->trans_cfg->use_tfh) iwl_write_direct64(trans, FH_MEM_CBBC_QUEUE(trans, txq_id), @@ -897,8 +896,9 @@ int iwl_pcie_tx_stop(struct iwl_trans *trans) * queues. This happens when we have an rfkill interrupt. * Since we stop Tx altogether - mark the queues as stopped. */ - memset(trans_pcie->queue_stopped, 0, sizeof(trans_pcie->queue_stopped)); - memset(trans_pcie->queue_used, 0, sizeof(trans_pcie->queue_used)); + memset(trans->txqs.queue_stopped, 0, + sizeof(trans->txqs.queue_stopped)); + memset(trans->txqs.queue_used, 0, sizeof(trans->txqs.queue_used)); /* This can happen: start_hw, stop_device */ if (!trans_pcie->txq_memory) @@ -922,7 +922,7 @@ void iwl_pcie_tx_free(struct iwl_trans *trans) int txq_id; struct iwl_trans_pcie *trans_pcie = IWL_TRANS_GET_PCIE_TRANS(trans); - memset(trans_pcie->queue_used, 0, sizeof(trans_pcie->queue_used)); + memset(trans->txqs.queue_used, 0, sizeof(trans->txqs.queue_used)); /* Tx queues */ if (trans_pcie->txq_memory) { @@ -930,7 +930,7 @@ void iwl_pcie_tx_free(struct iwl_trans *trans) txq_id < trans->trans_cfg->base_params->num_of_queues; txq_id++) { iwl_pcie_txq_free(trans, txq_id); - trans_pcie->txq[txq_id] = NULL; + trans->txqs.txq[txq_id] = NULL; } } @@ -991,7 +991,7 @@ static int iwl_pcie_tx_alloc(struct iwl_trans *trans) /* Alloc and init all Tx queues, including the command queue (#4/#9) */ for (txq_id = 0; txq_id < trans->trans_cfg->base_params->num_of_queues; txq_id++) { - bool cmd_queue = (txq_id == trans_pcie->cmd_queue); + bool cmd_queue = (txq_id == trans->txqs.cmd.q_id); if (cmd_queue) slots_num = max_t(u32, IWL_CMD_QUEUE_SIZE, @@ -999,14 +999,14 @@ static int iwl_pcie_tx_alloc(struct iwl_trans *trans) else slots_num = max_t(u32, IWL_DEFAULT_QUEUE_SIZE, trans->cfg->min_256_ba_txq_size); - trans_pcie->txq[txq_id] = &trans_pcie->txq_memory[txq_id]; - ret = iwl_pcie_txq_alloc(trans, trans_pcie->txq[txq_id], + trans->txqs.txq[txq_id] = &trans_pcie->txq_memory[txq_id]; + ret = iwl_pcie_txq_alloc(trans, trans->txqs.txq[txq_id], slots_num, cmd_queue); if (ret) { IWL_ERR(trans, "Tx %d queue alloc failed\n", txq_id); goto error; } - trans_pcie->txq[txq_id]->id = txq_id; + trans->txqs.txq[txq_id]->id = txq_id; } return 0; @@ -1045,7 +1045,7 @@ int iwl_pcie_tx_init(struct iwl_trans *trans) /* Alloc and init all Tx queues, including the command queue (#4/#9) */ for (txq_id = 0; txq_id < trans->trans_cfg->base_params->num_of_queues; txq_id++) { - bool cmd_queue = (txq_id == trans_pcie->cmd_queue); + bool cmd_queue = (txq_id == trans->txqs.cmd.q_id); if (cmd_queue) slots_num = max_t(u32, IWL_CMD_QUEUE_SIZE, @@ -1053,7 +1053,7 @@ int iwl_pcie_tx_init(struct iwl_trans *trans) else slots_num = max_t(u32, IWL_DEFAULT_QUEUE_SIZE, trans->cfg->min_256_ba_txq_size); - ret = iwl_pcie_txq_init(trans, trans_pcie->txq[txq_id], + ret = iwl_pcie_txq_init(trans, trans->txqs.txq[txq_id], slots_num, cmd_queue); if (ret) { IWL_ERR(trans, "Tx %d queue init failed\n", txq_id); @@ -1067,7 +1067,7 @@ int iwl_pcie_tx_init(struct iwl_trans *trans) * Circular buffer (TFD queue in DRAM) physical base address */ iwl_write_direct32(trans, FH_MEM_CBBC_QUEUE(trans, txq_id), - trans_pcie->txq[txq_id]->dma_addr >> 8); + trans->txqs.txq[txq_id]->dma_addr >> 8); } iwl_set_bits_prph(trans, SCD_GP_CTRL, SCD_GP_CTRL_AUTO_ACTIVE_MODE); @@ -1112,18 +1112,18 @@ void iwl_trans_pcie_reclaim(struct iwl_trans *trans, int txq_id, int ssn, struct sk_buff_head *skbs) { struct iwl_trans_pcie *trans_pcie = IWL_TRANS_GET_PCIE_TRANS(trans); - struct iwl_txq *txq = trans_pcie->txq[txq_id]; + struct iwl_txq *txq = trans->txqs.txq[txq_id]; int tfd_num = iwl_pcie_get_cmd_index(txq, ssn); int read_ptr = iwl_pcie_get_cmd_index(txq, txq->read_ptr); int last_to_free; /* This function is not meant to release cmd queue*/ - if (WARN_ON(txq_id == trans_pcie->cmd_queue)) + if (WARN_ON(txq_id == trans->txqs.cmd.q_id)) return; spin_lock_bh(&txq->lock); - if (!test_bit(txq_id, trans_pcie->queue_used)) { + if (!test_bit(txq_id, trans->txqs.queue_used)) { IWL_DEBUG_TX_QUEUES(trans, "Q %d inactive - ignoring idx %d\n", txq_id, ssn); goto out; @@ -1175,7 +1175,7 @@ void iwl_trans_pcie_reclaim(struct iwl_trans *trans, int txq_id, int ssn, iwl_pcie_txq_progress(txq); if (iwl_queue_space(trans, txq) > txq->low_mark && - test_bit(txq_id, trans_pcie->queue_stopped)) { + test_bit(txq_id, trans->txqs.queue_stopped)) { struct sk_buff_head overflow_skbs; __skb_queue_head_init(&overflow_skbs); @@ -1228,8 +1228,7 @@ out: /* Set wr_ptr of specific device and txq */ void iwl_trans_pcie_set_q_ptrs(struct iwl_trans *trans, int txq_id, int ptr) { - struct iwl_trans_pcie *trans_pcie = IWL_TRANS_GET_PCIE_TRANS(trans); - struct iwl_txq *txq = trans_pcie->txq[txq_id]; + struct iwl_txq *txq = trans->txqs.txq[txq_id]; spin_lock_bh(&txq->lock); @@ -1289,7 +1288,7 @@ static int iwl_pcie_set_cmd_in_flight(struct iwl_trans *trans, static void iwl_pcie_cmdq_reclaim(struct iwl_trans *trans, int txq_id, int idx) { struct iwl_trans_pcie *trans_pcie = IWL_TRANS_GET_PCIE_TRANS(trans); - struct iwl_txq *txq = trans_pcie->txq[txq_id]; + struct iwl_txq *txq = trans->txqs.txq[txq_id]; unsigned long flags; int nfreed = 0; u16 r; @@ -1301,7 +1300,7 @@ static void iwl_pcie_cmdq_reclaim(struct iwl_trans *trans, int txq_id, int idx) if (idx >= trans->trans_cfg->base_params->max_tfd_queue_size || (!iwl_queue_used(txq, idx))) { - WARN_ONCE(test_bit(txq_id, trans_pcie->queue_used), + WARN_ONCE(test_bit(txq_id, trans->txqs.queue_used), "%s: Read index for DMA queue txq id (%d), index %d is out of range [0-%d] %d %d.\n", __func__, txq_id, idx, trans->trans_cfg->base_params->max_tfd_queue_size, @@ -1363,11 +1362,11 @@ bool iwl_trans_pcie_txq_enable(struct iwl_trans *trans, int txq_id, u16 ssn, unsigned int wdg_timeout) { struct iwl_trans_pcie *trans_pcie = IWL_TRANS_GET_PCIE_TRANS(trans); - struct iwl_txq *txq = trans_pcie->txq[txq_id]; + struct iwl_txq *txq = trans->txqs.txq[txq_id]; int fifo = -1; bool scd_bug = false; - if (test_and_set_bit(txq_id, trans_pcie->queue_used)) + if (test_and_set_bit(txq_id, trans->txqs.queue_used)) WARN_ONCE(1, "queue %d already used - expect issues", txq_id); txq->wd_timeout = msecs_to_jiffies(wdg_timeout); @@ -1376,7 +1375,7 @@ bool iwl_trans_pcie_txq_enable(struct iwl_trans *trans, int txq_id, u16 ssn, fifo = cfg->fifo; /* Disable the scheduler prior configuring the cmd queue */ - if (txq_id == trans_pcie->cmd_queue && + if (txq_id == trans->txqs.cmd.q_id && trans_pcie->scd_set_active) iwl_scd_enable_set_active(trans, 0); @@ -1384,7 +1383,7 @@ bool iwl_trans_pcie_txq_enable(struct iwl_trans *trans, int txq_id, u16 ssn, iwl_scd_txq_set_inactive(trans, txq_id); /* Set this queue as a chain-building queue unless it is CMD */ - if (txq_id != trans_pcie->cmd_queue) + if (txq_id != trans->txqs.cmd.q_id) iwl_scd_txq_set_chain(trans, txq_id); if (cfg->aggregate) { @@ -1454,7 +1453,7 @@ bool iwl_trans_pcie_txq_enable(struct iwl_trans *trans, int txq_id, u16 ssn, SCD_QUEUE_STTS_REG_MSK); /* enable the scheduler for this queue (only) */ - if (txq_id == trans_pcie->cmd_queue && + if (txq_id == trans->txqs.cmd.q_id && trans_pcie->scd_set_active) iwl_scd_enable_set_active(trans, BIT(txq_id)); @@ -1473,8 +1472,7 @@ bool iwl_trans_pcie_txq_enable(struct iwl_trans *trans, int txq_id, u16 ssn, void iwl_trans_pcie_txq_set_shared_mode(struct iwl_trans *trans, u32 txq_id, bool shared_mode) { - struct iwl_trans_pcie *trans_pcie = IWL_TRANS_GET_PCIE_TRANS(trans); - struct iwl_txq *txq = trans_pcie->txq[txq_id]; + struct iwl_txq *txq = trans->txqs.txq[txq_id]; txq->ampdu = !shared_mode; } @@ -1487,8 +1485,8 @@ void iwl_trans_pcie_txq_disable(struct iwl_trans *trans, int txq_id, SCD_TX_STTS_QUEUE_OFFSET(txq_id); static const u32 zero_val[4] = {}; - trans_pcie->txq[txq_id]->frozen_expiry_remainder = 0; - trans_pcie->txq[txq_id]->frozen = false; + trans->txqs.txq[txq_id]->frozen_expiry_remainder = 0; + trans->txqs.txq[txq_id]->frozen = false; /* * Upon HW Rfkill - we stop the device, and then stop the queues @@ -1496,7 +1494,7 @@ void iwl_trans_pcie_txq_disable(struct iwl_trans *trans, int txq_id, * allow the op_mode to call txq_disable after it already called * stop_device. */ - if (!test_and_clear_bit(txq_id, trans_pcie->queue_used)) { + if (!test_and_clear_bit(txq_id, trans->txqs.queue_used)) { WARN_ONCE(test_bit(STATUS_DEVICE_ENABLED, &trans->status), "queue %d not used", txq_id); return; @@ -1510,7 +1508,7 @@ void iwl_trans_pcie_txq_disable(struct iwl_trans *trans, int txq_id, } iwl_pcie_txq_unmap(trans, txq_id); - trans_pcie->txq[txq_id]->ampdu = false; + trans->txqs.txq[txq_id]->ampdu = false; IWL_DEBUG_TX_QUEUES(trans, "Deactivate queue %d\n", txq_id); } @@ -1530,7 +1528,7 @@ static int iwl_pcie_enqueue_hcmd(struct iwl_trans *trans, struct iwl_host_cmd *cmd) { struct iwl_trans_pcie *trans_pcie = IWL_TRANS_GET_PCIE_TRANS(trans); - struct iwl_txq *txq = trans_pcie->txq[trans_pcie->cmd_queue]; + struct iwl_txq *txq = trans->txqs.txq[trans->txqs.cmd.q_id]; struct iwl_device_cmd *out_cmd; struct iwl_cmd_meta *out_meta; unsigned long flags; @@ -1656,7 +1654,7 @@ static int iwl_pcie_enqueue_hcmd(struct iwl_trans *trans, sizeof(struct iwl_cmd_header_wide)); out_cmd->hdr_wide.reserved = 0; out_cmd->hdr_wide.sequence = - cpu_to_le16(QUEUE_TO_SEQ(trans_pcie->cmd_queue) | + cpu_to_le16(QUEUE_TO_SEQ(trans->txqs.cmd.q_id) | INDEX_TO_SEQ(txq->write_ptr)); cmd_pos = sizeof(struct iwl_cmd_header_wide); @@ -1664,7 +1662,7 @@ static int iwl_pcie_enqueue_hcmd(struct iwl_trans *trans, } else { out_cmd->hdr.cmd = iwl_cmd_opcode(cmd->id); out_cmd->hdr.sequence = - cpu_to_le16(QUEUE_TO_SEQ(trans_pcie->cmd_queue) | + cpu_to_le16(QUEUE_TO_SEQ(trans->txqs.cmd.q_id) | INDEX_TO_SEQ(txq->write_ptr)); out_cmd->hdr.group_id = 0; @@ -1715,7 +1713,7 @@ static int iwl_pcie_enqueue_hcmd(struct iwl_trans *trans, iwl_get_cmd_string(trans, cmd->id), group_id, out_cmd->hdr.cmd, le16_to_cpu(out_cmd->hdr.sequence), - cmd_size, txq->write_ptr, idx, trans_pcie->cmd_queue); + cmd_size, txq->write_ptr, idx, trans->txqs.cmd.q_id); /* start the TFD with the minimum copy bytes */ tb0_size = min_t(int, copy_size, IWL_FIRST_TB_SIZE); @@ -1815,14 +1813,14 @@ void iwl_pcie_hcmd_complete(struct iwl_trans *trans, struct iwl_device_cmd *cmd; struct iwl_cmd_meta *meta; struct iwl_trans_pcie *trans_pcie = IWL_TRANS_GET_PCIE_TRANS(trans); - struct iwl_txq *txq = trans_pcie->txq[trans_pcie->cmd_queue]; + struct iwl_txq *txq = trans->txqs.txq[trans->txqs.cmd.q_id]; /* If a Tx command is being handled and it isn't in the actual * command queue then there a command routing bug has been introduced * in the queue management code. */ - if (WARN(txq_id != trans_pcie->cmd_queue, + if (WARN(txq_id != trans->txqs.cmd.q_id, "wrong command queue %d (should be %d), sequence 0x%X readp=%d writep=%d\n", - txq_id, trans_pcie->cmd_queue, sequence, txq->read_ptr, + txq_id, trans->txqs.cmd.q_id, sequence, txq->read_ptr, txq->write_ptr)) { iwl_print_hex_error(trans, pkt, 32); return; @@ -1894,7 +1892,7 @@ static int iwl_pcie_send_hcmd_sync(struct iwl_trans *trans, struct iwl_host_cmd *cmd) { struct iwl_trans_pcie *trans_pcie = IWL_TRANS_GET_PCIE_TRANS(trans); - struct iwl_txq *txq = trans_pcie->txq[trans_pcie->cmd_queue]; + struct iwl_txq *txq = trans->txqs.txq[trans->txqs.cmd.q_id]; int cmd_idx; int ret; @@ -2332,9 +2330,9 @@ int iwl_trans_pcie_tx(struct iwl_trans *trans, struct sk_buff *skb, u16 wifi_seq; bool amsdu; - txq = trans_pcie->txq[txq_id]; + txq = trans->txqs.txq[txq_id]; - if (WARN_ONCE(!test_bit(txq_id, trans_pcie->queue_used), + if (WARN_ONCE(!test_bit(txq_id, trans->txqs.queue_used), "TX on unused queue %d\n", txq_id)) return -EINVAL; -- cgit v1.2.3-59-g8ed1b From f327236df2afc8c3c711e7e070f122c26974f4da Mon Sep 17 00:00:00 2001 From: Sharon Date: Fri, 29 May 2020 09:39:29 +0300 Subject: iwlwifi: mvm: fix aux station leak When mvm is initialized we alloc aux station with aux queue. We later free the station memory when driver is stopped, but we never free the queue's memory, which casues a leak. Add a proper de-initialization of the station. Signed-off-by: Sharon Signed-off-by: Luca Coelho Link: https://lore.kernel.org/r/iwlwifi.20200529092401.0121c5be55e9.Id7516fbb3482131d0c9dfb51ff20b226617ddb49@changeid --- drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c | 5 ++--- drivers/net/wireless/intel/iwlwifi/mvm/sta.c | 18 +++++++++++++----- drivers/net/wireless/intel/iwlwifi/mvm/sta.h | 6 +++--- 3 files changed, 18 insertions(+), 11 deletions(-) diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c b/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c index 10df77ab1a77..77916231ff7d 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c +++ b/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c @@ -1208,14 +1208,13 @@ void __iwl_mvm_mac_stop(struct iwl_mvm *mvm) */ flush_work(&mvm->roc_done_wk); + iwl_mvm_rm_aux_sta(mvm); + iwl_mvm_stop_device(mvm); iwl_mvm_async_handlers_purge(mvm); /* async_handlers_list is empty and will stay empty: HW is stopped */ - /* the fw is stopped, the aux sta is dead: clean up driver state */ - iwl_mvm_del_aux_sta(mvm); - /* * Clear IN_HW_RESTART and HW_RESTART_REQUESTED flag when stopping the * hw (as restart_complete() won't be called in this case) and mac80211 diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/sta.c b/drivers/net/wireless/intel/iwlwifi/mvm/sta.c index 44d4720b7629..fee01cbbd3ac 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/sta.c +++ b/drivers/net/wireless/intel/iwlwifi/mvm/sta.c @@ -2093,16 +2093,24 @@ int iwl_mvm_rm_snif_sta(struct iwl_mvm *mvm, struct ieee80211_vif *vif) return ret; } -void iwl_mvm_dealloc_snif_sta(struct iwl_mvm *mvm) +int iwl_mvm_rm_aux_sta(struct iwl_mvm *mvm) { - iwl_mvm_dealloc_int_sta(mvm, &mvm->snif_sta); -} + int ret; -void iwl_mvm_del_aux_sta(struct iwl_mvm *mvm) -{ lockdep_assert_held(&mvm->mutex); + iwl_mvm_disable_txq(mvm, NULL, mvm->aux_queue, IWL_MAX_TID_COUNT, 0); + ret = iwl_mvm_rm_sta_common(mvm, mvm->aux_sta.sta_id); + if (ret) + IWL_WARN(mvm, "Failed sending remove station\n"); iwl_mvm_dealloc_int_sta(mvm, &mvm->aux_sta); + + return ret; +} + +void iwl_mvm_dealloc_snif_sta(struct iwl_mvm *mvm) +{ + iwl_mvm_dealloc_int_sta(mvm, &mvm->snif_sta); } /* diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/sta.h b/drivers/net/wireless/intel/iwlwifi/mvm/sta.h index 8d70093847cb..da2d1ac01229 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/sta.h +++ b/drivers/net/wireless/intel/iwlwifi/mvm/sta.h @@ -8,7 +8,7 @@ * Copyright(c) 2012 - 2014 Intel Corporation. All rights reserved. * Copyright(c) 2013 - 2014 Intel Mobile Communications GmbH * Copyright(c) 2015 - 2016 Intel Deutschland GmbH - * Copyright(c) 2018 - 2019 Intel Corporation + * Copyright(c) 2018 - 2020 Intel Corporation * * This program is free software; you can redistribute it and/or modify * it under the terms of version 2 of the GNU General Public License as @@ -31,7 +31,7 @@ * Copyright(c) 2012 - 2014 Intel Corporation. All rights reserved. * Copyright(c) 2013 - 2014 Intel Mobile Communications GmbH * Copyright(c) 2015 - 2016 Intel Deutschland GmbH - * Copyright(c) 2018 - 2019 Intel Corporation + * Copyright(c) 2018 - 2020 Intel Corporation * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -541,7 +541,7 @@ int iwl_mvm_sta_tx_agg(struct iwl_mvm *mvm, struct ieee80211_sta *sta, int tid, u8 queue, bool start); int iwl_mvm_add_aux_sta(struct iwl_mvm *mvm); -void iwl_mvm_del_aux_sta(struct iwl_mvm *mvm); +int iwl_mvm_rm_aux_sta(struct iwl_mvm *mvm); int iwl_mvm_alloc_bcast_sta(struct iwl_mvm *mvm, struct ieee80211_vif *vif); int iwl_mvm_send_add_bcast_sta(struct iwl_mvm *mvm, struct ieee80211_vif *vif); -- cgit v1.2.3-59-g8ed1b From 018971b11ab407c8d48c075ad38d2917587e97ab Mon Sep 17 00:00:00 2001 From: Avraham Stern Date: Fri, 29 May 2020 09:39:30 +0300 Subject: iwlwifi: mvm: add support for range request version 10 Range request version 10 keeps the same command size as version 9 but uses 2 reserved fields for the responder beacon interval and station id (if exists). For now, since the beacon interval of unassoc APs is unknown, use a value of 100 TUs which is a common value for many APs. While at it, remove the definition for CCMP_256 cipher, since this is not supported. Signed-off-by: Avraham Stern Signed-off-by: Luca Coelho Link: https://lore.kernel.org/r/iwlwifi.20200529092401.b7ccdad0805f.I59ea7f773caed85a66c61401066ae169008442e6@changeid --- .../net/wireless/intel/iwlwifi/fw/api/location.h | 14 ++++---- .../net/wireless/intel/iwlwifi/mvm/ftm-initiator.c | 42 +++++++++++++++++++--- 2 files changed, 45 insertions(+), 11 deletions(-) diff --git a/drivers/net/wireless/intel/iwlwifi/fw/api/location.h b/drivers/net/wireless/intel/iwlwifi/fw/api/location.h index 7ffad19d80fd..1df2e497fabf 100644 --- a/drivers/net/wireless/intel/iwlwifi/fw/api/location.h +++ b/drivers/net/wireless/intel/iwlwifi/fw/api/location.h @@ -550,13 +550,11 @@ struct iwl_tof_range_req_ap_entry_v4 { /** * enum iwl_location_cipher - location cipher selection * @IWL_LOCATION_CIPHER_CCMP_128: CCMP 128 - * @IWL_LOCATION_CIPHER_CCMP_256: CCMP 256 * @IWL_LOCATION_CIPHER_GCMP_128: GCMP 128 * @IWL_LOCATION_CIPHER_GCMP_256: GCMP 256 */ enum iwl_location_cipher { IWL_LOCATION_CIPHER_CCMP_128, - IWL_LOCATION_CIPHER_CCMP_256, IWL_LOCATION_CIPHER_GCMP_128, IWL_LOCATION_CIPHER_GCMP_256, }; @@ -577,7 +575,8 @@ enum iwl_location_cipher { * @samples_per_burst: the number of FTMs pairs in single Burst (1-31); * @num_of_bursts: Recommended value to be sent to the AP. 2s Exponent of * the number of measurement iterations (min 2^0 = 1, max 2^14) - * @reserved: For alignment and future use + * @sta_id: the station id of the AP. Only relevant when associated to the AP, + * otherwise should be set to &IWL_MVM_INVALID_STA. * @cipher: pairwise cipher suite for secured measurement. * &enum iwl_location_cipher. * @hltk: HLTK to be used for secured 11az measurement @@ -586,7 +585,8 @@ enum iwl_location_cipher { * If &IWL_INITIATOR_AP_FLAGS_USE_CALIB is set, the fw will use the * calibration value that corresponds to the rx bandwidth of the FTM * frame. - * @reserved2: For alignment and future use. + * @beacon_interval: beacon interval of the AP in TUs. Only required if + * &IWL_INITIATOR_AP_FLAGS_TB is set. */ struct iwl_tof_range_req_ap_entry { __le32 initiator_ap_flags; @@ -598,13 +598,13 @@ struct iwl_tof_range_req_ap_entry { __le16 burst_period; u8 samples_per_burst; u8 num_of_bursts; - u8 reserved; + u8 sta_id; u8 cipher; u8 hltk[HLTK_11AZ_LEN]; u8 tk[TK_11AZ_LEN]; __le16 calib[IWL_TOF_BW_NUM]; - __le16 reserved2; -} __packed; /* LOCATION_RANGE_REQ_AP_ENTRY_CMD_API_S_VER_5 */ + __le16 beacon_interval; +} __packed; /* LOCATION_RANGE_REQ_AP_ENTRY_CMD_API_S_VER_6 */ /** * enum iwl_tof_response_mode diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/ftm-initiator.c b/drivers/net/wireless/intel/iwlwifi/mvm/ftm-initiator.c index aaa7dd1788b1..5ca45915cf7c 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/ftm-initiator.c +++ b/drivers/net/wireless/intel/iwlwifi/mvm/ftm-initiator.c @@ -391,9 +391,27 @@ iwl_mvm_ftm_put_target_v3(struct iwl_mvm *mvm, } static int -iwl_mvm_ftm_put_target(struct iwl_mvm *mvm, +iwl_mvm_ftm_put_target_v4(struct iwl_mvm *mvm, + struct cfg80211_pmsr_request_peer *peer, + struct iwl_tof_range_req_ap_entry_v4 *target) +{ + int ret; + + ret = iwl_mvm_ftm_target_chandef_v2(mvm, peer, &target->channel_num, + &target->format_bw, + &target->ctrl_ch_position); + if (ret) + return ret; + + iwl_mvm_ftm_put_target_common(mvm, peer, (void *)target); + + return 0; +} + +static int +iwl_mvm_ftm_put_target(struct iwl_mvm *mvm, struct ieee80211_vif *vif, struct cfg80211_pmsr_request_peer *peer, - struct iwl_tof_range_req_ap_entry_v4 *target) + struct iwl_tof_range_req_ap_entry *target) { int ret; @@ -405,6 +423,20 @@ iwl_mvm_ftm_put_target(struct iwl_mvm *mvm, iwl_mvm_ftm_put_target_common(mvm, peer, (void *)target); + if (vif->bss_conf.assoc && + !memcmp(peer->addr, vif->bss_conf.bssid, ETH_ALEN)) { + struct iwl_mvm_vif *mvmvif = iwl_mvm_vif_from_mac80211(vif); + + target->sta_id = mvmvif->ap_sta_id; + } else { + target->sta_id = IWL_MVM_INVALID_STA; + } + + /* + * TODO: Beacon interval is currently unknown, so use the common value + * of 100 TUs. + */ + target->beacon_interval = cpu_to_le16(100); return 0; } @@ -496,7 +528,7 @@ static int iwl_mvm_ftm_start_v8(struct iwl_mvm *mvm, struct ieee80211_vif *vif, for (i = 0; i < cmd.num_of_ap; i++) { struct cfg80211_pmsr_request_peer *peer = &req->peers[i]; - err = iwl_mvm_ftm_put_target(mvm, peer, &cmd.ap[i]); + err = iwl_mvm_ftm_put_target_v4(mvm, peer, &cmd.ap[i]); if (err) return err; } @@ -521,8 +553,9 @@ static int iwl_mvm_ftm_start_v9(struct iwl_mvm *mvm, struct ieee80211_vif *vif, for (i = 0; i < cmd.num_of_ap; i++) { struct cfg80211_pmsr_request_peer *peer = &req->peers[i]; + struct iwl_tof_range_req_ap_entry *target = &cmd.ap[i]; - err = iwl_mvm_ftm_put_target(mvm, peer, (void *)&cmd.ap[i]); + err = iwl_mvm_ftm_put_target(mvm, vif, peer, target); if (err) return err; } @@ -548,6 +581,7 @@ int iwl_mvm_ftm_start(struct iwl_mvm *mvm, struct ieee80211_vif *vif, switch (cmd_ver) { case 9: + case 10: err = iwl_mvm_ftm_start_v9(mvm, vif, req); break; case 8: -- cgit v1.2.3-59-g8ed1b From e6d4318c049574dcfa040725903add2790cfbd7b Mon Sep 17 00:00:00 2001 From: Luca Coelho Date: Fri, 29 May 2020 09:39:31 +0300 Subject: iwlwifi: bump FW API to 56 for AX devices Start supporting API version 56 for AX devices. Signed-off-by: Luca Coelho Link: https://lore.kernel.org/r/iwlwifi.20200529092401.aabbc5b472ee.I88cb2c3d2d07e62eac3671335ff1fb80b73c5839@changeid --- drivers/net/wireless/intel/iwlwifi/cfg/22000.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/wireless/intel/iwlwifi/cfg/22000.c b/drivers/net/wireless/intel/iwlwifi/cfg/22000.c index 1daa653bcb99..efe427049a6e 100644 --- a/drivers/net/wireless/intel/iwlwifi/cfg/22000.c +++ b/drivers/net/wireless/intel/iwlwifi/cfg/22000.c @@ -57,7 +57,7 @@ #include "iwl-prph.h" /* Highest firmware API version supported */ -#define IWL_22000_UCODE_API_MAX 55 +#define IWL_22000_UCODE_API_MAX 56 /* Lowest firmware API version supported */ #define IWL_22000_UCODE_API_MIN 39 -- cgit v1.2.3-59-g8ed1b From feac90d756c03b03b83fabe83571bd88ecc96b78 Mon Sep 17 00:00:00 2001 From: Zijun Hu Date: Fri, 29 May 2020 04:31:07 +0800 Subject: Bluetooth: hci_qca: Fix suspend/resume functionality failure @dev parameter of qca_suspend()/qca_resume() represents serdev_device, but it is mistook for hci_dev and causes succedent unexpected memory access. Fix by taking @dev as serdev_device. Fixes: 41d5b25fed0 ("Bluetooth: hci_qca: add PM support") Signed-off-by: Zijun Hu Reviewed-by: Matthias Kaehlcke Signed-off-by: Marcel Holtmann --- drivers/bluetooth/hci_qca.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/drivers/bluetooth/hci_qca.c b/drivers/bluetooth/hci_qca.c index e4a68238fcb9..adcbe00a2275 100644 --- a/drivers/bluetooth/hci_qca.c +++ b/drivers/bluetooth/hci_qca.c @@ -1977,8 +1977,9 @@ static void qca_serdev_remove(struct serdev_device *serdev) static int __maybe_unused qca_suspend(struct device *dev) { - struct hci_dev *hdev = container_of(dev, struct hci_dev, dev); - struct hci_uart *hu = hci_get_drvdata(hdev); + struct serdev_device *serdev = to_serdev_device(dev); + struct qca_serdev *qcadev = serdev_device_get_drvdata(serdev); + struct hci_uart *hu = &qcadev->serdev_hu; struct qca_data *qca = hu->priv; unsigned long flags; int ret = 0; @@ -2057,8 +2058,9 @@ error: static int __maybe_unused qca_resume(struct device *dev) { - struct hci_dev *hdev = container_of(dev, struct hci_dev, dev); - struct hci_uart *hu = hci_get_drvdata(hdev); + struct serdev_device *serdev = to_serdev_device(dev); + struct qca_serdev *qcadev = serdev_device_get_drvdata(serdev); + struct hci_uart *hu = &qcadev->serdev_hu; struct qca_data *qca = hu->priv; clear_bit(QCA_SUSPENDING, &qca->flags); -- cgit v1.2.3-59-g8ed1b From 4803c54ca24923a30664bea2a7772db6e7303c51 Mon Sep 17 00:00:00 2001 From: Chuhong Yuan Date: Fri, 29 May 2020 10:27:26 +0800 Subject: Bluetooth: btmtkuart: Improve exception handling in btmtuart_probe() Calls of the functions clk_disable_unprepare() and hci_free_dev() were missing for the exception handling. Thus add the missed function calls together with corresponding jump targets. Fixes: 055825614c6b ("Bluetooth: btmtkuart: add an implementation for clock osc property") Signed-off-by: Chuhong Yuan Signed-off-by: Marcel Holtmann --- drivers/bluetooth/btmtkuart.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/drivers/bluetooth/btmtkuart.c b/drivers/bluetooth/btmtkuart.c index e11169ad8247..8a81fbca5c9d 100644 --- a/drivers/bluetooth/btmtkuart.c +++ b/drivers/bluetooth/btmtkuart.c @@ -1015,7 +1015,7 @@ static int btmtkuart_probe(struct serdev_device *serdev) if (btmtkuart_is_standalone(bdev)) { err = clk_prepare_enable(bdev->osc); if (err < 0) - return err; + goto err_hci_free_dev; if (bdev->boot) { gpiod_set_value_cansleep(bdev->boot, 1); @@ -1028,10 +1028,8 @@ static int btmtkuart_probe(struct serdev_device *serdev) /* Power on */ err = regulator_enable(bdev->vcc); - if (err < 0) { - clk_disable_unprepare(bdev->osc); - return err; - } + if (err < 0) + goto err_clk_disable_unprepare; /* Reset if the reset-gpios is available otherwise the board * -level design should be guaranteed. @@ -1063,7 +1061,6 @@ static int btmtkuart_probe(struct serdev_device *serdev) err = hci_register_dev(hdev); if (err < 0) { dev_err(&serdev->dev, "Can't register HCI device\n"); - hci_free_dev(hdev); goto err_regulator_disable; } @@ -1072,6 +1069,11 @@ static int btmtkuart_probe(struct serdev_device *serdev) err_regulator_disable: if (btmtkuart_is_standalone(bdev)) regulator_disable(bdev->vcc); +err_clk_disable_unprepare: + if (btmtkuart_is_standalone(bdev)) + clk_disable_unprepare(bdev->osc); +err_hci_free_dev: + hci_free_dev(hdev); return err; } -- cgit v1.2.3-59-g8ed1b From e6da0edc24eecef2f6964d92fa9044e1821deace Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Thu, 28 May 2020 14:35:12 +0200 Subject: Bluetooth: Acquire sk_lock.slock without disabling interrupts There was a lockdep which led to commit fad003b6c8e3d ("Bluetooth: Fix inconsistent lock state with RFCOMM") Lockdep noticed that `sk->sk_lock.slock' was acquired without disabling the softirq while the lock was also used in softirq context. Unfortunately the solution back then was to disable interrupts before acquiring the lock which however made lockdep happy. It would have been enough to simply disable the softirq. Disabling interrupts before acquiring a spinlock_t is not allowed on PREEMPT_RT because these locks are converted to 'sleeping' spinlocks. Use spin_lock_bh() in order to acquire the `sk_lock.slock'. Reported-by: Luis Claudio R. Goncalves Reported-by: kbuild test robot [missing unlock] Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Marcel Holtmann --- net/bluetooth/rfcomm/sock.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/net/bluetooth/rfcomm/sock.c b/net/bluetooth/rfcomm/sock.c index b4eaf21360ef..df14eebe80da 100644 --- a/net/bluetooth/rfcomm/sock.c +++ b/net/bluetooth/rfcomm/sock.c @@ -64,15 +64,13 @@ static void rfcomm_sk_data_ready(struct rfcomm_dlc *d, struct sk_buff *skb) static void rfcomm_sk_state_change(struct rfcomm_dlc *d, int err) { struct sock *sk = d->owner, *parent; - unsigned long flags; if (!sk) return; BT_DBG("dlc %p state %ld err %d", d, d->state, err); - local_irq_save(flags); - bh_lock_sock(sk); + spin_lock_bh(&sk->sk_lock.slock); if (err) sk->sk_err = err; @@ -93,8 +91,7 @@ static void rfcomm_sk_state_change(struct rfcomm_dlc *d, int err) sk->sk_state_change(sk); } - bh_unlock_sock(sk); - local_irq_restore(flags); + spin_unlock_bh(&sk->sk_lock.slock); if (parent && sock_flag(sk, SOCK_ZAPPED)) { /* We have to drop DLC lock here, otherwise -- cgit v1.2.3-59-g8ed1b From 7e7bbddd029b644f00f0ffbfbc485ed71977d0d5 Mon Sep 17 00:00:00 2001 From: Zijun Hu Date: Fri, 29 May 2020 21:56:57 +0800 Subject: Bluetooth: hci_qca: Fix qca6390 enable failure after warm reboot Warm reboot can not reset controller qca6390 due to lack of controllable power supply, so causes firmware download failure during enable. Fixed by sending VSC EDL_SOC_RESET to reset qca6390 within added device shutdown implementation. Signed-off-by: Zijun Hu Tested-by: Zijun Hu Signed-off-by: Marcel Holtmann --- drivers/bluetooth/hci_qca.c | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/drivers/bluetooth/hci_qca.c b/drivers/bluetooth/hci_qca.c index adcbe00a2275..aa957d749d6f 100644 --- a/drivers/bluetooth/hci_qca.c +++ b/drivers/bluetooth/hci_qca.c @@ -1975,6 +1975,38 @@ static void qca_serdev_remove(struct serdev_device *serdev) hci_uart_unregister_device(&qcadev->serdev_hu); } +static void qca_serdev_shutdown(struct device *dev) +{ + int ret; + int timeout = msecs_to_jiffies(CMD_TRANS_TIMEOUT_MS); + struct serdev_device *serdev = to_serdev_device(dev); + struct qca_serdev *qcadev = serdev_device_get_drvdata(serdev); + const u8 ibs_wake_cmd[] = { 0xFD }; + const u8 edl_reset_soc_cmd[] = { 0x01, 0x00, 0xFC, 0x01, 0x05 }; + + if (qcadev->btsoc_type == QCA_QCA6390) { + serdev_device_write_flush(serdev); + ret = serdev_device_write_buf(serdev, ibs_wake_cmd, + sizeof(ibs_wake_cmd)); + if (ret < 0) { + BT_ERR("QCA send IBS_WAKE_IND error: %d", ret); + return; + } + serdev_device_wait_until_sent(serdev, timeout); + usleep_range(8000, 10000); + + serdev_device_write_flush(serdev); + ret = serdev_device_write_buf(serdev, edl_reset_soc_cmd, + sizeof(edl_reset_soc_cmd)); + if (ret < 0) { + BT_ERR("QCA send EDL_RESET_REQ error: %d", ret); + return; + } + serdev_device_wait_until_sent(serdev, timeout); + usleep_range(8000, 10000); + } +} + static int __maybe_unused qca_suspend(struct device *dev) { struct serdev_device *serdev = to_serdev_device(dev); @@ -2102,6 +2134,7 @@ static struct serdev_device_driver qca_serdev_driver = { .name = "hci_uart_qca", .of_match_table = of_match_ptr(qca_bluetooth_of_match), .acpi_match_table = ACPI_PTR(qca_bluetooth_acpi_match), + .shutdown = qca_serdev_shutdown, .pm = &qca_pm_ops, }, }; -- cgit v1.2.3-59-g8ed1b From b6b15e20421fefae9f78274f9fef80bc97bf5d5c Mon Sep 17 00:00:00 2001 From: Rui Salvaterra Date: Mon, 25 May 2020 14:49:07 +0100 Subject: rt2800: enable MFP support unconditionally This gives us WPA3 support out of the box without having to manually disable hardware crypto. The driver will fall back to software crypto if the connection requires management frame protection. Suggested-by: Stanislaw Gruszka Signed-off-by: Rui Salvaterra Acked-by: Stanislaw Gruszka Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/20200525134906.1672-1-rsalvaterra@gmail.com --- drivers/net/wireless/ralink/rt2x00/rt2800lib.c | 4 +--- drivers/net/wireless/ralink/rt2x00/rt2x00mac.c | 3 ++- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/drivers/net/wireless/ralink/rt2x00/rt2800lib.c b/drivers/net/wireless/ralink/rt2x00/rt2800lib.c index 6beac1f74e7c..a779fe771a55 100644 --- a/drivers/net/wireless/ralink/rt2x00/rt2800lib.c +++ b/drivers/net/wireless/ralink/rt2x00/rt2800lib.c @@ -9971,9 +9971,7 @@ static int rt2800_probe_hw_mode(struct rt2x00_dev *rt2x00dev) if (!rt2x00_is_usb(rt2x00dev)) ieee80211_hw_set(rt2x00dev->hw, HOST_BROADCAST_PS_BUFFERING); - /* Set MFP if HW crypto is disabled. */ - if (rt2800_hwcrypt_disabled(rt2x00dev)) - ieee80211_hw_set(rt2x00dev->hw, MFP_CAPABLE); + ieee80211_hw_set(rt2x00dev->hw, MFP_CAPABLE); SET_IEEE80211_DEV(rt2x00dev->hw, rt2x00dev->dev); SET_IEEE80211_PERM_ADDR(rt2x00dev->hw, diff --git a/drivers/net/wireless/ralink/rt2x00/rt2x00mac.c b/drivers/net/wireless/ralink/rt2x00/rt2x00mac.c index 32efbc8e9f92..2f68a31072ae 100644 --- a/drivers/net/wireless/ralink/rt2x00/rt2x00mac.c +++ b/drivers/net/wireless/ralink/rt2x00/rt2x00mac.c @@ -468,7 +468,8 @@ int rt2x00mac_set_key(struct ieee80211_hw *hw, enum set_key_cmd cmd, if (!test_bit(DEVICE_STATE_PRESENT, &rt2x00dev->flags)) return 0; - if (!rt2x00_has_cap_hw_crypto(rt2x00dev)) + /* The hardware can't do MFP */ + if (!rt2x00_has_cap_hw_crypto(rt2x00dev) || (sta && sta->mfp)) return -EOPNOTSUPP; /* -- cgit v1.2.3-59-g8ed1b From 3a8855d8cfcb944032ce0eecba05b2d0e93f4fb1 Mon Sep 17 00:00:00 2001 From: Sergey Matyukevich Date: Wed, 20 May 2020 16:08:00 +0300 Subject: MAINTAINERS: update qtnfmac maintainers I am leaving Quantenna, so I will no longer have access to firmware and hardware. Meanwhile I plan to participate in reviewing qtnfmac patches for a while until my firmware knowledge becomes completely obsolete. Adding myself as a reviewer using my personal email address. Signed-off-by: Sergey Matyukevich Signed-off-by: Igor Mitsyanko Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/20200520130800.1902-1-sergey.matyukevich.os@quantenna.com --- MAINTAINERS | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index 9f338ed0d9ab..5d81c002232a 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -14066,7 +14066,7 @@ F: drivers/net/wireless/ath/wcn36xx/ QUANTENNA QTNFMAC WIRELESS DRIVER M: Igor Mitsyanko -M: Sergey Matyukevich +R: Sergey Matyukevich L: linux-wireless@vger.kernel.org S: Maintained F: drivers/net/wireless/quantenna -- cgit v1.2.3-59-g8ed1b From dba5a189bf6169dc3e7462df17fb2aee5ca37e90 Mon Sep 17 00:00:00 2001 From: Yan-Hsuan Chuang Date: Wed, 20 May 2020 13:53:50 +0800 Subject: Revert "rtw88: no need to set registers for SDIO" This reverts commit 07d0f5534935e2daf63a4e1012af13d68e089fed. For rtw88 driver, the SDIO is going to be supported, so there is no need to remove the SDIO related power sequence settings. And while the power sequence parser will pass in the mask of the HCI, the SDIO part will not be used to set registers accordingly. Moreover, the power sequence table is released as a whole package, so the next time if we are going to update, the SDIO settings will be overwritten. So, revert this now. Signed-off-by: Yan-Hsuan Chuang Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/20200520055350.23328-1-yhchuang@realtek.com --- drivers/net/wireless/realtek/rtw88/rtw8723d.c | 40 +++++++++++ drivers/net/wireless/realtek/rtw88/rtw8822b.c | 95 +++++++++++++++++++++++++++ drivers/net/wireless/realtek/rtw88/rtw8822c.c | 20 ++++++ 3 files changed, 155 insertions(+) diff --git a/drivers/net/wireless/realtek/rtw88/rtw8723d.c b/drivers/net/wireless/realtek/rtw88/rtw8723d.c index 8641ea645c4b..7422baf2d41b 100644 --- a/drivers/net/wireless/realtek/rtw88/rtw8723d.c +++ b/drivers/net/wireless/realtek/rtw88/rtw8723d.c @@ -2093,6 +2093,16 @@ static const struct rtw_pwr_seq_cmd trans_carddis_to_cardemu_8723d[] = { RTW_PWR_INTF_ALL_MSK, RTW_PWR_ADDR_MAC, RTW_PWR_CMD_WRITE, BIT(3) | BIT(7), 0}, + {0x0086, + RTW_PWR_CUT_ALL_MSK, + RTW_PWR_INTF_SDIO_MSK, + RTW_PWR_ADDR_SDIO, + RTW_PWR_CMD_WRITE, BIT(0), 0}, + {0x0086, + RTW_PWR_CUT_ALL_MSK, + RTW_PWR_INTF_SDIO_MSK, + RTW_PWR_ADDR_SDIO, + RTW_PWR_CMD_POLLING, BIT(1), BIT(1)}, {0x004A, RTW_PWR_CUT_ALL_MSK, RTW_PWR_INTF_USB_MSK, @@ -2103,6 +2113,11 @@ static const struct rtw_pwr_seq_cmd trans_carddis_to_cardemu_8723d[] = { RTW_PWR_INTF_ALL_MSK, RTW_PWR_ADDR_MAC, RTW_PWR_CMD_WRITE, BIT(3) | BIT(4), 0}, + {0x0023, + RTW_PWR_CUT_ALL_MSK, + RTW_PWR_INTF_SDIO_MSK, + RTW_PWR_ADDR_MAC, + RTW_PWR_CMD_WRITE, BIT(4), 0}, {0x0301, RTW_PWR_CUT_ALL_MSK, RTW_PWR_INTF_PCI_MSK, @@ -2310,6 +2325,11 @@ static const struct rtw_pwr_seq_cmd trans_act_to_lps_8723d[] = { RTW_PWR_INTF_ALL_MSK, RTW_PWR_ADDR_MAC, RTW_PWR_CMD_WRITE, BIT(1), 0}, + {0x0093, + RTW_PWR_CUT_ALL_MSK, + RTW_PWR_INTF_SDIO_MSK, + RTW_PWR_ADDR_MAC, + RTW_PWR_CMD_WRITE, 0xFF, 0x00}, {0x0553, RTW_PWR_CUT_ALL_MSK, RTW_PWR_INTF_ALL_MSK, @@ -2389,6 +2409,11 @@ static const struct rtw_pwr_seq_cmd trans_act_to_cardemu_8723d[] = { }; static const struct rtw_pwr_seq_cmd trans_cardemu_to_carddis_8723d[] = { + {0x0007, + RTW_PWR_CUT_ALL_MSK, + RTW_PWR_INTF_SDIO_MSK, + RTW_PWR_ADDR_MAC, + RTW_PWR_CMD_WRITE, 0xFF, 0x20}, {0x0005, RTW_PWR_CUT_ALL_MSK, RTW_PWR_INTF_USB_MSK | RTW_PWR_INTF_SDIO_MSK, @@ -2409,6 +2434,21 @@ static const struct rtw_pwr_seq_cmd trans_cardemu_to_carddis_8723d[] = { RTW_PWR_INTF_USB_MSK, RTW_PWR_ADDR_MAC, RTW_PWR_CMD_WRITE, BIT(0), 1}, + {0x0023, + RTW_PWR_CUT_ALL_MSK, + RTW_PWR_INTF_SDIO_MSK, + RTW_PWR_ADDR_MAC, + RTW_PWR_CMD_WRITE, BIT(4), BIT(4)}, + {0x0086, + RTW_PWR_CUT_ALL_MSK, + RTW_PWR_INTF_SDIO_MSK, + RTW_PWR_ADDR_SDIO, + RTW_PWR_CMD_WRITE, BIT(0), BIT(0)}, + {0x0086, + RTW_PWR_CUT_ALL_MSK, + RTW_PWR_INTF_SDIO_MSK, + RTW_PWR_ADDR_SDIO, + RTW_PWR_CMD_POLLING, BIT(1), 0}, {0xFFFF, RTW_PWR_CUT_ALL_MSK, RTW_PWR_INTF_ALL_MSK, diff --git a/drivers/net/wireless/realtek/rtw88/rtw8822b.c b/drivers/net/wireless/realtek/rtw88/rtw8822b.c index 6abcdf4070a2..e49bdd76ab9a 100644 --- a/drivers/net/wireless/realtek/rtw88/rtw8822b.c +++ b/drivers/net/wireless/realtek/rtw88/rtw8822b.c @@ -1551,6 +1551,16 @@ static void rtw8822b_bf_config_bfee(struct rtw_dev *rtwdev, struct rtw_vif *vif, } static const struct rtw_pwr_seq_cmd trans_carddis_to_cardemu_8822b[] = { + {0x0086, + RTW_PWR_CUT_ALL_MSK, + RTW_PWR_INTF_SDIO_MSK, + RTW_PWR_ADDR_SDIO, + RTW_PWR_CMD_WRITE, BIT(0), 0}, + {0x0086, + RTW_PWR_CUT_ALL_MSK, + RTW_PWR_INTF_SDIO_MSK, + RTW_PWR_ADDR_SDIO, + RTW_PWR_CMD_POLLING, BIT(1), BIT(1)}, {0x004A, RTW_PWR_CUT_ALL_MSK, RTW_PWR_INTF_USB_MSK, @@ -1679,6 +1689,11 @@ static const struct rtw_pwr_seq_cmd trans_cardemu_to_act_8822b[] = { RTW_PWR_INTF_ALL_MSK, RTW_PWR_ADDR_MAC, RTW_PWR_CMD_WRITE, 0xFF, 0x0c}, + {0x0068, + RTW_PWR_CUT_C_MSK, + RTW_PWR_INTF_SDIO_MSK, + RTW_PWR_ADDR_MAC, + RTW_PWR_CMD_WRITE, BIT(4), BIT(4)}, {0x0029, RTW_PWR_CUT_ALL_MSK, RTW_PWR_INTF_ALL_MSK, @@ -1707,6 +1722,11 @@ static const struct rtw_pwr_seq_cmd trans_cardemu_to_act_8822b[] = { }; static const struct rtw_pwr_seq_cmd trans_act_to_cardemu_8822b[] = { + {0x0003, + RTW_PWR_CUT_ALL_MSK, + RTW_PWR_INTF_SDIO_MSK, + RTW_PWR_ADDR_MAC, + RTW_PWR_CMD_WRITE, BIT(2), 0}, {0x0093, RTW_PWR_CUT_ALL_MSK, RTW_PWR_INTF_ALL_MSK, @@ -1775,6 +1795,11 @@ static const struct rtw_pwr_seq_cmd trans_act_to_cardemu_8822b[] = { }; static const struct rtw_pwr_seq_cmd trans_cardemu_to_carddis_8822b[] = { + {0x0005, + RTW_PWR_CUT_ALL_MSK, + RTW_PWR_INTF_SDIO_MSK, + RTW_PWR_ADDR_MAC, + RTW_PWR_CMD_WRITE, BIT(7), BIT(7)}, {0x0007, RTW_PWR_CUT_ALL_MSK, RTW_PWR_INTF_USB_MSK | RTW_PWR_INTF_SDIO_MSK, @@ -1795,6 +1820,46 @@ static const struct rtw_pwr_seq_cmd trans_cardemu_to_carddis_8822b[] = { RTW_PWR_INTF_USB_MSK, RTW_PWR_ADDR_MAC, RTW_PWR_CMD_WRITE, BIT(0), 0}, + {0x0067, + RTW_PWR_CUT_ALL_MSK, + RTW_PWR_INTF_SDIO_MSK, + RTW_PWR_ADDR_MAC, + RTW_PWR_CMD_WRITE, BIT(5), 0}, + {0x0067, + RTW_PWR_CUT_ALL_MSK, + RTW_PWR_INTF_SDIO_MSK, + RTW_PWR_ADDR_MAC, + RTW_PWR_CMD_WRITE, BIT(4), 0}, + {0x004F, + RTW_PWR_CUT_ALL_MSK, + RTW_PWR_INTF_SDIO_MSK, + RTW_PWR_ADDR_MAC, + RTW_PWR_CMD_WRITE, BIT(0), 0}, + {0x0067, + RTW_PWR_CUT_ALL_MSK, + RTW_PWR_INTF_SDIO_MSK, + RTW_PWR_ADDR_MAC, + RTW_PWR_CMD_WRITE, BIT(1), 0}, + {0x0046, + RTW_PWR_CUT_ALL_MSK, + RTW_PWR_INTF_SDIO_MSK, + RTW_PWR_ADDR_MAC, + RTW_PWR_CMD_WRITE, BIT(6), BIT(6)}, + {0x0067, + RTW_PWR_CUT_ALL_MSK, + RTW_PWR_INTF_SDIO_MSK, + RTW_PWR_ADDR_MAC, + RTW_PWR_CMD_WRITE, BIT(2), 0}, + {0x0046, + RTW_PWR_CUT_ALL_MSK, + RTW_PWR_INTF_SDIO_MSK, + RTW_PWR_ADDR_MAC, + RTW_PWR_CMD_WRITE, BIT(7), BIT(7)}, + {0x0062, + RTW_PWR_CUT_ALL_MSK, + RTW_PWR_INTF_SDIO_MSK, + RTW_PWR_ADDR_MAC, + RTW_PWR_CMD_WRITE, BIT(4), BIT(4)}, {0x0081, RTW_PWR_CUT_ALL_MSK, RTW_PWR_INTF_ALL_MSK, @@ -1805,11 +1870,41 @@ static const struct rtw_pwr_seq_cmd trans_cardemu_to_carddis_8822b[] = { RTW_PWR_INTF_USB_MSK | RTW_PWR_INTF_SDIO_MSK, RTW_PWR_ADDR_MAC, RTW_PWR_CMD_WRITE, BIT(3) | BIT(4), BIT(3)}, + {0x0086, + RTW_PWR_CUT_ALL_MSK, + RTW_PWR_INTF_SDIO_MSK, + RTW_PWR_ADDR_SDIO, + RTW_PWR_CMD_WRITE, BIT(0), BIT(0)}, + {0x0086, + RTW_PWR_CUT_ALL_MSK, + RTW_PWR_INTF_SDIO_MSK, + RTW_PWR_ADDR_SDIO, + RTW_PWR_CMD_POLLING, BIT(1), 0}, {0x0090, RTW_PWR_CUT_ALL_MSK, RTW_PWR_INTF_USB_MSK | RTW_PWR_INTF_PCI_MSK, RTW_PWR_ADDR_MAC, RTW_PWR_CMD_WRITE, BIT(1), 0}, + {0x0044, + RTW_PWR_CUT_ALL_MSK, + RTW_PWR_INTF_SDIO_MSK, + RTW_PWR_ADDR_SDIO, + RTW_PWR_CMD_WRITE, 0xFF, 0}, + {0x0040, + RTW_PWR_CUT_ALL_MSK, + RTW_PWR_INTF_SDIO_MSK, + RTW_PWR_ADDR_SDIO, + RTW_PWR_CMD_WRITE, 0xFF, 0x90}, + {0x0041, + RTW_PWR_CUT_ALL_MSK, + RTW_PWR_INTF_SDIO_MSK, + RTW_PWR_ADDR_SDIO, + RTW_PWR_CMD_WRITE, 0xFF, 0x00}, + {0x0042, + RTW_PWR_CUT_ALL_MSK, + RTW_PWR_INTF_SDIO_MSK, + RTW_PWR_ADDR_SDIO, + RTW_PWR_CMD_WRITE, 0xFF, 0x04}, {0xFFFF, RTW_PWR_CUT_ALL_MSK, RTW_PWR_INTF_ALL_MSK, diff --git a/drivers/net/wireless/realtek/rtw88/rtw8822c.c b/drivers/net/wireless/realtek/rtw88/rtw8822c.c index fe995bb4e43e..5e4cc57dbd7c 100644 --- a/drivers/net/wireless/realtek/rtw88/rtw8822c.c +++ b/drivers/net/wireless/realtek/rtw88/rtw8822c.c @@ -3563,6 +3563,16 @@ static void rtw8822c_pwr_track(struct rtw_dev *rtwdev) } static const struct rtw_pwr_seq_cmd trans_carddis_to_cardemu_8822c[] = { + {0x0086, + RTW_PWR_CUT_ALL_MSK, + RTW_PWR_INTF_SDIO_MSK, + RTW_PWR_ADDR_SDIO, + RTW_PWR_CMD_WRITE, BIT(0), 0}, + {0x0086, + RTW_PWR_CUT_ALL_MSK, + RTW_PWR_INTF_SDIO_MSK, + RTW_PWR_ADDR_SDIO, + RTW_PWR_CMD_POLLING, BIT(1), BIT(1)}, {0x002E, RTW_PWR_CUT_ALL_MSK, RTW_PWR_INTF_ALL_MSK, @@ -3773,6 +3783,11 @@ static const struct rtw_pwr_seq_cmd trans_act_to_cardemu_8822c[] = { }; static const struct rtw_pwr_seq_cmd trans_cardemu_to_carddis_8822c[] = { + {0x0005, + RTW_PWR_CUT_ALL_MSK, + RTW_PWR_INTF_SDIO_MSK, + RTW_PWR_ADDR_MAC, + RTW_PWR_CMD_WRITE, BIT(7), BIT(7)}, {0x0007, RTW_PWR_CUT_ALL_MSK, RTW_PWR_INTF_USB_MSK | RTW_PWR_INTF_SDIO_MSK, @@ -3818,6 +3833,11 @@ static const struct rtw_pwr_seq_cmd trans_cardemu_to_carddis_8822c[] = { RTW_PWR_INTF_PCI_MSK, RTW_PWR_ADDR_MAC, RTW_PWR_CMD_WRITE, BIT(2), BIT(2)}, + {0x0086, + RTW_PWR_CUT_ALL_MSK, + RTW_PWR_INTF_SDIO_MSK, + RTW_PWR_ADDR_SDIO, + RTW_PWR_CMD_WRITE, BIT(0), BIT(0)}, {0xFFFF, RTW_PWR_CUT_ALL_MSK, RTW_PWR_INTF_ALL_MSK, -- cgit v1.2.3-59-g8ed1b From 75d057bda1fbca6ade21378aa45db712e5f7d962 Mon Sep 17 00:00:00 2001 From: Larry Finger Date: Tue, 26 May 2020 10:59:08 -0500 Subject: b43: Fix connection problem with WPA3 Since the driver was first introduced into the kernel, it has only handled the ciphers associated with WEP, WPA, and WPA2. It fails with WPA3 even though mac80211 can handle those additional ciphers in software, b43 did not report that it could handle them. By setting MFP_CAPABLE using ieee80211_set_hw(), the problem is fixed. With this change, b43 will handle the ciphers it knows in hardware, and let mac80211 handle the others in software. It is not necessary to use the module parameter NOHWCRYPT to turn hardware encryption off. Although this change essentially eliminates that module parameter, I am choosing to keep it for cases where the hardware is broken, and software encryption is required for all ciphers. Reported-and-tested-by: Rui Salvaterra Signed-off-by: Larry Finger Cc: Stable Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/20200526155909.5807-2-Larry.Finger@lwfinger.net --- drivers/net/wireless/broadcom/b43/main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/wireless/broadcom/b43/main.c b/drivers/net/wireless/broadcom/b43/main.c index 39da1a4c30ac..3ad94dad2d89 100644 --- a/drivers/net/wireless/broadcom/b43/main.c +++ b/drivers/net/wireless/broadcom/b43/main.c @@ -5569,7 +5569,7 @@ static struct b43_wl *b43_wireless_init(struct b43_bus_dev *dev) /* fill hw info */ ieee80211_hw_set(hw, RX_INCLUDES_FCS); ieee80211_hw_set(hw, SIGNAL_DBM); - + ieee80211_hw_set(hw, MFP_CAPABLE); hw->wiphy->interface_modes = BIT(NL80211_IFTYPE_AP) | BIT(NL80211_IFTYPE_MESH_POINT) | -- cgit v1.2.3-59-g8ed1b From 6a29d134c04a8acebb7a95251acea7ad7abba106 Mon Sep 17 00:00:00 2001 From: Larry Finger Date: Tue, 26 May 2020 10:59:09 -0500 Subject: b43_legacy: Fix connection problem with WPA3 Since the driver was first introduced into the kernel, it has only handled the ciphers associated with WEP, WPA, and WPA2. It fails with WPA3 even though mac80211 can handle those additional ciphers in software, b43legacy did not report that it could handle them. By setting MFP_CAPABLE using ieee80211_set_hw(), the problem is fixed. With this change, b43legacy will handle the ciphers it knows in hardware, and let mac80211 handle the others in software. It is not necessary to use the module parameter NOHWCRYPT to turn hardware encryption off. Although this change essentially eliminates that module parameter, I am choosing to keep it for cases where the hardware is broken, and software encryption is required for all ciphers. Signed-off-by: Larry Finger Cc: Stable Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/20200526155909.5807-3-Larry.Finger@lwfinger.net --- drivers/net/wireless/broadcom/b43legacy/main.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/wireless/broadcom/b43legacy/main.c b/drivers/net/wireless/broadcom/b43legacy/main.c index 8b6b657c4b85..5208a39fd6f7 100644 --- a/drivers/net/wireless/broadcom/b43legacy/main.c +++ b/drivers/net/wireless/broadcom/b43legacy/main.c @@ -3801,6 +3801,7 @@ static int b43legacy_wireless_init(struct ssb_device *dev) /* fill hw info */ ieee80211_hw_set(hw, RX_INCLUDES_FCS); ieee80211_hw_set(hw, SIGNAL_DBM); + ieee80211_hw_set(hw, MFP_CAPABLE); /* Allow WPA3 in software */ hw->wiphy->interface_modes = BIT(NL80211_IFTYPE_AP) | -- cgit v1.2.3-59-g8ed1b From 83cee4e625f8344a55b197e2bf9713088e571375 Mon Sep 17 00:00:00 2001 From: Pali Rohár Date: Wed, 20 May 2020 14:54:10 +0200 Subject: cw1200: Remove local sdio VENDOR and DEVICE id definitions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit They are already present in linux/mmc/sdio_ids.h. Signed-off-by: Pali Rohár Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/20200520125410.31757-1-pali@kernel.org --- drivers/net/wireless/st/cw1200/cw1200_sdio.c | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/drivers/net/wireless/st/cw1200/cw1200_sdio.c b/drivers/net/wireless/st/cw1200/cw1200_sdio.c index 43e012073dbf..b65ec14136c7 100644 --- a/drivers/net/wireless/st/cw1200/cw1200_sdio.c +++ b/drivers/net/wireless/st/cw1200/cw1200_sdio.c @@ -14,6 +14,7 @@ #include #include #include +#include #include #include "cw1200.h" @@ -48,14 +49,6 @@ struct hwbus_priv { const struct cw1200_platform_data_sdio *pdata; }; -#ifndef SDIO_VENDOR_ID_STE -#define SDIO_VENDOR_ID_STE 0x0020 -#endif - -#ifndef SDIO_DEVICE_ID_STE_CW1200 -#define SDIO_DEVICE_ID_STE_CW1200 0x2280 -#endif - static const struct sdio_device_id cw1200_sdio_ids[] = { { SDIO_DEVICE(SDIO_VENDOR_ID_STE, SDIO_DEVICE_ID_STE_CW1200) }, { /* end: all zeroes */ }, -- cgit v1.2.3-59-g8ed1b From 729ef6b614a14043355c3e35569c62e08e1b9629 Mon Sep 17 00:00:00 2001 From: Pascal Terjan Date: Sat, 23 May 2020 22:26:28 +0100 Subject: libertas: Use shared constant for rfc1042 header This is one of the 9 drivers redefining rfc1042_header. Signed-off-by: Pascal Terjan Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/20200523212628.31526-1-pterjan@google.com --- drivers/net/wireless/marvell/libertas/rx.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/drivers/net/wireless/marvell/libertas/rx.c b/drivers/net/wireless/marvell/libertas/rx.c index 58a1fc433b73..f28aa09d1f9e 100644 --- a/drivers/net/wireless/marvell/libertas/rx.c +++ b/drivers/net/wireless/marvell/libertas/rx.c @@ -62,9 +62,6 @@ int lbs_process_rxed_packet(struct lbs_private *priv, struct sk_buff *skb) struct rxpd *p_rx_pd; int hdrchop; struct ethhdr *p_ethhdr; - static const u8 rfc1042_eth_hdr[] = { - 0xaa, 0xaa, 0x03, 0x00, 0x00, 0x00 - }; BUG_ON(!skb); @@ -102,7 +99,7 @@ int lbs_process_rxed_packet(struct lbs_private *priv, struct sk_buff *skb) sizeof(p_rx_pkt->eth803_hdr.src_addr)); if (memcmp(&p_rx_pkt->rfc1042_hdr, - rfc1042_eth_hdr, sizeof(rfc1042_eth_hdr)) == 0) { + rfc1042_header, sizeof(rfc1042_header)) == 0) { /* * Replace the 803 header and rfc1042 header (llc/snap) with an * EthernetII header, keep the src/dst and snap_type (ethertype) -- cgit v1.2.3-59-g8ed1b From e78e5d18c65362a0e6ca383fad88ef950293fc2c Mon Sep 17 00:00:00 2001 From: Pascal Terjan Date: Sat, 23 May 2020 22:27:35 +0100 Subject: atmel: Use shared constant for rfc1042 header This is one of the 9 drivers redefining rfc1042_header. Signed-off-by: Pascal Terjan Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/20200523212735.32364-1-pterjan@google.com --- drivers/net/wireless/atmel/atmel.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/net/wireless/atmel/atmel.c b/drivers/net/wireless/atmel/atmel.c index 74538085cfb7..d5875836068c 100644 --- a/drivers/net/wireless/atmel/atmel.c +++ b/drivers/net/wireless/atmel/atmel.c @@ -798,7 +798,6 @@ static void tx_update_descriptor(struct atmel_private *priv, int is_bcast, static netdev_tx_t start_tx(struct sk_buff *skb, struct net_device *dev) { - static const u8 SNAP_RFC1024[6] = { 0xaa, 0xaa, 0x03, 0x00, 0x00, 0x00 }; struct atmel_private *priv = netdev_priv(dev); struct ieee80211_hdr header; unsigned long flags; @@ -853,7 +852,7 @@ static netdev_tx_t start_tx(struct sk_buff *skb, struct net_device *dev) } if (priv->use_wpa) - memcpy(&header.addr4, SNAP_RFC1024, ETH_ALEN); + memcpy(&header.addr4, rfc1042_header, ETH_ALEN); header.frame_control = cpu_to_le16(frame_ctl); /* Copy the wireless header into the card */ -- cgit v1.2.3-59-g8ed1b From 9604617e998b49f7695fea1479ed82421ef8c9f0 Mon Sep 17 00:00:00 2001 From: Dinghao Liu Date: Wed, 20 May 2020 20:42:38 +0800 Subject: wlcore: fix runtime pm imbalance in wl1271_tx_work There are two error handling paths in this functon. When wlcore_tx_work_locked() returns an error code, we should decrease the runtime PM usage counter the same way as the error handling path beginning from pm_runtime_get_sync(). Signed-off-by: Dinghao Liu Acked-by: Tony Lindgren Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/20200520124241.9931-1-dinghao.liu@zju.edu.cn --- drivers/net/wireless/ti/wlcore/tx.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/wireless/ti/wlcore/tx.c b/drivers/net/wireless/ti/wlcore/tx.c index 90e56d4c3df3..e20e18cd04ae 100644 --- a/drivers/net/wireless/ti/wlcore/tx.c +++ b/drivers/net/wireless/ti/wlcore/tx.c @@ -863,6 +863,7 @@ void wl1271_tx_work(struct work_struct *work) ret = wlcore_tx_work_locked(wl); if (ret < 0) { + pm_runtime_put_noidle(wl->dev); wl12xx_queue_recovery_work(wl); goto out; } -- cgit v1.2.3-59-g8ed1b From 282a04bf1d8029eb98585cb5db3fd70fe8bc91f7 Mon Sep 17 00:00:00 2001 From: Dinghao Liu Date: Wed, 20 May 2020 20:46:47 +0800 Subject: wlcore: fix runtime pm imbalance in wlcore_regdomain_config pm_runtime_get_sync() increments the runtime PM usage counter even the call returns an error code. Thus a pairing decrement is needed on the error handling path to keep the counter balanced. Signed-off-by: Dinghao Liu Acked-by: Tony Lindgren Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/20200520124649.10848-1-dinghao.liu@zju.edu.cn --- drivers/net/wireless/ti/wlcore/main.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/net/wireless/ti/wlcore/main.c b/drivers/net/wireless/ti/wlcore/main.c index 4421fc656b1c..fa4ced9864c0 100644 --- a/drivers/net/wireless/ti/wlcore/main.c +++ b/drivers/net/wireless/ti/wlcore/main.c @@ -3665,8 +3665,10 @@ void wlcore_regdomain_config(struct wl1271 *wl) goto out; ret = pm_runtime_get_sync(wl->dev); - if (ret < 0) + if (ret < 0) { + pm_runtime_put_autosuspend(wl->dev); goto out; + } ret = wlcore_cmd_regdomain_config_locked(wl); if (ret < 0) { -- cgit v1.2.3-59-g8ed1b From 3e69ed2b52fd0eeb1e812e20a667316d913e6a97 Mon Sep 17 00:00:00 2001 From: Dinghao Liu Date: Wed, 20 May 2020 20:57:22 +0800 Subject: wlcore: fix runtime pm imbalance in wl1271_op_suspend When wlcore_hw_interrupt_notify() returns an error code, a pairing runtime PM usage counter decrement is needed to keep the counter balanced. Signed-off-by: Dinghao Liu Acked-by: Tony Lindgren Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/20200520125724.12832-1-dinghao.liu@zju.edu.cn --- drivers/net/wireless/ti/wlcore/main.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/net/wireless/ti/wlcore/main.c b/drivers/net/wireless/ti/wlcore/main.c index fa4ced9864c0..bf6698fc1389 100644 --- a/drivers/net/wireless/ti/wlcore/main.c +++ b/drivers/net/wireless/ti/wlcore/main.c @@ -1746,9 +1746,7 @@ static int __maybe_unused wl1271_op_suspend(struct ieee80211_hw *hw, ret = wl1271_configure_suspend(wl, wlvif, wow); if (ret < 0) { - mutex_unlock(&wl->mutex); - wl1271_warning("couldn't prepare device to suspend"); - return ret; + goto out_sleep; } } -- cgit v1.2.3-59-g8ed1b From 53df5271f2397706be85c3892246e3e726113902 Mon Sep 17 00:00:00 2001 From: Dinghao Liu Date: Wed, 20 May 2020 21:08:04 +0800 Subject: wlcore: fix runtime pm imbalance in __wl1271_op_remove_interface When wl12xx_cmd_role_disable() returns an error code, a pairing runtime PM usage counter decrement is needed to keep the counter balanced. Signed-off-by: Dinghao Liu Acked-by: Tony Lindgren Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/20200520130806.14789-1-dinghao.liu@zju.edu.cn --- drivers/net/wireless/ti/wlcore/main.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/net/wireless/ti/wlcore/main.c b/drivers/net/wireless/ti/wlcore/main.c index bf6698fc1389..0dcad4949889 100644 --- a/drivers/net/wireless/ti/wlcore/main.c +++ b/drivers/net/wireless/ti/wlcore/main.c @@ -2696,12 +2696,16 @@ static void __wl1271_op_remove_interface(struct wl1271 *wl, if (!wlcore_is_p2p_mgmt(wlvif)) { ret = wl12xx_cmd_role_disable(wl, &wlvif->role_id); - if (ret < 0) + if (ret < 0) { + pm_runtime_put_noidle(wl->dev); goto deinit; + } } else { ret = wl12xx_cmd_role_disable(wl, &wlvif->dev_role_id); - if (ret < 0) + if (ret < 0) { + pm_runtime_put_noidle(wl->dev); goto deinit; + } } pm_runtime_mark_last_busy(wl->dev); -- cgit v1.2.3-59-g8ed1b From efad661168c7f4c309f17b773ba2017b85348e9b Mon Sep 17 00:00:00 2001 From: Ping-Ke Shih Date: Fri, 29 May 2020 10:50:07 +0800 Subject: rtw88: coex: 8723d: set antanna control owner Without setting antenna control owner, the WiFi could be disconnected if the BT has traffic. Because the antenna is switched to BT side for its traffic, and the WiFi will have no chance to transfer data. Set control owner to prevent WiFi disconnect issue. Fixes: f5df1a8b4376 ("rtw88: 8723d: Add 8723DE to Kconfig and Makefile") Tested-by: You-Sheng Yang Signed-off-by: Ping-Ke Shih Signed-off-by: Yan-Hsuan Chuang Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/20200529025009.2468-2-yhchuang@realtek.com --- drivers/net/wireless/realtek/rtw88/coex.c | 12 ++++++++++-- drivers/net/wireless/realtek/rtw88/main.h | 1 + drivers/net/wireless/realtek/rtw88/rtw8723d.c | 5 +++++ drivers/net/wireless/realtek/rtw88/rtw8723d.h | 1 + 4 files changed, 17 insertions(+), 2 deletions(-) diff --git a/drivers/net/wireless/realtek/rtw88/coex.c b/drivers/net/wireless/realtek/rtw88/coex.c index 924dccd5d146..aa1f726d0966 100644 --- a/drivers/net/wireless/realtek/rtw88/coex.c +++ b/drivers/net/wireless/realtek/rtw88/coex.c @@ -751,10 +751,18 @@ EXPORT_SYMBOL(rtw_coex_write_indirect_reg); static void rtw_coex_coex_ctrl_owner(struct rtw_dev *rtwdev, bool wifi_control) { - if (wifi_control) + struct rtw_chip_info *chip = rtwdev->chip; + const struct rtw_hw_reg *btg_reg = chip->btg_reg; + + if (wifi_control) { rtw_write32_set(rtwdev, REG_SYS_SDIO_CTRL, BIT_LTE_MUX_CTRL_PATH); - else + if (btg_reg) + rtw_write8_set(rtwdev, btg_reg->addr, btg_reg->mask); + } else { rtw_write32_clr(rtwdev, REG_SYS_SDIO_CTRL, BIT_LTE_MUX_CTRL_PATH); + if (btg_reg) + rtw_write8_clr(rtwdev, btg_reg->addr, btg_reg->mask); + } } static void rtw_coex_set_gnt_bt(struct rtw_dev *rtwdev, u8 state) diff --git a/drivers/net/wireless/realtek/rtw88/main.h b/drivers/net/wireless/realtek/rtw88/main.h index 7ee09c008cd4..2ae424869f8b 100644 --- a/drivers/net/wireless/realtek/rtw88/main.h +++ b/drivers/net/wireless/realtek/rtw88/main.h @@ -1174,6 +1174,7 @@ struct rtw_chip_info { const struct coex_rf_para *wl_rf_para_tx; const struct coex_rf_para *wl_rf_para_rx; const struct coex_5g_afh_map *afh_5g; + const struct rtw_hw_reg *btg_reg; const struct rtw_reg_domain *coex_info_hw_regs; }; diff --git a/drivers/net/wireless/realtek/rtw88/rtw8723d.c b/drivers/net/wireless/realtek/rtw88/rtw8723d.c index 7422baf2d41b..4d88ba8406c7 100644 --- a/drivers/net/wireless/realtek/rtw88/rtw8723d.c +++ b/drivers/net/wireless/realtek/rtw88/rtw8723d.c @@ -2068,6 +2068,10 @@ static const u8 wl_rssi_step_8723d[] = {60, 50, 44, 30}; static const u8 bt_rssi_step_8723d[] = {30, 30, 30, 30}; static const struct coex_5g_afh_map afh_5g_8723d[] = { {0, 0, 0} }; +static const struct rtw_hw_reg btg_reg_8723d = { + .addr = REG_BTG_SEL, .mask = BIT_MASK_BTG_WL, +}; + /* wl_tx_dec_power, bt_tx_dec_power, wl_rx_gain, bt_rx_lna_constrain */ static const struct coex_rf_para rf_para_tx_8723d[] = { {0, 0, false, 7}, /* for normal */ @@ -2734,6 +2738,7 @@ struct rtw_chip_info rtw8723d_hw_spec = { .bt_afh_span_bw40 = 0x30, .afh_5g_num = ARRAY_SIZE(afh_5g_8723d), .afh_5g = afh_5g_8723d, + .btg_reg = &btg_reg_8723d, .coex_info_hw_regs_num = ARRAY_SIZE(coex_info_hw_regs_8723d), .coex_info_hw_regs = coex_info_hw_regs_8723d, diff --git a/drivers/net/wireless/realtek/rtw88/rtw8723d.h b/drivers/net/wireless/realtek/rtw88/rtw8723d.h index 31b8ed9ee652..7894d321cd7e 100644 --- a/drivers/net/wireless/realtek/rtw88/rtw8723d.h +++ b/drivers/net/wireless/realtek/rtw88/rtw8723d.h @@ -145,6 +145,7 @@ static inline s32 iqk_mult(s32 x, s32 y, s32 *ext) #define REG_GPIO_INTM 0x0048 #define REG_BTG_SEL 0x0067 +#define BIT_MASK_BTG_WL BIT(7) #define REG_LTECOEX_PATH_CONTROL 0x0070 #define REG_LTECOEX_CTRL 0x07c0 #define REG_LTECOEX_WRITE_DATA 0x07c4 -- cgit v1.2.3-59-g8ed1b From 2647d2827f2af6e054742f687cb21efaccab44f0 Mon Sep 17 00:00:00 2001 From: Ping-Ke Shih Date: Fri, 29 May 2020 10:50:08 +0800 Subject: rtw88: coex: 8723d: handle BT inquiry cases Coex mechanism used to make BT have higher priority and more time to transfer data when BT inquiry-page, which leads to poor WiFi performance. Should take WiFi traffic into consideration. If the WiFi is having heavy traffic, use another parameter to make sure WiFi has more chance to TX/RX, while guarantee the priority of BT for inquiry. If the WiFi isn't busy (connected or not), set proper parameter to fix originals. Fixes: f5df1a8b4376 ("rtw88: 8723d: Add 8723DE to Kconfig and Makefile") Tested-by: You-Sheng Yang Signed-off-by: Ping-Ke Shih Signed-off-by: Yan-Hsuan Chuang Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/20200529025009.2468-3-yhchuang@realtek.com --- drivers/net/wireless/realtek/rtw88/coex.c | 9 ++++++--- drivers/net/wireless/realtek/rtw88/rtw8723d.c | 5 +++-- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/drivers/net/wireless/realtek/rtw88/coex.c b/drivers/net/wireless/realtek/rtw88/coex.c index aa1f726d0966..cbf3d503df1c 100644 --- a/drivers/net/wireless/realtek/rtw88/coex.c +++ b/drivers/net/wireless/realtek/rtw88/coex.c @@ -1354,12 +1354,15 @@ static void rtw_coex_action_bt_inquiry(struct rtw_dev *rtwdev) tdma_case = 108; else tdma_case = 109; + } else if (coex_stat->wl_gl_busy) { + table_case = 114; + tdma_case = 121; } else if (coex_stat->wl_connected) { - table_case = 101; - tdma_case = 110; - } else { table_case = 100; tdma_case = 100; + } else { + table_case = 101; + tdma_case = 100; } } diff --git a/drivers/net/wireless/realtek/rtw88/rtw8723d.c b/drivers/net/wireless/realtek/rtw88/rtw8723d.c index 4d88ba8406c7..4700195c8eef 100644 --- a/drivers/net/wireless/realtek/rtw88/rtw8723d.c +++ b/drivers/net/wireless/realtek/rtw88/rtw8723d.c @@ -2040,7 +2040,7 @@ static const struct coex_tdma_para tdma_sant_8723d[] = { /* Non-Shared-Antenna TDMA */ static const struct coex_tdma_para tdma_nsant_8723d[] = { - { {0x00, 0x00, 0x00, 0x40, 0x00} }, /* case-100 */ + { {0x00, 0x00, 0x00, 0x40, 0x01} }, /* case-100 */ { {0x61, 0x45, 0x03, 0x11, 0x11} }, /* case-101 */ { {0x61, 0x3a, 0x03, 0x11, 0x11} }, { {0x61, 0x30, 0x03, 0x11, 0x11} }, @@ -2060,7 +2060,8 @@ static const struct coex_tdma_para tdma_nsant_8723d[] = { { {0x51, 0x3a, 0x03, 0x10, 0x50} }, { {0x51, 0x30, 0x03, 0x10, 0x50} }, { {0x51, 0x20, 0x03, 0x10, 0x50} }, - { {0x51, 0x10, 0x03, 0x10, 0x50} } + { {0x51, 0x10, 0x03, 0x10, 0x50} }, /* case-120 */ + { {0x51, 0x08, 0x03, 0x10, 0x50} }, }; /* rssi in percentage % (dbm = % - 100) */ -- cgit v1.2.3-59-g8ed1b From 7a242fb69821ea428b89e381de63624abea68568 Mon Sep 17 00:00:00 2001 From: Ping-Ke Shih Date: Fri, 29 May 2020 10:50:09 +0800 Subject: rtw88: fix EAPOL 4-way failure by finish IQK earlier Connecting to an AP with WPA2 security may fail. The IQK and the EAPOL 4-way handshake may overlap because the driver does IQK right after assoc success. For 802.11n devices, the IQK is done in the driver and it could require more than 100ms to complete. During IQK, any TX/RX events are paused. So if the EAPOL 4-way handshake started before IQK finished, then the 1/4 and 2/4 part of the handshake could be dropped. The AP will then issue deauth with reason IEEE8021X_FAILED (23). To resolve this, move IQK routine into managed TX prepare (ieee80211_ops::mgd_prepare_tx()). The callback is called before the managed frames (auth/assoc) are sent. This will make sure that the IQK is completed before the handshake starts. But don't do IQK during scanning because doing it on each channel will take too long. For 802.11ac devices, the IQK is done in firmware and it takes less time to complete. Therefore we don't see a failure during the EAPOL 4-way handshake. But it is still worth moving the IQK into ieee80211_ops::mgd_prepare_tx(). Fixes: f5df1a8b4376 ("rtw88: 8723d: Add 8723DE to Kconfig and Makefile") Tested-by: You-Sheng Yang Signed-off-by: Ping-Ke Shih Signed-off-by: Yan-Hsuan Chuang Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/20200529025009.2468-4-yhchuang@realtek.com --- drivers/net/wireless/realtek/rtw88/mac80211.c | 3 +-- drivers/net/wireless/realtek/rtw88/main.c | 17 +++++++++++++++++ drivers/net/wireless/realtek/rtw88/main.h | 3 +++ 3 files changed, 21 insertions(+), 2 deletions(-) diff --git a/drivers/net/wireless/realtek/rtw88/mac80211.c b/drivers/net/wireless/realtek/rtw88/mac80211.c index 98d2ac22f6f6..c412bc54efde 100644 --- a/drivers/net/wireless/realtek/rtw88/mac80211.c +++ b/drivers/net/wireless/realtek/rtw88/mac80211.c @@ -341,13 +341,11 @@ static void rtw_ops_bss_info_changed(struct ieee80211_hw *hw, rtw_leave_lps_deep(rtwdev); if (changed & BSS_CHANGED_ASSOC) { - struct rtw_chip_info *chip = rtwdev->chip; enum rtw_net_type net_type; if (conf->assoc) { rtw_coex_connect_notify(rtwdev, COEX_ASSOCIATE_FINISH); net_type = RTW_NET_MGD_LINKED; - chip->ops->phy_calibration(rtwdev); rtwvif->aid = conf->aid; rtw_fw_download_rsvd_page(rtwdev); @@ -663,6 +661,7 @@ static void rtw_ops_mgd_prepare_tx(struct ieee80211_hw *hw, mutex_lock(&rtwdev->mutex); rtw_leave_lps_deep(rtwdev); rtw_coex_connect_notify(rtwdev, COEX_ASSOCIATE_START); + rtw_chip_prepare_tx(rtwdev); mutex_unlock(&rtwdev->mutex); } diff --git a/drivers/net/wireless/realtek/rtw88/main.c b/drivers/net/wireless/realtek/rtw88/main.c index f88a7d2370aa..0eefafc51c62 100644 --- a/drivers/net/wireless/realtek/rtw88/main.c +++ b/drivers/net/wireless/realtek/rtw88/main.c @@ -408,6 +408,23 @@ void rtw_set_channel(struct rtw_dev *rtwdev) } rtw_phy_set_tx_power_level(rtwdev, center_chan); + + /* if the channel isn't set for scanning, we will do RF calibration + * in ieee80211_ops::mgd_prepare_tx(). Performing the calibration + * during scanning on each channel takes too long. + */ + if (!test_bit(RTW_FLAG_SCANNING, rtwdev->flags)) + rtwdev->need_rfk = true; +} + +void rtw_chip_prepare_tx(struct rtw_dev *rtwdev) +{ + struct rtw_chip_info *chip = rtwdev->chip; + + if (rtwdev->need_rfk) { + rtwdev->need_rfk = false; + chip->ops->phy_calibration(rtwdev); + } } static void rtw_vif_write_addr(struct rtw_dev *rtwdev, u32 start, u8 *addr) diff --git a/drivers/net/wireless/realtek/rtw88/main.h b/drivers/net/wireless/realtek/rtw88/main.h index 2ae424869f8b..0841f5fa4bf2 100644 --- a/drivers/net/wireless/realtek/rtw88/main.h +++ b/drivers/net/wireless/realtek/rtw88/main.h @@ -1720,6 +1720,8 @@ struct rtw_dev { struct rtw_fw_state wow_fw; struct rtw_wow_param wow; + bool need_rfk; + /* hci related data, must be last */ u8 priv[] __aligned(sizeof(void *)); }; @@ -1793,6 +1795,7 @@ void rtw_restore_reg(struct rtw_dev *rtwdev, struct rtw_backup_info *bckp, u32 num); void rtw_desc_to_mcsrate(u16 rate, u8 *mcs, u8 *nss); void rtw_set_channel(struct rtw_dev *rtwdev); +void rtw_chip_prepare_tx(struct rtw_dev *rtwdev); void rtw_vif_port_config(struct rtw_dev *rtwdev, struct rtw_vif *rtwvif, u32 config); void rtw_tx_report_purge_timer(struct timer_list *t); -- cgit v1.2.3-59-g8ed1b From 4e1a341580f2e51f7d1c992b50c28a6c4a242f7f Mon Sep 17 00:00:00 2001 From: Yan-Hsuan Chuang Date: Fri, 22 May 2020 11:55:21 +0800 Subject: rtw88: 8822c: fix missing brace warning for old compilers For older versions of gcc, the array = {0}; will cause warnings: drivers/net/wireless/realtek/rtw88/rtw8822c.c: In function 'rtw8822c_power_trim': >> drivers/net/wireless/realtek/rtw88/rtw8822c.c:1039:2: warning: >> missing braces around initializer [-Wmissing-braces] s8 bb_gain[2][8] = {0}; ^ drivers/net/wireless/realtek/rtw88/rtw8822c.c:1039:2: warning: (near initialization for 'bb_gain[0]') [-Wmissing-braces] Fixes: 5ad4d8957b69 ("rtw88: set power trim according to efuse PG values") Reported-by: kbuild test robot Signed-off-by: Yan-Hsuan Chuang Acked-by: Sebastian Andrzej Siewior Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/20200522035521.12295-1-yhchuang@realtek.com --- drivers/net/wireless/realtek/rtw88/rtw8822c.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/wireless/realtek/rtw88/rtw8822c.c b/drivers/net/wireless/realtek/rtw88/rtw8822c.c index 5e4cc57dbd7c..8d65a9684af3 100644 --- a/drivers/net/wireless/realtek/rtw88/rtw8822c.c +++ b/drivers/net/wireless/realtek/rtw88/rtw8822c.c @@ -1037,7 +1037,7 @@ static void rtw8822c_set_power_trim(struct rtw_dev *rtwdev, s8 bb_gain[2][8]) static void rtw8822c_power_trim(struct rtw_dev *rtwdev) { u8 pg_pwr = 0xff, i, path, idx; - s8 bb_gain[2][8] = {0}; + s8 bb_gain[2][8] = {}; u16 rf_efuse_2g[3] = {PPG_2GL_TXAB, PPG_2GM_TXAB, PPG_2GH_TXAB}; u16 rf_efuse_5g[2][5] = {{PPG_5GL1_TXA, PPG_5GL2_TXA, PPG_5GM1_TXA, PPG_5GM2_TXA, PPG_5GH1_TXA}, -- cgit v1.2.3-59-g8ed1b From 7967af8de39d4572da2d15dfa12b1b08810d60a9 Mon Sep 17 00:00:00 2001 From: Chien-Hsun Liao Date: Fri, 22 May 2020 17:12:34 +0800 Subject: rtw88: 8822c: remove CCK TX setting when switch channel The CCK TX setting when switch channel will fix the CCK to path A only, so if the antenna is configured to path B (e.g. iw phy set antenna 0x2 0x3 "TX B/RX AB"), then the CCK packets can never be delivered to the air if only path B is connected with an antenna (it can possibly be transmitted through path A, but as path B is configured, the expected behavior is incorrect). This can also solve the racing issue of CCK TX setting between driver and firmware. The CCK TX setting in driver should be removed. Otherwise, the CCK TX setting would be wrong when the racing occurs. Fixes: 297bcf8222f2 ("rtw88: add support for set/get antennas") Signed-off-by: Chien-Hsun Liao Signed-off-by: Yan-Hsuan Chuang Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/20200522091234.24495-1-yhchuang@realtek.com --- drivers/net/wireless/realtek/rtw88/rtw8822c.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/net/wireless/realtek/rtw88/rtw8822c.c b/drivers/net/wireless/realtek/rtw88/rtw8822c.c index 8d65a9684af3..c3d72ef611c6 100644 --- a/drivers/net/wireless/realtek/rtw88/rtw8822c.c +++ b/drivers/net/wireless/realtek/rtw88/rtw8822c.c @@ -1496,7 +1496,6 @@ static void rtw8822c_set_channel_bb(struct rtw_dev *rtwdev, u8 channel, u8 bw, { if (IS_CH_2G_BAND(channel)) { rtw_write32_clr(rtwdev, REG_BGCTRL, BITS_RX_IQ_WEIGHT); - rtw_write32_mask(rtwdev, REG_RXCCKSEL, 0xf0000000, 0x8); rtw_write32_set(rtwdev, REG_TXF4, BIT(20)); rtw_write32_clr(rtwdev, REG_CCK_CHECK, BIT_CHECK_CCK_EN); rtw_write32_clr(rtwdev, REG_CCKTXONLY, BIT_BB_CCK_CHECK_EN); @@ -1564,7 +1563,6 @@ static void rtw8822c_set_channel_bb(struct rtw_dev *rtwdev, u8 channel, u8 bw, rtw_write32_set(rtwdev, REG_CCK_CHECK, BIT_CHECK_CCK_EN); rtw_write32_set(rtwdev, REG_BGCTRL, BITS_RX_IQ_WEIGHT); rtw_write32_clr(rtwdev, REG_TXF4, BIT(20)); - rtw_write32_mask(rtwdev, REG_RXCCKSEL, 0xf0000000, 0x0); rtw_write32_mask(rtwdev, REG_CCAMSK, 0x3F000000, 0x22); rtw_write32_mask(rtwdev, REG_TXDFIR0, 0x70, 0x3); if (IS_CH_5G_BAND_1(channel) || IS_CH_5G_BAND_2(channel)) { -- cgit v1.2.3-59-g8ed1b From da74b6933b3ba27c88fa0b7ccbd019e4f41ebfd4 Mon Sep 17 00:00:00 2001 From: Dinghao Liu Date: Fri, 22 May 2020 12:49:04 +0800 Subject: wlcore: fix runtime pm imbalance in wlcore_irq_locked When wlcore_fw_status() returns an error code, a pairing runtime PM usage counter decrement is needed to keep the counter balanced. It's the same for all error paths after wlcore_fw_status(). Signed-off-by: Dinghao Liu Acked-by: Tony Lindgren Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/20200522044906.29564-1-dinghao.liu@zju.edu.cn --- drivers/net/wireless/ti/wlcore/main.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/drivers/net/wireless/ti/wlcore/main.c b/drivers/net/wireless/ti/wlcore/main.c index 0dcad4949889..de6c8a7589ca 100644 --- a/drivers/net/wireless/ti/wlcore/main.c +++ b/drivers/net/wireless/ti/wlcore/main.c @@ -548,7 +548,7 @@ static int wlcore_irq_locked(struct wl1271 *wl) ret = wlcore_fw_status(wl, wl->fw_status); if (ret < 0) - goto out; + goto err_ret; wlcore_hw_tx_immediate_compl(wl); @@ -565,7 +565,7 @@ static int wlcore_irq_locked(struct wl1271 *wl) ret = -EIO; /* restarting the chip. ignore any other interrupt. */ - goto out; + goto err_ret; } if (unlikely(intr & WL1271_ACX_SW_INTR_WATCHDOG)) { @@ -575,7 +575,7 @@ static int wlcore_irq_locked(struct wl1271 *wl) ret = -EIO; /* restarting the chip. ignore any other interrupt. */ - goto out; + goto err_ret; } if (likely(intr & WL1271_ACX_INTR_DATA)) { @@ -583,7 +583,7 @@ static int wlcore_irq_locked(struct wl1271 *wl) ret = wlcore_rx(wl, wl->fw_status); if (ret < 0) - goto out; + goto err_ret; /* Check if any tx blocks were freed */ spin_lock_irqsave(&wl->wl_lock, flags); @@ -596,7 +596,7 @@ static int wlcore_irq_locked(struct wl1271 *wl) */ ret = wlcore_tx_work_locked(wl); if (ret < 0) - goto out; + goto err_ret; } else { spin_unlock_irqrestore(&wl->wl_lock, flags); } @@ -604,7 +604,7 @@ static int wlcore_irq_locked(struct wl1271 *wl) /* check for tx results */ ret = wlcore_hw_tx_delayed_compl(wl); if (ret < 0) - goto out; + goto err_ret; /* Make sure the deferred queues don't get too long */ defer_count = skb_queue_len(&wl->deferred_tx_queue) + @@ -617,14 +617,14 @@ static int wlcore_irq_locked(struct wl1271 *wl) wl1271_debug(DEBUG_IRQ, "WL1271_ACX_INTR_EVENT_A"); ret = wl1271_event_handle(wl, 0); if (ret < 0) - goto out; + goto err_ret; } if (intr & WL1271_ACX_INTR_EVENT_B) { wl1271_debug(DEBUG_IRQ, "WL1271_ACX_INTR_EVENT_B"); ret = wl1271_event_handle(wl, 1); if (ret < 0) - goto out; + goto err_ret; } if (intr & WL1271_ACX_INTR_INIT_COMPLETE) @@ -635,6 +635,7 @@ static int wlcore_irq_locked(struct wl1271 *wl) wl1271_debug(DEBUG_IRQ, "WL1271_ACX_INTR_HW_AVAILABLE"); } +err_ret: pm_runtime_mark_last_busy(wl->dev); pm_runtime_put_autosuspend(wl->dev); -- cgit v1.2.3-59-g8ed1b From 2a7621ded321dfd70b5349bbfcd1af9e9df1f197 Mon Sep 17 00:00:00 2001 From: Wright Feng Date: Thu, 28 May 2020 22:49:34 -0500 Subject: brcmfmac: set F2 blocksize for 4373 Set F2 blocksize to 256 bytes for 4373. It fixes DMA error while having UDP bi-directional traffic. Also use a defined F1 MesBusyCtrl value. Signed-off-by: Wright Feng Signed-off-by: Chi-hsien Lin Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/20200529034938.124533-2-chi-hsien.lin@cypress.com --- drivers/net/wireless/broadcom/brcm80211/brcmfmac/bcmsdh.c | 14 +++++++++++++- drivers/net/wireless/broadcom/brcm80211/brcmfmac/sdio.c | 4 ++-- 2 files changed, 15 insertions(+), 3 deletions(-) diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/bcmsdh.c b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/bcmsdh.c index 22a17ae09e94..bb3196cba683 100644 --- a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/bcmsdh.c +++ b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/bcmsdh.c @@ -43,6 +43,7 @@ #define SDIO_FUNC1_BLOCKSIZE 64 #define SDIO_FUNC2_BLOCKSIZE 512 +#define SDIO_4373_FUNC2_BLOCKSIZE 256 #define SDIO_4359_FUNC2_BLOCKSIZE 256 /* Maximum milliseconds to wait for F2 to come up */ #define SDIO_WAIT_F2RDY 3000 @@ -910,13 +911,24 @@ static int brcmf_sdiod_probe(struct brcmf_sdio_dev *sdiodev) sdio_release_host(sdiodev->func1); goto out; } - if (sdiodev->func2->device == SDIO_DEVICE_ID_BROADCOM_4359) + switch (sdiodev->func2->device) { + case SDIO_DEVICE_ID_CYPRESS_4373: + f2_blksz = SDIO_4373_FUNC2_BLOCKSIZE; + break; + case SDIO_DEVICE_ID_BROADCOM_4359: f2_blksz = SDIO_4359_FUNC2_BLOCKSIZE; + break; + default: + break; + } + ret = sdio_set_block_size(sdiodev->func2, f2_blksz); if (ret) { brcmf_err("Failed to set F2 blocksize\n"); sdio_release_host(sdiodev->func1); goto out; + } else { + brcmf_dbg(SDIO, "set F2 blocksize to %d\n", f2_blksz); } /* increase F2 timeout */ diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/sdio.c b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/sdio.c index ce6f15284277..dce22cd2279d 100644 --- a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/sdio.c +++ b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/sdio.c @@ -41,6 +41,7 @@ /* watermark expressed in number of words */ #define DEFAULT_F2_WATERMARK 0x8 #define CY_4373_F2_WATERMARK 0x40 +#define CY_4373_F1_MESBUSYCTRL (CY_4373_F2_WATERMARK | SBSDIO_MESBUSYCTRL_ENAB) #define CY_43012_F2_WATERMARK 0x60 #define CY_4359_F2_WATERMARK 0x40 #define CY_4359_F1_MESBUSYCTRL (CY_4359_F2_WATERMARK | SBSDIO_MESBUSYCTRL_ENAB) @@ -4195,8 +4196,7 @@ static void brcmf_sdio_firmware_callback(struct device *dev, int err, brcmf_sdiod_writeb(sdiod, SBSDIO_DEVICE_CTL, devctl, &err); brcmf_sdiod_writeb(sdiod, SBSDIO_FUNC1_MESBUSYCTRL, - CY_4373_F2_WATERMARK | - SBSDIO_MESBUSYCTRL_ENAB, &err); + CY_4373_F1_MESBUSYCTRL, &err); break; case SDIO_DEVICE_ID_CYPRESS_43012: brcmf_dbg(INFO, "set F2 watermark to 0x%x*4 bytes\n", -- cgit v1.2.3-59-g8ed1b From 528158a8d4522bed5a35bb1a942638a79bca6acd Mon Sep 17 00:00:00 2001 From: Double Lo Date: Thu, 28 May 2020 22:49:35 -0500 Subject: brcmfmac: fix 4339 CRC error under SDIO 3.0 SDR104 mode This patch fixes 4339 CRC error while running Tput test with suspend/resume test script. The continuous failure messages before system crash: brcmfmac: brcmf_sdiod_sglist_rw: CMD53 sg block read failed -84 brcmfmac: brcmf_sdio_rxglom: glom read of 25600 bytes failed: -5 brcmfmac: brcmf_sdio_rxfail: abort command, terminate frame brcmfmac: brcmf_sdiod_sglist_rw: CMD53 sg block read failed -84 brcmfmac: brcmf_sdio_rxglom: glom read of 24576 bytes failed: -5 brcmfmac: brcmf_sdio_rxfail: abort command, terminate frame Signed-off-by: Double Lo Signed-off-by: Chi-hsien Lin Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/20200529034938.124533-3-chi-hsien.lin@cypress.com --- drivers/net/wireless/broadcom/brcm80211/brcmfmac/sdio.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/sdio.c b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/sdio.c index dce22cd2279d..491b635e72b1 100644 --- a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/sdio.c +++ b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/sdio.c @@ -43,6 +43,10 @@ #define CY_4373_F2_WATERMARK 0x40 #define CY_4373_F1_MESBUSYCTRL (CY_4373_F2_WATERMARK | SBSDIO_MESBUSYCTRL_ENAB) #define CY_43012_F2_WATERMARK 0x60 +#define CY_4339_F2_WATERMARK 48 +#define CY_4339_MES_WATERMARK 80 +#define CY_4339_MESBUSYCTRL (CY_4339_MES_WATERMARK | \ + SBSDIO_MESBUSYCTRL_ENAB) #define CY_4359_F2_WATERMARK 0x40 #define CY_4359_F1_MESBUSYCTRL (CY_4359_F2_WATERMARK | SBSDIO_MESBUSYCTRL_ENAB) @@ -4209,6 +4213,19 @@ static void brcmf_sdio_firmware_callback(struct device *dev, int err, brcmf_sdiod_writeb(sdiod, SBSDIO_DEVICE_CTL, devctl, &err); break; + case SDIO_DEVICE_ID_BROADCOM_4339: + brcmf_dbg(INFO, "set F2 watermark to 0x%x*4 bytes for 4339\n", + CY_4339_F2_WATERMARK); + brcmf_sdiod_writeb(sdiod, SBSDIO_WATERMARK, + CY_4339_F2_WATERMARK, &err); + devctl = brcmf_sdiod_readb(sdiod, SBSDIO_DEVICE_CTL, + &err); + devctl |= SBSDIO_DEVCTL_F2WM_ENAB; + brcmf_sdiod_writeb(sdiod, SBSDIO_DEVICE_CTL, devctl, + &err); + brcmf_sdiod_writeb(sdiod, SBSDIO_FUNC1_MESBUSYCTRL, + CY_4339_MESBUSYCTRL, &err); + break; case SDIO_DEVICE_ID_BROADCOM_4359: brcmf_dbg(INFO, "set F2 watermark to 0x%x*4 bytes\n", CY_4359_F2_WATERMARK); -- cgit v1.2.3-59-g8ed1b From 2bee41270f3bafe475dd2cfe28fcc3f1ed6ab6ee Mon Sep 17 00:00:00 2001 From: Frank Kao Date: Thu, 28 May 2020 22:49:36 -0500 Subject: brcmfmac: set F2 blocksize and watermark for 4354/4356 SDIO Set F2 blocksize to 256 bytes and watermark to 0x40 for 4354/4356 SDIO. Also enable and configure F1 MesBusyCtrl. It would resolve random driver crash issue. Signed-off-by: Frank Kao Signed-off-by: Chi-Hsien Lin Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/20200529034938.124533-4-chi-hsien.lin@cypress.com --- .../net/wireless/broadcom/brcm80211/brcmfmac/bcmsdh.c | 8 ++++++-- drivers/net/wireless/broadcom/brcm80211/brcmfmac/sdio.c | 16 ++++++++++------ 2 files changed, 16 insertions(+), 8 deletions(-) diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/bcmsdh.c b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/bcmsdh.c index bb3196cba683..b1a66320ba54 100644 --- a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/bcmsdh.c +++ b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/bcmsdh.c @@ -44,7 +44,7 @@ #define SDIO_FUNC1_BLOCKSIZE 64 #define SDIO_FUNC2_BLOCKSIZE 512 #define SDIO_4373_FUNC2_BLOCKSIZE 256 -#define SDIO_4359_FUNC2_BLOCKSIZE 256 +#define SDIO_435X_FUNC2_BLOCKSIZE 256 /* Maximum milliseconds to wait for F2 to come up */ #define SDIO_WAIT_F2RDY 3000 @@ -916,7 +916,11 @@ static int brcmf_sdiod_probe(struct brcmf_sdio_dev *sdiodev) f2_blksz = SDIO_4373_FUNC2_BLOCKSIZE; break; case SDIO_DEVICE_ID_BROADCOM_4359: - f2_blksz = SDIO_4359_FUNC2_BLOCKSIZE; + /* fallthrough */ + case SDIO_DEVICE_ID_BROADCOM_4354: + /* fallthrough */ + case SDIO_DEVICE_ID_BROADCOM_4356: + f2_blksz = SDIO_435X_FUNC2_BLOCKSIZE; break; default: break; diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/sdio.c b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/sdio.c index 491b635e72b1..037a4efef924 100644 --- a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/sdio.c +++ b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/sdio.c @@ -47,9 +47,9 @@ #define CY_4339_MES_WATERMARK 80 #define CY_4339_MESBUSYCTRL (CY_4339_MES_WATERMARK | \ SBSDIO_MESBUSYCTRL_ENAB) -#define CY_4359_F2_WATERMARK 0x40 -#define CY_4359_F1_MESBUSYCTRL (CY_4359_F2_WATERMARK | SBSDIO_MESBUSYCTRL_ENAB) - +#define CY_435X_F2_WATERMARK 0x40 +#define CY_435X_F1_MESBUSYCTRL (CY_435X_F2_WATERMARK | \ + SBSDIO_MESBUSYCTRL_ENAB) #ifdef DEBUG #define BRCMF_TRAP_INFO_SIZE 80 @@ -4227,17 +4227,21 @@ static void brcmf_sdio_firmware_callback(struct device *dev, int err, CY_4339_MESBUSYCTRL, &err); break; case SDIO_DEVICE_ID_BROADCOM_4359: + /* fallthrough */ + case SDIO_DEVICE_ID_BROADCOM_4354: + /* fallthrough */ + case SDIO_DEVICE_ID_BROADCOM_4356: brcmf_dbg(INFO, "set F2 watermark to 0x%x*4 bytes\n", - CY_4359_F2_WATERMARK); + CY_435X_F2_WATERMARK); brcmf_sdiod_writeb(sdiod, SBSDIO_WATERMARK, - CY_4359_F2_WATERMARK, &err); + CY_435X_F2_WATERMARK, &err); devctl = brcmf_sdiod_readb(sdiod, SBSDIO_DEVICE_CTL, &err); devctl |= SBSDIO_DEVCTL_F2WM_ENAB; brcmf_sdiod_writeb(sdiod, SBSDIO_DEVICE_CTL, devctl, &err); brcmf_sdiod_writeb(sdiod, SBSDIO_FUNC1_MESBUSYCTRL, - CY_4359_F1_MESBUSYCTRL, &err); + CY_435X_F1_MESBUSYCTRL, &err); break; default: brcmf_sdiod_writeb(sdiod, SBSDIO_WATERMARK, -- cgit v1.2.3-59-g8ed1b From df18c257bd6a3fe1906a9c87bcb0bb30cf87ef64 Mon Sep 17 00:00:00 2001 From: Wright Feng Date: Thu, 28 May 2020 22:49:37 -0500 Subject: brcmfmac: fix 43455 CRC error under SDIO 3.0 SDR104 mode This patch fixes 43455 CRC error while running throughput test with suspend/resume stress test. The continuous failure messages before system crash: brcmfmac: brcmf_sdiod_sglist_rw: CMD53 sg block read failed -84 brcmfmac: brcmf_sdio_rxglom: glom read of 25600 bytes failed: -5 brcmfmac: brcmf_sdio_rxfail: abort command, terminate frame brcmfmac: brcmf_sdiod_sglist_rw: CMD53 sg block read failed -84 brcmfmac: brcmf_sdio_rxglom: glom read of 24576 bytes failed: -5 brcmfmac: brcmf_sdio_rxfail: abort command, terminate frame Signed-off-by: Wright Feng Signed-off-by: Chi-hsien Lin Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/20200529034938.124533-5-chi-hsien.lin@cypress.com --- .../net/wireless/broadcom/brcm80211/brcmfmac/sdio.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/sdio.c b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/sdio.c index 037a4efef924..58d9f0b90ad3 100644 --- a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/sdio.c +++ b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/sdio.c @@ -47,9 +47,14 @@ #define CY_4339_MES_WATERMARK 80 #define CY_4339_MESBUSYCTRL (CY_4339_MES_WATERMARK | \ SBSDIO_MESBUSYCTRL_ENAB) +#define CY_43455_F2_WATERMARK 0x60 +#define CY_43455_MES_WATERMARK 0x50 +#define CY_43455_MESBUSYCTRL (CY_43455_MES_WATERMARK | \ + SBSDIO_MESBUSYCTRL_ENAB) #define CY_435X_F2_WATERMARK 0x40 #define CY_435X_F1_MESBUSYCTRL (CY_435X_F2_WATERMARK | \ SBSDIO_MESBUSYCTRL_ENAB) + #ifdef DEBUG #define BRCMF_TRAP_INFO_SIZE 80 @@ -4226,6 +4231,19 @@ static void brcmf_sdio_firmware_callback(struct device *dev, int err, brcmf_sdiod_writeb(sdiod, SBSDIO_FUNC1_MESBUSYCTRL, CY_4339_MESBUSYCTRL, &err); break; + case SDIO_DEVICE_ID_BROADCOM_43455: + brcmf_dbg(INFO, "set F2 watermark to 0x%x*4 bytes for 43455\n", + CY_43455_F2_WATERMARK); + brcmf_sdiod_writeb(sdiod, SBSDIO_WATERMARK, + CY_43455_F2_WATERMARK, &err); + devctl = brcmf_sdiod_readb(sdiod, SBSDIO_DEVICE_CTL, + &err); + devctl |= SBSDIO_DEVCTL_F2WM_ENAB; + brcmf_sdiod_writeb(sdiod, SBSDIO_DEVICE_CTL, devctl, + &err); + brcmf_sdiod_writeb(sdiod, SBSDIO_FUNC1_MESBUSYCTRL, + CY_43455_MESBUSYCTRL, &err); + break; case SDIO_DEVICE_ID_BROADCOM_4359: /* fallthrough */ case SDIO_DEVICE_ID_BROADCOM_4354: -- cgit v1.2.3-59-g8ed1b From 113a57a400a2d4974448760464f6443d52a1e498 Mon Sep 17 00:00:00 2001 From: Double Lo Date: Thu, 28 May 2020 22:49:38 -0500 Subject: brcmfmac: 43012 Update MES Watermark Set MES watermark size to 0x50 for 43012. It fixes SDIO bus hang issue when running at high throughput. Signed-off-by: Double Lo Signed-off-by: Chi-hsien Lin Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/20200529034938.124533-6-chi-hsien.lin@cypress.com --- drivers/net/wireless/broadcom/brcm80211/brcmfmac/sdio.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/sdio.c b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/sdio.c index 58d9f0b90ad3..760b7737e745 100644 --- a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/sdio.c +++ b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/sdio.c @@ -43,6 +43,9 @@ #define CY_4373_F2_WATERMARK 0x40 #define CY_4373_F1_MESBUSYCTRL (CY_4373_F2_WATERMARK | SBSDIO_MESBUSYCTRL_ENAB) #define CY_43012_F2_WATERMARK 0x60 +#define CY_43012_MES_WATERMARK 0x50 +#define CY_43012_MESBUSYCTRL (CY_43012_MES_WATERMARK | \ + SBSDIO_MESBUSYCTRL_ENAB) #define CY_4339_F2_WATERMARK 48 #define CY_4339_MES_WATERMARK 80 #define CY_4339_MESBUSYCTRL (CY_4339_MES_WATERMARK | \ @@ -4217,6 +4220,8 @@ static void brcmf_sdio_firmware_callback(struct device *dev, int err, devctl |= SBSDIO_DEVCTL_F2WM_ENAB; brcmf_sdiod_writeb(sdiod, SBSDIO_DEVICE_CTL, devctl, &err); + brcmf_sdiod_writeb(sdiod, SBSDIO_FUNC1_MESBUSYCTRL, + CY_43012_MESBUSYCTRL, &err); break; case SDIO_DEVICE_ID_BROADCOM_4339: brcmf_dbg(INFO, "set F2 watermark to 0x%x*4 bytes for 4339\n", -- cgit v1.2.3-59-g8ed1b From 11e7a91994c29da96d847f676be023da6a2c1359 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 27 May 2020 21:48:30 +0300 Subject: airo: Fix read overflows sending packets The problem is that we always copy a minimum of ETH_ZLEN (60) bytes from skb->data even when skb->len is less than ETH_ZLEN so it leads to a read overflow. The fix is to pad skb->data to at least ETH_ZLEN bytes. Cc: Reported-by: Hu Jiahui Signed-off-by: Dan Carpenter Reviewed-by: Eric Dumazet Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/20200527184830.GA1164846@mwanda --- drivers/net/wireless/cisco/airo.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/drivers/net/wireless/cisco/airo.c b/drivers/net/wireless/cisco/airo.c index 8363f91df7ea..827bb6d74815 100644 --- a/drivers/net/wireless/cisco/airo.c +++ b/drivers/net/wireless/cisco/airo.c @@ -1925,6 +1925,10 @@ static netdev_tx_t mpi_start_xmit(struct sk_buff *skb, airo_print_err(dev->name, "%s: skb == NULL!",__func__); return NETDEV_TX_OK; } + if (skb_padto(skb, ETH_ZLEN)) { + dev->stats.tx_dropped++; + return NETDEV_TX_OK; + } npacks = skb_queue_len (&ai->txq); if (npacks >= MAXTXQ - 1) { @@ -2127,6 +2131,10 @@ static netdev_tx_t airo_start_xmit(struct sk_buff *skb, airo_print_err(dev->name, "%s: skb == NULL!", __func__); return NETDEV_TX_OK; } + if (skb_padto(skb, ETH_ZLEN)) { + dev->stats.tx_dropped++; + return NETDEV_TX_OK; + } /* Find a vacant FID */ for( i = 0; i < MAX_FIDS / 2 && (fids[i] & 0xffff0000); i++ ); @@ -2201,6 +2209,10 @@ static netdev_tx_t airo_start_xmit11(struct sk_buff *skb, airo_print_err(dev->name, "%s: skb == NULL!", __func__); return NETDEV_TX_OK; } + if (skb_padto(skb, ETH_ZLEN)) { + dev->stats.tx_dropped++; + return NETDEV_TX_OK; + } /* Find a vacant FID */ for( i = MAX_FIDS / 2; i < MAX_FIDS && (fids[i] & 0xffff0000); i++ ); -- cgit v1.2.3-59-g8ed1b From 86cffb2c0a59d72f66060253e123154416618fa2 Mon Sep 17 00:00:00 2001 From: Pali Rohár Date: Thu, 21 May 2020 14:34:44 +0200 Subject: mwifiex: Parse all API_VER_ID properties MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit During initialization of SD8997 wifi chip kernel prints warnings: mwifiex_sdio mmc0:0001:1: Unknown api_id: 3 mwifiex_sdio mmc0:0001:1: Unknown api_id: 4 This patch adds support for parsing all api ids provided by SD8997 firmware. Signed-off-by: Pali Rohár Acked-by: Ganapathi Bhat Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/20200521123444.28957-1-pali@kernel.org --- drivers/net/wireless/marvell/mwifiex/cmdevt.c | 17 +++++++++++++++-- drivers/net/wireless/marvell/mwifiex/fw.h | 2 ++ 2 files changed, 17 insertions(+), 2 deletions(-) diff --git a/drivers/net/wireless/marvell/mwifiex/cmdevt.c b/drivers/net/wireless/marvell/mwifiex/cmdevt.c index 7e4b8cd52605..589cc5eb12a2 100644 --- a/drivers/net/wireless/marvell/mwifiex/cmdevt.c +++ b/drivers/net/wireless/marvell/mwifiex/cmdevt.c @@ -1581,8 +1581,21 @@ int mwifiex_ret_get_hw_spec(struct mwifiex_private *priv, adapter->fw_api_ver = api_rev->major_ver; mwifiex_dbg(adapter, INFO, - "Firmware api version %d\n", - adapter->fw_api_ver); + "Firmware api version %d.%d\n", + adapter->fw_api_ver, + api_rev->minor_ver); + break; + case UAP_FW_API_VER_ID: + mwifiex_dbg(adapter, INFO, + "uAP api version %d.%d\n", + api_rev->major_ver, + api_rev->minor_ver); + break; + case CHANRPT_API_VER_ID: + mwifiex_dbg(adapter, INFO, + "channel report api version %d.%d\n", + api_rev->major_ver, + api_rev->minor_ver); break; default: mwifiex_dbg(adapter, FATAL, diff --git a/drivers/net/wireless/marvell/mwifiex/fw.h b/drivers/net/wireless/marvell/mwifiex/fw.h index a415d73a73e6..6f86f5b96fc9 100644 --- a/drivers/net/wireless/marvell/mwifiex/fw.h +++ b/drivers/net/wireless/marvell/mwifiex/fw.h @@ -1052,6 +1052,8 @@ struct host_cmd_ds_802_11_ps_mode_enh { enum API_VER_ID { KEY_API_VER_ID = 1, FW_API_VER_ID = 2, + UAP_FW_API_VER_ID = 3, + CHANRPT_API_VER_ID = 4, }; struct hw_spec_api_rev { -- cgit v1.2.3-59-g8ed1b From 982d7287f8dad2d5e1c57dc84aca83128e787666 Mon Sep 17 00:00:00 2001 From: Pali Rohár Date: Thu, 21 May 2020 14:35:59 +0200 Subject: mwifiex: Add support for NL80211_ATTR_MAX_AP_ASSOC_STA MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit SD8997 firmware sends TLV_TYPE_MAX_CONN with struct hw_spec_max_conn to inform kernel about maximum number of p2p connections and stations in AP mode. During initialization of SD8997 wifi chip kernel prints warning: mwifiex_sdio mmc0:0001:1: Unknown GET_HW_SPEC TLV type: 0x217 This patch adds support for parsing TLV_TYPE_MAX_CONN (0x217) and sets appropriate cfg80211 member 'max_ap_assoc_sta' from retrieved structure. It allows userspace to retrieve NL80211_ATTR_MAX_AP_ASSOC_STA attribute. Signed-off-by: Pali Rohár Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/20200521123559.29028-1-pali@kernel.org --- drivers/net/wireless/marvell/mwifiex/cfg80211.c | 5 +++++ drivers/net/wireless/marvell/mwifiex/cmdevt.c | 12 ++++++++++++ drivers/net/wireless/marvell/mwifiex/fw.h | 8 ++++++++ drivers/net/wireless/marvell/mwifiex/main.h | 1 + 4 files changed, 26 insertions(+) diff --git a/drivers/net/wireless/marvell/mwifiex/cfg80211.c b/drivers/net/wireless/marvell/mwifiex/cfg80211.c index 97813ac291ae..4e4f59c17ded 100644 --- a/drivers/net/wireless/marvell/mwifiex/cfg80211.c +++ b/drivers/net/wireless/marvell/mwifiex/cfg80211.c @@ -4335,6 +4335,11 @@ int mwifiex_register_cfg80211(struct mwifiex_adapter *adapter) wiphy->iface_combinations = &mwifiex_iface_comb_ap_sta; wiphy->n_iface_combinations = 1; + if (adapter->max_sta_conn > adapter->max_p2p_conn) + wiphy->max_ap_assoc_sta = adapter->max_sta_conn; + else + wiphy->max_ap_assoc_sta = adapter->max_p2p_conn; + /* Initialize cipher suits */ wiphy->cipher_suites = mwifiex_cipher_suites; wiphy->n_cipher_suites = ARRAY_SIZE(mwifiex_cipher_suites); diff --git a/drivers/net/wireless/marvell/mwifiex/cmdevt.c b/drivers/net/wireless/marvell/mwifiex/cmdevt.c index 589cc5eb12a2..d068b9075c32 100644 --- a/drivers/net/wireless/marvell/mwifiex/cmdevt.c +++ b/drivers/net/wireless/marvell/mwifiex/cmdevt.c @@ -1495,6 +1495,7 @@ int mwifiex_ret_get_hw_spec(struct mwifiex_private *priv, struct mwifiex_adapter *adapter = priv->adapter; struct mwifiex_ie_types_header *tlv; struct hw_spec_api_rev *api_rev; + struct hw_spec_max_conn *max_conn; u16 resp_size, api_id; int i, left_len, parsed_len = 0; @@ -1604,6 +1605,17 @@ int mwifiex_ret_get_hw_spec(struct mwifiex_private *priv, break; } break; + case TLV_TYPE_MAX_CONN: + max_conn = (struct hw_spec_max_conn *)tlv; + adapter->max_p2p_conn = max_conn->max_p2p_conn; + adapter->max_sta_conn = max_conn->max_sta_conn; + mwifiex_dbg(adapter, INFO, + "max p2p connections: %u\n", + adapter->max_p2p_conn); + mwifiex_dbg(adapter, INFO, + "max sta connections: %u\n", + adapter->max_sta_conn); + break; default: mwifiex_dbg(adapter, FATAL, "Unknown GET_HW_SPEC TLV type: %#x\n", diff --git a/drivers/net/wireless/marvell/mwifiex/fw.h b/drivers/net/wireless/marvell/mwifiex/fw.h index 6f86f5b96fc9..8047e307892e 100644 --- a/drivers/net/wireless/marvell/mwifiex/fw.h +++ b/drivers/net/wireless/marvell/mwifiex/fw.h @@ -220,6 +220,7 @@ enum MWIFIEX_802_11_PRIVACY_FILTER { #define TLV_TYPE_BSS_MODE (PROPRIETARY_TLV_BASE_ID + 206) #define TLV_TYPE_RANDOM_MAC (PROPRIETARY_TLV_BASE_ID + 236) #define TLV_TYPE_CHAN_ATTR_CFG (PROPRIETARY_TLV_BASE_ID + 237) +#define TLV_TYPE_MAX_CONN (PROPRIETARY_TLV_BASE_ID + 279) #define MWIFIEX_TX_DATA_BUF_SIZE_2K 2048 @@ -2388,4 +2389,11 @@ struct mwifiex_opt_sleep_confirm { __le16 action; __le16 resp_ctrl; } __packed; + +struct hw_spec_max_conn { + struct mwifiex_ie_types_header header; + u8 max_p2p_conn; + u8 max_sta_conn; +} __packed; + #endif /* !_MWIFIEX_FW_H_ */ diff --git a/drivers/net/wireless/marvell/mwifiex/main.h b/drivers/net/wireless/marvell/mwifiex/main.h index afaffc325452..5923c5c14c8d 100644 --- a/drivers/net/wireless/marvell/mwifiex/main.h +++ b/drivers/net/wireless/marvell/mwifiex/main.h @@ -1022,6 +1022,7 @@ struct mwifiex_adapter { bool ext_scan; u8 fw_api_ver; u8 key_api_major_ver, key_api_minor_ver; + u8 max_p2p_conn, max_sta_conn; struct memory_type_mapping *mem_type_mapping_tbl; u8 num_mem_types; bool scan_chan_gap_enabled; -- cgit v1.2.3-59-g8ed1b From 36432797641ff0013be9252eecf7ad1ba73171a2 Mon Sep 17 00:00:00 2001 From: Ronak Doshi Date: Thu, 28 May 2020 19:53:52 -0700 Subject: vmxnet3: use correct hdr reference when packet is encapsulated 'Commit dacce2be3312 ("vmxnet3: add geneve and vxlan tunnel offload support")' added support for encapsulation offload. However, while preparing inner tso packet, it uses reference to outer ip headers. This patch fixes this issue by using correct reference for inner headers. Fixes: dacce2be3312 ("vmxnet3: add geneve and vxlan tunnel offload support") Signed-off-by: Ronak Doshi Signed-off-by: David S. Miller --- drivers/net/vmxnet3/vmxnet3_drv.c | 31 +++++++++++++++++++++++-------- 1 file changed, 23 insertions(+), 8 deletions(-) diff --git a/drivers/net/vmxnet3/vmxnet3_drv.c b/drivers/net/vmxnet3/vmxnet3_drv.c index 3d07ce6cb706..ca395f9679d0 100644 --- a/drivers/net/vmxnet3/vmxnet3_drv.c +++ b/drivers/net/vmxnet3/vmxnet3_drv.c @@ -859,14 +859,29 @@ vmxnet3_parse_hdr(struct sk_buff *skb, struct vmxnet3_tx_queue *tq, */ ctx->l4_offset = skb_checksum_start_offset(skb); - if (ctx->ipv4) { - const struct iphdr *iph = ip_hdr(skb); + if (VMXNET3_VERSION_GE_4(adapter) && + skb->encapsulation) { + struct iphdr *iph = inner_ip_hdr(skb); + + if (iph->version == 4) { + protocol = iph->protocol; + } else { + const struct ipv6hdr *ipv6h; - protocol = iph->protocol; - } else if (ctx->ipv6) { - const struct ipv6hdr *ipv6h = ipv6_hdr(skb); + ipv6h = inner_ipv6_hdr(skb); + protocol = ipv6h->nexthdr; + } + } else { + if (ctx->ipv4) { + const struct iphdr *iph = ip_hdr(skb); - protocol = ipv6h->nexthdr; + protocol = iph->protocol; + } else if (ctx->ipv6) { + const struct ipv6hdr *ipv6h; + + ipv6h = ipv6_hdr(skb); + protocol = ipv6h->nexthdr; + } } switch (protocol) { @@ -946,11 +961,11 @@ vmxnet3_prepare_inner_tso(struct sk_buff *skb, struct tcphdr *tcph = inner_tcp_hdr(skb); struct iphdr *iph = inner_ip_hdr(skb); - if (ctx->ipv4) { + if (iph->version == 4) { iph->check = 0; tcph->check = ~csum_tcpudp_magic(iph->saddr, iph->daddr, 0, IPPROTO_TCP, 0); - } else if (ctx->ipv6) { + } else { struct ipv6hdr *iph = inner_ipv6_hdr(skb); tcph->check = ~csum_ipv6_magic(&iph->saddr, &iph->daddr, 0, -- cgit v1.2.3-59-g8ed1b From 09b547a7996e77d4141de27c1657d7c3adcb478e Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Fri, 29 May 2020 10:26:48 +0200 Subject: net: ethernet: mtk-star-emac: remove unused variable The desc pointer is set but not used. Remove it. Reported-by: kbuild test robot Fixes: 8c7bd5a454ff ("net: ethernet: mtk-star-emac: new driver") Signed-off-by: Bartosz Golaszewski Signed-off-by: David S. Miller --- drivers/net/ethernet/mediatek/mtk_star_emac.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/drivers/net/ethernet/mediatek/mtk_star_emac.c b/drivers/net/ethernet/mediatek/mtk_star_emac.c index 8596ca0e60eb..7df35872c107 100644 --- a/drivers/net/ethernet/mediatek/mtk_star_emac.c +++ b/drivers/net/ethernet/mediatek/mtk_star_emac.c @@ -746,15 +746,12 @@ mtk_star_ring_free_skbs(struct mtk_star_priv *priv, struct mtk_star_ring *ring, struct mtk_star_ring_desc_data *)) { struct mtk_star_ring_desc_data desc_data; - struct mtk_star_ring_desc *desc; int i; for (i = 0; i < MTK_STAR_RING_NUM_DESCS; i++) { if (!ring->dma_addrs[i]) continue; - desc = &ring->descs[i]; - desc_data.dma_addr = ring->dma_addrs[i]; desc_data.skb = ring->skbs[i]; -- cgit v1.2.3-59-g8ed1b From 2684bda34786b6ae0944a4bacb7f59b5955979a8 Mon Sep 17 00:00:00 2001 From: Antoine Tenart Date: Fri, 29 May 2020 11:49:09 +0200 Subject: net: phy: mscc: fix PHYs using the vsc8574_probe PHYs using the vsc8574_probe fail to be initialized and their config_init return -EIO leading to errors like: "could not attach PHY: -5". This is because when the conversion of the MSCC PHY driver to use the shared PHY package helpers was done, the base address retrieval and the base PHY read and write helpers in the driver were modified. In particular, the base address retrieval logic was moved from the config_init to the probe. But the vsc8574_probe was forgotten. This patch fixes it. Fixes: deb04e9c0ff2 ("net: phy: mscc: use phy_package_shared") Signed-off-by: Antoine Tenart Reviewed-by: Michael Walle Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/phy/mscc/mscc_main.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/net/phy/mscc/mscc_main.c b/drivers/net/phy/mscc/mscc_main.c index 550acf547ced..7ed0285206d0 100644 --- a/drivers/net/phy/mscc/mscc_main.c +++ b/drivers/net/phy/mscc/mscc_main.c @@ -1977,6 +1977,10 @@ static int vsc8574_probe(struct phy_device *phydev) phydev->priv = vsc8531; + vsc8584_get_base_addr(phydev); + devm_phy_package_join(&phydev->mdio.dev, phydev, + vsc8531->base_addr, 0); + vsc8531->nleds = 4; vsc8531->supp_led_modes = VSC8584_SUPP_LED_MODES; vsc8531->hw_stats = vsc8584_hw_stats; -- cgit v1.2.3-59-g8ed1b From 830f5ce266ce79e18f0026c36c99319b1bc08e1b Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Fri, 29 May 2020 13:02:07 +0300 Subject: net: phy: marvell: unlock after phy_select_page() failure We need to call phy_restore_page() even if phy_select_page() fails. Otherwise we are holding the phy_lock_mdio_bus() lock. This requirement is documented at the start of the phy_select_page() function. Fixes: a618e86da91d ("net : phy: marvell: Speedup TDR data retrieval by only changing page once") Signed-off-by: Dan Carpenter Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- drivers/net/phy/marvell.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/phy/marvell.c b/drivers/net/phy/marvell.c index 2c04e3b2b285..4ea226566cec 100644 --- a/drivers/net/phy/marvell.c +++ b/drivers/net/phy/marvell.c @@ -1867,7 +1867,7 @@ static int marvell_vct5_amplitude_graph(struct phy_device *phydev) */ page = phy_select_page(phydev, MII_MARVELL_VCT5_PAGE); if (page < 0) - return page; + goto restore_page; for (distance = priv->first; distance <= priv->last; -- cgit v1.2.3-59-g8ed1b From 40ef92c6ec09bd8aaffccfa41a715d1df5625f95 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 29 May 2020 14:09:40 +0200 Subject: sctp: add sctp_sock_set_nodelay Add a helper to directly set the SCTP_NODELAY sockopt from kernel space without going through a fake uaccess. Signed-off-by: Christoph Hellwig Acked-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- fs/dlm/lowcomms.c | 10 ++-------- include/net/sctp/sctp.h | 7 +++++++ 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c index 69333728d871..9f1c3cdc9d65 100644 --- a/fs/dlm/lowcomms.c +++ b/fs/dlm/lowcomms.c @@ -914,7 +914,6 @@ static int sctp_bind_addrs(struct connection *con, uint16_t port) static void sctp_connect_to_sock(struct connection *con) { struct sockaddr_storage daddr; - int one = 1; int result; int addr_len; struct socket *sock; @@ -961,8 +960,7 @@ static void sctp_connect_to_sock(struct connection *con) log_print("connecting to %d", con->nodeid); /* Turn off Nagle's algorithm */ - kernel_setsockopt(sock, SOL_SCTP, SCTP_NODELAY, (char *)&one, - sizeof(one)); + sctp_sock_set_nodelay(sock->sk); /* * Make sock->ops->connect() function return in specified time, @@ -1176,7 +1174,6 @@ static int sctp_listen_for_all(void) struct socket *sock = NULL; int result = -EINVAL; struct connection *con = nodeid2con(0, GFP_NOFS); - int one = 1; if (!con) return -ENOMEM; @@ -1191,10 +1188,7 @@ static int sctp_listen_for_all(void) } sock_set_rcvbuf(sock->sk, NEEDED_RMEM); - result = kernel_setsockopt(sock, SOL_SCTP, SCTP_NODELAY, (char *)&one, - sizeof(one)); - if (result < 0) - log_print("Could not set SCTP NODELAY error %d\n", result); + sctp_sock_set_nodelay(sock->sk); write_lock_bh(&sock->sk->sk_callback_lock); /* Init con struct */ diff --git a/include/net/sctp/sctp.h b/include/net/sctp/sctp.h index 3ab5c6bbb90b..f8bcb75bb044 100644 --- a/include/net/sctp/sctp.h +++ b/include/net/sctp/sctp.h @@ -615,4 +615,11 @@ static inline bool sctp_newsk_ready(const struct sock *sk) return sock_flag(sk, SOCK_DEAD) || sk->sk_socket; } +static inline void sctp_sock_set_nodelay(struct sock *sk) +{ + lock_sock(sk); + sctp_sk(sk)->nodelay = true; + release_sock(sk); +} + #endif /* __net_sctp_h__ */ -- cgit v1.2.3-59-g8ed1b From 05bfd3661448a46db3a258b316160d34cf0a1317 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 29 May 2020 14:09:41 +0200 Subject: sctp: refactor sctp_setsockopt_bindx Split out a sctp_setsockopt_bindx_kernel that takes a kernel pointer to the sockaddr and make sctp_setsockopt_bindx a small wrapper around it. This prepares for adding a new bind_add proto op. Signed-off-by: Christoph Hellwig Acked-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- net/sctp/socket.c | 61 +++++++++++++++++++++++++------------------------------ 1 file changed, 28 insertions(+), 33 deletions(-) diff --git a/net/sctp/socket.c b/net/sctp/socket.c index 827a9903ee28..6e745ac3c4a5 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -972,23 +972,22 @@ int sctp_asconf_mgmt(struct sctp_sock *sp, struct sctp_sockaddr_entry *addrw) * it. * * sk The sk of the socket - * addrs The pointer to the addresses in user land + * addrs The pointer to the addresses * addrssize Size of the addrs buffer * op Operation to perform (add or remove, see the flags of * sctp_bindx) * * Returns 0 if ok, <0 errno code on error. */ -static int sctp_setsockopt_bindx(struct sock *sk, - struct sockaddr __user *addrs, - int addrs_size, int op) +static int sctp_setsockopt_bindx_kernel(struct sock *sk, + struct sockaddr *addrs, int addrs_size, + int op) { - struct sockaddr *kaddrs; int err; int addrcnt = 0; int walk_size = 0; struct sockaddr *sa_addr; - void *addr_buf; + void *addr_buf = addrs; struct sctp_af *af; pr_debug("%s: sk:%p addrs:%p addrs_size:%d opt:%d\n", @@ -997,17 +996,10 @@ static int sctp_setsockopt_bindx(struct sock *sk, if (unlikely(addrs_size <= 0)) return -EINVAL; - kaddrs = memdup_user(addrs, addrs_size); - if (IS_ERR(kaddrs)) - return PTR_ERR(kaddrs); - /* Walk through the addrs buffer and count the number of addresses. */ - addr_buf = kaddrs; while (walk_size < addrs_size) { - if (walk_size + sizeof(sa_family_t) > addrs_size) { - kfree(kaddrs); + if (walk_size + sizeof(sa_family_t) > addrs_size) return -EINVAL; - } sa_addr = addr_buf; af = sctp_get_af_specific(sa_addr->sa_family); @@ -1015,10 +1007,8 @@ static int sctp_setsockopt_bindx(struct sock *sk, /* If the address family is not supported or if this address * causes the address buffer to overflow return EINVAL. */ - if (!af || (walk_size + af->sockaddr_len) > addrs_size) { - kfree(kaddrs); + if (!af || (walk_size + af->sockaddr_len) > addrs_size) return -EINVAL; - } addrcnt++; addr_buf += af->sockaddr_len; walk_size += af->sockaddr_len; @@ -1029,31 +1019,36 @@ static int sctp_setsockopt_bindx(struct sock *sk, case SCTP_BINDX_ADD_ADDR: /* Allow security module to validate bindx addresses. */ err = security_sctp_bind_connect(sk, SCTP_SOCKOPT_BINDX_ADD, - (struct sockaddr *)kaddrs, - addrs_size); + addrs, addrs_size); if (err) - goto out; - err = sctp_bindx_add(sk, kaddrs, addrcnt); + return err; + err = sctp_bindx_add(sk, addrs, addrcnt); if (err) - goto out; - err = sctp_send_asconf_add_ip(sk, kaddrs, addrcnt); - break; - + return err; + return sctp_send_asconf_add_ip(sk, addrs, addrcnt); case SCTP_BINDX_REM_ADDR: - err = sctp_bindx_rem(sk, kaddrs, addrcnt); + err = sctp_bindx_rem(sk, addrs, addrcnt); if (err) - goto out; - err = sctp_send_asconf_del_ip(sk, kaddrs, addrcnt); - break; + return err; + return sctp_send_asconf_del_ip(sk, addrs, addrcnt); default: - err = -EINVAL; - break; + return -EINVAL; } +} -out: - kfree(kaddrs); +static int sctp_setsockopt_bindx(struct sock *sk, + struct sockaddr __user *addrs, + int addrs_size, int op) +{ + struct sockaddr *kaddrs; + int err; + kaddrs = memdup_user(addrs, addrs_size); + if (IS_ERR(kaddrs)) + return PTR_ERR(kaddrs); + err = sctp_setsockopt_bindx_kernel(sk, kaddrs, addrs_size, op); + kfree(kaddrs); return err; } -- cgit v1.2.3-59-g8ed1b From c0425a4249e9d313eec5f81c0bde8a286ebf9a63 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 29 May 2020 14:09:42 +0200 Subject: net: add a new bind_add method The SCTP protocol allows to bind multiple address to a socket. That feature is currently only exposed as a socket option. Add a bind_add method struct proto that allows to bind additional addresses, and switch the dlm code to use the method instead of going through the socket option from kernel space. Signed-off-by: Christoph Hellwig Acked-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- fs/dlm/lowcomms.c | 9 +++------ include/net/sock.h | 6 +++++- net/core/sock.c | 8 ++++++++ net/sctp/socket.c | 14 ++++++++++++++ 4 files changed, 30 insertions(+), 7 deletions(-) diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c index 9f1c3cdc9d65..3543a8fec907 100644 --- a/fs/dlm/lowcomms.c +++ b/fs/dlm/lowcomms.c @@ -882,6 +882,7 @@ static void writequeue_entry_complete(struct writequeue_entry *e, int completed) static int sctp_bind_addrs(struct connection *con, uint16_t port) { struct sockaddr_storage localaddr; + struct sockaddr *addr = (struct sockaddr *)&localaddr; int i, addr_len, result = 0; for (i = 0; i < dlm_local_count; i++) { @@ -889,13 +890,9 @@ static int sctp_bind_addrs(struct connection *con, uint16_t port) make_sockaddr(&localaddr, port, &addr_len); if (!i) - result = kernel_bind(con->sock, - (struct sockaddr *)&localaddr, - addr_len); + result = kernel_bind(con->sock, addr, addr_len); else - result = kernel_setsockopt(con->sock, SOL_SCTP, - SCTP_SOCKOPT_BINDX_ADD, - (char *)&localaddr, addr_len); + result = sock_bind_add(con->sock->sk, addr, addr_len); if (result < 0) { log_print("Can't bind to %d addr number %d, %d.\n", diff --git a/include/net/sock.h b/include/net/sock.h index d994daa418ec..6e9f713a7860 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1156,7 +1156,9 @@ struct proto { int (*sendpage)(struct sock *sk, struct page *page, int offset, size_t size, int flags); int (*bind)(struct sock *sk, - struct sockaddr *uaddr, int addr_len); + struct sockaddr *addr, int addr_len); + int (*bind_add)(struct sock *sk, + struct sockaddr *addr, int addr_len); int (*backlog_rcv) (struct sock *sk, struct sk_buff *skb); @@ -2698,4 +2700,6 @@ void sock_set_reuseaddr(struct sock *sk); void sock_set_reuseport(struct sock *sk); void sock_set_sndtimeo(struct sock *sk, s64 secs); +int sock_bind_add(struct sock *sk, struct sockaddr *addr, int addr_len); + #endif /* _SOCK_H */ diff --git a/net/core/sock.c b/net/core/sock.c index 2ca3425b519c..61ec573221a6 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -3712,3 +3712,11 @@ bool sk_busy_loop_end(void *p, unsigned long start_time) } EXPORT_SYMBOL(sk_busy_loop_end); #endif /* CONFIG_NET_RX_BUSY_POLL */ + +int sock_bind_add(struct sock *sk, struct sockaddr *addr, int addr_len) +{ + if (!sk->sk_prot->bind_add) + return -EOPNOTSUPP; + return sk->sk_prot->bind_add(sk, addr, addr_len); +} +EXPORT_SYMBOL(sock_bind_add); diff --git a/net/sctp/socket.c b/net/sctp/socket.c index 6e745ac3c4a5..d57e1a002ffc 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -1052,6 +1052,18 @@ static int sctp_setsockopt_bindx(struct sock *sk, return err; } +static int sctp_bind_add(struct sock *sk, struct sockaddr *addrs, + int addrlen) +{ + int err; + + lock_sock(sk); + err = sctp_setsockopt_bindx_kernel(sk, addrs, addrlen, + SCTP_BINDX_ADD_ADDR); + release_sock(sk); + return err; +} + static int sctp_connect_new_asoc(struct sctp_endpoint *ep, const union sctp_addr *daddr, const struct sctp_initmsg *init, @@ -9620,6 +9632,7 @@ struct proto sctp_prot = { .sendmsg = sctp_sendmsg, .recvmsg = sctp_recvmsg, .bind = sctp_bind, + .bind_add = sctp_bind_add, .backlog_rcv = sctp_backlog_rcv, .hash = sctp_hash, .unhash = sctp_unhash, @@ -9662,6 +9675,7 @@ struct proto sctpv6_prot = { .sendmsg = sctp_sendmsg, .recvmsg = sctp_recvmsg, .bind = sctp_bind, + .bind_add = sctp_bind_add, .backlog_rcv = sctp_backlog_rcv, .hash = sctp_hash, .unhash = sctp_unhash, -- cgit v1.2.3-59-g8ed1b From 5a892ff2facb4548c17c05931ed899038a0da63e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 29 May 2020 14:09:43 +0200 Subject: net: remove kernel_setsockopt No users left. Signed-off-by: Christoph Hellwig Reviewed-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- include/linux/net.h | 2 -- net/socket.c | 31 ------------------------------- 2 files changed, 33 deletions(-) diff --git a/include/linux/net.h b/include/linux/net.h index 74ef5d7315f7..e10f378194a5 100644 --- a/include/linux/net.h +++ b/include/linux/net.h @@ -303,8 +303,6 @@ int kernel_connect(struct socket *sock, struct sockaddr *addr, int addrlen, int flags); int kernel_getsockname(struct socket *sock, struct sockaddr *addr); int kernel_getpeername(struct socket *sock, struct sockaddr *addr); -int kernel_setsockopt(struct socket *sock, int level, int optname, char *optval, - unsigned int optlen); int kernel_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags); int kernel_sendpage_locked(struct sock *sk, struct page *page, int offset, diff --git a/net/socket.c b/net/socket.c index 81a98b6cbd08..976426d03f09 100644 --- a/net/socket.c +++ b/net/socket.c @@ -3624,37 +3624,6 @@ int kernel_getpeername(struct socket *sock, struct sockaddr *addr) } EXPORT_SYMBOL(kernel_getpeername); -/** - * kernel_setsockopt - set a socket option (kernel space) - * @sock: socket - * @level: API level (SOL_SOCKET, ...) - * @optname: option tag - * @optval: option value - * @optlen: option length - * - * Returns 0 or an error. - */ - -int kernel_setsockopt(struct socket *sock, int level, int optname, - char *optval, unsigned int optlen) -{ - mm_segment_t oldfs = get_fs(); - char __user *uoptval; - int err; - - uoptval = (char __user __force *) optval; - - set_fs(KERNEL_DS); - if (level == SOL_SOCKET) - err = sock_setsockopt(sock, level, optname, uoptval, optlen); - else - err = sock->ops->setsockopt(sock, level, optname, uoptval, - optlen); - set_fs(oldfs); - return err; -} -EXPORT_SYMBOL(kernel_setsockopt); - /** * kernel_sendpage - send a &page through a socket (kernel space) * @sock: socket -- cgit v1.2.3-59-g8ed1b From 04198499b23f9d73a127a17b8576c9266c8f6f9b Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Wed, 27 May 2020 19:41:34 +0300 Subject: net: dsa: tag_8021q: stop restoring VLANs from bridge Right now, our only tag_8021q user, sja1105, has the ability to restore bridge VLANs on its own, so this logic is unnecessary. Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- net/dsa/tag_8021q.c | 61 +---------------------------------------------------- 1 file changed, 1 insertion(+), 60 deletions(-) diff --git a/net/dsa/tag_8021q.c b/net/dsa/tag_8021q.c index 3052da668156..780b2a15ac9b 100644 --- a/net/dsa/tag_8021q.c +++ b/net/dsa/tag_8021q.c @@ -140,34 +140,6 @@ bool vid_is_dsa_8021q(u16 vid) } EXPORT_SYMBOL_GPL(vid_is_dsa_8021q); -static int dsa_8021q_restore_pvid(struct dsa_switch *ds, int port) -{ - struct bridge_vlan_info vinfo; - struct net_device *slave; - u16 pvid; - int err; - - if (!dsa_is_user_port(ds, port)) - return 0; - - slave = dsa_to_port(ds, port)->slave; - - err = br_vlan_get_pvid(slave, &pvid); - if (!pvid || err < 0) - /* There is no pvid on the bridge for this port, which is - * perfectly valid. Nothing to restore, bye-bye! - */ - return 0; - - err = br_vlan_get_info(slave, pvid, &vinfo); - if (err < 0) { - dev_err(ds->dev, "Couldn't determine PVID attributes\n"); - return err; - } - - return dsa_port_vid_add(dsa_to_port(ds, port), pvid, vinfo.flags); -} - /* If @enabled is true, installs @vid with @flags into the switch port's HW * filter. * If @enabled is false, deletes @vid (ignores @flags) from the port. Had the @@ -178,39 +150,11 @@ static int dsa_8021q_vid_apply(struct dsa_switch *ds, int port, u16 vid, u16 flags, bool enabled) { struct dsa_port *dp = dsa_to_port(ds, port); - struct bridge_vlan_info vinfo; - int err; if (enabled) return dsa_port_vid_add(dp, vid, flags); - err = dsa_port_vid_del(dp, vid); - if (err < 0) - return err; - - /* Nothing to restore from the bridge for a non-user port. - * The CPU port VLANs are restored implicitly with the user ports, - * similar to how the bridge does in dsa_slave_vlan_add and - * dsa_slave_vlan_del. - */ - if (!dsa_is_user_port(ds, port)) - return 0; - - err = br_vlan_get_info(dp->slave, vid, &vinfo); - /* Couldn't determine bridge attributes for this vid, - * it means the bridge had not configured it. - */ - if (err < 0) - return 0; - - /* Restore the VID from the bridge */ - err = dsa_port_vid_add(dp, vid, vinfo.flags); - if (err < 0) - return err; - - vinfo.flags &= ~BRIDGE_VLAN_INFO_PVID; - - return dsa_port_vid_add(dp->cpu_dp, vid, vinfo.flags); + return dsa_port_vid_del(dp, vid); } /* RX VLAN tagging (left) and TX VLAN tagging (right) setup shown for a single @@ -329,9 +273,6 @@ int dsa_port_setup_8021q_tagging(struct dsa_switch *ds, int port, bool enabled) return err; } - if (!enabled) - err = dsa_8021q_restore_pvid(ds, port); - return err; } EXPORT_SYMBOL_GPL(dsa_port_setup_8021q_tagging); -- cgit v1.2.3-59-g8ed1b From aef31718a923338aff610abef41114a9c0fd37ea Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Wed, 27 May 2020 20:20:38 +0300 Subject: net: dsa: sja1105: avoid invalid state in sja1105_vlan_filtering Be there 2 switches spi/spi2.0 and spi/spi2.1 in a cross-chip setup, both under the same VLAN-filtering bridge, both in the SJA1105_VLAN_BEST_EFFORT state. If we try to change the VLAN state of one of the switches (to SJA1105_VLAN_FILTERING_FULL) we get the following error: devlink dev param set spi/spi2.1 name best_effort_vlan_filtering value false cmode runtime [ 38.325683] sja1105 spi2.1: Not allowed to overcommit frame memory. L2 memory partitions and VL memory partitions share the same space. The sum of all 16 memory partitions is not allowed to be larger than 929 128-byte blocks (or 910 with retagging). Please adjust l2-forwarding-parameters-table.part_spc and/or vl-forwarding-parameters-table.partspc. [ 38.356803] sja1105 spi2.1: Invalid config, cannot upload This is because the spi/spi2.1 switch doesn't support tagging anymore in the SJA1105_VLAN_FILTERING_FULL state, so it doesn't need to have any retagging rules defined. Great, so it can use more frame memory (retagging consumes extra memory). But the built-in low-level static config checker from the sja1105 driver says "not so fast, you've increased the frame memory to non-retagging values, but you still kept the retagging rules in the static config". So we need to rebuild the VLAN table immediately before re-uploading the static config, operation which will take care, based on the new VLAN state, of removing the retagging rules. Fixes: 3f01c91aab92 ("net: dsa: sja1105: implement VLAN retagging for dsa_8021q sub-VLANs") Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- drivers/net/dsa/sja1105/sja1105_main.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/net/dsa/sja1105/sja1105_main.c b/drivers/net/dsa/sja1105/sja1105_main.c index 36ab527449e6..789b288cc78b 100644 --- a/drivers/net/dsa/sja1105/sja1105_main.c +++ b/drivers/net/dsa/sja1105/sja1105_main.c @@ -2746,6 +2746,10 @@ static int sja1105_vlan_filtering(struct dsa_switch *ds, int port, bool enabled) sja1105_frame_memory_partitioning(priv); + rc = sja1105_build_vlan_table(priv, false); + if (rc) + return rc; + rc = sja1105_static_config_reload(priv, SJA1105_VLAN_FILTERING); if (rc) dev_err(ds->dev, "Failed to change VLAN Ethertype\n"); -- cgit v1.2.3-59-g8ed1b From 8298a419a0064c6cd6c84a4a45de294a3eff0832 Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Thu, 28 May 2020 07:43:59 +0000 Subject: tipc: remove set but not used variable 'prev' Fixes gcc '-Wunused-but-set-variable' warning: net/tipc/msg.c: In function 'tipc_msg_append': net/tipc/msg.c:215:24: warning: variable 'prev' set but not used [-Wunused-but-set-variable] commit 0a3e060f340d ("tipc: add test for Nagle algorithm effectiveness") left behind this, remove it. Signed-off-by: YueHaibing Signed-off-by: David S. Miller --- net/tipc/msg.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/net/tipc/msg.c b/net/tipc/msg.c index 23809039dda1..c0afcd627c5e 100644 --- a/net/tipc/msg.c +++ b/net/tipc/msg.c @@ -212,7 +212,7 @@ err: int tipc_msg_append(struct tipc_msg *_hdr, struct msghdr *m, int dlen, int mss, struct sk_buff_head *txq) { - struct sk_buff *skb, *prev; + struct sk_buff *skb; int accounted, total, curr; int mlen, cpy, rem = dlen; struct tipc_msg *hdr; @@ -223,7 +223,6 @@ int tipc_msg_append(struct tipc_msg *_hdr, struct msghdr *m, int dlen, while (rem) { if (!skb || skb->len >= mss) { - prev = skb; skb = tipc_buf_acquire(mss, GFP_KERNEL); if (unlikely(!skb)) return -ENOMEM; -- cgit v1.2.3-59-g8ed1b From 139df98bdfef62868df3d626d27f1db9edd9830f Mon Sep 17 00:00:00 2001 From: Fugang Duan Date: Thu, 28 May 2020 16:26:23 +0800 Subject: stmmac: platform: add "snps, dwmac-5.10a" IP compatible string Add "snps,dwmac-5.10a" compatible string for 5.10a version that can avoid to define some plat data in glue layer. Signed-off-by: Fugang Duan Signed-off-by: David S. Miller --- drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c index bcda49dcf619..f32317fa75c8 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c @@ -507,7 +507,8 @@ stmmac_probe_config_dt(struct platform_device *pdev, const char **mac) if (of_device_is_compatible(np, "snps,dwmac-4.00") || of_device_is_compatible(np, "snps,dwmac-4.10a") || - of_device_is_compatible(np, "snps,dwmac-4.20a")) { + of_device_is_compatible(np, "snps,dwmac-4.20a") || + of_device_is_compatible(np, "snps,dwmac-5.10a")) { plat->has_gmac4 = 1; plat->has_gmac = 0; plat->pmt = 1; -- cgit v1.2.3-59-g8ed1b From 94abdad6974a5e108d921df2c38e35cae6179bb2 Mon Sep 17 00:00:00 2001 From: Fugang Duan Date: Thu, 28 May 2020 16:26:24 +0800 Subject: net: ethernet: dwmac: add ethernet glue logic for NXP imx8 chip NXP imx8 family like imx8mp/imx8dxl chips support Synopsys MAC 5.10a IP. This patch adds settings for NXP imx8 glue layer: - clocks - dwmac address width - phy interface mode selection - adjust rgmii txclk rate v2: - adjust code sequences in order to have reverse christmas tree local variable ordering. Signed-off-by: Fugang Duan Signed-off-by: David S. Miller --- drivers/net/ethernet/stmicro/stmmac/Kconfig | 13 + drivers/net/ethernet/stmicro/stmmac/Makefile | 1 + drivers/net/ethernet/stmicro/stmmac/dwmac-imx.c | 315 ++++++++++++++++++++++++ 3 files changed, 329 insertions(+) create mode 100644 drivers/net/ethernet/stmicro/stmmac/dwmac-imx.c diff --git a/drivers/net/ethernet/stmicro/stmmac/Kconfig b/drivers/net/ethernet/stmicro/stmmac/Kconfig index b46f8d2ae6d7..36bd2e18f23b 100644 --- a/drivers/net/ethernet/stmicro/stmmac/Kconfig +++ b/drivers/net/ethernet/stmicro/stmmac/Kconfig @@ -196,6 +196,19 @@ config DWMAC_SUN8I This selects Allwinner SoC glue layer support for the stmmac device driver. This driver is used for H3/A83T/A64 EMAC ethernet controller. + +config DWMAC_IMX8 + tristate "NXP IMX8 DWMAC support" + default ARCH_MXC + depends on OF && (ARCH_MXC || COMPILE_TEST) + select MFD_SYSCON + ---help--- + Support for ethernet controller on NXP i.MX8 SOCs. + + This selects NXP SoC glue layer support for the stmmac + device driver. This driver is used for i.MX8 series like + iMX8MP/iMX8DXL GMAC ethernet controller. + endif config DWMAC_INTEL diff --git a/drivers/net/ethernet/stmicro/stmmac/Makefile b/drivers/net/ethernet/stmicro/stmmac/Makefile index f9d024d6b69b..295615ab36a7 100644 --- a/drivers/net/ethernet/stmicro/stmmac/Makefile +++ b/drivers/net/ethernet/stmicro/stmmac/Makefile @@ -27,6 +27,7 @@ obj-$(CONFIG_DWMAC_SUNXI) += dwmac-sunxi.o obj-$(CONFIG_DWMAC_SUN8I) += dwmac-sun8i.o obj-$(CONFIG_DWMAC_DWC_QOS_ETH) += dwmac-dwc-qos-eth.o obj-$(CONFIG_DWMAC_GENERIC) += dwmac-generic.o +obj-$(CONFIG_DWMAC_IMX8) += dwmac-imx.o stmmac-platform-objs:= stmmac_platform.o dwmac-altr-socfpga-objs := altr_tse_pcs.o dwmac-socfpga.o diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-imx.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-imx.c new file mode 100644 index 000000000000..5010af7dab4a --- /dev/null +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-imx.c @@ -0,0 +1,315 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * dwmac-imx.c - DWMAC Specific Glue layer for NXP imx8 + * + * Copyright 2020 NXP + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "stmmac_platform.h" + +#define GPR_ENET_QOS_INTF_MODE_MASK GENMASK(21, 16) +#define GPR_ENET_QOS_INTF_SEL_MII (0x0 << 16) +#define GPR_ENET_QOS_INTF_SEL_RMII (0x4 << 16) +#define GPR_ENET_QOS_INTF_SEL_RGMII (0x1 << 16) +#define GPR_ENET_QOS_CLK_GEN_EN (0x1 << 19) +#define GPR_ENET_QOS_CLK_TX_CLK_SEL (0x1 << 20) +#define GPR_ENET_QOS_RGMII_EN (0x1 << 21) + +struct imx_dwmac_ops { + u32 addr_width; + bool mac_rgmii_txclk_auto_adj; + + int (*set_intf_mode)(struct plat_stmmacenet_data *plat_dat); +}; + +struct imx_priv_data { + struct device *dev; + struct clk *clk_tx; + struct clk *clk_mem; + struct regmap *intf_regmap; + u32 intf_reg_off; + bool rmii_refclk_ext; + + const struct imx_dwmac_ops *ops; + struct plat_stmmacenet_data *plat_dat; +}; + +static int imx8mp_set_intf_mode(struct plat_stmmacenet_data *plat_dat) +{ + struct imx_priv_data *dwmac = plat_dat->bsp_priv; + int val; + + switch (plat_dat->interface) { + case PHY_INTERFACE_MODE_MII: + val = GPR_ENET_QOS_INTF_SEL_MII; + break; + case PHY_INTERFACE_MODE_RMII: + val = GPR_ENET_QOS_INTF_SEL_RMII; + val |= (dwmac->rmii_refclk_ext ? 0 : GPR_ENET_QOS_CLK_TX_CLK_SEL); + break; + case PHY_INTERFACE_MODE_RGMII: + case PHY_INTERFACE_MODE_RGMII_ID: + case PHY_INTERFACE_MODE_RGMII_RXID: + case PHY_INTERFACE_MODE_RGMII_TXID: + val = GPR_ENET_QOS_INTF_SEL_RGMII | + GPR_ENET_QOS_RGMII_EN; + break; + default: + pr_debug("imx dwmac doesn't support %d interface\n", + plat_dat->interface); + return -EINVAL; + } + + val |= GPR_ENET_QOS_CLK_GEN_EN; + return regmap_update_bits(dwmac->intf_regmap, dwmac->intf_reg_off, + GPR_ENET_QOS_INTF_MODE_MASK, val); +}; + +static int +imx8dxl_set_intf_mode(struct plat_stmmacenet_data *plat_dat) +{ + int ret = 0; + + /* TBD: depends on imx8dxl scu interfaces to be upstreamed */ + return ret; +} + +static int imx_dwmac_init(struct platform_device *pdev, void *priv) +{ + struct plat_stmmacenet_data *plat_dat; + struct imx_priv_data *dwmac = priv; + int ret; + + plat_dat = dwmac->plat_dat; + + ret = clk_prepare_enable(dwmac->clk_mem); + if (ret) { + dev_err(&pdev->dev, "mem clock enable failed\n"); + return ret; + } + + ret = clk_prepare_enable(dwmac->clk_tx); + if (ret) { + dev_err(&pdev->dev, "tx clock enable failed\n"); + goto clk_tx_en_failed; + } + + if (dwmac->ops->set_intf_mode) { + ret = dwmac->ops->set_intf_mode(plat_dat); + if (ret) + goto intf_mode_failed; + } + + return 0; + +intf_mode_failed: + clk_disable_unprepare(dwmac->clk_tx); +clk_tx_en_failed: + clk_disable_unprepare(dwmac->clk_mem); + return ret; +} + +static void imx_dwmac_exit(struct platform_device *pdev, void *priv) +{ + struct imx_priv_data *dwmac = priv; + + if (dwmac->clk_tx) + clk_disable_unprepare(dwmac->clk_tx); + clk_disable_unprepare(dwmac->clk_mem); +} + +static void imx_dwmac_fix_speed(void *priv, unsigned int speed) +{ + struct plat_stmmacenet_data *plat_dat; + struct imx_priv_data *dwmac = priv; + unsigned long rate; + int err; + + plat_dat = dwmac->plat_dat; + + if (dwmac->ops->mac_rgmii_txclk_auto_adj || + (plat_dat->interface == PHY_INTERFACE_MODE_RMII) || + (plat_dat->interface == PHY_INTERFACE_MODE_MII)) + return; + + switch (speed) { + case SPEED_1000: + rate = 125000000; + break; + case SPEED_100: + rate = 25000000; + break; + case SPEED_10: + rate = 2500000; + break; + default: + dev_err(dwmac->dev, "invalid speed %u\n", speed); + return; + } + + err = clk_set_rate(dwmac->clk_tx, rate); + if (err < 0) + dev_err(dwmac->dev, "failed to set tx rate %lu\n", rate); +} + +static int +imx_dwmac_parse_dt(struct imx_priv_data *dwmac, struct device *dev) +{ + struct device_node *np = dev->of_node; + int err = 0; + + if (of_get_property(np, "snps,rmii_refclk_ext", NULL)) + dwmac->rmii_refclk_ext = true; + + dwmac->clk_tx = devm_clk_get(dev, "tx"); + if (IS_ERR(dwmac->clk_tx)) { + dev_err(dev, "failed to get tx clock\n"); + return PTR_ERR(dwmac->clk_tx); + } + + dwmac->clk_mem = NULL; + if (of_machine_is_compatible("fsl,imx8dxl")) { + dwmac->clk_mem = devm_clk_get(dev, "mem"); + if (IS_ERR(dwmac->clk_mem)) { + dev_err(dev, "failed to get mem clock\n"); + return PTR_ERR(dwmac->clk_mem); + } + } + + if (of_machine_is_compatible("fsl,imx8mp")) { + /* Binding doc describes the propety: + is required by i.MX8MP. + is optinoal for i.MX8DXL. + */ + dwmac->intf_regmap = syscon_regmap_lookup_by_phandle(np, "intf_mode"); + if (IS_ERR(dwmac->intf_regmap)) + return PTR_ERR(dwmac->intf_regmap); + + err = of_property_read_u32_index(np, "intf_mode", 1, &dwmac->intf_reg_off); + if (err) { + dev_err(dev, "Can't get intf mode reg offset (%d)\n", err); + return err; + } + } + + return err; +} + +static int imx_dwmac_probe(struct platform_device *pdev) +{ + struct plat_stmmacenet_data *plat_dat; + struct stmmac_resources stmmac_res; + struct imx_priv_data *dwmac; + const struct imx_dwmac_ops *data; + int ret; + + ret = stmmac_get_platform_resources(pdev, &stmmac_res); + if (ret) + return ret; + + dwmac = devm_kzalloc(&pdev->dev, sizeof(*dwmac), GFP_KERNEL); + if (!dwmac) + return PTR_ERR(dwmac); + + plat_dat = stmmac_probe_config_dt(pdev, &stmmac_res.mac); + if (IS_ERR(plat_dat)) + return PTR_ERR(plat_dat); + + data = of_device_get_match_data(&pdev->dev); + if (!data) { + dev_err(&pdev->dev, "failed to get match data\n"); + ret = -EINVAL; + goto err_match_data; + } + + dwmac->ops = data; + dwmac->dev = &pdev->dev; + + ret = imx_dwmac_parse_dt(dwmac, &pdev->dev); + if (ret) { + dev_err(&pdev->dev, "failed to parse OF data\n"); + goto err_parse_dt; + } + + ret = dma_set_mask_and_coherent(&pdev->dev, + DMA_BIT_MASK(dwmac->ops->addr_width)); + if (ret) { + dev_err(&pdev->dev, "DMA mask set failed\n"); + goto err_dma_mask; + } + + plat_dat->init = imx_dwmac_init; + plat_dat->exit = imx_dwmac_exit; + plat_dat->fix_mac_speed = imx_dwmac_fix_speed; + plat_dat->bsp_priv = dwmac; + dwmac->plat_dat = plat_dat; + + ret = imx_dwmac_init(pdev, dwmac); + if (ret) + goto err_dwmac_init; + + ret = stmmac_dvr_probe(&pdev->dev, plat_dat, &stmmac_res); + if (ret) + goto err_drv_probe; + + return 0; + +err_dwmac_init: +err_drv_probe: + imx_dwmac_exit(pdev, plat_dat->bsp_priv); +err_dma_mask: +err_parse_dt: +err_match_data: + stmmac_remove_config_dt(pdev, plat_dat); + return ret; +} + +static struct imx_dwmac_ops imx8mp_dwmac_data = { + .addr_width = 34, + .mac_rgmii_txclk_auto_adj = false, + .set_intf_mode = imx8mp_set_intf_mode, +}; + +static struct imx_dwmac_ops imx8dxl_dwmac_data = { + .addr_width = 32, + .mac_rgmii_txclk_auto_adj = true, + .set_intf_mode = imx8dxl_set_intf_mode, +}; + +static const struct of_device_id imx_dwmac_match[] = { + { .compatible = "nxp,imx8mp-dwmac-eqos", .data = &imx8mp_dwmac_data }, + { .compatible = "nxp,imx8dxl-dwmac-eqos", .data = &imx8dxl_dwmac_data }, + { } +}; +MODULE_DEVICE_TABLE(of, imx_dwmac_match); + +static struct platform_driver imx_dwmac_driver = { + .probe = imx_dwmac_probe, + .remove = stmmac_pltfr_remove, + .driver = { + .name = "imx-dwmac", + .pm = &stmmac_pltfr_pm_ops, + .of_match_table = imx_dwmac_match, + }, +}; +module_platform_driver(imx_dwmac_driver); + +MODULE_AUTHOR("NXP"); +MODULE_DESCRIPTION("NXP imx8 DWMAC Specific Glue layer"); +MODULE_LICENSE("GPL v2"); -- cgit v1.2.3-59-g8ed1b From 29884aa6806ce27460281d0ec1819683261e89cd Mon Sep 17 00:00:00 2001 From: Fugang Duan Date: Thu, 28 May 2020 16:26:25 +0800 Subject: dt-bindings: net: imx-dwmac: Add NXP imx8 DWMAC glue layer Add description for NXP imx8 families like imx8mp/imx8dxl that integrate the Synopsys gmac IP version 5.10a. Signed-off-by: Fugang Duan Signed-off-by: David S. Miller --- .../devicetree/bindings/net/imx-dwmac.txt | 56 ++++++++++++++++++++++ 1 file changed, 56 insertions(+) create mode 100644 Documentation/devicetree/bindings/net/imx-dwmac.txt diff --git a/Documentation/devicetree/bindings/net/imx-dwmac.txt b/Documentation/devicetree/bindings/net/imx-dwmac.txt new file mode 100644 index 000000000000..921d522fe8d7 --- /dev/null +++ b/Documentation/devicetree/bindings/net/imx-dwmac.txt @@ -0,0 +1,56 @@ +IMX8 glue layer controller, NXP imx8 families support Synopsys MAC 5.10a IP. + +This file documents platform glue layer for IMX. +Please see stmmac.txt for the other unchanged properties. + +The device node has following properties. + +Required properties: +- compatible: Should be "nxp,imx8mp-dwmac-eqos" to select glue layer + and "snps,dwmac-5.10a" to select IP version. +- clocks: Must contain a phandle for each entry in clock-names. +- clock-names: Should be "stmmaceth" for the host clock. + Should be "pclk" for the MAC apb clock. + Should be "ptp_ref" for the MAC timer clock. + Should be "tx" for the MAC RGMII TX clock: + Should be "mem" for EQOS MEM clock. + - "mem" clock is required for imx8dxl platform. + - "mem" clock is not required for imx8mp platform. +- interrupt-names: Should contain a list of interrupt names corresponding to + the interrupts in the interrupts property, if available. + Should be "macirq" for the main MAC IRQ + Should be "eth_wake_irq" for the IT which wake up system +- intf_mode: Should be phandle/offset pair. The phandle to the syscon node which + encompases the GPR register, and the offset of the GPR register. + - required for imx8mp platform. + - is optional for imx8dxl platform. + +Optional properties: +- intf_mode: is optional for imx8dxl platform. +- snps,rmii_refclk_ext: to select RMII reference clock from external. + +Example: + eqos: ethernet@30bf0000 { + compatible = "nxp,imx8mp-dwmac-eqos", "snps,dwmac-5.10a"; + reg = <0x30bf0000 0x10000>; + interrupts = , + ; + interrupt-names = "eth_wake_irq", "macirq"; + clocks = <&clk IMX8MP_CLK_ENET_QOS_ROOT>, + <&clk IMX8MP_CLK_QOS_ENET_ROOT>, + <&clk IMX8MP_CLK_ENET_QOS_TIMER>, + <&clk IMX8MP_CLK_ENET_QOS>; + clock-names = "stmmaceth", "pclk", "ptp_ref", "tx"; + assigned-clocks = <&clk IMX8MP_CLK_ENET_AXI>, + <&clk IMX8MP_CLK_ENET_QOS_TIMER>, + <&clk IMX8MP_CLK_ENET_QOS>; + assigned-clock-parents = <&clk IMX8MP_SYS_PLL1_266M>, + <&clk IMX8MP_SYS_PLL2_100M>, + <&clk IMX8MP_SYS_PLL2_125M>; + assigned-clock-rates = <0>, <100000000>, <125000000>; + nvmem-cells = <ð_mac0>; + nvmem-cell-names = "mac-address"; + nvmem_macaddr_swap; + intf_mode = <&gpr 0x4>; + status = "disabled"; + }; -- cgit v1.2.3-59-g8ed1b From 91f3fd1124e002f970ef21c2459bf2ba4ac080ee Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Thu, 28 May 2020 15:59:02 +0200 Subject: dt-bindings: net: rename the bindings document for MediaTek STAR EMAC The driver itself was renamed before getting merged into mainline, but the binding document kept the old name. This makes both names consistent. Signed-off-by: Bartosz Golaszewski Acked-by: Rob Herring Signed-off-by: David S. Miller --- .../devicetree/bindings/net/mediatek,eth-mac.yaml | 89 ---------------------- .../bindings/net/mediatek,star-emac.yaml | 89 ++++++++++++++++++++++ 2 files changed, 89 insertions(+), 89 deletions(-) delete mode 100644 Documentation/devicetree/bindings/net/mediatek,eth-mac.yaml create mode 100644 Documentation/devicetree/bindings/net/mediatek,star-emac.yaml diff --git a/Documentation/devicetree/bindings/net/mediatek,eth-mac.yaml b/Documentation/devicetree/bindings/net/mediatek,eth-mac.yaml deleted file mode 100644 index f85d91a9d6e5..000000000000 --- a/Documentation/devicetree/bindings/net/mediatek,eth-mac.yaml +++ /dev/null @@ -1,89 +0,0 @@ -# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) -%YAML 1.2 ---- -$id: http://devicetree.org/schemas/net/mediatek,eth-mac.yaml# -$schema: http://devicetree.org/meta-schemas/core.yaml# - -title: MediaTek STAR Ethernet MAC Controller - -maintainers: - - Bartosz Golaszewski - -description: - This Ethernet MAC is used on the MT8* family of SoCs from MediaTek. - It's compliant with 802.3 standards and supports half- and full-duplex - modes with flow-control as well as CRC offloading and VLAN tags. - -allOf: - - $ref: "ethernet-controller.yaml#" - -properties: - compatible: - enum: - - mediatek,mt8516-eth - - mediatek,mt8518-eth - - mediatek,mt8175-eth - - reg: - maxItems: 1 - - interrupts: - maxItems: 1 - - clocks: - minItems: 3 - maxItems: 3 - - clock-names: - additionalItems: false - items: - - const: core - - const: reg - - const: trans - - mediatek,pericfg: - $ref: /schemas/types.yaml#definitions/phandle - description: - Phandle to the device containing the PERICFG register range. This is used - to control the MII mode. - - mdio: - type: object - description: - Creates and registers an MDIO bus. - -required: - - compatible - - reg - - interrupts - - clocks - - clock-names - - mediatek,pericfg - - phy-handle - -examples: - - | - #include - #include - - ethernet: ethernet@11180000 { - compatible = "mediatek,mt8516-eth"; - reg = <0x11180000 0x1000>; - mediatek,pericfg = <&pericfg>; - interrupts = ; - clocks = <&topckgen CLK_TOP_RG_ETH>, - <&topckgen CLK_TOP_66M_ETH>, - <&topckgen CLK_TOP_133M_ETH>; - clock-names = "core", "reg", "trans"; - phy-handle = <ð_phy>; - phy-mode = "rmii"; - - mdio { - #address-cells = <1>; - #size-cells = <0>; - - eth_phy: ethernet-phy@0 { - reg = <0>; - }; - }; - }; diff --git a/Documentation/devicetree/bindings/net/mediatek,star-emac.yaml b/Documentation/devicetree/bindings/net/mediatek,star-emac.yaml new file mode 100644 index 000000000000..aea88e621792 --- /dev/null +++ b/Documentation/devicetree/bindings/net/mediatek,star-emac.yaml @@ -0,0 +1,89 @@ +# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/net/mediatek,star-emac.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: MediaTek STAR Ethernet MAC Controller + +maintainers: + - Bartosz Golaszewski + +description: + This Ethernet MAC is used on the MT8* family of SoCs from MediaTek. + It's compliant with 802.3 standards and supports half- and full-duplex + modes with flow-control as well as CRC offloading and VLAN tags. + +allOf: + - $ref: "ethernet-controller.yaml#" + +properties: + compatible: + enum: + - mediatek,mt8516-eth + - mediatek,mt8518-eth + - mediatek,mt8175-eth + + reg: + maxItems: 1 + + interrupts: + maxItems: 1 + + clocks: + minItems: 3 + maxItems: 3 + + clock-names: + additionalItems: false + items: + - const: core + - const: reg + - const: trans + + mediatek,pericfg: + $ref: /schemas/types.yaml#definitions/phandle + description: + Phandle to the device containing the PERICFG register range. This is used + to control the MII mode. + + mdio: + type: object + description: + Creates and registers an MDIO bus. + +required: + - compatible + - reg + - interrupts + - clocks + - clock-names + - mediatek,pericfg + - phy-handle + +examples: + - | + #include + #include + + ethernet: ethernet@11180000 { + compatible = "mediatek,mt8516-eth"; + reg = <0x11180000 0x1000>; + mediatek,pericfg = <&pericfg>; + interrupts = ; + clocks = <&topckgen CLK_TOP_RG_ETH>, + <&topckgen CLK_TOP_66M_ETH>, + <&topckgen CLK_TOP_133M_ETH>; + clock-names = "core", "reg", "trans"; + phy-handle = <ð_phy>; + phy-mode = "rmii"; + + mdio { + #address-cells = <1>; + #size-cells = <0>; + + eth_phy: ethernet-phy@0 { + reg = <0>; + }; + }; + }; -- cgit v1.2.3-59-g8ed1b From f0b37fa613989dacbaba57010681218ed91e989b Mon Sep 17 00:00:00 2001 From: Louis Peens Date: Thu, 28 May 2020 16:18:46 +0200 Subject: nfp: flower: fix incorrect flag assignment A previous refactoring missed some locations the flags were renamed but not moved from the previous flower_ext_feats to the new flower_en_feats variable. This lead to the FLOW_MERGE and LAG features not being enabled. Fixes: e09303d3c4d9 ("nfp: flower: renaming of feature bits") Signed-off-by: Louis Peens Signed-off-by: Simon Horman Acked-by: Jakub Kicinski Signed-off-by: David S. Miller --- drivers/net/ethernet/netronome/nfp/flower/main.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/netronome/nfp/flower/main.c b/drivers/net/ethernet/netronome/nfp/flower/main.c index d054553c75e0..ca7032d22196 100644 --- a/drivers/net/ethernet/netronome/nfp/flower/main.c +++ b/drivers/net/ethernet/netronome/nfp/flower/main.c @@ -708,7 +708,7 @@ static int nfp_flower_sync_feature_bits(struct nfp_app *app) err = nfp_rtsym_write_le(app->pf->rtbl, "_abi_flower_balance_sync_enable", 1); if (!err) { - app_priv->flower_ext_feats |= NFP_FL_ENABLE_LAG; + app_priv->flower_en_feats |= NFP_FL_ENABLE_LAG; nfp_flower_lag_init(&app_priv->nfp_lag); } else if (err == -ENOENT) { nfp_warn(app->cpp, "LAG not supported by FW.\n"); @@ -721,7 +721,7 @@ static int nfp_flower_sync_feature_bits(struct nfp_app *app) err = nfp_rtsym_write_le(app->pf->rtbl, "_abi_flower_merge_hint_enable", 1); if (!err) { - app_priv->flower_ext_feats |= NFP_FL_ENABLE_FLOW_MERGE; + app_priv->flower_en_feats |= NFP_FL_ENABLE_FLOW_MERGE; nfp_flower_internal_port_init(app_priv); } else if (err == -ENOENT) { nfp_warn(app->cpp, @@ -840,7 +840,7 @@ static int nfp_flower_init(struct nfp_app *app) return 0; err_cleanup: - if (app_priv->flower_ext_feats & NFP_FL_ENABLE_LAG) + if (app_priv->flower_en_feats & NFP_FL_ENABLE_LAG) nfp_flower_lag_cleanup(&app_priv->nfp_lag); nfp_flower_metadata_cleanup(app); err_free_app_priv: -- cgit v1.2.3-59-g8ed1b From fd55199d3b762f9555c66a1bc4bf6eac3901fc2f Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Thu, 28 May 2020 23:43:24 +0200 Subject: net: ethtool: cabletest: Make ethnl_act_cable_test_tdr_cfg static kbuild test robot is reporting: net/ethtool/cabletest.c:230:5: warning: no previous prototype for Mark the function as static. Reported-by: kbuild test robot Signed-off-by: Andrew Lunn Signed-off-by: David S. Miller --- net/ethtool/cabletest.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/net/ethtool/cabletest.c b/net/ethtool/cabletest.c index 9991688d7d1d..7b7a0456c15c 100644 --- a/net/ethtool/cabletest.c +++ b/net/ethtool/cabletest.c @@ -227,9 +227,9 @@ cable_test_tdr_act_policy[ETHTOOL_A_CABLE_TEST_TDR_MAX + 1] = { }; /* CABLE_TEST_TDR_ACT */ -int ethnl_act_cable_test_tdr_cfg(const struct nlattr *nest, - struct genl_info *info, - struct phy_tdr_config *cfg) +static int ethnl_act_cable_test_tdr_cfg(const struct nlattr *nest, + struct genl_info *info, + struct phy_tdr_config *cfg) { struct nlattr *tb[ETHTOOL_A_CABLE_TEST_TDR_CFG_MAX + 1]; int ret; -- cgit v1.2.3-59-g8ed1b From bc183dec08f9cb177cf5206a010b7a9e7b22e567 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 29 May 2020 00:01:52 +0200 Subject: tcp: tcp_init_buffer_space can be static As of commit 98fa6271cfcb ("tcp: refactor setting the initial congestion window") this is called only from tcp_input.c, so it can be static. Signed-off-by: Florian Westphal Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/tcp.h | 1 - net/ipv4/tcp_input.c | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/include/net/tcp.h b/include/net/tcp.h index 66e4b8331850..bca761ffa25f 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -662,7 +662,6 @@ void tcp_initialize_rcv_mss(struct sock *sk); int tcp_mtu_to_mss(struct sock *sk, int pmtu); int tcp_mss_to_mtu(struct sock *sk, int mss); void tcp_mtup_init(struct sock *sk); -void tcp_init_buffer_space(struct sock *sk); static inline void tcp_bound_rto(const struct sock *sk) { diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index ad90102f5dfb..83330a6cb242 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -437,7 +437,7 @@ static void tcp_grow_window(struct sock *sk, const struct sk_buff *skb) /* 3. Try to fixup all. It is made immediately after connection enters * established state. */ -void tcp_init_buffer_space(struct sock *sk) +static void tcp_init_buffer_space(struct sock *sk) { int tcp_app_win = sock_net(sk)->ipv4.sysctl_tcp_app_win; struct tcp_sock *tp = tcp_sk(sk); -- cgit v1.2.3-59-g8ed1b From 56e2287b4110fbb81456a346b1d4c12bee7cf044 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Wed, 27 May 2020 15:44:09 +0200 Subject: mlx5: fix xdp data_meta setup in mlx5e_fill_xdp_buff The helper function xdp_set_data_meta_invalid() must be called after setting xdp->data as it depends on it. The bug was introduced in the cited patch below, and cause the kernel to crash when using BPF helper bpf_xdp_adjust_head() on mlx5 driver. Fixes: 39d6443c8daf ("mlx5, xsk: Migrate to new MEM_TYPE_XSK_BUFF_POOL") Reported-by: David Ahern Signed-off-by: Jesper Dangaard Brouer Tested-by: David Ahern Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en_rx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c index 6b3c82da199c..dbb1c6323967 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c @@ -1056,8 +1056,8 @@ static void mlx5e_fill_xdp_buff(struct mlx5e_rq *rq, void *va, u16 headroom, u32 len, struct xdp_buff *xdp) { xdp->data_hard_start = va; - xdp_set_data_meta_invalid(xdp); xdp->data = va + headroom; + xdp_set_data_meta_invalid(xdp); xdp->data_end = xdp->data + len; xdp->rxq = &rq->xdp_rxq; xdp->frame_sz = rq->buff.frame0_sz; -- cgit v1.2.3-59-g8ed1b From 2950d1d64fd035726b4b060313f931ed52e3615f Mon Sep 17 00:00:00 2001 From: Saeed Mahameed Date: Tue, 26 May 2020 22:09:09 -0700 Subject: net/mlx5: Kconfig: Fix spelling typo "mdoe"->"mode" Fixes: d956873f908c ("net/mlx5e: Introduce kconfig var for TC support") Signed-off-by: Saeed Mahameed Reported-by: Marcelo Ricardo Leitner --- drivers/net/ethernet/mellanox/mlx5/core/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/Kconfig b/drivers/net/ethernet/mellanox/mlx5/core/Kconfig index 4256d59eca2b..b6ffd1622cfd 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/Kconfig +++ b/drivers/net/ethernet/mellanox/mlx5/core/Kconfig @@ -84,7 +84,7 @@ config MLX5_CLS_ACT default y help mlx5 ConnectX offloads support for TC classifier action (NET_CLS_ACT), - works in both native NIC mdoe and Switchdev SRIOV mode. + works in both native NIC mode and Switchdev SRIOV mode. Actions get attached to a Hardware offloaded classifiers and are invoked after a successful classification. Actions are used to overwrite the classification result, instantly drop or redirect and/or -- cgit v1.2.3-59-g8ed1b From 2861904697de587c5b92c65a18a44ca3bbfd00ed Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Wed, 27 May 2020 00:50:22 -0700 Subject: net/mlx5e: Don't use err uninitialized in mlx5e_attach_decap Clang warns: drivers/net/ethernet/mellanox/mlx5/core/en_tc.c:3712:6: warning: variable 'err' is used uninitialized whenever 'if' condition is false [-Wsometimes-uninitialized] if (IS_ERR(d->pkt_reformat)) { ^~~~~~~~~~~~~~~~~~~~~~~ drivers/net/ethernet/mellanox/mlx5/core/en_tc.c:3718:6: note: uninitialized use occurs here if (err) ^~~ drivers/net/ethernet/mellanox/mlx5/core/en_tc.c:3712:2: note: remove the 'if' if its condition is always true if (IS_ERR(d->pkt_reformat)) { ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~ drivers/net/ethernet/mellanox/mlx5/core/en_tc.c:3670:9: note: initialize the variable 'err' to silence this warning int err; ^ = 0 1 warning generated. It is not wrong, err is only ever initialized in if statements but this one is not in one. Initialize err to 0 to fix this. Fixes: 14e6b038afa0 ("net/mlx5e: Add support for hw decapsulation of MPLS over UDP") Link: https://github.com/ClangBuiltLinux/linux/issues/1037 Signed-off-by: Nathan Chancellor Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en_tc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c index 0f119c08b835..ac19a61c5cbc 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c @@ -3667,7 +3667,7 @@ static int mlx5e_attach_decap(struct mlx5e_priv *priv, struct mlx5e_decap_entry *d; struct mlx5e_decap_key key; uintptr_t hash_key; - int err; + int err = 0; parse_attr = attr->parse_attr; if (sizeof(parse_attr->eth) > MLX5_CAP_ESW(priv->mdev, max_encap_header_size)) { -- cgit v1.2.3-59-g8ed1b From e1167e16114f78f948078749aa1608a785f65807 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 28 Apr 2020 23:23:47 +0200 Subject: net/mlx5: reduce stack usage in qp_read_field Moving the mlx5_ifc_query_qp_out_bits structure on the stack was a bit excessive and now causes the compiler to complain on 32-bit architectures: drivers/net/ethernet/mellanox/mlx5/core/debugfs.c: In function 'qp_read_field': drivers/net/ethernet/mellanox/mlx5/core/debugfs.c:274:1: error: the frame size of 1104 bytes is larger than 1024 bytes [-Werror=frame-larger-than=] Revert the previous patch partially to use dynamically allocation as the code did before. Unfortunately there is no good error handling in case the allocation fails. Fixes: 57a6c5e992f5 ("net/mlx5: Replace hand written QP context struct with automatic getters") Signed-off-by: Arnd Bergmann Acked-by: Saeed Mahameed Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/debugfs.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/debugfs.c b/drivers/net/ethernet/mellanox/mlx5/core/debugfs.c index 6409090b3ec5..d2d57213511b 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/debugfs.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/debugfs.c @@ -202,18 +202,23 @@ void mlx5_cq_debugfs_cleanup(struct mlx5_core_dev *dev) static u64 qp_read_field(struct mlx5_core_dev *dev, struct mlx5_core_qp *qp, int index, int *is_str) { - u32 out[MLX5_ST_SZ_BYTES(query_qp_out)] = {}; + int outlen = MLX5_ST_SZ_BYTES(query_qp_out); u32 in[MLX5_ST_SZ_DW(query_qp_in)] = {}; u64 param = 0; + u32 *out; int state; u32 *qpc; int err; + out = kzalloc(outlen, GFP_KERNEL); + if (!out) + return 0; + MLX5_SET(query_qp_in, in, opcode, MLX5_CMD_OP_QUERY_QP); MLX5_SET(query_qp_in, in, qpn, qp->qpn); err = mlx5_cmd_exec_inout(dev, query_qp, in, out); if (err) - return 0; + goto out; *is_str = 0; @@ -269,7 +274,8 @@ static u64 qp_read_field(struct mlx5_core_dev *dev, struct mlx5_core_qp *qp, param = MLX5_GET(qpc, qpc, remote_qpn); break; } - +out: + kfree(out); return param; } -- cgit v1.2.3-59-g8ed1b From 86ae579cefffe18cb08928505d90fbc87367e8f5 Mon Sep 17 00:00:00 2001 From: Eli Cohen Date: Wed, 27 May 2020 08:35:03 +0300 Subject: net: Make mpls_entry_encode() available for generic users Move mpls_entry_encode() from net/mpls/internal.h to include/net/mpls.h and make it available for other users. Specifically, hardware driver that offload MPLS can benefit from that. Suggested-by: Jakub Kicinski Suggested-by: David Ahern Signed-off-by: Eli Cohen Signed-off-by: Saeed Mahameed --- include/net/mpls.h | 17 +++++++++++++++++ net/mpls/internal.h | 11 ----------- 2 files changed, 17 insertions(+), 11 deletions(-) diff --git a/include/net/mpls.h b/include/net/mpls.h index ccaf238e8ea7..0bb7944e7b08 100644 --- a/include/net/mpls.h +++ b/include/net/mpls.h @@ -8,6 +8,7 @@ #include #include +#include #define MPLS_HLEN 4 @@ -25,4 +26,20 @@ static inline struct mpls_shim_hdr *mpls_hdr(const struct sk_buff *skb) { return (struct mpls_shim_hdr *)skb_network_header(skb); } + +static inline struct mpls_shim_hdr mpls_entry_encode(u32 label, + unsigned int ttl, + unsigned int tc, + bool bos) +{ + struct mpls_shim_hdr result; + + result.label_stack_entry = + cpu_to_be32((label << MPLS_LS_LABEL_SHIFT) | + (tc << MPLS_LS_TC_SHIFT) | + (bos ? (1 << MPLS_LS_S_SHIFT) : 0) | + (ttl << MPLS_LS_TTL_SHIFT)); + return result; +} + #endif diff --git a/net/mpls/internal.h b/net/mpls/internal.h index 0e9aa94adc07..838cdfc10e47 100644 --- a/net/mpls/internal.h +++ b/net/mpls/internal.h @@ -172,17 +172,6 @@ struct mpls_route { /* next hop label forwarding entry */ #define endfor_nexthops(rt) } -static inline struct mpls_shim_hdr mpls_entry_encode(u32 label, unsigned ttl, unsigned tc, bool bos) -{ - struct mpls_shim_hdr result; - result.label_stack_entry = - cpu_to_be32((label << MPLS_LS_LABEL_SHIFT) | - (tc << MPLS_LS_TC_SHIFT) | - (bos ? (1 << MPLS_LS_S_SHIFT) : 0) | - (ttl << MPLS_LS_TTL_SHIFT)); - return result; -} - static inline struct mpls_entry_decoded mpls_entry_decode(struct mpls_shim_hdr *hdr) { struct mpls_entry_decoded result; -- cgit v1.2.3-59-g8ed1b From f7e3ac424a2b3fc6fb3b490106cd874adb62ae4a Mon Sep 17 00:00:00 2001 From: Eli Cohen Date: Wed, 27 May 2020 08:38:03 +0300 Subject: net/mlx5e: Use generic API to build MPLS label Make use of generic API mpls_entry_encode() to build mpls label and get rid of local function. Signed-off-by: Eli Cohen Signed-off-by: Saeed Mahameed --- .../ethernet/mellanox/mlx5/core/en/tc_tun_mplsoudp.c | 20 +++----------------- 1 file changed, 3 insertions(+), 17 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun_mplsoudp.c b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun_mplsoudp.c index b4a3c96d34fd..1f9526244222 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun_mplsoudp.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun_mplsoudp.c @@ -25,35 +25,21 @@ static int init_encap_attr(struct net_device *tunnel_dev, return 0; } -static inline __be32 mpls_label_id_field(__be32 label, u8 tos, u8 ttl) -{ - u32 res; - - /* mpls label is 32 bits long and construction as follows: - * 20 bits label - * 3 bits tos - * 1 bit bottom of stack. Since we support only one label, this bit is - * always set. - * 8 bits TTL - */ - res = be32_to_cpu(label) << 12 | 1 << 8 | (tos & 7) << 9 | ttl; - return cpu_to_be32(res); -} - static int generate_ip_tun_hdr(char buf[], __u8 *ip_proto, struct mlx5e_encap_entry *r) { const struct ip_tunnel_key *tun_key = &r->tun_info->key; - __be32 tun_id = tunnel_id_to_key32(tun_key->tun_id); struct udphdr *udp = (struct udphdr *)(buf); struct mpls_shim_hdr *mpls; + u32 tun_id; + tun_id = be32_to_cpu(tunnel_id_to_key32(tun_key->tun_id)); mpls = (struct mpls_shim_hdr *)(udp + 1); *ip_proto = IPPROTO_UDP; udp->dest = tun_key->tp_dst; - mpls->label_stack_entry = mpls_label_id_field(tun_id, tun_key->tos, tun_key->ttl); + *mpls = mpls_entry_encode(tun_id, tun_key->ttl, tun_key->tos, true); return 0; } -- cgit v1.2.3-59-g8ed1b From 618f88c4c40a9621a3105f3ff957a91a148e7d94 Mon Sep 17 00:00:00 2001 From: Saeed Mahameed Date: Thu, 28 May 2020 01:02:08 -0700 Subject: net/mlx5: DR: Fix incorrect type in argument HW spec objects should receive a void ptr to work on, the MLX5_SET/GET macro will know how to handle it. No need to provide explicit or wrong pointer type in this case. warning: incorrect type in argument 1 (different base types) expected unsigned long long const [usertype] *sw_action got restricted __be64 [usertype] *[assigned] sw_action Signed-off-by: Saeed Mahameed Reviewed-by: Mark Bloch --- drivers/net/ethernet/mellanox/mlx5/core/steering/dr_action.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_action.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_action.c index 554811de4c9d..df1363a34a42 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_action.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_action.c @@ -1662,7 +1662,7 @@ dr_action_modify_check_field_limitation(struct mlx5dr_action *action, } static bool -dr_action_modify_check_is_ttl_modify(const u64 *sw_action) +dr_action_modify_check_is_ttl_modify(const void *sw_action) { u16 sw_field = MLX5_GET(set_action_in, sw_action, field); -- cgit v1.2.3-59-g8ed1b From c2ba2c2287698bac36bf71e5c4f3be423371bee0 Mon Sep 17 00:00:00 2001 From: Saeed Mahameed Date: Thu, 28 May 2020 01:11:37 -0700 Subject: net/mlx5: DR: Fix cast to restricted __be32 raw_ip actual type is __be32 and not u32. Fix that and get rid of the warning. drivers/net/ethernet/mellanox/mlx5/core/steering/dr_ste.c:906:31: warning: cast to restricted __be32 Signed-off-by: Saeed Mahameed Reviewed-by: Mark Bloch --- drivers/net/ethernet/mellanox/mlx5/core/steering/dr_ste.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_ste.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_ste.c index 78c884911ceb..470895016693 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_ste.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_ste.c @@ -869,7 +869,7 @@ static void dr_ste_copy_mask_misc(char *mask, struct mlx5dr_match_misc *spec) static void dr_ste_copy_mask_spec(char *mask, struct mlx5dr_match_spec *spec) { - u32 raw_ip[4]; + __be32 raw_ip[4]; spec->smac_47_16 = MLX5_GET(fte_match_set_lyr_2_4, mask, smac_47_16); -- cgit v1.2.3-59-g8ed1b From 9ff2e92c466dc1aa4d970e5027dfd66b1f32b7bc Mon Sep 17 00:00:00 2001 From: Saeed Mahameed Date: Thu, 28 May 2020 01:14:31 -0700 Subject: net/mlx5: DR: Fix incorrect type in return expression dr_ste_crc32_calc() calculates crc32 and should return it in HW format. It is being used to calculate a u32 index, hence we force the return value of u32 to avoid the sparse warning: drivers/net/ethernet/mellanox/mlx5/core/steering/dr_ste.c:115:16: warning: incorrect type in return expression (different base types) expected unsigned int got restricted __be32 [usertype] Signed-off-by: Saeed Mahameed Reviewed-by: Mark Bloch --- drivers/net/ethernet/mellanox/mlx5/core/steering/dr_ste.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_ste.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_ste.c index 470895016693..00c2f598f034 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_ste.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_ste.c @@ -112,7 +112,7 @@ static u32 dr_ste_crc32_calc(const void *input_data, size_t length) { u32 crc = crc32(0, input_data, length); - return htonl(crc); + return (__force u32)htonl(crc); } u32 mlx5dr_ste_calc_hash_index(u8 *hw_ste_p, struct mlx5dr_ste_htbl *htbl) -- cgit v1.2.3-59-g8ed1b From 2553f421f44f4db7579f202b79b69046b579c7b5 Mon Sep 17 00:00:00 2001 From: Saeed Mahameed Date: Wed, 27 May 2020 23:16:02 -0700 Subject: net/mlx5: cmd: Fix memset with byte count warning Fix sparse warning: drivers/net/ethernet/mellanox/mlx5/core/cmd.c:1949:15: warning: memset with byte count of 271720 mlx5_cmd_stats array is too big to be held inline in mlx5_cmd. Allocate it separately. Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/cmd.c | 20 ++++++++++++++------ drivers/net/ethernet/mellanox/mlx5/core/debugfs.c | 2 +- include/linux/mlx5/driver.h | 2 +- 3 files changed, 16 insertions(+), 8 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c index eca159e8e123..1d91a0d0ab1d 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c @@ -1072,7 +1072,7 @@ static int mlx5_cmd_invoke(struct mlx5_core_dev *dev, struct mlx5_cmd_msg *in, ds = ent->ts2 - ent->ts1; op = MLX5_GET(mbox_in, in->first.data, opcode); - if (op < ARRAY_SIZE(cmd->stats)) { + if (op < MLX5_CMD_OP_MAX) { stats = &cmd->stats[op]; spin_lock_irq(&stats->lock); stats->sum += ds; @@ -1551,7 +1551,7 @@ static void mlx5_cmd_comp_handler(struct mlx5_core_dev *dev, u64 vec, bool force if (ent->callback) { ds = ent->ts2 - ent->ts1; - if (ent->op < ARRAY_SIZE(cmd->stats)) { + if (ent->op < MLX5_CMD_OP_MAX) { stats = &cmd->stats[ent->op]; spin_lock_irqsave(&stats->lock, flags); stats->sum += ds; @@ -1960,10 +1960,16 @@ int mlx5_cmd_init(struct mlx5_core_dev *dev) return -EINVAL; } - cmd->pool = dma_pool_create("mlx5_cmd", dev->device, size, align, 0); - if (!cmd->pool) + cmd->stats = kvzalloc(MLX5_CMD_OP_MAX * sizeof(*cmd->stats), GFP_KERNEL); + if (!cmd->stats) return -ENOMEM; + cmd->pool = dma_pool_create("mlx5_cmd", dev->device, size, align, 0); + if (!cmd->pool) { + err = -ENOMEM; + goto dma_pool_err; + } + err = alloc_cmd_page(dev, cmd); if (err) goto err_free_pool; @@ -1999,7 +2005,7 @@ int mlx5_cmd_init(struct mlx5_core_dev *dev) spin_lock_init(&cmd->alloc_lock); spin_lock_init(&cmd->token_lock); - for (i = 0; i < ARRAY_SIZE(cmd->stats); i++) + for (i = 0; i < MLX5_CMD_OP_MAX; i++) spin_lock_init(&cmd->stats[i].lock); sema_init(&cmd->sem, cmd->max_reg_cmds); @@ -2046,7 +2052,8 @@ err_free_page: err_free_pool: dma_pool_destroy(cmd->pool); - +dma_pool_err: + kvfree(cmd->stats); return err; } EXPORT_SYMBOL(mlx5_cmd_init); @@ -2060,6 +2067,7 @@ void mlx5_cmd_cleanup(struct mlx5_core_dev *dev) destroy_msg_cache(dev); free_cmd_page(dev, cmd); dma_pool_destroy(cmd->pool); + kvfree(cmd->stats); } EXPORT_SYMBOL(mlx5_cmd_cleanup); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/debugfs.c b/drivers/net/ethernet/mellanox/mlx5/core/debugfs.c index d2d57213511b..07c8d9811bc8 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/debugfs.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/debugfs.c @@ -171,7 +171,7 @@ void mlx5_cmdif_debugfs_init(struct mlx5_core_dev *dev) cmd = &dev->priv.cmdif_debugfs; *cmd = debugfs_create_dir("commands", dev->priv.dbg_root); - for (i = 0; i < ARRAY_SIZE(dev->cmd.stats); i++) { + for (i = 0; i < MLX5_CMD_OP_MAX; i++) { stats = &dev->cmd.stats[i]; namep = mlx5_command_str(i); if (strcmp(namep, "unknown command opcode")) { diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 6aa6bbd60559..13c0e4556eda 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -298,7 +298,7 @@ struct mlx5_cmd { struct mlx5_cmd_debug dbg; struct cmd_msg_cache cache[MLX5_NUM_COMMAND_CACHES]; int checksum_disabled; - struct mlx5_cmd_stats stats[MLX5_CMD_OP_MAX]; + struct mlx5_cmd_stats *stats; }; struct mlx5_port_caps { -- cgit v1.2.3-59-g8ed1b From aee3e9c457f172870bdb87e675faf6c4528190b1 Mon Sep 17 00:00:00 2001 From: Saeed Mahameed Date: Thu, 28 May 2020 18:42:40 -0700 Subject: net/mlx5: Accel: fpga tls fix cast to __be64 and incorrect argument types tls handle and rcd_sn are actually big endian and not in host format. Fix that. Fix the following sparse warnings: drivers/net/ethernet/mellanox/mlx5/core/en_accel/tls.c:177:21: warning: cast to restricted __be64 drivers/net/ethernet/mellanox/mlx5/core/en_accel/tls.c:178:52: warning: incorrect type in argument 2 (different base types) expected unsigned int [usertype] handle got restricted __be32 [usertype] handle Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/accel/tls.c | 4 ++-- drivers/net/ethernet/mellanox/mlx5/core/accel/tls.h | 8 ++++---- drivers/net/ethernet/mellanox/mlx5/core/en_accel/tls.c | 2 +- drivers/net/ethernet/mellanox/mlx5/core/fpga/tls.c | 4 ++-- drivers/net/ethernet/mellanox/mlx5/core/fpga/tls.h | 4 ++-- 5 files changed, 11 insertions(+), 11 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/accel/tls.c b/drivers/net/ethernet/mellanox/mlx5/core/accel/tls.c index cab708af3422..cbf3d76c05a8 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/accel/tls.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/accel/tls.c @@ -56,8 +56,8 @@ void mlx5_accel_tls_del_flow(struct mlx5_core_dev *mdev, u32 swid, mlx5_fpga_tls_del_flow(mdev, swid, GFP_KERNEL, direction_sx); } -int mlx5_accel_tls_resync_rx(struct mlx5_core_dev *mdev, u32 handle, u32 seq, - u64 rcd_sn) +int mlx5_accel_tls_resync_rx(struct mlx5_core_dev *mdev, __be32 handle, + u32 seq, __be64 rcd_sn) { return mlx5_fpga_tls_resync_rx(mdev, handle, seq, rcd_sn); } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/accel/tls.h b/drivers/net/ethernet/mellanox/mlx5/core/accel/tls.h index e09bc3858d57..aefea467f7b3 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/accel/tls.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/accel/tls.h @@ -109,8 +109,8 @@ int mlx5_accel_tls_add_flow(struct mlx5_core_dev *mdev, void *flow, bool direction_sx); void mlx5_accel_tls_del_flow(struct mlx5_core_dev *mdev, u32 swid, bool direction_sx); -int mlx5_accel_tls_resync_rx(struct mlx5_core_dev *mdev, u32 handle, u32 seq, - u64 rcd_sn); +int mlx5_accel_tls_resync_rx(struct mlx5_core_dev *mdev, __be32 handle, + u32 seq, __be64 rcd_sn); bool mlx5_accel_is_tls_device(struct mlx5_core_dev *mdev); u32 mlx5_accel_tls_device_caps(struct mlx5_core_dev *mdev); int mlx5_accel_tls_init(struct mlx5_core_dev *mdev); @@ -125,8 +125,8 @@ mlx5_accel_tls_add_flow(struct mlx5_core_dev *mdev, void *flow, bool direction_sx) { return -ENOTSUPP; } static inline void mlx5_accel_tls_del_flow(struct mlx5_core_dev *mdev, u32 swid, bool direction_sx) { } -static inline int mlx5_accel_tls_resync_rx(struct mlx5_core_dev *mdev, u32 handle, - u32 seq, u64 rcd_sn) { return 0; } +static inline int mlx5_accel_tls_resync_rx(struct mlx5_core_dev *mdev, __be32 handle, + u32 seq, __be64 rcd_sn) { return 0; } static inline bool mlx5_accel_is_tls_device(struct mlx5_core_dev *mdev) { return mlx5_accel_is_ktls_device(mdev); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/tls.c b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/tls.c index c27e9a609d51..1fbb5a90cb38 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/tls.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/tls.c @@ -167,7 +167,7 @@ static int mlx5e_tls_resync(struct net_device *netdev, struct sock *sk, struct tls_context *tls_ctx = tls_get_ctx(sk); struct mlx5e_priv *priv = netdev_priv(netdev); struct mlx5e_tls_offload_context_rx *rx_ctx; - u64 rcd_sn = *(u64 *)rcd_sn_data; + __be64 rcd_sn = *(__be64 *)rcd_sn_data; if (WARN_ON_ONCE(direction != TLS_OFFLOAD_CTX_DIR_RX)) return -EINVAL; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fpga/tls.c b/drivers/net/ethernet/mellanox/mlx5/core/fpga/tls.c index 22a2ef111514..29b7339ebfa3 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/fpga/tls.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/fpga/tls.c @@ -194,8 +194,8 @@ static void mlx5_fpga_tls_flow_to_cmd(void *flow, void *cmd) MLX5_GET(tls_flow, flow, direction_sx)); } -int mlx5_fpga_tls_resync_rx(struct mlx5_core_dev *mdev, u32 handle, u32 seq, - u64 rcd_sn) +int mlx5_fpga_tls_resync_rx(struct mlx5_core_dev *mdev, __be32 handle, + u32 seq, __be64 rcd_sn) { struct mlx5_fpga_dma_buf *buf; int size = sizeof(*buf) + MLX5_TLS_COMMAND_SIZE; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fpga/tls.h b/drivers/net/ethernet/mellanox/mlx5/core/fpga/tls.h index 3b2e37bf76fe..5714cf391d1b 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/fpga/tls.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/fpga/tls.h @@ -68,7 +68,7 @@ static inline u32 mlx5_fpga_tls_device_caps(struct mlx5_core_dev *mdev) return mdev->fpga->tls->caps; } -int mlx5_fpga_tls_resync_rx(struct mlx5_core_dev *mdev, u32 handle, u32 seq, - u64 rcd_sn); +int mlx5_fpga_tls_resync_rx(struct mlx5_core_dev *mdev, __be32 handle, + u32 seq, __be64 rcd_sn); #endif /* __MLX5_FPGA_TLS_H__ */ -- cgit v1.2.3-59-g8ed1b From 44345c4c130ee3df9b9fbc366d59ab3ac707d7f8 Mon Sep 17 00:00:00 2001 From: Saeed Mahameed Date: Fri, 29 May 2020 00:47:12 -0700 Subject: net/mlx5: IPSec: Fix incorrect type for spi spi is __be32, fix that. Fixes sparse warning: drivers/net/ethernet/mellanox/mlx5/core/accel/ipsec.c:74:64 warning: incorrect type Signed-off-by: Saeed Mahameed --- include/linux/mlx5/accel.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/mlx5/accel.h b/include/linux/mlx5/accel.h index b919d143a9a6..96ebaa94a92e 100644 --- a/include/linux/mlx5/accel.h +++ b/include/linux/mlx5/accel.h @@ -76,7 +76,7 @@ struct aes_gcm_keymat { struct mlx5_accel_esp_xfrm_attrs { enum mlx5_accel_esp_action action; u32 esn; - u32 spi; + __be32 spi; u32 seq; u32 tfc_pad; u32 flags; -- cgit v1.2.3-59-g8ed1b From c51323ee7ab4132c80db198b7d0956fef957e6ab Mon Sep 17 00:00:00 2001 From: Saeed Mahameed Date: Wed, 27 May 2020 23:41:03 -0700 Subject: net/mlx5e: en_tc: Fix incorrect type in initializer warnings Fix some trivial warnings of the type: warning: incorrect type in initializer (different base types) Signed-off-by: Saeed Mahameed Reviewed-by: Mark Bloch --- drivers/net/ethernet/mellanox/mlx5/core/en_tc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c index ac19a61c5cbc..e866f209f252 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c @@ -1873,7 +1873,7 @@ enc_opts_is_dont_care_or_full_match(struct mlx5e_priv *priv, memchr_inv(opt->opt_data, 0, opt->length * 4)) { *dont_care = false; - if (opt->opt_class != U16_MAX || + if (opt->opt_class != htons(U16_MAX) || opt->type != U8_MAX) { NL_SET_ERR_MSG(extack, "Partial match of tunnel options in chain > 0 isn't supported"); -- cgit v1.2.3-59-g8ed1b From 58ff18e12c9b3bb860b32e9cac4dc8e12aec2695 Mon Sep 17 00:00:00 2001 From: Saeed Mahameed Date: Thu, 28 May 2020 00:22:12 -0700 Subject: net/mlx5e: en_tc: Fix cast to restricted __be32 warning Fixes sparse warnings: warning: cast to restricted __be32 warning: restricted __be32 degrades to integer Signed-off-by: Saeed Mahameed Reviewed-by: Mark Bloch --- drivers/net/ethernet/mellanox/mlx5/core/en_tc.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c index e866f209f252..3ce177c24d52 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c @@ -210,8 +210,8 @@ mlx5e_tc_match_to_reg_match(struct mlx5_flow_spec *spec, fmask = headers_c + soffset; fval = headers_v + soffset; - mask = cpu_to_be32(mask) >> (32 - (match_len * 8)); - data = cpu_to_be32(data) >> (32 - (match_len * 8)); + mask = (__force u32)(cpu_to_be32(mask)) >> (32 - (match_len * 8)); + data = (__force u32)(cpu_to_be32(data)) >> (32 - (match_len * 8)); memcpy(fmask, &mask, match_len); memcpy(fval, &data, match_len); @@ -2815,10 +2815,10 @@ static int offload_pedit_fields(struct mlx5e_priv *priv, continue; if (f->field_bsize == 32) { - mask_be32 = (__be32)mask; + mask_be32 = (__force __be32)(mask); mask = (__force unsigned long)cpu_to_le32(be32_to_cpu(mask_be32)); } else if (f->field_bsize == 16) { - mask_be32 = (__be32)mask; + mask_be32 = (__force __be32)(mask); mask_be16 = *(__be16 *)&mask_be32; mask = (__force unsigned long)cpu_to_le16(be16_to_cpu(mask_be16)); } -- cgit v1.2.3-59-g8ed1b From eb24387183d37f2f4f456654ef92679b1556f8df Mon Sep 17 00:00:00 2001 From: Saeed Mahameed Date: Fri, 29 May 2020 00:36:10 -0700 Subject: net/mlx5e: Make mlx5e_dcbnl_ops static Fix sparse warning: drivers/net/ethernet/mellanox/mlx5/core/en_dcbnl.c:988:29: error: symbol 'mlx5e_dcbnl_ops' was not declared. Should it be static? Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en_dcbnl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_dcbnl.c b/drivers/net/ethernet/mellanox/mlx5/core/en_dcbnl.c index ec7b332d74c2..bc102d094bbd 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_dcbnl.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_dcbnl.c @@ -985,7 +985,7 @@ static int mlx5e_dcbnl_setbuffer(struct net_device *dev, return err; } -const struct dcbnl_rtnl_ops mlx5e_dcbnl_ops = { +static const struct dcbnl_rtnl_ops mlx5e_dcbnl_ops = { .ieee_getets = mlx5e_dcbnl_ieee_getets, .ieee_setets = mlx5e_dcbnl_ieee_setets, .ieee_getmaxrate = mlx5e_dcbnl_ieee_getmaxrate, -- cgit v1.2.3-59-g8ed1b From c01c320d24ac42802ee5e6db5342477d64a23e8f Mon Sep 17 00:00:00 2001 From: Jouni Malinen Date: Sun, 26 Apr 2020 11:47:32 +0300 Subject: ath9k: Set RX filter based to allow broadcast Action frame RX Advertise support for multicast frame registration and update the RX filter based on the recently added FIF_MCAST_ACTION to allow broadcast Action frames to be received. This is needed for Device Provisioning Protocol (DPP) use cases that use broadcast Public Action frames. Signed-off-by: Jouni Malinen Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/20200426084733.7889-1-jouni@codeaurora.org --- drivers/net/wireless/ath/ath9k/init.c | 2 ++ drivers/net/wireless/ath/ath9k/main.c | 1 + drivers/net/wireless/ath/ath9k/recv.c | 3 ++- 3 files changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/net/wireless/ath/ath9k/init.c b/drivers/net/wireless/ath/ath9k/init.c index 17c318902cb8..289a2444d534 100644 --- a/drivers/net/wireless/ath/ath9k/init.c +++ b/drivers/net/wireless/ath/ath9k/init.c @@ -1012,6 +1012,8 @@ static void ath9k_set_hw_capab(struct ath_softc *sc, struct ieee80211_hw *hw) wiphy_ext_feature_set(hw->wiphy, NL80211_EXT_FEATURE_CQM_RSSI_LIST); wiphy_ext_feature_set(hw->wiphy, NL80211_EXT_FEATURE_AIRTIME_FAIRNESS); + wiphy_ext_feature_set(hw->wiphy, + NL80211_EXT_FEATURE_MULTICAST_REGISTRATIONS); } int ath9k_init_device(u16 devid, struct ath_softc *sc, diff --git a/drivers/net/wireless/ath/ath9k/main.c b/drivers/net/wireless/ath/ath9k/main.c index 457e9b0d21ca..a47f6e978095 100644 --- a/drivers/net/wireless/ath/ath9k/main.c +++ b/drivers/net/wireless/ath/ath9k/main.c @@ -1476,6 +1476,7 @@ static int ath9k_config(struct ieee80211_hw *hw, u32 changed) FIF_OTHER_BSS | \ FIF_BCN_PRBRESP_PROMISC | \ FIF_PROBE_REQ | \ + FIF_MCAST_ACTION | \ FIF_FCSFAIL) /* FIXME: sc->sc_full_reset ? */ diff --git a/drivers/net/wireless/ath/ath9k/recv.c b/drivers/net/wireless/ath/ath9k/recv.c index 06e660858766..0c0624a3b40d 100644 --- a/drivers/net/wireless/ath/ath9k/recv.c +++ b/drivers/net/wireless/ath/ath9k/recv.c @@ -413,7 +413,8 @@ u32 ath_calcrxfilter(struct ath_softc *sc) if (sc->cur_chandef.width != NL80211_CHAN_WIDTH_20_NOHT) rfilt |= ATH9K_RX_FILTER_COMP_BAR; - if (sc->cur_chan->nvifs > 1 || (sc->cur_chan->rxfilter & FIF_OTHER_BSS)) { + if (sc->cur_chan->nvifs > 1 || + (sc->cur_chan->rxfilter & (FIF_OTHER_BSS | FIF_MCAST_ACTION))) { /* This is needed for older chips */ if (sc->sc_ah->hw_version.macVersion <= AR_SREV_VERSION_9160) rfilt |= ATH9K_RX_FILTER_PROM; -- cgit v1.2.3-59-g8ed1b From 2a9311b5d39ab7e60e612d75fd08c78e358caf4d Mon Sep 17 00:00:00 2001 From: Jouni Malinen Date: Sun, 26 Apr 2020 11:47:33 +0300 Subject: ath9k_htc: Set RX filter based to allow broadcast Action frame RX Advertise support for multicast frame registration and update the RX filter based on the recently added FIF_MCAST_ACTION to allow broadcast Action frames to be received. This is needed for Device Provisioning Protocol (DPP) use cases that use broadcast Public Action frames. Signed-off-by: Jouni Malinen Reported-by: kbuild test robot Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/20200426084733.7889-2-jouni@codeaurora.org --- drivers/net/wireless/ath/ath9k/htc_drv_init.c | 2 ++ drivers/net/wireless/ath/ath9k/htc_drv_main.c | 1 + drivers/net/wireless/ath/ath9k/htc_drv_txrx.c | 3 ++- 3 files changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/net/wireless/ath/ath9k/htc_drv_init.c b/drivers/net/wireless/ath/ath9k/htc_drv_init.c index 40a065028ebe..1d6ad8d46607 100644 --- a/drivers/net/wireless/ath/ath9k/htc_drv_init.c +++ b/drivers/net/wireless/ath/ath9k/htc_drv_init.c @@ -780,6 +780,8 @@ static void ath9k_set_hw_capab(struct ath9k_htc_priv *priv, SET_IEEE80211_PERM_ADDR(hw, common->macaddr); wiphy_ext_feature_set(hw->wiphy, NL80211_EXT_FEATURE_CQM_RSSI_LIST); + wiphy_ext_feature_set(hw->wiphy, + NL80211_EXT_FEATURE_MULTICAST_REGISTRATIONS); } static int ath9k_init_firmware_version(struct ath9k_htc_priv *priv) diff --git a/drivers/net/wireless/ath/ath9k/htc_drv_main.c b/drivers/net/wireless/ath/ath9k/htc_drv_main.c index 791f6633667c..2b7832b1c800 100644 --- a/drivers/net/wireless/ath/ath9k/htc_drv_main.c +++ b/drivers/net/wireless/ath/ath9k/htc_drv_main.c @@ -1251,6 +1251,7 @@ out: FIF_OTHER_BSS | \ FIF_BCN_PRBRESP_PROMISC | \ FIF_PROBE_REQ | \ + FIF_MCAST_ACTION | \ FIF_FCSFAIL) static void ath9k_htc_configure_filter(struct ieee80211_hw *hw, diff --git a/drivers/net/wireless/ath/ath9k/htc_drv_txrx.c b/drivers/net/wireless/ath/ath9k/htc_drv_txrx.c index 118e5550b10c..b353995bdd45 100644 --- a/drivers/net/wireless/ath/ath9k/htc_drv_txrx.c +++ b/drivers/net/wireless/ath/ath9k/htc_drv_txrx.c @@ -893,7 +893,8 @@ u32 ath9k_htc_calcrxfilter(struct ath9k_htc_priv *priv) if (priv->rxfilter & FIF_PSPOLL) rfilt |= ATH9K_RX_FILTER_PSPOLL; - if (priv->nvifs > 1 || priv->rxfilter & FIF_OTHER_BSS) + if (priv->nvifs > 1 || + priv->rxfilter & (FIF_OTHER_BSS | FIF_MCAST_ACTION)) rfilt |= ATH9K_RX_FILTER_MCAST_BCAST_ALL; return rfilt; -- cgit v1.2.3-59-g8ed1b From 23cc6bb5a2e6a9220c075824c8f68a91a633b547 Mon Sep 17 00:00:00 2001 From: Douglas Anderson Date: Thu, 28 May 2020 12:21:09 -0700 Subject: ath10k: Remove ath10k_qmi_register_service_notifier() declaration The ath10k/qmi.h header file contains a declaration for the function ath10k_qmi_register_service_notifier(). This function doesn't exist. Remove the declaration. This patch is a no-op and was just found by code inspection. Fixes: ba94c753ccb4 ("ath10k: add QMI message handshake for wcn3990 client") Signed-off-by: Douglas Anderson Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/20200528122105.1.I31937dce728b441fd72cbe23447bc4710fd56ddb@changeid --- drivers/net/wireless/ath/ath10k/qmi.h | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/net/wireless/ath/ath10k/qmi.h b/drivers/net/wireless/ath/ath10k/qmi.h index 16190511318d..89464239fe96 100644 --- a/drivers/net/wireless/ath/ath10k/qmi.h +++ b/drivers/net/wireless/ath/ath10k/qmi.h @@ -115,7 +115,6 @@ int ath10k_qmi_wlan_enable(struct ath10k *ar, enum wlfw_driver_mode_enum_v01 mode, const char *version); int ath10k_qmi_wlan_disable(struct ath10k *ar); -int ath10k_qmi_register_service_notifier(struct notifier_block *nb); int ath10k_qmi_init(struct ath10k *ar, u32 msa_size); int ath10k_qmi_deinit(struct ath10k *ar); int ath10k_qmi_set_fw_log_mode(struct ath10k *ar, u8 fw_log_mode); -- cgit v1.2.3-59-g8ed1b From 9529cba988b74091b3a67916b1c119f5b189b8b6 Mon Sep 17 00:00:00 2001 From: Muna Sinada Date: Thu, 28 May 2020 15:54:43 -0700 Subject: ath11k: reset trigger frame MAC padding duration The value was 3 and it's reserved value. Corrected to maintain fw defaults. Signed-off-by: Muna Sinada Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/1590706483-27609-1-git-send-email-msinada@codeaurora.org --- drivers/net/wireless/ath/ath11k/mac.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/wireless/ath/ath11k/mac.c b/drivers/net/wireless/ath/ath11k/mac.c index 1a7e5817e5c8..00e5aac3deea 100644 --- a/drivers/net/wireless/ath/ath11k/mac.c +++ b/drivers/net/wireless/ath/ath11k/mac.c @@ -3562,7 +3562,7 @@ static int ath11k_mac_copy_he_cap(struct ath11k *ar, memcpy(he_cap_elem->phy_cap_info, band_cap->he_cap_phy_info, sizeof(he_cap_elem->phy_cap_info)); - he_cap_elem->mac_cap_info[1] |= + he_cap_elem->mac_cap_info[1] &= IEEE80211_HE_MAC_CAP1_TF_MAC_PAD_DUR_MASK; he_cap_elem->phy_cap_info[4] &= ~IEEE80211_HE_PHY_CAP4_BEAMFORMEE_MAX_STS_UNDER_80MHZ_MASK; -- cgit v1.2.3-59-g8ed1b From 37b76986ebd72aae2c613b94b805e67eaea558b6 Mon Sep 17 00:00:00 2001 From: Muna Sinada Date: Thu, 28 May 2020 16:10:17 -0700 Subject: ath11k: clear DCM max constellation tx value According to 11ax spec. draft 4.0. DCM Max Constellation Tx data field should be set to "Reserved" for an AP, therefore bit is cleared. Signed-off-by: Muna Sinada Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/1590707417-29672-1-git-send-email-msinada@codeaurora.org --- drivers/net/wireless/ath/ath11k/mac.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/wireless/ath/ath11k/mac.c b/drivers/net/wireless/ath/ath11k/mac.c index 00e5aac3deea..2836a0f197ab 100644 --- a/drivers/net/wireless/ath/ath11k/mac.c +++ b/drivers/net/wireless/ath/ath11k/mac.c @@ -3578,6 +3578,8 @@ static int ath11k_mac_copy_he_cap(struct ath11k *ar, switch (i) { case NL80211_IFTYPE_AP: + he_cap_elem->phy_cap_info[3] &= + ~IEEE80211_HE_PHY_CAP3_DCM_MAX_CONST_TX_MASK; he_cap_elem->phy_cap_info[9] |= IEEE80211_HE_PHY_CAP9_RX_1024_QAM_LESS_THAN_242_TONE_RU; break; -- cgit v1.2.3-59-g8ed1b From 1f4982ef56f794101cae7ec0fa3b7605f78bd25f Mon Sep 17 00:00:00 2001 From: Huazhong Tan Date: Sat, 30 May 2020 09:08:27 +0800 Subject: net: hns3: fix a print format issue in hclge_mac_mdio_config() Use %d to print int variable 'ret' in hclge_mac_mdio_config(). Signed-off-by: Huazhong Tan Signed-off-by: David S. Miller --- drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_mdio.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_mdio.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_mdio.c index 696c5ae922e3..e89820702540 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_mdio.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_mdio.c @@ -155,7 +155,7 @@ int hclge_mac_mdio_config(struct hclge_dev *hdev) ret = mdiobus_register(mdio_bus); if (ret) { dev_err(mdio_bus->parent, - "Failed to register MDIO bus ret = %#x\n", ret); + "failed to register MDIO bus, ret = %d\n", ret); return ret; } -- cgit v1.2.3-59-g8ed1b From d62805087e7fbbd7582403b972dd75581256e585 Mon Sep 17 00:00:00 2001 From: Huazhong Tan Date: Sat, 30 May 2020 09:08:28 +0800 Subject: net: hns3: remove an unused macro hclge_is_csq Macro hclge_is_csq defined in hcgle_cmd.c has not been used, so remove it. Signed-off-by: Huazhong Tan Signed-off-by: David S. Miller --- drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.c index 64a1d0bdd7d1..1d6c328bd9fb 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.c @@ -11,8 +11,6 @@ #include "hnae3.h" #include "hclge_main.h" -#define hclge_is_csq(ring) ((ring)->flag & HCLGE_TYPE_CSQ) - #define cmq_ring_to_dev(ring) (&(ring)->dev->pdev->dev) static int hclge_ring_space(struct hclge_cmq_ring *ring) -- cgit v1.2.3-59-g8ed1b From fc68aed15648c70db0377a6abd2b34ec328dd12a Mon Sep 17 00:00:00 2001 From: Huazhong Tan Date: Sat, 30 May 2020 09:08:29 +0800 Subject: net: hns3: remove two unused macros in hclgevf_cmd.c Macro hclgevf_ring_to_dma_dir and hclgevf_is_csq defined in hclgevf_cmd.c, but not used, so remove them. Signed-off-by: Huazhong Tan Signed-off-by: David S. Miller --- drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_cmd.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_cmd.c b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_cmd.c index f38d236ebf4f..fec65239a3c8 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_cmd.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_cmd.c @@ -11,9 +11,6 @@ #include "hclgevf_main.h" #include "hnae3.h" -#define hclgevf_is_csq(ring) ((ring)->flag & HCLGEVF_TYPE_CSQ) -#define hclgevf_ring_to_dma_dir(ring) (hclgevf_is_csq(ring) ? \ - DMA_TO_DEVICE : DMA_FROM_DEVICE) #define cmq_ring_to_dev(ring) (&(ring)->dev->pdev->dev) static int hclgevf_ring_space(struct hclgevf_cmq_ring *ring) -- cgit v1.2.3-59-g8ed1b From ec4d9392207aad5db32cc518c4a5c8b7f1057fa1 Mon Sep 17 00:00:00 2001 From: Huazhong Tan Date: Sat, 30 May 2020 09:08:30 +0800 Subject: net: hns3: fix an incorrect comment for num_tqps in struct hclgevf_dev struct hclgevf_dev stands for VF device, its field num_tqps indicates the number of VF's task queue pairs, so the comment is incorrect, replace 'PF' with 'VF'. Reported-by: Jian Shen Signed-off-by: Huazhong Tan Signed-off-by: David S. Miller --- drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.h b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.h index 738de124cfc4..c1fac8920ae3 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.h +++ b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.h @@ -278,7 +278,7 @@ struct hclgevf_dev { struct semaphore reset_sem; /* protect reset process */ u32 fw_version; - u16 num_tqps; /* num task queue pairs of this PF */ + u16 num_tqps; /* num task queue pairs of this VF */ u16 alloc_rss_size; /* allocated RSS task queue */ u16 rss_size_max; /* HW defined max RSS task queue */ -- cgit v1.2.3-59-g8ed1b From 2adb8187e5439e5066c9893586e5079e89f9060a Mon Sep 17 00:00:00 2001 From: Huazhong Tan Date: Sat, 30 May 2020 09:08:31 +0800 Subject: net: hns3: fix two coding style issues in hclgevf_main.c Remove a redundant blank line in hclgevf_cmd_set_promisc_mode(), and fix a reverse xmas tree coding style issue in hclgevf_set_rss_tc_mode(). Reported-by: Jian Shen Signed-off-by: Huazhong Tan Signed-off-by: David S. Miller --- drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c index a8c0e79901f5..1b9578d0bd80 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c @@ -669,8 +669,8 @@ static int hclgevf_set_rss_tc_mode(struct hclgevf_dev *hdev, u16 rss_size) u16 tc_size[HCLGEVF_MAX_TC_NUM]; struct hclgevf_desc desc; u16 roundup_size; - int status; unsigned int i; + int status; req = (struct hclgevf_rss_tc_mode_cmd *)desc.data; @@ -1143,7 +1143,6 @@ static int hclgevf_cmd_set_promisc_mode(struct hclgevf_dev *hdev, send_msg.en_mc = en_mc_pmc ? 1 : 0; ret = hclgevf_send_mbx_msg(hdev, &send_msg, false, NULL, 0); - if (ret) dev_err(&hdev->pdev->dev, "Set promisc mode fail, status is %d.\n", ret); -- cgit v1.2.3-59-g8ed1b From 996aade998ac0e9f6f0bf09531c32f1106d9d559 Mon Sep 17 00:00:00 2001 From: Huazhong Tan Date: Sat, 30 May 2020 09:08:32 +0800 Subject: net: hns3: remove some unused codes in hns3_nic_set_features() NETIF_F_HW_VLAN_CTAG_FILTER is not set in netdev->hw_feature for the HNS3 driver, so the handler of NETIF_F_HW_VLAN_CTAG_FILTER in hns3_nic_set_features() won't be called, remove it. Reported-by: Jian Shen Signed-off-by: Huazhong Tan Signed-off-by: David S. Miller --- drivers/net/ethernet/hisilicon/hns3/hns3_enet.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c index 9fe40c7773b4..b14f2abc2425 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c @@ -1544,12 +1544,6 @@ static int hns3_nic_set_features(struct net_device *netdev, return ret; } - if ((changed & NETIF_F_HW_VLAN_CTAG_FILTER) && - h->ae_algo->ops->enable_vlan_filter) { - enable = !!(features & NETIF_F_HW_VLAN_CTAG_FILTER); - h->ae_algo->ops->enable_vlan_filter(h, enable); - } - if ((changed & NETIF_F_HW_VLAN_CTAG_RX) && h->ae_algo->ops->enable_hw_strip_rxvtag) { enable = !!(features & NETIF_F_HW_VLAN_CTAG_RX); -- cgit v1.2.3-59-g8ed1b From 53bd63afbd659017d20dfb7ac3a53ceb8cbb338a Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Sat, 30 May 2020 13:29:52 +0300 Subject: net: dsa: sja1105: suppress -Wmissing-prototypes in sja1105_static_config.c Newer compilers complain with W=1 builds that there are non-static functions defined in sja1105_static_config.c that don't have a prototype, because their prototype is defined in sja1105.h which this translation unit does not include. I don't entirely understand what is the point of these warnings, since in principle there's nothing wrong with that. But let's move the prototypes to a header file that _is_ included by sja1105_static_config.c, since that will make these warnings go away. Signed-off-by: Vladimir Oltean Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/dsa/sja1105/sja1105.h | 18 ------------------ drivers/net/dsa/sja1105/sja1105_static_config.h | 18 ++++++++++++++++++ 2 files changed, 18 insertions(+), 18 deletions(-) diff --git a/drivers/net/dsa/sja1105/sja1105.h b/drivers/net/dsa/sja1105/sja1105.h index cb3c81a49fbc..29ed21687295 100644 --- a/drivers/net/dsa/sja1105/sja1105.h +++ b/drivers/net/dsa/sja1105/sja1105.h @@ -323,24 +323,6 @@ int sja1105pqrs_fdb_add(struct dsa_switch *ds, int port, int sja1105pqrs_fdb_del(struct dsa_switch *ds, int port, const unsigned char *addr, u16 vid); -/* Common implementations for the static and dynamic configs */ -size_t sja1105_l2_forwarding_entry_packing(void *buf, void *entry_ptr, - enum packing_op op); -size_t sja1105pqrs_l2_lookup_entry_packing(void *buf, void *entry_ptr, - enum packing_op op); -size_t sja1105et_l2_lookup_entry_packing(void *buf, void *entry_ptr, - enum packing_op op); -size_t sja1105_vlan_lookup_entry_packing(void *buf, void *entry_ptr, - enum packing_op op); -size_t sja1105_retagging_entry_packing(void *buf, void *entry_ptr, - enum packing_op op); -size_t sja1105pqrs_mac_config_entry_packing(void *buf, void *entry_ptr, - enum packing_op op); -size_t sja1105pqrs_avb_params_entry_packing(void *buf, void *entry_ptr, - enum packing_op op); -size_t sja1105_vl_lookup_entry_packing(void *buf, void *entry_ptr, - enum packing_op op); - /* From sja1105_flower.c */ int sja1105_cls_flower_del(struct dsa_switch *ds, int port, struct flow_cls_offload *cls, bool ingress); diff --git a/drivers/net/dsa/sja1105/sja1105_static_config.h b/drivers/net/dsa/sja1105/sja1105_static_config.h index 9b62b9b5549d..8279f4f31eff 100644 --- a/drivers/net/dsa/sja1105/sja1105_static_config.h +++ b/drivers/net/dsa/sja1105/sja1105_static_config.h @@ -430,4 +430,22 @@ void sja1105_unpack(const void *buf, u64 *val, int start, int end, size_t len); void sja1105_packing(void *buf, u64 *val, int start, int end, size_t len, enum packing_op op); +/* Common implementations for the static and dynamic configs */ +size_t sja1105_l2_forwarding_entry_packing(void *buf, void *entry_ptr, + enum packing_op op); +size_t sja1105pqrs_l2_lookup_entry_packing(void *buf, void *entry_ptr, + enum packing_op op); +size_t sja1105et_l2_lookup_entry_packing(void *buf, void *entry_ptr, + enum packing_op op); +size_t sja1105_vlan_lookup_entry_packing(void *buf, void *entry_ptr, + enum packing_op op); +size_t sja1105_retagging_entry_packing(void *buf, void *entry_ptr, + enum packing_op op); +size_t sja1105pqrs_mac_config_entry_packing(void *buf, void *entry_ptr, + enum packing_op op); +size_t sja1105pqrs_avb_params_entry_packing(void *buf, void *entry_ptr, + enum packing_op op); +size_t sja1105_vl_lookup_entry_packing(void *buf, void *entry_ptr, + enum packing_op op); + #endif -- cgit v1.2.3-59-g8ed1b From 99b981f431323e31c279ee7aee20a4c501a1e89d Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Sat, 30 May 2020 13:29:53 +0300 Subject: net: dsa: sja1105: fix port mirroring for P/Q/R/S The dynamic configuration interface for the General Params and the L2 Lookup Params tables was copy-pasted between E/T devices and P/Q/R/S devices. Nonetheless, these interfaces are bitwise different. The driver is using dynamic reconfiguration of the General Parameters table for the port mirroring feature, which was therefore broken on P/Q/R/S. Note that this patch can't be backported easily very far to stable trees (since it conflicts with some other development done since the introduction of the driver). So the Fixes: tag is purely informational. Fixes: 8aa9ebccae87 ("net: dsa: Introduce driver for NXP SJA1105 5-port L2 switch") Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- drivers/net/dsa/sja1105/sja1105_dynamic_config.c | 50 +++++++++++++++++++----- drivers/net/dsa/sja1105/sja1105_static_config.c | 10 ++--- drivers/net/dsa/sja1105/sja1105_static_config.h | 4 ++ 3 files changed, 48 insertions(+), 16 deletions(-) diff --git a/drivers/net/dsa/sja1105/sja1105_dynamic_config.c b/drivers/net/dsa/sja1105/sja1105_dynamic_config.c index 7516f2ffdd4e..4471eeccc293 100644 --- a/drivers/net/dsa/sja1105/sja1105_dynamic_config.c +++ b/drivers/net/dsa/sja1105/sja1105_dynamic_config.c @@ -127,9 +127,15 @@ #define SJA1105ET_SIZE_L2_LOOKUP_PARAMS_DYN_CMD \ SJA1105_SIZE_DYN_CMD +#define SJA1105PQRS_SIZE_L2_LOOKUP_PARAMS_DYN_CMD \ + (SJA1105_SIZE_DYN_CMD + SJA1105PQRS_SIZE_L2_LOOKUP_PARAMS_ENTRY) + #define SJA1105ET_SIZE_GENERAL_PARAMS_DYN_CMD \ SJA1105_SIZE_DYN_CMD +#define SJA1105PQRS_SIZE_GENERAL_PARAMS_DYN_CMD \ + (SJA1105_SIZE_DYN_CMD + SJA1105PQRS_SIZE_GENERAL_PARAMS_ENTRY) + #define SJA1105PQRS_SIZE_AVB_PARAMS_DYN_CMD \ (SJA1105_SIZE_DYN_CMD + SJA1105PQRS_SIZE_AVB_PARAMS_ENTRY) @@ -143,7 +149,7 @@ (SJA1105_SIZE_DYN_CMD + SJA1105PQRS_SIZE_CBS_ENTRY) #define SJA1105_MAX_DYN_CMD_SIZE \ - SJA1105PQRS_SIZE_MAC_CONFIG_DYN_CMD + SJA1105PQRS_SIZE_GENERAL_PARAMS_DYN_CMD struct sja1105_dyn_cmd { bool search; @@ -500,6 +506,18 @@ sja1105et_l2_lookup_params_entry_packing(void *buf, void *entry_ptr, return 0; } +static void +sja1105pqrs_l2_lookup_params_cmd_packing(void *buf, + struct sja1105_dyn_cmd *cmd, + enum packing_op op) +{ + u8 *p = buf + SJA1105PQRS_SIZE_L2_LOOKUP_PARAMS_ENTRY; + const int size = SJA1105_SIZE_DYN_CMD; + + sja1105_packing(p, &cmd->valid, 31, 31, size, op); + sja1105_packing(p, &cmd->rdwrset, 30, 30, size, op); +} + static void sja1105et_general_params_cmd_packing(void *buf, struct sja1105_dyn_cmd *cmd, enum packing_op op) @@ -522,6 +540,18 @@ sja1105et_general_params_entry_packing(void *buf, void *entry_ptr, return 0; } +static void +sja1105pqrs_general_params_cmd_packing(void *buf, struct sja1105_dyn_cmd *cmd, + enum packing_op op) +{ + u8 *p = buf + SJA1105PQRS_SIZE_GENERAL_PARAMS_ENTRY; + const int size = SJA1105_SIZE_DYN_CMD; + + sja1105_packing(p, &cmd->valid, 31, 31, size, op); + sja1105_packing(p, &cmd->errors, 30, 30, size, op); + sja1105_packing(p, &cmd->rdwrset, 28, 28, size, op); +} + static void sja1105pqrs_avb_params_cmd_packing(void *buf, struct sja1105_dyn_cmd *cmd, enum packing_op op) @@ -761,12 +791,12 @@ struct sja1105_dynamic_table_ops sja1105pqrs_dyn_ops[BLK_IDX_MAX_DYN] = { [BLK_IDX_SCHEDULE_ENTRY_POINTS_PARAMS] = {0}, [BLK_IDX_VL_FORWARDING_PARAMS] = {0}, [BLK_IDX_L2_LOOKUP_PARAMS] = { - .entry_packing = sja1105et_l2_lookup_params_entry_packing, - .cmd_packing = sja1105et_l2_lookup_params_cmd_packing, + .entry_packing = sja1105pqrs_l2_lookup_params_entry_packing, + .cmd_packing = sja1105pqrs_l2_lookup_params_cmd_packing, .max_entry_count = SJA1105_MAX_L2_LOOKUP_PARAMS_COUNT, .access = (OP_READ | OP_WRITE), - .packed_size = SJA1105ET_SIZE_L2_LOOKUP_PARAMS_DYN_CMD, - .addr = 0x38, + .packed_size = SJA1105PQRS_SIZE_L2_LOOKUP_PARAMS_DYN_CMD, + .addr = 0x54, }, [BLK_IDX_L2_FORWARDING_PARAMS] = {0}, [BLK_IDX_AVB_PARAMS] = { @@ -778,12 +808,12 @@ struct sja1105_dynamic_table_ops sja1105pqrs_dyn_ops[BLK_IDX_MAX_DYN] = { .addr = 0x8003, }, [BLK_IDX_GENERAL_PARAMS] = { - .entry_packing = sja1105et_general_params_entry_packing, - .cmd_packing = sja1105et_general_params_cmd_packing, + .entry_packing = sja1105pqrs_general_params_entry_packing, + .cmd_packing = sja1105pqrs_general_params_cmd_packing, .max_entry_count = SJA1105_MAX_GENERAL_PARAMS_COUNT, - .access = OP_WRITE, - .packed_size = SJA1105ET_SIZE_GENERAL_PARAMS_DYN_CMD, - .addr = 0x34, + .access = (OP_READ | OP_WRITE), + .packed_size = SJA1105PQRS_SIZE_GENERAL_PARAMS_DYN_CMD, + .addr = 0x3B, }, [BLK_IDX_RETAGGING] = { .entry_packing = sja1105_retagging_entry_packing, diff --git a/drivers/net/dsa/sja1105/sja1105_static_config.c b/drivers/net/dsa/sja1105/sja1105_static_config.c index 780aca034cdc..ff3fe471efc2 100644 --- a/drivers/net/dsa/sja1105/sja1105_static_config.c +++ b/drivers/net/dsa/sja1105/sja1105_static_config.c @@ -146,9 +146,8 @@ static size_t sja1105et_general_params_entry_packing(void *buf, void *entry_ptr, /* TPID and TPID2 are intentionally reversed so that semantic * compatibility with E/T is kept. */ -static size_t -sja1105pqrs_general_params_entry_packing(void *buf, void *entry_ptr, - enum packing_op op) +size_t sja1105pqrs_general_params_entry_packing(void *buf, void *entry_ptr, + enum packing_op op) { const size_t size = SJA1105PQRS_SIZE_GENERAL_PARAMS_ENTRY; struct sja1105_general_params_entry *entry = entry_ptr; @@ -228,9 +227,8 @@ sja1105et_l2_lookup_params_entry_packing(void *buf, void *entry_ptr, return size; } -static size_t -sja1105pqrs_l2_lookup_params_entry_packing(void *buf, void *entry_ptr, - enum packing_op op) +size_t sja1105pqrs_l2_lookup_params_entry_packing(void *buf, void *entry_ptr, + enum packing_op op) { const size_t size = SJA1105PQRS_SIZE_L2_LOOKUP_PARAMS_ENTRY; struct sja1105_l2_lookup_params_entry *entry = entry_ptr; diff --git a/drivers/net/dsa/sja1105/sja1105_static_config.h b/drivers/net/dsa/sja1105/sja1105_static_config.h index 8279f4f31eff..ee0f10062763 100644 --- a/drivers/net/dsa/sja1105/sja1105_static_config.h +++ b/drivers/net/dsa/sja1105/sja1105_static_config.h @@ -431,6 +431,10 @@ void sja1105_packing(void *buf, u64 *val, int start, int end, size_t len, enum packing_op op); /* Common implementations for the static and dynamic configs */ +size_t sja1105pqrs_general_params_entry_packing(void *buf, void *entry_ptr, + enum packing_op op); +size_t sja1105pqrs_l2_lookup_params_entry_packing(void *buf, void *entry_ptr, + enum packing_op op); size_t sja1105_l2_forwarding_entry_packing(void *buf, void *entry_ptr, enum packing_op op); size_t sja1105pqrs_l2_lookup_entry_packing(void *buf, void *entry_ptr, -- cgit v1.2.3-59-g8ed1b From b8ded9de8db34dd209a3dece94cf54fc414e78f7 Mon Sep 17 00:00:00 2001 From: Karsten Graul Date: Sat, 30 May 2020 16:42:37 +0200 Subject: net/smc: pre-fetch send buffer outside of send_lock Pre-fetch send buffer for the CDC validation message before entering the send_lock. Without that the send call might fail with -EBUSY because there are no free buffers and waiting for buffers is not possible under send_lock. Signed-off-by: Karsten Graul Reviewed-by: Ursula Braun Signed-off-by: David S. Miller --- net/smc/smc_cdc.c | 10 +++------- net/smc/smc_cdc.h | 4 +++- net/smc/smc_core.c | 18 +++++++++++++++--- 3 files changed, 21 insertions(+), 11 deletions(-) diff --git a/net/smc/smc_cdc.c b/net/smc/smc_cdc.c index b2b85e1be72c..a47e8855e045 100644 --- a/net/smc/smc_cdc.c +++ b/net/smc/smc_cdc.c @@ -116,19 +116,15 @@ int smc_cdc_msg_send(struct smc_connection *conn, } /* send a validation msg indicating the move of a conn to an other QP link */ -int smcr_cdc_msg_send_validation(struct smc_connection *conn) +int smcr_cdc_msg_send_validation(struct smc_connection *conn, + struct smc_cdc_tx_pend *pend, + struct smc_wr_buf *wr_buf) { struct smc_host_cdc_msg *local = &conn->local_tx_ctrl; struct smc_link *link = conn->lnk; - struct smc_cdc_tx_pend *pend; - struct smc_wr_buf *wr_buf; struct smc_cdc_msg *peer; int rc; - rc = smc_cdc_get_free_slot(conn, link, &wr_buf, NULL, &pend); - if (rc) - return rc; - peer = (struct smc_cdc_msg *)wr_buf; peer->common.type = local->common.type; peer->len = local->len; diff --git a/net/smc/smc_cdc.h b/net/smc/smc_cdc.h index 2ddcc5fb5ceb..0a0a89abd38b 100644 --- a/net/smc/smc_cdc.h +++ b/net/smc/smc_cdc.h @@ -296,7 +296,9 @@ int smc_cdc_msg_send(struct smc_connection *conn, struct smc_wr_buf *wr_buf, struct smc_cdc_tx_pend *pend); int smc_cdc_get_slot_and_msg_send(struct smc_connection *conn); int smcd_cdc_msg_send(struct smc_connection *conn); -int smcr_cdc_msg_send_validation(struct smc_connection *conn); +int smcr_cdc_msg_send_validation(struct smc_connection *conn, + struct smc_cdc_tx_pend *pend, + struct smc_wr_buf *wr_buf); int smc_cdc_init(void) __init; void smcd_cdc_rx_init(struct smc_connection *conn); diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index 65de700e1f17..7964a21e5e6f 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -483,7 +483,8 @@ static int smc_write_space(struct smc_connection *conn) return space; } -static int smc_switch_cursor(struct smc_sock *smc) +static int smc_switch_cursor(struct smc_sock *smc, struct smc_cdc_tx_pend *pend, + struct smc_wr_buf *wr_buf) { struct smc_connection *conn = &smc->conn; union smc_host_cursor cons, fin; @@ -520,11 +521,14 @@ static int smc_switch_cursor(struct smc_sock *smc) if (smc->sk.sk_state != SMC_INIT && smc->sk.sk_state != SMC_CLOSED) { - rc = smcr_cdc_msg_send_validation(conn); + rc = smcr_cdc_msg_send_validation(conn, pend, wr_buf); if (!rc) { schedule_delayed_work(&conn->tx_work, 0); smc->sk.sk_data_ready(&smc->sk); } + } else { + smc_wr_tx_put_slot(conn->lnk, + (struct smc_wr_tx_pend_priv *)pend); } return rc; } @@ -533,7 +537,9 @@ struct smc_link *smc_switch_conns(struct smc_link_group *lgr, struct smc_link *from_lnk, bool is_dev_err) { struct smc_link *to_lnk = NULL; + struct smc_cdc_tx_pend *pend; struct smc_connection *conn; + struct smc_wr_buf *wr_buf; struct smc_sock *smc; struct rb_node *node; int i, rc = 0; @@ -582,10 +588,16 @@ again: } sock_hold(&smc->sk); read_unlock_bh(&lgr->conns_lock); + /* pre-fetch buffer outside of send_lock, might sleep */ + rc = smc_cdc_get_free_slot(conn, to_lnk, &wr_buf, NULL, &pend); + if (rc) { + smcr_link_down_cond_sched(to_lnk); + return NULL; + } /* avoid race with smcr_tx_sndbuf_nonempty() */ spin_lock_bh(&conn->send_lock); conn->lnk = to_lnk; - rc = smc_switch_cursor(smc); + rc = smc_switch_cursor(smc, pend, wr_buf); spin_unlock_bh(&conn->send_lock); sock_put(&smc->sk); if (rc) { -- cgit v1.2.3-59-g8ed1b From 27dc36aefc73ce50a485c9d32c33b18832289203 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Sat, 30 May 2020 23:54:36 +0200 Subject: r8169: change driver data type Change driver private data type to struct rtl8169_private * to avoid some overhead. Signed-off-by: Heiner Kallweit Signed-off-by: David S. Miller --- drivers/net/ethernet/realtek/r8169_main.c | 60 +++++++++++++------------------ 1 file changed, 25 insertions(+), 35 deletions(-) diff --git a/drivers/net/ethernet/realtek/r8169_main.c b/drivers/net/ethernet/realtek/r8169_main.c index d672ae77c644..810398ef7186 100644 --- a/drivers/net/ethernet/realtek/r8169_main.c +++ b/drivers/net/ethernet/realtek/r8169_main.c @@ -4816,15 +4816,13 @@ rtl8169_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats) pm_runtime_put_noidle(&pdev->dev); } -static void rtl8169_net_suspend(struct net_device *dev) +static void rtl8169_net_suspend(struct rtl8169_private *tp) { - struct rtl8169_private *tp = netdev_priv(dev); - - if (!netif_running(dev)) + if (!netif_running(tp->dev)) return; phy_stop(tp->phydev); - netif_device_detach(dev); + netif_device_detach(tp->dev); rtl_lock_work(tp); napi_disable(&tp->napi); @@ -4840,20 +4838,17 @@ static void rtl8169_net_suspend(struct net_device *dev) static int rtl8169_suspend(struct device *device) { - struct net_device *dev = dev_get_drvdata(device); - struct rtl8169_private *tp = netdev_priv(dev); + struct rtl8169_private *tp = dev_get_drvdata(device); - rtl8169_net_suspend(dev); + rtl8169_net_suspend(tp); clk_disable_unprepare(tp->clk); return 0; } -static void __rtl8169_resume(struct net_device *dev) +static void __rtl8169_resume(struct rtl8169_private *tp) { - struct rtl8169_private *tp = netdev_priv(dev); - - netif_device_attach(dev); + netif_device_attach(tp->dev); rtl_pll_power_up(tp); rtl8169_init_phy(tp); @@ -4869,23 +4864,21 @@ static void __rtl8169_resume(struct net_device *dev) static int rtl8169_resume(struct device *device) { - struct net_device *dev = dev_get_drvdata(device); - struct rtl8169_private *tp = netdev_priv(dev); + struct rtl8169_private *tp = dev_get_drvdata(device); - rtl_rar_set(tp, dev->dev_addr); + rtl_rar_set(tp, tp->dev->dev_addr); clk_prepare_enable(tp->clk); - if (netif_running(dev)) - __rtl8169_resume(dev); + if (netif_running(tp->dev)) + __rtl8169_resume(tp); return 0; } static int rtl8169_runtime_suspend(struct device *device) { - struct net_device *dev = dev_get_drvdata(device); - struct rtl8169_private *tp = netdev_priv(dev); + struct rtl8169_private *tp = dev_get_drvdata(device); if (!tp->TxDescArray) return 0; @@ -4894,7 +4887,7 @@ static int rtl8169_runtime_suspend(struct device *device) __rtl8169_set_wol(tp, WAKE_ANY); rtl_unlock_work(tp); - rtl8169_net_suspend(dev); + rtl8169_net_suspend(tp); /* Update counters before going runtime suspend */ rtl8169_update_counters(tp); @@ -4904,10 +4897,9 @@ static int rtl8169_runtime_suspend(struct device *device) static int rtl8169_runtime_resume(struct device *device) { - struct net_device *dev = dev_get_drvdata(device); - struct rtl8169_private *tp = netdev_priv(dev); + struct rtl8169_private *tp = dev_get_drvdata(device); - rtl_rar_set(tp, dev->dev_addr); + rtl_rar_set(tp, tp->dev->dev_addr); if (!tp->TxDescArray) return 0; @@ -4916,16 +4908,16 @@ static int rtl8169_runtime_resume(struct device *device) __rtl8169_set_wol(tp, tp->saved_wolopts); rtl_unlock_work(tp); - __rtl8169_resume(dev); + __rtl8169_resume(tp); return 0; } static int rtl8169_runtime_idle(struct device *device) { - struct net_device *dev = dev_get_drvdata(device); + struct rtl8169_private *tp = dev_get_drvdata(device); - if (!netif_running(dev) || !netif_carrier_ok(dev)) + if (!netif_running(tp->dev) || !netif_carrier_ok(tp->dev)) pm_schedule_suspend(device, 10000); return -EBUSY; @@ -4970,13 +4962,12 @@ static void rtl_wol_shutdown_quirk(struct rtl8169_private *tp) static void rtl_shutdown(struct pci_dev *pdev) { - struct net_device *dev = pci_get_drvdata(pdev); - struct rtl8169_private *tp = netdev_priv(dev); + struct rtl8169_private *tp = pci_get_drvdata(pdev); - rtl8169_net_suspend(dev); + rtl8169_net_suspend(tp); /* Restore original MAC address */ - rtl_rar_set(tp, dev->perm_addr); + rtl_rar_set(tp, tp->dev->perm_addr); rtl8169_hw_reset(tp); @@ -4993,13 +4984,12 @@ static void rtl_shutdown(struct pci_dev *pdev) static void rtl_remove_one(struct pci_dev *pdev) { - struct net_device *dev = pci_get_drvdata(pdev); - struct rtl8169_private *tp = netdev_priv(dev); + struct rtl8169_private *tp = pci_get_drvdata(pdev); if (pci_dev_run_wake(pdev)) pm_runtime_get_noresume(&pdev->dev); - unregister_netdev(dev); + unregister_netdev(tp->dev); if (r8168_check_dash(tp)) rtl8168_driver_stop(tp); @@ -5007,7 +4997,7 @@ static void rtl_remove_one(struct pci_dev *pdev) rtl_release_firmware(tp); /* restore original MAC address */ - rtl_rar_set(tp, dev->perm_addr); + rtl_rar_set(tp, tp->dev->perm_addr); } static const struct net_device_ops rtl_netdev_ops = { @@ -5446,7 +5436,7 @@ static int rtl_init_one(struct pci_dev *pdev, const struct pci_device_id *ent) if (!tp->counters) return -ENOMEM; - pci_set_drvdata(pdev, dev); + pci_set_drvdata(pdev, tp); rc = r8169_mdio_register(tp); if (rc) -- cgit v1.2.3-59-g8ed1b From 01bd753d039553cf63830823d1dcc8a864174afc Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Sat, 30 May 2020 23:55:30 +0200 Subject: r8169: enable WAKE_PHY as only WoL source when runtime-suspending We go to runtime-suspend few secs after cable removal. As cable is removed "physical link up" is the only meaningful WoL source. Signed-off-by: Heiner Kallweit Signed-off-by: David S. Miller --- drivers/net/ethernet/realtek/r8169_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/realtek/r8169_main.c b/drivers/net/ethernet/realtek/r8169_main.c index 810398ef7186..6fcd35ac8110 100644 --- a/drivers/net/ethernet/realtek/r8169_main.c +++ b/drivers/net/ethernet/realtek/r8169_main.c @@ -4884,7 +4884,7 @@ static int rtl8169_runtime_suspend(struct device *device) return 0; rtl_lock_work(tp); - __rtl8169_set_wol(tp, WAKE_ANY); + __rtl8169_set_wol(tp, WAKE_PHY); rtl_unlock_work(tp); rtl8169_net_suspend(tp); -- cgit v1.2.3-59-g8ed1b From 9fdd50c579802adad09028eba86e5a7ef7c9b738 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Sat, 30 May 2020 23:56:14 +0200 Subject: r8169: don't reset tx ring indexes in rtl8169_tx_clear In places where the indexes have to be reset, we call rtl8169_init_ring_indexes() anyway after rtl8169_tx_clear(). Signed-off-by: Heiner Kallweit Signed-off-by: David S. Miller --- drivers/net/ethernet/realtek/r8169_main.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/net/ethernet/realtek/r8169_main.c b/drivers/net/ethernet/realtek/r8169_main.c index 6fcd35ac8110..43652c450892 100644 --- a/drivers/net/ethernet/realtek/r8169_main.c +++ b/drivers/net/ethernet/realtek/r8169_main.c @@ -3955,7 +3955,6 @@ static void rtl8169_tx_clear_range(struct rtl8169_private *tp, u32 start, static void rtl8169_tx_clear(struct rtl8169_private *tp) { rtl8169_tx_clear_range(tp, tp->dirty_tx, NUM_TX_DESC); - tp->cur_tx = tp->dirty_tx = 0; netdev_reset_queue(tp->dev); } -- cgit v1.2.3-59-g8ed1b From bac75d8565e800f7a4494d8c873e38dce33d6079 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Sat, 30 May 2020 23:57:10 +0200 Subject: r8169: move some calls to rtl8169_hw_reset Move calls that are needed before and after calling rtl8169_hw_reset() into this function. This requires to move the function in the code. Signed-off-by: Heiner Kallweit Signed-off-by: David S. Miller --- drivers/net/ethernet/realtek/r8169_main.c | 75 +++++++++++++++---------------- 1 file changed, 36 insertions(+), 39 deletions(-) diff --git a/drivers/net/ethernet/realtek/r8169_main.c b/drivers/net/ethernet/realtek/r8169_main.c index 43652c450892..5f3c50fb0647 100644 --- a/drivers/net/ethernet/realtek/r8169_main.c +++ b/drivers/net/ethernet/realtek/r8169_main.c @@ -2530,36 +2530,6 @@ static void rtl_enable_rxdvgate(struct rtl8169_private *tp) rtl_wait_txrx_fifo_empty(tp); } -static void rtl8169_hw_reset(struct rtl8169_private *tp) -{ - /* Disable interrupts */ - rtl8169_irq_mask_and_ack(tp); - - rtl_rx_close(tp); - - switch (tp->mac_version) { - case RTL_GIGA_MAC_VER_27: - case RTL_GIGA_MAC_VER_28: - case RTL_GIGA_MAC_VER_31: - rtl_loop_wait_low(tp, &rtl_npq_cond, 20, 2000); - break; - case RTL_GIGA_MAC_VER_34 ... RTL_GIGA_MAC_VER_38: - RTL_W8(tp, ChipCmd, RTL_R8(tp, ChipCmd) | StopReq); - rtl_loop_wait_high(tp, &rtl_txcfg_empty_cond, 100, 666); - break; - case RTL_GIGA_MAC_VER_40 ... RTL_GIGA_MAC_VER_61: - rtl_enable_rxdvgate(tp); - fsleep(2000); - break; - default: - RTL_W8(tp, ChipCmd, RTL_R8(tp, ChipCmd) | StopReq); - udelay(100); - break; - } - - rtl_hw_reset(tp); -} - static void rtl_set_tx_config_registers(struct rtl8169_private *tp) { u32 val = TX_DMA_BURST << TxDMAShift | @@ -3958,6 +3928,42 @@ static void rtl8169_tx_clear(struct rtl8169_private *tp) netdev_reset_queue(tp->dev); } +static void rtl8169_hw_reset(struct rtl8169_private *tp) +{ + /* Give a racing hard_start_xmit a few cycles to complete. */ + synchronize_rcu(); + + /* Disable interrupts */ + rtl8169_irq_mask_and_ack(tp); + + rtl_rx_close(tp); + + switch (tp->mac_version) { + case RTL_GIGA_MAC_VER_27: + case RTL_GIGA_MAC_VER_28: + case RTL_GIGA_MAC_VER_31: + rtl_loop_wait_low(tp, &rtl_npq_cond, 20, 2000); + break; + case RTL_GIGA_MAC_VER_34 ... RTL_GIGA_MAC_VER_38: + RTL_W8(tp, ChipCmd, RTL_R8(tp, ChipCmd) | StopReq); + rtl_loop_wait_high(tp, &rtl_txcfg_empty_cond, 100, 666); + break; + case RTL_GIGA_MAC_VER_40 ... RTL_GIGA_MAC_VER_61: + rtl_enable_rxdvgate(tp); + fsleep(2000); + break; + default: + RTL_W8(tp, ChipCmd, RTL_R8(tp, ChipCmd) | StopReq); + fsleep(100); + break; + } + + rtl_hw_reset(tp); + + rtl8169_tx_clear(tp); + rtl8169_init_ring_indexes(tp); +} + static void rtl_reset_work(struct rtl8169_private *tp) { struct net_device *dev = tp->dev; @@ -3965,16 +3971,12 @@ static void rtl_reset_work(struct rtl8169_private *tp) napi_disable(&tp->napi); netif_stop_queue(dev); - synchronize_rcu(); rtl8169_hw_reset(tp); for (i = 0; i < NUM_RX_DESC; i++) rtl8169_mark_to_asic(tp->RxDescArray + i); - rtl8169_tx_clear(tp); - rtl8169_init_ring_indexes(tp); - napi_enable(&tp->napi); rtl_hw_start(tp); netif_wake_queue(dev); @@ -4636,11 +4638,6 @@ static void rtl8169_down(struct net_device *dev) rtl8169_hw_reset(tp); - /* Give a racing hard_start_xmit a few cycles to complete. */ - synchronize_rcu(); - - rtl8169_tx_clear(tp); - rtl8169_rx_clear(tp); rtl_pll_power_down(tp); -- cgit v1.2.3-59-g8ed1b From 8ac8e8c64b539a548a5f22d6b21f999eea38b0ee Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Sat, 30 May 2020 23:58:35 +0200 Subject: r8169: make rtl8169_down central chip quiesce function Functionality for quiescing the chip is spread across different functions currently. Move it to rtl8169_down(). Signed-off-by: Heiner Kallweit Signed-off-by: David S. Miller --- drivers/net/ethernet/realtek/r8169_main.c | 33 +++++++++++-------------------- 1 file changed, 11 insertions(+), 22 deletions(-) diff --git a/drivers/net/ethernet/realtek/r8169_main.c b/drivers/net/ethernet/realtek/r8169_main.c index 5f3c50fb0647..fd93377f961f 100644 --- a/drivers/net/ethernet/realtek/r8169_main.c +++ b/drivers/net/ethernet/realtek/r8169_main.c @@ -4627,20 +4627,21 @@ static int r8169_phy_connect(struct rtl8169_private *tp) return 0; } -static void rtl8169_down(struct net_device *dev) +static void rtl8169_down(struct rtl8169_private *tp) { - struct rtl8169_private *tp = netdev_priv(dev); + rtl_lock_work(tp); - phy_stop(tp->phydev); + /* Clear all task flags */ + bitmap_zero(tp->wk.flags, RTL_FLAG_MAX); + phy_stop(tp->phydev); napi_disable(&tp->napi); - netif_stop_queue(dev); rtl8169_hw_reset(tp); - rtl8169_rx_clear(tp); - rtl_pll_power_down(tp); + + rtl_unlock_work(tp); } static int rtl8169_close(struct net_device *dev) @@ -4653,12 +4654,9 @@ static int rtl8169_close(struct net_device *dev) /* Update counters before going down */ rtl8169_update_counters(tp); - rtl_lock_work(tp); - /* Clear all task flags */ - bitmap_zero(tp->wk.flags, RTL_FLAG_MAX); - - rtl8169_down(dev); - rtl_unlock_work(tp); + netif_stop_queue(dev); + rtl8169_down(tp); + rtl8169_rx_clear(tp); cancel_work_sync(&tp->wk.work); @@ -4817,17 +4815,8 @@ static void rtl8169_net_suspend(struct rtl8169_private *tp) if (!netif_running(tp->dev)) return; - phy_stop(tp->phydev); netif_device_detach(tp->dev); - - rtl_lock_work(tp); - napi_disable(&tp->napi); - /* Clear all task flags */ - bitmap_zero(tp->wk.flags, RTL_FLAG_MAX); - - rtl_unlock_work(tp); - - rtl_pll_power_down(tp); + rtl8169_down(tp); } #ifdef CONFIG_PM -- cgit v1.2.3-59-g8ed1b From 67ee63ef2b15fcb0cb692010083592672d18f0a8 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Sat, 30 May 2020 23:59:58 +0200 Subject: r8169: improve handling power management ops Simplify handling the power management callbacks. Signed-off-by: Heiner Kallweit Signed-off-by: David S. Miller --- drivers/net/ethernet/realtek/r8169_main.c | 28 +++++++++------------------- 1 file changed, 9 insertions(+), 19 deletions(-) diff --git a/drivers/net/ethernet/realtek/r8169_main.c b/drivers/net/ethernet/realtek/r8169_main.c index fd93377f961f..4d2ec9742cee 100644 --- a/drivers/net/ethernet/realtek/r8169_main.c +++ b/drivers/net/ethernet/realtek/r8169_main.c @@ -4821,7 +4821,7 @@ static void rtl8169_net_suspend(struct rtl8169_private *tp) #ifdef CONFIG_PM -static int rtl8169_suspend(struct device *device) +static int __maybe_unused rtl8169_suspend(struct device *device) { struct rtl8169_private *tp = dev_get_drvdata(device); @@ -4847,7 +4847,7 @@ static void __rtl8169_resume(struct rtl8169_private *tp) rtl_unlock_work(tp); } -static int rtl8169_resume(struct device *device) +static int __maybe_unused rtl8169_resume(struct device *device) { struct rtl8169_private *tp = dev_get_drvdata(device); @@ -4909,24 +4909,12 @@ static int rtl8169_runtime_idle(struct device *device) } static const struct dev_pm_ops rtl8169_pm_ops = { - .suspend = rtl8169_suspend, - .resume = rtl8169_resume, - .freeze = rtl8169_suspend, - .thaw = rtl8169_resume, - .poweroff = rtl8169_suspend, - .restore = rtl8169_resume, - .runtime_suspend = rtl8169_runtime_suspend, - .runtime_resume = rtl8169_runtime_resume, - .runtime_idle = rtl8169_runtime_idle, + SET_SYSTEM_SLEEP_PM_OPS(rtl8169_suspend, rtl8169_resume) + SET_RUNTIME_PM_OPS(rtl8169_runtime_suspend, rtl8169_runtime_resume, + rtl8169_runtime_idle) }; -#define RTL8169_PM_OPS (&rtl8169_pm_ops) - -#else /* !CONFIG_PM */ - -#define RTL8169_PM_OPS NULL - -#endif /* !CONFIG_PM */ +#endif /* CONFIG_PM */ static void rtl_wol_shutdown_quirk(struct rtl8169_private *tp) { @@ -5458,7 +5446,9 @@ static struct pci_driver rtl8169_pci_driver = { .probe = rtl_init_one, .remove = rtl_remove_one, .shutdown = rtl_shutdown, - .driver.pm = RTL8169_PM_OPS, +#ifdef CONFIG_PM + .driver.pm = &rtl8169_pm_ops, +#endif }; module_pci_driver(rtl8169_pci_driver); -- cgit v1.2.3-59-g8ed1b From 72b4868211a85d040c42444620f2197bb0094ac8 Mon Sep 17 00:00:00 2001 From: Roopa Prabhu Date: Thu, 28 May 2020 22:12:35 -0700 Subject: vxlan: add check to prevent use of remote ip attributes with NDA_NH_ID NDA_NH_ID represents a remote ip or a group of remote ips. It allows use of nexthop groups in lieu of a remote ip or a list of remote ips supported by the fdb api. Current code ignores the other remote ip attrs when NDA_NH_ID is specified. In the spirit of strict checking, This commit adds a check to explicitly return an error on incorrect usage. Fixes: 1274e1cc4226 ("vxlan: ecmp support for mac fdb entries") Signed-off-by: Roopa Prabhu Signed-off-by: David S. Miller --- drivers/net/vxlan.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c index a0015cdedfaf..fe606c688855 100644 --- a/drivers/net/vxlan.c +++ b/drivers/net/vxlan.c @@ -1196,6 +1196,10 @@ static int vxlan_fdb_parse(struct nlattr *tb[], struct vxlan_dev *vxlan, struct net *net = dev_net(vxlan->dev); int err; + if (tb[NDA_NH_ID] && (tb[NDA_DST] || tb[NDA_VNI] || tb[NDA_IFINDEX] || + tb[NDA_PORT])) + return -EINVAL; + if (tb[NDA_DST]) { err = vxlan_nla_get_addr(ip, tb[NDA_DST]); if (err) -- cgit v1.2.3-59-g8ed1b From 79472fe873dd307101f9b1dbfe0fedebae42219a Mon Sep 17 00:00:00 2001 From: Roopa Prabhu Date: Thu, 28 May 2020 22:12:36 -0700 Subject: vxlan: few locking fixes in nexthop event handler - remove fdb from nh_list before the rcu grace period - protect fdb->vdev with rcu - hold spin lock before destroying fdb Fixes: c7cdbe2efc40 ("vxlan: support for nexthop notifiers") Signed-off-by: Roopa Prabhu Reviewed-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- drivers/net/vxlan.c | 32 +++++++++++++++++++++++++------- 1 file changed, 25 insertions(+), 7 deletions(-) diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c index fe606c688855..39bc10a7fd2e 100644 --- a/drivers/net/vxlan.c +++ b/drivers/net/vxlan.c @@ -81,7 +81,7 @@ struct vxlan_fdb { u16 flags; /* see ndm_flags and below */ struct list_head nh_list; struct nexthop __rcu *nh; - struct vxlan_dev *vdev; + struct vxlan_dev __rcu *vdev; }; #define NTF_VXLAN_ADDED_BY_USER 0x100 @@ -837,7 +837,7 @@ static struct vxlan_fdb *vxlan_fdb_alloc(struct vxlan_dev *vxlan, const u8 *mac, f->updated = f->used = jiffies; f->vni = src_vni; f->nh = NULL; - f->vdev = vxlan; + RCU_INIT_POINTER(f->vdev, vxlan); INIT_LIST_HEAD(&f->nh_list); INIT_LIST_HEAD(&f->remotes); memcpy(f->eth_addr, mac, ETH_ALEN); @@ -963,7 +963,7 @@ static void __vxlan_fdb_free(struct vxlan_fdb *f) nh = rcu_dereference_raw(f->nh); if (nh) { rcu_assign_pointer(f->nh, NULL); - list_del_rcu(&f->nh_list); + rcu_assign_pointer(f->vdev, NULL); nexthop_put(nh); } @@ -1000,7 +1000,7 @@ static void vxlan_fdb_destroy(struct vxlan_dev *vxlan, struct vxlan_fdb *f, } hlist_del_rcu(&f->hlist); - f->vdev = NULL; + list_del_rcu(&f->nh_list); call_rcu(&f->rcu, vxlan_fdb_free); } @@ -4615,17 +4615,35 @@ static struct notifier_block vxlan_switchdev_notifier_block __read_mostly = { .notifier_call = vxlan_switchdev_event, }; +static void vxlan_fdb_nh_flush(struct nexthop *nh) +{ + struct vxlan_fdb *fdb; + struct vxlan_dev *vxlan; + u32 hash_index; + + rcu_read_lock(); + list_for_each_entry_rcu(fdb, &nh->fdb_list, nh_list) { + vxlan = rcu_dereference(fdb->vdev); + WARN_ON(!vxlan); + hash_index = fdb_head_index(vxlan, fdb->eth_addr, + vxlan->default_dst.remote_vni); + spin_lock_bh(&vxlan->hash_lock[hash_index]); + if (!hlist_unhashed(&fdb->hlist)) + vxlan_fdb_destroy(vxlan, fdb, false, false); + spin_unlock_bh(&vxlan->hash_lock[hash_index]); + } + rcu_read_unlock(); +} + static int vxlan_nexthop_event(struct notifier_block *nb, unsigned long event, void *ptr) { struct nexthop *nh = ptr; - struct vxlan_fdb *fdb, *tmp; if (!nh || event != NEXTHOP_EVENT_DEL) return NOTIFY_DONE; - list_for_each_entry_safe(fdb, tmp, &nh->fdb_list, nh_list) - vxlan_fdb_destroy(fdb->vdev, fdb, false, false); + vxlan_fdb_nh_flush(nh); return NOTIFY_DONE; } -- cgit v1.2.3-59-g8ed1b From 1c0522b4a2e143fa6e55e4bd2308415c81184ec7 Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Fri, 29 May 2020 14:16:53 +0300 Subject: selftests: forwarding: mirror_lib: Use mausezahn Using ping in tests is error-prone, because ping is too smart. On a flaky system (notably in a simulator), when packets don't come quickly enough, more pings are sent, and that throws off counters. Instead use mausezahn to generate ICMP echo request packets. That allows us to send them in quicker succession as well, because the reason the ping was made slow in the first place was to make the tests work on simulated systems. Signed-off-by: Petr Machata Signed-off-by: Ido Schimmel Signed-off-by: David S. Miller --- tools/testing/selftests/net/forwarding/mirror_lib.sh | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/tools/testing/selftests/net/forwarding/mirror_lib.sh b/tools/testing/selftests/net/forwarding/mirror_lib.sh index 00797597fcf5..c33bfd7ba214 100644 --- a/tools/testing/selftests/net/forwarding/mirror_lib.sh +++ b/tools/testing/selftests/net/forwarding/mirror_lib.sh @@ -29,11 +29,9 @@ mirror_test() local pref=$1; shift local expect=$1; shift - local ping_timeout=$((PING_TIMEOUT * 5)) local t0=$(tc_rule_stats_get $dev $pref) - ip vrf exec $vrf_name \ - ${PING} ${sip:+-I $sip} $dip -c 10 -i 0.5 -w $ping_timeout \ - &> /dev/null + $MZ $vrf_name ${sip:+-A $sip} -B $dip -a own -b bc -q \ + -c 10 -d 100ms -t icmp type=8 sleep 0.5 local t1=$(tc_rule_stats_get $dev $pref) local delta=$((t1 - t0)) -- cgit v1.2.3-59-g8ed1b From 3ed97037f063b9130b56991f55f346597d27440d Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Fri, 29 May 2020 14:16:54 +0300 Subject: selftests: forwarding: pedit_dsfield: Check counter value A missing stats_update callback was recently added to act_pedit. Now that iproute2 supports JSON dumping for pedit, extend the pedit_dsfield selftest with a check that would have caught the fact that the callback was missing. Signed-off-by: Petr Machata Signed-off-by: David S. Miller --- tools/testing/selftests/net/forwarding/pedit_dsfield.sh | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/net/forwarding/pedit_dsfield.sh b/tools/testing/selftests/net/forwarding/pedit_dsfield.sh index 1181d647f6a7..55eeacf59241 100755 --- a/tools/testing/selftests/net/forwarding/pedit_dsfield.sh +++ b/tools/testing/selftests/net/forwarding/pedit_dsfield.sh @@ -132,7 +132,12 @@ do_test_pedit_dsfield_common() local pkts pkts=$(busywait "$TC_HIT_TIMEOUT" until_counter_is ">= 10" \ tc_rule_handle_stats_get "dev $h2 ingress" 101) - check_err $? "Expected to get 10 packets, but got $pkts." + check_err $? "Expected to get 10 packets on test probe, but got $pkts." + + pkts=$(tc_rule_handle_stats_get "$pedit_locus" 101) + ((pkts >= 10)) + check_err $? "Expected to get 10 packets on pedit rule, but got $pkts." + log_test "$pedit_locus pedit $pedit_action" } -- cgit v1.2.3-59-g8ed1b From 9b23203c32ee02cd316e39ba3ec243e0f7bf56de Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Fri, 29 May 2020 14:25:40 +0200 Subject: ravb: Mask PHY mode to avoid inserting delays twice Until recently, the Micrel KSZ9031 PHY driver ignored any PHY mode ("RGMII-*ID") settings, but used the hardware defaults, augmented by explicit configuration of individual skew values using the "*-skew-ps" DT properties. The lack of PHY mode support was compensated by the EtherAVB MAC driver, which configures TX and/or RX internal delay itself, based on the PHY mode. However, now the KSZ9031 driver has gained PHY mode support, delays may be configured twice, causing regressions. E.g. on the Renesas Salvator-X board with R-Car M3-W ES1.0, TX performance dropped from ca. 400 Mbps to 0.1-0.3 Mbps, as measured by nuttcp. As internal delay configuration supported by the KSZ9031 PHY is too limited for some use cases, the ability to configure MAC internal delay is deemed useful and necessary. Hence a proper fix would involve splitting internal delay configuration in two parts, one for the PHY, and one for the MAC. However, this would require adding new DT properties, thus breaking DTB backwards-compatibility. Hence fix the regression in a backwards-compatibility way, by letting the EtherAVB driver mask the PHY mode when it has inserted a delay, to avoid the PHY driver adding a second delay. This also fixes messages like: Micrel KSZ9031 Gigabit PHY e6800000.ethernet-ffffffff:00: *-skew-ps values should be used only with phy-mode = "rgmii" as the PHY no longer sees the original RGMII-*ID mode. Solving the issue by splitting configuration in two parts can be handled in future patches, and would require retaining a backwards-compatibility mode anyway. Fixes: bcf3440c6dd78bfe ("net: phy: micrel: add phy-mode support for the KSZ9031 PHY") Signed-off-by: Geert Uytterhoeven Reviewed-by: Andrew Lunn Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/ethernet/renesas/ravb_main.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/renesas/ravb_main.c b/drivers/net/ethernet/renesas/ravb_main.c index 067ad25553b9..a442bcf64b9c 100644 --- a/drivers/net/ethernet/renesas/ravb_main.c +++ b/drivers/net/ethernet/renesas/ravb_main.c @@ -1014,6 +1014,7 @@ static int ravb_phy_init(struct net_device *ndev) struct ravb_private *priv = netdev_priv(ndev); struct phy_device *phydev; struct device_node *pn; + phy_interface_t iface; int err; priv->link = 0; @@ -1032,8 +1033,13 @@ static int ravb_phy_init(struct net_device *ndev) } pn = of_node_get(np); } - phydev = of_phy_connect(ndev, pn, ravb_adjust_link, 0, - priv->phy_interface); + + iface = priv->phy_interface; + if (priv->chip_id != RCAR_GEN2 && phy_interface_mode_is_rgmii(iface)) { + /* ravb_set_delay_mode() takes care of internal delay mode */ + iface = PHY_INTERFACE_MODE_RGMII; + } + phydev = of_phy_connect(ndev, pn, ravb_adjust_link, 0, iface); of_node_put(pn); if (!phydev) { netdev_err(ndev, "failed to connect PHY\n"); -- cgit v1.2.3-59-g8ed1b From b0c19ed6088ab41dd2a727b60594b7297c15d6ce Mon Sep 17 00:00:00 2001 From: Toke Høiland-Jørgensen Date: Fri, 29 May 2020 14:43:44 +0200 Subject: sch_cake: Take advantage of skb->hash where appropriate MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit While the other fq-based qdiscs take advantage of skb->hash and doesn't recompute it if it is already set, sch_cake does not. This was a deliberate choice because sch_cake hashes various parts of the packet header to support its advanced flow isolation modes. However, foregoing the use of skb->hash entirely loses a few important benefits: - When skb->hash is set by hardware, a few CPU cycles can be saved by not hashing again in software. - Tunnel encapsulations will generally preserve the value of skb->hash from before the encapsulation, which allows flow-based qdiscs to distinguish between flows even though the outer packet header no longer has flow information. It turns out that we can preserve these desirable properties in many cases, while still supporting the advanced flow isolation properties of sch_cake. This patch does so by reusing the skb->hash value as the flow_hash part of the hashing procedure in cake_hash() only in the following conditions: - If the skb->hash is marked as covering the flow headers (skb->l4_hash is set) AND - NAT header rewriting is either disabled, or did not change any values used for hashing. The latter is important to match local-origin packets such as those of a tunnel endpoint. The immediate motivation for fixing this was the recent patch to WireGuard to preserve the skb->hash on encapsulation. As such, this is also what I tested against; with this patch, added latency under load for competing flows drops from ~8 ms to sub-1ms on an RRUL test over a WireGuard tunnel going through a virtual link shaped to 1Gbps using sch_cake. This matches the results we saw with a similar setup using sch_fq_codel when testing the WireGuard patch. Fixes: 046f6fd5daef ("sched: Add Common Applications Kept Enhanced (cake) qdisc") Signed-off-by: Toke Høiland-Jørgensen Signed-off-by: David S. Miller --- net/sched/sch_cake.c | 65 +++++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 51 insertions(+), 14 deletions(-) diff --git a/net/sched/sch_cake.c b/net/sched/sch_cake.c index 1496e87cd07b..60f8ae578819 100644 --- a/net/sched/sch_cake.c +++ b/net/sched/sch_cake.c @@ -584,26 +584,48 @@ static bool cobalt_should_drop(struct cobalt_vars *vars, return drop; } -static void cake_update_flowkeys(struct flow_keys *keys, +static bool cake_update_flowkeys(struct flow_keys *keys, const struct sk_buff *skb) { #if IS_ENABLED(CONFIG_NF_CONNTRACK) struct nf_conntrack_tuple tuple = {}; - bool rev = !skb->_nfct; + bool rev = !skb->_nfct, upd = false; + __be32 ip; if (tc_skb_protocol(skb) != htons(ETH_P_IP)) - return; + return false; if (!nf_ct_get_tuple_skb(&tuple, skb)) - return; + return false; - keys->addrs.v4addrs.src = rev ? tuple.dst.u3.ip : tuple.src.u3.ip; - keys->addrs.v4addrs.dst = rev ? tuple.src.u3.ip : tuple.dst.u3.ip; + ip = rev ? tuple.dst.u3.ip : tuple.src.u3.ip; + if (ip != keys->addrs.v4addrs.src) { + keys->addrs.v4addrs.src = ip; + upd = true; + } + ip = rev ? tuple.src.u3.ip : tuple.dst.u3.ip; + if (ip != keys->addrs.v4addrs.dst) { + keys->addrs.v4addrs.dst = ip; + upd = true; + } if (keys->ports.ports) { - keys->ports.src = rev ? tuple.dst.u.all : tuple.src.u.all; - keys->ports.dst = rev ? tuple.src.u.all : tuple.dst.u.all; + __be16 port; + + port = rev ? tuple.dst.u.all : tuple.src.u.all; + if (port != keys->ports.src) { + keys->ports.src = port; + upd = true; + } + port = rev ? tuple.src.u.all : tuple.dst.u.all; + if (port != keys->ports.dst) { + port = keys->ports.dst; + upd = true; + } } + return upd; +#else + return false; #endif } @@ -624,23 +646,36 @@ static bool cake_ddst(int flow_mode) static u32 cake_hash(struct cake_tin_data *q, const struct sk_buff *skb, int flow_mode, u16 flow_override, u16 host_override) { + bool hash_flows = (!flow_override && !!(flow_mode & CAKE_FLOW_FLOWS)); + bool hash_hosts = (!host_override && !!(flow_mode & CAKE_FLOW_HOSTS)); + bool nat_enabled = !!(flow_mode & CAKE_FLOW_NAT_FLAG); u32 flow_hash = 0, srchost_hash = 0, dsthost_hash = 0; u16 reduced_hash, srchost_idx, dsthost_idx; struct flow_keys keys, host_keys; + bool use_skbhash = skb->l4_hash; if (unlikely(flow_mode == CAKE_FLOW_NONE)) return 0; - /* If both overrides are set we can skip packet dissection entirely */ - if ((flow_override || !(flow_mode & CAKE_FLOW_FLOWS)) && - (host_override || !(flow_mode & CAKE_FLOW_HOSTS))) + /* If both overrides are set, or we can use the SKB hash and nat mode is + * disabled, we can skip packet dissection entirely. If nat mode is + * enabled there's another check below after doing the conntrack lookup. + */ + if ((!hash_flows || (use_skbhash && !nat_enabled)) && !hash_hosts) goto skip_hash; skb_flow_dissect_flow_keys(skb, &keys, FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL); - if (flow_mode & CAKE_FLOW_NAT_FLAG) - cake_update_flowkeys(&keys, skb); + /* Don't use the SKB hash if we change the lookup keys from conntrack */ + if (nat_enabled && cake_update_flowkeys(&keys, skb)) + use_skbhash = false; + + /* If we can still use the SKB hash and don't need the host hash, we can + * skip the rest of the hashing procedure + */ + if (use_skbhash && !hash_hosts) + goto skip_hash; /* flow_hash_from_keys() sorts the addresses by value, so we have * to preserve their order in a separate data structure to treat @@ -679,12 +714,14 @@ static u32 cake_hash(struct cake_tin_data *q, const struct sk_buff *skb, /* This *must* be after the above switch, since as a * side-effect it sorts the src and dst addresses. */ - if (flow_mode & CAKE_FLOW_FLOWS) + if (hash_flows && !use_skbhash) flow_hash = flow_hash_from_keys(&keys); skip_hash: if (flow_override) flow_hash = flow_override - 1; + else if (use_skbhash) + flow_hash = skb->hash; if (host_override) { dsthost_hash = host_override - 1; srchost_hash = host_override - 1; -- cgit v1.2.3-59-g8ed1b From 39884604b11692158ce0c559fc603510b96f8c2e Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Fri, 29 May 2020 17:49:18 +0200 Subject: mptcp: fix NULL ptr dereference in MP_JOIN error path When token lookup on MP_JOIN 3rd ack fails, the server socket closes with a reset the incoming child. Such socket has the 'is_mptcp' flag set, but no msk socket associated - due to the failed lookup. While crafting the reset packet mptcp_established_options_mp() will try to dereference the child's master socket, causing a NULL ptr dereference. This change addresses the issue with explicit fallback to TCP in such error path. Fixes: 729cd6436f35 ("mptcp: cope better with MP_JOIN failure") Signed-off-by: Paolo Abeni Signed-off-by: David S. Miller --- net/mptcp/subflow.c | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index f3c06b8af92d..493b98a0825c 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -413,6 +413,20 @@ static void subflow_ulp_fallback(struct sock *sk, tcp_sk(sk)->is_mptcp = 0; } +static void subflow_drop_ctx(struct sock *ssk) +{ + struct mptcp_subflow_context *ctx = mptcp_subflow_ctx(ssk); + + if (!ctx) + return; + + subflow_ulp_fallback(ssk, ctx); + if (ctx->conn) + sock_put(ctx->conn); + + kfree_rcu(ctx, rcu); +} + static struct sock *subflow_syn_recv_sock(const struct sock *sk, struct sk_buff *skb, struct request_sock *req, @@ -485,10 +499,7 @@ create_child: if (fallback_is_fatal) goto dispose_child; - if (ctx) { - subflow_ulp_fallback(child, ctx); - kfree_rcu(ctx, rcu); - } + subflow_drop_ctx(child); goto out; } @@ -537,6 +548,7 @@ out: return child; dispose_child: + subflow_drop_ctx(child); tcp_rsk(req)->drop_req = true; tcp_send_active_reset(child, GFP_ATOMIC); inet_csk_prepare_for_destroy_sock(child); -- cgit v1.2.3-59-g8ed1b From 5e9cf0f0a3e98992184442de24253fe1b9c40f2e Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Fri, 29 May 2020 14:04:27 +0200 Subject: cfg80211: fix 6 GHz frequencies to kHz The updates to change to kHz frequencies and the 6 GHz additions evidently overlapped (or rather, I didn't see it when applying the latter), so the 6 GHz is broken. Fix this. Fixes: 934f4c7dd3a5 ("cfg80211: express channels with a KHz component") Link: https://lore.kernel.org/r/20200529140425.1bf824f6911b.I4a1174916b8f5965af4366999eb9ffc7a0347470@changeid Signed-off-by: Johannes Berg --- net/wireless/util.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/wireless/util.c b/net/wireless/util.c index df75e58eca5d..5b3b0d1222a2 100644 --- a/net/wireless/util.c +++ b/net/wireless/util.c @@ -94,7 +94,7 @@ u32 ieee80211_channel_to_freq_khz(int chan, enum nl80211_band band) case NL80211_BAND_6GHZ: /* see 802.11ax D4.1 27.3.22.2 */ if (chan <= 253) - return 5940 + chan * 5; + return MHZ_TO_KHZ(5940 + chan * 5); break; case NL80211_BAND_60GHZ: if (chan < 7) -- cgit v1.2.3-59-g8ed1b From d1a1646c0de7b0083d42a1ada72a3ec243bfcc6d Mon Sep 17 00:00:00 2001 From: Arend Van Spriel Date: Fri, 29 May 2020 11:41:43 +0200 Subject: cfg80211: adapt to new channelization of the 6GHz band The 6GHz band does not have regulatory approval yet, but things are moving forward. However, that has led to a change in the channelization of the 6GHz band which has been accepted in the 11ax specification. It also fixes a missing MHZ_TO_KHZ() macro for 6GHz channels while at it. This change is primarily thrown in to discuss how to deal with it. I noticed ath11k adding 6G support with old channelization and ditto for iw. It probably involves changes in hostapd as well. Cc: Pradeep Kumar Chitrapu Cc: Jouni Malinen Signed-off-by: Arend van Spriel Link: https://lore.kernel.org/r/edf07cdd-ad15-4012-3afd-d8b961a80b69@broadcom.com Signed-off-by: Johannes Berg --- net/wireless/util.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/net/wireless/util.c b/net/wireless/util.c index 5b3b0d1222a2..a27d4f45fb5f 100644 --- a/net/wireless/util.c +++ b/net/wireless/util.c @@ -92,9 +92,11 @@ u32 ieee80211_channel_to_freq_khz(int chan, enum nl80211_band band) return MHZ_TO_KHZ(5000 + chan * 5); break; case NL80211_BAND_6GHZ: - /* see 802.11ax D4.1 27.3.22.2 */ + /* see 802.11ax D6.1 27.3.23.2 */ + if (chan == 2) + return MHZ_TO_KHZ(5935); if (chan <= 253) - return MHZ_TO_KHZ(5940 + chan * 5); + return MHZ_TO_KHZ(5950 + chan * 5); break; case NL80211_BAND_60GHZ: if (chan < 7) -- cgit v1.2.3-59-g8ed1b From 0e47901d78f0b91901f845c2fc575ae48d8ed395 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Thu, 28 May 2020 21:34:24 +0200 Subject: nl80211: really allow client-only BIGTK support My previous commit here was wrong, it didn't check the new flag in two necessary places, so things didn't work. Fix that. Fixes: 155d7c733807 ("nl80211: allow client-only BIGTK support") Link: https://lore.kernel.org/r/20200528213443.993f108e96ca.I0086ae42d672379380d04ac5effb2f3d5135731b@changeid Signed-off-by: Johannes Berg --- net/wireless/sme.c | 7 +++++-- net/wireless/util.c | 4 +++- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/net/wireless/sme.c b/net/wireless/sme.c index 3554c0d951f4..15595cf401de 100644 --- a/net/wireless/sme.c +++ b/net/wireless/sme.c @@ -5,7 +5,7 @@ * (for nl80211's connect() and wext) * * Copyright 2009 Johannes Berg - * Copyright (C) 2009 Intel Corporation. All rights reserved. + * Copyright (C) 2009, 2020 Intel Corporation. All rights reserved. * Copyright 2017 Intel Deutschland GmbH */ @@ -1118,7 +1118,10 @@ void __cfg80211_disconnected(struct net_device *dev, const u8 *ie, if (wiphy_ext_feature_isset( wdev->wiphy, - NL80211_EXT_FEATURE_BEACON_PROTECTION)) + NL80211_EXT_FEATURE_BEACON_PROTECTION) || + wiphy_ext_feature_isset( + wdev->wiphy, + NL80211_EXT_FEATURE_BEACON_PROTECTION_CLIENT)) max_key_idx = 7; for (i = 0; i <= max_key_idx; i++) rdev_del_key(rdev, dev, i, false, NULL); diff --git a/net/wireless/util.c b/net/wireless/util.c index a27d4f45fb5f..4d3b76f94f55 100644 --- a/net/wireless/util.c +++ b/net/wireless/util.c @@ -242,7 +242,9 @@ int cfg80211_validate_key_settings(struct cfg80211_registered_device *rdev, int max_key_idx = 5; if (wiphy_ext_feature_isset(&rdev->wiphy, - NL80211_EXT_FEATURE_BEACON_PROTECTION)) + NL80211_EXT_FEATURE_BEACON_PROTECTION) || + wiphy_ext_feature_isset(&rdev->wiphy, + NL80211_EXT_FEATURE_BEACON_PROTECTION_CLIENT)) max_key_idx = 7; if (key_idx < 0 || key_idx > max_key_idx) return -EINVAL; -- cgit v1.2.3-59-g8ed1b From afbc9c9e8bfe71e8bd12a8c01bedd969fbab8f0e Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Thu, 28 May 2020 21:34:25 +0200 Subject: cfg80211: add a helper to identify 6 GHz PSCs This allows identifying whether or not a channel is a PSC (preferred scanning channel). Link: https://lore.kernel.org/r/20200528213443.414363ecf62c.Ic15e681a0e249eab7350a06ceb582cca8bb9a080@changeid Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index e2dbc9c02ef3..a38653358885 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -5277,6 +5277,21 @@ ieee80211_get_channel(struct wiphy *wiphy, int freq) return ieee80211_get_channel_khz(wiphy, MHZ_TO_KHZ(freq)); } +/** + * cfg80211_channel_is_psc - Check if the channel is a 6 GHz PSC + * @chan: control channel to check + * + * The Preferred Scanning Channels (PSC) are defined in + * Draft IEEE P802.11ax/D5.0, 26.17.2.3.3 + */ +static inline bool cfg80211_channel_is_psc(struct ieee80211_channel *chan) +{ + if (chan->band != NL80211_BAND_6GHZ) + return false; + + return ieee80211_frequency_to_channel(chan->center_freq) % 16 == 5; +} + /** * ieee80211_get_response_rate - get basic rate for a given rate * -- cgit v1.2.3-59-g8ed1b From 372b38ea5911fc2500f0291b00140e80a26c0e36 Mon Sep 17 00:00:00 2001 From: Tova Mussai Date: Thu, 28 May 2020 21:34:26 +0200 Subject: ieee80211: definitions for reduced neighbor reports Add the necessary definitions to parse reduced neighbor report elements. Signed-off-by: Tova Mussai [change struct name, remove IEEE80211_MIN_AP_NEIGHBOR_INFO_SIZE] Link: https://lore.kernel.org/r/20200528213443.4f9154461c06.I518d9898ad982f838112ea9ca14a20d6bbb16394@changeid Signed-off-by: Johannes Berg --- include/linux/ieee80211.h | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index 0320ca4c7d28..c29184bf9416 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -2754,6 +2754,8 @@ enum ieee80211_eid { WLAN_EID_QUIET_CHANNEL = 198, WLAN_EID_OPMODE_NOTIF = 199, + WLAN_EID_REDUCED_NEIGHBOR_REPORT = 201, + WLAN_EID_S1G_BCN_COMPAT = 213, WLAN_EID_S1G_SHORT_BCN_INTERVAL = 214, WLAN_EID_S1G_CAPABILITIES = 217, @@ -3675,4 +3677,30 @@ static inline bool for_each_element_completed(const struct element *element, #define WLAN_RSNX_CAPA_PROTECTED_TWT BIT(4) #define WLAN_RSNX_CAPA_SAE_H2E BIT(5) +/* + * reduced neighbor report, based on Draft P802.11ax_D5.0, + * section 9.4.2.170 + */ +#define IEEE80211_AP_INFO_TBTT_HDR_TYPE 0x03 +#define IEEE80211_AP_INFO_TBTT_HDR_FILTERED 0x04 +#define IEEE80211_AP_INFO_TBTT_HDR_COLOC 0x08 +#define IEEE80211_AP_INFO_TBTT_HDR_COUNT 0xF0 +#define IEEE80211_TBTT_INFO_OFFSET_BSSID_BSS_PARAM 8 +#define IEEE80211_TBTT_INFO_OFFSET_BSSID_SSSID_BSS_PARAM 12 + +#define IEEE80211_RNR_TBTT_PARAMS_OCT_RECOMMENDED 0x01 +#define IEEE80211_RNR_TBTT_PARAMS_SAME_SSID 0x02 +#define IEEE80211_RNR_TBTT_PARAMS_MULTI_BSSID 0x04 +#define IEEE80211_RNR_TBTT_PARAMS_TRANSMITTED_BSSID 0x08 +#define IEEE80211_RNR_TBTT_PARAMS_COLOC_ESS 0x10 +#define IEEE80211_RNR_TBTT_PARAMS_PROBE_ACTIVE 0x20 +#define IEEE80211_RNR_TBTT_PARAMS_COLOC_AP 0x40 + +struct ieee80211_neighbor_ap_info { + u8 tbtt_info_hdr; + u8 tbtt_info_len; + u8 op_class; + u8 channel; +} __packed; + #endif /* LINUX_IEEE80211_H */ -- cgit v1.2.3-59-g8ed1b From 821273a5a502eebaae005557907d122d1e9b7b98 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Thu, 28 May 2020 21:34:27 +0200 Subject: ieee80211: add code to obtain and parse 6 GHz operation field Add some code to obtain and parse the 6 GHz operation field inside the HE operation element. While at it, fix the required length using sizeof() the new struct, which is 5 instead of 4 now. Link: https://lore.kernel.org/r/20200528213443.42ca72c45ca9.Id74bc1b03da9ea6574f9bc70deeb60dfc1634359@changeid Signed-off-by: Johannes Berg --- include/linux/ieee80211.h | 52 ++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 51 insertions(+), 1 deletion(-) diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index c29184bf9416..2bd9e757167d 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -2209,6 +2209,28 @@ ieee80211_he_ppe_size(u8 ppe_thres_hdr, const u8 *phy_cap_info) #define IEEE80211_HE_OPERATION_PARTIAL_BSS_COLOR 0x40000000 #define IEEE80211_HE_OPERATION_BSS_COLOR_DISABLED 0x80000000 +/** + * ieee80211_he_6ghz_oper - HE 6 GHz operation Information field + * @primary: primary channel + * @control: control flags + * @ccfs0: channel center frequency segment 0 + * @ccfs1: channel center frequency segment 1 + * @minrate: minimum rate (in 1 Mbps units) + */ +struct ieee80211_he_6ghz_oper { + u8 primary; +#define IEEE80211_HE_6GHZ_OPER_CTRL_CHANWIDTH 0x3 +#define IEEE80211_HE_6GHZ_OPER_CTRL_CHANWIDTH_20MHZ 0 +#define IEEE80211_HE_6GHZ_OPER_CTRL_CHANWIDTH_40MHZ 1 +#define IEEE80211_HE_6GHZ_OPER_CTRL_CHANWIDTH_80MHZ 2 +#define IEEE80211_HE_6GHZ_OPER_CTRL_CHANWIDTH_160MHZ 3 +#define IEEE80211_HE_6GHZ_OPER_CTRL_DUP_BEACON 0x4 + u8 control; + u8 ccfs0; + u8 ccfs1; + u8 minrate; +} __packed; + /* * ieee80211_he_oper_size - calculate 802.11ax HE Operations IE size * @he_oper_ie: byte data of the He Operations IE, stating from the byte @@ -2235,7 +2257,7 @@ ieee80211_he_oper_size(const u8 *he_oper_ie) if (he_oper_params & IEEE80211_HE_OPERATION_CO_HOSTED_BSS) oper_len++; if (he_oper_params & IEEE80211_HE_OPERATION_6GHZ_OP_INFO) - oper_len += 4; + oper_len += sizeof(struct ieee80211_he_6ghz_oper); /* Add the first byte (extension ID) to the total length */ oper_len++; @@ -2243,6 +2265,34 @@ ieee80211_he_oper_size(const u8 *he_oper_ie) return oper_len; } +/** + * ieee80211_he_6ghz_oper - obtain 6 GHz operation field + * @he_oper: HE operation element (must be pre-validated for size) + * but may be %NULL + * + * Return: a pointer to the 6 GHz operation field, or %NULL + */ +static inline const struct ieee80211_he_6ghz_oper * +ieee80211_he_6ghz_oper(const struct ieee80211_he_operation *he_oper) +{ + const u8 *ret = (void *)&he_oper->optional; + u32 he_oper_params; + + if (!he_oper) + return NULL; + + he_oper_params = le32_to_cpu(he_oper->he_oper_params); + + if (!(he_oper_params & IEEE80211_HE_OPERATION_6GHZ_OP_INFO)) + return NULL; + if (he_oper_params & IEEE80211_HE_OPERATION_VHT_OPER_INFO) + ret += 3; + if (he_oper_params & IEEE80211_HE_OPERATION_CO_HOSTED_BSS) + ret++; + + return (void *)ret; +} + /* HE Spatial Reuse defines */ #define IEEE80211_HE_SPR_NON_SRG_OFFSET_PRESENT 0x4 #define IEEE80211_HE_SPR_SRG_INFORMATION_PRESENT 0x8 -- cgit v1.2.3-59-g8ed1b From 8b30808d9be4183fab17f0b0e68eea88c94ff15a Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Thu, 28 May 2020 21:34:28 +0200 Subject: ieee80211: add HE ext EIDs and 6 GHz capability defines Add the HE extended element IDs and the definitions for the HE 6 GHz band capabilities element, from Draft 5.0. Link: https://lore.kernel.org/r/20200528213443.1a6689fe093f.Ifdc5400fb01779351354daf38663ebeea03c9ad9@changeid Signed-off-by: Johannes Berg --- include/linux/ieee80211.h | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index 2bd9e757167d..9580dfd9e2d1 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -2839,9 +2839,19 @@ enum ieee80211_eid_ext { WLAN_EID_EXT_UORA = 37, WLAN_EID_EXT_HE_MU_EDCA = 38, WLAN_EID_EXT_HE_SPR = 39, + WLAN_EID_EXT_NDP_FEEDBACK_REPORT_PARAMSET = 41, + WLAN_EID_EXT_BSS_COLOR_CHG_ANN = 42, + WLAN_EID_EXT_QUIET_TIME_PERIOD_SETUP = 43, + WLAN_EID_EXT_ESS_REPORT = 45, + WLAN_EID_EXT_OPS = 46, + WLAN_EID_EXT_HE_BSS_LOAD = 47, WLAN_EID_EXT_MAX_CHANNEL_SWITCH_TIME = 52, WLAN_EID_EXT_MULTIPLE_BSSID_CONFIGURATION = 55, WLAN_EID_EXT_NON_INHERITANCE = 56, + WLAN_EID_EXT_KNOWN_BSSID = 57, + WLAN_EID_EXT_SHORT_SSID_LIST = 58, + WLAN_EID_EXT_HE_6GHZ_CAPA = 59, + WLAN_EID_EXT_UL_MU_POWER_CAPA = 60, }; /* Action category code */ @@ -3384,6 +3394,24 @@ struct ieee80211_tspec_ie { __le16 medium_time; } __packed; +struct ieee80211_he_6ghz_capa { + /* uses IEEE80211_HE_6GHZ_CAP_* below */ + __le16 capa; +} __packed; + +/* HE 6 GHz band capabilities */ +/* uses enum ieee80211_min_mpdu_spacing values */ +#define IEEE80211_HE_6GHZ_CAP_MIN_MPDU_START 0x0007 +/* uses enum ieee80211_vht_max_ampdu_length_exp values */ +#define IEEE80211_HE_6GHZ_CAP_MAX_AMPDU_LEN_EXP 0x0038 +/* uses IEEE80211_VHT_CAP_MAX_MPDU_LENGTH_* values */ +#define IEEE80211_HE_6GHZ_CAP_MAX_MPDU_LEN 0x00c0 +/* WLAN_HT_CAP_SM_PS_* values */ +#define IEEE80211_HE_6GHZ_CAP_SM_PS 0x0600 +#define IEEE80211_HE_6GHZ_CAP_RD_RESPONDER 0x0800 +#define IEEE80211_HE_6GHZ_CAP_RX_ANTPAT_CONS 0x1000 +#define IEEE80211_HE_6GHZ_CAP_TX_ANTPAT_CONS 0x2000 + /** * ieee80211_get_qos_ctl - get pointer to qos control bytes * @hdr: the frame -- cgit v1.2.3-59-g8ed1b From 43e64bf301fd8c54f0082d91c6ffd4de861baf96 Mon Sep 17 00:00:00 2001 From: Rajkumar Manoharan Date: Thu, 28 May 2020 21:34:29 +0200 Subject: cfg80211: handle 6 GHz capability of new station Handle 6 GHz HE capability while adding new station. It will be used later in mac80211 station processing. Signed-off-by: Rajkumar Manoharan Link: https://lore.kernel.org/r/1589399105-25472-2-git-send-email-rmanohar@codeaurora.org [handle nl80211_set_station, require WME, remove NL80211_HE_6GHZ_CAPABILITY_LEN] Link: https://lore.kernel.org/r/20200528213443.b6b711fd4312.Ic9b97d57b6c4f2b28d4b2d23d2849d8bc20bd8cc@changeid Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 2 ++ include/uapi/linux/nl80211.h | 5 +++++ net/wireless/nl80211.c | 18 +++++++++++++++++- 3 files changed, 24 insertions(+), 1 deletion(-) diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index a38653358885..da734ea71b5a 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -1238,6 +1238,7 @@ struct sta_txpwr { * @he_capa_len: the length of the HE capabilities * @airtime_weight: airtime scheduler weight for this station * @txpwr: transmit power for an associated station + * @he_6ghz_capa: HE 6 GHz Band capabilities of station */ struct station_parameters { const u8 *supported_rates; @@ -1270,6 +1271,7 @@ struct station_parameters { u8 he_capa_len; u16 airtime_weight; struct sta_txpwr txpwr; + const struct ieee80211_he_6ghz_capa *he_6ghz_capa; }; /** diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index c14666b75e57..e42ae429383e 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -2502,6 +2502,9 @@ enum nl80211_commands { * first channel segment specified in %NL80211_ATTR_CENTER_FREQ1. * @NL80211_ATTR_SCAN_FREQ_KHZ: nested attribute with KHz frequencies * + * @NL80211_ATTR_HE_6GHZ_CAPABILITY: HE 6 GHz Band Capability element (from + * association request when used with NL80211_CMD_NEW_STATION). + * * @NUM_NL80211_ATTR: total number of nl80211_attrs available * @NL80211_ATTR_MAX: highest attribute number currently defined * @__NL80211_ATTR_AFTER_LAST: internal use @@ -2982,6 +2985,8 @@ enum nl80211_attrs { NL80211_ATTR_CENTER_FREQ1_OFFSET, NL80211_ATTR_SCAN_FREQ_KHZ, + NL80211_ATTR_HE_6GHZ_CAPABILITY, + /* add attributes here, update the policy in nl80211.c */ __NL80211_ATTR_AFTER_LAST, diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 22c4d13e28cb..bf8bd8268cb7 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -654,6 +654,10 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = { [NL80211_ATTR_RECEIVE_MULTICAST] = { .type = NLA_FLAG }, [NL80211_ATTR_WIPHY_FREQ_OFFSET] = NLA_POLICY_RANGE(NLA_U32, 0, 999), [NL80211_ATTR_SCAN_FREQ_KHZ] = { .type = NLA_NESTED }, + [NL80211_ATTR_HE_6GHZ_CAPABILITY] = { + .type = NLA_EXACT_LEN, + .len = sizeof(struct ieee80211_he_6ghz_capa), + }, }; /* policy for the key attributes */ @@ -5989,6 +5993,10 @@ static int nl80211_set_station(struct sk_buff *skb, struct genl_info *info) nla_get_u8(info->attrs[NL80211_ATTR_OPMODE_NOTIF]); } + if (info->attrs[NL80211_ATTR_HE_6GHZ_CAPABILITY]) + params.he_6ghz_capa = + nla_data(info->attrs[NL80211_ATTR_HE_CAPABILITY]); + if (info->attrs[NL80211_ATTR_AIRTIME_WEIGHT]) params.airtime_weight = nla_get_u16(info->attrs[NL80211_ATTR_AIRTIME_WEIGHT]); @@ -6123,6 +6131,10 @@ static int nl80211_new_station(struct sk_buff *skb, struct genl_info *info) return -EINVAL; } + if (info->attrs[NL80211_ATTR_HE_6GHZ_CAPABILITY]) + params.he_6ghz_capa = + nla_data(info->attrs[NL80211_ATTR_HE_6GHZ_CAPABILITY]); + if (info->attrs[NL80211_ATTR_OPMODE_NOTIF]) { params.opmode_notif_used = true; params.opmode_notif = @@ -6167,10 +6179,14 @@ static int nl80211_new_station(struct sk_buff *skb, struct genl_info *info) params.vht_capa = NULL; /* HE requires WME */ - if (params.he_capa_len) + if (params.he_capa_len || params.he_6ghz_capa) return -EINVAL; } + /* Ensure that HT/VHT capabilities are not set for 6 GHz HE STA */ + if (params.he_6ghz_capa && (params.ht_capa || params.vht_capa)) + return -EINVAL; + /* When you run into this, adjust the code below for the new flag */ BUILD_BUG_ON(NL80211_STA_FLAG_MAX != 7); -- cgit v1.2.3-59-g8ed1b From a6cf28e05f0b3bda6ff0c58100324ac91aec6027 Mon Sep 17 00:00:00 2001 From: Rajkumar Manoharan Date: Thu, 28 May 2020 21:34:30 +0200 Subject: mac80211: add HE 6 GHz Band Capabilities into parse extension Handle 6 GHz band capability element parsing for association. Signed-off-by: Rajkumar Manoharan Link: https://lore.kernel.org/r/1589399105-25472-4-git-send-email-rmanohar@codeaurora.org [some renaming to be in line with previous patches] Link: https://lore.kernel.org/r/20200528213443.a13d7a0b85b0.Ia07584da4fc77aa77c4cc563248d2ce4234ffe5d@changeid Signed-off-by: Johannes Berg --- net/mac80211/ieee80211_i.h | 1 + net/mac80211/util.c | 4 ++++ 2 files changed, 5 insertions(+) diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index b7935f3d000d..dac016636d12 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -1494,6 +1494,7 @@ struct ieee802_11_elems { const struct ieee80211_he_operation *he_operation; const struct ieee80211_he_spr *he_spr; const struct ieee80211_mu_edca_param_set *mu_edca_param_set; + const struct ieee80211_he_6ghz_capa *he_6ghz_capa; const u8 *uora_element; const u8 *mesh_id; const u8 *peering; diff --git a/net/mac80211/util.c b/net/mac80211/util.c index 20436c86b9bf..5d2c5ae8aadb 100644 --- a/net/mac80211/util.c +++ b/net/mac80211/util.c @@ -936,6 +936,10 @@ static void ieee80211_parse_extension_element(u32 *crc, len >= ieee80211_he_spr_size(data)) elems->he_spr = data; break; + case WLAN_EID_EXT_HE_6GHZ_CAPA: + if (len == sizeof(*elems->he_6ghz_capa)) + elems->he_6ghz_capa = data; + break; } } -- cgit v1.2.3-59-g8ed1b From 223952177296c34d9c8de9cde33204caffe55725 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Thu, 28 May 2020 21:34:31 +0200 Subject: cfg80211: add and expose HE 6 GHz band capabilities These capabilities cover what would otherwise be transported in HT/VHT capabilities, but only a subset thereof that is actually needed on 6 GHz with HE already present. Expose the capabilities to userspace, drivers are expected to set them as using the 6 GHz band (currently) requires HE capability. Link: https://lore.kernel.org/r/20200528213443.244cd5cb9db8.Icd8c773277a88c837e7e3af1d4d1013cc3b66543@changeid Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 3 +++ include/uapi/linux/nl80211.h | 3 +++ net/wireless/nl80211.c | 9 ++++++++- 3 files changed, 14 insertions(+), 1 deletion(-) diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index da734ea71b5a..9b76be3d561a 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -354,10 +354,13 @@ struct ieee80211_sta_he_cap { * * @types_mask: interface types mask * @he_cap: holds the HE capabilities + * @he_6ghz_capa: HE 6 GHz capabilities, must be filled in for a + * 6 GHz band channel (and 0 may be valid value). */ struct ieee80211_sband_iftype_data { u16 types_mask; struct ieee80211_sta_he_cap he_cap; + struct ieee80211_he_6ghz_capa he_6ghz_capa; }; /** diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index e42ae429383e..5b350d032fa3 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -3565,6 +3565,8 @@ enum nl80211_mpath_info { * defined in HE capabilities IE * @NL80211_BAND_IFTYPE_ATTR_MAX: highest band HE capability attribute currently * defined + * @NL80211_BAND_IFTYPE_ATTR_HE_6GHZ_CAPA: HE 6GHz band capabilities (__le16), + * given for all 6 GHz band channels * @__NL80211_BAND_IFTYPE_ATTR_AFTER_LAST: internal use */ enum nl80211_band_iftype_attr { @@ -3575,6 +3577,7 @@ enum nl80211_band_iftype_attr { NL80211_BAND_IFTYPE_ATTR_HE_CAP_PHY, NL80211_BAND_IFTYPE_ATTR_HE_CAP_MCS_SET, NL80211_BAND_IFTYPE_ATTR_HE_CAP_PPE, + NL80211_BAND_IFTYPE_ATTR_HE_6GHZ_CAPA, /* keep last */ __NL80211_BAND_IFTYPE_ATTR_AFTER_LAST, diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index bf8bd8268cb7..3a24e6add13e 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -1562,6 +1562,7 @@ static int nl80211_send_coalesce(struct sk_buff *msg, static int nl80211_send_iftype_data(struct sk_buff *msg, + const struct ieee80211_supported_band *sband, const struct ieee80211_sband_iftype_data *iftdata) { const struct ieee80211_sta_he_cap *he_cap = &iftdata->he_cap; @@ -1585,6 +1586,12 @@ nl80211_send_iftype_data(struct sk_buff *msg, return -ENOBUFS; } + if (sband->band == NL80211_BAND_6GHZ && + nla_put(msg, NL80211_BAND_IFTYPE_ATTR_HE_6GHZ_CAPA, + sizeof(iftdata->he_6ghz_capa), + &iftdata->he_6ghz_capa)) + return -ENOBUFS; + return 0; } @@ -1633,7 +1640,7 @@ static int nl80211_send_band_rateinfo(struct sk_buff *msg, if (!iftdata) return -ENOBUFS; - err = nl80211_send_iftype_data(msg, + err = nl80211_send_iftype_data(msg, sband, &sband->iftype_data[i]); if (err) return err; -- cgit v1.2.3-59-g8ed1b From 24a2042cb22fdfc7feef0df9622f0d9d71b8ced1 Mon Sep 17 00:00:00 2001 From: Rajkumar Manoharan Date: Thu, 28 May 2020 21:34:32 +0200 Subject: mac80211: add HE 6 GHz Band Capability element Construct HE 6 GHz band capability element (IEEE 802.11ax/D6.0, 9.4.2.261) for association request and mesh beacon. The 6 GHz capability information is passed by driver through iftypes caps. Signed-off-by: Rajkumar Manoharan Link: https://lore.kernel.org/r/1589399105-25472-7-git-send-email-rmanohar@codeaurora.org [handle SMPS, adjust for previous patches, reserve SKB space properly, change to handle SKB directly] Link: https://lore.kernel.org/r/20200528213443.643aa8101111.I3f9747c1147480f65445f13eda5c4a5ed4e86757@changeid Signed-off-by: Johannes Berg --- net/mac80211/ieee80211_i.h | 2 ++ net/mac80211/mesh.c | 9 +++++++++ net/mac80211/mesh.h | 2 ++ net/mac80211/mesh_plink.c | 4 +++- net/mac80211/mlme.c | 3 +++ net/mac80211/util.c | 46 ++++++++++++++++++++++++++++++++++++++++++++++ 6 files changed, 65 insertions(+), 1 deletion(-) diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index dac016636d12..344ea828e806 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -2177,6 +2177,8 @@ u8 ieee80211_ie_len_he_cap(struct ieee80211_sub_if_data *sdata, u8 iftype); u8 *ieee80211_ie_build_he_cap(u8 *pos, const struct ieee80211_sta_he_cap *he_cap, u8 *end); +void ieee80211_ie_build_he_6ghz_cap(struct ieee80211_sub_if_data *sdata, + struct sk_buff *skb); u8 *ieee80211_ie_build_he_oper(u8 *pos); int ieee80211_parse_bitrates(struct cfg80211_chan_def *chandef, const struct ieee80211_supported_band *sband, diff --git a/net/mac80211/mesh.c b/net/mac80211/mesh.c index 5930d07b1e43..5e8d72bdbb98 100644 --- a/net/mac80211/mesh.c +++ b/net/mac80211/mesh.c @@ -587,6 +587,13 @@ int mesh_add_he_oper_ie(struct ieee80211_sub_if_data *sdata, return 0; } +int mesh_add_he_6ghz_cap_ie(struct ieee80211_sub_if_data *sdata, + struct sk_buff *skb) +{ + ieee80211_ie_build_he_6ghz_cap(sdata, skb); + return 0; +} + static void ieee80211_mesh_path_timer(struct timer_list *t) { struct ieee80211_sub_if_data *sdata = @@ -766,6 +773,7 @@ ieee80211_mesh_build_beacon(struct ieee80211_if_mesh *ifmsh) 2 + sizeof(struct ieee80211_vht_operation) + ie_len_he_cap + 2 + 1 + sizeof(struct ieee80211_he_operation) + + 2 + 1 + sizeof(struct ieee80211_he_6ghz_capa) + ifmsh->ie_len; bcn = kzalloc(sizeof(*bcn) + head_len + tail_len, GFP_KERNEL); @@ -885,6 +893,7 @@ ieee80211_mesh_build_beacon(struct ieee80211_if_mesh *ifmsh) mesh_add_vht_oper_ie(sdata, skb) || mesh_add_he_cap_ie(sdata, skb, ie_len_he_cap) || mesh_add_he_oper_ie(sdata, skb) || + mesh_add_he_6ghz_cap_ie(sdata, skb) || mesh_add_vendor_ies(sdata, skb)) goto out_free; diff --git a/net/mac80211/mesh.h b/net/mac80211/mesh.h index 953f720754e8..40492d1bd8fd 100644 --- a/net/mac80211/mesh.h +++ b/net/mac80211/mesh.h @@ -222,6 +222,8 @@ int mesh_add_he_cap_ie(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb, u8 ie_len); int mesh_add_he_oper_ie(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb); +int mesh_add_he_6ghz_cap_ie(struct ieee80211_sub_if_data *sdata, + struct sk_buff *skb); void mesh_rmc_free(struct ieee80211_sub_if_data *sdata); int mesh_rmc_init(struct ieee80211_sub_if_data *sdata); void ieee80211s_init(void); diff --git a/net/mac80211/mesh_plink.c b/net/mac80211/mesh_plink.c index 737c5f4dbf52..3aca89c97f36 100644 --- a/net/mac80211/mesh_plink.c +++ b/net/mac80211/mesh_plink.c @@ -238,6 +238,7 @@ static int mesh_plink_frame_tx(struct ieee80211_sub_if_data *sdata, 2 + sizeof(struct ieee80211_vht_operation) + ie_len_he_cap + 2 + 1 + sizeof(struct ieee80211_he_operation) + + 2 + 1 + sizeof(struct ieee80211_he_6ghz_capa) + 2 + 8 + /* peering IE */ sdata->u.mesh.ie_len); if (!skb) @@ -328,7 +329,8 @@ static int mesh_plink_frame_tx(struct ieee80211_sub_if_data *sdata, mesh_add_vht_cap_ie(sdata, skb) || mesh_add_vht_oper_ie(sdata, skb) || mesh_add_he_cap_ie(sdata, skb, ie_len_he_cap) || - mesh_add_he_oper_ie(sdata, skb)) + mesh_add_he_oper_ie(sdata, skb) || + mesh_add_he_6ghz_cap_ie(sdata, skb)) goto free; } diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index a259b4487b60..f6ddce646f18 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -658,6 +658,8 @@ static void ieee80211_add_he_ie(struct ieee80211_sub_if_data *sdata, he_cap->he_cap_elem.phy_cap_info); pos = skb_put(skb, he_cap_size); ieee80211_ie_build_he_cap(pos, he_cap, pos + he_cap_size); + + ieee80211_ie_build_he_6ghz_cap(sdata, skb); } static void ieee80211_send_assoc(struct ieee80211_sub_if_data *sdata) @@ -731,6 +733,7 @@ static void ieee80211_send_assoc(struct ieee80211_sub_if_data *sdata) 2 + 1 + sizeof(struct ieee80211_he_cap_elem) + /* HE */ sizeof(struct ieee80211_he_mcs_nss_supp) + IEEE80211_HE_PPE_THRES_MAX_LEN + + 2 + 1 + sizeof(struct ieee80211_he_6ghz_capa) + assoc_data->ie_len + /* extra IEs */ (assoc_data->fils_kek_len ? 16 /* AES-SIV */ : 0) + 9, /* WMM */ diff --git a/net/mac80211/util.c b/net/mac80211/util.c index 5d2c5ae8aadb..048b38546a56 100644 --- a/net/mac80211/util.c +++ b/net/mac80211/util.c @@ -2839,6 +2839,52 @@ end: return pos; } +void ieee80211_ie_build_he_6ghz_cap(struct ieee80211_sub_if_data *sdata, + struct sk_buff *skb) +{ + struct ieee80211_supported_band *sband; + const struct ieee80211_sband_iftype_data *iftd; + enum nl80211_iftype iftype = ieee80211_vif_type_p2p(&sdata->vif); + u8 *pos; + u16 cap; + + sband = ieee80211_get_sband(sdata); + if (!sband) + return; + + iftd = ieee80211_get_sband_iftype_data(sband, iftype); + if (WARN_ON(!iftd)) + return; + + cap = le16_to_cpu(iftd->he_6ghz_capa.capa); + cap &= ~IEEE80211_HE_6GHZ_CAP_SM_PS; + + switch (sdata->smps_mode) { + case IEEE80211_SMPS_AUTOMATIC: + case IEEE80211_SMPS_NUM_MODES: + WARN_ON(1); + /* fall through */ + case IEEE80211_SMPS_OFF: + cap |= u16_encode_bits(WLAN_HT_CAP_SM_PS_DISABLED, + IEEE80211_HE_6GHZ_CAP_SM_PS); + break; + case IEEE80211_SMPS_STATIC: + cap |= u16_encode_bits(WLAN_HT_CAP_SM_PS_STATIC, + IEEE80211_HE_6GHZ_CAP_SM_PS); + break; + case IEEE80211_SMPS_DYNAMIC: + cap |= u16_encode_bits(WLAN_HT_CAP_SM_PS_DYNAMIC, + IEEE80211_HE_6GHZ_CAP_SM_PS); + break; + } + + pos = skb_put(skb, 2 + 1 + sizeof(cap)); + *pos++ = WLAN_EID_EXTENSION; + *pos++ = 1 + sizeof(cap); + *pos++ = WLAN_EID_EXT_HE_6GHZ_CAPA; + put_unaligned_le16(cap, pos); +} + u8 *ieee80211_ie_build_ht_oper(u8 *pos, struct ieee80211_sta_ht_cap *ht_cap, const struct cfg80211_chan_def *chandef, u16 prot_mode, bool rifs_mode) -- cgit v1.2.3-59-g8ed1b From d1b7524b3ea140e552658485d4e9dce5ee2953e1 Mon Sep 17 00:00:00 2001 From: Rajkumar Manoharan Date: Thu, 28 May 2020 21:34:33 +0200 Subject: mac80211: build HE operation with 6 GHz oper information Add 6 GHz operation information (IEEE 802.11ax/D6.0, Figure 9-787k) while building HE operation element for non-HE AP. This field is used to determine channel information in the absence of HT/VHT IEs. Signed-off-by: Rajkumar Manoharan Link: https://lore.kernel.org/r/1589399105-25472-8-git-send-email-rmanohar@codeaurora.org [fix skb allocation size] Link: https://lore.kernel.org/r/20200528193455.76796-1-johannes@sipsolutions.net Signed-off-by: Johannes Berg --- net/mac80211/ieee80211_i.h | 2 +- net/mac80211/mesh.c | 12 ++++++--- net/mac80211/mesh_plink.c | 1 + net/mac80211/util.c | 65 +++++++++++++++++++++++++++++++++++++++++++--- 4 files changed, 72 insertions(+), 8 deletions(-) diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index 344ea828e806..9f874ce500f6 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -2179,7 +2179,7 @@ u8 *ieee80211_ie_build_he_cap(u8 *pos, u8 *end); void ieee80211_ie_build_he_6ghz_cap(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb); -u8 *ieee80211_ie_build_he_oper(u8 *pos); +u8 *ieee80211_ie_build_he_oper(u8 *pos, struct cfg80211_chan_def *chandef); int ieee80211_parse_bitrates(struct cfg80211_chan_def *chandef, const struct ieee80211_supported_band *sband, const u8 *srates, int srates_len, u32 *rates); diff --git a/net/mac80211/mesh.c b/net/mac80211/mesh.c index 5e8d72bdbb98..5f3d45474db6 100644 --- a/net/mac80211/mesh.c +++ b/net/mac80211/mesh.c @@ -565,6 +565,7 @@ int mesh_add_he_oper_ie(struct ieee80211_sub_if_data *sdata, { const struct ieee80211_sta_he_cap *he_cap; struct ieee80211_supported_band *sband; + u32 len; u8 *pos; sband = ieee80211_get_sband(sdata); @@ -578,11 +579,15 @@ int mesh_add_he_oper_ie(struct ieee80211_sub_if_data *sdata, sdata->vif.bss_conf.chandef.width == NL80211_CHAN_WIDTH_10) return 0; - if (skb_tailroom(skb) < 2 + 1 + sizeof(struct ieee80211_he_operation)) + len = 2 + 1 + sizeof(struct ieee80211_he_operation); + if (sdata->vif.bss_conf.chandef.chan->band == NL80211_BAND_6GHZ) + len += sizeof(struct ieee80211_he_6ghz_oper); + + if (skb_tailroom(skb) < len) return -ENOMEM; - pos = skb_put(skb, 2 + 1 + sizeof(struct ieee80211_he_operation)); - ieee80211_ie_build_he_oper(pos); + pos = skb_put(skb, len); + ieee80211_ie_build_he_oper(pos, &sdata->vif.bss_conf.chandef); return 0; } @@ -773,6 +778,7 @@ ieee80211_mesh_build_beacon(struct ieee80211_if_mesh *ifmsh) 2 + sizeof(struct ieee80211_vht_operation) + ie_len_he_cap + 2 + 1 + sizeof(struct ieee80211_he_operation) + + sizeof(struct ieee80211_he_6ghz_oper) + 2 + 1 + sizeof(struct ieee80211_he_6ghz_capa) + ifmsh->ie_len; diff --git a/net/mac80211/mesh_plink.c b/net/mac80211/mesh_plink.c index 3aca89c97f36..fbbfc5d4a51c 100644 --- a/net/mac80211/mesh_plink.c +++ b/net/mac80211/mesh_plink.c @@ -238,6 +238,7 @@ static int mesh_plink_frame_tx(struct ieee80211_sub_if_data *sdata, 2 + sizeof(struct ieee80211_vht_operation) + ie_len_he_cap + 2 + 1 + sizeof(struct ieee80211_he_operation) + + sizeof(struct ieee80211_he_6ghz_oper) + 2 + 1 + sizeof(struct ieee80211_he_6ghz_capa) + 2 + 8 + /* peering IE */ sdata->u.mesh.ie_len); diff --git a/net/mac80211/util.c b/net/mac80211/util.c index 048b38546a56..87dd003dbdf2 100644 --- a/net/mac80211/util.c +++ b/net/mac80211/util.c @@ -3008,13 +3008,18 @@ u8 *ieee80211_ie_build_vht_oper(u8 *pos, struct ieee80211_sta_vht_cap *vht_cap, return pos + sizeof(struct ieee80211_vht_operation); } -u8 *ieee80211_ie_build_he_oper(u8 *pos) +u8 *ieee80211_ie_build_he_oper(u8 *pos, struct cfg80211_chan_def *chandef) { struct ieee80211_he_operation *he_oper; + struct ieee80211_he_6ghz_oper *he_6ghz_op; u32 he_oper_params; + u8 ie_len = 1 + sizeof(struct ieee80211_he_operation); + + if (chandef->chan->band == NL80211_BAND_6GHZ) + ie_len += sizeof(struct ieee80211_he_6ghz_oper); *pos++ = WLAN_EID_EXTENSION; - *pos++ = 1 + sizeof(struct ieee80211_he_operation); + *pos++ = ie_len; *pos++ = WLAN_EID_EXT_HE_OPERATION; he_oper_params = 0; @@ -3024,16 +3029,68 @@ u8 *ieee80211_ie_build_he_oper(u8 *pos) IEEE80211_HE_OPERATION_ER_SU_DISABLE); he_oper_params |= u32_encode_bits(1, IEEE80211_HE_OPERATION_BSS_COLOR_DISABLED); + if (chandef->chan->band == NL80211_BAND_6GHZ) + he_oper_params |= u32_encode_bits(1, + IEEE80211_HE_OPERATION_6GHZ_OP_INFO); he_oper = (struct ieee80211_he_operation *)pos; he_oper->he_oper_params = cpu_to_le32(he_oper_params); /* don't require special HE peer rates */ he_oper->he_mcs_nss_set = cpu_to_le16(0xffff); + pos += sizeof(struct ieee80211_he_operation); - /* TODO add VHT operational and 6GHz operational subelement? */ + if (chandef->chan->band != NL80211_BAND_6GHZ) + goto out; - return pos + sizeof(struct ieee80211_vht_operation); + /* TODO add VHT operational */ + he_6ghz_op = (struct ieee80211_he_6ghz_oper *)pos; + he_6ghz_op->minrate = 6; /* 6 Mbps */ + he_6ghz_op->primary = + ieee80211_frequency_to_channel(chandef->chan->center_freq); + he_6ghz_op->ccfs0 = + ieee80211_frequency_to_channel(chandef->center_freq1); + if (chandef->center_freq2) + he_6ghz_op->ccfs1 = + ieee80211_frequency_to_channel(chandef->center_freq2); + else + he_6ghz_op->ccfs1 = 0; + + switch (chandef->width) { + case NL80211_CHAN_WIDTH_160: + /* Convert 160 MHz channel width to new style as interop + * workaround. + */ + he_6ghz_op->control = + IEEE80211_HE_6GHZ_OPER_CTRL_CHANWIDTH_160MHZ; + he_6ghz_op->ccfs1 = he_6ghz_op->ccfs0; + if (chandef->chan->center_freq < chandef->center_freq1) + he_6ghz_op->ccfs0 -= 8; + else + he_6ghz_op->ccfs0 += 8; + fallthrough; + case NL80211_CHAN_WIDTH_80P80: + he_6ghz_op->control = + IEEE80211_HE_6GHZ_OPER_CTRL_CHANWIDTH_160MHZ; + break; + case NL80211_CHAN_WIDTH_80: + he_6ghz_op->control = + IEEE80211_HE_6GHZ_OPER_CTRL_CHANWIDTH_80MHZ; + break; + case NL80211_CHAN_WIDTH_40: + he_6ghz_op->control = + IEEE80211_HE_6GHZ_OPER_CTRL_CHANWIDTH_40MHZ; + break; + default: + he_6ghz_op->control = + IEEE80211_HE_6GHZ_OPER_CTRL_CHANWIDTH_20MHZ; + break; + } + + pos += sizeof(struct ieee80211_he_6ghz_oper); + +out: + return pos; } bool ieee80211_chandef_ht_oper(const struct ieee80211_ht_operation *ht_oper, -- cgit v1.2.3-59-g8ed1b From 607ca9ea3462719e256b60b24286f984e0d48c9b Mon Sep 17 00:00:00 2001 From: Rajkumar Manoharan Date: Thu, 28 May 2020 21:34:34 +0200 Subject: mac80211: do not allow HT/VHT IEs in 6 GHz mesh mode As HT/VHT elements are not allowed in 6 GHz band, do not include them in mesh beacon template formation. Signed-off-by: Rajkumar Manoharan Link: https://lore.kernel.org/r/1589399105-25472-9-git-send-email-rmanohar@codeaurora.org Link: https://lore.kernel.org/r/20200528193455.76796-2-johannes@sipsolutions.net Signed-off-by: Johannes Berg --- net/mac80211/mesh.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/net/mac80211/mesh.c b/net/mac80211/mesh.c index 5f3d45474db6..79e0a90982dd 100644 --- a/net/mac80211/mesh.c +++ b/net/mac80211/mesh.c @@ -415,6 +415,10 @@ int mesh_add_ht_cap_ie(struct ieee80211_sub_if_data *sdata, if (!sband) return -EINVAL; + /* HT not allowed in 6 GHz */ + if (sband->band == NL80211_BAND_6GHZ) + return 0; + if (!sband->ht_cap.ht_supported || sdata->vif.bss_conf.chandef.width == NL80211_CHAN_WIDTH_20_NOHT || sdata->vif.bss_conf.chandef.width == NL80211_CHAN_WIDTH_5 || @@ -452,6 +456,10 @@ int mesh_add_ht_oper_ie(struct ieee80211_sub_if_data *sdata, sband = local->hw.wiphy->bands[channel->band]; ht_cap = &sband->ht_cap; + /* HT not allowed in 6 GHz */ + if (sband->band == NL80211_BAND_6GHZ) + return 0; + if (!ht_cap->ht_supported || sdata->vif.bss_conf.chandef.width == NL80211_CHAN_WIDTH_20_NOHT || sdata->vif.bss_conf.chandef.width == NL80211_CHAN_WIDTH_5 || @@ -479,6 +487,10 @@ int mesh_add_vht_cap_ie(struct ieee80211_sub_if_data *sdata, if (!sband) return -EINVAL; + /* VHT not allowed in 6 GHz */ + if (sband->band == NL80211_BAND_6GHZ) + return 0; + if (!sband->vht_cap.vht_supported || sdata->vif.bss_conf.chandef.width == NL80211_CHAN_WIDTH_20_NOHT || sdata->vif.bss_conf.chandef.width == NL80211_CHAN_WIDTH_5 || @@ -516,6 +528,10 @@ int mesh_add_vht_oper_ie(struct ieee80211_sub_if_data *sdata, sband = local->hw.wiphy->bands[channel->band]; vht_cap = &sband->vht_cap; + /* VHT not allowed in 6 GHz */ + if (sband->band == NL80211_BAND_6GHZ) + return 0; + if (!vht_cap->vht_supported || sdata->vif.bss_conf.chandef.width == NL80211_CHAN_WIDTH_20_NOHT || sdata->vif.bss_conf.chandef.width == NL80211_CHAN_WIDTH_5 || -- cgit v1.2.3-59-g8ed1b From 2a333a0db24e37daa2e4eb9a542c07deda44ca5a Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Thu, 28 May 2020 21:34:35 +0200 Subject: mac80211: avoid using ext NSS high BW if not supported If the AP advertises inconsistent data, namely it has CCFS1 or CCFS2, but doesn't advertise support for 160/80+80 bandwidth or "Extended NSS BW Support", then we cannot use any MCSes in the the higher bandwidth. Thus, avoid connecting with higher bandwidth since it's less efficient that way. Link: https://lore.kernel.org/r/20200528213443.0e55d40c3ccc.I6fd0b4708ebd087e5e46466c3e91f6efbcbef668@changeid Signed-off-by: Johannes Berg --- net/mac80211/ibss.c | 11 +++++++++-- net/mac80211/ieee80211_i.h | 6 +++++- net/mac80211/mesh.c | 16 ++++++++++++--- net/mac80211/mlme.c | 25 +++++++++++++++++------ net/mac80211/scan.c | 6 ++++++ net/mac80211/spectmgmt.c | 4 +++- net/mac80211/util.c | 49 ++++++++++++++++++++++++++++++++++++++++++---- 7 files changed, 100 insertions(+), 17 deletions(-) diff --git a/net/mac80211/ibss.c b/net/mac80211/ibss.c index 2479cd48fed0..81d26fef41e9 100644 --- a/net/mac80211/ibss.c +++ b/net/mac80211/ibss.c @@ -9,7 +9,7 @@ * Copyright 2009, Johannes Berg * Copyright 2013-2014 Intel Mobile Communications GmbH * Copyright(c) 2016 Intel Deutschland GmbH - * Copyright(c) 2018-2019 Intel Corporation + * Copyright(c) 2018-2020 Intel Corporation */ #include @@ -781,6 +781,7 @@ ieee80211_ibss_process_chanswitch(struct ieee80211_sub_if_data *sdata, enum nl80211_channel_type ch_type; int err; u32 sta_flags; + u32 vht_cap_info = 0; sdata_assert_lock(sdata); @@ -798,9 +799,13 @@ ieee80211_ibss_process_chanswitch(struct ieee80211_sub_if_data *sdata, break; } + if (elems->vht_cap_elem) + vht_cap_info = le32_to_cpu(elems->vht_cap_elem->vht_cap_info); + memset(¶ms, 0, sizeof(params)); err = ieee80211_parse_ch_switch_ie(sdata, elems, ifibss->chandef.chan->band, + vht_cap_info, sta_flags, ifibss->bssid, &csa_ie); /* can't switch to destination channel, fail */ if (err < 0) @@ -1060,8 +1065,10 @@ static void ieee80211_update_sta_info(struct ieee80211_sub_if_data *sdata, /* we both use VHT */ struct ieee80211_vht_cap cap_ie; struct ieee80211_sta_vht_cap cap = sta->sta.vht_cap; + u32 vht_cap_info = + le32_to_cpu(elems->vht_cap_elem->vht_cap_info); - ieee80211_chandef_vht_oper(&local->hw, + ieee80211_chandef_vht_oper(&local->hw, vht_cap_info, elems->vht_operation, elems->ht_operation, &chandef); diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index 9f874ce500f6..0cc584574976 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -111,6 +111,8 @@ struct ieee80211_bss { size_t supp_rates_len; struct ieee80211_rate *beacon_rate; + u32 vht_cap_info; + /* * During association, we save an ERP value from a probe response so * that we can feed ERP info to the driver when handling the @@ -1915,6 +1917,7 @@ void ieee80211_process_measurement_req(struct ieee80211_sub_if_data *sdata, * @sdata: the sdata of the interface which has received the frame * @elems: parsed 802.11 elements received with the frame * @current_band: indicates the current band + * @vht_cap_info: VHT capabilities of the transmitter * @sta_flags: contains information about own capabilities and restrictions * to decide which channel switch announcements can be accepted. Only the * following subset of &enum ieee80211_sta_flags are evaluated: @@ -1929,6 +1932,7 @@ void ieee80211_process_measurement_req(struct ieee80211_sub_if_data *sdata, int ieee80211_parse_ch_switch_ie(struct ieee80211_sub_if_data *sdata, struct ieee802_11_elems *elems, enum nl80211_band current_band, + u32 vht_cap_info, u32 sta_flags, u8 *bssid, struct ieee80211_csa_ie *csa_ie); @@ -2194,7 +2198,7 @@ u8 *ieee80211_add_wmm_info_ie(u8 *buf, u8 qosinfo); /* channel management */ bool ieee80211_chandef_ht_oper(const struct ieee80211_ht_operation *ht_oper, struct cfg80211_chan_def *chandef); -bool ieee80211_chandef_vht_oper(struct ieee80211_hw *hw, +bool ieee80211_chandef_vht_oper(struct ieee80211_hw *hw, u32 vht_cap_info, const struct ieee80211_vht_operation *oper, const struct ieee80211_ht_operation *htop, struct cfg80211_chan_def *chandef); diff --git a/net/mac80211/mesh.c b/net/mac80211/mesh.c index 79e0a90982dd..696d6fb322e6 100644 --- a/net/mac80211/mesh.c +++ b/net/mac80211/mesh.c @@ -1,7 +1,7 @@ // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (c) 2008, 2009 open80211s Ltd. - * Copyright (C) 2018 - 2019 Intel Corporation + * Copyright (C) 2018 - 2020 Intel Corporation * Authors: Luis Carlos Cobo * Javier Cardona */ @@ -63,6 +63,7 @@ bool mesh_matches_local(struct ieee80211_sub_if_data *sdata, u32 basic_rates = 0; struct cfg80211_chan_def sta_chan_def; struct ieee80211_supported_band *sband; + u32 vht_cap_info = 0; /* * As support for each feature is added, check for matching @@ -96,7 +97,11 @@ bool mesh_matches_local(struct ieee80211_sub_if_data *sdata, cfg80211_chandef_create(&sta_chan_def, sdata->vif.bss_conf.chandef.chan, NL80211_CHAN_NO_HT); ieee80211_chandef_ht_oper(ie->ht_operation, &sta_chan_def); - ieee80211_chandef_vht_oper(&sdata->local->hw, + + if (ie->vht_cap_elem) + vht_cap_info = le32_to_cpu(ie->vht_cap_elem->vht_cap_info); + + ieee80211_chandef_vht_oper(&sdata->local->hw, vht_cap_info, ie->vht_operation, ie->ht_operation, &sta_chan_def); @@ -1076,7 +1081,7 @@ ieee80211_mesh_process_chnswitch(struct ieee80211_sub_if_data *sdata, struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh; struct ieee80211_supported_band *sband; int err; - u32 sta_flags; + u32 sta_flags, vht_cap_info = 0; sdata_assert_lock(sdata); @@ -1099,8 +1104,13 @@ ieee80211_mesh_process_chnswitch(struct ieee80211_sub_if_data *sdata, break; } + if (elems->vht_cap_elem) + vht_cap_info = + le32_to_cpu(elems->vht_cap_elem->vht_cap_info); + memset(¶ms, 0, sizeof(params)); err = ieee80211_parse_ch_switch_ie(sdata, elems, sband->band, + vht_cap_info, sta_flags, sdata->vif.addr, &csa_ie); if (err < 0) diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index f6ddce646f18..1f9c48414c85 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -145,6 +145,7 @@ static u32 ieee80211_determine_chantype(struct ieee80211_sub_if_data *sdata, struct ieee80211_supported_band *sband, struct ieee80211_channel *channel, + u32 vht_cap_info, const struct ieee80211_ht_operation *ht_oper, const struct ieee80211_vht_operation *vht_oper, const struct ieee80211_he_operation *he_oper, @@ -223,7 +224,7 @@ ieee80211_determine_chantype(struct ieee80211_sub_if_data *sdata, memcpy(&he_oper_vht_cap, he_oper->optional, 3); he_oper_vht_cap.basic_mcs_set = cpu_to_le16(0); - if (!ieee80211_chandef_vht_oper(&sdata->local->hw, + if (!ieee80211_chandef_vht_oper(&sdata->local->hw, vht_cap_info, &he_oper_vht_cap, ht_oper, &vht_chandef)) { if (!(ifmgd->flags & IEEE80211_STA_DISABLE_HE)) @@ -232,8 +233,10 @@ ieee80211_determine_chantype(struct ieee80211_sub_if_data *sdata, ret = IEEE80211_STA_DISABLE_HE; goto out; } - } else if (!ieee80211_chandef_vht_oper(&sdata->local->hw, vht_oper, - ht_oper, &vht_chandef)) { + } else if (!ieee80211_chandef_vht_oper(&sdata->local->hw, + vht_cap_info, + vht_oper, ht_oper, + &vht_chandef)) { if (!(ifmgd->flags & IEEE80211_STA_DISABLE_VHT)) sdata_info(sdata, "AP VHT information is invalid, disable VHT\n"); @@ -329,6 +332,7 @@ out: static int ieee80211_config_bw(struct ieee80211_sub_if_data *sdata, struct sta_info *sta, const struct ieee80211_ht_cap *ht_cap, + const struct ieee80211_vht_cap *vht_cap, const struct ieee80211_ht_operation *ht_oper, const struct ieee80211_vht_operation *vht_oper, const struct ieee80211_he_operation *he_oper, @@ -343,6 +347,7 @@ static int ieee80211_config_bw(struct ieee80211_sub_if_data *sdata, u16 ht_opmode; u32 flags; enum ieee80211_sta_rx_bandwidth new_sta_bw; + u32 vht_cap_info = 0; int ret; /* if HT was/is disabled, don't track any bandwidth changes */ @@ -371,8 +376,11 @@ static int ieee80211_config_bw(struct ieee80211_sub_if_data *sdata, sdata->vif.bss_conf.ht_operation_mode = ht_opmode; } + if (vht_cap) + vht_cap_info = le32_to_cpu(vht_cap->vht_cap_info); + /* calculate new channel (type) based on HT/VHT/HE operation IEs */ - flags = ieee80211_determine_chantype(sdata, sband, chan, + flags = ieee80211_determine_chantype(sdata, sband, chan, vht_cap_info, ht_oper, vht_oper, he_oper, &chandef, true); @@ -1327,6 +1335,7 @@ ieee80211_sta_process_chanswitch(struct ieee80211_sub_if_data *sdata, enum nl80211_band current_band; struct ieee80211_csa_ie csa_ie; struct ieee80211_channel_switch ch_switch; + struct ieee80211_bss *bss; int res; sdata_assert_lock(sdata); @@ -1338,7 +1347,9 @@ ieee80211_sta_process_chanswitch(struct ieee80211_sub_if_data *sdata, return; current_band = cbss->channel->band; + bss = (void *)cbss->priv; res = ieee80211_parse_ch_switch_ie(sdata, elems, current_band, + bss->vht_cap_info, ifmgd->flags, ifmgd->associated->bssid, &csa_ie); @@ -4097,8 +4108,8 @@ static void ieee80211_rx_mgmt_beacon(struct ieee80211_sub_if_data *sdata, changed |= ieee80211_recalc_twt_req(sdata, sta, &elems); - if (ieee80211_config_bw(sdata, sta, - elems.ht_cap_elem, elems.ht_operation, + if (ieee80211_config_bw(sdata, sta, elems.ht_cap_elem, + elems.vht_cap_elem, elems.ht_operation, elems.vht_operation, elems.he_operation, bssid, &changed)) { mutex_unlock(&local->sta_mtx); @@ -4815,6 +4826,7 @@ static int ieee80211_prep_channel(struct ieee80211_sub_if_data *sdata, const struct ieee80211_he_operation *he_oper = NULL; struct ieee80211_supported_band *sband; struct cfg80211_chan_def chandef; + struct ieee80211_bss *bss = (void *)cbss->priv; int ret; u32 i; bool have_80mhz; @@ -4913,6 +4925,7 @@ static int ieee80211_prep_channel(struct ieee80211_sub_if_data *sdata, ifmgd->flags |= ieee80211_determine_chantype(sdata, sband, cbss->channel, + bss->vht_cap_info, ht_oper, vht_oper, he_oper, &chandef, false); diff --git a/net/mac80211/scan.c b/net/mac80211/scan.c index 5db15996524f..d0c2e8012118 100644 --- a/net/mac80211/scan.c +++ b/net/mac80211/scan.c @@ -132,6 +132,12 @@ ieee80211_update_bss_from_elems(struct ieee80211_local *local, bss->beacon_rate = &sband->bitrates[rx_status->rate_idx]; } + + if (elems->vht_cap_elem) + bss->vht_cap_info = + le32_to_cpu(elems->vht_cap_elem->vht_cap_info); + else + bss->vht_cap_info = 0; } struct ieee80211_bss * diff --git a/net/mac80211/spectmgmt.c b/net/mac80211/spectmgmt.c index 5fe2b645912f..ae1cb2c68722 100644 --- a/net/mac80211/spectmgmt.c +++ b/net/mac80211/spectmgmt.c @@ -9,7 +9,7 @@ * Copyright 2007, Michael Wu * Copyright 2007-2008, Intel Corporation * Copyright 2008, Johannes Berg - * Copyright (C) 2018 Intel Corporation + * Copyright (C) 2018, 2020 Intel Corporation */ #include @@ -22,6 +22,7 @@ int ieee80211_parse_ch_switch_ie(struct ieee80211_sub_if_data *sdata, struct ieee802_11_elems *elems, enum nl80211_band current_band, + u32 vht_cap_info, u32 sta_flags, u8 *bssid, struct ieee80211_csa_ie *csa_ie) { @@ -150,6 +151,7 @@ int ieee80211_parse_ch_switch_ie(struct ieee80211_sub_if_data *sdata, /* ignore if parsing fails */ if (!ieee80211_chandef_vht_oper(&sdata->local->hw, + vht_cap_info, &vht_oper, &ht_oper, &new_vht_chandef)) new_vht_chandef.chan = NULL; diff --git a/net/mac80211/util.c b/net/mac80211/util.c index 87dd003dbdf2..e6104829fa1c 100644 --- a/net/mac80211/util.c +++ b/net/mac80211/util.c @@ -3120,7 +3120,7 @@ bool ieee80211_chandef_ht_oper(const struct ieee80211_ht_operation *ht_oper, return true; } -bool ieee80211_chandef_vht_oper(struct ieee80211_hw *hw, +bool ieee80211_chandef_vht_oper(struct ieee80211_hw *hw, u32 vht_cap_info, const struct ieee80211_vht_operation *oper, const struct ieee80211_ht_operation *htop, struct cfg80211_chan_def *chandef) @@ -3132,6 +3132,10 @@ bool ieee80211_chandef_vht_oper(struct ieee80211_hw *hw, u32 vht_cap; bool support_80_80 = false; bool support_160 = false; + u8 ext_nss_bw_supp = u32_get_bits(vht_cap_info, + IEEE80211_VHT_CAP_EXT_NSS_BW_MASK); + u8 supp_chwidth = u32_get_bits(vht_cap_info, + IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_MASK); if (!oper || !htop) return false; @@ -3151,11 +3155,48 @@ bool ieee80211_chandef_vht_oper(struct ieee80211_hw *hw, IEEE80211_HT_OP_MODE_CCFS2_MASK) >> IEEE80211_HT_OP_MODE_CCFS2_SHIFT; - /* when parsing (and we know how to) CCFS1 and CCFS2 are equivalent */ ccf0 = ccfs0; - ccf1 = ccfs1; - if (!ccfs1 && ieee80211_hw_check(hw, SUPPORTS_VHT_EXT_NSS_BW)) + + /* if not supported, parse as though we didn't understand it */ + if (!ieee80211_hw_check(hw, SUPPORTS_VHT_EXT_NSS_BW)) + ext_nss_bw_supp = 0; + + /* + * Cf. IEEE 802.11 Table 9-250 + * + * We really just consider that because it's inefficient to connect + * at a higher bandwidth than we'll actually be able to use. + */ + switch ((supp_chwidth << 4) | ext_nss_bw_supp) { + default: + case 0x00: + ccf1 = 0; + support_160 = false; + support_80_80 = false; + break; + case 0x01: + support_80_80 = false; + /* fall through */ + case 0x02: + case 0x03: ccf1 = ccfs2; + break; + case 0x10: + ccf1 = ccfs1; + break; + case 0x11: + case 0x12: + if (!ccfs1) + ccf1 = ccfs2; + else + ccf1 = ccfs1; + break; + case 0x13: + case 0x20: + case 0x23: + ccf1 = ccfs1; + break; + } cf0 = ieee80211_channel_to_frequency(ccf0, chandef->chan->band); cf1 = ieee80211_channel_to_frequency(ccf1, chandef->chan->band); -- cgit v1.2.3-59-g8ed1b From 57fa5e85d53ce51e0cb06a7f320b79377d0fbe5f Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Thu, 28 May 2020 21:34:36 +0200 Subject: mac80211: determine chandef from HE 6 GHz operation Support connecting to HE 6 GHz APs and mesh networks on 6 GHz, where the HT/VHT information is missing but instead the HE 6 GHz band capability is present, and the 6 GHz Operation information field is used to encode the channel configuration instead of the HT/VHT operation elements. Also add some other bits needed to connect to 6 GHz networks. Link: https://lore.kernel.org/r/1589399105-25472-10-git-send-email-rmanohar@codeaurora.org Co-developed-by: Rajkumar Manoharan Signed-off-by: Rajkumar Manoharan Link: https://lore.kernel.org/r/20200528213443.25687d2695bc.I3f9747c1147480f65445f13eda5c4a5ed4e86757@changeid Signed-off-by: Johannes Berg --- net/mac80211/ieee80211_i.h | 3 ++ net/mac80211/mesh.c | 1 + net/mac80211/mlme.c | 69 +++++++++++++++++++++-------- net/mac80211/util.c | 106 +++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 160 insertions(+), 19 deletions(-) diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index 0cc584574976..6cac5bf7cba3 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -2202,6 +2202,9 @@ bool ieee80211_chandef_vht_oper(struct ieee80211_hw *hw, u32 vht_cap_info, const struct ieee80211_vht_operation *oper, const struct ieee80211_ht_operation *htop, struct cfg80211_chan_def *chandef); +bool ieee80211_chandef_he_6ghz_oper(struct ieee80211_sub_if_data *sdata, + const struct ieee80211_he_operation *he_oper, + struct cfg80211_chan_def *chandef); u32 ieee80211_chandef_downgrade(struct cfg80211_chan_def *c); int __must_check diff --git a/net/mac80211/mesh.c b/net/mac80211/mesh.c index 696d6fb322e6..5f1ca25b6c97 100644 --- a/net/mac80211/mesh.c +++ b/net/mac80211/mesh.c @@ -104,6 +104,7 @@ bool mesh_matches_local(struct ieee80211_sub_if_data *sdata, ieee80211_chandef_vht_oper(&sdata->local->hw, vht_cap_info, ie->vht_operation, ie->ht_operation, &sta_chan_def); + ieee80211_chandef_he_6ghz_oper(sdata, ie->he_operation, &sta_chan_def); if (!cfg80211_chandef_compatible(&sdata->vif.bss_conf.chandef, &sta_chan_def)) diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index 1f9c48414c85..bc558d1d20fc 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -156,15 +156,24 @@ ieee80211_determine_chantype(struct ieee80211_sub_if_data *sdata, struct ieee80211_sta_ht_cap sta_ht_cap; u32 ht_cfreq, ret; - memcpy(&sta_ht_cap, &sband->ht_cap, sizeof(sta_ht_cap)); - ieee80211_apply_htcap_overrides(sdata, &sta_ht_cap); - memset(chandef, 0, sizeof(struct cfg80211_chan_def)); chandef->chan = channel; chandef->width = NL80211_CHAN_WIDTH_20_NOHT; chandef->center_freq1 = channel->center_freq; chandef->freq1_offset = channel->freq_offset; + if (channel->band == NL80211_BAND_6GHZ) { + if (!ieee80211_chandef_he_6ghz_oper(sdata, he_oper, chandef)) + ret = IEEE80211_STA_DISABLE_HT | + IEEE80211_STA_DISABLE_VHT | + IEEE80211_STA_DISABLE_HE; + vht_chandef = *chandef; + goto out; + } + + memcpy(&sta_ht_cap, &sband->ht_cap, sizeof(sta_ht_cap)); + ieee80211_apply_htcap_overrides(sdata, &sta_ht_cap); + if (!ht_oper || !sta_ht_cap.ht_supported) { ret = IEEE80211_STA_DISABLE_HT | IEEE80211_STA_DISABLE_VHT | @@ -914,7 +923,8 @@ static void ieee80211_send_assoc(struct ieee80211_sub_if_data *sdata) !(ifmgd->flags & IEEE80211_STA_DISABLE_VHT))) ifmgd->flags |= IEEE80211_STA_DISABLE_VHT; - if (!(ifmgd->flags & IEEE80211_STA_DISABLE_HT)) + if (sband->band != NL80211_BAND_6GHZ && + !(ifmgd->flags & IEEE80211_STA_DISABLE_HT)) ieee80211_add_ht_ie(sdata, skb, assoc_data->ap_ht_param, sband, chan, sdata->smps_mode); @@ -968,7 +978,8 @@ static void ieee80211_send_assoc(struct ieee80211_sub_if_data *sdata) offset = noffset; } - if (!(ifmgd->flags & IEEE80211_STA_DISABLE_VHT)) + if (sband->band != NL80211_BAND_6GHZ && + !(ifmgd->flags & IEEE80211_STA_DISABLE_VHT)) ieee80211_add_vht_ie(sdata, skb, sband, &assoc_data->ap_vht_cap); @@ -3248,6 +3259,7 @@ static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata, struct ieee80211_bss_conf *bss_conf = &sdata->vif.bss_conf; const struct cfg80211_bss_ies *bss_ies = NULL; struct ieee80211_mgd_assoc_data *assoc_data = ifmgd->assoc_data; + bool is_6ghz = cbss->channel->band == NL80211_BAND_6GHZ; u32 changed = 0; int err; bool ret; @@ -3289,11 +3301,12 @@ static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata, * 2G/3G/4G wifi routers, reported models include the "Onda PN51T", * "Vodafone PocketWiFi 2", "ZTE MF60" and a similar T-Mobile device. */ - if ((assoc_data->wmm && !elems->wmm_param) || - (!(ifmgd->flags & IEEE80211_STA_DISABLE_HT) && - (!elems->ht_cap_elem || !elems->ht_operation)) || - (!(ifmgd->flags & IEEE80211_STA_DISABLE_VHT) && - (!elems->vht_cap_elem || !elems->vht_operation))) { + if (!is_6ghz && + ((assoc_data->wmm && !elems->wmm_param) || + (!(ifmgd->flags & IEEE80211_STA_DISABLE_HT) && + (!elems->ht_cap_elem || !elems->ht_operation)) || + (!(ifmgd->flags & IEEE80211_STA_DISABLE_VHT) && + (!elems->vht_cap_elem || !elems->vht_operation)))) { const struct cfg80211_bss_ies *ies; struct ieee802_11_elems bss_elems; @@ -3351,7 +3364,7 @@ static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata, * We previously checked these in the beacon/probe response, so * they should be present here. This is just a safety net. */ - if (!(ifmgd->flags & IEEE80211_STA_DISABLE_HT) && + if (!is_6ghz && !(ifmgd->flags & IEEE80211_STA_DISABLE_HT) && (!elems->wmm_param || !elems->ht_cap_elem || !elems->ht_operation)) { sdata_info(sdata, "HT AP is missing WMM params or HT capability/operation\n"); @@ -3359,7 +3372,7 @@ static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata, goto out; } - if (!(ifmgd->flags & IEEE80211_STA_DISABLE_VHT) && + if (!is_6ghz && !(ifmgd->flags & IEEE80211_STA_DISABLE_VHT) && (!elems->vht_cap_elem || !elems->vht_operation)) { sdata_info(sdata, "VHT AP is missing VHT capability/operation\n"); @@ -3367,6 +3380,14 @@ static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata, goto out; } + if (is_6ghz && !(ifmgd->flags & IEEE80211_STA_DISABLE_HE) && + !elems->he_6ghz_capa) { + sdata_info(sdata, + "HE 6 GHz AP is missing HE 6 GHz band capability\n"); + ret = false; + goto out; + } + mutex_lock(&sdata->local->sta_mtx); /* * station info was already allocated and inserted before @@ -4826,6 +4847,7 @@ static int ieee80211_prep_channel(struct ieee80211_sub_if_data *sdata, const struct ieee80211_he_operation *he_oper = NULL; struct ieee80211_supported_band *sband; struct cfg80211_chan_def chandef; + bool is_6ghz = cbss->channel->band == NL80211_BAND_6GHZ; struct ieee80211_bss *bss = (void *)cbss->priv; int ret; u32 i; @@ -4838,21 +4860,23 @@ static int ieee80211_prep_channel(struct ieee80211_sub_if_data *sdata, IEEE80211_STA_DISABLE_160MHZ); /* disable HT/VHT/HE if we don't support them */ - if (!sband->ht_cap.ht_supported) { + if (!sband->ht_cap.ht_supported && !is_6ghz) { ifmgd->flags |= IEEE80211_STA_DISABLE_HT; ifmgd->flags |= IEEE80211_STA_DISABLE_VHT; ifmgd->flags |= IEEE80211_STA_DISABLE_HE; } - if (!sband->vht_cap.vht_supported) + if (!sband->vht_cap.vht_supported && !is_6ghz) { ifmgd->flags |= IEEE80211_STA_DISABLE_VHT; + ifmgd->flags |= IEEE80211_STA_DISABLE_HE; + } if (!ieee80211_get_he_sta_cap(sband)) ifmgd->flags |= IEEE80211_STA_DISABLE_HE; rcu_read_lock(); - if (!(ifmgd->flags & IEEE80211_STA_DISABLE_HT)) { + if (!(ifmgd->flags & IEEE80211_STA_DISABLE_HT) && !is_6ghz) { const u8 *ht_oper_ie, *ht_cap_ie; ht_oper_ie = ieee80211_bss_get_ie(cbss, WLAN_EID_HT_OPERATION); @@ -4869,7 +4893,7 @@ static int ieee80211_prep_channel(struct ieee80211_sub_if_data *sdata, } } - if (!(ifmgd->flags & IEEE80211_STA_DISABLE_VHT)) { + if (!(ifmgd->flags & IEEE80211_STA_DISABLE_VHT) && !is_6ghz) { const u8 *vht_oper_ie, *vht_cap; vht_oper_ie = ieee80211_bss_get_ie(cbss, @@ -4934,6 +4958,11 @@ static int ieee80211_prep_channel(struct ieee80211_sub_if_data *sdata, rcu_read_unlock(); + if (ifmgd->flags & IEEE80211_STA_DISABLE_HE && is_6ghz) { + sdata_info(sdata, "Rejecting non-HE 6/7 GHz connection"); + return -EINVAL; + } + /* will change later if needed */ sdata->smps_mode = IEEE80211_SMPS_OFF; @@ -5315,6 +5344,7 @@ int ieee80211_mgd_auth(struct ieee80211_sub_if_data *sdata, int ieee80211_mgd_assoc(struct ieee80211_sub_if_data *sdata, struct cfg80211_assoc_request *req) { + bool is_6ghz = req->bss->channel->band == NL80211_BAND_6GHZ; struct ieee80211_local *local = sdata->local; struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; struct ieee80211_bss *bss = (void *)req->bss->priv; @@ -5457,14 +5487,15 @@ int ieee80211_mgd_assoc(struct ieee80211_sub_if_data *sdata, if (ht_ie && ht_ie[1] >= sizeof(struct ieee80211_ht_operation)) assoc_data->ap_ht_param = ((struct ieee80211_ht_operation *)(ht_ie + 2))->ht_param; - else + else if (!is_6ghz) ifmgd->flags |= IEEE80211_STA_DISABLE_HT; vht_ie = ieee80211_bss_get_ie(req->bss, WLAN_EID_VHT_CAPABILITY); if (vht_ie && vht_ie[1] >= sizeof(struct ieee80211_vht_cap)) memcpy(&assoc_data->ap_vht_cap, vht_ie + 2, sizeof(struct ieee80211_vht_cap)); - else - ifmgd->flags |= IEEE80211_STA_DISABLE_VHT; + else if (!is_6ghz) + ifmgd->flags |= IEEE80211_STA_DISABLE_VHT | + IEEE80211_STA_DISABLE_HE; rcu_read_unlock(); if (WARN((sdata->vif.driver_flags & IEEE80211_VIF_SUPPORTS_UAPSD) && diff --git a/net/mac80211/util.c b/net/mac80211/util.c index e6104829fa1c..cbe24d303f0d 100644 --- a/net/mac80211/util.c +++ b/net/mac80211/util.c @@ -3244,6 +3244,112 @@ bool ieee80211_chandef_vht_oper(struct ieee80211_hw *hw, u32 vht_cap_info, return true; } +bool ieee80211_chandef_he_6ghz_oper(struct ieee80211_sub_if_data *sdata, + const struct ieee80211_he_operation *he_oper, + struct cfg80211_chan_def *chandef) +{ + struct ieee80211_local *local = sdata->local; + struct ieee80211_supported_band *sband; + enum nl80211_iftype iftype = ieee80211_vif_type_p2p(&sdata->vif); + const struct ieee80211_sta_he_cap *he_cap; + struct cfg80211_chan_def he_chandef = *chandef; + const struct ieee80211_he_6ghz_oper *he_6ghz_oper; + bool support_80_80, support_160; + u8 he_phy_cap; + u32 freq; + + if (chandef->chan->band != NL80211_BAND_6GHZ) + return true; + + sband = local->hw.wiphy->bands[NL80211_BAND_6GHZ]; + + he_cap = ieee80211_get_he_iftype_cap(sband, iftype); + if (!he_cap) { + sdata_info(sdata, "Missing iftype sband data/HE cap"); + return false; + } + + he_phy_cap = he_cap->he_cap_elem.phy_cap_info[0]; + support_160 = + he_phy_cap & + IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_160MHZ_IN_5G; + support_80_80 = + he_phy_cap & + IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_80PLUS80_MHZ_IN_5G; + + if (!he_oper) { + sdata_info(sdata, + "HE is not advertised on (on %d MHz), expect issues\n", + chandef->chan->center_freq); + return false; + } + + he_6ghz_oper = ieee80211_he_6ghz_oper(he_oper); + + if (!he_6ghz_oper) { + sdata_info(sdata, + "HE 6GHz operation missing (on %d MHz), expect issues\n", + chandef->chan->center_freq); + return false; + } + + freq = ieee80211_channel_to_frequency(he_6ghz_oper->primary, + NL80211_BAND_6GHZ); + he_chandef.chan = ieee80211_get_channel(sdata->local->hw.wiphy, freq); + + switch (u8_get_bits(he_6ghz_oper->control, + IEEE80211_HE_6GHZ_OPER_CTRL_CHANWIDTH)) { + case IEEE80211_HE_6GHZ_OPER_CTRL_CHANWIDTH_20MHZ: + he_chandef.width = NL80211_CHAN_WIDTH_20; + break; + case IEEE80211_HE_6GHZ_OPER_CTRL_CHANWIDTH_40MHZ: + he_chandef.width = NL80211_CHAN_WIDTH_40; + break; + case IEEE80211_HE_6GHZ_OPER_CTRL_CHANWIDTH_80MHZ: + he_chandef.width = NL80211_CHAN_WIDTH_80; + break; + case IEEE80211_HE_6GHZ_OPER_CTRL_CHANWIDTH_160MHZ: + he_chandef.width = NL80211_CHAN_WIDTH_80; + if (!he_6ghz_oper->ccfs1) + break; + if (abs(he_6ghz_oper->ccfs1 - he_6ghz_oper->ccfs0) == 8) { + if (support_160) + he_chandef.width = NL80211_CHAN_WIDTH_160; + } else { + if (support_80_80) + he_chandef.width = NL80211_CHAN_WIDTH_80P80; + } + break; + } + + if (he_chandef.width == NL80211_CHAN_WIDTH_160) { + he_chandef.center_freq1 = + ieee80211_channel_to_frequency(he_6ghz_oper->ccfs1, + NL80211_BAND_6GHZ); + } else { + he_chandef.center_freq1 = + ieee80211_channel_to_frequency(he_6ghz_oper->ccfs0, + NL80211_BAND_6GHZ); + he_chandef.center_freq2 = + ieee80211_channel_to_frequency(he_6ghz_oper->ccfs1, + NL80211_BAND_6GHZ); + } + + if (!cfg80211_chandef_valid(&he_chandef)) { + sdata_info(sdata, + "HE 6GHz operation resulted in invalid chandef: %d MHz/%d/%d MHz/%d MHz\n", + he_chandef.chan ? he_chandef.chan->center_freq : 0, + he_chandef.width, + he_chandef.center_freq1, + he_chandef.center_freq2); + return false; + } + + *chandef = he_chandef; + + return true; +} + int ieee80211_parse_bitrates(struct cfg80211_chan_def *chandef, const struct ieee80211_supported_band *sband, const u8 *srates, int srates_len, u32 *rates) -- cgit v1.2.3-59-g8ed1b From 3b3ec3d52e8f72ec8c40477b96f23440a89000be Mon Sep 17 00:00:00 2001 From: Shaul Triebitz Date: Thu, 28 May 2020 21:34:37 +0200 Subject: mac80211: check the correct bit for EMA AP An AP supporting EMA (Enhanced Multi-BSSID advertisement) should set bit 83 in the extended capabilities IE (9.4.2.26 in the 802.11ax D5 spec). So the *3rd* bit of the 10th byte should be checked. Also, in one place, the wrong byte was checked. (cfg80211_find_ie returns a pointer to the beginning of the IE, so the data really starts at ie[2], so the 10th byte should be ie[12]. To avoid this confusion, use cfg80211_find_elem instead). Signed-off-by: Shaul Triebitz Link: https://lore.kernel.org/r/20200528213443.4316121fa2a3.I9745582f8d41ad8e689dac0fefcd70b276d7c1ea@changeid Signed-off-by: Johannes Berg --- include/linux/ieee80211.h | 2 +- net/mac80211/mlme.c | 18 +++++++++--------- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index 9580dfd9e2d1..1ecfd19f836d 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -3082,7 +3082,7 @@ enum ieee80211_tdls_actioncode { #define WLAN_EXT_CAPA10_OBSS_NARROW_BW_RU_TOLERANCE_SUPPORT BIT(7) /* Defines support for enhanced multi-bssid advertisement*/ -#define WLAN_EXT_CAPA11_EMA_SUPPORT BIT(1) +#define WLAN_EXT_CAPA11_EMA_SUPPORT BIT(3) /* TDLS specific payload type in the LLC/SNAP header */ #define WLAN_TDLS_SNAP_RFTYPE 0x2 diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index bc558d1d20fc..c534cd1bb9cd 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -5596,7 +5596,7 @@ int ieee80211_mgd_assoc(struct ieee80211_sub_if_data *sdata, assoc_data->timeout_started = true; assoc_data->need_beacon = true; } else if (beacon_ies) { - const u8 *ie; + const struct element *elem; u8 dtim_count = 0; ieee80211_get_dtim(beacon_ies, &dtim_count, @@ -5613,15 +5613,15 @@ int ieee80211_mgd_assoc(struct ieee80211_sub_if_data *sdata, sdata->vif.bss_conf.sync_dtim_count = dtim_count; } - ie = cfg80211_find_ext_ie(WLAN_EID_EXT_MULTIPLE_BSSID_CONFIGURATION, - beacon_ies->data, beacon_ies->len); - if (ie && ie[1] >= 3) - sdata->vif.bss_conf.profile_periodicity = ie[4]; + elem = cfg80211_find_ext_elem(WLAN_EID_EXT_MULTIPLE_BSSID_CONFIGURATION, + beacon_ies->data, beacon_ies->len); + if (elem && elem->datalen >= 3) + sdata->vif.bss_conf.profile_periodicity = elem->data[2]; - ie = cfg80211_find_ie(WLAN_EID_EXT_CAPABILITY, - beacon_ies->data, beacon_ies->len); - if (ie && ie[1] >= 11 && - (ie[10] & WLAN_EXT_CAPA11_EMA_SUPPORT)) + elem = cfg80211_find_elem(WLAN_EID_EXT_CAPABILITY, + beacon_ies->data, beacon_ies->len); + if (elem && elem->datalen >= 11 && + (elem->data[10] & WLAN_EXT_CAPA11_EMA_SUPPORT)) sdata->vif.bss_conf.ema_ap = true; } else { assoc_data->timeout = jiffies; -- cgit v1.2.3-59-g8ed1b From 1bb9a8a4c81d0305c511a0919cd30ebfa91915ae Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Thu, 28 May 2020 21:34:38 +0200 Subject: mac80211: use HE 6 GHz band capability and pass it to the driver In order to handle 6 GHz AP side, take the HE 6 GHz band capability data and pass it to the driver (which needs it for A-MPDU spacing and A-MPDU length). Link: https://lore.kernel.org/r/1589399105-25472-6-git-send-email-rmanohar@codeaurora.org Co-developed-by: Rajkumar Manoharan Signed-off-by: Rajkumar Manoharan Link: https://lore.kernel.org/r/20200528213443.784e4890d82f.I5f1230d5ab27e84e7bbe88e3645b24ea15a0c146@changeid Signed-off-by: Johannes Berg --- include/net/mac80211.h | 4 +++- net/mac80211/cfg.c | 4 +++- net/mac80211/he.c | 48 ++++++++++++++++++++++++++++++++++++++++++++++ net/mac80211/ieee80211_i.h | 1 + net/mac80211/mesh_plink.c | 4 +++- net/mac80211/mlme.c | 1 + 6 files changed, 59 insertions(+), 3 deletions(-) diff --git a/include/net/mac80211.h b/include/net/mac80211.h index 7cb712427df1..11d5610d2ad5 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -7,7 +7,7 @@ * Copyright 2007-2010 Johannes Berg * Copyright 2013-2014 Intel Mobile Communications GmbH * Copyright (C) 2015 - 2017 Intel Deutschland GmbH - * Copyright (C) 2018 - 2019 Intel Corporation + * Copyright (C) 2018 - 2020 Intel Corporation */ #ifndef MAC80211_H @@ -1977,6 +1977,7 @@ struct ieee80211_sta_txpwr { * @ht_cap: HT capabilities of this STA; restricted to our own capabilities * @vht_cap: VHT capabilities of this STA; restricted to our own capabilities * @he_cap: HE capabilities of this STA + * @he_6ghz_capa: on 6 GHz, holds the HE 6 GHz band capabilities * @max_rx_aggregation_subframes: maximal amount of frames in a single AMPDU * that this station is allowed to transmit to us. * Can be modified by driver. @@ -2016,6 +2017,7 @@ struct ieee80211_sta { struct ieee80211_sta_ht_cap ht_cap; struct ieee80211_sta_vht_cap vht_cap; struct ieee80211_sta_he_cap he_cap; + struct ieee80211_he_6ghz_capa he_6ghz_capa; u16 max_rx_aggregation_subframes; bool wme; u8 uapsd_queues; diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index 06a2b7640a9d..90a07d075fdb 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -1520,7 +1520,9 @@ static int sta_apply_parameters(struct ieee80211_local *local, if (params->he_capa) ieee80211_he_cap_ie_to_sta_he_cap(sdata, sband, (void *)params->he_capa, - params->he_capa_len, sta); + params->he_capa_len, + (void *)params->he_6ghz_capa, + sta); if (params->opmode_notif_used) { /* returned value is only needed for rc update, but the diff --git a/net/mac80211/he.c b/net/mac80211/he.c index f520552b22be..cc26f239838b 100644 --- a/net/mac80211/he.c +++ b/net/mac80211/he.c @@ -8,10 +8,55 @@ #include "ieee80211_i.h" +static void +ieee80211_update_from_he_6ghz_capa(const struct ieee80211_he_6ghz_capa *he_6ghz_capa, + struct sta_info *sta) +{ + enum ieee80211_smps_mode smps_mode; + + if (sta->sdata->vif.type == NL80211_IFTYPE_AP || + sta->sdata->vif.type == NL80211_IFTYPE_AP_VLAN) { + switch (le16_get_bits(he_6ghz_capa->capa, + IEEE80211_HE_6GHZ_CAP_SM_PS)) { + case WLAN_HT_CAP_SM_PS_INVALID: + case WLAN_HT_CAP_SM_PS_STATIC: + smps_mode = IEEE80211_SMPS_STATIC; + break; + case WLAN_HT_CAP_SM_PS_DYNAMIC: + smps_mode = IEEE80211_SMPS_DYNAMIC; + break; + case WLAN_HT_CAP_SM_PS_DISABLED: + smps_mode = IEEE80211_SMPS_OFF; + break; + } + + sta->sta.smps_mode = smps_mode; + } else { + sta->sta.smps_mode = IEEE80211_SMPS_OFF; + } + + switch (le16_get_bits(he_6ghz_capa->capa, + IEEE80211_HE_6GHZ_CAP_MAX_MPDU_LEN)) { + case IEEE80211_VHT_CAP_MAX_MPDU_LENGTH_11454: + sta->sta.max_amsdu_len = IEEE80211_MAX_MPDU_LEN_VHT_11454; + break; + case IEEE80211_VHT_CAP_MAX_MPDU_LENGTH_7991: + sta->sta.max_amsdu_len = IEEE80211_MAX_MPDU_LEN_VHT_7991; + break; + case IEEE80211_VHT_CAP_MAX_MPDU_LENGTH_3895: + default: + sta->sta.max_amsdu_len = IEEE80211_MAX_MPDU_LEN_VHT_3895; + break; + } + + sta->sta.he_6ghz_capa = *he_6ghz_capa; +} + void ieee80211_he_cap_ie_to_sta_he_cap(struct ieee80211_sub_if_data *sdata, struct ieee80211_supported_band *sband, const u8 *he_cap_ie, u8 he_cap_len, + const struct ieee80211_he_6ghz_capa *he_6ghz_capa, struct sta_info *sta) { struct ieee80211_sta_he_cap *he_cap = &sta->sta.he_cap; @@ -53,6 +98,9 @@ ieee80211_he_cap_ie_to_sta_he_cap(struct ieee80211_sub_if_data *sdata, sta->cur_max_bandwidth = ieee80211_sta_cap_rx_bw(sta); sta->sta.bandwidth = ieee80211_sta_cur_vht_bw(sta); + + if (sband->band == NL80211_BAND_6GHZ && he_6ghz_capa) + ieee80211_update_from_he_6ghz_capa(he_6ghz_capa, sta); } void diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index 6cac5bf7cba3..24dc1fd57000 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -1899,6 +1899,7 @@ void ieee80211_he_cap_ie_to_sta_he_cap(struct ieee80211_sub_if_data *sdata, struct ieee80211_supported_band *sband, const u8 *he_cap_ie, u8 he_cap_len, + const struct ieee80211_he_6ghz_capa *he_6ghz_capa, struct sta_info *sta); void ieee80211_he_spr_ie_to_bss_conf(struct ieee80211_vif *vif, diff --git a/net/mac80211/mesh_plink.c b/net/mac80211/mesh_plink.c index fbbfc5d4a51c..798e4b6b383f 100644 --- a/net/mac80211/mesh_plink.c +++ b/net/mac80211/mesh_plink.c @@ -444,7 +444,9 @@ static void mesh_sta_info_init(struct ieee80211_sub_if_data *sdata, elems->vht_cap_elem, sta); ieee80211_he_cap_ie_to_sta_he_cap(sdata, sband, elems->he_cap, - elems->he_cap_len, sta); + elems->he_cap_len, + elems->he_6ghz_capa, + sta); if (bw != sta->sta.bandwidth) changed |= IEEE80211_RC_BW_CHANGED; diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index c534cd1bb9cd..8a37089e86bb 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -3430,6 +3430,7 @@ static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata, ieee80211_he_cap_ie_to_sta_he_cap(sdata, sband, elems->he_cap, elems->he_cap_len, + elems->he_6ghz_capa, sta); bss_conf->he_support = sta->sta.he_cap.has_he; -- cgit v1.2.3-59-g8ed1b From 2ad2274c58ee2dcaf9ccde5c63ff30f59b138f77 Mon Sep 17 00:00:00 2001 From: Ilan Peer Date: Thu, 28 May 2020 21:34:39 +0200 Subject: mac80211: Add HE 6GHz capabilities element to probe request On 6 GHz, the 6 GHz capabilities element should be added, do that. Signed-off-by: Ilan Peer [add commit message] Link: https://lore.kernel.org/r/20200528213443.8ee764f0cde0.I2b0c66b60e11818c97c9803e04a6a197c6376243@changeid Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 20 ++++++++++++++++++++ net/mac80211/ieee80211_i.h | 2 +- net/mac80211/scan.c | 17 +++++++++-------- net/mac80211/util.c | 36 ++++++++++++++++++++++++++++-------- 4 files changed, 58 insertions(+), 17 deletions(-) diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 9b76be3d561a..95b55eea2afb 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -512,6 +512,26 @@ ieee80211_get_he_sta_cap(const struct ieee80211_supported_band *sband) return ieee80211_get_he_iftype_cap(sband, NL80211_IFTYPE_STATION); } +/** + * ieee80211_get_he_6ghz_capa - return HE 6 GHz capabilities + * @sband: the sband to search for the STA on + * @iftype: the iftype to search for + * + * Return: the 6GHz capabilities + */ +static inline __le16 +ieee80211_get_he_6ghz_capa(const struct ieee80211_supported_band *sband, + enum nl80211_iftype iftype) +{ + const struct ieee80211_sband_iftype_data *data = + ieee80211_get_sband_iftype_data(sband, iftype); + + if (WARN_ON(!data || !data->he_cap.has_he)) + return 0; + + return data->he_6ghz_capa.capa; +} + /** * wiphy_read_of_freq_limits - read frequency limits from device tree * diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index 24dc1fd57000..ec1a71ac65f2 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -2144,7 +2144,7 @@ enum { IEEE80211_PROBE_FLAG_RANDOM_SN = BIT(2), }; -int ieee80211_build_preq_ies(struct ieee80211_local *local, u8 *buffer, +int ieee80211_build_preq_ies(struct ieee80211_sub_if_data *sdata, u8 *buffer, size_t buffer_len, struct ieee80211_scan_ies *ie_desc, const u8 *ie, size_t ie_len, diff --git a/net/mac80211/scan.c b/net/mac80211/scan.c index d0c2e8012118..ad90bbe57457 100644 --- a/net/mac80211/scan.c +++ b/net/mac80211/scan.c @@ -313,8 +313,9 @@ ieee80211_prepare_scan_chandef(struct cfg80211_chan_def *chandef, } /* return false if no more work */ -static bool ieee80211_prep_hw_scan(struct ieee80211_local *local) +static bool ieee80211_prep_hw_scan(struct ieee80211_sub_if_data *sdata) { + struct ieee80211_local *local = sdata->local; struct cfg80211_scan_request *req; struct cfg80211_chan_def chandef; u8 bands_used = 0; @@ -361,7 +362,7 @@ static bool ieee80211_prep_hw_scan(struct ieee80211_local *local) if (req->flags & NL80211_SCAN_FLAG_MIN_PREQ_CONTENT) flags |= IEEE80211_PROBE_FLAG_MIN_CONTENT; - ielen = ieee80211_build_preq_ies(local, + ielen = ieee80211_build_preq_ies(sdata, (u8 *)local->hw_scan_req->req.ie, local->hw_scan_ies_bufsize, &local->hw_scan_req->ies, @@ -401,9 +402,12 @@ static void __ieee80211_scan_completed(struct ieee80211_hw *hw, bool aborted) if (WARN_ON(!local->scan_req)) return; + scan_sdata = rcu_dereference_protected(local->scan_sdata, + lockdep_is_held(&local->mtx)); + if (hw_scan && !aborted && !ieee80211_hw_check(&local->hw, SINGLE_SCAN_ON_ALL_BANDS) && - ieee80211_prep_hw_scan(local)) { + ieee80211_prep_hw_scan(scan_sdata)) { int rc; rc = drv_hw_scan(local, @@ -432,9 +436,6 @@ static void __ieee80211_scan_completed(struct ieee80211_hw *hw, bool aborted) cfg80211_scan_done(scan_req, &local->scan_info); } RCU_INIT_POINTER(local->scan_req, NULL); - - scan_sdata = rcu_dereference_protected(local->scan_sdata, - lockdep_is_held(&local->mtx)); RCU_INIT_POINTER(local->scan_sdata, NULL); local->scanning = 0; @@ -776,7 +777,7 @@ static int __ieee80211_start_scan(struct ieee80211_sub_if_data *sdata, ieee80211_recalc_idle(local); if (hw_scan) { - WARN_ON(!ieee80211_prep_hw_scan(local)); + WARN_ON(!ieee80211_prep_hw_scan(sdata)); rc = drv_hw_scan(local, sdata, local->hw_scan_req); } else { rc = ieee80211_start_sw_scan(local, sdata); @@ -1274,7 +1275,7 @@ int __ieee80211_request_sched_scan_start(struct ieee80211_sub_if_data *sdata, ieee80211_prepare_scan_chandef(&chandef, req->scan_width); - ieee80211_build_preq_ies(local, ie, num_bands * iebufsz, + ieee80211_build_preq_ies(sdata, ie, num_bands * iebufsz, &sched_scan_ies, req->ie, req->ie_len, bands_used, rate_masks, &chandef, flags); diff --git a/net/mac80211/util.c b/net/mac80211/util.c index cbe24d303f0d..21c94094a699 100644 --- a/net/mac80211/util.c +++ b/net/mac80211/util.c @@ -1663,7 +1663,20 @@ void ieee80211_send_deauth_disassoc(struct ieee80211_sub_if_data *sdata, } } -static int ieee80211_build_preq_ies_band(struct ieee80211_local *local, +static u8 *ieee80211_write_he_6ghz_cap(u8 *pos, __le16 cap, u8 *end) +{ + if ((end - pos) < 5) + return pos; + + *pos++ = WLAN_EID_EXTENSION; + *pos++ = 1 + sizeof(cap); + *pos++ = WLAN_EID_EXT_HE_6GHZ_CAPA; + memcpy(pos, &cap, sizeof(cap)); + + return pos + 2; +} + +static int ieee80211_build_preq_ies_band(struct ieee80211_sub_if_data *sdata, u8 *buffer, size_t buffer_len, const u8 *ie, size_t ie_len, enum nl80211_band band, @@ -1671,6 +1684,7 @@ static int ieee80211_build_preq_ies_band(struct ieee80211_local *local, struct cfg80211_chan_def *chandef, size_t *offset, u32 flags) { + struct ieee80211_local *local = sdata->local; struct ieee80211_supported_band *sband; const struct ieee80211_sta_he_cap *he_cap; u8 *pos = buffer, *end = buffer + buffer_len; @@ -1848,6 +1862,14 @@ static int ieee80211_build_preq_ies_band(struct ieee80211_local *local, pos = ieee80211_ie_build_he_cap(pos, he_cap, end); if (!pos) goto out_err; + + if (sband->band == NL80211_BAND_6GHZ) { + enum nl80211_iftype iftype = + ieee80211_vif_type_p2p(&sdata->vif); + __le16 cap = ieee80211_get_he_6ghz_capa(sband, iftype); + + pos = ieee80211_write_he_6ghz_cap(pos, cap, end); + } } /* @@ -1862,7 +1884,7 @@ static int ieee80211_build_preq_ies_band(struct ieee80211_local *local, return pos - buffer; } -int ieee80211_build_preq_ies(struct ieee80211_local *local, u8 *buffer, +int ieee80211_build_preq_ies(struct ieee80211_sub_if_data *sdata, u8 *buffer, size_t buffer_len, struct ieee80211_scan_ies *ie_desc, const u8 *ie, size_t ie_len, @@ -1877,7 +1899,7 @@ int ieee80211_build_preq_ies(struct ieee80211_local *local, u8 *buffer, for (i = 0; i < NUM_NL80211_BANDS; i++) { if (bands_used & BIT(i)) { - pos += ieee80211_build_preq_ies_band(local, + pos += ieee80211_build_preq_ies_band(sdata, buffer + pos, buffer_len - pos, ie, ie_len, i, @@ -1939,7 +1961,7 @@ struct sk_buff *ieee80211_build_probe_req(struct ieee80211_sub_if_data *sdata, return NULL; rate_masks[chan->band] = ratemask; - ies_len = ieee80211_build_preq_ies(local, skb_tail_pointer(skb), + ies_len = ieee80211_build_preq_ies(sdata, skb_tail_pointer(skb), skb_tailroom(skb), &dummy_ie_desc, ie, ie_len, BIT(chan->band), rate_masks, &chandef, flags); @@ -2879,10 +2901,8 @@ void ieee80211_ie_build_he_6ghz_cap(struct ieee80211_sub_if_data *sdata, } pos = skb_put(skb, 2 + 1 + sizeof(cap)); - *pos++ = WLAN_EID_EXTENSION; - *pos++ = 1 + sizeof(cap); - *pos++ = WLAN_EID_EXT_HE_6GHZ_CAPA; - put_unaligned_le16(cap, pos); + ieee80211_write_he_6ghz_cap(pos, cpu_to_le16(cap), + pos + 2 + 1 + sizeof(cap)); } u8 *ieee80211_ie_build_ht_oper(u8 *pos, struct ieee80211_sta_ht_cap *ht_cap, -- cgit v1.2.3-59-g8ed1b From ba8f6a037f790147438173029799f54c9d3065f2 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Thu, 28 May 2020 21:34:40 +0200 Subject: cfg80211: treat 6 GHz channels as valid regardless of capability If a 6 GHz channel exists, then we can probably safely assume that the device actually supports it, and then it should support most bandwidths. This will probably need to be extended to check the interface type and then dig into the HE capabilities for that though, to have the correct bandwidth check. Link: https://lore.kernel.org/r/20200528213443.d4864ef52e92.I82f09b2b14a56413ce20376d09967fe954a033eb@changeid Signed-off-by: Johannes Berg --- net/wireless/chan.c | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/net/wireless/chan.c b/net/wireless/chan.c index e111c08daa0e..cddf92c5d09e 100644 --- a/net/wireless/chan.c +++ b/net/wireless/chan.c @@ -6,7 +6,7 @@ * * Copyright 2009 Johannes Berg * Copyright 2013-2014 Intel Mobile Communications GmbH - * Copyright 2018 Intel Corporation + * Copyright 2018-2020 Intel Corporation */ #include @@ -919,7 +919,8 @@ bool cfg80211_chandef_usable(struct wiphy *wiphy, width = 10; break; case NL80211_CHAN_WIDTH_20: - if (!ht_cap->ht_supported) + if (!ht_cap->ht_supported && + chandef->chan->band != NL80211_BAND_6GHZ) return false; /* fall through */ case NL80211_CHAN_WIDTH_20_NOHT: @@ -928,6 +929,8 @@ bool cfg80211_chandef_usable(struct wiphy *wiphy, break; case NL80211_CHAN_WIDTH_40: width = 40; + if (chandef->chan->band == NL80211_BAND_6GHZ) + break; if (!ht_cap->ht_supported) return false; if (!(ht_cap->cap & IEEE80211_HT_CAP_SUP_WIDTH_20_40) || @@ -942,24 +945,29 @@ bool cfg80211_chandef_usable(struct wiphy *wiphy, break; case NL80211_CHAN_WIDTH_80P80: cap = vht_cap->cap & IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_MASK; - if (cap != IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_160_80PLUS80MHZ) + if (chandef->chan->band != NL80211_BAND_6GHZ && + cap != IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_160_80PLUS80MHZ) return false; /* fall through */ case NL80211_CHAN_WIDTH_80: - if (!vht_cap->vht_supported) - return false; prohibited_flags |= IEEE80211_CHAN_NO_80MHZ; width = 80; + if (chandef->chan->band == NL80211_BAND_6GHZ) + break; + if (!vht_cap->vht_supported) + return false; break; case NL80211_CHAN_WIDTH_160: + prohibited_flags |= IEEE80211_CHAN_NO_160MHZ; + width = 160; + if (chandef->chan->band == NL80211_BAND_6GHZ) + break; if (!vht_cap->vht_supported) return false; cap = vht_cap->cap & IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_MASK; if (cap != IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_160MHZ && cap != IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_160_80PLUS80MHZ) return false; - prohibited_flags |= IEEE80211_CHAN_NO_160MHZ; - width = 160; break; default: WARN_ON_ONCE(1); -- cgit v1.2.3-59-g8ed1b From 461ce35d5535c1479384f67fcf4bfc3f3610edca Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Thu, 28 May 2020 21:34:41 +0200 Subject: cfg80211: reject HT/VHT capabilities on 6 GHz band On the 6 GHz band, HE should be used, but without any direct HT/VHT capabilities, instead the HE 6 GHz band capabilities will capture the relevant information. Reject HT/VHT capabilities here. Link: https://lore.kernel.org/r/20200528213443.bfe89c35459a.Ibba5e066fa0087fd49d13cfee89d196ea0c68ae2@changeid Signed-off-by: Johannes Berg --- net/wireless/core.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/net/wireless/core.c b/net/wireless/core.c index b795f363d004..1651f86db6ca 100644 --- a/net/wireless/core.c +++ b/net/wireless/core.c @@ -5,7 +5,7 @@ * Copyright 2006-2010 Johannes Berg * Copyright 2013-2014 Intel Mobile Communications GmbH * Copyright 2015-2017 Intel Deutschland GmbH - * Copyright (C) 2018-2019 Intel Corporation + * Copyright (C) 2018-2020 Intel Corporation */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt @@ -807,6 +807,11 @@ int wiphy_register(struct wiphy *wiphy) !sband->n_bitrates)) return -EINVAL; + if (WARN_ON(band == NL80211_BAND_6GHZ && + (sband->ht_cap.ht_supported || + sband->vht_cap.vht_supported))) + return -EINVAL; + /* * Since cfg80211_disable_40mhz_24ghz is global, we can * modify the sband's ht data even if the driver uses a -- cgit v1.2.3-59-g8ed1b From f438136528482f98535889c9a6f99bbacdd92870 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Thu, 28 May 2020 21:34:42 +0200 Subject: cfg80211: require HE capabilities for 6 GHz band On 6 GHz band, HE capabilities must be available for all of the interface types, otherwise we shouldn't use 6 GHz. Check this. Link: https://lore.kernel.org/r/20200528213443.5881cb3c8c4a.I583b54172f91f98d44af64a16c5826fe458cbb27@changeid Signed-off-by: Johannes Berg --- net/wireless/core.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/net/wireless/core.c b/net/wireless/core.c index 1651f86db6ca..5b6714460490 100644 --- a/net/wireless/core.c +++ b/net/wireless/core.c @@ -791,6 +791,7 @@ int wiphy_register(struct wiphy *wiphy) /* sanity check supported bands/channels */ for (band = 0; band < NUM_NL80211_BANDS; band++) { u16 types = 0; + bool have_he = false; sband = wiphy->bands[band]; if (!sband) @@ -859,8 +860,17 @@ int wiphy_register(struct wiphy *wiphy) return -EINVAL; types |= iftd->types_mask; + + if (i == 0) + have_he = iftd->he_cap.has_he; + else + have_he = have_he && + iftd->he_cap.has_he; } + if (WARN_ON(!have_he && band == NL80211_BAND_6GHZ)) + return -EINVAL; + have_band = true; } -- cgit v1.2.3-59-g8ed1b From 93382a0d119b3ab95e3ebca51ea15aa87187b493 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Thu, 28 May 2020 21:34:44 +0200 Subject: mac80211: accept aggregation sessions on 6 GHz On 6 GHz, stations don't have ht_supported set, but they can still do aggregation since they must have HE, allow that. Link: https://lore.kernel.org/r/20200528213443.776d3c891b64.Ifa099d450617b50c691832b3c4aa08959fab520a@changeid Signed-off-by: Johannes Berg --- net/mac80211/agg-rx.c | 5 +++-- net/mac80211/agg-tx.c | 3 ++- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/net/mac80211/agg-rx.c b/net/mac80211/agg-rx.c index 4d1c335e06e5..7f245e9f114c 100644 --- a/net/mac80211/agg-rx.c +++ b/net/mac80211/agg-rx.c @@ -9,7 +9,7 @@ * Copyright 2007, Michael Wu * Copyright 2007-2010, Intel Corporation * Copyright(c) 2015-2017 Intel Deutschland GmbH - * Copyright (C) 2018 Intel Corporation + * Copyright (C) 2018-2020 Intel Corporation */ /** @@ -292,7 +292,8 @@ void ___ieee80211_start_rx_ba_session(struct sta_info *sta, goto end; } - if (!sta->sta.ht_cap.ht_supported) { + if (!sta->sta.ht_cap.ht_supported && + sta->sdata->vif.bss_conf.chandef.chan->band != NL80211_BAND_6GHZ) { ht_dbg(sta->sdata, "STA %pM erroneously requests BA session on tid %d w/o QoS\n", sta->sta.addr, tid); diff --git a/net/mac80211/agg-tx.c b/net/mac80211/agg-tx.c index c2d5f512526d..b37c8a983d88 100644 --- a/net/mac80211/agg-tx.c +++ b/net/mac80211/agg-tx.c @@ -593,7 +593,8 @@ int ieee80211_start_tx_ba_session(struct ieee80211_sta *pubsta, u16 tid, "Requested to start BA session on reserved tid=%d", tid)) return -EINVAL; - if (!pubsta->ht_cap.ht_supported) + if (!pubsta->ht_cap.ht_supported && + sta->sdata->vif.bss_conf.chandef.chan->band != NL80211_BAND_6GHZ) return -EINVAL; if (WARN_ON_ONCE(!local->ops->ampdu_action)) -- cgit v1.2.3-59-g8ed1b From 6fcb56ce0f9036b08d1c3e35ff93b100d499771b Mon Sep 17 00:00:00 2001 From: Ilan Peer Date: Thu, 28 May 2020 21:34:45 +0200 Subject: mac80211: Consider 6 GHz band when handling power constraint Treat it like the 5 GHz band. Signed-off-by: Ilan Peer Link: https://lore.kernel.org/r/20200528213443.889e5c9dd006.Id8ed3bb8000ba8738be5df05639415eb2e23c61a@changeid Signed-off-by: Johannes Berg --- net/mac80211/mlme.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index 8a37089e86bb..d7bffd640ed3 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -1533,6 +1533,7 @@ ieee80211_find_80211h_pwr_constr(struct ieee80211_sub_if_data *sdata, chan_increment = 1; break; case NL80211_BAND_5GHZ: + case NL80211_BAND_6GHZ: chan_increment = 4; break; } -- cgit v1.2.3-59-g8ed1b From 07c12d618f06c8876fe12a53217d92b32d7baf07 Mon Sep 17 00:00:00 2001 From: Tova Mussai Date: Thu, 28 May 2020 21:34:46 +0200 Subject: mac80211: set short_slot for 6 GHz band Set short slot also for 6 GHz band, just like 5 GHz. Signed-off-by: Tova Mussai Link: https://lore.kernel.org/r/20200528213443.75f38e6f5efd.I272fbae402b03123f04e9ae69204eeab960c70cd@changeid Signed-off-by: Johannes Berg --- net/mac80211/cfg.c | 3 ++- net/mac80211/mlme.c | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index 90a07d075fdb..9b360544ad6f 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -2198,7 +2198,8 @@ static int ieee80211_change_bss(struct wiphy *wiphy, } if (!sdata->vif.bss_conf.use_short_slot && - sband->band == NL80211_BAND_5GHZ) { + (sband->band == NL80211_BAND_5GHZ || + sband->band == NL80211_BAND_6GHZ)) { sdata->vif.bss_conf.use_short_slot = true; changed |= BSS_CHANGED_ERP_SLOT; } diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index d7bffd640ed3..5820ef02a587 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -2171,7 +2171,8 @@ static u32 ieee80211_handle_bss_capability(struct ieee80211_sub_if_data *sdata, } use_short_slot = !!(capab & WLAN_CAPABILITY_SHORT_SLOT_TIME); - if (sband->band == NL80211_BAND_5GHZ) + if (sband->band == NL80211_BAND_5GHZ || + sband->band == NL80211_BAND_6GHZ) use_short_slot = true; if (use_protection != bss_conf->use_cts_prot) { -- cgit v1.2.3-59-g8ed1b From 093a48d2aa4b74db3134b61d7b7a061dbe79177b Mon Sep 17 00:00:00 2001 From: Nathan Errera Date: Thu, 28 May 2020 21:22:38 +0200 Subject: cfg80211: support bigger kek/kck key length With some newer AKMs, the KCK and KEK are bigger, so allow that if the driver advertises support for it. In addition, add a new attribute for the AKM so we can use it for offloaded rekeying. Signed-off-by: Nathan Errera [reword commit message] Link: https://lore.kernel.org/r/20200528212237.5eb58b00a5d1.I61b09d77c4f382e8d58a05dcca78096e99a6bc15@changeid Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 12 +++++++++--- include/uapi/linux/nl80211.h | 4 ++++ net/wireless/nl80211.c | 23 +++++++++++++++++++---- 3 files changed, 32 insertions(+), 7 deletions(-) diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 95b55eea2afb..b58ad1a3f695 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -2936,12 +2936,17 @@ struct cfg80211_wowlan_wakeup { /** * struct cfg80211_gtk_rekey_data - rekey data - * @kek: key encryption key (NL80211_KEK_LEN bytes) - * @kck: key confirmation key (NL80211_KCK_LEN bytes) + * @kek: key encryption key (@kek_len bytes) + * @kck: key confirmation key (@kck_len bytes) * @replay_ctr: replay counter (NL80211_REPLAY_CTR_LEN bytes) + * @kek_len: length of kek + * @kck_len length of kck + * @akm: akm (oui, id) */ struct cfg80211_gtk_rekey_data { const u8 *kek, *kck, *replay_ctr; + u32 akm; + u8 kek_len, kck_len; }; /** @@ -4166,9 +4171,10 @@ struct cfg80211_ops { * beaconing mode (AP, IBSS, Mesh, ...). * @WIPHY_FLAG_HAS_STATIC_WEP: The device supports static WEP key installation * before connection. + * @WIPHY_FLAG_SUPPORTS_EXT_KEK_KCK: The device supports bigger kek and kck keys */ enum wiphy_flags { - /* use hole at 0 */ + WIPHY_FLAG_SUPPORTS_EXT_KEK_KCK = BIT(0), /* use hole at 1 */ /* use hole at 2 */ WIPHY_FLAG_NETNS_OK = BIT(3), diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index 5b350d032fa3..dad8c8f8581f 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -5396,6 +5396,8 @@ enum plink_actions { #define NL80211_KCK_LEN 16 #define NL80211_KEK_LEN 16 +#define NL80211_KCK_EXT_LEN 24 +#define NL80211_KEK_EXT_LEN 32 #define NL80211_REPLAY_CTR_LEN 8 /** @@ -5404,6 +5406,7 @@ enum plink_actions { * @NL80211_REKEY_DATA_KEK: key encryption key (binary) * @NL80211_REKEY_DATA_KCK: key confirmation key (binary) * @NL80211_REKEY_DATA_REPLAY_CTR: replay counter (binary) + * @NL80211_REKEY_DATA_AKM: AKM data (OUI, suite type) * @NUM_NL80211_REKEY_DATA: number of rekey attributes (internal) * @MAX_NL80211_REKEY_DATA: highest rekey attribute (internal) */ @@ -5412,6 +5415,7 @@ enum nl80211_rekey_data { NL80211_REKEY_DATA_KEK, NL80211_REKEY_DATA_KCK, NL80211_REKEY_DATA_REPLAY_CTR, + NL80211_REKEY_DATA_AKM, /* keep last */ NUM_NL80211_REKEY_DATA, diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 3a24e6add13e..263ae395ad44 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -730,9 +730,16 @@ nl80211_coalesce_policy[NUM_NL80211_ATTR_COALESCE_RULE] = { /* policy for GTK rekey offload attributes */ static const struct nla_policy nl80211_rekey_policy[NUM_NL80211_REKEY_DATA] = { - [NL80211_REKEY_DATA_KEK] = NLA_POLICY_EXACT_LEN_WARN(NL80211_KEK_LEN), - [NL80211_REKEY_DATA_KCK] = NLA_POLICY_EXACT_LEN_WARN(NL80211_KCK_LEN), + [NL80211_REKEY_DATA_KEK] = { + .type = NLA_BINARY, + .len = NL80211_KEK_EXT_LEN + }, + [NL80211_REKEY_DATA_KCK] = { + .type = NLA_BINARY, + .len = NL80211_KCK_EXT_LEN + }, [NL80211_REKEY_DATA_REPLAY_CTR] = NLA_POLICY_EXACT_LEN_WARN(NL80211_REPLAY_CTR_LEN), + [NL80211_REKEY_DATA_AKM] = { .type = NLA_U32 }, }; static const struct nla_policy @@ -12347,14 +12354,22 @@ static int nl80211_set_rekey_data(struct sk_buff *skb, struct genl_info *info) return -EINVAL; if (nla_len(tb[NL80211_REKEY_DATA_REPLAY_CTR]) != NL80211_REPLAY_CTR_LEN) return -ERANGE; - if (nla_len(tb[NL80211_REKEY_DATA_KEK]) != NL80211_KEK_LEN) + if (nla_len(tb[NL80211_REKEY_DATA_KEK]) != NL80211_KEK_LEN && + !(rdev->wiphy.flags & WIPHY_FLAG_SUPPORTS_EXT_KEK_KCK && + nla_len(tb[NL80211_REKEY_DATA_KEK]) == NL80211_KEK_EXT_LEN)) return -ERANGE; - if (nla_len(tb[NL80211_REKEY_DATA_KCK]) != NL80211_KCK_LEN) + if (nla_len(tb[NL80211_REKEY_DATA_KCK]) != NL80211_KCK_LEN && + !(rdev->wiphy.flags & WIPHY_FLAG_SUPPORTS_EXT_KEK_KCK && + nla_len(tb[NL80211_REKEY_DATA_KEK]) == NL80211_KCK_EXT_LEN)) return -ERANGE; rekey_data.kek = nla_data(tb[NL80211_REKEY_DATA_KEK]); rekey_data.kck = nla_data(tb[NL80211_REKEY_DATA_KCK]); rekey_data.replay_ctr = nla_data(tb[NL80211_REKEY_DATA_REPLAY_CTR]); + rekey_data.kek_len = nla_len(tb[NL80211_REKEY_DATA_KEK]); + rekey_data.kck_len = nla_len(tb[NL80211_REKEY_DATA_KCK]); + if (tb[NL80211_REKEY_DATA_AKM]) + rekey_data.akm = nla_get_u32(tb[NL80211_REKEY_DATA_AKM]); wdev_lock(wdev); if (!wdev->current_bss) { -- cgit v1.2.3-59-g8ed1b From f109603a4be0578a8145c8ae7d6e10b0b5ab6df4 Mon Sep 17 00:00:00 2001 From: Brett Creeley Date: Fri, 15 May 2020 17:51:17 -0700 Subject: ice: allow host to clear administratively set VF MAC Currently a user is not allowed to clear a VF's administratively set MAC on the PF. Fix this by allowing an all zero MAC address via "ip link set ${pf_eth} vf ${vf_id} mac 00:00:00:00:00:00". An example use case for this would be issuing a "virsh shutdown" command on a VM. The call to iproute mentioned above is part of this flow. Without this change the driver incorrectly rejects clearing the VF's administratively set MAC and prints unhelpful log messages. Also, improve the comments surrounding this change. Signed-off-by: Brett Creeley Tested-by: Andrew Bowers Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c b/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c index a126e7c7663d..9550501f9279 100644 --- a/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c +++ b/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c @@ -3904,7 +3904,7 @@ int ice_set_vf_mac(struct net_device *netdev, int vf_id, u8 *mac) if (ice_validate_vf_id(pf, vf_id)) return -EINVAL; - if (is_zero_ether_addr(mac) || is_multicast_ether_addr(mac)) { + if (is_multicast_ether_addr(mac)) { netdev_err(netdev, "%pM not a valid unicast address\n", mac); return -EINVAL; } @@ -3924,15 +3924,21 @@ int ice_set_vf_mac(struct net_device *netdev, int vf_id, u8 *mac) return -EINVAL; } - /* copy MAC into dflt_lan_addr and trigger a VF reset. The reset - * flow will use the updated dflt_lan_addr and add a MAC filter - * using ice_add_mac. Also set pf_set_mac to indicate that the PF has - * set the MAC address for this VF. + /* VF is notified of its new MAC via the PF's response to the + * VIRTCHNL_OP_GET_VF_RESOURCES message after the VF has been reset */ ether_addr_copy(vf->dflt_lan_addr.addr, mac); - vf->pf_set_mac = true; - netdev_info(netdev, "MAC on VF %d set to %pM. VF driver will be reinitialized\n", - vf_id, mac); + if (is_zero_ether_addr(mac)) { + /* VF will send VIRTCHNL_OP_ADD_ETH_ADDR message with its MAC */ + vf->pf_set_mac = false; + netdev_info(netdev, "Removing MAC on VF %d. VF driver will be reinitialized\n", + vf->vf_id); + } else { + /* PF will add MAC rule for the VF */ + vf->pf_set_mac = true; + netdev_info(netdev, "Setting MAC %pM on VF %d. VF driver will be reinitialized\n", + mac, vf_id); + } ice_vc_reset_vf(vf); return 0; -- cgit v1.2.3-59-g8ed1b From c1636a6e8a5e10190bd31ae085f9e8f8c5bc50a0 Mon Sep 17 00:00:00 2001 From: Paul Greenwalt Date: Fri, 15 May 2020 17:51:18 -0700 Subject: ice: support adding 16 unicast/multicast filter on untrusted VF Allow untrusted VF to add 16 unicast/multicast filters. VF uses 1 filter for the default/perm_addr/LAA MAC, 1 for broadcast, and 16 additional unicast/multicast filters. Signed-off-by: Paul Greenwalt Tested-by: Andrew Bowers Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/ice/ice_virtchnl_pf.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.h b/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.h index 0adff89a6749..67aa9110fdd1 100644 --- a/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.h +++ b/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.h @@ -7,7 +7,10 @@ /* Restrict number of MAC Addr and VLAN that non-trusted VF can programmed */ #define ICE_MAX_VLAN_PER_VF 8 -#define ICE_MAX_MACADDR_PER_VF 12 +/* MAC filters: 1 is reserved for the VF's default/perm_addr/LAA MAC, 1 for + * broadcast, and 16 for additional unicast/multicast filters + */ +#define ICE_MAX_MACADDR_PER_VF 18 /* Malicious Driver Detection */ #define ICE_DFLT_NUM_INVAL_MSGS_ALLOWED 10 -- cgit v1.2.3-59-g8ed1b From 2bb19d6e077190bafbfd50f3793333af3f07a7b1 Mon Sep 17 00:00:00 2001 From: Brett Creeley Date: Fri, 15 May 2020 17:51:19 -0700 Subject: ice: Fix transmit for all software offloaded VLANs Currently the driver does not recognize when there is an 802.1AD VLAN tag right after the dmac/smac (outermost VLAN tag). If any DCB map is applied and/or DCB is enabled this is causing the hardware to insert a VLAN 0 tag after the 802.1AD VLAN tag that is already in the packet. Fix this by preventing VLAN tag 0 from being added when any VLAN is already present after dmac/smac (software offloaded) or skb (hardware offloaded). Signed-off-by: Brett Creeley Tested-by: Andrew Bowers Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/ice/ice_dcb_lib.c | 28 +++++++---------- drivers/net/ethernet/intel/ice/ice_dcb_lib.h | 2 +- drivers/net/ethernet/intel/ice/ice_txrx.c | 45 +++++++--------------------- 3 files changed, 21 insertions(+), 54 deletions(-) diff --git a/drivers/net/ethernet/intel/ice/ice_dcb_lib.c b/drivers/net/ethernet/intel/ice/ice_dcb_lib.c index 3c7f604c0c49..979af197f8a3 100644 --- a/drivers/net/ethernet/intel/ice/ice_dcb_lib.c +++ b/drivers/net/ethernet/intel/ice/ice_dcb_lib.c @@ -791,39 +791,31 @@ void ice_update_dcb_stats(struct ice_pf *pf) * ice_tx_prepare_vlan_flags_dcb - prepare VLAN tagging for DCB * @tx_ring: ring to send buffer on * @first: pointer to struct ice_tx_buf + * + * This should not be called if the outer VLAN is software offloaded as the VLAN + * tag will already be configured with the correct ID and priority bits */ -int +void ice_tx_prepare_vlan_flags_dcb(struct ice_ring *tx_ring, struct ice_tx_buf *first) { struct sk_buff *skb = first->skb; if (!test_bit(ICE_FLAG_DCB_ENA, tx_ring->vsi->back->flags)) - return 0; + return; /* Insert 802.1p priority into VLAN header */ - if ((first->tx_flags & (ICE_TX_FLAGS_HW_VLAN | ICE_TX_FLAGS_SW_VLAN)) || + if ((first->tx_flags & ICE_TX_FLAGS_HW_VLAN) || skb->priority != TC_PRIO_CONTROL) { first->tx_flags &= ~ICE_TX_FLAGS_VLAN_PR_M; /* Mask the lower 3 bits to set the 802.1p priority */ first->tx_flags |= (skb->priority & 0x7) << ICE_TX_FLAGS_VLAN_PR_S; - if (first->tx_flags & ICE_TX_FLAGS_SW_VLAN) { - struct vlan_ethhdr *vhdr; - int rc; - - rc = skb_cow_head(skb, 0); - if (rc < 0) - return rc; - vhdr = (struct vlan_ethhdr *)skb->data; - vhdr->h_vlan_TCI = htons(first->tx_flags >> - ICE_TX_FLAGS_VLAN_S); - } else { - first->tx_flags |= ICE_TX_FLAGS_HW_VLAN; - } + /* if this is not already set it means a VLAN 0 + priority needs + * to be offloaded + */ + first->tx_flags |= ICE_TX_FLAGS_HW_VLAN; } - - return 0; } /** diff --git a/drivers/net/ethernet/intel/ice/ice_dcb_lib.h b/drivers/net/ethernet/intel/ice/ice_dcb_lib.h index 7c42324494d2..323238669572 100644 --- a/drivers/net/ethernet/intel/ice/ice_dcb_lib.h +++ b/drivers/net/ethernet/intel/ice/ice_dcb_lib.h @@ -27,7 +27,7 @@ void ice_pf_dcb_recfg(struct ice_pf *pf); void ice_vsi_cfg_dcb_rings(struct ice_vsi *vsi); int ice_init_pf_dcb(struct ice_pf *pf, bool locked); void ice_update_dcb_stats(struct ice_pf *pf); -int +void ice_tx_prepare_vlan_flags_dcb(struct ice_ring *tx_ring, struct ice_tx_buf *first); void diff --git a/drivers/net/ethernet/intel/ice/ice_txrx.c b/drivers/net/ethernet/intel/ice/ice_txrx.c index cda7e05bd8ae..abdb137c8bb7 100644 --- a/drivers/net/ethernet/intel/ice/ice_txrx.c +++ b/drivers/net/ethernet/intel/ice/ice_txrx.c @@ -2053,49 +2053,25 @@ int ice_tx_csum(struct ice_tx_buf *first, struct ice_tx_offload_params *off) * * Checks the skb and set up correspondingly several generic transmit flags * related to VLAN tagging for the HW, such as VLAN, DCB, etc. - * - * Returns error code indicate the frame should be dropped upon error and the - * otherwise returns 0 to indicate the flags has been set properly. */ -static int +static void ice_tx_prepare_vlan_flags(struct ice_ring *tx_ring, struct ice_tx_buf *first) { struct sk_buff *skb = first->skb; - __be16 protocol = skb->protocol; - - if (protocol == htons(ETH_P_8021Q) && - !(tx_ring->netdev->features & NETIF_F_HW_VLAN_CTAG_TX)) { - /* when HW VLAN acceleration is turned off by the user the - * stack sets the protocol to 8021q so that the driver - * can take any steps required to support the SW only - * VLAN handling. In our case the driver doesn't need - * to take any further steps so just set the protocol - * to the encapsulated ethertype. - */ - skb->protocol = vlan_get_protocol(skb); - return 0; - } - /* if we have a HW VLAN tag being added, default to the HW one */ + /* nothing left to do, software offloaded VLAN */ + if (!skb_vlan_tag_present(skb) && eth_type_vlan(skb->protocol)) + return; + + /* currently, we always assume 802.1Q for VLAN insertion as VLAN + * insertion for 802.1AD is not supported + */ if (skb_vlan_tag_present(skb)) { first->tx_flags |= skb_vlan_tag_get(skb) << ICE_TX_FLAGS_VLAN_S; first->tx_flags |= ICE_TX_FLAGS_HW_VLAN; - } else if (protocol == htons(ETH_P_8021Q)) { - struct vlan_hdr *vhdr, _vhdr; - - /* for SW VLAN, check the next protocol and store the tag */ - vhdr = (struct vlan_hdr *)skb_header_pointer(skb, ETH_HLEN, - sizeof(_vhdr), - &_vhdr); - if (!vhdr) - return -EINVAL; - - first->tx_flags |= ntohs(vhdr->h_vlan_TCI) << - ICE_TX_FLAGS_VLAN_S; - first->tx_flags |= ICE_TX_FLAGS_SW_VLAN; } - return ice_tx_prepare_vlan_flags_dcb(tx_ring, first); + ice_tx_prepare_vlan_flags_dcb(tx_ring, first); } /** @@ -2403,8 +2379,7 @@ ice_xmit_frame_ring(struct sk_buff *skb, struct ice_ring *tx_ring) first->tx_flags = 0; /* prepare the VLAN tagging flags for Tx */ - if (ice_tx_prepare_vlan_flags(tx_ring, first)) - goto out_drop; + ice_tx_prepare_vlan_flags(tx_ring, first); /* set up TSO offload */ tso = ice_tso(first, &offload); -- cgit v1.2.3-59-g8ed1b From c9a12d6d2091175fe2dc1707dd40d6ad781414fe Mon Sep 17 00:00:00 2001 From: Dan Nowlin Date: Fri, 15 May 2020 17:51:20 -0700 Subject: ice: Increase timeout after PFR To allow for resets during package download, increase the timeout period after performing a PFR. The time waited is the global config lock timeout plus the normal PFSWR timeout. Signed-off-by: Dan Nowlin Tested-by: Andrew Bowers Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/ice/ice_common.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/intel/ice/ice_common.c b/drivers/net/ethernet/intel/ice/ice_common.c index 8c73e161829d..d4a31c734326 100644 --- a/drivers/net/ethernet/intel/ice/ice_common.c +++ b/drivers/net/ethernet/intel/ice/ice_common.c @@ -964,7 +964,12 @@ static enum ice_status ice_pf_reset(struct ice_hw *hw) wr32(hw, PFGEN_CTRL, (reg | PFGEN_CTRL_PFSWR_M)); - for (cnt = 0; cnt < ICE_PF_RESET_WAIT_COUNT; cnt++) { + /* Wait for the PFR to complete. The wait time is the global config lock + * timeout plus the PFR timeout which will account for a possible reset + * that is occurring during a download package operation. + */ + for (cnt = 0; cnt < ICE_GLOBAL_CFG_LOCK_TIMEOUT + + ICE_PF_RESET_WAIT_COUNT; cnt++) { reg = rd32(hw, PFGEN_CTRL); if (!(reg & PFGEN_CTRL_PFSWR_M)) break; -- cgit v1.2.3-59-g8ed1b From bff185e2406e10d2608857171537645714bea1f4 Mon Sep 17 00:00:00 2001 From: Chinh T Cao Date: Fri, 15 May 2020 17:51:21 -0700 Subject: ice: Update ICE_PHY_TYPE_HIGH_MAX_INDEX value As currently, we are supporting only 5 PHY_SPEEDs for phy_type_high. Thus, we should adjust the value of ICE_PHY_TYPE_HIGH_MAX_INDEX to 5. Signed-off-by: Chinh T Cao Tested-by: Andrew Bowers Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/ice/ice_adminq_cmd.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/intel/ice/ice_adminq_cmd.h b/drivers/net/ethernet/intel/ice/ice_adminq_cmd.h index f04c338fb6e0..50040c5c55ec 100644 --- a/drivers/net/ethernet/intel/ice/ice_adminq_cmd.h +++ b/drivers/net/ethernet/intel/ice/ice_adminq_cmd.h @@ -974,7 +974,7 @@ struct ice_aqc_get_phy_caps { #define ICE_PHY_TYPE_HIGH_100G_CAUI2 BIT_ULL(2) #define ICE_PHY_TYPE_HIGH_100G_AUI2_AOC_ACC BIT_ULL(3) #define ICE_PHY_TYPE_HIGH_100G_AUI2 BIT_ULL(4) -#define ICE_PHY_TYPE_HIGH_MAX_INDEX 19 +#define ICE_PHY_TYPE_HIGH_MAX_INDEX 5 struct ice_aqc_get_phy_caps_data { __le64 phy_type_low; /* Use values from ICE_PHY_TYPE_LOW_* */ -- cgit v1.2.3-59-g8ed1b From cf0bf41dd6cb1d5461c71d2159c7c062fff3c8fd Mon Sep 17 00:00:00 2001 From: Brett Creeley Date: Fri, 15 May 2020 17:54:58 -0700 Subject: ice: Reset VF for all port VLAN changes from host Currently the PF is modifying the VF's port VLAN on the fly when configured via iproute. This is okay for most cases, but if the VF already has guest VLANs configured the PF has to remove all of those filters so only VLAN tagged traffic that matches the port VLAN will pass. Instead of adding functionality to track which guest VLANs have been added, just reset the VF each time port VLAN parameters are modified. Signed-off-by: Brett Creeley Tested-by: Andrew Bowers Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c | 42 +++--------------------- 1 file changed, 5 insertions(+), 37 deletions(-) diff --git a/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c b/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c index 9550501f9279..2916cfb9d032 100644 --- a/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c +++ b/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c @@ -3295,7 +3295,6 @@ ice_set_vf_port_vlan(struct net_device *netdev, int vf_id, u16 vlan_id, u8 qos, __be16 vlan_proto) { struct ice_pf *pf = ice_netdev_to_pf(netdev); - struct ice_vsi *vsi; struct device *dev; struct ice_vf *vf; u16 vlanprio; @@ -3317,8 +3316,6 @@ ice_set_vf_port_vlan(struct net_device *netdev, int vf_id, u16 vlan_id, u8 qos, } vf = &pf->vf[vf_id]; - vsi = pf->vsi[vf->lan_vsi_idx]; - ret = ice_check_vf_ready_for_cfg(vf); if (ret) return ret; @@ -3331,44 +3328,15 @@ ice_set_vf_port_vlan(struct net_device *netdev, int vf_id, u16 vlan_id, u8 qos, return 0; } - if (vlan_id || qos) { - /* remove VLAN 0 filter set by default when transitioning from - * no port VLAN to a port VLAN. No change to old port VLAN on - * failure. - */ - ret = ice_vsi_kill_vlan(vsi, 0); - if (ret) - return ret; - ret = ice_vsi_manage_pvid(vsi, vlanprio, true); - if (ret) - return ret; - } else { - /* add VLAN 0 filter back when transitioning from port VLAN to - * no port VLAN. No change to old port VLAN on failure. - */ - ret = ice_vsi_add_vlan(vsi, 0, ICE_FWD_TO_VSI); - if (ret) - return ret; - ret = ice_vsi_manage_pvid(vsi, 0, false); - if (ret) - return ret; - } + vf->port_vlan_info = vlanprio; - if (vlan_id) { + if (vf->port_vlan_info) dev_info(dev, "Setting VLAN %d, QoS 0x%x on VF %d\n", vlan_id, qos, vf_id); + else + dev_info(dev, "Clearing port VLAN on VF %d\n", vf_id); - /* add VLAN filter for the port VLAN */ - ret = ice_vsi_add_vlan(vsi, vlan_id, ICE_FWD_TO_VSI); - if (ret) - return ret; - } - /* remove old port VLAN filter with valid VLAN ID or QoS fields */ - if (vf->port_vlan_info) - ice_vsi_kill_vlan(vsi, vf->port_vlan_info & VLAN_VID_MASK); - - /* keep port VLAN information persistent on resets */ - vf->port_vlan_info = le16_to_cpu(vsi->info.pvid); + ice_vc_reset_vf(vf); return 0; } -- cgit v1.2.3-59-g8ed1b From 401ce33b32812a8fde6789588416d8c5b232138f Mon Sep 17 00:00:00 2001 From: Brett Creeley Date: Fri, 15 May 2020 17:54:59 -0700 Subject: ice: Always clear QRXFLXP_CNTXT before writing new value Always clear the previous value in QRXFLXP_CNTXT before writing a new value. This will make it so re-used queues will not accidentally take the previously configured settings. Signed-off-by: Brett Creeley Signed-off-by: Tony Nguyen Tested-by: Andrew Bowers Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/ice/ice_base.c | 33 +++++++++++-------------------- drivers/net/ethernet/intel/ice/ice_lib.c | 26 ++++++++++++++++++++++++ drivers/net/ethernet/intel/ice/ice_lib.h | 3 +++ 3 files changed, 40 insertions(+), 22 deletions(-) diff --git a/drivers/net/ethernet/intel/ice/ice_base.c b/drivers/net/ethernet/intel/ice/ice_base.c index a174911d8994..d620d26d42ed 100644 --- a/drivers/net/ethernet/intel/ice/ice_base.c +++ b/drivers/net/ethernet/intel/ice/ice_base.c @@ -3,6 +3,7 @@ #include #include "ice_base.h" +#include "ice_lib.h" #include "ice_dcb_lib.h" /** @@ -288,7 +289,6 @@ int ice_setup_rx_ctx(struct ice_ring *ring) u32 rxdid = ICE_RXDID_FLEX_NIC; struct ice_rlan_ctx rlan_ctx; struct ice_hw *hw; - u32 regval; u16 pf_q; int err; @@ -385,27 +385,16 @@ int ice_setup_rx_ctx(struct ice_ring *ring) /* Rx queue threshold in units of 64 */ rlan_ctx.lrxqthresh = 1; - /* Enable Flexible Descriptors in the queue context which - * allows this driver to select a specific receive descriptor format - */ - regval = rd32(hw, QRXFLXP_CNTXT(pf_q)); - if (vsi->type != ICE_VSI_VF) { - regval |= (rxdid << QRXFLXP_CNTXT_RXDID_IDX_S) & - QRXFLXP_CNTXT_RXDID_IDX_M; - - /* increasing context priority to pick up profile ID; - * default is 0x01; setting to 0x03 to ensure profile - * is programming if prev context is of same priority - */ - regval |= (0x03 << QRXFLXP_CNTXT_RXDID_PRIO_S) & - QRXFLXP_CNTXT_RXDID_PRIO_M; - - } else { - regval &= ~(QRXFLXP_CNTXT_RXDID_IDX_M | - QRXFLXP_CNTXT_RXDID_PRIO_M | - QRXFLXP_CNTXT_TS_M); - } - wr32(hw, QRXFLXP_CNTXT(pf_q), regval); + /* Enable Flexible Descriptors in the queue context which + * allows this driver to select a specific receive descriptor format + * increasing context priority to pick up profile ID; default is 0x01; + * setting to 0x03 to ensure profile is programming if prev context is + * of same priority + */ + if (vsi->type != ICE_VSI_VF) + ice_write_qrxflxp_cntxt(hw, pf_q, rxdid, 0x3); + else + ice_write_qrxflxp_cntxt(hw, pf_q, ICE_RXDID_LEGACY_1, 0x3); /* Absolute queue number out of 2K needs to be passed */ err = ice_write_rxq_ctx(hw, &rlan_ctx, pf_q); diff --git a/drivers/net/ethernet/intel/ice/ice_lib.c b/drivers/net/ethernet/intel/ice/ice_lib.c index 89e8e4f7f56f..ecc04a696e50 100644 --- a/drivers/net/ethernet/intel/ice/ice_lib.c +++ b/drivers/net/ethernet/intel/ice/ice_lib.c @@ -1595,6 +1595,32 @@ void ice_vsi_cfg_frame_size(struct ice_vsi *vsi) } } +/** + * ice_write_qrxflxp_cntxt - write/configure QRXFLXP_CNTXT register + * @hw: HW pointer + * @pf_q: index of the Rx queue in the PF's queue space + * @rxdid: flexible descriptor RXDID + * @prio: priority for the RXDID for this queue + */ +void +ice_write_qrxflxp_cntxt(struct ice_hw *hw, u16 pf_q, u32 rxdid, u32 prio) +{ + int regval = rd32(hw, QRXFLXP_CNTXT(pf_q)); + + /* clear any previous values */ + regval &= ~(QRXFLXP_CNTXT_RXDID_IDX_M | + QRXFLXP_CNTXT_RXDID_PRIO_M | + QRXFLXP_CNTXT_TS_M); + + regval |= (rxdid << QRXFLXP_CNTXT_RXDID_IDX_S) & + QRXFLXP_CNTXT_RXDID_IDX_M; + + regval |= (prio << QRXFLXP_CNTXT_RXDID_PRIO_S) & + QRXFLXP_CNTXT_RXDID_PRIO_M; + + wr32(hw, QRXFLXP_CNTXT(pf_q), regval); +} + /** * ice_vsi_cfg_rxqs - Configure the VSI for Rx * @vsi: the VSI being configured diff --git a/drivers/net/ethernet/intel/ice/ice_lib.h b/drivers/net/ethernet/intel/ice/ice_lib.h index 076e635e0c9f..d80e6afa4511 100644 --- a/drivers/net/ethernet/intel/ice/ice_lib.h +++ b/drivers/net/ethernet/intel/ice/ice_lib.h @@ -74,6 +74,9 @@ int ice_vsi_rebuild(struct ice_vsi *vsi, bool init_vsi); bool ice_is_reset_in_progress(unsigned long *state); +void +ice_write_qrxflxp_cntxt(struct ice_hw *hw, u16 pf_q, u32 rxdid, u32 prio); + void ice_vsi_put_qs(struct ice_vsi *vsi); void ice_vsi_dis_irq(struct ice_vsi *vsi); -- cgit v1.2.3-59-g8ed1b From 765dd7a1827c687b782e6ab3dd6daf4d13a4780f Mon Sep 17 00:00:00 2001 From: Jesse Brandeburg Date: Fri, 15 May 2020 17:55:00 -0700 Subject: ice: Fix inability to set channels when down Currently the driver prevents a user from doing modprobe ice ethtool -L eth0 combined 5 ip link set eth0 up The ethtool command fails, because the driver is checking to see if the interface is down before allowing the get_channels to proceed (even for a set_channels). Remove this check and allow the user to configure the interface before bringing it up, which is a much better usability case. Fixes: 87324e747fde ("ice: Implement ethtool ops for channels") Signed-off-by: Jesse Brandeburg Tested-by: Andrew Bowers Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/ice/ice_ethtool.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/drivers/net/ethernet/intel/ice/ice_ethtool.c b/drivers/net/ethernet/intel/ice/ice_ethtool.c index fd1849155d85..68c38004a088 100644 --- a/drivers/net/ethernet/intel/ice/ice_ethtool.c +++ b/drivers/net/ethernet/intel/ice/ice_ethtool.c @@ -3189,10 +3189,6 @@ ice_get_channels(struct net_device *dev, struct ethtool_channels *ch) struct ice_vsi *vsi = np->vsi; struct ice_pf *pf = vsi->back; - /* check to see if VSI is active */ - if (test_bit(__ICE_DOWN, vsi->state)) - return; - /* report maximum channels */ ch->max_rx = ice_get_max_rxq(pf); ch->max_tx = ice_get_max_txq(pf); -- cgit v1.2.3-59-g8ed1b From 7dcc0fb8f64913c783c7c1f06065c47e39b19794 Mon Sep 17 00:00:00 2001 From: Brett Creeley Date: Fri, 15 May 2020 17:55:01 -0700 Subject: ice: Allow VF to request reset as soon as it's initialized A VF driver has the ability to request reset via VIRTCHNL_OP_RESET_VF. This is a required step in VF driver load. Currently, the PF is only allowing a VF to request reset using this method after the VF has already communicated resources via VIRTCHNL_OP_GET_VF_RESOURCES. However, this is incorrect because the VF can request reset before requesting resources. Fix this by allowing the VF to request a reset once it has been initialized. Signed-off-by: Brett Creeley Tested-by: Andrew Bowers Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c b/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c index 2916cfb9d032..16a2f2526ccc 100644 --- a/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c +++ b/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c @@ -2014,7 +2014,7 @@ err: */ static void ice_vc_reset_vf_msg(struct ice_vf *vf) { - if (test_bit(ICE_VF_STATE_ACTIVE, vf->vf_states)) + if (test_bit(ICE_VF_STATE_INIT, vf->vf_states)) ice_reset_vf(vf, false); } -- cgit v1.2.3-59-g8ed1b From ebb462dc21eae79bed5b050afb225534992bd1f0 Mon Sep 17 00:00:00 2001 From: Bruce Allan Date: Fri, 15 May 2020 17:55:02 -0700 Subject: ice: fix function signature style format Where possible, cuddle multiple lines of function signatures to be consistent throughout the code. Signed-off-by: Bruce Allan Tested-by: Andrew Bowers Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/ice/ice_controlq.c | 3 +-- drivers/net/ethernet/intel/ice/ice_sched.c | 12 ++++-------- drivers/net/ethernet/intel/ice/ice_switch.c | 9 +++------ 3 files changed, 8 insertions(+), 16 deletions(-) diff --git a/drivers/net/ethernet/intel/ice/ice_controlq.c b/drivers/net/ethernet/intel/ice/ice_controlq.c index 479a74efc536..1e18021aa073 100644 --- a/drivers/net/ethernet/intel/ice/ice_controlq.c +++ b/drivers/net/ethernet/intel/ice/ice_controlq.c @@ -769,8 +769,7 @@ enum ice_status ice_create_all_ctrlq(struct ice_hw *hw) * * Destroys the send and receive queue locks for a given control queue. */ -static void -ice_destroy_ctrlq_locks(struct ice_ctl_q_info *cq) +static void ice_destroy_ctrlq_locks(struct ice_ctl_q_info *cq) { mutex_destroy(&cq->sq_lock); mutex_destroy(&cq->rq_lock); diff --git a/drivers/net/ethernet/intel/ice/ice_sched.c b/drivers/net/ethernet/intel/ice/ice_sched.c index d63acd2fcf79..0475134295e4 100644 --- a/drivers/net/ethernet/intel/ice/ice_sched.c +++ b/drivers/net/ethernet/intel/ice/ice_sched.c @@ -1714,8 +1714,7 @@ ice_sched_cfg_vsi(struct ice_port_info *pi, u16 vsi_handle, u8 tc, u16 maxqs, * This function removes single aggregator VSI info entry from * aggregator list. */ -static void -ice_sched_rm_agg_vsi_info(struct ice_port_info *pi, u16 vsi_handle) +static void ice_sched_rm_agg_vsi_info(struct ice_port_info *pi, u16 vsi_handle) { struct ice_sched_agg_info *agg_info; struct ice_sched_agg_info *atmp; @@ -1947,8 +1946,7 @@ ice_sched_cfg_node_bw_alloc(struct ice_hw *hw, struct ice_sched_node *node, * * Save or clear CIR bandwidth (BW) in the passed param bw_t_info. */ -static void -ice_set_clear_cir_bw(struct ice_bw_type_info *bw_t_info, u32 bw) +static void ice_set_clear_cir_bw(struct ice_bw_type_info *bw_t_info, u32 bw) { if (bw == ICE_SCHED_DFLT_BW) { clear_bit(ICE_BW_TYPE_CIR, bw_t_info->bw_t_bitmap); @@ -1967,8 +1965,7 @@ ice_set_clear_cir_bw(struct ice_bw_type_info *bw_t_info, u32 bw) * * Save or clear EIR bandwidth (BW) in the passed param bw_t_info. */ -static void -ice_set_clear_eir_bw(struct ice_bw_type_info *bw_t_info, u32 bw) +static void ice_set_clear_eir_bw(struct ice_bw_type_info *bw_t_info, u32 bw) { if (bw == ICE_SCHED_DFLT_BW) { clear_bit(ICE_BW_TYPE_EIR, bw_t_info->bw_t_bitmap); @@ -1993,8 +1990,7 @@ ice_set_clear_eir_bw(struct ice_bw_type_info *bw_t_info, u32 bw) * * Save or clear shared bandwidth (BW) in the passed param bw_t_info. */ -static void -ice_set_clear_shared_bw(struct ice_bw_type_info *bw_t_info, u32 bw) +static void ice_set_clear_shared_bw(struct ice_bw_type_info *bw_t_info, u32 bw) { if (bw == ICE_SCHED_DFLT_BW) { clear_bit(ICE_BW_TYPE_SHARED, bw_t_info->bw_t_bitmap); diff --git a/drivers/net/ethernet/intel/ice/ice_switch.c b/drivers/net/ethernet/intel/ice/ice_switch.c index 0156b73df1b1..ff7d16ac693e 100644 --- a/drivers/net/ethernet/intel/ice/ice_switch.c +++ b/drivers/net/ethernet/intel/ice/ice_switch.c @@ -1612,8 +1612,7 @@ exit: * check for duplicates in this case, removing duplicates from a given * list should be taken care of in the caller of this function. */ -enum ice_status -ice_add_mac(struct ice_hw *hw, struct list_head *m_list) +enum ice_status ice_add_mac(struct ice_hw *hw, struct list_head *m_list) { struct ice_aqc_sw_rules_elem *s_rule, *r_iter; struct ice_fltr_list_entry *m_list_itr; @@ -1914,8 +1913,7 @@ exit: * @hw: pointer to the hardware structure * @v_list: list of VLAN entries and forwarding information */ -enum ice_status -ice_add_vlan(struct ice_hw *hw, struct list_head *v_list) +enum ice_status ice_add_vlan(struct ice_hw *hw, struct list_head *v_list) { struct ice_fltr_list_entry *v_list_itr; @@ -2145,8 +2143,7 @@ ice_find_ucast_rule_entry(struct ice_hw *hw, u8 recp_id, * the entries passed into m_list were added previously. It will not attempt to * do a partial remove of entries that were found. */ -enum ice_status -ice_remove_mac(struct ice_hw *hw, struct list_head *m_list) +enum ice_status ice_remove_mac(struct ice_hw *hw, struct list_head *m_list) { struct ice_fltr_list_entry *list_itr, *tmp; struct mutex *rule_lock; /* Lock to protect filter rule list */ -- cgit v1.2.3-59-g8ed1b From 1a9c561aa35534a03c0aa51c7fb1485731202a7c Mon Sep 17 00:00:00 2001 From: Paul M Stillwell Jr Date: Fri, 15 May 2020 17:55:03 -0700 Subject: ice: fix PCI device serial number to be lowercase values Commit ceb2f00707f9 ("ice: Use pci_get_dsn()") changed the code to use a new function to get the Device Serial Number. It also changed the case of the filename for loading a package on a specific NIC from lowercase to uppercase. Change the filename back to lowercase since that is what we specified. Fixes: ceb2f00707f9 ("ice: Use pci_get_dsn()") Signed-off-by: Paul M Stillwell Jr Tested-by: Andrew Bowers Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/ice/ice_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/intel/ice/ice_main.c b/drivers/net/ethernet/intel/ice/ice_main.c index bbf92d2f1ac1..cb72ff32a29b 100644 --- a/drivers/net/ethernet/intel/ice/ice_main.c +++ b/drivers/net/ethernet/intel/ice/ice_main.c @@ -3248,7 +3248,7 @@ static char *ice_get_opt_fw_name(struct ice_pf *pf) if (!opt_fw_filename) return NULL; - snprintf(opt_fw_filename, NAME_MAX, "%sice-%016llX.pkg", + snprintf(opt_fw_filename, NAME_MAX, "%sice-%016llx.pkg", ICE_DDP_PKG_PATH, dsn); return opt_fw_filename; -- cgit v1.2.3-59-g8ed1b From a039f6fcba452fba5973798f4c641eee1ef770a1 Mon Sep 17 00:00:00 2001 From: Brett Creeley Date: Fri, 15 May 2020 17:55:04 -0700 Subject: ice: Use coalesce values from q_vector 0 when increasing q_vectors Currently when a VSI is built (i.e. reset, set channels, etc.) the coalesce settings will be preserved in most cases. However, when the number of q_vectors are increased the settings for the new q_vectors will be set to the driver defaults of AIM on, Rx/Tx ITR 50, and INTRL 0. This is causing issues with how the ethtool layer gets the current coalesce settings since it only uses q_vector 0. So, assume that the user set the coalesce settings globally (i.e. ethtool -C eth0) and use q_vector 0's settings for all of the new q_vectors. Signed-off-by: Brett Creeley Tested-by: Andrew Bowers Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/ice/ice_lib.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/drivers/net/ethernet/intel/ice/ice_lib.c b/drivers/net/ethernet/intel/ice/ice_lib.c index ecc04a696e50..28b46cc9f5cb 100644 --- a/drivers/net/ethernet/intel/ice/ice_lib.c +++ b/drivers/net/ethernet/intel/ice/ice_lib.c @@ -2707,15 +2707,13 @@ ice_vsi_rebuild_set_coalesce(struct ice_vsi *vsi, ice_vsi_rebuild_update_coalesce(vsi->q_vectors[i], &coalesce[i]); - for (; i < vsi->num_q_vectors; i++) { - struct ice_coalesce_stored coalesce_dflt = { - .itr_tx = ICE_DFLT_TX_ITR, - .itr_rx = ICE_DFLT_RX_ITR, - .intrl = 0 - }; + /* number of q_vectors increased, so assume coalesce settings were + * changed globally (i.e. ethtool -C eth0 instead of per-queue) and use + * the previous settings from q_vector 0 for all of the new q_vectors + */ + for (; i < vsi->num_q_vectors; i++) ice_vsi_rebuild_update_coalesce(vsi->q_vectors[i], - &coalesce_dflt); - } + &coalesce[0]); } /** -- cgit v1.2.3-59-g8ed1b From d5329be9907723a646f8874388cdaaccba988b4d Mon Sep 17 00:00:00 2001 From: Henry Tieman Date: Fri, 15 May 2020 17:55:05 -0700 Subject: ice: fix aRFS after flow director delete The logic was missing for adding back perfect flows after flow director filter delete. The code now adds perfect flows into the HW tables after filter delete. Signed-off-by: Henry Tieman Tested-by: Andrew Bowers Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/ice/ice_ethtool_fdir.c | 27 ++++++++++++++++++++++- 1 file changed, 26 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/intel/ice/ice_ethtool_fdir.c b/drivers/net/ethernet/intel/ice/ice_ethtool_fdir.c index 42803fc0ed18..d7430ce6af26 100644 --- a/drivers/net/ethernet/intel/ice/ice_ethtool_fdir.c +++ b/drivers/net/ethernet/intel/ice/ice_ethtool_fdir.c @@ -1363,6 +1363,31 @@ release_lock: mutex_unlock(&hw->fdir_fltr_lock); } +/** + * ice_fdir_do_rem_flow - delete flow and possibly add perfect flow + * @pf: PF structure + * @flow_type: FDir flow type to release + */ +static void +ice_fdir_do_rem_flow(struct ice_pf *pf, enum ice_fltr_ptype flow_type) +{ + struct ice_hw *hw = &pf->hw; + bool need_perfect = false; + + if (flow_type == ICE_FLTR_PTYPE_NONF_IPV4_TCP || + flow_type == ICE_FLTR_PTYPE_NONF_IPV4_UDP || + flow_type == ICE_FLTR_PTYPE_NONF_IPV6_TCP || + flow_type == ICE_FLTR_PTYPE_NONF_IPV6_UDP) + need_perfect = true; + + if (need_perfect && test_bit(flow_type, hw->fdir_perfect_fltr)) + return; + + ice_fdir_rem_flow(hw, ICE_BLK_FD, flow_type); + if (need_perfect) + ice_create_init_fdir_rule(pf, flow_type); +} + /** * ice_fdir_update_list_entry - add or delete a filter from the filter list * @pf: PF structure @@ -1393,7 +1418,7 @@ ice_fdir_update_list_entry(struct ice_pf *pf, struct ice_fdir_fltr *input, /* we just deleted the last filter of flow_type so we * should also delete the HW filter info. */ - ice_fdir_rem_flow(hw, ICE_BLK_FD, old_fltr->flow_type); + ice_fdir_do_rem_flow(pf, old_fltr->flow_type); list_del(&old_fltr->fltr_node); devm_kfree(ice_hw_to_dev(hw), old_fltr); } -- cgit v1.2.3-59-g8ed1b From b5e19a642b7ed3d9e6de746957226a7ae726d226 Mon Sep 17 00:00:00 2001 From: Chinh T Cao Date: Fri, 15 May 2020 17:55:06 -0700 Subject: ice: Ignore EMODE when setting PHY config When setting the PHY cfg (CQ cmd 0x0601), if the firmware responds with an EMODE error, software will ignore the error as it simply means that manageability (ex: BMC) is in control of the link and that the new setting may not be applied. Signed-off-by: Chinh T Cao Signed-off-by: Tony Nguyen Tested-by: Andrew Bowers Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/ice/ice_adminq_cmd.h | 1 + drivers/net/ethernet/intel/ice/ice_common.c | 7 ++++++- drivers/net/ethernet/intel/ice/ice_main.c | 2 ++ 3 files changed, 9 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/intel/ice/ice_adminq_cmd.h b/drivers/net/ethernet/intel/ice/ice_adminq_cmd.h index 50040c5c55ec..92f82f2a8af4 100644 --- a/drivers/net/ethernet/intel/ice/ice_adminq_cmd.h +++ b/drivers/net/ethernet/intel/ice/ice_adminq_cmd.h @@ -1826,6 +1826,7 @@ enum ice_aq_err { ICE_AQ_RC_EINVAL = 14, /* Invalid argument */ ICE_AQ_RC_ENOSPC = 16, /* No space left or allocation failure */ ICE_AQ_RC_ENOSYS = 17, /* Function not implemented */ + ICE_AQ_RC_EMODE = 21, /* Op not allowed in current dev mode */ ICE_AQ_RC_ENOSEC = 24, /* Missing security manifest */ ICE_AQ_RC_EBADSIG = 25, /* Bad RSA signature */ ICE_AQ_RC_ESVN = 26, /* SVN number prohibits this package */ diff --git a/drivers/net/ethernet/intel/ice/ice_common.c b/drivers/net/ethernet/intel/ice/ice_common.c index d4a31c734326..bce0e1281168 100644 --- a/drivers/net/ethernet/intel/ice/ice_common.c +++ b/drivers/net/ethernet/intel/ice/ice_common.c @@ -2232,6 +2232,7 @@ ice_aq_set_phy_cfg(struct ice_hw *hw, u8 lport, struct ice_aqc_set_phy_cfg_data *cfg, struct ice_sq_cd *cd) { struct ice_aq_desc desc; + enum ice_status status; if (!cfg) return ICE_ERR_PARAM; @@ -2260,7 +2261,11 @@ ice_aq_set_phy_cfg(struct ice_hw *hw, u8 lport, ice_debug(hw, ICE_DBG_LINK, "eeer_value = 0x%x\n", cfg->eeer_value); ice_debug(hw, ICE_DBG_LINK, "link_fec_opt = 0x%x\n", cfg->link_fec_opt); - return ice_aq_send_cmd(hw, &desc, cfg, sizeof(*cfg), cd); + status = ice_aq_send_cmd(hw, &desc, cfg, sizeof(*cfg), cd); + if (hw->adminq.sq_last_status == ICE_AQ_RC_EMODE) + status = 0; + + return status; } /** diff --git a/drivers/net/ethernet/intel/ice/ice_main.c b/drivers/net/ethernet/intel/ice/ice_main.c index cb72ff32a29b..082825e3cb39 100644 --- a/drivers/net/ethernet/intel/ice/ice_main.c +++ b/drivers/net/ethernet/intel/ice/ice_main.c @@ -5159,6 +5159,8 @@ const char *ice_aq_str(enum ice_aq_err aq_err) return "ICE_AQ_RC_ENOSPC"; case ICE_AQ_RC_ENOSYS: return "ICE_AQ_RC_ENOSYS"; + case ICE_AQ_RC_EMODE: + return "ICE_AQ_RC_EMODE"; case ICE_AQ_RC_ENOSEC: return "ICE_AQ_RC_ENOSEC"; case ICE_AQ_RC_EBADSIG: -- cgit v1.2.3-59-g8ed1b From 4942857b015ede4fab8b262931244a3c1006a2a6 Mon Sep 17 00:00:00 2001 From: Zijun Hu Date: Fri, 29 May 2020 22:46:13 +0800 Subject: Bluetooth: hci_qca: Improve controller ID info log level Controller ID info got by VSC EDL_PATCH_GETVER is very important, so improve its log level from DEBUG to INFO. Signed-off-by: Zijun Hu Reviewed-by: Matthias Kaehlcke Signed-off-by: Marcel Holtmann --- drivers/bluetooth/btqca.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/drivers/bluetooth/btqca.c b/drivers/bluetooth/btqca.c index 3ea866d44568..c5984966f315 100644 --- a/drivers/bluetooth/btqca.c +++ b/drivers/bluetooth/btqca.c @@ -74,17 +74,21 @@ int qca_read_soc_version(struct hci_dev *hdev, u32 *soc_version, ver = (struct qca_btsoc_version *)(edl->data); - BT_DBG("%s: Product:0x%08x", hdev->name, le32_to_cpu(ver->product_id)); - BT_DBG("%s: Patch :0x%08x", hdev->name, le16_to_cpu(ver->patch_ver)); - BT_DBG("%s: ROM :0x%08x", hdev->name, le16_to_cpu(ver->rom_ver)); - BT_DBG("%s: SOC :0x%08x", hdev->name, le32_to_cpu(ver->soc_id)); + bt_dev_info(hdev, "QCA Product ID :0x%08x", + le32_to_cpu(ver->product_id)); + bt_dev_info(hdev, "QCA SOC Version :0x%08x", + le32_to_cpu(ver->soc_id)); + bt_dev_info(hdev, "QCA ROM Version :0x%08x", + le16_to_cpu(ver->rom_ver)); + bt_dev_info(hdev, "QCA Patch Version:0x%08x", + le16_to_cpu(ver->patch_ver)); /* QCA chipset version can be decided by patch and SoC * version, combination with upper 2 bytes from SoC * and lower 2 bytes from patch will be used. */ *soc_version = (le32_to_cpu(ver->soc_id) << 16) | - (le16_to_cpu(ver->rom_ver) & 0x0000ffff); + (le16_to_cpu(ver->rom_ver) & 0x0000ffff); if (*soc_version == 0) err = -EILSEQ; -- cgit v1.2.3-59-g8ed1b From d3a0fe6b0988241c64ef4f6a1045423cc79a612a Mon Sep 17 00:00:00 2001 From: Zijun Hu Date: Fri, 29 May 2020 23:58:56 +0800 Subject: Bluetooth: btmtkuart: Use serdev_device_write_buf() instead of serdev_device_write() serdev_device_write() is not appropriate at here because serdev_device_write_wakeup() is not used to release completion hold by the former at @write_wakeup member of struct serdev_device_ops. Fix by using serdev_device_write_buf() instead of serdev_device_write(). Signed-off-by: Zijun Hu Signed-off-by: Marcel Holtmann --- drivers/bluetooth/btmtkuart.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/bluetooth/btmtkuart.c b/drivers/bluetooth/btmtkuart.c index 8a81fbca5c9d..6c40bc75fb5b 100644 --- a/drivers/bluetooth/btmtkuart.c +++ b/drivers/bluetooth/btmtkuart.c @@ -695,8 +695,7 @@ static int btmtkuart_change_baudrate(struct hci_dev *hdev) /* Send a dummy byte 0xff to activate the new baudrate */ param = 0xff; - err = serdev_device_write(bdev->serdev, ¶m, sizeof(param), - MAX_SCHEDULE_TIMEOUT); + err = serdev_device_write_buf(bdev->serdev, ¶m, sizeof(param)); if (err < 0 || err < sizeof(param)) return err; -- cgit v1.2.3-59-g8ed1b From e5aeebddfc312ea7bb55dfe6c7264e71a3b43992 Mon Sep 17 00:00:00 2001 From: Zijun Hu Date: Fri, 29 May 2020 22:38:31 +0800 Subject: Bluetooth: hci_qca: Fix QCA6390 memdump failure QCA6390 memdump VSE sometimes come to bluetooth driver with wrong sequence number as illustrated as follows: frame # in dec: frame data in hex 1396: ff fd 01 08 74 05 00 37 8f 14 1397: ff fd 01 08 75 05 00 ff bf 38 1414: ff fd 01 08 86 05 00 fb 5e 4b 1399: ff fd 01 08 77 05 00 f3 44 0a 1400: ff fd 01 08 78 05 00 ca f7 41 it is mistook for controller missing packets, so results in page fault after overwriting memdump buffer allocated. Fixed by ignoring QCA6390 sequence number check and checking buffer space before writing. Signed-off-by: Zijun Hu Tested-by: Zijun Hu Signed-off-by: Marcel Holtmann --- drivers/bluetooth/hci_qca.c | 54 +++++++++++++++++++++++++++++++++++++-------- 1 file changed, 45 insertions(+), 9 deletions(-) diff --git a/drivers/bluetooth/hci_qca.c b/drivers/bluetooth/hci_qca.c index aa957d749d6f..81c3c38baba1 100644 --- a/drivers/bluetooth/hci_qca.c +++ b/drivers/bluetooth/hci_qca.c @@ -114,6 +114,7 @@ struct qca_memdump_data { char *memdump_buf_tail; u32 current_seq_no; u32 received_dump; + u32 ram_dump_size; }; struct qca_memdump_event_hdr { @@ -976,6 +977,8 @@ static void qca_controller_memdump(struct work_struct *work) char nullBuff[QCA_DUMP_PACKET_SIZE] = { 0 }; u16 seq_no; u32 dump_size; + u32 rx_size; + enum qca_btsoc_type soc_type = qca_soc_type(hu); while ((skb = skb_dequeue(&qca->rx_memdump_q))) { @@ -1025,10 +1028,12 @@ static void qca_controller_memdump(struct work_struct *work) dump_size); queue_delayed_work(qca->workqueue, &qca->ctrl_memdump_timeout, - msecs_to_jiffies(MEMDUMP_TIMEOUT_MS)); + msecs_to_jiffies(MEMDUMP_TIMEOUT_MS) + ); skb_pull(skb, sizeof(dump_size)); memdump_buf = vmalloc(dump_size); + qca_memdump->ram_dump_size = dump_size; qca_memdump->memdump_buf_head = memdump_buf; qca_memdump->memdump_buf_tail = memdump_buf; } @@ -1051,26 +1056,57 @@ static void qca_controller_memdump(struct work_struct *work) * the controller. In such cases let us store the dummy * packets in the buffer. */ + /* For QCA6390, controller does not lost packets but + * sequence number field of packat sometimes has error + * bits, so skip this checking for missing packet. + */ while ((seq_no > qca_memdump->current_seq_no + 1) && - seq_no != QCA_LAST_SEQUENCE_NUM) { + (soc_type != QCA_QCA6390) && + seq_no != QCA_LAST_SEQUENCE_NUM) { bt_dev_err(hu->hdev, "QCA controller missed packet:%d", qca_memdump->current_seq_no); + rx_size = qca_memdump->received_dump; + rx_size += QCA_DUMP_PACKET_SIZE; + if (rx_size > qca_memdump->ram_dump_size) { + bt_dev_err(hu->hdev, + "QCA memdump received %d, no space for missed packet", + qca_memdump->received_dump); + break; + } memcpy(memdump_buf, nullBuff, QCA_DUMP_PACKET_SIZE); memdump_buf = memdump_buf + QCA_DUMP_PACKET_SIZE; qca_memdump->received_dump += QCA_DUMP_PACKET_SIZE; qca_memdump->current_seq_no++; } - memcpy(memdump_buf, (unsigned char *) skb->data, skb->len); - memdump_buf = memdump_buf + skb->len; - qca_memdump->memdump_buf_tail = memdump_buf; - qca_memdump->current_seq_no = seq_no + 1; - qca_memdump->received_dump += skb->len; + rx_size = qca_memdump->received_dump + skb->len; + if (rx_size <= qca_memdump->ram_dump_size) { + if ((seq_no != QCA_LAST_SEQUENCE_NUM) && + (seq_no != qca_memdump->current_seq_no)) + bt_dev_err(hu->hdev, + "QCA memdump unexpected packet %d", + seq_no); + bt_dev_dbg(hu->hdev, + "QCA memdump packet %d with length %d", + seq_no, skb->len); + memcpy(memdump_buf, (unsigned char *)skb->data, + skb->len); + memdump_buf = memdump_buf + skb->len; + qca_memdump->memdump_buf_tail = memdump_buf; + qca_memdump->current_seq_no = seq_no + 1; + qca_memdump->received_dump += skb->len; + } else { + bt_dev_err(hu->hdev, + "QCA memdump received %d, no space for packet %d", + qca_memdump->received_dump, seq_no); + } qca->qca_memdump = qca_memdump; kfree_skb(skb); if (seq_no == QCA_LAST_SEQUENCE_NUM) { - bt_dev_info(hu->hdev, "QCA writing crash dump of size %d bytes", - qca_memdump->received_dump); + bt_dev_info(hu->hdev, + "QCA memdump Done, received %d, total %d", + qca_memdump->received_dump, + qca_memdump->ram_dump_size); memdump_buf = qca_memdump->memdump_buf_head; dev_coredumpv(&hu->serdev->dev, memdump_buf, qca_memdump->received_dump, GFP_KERNEL); -- cgit v1.2.3-59-g8ed1b From dafe2078a75af1abe4780313ef8dd8491ba8598f Mon Sep 17 00:00:00 2001 From: Patrick Eigensatz Date: Mon, 1 Jun 2020 13:12:01 +0200 Subject: ipv4: nexthop: Fix deadcode issue by performing a proper NULL check After allocating the spare nexthop group it should be tested for kzalloc() returning NULL, instead the already used nexthop group (which cannot be NULL at this point) had been tested so far. Additionally, if kzalloc() fails, return ERR_PTR(-ENOMEM) instead of NULL. Coverity-id: 1463885 Reported-by: Coverity Signed-off-by: Patrick Eigensatz Acked-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- net/ipv4/nexthop.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/ipv4/nexthop.c b/net/ipv4/nexthop.c index ebafa5ed91ac..400a9f89ebdb 100644 --- a/net/ipv4/nexthop.c +++ b/net/ipv4/nexthop.c @@ -1185,10 +1185,10 @@ static struct nexthop *nexthop_create_group(struct net *net, /* spare group used for removals */ nhg->spare = nexthop_grp_alloc(num_nh); - if (!nhg) { + if (!nhg->spare) { kfree(nhg); kfree(nh); - return NULL; + return ERR_PTR(-ENOMEM); } nhg->spare->spare = nhg; -- cgit v1.2.3-59-g8ed1b From 53fc685243bd6fb90d90305cea54598b78d3cbfc Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Mon, 1 Jun 2020 15:58:54 +0300 Subject: bridge: Avoid infinite loop when suppressing NS messages with invalid options When neighbor suppression is enabled the bridge device might reply to Neighbor Solicitation (NS) messages on behalf of remote hosts. In case the NS message includes the "Source link-layer address" option [1], the bridge device will use the specified address as the link-layer destination address in its reply. To avoid an infinite loop, break out of the options parsing loop when encountering an option with length zero and disregard the NS message. This is consistent with the IPv6 ndisc code and RFC 4886 which states that "Nodes MUST silently discard an ND packet that contains an option with length zero" [2]. [1] https://tools.ietf.org/html/rfc4861#section-4.3 [2] https://tools.ietf.org/html/rfc4861#section-4.6 Fixes: ed842faeb2bd ("bridge: suppress nd pkts on BR_NEIGH_SUPPRESS ports") Signed-off-by: Ido Schimmel Reported-by: Alla Segal Tested-by: Alla Segal Acked-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- net/bridge/br_arp_nd_proxy.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/net/bridge/br_arp_nd_proxy.c b/net/bridge/br_arp_nd_proxy.c index 37908561a64b..b18cdf03edb3 100644 --- a/net/bridge/br_arp_nd_proxy.c +++ b/net/bridge/br_arp_nd_proxy.c @@ -276,6 +276,10 @@ static void br_nd_send(struct net_bridge *br, struct net_bridge_port *p, ns_olen = request->len - (skb_network_offset(request) + sizeof(struct ipv6hdr)) - sizeof(*ns); for (i = 0; i < ns_olen - 1; i += (ns->opt[i + 1] << 3)) { + if (!ns->opt[i + 1]) { + kfree_skb(reply); + return; + } if (ns->opt[i] == ND_OPT_SOURCE_LL_ADDR) { daddr = ns->opt + i + sizeof(struct nd_opt_hdr); break; -- cgit v1.2.3-59-g8ed1b From 8066e6b449e050675df48e7c4b16c29f00507ff0 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Mon, 1 Jun 2020 15:58:55 +0300 Subject: vxlan: Avoid infinite loop when suppressing NS messages with invalid options When proxy mode is enabled the vxlan device might reply to Neighbor Solicitation (NS) messages on behalf of remote hosts. In case the NS message includes the "Source link-layer address" option [1], the vxlan device will use the specified address as the link-layer destination address in its reply. To avoid an infinite loop, break out of the options parsing loop when encountering an option with length zero and disregard the NS message. This is consistent with the IPv6 ndisc code and RFC 4886 which states that "Nodes MUST silently discard an ND packet that contains an option with length zero" [2]. [1] https://tools.ietf.org/html/rfc4861#section-4.3 [2] https://tools.ietf.org/html/rfc4861#section-4.6 Fixes: 4b29dba9c085 ("vxlan: fix nonfunctional neigh_reduce()") Signed-off-by: Ido Schimmel Acked-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- drivers/net/vxlan.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c index 39bc10a7fd2e..d5906b41cdae 100644 --- a/drivers/net/vxlan.c +++ b/drivers/net/vxlan.c @@ -2092,6 +2092,10 @@ static struct sk_buff *vxlan_na_create(struct sk_buff *request, ns_olen = request->len - skb_network_offset(request) - sizeof(struct ipv6hdr) - sizeof(*ns); for (i = 0; i < ns_olen-1; i += (ns->opt[i+1]<<3)) { + if (!ns->opt[i + 1]) { + kfree_skb(reply); + return NULL; + } if (ns->opt[i] == ND_OPT_SOURCE_LL_ADDR) { daddr = ns->opt + i + sizeof(struct nd_opt_hdr); break; -- cgit v1.2.3-59-g8ed1b From bda6752f3de99e9d765638b89aacfb11c07cee06 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Thu, 28 May 2020 15:49:57 +0300 Subject: cxgb4: cleanup error code in setup_sge_queues_uld() The caller doesn't care about the error codes, they only check for zero vs non-zero. Still, it's better to preserve the negative error codes from alloc_uld_rxqs() instead of changing it to 1. We can also return directly if there is a failure. Signed-off-by: Dan Carpenter Signed-off-by: David S. Miller --- drivers/net/ethernet/chelsio/cxgb4/cxgb4_uld.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_uld.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_uld.c index 6b1d3df4b9ba..9e3c6b36cde8 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_uld.c +++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_uld.c @@ -174,13 +174,14 @@ static int setup_sge_queues_uld(struct adapter *adap, unsigned int uld_type, bool lro) { struct sge_uld_rxq_info *rxq_info = adap->sge.uld_rxq_info[uld_type]; - int i, ret = 0; + int i, ret; - ret = !(!alloc_uld_rxqs(adap, rxq_info, lro)); + ret = alloc_uld_rxqs(adap, rxq_info, lro); + if (ret) + return ret; /* Tell uP to route control queue completions to rdma rspq */ - if (adap->flags & CXGB4_FULL_INIT_DONE && - !ret && uld_type == CXGB4_ULD_RDMA) { + if (adap->flags & CXGB4_FULL_INIT_DONE && uld_type == CXGB4_ULD_RDMA) { struct sge *s = &adap->sge; unsigned int cmplqid; u32 param, cmdop; -- cgit v1.2.3-59-g8ed1b From bfad978116c2aa3b693701059923de4561196f9b Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Thu, 28 May 2020 17:45:02 +0200 Subject: regmap: provide helpers for simple bit operations In many instances regmap_update_bits() is used for simple bit setting and clearing. In these cases the last argument is redundant and we can hide it with a static inline function. This adds three new helpers for simple bit operations: set_bits, clear_bits and test_bits (the last one defined as a regular function). Signed-off-by: Bartosz Golaszewski Signed-off-by: David S. Miller --- drivers/base/regmap/regmap.c | 22 ++++++++++++++++++++++ include/linux/regmap.h | 36 ++++++++++++++++++++++++++++++++++++ 2 files changed, 58 insertions(+) diff --git a/drivers/base/regmap/regmap.c b/drivers/base/regmap/regmap.c index 59f911e57719..4ad5c5adc0a3 100644 --- a/drivers/base/regmap/regmap.c +++ b/drivers/base/regmap/regmap.c @@ -2936,6 +2936,28 @@ int regmap_update_bits_base(struct regmap *map, unsigned int reg, } EXPORT_SYMBOL_GPL(regmap_update_bits_base); +/** + * regmap_test_bits() - Check if all specified bits are set in a register. + * + * @map: Register map to operate on + * @reg: Register to read from + * @bits: Bits to test + * + * Returns -1 if the underlying regmap_read() fails, 0 if at least one of the + * tested bits is not set and 1 if all tested bits are set. + */ +int regmap_test_bits(struct regmap *map, unsigned int reg, unsigned int bits) +{ + unsigned int val, ret; + + ret = regmap_read(map, reg, &val); + if (ret) + return ret; + + return (val & bits) == bits; +} +EXPORT_SYMBOL_GPL(regmap_test_bits); + void regmap_async_complete_cb(struct regmap_async *async, int ret) { struct regmap *map = async->map; diff --git a/include/linux/regmap.h b/include/linux/regmap.h index 40b07168fd8e..ddf0baff195d 100644 --- a/include/linux/regmap.h +++ b/include/linux/regmap.h @@ -1111,6 +1111,21 @@ bool regmap_reg_in_ranges(unsigned int reg, const struct regmap_range *ranges, unsigned int nranges); +static inline int regmap_set_bits(struct regmap *map, + unsigned int reg, unsigned int bits) +{ + return regmap_update_bits_base(map, reg, bits, bits, + NULL, false, false); +} + +static inline int regmap_clear_bits(struct regmap *map, + unsigned int reg, unsigned int bits) +{ + return regmap_update_bits_base(map, reg, bits, 0, NULL, false, false); +} + +int regmap_test_bits(struct regmap *map, unsigned int reg, unsigned int bits); + /** * struct reg_field - Description of an register field * @@ -1410,6 +1425,27 @@ static inline int regmap_update_bits_base(struct regmap *map, unsigned int reg, return -EINVAL; } +static inline int regmap_set_bits(struct regmap *map, + unsigned int reg, unsigned int bits) +{ + WARN_ONCE(1, "regmap API is disabled"); + return -EINVAL; +} + +static inline int regmap_clear_bits(struct regmap *map, + unsigned int reg, unsigned int bits) +{ + WARN_ONCE(1, "regmap API is disabled"); + return -EINVAL; +} + +static inline int regmap_test_bits(struct regmap *map, + unsigned int reg, unsigned int bits) +{ + WARN_ONCE(1, "regmap API is disabled"); + return -EINVAL; +} + static inline int regmap_field_update_bits_base(struct regmap_field *field, unsigned int mask, unsigned int val, bool *change, bool async, bool force) -- cgit v1.2.3-59-g8ed1b From 240f1ae40c659713a53f8fa5e4899e7b7350bfed Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Thu, 28 May 2020 17:45:03 +0200 Subject: net: ethernet: mtk-star-emac: use regmap bitops Shrink the code visually by replacing regmap_update_bits() with appropriate regmap bit operations where applicable. Signed-off-by: Bartosz Golaszewski Signed-off-by: David S. Miller --- drivers/net/ethernet/mediatek/mtk_star_emac.c | 80 ++++++++++++--------------- 1 file changed, 35 insertions(+), 45 deletions(-) diff --git a/drivers/net/ethernet/mediatek/mtk_star_emac.c b/drivers/net/ethernet/mediatek/mtk_star_emac.c index 7df35872c107..f1ace4fec19f 100644 --- a/drivers/net/ethernet/mediatek/mtk_star_emac.c +++ b/drivers/net/ethernet/mediatek/mtk_star_emac.c @@ -413,8 +413,8 @@ static void mtk_star_dma_unmap_tx(struct mtk_star_priv *priv, static void mtk_star_nic_disable_pd(struct mtk_star_priv *priv) { - regmap_update_bits(priv->regs, MTK_STAR_REG_MAC_CFG, - MTK_STAR_BIT_MAC_CFG_NIC_PD, 0); + regmap_clear_bits(priv->regs, MTK_STAR_REG_MAC_CFG, + MTK_STAR_BIT_MAC_CFG_NIC_PD); } /* Unmask the three interrupts we care about, mask all others. */ @@ -434,41 +434,38 @@ static void mtk_star_intr_disable(struct mtk_star_priv *priv) static void mtk_star_intr_enable_tx(struct mtk_star_priv *priv) { - regmap_update_bits(priv->regs, MTK_STAR_REG_INT_MASK, - MTK_STAR_BIT_INT_STS_TNTC, 0); + regmap_clear_bits(priv->regs, MTK_STAR_REG_INT_MASK, + MTK_STAR_BIT_INT_STS_TNTC); } static void mtk_star_intr_enable_rx(struct mtk_star_priv *priv) { - regmap_update_bits(priv->regs, MTK_STAR_REG_INT_MASK, - MTK_STAR_BIT_INT_STS_FNRC, 0); + regmap_clear_bits(priv->regs, MTK_STAR_REG_INT_MASK, + MTK_STAR_BIT_INT_STS_FNRC); } static void mtk_star_intr_enable_stats(struct mtk_star_priv *priv) { - regmap_update_bits(priv->regs, MTK_STAR_REG_INT_MASK, - MTK_STAR_REG_INT_STS_MIB_CNT_TH, 0); + regmap_clear_bits(priv->regs, MTK_STAR_REG_INT_MASK, + MTK_STAR_REG_INT_STS_MIB_CNT_TH); } static void mtk_star_intr_disable_tx(struct mtk_star_priv *priv) { - regmap_update_bits(priv->regs, MTK_STAR_REG_INT_MASK, - MTK_STAR_BIT_INT_STS_TNTC, - MTK_STAR_BIT_INT_STS_TNTC); + regmap_set_bits(priv->regs, MTK_STAR_REG_INT_MASK, + MTK_STAR_BIT_INT_STS_TNTC); } static void mtk_star_intr_disable_rx(struct mtk_star_priv *priv) { - regmap_update_bits(priv->regs, MTK_STAR_REG_INT_MASK, - MTK_STAR_BIT_INT_STS_FNRC, - MTK_STAR_BIT_INT_STS_FNRC); + regmap_set_bits(priv->regs, MTK_STAR_REG_INT_MASK, + MTK_STAR_BIT_INT_STS_FNRC); } static void mtk_star_intr_disable_stats(struct mtk_star_priv *priv) { - regmap_update_bits(priv->regs, MTK_STAR_REG_INT_MASK, - MTK_STAR_REG_INT_STS_MIB_CNT_TH, - MTK_STAR_REG_INT_STS_MIB_CNT_TH); + regmap_set_bits(priv->regs, MTK_STAR_REG_INT_MASK, + MTK_STAR_REG_INT_STS_MIB_CNT_TH); } static unsigned int mtk_star_intr_read(struct mtk_star_priv *priv) @@ -524,12 +521,10 @@ static void mtk_star_dma_init(struct mtk_star_priv *priv) static void mtk_star_dma_start(struct mtk_star_priv *priv) { - regmap_update_bits(priv->regs, MTK_STAR_REG_TX_DMA_CTRL, - MTK_STAR_BIT_TX_DMA_CTRL_START, - MTK_STAR_BIT_TX_DMA_CTRL_START); - regmap_update_bits(priv->regs, MTK_STAR_REG_RX_DMA_CTRL, - MTK_STAR_BIT_RX_DMA_CTRL_START, - MTK_STAR_BIT_RX_DMA_CTRL_START); + regmap_set_bits(priv->regs, MTK_STAR_REG_TX_DMA_CTRL, + MTK_STAR_BIT_TX_DMA_CTRL_START); + regmap_set_bits(priv->regs, MTK_STAR_REG_RX_DMA_CTRL, + MTK_STAR_BIT_RX_DMA_CTRL_START); } static void mtk_star_dma_stop(struct mtk_star_priv *priv) @@ -553,16 +548,14 @@ static void mtk_star_dma_disable(struct mtk_star_priv *priv) static void mtk_star_dma_resume_rx(struct mtk_star_priv *priv) { - regmap_update_bits(priv->regs, MTK_STAR_REG_RX_DMA_CTRL, - MTK_STAR_BIT_RX_DMA_CTRL_RESUME, - MTK_STAR_BIT_RX_DMA_CTRL_RESUME); + regmap_set_bits(priv->regs, MTK_STAR_REG_RX_DMA_CTRL, + MTK_STAR_BIT_RX_DMA_CTRL_RESUME); } static void mtk_star_dma_resume_tx(struct mtk_star_priv *priv) { - regmap_update_bits(priv->regs, MTK_STAR_REG_TX_DMA_CTRL, - MTK_STAR_BIT_TX_DMA_CTRL_RESUME, - MTK_STAR_BIT_TX_DMA_CTRL_RESUME); + regmap_set_bits(priv->regs, MTK_STAR_REG_TX_DMA_CTRL, + MTK_STAR_BIT_TX_DMA_CTRL_RESUME); } static void mtk_star_set_mac_addr(struct net_device *ndev) @@ -842,8 +835,8 @@ static int mtk_star_hash_wait_ok(struct mtk_star_priv *priv) return ret; /* Check the BIST_OK bit. */ - regmap_read(priv->regs, MTK_STAR_REG_HASH_CTRL, &val); - if (!(val & MTK_STAR_BIT_HASH_CTRL_BIST_OK)) + if (!regmap_test_bits(priv->regs, MTK_STAR_REG_HASH_CTRL, + MTK_STAR_BIT_HASH_CTRL_BIST_OK)) return -EIO; return 0; @@ -877,12 +870,10 @@ static int mtk_star_reset_hash_table(struct mtk_star_priv *priv) if (ret) return ret; - regmap_update_bits(priv->regs, MTK_STAR_REG_HASH_CTRL, - MTK_STAR_BIT_HASH_CTRL_BIST_EN, - MTK_STAR_BIT_HASH_CTRL_BIST_EN); - regmap_update_bits(priv->regs, MTK_STAR_REG_TEST1, - MTK_STAR_BIT_TEST1_RST_HASH_MBIST, - MTK_STAR_BIT_TEST1_RST_HASH_MBIST); + regmap_set_bits(priv->regs, MTK_STAR_REG_HASH_CTRL, + MTK_STAR_BIT_HASH_CTRL_BIST_EN); + regmap_set_bits(priv->regs, MTK_STAR_REG_TEST1, + MTK_STAR_BIT_TEST1_RST_HASH_MBIST); return mtk_star_hash_wait_ok(priv); } @@ -1013,13 +1004,13 @@ static int mtk_star_enable(struct net_device *ndev) return ret; /* Setup the hashing algorithm */ - regmap_update_bits(priv->regs, MTK_STAR_REG_ARL_CFG, - MTK_STAR_BIT_ARL_CFG_HASH_ALG | - MTK_STAR_BIT_ARL_CFG_MISC_MODE, 0); + regmap_clear_bits(priv->regs, MTK_STAR_REG_ARL_CFG, + MTK_STAR_BIT_ARL_CFG_HASH_ALG | + MTK_STAR_BIT_ARL_CFG_MISC_MODE); /* Don't strip VLAN tags */ - regmap_update_bits(priv->regs, MTK_STAR_REG_MAC_CFG, - MTK_STAR_BIT_MAC_CFG_VLAN_STRIP, 0); + regmap_clear_bits(priv->regs, MTK_STAR_REG_MAC_CFG, + MTK_STAR_BIT_MAC_CFG_VLAN_STRIP); /* Setup DMA */ mtk_star_dma_init(priv); @@ -1201,9 +1192,8 @@ static void mtk_star_set_rx_mode(struct net_device *ndev) int ret; if (ndev->flags & IFF_PROMISC) { - regmap_update_bits(priv->regs, MTK_STAR_REG_ARL_CFG, - MTK_STAR_BIT_ARL_CFG_MISC_MODE, - MTK_STAR_BIT_ARL_CFG_MISC_MODE); + regmap_set_bits(priv->regs, MTK_STAR_REG_ARL_CFG, + MTK_STAR_BIT_ARL_CFG_MISC_MODE); } else if (netdev_mc_count(ndev) > MTK_STAR_HASHTABLE_MC_LIMIT || ndev->flags & IFF_ALLMULTI) { for (i = 0; i < MTK_STAR_HASHTABLE_SIZE_MAX; i++) { -- cgit v1.2.3-59-g8ed1b From a01c245438c59a44f3ece8440046c78675fa8b0b Mon Sep 17 00:00:00 2001 From: Davide Caratti Date: Fri, 29 May 2020 00:05:32 +0200 Subject: net/sched: fix a couple of splats in the error path of tfc_gate_init() trying to configure TC 'act_gate' rules with invalid control actions, the following splat can be observed: general protection fault, probably for non-canonical address 0xdffffc0000000002: 0000 [#1] SMP KASAN NOPTI KASAN: null-ptr-deref in range [0x0000000000000010-0x0000000000000017] CPU: 1 PID: 2143 Comm: tc Not tainted 5.7.0-rc6+ #168 Hardware name: Red Hat KVM, BIOS 1.11.1-4.module+el8.1.0+4066+0f1aadab 04/01/2014 RIP: 0010:hrtimer_active+0x56/0x290 [...] Call Trace: hrtimer_try_to_cancel+0x6d/0x330 hrtimer_cancel+0x11/0x20 tcf_gate_cleanup+0x15/0x30 [act_gate] tcf_action_cleanup+0x58/0x170 __tcf_action_put+0xb0/0xe0 __tcf_idr_release+0x68/0x90 tcf_gate_init+0x7c7/0x19a0 [act_gate] tcf_action_init_1+0x60f/0x960 tcf_action_init+0x157/0x2a0 tcf_action_add+0xd9/0x2f0 tc_ctl_action+0x2a3/0x39d rtnetlink_rcv_msg+0x5f3/0x920 netlink_rcv_skb+0x121/0x350 netlink_unicast+0x439/0x630 netlink_sendmsg+0x714/0xbf0 sock_sendmsg+0xe2/0x110 ____sys_sendmsg+0x5b4/0x890 ___sys_sendmsg+0xe9/0x160 __sys_sendmsg+0xd3/0x170 do_syscall_64+0x9a/0x370 entry_SYSCALL_64_after_hwframe+0x44/0xa9 this is caused by hrtimer_cancel(), running before hrtimer_init(). Fix it ensuring to call hrtimer_cancel() only if clockid is valid, and the timer has been initialized. After fixing this splat, the same error path causes another problem: general protection fault, probably for non-canonical address 0xdffffc0000000000: 0000 [#1] SMP KASAN NOPTI KASAN: null-ptr-deref in range [0x0000000000000000-0x0000000000000007] CPU: 1 PID: 980 Comm: tc Not tainted 5.7.0-rc6+ #168 Hardware name: Red Hat KVM, BIOS 1.11.1-4.module+el8.1.0+4066+0f1aadab 04/01/2014 RIP: 0010:release_entry_list+0x4a/0x240 [act_gate] [...] Call Trace: tcf_action_cleanup+0x58/0x170 __tcf_action_put+0xb0/0xe0 __tcf_idr_release+0x68/0x90 tcf_gate_init+0x7ab/0x19a0 [act_gate] tcf_action_init_1+0x60f/0x960 tcf_action_init+0x157/0x2a0 tcf_action_add+0xd9/0x2f0 tc_ctl_action+0x2a3/0x39d rtnetlink_rcv_msg+0x5f3/0x920 netlink_rcv_skb+0x121/0x350 netlink_unicast+0x439/0x630 netlink_sendmsg+0x714/0xbf0 sock_sendmsg+0xe2/0x110 ____sys_sendmsg+0x5b4/0x890 ___sys_sendmsg+0xe9/0x160 __sys_sendmsg+0xd3/0x170 do_syscall_64+0x9a/0x370 entry_SYSCALL_64_after_hwframe+0x44/0xa9 the problem is similar: tcf_action_cleanup() was trying to release a list without initializing it first. Ensure that INIT_LIST_HEAD() is called for every newly created 'act_gate' action, same as what was done to 'act_ife' with commit 44c23d71599f ("net/sched: act_ife: initalize ife->metalist earlier"). Fixes: a51c328df310 ("net: qos: introduce a gate control flow action") CC: Ivan Vecera Signed-off-by: Davide Caratti Signed-off-by: David S. Miller --- net/sched/act_gate.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/net/sched/act_gate.c b/net/sched/act_gate.c index 35fc48795541..9c628591f452 100644 --- a/net/sched/act_gate.c +++ b/net/sched/act_gate.c @@ -331,6 +331,10 @@ static int tcf_gate_init(struct net *net, struct nlattr *nla, tcf_idr_release(*a, bind); return -EEXIST; } + if (ret == ACT_P_CREATED) { + to_gate(*a)->param.tcfg_clockid = -1; + INIT_LIST_HEAD(&(to_gate(*a)->param.entries)); + } if (tb[TCA_GATE_PRIORITY]) prio = nla_get_s32(tb[TCA_GATE_PRIORITY]); @@ -377,7 +381,6 @@ static int tcf_gate_init(struct net *net, struct nlattr *nla, goto chain_put; } - INIT_LIST_HEAD(&p->entries); if (tb[TCA_GATE_ENTRY_LIST]) { err = parse_gate_list(tb[TCA_GATE_ENTRY_LIST], p, extack); if (err < 0) @@ -449,9 +452,9 @@ static void tcf_gate_cleanup(struct tc_action *a) struct tcf_gate *gact = to_gate(a); struct tcf_gate_params *p; - hrtimer_cancel(&gact->hitimer); - p = &gact->param; + if (p->tcfg_clockid != -1) + hrtimer_cancel(&gact->hitimer); release_entry_list(&p->entries); } -- cgit v1.2.3-59-g8ed1b From a8284c6899cf7321abbd258d970a9442978b0a4f Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Fri, 29 May 2020 02:25:34 +0200 Subject: netfilter: nf_flowtable: expose nf_flow_table_gc_cleanup() This function schedules the flow teardown state and it forces a gc run. Signed-off-by: Pablo Neira Ayuso Signed-off-by: David S. Miller --- include/net/netfilter/nf_flow_table.h | 2 ++ net/netfilter/nf_flow_table_core.c | 6 +++--- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/include/net/netfilter/nf_flow_table.h b/include/net/netfilter/nf_flow_table.h index c54a7f707e50..d7338bfd7b0f 100644 --- a/include/net/netfilter/nf_flow_table.h +++ b/include/net/netfilter/nf_flow_table.h @@ -175,6 +175,8 @@ void flow_offload_refresh(struct nf_flowtable *flow_table, struct flow_offload_tuple_rhash *flow_offload_lookup(struct nf_flowtable *flow_table, struct flow_offload_tuple *tuple); +void nf_flow_table_gc_cleanup(struct nf_flowtable *flowtable, + struct net_device *dev); void nf_flow_table_cleanup(struct net_device *dev); int nf_flow_table_init(struct nf_flowtable *flow_table); diff --git a/net/netfilter/nf_flow_table_core.c b/net/netfilter/nf_flow_table_core.c index 42da6e337276..6a3034f84ab6 100644 --- a/net/netfilter/nf_flow_table_core.c +++ b/net/netfilter/nf_flow_table_core.c @@ -588,8 +588,8 @@ static void nf_flow_table_do_cleanup(struct flow_offload *flow, void *data) flow_offload_teardown(flow); } -static void nf_flow_table_iterate_cleanup(struct nf_flowtable *flowtable, - struct net_device *dev) +void nf_flow_table_gc_cleanup(struct nf_flowtable *flowtable, + struct net_device *dev) { nf_flow_table_iterate(flowtable, nf_flow_table_do_cleanup, dev); flush_delayed_work(&flowtable->gc_work); @@ -602,7 +602,7 @@ void nf_flow_table_cleanup(struct net_device *dev) mutex_lock(&flowtable_lock); list_for_each_entry(flowtable, &flowtables, list) - nf_flow_table_iterate_cleanup(flowtable, dev); + nf_flow_table_gc_cleanup(flowtable, dev); mutex_unlock(&flowtable_lock); } EXPORT_SYMBOL_GPL(nf_flow_table_cleanup); -- cgit v1.2.3-59-g8ed1b From 1fac52da5942c58dd3e337fd7c5a550925ca752e Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Fri, 29 May 2020 02:25:35 +0200 Subject: net: flow_offload: consolidate indirect flow_block infrastructure Tunnel devices provide no dev->netdev_ops->ndo_setup_tc(...) interface. The tunnel device and route control plane does not provide an obvious way to relate tunnel and physical devices. This patch allows drivers to register a tunnel device offload handler for the tc and netfilter frontends through flow_indr_dev_register() and flow_indr_dev_unregister(). The frontend calls flow_indr_dev_setup_offload() that iterates over the list of drivers that are offering tunnel device hardware offload support and it sets up the flow block for this tunnel device. If the driver module is removed, the indirect flow_block ends up with a stale callback reference. The module removal path triggers the dev_shutdown() path to remove the qdisc and the flow_blocks for the physical devices. However, this is not useful for tunnel devices, where relation between the physical and the tunnel device is not explicit. This patch introduces a cleanup callback that is invoked when the driver module is removed to clean up the tunnel device flow_block. This patch defines struct flow_block_indr and it uses it from flow_block_cb to store the information that front-end requires to perform the flow_block_cb cleanup on module removal. Signed-off-by: Pablo Neira Ayuso Signed-off-by: David S. Miller --- include/net/flow_offload.h | 19 ++++++ net/core/flow_offload.c | 157 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 176 insertions(+) diff --git a/include/net/flow_offload.h b/include/net/flow_offload.h index 95d633785ef9..5493282348fa 100644 --- a/include/net/flow_offload.h +++ b/include/net/flow_offload.h @@ -443,6 +443,16 @@ enum tc_setup_type; typedef int flow_setup_cb_t(enum tc_setup_type type, void *type_data, void *cb_priv); +struct flow_block_cb; + +struct flow_block_indr { + struct list_head list; + struct net_device *dev; + enum flow_block_binder_type binder_type; + void *data; + void (*cleanup)(struct flow_block_cb *block_cb); +}; + struct flow_block_cb { struct list_head driver_list; struct list_head list; @@ -450,6 +460,7 @@ struct flow_block_cb { void *cb_ident; void *cb_priv; void (*release)(void *cb_priv); + struct flow_block_indr indr; unsigned int refcnt; }; @@ -523,6 +534,14 @@ static inline void flow_block_init(struct flow_block *flow_block) typedef int flow_indr_block_bind_cb_t(struct net_device *dev, void *cb_priv, enum tc_setup_type type, void *type_data); +int flow_indr_dev_register(flow_indr_block_bind_cb_t *cb, void *cb_priv); +void flow_indr_dev_unregister(flow_indr_block_bind_cb_t *cb, void *cb_priv, + flow_setup_cb_t *setup_cb); +int flow_indr_dev_setup_offload(struct net_device *dev, + enum tc_setup_type type, void *data, + struct flow_block_offload *bo, + void (*cleanup)(struct flow_block_cb *block_cb)); + typedef void flow_indr_block_cmd_t(struct net_device *dev, flow_indr_block_bind_cb_t *cb, void *cb_priv, enum flow_block_command command); diff --git a/net/core/flow_offload.c b/net/core/flow_offload.c index e64941c526b1..8cd7da2586ae 100644 --- a/net/core/flow_offload.c +++ b/net/core/flow_offload.c @@ -317,6 +317,163 @@ int flow_block_cb_setup_simple(struct flow_block_offload *f, } EXPORT_SYMBOL(flow_block_cb_setup_simple); +static DEFINE_MUTEX(flow_indr_block_lock); +static LIST_HEAD(flow_block_indr_list); +static LIST_HEAD(flow_block_indr_dev_list); + +struct flow_indr_dev { + struct list_head list; + flow_indr_block_bind_cb_t *cb; + void *cb_priv; + refcount_t refcnt; + struct rcu_head rcu; +}; + +static struct flow_indr_dev *flow_indr_dev_alloc(flow_indr_block_bind_cb_t *cb, + void *cb_priv) +{ + struct flow_indr_dev *indr_dev; + + indr_dev = kmalloc(sizeof(*indr_dev), GFP_KERNEL); + if (!indr_dev) + return NULL; + + indr_dev->cb = cb; + indr_dev->cb_priv = cb_priv; + refcount_set(&indr_dev->refcnt, 1); + + return indr_dev; +} + +int flow_indr_dev_register(flow_indr_block_bind_cb_t *cb, void *cb_priv) +{ + struct flow_indr_dev *indr_dev; + + mutex_lock(&flow_indr_block_lock); + list_for_each_entry(indr_dev, &flow_block_indr_dev_list, list) { + if (indr_dev->cb == cb && + indr_dev->cb_priv == cb_priv) { + refcount_inc(&indr_dev->refcnt); + mutex_unlock(&flow_indr_block_lock); + return 0; + } + } + + indr_dev = flow_indr_dev_alloc(cb, cb_priv); + if (!indr_dev) { + mutex_unlock(&flow_indr_block_lock); + return -ENOMEM; + } + + list_add(&indr_dev->list, &flow_block_indr_dev_list); + mutex_unlock(&flow_indr_block_lock); + + return 0; +} +EXPORT_SYMBOL(flow_indr_dev_register); + +static void __flow_block_indr_cleanup(flow_setup_cb_t *setup_cb, void *cb_priv, + struct list_head *cleanup_list) +{ + struct flow_block_cb *this, *next; + + list_for_each_entry_safe(this, next, &flow_block_indr_list, indr.list) { + if (this->cb == setup_cb && + this->cb_priv == cb_priv) { + list_move(&this->indr.list, cleanup_list); + return; + } + } +} + +static void flow_block_indr_notify(struct list_head *cleanup_list) +{ + struct flow_block_cb *this, *next; + + list_for_each_entry_safe(this, next, cleanup_list, indr.list) { + list_del(&this->indr.list); + this->indr.cleanup(this); + } +} + +void flow_indr_dev_unregister(flow_indr_block_bind_cb_t *cb, void *cb_priv, + flow_setup_cb_t *setup_cb) +{ + struct flow_indr_dev *this, *next, *indr_dev = NULL; + LIST_HEAD(cleanup_list); + + mutex_lock(&flow_indr_block_lock); + list_for_each_entry_safe(this, next, &flow_block_indr_dev_list, list) { + if (this->cb == cb && + this->cb_priv == cb_priv && + refcount_dec_and_test(&this->refcnt)) { + indr_dev = this; + list_del(&indr_dev->list); + break; + } + } + + if (!indr_dev) { + mutex_unlock(&flow_indr_block_lock); + return; + } + + __flow_block_indr_cleanup(setup_cb, cb_priv, &cleanup_list); + mutex_unlock(&flow_indr_block_lock); + + flow_block_indr_notify(&cleanup_list); + kfree(indr_dev); +} +EXPORT_SYMBOL(flow_indr_dev_unregister); + +static void flow_block_indr_init(struct flow_block_cb *flow_block, + struct flow_block_offload *bo, + struct net_device *dev, void *data, + void (*cleanup)(struct flow_block_cb *block_cb)) +{ + flow_block->indr.binder_type = bo->binder_type; + flow_block->indr.data = data; + flow_block->indr.dev = dev; + flow_block->indr.cleanup = cleanup; +} + +static void __flow_block_indr_binding(struct flow_block_offload *bo, + struct net_device *dev, void *data, + void (*cleanup)(struct flow_block_cb *block_cb)) +{ + struct flow_block_cb *block_cb; + + list_for_each_entry(block_cb, &bo->cb_list, list) { + switch (bo->command) { + case FLOW_BLOCK_BIND: + flow_block_indr_init(block_cb, bo, dev, data, cleanup); + list_add(&block_cb->indr.list, &flow_block_indr_list); + break; + case FLOW_BLOCK_UNBIND: + list_del(&block_cb->indr.list); + break; + } + } +} + +int flow_indr_dev_setup_offload(struct net_device *dev, + enum tc_setup_type type, void *data, + struct flow_block_offload *bo, + void (*cleanup)(struct flow_block_cb *block_cb)) +{ + struct flow_indr_dev *this; + + mutex_lock(&flow_indr_block_lock); + list_for_each_entry(this, &flow_block_indr_dev_list, list) + this->cb(dev, this->cb_priv, type, bo); + + __flow_block_indr_binding(bo, dev, data, cleanup); + mutex_unlock(&flow_indr_block_lock); + + return list_empty(&bo->cb_list) ? -EOPNOTSUPP : 0; +} +EXPORT_SYMBOL(flow_indr_dev_setup_offload); + static LIST_HEAD(block_cb_list); static struct rhashtable indr_setup_block_ht; -- cgit v1.2.3-59-g8ed1b From 324a823b9962a0f290c40fb6314926d434193276 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Fri, 29 May 2020 02:25:36 +0200 Subject: net: cls_api: add tcf_block_offload_init() Add a helper function to initialize the flow_block_offload structure. Signed-off-by: Pablo Neira Ayuso Signed-off-by: David S. Miller --- net/sched/cls_api.c | 26 +++++++++++++++++++------- 1 file changed, 19 insertions(+), 7 deletions(-) diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index 752d608f4442..c5a2f16097b6 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -693,6 +693,22 @@ static void tc_indr_block_get_and_cmd(struct net_device *dev, tc_indr_block_cmd(dev, block, cb, cb_priv, command, false); } +static void tcf_block_offload_init(struct flow_block_offload *bo, + struct net_device *dev, + enum flow_block_command command, + enum flow_block_binder_type binder_type, + struct flow_block *flow_block, + bool shared, struct netlink_ext_ack *extack) +{ + bo->net = dev_net(dev); + bo->command = command; + bo->binder_type = binder_type; + bo->block = flow_block; + bo->block_shared = shared; + bo->extack = extack; + INIT_LIST_HEAD(&bo->cb_list); +} + static void tc_indr_block_call(struct tcf_block *block, struct net_device *dev, struct tcf_block_ext_info *ei, @@ -727,13 +743,9 @@ static int tcf_block_offload_cmd(struct tcf_block *block, struct flow_block_offload bo = {}; int err; - bo.net = dev_net(dev); - bo.command = command; - bo.binder_type = ei->binder_type; - bo.block = &block->flow_block; - bo.block_shared = tcf_block_shared(block); - bo.extack = extack; - INIT_LIST_HEAD(&bo.cb_list); + tcf_block_offload_init(&bo, dev, command, ei->binder_type, + &block->flow_block, tcf_block_shared(block), + extack); err = dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_BLOCK, &bo); if (err < 0) { -- cgit v1.2.3-59-g8ed1b From 0fdcf78d59737939ea449b512d02c3733a22c8e1 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Fri, 29 May 2020 02:25:37 +0200 Subject: net: use flow_indr_dev_setup_offload() Update existing frontends to use flow_indr_dev_setup_offload(). This new function must be called if ->ndo_setup_tc is unset to deal with tunnel devices. If there is no driver that is subscribed to new tunnel device flow_block bindings, then this function bails out with EOPNOTSUPP. If the driver module is removed, the ->cleanup() callback removes the entries that belong to this tunnel device. This cleanup procedures is triggered when the device unregisters the tunnel device offload handler. Signed-off-by: Pablo Neira Ayuso Signed-off-by: David S. Miller --- net/netfilter/nf_flow_table_offload.c | 19 +++++++++--- net/netfilter/nf_tables_offload.c | 28 ++++++++++++++--- net/sched/cls_api.c | 58 +++++++++++++++++------------------ 3 files changed, 67 insertions(+), 38 deletions(-) diff --git a/net/netfilter/nf_flow_table_offload.c b/net/netfilter/nf_flow_table_offload.c index 2ff4087007a6..01cfa02c43bd 100644 --- a/net/netfilter/nf_flow_table_offload.c +++ b/net/netfilter/nf_flow_table_offload.c @@ -942,6 +942,18 @@ static void nf_flow_table_block_offload_init(struct flow_block_offload *bo, INIT_LIST_HEAD(&bo->cb_list); } +static void nf_flow_table_indr_cleanup(struct flow_block_cb *block_cb) +{ + struct nf_flowtable *flowtable = block_cb->indr.data; + struct net_device *dev = block_cb->indr.dev; + + nf_flow_table_gc_cleanup(flowtable, dev); + down_write(&flowtable->flow_block_lock); + list_del(&block_cb->list); + flow_block_cb_free(block_cb); + up_write(&flowtable->flow_block_lock); +} + static int nf_flow_table_indr_offload_cmd(struct flow_block_offload *bo, struct nf_flowtable *flowtable, struct net_device *dev, @@ -950,12 +962,9 @@ static int nf_flow_table_indr_offload_cmd(struct flow_block_offload *bo, { nf_flow_table_block_offload_init(bo, dev_net(dev), cmd, flowtable, extack); - flow_indr_block_call(dev, bo, cmd, TC_SETUP_FT); - if (list_empty(&bo->cb_list)) - return -EOPNOTSUPP; - - return 0; + return flow_indr_dev_setup_offload(dev, TC_SETUP_FT, flowtable, bo, + nf_flow_table_indr_cleanup); } static int nf_flow_table_offload_cmd(struct flow_block_offload *bo, diff --git a/net/netfilter/nf_tables_offload.c b/net/netfilter/nf_tables_offload.c index 954bccb7f32a..1960f11477e8 100644 --- a/net/netfilter/nf_tables_offload.c +++ b/net/netfilter/nf_tables_offload.c @@ -304,21 +304,41 @@ static void nft_indr_block_ing_cmd(struct net_device *dev, nft_block_setup(chain, &bo, cmd); } -static int nft_indr_block_offload_cmd(struct nft_base_chain *chain, +static void nft_indr_block_cleanup(struct flow_block_cb *block_cb) +{ + struct nft_base_chain *basechain = block_cb->indr.data; + struct net_device *dev = block_cb->indr.dev; + struct netlink_ext_ack extack = {}; + struct net *net = dev_net(dev); + struct flow_block_offload bo; + + nft_flow_block_offload_init(&bo, dev_net(dev), FLOW_BLOCK_UNBIND, + basechain, &extack); + mutex_lock(&net->nft.commit_mutex); + list_move(&block_cb->list, &bo.cb_list); + nft_flow_offload_unbind(&bo, basechain); + mutex_unlock(&net->nft.commit_mutex); +} + +static int nft_indr_block_offload_cmd(struct nft_base_chain *basechain, struct net_device *dev, enum flow_block_command cmd) { struct netlink_ext_ack extack = {}; struct flow_block_offload bo; + int err; - nft_flow_block_offload_init(&bo, dev_net(dev), cmd, chain, &extack); + nft_flow_block_offload_init(&bo, dev_net(dev), cmd, basechain, &extack); - flow_indr_block_call(dev, &bo, cmd, TC_SETUP_BLOCK); + err = flow_indr_dev_setup_offload(dev, TC_SETUP_BLOCK, basechain, &bo, + nft_indr_block_cleanup); + if (err < 0) + return err; if (list_empty(&bo.cb_list)) return -EOPNOTSUPP; - return nft_block_setup(chain, &bo, cmd); + return nft_block_setup(basechain, &bo, cmd); } #define FLOW_SETUP_BLOCK TC_SETUP_BLOCK diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index c5a2f16097b6..760e51d852f5 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -709,24 +709,26 @@ static void tcf_block_offload_init(struct flow_block_offload *bo, INIT_LIST_HEAD(&bo->cb_list); } -static void tc_indr_block_call(struct tcf_block *block, - struct net_device *dev, - struct tcf_block_ext_info *ei, - enum flow_block_command command, - struct netlink_ext_ack *extack) +static void tcf_block_unbind(struct tcf_block *block, + struct flow_block_offload *bo); + +static void tc_block_indr_cleanup(struct flow_block_cb *block_cb) { - struct flow_block_offload bo = { - .command = command, - .binder_type = ei->binder_type, - .net = dev_net(dev), - .block = &block->flow_block, - .block_shared = tcf_block_shared(block), - .extack = extack, - }; - INIT_LIST_HEAD(&bo.cb_list); + struct tcf_block *block = block_cb->indr.data; + struct net_device *dev = block_cb->indr.dev; + struct netlink_ext_ack extack = {}; + struct flow_block_offload bo; - flow_indr_block_call(dev, &bo, command, TC_SETUP_BLOCK); - tcf_block_setup(block, &bo); + tcf_block_offload_init(&bo, dev, FLOW_BLOCK_UNBIND, + block_cb->indr.binder_type, + &block->flow_block, tcf_block_shared(block), + &extack); + down_write(&block->cb_lock); + list_move(&block_cb->list, &bo.cb_list); + up_write(&block->cb_lock); + rtnl_lock(); + tcf_block_unbind(block, &bo); + rtnl_unlock(); } static bool tcf_block_offload_in_use(struct tcf_block *block) @@ -747,7 +749,12 @@ static int tcf_block_offload_cmd(struct tcf_block *block, &block->flow_block, tcf_block_shared(block), extack); - err = dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_BLOCK, &bo); + if (dev->netdev_ops->ndo_setup_tc) + err = dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_BLOCK, &bo); + else + err = flow_indr_dev_setup_offload(dev, TC_SETUP_BLOCK, block, + &bo, tc_block_indr_cleanup); + if (err < 0) { if (err != -EOPNOTSUPP) NL_SET_ERR_MSG(extack, "Driver ndo_setup_tc failed"); @@ -765,13 +772,13 @@ static int tcf_block_offload_bind(struct tcf_block *block, struct Qdisc *q, int err; down_write(&block->cb_lock); - if (!dev->netdev_ops->ndo_setup_tc) - goto no_offload_dev_inc; /* If tc offload feature is disabled and the block we try to bind * to already has some offloaded filters, forbid to bind. */ - if (!tc_can_offload(dev) && tcf_block_offload_in_use(block)) { + if (dev->netdev_ops->ndo_setup_tc && + !tc_can_offload(dev) && + tcf_block_offload_in_use(block)) { NL_SET_ERR_MSG(extack, "Bind to offloaded block failed as dev has offload disabled"); err = -EOPNOTSUPP; goto err_unlock; @@ -783,18 +790,15 @@ static int tcf_block_offload_bind(struct tcf_block *block, struct Qdisc *q, if (err) goto err_unlock; - tc_indr_block_call(block, dev, ei, FLOW_BLOCK_BIND, extack); up_write(&block->cb_lock); return 0; no_offload_dev_inc: - if (tcf_block_offload_in_use(block)) { - err = -EOPNOTSUPP; + if (tcf_block_offload_in_use(block)) goto err_unlock; - } + err = 0; block->nooffloaddevcnt++; - tc_indr_block_call(block, dev, ei, FLOW_BLOCK_BIND, extack); err_unlock: up_write(&block->cb_lock); return err; @@ -807,10 +811,6 @@ static void tcf_block_offload_unbind(struct tcf_block *block, struct Qdisc *q, int err; down_write(&block->cb_lock); - tc_indr_block_call(block, dev, ei, FLOW_BLOCK_UNBIND, NULL); - - if (!dev->netdev_ops->ndo_setup_tc) - goto no_offload_dev_dec; err = tcf_block_offload_cmd(block, dev, ei, FLOW_BLOCK_UNBIND, NULL); if (err == -EOPNOTSUPP) goto no_offload_dev_dec; -- cgit v1.2.3-59-g8ed1b From 9eabd188716b2c53d8b9d23e969c6c17049f0fcc Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Fri, 29 May 2020 02:25:38 +0200 Subject: mlx5: update indirect block support Register ndo callback via flow_indr_dev_register() and flow_indr_dev_unregister(). No need for mlx5e_rep_indr_clean_block_privs() since flow_block_cb_free() already releases the internal mapping via ->release callback, which in this case is mlx5e_rep_indr_tc_block_unbind(). Signed-off-by: Pablo Neira Ayuso Signed-off-by: David S. Miller --- .../net/ethernet/mellanox/mlx5/core/en/rep/tc.c | 81 +++------------------- .../net/ethernet/mellanox/mlx5/core/en/rep/tc.h | 4 -- drivers/net/ethernet/mellanox/mlx5/core/en_rep.c | 1 - drivers/net/ethernet/mellanox/mlx5/core/en_rep.h | 5 -- 4 files changed, 8 insertions(+), 83 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/rep/tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en/rep/tc.c index c609a5e50ebc..80713123de5c 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/rep/tc.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/rep/tc.c @@ -306,20 +306,6 @@ mlx5e_rep_indr_block_priv_lookup(struct mlx5e_rep_priv *rpriv, return NULL; } -static void mlx5e_rep_indr_unregister_block(struct mlx5e_rep_priv *rpriv, - struct net_device *netdev); - -void mlx5e_rep_indr_clean_block_privs(struct mlx5e_rep_priv *rpriv) -{ - struct mlx5e_rep_indr_block_priv *cb_priv, *temp; - struct list_head *head = &rpriv->uplink_priv.tc_indr_block_priv_list; - - list_for_each_entry_safe(cb_priv, temp, head, list) { - mlx5e_rep_indr_unregister_block(rpriv, cb_priv->netdev); - kfree(cb_priv); - } -} - static int mlx5e_rep_indr_offload(struct net_device *netdev, struct flow_cls_offload *flower, @@ -423,9 +409,14 @@ mlx5e_rep_indr_setup_block(struct net_device *netdev, struct flow_block_offload *f, flow_setup_cb_t *setup_cb) { + struct mlx5e_priv *priv = netdev_priv(rpriv->netdev); struct mlx5e_rep_indr_block_priv *indr_priv; struct flow_block_cb *block_cb; + if (!mlx5e_tc_tun_device_to_offload(priv, netdev) && + !(is_vlan_dev(netdev) && vlan_dev_real_dev(netdev) == rpriv->netdev)) + return -EOPNOTSUPP; + if (f->binder_type != FLOW_BLOCK_BINDER_TYPE_CLSACT_INGRESS) return -EOPNOTSUPP; @@ -492,76 +483,20 @@ int mlx5e_rep_indr_setup_cb(struct net_device *netdev, void *cb_priv, } } -static int mlx5e_rep_indr_register_block(struct mlx5e_rep_priv *rpriv, - struct net_device *netdev) -{ - int err; - - err = __flow_indr_block_cb_register(netdev, rpriv, - mlx5e_rep_indr_setup_cb, - rpriv); - if (err) { - struct mlx5e_priv *priv = netdev_priv(rpriv->netdev); - - mlx5_core_err(priv->mdev, "Failed to register remote block notifier for %s err=%d\n", - netdev_name(netdev), err); - } - return err; -} - -static void mlx5e_rep_indr_unregister_block(struct mlx5e_rep_priv *rpriv, - struct net_device *netdev) -{ - __flow_indr_block_cb_unregister(netdev, mlx5e_rep_indr_setup_cb, - rpriv); -} - -static int mlx5e_nic_rep_netdevice_event(struct notifier_block *nb, - unsigned long event, void *ptr) -{ - struct mlx5e_rep_priv *rpriv = container_of(nb, struct mlx5e_rep_priv, - uplink_priv.netdevice_nb); - struct mlx5e_priv *priv = netdev_priv(rpriv->netdev); - struct net_device *netdev = netdev_notifier_info_to_dev(ptr); - - if (!mlx5e_tc_tun_device_to_offload(priv, netdev) && - !(is_vlan_dev(netdev) && vlan_dev_real_dev(netdev) == rpriv->netdev)) - return NOTIFY_OK; - - switch (event) { - case NETDEV_REGISTER: - mlx5e_rep_indr_register_block(rpriv, netdev); - break; - case NETDEV_UNREGISTER: - mlx5e_rep_indr_unregister_block(rpriv, netdev); - break; - } - return NOTIFY_OK; -} - int mlx5e_rep_tc_netdevice_event_register(struct mlx5e_rep_priv *rpriv) { struct mlx5_rep_uplink_priv *uplink_priv = &rpriv->uplink_priv; - int err; /* init indirect block notifications */ INIT_LIST_HEAD(&uplink_priv->tc_indr_block_priv_list); - uplink_priv->netdevice_nb.notifier_call = mlx5e_nic_rep_netdevice_event; - err = register_netdevice_notifier_dev_net(rpriv->netdev, - &uplink_priv->netdevice_nb, - &uplink_priv->netdevice_nn); - return err; + return flow_indr_dev_register(mlx5e_rep_indr_setup_cb, rpriv); } void mlx5e_rep_tc_netdevice_event_unregister(struct mlx5e_rep_priv *rpriv) { - struct mlx5_rep_uplink_priv *uplink_priv = &rpriv->uplink_priv; - - /* clean indirect TC block notifications */ - unregister_netdevice_notifier_dev_net(rpriv->netdev, - &uplink_priv->netdevice_nb, - &uplink_priv->netdevice_nn); + flow_indr_dev_unregister(mlx5e_rep_indr_setup_cb, rpriv, + mlx5e_rep_indr_setup_tc_cb); } #if IS_ENABLED(CONFIG_NET_TC_SKB_EXT) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/rep/tc.h b/drivers/net/ethernet/mellanox/mlx5/core/en/rep/tc.h index 86f92abf2fdd..fdf9702c2d7d 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/rep/tc.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/rep/tc.h @@ -33,7 +33,6 @@ void mlx5e_rep_encap_entry_detach(struct mlx5e_priv *priv, int mlx5e_rep_setup_tc(struct net_device *dev, enum tc_setup_type type, void *type_data); -void mlx5e_rep_indr_clean_block_privs(struct mlx5e_rep_priv *rpriv); bool mlx5e_rep_tc_update_skb(struct mlx5_cqe64 *cqe, struct sk_buff *skb, @@ -65,9 +64,6 @@ static inline int mlx5e_rep_setup_tc(struct net_device *dev, enum tc_setup_type type, void *type_data) { return -EOPNOTSUPP; } -static inline void -mlx5e_rep_indr_clean_block_privs(struct mlx5e_rep_priv *rpriv) {} - struct mlx5e_tc_update_priv; static inline bool mlx5e_rep_tc_update_skb(struct mlx5_cqe64 *cqe, diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c index af89a4803c7d..006807e04eda 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c @@ -1018,7 +1018,6 @@ destroy_tises: static void mlx5e_cleanup_uplink_rep_tx(struct mlx5e_rep_priv *rpriv) { mlx5e_rep_tc_netdevice_event_unregister(rpriv); - mlx5e_rep_indr_clean_block_privs(rpriv); mlx5e_rep_bond_cleanup(rpriv); mlx5e_rep_tc_cleanup(rpriv); } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.h b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.h index da9f1686d525..1d5669801484 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.h @@ -69,13 +69,8 @@ struct mlx5_rep_uplink_priv { * tc_indr_block_cb_priv_list is used to lookup indirect callback * private data * - * netdevice_nb is the netdev events notifier - used to register - * tunnel devices for block events - * */ struct list_head tc_indr_block_priv_list; - struct notifier_block netdevice_nb; - struct netdev_net_notifier netdevice_nn; struct mlx5_tun_entropy tun_entropy; -- cgit v1.2.3-59-g8ed1b From 50c1b1c9385fbb35c25b27608e00bcf89368e8ba Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Fri, 29 May 2020 02:25:39 +0200 Subject: nfp: update indirect block support Register ndo callback via flow_indr_dev_register() and flow_indr_dev_unregister(). Signed-off-by: Pablo Neira Ayuso Signed-off-by: David S. Miller --- drivers/net/ethernet/netronome/nfp/flower/main.c | 11 ++++--- drivers/net/ethernet/netronome/nfp/flower/main.h | 7 +++-- .../net/ethernet/netronome/nfp/flower/offload.c | 35 ++++------------------ 3 files changed, 17 insertions(+), 36 deletions(-) diff --git a/drivers/net/ethernet/netronome/nfp/flower/main.c b/drivers/net/ethernet/netronome/nfp/flower/main.c index ca7032d22196..c39327677a7d 100644 --- a/drivers/net/ethernet/netronome/nfp/flower/main.c +++ b/drivers/net/ethernet/netronome/nfp/flower/main.c @@ -830,6 +830,10 @@ static int nfp_flower_init(struct nfp_app *app) if (err) goto err_cleanup; + err = flow_indr_dev_register(nfp_flower_indr_setup_tc_cb, app); + if (err) + goto err_cleanup; + if (app_priv->flower_ext_feats & NFP_FL_FEATS_VF_RLIM) nfp_flower_qos_init(app); @@ -856,6 +860,9 @@ static void nfp_flower_clean(struct nfp_app *app) skb_queue_purge(&app_priv->cmsg_skbs_low); flush_work(&app_priv->cmsg_work); + flow_indr_dev_unregister(nfp_flower_indr_setup_tc_cb, app, + nfp_flower_setup_indr_block_cb); + if (app_priv->flower_ext_feats & NFP_FL_FEATS_VF_RLIM) nfp_flower_qos_cleanup(app); @@ -959,10 +966,6 @@ nfp_flower_netdev_event(struct nfp_app *app, struct net_device *netdev, return ret; } - ret = nfp_flower_reg_indir_block_handler(app, netdev, event); - if (ret & NOTIFY_STOP_MASK) - return ret; - ret = nfp_flower_internal_port_event_handler(app, netdev, event); if (ret & NOTIFY_STOP_MASK) return ret; diff --git a/drivers/net/ethernet/netronome/nfp/flower/main.h b/drivers/net/ethernet/netronome/nfp/flower/main.h index 59abea2a39ad..6c3dc3baf387 100644 --- a/drivers/net/ethernet/netronome/nfp/flower/main.h +++ b/drivers/net/ethernet/netronome/nfp/flower/main.h @@ -458,9 +458,10 @@ void nfp_flower_qos_cleanup(struct nfp_app *app); int nfp_flower_setup_qos_offload(struct nfp_app *app, struct net_device *netdev, struct tc_cls_matchall_offload *flow); void nfp_flower_stats_rlim_reply(struct nfp_app *app, struct sk_buff *skb); -int nfp_flower_reg_indir_block_handler(struct nfp_app *app, - struct net_device *netdev, - unsigned long event); +int nfp_flower_indr_setup_tc_cb(struct net_device *netdev, void *cb_priv, + enum tc_setup_type type, void *type_data); +int nfp_flower_setup_indr_block_cb(enum tc_setup_type type, void *type_data, + void *cb_priv); void __nfp_flower_non_repr_priv_get(struct nfp_flower_non_repr_priv *non_repr_priv); diff --git a/drivers/net/ethernet/netronome/nfp/flower/offload.c b/drivers/net/ethernet/netronome/nfp/flower/offload.c index 6b60771ccb19..695d24b9dd92 100644 --- a/drivers/net/ethernet/netronome/nfp/flower/offload.c +++ b/drivers/net/ethernet/netronome/nfp/flower/offload.c @@ -1619,8 +1619,8 @@ nfp_flower_indr_block_cb_priv_lookup(struct nfp_app *app, return NULL; } -static int nfp_flower_setup_indr_block_cb(enum tc_setup_type type, - void *type_data, void *cb_priv) +int nfp_flower_setup_indr_block_cb(enum tc_setup_type type, + void *type_data, void *cb_priv) { struct nfp_flower_indr_block_cb_priv *priv = cb_priv; struct flow_cls_offload *flower = type_data; @@ -1708,10 +1708,13 @@ nfp_flower_setup_indr_tc_block(struct net_device *netdev, struct nfp_app *app, return 0; } -static int +int nfp_flower_indr_setup_tc_cb(struct net_device *netdev, void *cb_priv, enum tc_setup_type type, void *type_data) { + if (!nfp_fl_is_netdev_to_offload(netdev)) + return -EOPNOTSUPP; + switch (type) { case TC_SETUP_BLOCK: return nfp_flower_setup_indr_tc_block(netdev, cb_priv, @@ -1720,29 +1723,3 @@ nfp_flower_indr_setup_tc_cb(struct net_device *netdev, void *cb_priv, return -EOPNOTSUPP; } } - -int nfp_flower_reg_indir_block_handler(struct nfp_app *app, - struct net_device *netdev, - unsigned long event) -{ - int err; - - if (!nfp_fl_is_netdev_to_offload(netdev)) - return NOTIFY_OK; - - if (event == NETDEV_REGISTER) { - err = __flow_indr_block_cb_register(netdev, app, - nfp_flower_indr_setup_tc_cb, - app); - if (err) - nfp_flower_cmsg_warn(app, - "Indirect block reg failed - %s\n", - netdev->name); - } else if (event == NETDEV_UNREGISTER) { - __flow_indr_block_cb_unregister(netdev, - nfp_flower_indr_setup_tc_cb, - app); - } - - return NOTIFY_OK; -} -- cgit v1.2.3-59-g8ed1b From e445e30cf7e6d68566db775ce186cbe63ef286e9 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Fri, 29 May 2020 02:25:40 +0200 Subject: bnxt_tc: update indirect block support Register ndo callback via flow_indr_dev_register() and flow_indr_dev_unregister(). Signed-off-by: Pablo Neira Ayuso Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/bnxt/bnxt.h | 1 - drivers/net/ethernet/broadcom/bnxt/bnxt_tc.c | 51 +++++++--------------------- 2 files changed, 12 insertions(+), 40 deletions(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.h b/drivers/net/ethernet/broadcom/bnxt/bnxt.h index 5c562e0aac67..9e173d74b72a 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.h +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.h @@ -1870,7 +1870,6 @@ struct bnxt { u8 dsn[8]; struct bnxt_tc_info *tc_info; struct list_head tc_indr_block_list; - struct notifier_block tc_netdev_nb; struct dentry *debugfs_pdev; struct device *hwmon_dev; }; diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_tc.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_tc.c index 782ea0771221..0eef4f5e4a46 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_tc.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_tc.c @@ -1939,53 +1939,25 @@ static int bnxt_tc_setup_indr_block(struct net_device *netdev, struct bnxt *bp, return 0; } -static int bnxt_tc_setup_indr_cb(struct net_device *netdev, void *cb_priv, - enum tc_setup_type type, void *type_data) -{ - switch (type) { - case TC_SETUP_BLOCK: - return bnxt_tc_setup_indr_block(netdev, cb_priv, type_data); - default: - return -EOPNOTSUPP; - } -} - static bool bnxt_is_netdev_indr_offload(struct net_device *netdev) { return netif_is_vxlan(netdev); } -static int bnxt_tc_indr_block_event(struct notifier_block *nb, - unsigned long event, void *ptr) +static int bnxt_tc_setup_indr_cb(struct net_device *netdev, void *cb_priv, + enum tc_setup_type type, void *type_data) { - struct net_device *netdev; - struct bnxt *bp; - int rc; - - netdev = netdev_notifier_info_to_dev(ptr); if (!bnxt_is_netdev_indr_offload(netdev)) - return NOTIFY_OK; - - bp = container_of(nb, struct bnxt, tc_netdev_nb); + return -EOPNOTSUPP; - switch (event) { - case NETDEV_REGISTER: - rc = __flow_indr_block_cb_register(netdev, bp, - bnxt_tc_setup_indr_cb, - bp); - if (rc) - netdev_info(bp->dev, - "Failed to register indirect blk: dev: %s\n", - netdev->name); - break; - case NETDEV_UNREGISTER: - __flow_indr_block_cb_unregister(netdev, - bnxt_tc_setup_indr_cb, - bp); + switch (type) { + case TC_SETUP_BLOCK: + return bnxt_tc_setup_indr_block(netdev, cb_priv, type_data); + default: break; } - return NOTIFY_DONE; + return -EOPNOTSUPP; } static const struct rhashtable_params bnxt_tc_flow_ht_params = { @@ -2074,8 +2046,8 @@ int bnxt_init_tc(struct bnxt *bp) /* init indirect block notifications */ INIT_LIST_HEAD(&bp->tc_indr_block_list); - bp->tc_netdev_nb.notifier_call = bnxt_tc_indr_block_event; - rc = register_netdevice_notifier(&bp->tc_netdev_nb); + + rc = flow_indr_dev_register(bnxt_tc_setup_indr_cb, bp); if (!rc) return 0; @@ -2101,7 +2073,8 @@ void bnxt_shutdown_tc(struct bnxt *bp) if (!bnxt_tc_flower_enabled(bp)) return; - unregister_netdevice_notifier(&bp->tc_netdev_nb); + flow_indr_dev_unregister(bnxt_tc_setup_indr_cb, bp, + bnxt_tc_setup_indr_block_cb); rhashtable_destroy(&tc_info->flow_table); rhashtable_destroy(&tc_info->l2_table); rhashtable_destroy(&tc_info->decap_l2_table); -- cgit v1.2.3-59-g8ed1b From 709ffbe19b777e8fc952e2fdcfd8e6f50c8ef08c Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Fri, 29 May 2020 02:25:41 +0200 Subject: net: remove indirect block netdev event registration Drivers do not register to netdev events to set up indirect blocks anymore. Remove __flow_indr_block_cb_register() and __flow_indr_block_cb_unregister(). The frontends set up the callbacks through flow_indr_dev_setup_block() Signed-off-by: Pablo Neira Ayuso Signed-off-by: David S. Miller --- include/net/flow_offload.h | 9 -- net/core/flow_offload.c | 238 ---------------------------------- net/netfilter/nf_flow_table_offload.c | 66 ---------- net/netfilter/nf_tables_offload.c | 53 +------- net/sched/cls_api.c | 79 ----------- 5 files changed, 1 insertion(+), 444 deletions(-) diff --git a/include/net/flow_offload.h b/include/net/flow_offload.h index 5493282348fa..69e13c8b6b3a 100644 --- a/include/net/flow_offload.h +++ b/include/net/flow_offload.h @@ -546,15 +546,6 @@ typedef void flow_indr_block_cmd_t(struct net_device *dev, flow_indr_block_bind_cb_t *cb, void *cb_priv, enum flow_block_command command); -struct flow_indr_block_entry { - flow_indr_block_cmd_t *cb; - struct list_head list; -}; - -void flow_indr_add_block_cb(struct flow_indr_block_entry *entry); - -void flow_indr_del_block_cb(struct flow_indr_block_entry *entry); - int __flow_indr_block_cb_register(struct net_device *dev, void *cb_priv, flow_indr_block_bind_cb_t *cb, void *cb_ident); diff --git a/net/core/flow_offload.c b/net/core/flow_offload.c index 8cd7da2586ae..0cfc35e6be28 100644 --- a/net/core/flow_offload.c +++ b/net/core/flow_offload.c @@ -473,241 +473,3 @@ int flow_indr_dev_setup_offload(struct net_device *dev, return list_empty(&bo->cb_list) ? -EOPNOTSUPP : 0; } EXPORT_SYMBOL(flow_indr_dev_setup_offload); - -static LIST_HEAD(block_cb_list); - -static struct rhashtable indr_setup_block_ht; - -struct flow_indr_block_cb { - struct list_head list; - void *cb_priv; - flow_indr_block_bind_cb_t *cb; - void *cb_ident; -}; - -struct flow_indr_block_dev { - struct rhash_head ht_node; - struct net_device *dev; - unsigned int refcnt; - struct list_head cb_list; -}; - -static const struct rhashtable_params flow_indr_setup_block_ht_params = { - .key_offset = offsetof(struct flow_indr_block_dev, dev), - .head_offset = offsetof(struct flow_indr_block_dev, ht_node), - .key_len = sizeof(struct net_device *), -}; - -static struct flow_indr_block_dev * -flow_indr_block_dev_lookup(struct net_device *dev) -{ - return rhashtable_lookup_fast(&indr_setup_block_ht, &dev, - flow_indr_setup_block_ht_params); -} - -static struct flow_indr_block_dev * -flow_indr_block_dev_get(struct net_device *dev) -{ - struct flow_indr_block_dev *indr_dev; - - indr_dev = flow_indr_block_dev_lookup(dev); - if (indr_dev) - goto inc_ref; - - indr_dev = kzalloc(sizeof(*indr_dev), GFP_KERNEL); - if (!indr_dev) - return NULL; - - INIT_LIST_HEAD(&indr_dev->cb_list); - indr_dev->dev = dev; - if (rhashtable_insert_fast(&indr_setup_block_ht, &indr_dev->ht_node, - flow_indr_setup_block_ht_params)) { - kfree(indr_dev); - return NULL; - } - -inc_ref: - indr_dev->refcnt++; - return indr_dev; -} - -static void flow_indr_block_dev_put(struct flow_indr_block_dev *indr_dev) -{ - if (--indr_dev->refcnt) - return; - - rhashtable_remove_fast(&indr_setup_block_ht, &indr_dev->ht_node, - flow_indr_setup_block_ht_params); - kfree(indr_dev); -} - -static struct flow_indr_block_cb * -flow_indr_block_cb_lookup(struct flow_indr_block_dev *indr_dev, - flow_indr_block_bind_cb_t *cb, void *cb_ident) -{ - struct flow_indr_block_cb *indr_block_cb; - - list_for_each_entry(indr_block_cb, &indr_dev->cb_list, list) - if (indr_block_cb->cb == cb && - indr_block_cb->cb_ident == cb_ident) - return indr_block_cb; - return NULL; -} - -static struct flow_indr_block_cb * -flow_indr_block_cb_add(struct flow_indr_block_dev *indr_dev, void *cb_priv, - flow_indr_block_bind_cb_t *cb, void *cb_ident) -{ - struct flow_indr_block_cb *indr_block_cb; - - indr_block_cb = flow_indr_block_cb_lookup(indr_dev, cb, cb_ident); - if (indr_block_cb) - return ERR_PTR(-EEXIST); - - indr_block_cb = kzalloc(sizeof(*indr_block_cb), GFP_KERNEL); - if (!indr_block_cb) - return ERR_PTR(-ENOMEM); - - indr_block_cb->cb_priv = cb_priv; - indr_block_cb->cb = cb; - indr_block_cb->cb_ident = cb_ident; - list_add(&indr_block_cb->list, &indr_dev->cb_list); - - return indr_block_cb; -} - -static void flow_indr_block_cb_del(struct flow_indr_block_cb *indr_block_cb) -{ - list_del(&indr_block_cb->list); - kfree(indr_block_cb); -} - -static DEFINE_MUTEX(flow_indr_block_cb_lock); - -static void flow_block_cmd(struct net_device *dev, - flow_indr_block_bind_cb_t *cb, void *cb_priv, - enum flow_block_command command) -{ - struct flow_indr_block_entry *entry; - - mutex_lock(&flow_indr_block_cb_lock); - list_for_each_entry(entry, &block_cb_list, list) { - entry->cb(dev, cb, cb_priv, command); - } - mutex_unlock(&flow_indr_block_cb_lock); -} - -int __flow_indr_block_cb_register(struct net_device *dev, void *cb_priv, - flow_indr_block_bind_cb_t *cb, - void *cb_ident) -{ - struct flow_indr_block_cb *indr_block_cb; - struct flow_indr_block_dev *indr_dev; - int err; - - indr_dev = flow_indr_block_dev_get(dev); - if (!indr_dev) - return -ENOMEM; - - indr_block_cb = flow_indr_block_cb_add(indr_dev, cb_priv, cb, cb_ident); - err = PTR_ERR_OR_ZERO(indr_block_cb); - if (err) - goto err_dev_put; - - flow_block_cmd(dev, indr_block_cb->cb, indr_block_cb->cb_priv, - FLOW_BLOCK_BIND); - - return 0; - -err_dev_put: - flow_indr_block_dev_put(indr_dev); - return err; -} -EXPORT_SYMBOL_GPL(__flow_indr_block_cb_register); - -int flow_indr_block_cb_register(struct net_device *dev, void *cb_priv, - flow_indr_block_bind_cb_t *cb, - void *cb_ident) -{ - int err; - - rtnl_lock(); - err = __flow_indr_block_cb_register(dev, cb_priv, cb, cb_ident); - rtnl_unlock(); - - return err; -} -EXPORT_SYMBOL_GPL(flow_indr_block_cb_register); - -void __flow_indr_block_cb_unregister(struct net_device *dev, - flow_indr_block_bind_cb_t *cb, - void *cb_ident) -{ - struct flow_indr_block_cb *indr_block_cb; - struct flow_indr_block_dev *indr_dev; - - indr_dev = flow_indr_block_dev_lookup(dev); - if (!indr_dev) - return; - - indr_block_cb = flow_indr_block_cb_lookup(indr_dev, cb, cb_ident); - if (!indr_block_cb) - return; - - flow_block_cmd(dev, indr_block_cb->cb, indr_block_cb->cb_priv, - FLOW_BLOCK_UNBIND); - - flow_indr_block_cb_del(indr_block_cb); - flow_indr_block_dev_put(indr_dev); -} -EXPORT_SYMBOL_GPL(__flow_indr_block_cb_unregister); - -void flow_indr_block_cb_unregister(struct net_device *dev, - flow_indr_block_bind_cb_t *cb, - void *cb_ident) -{ - rtnl_lock(); - __flow_indr_block_cb_unregister(dev, cb, cb_ident); - rtnl_unlock(); -} -EXPORT_SYMBOL_GPL(flow_indr_block_cb_unregister); - -void flow_indr_block_call(struct net_device *dev, - struct flow_block_offload *bo, - enum flow_block_command command, - enum tc_setup_type type) -{ - struct flow_indr_block_cb *indr_block_cb; - struct flow_indr_block_dev *indr_dev; - - indr_dev = flow_indr_block_dev_lookup(dev); - if (!indr_dev) - return; - - list_for_each_entry(indr_block_cb, &indr_dev->cb_list, list) - indr_block_cb->cb(dev, indr_block_cb->cb_priv, type, bo); -} -EXPORT_SYMBOL_GPL(flow_indr_block_call); - -void flow_indr_add_block_cb(struct flow_indr_block_entry *entry) -{ - mutex_lock(&flow_indr_block_cb_lock); - list_add_tail(&entry->list, &block_cb_list); - mutex_unlock(&flow_indr_block_cb_lock); -} -EXPORT_SYMBOL_GPL(flow_indr_add_block_cb); - -void flow_indr_del_block_cb(struct flow_indr_block_entry *entry) -{ - mutex_lock(&flow_indr_block_cb_lock); - list_del(&entry->list); - mutex_unlock(&flow_indr_block_cb_lock); -} -EXPORT_SYMBOL_GPL(flow_indr_del_block_cb); - -static int __init init_flow_indr_rhashtable(void) -{ - return rhashtable_init(&indr_setup_block_ht, - &flow_indr_setup_block_ht_params); -} -subsys_initcall(init_flow_indr_rhashtable); diff --git a/net/netfilter/nf_flow_table_offload.c b/net/netfilter/nf_flow_table_offload.c index 01cfa02c43bd..62651e6683f6 100644 --- a/net/netfilter/nf_flow_table_offload.c +++ b/net/netfilter/nf_flow_table_offload.c @@ -1008,69 +1008,6 @@ int nf_flow_table_offload_setup(struct nf_flowtable *flowtable, } EXPORT_SYMBOL_GPL(nf_flow_table_offload_setup); -static void nf_flow_table_indr_block_ing_cmd(struct net_device *dev, - struct nf_flowtable *flowtable, - flow_indr_block_bind_cb_t *cb, - void *cb_priv, - enum flow_block_command cmd) -{ - struct netlink_ext_ack extack = {}; - struct flow_block_offload bo; - - if (!flowtable) - return; - - nf_flow_table_block_offload_init(&bo, dev_net(dev), cmd, flowtable, - &extack); - - cb(dev, cb_priv, TC_SETUP_FT, &bo); - - nf_flow_table_block_setup(flowtable, &bo, cmd); -} - -static void nf_flow_table_indr_block_cb_cmd(struct nf_flowtable *flowtable, - struct net_device *dev, - flow_indr_block_bind_cb_t *cb, - void *cb_priv, - enum flow_block_command cmd) -{ - if (!(flowtable->flags & NF_FLOWTABLE_HW_OFFLOAD)) - return; - - nf_flow_table_indr_block_ing_cmd(dev, flowtable, cb, cb_priv, cmd); -} - -static void nf_flow_table_indr_block_cb(struct net_device *dev, - flow_indr_block_bind_cb_t *cb, - void *cb_priv, - enum flow_block_command cmd) -{ - struct net *net = dev_net(dev); - struct nft_flowtable *nft_ft; - struct nft_table *table; - struct nft_hook *hook; - - mutex_lock(&net->nft.commit_mutex); - list_for_each_entry(table, &net->nft.tables, list) { - list_for_each_entry(nft_ft, &table->flowtables, list) { - list_for_each_entry(hook, &nft_ft->hook_list, list) { - if (hook->ops.dev != dev) - continue; - - nf_flow_table_indr_block_cb_cmd(&nft_ft->data, - dev, cb, - cb_priv, cmd); - } - } - } - mutex_unlock(&net->nft.commit_mutex); -} - -static struct flow_indr_block_entry block_ing_entry = { - .cb = nf_flow_table_indr_block_cb, - .list = LIST_HEAD_INIT(block_ing_entry.list), -}; - int nf_flow_table_offload_init(void) { nf_flow_offload_wq = alloc_workqueue("nf_flow_table_offload", @@ -1078,13 +1015,10 @@ int nf_flow_table_offload_init(void) if (!nf_flow_offload_wq) return -ENOMEM; - flow_indr_add_block_cb(&block_ing_entry); - return 0; } void nf_flow_table_offload_exit(void) { - flow_indr_del_block_cb(&block_ing_entry); destroy_workqueue(nf_flow_offload_wq); } diff --git a/net/netfilter/nf_tables_offload.c b/net/netfilter/nf_tables_offload.c index 1960f11477e8..185fc82c99aa 100644 --- a/net/netfilter/nf_tables_offload.c +++ b/net/netfilter/nf_tables_offload.c @@ -285,25 +285,6 @@ static int nft_block_offload_cmd(struct nft_base_chain *chain, return nft_block_setup(chain, &bo, cmd); } -static void nft_indr_block_ing_cmd(struct net_device *dev, - struct nft_base_chain *chain, - flow_indr_block_bind_cb_t *cb, - void *cb_priv, - enum flow_block_command cmd) -{ - struct netlink_ext_ack extack = {}; - struct flow_block_offload bo; - - if (!chain) - return; - - nft_flow_block_offload_init(&bo, dev_net(dev), cmd, chain, &extack); - - cb(dev, cb_priv, TC_SETUP_BLOCK, &bo); - - nft_block_setup(chain, &bo, cmd); -} - static void nft_indr_block_cleanup(struct flow_block_cb *block_cb) { struct nft_base_chain *basechain = block_cb->indr.data; @@ -575,24 +556,6 @@ static struct nft_chain *__nft_offload_get_chain(struct net_device *dev) return NULL; } -static void nft_indr_block_cb(struct net_device *dev, - flow_indr_block_bind_cb_t *cb, void *cb_priv, - enum flow_block_command cmd) -{ - struct net *net = dev_net(dev); - struct nft_chain *chain; - - mutex_lock(&net->nft.commit_mutex); - chain = __nft_offload_get_chain(dev); - if (chain && chain->flags & NFT_CHAIN_HW_OFFLOAD) { - struct nft_base_chain *basechain; - - basechain = nft_base_chain(chain); - nft_indr_block_ing_cmd(dev, basechain, cb, cb_priv, cmd); - } - mutex_unlock(&net->nft.commit_mutex); -} - static int nft_offload_netdev_event(struct notifier_block *this, unsigned long event, void *ptr) { @@ -614,30 +577,16 @@ static int nft_offload_netdev_event(struct notifier_block *this, return NOTIFY_DONE; } -static struct flow_indr_block_entry block_ing_entry = { - .cb = nft_indr_block_cb, - .list = LIST_HEAD_INIT(block_ing_entry.list), -}; - static struct notifier_block nft_offload_netdev_notifier = { .notifier_call = nft_offload_netdev_event, }; int nft_offload_init(void) { - int err; - - err = register_netdevice_notifier(&nft_offload_netdev_notifier); - if (err < 0) - return err; - - flow_indr_add_block_cb(&block_ing_entry); - - return 0; + return register_netdevice_notifier(&nft_offload_netdev_notifier); } void nft_offload_exit(void) { - flow_indr_del_block_cb(&block_ing_entry); unregister_netdevice_notifier(&nft_offload_netdev_notifier); } diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index 760e51d852f5..a00a203b2ef5 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -621,78 +621,6 @@ static void tcf_chain_flush(struct tcf_chain *chain, bool rtnl_held) static int tcf_block_setup(struct tcf_block *block, struct flow_block_offload *bo); -static void tc_indr_block_cmd(struct net_device *dev, struct tcf_block *block, - flow_indr_block_bind_cb_t *cb, void *cb_priv, - enum flow_block_command command, bool ingress) -{ - struct flow_block_offload bo = { - .command = command, - .binder_type = ingress ? - FLOW_BLOCK_BINDER_TYPE_CLSACT_INGRESS : - FLOW_BLOCK_BINDER_TYPE_CLSACT_EGRESS, - .net = dev_net(dev), - .block_shared = tcf_block_non_null_shared(block), - }; - INIT_LIST_HEAD(&bo.cb_list); - - if (!block) - return; - - bo.block = &block->flow_block; - - down_write(&block->cb_lock); - cb(dev, cb_priv, TC_SETUP_BLOCK, &bo); - - tcf_block_setup(block, &bo); - up_write(&block->cb_lock); -} - -static struct tcf_block *tc_dev_block(struct net_device *dev, bool ingress) -{ - const struct Qdisc_class_ops *cops; - const struct Qdisc_ops *ops; - struct Qdisc *qdisc; - - if (!dev_ingress_queue(dev)) - return NULL; - - qdisc = dev_ingress_queue(dev)->qdisc_sleeping; - if (!qdisc) - return NULL; - - ops = qdisc->ops; - if (!ops) - return NULL; - - if (!ingress && !strcmp("ingress", ops->id)) - return NULL; - - cops = ops->cl_ops; - if (!cops) - return NULL; - - if (!cops->tcf_block) - return NULL; - - return cops->tcf_block(qdisc, - ingress ? TC_H_MIN_INGRESS : TC_H_MIN_EGRESS, - NULL); -} - -static void tc_indr_block_get_and_cmd(struct net_device *dev, - flow_indr_block_bind_cb_t *cb, - void *cb_priv, - enum flow_block_command command) -{ - struct tcf_block *block; - - block = tc_dev_block(dev, true); - tc_indr_block_cmd(dev, block, cb, cb_priv, command, true); - - block = tc_dev_block(dev, false); - tc_indr_block_cmd(dev, block, cb, cb_priv, command, false); -} - static void tcf_block_offload_init(struct flow_block_offload *bo, struct net_device *dev, enum flow_block_command command, @@ -3836,11 +3764,6 @@ static struct pernet_operations tcf_net_ops = { .size = sizeof(struct tcf_net), }; -static struct flow_indr_block_entry block_entry = { - .cb = tc_indr_block_get_and_cmd, - .list = LIST_HEAD_INIT(block_entry.list), -}; - static int __init tc_filter_init(void) { int err; @@ -3853,8 +3776,6 @@ static int __init tc_filter_init(void) if (err) goto err_register_pernet_subsys; - flow_indr_add_block_cb(&block_entry); - rtnl_register(PF_UNSPEC, RTM_NEWTFILTER, tc_new_tfilter, NULL, RTNL_FLAG_DOIT_UNLOCKED); rtnl_register(PF_UNSPEC, RTM_DELTFILTER, tc_del_tfilter, NULL, -- cgit v1.2.3-59-g8ed1b From 0c34bb598c510e070160029f34efeeb217000f8d Mon Sep 17 00:00:00 2001 From: Alexander Sverdlin Date: Fri, 29 May 2020 14:17:10 +0200 Subject: net: octeon: mgmt: Repair filling of RX ring The removal of mips_swiotlb_ops exposed a problem in octeon_mgmt Ethernet driver. mips_swiotlb_ops had an mb() after most of the operations and the removal of the ops had broken the receive functionality of the driver. My code inspection has shown no other places except octeon_mgmt_rx_fill_ring() where an explicit barrier would be obviously missing. The latter function however has to make sure that "ringing the bell" doesn't happen before RX ring entry is really written. The patch has been successfully tested on Octeon II. Fixes: a999933db9ed ("MIPS: remove mips_swiotlb_ops") Cc: stable@vger.kernel.org Signed-off-by: Alexander Sverdlin Signed-off-by: David S. Miller --- drivers/net/ethernet/cavium/octeon/octeon_mgmt.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/net/ethernet/cavium/octeon/octeon_mgmt.c b/drivers/net/ethernet/cavium/octeon/octeon_mgmt.c index 9d868403d86c..cbaa1924afbe 100644 --- a/drivers/net/ethernet/cavium/octeon/octeon_mgmt.c +++ b/drivers/net/ethernet/cavium/octeon/octeon_mgmt.c @@ -234,6 +234,11 @@ static void octeon_mgmt_rx_fill_ring(struct net_device *netdev) /* Put it in the ring. */ p->rx_ring[p->rx_next_fill] = re.d64; + /* Make sure there is no reorder of filling the ring and ringing + * the bell + */ + wmb(); + dma_sync_single_for_device(p->dev, p->rx_ring_handle, ring_size_to_bytes(OCTEON_MGMT_RX_RING_SIZE), DMA_BIDIRECTIONAL); -- cgit v1.2.3-59-g8ed1b From a74d19ba7c41b6c1e424ef4fb7d4600f43ff75e5 Mon Sep 17 00:00:00 2001 From: Liu Xiang Date: Fri, 29 May 2020 23:24:56 +0800 Subject: net: fec: disable correct clk in the err path of fec_enet_clk_enable When enable clk_ref failed, clk_ptp should be disabled rather than clk_ref itself. Signed-off-by: Liu Xiang Signed-off-by: David S. Miller --- drivers/net/ethernet/freescale/fec_main.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/freescale/fec_main.c b/drivers/net/ethernet/freescale/fec_main.c index 4acb91dce5fc..2d0d313ee7c5 100644 --- a/drivers/net/ethernet/freescale/fec_main.c +++ b/drivers/net/ethernet/freescale/fec_main.c @@ -1981,8 +1981,12 @@ static int fec_enet_clk_enable(struct net_device *ndev, bool enable) return 0; failed_clk_ref: - if (fep->clk_ref) - clk_disable_unprepare(fep->clk_ref); + if (fep->clk_ptp) { + mutex_lock(&fep->ptp_clk_mutex); + clk_disable_unprepare(fep->clk_ptp); + fep->ptp_clk_on = false; + mutex_unlock(&fep->ptp_clk_mutex); + } failed_clk_ptp: if (fep->clk_enet_out) clk_disable_unprepare(fep->clk_enet_out); -- cgit v1.2.3-59-g8ed1b From 678eb199cc9df3bf1cb12fb2da22768b8d1b6bf3 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Fri, 29 May 2020 21:36:36 +0300 Subject: devlink: Create dedicated trap group for layer 3 exceptions Packets that hit exceptions during layer 3 forwarding must be trapped to the CPU for the control plane to function properly. Create a dedicated group for them, so that user space could choose to assign a different policer for them. Signed-off-by: Ido Schimmel Reviewed-by: Jiri Pirko Signed-off-by: David S. Miller --- Documentation/networking/devlink/devlink-trap.rst | 7 +++++-- include/net/devlink.h | 3 +++ net/core/devlink.c | 1 + 3 files changed, 9 insertions(+), 2 deletions(-) diff --git a/Documentation/networking/devlink/devlink-trap.rst b/Documentation/networking/devlink/devlink-trap.rst index fe089acb7783..4ca241e70064 100644 --- a/Documentation/networking/devlink/devlink-trap.rst +++ b/Documentation/networking/devlink/devlink-trap.rst @@ -277,8 +277,11 @@ narrow. The description of these groups must be added to the following table: - Contains packet traps for packets that were dropped by the device during layer 2 forwarding (i.e., bridge) * - ``l3_drops`` - - Contains packet traps for packets that were dropped by the device or hit - an exception (e.g., TTL error) during layer 3 forwarding + - Contains packet traps for packets that were dropped by the device during + layer 3 forwarding + * - ``l3_exceptions`` + - Contains packet traps for packets that hit an exception (e.g., TTL + error) during layer 3 forwarding * - ``buffer_drops`` - Contains packet traps for packets that were dropped by the device due to an enqueue decision diff --git a/include/net/devlink.h b/include/net/devlink.h index 8ffc1b5cd89b..851388c9d795 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -657,6 +657,7 @@ enum devlink_trap_generic_id { enum devlink_trap_group_generic_id { DEVLINK_TRAP_GROUP_GENERIC_ID_L2_DROPS, DEVLINK_TRAP_GROUP_GENERIC_ID_L3_DROPS, + DEVLINK_TRAP_GROUP_GENERIC_ID_L3_EXCEPTIONS, DEVLINK_TRAP_GROUP_GENERIC_ID_BUFFER_DROPS, DEVLINK_TRAP_GROUP_GENERIC_ID_TUNNEL_DROPS, DEVLINK_TRAP_GROUP_GENERIC_ID_ACL_DROPS, @@ -730,6 +731,8 @@ enum devlink_trap_group_generic_id { "l2_drops" #define DEVLINK_TRAP_GROUP_GENERIC_NAME_L3_DROPS \ "l3_drops" +#define DEVLINK_TRAP_GROUP_GENERIC_NAME_L3_EXCEPTIONS \ + "l3_exceptions" #define DEVLINK_TRAP_GROUP_GENERIC_NAME_BUFFER_DROPS \ "buffer_drops" #define DEVLINK_TRAP_GROUP_GENERIC_NAME_TUNNEL_DROPS \ diff --git a/net/core/devlink.c b/net/core/devlink.c index 7b76e5fffc10..d9fff7083f02 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -8505,6 +8505,7 @@ static const struct devlink_trap devlink_trap_generic[] = { static const struct devlink_trap_group devlink_trap_group_generic[] = { DEVLINK_TRAP_GROUP(L2_DROPS), DEVLINK_TRAP_GROUP(L3_DROPS), + DEVLINK_TRAP_GROUP(L3_EXCEPTIONS), DEVLINK_TRAP_GROUP(BUFFER_DROPS), DEVLINK_TRAP_GROUP(TUNNEL_DROPS), DEVLINK_TRAP_GROUP(ACL_DROPS), -- cgit v1.2.3-59-g8ed1b From 1e292f5c11c1e2ef38f416b62c5d616f5768057f Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Fri, 29 May 2020 21:36:37 +0300 Subject: mlxsw: spectrum_trap: Move layer 3 exceptions to exceptions trap group The layer 3 exceptions are still subject to the same trap policer, so nothing changes, but user space can choose to assign a different one. Signed-off-by: Ido Schimmel Reviewed-by: Jiri Pirko Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlxsw/reg.h | 1 + .../net/ethernet/mellanox/mlxsw/spectrum_trap.c | 40 +++++++++++++--------- 2 files changed, 25 insertions(+), 16 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlxsw/reg.h b/drivers/net/ethernet/mellanox/mlxsw/reg.h index 38fa7304af0c..030d6f9766d2 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/reg.h +++ b/drivers/net/ethernet/mellanox/mlxsw/reg.h @@ -5552,6 +5552,7 @@ enum mlxsw_reg_htgt_trap_group { MLXSW_REG_HTGT_TRAP_GROUP_SP_DUMMY, MLXSW_REG_HTGT_TRAP_GROUP_SP_L2_DISCARDS, MLXSW_REG_HTGT_TRAP_GROUP_SP_L3_DISCARDS, + MLXSW_REG_HTGT_TRAP_GROUP_SP_L3_EXCEPTIONS, MLXSW_REG_HTGT_TRAP_GROUP_SP_TUNNEL_DISCARDS, MLXSW_REG_HTGT_TRAP_GROUP_SP_ACL_DISCARDS, diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_trap.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_trap.c index f4b812276a5a..dc2217f1a07f 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_trap.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_trap.c @@ -212,6 +212,11 @@ static const struct mlxsw_sp_trap_group_item mlxsw_sp_trap_group_items_arr[] = { .hw_group_id = MLXSW_REG_HTGT_TRAP_GROUP_SP_L3_DISCARDS, .priority = 0, }, + { + .group = DEVLINK_TRAP_GROUP_GENERIC(L3_EXCEPTIONS, 1), + .hw_group_id = MLXSW_REG_HTGT_TRAP_GROUP_SP_L3_EXCEPTIONS, + .priority = 2, + }, { .group = DEVLINK_TRAP_GROUP_GENERIC(TUNNEL_DROPS, 1), .hw_group_id = MLXSW_REG_HTGT_TRAP_GROUP_SP_TUNNEL_DISCARDS, @@ -332,56 +337,59 @@ static const struct mlxsw_sp_trap_item mlxsw_sp_trap_items_arr[] = { }, }, { - .trap = MLXSW_SP_TRAP_EXCEPTION(MTU_ERROR, L3_DROPS), + .trap = MLXSW_SP_TRAP_EXCEPTION(MTU_ERROR, L3_EXCEPTIONS), .listeners_arr = { - MLXSW_SP_RXL_EXCEPTION(MTUERROR, L3_DISCARDS, + MLXSW_SP_RXL_EXCEPTION(MTUERROR, L3_EXCEPTIONS, TRAP_TO_CPU), }, }, { - .trap = MLXSW_SP_TRAP_EXCEPTION(TTL_ERROR, L3_DROPS), + .trap = MLXSW_SP_TRAP_EXCEPTION(TTL_ERROR, L3_EXCEPTIONS), .listeners_arr = { - MLXSW_SP_RXL_EXCEPTION(TTLERROR, L3_DISCARDS, + MLXSW_SP_RXL_EXCEPTION(TTLERROR, L3_EXCEPTIONS, TRAP_TO_CPU), }, }, { - .trap = MLXSW_SP_TRAP_EXCEPTION(RPF, L3_DROPS), + .trap = MLXSW_SP_TRAP_EXCEPTION(RPF, L3_EXCEPTIONS), .listeners_arr = { - MLXSW_SP_RXL_EXCEPTION(RPF, L3_DISCARDS, TRAP_TO_CPU), + MLXSW_SP_RXL_EXCEPTION(RPF, L3_EXCEPTIONS, TRAP_TO_CPU), }, }, { - .trap = MLXSW_SP_TRAP_EXCEPTION(REJECT_ROUTE, L3_DROPS), + .trap = MLXSW_SP_TRAP_EXCEPTION(REJECT_ROUTE, L3_EXCEPTIONS), .listeners_arr = { - MLXSW_SP_RXL_EXCEPTION(RTR_INGRESS1, L3_DISCARDS, + MLXSW_SP_RXL_EXCEPTION(RTR_INGRESS1, L3_EXCEPTIONS, TRAP_TO_CPU), }, }, { - .trap = MLXSW_SP_TRAP_EXCEPTION(UNRESOLVED_NEIGH, L3_DROPS), + .trap = MLXSW_SP_TRAP_EXCEPTION(UNRESOLVED_NEIGH, + L3_EXCEPTIONS), .listeners_arr = { - MLXSW_SP_RXL_EXCEPTION(HOST_MISS_IPV4, L3_DISCARDS, + MLXSW_SP_RXL_EXCEPTION(HOST_MISS_IPV4, L3_EXCEPTIONS, TRAP_TO_CPU), - MLXSW_SP_RXL_EXCEPTION(HOST_MISS_IPV6, L3_DISCARDS, + MLXSW_SP_RXL_EXCEPTION(HOST_MISS_IPV6, L3_EXCEPTIONS, TRAP_TO_CPU), - MLXSW_SP_RXL_EXCEPTION(DISCARD_ROUTER3, L3_DISCARDS, + MLXSW_SP_RXL_EXCEPTION(DISCARD_ROUTER3, L3_EXCEPTIONS, TRAP_EXCEPTION_TO_CPU), }, }, { .trap = MLXSW_SP_TRAP_EXCEPTION(IPV4_LPM_UNICAST_MISS, - L3_DROPS), + L3_EXCEPTIONS), .listeners_arr = { - MLXSW_SP_RXL_EXCEPTION(DISCARD_ROUTER_LPM4, L3_DISCARDS, + MLXSW_SP_RXL_EXCEPTION(DISCARD_ROUTER_LPM4, + L3_EXCEPTIONS, TRAP_EXCEPTION_TO_CPU), }, }, { .trap = MLXSW_SP_TRAP_EXCEPTION(IPV6_LPM_UNICAST_MISS, - L3_DROPS), + L3_EXCEPTIONS), .listeners_arr = { - MLXSW_SP_RXL_EXCEPTION(DISCARD_ROUTER_LPM6, L3_DISCARDS, + MLXSW_SP_RXL_EXCEPTION(DISCARD_ROUTER_LPM6, + L3_EXCEPTIONS, TRAP_EXCEPTION_TO_CPU), }, }, -- cgit v1.2.3-59-g8ed1b From 85176f19f5ff579c8c1676b4c170a6535b782584 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Fri, 29 May 2020 21:36:38 +0300 Subject: netdevsim: Move layer 3 exceptions to exceptions trap group The layer 3 exceptions are still subject to the same trap policer, so nothing changes, but user space can choose to assign a different one. Signed-off-by: Ido Schimmel Reviewed-by: Jiri Pirko Signed-off-by: David S. Miller --- drivers/net/netdevsim/dev.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/netdevsim/dev.c b/drivers/net/netdevsim/dev.c index dc3ff0e20944..09d947eff980 100644 --- a/drivers/net/netdevsim/dev.c +++ b/drivers/net/netdevsim/dev.c @@ -458,6 +458,7 @@ static const struct devlink_trap_policer nsim_trap_policers_arr[] = { static const struct devlink_trap_group nsim_trap_groups_arr[] = { DEVLINK_TRAP_GROUP_GENERIC(L2_DROPS, 0), DEVLINK_TRAP_GROUP_GENERIC(L3_DROPS, 1), + DEVLINK_TRAP_GROUP_GENERIC(L3_EXCEPTIONS, 1), DEVLINK_TRAP_GROUP_GENERIC(BUFFER_DROPS, 2), DEVLINK_TRAP_GROUP_GENERIC(ACL_DROPS, 3), }; @@ -471,7 +472,7 @@ static const struct devlink_trap nsim_traps_arr[] = { NSIM_TRAP_DROP(PORT_LOOPBACK_FILTER, L2_DROPS), NSIM_TRAP_DRIVER_EXCEPTION(FID_MISS, L2_DROPS), NSIM_TRAP_DROP(BLACKHOLE_ROUTE, L3_DROPS), - NSIM_TRAP_EXCEPTION(TTL_ERROR, L3_DROPS), + NSIM_TRAP_EXCEPTION(TTL_ERROR, L3_EXCEPTIONS), NSIM_TRAP_DROP(TAIL_DROP, BUFFER_DROPS), NSIM_TRAP_DROP_EXT(INGRESS_FLOW_ACTION_DROP, ACL_DROPS, DEVLINK_TRAP_METADATA_TYPE_F_FA_COOKIE), -- cgit v1.2.3-59-g8ed1b From 9eefeabed6f831018c15bd7e17d34967ee34d9dd Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Fri, 29 May 2020 21:36:39 +0300 Subject: devlink: Add 'mirror' trap action The action is used by control traps such as IGMP query. The packet is flooded by the device, but also trapped to the CPU in order for the software bridge to mark the receiving port as a multicast router port. Such packets are marked with 'skb->offload_fwd_mark = 1' in order to prevent the software bridge from flooding them again. Signed-off-by: Ido Schimmel Reviewed-by: Jiri Pirko Signed-off-by: David S. Miller --- Documentation/networking/devlink/devlink-trap.rst | 2 ++ include/uapi/linux/devlink.h | 3 +++ net/core/devlink.c | 3 ++- 3 files changed, 7 insertions(+), 1 deletion(-) diff --git a/Documentation/networking/devlink/devlink-trap.rst b/Documentation/networking/devlink/devlink-trap.rst index 4ca241e70064..5b97327caefc 100644 --- a/Documentation/networking/devlink/devlink-trap.rst +++ b/Documentation/networking/devlink/devlink-trap.rst @@ -108,6 +108,8 @@ The ``devlink-trap`` mechanism supports the following packet trap actions: * ``trap``: The sole copy of the packet is sent to the CPU. * ``drop``: The packet is dropped by the underlying device and a copy is not sent to the CPU. + * ``mirror``: The packet is forwarded by the underlying device and a copy is + sent to the CPU. Generic Packet Traps ==================== diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h index 1ae90e06c06d..16305932a950 100644 --- a/include/uapi/linux/devlink.h +++ b/include/uapi/linux/devlink.h @@ -233,10 +233,13 @@ enum { * @DEVLINK_TRAP_ACTION_DROP: Packet is dropped by the device and a copy is not * sent to the CPU. * @DEVLINK_TRAP_ACTION_TRAP: The sole copy of the packet is sent to the CPU. + * @DEVLINK_TRAP_ACTION_MIRROR: Packet is forwarded by the device and a copy is + * sent to the CPU. */ enum devlink_trap_action { DEVLINK_TRAP_ACTION_DROP, DEVLINK_TRAP_ACTION_TRAP, + DEVLINK_TRAP_ACTION_MIRROR, }; /** diff --git a/net/core/devlink.c b/net/core/devlink.c index d9fff7083f02..d6298917b077 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -5869,7 +5869,8 @@ devlink_trap_action_get_from_info(struct genl_info *info, val = nla_get_u8(info->attrs[DEVLINK_ATTR_TRAP_ACTION]); switch (val) { case DEVLINK_TRAP_ACTION_DROP: /* fall-through */ - case DEVLINK_TRAP_ACTION_TRAP: + case DEVLINK_TRAP_ACTION_TRAP: /* fall-through */ + case DEVLINK_TRAP_ACTION_MIRROR: *p_trap_action = val; break; default: -- cgit v1.2.3-59-g8ed1b From 30a4e9a29ab9aadfe6c5386ae4aa396b1d2556c2 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Fri, 29 May 2020 21:36:40 +0300 Subject: devlink: Add 'control' trap type This type is used for traps that trap control packets such as ARP request and IGMP query to the CPU. Do not report such packets to the kernel's drop monitor as they were not dropped by the device no encountered an exception during forwarding. Signed-off-by: Ido Schimmel Reviewed-by: Jiri Pirko Signed-off-by: David S. Miller --- Documentation/networking/devlink/devlink-trap.rst | 8 +++++++- include/uapi/linux/devlink.h | 6 ++++++ net/core/devlink.c | 7 +++++++ 3 files changed, 20 insertions(+), 1 deletion(-) diff --git a/Documentation/networking/devlink/devlink-trap.rst b/Documentation/networking/devlink/devlink-trap.rst index 5b97327caefc..6c293cfa23ee 100644 --- a/Documentation/networking/devlink/devlink-trap.rst +++ b/Documentation/networking/devlink/devlink-trap.rst @@ -55,7 +55,7 @@ The following diagram provides a general overview of ``devlink-trap``:: | | +-------^--------+ | - | + | Non-control traps | +----+----+ | | Kernel's Rx path @@ -97,6 +97,12 @@ The ``devlink-trap`` mechanism supports the following packet trap types: processed by ``devlink`` and injected to the kernel's Rx path. Changing the action of such traps is not allowed, as it can easily break the control plane. + * ``control``: Trapped packets were trapped by the device because these are + control packets required for the correct functioning of the control plane. + For example, ARP request and IGMP query packets. Packets are injected to + the kernel's Rx path, but not reported to the kernel's drop monitor. + Changing the action of such traps is not allowed, as it can easily break + the control plane. .. _Trap-Actions: diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h index 16305932a950..08563e6a424d 100644 --- a/include/uapi/linux/devlink.h +++ b/include/uapi/linux/devlink.h @@ -253,10 +253,16 @@ enum devlink_trap_action { * control plane for resolution. Trapped packets * are processed by devlink and injected to * the kernel's Rx path. + * @DEVLINK_TRAP_TYPE_CONTROL: Packet was trapped because it is required for + * the correct functioning of the control plane. + * For example, an ARP request packet. Trapped + * packets are injected to the kernel's Rx path, + * but not reported to drop monitor. */ enum devlink_trap_type { DEVLINK_TRAP_TYPE_DROP, DEVLINK_TRAP_TYPE_EXCEPTION, + DEVLINK_TRAP_TYPE_CONTROL, }; enum { diff --git a/net/core/devlink.c b/net/core/devlink.c index d6298917b077..47c28e0f848f 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -8847,6 +8847,13 @@ void devlink_trap_report(struct devlink *devlink, struct sk_buff *skb, devlink_trap_stats_update(trap_item->stats, skb->len); devlink_trap_stats_update(trap_item->group_item->stats, skb->len); + /* Control packets were not dropped by the device or encountered an + * exception during forwarding and therefore should not be reported to + * the kernel's drop monitor. + */ + if (trap_item->trap->type == DEVLINK_TRAP_TYPE_CONTROL) + return; + devlink_trap_report_metadata_fill(&hw_metadata, trap_item, in_devlink_port, fa_cookie); net_dm_hw_report(skb, &hw_metadata); -- cgit v1.2.3-59-g8ed1b From 515eac677fe119433c2a466443bef95c10c550cc Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Fri, 29 May 2020 21:36:41 +0300 Subject: devlink: Add layer 2 control packet traps Add layer 2 control packet traps such as STP and IGMP query, so that capable device drivers could register them with devlink. Add documentation for every added packet trap and packet trap group. Signed-off-by: Ido Schimmel Reviewed-by: Jiri Pirko Signed-off-by: David S. Miller --- Documentation/networking/devlink/devlink-trap.rst | 45 +++++++++++++++++++++ include/net/devlink.h | 48 +++++++++++++++++++++++ net/core/devlink.c | 16 ++++++++ 3 files changed, 109 insertions(+) diff --git a/Documentation/networking/devlink/devlink-trap.rst b/Documentation/networking/devlink/devlink-trap.rst index 6c293cfa23ee..e9fc3c9d7d7a 100644 --- a/Documentation/networking/devlink/devlink-trap.rst +++ b/Documentation/networking/devlink/devlink-trap.rst @@ -252,6 +252,42 @@ be added to the following table: * - ``egress_flow_action_drop`` - ``drop`` - Traps packets dropped during processing of egress flow action drop + * - ``stp`` + - ``control`` + - Traps STP packets + * - ``lacp`` + - ``control`` + - Traps LACP packets + * - ``lldp`` + - ``control`` + - Traps LLDP packets + * - ``igmp_query`` + - ``control`` + - Traps IGMP Membership Query packets + * - ``igmp_v1_report`` + - ``control`` + - Traps IGMP Version 1 Membership Report packets + * - ``igmp_v2_report`` + - ``control`` + - Traps IGMP Version 2 Membership Report packets + * - ``igmp_v3_report`` + - ``control`` + - Traps IGMP Version 3 Membership Report packets + * - ``igmp_v2_leave`` + - ``control`` + - Traps IGMP Version 2 Leave Group packets + * - ``mld_query`` + - ``control`` + - Traps MLD Multicast Listener Query packets + * - ``mld_v1_report`` + - ``control`` + - Traps MLD Version 1 Multicast Listener Report packets + * - ``mld_v2_report`` + - ``control`` + - Traps MLD Version 2 Multicast Listener Report packets + * - ``mld_v1_done`` + - ``control`` + - Traps MLD Version 1 Multicast Listener Done packets Driver-specific Packet Traps ============================ @@ -299,6 +335,15 @@ narrow. The description of these groups must be added to the following table: * - ``acl_drops`` - Contains packet traps for packets that were dropped by the device during ACL processing + * - ``stp`` + - Contains packet traps for STP packets + * - ``lacp`` + - Contains packet traps for LACP packets + * - ``lldp`` + - Contains packet traps for LLDP packets + * - ``mc_snooping`` + - Contains packet traps for IGMP and MLD packets required for multicast + snooping Packet Trap Policers ==================== diff --git a/include/net/devlink.h b/include/net/devlink.h index 851388c9d795..c0061542ad65 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -645,6 +645,18 @@ enum devlink_trap_generic_id { DEVLINK_TRAP_GENERIC_ID_OVERLAY_SMAC_MC, DEVLINK_TRAP_GENERIC_ID_INGRESS_FLOW_ACTION_DROP, DEVLINK_TRAP_GENERIC_ID_EGRESS_FLOW_ACTION_DROP, + DEVLINK_TRAP_GENERIC_ID_STP, + DEVLINK_TRAP_GENERIC_ID_LACP, + DEVLINK_TRAP_GENERIC_ID_LLDP, + DEVLINK_TRAP_GENERIC_ID_IGMP_QUERY, + DEVLINK_TRAP_GENERIC_ID_IGMP_V1_REPORT, + DEVLINK_TRAP_GENERIC_ID_IGMP_V2_REPORT, + DEVLINK_TRAP_GENERIC_ID_IGMP_V3_REPORT, + DEVLINK_TRAP_GENERIC_ID_IGMP_V2_LEAVE, + DEVLINK_TRAP_GENERIC_ID_MLD_QUERY, + DEVLINK_TRAP_GENERIC_ID_MLD_V1_REPORT, + DEVLINK_TRAP_GENERIC_ID_MLD_V2_REPORT, + DEVLINK_TRAP_GENERIC_ID_MLD_V1_DONE, /* Add new generic trap IDs above */ __DEVLINK_TRAP_GENERIC_ID_MAX, @@ -661,6 +673,10 @@ enum devlink_trap_group_generic_id { DEVLINK_TRAP_GROUP_GENERIC_ID_BUFFER_DROPS, DEVLINK_TRAP_GROUP_GENERIC_ID_TUNNEL_DROPS, DEVLINK_TRAP_GROUP_GENERIC_ID_ACL_DROPS, + DEVLINK_TRAP_GROUP_GENERIC_ID_STP, + DEVLINK_TRAP_GROUP_GENERIC_ID_LACP, + DEVLINK_TRAP_GROUP_GENERIC_ID_LLDP, + DEVLINK_TRAP_GROUP_GENERIC_ID_MC_SNOOPING, /* Add new generic trap group IDs above */ __DEVLINK_TRAP_GROUP_GENERIC_ID_MAX, @@ -726,6 +742,30 @@ enum devlink_trap_group_generic_id { "ingress_flow_action_drop" #define DEVLINK_TRAP_GENERIC_NAME_EGRESS_FLOW_ACTION_DROP \ "egress_flow_action_drop" +#define DEVLINK_TRAP_GENERIC_NAME_STP \ + "stp" +#define DEVLINK_TRAP_GENERIC_NAME_LACP \ + "lacp" +#define DEVLINK_TRAP_GENERIC_NAME_LLDP \ + "lldp" +#define DEVLINK_TRAP_GENERIC_NAME_IGMP_QUERY \ + "igmp_query" +#define DEVLINK_TRAP_GENERIC_NAME_IGMP_V1_REPORT \ + "igmp_v1_report" +#define DEVLINK_TRAP_GENERIC_NAME_IGMP_V2_REPORT \ + "igmp_v2_report" +#define DEVLINK_TRAP_GENERIC_NAME_IGMP_V3_REPORT \ + "igmp_v3_report" +#define DEVLINK_TRAP_GENERIC_NAME_IGMP_V2_LEAVE \ + "igmp_v2_leave" +#define DEVLINK_TRAP_GENERIC_NAME_MLD_QUERY \ + "mld_query" +#define DEVLINK_TRAP_GENERIC_NAME_MLD_V1_REPORT \ + "mld_v1_report" +#define DEVLINK_TRAP_GENERIC_NAME_MLD_V2_REPORT \ + "mld_v2_report" +#define DEVLINK_TRAP_GENERIC_NAME_MLD_V1_DONE \ + "mld_v1_done" #define DEVLINK_TRAP_GROUP_GENERIC_NAME_L2_DROPS \ "l2_drops" @@ -739,6 +779,14 @@ enum devlink_trap_group_generic_id { "tunnel_drops" #define DEVLINK_TRAP_GROUP_GENERIC_NAME_ACL_DROPS \ "acl_drops" +#define DEVLINK_TRAP_GROUP_GENERIC_NAME_STP \ + "stp" +#define DEVLINK_TRAP_GROUP_GENERIC_NAME_LACP \ + "lacp" +#define DEVLINK_TRAP_GROUP_GENERIC_NAME_LLDP \ + "lldp" +#define DEVLINK_TRAP_GROUP_GENERIC_NAME_MC_SNOOPING \ + "mc_snooping" #define DEVLINK_TRAP_GENERIC(_type, _init_action, _id, _group_id, \ _metadata_cap) \ diff --git a/net/core/devlink.c b/net/core/devlink.c index 47c28e0f848f..c91ef1b5f738 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -8495,6 +8495,18 @@ static const struct devlink_trap devlink_trap_generic[] = { DEVLINK_TRAP(OVERLAY_SMAC_MC, DROP), DEVLINK_TRAP(INGRESS_FLOW_ACTION_DROP, DROP), DEVLINK_TRAP(EGRESS_FLOW_ACTION_DROP, DROP), + DEVLINK_TRAP(STP, CONTROL), + DEVLINK_TRAP(LACP, CONTROL), + DEVLINK_TRAP(LLDP, CONTROL), + DEVLINK_TRAP(IGMP_QUERY, CONTROL), + DEVLINK_TRAP(IGMP_V1_REPORT, CONTROL), + DEVLINK_TRAP(IGMP_V2_REPORT, CONTROL), + DEVLINK_TRAP(IGMP_V3_REPORT, CONTROL), + DEVLINK_TRAP(IGMP_V2_LEAVE, CONTROL), + DEVLINK_TRAP(MLD_QUERY, CONTROL), + DEVLINK_TRAP(MLD_V1_REPORT, CONTROL), + DEVLINK_TRAP(MLD_V2_REPORT, CONTROL), + DEVLINK_TRAP(MLD_V1_DONE, CONTROL), }; #define DEVLINK_TRAP_GROUP(_id) \ @@ -8510,6 +8522,10 @@ static const struct devlink_trap_group devlink_trap_group_generic[] = { DEVLINK_TRAP_GROUP(BUFFER_DROPS), DEVLINK_TRAP_GROUP(TUNNEL_DROPS), DEVLINK_TRAP_GROUP(ACL_DROPS), + DEVLINK_TRAP_GROUP(STP), + DEVLINK_TRAP_GROUP(LACP), + DEVLINK_TRAP_GROUP(LLDP), + DEVLINK_TRAP_GROUP(MC_SNOOPING), }; static int devlink_trap_generic_verify(const struct devlink_trap *trap) -- cgit v1.2.3-59-g8ed1b From d77cfd162a346259222d0207a95bf1a0cc0c2520 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Fri, 29 May 2020 21:36:42 +0300 Subject: devlink: Add layer 3 control packet traps Add layer 3 control packet traps such as ARP and DHCP, so that capable device drivers could register them with devlink. Add documentation for every added packet trap and packet trap group. Signed-off-by: Ido Schimmel Reviewed-by: Jiri Pirko Signed-off-by: David S. Miller --- Documentation/networking/devlink/devlink-trap.rst | 143 ++++++++++++++++++++++ include/net/devlink.h | 126 +++++++++++++++++++ net/core/devlink.c | 42 +++++++ 3 files changed, 311 insertions(+) diff --git a/Documentation/networking/devlink/devlink-trap.rst b/Documentation/networking/devlink/devlink-trap.rst index e9fc3c9d7d7a..621b634b16be 100644 --- a/Documentation/networking/devlink/devlink-trap.rst +++ b/Documentation/networking/devlink/devlink-trap.rst @@ -288,6 +288,115 @@ be added to the following table: * - ``mld_v1_done`` - ``control`` - Traps MLD Version 1 Multicast Listener Done packets + * - ``ipv4_dhcp`` + - ``control`` + - Traps IPv4 DHCP packets + * - ``ipv6_dhcp`` + - ``control`` + - Traps IPv6 DHCP packets + * - ``arp_request`` + - ``control`` + - Traps ARP request packets + * - ``arp_response`` + - ``control`` + - Traps ARP response packets + * - ``arp_overlay`` + - ``control`` + - Traps NVE-decapsulated ARP packets that reached the overlay network. + This is required, for example, when the address that needs to be + resolved is a local address + * - ``ipv6_neigh_solicit`` + - ``control`` + - Traps IPv6 Neighbour Solicitation packets + * - ``ipv6_neigh_advert`` + - ``control`` + - Traps IPv6 Neighbour Advertisement packets + * - ``ipv4_bfd`` + - ``control`` + - Traps IPv4 BFD packets + * - ``ipv6_bfd`` + - ``control`` + - Traps IPv6 BFD packets + * - ``ipv4_ospf`` + - ``control`` + - Traps IPv4 OSPF packets + * - ``ipv6_ospf`` + - ``control`` + - Traps IPv6 OSPF packets + * - ``ipv4_bgp`` + - ``control`` + - Traps IPv4 BGP packets + * - ``ipv6_bgp`` + - ``control`` + - Traps IPv6 BGP packets + * - ``ipv4_vrrp`` + - ``control`` + - Traps IPv4 VRRP packets + * - ``ipv6_vrrp`` + - ``control`` + - Traps IPv6 VRRP packets + * - ``ipv4_pim`` + - ``control`` + - Traps IPv4 PIM packets + * - ``ipv6_pim`` + - ``control`` + - Traps IPv6 PIM packets + * - ``uc_loopback`` + - ``control`` + - Traps unicast packets that need to be routed through the same layer 3 + interface from which they were received. Such packets are routed by the + kernel, but also cause it to potentially generate ICMP redirect packets + * - ``local_route`` + - ``control`` + - Traps unicast packets that hit a local route and need to be locally + delivered + * - ``external_route`` + - ``control`` + - Traps packets that should be routed through an external interface (e.g., + management interface) that does not belong to the same device (e.g., + switch ASIC) as the ingress interface + * - ``ipv6_uc_dip_link_local_scope`` + - ``control`` + - Traps unicast IPv6 packets that need to be routed and have a destination + IP address with a link-local scope (i.e., fe80::/10). The trap allows + device drivers to avoid programming link-local routes, but still receive + packets for local delivery + * - ``ipv6_dip_all_nodes`` + - ``control`` + - Traps IPv6 packets that their destination IP address is the "All Nodes + Address" (i.e., ff02::1) + * - ``ipv6_dip_all_routers`` + - ``control`` + - Traps IPv6 packets that their destination IP address is the "All Routers + Address" (i.e., ff02::2) + * - ``ipv6_router_solicit`` + - ``control`` + - Traps IPv6 Router Solicitation packets + * - ``ipv6_router_advert`` + - ``control`` + - Traps IPv6 Router Advertisement packets + * - ``ipv6_redirect`` + - ``control`` + - Traps IPv6 Redirect Message packets + * - ``ipv4_router_alert`` + - ``control`` + - Traps IPv4 packets that need to be routed and include the Router Alert + option. Such packets need to be locally delivered to raw sockets that + have the IP_ROUTER_ALERT socket option set + * - ``ipv6_router_alert`` + - ``control`` + - Traps IPv6 packets that need to be routed and include the Router Alert + option in their Hop-by-Hop extension header. Such packets need to be + locally delivered to raw sockets that have the IPV6_ROUTER_ALERT socket + option set + * - ``ptp_event`` + - ``control`` + - Traps PTP time-critical event messages (Sync, Delay_req, Pdelay_Req and + Pdelay_Resp) + * - ``ptp_general`` + - ``control`` + - Traps PTP general messages (Announce, Follow_Up, Delay_Resp, + Pdelay_Resp_Follow_Up, management and signaling) Driver-specific Packet Traps ============================ @@ -344,6 +453,40 @@ narrow. The description of these groups must be added to the following table: * - ``mc_snooping`` - Contains packet traps for IGMP and MLD packets required for multicast snooping + * - ``dhcp`` + - Contains packet traps for DHCP packets + * - ``neigh_discovery`` + - Contains packet traps for neighbour discovery packets (e.g., ARP, IPv6 + ND) + * - ``bfd`` + - Contains packet traps for BFD packets + * - ``ospf`` + - Contains packet traps for OSPF packets + * - ``bgp`` + - Contains packet traps for BGP packets + * - ``vrrp`` + - Contains packet traps for VRRP packets + * - ``pim`` + - Contains packet traps for PIM packets + * - ``uc_loopback`` + - Contains a packet trap for unicast loopback packets (i.e., + ``uc_loopback``). This trap is singled-out because in cases such as + one-armed router it will be constantly triggered. To limit the impact on + the CPU usage, a packet trap policer with a low rate can be bound to the + group without affecting other traps + * - ``local_delivery`` + - Contains packet traps for packets that should be locally delivered after + routing, but do not match more specific packet traps (e.g., + ``ipv4_bgp``) + * - ``ipv6`` + - Contains packet traps for various IPv6 control packets (e.g., Router + Advertisements) + * - ``ptp_event`` + - Contains packet traps for PTP time-critical event messages (Sync, + Delay_req, Pdelay_Req and Pdelay_Resp) + * - ``ptp_general`` + - Contains packet traps for PTP general messages (Announce, Follow_Up, + Delay_Resp, Pdelay_Resp_Follow_Up, management and signaling) Packet Trap Policers ==================== diff --git a/include/net/devlink.h b/include/net/devlink.h index c0061542ad65..05a45dea976b 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -657,6 +657,36 @@ enum devlink_trap_generic_id { DEVLINK_TRAP_GENERIC_ID_MLD_V1_REPORT, DEVLINK_TRAP_GENERIC_ID_MLD_V2_REPORT, DEVLINK_TRAP_GENERIC_ID_MLD_V1_DONE, + DEVLINK_TRAP_GENERIC_ID_IPV4_DHCP, + DEVLINK_TRAP_GENERIC_ID_IPV6_DHCP, + DEVLINK_TRAP_GENERIC_ID_ARP_REQUEST, + DEVLINK_TRAP_GENERIC_ID_ARP_RESPONSE, + DEVLINK_TRAP_GENERIC_ID_ARP_OVERLAY, + DEVLINK_TRAP_GENERIC_ID_IPV6_NEIGH_SOLICIT, + DEVLINK_TRAP_GENERIC_ID_IPV6_NEIGH_ADVERT, + DEVLINK_TRAP_GENERIC_ID_IPV4_BFD, + DEVLINK_TRAP_GENERIC_ID_IPV6_BFD, + DEVLINK_TRAP_GENERIC_ID_IPV4_OSPF, + DEVLINK_TRAP_GENERIC_ID_IPV6_OSPF, + DEVLINK_TRAP_GENERIC_ID_IPV4_BGP, + DEVLINK_TRAP_GENERIC_ID_IPV6_BGP, + DEVLINK_TRAP_GENERIC_ID_IPV4_VRRP, + DEVLINK_TRAP_GENERIC_ID_IPV6_VRRP, + DEVLINK_TRAP_GENERIC_ID_IPV4_PIM, + DEVLINK_TRAP_GENERIC_ID_IPV6_PIM, + DEVLINK_TRAP_GENERIC_ID_UC_LB, + DEVLINK_TRAP_GENERIC_ID_LOCAL_ROUTE, + DEVLINK_TRAP_GENERIC_ID_EXTERNAL_ROUTE, + DEVLINK_TRAP_GENERIC_ID_IPV6_UC_DIP_LINK_LOCAL_SCOPE, + DEVLINK_TRAP_GENERIC_ID_IPV6_DIP_ALL_NODES, + DEVLINK_TRAP_GENERIC_ID_IPV6_DIP_ALL_ROUTERS, + DEVLINK_TRAP_GENERIC_ID_IPV6_ROUTER_SOLICIT, + DEVLINK_TRAP_GENERIC_ID_IPV6_ROUTER_ADVERT, + DEVLINK_TRAP_GENERIC_ID_IPV6_REDIRECT, + DEVLINK_TRAP_GENERIC_ID_IPV4_ROUTER_ALERT, + DEVLINK_TRAP_GENERIC_ID_IPV6_ROUTER_ALERT, + DEVLINK_TRAP_GENERIC_ID_PTP_EVENT, + DEVLINK_TRAP_GENERIC_ID_PTP_GENERAL, /* Add new generic trap IDs above */ __DEVLINK_TRAP_GENERIC_ID_MAX, @@ -677,6 +707,18 @@ enum devlink_trap_group_generic_id { DEVLINK_TRAP_GROUP_GENERIC_ID_LACP, DEVLINK_TRAP_GROUP_GENERIC_ID_LLDP, DEVLINK_TRAP_GROUP_GENERIC_ID_MC_SNOOPING, + DEVLINK_TRAP_GROUP_GENERIC_ID_DHCP, + DEVLINK_TRAP_GROUP_GENERIC_ID_NEIGH_DISCOVERY, + DEVLINK_TRAP_GROUP_GENERIC_ID_BFD, + DEVLINK_TRAP_GROUP_GENERIC_ID_OSPF, + DEVLINK_TRAP_GROUP_GENERIC_ID_BGP, + DEVLINK_TRAP_GROUP_GENERIC_ID_VRRP, + DEVLINK_TRAP_GROUP_GENERIC_ID_PIM, + DEVLINK_TRAP_GROUP_GENERIC_ID_UC_LB, + DEVLINK_TRAP_GROUP_GENERIC_ID_LOCAL_DELIVERY, + DEVLINK_TRAP_GROUP_GENERIC_ID_IPV6, + DEVLINK_TRAP_GROUP_GENERIC_ID_PTP_EVENT, + DEVLINK_TRAP_GROUP_GENERIC_ID_PTP_GENERAL, /* Add new generic trap group IDs above */ __DEVLINK_TRAP_GROUP_GENERIC_ID_MAX, @@ -766,6 +808,66 @@ enum devlink_trap_group_generic_id { "mld_v2_report" #define DEVLINK_TRAP_GENERIC_NAME_MLD_V1_DONE \ "mld_v1_done" +#define DEVLINK_TRAP_GENERIC_NAME_IPV4_DHCP \ + "ipv4_dhcp" +#define DEVLINK_TRAP_GENERIC_NAME_IPV6_DHCP \ + "ipv6_dhcp" +#define DEVLINK_TRAP_GENERIC_NAME_ARP_REQUEST \ + "arp_request" +#define DEVLINK_TRAP_GENERIC_NAME_ARP_RESPONSE \ + "arp_response" +#define DEVLINK_TRAP_GENERIC_NAME_ARP_OVERLAY \ + "arp_overlay" +#define DEVLINK_TRAP_GENERIC_NAME_IPV6_NEIGH_SOLICIT \ + "ipv6_neigh_solicit" +#define DEVLINK_TRAP_GENERIC_NAME_IPV6_NEIGH_ADVERT \ + "ipv6_neigh_advert" +#define DEVLINK_TRAP_GENERIC_NAME_IPV4_BFD \ + "ipv4_bfd" +#define DEVLINK_TRAP_GENERIC_NAME_IPV6_BFD \ + "ipv6_bfd" +#define DEVLINK_TRAP_GENERIC_NAME_IPV4_OSPF \ + "ipv4_ospf" +#define DEVLINK_TRAP_GENERIC_NAME_IPV6_OSPF \ + "ipv6_ospf" +#define DEVLINK_TRAP_GENERIC_NAME_IPV4_BGP \ + "ipv4_bgp" +#define DEVLINK_TRAP_GENERIC_NAME_IPV6_BGP \ + "ipv6_bgp" +#define DEVLINK_TRAP_GENERIC_NAME_IPV4_VRRP \ + "ipv4_vrrp" +#define DEVLINK_TRAP_GENERIC_NAME_IPV6_VRRP \ + "ipv6_vrrp" +#define DEVLINK_TRAP_GENERIC_NAME_IPV4_PIM \ + "ipv4_pim" +#define DEVLINK_TRAP_GENERIC_NAME_IPV6_PIM \ + "ipv6_pim" +#define DEVLINK_TRAP_GENERIC_NAME_UC_LB \ + "uc_loopback" +#define DEVLINK_TRAP_GENERIC_NAME_LOCAL_ROUTE \ + "local_route" +#define DEVLINK_TRAP_GENERIC_NAME_EXTERNAL_ROUTE \ + "external_route" +#define DEVLINK_TRAP_GENERIC_NAME_IPV6_UC_DIP_LINK_LOCAL_SCOPE \ + "ipv6_uc_dip_link_local_scope" +#define DEVLINK_TRAP_GENERIC_NAME_IPV6_DIP_ALL_NODES \ + "ipv6_dip_all_nodes" +#define DEVLINK_TRAP_GENERIC_NAME_IPV6_DIP_ALL_ROUTERS \ + "ipv6_dip_all_routers" +#define DEVLINK_TRAP_GENERIC_NAME_IPV6_ROUTER_SOLICIT \ + "ipv6_router_solicit" +#define DEVLINK_TRAP_GENERIC_NAME_IPV6_ROUTER_ADVERT \ + "ipv6_router_advert" +#define DEVLINK_TRAP_GENERIC_NAME_IPV6_REDIRECT \ + "ipv6_redirect" +#define DEVLINK_TRAP_GENERIC_NAME_IPV4_ROUTER_ALERT \ + "ipv4_router_alert" +#define DEVLINK_TRAP_GENERIC_NAME_IPV6_ROUTER_ALERT \ + "ipv6_router_alert" +#define DEVLINK_TRAP_GENERIC_NAME_PTP_EVENT \ + "ptp_event" +#define DEVLINK_TRAP_GENERIC_NAME_PTP_GENERAL \ + "ptp_general" #define DEVLINK_TRAP_GROUP_GENERIC_NAME_L2_DROPS \ "l2_drops" @@ -787,6 +889,30 @@ enum devlink_trap_group_generic_id { "lldp" #define DEVLINK_TRAP_GROUP_GENERIC_NAME_MC_SNOOPING \ "mc_snooping" +#define DEVLINK_TRAP_GROUP_GENERIC_NAME_DHCP \ + "dhcp" +#define DEVLINK_TRAP_GROUP_GENERIC_NAME_NEIGH_DISCOVERY \ + "neigh_discovery" +#define DEVLINK_TRAP_GROUP_GENERIC_NAME_BFD \ + "bfd" +#define DEVLINK_TRAP_GROUP_GENERIC_NAME_OSPF \ + "ospf" +#define DEVLINK_TRAP_GROUP_GENERIC_NAME_BGP \ + "bgp" +#define DEVLINK_TRAP_GROUP_GENERIC_NAME_VRRP \ + "vrrp" +#define DEVLINK_TRAP_GROUP_GENERIC_NAME_PIM \ + "pim" +#define DEVLINK_TRAP_GROUP_GENERIC_NAME_UC_LB \ + "uc_loopback" +#define DEVLINK_TRAP_GROUP_GENERIC_NAME_LOCAL_DELIVERY \ + "local_delivery" +#define DEVLINK_TRAP_GROUP_GENERIC_NAME_IPV6 \ + "ipv6" +#define DEVLINK_TRAP_GROUP_GENERIC_NAME_PTP_EVENT \ + "ptp_event" +#define DEVLINK_TRAP_GROUP_GENERIC_NAME_PTP_GENERAL \ + "ptp_general" #define DEVLINK_TRAP_GENERIC(_type, _init_action, _id, _group_id, \ _metadata_cap) \ diff --git a/net/core/devlink.c b/net/core/devlink.c index c91ef1b5f738..f32854c3d0e7 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -8507,6 +8507,36 @@ static const struct devlink_trap devlink_trap_generic[] = { DEVLINK_TRAP(MLD_V1_REPORT, CONTROL), DEVLINK_TRAP(MLD_V2_REPORT, CONTROL), DEVLINK_TRAP(MLD_V1_DONE, CONTROL), + DEVLINK_TRAP(IPV4_DHCP, CONTROL), + DEVLINK_TRAP(IPV6_DHCP, CONTROL), + DEVLINK_TRAP(ARP_REQUEST, CONTROL), + DEVLINK_TRAP(ARP_RESPONSE, CONTROL), + DEVLINK_TRAP(ARP_OVERLAY, CONTROL), + DEVLINK_TRAP(IPV6_NEIGH_SOLICIT, CONTROL), + DEVLINK_TRAP(IPV6_NEIGH_ADVERT, CONTROL), + DEVLINK_TRAP(IPV4_BFD, CONTROL), + DEVLINK_TRAP(IPV6_BFD, CONTROL), + DEVLINK_TRAP(IPV4_OSPF, CONTROL), + DEVLINK_TRAP(IPV6_OSPF, CONTROL), + DEVLINK_TRAP(IPV4_BGP, CONTROL), + DEVLINK_TRAP(IPV6_BGP, CONTROL), + DEVLINK_TRAP(IPV4_VRRP, CONTROL), + DEVLINK_TRAP(IPV6_VRRP, CONTROL), + DEVLINK_TRAP(IPV4_PIM, CONTROL), + DEVLINK_TRAP(IPV6_PIM, CONTROL), + DEVLINK_TRAP(UC_LB, CONTROL), + DEVLINK_TRAP(LOCAL_ROUTE, CONTROL), + DEVLINK_TRAP(EXTERNAL_ROUTE, CONTROL), + DEVLINK_TRAP(IPV6_UC_DIP_LINK_LOCAL_SCOPE, CONTROL), + DEVLINK_TRAP(IPV6_DIP_ALL_NODES, CONTROL), + DEVLINK_TRAP(IPV6_DIP_ALL_ROUTERS, CONTROL), + DEVLINK_TRAP(IPV6_ROUTER_SOLICIT, CONTROL), + DEVLINK_TRAP(IPV6_ROUTER_ADVERT, CONTROL), + DEVLINK_TRAP(IPV6_REDIRECT, CONTROL), + DEVLINK_TRAP(IPV4_ROUTER_ALERT, CONTROL), + DEVLINK_TRAP(IPV6_ROUTER_ALERT, CONTROL), + DEVLINK_TRAP(PTP_EVENT, CONTROL), + DEVLINK_TRAP(PTP_GENERAL, CONTROL), }; #define DEVLINK_TRAP_GROUP(_id) \ @@ -8526,6 +8556,18 @@ static const struct devlink_trap_group devlink_trap_group_generic[] = { DEVLINK_TRAP_GROUP(LACP), DEVLINK_TRAP_GROUP(LLDP), DEVLINK_TRAP_GROUP(MC_SNOOPING), + DEVLINK_TRAP_GROUP(DHCP), + DEVLINK_TRAP_GROUP(NEIGH_DISCOVERY), + DEVLINK_TRAP_GROUP(BFD), + DEVLINK_TRAP_GROUP(OSPF), + DEVLINK_TRAP_GROUP(BGP), + DEVLINK_TRAP_GROUP(VRRP), + DEVLINK_TRAP_GROUP(PIM), + DEVLINK_TRAP_GROUP(UC_LB), + DEVLINK_TRAP_GROUP(LOCAL_DELIVERY), + DEVLINK_TRAP_GROUP(IPV6), + DEVLINK_TRAP_GROUP(PTP_EVENT), + DEVLINK_TRAP_GROUP(PTP_GENERAL), }; static int devlink_trap_generic_verify(const struct devlink_trap *trap) -- cgit v1.2.3-59-g8ed1b From 5eb18a2b6c11bf165271644ef1ab812b10659c8f Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Fri, 29 May 2020 21:36:43 +0300 Subject: devlink: Add ACL control packet traps Add packet traps for packets that are sampled / trapped by ACLs, so that capable drivers could register them with devlink. Add documentation for every added packet trap and packet trap group. Signed-off-by: Ido Schimmel Reviewed-by: Jiri Pirko Signed-off-by: David S. Miller --- Documentation/networking/devlink/devlink-trap.rst | 14 ++++++++++++++ include/net/devlink.h | 12 ++++++++++++ net/core/devlink.c | 4 ++++ 3 files changed, 30 insertions(+) diff --git a/Documentation/networking/devlink/devlink-trap.rst b/Documentation/networking/devlink/devlink-trap.rst index 621b634b16be..1e3f3ffee248 100644 --- a/Documentation/networking/devlink/devlink-trap.rst +++ b/Documentation/networking/devlink/devlink-trap.rst @@ -397,6 +397,14 @@ be added to the following table: - ``control`` - Traps PTP general messages (Announce, Follow_Up, Delay_Resp, Pdelay_Resp_Follow_Up, management and signaling) + * - ``flow_action_sample`` + - ``control`` + - Traps packets sampled during processing of flow action sample (e.g., via + tc's sample action) + * - ``flow_action_trap`` + - ``control`` + - Traps packets logged during processing of flow action trap (e.g., via + tc's trap action) Driver-specific Packet Traps ============================ @@ -487,6 +495,12 @@ narrow. The description of these groups must be added to the following table: * - ``ptp_general`` - Contains packet traps for PTP general messages (Announce, Follow_Up, Delay_Resp, Pdelay_Resp_Follow_Up, management and signaling) + * - ``acl_sample`` + - Contains packet traps for packets that were sampled by the device during + ACL processing + * - ``acl_trap`` + - Contains packet traps for packets that were trapped (logged) by the + device during ACL processing Packet Trap Policers ==================== diff --git a/include/net/devlink.h b/include/net/devlink.h index 05a45dea976b..1df6dfec26c2 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -687,6 +687,8 @@ enum devlink_trap_generic_id { DEVLINK_TRAP_GENERIC_ID_IPV6_ROUTER_ALERT, DEVLINK_TRAP_GENERIC_ID_PTP_EVENT, DEVLINK_TRAP_GENERIC_ID_PTP_GENERAL, + DEVLINK_TRAP_GENERIC_ID_FLOW_ACTION_SAMPLE, + DEVLINK_TRAP_GENERIC_ID_FLOW_ACTION_TRAP, /* Add new generic trap IDs above */ __DEVLINK_TRAP_GENERIC_ID_MAX, @@ -719,6 +721,8 @@ enum devlink_trap_group_generic_id { DEVLINK_TRAP_GROUP_GENERIC_ID_IPV6, DEVLINK_TRAP_GROUP_GENERIC_ID_PTP_EVENT, DEVLINK_TRAP_GROUP_GENERIC_ID_PTP_GENERAL, + DEVLINK_TRAP_GROUP_GENERIC_ID_ACL_SAMPLE, + DEVLINK_TRAP_GROUP_GENERIC_ID_ACL_TRAP, /* Add new generic trap group IDs above */ __DEVLINK_TRAP_GROUP_GENERIC_ID_MAX, @@ -868,6 +872,10 @@ enum devlink_trap_group_generic_id { "ptp_event" #define DEVLINK_TRAP_GENERIC_NAME_PTP_GENERAL \ "ptp_general" +#define DEVLINK_TRAP_GENERIC_NAME_FLOW_ACTION_SAMPLE \ + "flow_action_sample" +#define DEVLINK_TRAP_GENERIC_NAME_FLOW_ACTION_TRAP \ + "flow_action_trap" #define DEVLINK_TRAP_GROUP_GENERIC_NAME_L2_DROPS \ "l2_drops" @@ -913,6 +921,10 @@ enum devlink_trap_group_generic_id { "ptp_event" #define DEVLINK_TRAP_GROUP_GENERIC_NAME_PTP_GENERAL \ "ptp_general" +#define DEVLINK_TRAP_GROUP_GENERIC_NAME_ACL_SAMPLE \ + "acl_sample" +#define DEVLINK_TRAP_GROUP_GENERIC_NAME_ACL_TRAP \ + "acl_trap" #define DEVLINK_TRAP_GENERIC(_type, _init_action, _id, _group_id, \ _metadata_cap) \ diff --git a/net/core/devlink.c b/net/core/devlink.c index f32854c3d0e7..2cafbc808b09 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -8537,6 +8537,8 @@ static const struct devlink_trap devlink_trap_generic[] = { DEVLINK_TRAP(IPV6_ROUTER_ALERT, CONTROL), DEVLINK_TRAP(PTP_EVENT, CONTROL), DEVLINK_TRAP(PTP_GENERAL, CONTROL), + DEVLINK_TRAP(FLOW_ACTION_SAMPLE, CONTROL), + DEVLINK_TRAP(FLOW_ACTION_TRAP, CONTROL), }; #define DEVLINK_TRAP_GROUP(_id) \ @@ -8568,6 +8570,8 @@ static const struct devlink_trap_group devlink_trap_group_generic[] = { DEVLINK_TRAP_GROUP(IPV6), DEVLINK_TRAP_GROUP(PTP_EVENT), DEVLINK_TRAP_GROUP(PTP_GENERAL), + DEVLINK_TRAP_GROUP(ACL_SAMPLE), + DEVLINK_TRAP_GROUP(ACL_TRAP), }; static int devlink_trap_generic_verify(const struct devlink_trap *trap) -- cgit v1.2.3-59-g8ed1b From 1897936744f0ab366102170d7c76bfc8f7aeb2ba Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Fri, 29 May 2020 21:36:44 +0300 Subject: netdevsim: Register control traps Register two control traps with devlink. The existing selftest at tools/testing/selftests/drivers/net/netdevsim/devlink_trap.sh iterates over all registered traps and checks that the action of non-drop traps cannot be changed. Up until now only exception traps were tested, now control traps will be tested as well. Signed-off-by: Ido Schimmel Reviewed-by: Jiri Pirko Signed-off-by: David S. Miller --- drivers/net/netdevsim/dev.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/net/netdevsim/dev.c b/drivers/net/netdevsim/dev.c index 09d947eff980..ec6b6f7818ac 100644 --- a/drivers/net/netdevsim/dev.c +++ b/drivers/net/netdevsim/dev.c @@ -431,6 +431,10 @@ enum { DEVLINK_TRAP_GENERIC(EXCEPTION, TRAP, _id, \ DEVLINK_TRAP_GROUP_GENERIC_ID_##_group_id, \ NSIM_TRAP_METADATA) +#define NSIM_TRAP_CONTROL(_id, _group_id, _action) \ + DEVLINK_TRAP_GENERIC(CONTROL, _action, _id, \ + DEVLINK_TRAP_GROUP_GENERIC_ID_##_group_id, \ + NSIM_TRAP_METADATA) #define NSIM_TRAP_DRIVER_EXCEPTION(_id, _group_id) \ DEVLINK_TRAP_DRIVER(EXCEPTION, TRAP, NSIM_TRAP_ID_##_id, \ NSIM_TRAP_NAME_##_id, \ @@ -461,6 +465,7 @@ static const struct devlink_trap_group nsim_trap_groups_arr[] = { DEVLINK_TRAP_GROUP_GENERIC(L3_EXCEPTIONS, 1), DEVLINK_TRAP_GROUP_GENERIC(BUFFER_DROPS, 2), DEVLINK_TRAP_GROUP_GENERIC(ACL_DROPS, 3), + DEVLINK_TRAP_GROUP_GENERIC(MC_SNOOPING, 3), }; static const struct devlink_trap nsim_traps_arr[] = { @@ -478,6 +483,8 @@ static const struct devlink_trap nsim_traps_arr[] = { DEVLINK_TRAP_METADATA_TYPE_F_FA_COOKIE), NSIM_TRAP_DROP_EXT(EGRESS_FLOW_ACTION_DROP, ACL_DROPS, DEVLINK_TRAP_METADATA_TYPE_F_FA_COOKIE), + NSIM_TRAP_CONTROL(IGMP_QUERY, MC_SNOOPING, MIRROR), + NSIM_TRAP_CONTROL(IGMP_V1_REPORT, MC_SNOOPING, TRAP), }; #define NSIM_TRAP_L4_DATA_LEN 100 -- cgit v1.2.3-59-g8ed1b From 45b1c87313cd2ab2843edd4e6467e3d6458e0c68 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Fri, 29 May 2020 21:36:45 +0300 Subject: mlxsw: spectrum_trap: Factor out common Rx listener function We currently have an Rx listener function for exception traps that marks received skbs with 'offload_fwd_mark' and injects them to the kernel's Rx path. The marking is done because all these exceptions occur during L3 forwarding, after the packets were potentially flooded at L2. A subsequent patch will add support for control traps. Packets received via some of these control traps need different handling: 1. Packets might not need to be marked with 'offload_fwd_mark'. For example, if packet was trapped before L2 forwarding 2. Packets might not need to be injected to the kernel's Rx path. For example, sampled packets are reported to user space via the psample module Factor out a common Rx listener function that only reports trapped packets to devlink. Call it from mlxsw_sp_rx_no_mark_listener() and mlxsw_sp_rx_mark_listener() that will inject the packets to the kernel's Rx path, without and with the marking, respectively. Signed-off-by: Ido Schimmel Reviewed-by: Jiri Pirko Signed-off-by: David S. Miller --- .../net/ethernet/mellanox/mlxsw/spectrum_trap.c | 29 ++++++++++++++++++---- 1 file changed, 24 insertions(+), 5 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_trap.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_trap.c index dc2217f1a07f..206751963a4f 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_trap.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_trap.c @@ -125,8 +125,8 @@ static void mlxsw_sp_rx_acl_drop_listener(struct sk_buff *skb, u8 local_port, consume_skb(skb); } -static void mlxsw_sp_rx_exception_listener(struct sk_buff *skb, u8 local_port, - void *trap_ctx) +static int __mlxsw_sp_rx_no_mark_listener(struct sk_buff *skb, u8 local_port, + void *trap_ctx) { struct devlink_port *in_devlink_port; struct mlxsw_sp_port *mlxsw_sp_port; @@ -139,7 +139,7 @@ static void mlxsw_sp_rx_exception_listener(struct sk_buff *skb, u8 local_port, err = mlxsw_sp_rx_listener(mlxsw_sp, skb, local_port, mlxsw_sp_port); if (err) - return; + return err; devlink = priv_to_devlink(mlxsw_sp->core); in_devlink_port = mlxsw_core_port_devlink_port_get(mlxsw_sp->core, @@ -147,10 +147,29 @@ static void mlxsw_sp_rx_exception_listener(struct sk_buff *skb, u8 local_port, skb_push(skb, ETH_HLEN); devlink_trap_report(devlink, skb, trap_ctx, in_devlink_port, NULL); skb_pull(skb, ETH_HLEN); - skb->offload_fwd_mark = 1; + + return 0; +} + +static void mlxsw_sp_rx_no_mark_listener(struct sk_buff *skb, u8 local_port, + void *trap_ctx) +{ + int err; + + err = __mlxsw_sp_rx_no_mark_listener(skb, local_port, trap_ctx); + if (err) + return; + netif_receive_skb(skb); } +static void mlxsw_sp_rx_mark_listener(struct sk_buff *skb, u8 local_port, + void *trap_ctx) +{ + skb->offload_fwd_mark = 1; + mlxsw_sp_rx_no_mark_listener(skb, local_port, trap_ctx); +} + #define MLXSW_SP_TRAP_DROP(_id, _group_id) \ DEVLINK_TRAP_GENERIC(DROP, DROP, _id, \ DEVLINK_TRAP_GROUP_GENERIC_ID_##_group_id, \ @@ -183,7 +202,7 @@ static void mlxsw_sp_rx_exception_listener(struct sk_buff *skb, u8 local_port, SET_FW_DEFAULT, SP_##_dis_group_id) #define MLXSW_SP_RXL_EXCEPTION(_id, _group_id, _action) \ - MLXSW_RXL(mlxsw_sp_rx_exception_listener, _id, \ + MLXSW_RXL(mlxsw_sp_rx_mark_listener, _id, \ _action, false, SP_##_group_id, SET_FW_DEFAULT) #define MLXSW_SP_TRAP_POLICER(_id, _rate, _burst) \ -- cgit v1.2.3-59-g8ed1b From 39c10350cfc8ce23faae651877171e354b9006d4 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Fri, 29 May 2020 21:36:46 +0300 Subject: mlxsw: spectrum_trap: Register layer 2 control traps In a similar fashion to other traps, register layer 2 control traps with devlink. Signed-off-by: Ido Schimmel Reviewed-by: Jiri Pirko Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlxsw/spectrum.c | 37 +---- drivers/net/ethernet/mellanox/mlxsw/spectrum.h | 2 + .../net/ethernet/mellanox/mlxsw/spectrum_trap.c | 151 +++++++++++++++++++++ 3 files changed, 159 insertions(+), 31 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c index c598ae9ed106..74925826a2cb 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c @@ -4022,6 +4022,12 @@ static void mlxsw_sp_rx_listener_ptp(struct sk_buff *skb, u8 local_port, mlxsw_sp->ptp_ops->receive(mlxsw_sp, skb, local_port); } +void mlxsw_sp_ptp_receive(struct mlxsw_sp *mlxsw_sp, struct sk_buff *skb, + u8 local_port) +{ + mlxsw_sp->ptp_ops->receive(mlxsw_sp, skb, local_port); +} + #define MLXSW_SP_RXL_NO_MARK(_trap_id, _action, _trap_group, _is_ctrl) \ MLXSW_RXL(mlxsw_sp_rx_listener_no_mark_func, _trap_id, _action, \ _is_ctrl, SP_##_trap_group, DISCARD) @@ -4041,26 +4047,9 @@ static const struct mlxsw_listener mlxsw_sp_listener[] = { /* Events */ MLXSW_SP_EVENTL(mlxsw_sp_pude_event_func, PUDE), /* L2 traps */ - MLXSW_SP_RXL_NO_MARK(STP, TRAP_TO_CPU, STP, true), - MLXSW_SP_RXL_NO_MARK(LACP, TRAP_TO_CPU, LACP, true), - MLXSW_RXL(mlxsw_sp_rx_listener_ptp, LLDP, TRAP_TO_CPU, - false, SP_LLDP, DISCARD), - MLXSW_SP_RXL_MARK(IGMP_QUERY, MIRROR_TO_CPU, MC_SNOOPING, false), - MLXSW_SP_RXL_NO_MARK(IGMP_V1_REPORT, TRAP_TO_CPU, MC_SNOOPING, false), - MLXSW_SP_RXL_NO_MARK(IGMP_V2_REPORT, TRAP_TO_CPU, MC_SNOOPING, false), - MLXSW_SP_RXL_NO_MARK(IGMP_V2_LEAVE, TRAP_TO_CPU, MC_SNOOPING, false), - MLXSW_SP_RXL_NO_MARK(IGMP_V3_REPORT, TRAP_TO_CPU, MC_SNOOPING, false), MLXSW_SP_RXL_MARK(ARPBC, MIRROR_TO_CPU, NEIGH_DISCOVERY, false), MLXSW_SP_RXL_MARK(ARPUC, MIRROR_TO_CPU, NEIGH_DISCOVERY, false), MLXSW_SP_RXL_NO_MARK(FID_MISS, TRAP_TO_CPU, FID_MISS, false), - MLXSW_SP_RXL_MARK(IPV6_MLDV12_LISTENER_QUERY, MIRROR_TO_CPU, - MC_SNOOPING, false), - MLXSW_SP_RXL_NO_MARK(IPV6_MLDV1_LISTENER_REPORT, TRAP_TO_CPU, - MC_SNOOPING, false), - MLXSW_SP_RXL_NO_MARK(IPV6_MLDV1_LISTENER_DONE, TRAP_TO_CPU, MC_SNOOPING, - false), - MLXSW_SP_RXL_NO_MARK(IPV6_MLDV2_LISTENER_REPORT, TRAP_TO_CPU, - MC_SNOOPING, false), /* L3 traps */ MLXSW_SP_RXL_L3_MARK(LBERROR, MIRROR_TO_CPU, LBERROR, false), MLXSW_SP_RXL_MARK(IP2ME, TRAP_TO_CPU, IP2ME, false), @@ -4149,9 +4138,6 @@ static int mlxsw_sp_cpu_policers_set(struct mlxsw_core *mlxsw_core) for (i = 0; i < max_cpu_policers; i++) { is_bytes = false; switch (i) { - case MLXSW_REG_HTGT_TRAP_GROUP_SP_STP: - case MLXSW_REG_HTGT_TRAP_GROUP_SP_LACP: - case MLXSW_REG_HTGT_TRAP_GROUP_SP_LLDP: case MLXSW_REG_HTGT_TRAP_GROUP_SP_OSPF: case MLXSW_REG_HTGT_TRAP_GROUP_SP_PIM: case MLXSW_REG_HTGT_TRAP_GROUP_SP_LBERROR: @@ -4159,10 +4145,6 @@ static int mlxsw_sp_cpu_policers_set(struct mlxsw_core *mlxsw_core) rate = 128; burst_size = 7; break; - case MLXSW_REG_HTGT_TRAP_GROUP_SP_MC_SNOOPING: - rate = 16 * 1024; - burst_size = 10; - break; case MLXSW_REG_HTGT_TRAP_GROUP_SP_BGP: case MLXSW_REG_HTGT_TRAP_GROUP_SP_NEIGH_DISCOVERY: case MLXSW_REG_HTGT_TRAP_GROUP_SP_ROUTER_EXP: @@ -4225,9 +4207,6 @@ static int mlxsw_sp_trap_groups_set(struct mlxsw_core *mlxsw_core) for (i = 0; i < max_trap_groups; i++) { policer_id = i; switch (i) { - case MLXSW_REG_HTGT_TRAP_GROUP_SP_STP: - case MLXSW_REG_HTGT_TRAP_GROUP_SP_LACP: - case MLXSW_REG_HTGT_TRAP_GROUP_SP_LLDP: case MLXSW_REG_HTGT_TRAP_GROUP_SP_OSPF: case MLXSW_REG_HTGT_TRAP_GROUP_SP_PIM: case MLXSW_REG_HTGT_TRAP_GROUP_SP_PTP0: @@ -4241,10 +4220,6 @@ static int mlxsw_sp_trap_groups_set(struct mlxsw_core *mlxsw_core) priority = 4; tc = 4; break; - case MLXSW_REG_HTGT_TRAP_GROUP_SP_MC_SNOOPING: - priority = 3; - tc = 3; - break; case MLXSW_REG_HTGT_TRAP_GROUP_SP_NEIGH_DISCOVERY: case MLXSW_REG_HTGT_TRAP_GROUP_SP_IP2ME: case MLXSW_REG_HTGT_TRAP_GROUP_SP_IPV6: diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.h b/drivers/net/ethernet/mellanox/mlxsw/spectrum.h index 147a5634244b..9d4dfb22cb7a 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.h +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.h @@ -451,6 +451,8 @@ extern struct notifier_block mlxsw_sp_switchdev_notifier; /* spectrum.c */ void mlxsw_sp_rx_listener_no_mark_func(struct sk_buff *skb, u8 local_port, void *priv); +void mlxsw_sp_ptp_receive(struct mlxsw_sp *mlxsw_sp, struct sk_buff *skb, + u8 local_port); int mlxsw_sp_port_speed_get(struct mlxsw_sp_port *mlxsw_sp_port, u32 *speed); int mlxsw_sp_port_ets_set(struct mlxsw_sp_port *mlxsw_sp_port, enum mlxsw_reg_qeec_hr hr, u8 index, u8 next_index, diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_trap.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_trap.c index 206751963a4f..32b77d5a917d 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_trap.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_trap.c @@ -170,6 +170,23 @@ static void mlxsw_sp_rx_mark_listener(struct sk_buff *skb, u8 local_port, mlxsw_sp_rx_no_mark_listener(skb, local_port, trap_ctx); } +static void mlxsw_sp_rx_ptp_listener(struct sk_buff *skb, u8 local_port, + void *trap_ctx) +{ + struct mlxsw_sp *mlxsw_sp = devlink_trap_ctx_priv(trap_ctx); + int err; + + err = __mlxsw_sp_rx_no_mark_listener(skb, local_port, trap_ctx); + if (err) + return; + + /* The PTP handler expects skb->data to point to the start of the + * Ethernet header. + */ + skb_push(skb, ETH_HLEN); + mlxsw_sp_ptp_receive(mlxsw_sp, skb, local_port); +} + #define MLXSW_SP_TRAP_DROP(_id, _group_id) \ DEVLINK_TRAP_GENERIC(DROP, DROP, _id, \ DEVLINK_TRAP_GROUP_GENERIC_ID_##_group_id, \ @@ -191,6 +208,11 @@ static void mlxsw_sp_rx_mark_listener(struct sk_buff *skb, u8 local_port, DEVLINK_TRAP_GROUP_GENERIC_ID_##_group_id, \ MLXSW_SP_TRAP_METADATA) +#define MLXSW_SP_TRAP_CONTROL(_id, _group_id, _action) \ + DEVLINK_TRAP_GENERIC(CONTROL, _action, _id, \ + DEVLINK_TRAP_GROUP_GENERIC_ID_##_group_id, \ + MLXSW_SP_TRAP_METADATA) + #define MLXSW_SP_RXL_DISCARD(_id, _group_id) \ MLXSW_RXL_DIS(mlxsw_sp_rx_drop_listener, DISCARD_##_id, \ TRAP_EXCEPTION_TO_CPU, false, SP_##_group_id, \ @@ -205,6 +227,14 @@ static void mlxsw_sp_rx_mark_listener(struct sk_buff *skb, u8 local_port, MLXSW_RXL(mlxsw_sp_rx_mark_listener, _id, \ _action, false, SP_##_group_id, SET_FW_DEFAULT) +#define MLXSW_SP_RXL_NO_MARK(_id, _group_id, _action, _is_ctrl) \ + MLXSW_RXL(mlxsw_sp_rx_no_mark_listener, _id, _action, \ + _is_ctrl, SP_##_group_id, DISCARD) + +#define MLXSW_SP_RXL_MARK(_id, _group_id, _action, _is_ctrl) \ + MLXSW_RXL(mlxsw_sp_rx_mark_listener, _id, _action, _is_ctrl, \ + SP_##_group_id, DISCARD) + #define MLXSW_SP_TRAP_POLICER(_id, _rate, _burst) \ DEVLINK_TRAP_POLICER(_id, _rate, _burst, \ MLXSW_REG_QPCR_HIGHEST_CIR, \ @@ -218,6 +248,18 @@ mlxsw_sp_trap_policer_items_arr[] = { { .policer = MLXSW_SP_TRAP_POLICER(1, 10 * 1024, 128), }, + { + .policer = MLXSW_SP_TRAP_POLICER(2, 128, 128), + }, + { + .policer = MLXSW_SP_TRAP_POLICER(3, 128, 128), + }, + { + .policer = MLXSW_SP_TRAP_POLICER(4, 128, 128), + }, + { + .policer = MLXSW_SP_TRAP_POLICER(5, 16 * 1024, 128), + }, }; static const struct mlxsw_sp_trap_group_item mlxsw_sp_trap_group_items_arr[] = { @@ -246,6 +288,26 @@ static const struct mlxsw_sp_trap_group_item mlxsw_sp_trap_group_items_arr[] = { .hw_group_id = MLXSW_REG_HTGT_TRAP_GROUP_SP_ACL_DISCARDS, .priority = 0, }, + { + .group = DEVLINK_TRAP_GROUP_GENERIC(STP, 2), + .hw_group_id = MLXSW_REG_HTGT_TRAP_GROUP_SP_STP, + .priority = 5, + }, + { + .group = DEVLINK_TRAP_GROUP_GENERIC(LACP, 3), + .hw_group_id = MLXSW_REG_HTGT_TRAP_GROUP_SP_LACP, + .priority = 5, + }, + { + .group = DEVLINK_TRAP_GROUP_GENERIC(LLDP, 4), + .hw_group_id = MLXSW_REG_HTGT_TRAP_GROUP_SP_LLDP, + .priority = 5, + }, + { + .group = DEVLINK_TRAP_GROUP_GENERIC(MC_SNOOPING, 5), + .hw_group_id = MLXSW_REG_HTGT_TRAP_GROUP_SP_MC_SNOOPING, + .priority = 3, + }, }; static const struct mlxsw_sp_trap_item mlxsw_sp_trap_items_arr[] = { @@ -466,6 +528,95 @@ static const struct mlxsw_sp_trap_item mlxsw_sp_trap_items_arr[] = { DUMMY), }, }, + { + .trap = MLXSW_SP_TRAP_CONTROL(STP, STP, TRAP), + .listeners_arr = { + MLXSW_SP_RXL_NO_MARK(STP, STP, TRAP_TO_CPU, true), + }, + }, + { + .trap = MLXSW_SP_TRAP_CONTROL(LACP, LACP, TRAP), + .listeners_arr = { + MLXSW_SP_RXL_NO_MARK(LACP, LACP, TRAP_TO_CPU, true), + }, + }, + { + .trap = MLXSW_SP_TRAP_CONTROL(LLDP, LLDP, TRAP), + .listeners_arr = { + MLXSW_RXL(mlxsw_sp_rx_ptp_listener, LLDP, TRAP_TO_CPU, + false, SP_LLDP, DISCARD), + }, + }, + { + .trap = MLXSW_SP_TRAP_CONTROL(IGMP_QUERY, MC_SNOOPING, MIRROR), + .listeners_arr = { + MLXSW_SP_RXL_MARK(IGMP_QUERY, MC_SNOOPING, + MIRROR_TO_CPU, false), + }, + }, + { + .trap = MLXSW_SP_TRAP_CONTROL(IGMP_V1_REPORT, MC_SNOOPING, + TRAP), + .listeners_arr = { + MLXSW_SP_RXL_NO_MARK(IGMP_V1_REPORT, MC_SNOOPING, + TRAP_TO_CPU, false), + }, + }, + { + .trap = MLXSW_SP_TRAP_CONTROL(IGMP_V2_REPORT, MC_SNOOPING, + TRAP), + .listeners_arr = { + MLXSW_SP_RXL_NO_MARK(IGMP_V2_REPORT, MC_SNOOPING, + TRAP_TO_CPU, false), + }, + }, + { + .trap = MLXSW_SP_TRAP_CONTROL(IGMP_V3_REPORT, MC_SNOOPING, + TRAP), + .listeners_arr = { + MLXSW_SP_RXL_NO_MARK(IGMP_V3_REPORT, MC_SNOOPING, + TRAP_TO_CPU, false), + }, + }, + { + .trap = MLXSW_SP_TRAP_CONTROL(IGMP_V2_LEAVE, MC_SNOOPING, + TRAP), + .listeners_arr = { + MLXSW_SP_RXL_NO_MARK(IGMP_V2_LEAVE, MC_SNOOPING, + TRAP_TO_CPU, false), + }, + }, + { + .trap = MLXSW_SP_TRAP_CONTROL(MLD_QUERY, MC_SNOOPING, MIRROR), + .listeners_arr = { + MLXSW_SP_RXL_MARK(IPV6_MLDV12_LISTENER_QUERY, + MC_SNOOPING, MIRROR_TO_CPU, false), + }, + }, + { + .trap = MLXSW_SP_TRAP_CONTROL(MLD_V1_REPORT, MC_SNOOPING, + TRAP), + .listeners_arr = { + MLXSW_SP_RXL_NO_MARK(IPV6_MLDV1_LISTENER_REPORT, + MC_SNOOPING, TRAP_TO_CPU, false), + }, + }, + { + .trap = MLXSW_SP_TRAP_CONTROL(MLD_V2_REPORT, MC_SNOOPING, + TRAP), + .listeners_arr = { + MLXSW_SP_RXL_NO_MARK(IPV6_MLDV2_LISTENER_REPORT, + MC_SNOOPING, TRAP_TO_CPU, false), + }, + }, + { + .trap = MLXSW_SP_TRAP_CONTROL(MLD_V1_DONE, MC_SNOOPING, + TRAP), + .listeners_arr = { + MLXSW_SP_RXL_NO_MARK(IPV6_MLDV1_LISTENER_DONE, + MC_SNOOPING, TRAP_TO_CPU, false), + }, + }, }; static struct mlxsw_sp_trap_policer_item * -- cgit v1.2.3-59-g8ed1b From 8110668ecd9a9e704b9b412302e76a9c6e1f4ce2 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Fri, 29 May 2020 21:36:47 +0300 Subject: mlxsw: spectrum_trap: Register layer 3 control traps In a similar fashion to layer 2 control traps, register layer 3 control traps with devlink. Signed-off-by: Ido Schimmel Reviewed-by: Jiri Pirko Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlxsw/reg.h | 1 - drivers/net/ethernet/mellanox/mlxsw/spectrum.c | 93 ------ .../net/ethernet/mellanox/mlxsw/spectrum_trap.c | 318 +++++++++++++++++++++ 3 files changed, 318 insertions(+), 94 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlxsw/reg.h b/drivers/net/ethernet/mellanox/mlxsw/reg.h index 030d6f9766d2..fcb88d4271bf 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/reg.h +++ b/drivers/net/ethernet/mellanox/mlxsw/reg.h @@ -5536,7 +5536,6 @@ enum mlxsw_reg_htgt_trap_group { MLXSW_REG_HTGT_TRAP_GROUP_SP_MULTICAST, MLXSW_REG_HTGT_TRAP_GROUP_SP_NEIGH_DISCOVERY, MLXSW_REG_HTGT_TRAP_GROUP_SP_ROUTER_EXP, - MLXSW_REG_HTGT_TRAP_GROUP_SP_REMOTE_ROUTE, MLXSW_REG_HTGT_TRAP_GROUP_SP_IP2ME, MLXSW_REG_HTGT_TRAP_GROUP_SP_DHCP, MLXSW_REG_HTGT_TRAP_GROUP_SP_EVENT, diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c index 74925826a2cb..8daeae1384da 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c @@ -4014,14 +4014,6 @@ out: consume_skb(skb); } -static void mlxsw_sp_rx_listener_ptp(struct sk_buff *skb, u8 local_port, - void *priv) -{ - struct mlxsw_sp *mlxsw_sp = priv; - - mlxsw_sp->ptp_ops->receive(mlxsw_sp, skb, local_port); -} - void mlxsw_sp_ptp_receive(struct mlxsw_sp *mlxsw_sp, struct sk_buff *skb, u8 local_port) { @@ -4047,43 +4039,13 @@ static const struct mlxsw_listener mlxsw_sp_listener[] = { /* Events */ MLXSW_SP_EVENTL(mlxsw_sp_pude_event_func, PUDE), /* L2 traps */ - MLXSW_SP_RXL_MARK(ARPBC, MIRROR_TO_CPU, NEIGH_DISCOVERY, false), - MLXSW_SP_RXL_MARK(ARPUC, MIRROR_TO_CPU, NEIGH_DISCOVERY, false), MLXSW_SP_RXL_NO_MARK(FID_MISS, TRAP_TO_CPU, FID_MISS, false), /* L3 traps */ - MLXSW_SP_RXL_L3_MARK(LBERROR, MIRROR_TO_CPU, LBERROR, false), - MLXSW_SP_RXL_MARK(IP2ME, TRAP_TO_CPU, IP2ME, false), MLXSW_SP_RXL_MARK(IPV6_UNSPECIFIED_ADDRESS, TRAP_TO_CPU, ROUTER_EXP, false), - MLXSW_SP_RXL_MARK(IPV6_LINK_LOCAL_DEST, TRAP_TO_CPU, IP2ME, false), MLXSW_SP_RXL_MARK(IPV6_LINK_LOCAL_SRC, TRAP_TO_CPU, ROUTER_EXP, false), - MLXSW_SP_RXL_MARK(IPV6_ALL_NODES_LINK, TRAP_TO_CPU, IPV6, false), - MLXSW_SP_RXL_MARK(IPV6_ALL_ROUTERS_LINK, TRAP_TO_CPU, IPV6, - false), - MLXSW_SP_RXL_MARK(IPV4_OSPF, TRAP_TO_CPU, OSPF, false), - MLXSW_SP_RXL_MARK(IPV6_OSPF, TRAP_TO_CPU, OSPF, false), - MLXSW_SP_RXL_MARK(IPV4_DHCP, TRAP_TO_CPU, DHCP, false), - MLXSW_SP_RXL_MARK(IPV6_DHCP, TRAP_TO_CPU, DHCP, false), - MLXSW_SP_RXL_MARK(RTR_INGRESS0, TRAP_TO_CPU, REMOTE_ROUTE, false), - MLXSW_SP_RXL_MARK(IPV4_BGP, TRAP_TO_CPU, BGP, false), - MLXSW_SP_RXL_MARK(IPV6_BGP, TRAP_TO_CPU, BGP, false), - MLXSW_SP_RXL_MARK(L3_IPV6_ROUTER_SOLICITATION, TRAP_TO_CPU, IPV6, - false), - MLXSW_SP_RXL_MARK(L3_IPV6_ROUTER_ADVERTISEMENT, TRAP_TO_CPU, IPV6, - false), - MLXSW_SP_RXL_MARK(L3_IPV6_NEIGHBOR_SOLICITATION, TRAP_TO_CPU, - NEIGH_DISCOVERY, false), - MLXSW_SP_RXL_MARK(L3_IPV6_NEIGHBOR_ADVERTISEMENT, TRAP_TO_CPU, - NEIGH_DISCOVERY, false), - MLXSW_SP_RXL_MARK(L3_IPV6_REDIRECTION, TRAP_TO_CPU, IPV6, false), MLXSW_SP_RXL_MARK(IPV6_MC_LINK_LOCAL_DEST, TRAP_TO_CPU, ROUTER_EXP, false), - MLXSW_SP_RXL_MARK(ROUTER_ALERT_IPV4, TRAP_TO_CPU, IP2ME, false), - MLXSW_SP_RXL_MARK(ROUTER_ALERT_IPV6, TRAP_TO_CPU, IP2ME, false), - MLXSW_SP_RXL_MARK(IPV4_VRRP, TRAP_TO_CPU, VRRP, false), - MLXSW_SP_RXL_MARK(IPV6_VRRP, TRAP_TO_CPU, VRRP, false), - MLXSW_SP_RXL_MARK(IPV4_BFD, TRAP_TO_CPU, BFD, false), - MLXSW_SP_RXL_MARK(IPV6_BFD, TRAP_TO_CPU, BFD, false), MLXSW_SP_RXL_NO_MARK(DISCARD_ING_ROUTER_SIP_CLASS_E, FORWARD, ROUTER_EXP, false), MLXSW_SP_RXL_NO_MARK(DISCARD_ING_ROUTER_MC_DMAC, FORWARD, @@ -4098,18 +4060,10 @@ static const struct mlxsw_listener mlxsw_sp_listener[] = { /* ACL trap */ MLXSW_SP_RXL_NO_MARK(ACL0, TRAP_TO_CPU, FLOW_LOGGING, false), /* Multicast Router Traps */ - MLXSW_SP_RXL_MARK(IPV4_PIM, TRAP_TO_CPU, PIM, false), - MLXSW_SP_RXL_MARK(IPV6_PIM, TRAP_TO_CPU, PIM, false), MLXSW_SP_RXL_MARK(ACL1, TRAP_TO_CPU, MULTICAST, false), MLXSW_SP_RXL_L3_MARK(ACL2, TRAP_TO_CPU, MULTICAST, false), /* NVE traps */ MLXSW_SP_RXL_MARK(NVE_ENCAP_ARP, TRAP_TO_CPU, NEIGH_DISCOVERY, false), - MLXSW_SP_RXL_NO_MARK(NVE_DECAP_ARP, TRAP_TO_CPU, NEIGH_DISCOVERY, - false), - /* PTP traps */ - MLXSW_RXL(mlxsw_sp_rx_listener_ptp, PTP0, TRAP_TO_CPU, - false, SP_PTP0, DISCARD), - MLXSW_SP_RXL_NO_MARK(PTP1, TRAP_TO_CPU, PTP1, false), }; static const struct mlxsw_listener mlxsw_sp1_listener[] = { @@ -4138,41 +4092,13 @@ static int mlxsw_sp_cpu_policers_set(struct mlxsw_core *mlxsw_core) for (i = 0; i < max_cpu_policers; i++) { is_bytes = false; switch (i) { - case MLXSW_REG_HTGT_TRAP_GROUP_SP_OSPF: - case MLXSW_REG_HTGT_TRAP_GROUP_SP_PIM: - case MLXSW_REG_HTGT_TRAP_GROUP_SP_LBERROR: - case MLXSW_REG_HTGT_TRAP_GROUP_SP_DHCP: - rate = 128; - burst_size = 7; - break; - case MLXSW_REG_HTGT_TRAP_GROUP_SP_BGP: - case MLXSW_REG_HTGT_TRAP_GROUP_SP_NEIGH_DISCOVERY: case MLXSW_REG_HTGT_TRAP_GROUP_SP_ROUTER_EXP: - case MLXSW_REG_HTGT_TRAP_GROUP_SP_REMOTE_ROUTE: - case MLXSW_REG_HTGT_TRAP_GROUP_SP_IPV6: case MLXSW_REG_HTGT_TRAP_GROUP_SP_MULTICAST: case MLXSW_REG_HTGT_TRAP_GROUP_SP_FLOW_LOGGING: - case MLXSW_REG_HTGT_TRAP_GROUP_SP_IP2ME: case MLXSW_REG_HTGT_TRAP_GROUP_SP_FID_MISS: rate = 1024; burst_size = 7; break; - case MLXSW_REG_HTGT_TRAP_GROUP_SP_PTP0: - rate = 24 * 1024; - burst_size = 12; - break; - case MLXSW_REG_HTGT_TRAP_GROUP_SP_PTP1: - rate = 19 * 1024; - burst_size = 12; - break; - case MLXSW_REG_HTGT_TRAP_GROUP_SP_VRRP: - rate = 360; - burst_size = 7; - break; - case MLXSW_REG_HTGT_TRAP_GROUP_SP_BFD: - rate = 20 * 1024; - burst_size = 10; - break; default: continue; } @@ -4207,36 +4133,17 @@ static int mlxsw_sp_trap_groups_set(struct mlxsw_core *mlxsw_core) for (i = 0; i < max_trap_groups; i++) { policer_id = i; switch (i) { - case MLXSW_REG_HTGT_TRAP_GROUP_SP_OSPF: - case MLXSW_REG_HTGT_TRAP_GROUP_SP_PIM: - case MLXSW_REG_HTGT_TRAP_GROUP_SP_PTP0: - case MLXSW_REG_HTGT_TRAP_GROUP_SP_VRRP: - case MLXSW_REG_HTGT_TRAP_GROUP_SP_BFD: - priority = 5; - tc = 5; - break; case MLXSW_REG_HTGT_TRAP_GROUP_SP_FLOW_LOGGING: - case MLXSW_REG_HTGT_TRAP_GROUP_SP_BGP: priority = 4; tc = 4; break; - case MLXSW_REG_HTGT_TRAP_GROUP_SP_NEIGH_DISCOVERY: - case MLXSW_REG_HTGT_TRAP_GROUP_SP_IP2ME: - case MLXSW_REG_HTGT_TRAP_GROUP_SP_IPV6: - case MLXSW_REG_HTGT_TRAP_GROUP_SP_PTP1: - case MLXSW_REG_HTGT_TRAP_GROUP_SP_DHCP: - priority = 2; - tc = 2; - break; case MLXSW_REG_HTGT_TRAP_GROUP_SP_ROUTER_EXP: - case MLXSW_REG_HTGT_TRAP_GROUP_SP_REMOTE_ROUTE: case MLXSW_REG_HTGT_TRAP_GROUP_SP_MULTICAST: case MLXSW_REG_HTGT_TRAP_GROUP_SP_FID_MISS: priority = 1; tc = 1; break; case MLXSW_REG_HTGT_TRAP_GROUP_SP_PKT_SAMPLE: - case MLXSW_REG_HTGT_TRAP_GROUP_SP_LBERROR: priority = 0; tc = 0; break; diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_trap.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_trap.c index 32b77d5a917d..148a35b7f4f8 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_trap.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_trap.c @@ -170,6 +170,14 @@ static void mlxsw_sp_rx_mark_listener(struct sk_buff *skb, u8 local_port, mlxsw_sp_rx_no_mark_listener(skb, local_port, trap_ctx); } +static void mlxsw_sp_rx_l3_mark_listener(struct sk_buff *skb, u8 local_port, + void *trap_ctx) +{ + skb->offload_l3_fwd_mark = 1; + skb->offload_fwd_mark = 1; + mlxsw_sp_rx_no_mark_listener(skb, local_port, trap_ctx); +} + static void mlxsw_sp_rx_ptp_listener(struct sk_buff *skb, u8 local_port, void *trap_ctx) { @@ -235,6 +243,10 @@ static void mlxsw_sp_rx_ptp_listener(struct sk_buff *skb, u8 local_port, MLXSW_RXL(mlxsw_sp_rx_mark_listener, _id, _action, _is_ctrl, \ SP_##_group_id, DISCARD) +#define MLXSW_SP_RXL_L3_MARK(_id, _group_id, _action, _is_ctrl) \ + MLXSW_RXL(mlxsw_sp_rx_l3_mark_listener, _id, _action, _is_ctrl, \ + SP_##_group_id, DISCARD) + #define MLXSW_SP_TRAP_POLICER(_id, _rate, _burst) \ DEVLINK_TRAP_POLICER(_id, _rate, _burst, \ MLXSW_REG_QPCR_HIGHEST_CIR, \ @@ -260,6 +272,42 @@ mlxsw_sp_trap_policer_items_arr[] = { { .policer = MLXSW_SP_TRAP_POLICER(5, 16 * 1024, 128), }, + { + .policer = MLXSW_SP_TRAP_POLICER(6, 128, 128), + }, + { + .policer = MLXSW_SP_TRAP_POLICER(7, 1024, 128), + }, + { + .policer = MLXSW_SP_TRAP_POLICER(8, 20 * 1024, 1024), + }, + { + .policer = MLXSW_SP_TRAP_POLICER(9, 128, 128), + }, + { + .policer = MLXSW_SP_TRAP_POLICER(10, 1024, 128), + }, + { + .policer = MLXSW_SP_TRAP_POLICER(11, 360, 128), + }, + { + .policer = MLXSW_SP_TRAP_POLICER(12, 128, 128), + }, + { + .policer = MLXSW_SP_TRAP_POLICER(13, 128, 128), + }, + { + .policer = MLXSW_SP_TRAP_POLICER(14, 1024, 128), + }, + { + .policer = MLXSW_SP_TRAP_POLICER(15, 1024, 128), + }, + { + .policer = MLXSW_SP_TRAP_POLICER(16, 24 * 1024, 4096), + }, + { + .policer = MLXSW_SP_TRAP_POLICER(17, 19 * 1024, 4096), + }, }; static const struct mlxsw_sp_trap_group_item mlxsw_sp_trap_group_items_arr[] = { @@ -308,6 +356,66 @@ static const struct mlxsw_sp_trap_group_item mlxsw_sp_trap_group_items_arr[] = { .hw_group_id = MLXSW_REG_HTGT_TRAP_GROUP_SP_MC_SNOOPING, .priority = 3, }, + { + .group = DEVLINK_TRAP_GROUP_GENERIC(DHCP, 6), + .hw_group_id = MLXSW_REG_HTGT_TRAP_GROUP_SP_DHCP, + .priority = 2, + }, + { + .group = DEVLINK_TRAP_GROUP_GENERIC(NEIGH_DISCOVERY, 7), + .hw_group_id = MLXSW_REG_HTGT_TRAP_GROUP_SP_NEIGH_DISCOVERY, + .priority = 2, + }, + { + .group = DEVLINK_TRAP_GROUP_GENERIC(BFD, 8), + .hw_group_id = MLXSW_REG_HTGT_TRAP_GROUP_SP_BFD, + .priority = 5, + }, + { + .group = DEVLINK_TRAP_GROUP_GENERIC(OSPF, 9), + .hw_group_id = MLXSW_REG_HTGT_TRAP_GROUP_SP_OSPF, + .priority = 5, + }, + { + .group = DEVLINK_TRAP_GROUP_GENERIC(BGP, 10), + .hw_group_id = MLXSW_REG_HTGT_TRAP_GROUP_SP_BGP, + .priority = 4, + }, + { + .group = DEVLINK_TRAP_GROUP_GENERIC(VRRP, 11), + .hw_group_id = MLXSW_REG_HTGT_TRAP_GROUP_SP_VRRP, + .priority = 5, + }, + { + .group = DEVLINK_TRAP_GROUP_GENERIC(PIM, 12), + .hw_group_id = MLXSW_REG_HTGT_TRAP_GROUP_SP_PIM, + .priority = 5, + }, + { + .group = DEVLINK_TRAP_GROUP_GENERIC(UC_LB, 13), + .hw_group_id = MLXSW_REG_HTGT_TRAP_GROUP_SP_LBERROR, + .priority = 0, + }, + { + .group = DEVLINK_TRAP_GROUP_GENERIC(LOCAL_DELIVERY, 14), + .hw_group_id = MLXSW_REG_HTGT_TRAP_GROUP_SP_IP2ME, + .priority = 2, + }, + { + .group = DEVLINK_TRAP_GROUP_GENERIC(IPV6, 15), + .hw_group_id = MLXSW_REG_HTGT_TRAP_GROUP_SP_IPV6, + .priority = 2, + }, + { + .group = DEVLINK_TRAP_GROUP_GENERIC(PTP_EVENT, 16), + .hw_group_id = MLXSW_REG_HTGT_TRAP_GROUP_SP_PTP0, + .priority = 5, + }, + { + .group = DEVLINK_TRAP_GROUP_GENERIC(PTP_GENERAL, 17), + .hw_group_id = MLXSW_REG_HTGT_TRAP_GROUP_SP_PTP1, + .priority = 2, + }, }; static const struct mlxsw_sp_trap_item mlxsw_sp_trap_items_arr[] = { @@ -617,6 +725,216 @@ static const struct mlxsw_sp_trap_item mlxsw_sp_trap_items_arr[] = { MC_SNOOPING, TRAP_TO_CPU, false), }, }, + { + .trap = MLXSW_SP_TRAP_CONTROL(IPV4_DHCP, DHCP, TRAP), + .listeners_arr = { + MLXSW_SP_RXL_MARK(IPV4_DHCP, DHCP, TRAP_TO_CPU, false), + }, + }, + { + .trap = MLXSW_SP_TRAP_CONTROL(IPV6_DHCP, DHCP, TRAP), + .listeners_arr = { + MLXSW_SP_RXL_MARK(IPV6_DHCP, DHCP, TRAP_TO_CPU, false), + }, + }, + { + .trap = MLXSW_SP_TRAP_CONTROL(ARP_REQUEST, NEIGH_DISCOVERY, + MIRROR), + .listeners_arr = { + MLXSW_SP_RXL_MARK(ARPBC, NEIGH_DISCOVERY, MIRROR_TO_CPU, + false), + }, + }, + { + .trap = MLXSW_SP_TRAP_CONTROL(ARP_RESPONSE, NEIGH_DISCOVERY, + MIRROR), + .listeners_arr = { + MLXSW_SP_RXL_MARK(ARPUC, NEIGH_DISCOVERY, MIRROR_TO_CPU, + false), + }, + }, + { + .trap = MLXSW_SP_TRAP_CONTROL(ARP_OVERLAY, NEIGH_DISCOVERY, + TRAP), + .listeners_arr = { + MLXSW_SP_RXL_NO_MARK(NVE_DECAP_ARP, NEIGH_DISCOVERY, + TRAP_TO_CPU, false), + }, + }, + { + .trap = MLXSW_SP_TRAP_CONTROL(IPV6_NEIGH_SOLICIT, + NEIGH_DISCOVERY, TRAP), + .listeners_arr = { + MLXSW_SP_RXL_MARK(L3_IPV6_NEIGHBOR_SOLICITATION, + NEIGH_DISCOVERY, TRAP_TO_CPU, false), + }, + }, + { + .trap = MLXSW_SP_TRAP_CONTROL(IPV6_NEIGH_ADVERT, + NEIGH_DISCOVERY, TRAP), + .listeners_arr = { + MLXSW_SP_RXL_MARK(L3_IPV6_NEIGHBOR_ADVERTISEMENT, + NEIGH_DISCOVERY, TRAP_TO_CPU, false), + }, + }, + { + .trap = MLXSW_SP_TRAP_CONTROL(IPV4_BFD, BFD, TRAP), + .listeners_arr = { + MLXSW_SP_RXL_MARK(IPV4_BFD, BFD, TRAP_TO_CPU, false), + }, + }, + { + .trap = MLXSW_SP_TRAP_CONTROL(IPV6_BFD, BFD, TRAP), + .listeners_arr = { + MLXSW_SP_RXL_MARK(IPV6_BFD, BFD, TRAP_TO_CPU, false), + }, + }, + { + .trap = MLXSW_SP_TRAP_CONTROL(IPV4_OSPF, OSPF, TRAP), + .listeners_arr = { + MLXSW_SP_RXL_MARK(IPV4_OSPF, OSPF, TRAP_TO_CPU, false), + }, + }, + { + .trap = MLXSW_SP_TRAP_CONTROL(IPV6_OSPF, OSPF, TRAP), + .listeners_arr = { + MLXSW_SP_RXL_MARK(IPV6_OSPF, OSPF, TRAP_TO_CPU, false), + }, + }, + { + .trap = MLXSW_SP_TRAP_CONTROL(IPV4_BGP, BGP, TRAP), + .listeners_arr = { + MLXSW_SP_RXL_MARK(IPV4_BGP, BGP, TRAP_TO_CPU, false), + }, + }, + { + .trap = MLXSW_SP_TRAP_CONTROL(IPV6_BGP, BGP, TRAP), + .listeners_arr = { + MLXSW_SP_RXL_MARK(IPV6_BGP, BGP, TRAP_TO_CPU, false), + }, + }, + { + .trap = MLXSW_SP_TRAP_CONTROL(IPV4_VRRP, VRRP, TRAP), + .listeners_arr = { + MLXSW_SP_RXL_MARK(IPV4_VRRP, VRRP, TRAP_TO_CPU, false), + }, + }, + { + .trap = MLXSW_SP_TRAP_CONTROL(IPV6_VRRP, VRRP, TRAP), + .listeners_arr = { + MLXSW_SP_RXL_MARK(IPV6_VRRP, VRRP, TRAP_TO_CPU, false), + }, + }, + { + .trap = MLXSW_SP_TRAP_CONTROL(IPV4_PIM, PIM, TRAP), + .listeners_arr = { + MLXSW_SP_RXL_MARK(IPV4_PIM, PIM, TRAP_TO_CPU, false), + }, + }, + { + .trap = MLXSW_SP_TRAP_CONTROL(IPV6_PIM, PIM, TRAP), + .listeners_arr = { + MLXSW_SP_RXL_MARK(IPV6_PIM, PIM, TRAP_TO_CPU, false), + }, + }, + { + .trap = MLXSW_SP_TRAP_CONTROL(UC_LB, UC_LB, MIRROR), + .listeners_arr = { + MLXSW_SP_RXL_L3_MARK(LBERROR, LBERROR, MIRROR_TO_CPU, + false), + }, + }, + { + .trap = MLXSW_SP_TRAP_CONTROL(LOCAL_ROUTE, LOCAL_DELIVERY, + TRAP), + .listeners_arr = { + MLXSW_SP_RXL_MARK(IP2ME, IP2ME, TRAP_TO_CPU, false), + }, + }, + { + .trap = MLXSW_SP_TRAP_CONTROL(EXTERNAL_ROUTE, LOCAL_DELIVERY, + TRAP), + .listeners_arr = { + MLXSW_SP_RXL_MARK(RTR_INGRESS0, IP2ME, TRAP_TO_CPU, + false), + }, + }, + { + .trap = MLXSW_SP_TRAP_CONTROL(IPV6_UC_DIP_LINK_LOCAL_SCOPE, + LOCAL_DELIVERY, TRAP), + .listeners_arr = { + MLXSW_SP_RXL_MARK(IPV6_LINK_LOCAL_DEST, IP2ME, + TRAP_TO_CPU, false), + }, + }, + { + .trap = MLXSW_SP_TRAP_CONTROL(IPV4_ROUTER_ALERT, LOCAL_DELIVERY, + TRAP), + .listeners_arr = { + MLXSW_SP_RXL_MARK(ROUTER_ALERT_IPV4, IP2ME, TRAP_TO_CPU, + false), + }, + }, + { + /* IPV6_ROUTER_ALERT is defined in uAPI as 22, but it is not + * used in this file, so undefine it. + */ + #undef IPV6_ROUTER_ALERT + .trap = MLXSW_SP_TRAP_CONTROL(IPV6_ROUTER_ALERT, LOCAL_DELIVERY, + TRAP), + .listeners_arr = { + MLXSW_SP_RXL_MARK(ROUTER_ALERT_IPV6, IP2ME, TRAP_TO_CPU, + false), + }, + }, + { + .trap = MLXSW_SP_TRAP_CONTROL(IPV6_DIP_ALL_NODES, IPV6, TRAP), + .listeners_arr = { + MLXSW_SP_RXL_MARK(IPV6_ALL_NODES_LINK, IPV6, + TRAP_TO_CPU, false), + }, + }, + { + .trap = MLXSW_SP_TRAP_CONTROL(IPV6_DIP_ALL_ROUTERS, IPV6, TRAP), + .listeners_arr = { + MLXSW_SP_RXL_MARK(IPV6_ALL_ROUTERS_LINK, IPV6, + TRAP_TO_CPU, false), + }, + }, + { + .trap = MLXSW_SP_TRAP_CONTROL(IPV6_ROUTER_SOLICIT, IPV6, TRAP), + .listeners_arr = { + MLXSW_SP_RXL_MARK(L3_IPV6_ROUTER_SOLICITATION, IPV6, + TRAP_TO_CPU, false), + }, + }, + { + .trap = MLXSW_SP_TRAP_CONTROL(IPV6_ROUTER_ADVERT, IPV6, TRAP), + .listeners_arr = { + MLXSW_SP_RXL_MARK(L3_IPV6_ROUTER_ADVERTISEMENT, IPV6, + TRAP_TO_CPU, false), + }, + }, + { + .trap = MLXSW_SP_TRAP_CONTROL(IPV6_REDIRECT, IPV6, TRAP), + .listeners_arr = { + MLXSW_SP_RXL_MARK(L3_IPV6_REDIRECTION, IPV6, + TRAP_TO_CPU, false), + }, + }, + { + .trap = MLXSW_SP_TRAP_CONTROL(PTP_EVENT, PTP_EVENT, TRAP), + .listeners_arr = { + MLXSW_RXL(mlxsw_sp_rx_ptp_listener, PTP0, TRAP_TO_CPU, + false, SP_PTP0, DISCARD), + }, + }, + { + .trap = MLXSW_SP_TRAP_CONTROL(PTP_GENERAL, PTP_GENERAL, TRAP), + .listeners_arr = { + MLXSW_SP_RXL_NO_MARK(PTP1, PTP1, TRAP_TO_CPU, false), + }, + }, }; static struct mlxsw_sp_trap_policer_item * -- cgit v1.2.3-59-g8ed1b From 88e2774961d7854628fa9c403166c4162cebc12e Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Fri, 29 May 2020 21:36:48 +0300 Subject: mlxsw: spectrum_trap: Register ACL control traps In a similar fashion to other control traps, register ACL control traps with devlink. Signed-off-by: Ido Schimmel Reviewed-by: Jiri Pirko Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlxsw/spectrum.c | 31 ++++----------- drivers/net/ethernet/mellanox/mlxsw/spectrum.h | 2 + .../net/ethernet/mellanox/mlxsw/spectrum_trap.c | 45 ++++++++++++++++++++++ 3 files changed, 55 insertions(+), 23 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c index 8daeae1384da..5ffa32b75e5f 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c @@ -3987,10 +3987,15 @@ static void mlxsw_sp_rx_listener_l3_mark_func(struct sk_buff *skb, return mlxsw_sp_rx_listener_no_mark_func(skb, local_port, priv); } -static void mlxsw_sp_rx_listener_sample_func(struct sk_buff *skb, u8 local_port, - void *priv) +void mlxsw_sp_ptp_receive(struct mlxsw_sp *mlxsw_sp, struct sk_buff *skb, + u8 local_port) +{ + mlxsw_sp->ptp_ops->receive(mlxsw_sp, skb, local_port); +} + +void mlxsw_sp_sample_receive(struct mlxsw_sp *mlxsw_sp, struct sk_buff *skb, + u8 local_port) { - struct mlxsw_sp *mlxsw_sp = priv; struct mlxsw_sp_port *mlxsw_sp_port = mlxsw_sp->ports[local_port]; struct mlxsw_sp_port_sample *sample; u32 size; @@ -4014,12 +4019,6 @@ out: consume_skb(skb); } -void mlxsw_sp_ptp_receive(struct mlxsw_sp *mlxsw_sp, struct sk_buff *skb, - u8 local_port) -{ - mlxsw_sp->ptp_ops->receive(mlxsw_sp, skb, local_port); -} - #define MLXSW_SP_RXL_NO_MARK(_trap_id, _action, _trap_group, _is_ctrl) \ MLXSW_RXL(mlxsw_sp_rx_listener_no_mark_func, _trap_id, _action, \ _is_ctrl, SP_##_trap_group, DISCARD) @@ -4054,11 +4053,6 @@ static const struct mlxsw_listener mlxsw_sp_listener[] = { ROUTER_EXP, false), MLXSW_SP_RXL_NO_MARK(DISCARD_ING_ROUTER_DIP_LINK_LOCAL, FORWARD, ROUTER_EXP, false), - /* PKT Sample trap */ - MLXSW_RXL(mlxsw_sp_rx_listener_sample_func, PKT_SAMPLE, MIRROR_TO_CPU, - false, SP_PKT_SAMPLE, DISCARD), - /* ACL trap */ - MLXSW_SP_RXL_NO_MARK(ACL0, TRAP_TO_CPU, FLOW_LOGGING, false), /* Multicast Router Traps */ MLXSW_SP_RXL_MARK(ACL1, TRAP_TO_CPU, MULTICAST, false), MLXSW_SP_RXL_L3_MARK(ACL2, TRAP_TO_CPU, MULTICAST, false), @@ -4094,7 +4088,6 @@ static int mlxsw_sp_cpu_policers_set(struct mlxsw_core *mlxsw_core) switch (i) { case MLXSW_REG_HTGT_TRAP_GROUP_SP_ROUTER_EXP: case MLXSW_REG_HTGT_TRAP_GROUP_SP_MULTICAST: - case MLXSW_REG_HTGT_TRAP_GROUP_SP_FLOW_LOGGING: case MLXSW_REG_HTGT_TRAP_GROUP_SP_FID_MISS: rate = 1024; burst_size = 7; @@ -4133,20 +4126,12 @@ static int mlxsw_sp_trap_groups_set(struct mlxsw_core *mlxsw_core) for (i = 0; i < max_trap_groups; i++) { policer_id = i; switch (i) { - case MLXSW_REG_HTGT_TRAP_GROUP_SP_FLOW_LOGGING: - priority = 4; - tc = 4; - break; case MLXSW_REG_HTGT_TRAP_GROUP_SP_ROUTER_EXP: case MLXSW_REG_HTGT_TRAP_GROUP_SP_MULTICAST: case MLXSW_REG_HTGT_TRAP_GROUP_SP_FID_MISS: priority = 1; tc = 1; break; - case MLXSW_REG_HTGT_TRAP_GROUP_SP_PKT_SAMPLE: - priority = 0; - tc = 0; - break; case MLXSW_REG_HTGT_TRAP_GROUP_SP_EVENT: priority = MLXSW_REG_HTGT_DEFAULT_PRIORITY; tc = MLXSW_REG_HTGT_DEFAULT_TC; diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.h b/drivers/net/ethernet/mellanox/mlxsw/spectrum.h index 9d4dfb22cb7a..6f96ca50c9ba 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.h +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.h @@ -453,6 +453,8 @@ void mlxsw_sp_rx_listener_no_mark_func(struct sk_buff *skb, u8 local_port, void *priv); void mlxsw_sp_ptp_receive(struct mlxsw_sp *mlxsw_sp, struct sk_buff *skb, u8 local_port); +void mlxsw_sp_sample_receive(struct mlxsw_sp *mlxsw_sp, struct sk_buff *skb, + u8 local_port); int mlxsw_sp_port_speed_get(struct mlxsw_sp_port *mlxsw_sp_port, u32 *speed); int mlxsw_sp_port_ets_set(struct mlxsw_sp_port *mlxsw_sp_port, enum mlxsw_reg_qeec_hr hr, u8 index, u8 next_index, diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_trap.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_trap.c index 148a35b7f4f8..157a42c63066 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_trap.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_trap.c @@ -195,6 +195,23 @@ static void mlxsw_sp_rx_ptp_listener(struct sk_buff *skb, u8 local_port, mlxsw_sp_ptp_receive(mlxsw_sp, skb, local_port); } +static void mlxsw_sp_rx_sample_listener(struct sk_buff *skb, u8 local_port, + void *trap_ctx) +{ + struct mlxsw_sp *mlxsw_sp = devlink_trap_ctx_priv(trap_ctx); + int err; + + err = __mlxsw_sp_rx_no_mark_listener(skb, local_port, trap_ctx); + if (err) + return; + + /* The sample handler expects skb->data to point to the start of the + * Ethernet header. + */ + skb_push(skb, ETH_HLEN); + mlxsw_sp_sample_receive(mlxsw_sp, skb, local_port); +} + #define MLXSW_SP_TRAP_DROP(_id, _group_id) \ DEVLINK_TRAP_GENERIC(DROP, DROP, _id, \ DEVLINK_TRAP_GROUP_GENERIC_ID_##_group_id, \ @@ -308,6 +325,9 @@ mlxsw_sp_trap_policer_items_arr[] = { { .policer = MLXSW_SP_TRAP_POLICER(17, 19 * 1024, 4096), }, + { + .policer = MLXSW_SP_TRAP_POLICER(18, 1024, 128), + }, }; static const struct mlxsw_sp_trap_group_item mlxsw_sp_trap_group_items_arr[] = { @@ -416,6 +436,16 @@ static const struct mlxsw_sp_trap_group_item mlxsw_sp_trap_group_items_arr[] = { .hw_group_id = MLXSW_REG_HTGT_TRAP_GROUP_SP_PTP1, .priority = 2, }, + { + .group = DEVLINK_TRAP_GROUP_GENERIC(ACL_SAMPLE, 0), + .hw_group_id = MLXSW_REG_HTGT_TRAP_GROUP_SP_PKT_SAMPLE, + .priority = 0, + }, + { + .group = DEVLINK_TRAP_GROUP_GENERIC(ACL_TRAP, 18), + .hw_group_id = MLXSW_REG_HTGT_TRAP_GROUP_SP_FLOW_LOGGING, + .priority = 4, + }, }; static const struct mlxsw_sp_trap_item mlxsw_sp_trap_items_arr[] = { @@ -935,6 +965,21 @@ static const struct mlxsw_sp_trap_item mlxsw_sp_trap_items_arr[] = { MLXSW_SP_RXL_NO_MARK(PTP1, PTP1, TRAP_TO_CPU, false), }, }, + { + .trap = MLXSW_SP_TRAP_CONTROL(FLOW_ACTION_SAMPLE, ACL_SAMPLE, + MIRROR), + .listeners_arr = { + MLXSW_RXL(mlxsw_sp_rx_sample_listener, PKT_SAMPLE, + MIRROR_TO_CPU, false, SP_PKT_SAMPLE, DISCARD), + }, + }, + { + .trap = MLXSW_SP_TRAP_CONTROL(FLOW_ACTION_TRAP, ACL_TRAP, TRAP), + .listeners_arr = { + MLXSW_SP_RXL_NO_MARK(ACL0, FLOW_LOGGING, TRAP_TO_CPU, + false), + }, + }, }; static struct mlxsw_sp_trap_policer_item * -- cgit v1.2.3-59-g8ed1b From 9959b389779a9e688d1a9272eed6377d999d8739 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Fri, 29 May 2020 21:36:49 +0300 Subject: selftests: mlxsw: Add test for control packets Generate packets matching the various control traps and check that the traps' stats increase accordingly. Signed-off-by: Ido Schimmel Signed-off-by: David S. Miller --- .../drivers/net/mlxsw/devlink_trap_control.sh | 688 +++++++++++++++++++++ .../selftests/net/forwarding/devlink_lib.sh | 23 + 2 files changed, 711 insertions(+) create mode 100755 tools/testing/selftests/drivers/net/mlxsw/devlink_trap_control.sh diff --git a/tools/testing/selftests/drivers/net/mlxsw/devlink_trap_control.sh b/tools/testing/selftests/drivers/net/mlxsw/devlink_trap_control.sh new file mode 100755 index 000000000000..a37273473c1b --- /dev/null +++ b/tools/testing/selftests/drivers/net/mlxsw/devlink_trap_control.sh @@ -0,0 +1,688 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +# +# Test devlink-trap control trap functionality over mlxsw. Each registered +# control packet trap is tested to make sure it is triggered under the right +# conditions. +# +# +---------------------------------+ +# | H1 (vrf) | +# | + $h1 | +# | | 192.0.2.1/24 | +# | | 2001:db8:1::1/64 | +# | | | +# | | default via 192.0.2.2 | +# | | default via 2001:db8:1::2 | +# +----|----------------------------+ +# | +# +----|----------------------------------------------------------------------+ +# | SW | | +# | + $rp1 | +# | 192.0.2.2/24 | +# | 2001:db8:1::2/64 | +# | | +# | 2001:db8:2::2/64 | +# | 198.51.100.2/24 | +# | + $rp2 | +# | | | +# +----|----------------------------------------------------------------------+ +# | +# +----|----------------------------+ +# | | default via 198.51.100.2 | +# | | default via 2001:db8:2::2 | +# | | | +# | | 2001:db8:2::1/64 | +# | | 198.51.100.1/24 | +# | + $h2 | +# | H2 (vrf) | +# +---------------------------------+ + +lib_dir=$(dirname $0)/../../../net/forwarding + +ALL_TESTS=" + stp_test + lacp_test + lldp_test + igmp_query_test + igmp_v1_report_test + igmp_v2_report_test + igmp_v3_report_test + igmp_v2_leave_test + mld_query_test + mld_v1_report_test + mld_v2_report_test + mld_v1_done_test + ipv4_dhcp_test + ipv6_dhcp_test + arp_request_test + arp_response_test + ipv6_neigh_solicit_test + ipv6_neigh_advert_test + ipv4_bfd_test + ipv6_bfd_test + ipv4_ospf_test + ipv6_ospf_test + ipv4_bgp_test + ipv6_bgp_test + ipv4_vrrp_test + ipv6_vrrp_test + ipv4_pim_test + ipv6_pim_test + uc_loopback_test + local_route_test + external_route_test + ipv6_uc_dip_link_local_scope_test + ipv4_router_alert_test + ipv6_router_alert_test + ipv6_dip_all_nodes_test + ipv6_dip_all_routers_test + ipv6_router_solicit_test + ipv6_router_advert_test + ipv6_redirect_test + ptp_event_test + ptp_general_test + flow_action_sample_test + flow_action_trap_test +" +NUM_NETIFS=4 +source $lib_dir/lib.sh +source $lib_dir/devlink_lib.sh + +h1_create() +{ + simple_if_init $h1 192.0.2.1/24 2001:db8:1::1/64 + + ip -4 route add default vrf v$h1 nexthop via 192.0.2.2 + ip -6 route add default vrf v$h1 nexthop via 2001:db8:1::2 +} + +h1_destroy() +{ + ip -6 route del default vrf v$h1 nexthop via 2001:db8:1::2 + ip -4 route del default vrf v$h1 nexthop via 192.0.2.2 + + simple_if_fini $h1 192.0.2.1/24 2001:db8:1::1/64 +} + +h2_create() +{ + simple_if_init $h2 198.51.100.1/24 2001:db8:2::1/64 + + ip -4 route add default vrf v$h2 nexthop via 198.51.100.2 + ip -6 route add default vrf v$h2 nexthop via 2001:db8:2::2 +} + +h2_destroy() +{ + ip -6 route del default vrf v$h2 nexthop via 2001:db8:2::2 + ip -4 route del default vrf v$h2 nexthop via 198.51.100.2 + + simple_if_fini $h2 198.51.100.1/24 2001:db8:2::1/64 +} + +router_create() +{ + ip link set dev $rp1 up + ip link set dev $rp2 up + + __addr_add_del $rp1 add 192.0.2.2/24 2001:db8:1::2/64 + __addr_add_del $rp2 add 198.51.100.2/24 2001:db8:2::2/64 +} + +router_destroy() +{ + __addr_add_del $rp2 del 198.51.100.2/24 2001:db8:2::2/64 + __addr_add_del $rp1 del 192.0.2.2/24 2001:db8:1::2/64 + + ip link set dev $rp2 down + ip link set dev $rp1 down +} + +setup_prepare() +{ + h1=${NETIFS[p1]} + rp1=${NETIFS[p2]} + + rp2=${NETIFS[p3]} + h2=${NETIFS[p4]} + + vrf_prepare + forwarding_enable + + h1_create + h2_create + router_create +} + +cleanup() +{ + pre_cleanup + + router_destroy + h2_destroy + h1_destroy + + forwarding_restore + vrf_cleanup +} + +stp_test() +{ + devlink_trap_stats_test "STP" "stp" $MZ $h1 -c 1 -t bpdu -q +} + +lacp_payload_get() +{ + local source_mac=$1; shift + local p + + p=$(: + )"01:80:C2:00:00:02:"$( : ETH daddr + )"$source_mac:"$( : ETH saddr + )"88:09:"$( : ETH type + ) + echo $p +} + +lacp_test() +{ + local h1mac=$(mac_get $h1) + + devlink_trap_stats_test "LACP" "lacp" $MZ $h1 -c 1 \ + $(lacp_payload_get $h1mac) -p 100 -q +} + +lldp_payload_get() +{ + local source_mac=$1; shift + local p + + p=$(: + )"01:80:C2:00:00:0E:"$( : ETH daddr + )"$source_mac:"$( : ETH saddr + )"88:CC:"$( : ETH type + ) + echo $p +} + +lldp_test() +{ + local h1mac=$(mac_get $h1) + + devlink_trap_stats_test "LLDP" "lldp" $MZ $h1 -c 1 \ + $(lldp_payload_get $h1mac) -p 100 -q +} + +igmp_query_test() +{ + # IGMP (IP Protocol 2) Membership Query (Type 0x11) + devlink_trap_stats_test "IGMP Membership Query" "igmp_query" \ + $MZ $h1 -c 1 -a own -b 01:00:5E:00:00:01 \ + -A 192.0.2.1 -B 224.0.0.1 -t ip proto=2,p=11 -p 100 -q +} + +igmp_v1_report_test() +{ + # IGMP (IP Protocol 2) Version 1 Membership Report (Type 0x12) + devlink_trap_stats_test "IGMP Version 1 Membership Report" \ + "igmp_v1_report" $MZ $h1 -c 1 -a own -b 01:00:5E:00:00:01 \ + -A 192.0.2.1 -B 244.0.0.1 -t ip proto=2,p=12 -p 100 -q +} + +igmp_v2_report_test() +{ + # IGMP (IP Protocol 2) Version 2 Membership Report (Type 0x16) + devlink_trap_stats_test "IGMP Version 2 Membership Report" \ + "igmp_v2_report" $MZ $h1 -c 1 -a own -b 01:00:5E:00:00:01 \ + -A 192.0.2.1 -B 244.0.0.1 -t ip proto=2,p=16 -p 100 -q +} + +igmp_v3_report_test() +{ + # IGMP (IP Protocol 2) Version 3 Membership Report (Type 0x22) + devlink_trap_stats_test "IGMP Version 3 Membership Report" \ + "igmp_v3_report" $MZ $h1 -c 1 -a own -b 01:00:5E:00:00:01 \ + -A 192.0.2.1 -B 244.0.0.1 -t ip proto=2,p=22 -p 100 -q +} + +igmp_v2_leave_test() +{ + # IGMP (IP Protocol 2) Version 2 Leave Group (Type 0x17) + devlink_trap_stats_test "IGMP Version 2 Leave Group" \ + "igmp_v2_leave" $MZ $h1 -c 1 -a own -b 01:00:5E:00:00:02 \ + -A 192.0.2.1 -B 224.0.0.2 -t ip proto=2,p=17 -p 100 -q +} + +mld_payload_get() +{ + local type=$1; shift + local p + + type=$(printf "%x" $type) + p=$(: + )"3A:"$( : Next Header - ICMPv6 + )"00:"$( : Hdr Ext Len + )"00:00:00:00:00:00:"$( : Options and Padding + )"$type:"$( : ICMPv6.type + )"00:"$( : ICMPv6.code + )"00:"$( : ICMPv6.checksum + ) + echo $p +} + +mld_query_test() +{ + # MLD Multicast Listener Query (Type 130) + devlink_trap_stats_test "MLD Multicast Listener Query" "mld_query" \ + $MZ $h1 -6 -c 1 -A fe80::1 -B ff02::1 \ + -t ip hop=1,next=0,payload=$(mld_payload_get 130) -p 100 -q +} + +mld_v1_report_test() +{ + # MLD Version 1 Multicast Listener Report (Type 131) + devlink_trap_stats_test "MLD Version 1 Multicast Listener Report" \ + "mld_v1_report" $MZ $h1 -6 -c 1 -A fe80::1 -B ff02::16 \ + -t ip hop=1,next=0,payload=$(mld_payload_get 131) -p 100 -q +} + +mld_v2_report_test() +{ + # MLD Version 2 Multicast Listener Report (Type 143) + devlink_trap_stats_test "MLD Version 2 Multicast Listener Report" \ + "mld_v2_report" $MZ $h1 -6 -c 1 -A fe80::1 -B ff02::16 \ + -t ip hop=1,next=0,payload=$(mld_payload_get 143) -p 100 -q +} + +mld_v1_done_test() +{ + # MLD Version 1 Multicast Listener Done (Type 132) + devlink_trap_stats_test "MLD Version 1 Multicast Listener Done" \ + "mld_v1_done" $MZ $h1 -6 -c 1 -A fe80::1 -B ff02::16 \ + -t ip hop=1,next=0,payload=$(mld_payload_get 132) -p 100 -q +} + +ipv4_dhcp_test() +{ + devlink_trap_stats_test "IPv4 DHCP Port 67" "ipv4_dhcp" \ + $MZ $h1 -c 1 -a own -b bcast -A 0.0.0.0 -B 255.255.255.255 \ + -t udp sp=68,dp=67 -p 100 -q + + devlink_trap_stats_test "IPv4 DHCP Port 68" "ipv4_dhcp" \ + $MZ $h1 -c 1 -a own -b $(mac_get $rp1) -A 192.0.2.1 \ + -B 255.255.255.255 -t udp sp=67,dp=68 -p 100 -q +} + +ipv6_dhcp_test() +{ + devlink_trap_stats_test "IPv6 DHCP Port 547" "ipv6_dhcp" \ + $MZ $h1 -6 -c 1 -A fe80::1 -B ff02::1:2 -t udp sp=546,dp=547 \ + -p 100 -q + + devlink_trap_stats_test "IPv6 DHCP Port 546" "ipv6_dhcp" \ + $MZ $h1 -6 -c 1 -A fe80::1 -B ff02::1:2 -t udp sp=547,dp=546 \ + -p 100 -q +} + +arp_request_test() +{ + devlink_trap_stats_test "ARP Request" "arp_request" \ + $MZ $h1 -c 1 -a own -b bcast -t arp request -p 100 -q +} + +arp_response_test() +{ + devlink_trap_stats_test "ARP Response" "arp_response" \ + $MZ $h1 -c 1 -a own -b $(mac_get $rp1) -t arp reply -p 100 -q +} + +icmpv6_header_get() +{ + local type=$1; shift + local p + + type=$(printf "%x" $type) + p=$(: + )"$type:"$( : ICMPv6.type + )"00:"$( : ICMPv6.code + )"00:"$( : ICMPv6.checksum + ) + echo $p +} + +ipv6_neigh_solicit_test() +{ + devlink_trap_stats_test "IPv6 Neighbour Solicitation" \ + "ipv6_neigh_solicit" $MZ $h1 -6 -c 1 \ + -A fe80::1 -B ff02::1:ff00:02 \ + -t ip hop=1,next=58,payload=$(icmpv6_header_get 135) -p 100 -q +} + +ipv6_neigh_advert_test() +{ + devlink_trap_stats_test "IPv6 Neighbour Advertisement" \ + "ipv6_neigh_advert" $MZ $h1 -6 -c 1 -a own -b $(mac_get $rp1) \ + -A fe80::1 -B 2001:db8:1::2 \ + -t ip hop=1,next=58,payload=$(icmpv6_header_get 136) -p 100 -q +} + +ipv4_bfd_test() +{ + devlink_trap_stats_test "IPv4 BFD Control - Port 3784" "ipv4_bfd" \ + $MZ $h1 -c 1 -a own -b $(mac_get $rp1) \ + -A 192.0.2.1 -B 192.0.2.2 -t udp sp=49153,dp=3784 -p 100 -q + + devlink_trap_stats_test "IPv4 BFD Echo - Port 3785" "ipv4_bfd" \ + $MZ $h1 -c 1 -a own -b $(mac_get $rp1) \ + -A 192.0.2.1 -B 192.0.2.2 -t udp sp=49153,dp=3785 -p 100 -q +} + +ipv6_bfd_test() +{ + devlink_trap_stats_test "IPv6 BFD Control - Port 3784" "ipv6_bfd" \ + $MZ $h1 -6 -c 1 -a own -b $(mac_get $rp1) \ + -A 2001:db8:1::1 -B 2001:db8:1::2 \ + -t udp sp=49153,dp=3784 -p 100 -q + + devlink_trap_stats_test "IPv6 BFD Echo - Port 3785" "ipv6_bfd" \ + $MZ $h1 -6 -c 1 -a own -b $(mac_get $rp1) \ + -A 2001:db8:1::1 -B 2001:db8:1::2 \ + -t udp sp=49153,dp=3785 -p 100 -q +} + +ipv4_ospf_test() +{ + devlink_trap_stats_test "IPv4 OSPF - Multicast" "ipv4_ospf" \ + $MZ $h1 -c 1 -a own -b 01:00:5e:00:00:05 \ + -A 192.0.2.1 -B 224.0.0.5 -t ip proto=89 -p 100 -q + + devlink_trap_stats_test "IPv4 OSPF - Unicast" "ipv4_ospf" \ + $MZ $h1 -c 1 -a own -b $(mac_get $rp1) \ + -A 192.0.2.1 -B 192.0.2.2 -t ip proto=89 -p 100 -q +} + +ipv6_ospf_test() +{ + devlink_trap_stats_test "IPv6 OSPF - Multicast" "ipv6_ospf" \ + $MZ $h1 -6 -c 1 -a own -b 33:33:00:00:00:05 \ + -A fe80::1 -B ff02::5 -t ip next=89 -p 100 -q + + devlink_trap_stats_test "IPv6 OSPF - Unicast" "ipv6_ospf" \ + $MZ $h1 -6 -c 1 -a own -b $(mac_get $rp1) \ + -A 2001:db8:1::1 -B 2001:db8:1::2 -t ip next=89 -p 100 -q +} + +ipv4_bgp_test() +{ + devlink_trap_stats_test "IPv4 BGP" "ipv4_bgp" \ + $MZ $h1 -c 1 -a own -b $(mac_get $rp1) \ + -A 192.0.2.1 -B 192.0.2.2 -t tcp sp=54321,dp=179,flags=rst \ + -p 100 -q +} + +ipv6_bgp_test() +{ + devlink_trap_stats_test "IPv6 BGP" "ipv6_bgp" \ + $MZ $h1 -6 -c 1 -a own -b $(mac_get $rp1) \ + -A 2001:db8:1::1 -B 2001:db8:1::2 \ + -t tcp sp=54321,dp=179,flags=rst -p 100 -q +} + +ipv4_vrrp_test() +{ + devlink_trap_stats_test "IPv4 VRRP" "ipv4_vrrp" \ + $MZ $h1 -c 1 -a own -b 01:00:5e:00:00:12 \ + -A 192.0.2.1 -B 224.0.0.18 -t ip proto=112 -p 100 -q +} + +ipv6_vrrp_test() +{ + devlink_trap_stats_test "IPv6 VRRP" "ipv6_vrrp" \ + $MZ $h1 -6 -c 1 -a own -b 33:33:00:00:00:12 \ + -A fe80::1 -B ff02::12 -t ip next=112 -p 100 -q +} + +ipv4_pim_test() +{ + devlink_trap_stats_test "IPv4 PIM - Multicast" "ipv4_pim" \ + $MZ $h1 -c 1 -a own -b 01:00:5e:00:00:0d \ + -A 192.0.2.1 -B 224.0.0.13 -t ip proto=103 -p 100 -q + + devlink_trap_stats_test "IPv4 PIM - Unicast" "ipv4_pim" \ + $MZ $h1 -c 1 -a own -b $(mac_get $rp1) \ + -A 192.0.2.1 -B 192.0.2.2 -t ip proto=103 -p 100 -q +} + +ipv6_pim_test() +{ + devlink_trap_stats_test "IPv6 PIM - Multicast" "ipv6_pim" \ + $MZ $h1 -6 -c 1 -a own -b 33:33:00:00:00:0d \ + -A fe80::1 -B ff02::d -t ip next=103 -p 100 -q + + devlink_trap_stats_test "IPv6 PIM - Unicast" "ipv6_pim" \ + $MZ $h1 -6 -c 1 -a own -b $(mac_get $rp1) \ + -A fe80::1 -B 2001:db8:1::2 -t ip next=103 -p 100 -q +} + +uc_loopback_test() +{ + # Add neighbours to the fake destination IPs, so that the packets are + # routed in the device and not trapped due to an unresolved neighbour + # exception. + ip -4 neigh add 192.0.2.3 lladdr 00:11:22:33:44:55 nud permanent \ + dev $rp1 + ip -6 neigh add 2001:db8:1::3 lladdr 00:11:22:33:44:55 nud permanent \ + dev $rp1 + + devlink_trap_stats_test "IPv4 Unicast Loopback" "uc_loopback" \ + $MZ $h1 -c 1 -a own -b $(mac_get $rp1) \ + -A 192.0.2.1 -B 192.0.2.3 -t udp sp=54321,dp=12345 -p 100 -q + + devlink_trap_stats_test "IPv6 Unicast Loopback" "uc_loopback" \ + $MZ $h1 -6 -c 1 -a own -b $(mac_get $rp1) \ + -A 2001:db8:1::1 -B 2001:db8:1::3 -t udp sp=54321,dp=12345 \ + -p 100 -q + + ip -6 neigh del 2001:db8:1::3 dev $rp1 + ip -4 neigh del 192.0.2.3 dev $rp1 +} + +local_route_test() +{ + # Use a fake source IP to prevent the trap from being triggered twice + # when the router sends back a port unreachable message. + devlink_trap_stats_test "IPv4 Local Route" "local_route" \ + $MZ $h1 -c 1 -a own -b $(mac_get $rp1) \ + -A 192.0.2.3 -B 192.0.2.2 -t udp sp=54321,dp=12345 -p 100 -q + + devlink_trap_stats_test "IPv6 Local Route" "local_route" \ + $MZ $h1 -6 -c 1 -a own -b $(mac_get $rp1) \ + -A 2001:db8:1::3 -B 2001:db8:1::2 -t udp sp=54321,sp=12345 \ + -p 100 -q +} + +external_route_test() +{ + # Add a dummy device through which the incoming packets should be + # routed. + ip link add name dummy10 up type dummy + ip address add 203.0.113.1/24 dev dummy10 + ip -6 address add 2001:db8:10::1/64 dev dummy10 + + devlink_trap_stats_test "IPv4 External Route" "external_route" \ + $MZ $h1 -c 1 -a own -b $(mac_get $rp1) \ + -A 192.0.2.1 -B 203.0.113.2 -t udp sp=54321,dp=12345 -p 100 -q + + devlink_trap_stats_test "IPv6 External Route" "external_route" \ + $MZ $h1 -6 -c 1 -a own -b $(mac_get $rp1) \ + -A 2001:db8:1::1 -B 2001:db8:10::2 -t udp sp=54321,sp=12345 \ + -p 100 -q + + ip -6 address del 2001:db8:10::1/64 dev dummy10 + ip address del 203.0.113.1/24 dev dummy10 + ip link del dev dummy10 +} + +ipv6_uc_dip_link_local_scope_test() +{ + # Add a dummy link-local prefix route to allow the packet to be routed. + ip -6 route add fe80:1::/64 dev $rp2 + + devlink_trap_stats_test \ + "IPv6 Unicast Destination IP With Link-Local Scope" \ + "ipv6_uc_dip_link_local_scope" \ + $MZ $h1 -6 -c 1 -a own -b $(mac_get $rp1) \ + -A fe80::1 -B fe80:1::2 -t udp sp=54321,sp=12345 \ + -p 100 -q + + ip -6 route del fe80:1::/64 dev $rp2 +} + +ipv4_router_alert_get() +{ + local p + + # https://en.wikipedia.org/wiki/IPv4#Options + p=$(: + )"94:"$( : Option Number + )"04:"$( : Option Length + )"00:00:"$( : Option Data + ) + echo $p +} + +ipv4_router_alert_test() +{ + devlink_trap_stats_test "IPv4 Router Alert" "ipv4_router_alert" \ + $MZ $h1 -c 1 -a own -b $(mac_get $rp1) \ + -A 192.0.2.1 -B 198.51.100.3 \ + -t ip option=$(ipv4_router_alert_get) -p 100 -q +} + +ipv6_router_alert_get() +{ + local p + + # https://en.wikipedia.org/wiki/IPv6_packet#Hop-by-hop_options_and_destination_options + # https://tools.ietf.org/html/rfc2711#section-2.1 + p=$(: + )"11:"$( : Next Header - UDP + )"00:"$( : Hdr Ext Len + )"05:02:00:00:00:00:"$( : Option Data + ) + echo $p +} + +ipv6_router_alert_test() +{ + devlink_trap_stats_test "IPv6 Router Alert" "ipv6_router_alert" \ + $MZ $h1 -6 -c 1 -a own -b $(mac_get $rp1) \ + -A 2001:db8:1::1 -B 2001:db8:1::3 \ + -t ip next=0,payload=$(ipv6_router_alert_get) -p 100 -q +} + +ipv6_dip_all_nodes_test() +{ + devlink_trap_stats_test "IPv6 Destination IP \"All Nodes Address\"" \ + "ipv6_dip_all_nodes" \ + $MZ $h1 -6 -c 1 -a own -b 33:33:00:00:00:01 \ + -A 2001:db8:1::1 -B ff02::1 -t udp sp=12345,dp=54321 -p 100 -q +} + +ipv6_dip_all_routers_test() +{ + devlink_trap_stats_test "IPv6 Destination IP \"All Routers Address\"" \ + "ipv6_dip_all_routers" \ + $MZ $h1 -6 -c 1 -a own -b 33:33:00:00:00:02 \ + -A 2001:db8:1::1 -B ff02::2 -t udp sp=12345,dp=54321 -p 100 -q +} + +ipv6_router_solicit_test() +{ + devlink_trap_stats_test "IPv6 Router Solicitation" \ + "ipv6_router_solicit" \ + $MZ $h1 -6 -c 1 -a own -b 33:33:00:00:00:02 \ + -A fe80::1 -B ff02::2 \ + -t ip hop=1,next=58,payload=$(icmpv6_header_get 133) -p 100 -q +} + +ipv6_router_advert_test() +{ + devlink_trap_stats_test "IPv6 Router Advertisement" \ + "ipv6_router_advert" \ + $MZ $h1 -6 -c 1 -a own -b 33:33:00:00:00:01 \ + -A fe80::1 -B ff02::1 \ + -t ip hop=1,next=58,payload=$(icmpv6_header_get 134) -p 100 -q +} + +ipv6_redirect_test() +{ + devlink_trap_stats_test "IPv6 Redirect Message" \ + "ipv6_redirect" \ + $MZ $h1 -6 -c 1 -a own -b $(mac_get $rp1) \ + -A fe80::1 -B 2001:db8:1::2 \ + -t ip hop=1,next=58,payload=$(icmpv6_header_get 137) -p 100 -q +} + +ptp_event_test() +{ + # PTP is only supported on Spectrum-1, for now. + [[ "$DEVLINK_VIDDID" != "15b3:cb84" ]] && return + + # PTP Sync (0) + devlink_trap_stats_test "PTP Time-Critical Event Message" "ptp_event" \ + $MZ $h1 -c 1 -a own -b 01:00:5e:00:01:81 \ + -A 192.0.2.1 -B 224.0.1.129 \ + -t udp sp=12345,dp=319,payload=10 -p 100 -q +} + +ptp_general_test() +{ + # PTP is only supported on Spectrum-1, for now. + [[ "$DEVLINK_VIDDID" != "15b3:cb84" ]] && return + + # PTP Announce (b) + devlink_trap_stats_test "PTP General Message" "ptp_general" \ + $MZ $h1 -c 1 -a own -b 01:00:5e:00:01:81 \ + -A 192.0.2.1 -B 224.0.1.129 \ + -t udp sp=12345,dp=320,payload=1b -p 100 -q +} + +flow_action_sample_test() +{ + # Install a filter that samples every incoming packet. + tc qdisc add dev $rp1 clsact + tc filter add dev $rp1 ingress proto all pref 1 handle 101 matchall \ + skip_sw action sample rate 1 group 1 + + devlink_trap_stats_test "Flow Sampling" "flow_action_sample" \ + $MZ $h1 -c 1 -a own -b $(mac_get $rp1) \ + -A 192.0.2.1 -B 198.51.100.1 -t udp sp=12345,dp=54321 -p 100 -q + + tc filter del dev $rp1 ingress proto all pref 1 handle 101 matchall + tc qdisc del dev $rp1 clsact +} + +flow_action_trap_test() +{ + # Install a filter that traps a specific flow. + tc qdisc add dev $rp1 clsact + tc filter add dev $rp1 ingress proto ip pref 1 handle 101 flower \ + skip_sw ip_proto udp src_port 12345 dst_port 54321 action trap + + devlink_trap_stats_test "Flow Trapping (Logging)" "flow_action_trap" \ + $MZ $h1 -c 1 -a own -b $(mac_get $rp1) \ + -A 192.0.2.1 -B 198.51.100.1 -t udp sp=12345,dp=54321 -p 100 -q + + tc filter del dev $rp1 ingress proto ip pref 1 handle 101 flower + tc qdisc del dev $rp1 clsact +} + +trap cleanup EXIT + +setup_prepare +setup_wait + +tests_run + +exit $EXIT_STATUS diff --git a/tools/testing/selftests/net/forwarding/devlink_lib.sh b/tools/testing/selftests/net/forwarding/devlink_lib.sh index e27236109235..f0e6be4c09e9 100644 --- a/tools/testing/selftests/net/forwarding/devlink_lib.sh +++ b/tools/testing/selftests/net/forwarding/devlink_lib.sh @@ -423,6 +423,29 @@ devlink_trap_drop_cleanup() tc filter del dev $dev egress protocol $proto pref $pref handle $handle flower } +devlink_trap_stats_test() +{ + local test_name=$1; shift + local trap_name=$1; shift + local send_one="$@" + local t0_packets + local t1_packets + + RET=0 + + t0_packets=$(devlink_trap_rx_packets_get $trap_name) + + $send_one && sleep 1 + + t1_packets=$(devlink_trap_rx_packets_get $trap_name) + + if [[ $t1_packets -eq $t0_packets ]]; then + check_err 1 "Trap stats did not increase" + fi + + log_test "$test_name" +} + devlink_trap_policers_num_get() { devlink -j -p trap policer show | jq '.[]["'$DEVLINK_DEV'"] | length' -- cgit v1.2.3-59-g8ed1b From 6f197fb63850b26ef8f70f1bfe5900e377910a5a Mon Sep 17 00:00:00 2001 From: Roelof Berg Date: Fri, 29 May 2020 21:30:02 +0200 Subject: lan743x: Added fixed link and RGMII support Microchip lan7431 is frequently connected to a phy. However, it can also be directly connected to a MII remote peer without any phy in between. For supporting such a phyless hardware setup in Linux we utilized phylib, which supports a fixed-link configuration via the device tree. And we added support for defining the connection type R/GMII in the device tree. New behavior: ------------- . The automatic speed and duplex detection of the lan743x silicon between mac and phy is disabled. Instead phylib is used like in other typical Linux drivers. The usage of phylib allows to specify fixed-link parameters in the device tree. . The device tree entry phy-connection-type is supported now with the modes RGMII or (G)MII (default). Development state: ------------------ . Tested with fixed-phy configurations. Not yet tested in normal configurations with phy. Microchip kindly offered testing as soon as the Corona measures allow this. . All review findings of Andrew Lunn are included Example: -------- &pcie { status = "okay"; host@0 { reg = <0 0 0 0 0>; #address-cells = <3>; #size-cells = <2>; ethernet@0 { compatible = "weyland-yutani,noscom1", "microchip,lan743x"; status = "okay"; reg = <0 0 0 0 0>; phy-connection-type = "rgmii"; fixed-link { speed = <100>; full-duplex; }; }; }; }; Signed-off-by: Roelof Berg Signed-off-by: Andrew Lunn Signed-off-by: David S. Miller --- drivers/net/ethernet/microchip/lan743x_ethtool.c | 4 +- drivers/net/ethernet/microchip/lan743x_main.c | 81 +++++++++++++++++++++--- drivers/net/ethernet/microchip/lan743x_main.h | 6 ++ drivers/net/ethernet/microchip/lan743x_ptp.c | 2 +- 4 files changed, 80 insertions(+), 13 deletions(-) diff --git a/drivers/net/ethernet/microchip/lan743x_ethtool.c b/drivers/net/ethernet/microchip/lan743x_ethtool.c index 3a0b289d9771..c533d06fbe3a 100644 --- a/drivers/net/ethernet/microchip/lan743x_ethtool.c +++ b/drivers/net/ethernet/microchip/lan743x_ethtool.c @@ -2,11 +2,11 @@ /* Copyright (C) 2018 Microchip Technology Inc. */ #include -#include "lan743x_main.h" -#include "lan743x_ethtool.h" #include #include #include +#include "lan743x_main.h" +#include "lan743x_ethtool.h" /* eeprom */ #define LAN743X_EEPROM_MAGIC (0x74A5) diff --git a/drivers/net/ethernet/microchip/lan743x_main.c b/drivers/net/ethernet/microchip/lan743x_main.c index a43140f7b5eb..36624e3c633b 100644 --- a/drivers/net/ethernet/microchip/lan743x_main.c +++ b/drivers/net/ethernet/microchip/lan743x_main.c @@ -8,7 +8,10 @@ #include #include #include +#include +#include #include +#include #include #include #include @@ -798,9 +801,9 @@ static int lan743x_mac_init(struct lan743x_adapter *adapter) netdev = adapter->netdev; - /* setup auto duplex, and speed detection */ + /* disable auto duplex, and speed detection. Phylib does that */ data = lan743x_csr_read(adapter, MAC_CR); - data |= MAC_CR_ADD_ | MAC_CR_ASD_; + data &= ~(MAC_CR_ADD_ | MAC_CR_ASD_); data |= MAC_CR_CNTR_RST_; lan743x_csr_write(adapter, MAC_CR, data); @@ -946,6 +949,7 @@ static void lan743x_phy_link_status_change(struct net_device *netdev) { struct lan743x_adapter *adapter = netdev_priv(netdev); struct phy_device *phydev = netdev->phydev; + u32 data; phy_print_status(phydev); if (phydev->state == PHY_RUNNING) { @@ -953,6 +957,39 @@ static void lan743x_phy_link_status_change(struct net_device *netdev) int remote_advertisement = 0; int local_advertisement = 0; + data = lan743x_csr_read(adapter, MAC_CR); + + /* set interface mode */ + if (phy_interface_mode_is_rgmii(adapter->phy_mode)) + /* RGMII */ + data &= ~MAC_CR_MII_EN_; + else + /* GMII */ + data |= MAC_CR_MII_EN_; + + /* set duplex mode */ + if (phydev->duplex) + data |= MAC_CR_DPX_; + else + data &= ~MAC_CR_DPX_; + + /* set bus speed */ + switch (phydev->speed) { + case SPEED_10: + data &= ~MAC_CR_CFG_H_; + data &= ~MAC_CR_CFG_L_; + break; + case SPEED_100: + data &= ~MAC_CR_CFG_H_; + data |= MAC_CR_CFG_L_; + break; + case SPEED_1000: + data |= MAC_CR_CFG_H_; + data |= MAC_CR_CFG_L_; + break; + } + lan743x_csr_write(adapter, MAC_CR, data); + memset(&ksettings, 0, sizeof(ksettings)); phy_ethtool_get_link_ksettings(netdev, &ksettings); local_advertisement = @@ -980,20 +1017,44 @@ static void lan743x_phy_close(struct lan743x_adapter *adapter) static int lan743x_phy_open(struct lan743x_adapter *adapter) { struct lan743x_phy *phy = &adapter->phy; + struct device_node *phynode; struct phy_device *phydev; struct net_device *netdev; int ret = -EIO; netdev = adapter->netdev; - phydev = phy_find_first(adapter->mdiobus); - if (!phydev) - goto return_error; + phynode = of_node_get(adapter->pdev->dev.of_node); + adapter->phy_mode = PHY_INTERFACE_MODE_GMII; + + if (phynode) { + of_get_phy_mode(phynode, &adapter->phy_mode); + + if (of_phy_is_fixed_link(phynode)) { + ret = of_phy_register_fixed_link(phynode); + if (ret) { + netdev_err(netdev, + "cannot register fixed PHY\n"); + of_node_put(phynode); + goto return_error; + } + } + phydev = of_phy_connect(netdev, phynode, + lan743x_phy_link_status_change, 0, + adapter->phy_mode); + of_node_put(phynode); + if (!phydev) + goto return_error; + } else { + phydev = phy_find_first(adapter->mdiobus); + if (!phydev) + goto return_error; - ret = phy_connect_direct(netdev, phydev, - lan743x_phy_link_status_change, - PHY_INTERFACE_MODE_GMII); - if (ret) - goto return_error; + ret = phy_connect_direct(netdev, phydev, + lan743x_phy_link_status_change, + adapter->phy_mode); + if (ret) + goto return_error; + } /* MAC doesn't support 1000T Half */ phy_remove_link_mode(phydev, ETHTOOL_LINK_MODE_1000baseT_Half_BIT); diff --git a/drivers/net/ethernet/microchip/lan743x_main.h b/drivers/net/ethernet/microchip/lan743x_main.h index 3b02eeae5f45..c61a40411317 100644 --- a/drivers/net/ethernet/microchip/lan743x_main.h +++ b/drivers/net/ethernet/microchip/lan743x_main.h @@ -4,6 +4,7 @@ #ifndef _LAN743X_H #define _LAN743X_H +#include #include "lan743x_ptp.h" #define DRIVER_AUTHOR "Bryan Whitehead " @@ -104,10 +105,14 @@ ((value << 0) & FCT_FLOW_CTL_ON_THRESHOLD_) #define MAC_CR (0x100) +#define MAC_CR_MII_EN_ BIT(19) #define MAC_CR_EEE_EN_ BIT(17) #define MAC_CR_ADD_ BIT(12) #define MAC_CR_ASD_ BIT(11) #define MAC_CR_CNTR_RST_ BIT(5) +#define MAC_CR_DPX_ BIT(3) +#define MAC_CR_CFG_H_ BIT(2) +#define MAC_CR_CFG_L_ BIT(1) #define MAC_CR_RST_ BIT(0) #define MAC_RX (0x104) @@ -698,6 +703,7 @@ struct lan743x_rx { struct lan743x_adapter { struct net_device *netdev; struct mii_bus *mdiobus; + phy_interface_t phy_mode; int msg_enable; #ifdef CONFIG_PM u32 wolopts; diff --git a/drivers/net/ethernet/microchip/lan743x_ptp.c b/drivers/net/ethernet/microchip/lan743x_ptp.c index 9399f6a98748..ab6d719d40f0 100644 --- a/drivers/net/ethernet/microchip/lan743x_ptp.c +++ b/drivers/net/ethernet/microchip/lan743x_ptp.c @@ -2,12 +2,12 @@ /* Copyright (C) 2018 Microchip Technology Inc. */ #include -#include "lan743x_main.h" #include #include #include #include +#include "lan743x_main.h" #include "lan743x_ptp.h" -- cgit v1.2.3-59-g8ed1b From 0af413bd3e2de73bcf0742ed556be4af83c71964 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 29 May 2020 22:13:58 +0200 Subject: flow_dissector: work around stack frame size warning The fl_flow_key structure is around 500 bytes, so having two of them on the stack in one function now exceeds the warning limit after an otherwise correct change: net/sched/cls_flower.c:298:12: error: stack frame size of 1056 bytes in function 'fl_classify' [-Werror,-Wframe-larger-than=] I suspect the fl_classify function could be reworked to only have one of them on the stack and modify it in place, but I could not work out how to do that. As a somewhat hacky workaround, move one of them into an out-of-line function to reduce its scope. This does not necessarily reduce the stack usage of the outer function, but at least the second copy is removed from the stack during most of it and does not add up to whatever is called from there. I now see 552 bytes of stack usage for fl_classify(), plus 528 bytes for fl_mask_lookup(). Fixes: 58cff782cc55 ("flow_dissector: Parse multiple MPLS Label Stack Entries") Signed-off-by: Arnd Bergmann Acked-by: Cong Wang Acked-by: Guillaume Nault Signed-off-by: David S. Miller --- net/sched/cls_flower.c | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c index 96f5999281e0..030896eadd11 100644 --- a/net/sched/cls_flower.c +++ b/net/sched/cls_flower.c @@ -272,14 +272,16 @@ static struct cls_fl_filter *fl_lookup_range(struct fl_flow_mask *mask, return NULL; } -static struct cls_fl_filter *fl_lookup(struct fl_flow_mask *mask, - struct fl_flow_key *mkey, - struct fl_flow_key *key) +static noinline_for_stack +struct cls_fl_filter *fl_mask_lookup(struct fl_flow_mask *mask, struct fl_flow_key *key) { + struct fl_flow_key mkey; + + fl_set_masked_key(&mkey, key, mask); if ((mask->flags & TCA_FLOWER_MASK_FLAGS_RANGE)) - return fl_lookup_range(mask, mkey, key); + return fl_lookup_range(mask, &mkey, key); - return __fl_lookup(mask, mkey); + return __fl_lookup(mask, &mkey); } static u16 fl_ct_info_to_flower_map[] = { @@ -299,7 +301,6 @@ static int fl_classify(struct sk_buff *skb, const struct tcf_proto *tp, struct tcf_result *res) { struct cls_fl_head *head = rcu_dereference_bh(tp->root); - struct fl_flow_key skb_mkey; struct fl_flow_key skb_key; struct fl_flow_mask *mask; struct cls_fl_filter *f; @@ -319,9 +320,7 @@ static int fl_classify(struct sk_buff *skb, const struct tcf_proto *tp, ARRAY_SIZE(fl_ct_info_to_flower_map)); skb_flow_dissect(skb, &mask->dissector, &skb_key, 0); - fl_set_masked_key(&skb_mkey, &skb_key, mask); - - f = fl_lookup(mask, &skb_mkey, &skb_key); + f = fl_mask_lookup(mask, &skb_key); if (f && !tc_skip_sw(f->flags)) { *res = f->res; return tcf_exts_exec(skb, &f->exts, res); -- cgit v1.2.3-59-g8ed1b From 3e1c6846b9e108740ef8a37be80314053f5dd52a Mon Sep 17 00:00:00 2001 From: Jia-Ju Bai Date: Sat, 30 May 2020 10:41:50 +0800 Subject: net: vmxnet3: fix possible buffer overflow caused by bad DMA value in vmxnet3_get_rss() The value adapter->rss_conf is stored in DMA memory, and it is assigned to rssConf, so rssConf->indTableSize can be modified at anytime by malicious hardware. Because rssConf->indTableSize is assigned to n, buffer overflow may occur when the code "rssConf->indTable[n]" is executed. To fix this possible bug, n is checked after being used. Signed-off-by: Jia-Ju Bai Signed-off-by: David S. Miller --- drivers/net/vmxnet3/vmxnet3_ethtool.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/vmxnet3/vmxnet3_ethtool.c b/drivers/net/vmxnet3/vmxnet3_ethtool.c index bfdda0f34b97..6acaafe169de 100644 --- a/drivers/net/vmxnet3/vmxnet3_ethtool.c +++ b/drivers/net/vmxnet3/vmxnet3_ethtool.c @@ -954,6 +954,8 @@ vmxnet3_get_rss(struct net_device *netdev, u32 *p, u8 *key, u8 *hfunc) *hfunc = ETH_RSS_HASH_TOP; if (!p) return 0; + if (n > UPT1_RSS_MAX_IND_TABLE_SIZE) + return 0; while (n--) p[n] = rssConf->indTable[n]; return 0; -- cgit v1.2.3-59-g8ed1b From 7e89ed8ab3f74e0746d3ea80537d7a06b0e27732 Mon Sep 17 00:00:00 2001 From: Horatiu Vultur Date: Sat, 30 May 2020 18:09:46 +0000 Subject: bridge: mrp: Update MRP frame type Replace u16/u32 with be16/be32 in the MRP frame types. This fixes sparse warnings like: warning: cast to restricted __be16 Signed-off-by: Horatiu Vultur Signed-off-by: David S. Miller --- include/uapi/linux/mrp_bridge.h | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/include/uapi/linux/mrp_bridge.h b/include/uapi/linux/mrp_bridge.h index 2600cdf5a284..bcad42128d62 100644 --- a/include/uapi/linux/mrp_bridge.h +++ b/include/uapi/linux/mrp_bridge.h @@ -55,30 +55,30 @@ struct br_mrp_end_hdr { }; struct br_mrp_common_hdr { - __u16 seq_id; + __be16 seq_id; __u8 domain[MRP_DOMAIN_UUID_LENGTH]; }; struct br_mrp_ring_test_hdr { - __u16 prio; + __be16 prio; __u8 sa[ETH_ALEN]; - __u16 port_role; - __u16 state; - __u16 transitions; - __u32 timestamp; + __be16 port_role; + __be16 state; + __be16 transitions; + __be32 timestamp; }; struct br_mrp_ring_topo_hdr { - __u16 prio; + __be16 prio; __u8 sa[ETH_ALEN]; - __u16 interval; + __be16 interval; }; struct br_mrp_ring_link_hdr { __u8 sa[ETH_ALEN]; - __u16 port_role; - __u16 interval; - __u16 blocked; + __be16 port_role; + __be16 interval; + __be16 blocked; }; #endif -- cgit v1.2.3-59-g8ed1b From 4b3a61b030d1131dcf3633a276158a3d0a435a47 Mon Sep 17 00:00:00 2001 From: Horatiu Vultur Date: Sat, 30 May 2020 18:09:47 +0000 Subject: bridge: mrp: Set the priority of MRP instance Each MRP instance has a priority, a lower value means a higher priority. The priority of MRP instance is stored in MRP_Test frame in this way all the MRP nodes in the ring can see other nodes priority. Signed-off-by: Horatiu Vultur Signed-off-by: David S. Miller --- include/net/switchdev.h | 1 + include/uapi/linux/if_bridge.h | 2 ++ net/bridge/br_mrp.c | 3 ++- net/bridge/br_mrp_netlink.c | 5 +++++ net/bridge/br_mrp_switchdev.c | 1 + net/bridge/br_private_mrp.h | 1 + 6 files changed, 12 insertions(+), 1 deletion(-) diff --git a/include/net/switchdev.h b/include/net/switchdev.h index db519957e134..f82ef4c45f5e 100644 --- a/include/net/switchdev.h +++ b/include/net/switchdev.h @@ -116,6 +116,7 @@ struct switchdev_obj_mrp { struct net_device *p_port; struct net_device *s_port; u32 ring_id; + u16 prio; }; #define SWITCHDEV_OBJ_MRP(OBJ) \ diff --git a/include/uapi/linux/if_bridge.h b/include/uapi/linux/if_bridge.h index 5a43eb86c93b..0162c1370ecb 100644 --- a/include/uapi/linux/if_bridge.h +++ b/include/uapi/linux/if_bridge.h @@ -176,6 +176,7 @@ enum { IFLA_BRIDGE_MRP_INSTANCE_RING_ID, IFLA_BRIDGE_MRP_INSTANCE_P_IFINDEX, IFLA_BRIDGE_MRP_INSTANCE_S_IFINDEX, + IFLA_BRIDGE_MRP_INSTANCE_PRIO, __IFLA_BRIDGE_MRP_INSTANCE_MAX, }; @@ -230,6 +231,7 @@ struct br_mrp_instance { __u32 ring_id; __u32 p_ifindex; __u32 s_ifindex; + __u16 prio; }; struct br_mrp_ring_state { diff --git a/net/bridge/br_mrp.c b/net/bridge/br_mrp.c index 8ea59504ef47..f8fd037219fe 100644 --- a/net/bridge/br_mrp.c +++ b/net/bridge/br_mrp.c @@ -147,7 +147,7 @@ static struct sk_buff *br_mrp_alloc_test_skb(struct br_mrp *mrp, br_mrp_skb_tlv(skb, BR_MRP_TLV_HEADER_RING_TEST, sizeof(*hdr)); hdr = skb_put(skb, sizeof(*hdr)); - hdr->prio = cpu_to_be16(MRP_DEFAULT_PRIO); + hdr->prio = cpu_to_be16(mrp->prio); ether_addr_copy(hdr->sa, p->br->dev->dev_addr); hdr->port_role = cpu_to_be16(port_role); hdr->state = cpu_to_be16(mrp->ring_state); @@ -290,6 +290,7 @@ int br_mrp_add(struct net_bridge *br, struct br_mrp_instance *instance) return -ENOMEM; mrp->ring_id = instance->ring_id; + mrp->prio = instance->prio; p = br_mrp_get_port(br, instance->p_ifindex); spin_lock_bh(&br->lock); diff --git a/net/bridge/br_mrp_netlink.c b/net/bridge/br_mrp_netlink.c index d9de780d2ce0..8cb67d9ca44e 100644 --- a/net/bridge/br_mrp_netlink.c +++ b/net/bridge/br_mrp_netlink.c @@ -22,6 +22,7 @@ br_mrp_instance_policy[IFLA_BRIDGE_MRP_INSTANCE_MAX + 1] = { [IFLA_BRIDGE_MRP_INSTANCE_RING_ID] = { .type = NLA_U32 }, [IFLA_BRIDGE_MRP_INSTANCE_P_IFINDEX] = { .type = NLA_U32 }, [IFLA_BRIDGE_MRP_INSTANCE_S_IFINDEX] = { .type = NLA_U32 }, + [IFLA_BRIDGE_MRP_INSTANCE_PRIO] = { .type = NLA_U16 }, }; static int br_mrp_instance_parse(struct net_bridge *br, struct nlattr *attr, @@ -49,6 +50,10 @@ static int br_mrp_instance_parse(struct net_bridge *br, struct nlattr *attr, inst.ring_id = nla_get_u32(tb[IFLA_BRIDGE_MRP_INSTANCE_RING_ID]); inst.p_ifindex = nla_get_u32(tb[IFLA_BRIDGE_MRP_INSTANCE_P_IFINDEX]); inst.s_ifindex = nla_get_u32(tb[IFLA_BRIDGE_MRP_INSTANCE_S_IFINDEX]); + inst.prio = MRP_DEFAULT_PRIO; + + if (tb[IFLA_BRIDGE_MRP_INSTANCE_PRIO]) + inst.prio = nla_get_u16(tb[IFLA_BRIDGE_MRP_INSTANCE_PRIO]); if (cmd == RTM_SETLINK) return br_mrp_add(br, &inst); diff --git a/net/bridge/br_mrp_switchdev.c b/net/bridge/br_mrp_switchdev.c index 51cb1d5a24b4..3a776043bf80 100644 --- a/net/bridge/br_mrp_switchdev.c +++ b/net/bridge/br_mrp_switchdev.c @@ -12,6 +12,7 @@ int br_mrp_switchdev_add(struct net_bridge *br, struct br_mrp *mrp) .p_port = rtnl_dereference(mrp->p_port)->dev, .s_port = rtnl_dereference(mrp->s_port)->dev, .ring_id = mrp->ring_id, + .prio = mrp->prio, }; int err; diff --git a/net/bridge/br_private_mrp.h b/net/bridge/br_private_mrp.h index a0f53cc3ab85..558941ce2366 100644 --- a/net/bridge/br_private_mrp.h +++ b/net/bridge/br_private_mrp.h @@ -14,6 +14,7 @@ struct br_mrp { struct net_bridge_port __rcu *s_port; u32 ring_id; + u16 prio; enum br_mrp_ring_role_type ring_role; u8 ring_role_offloaded; -- cgit v1.2.3-59-g8ed1b From c6676e7d62cfb5cb7c1c5320a26f3634a11afdb0 Mon Sep 17 00:00:00 2001 From: Horatiu Vultur Date: Sat, 30 May 2020 18:09:48 +0000 Subject: bridge: mrp: Add support for role MRA A node that has the MRA role, it can behave as MRM or MRC. Initially it starts as MRM and sends MRP_Test frames on both ring ports. If it detects that there are MRP_Test send by another MRM, then it checks if these frames have a lower priority than itself. In this case it would send MRP_Nack frames to notify the other node that it needs to stop sending MRP_Test frames. If it receives a MRP_Nack frame then it stops sending MRP_Test frames and starts to behave as a MRC but it would continue to monitor the MRP_Test frames send by MRM. If at a point the MRM stops to send MRP_Test frames it would get the MRM role and start to send MRP_Test frames. Signed-off-by: Horatiu Vultur Signed-off-by: David S. Miller --- include/net/switchdev.h | 1 + include/uapi/linux/if_bridge.h | 2 + include/uapi/linux/mrp_bridge.h | 38 ++++++++++++ net/bridge/br_mrp.c | 125 ++++++++++++++++++++++++++++++++++------ net/bridge/br_mrp_netlink.c | 6 ++ net/bridge/br_mrp_switchdev.c | 4 +- net/bridge/br_private_mrp.h | 4 +- 7 files changed, 159 insertions(+), 21 deletions(-) diff --git a/include/net/switchdev.h b/include/net/switchdev.h index f82ef4c45f5e..b8c059b4e06d 100644 --- a/include/net/switchdev.h +++ b/include/net/switchdev.h @@ -130,6 +130,7 @@ struct switchdev_obj_ring_test_mrp { u8 max_miss; u32 ring_id; u32 period; + bool monitor; }; #define SWITCHDEV_OBJ_RING_TEST_MRP(OBJ) \ diff --git a/include/uapi/linux/if_bridge.h b/include/uapi/linux/if_bridge.h index 0162c1370ecb..caa6914a3e53 100644 --- a/include/uapi/linux/if_bridge.h +++ b/include/uapi/linux/if_bridge.h @@ -222,6 +222,7 @@ enum { IFLA_BRIDGE_MRP_START_TEST_INTERVAL, IFLA_BRIDGE_MRP_START_TEST_MAX_MISS, IFLA_BRIDGE_MRP_START_TEST_PERIOD, + IFLA_BRIDGE_MRP_START_TEST_MONITOR, __IFLA_BRIDGE_MRP_START_TEST_MAX, }; @@ -249,6 +250,7 @@ struct br_mrp_start_test { __u32 interval; __u32 max_miss; __u32 period; + __u32 monitor; }; struct bridge_stp_xstats { diff --git a/include/uapi/linux/mrp_bridge.h b/include/uapi/linux/mrp_bridge.h index bcad42128d62..84f15f48a7cb 100644 --- a/include/uapi/linux/mrp_bridge.h +++ b/include/uapi/linux/mrp_bridge.h @@ -11,11 +11,14 @@ #define MRP_DOMAIN_UUID_LENGTH 16 #define MRP_VERSION 1 #define MRP_FRAME_PRIO 7 +#define MRP_OUI_LENGTH 3 +#define MRP_MANUFACTURE_DATA_LENGTH 2 enum br_mrp_ring_role_type { BR_MRP_RING_ROLE_DISABLED, BR_MRP_RING_ROLE_MRC, BR_MRP_RING_ROLE_MRM, + BR_MRP_RING_ROLE_MRA, }; enum br_mrp_ring_state_type { @@ -43,6 +46,13 @@ enum br_mrp_tlv_header_type { BR_MRP_TLV_HEADER_RING_TOPO = 0x3, BR_MRP_TLV_HEADER_RING_LINK_DOWN = 0x4, BR_MRP_TLV_HEADER_RING_LINK_UP = 0x5, + BR_MRP_TLV_HEADER_OPTION = 0x7f, +}; + +enum br_mrp_sub_tlv_header_type { + BR_MRP_SUB_TLV_HEADER_TEST_MGR_NACK = 0x1, + BR_MRP_SUB_TLV_HEADER_TEST_PROPAGATE = 0x2, + BR_MRP_SUB_TLV_HEADER_TEST_AUTO_MGR = 0x3, }; struct br_mrp_tlv_hdr { @@ -50,6 +60,11 @@ struct br_mrp_tlv_hdr { __u8 length; }; +struct br_mrp_sub_tlv_hdr { + __u8 type; + __u8 length; +}; + struct br_mrp_end_hdr { struct br_mrp_tlv_hdr hdr; }; @@ -81,4 +96,27 @@ struct br_mrp_ring_link_hdr { __be16 blocked; }; +struct br_mrp_sub_opt_hdr { + __u8 type; + __u8 manufacture_data[MRP_MANUFACTURE_DATA_LENGTH]; +}; + +struct br_mrp_test_mgr_nack_hdr { + __be16 prio; + __u8 sa[ETH_ALEN]; + __be16 other_prio; + __u8 other_sa[ETH_ALEN]; +}; + +struct br_mrp_test_prop_hdr { + __be16 prio; + __u8 sa[ETH_ALEN]; + __be16 other_prio; + __u8 other_sa[ETH_ALEN]; +}; + +struct br_mrp_oui_hdr { + __u8 oui[MRP_OUI_LENGTH]; +}; + #endif diff --git a/net/bridge/br_mrp.c b/net/bridge/br_mrp.c index f8fd037219fe..24986ec7d38c 100644 --- a/net/bridge/br_mrp.c +++ b/net/bridge/br_mrp.c @@ -160,6 +160,16 @@ static struct sk_buff *br_mrp_alloc_test_skb(struct br_mrp *mrp, return skb; } +/* This function is continuously called in the following cases: + * - when node role is MRM, in this case test_monitor is always set to false + * because it needs to notify the userspace that the ring is open and needs to + * send MRP_Test frames + * - when node role is MRA, there are 2 subcases: + * - when MRA behaves as MRM, in this case is similar with MRM role + * - when MRA behaves as MRC, in this case test_monitor is set to true, + * because it needs to detect when it stops seeing MRP_Test frames + * from MRM node but it doesn't need to send MRP_Test frames. + */ static void br_mrp_test_work_expired(struct work_struct *work) { struct delayed_work *del_work = to_delayed_work(work); @@ -177,8 +187,14 @@ static void br_mrp_test_work_expired(struct work_struct *work) /* Notify that the ring is open only if the ring state is * closed, otherwise it would continue to notify at every * interval. + * Also notify that the ring is open when the node has the + * role MRA and behaves as MRC. The reason is that the + * userspace needs to know when the MRM stopped sending + * MRP_Test frames so that the current node to try to take + * the role of a MRM. */ - if (mrp->ring_state == BR_MRP_RING_STATE_CLOSED) + if (mrp->ring_state == BR_MRP_RING_STATE_CLOSED || + mrp->test_monitor) notify_open = true; } @@ -186,12 +202,15 @@ static void br_mrp_test_work_expired(struct work_struct *work) p = rcu_dereference(mrp->p_port); if (p) { - skb = br_mrp_alloc_test_skb(mrp, p, BR_MRP_PORT_ROLE_PRIMARY); - if (!skb) - goto out; - - skb_reset_network_header(skb); - dev_queue_xmit(skb); + if (!mrp->test_monitor) { + skb = br_mrp_alloc_test_skb(mrp, p, + BR_MRP_PORT_ROLE_PRIMARY); + if (!skb) + goto out; + + skb_reset_network_header(skb); + dev_queue_xmit(skb); + } if (notify_open && !mrp->ring_role_offloaded) br_mrp_port_open(p->dev, true); @@ -199,12 +218,15 @@ static void br_mrp_test_work_expired(struct work_struct *work) p = rcu_dereference(mrp->s_port); if (p) { - skb = br_mrp_alloc_test_skb(mrp, p, BR_MRP_PORT_ROLE_SECONDARY); - if (!skb) - goto out; - - skb_reset_network_header(skb); - dev_queue_xmit(skb); + if (!mrp->test_monitor) { + skb = br_mrp_alloc_test_skb(mrp, p, + BR_MRP_PORT_ROLE_SECONDARY); + if (!skb) + goto out; + + skb_reset_network_header(skb); + dev_queue_xmit(skb); + } if (notify_open && !mrp->ring_role_offloaded) br_mrp_port_open(p->dev, true); @@ -227,7 +249,7 @@ static void br_mrp_del_impl(struct net_bridge *br, struct br_mrp *mrp) /* Stop sending MRP_Test frames */ cancel_delayed_work_sync(&mrp->test_work); - br_mrp_switchdev_send_ring_test(br, mrp, 0, 0, 0); + br_mrp_switchdev_send_ring_test(br, mrp, 0, 0, 0, 0); br_mrp_switchdev_del(br, mrp); @@ -452,8 +474,8 @@ int br_mrp_set_ring_role(struct net_bridge *br, return 0; } -/* Start to generate MRP test frames, the frames are generated by HW and if it - * fails, they are generated by the SW. +/* Start to generate or monitor MRP test frames, the frames are generated by + * HW and if it fails, they are generated by the SW. * note: already called with rtnl_lock */ int br_mrp_start_test(struct net_bridge *br, @@ -464,16 +486,18 @@ int br_mrp_start_test(struct net_bridge *br, if (!mrp) return -EINVAL; - /* Try to push it to the HW and if it fails then continue to generate in - * SW and if that also fails then return error + /* Try to push it to the HW and if it fails then continue with SW + * implementation and if that also fails then return error. */ if (!br_mrp_switchdev_send_ring_test(br, mrp, test->interval, - test->max_miss, test->period)) + test->max_miss, test->period, + test->monitor)) return 0; mrp->test_interval = test->interval; mrp->test_end = jiffies + usecs_to_jiffies(test->period); mrp->test_max_miss = test->max_miss; + mrp->test_monitor = test->monitor; mrp->test_count_miss = 0; queue_delayed_work(system_wq, &mrp->test_work, usecs_to_jiffies(test->interval)); @@ -510,6 +534,57 @@ static void br_mrp_mrm_process(struct br_mrp *mrp, struct net_bridge_port *port, br_mrp_port_open(port->dev, false); } +/* Determin if the test hdr has a better priority than the node */ +static bool br_mrp_test_better_than_own(struct br_mrp *mrp, + struct net_bridge *br, + const struct br_mrp_ring_test_hdr *hdr) +{ + u16 prio = be16_to_cpu(hdr->prio); + + if (prio < mrp->prio || + (prio == mrp->prio && + ether_addr_to_u64(hdr->sa) < ether_addr_to_u64(br->dev->dev_addr))) + return true; + + return false; +} + +/* Process only MRP Test frame. All the other MRP frames are processed by + * userspace application + * note: already called with rcu_read_lock + */ +static void br_mrp_mra_process(struct br_mrp *mrp, struct net_bridge *br, + struct net_bridge_port *port, + struct sk_buff *skb) +{ + const struct br_mrp_ring_test_hdr *test_hdr; + struct br_mrp_ring_test_hdr _test_hdr; + const struct br_mrp_tlv_hdr *hdr; + struct br_mrp_tlv_hdr _hdr; + + /* Each MRP header starts with a version field which is 16 bits. + * Therefore skip the version and get directly the TLV header. + */ + hdr = skb_header_pointer(skb, sizeof(uint16_t), sizeof(_hdr), &_hdr); + if (!hdr) + return; + + if (hdr->type != BR_MRP_TLV_HEADER_RING_TEST) + return; + + test_hdr = skb_header_pointer(skb, sizeof(uint16_t) + sizeof(_hdr), + sizeof(_test_hdr), &_test_hdr); + if (!test_hdr) + return; + + /* Only frames that have a better priority than the node will + * clear the miss counter because otherwise the node will need to behave + * as MRM. + */ + if (br_mrp_test_better_than_own(mrp, br, test_hdr)) + mrp->test_count_miss = 0; +} + /* This will just forward the frame to the other mrp ring port(MRC role) or will * not do anything. * note: already called with rcu_read_lock @@ -546,6 +621,18 @@ static int br_mrp_rcv(struct net_bridge_port *p, return 1; } + /* If the role is MRA then don't forward the frames if it behaves as + * MRM node + */ + if (mrp->ring_role == BR_MRP_RING_ROLE_MRA) { + if (!mrp->test_monitor) { + br_mrp_mrm_process(mrp, p, skb); + return 1; + } + + br_mrp_mra_process(mrp, br, p, skb); + } + /* Clone the frame and forward it on the other MRP port */ nskb = skb_clone(skb, GFP_ATOMIC); if (!nskb) diff --git a/net/bridge/br_mrp_netlink.c b/net/bridge/br_mrp_netlink.c index 8cb67d9ca44e..34b3a8776991 100644 --- a/net/bridge/br_mrp_netlink.c +++ b/net/bridge/br_mrp_netlink.c @@ -196,6 +196,7 @@ br_mrp_start_test_policy[IFLA_BRIDGE_MRP_START_TEST_MAX + 1] = { [IFLA_BRIDGE_MRP_START_TEST_INTERVAL] = { .type = NLA_U32 }, [IFLA_BRIDGE_MRP_START_TEST_MAX_MISS] = { .type = NLA_U32 }, [IFLA_BRIDGE_MRP_START_TEST_PERIOD] = { .type = NLA_U32 }, + [IFLA_BRIDGE_MRP_START_TEST_MONITOR] = { .type = NLA_U32 }, }; static int br_mrp_start_test_parse(struct net_bridge *br, struct nlattr *attr, @@ -225,6 +226,11 @@ static int br_mrp_start_test_parse(struct net_bridge *br, struct nlattr *attr, test.interval = nla_get_u32(tb[IFLA_BRIDGE_MRP_START_TEST_INTERVAL]); test.max_miss = nla_get_u32(tb[IFLA_BRIDGE_MRP_START_TEST_MAX_MISS]); test.period = nla_get_u32(tb[IFLA_BRIDGE_MRP_START_TEST_PERIOD]); + test.monitor = false; + + if (tb[IFLA_BRIDGE_MRP_START_TEST_MONITOR]) + test.monitor = + nla_get_u32(tb[IFLA_BRIDGE_MRP_START_TEST_MONITOR]); return br_mrp_start_test(br, &test); } diff --git a/net/bridge/br_mrp_switchdev.c b/net/bridge/br_mrp_switchdev.c index 3a776043bf80..0da68a0da4b5 100644 --- a/net/bridge/br_mrp_switchdev.c +++ b/net/bridge/br_mrp_switchdev.c @@ -65,7 +65,8 @@ int br_mrp_switchdev_set_ring_role(struct net_bridge *br, int br_mrp_switchdev_send_ring_test(struct net_bridge *br, struct br_mrp *mrp, u32 interval, - u8 max_miss, u32 period) + u8 max_miss, u32 period, + bool monitor) { struct switchdev_obj_ring_test_mrp test = { .obj.orig_dev = br->dev, @@ -74,6 +75,7 @@ int br_mrp_switchdev_send_ring_test(struct net_bridge *br, .max_miss = max_miss, .ring_id = mrp->ring_id, .period = period, + .monitor = monitor, }; int err; diff --git a/net/bridge/br_private_mrp.h b/net/bridge/br_private_mrp.h index 558941ce2366..33b255e38ffe 100644 --- a/net/bridge/br_private_mrp.h +++ b/net/bridge/br_private_mrp.h @@ -26,6 +26,7 @@ struct br_mrp { unsigned long test_end; u32 test_count_miss; u32 test_max_miss; + bool test_monitor; u32 seq_id; @@ -52,7 +53,8 @@ int br_mrp_switchdev_set_ring_role(struct net_bridge *br, struct br_mrp *mrp, int br_mrp_switchdev_set_ring_state(struct net_bridge *br, struct br_mrp *mrp, enum br_mrp_ring_state_type state); int br_mrp_switchdev_send_ring_test(struct net_bridge *br, struct br_mrp *mrp, - u32 interval, u8 max_miss, u32 period); + u32 interval, u8 max_miss, u32 period, + bool monitor); int br_mrp_port_switchdev_set_state(struct net_bridge_port *p, enum br_mrp_port_state_type state); int br_mrp_port_switchdev_set_role(struct net_bridge_port *p, -- cgit v1.2.3-59-g8ed1b From 4e4f4ce6abf5f6a8df0561776d3a790d60d519d0 Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Sat, 30 May 2020 20:49:56 +0200 Subject: cls_flower: remove mpls_opts_policy MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Compiling with W=1 gives the following warning: net/sched/cls_flower.c:731:1: warning: ‘mpls_opts_policy’ defined but not used [-Wunused-const-variable=] The TCA_FLOWER_KEY_MPLS_OPTS contains a list of TCA_FLOWER_KEY_MPLS_OPTS_LSE. Therefore, the attributes all have the same type and we can't parse the list with nla_parse*() and have the attributes validated automatically using an nla_policy. fl_set_key_mpls_opts() properly verifies that all attributes in the list are TCA_FLOWER_KEY_MPLS_OPTS_LSE. Then fl_set_key_mpls_lse() uses nla_parse_nested() on all these attributes, thus verifying that they have the NLA_F_NESTED flag. So we can safely drop the mpls_opts_policy. Reported-by: kbuild test robot Signed-off-by: Guillaume Nault Signed-off-by: David S. Miller --- net/sched/cls_flower.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c index 030896eadd11..b2da37286082 100644 --- a/net/sched/cls_flower.c +++ b/net/sched/cls_flower.c @@ -726,11 +726,6 @@ erspan_opt_policy[TCA_FLOWER_KEY_ENC_OPT_ERSPAN_MAX + 1] = { [TCA_FLOWER_KEY_ENC_OPT_ERSPAN_HWID] = { .type = NLA_U8 }, }; -static const struct nla_policy -mpls_opts_policy[TCA_FLOWER_KEY_MPLS_OPTS_MAX + 1] = { - [TCA_FLOWER_KEY_MPLS_OPTS_LSE] = { .type = NLA_NESTED }, -}; - static const struct nla_policy mpls_stack_entry_policy[TCA_FLOWER_KEY_MPLS_OPT_LSE_MAX + 1] = { [TCA_FLOWER_KEY_MPLS_OPT_LSE_DEPTH] = { .type = NLA_U8 }, -- cgit v1.2.3-59-g8ed1b From 96aa1b22bd6bb9fccf62f6261f390ed6f3e7967f Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Sat, 30 May 2020 15:41:31 -0400 Subject: tun: correct header offsets in napi frags mode Tun in IFF_NAPI_FRAGS mode calls napi_gro_frags. Unlike netif_rx and netif_gro_receive, this expects skb->data to point to the mac layer. But skb_probe_transport_header, __skb_get_hash_symmetric, and xdp_do_generic in tun_get_user need skb->data to point to the network header. Flow dissection also needs skb->protocol set, so eth_type_trans has to be called. Ensure the link layer header lies in linear as eth_type_trans pulls ETH_HLEN. Then take the same code paths for frags as for not frags. Push the link layer header back just before calling napi_gro_frags. By pulling up to ETH_HLEN from frag0 into linear, this disables the frag0 optimization in the special case when IFF_NAPI_FRAGS is used with zero length iov[0] (and thus empty skb->linear). Fixes: 90e33d459407 ("tun: enable napi_gro_frags() for TUN/TAP driver") Signed-off-by: Willem de Bruijn Acked-by: Petar Penkov Signed-off-by: David S. Miller --- drivers/net/tun.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/drivers/net/tun.c b/drivers/net/tun.c index c54f967e2c66..b0ab882c021e 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -1872,8 +1872,11 @@ drop: skb->dev = tun->dev; break; case IFF_TAP: - if (!frags) - skb->protocol = eth_type_trans(skb, tun->dev); + if (frags && !pskb_may_pull(skb, ETH_HLEN)) { + err = -ENOMEM; + goto drop; + } + skb->protocol = eth_type_trans(skb, tun->dev); break; } @@ -1930,9 +1933,12 @@ drop: } if (frags) { + u32 headlen; + /* Exercise flow dissector code path. */ - u32 headlen = eth_get_headlen(tun->dev, skb->data, - skb_headlen(skb)); + skb_push(skb, ETH_HLEN); + headlen = eth_get_headlen(tun->dev, skb->data, + skb_headlen(skb)); if (unlikely(headlen > skb_headlen(skb))) { this_cpu_inc(tun->pcpu_stats->rx_dropped); -- cgit v1.2.3-59-g8ed1b From 3190ca3b5f51a0e471ee3f04c898401c81b00385 Mon Sep 17 00:00:00 2001 From: Michael Walle Date: Sat, 30 May 2020 22:34:04 +0200 Subject: net: phy: broadcom: don't export RDB/legacy access methods Don't export __bcm_phy_enable_rdb_access() and __bcm_phy_enable_legacy_access() functions. They aren't used outside this module and it was forgotten to provide a prototype for these functions. Just make them static for now. Fixes: 11ecf8c55b91 ("net: phy: broadcom: add cable test support") Reported-by: kbuild test robot Signed-off-by: Michael Walle Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/phy/bcm-phy-lib.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/drivers/net/phy/bcm-phy-lib.c b/drivers/net/phy/bcm-phy-lib.c index cb92786e3ded..ef6825b30323 100644 --- a/drivers/net/phy/bcm-phy-lib.c +++ b/drivers/net/phy/bcm-phy-lib.c @@ -583,18 +583,16 @@ int bcm_phy_enable_jumbo(struct phy_device *phydev) } EXPORT_SYMBOL_GPL(bcm_phy_enable_jumbo); -int __bcm_phy_enable_rdb_access(struct phy_device *phydev) +static int __bcm_phy_enable_rdb_access(struct phy_device *phydev) { return __bcm_phy_write_exp(phydev, BCM54XX_EXP_REG7E, 0); } -EXPORT_SYMBOL_GPL(__bcm_phy_enable_rdb_access); -int __bcm_phy_enable_legacy_access(struct phy_device *phydev) +static int __bcm_phy_enable_legacy_access(struct phy_device *phydev) { return __bcm_phy_write_rdb(phydev, BCM54XX_RDB_REG0087, BCM54XX_ACCESS_MODE_LEGACY_EN); } -EXPORT_SYMBOL_GPL(__bcm_phy_enable_legacy_access); static int _bcm_phy_cable_test_start(struct phy_device *phydev, bool is_rdb) { -- cgit v1.2.3-59-g8ed1b From 685e39eaf4b5bf68167c799fe683e26cdc43a5ea Mon Sep 17 00:00:00 2001 From: Ioana Radulescu Date: Sun, 31 May 2020 00:08:08 +0300 Subject: dpaa2-eth: Add support for Rx traffic classes The firmware reserves for each DPNI a number of RX frame queues equal to the number of configured flows x number of configured traffic classes. Current driver configuration directs all incoming traffic to FQs corresponding to TC0, leaving all other priority levels unused. Start adding support for multiple ingress traffic classes, by configuring the FQs associated with all priority levels, not just TC0. All settings that are per-TC, such as those related to hashing and flow steering, are also updated. Signed-off-by: Ioana Radulescu Signed-off-by: Ioana Ciornei Signed-off-by: David S. Miller --- .../ethernet/freescale/dpaa2/dpaa2-eth-debugfs.c | 7 ++- drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c | 70 +++++++++++++++------- drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.h | 4 +- .../net/ethernet/freescale/dpaa2/dpaa2-ethtool.c | 19 ++++-- 4 files changed, 68 insertions(+), 32 deletions(-) diff --git a/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth-debugfs.c b/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth-debugfs.c index 0a31e4268dfb..c453a23045c1 100644 --- a/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth-debugfs.c +++ b/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth-debugfs.c @@ -81,8 +81,8 @@ static int dpaa2_dbg_fqs_show(struct seq_file *file, void *offset) int i, err; seq_printf(file, "FQ stats for %s:\n", priv->net_dev->name); - seq_printf(file, "%s%16s%16s%16s%16s\n", - "VFQID", "CPU", "Type", "Frames", "Pending frames"); + seq_printf(file, "%s%16s%16s%16s%16s%16s\n", + "VFQID", "CPU", "TC", "Type", "Frames", "Pending frames"); for (i = 0; i < priv->num_fqs; i++) { fq = &priv->fq[i]; @@ -90,9 +90,10 @@ static int dpaa2_dbg_fqs_show(struct seq_file *file, void *offset) if (err) fcnt = 0; - seq_printf(file, "%5d%16d%16s%16llu%16u\n", + seq_printf(file, "%5d%16d%16d%16s%16llu%16u\n", fq->fqid, fq->target_cpu, + fq->tc, fq_type_to_str(fq), fq->stats.frames, fcnt); diff --git a/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c b/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c index fe3806d54630..01263e247d39 100644 --- a/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c +++ b/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c @@ -1290,6 +1290,7 @@ static void disable_ch_napi(struct dpaa2_eth_priv *priv) static void dpaa2_eth_set_rx_taildrop(struct dpaa2_eth_priv *priv, bool enable) { struct dpni_taildrop td = {0}; + struct dpaa2_eth_fq *fq; int i, err; if (priv->rx_td_enabled == enable) @@ -1299,11 +1300,12 @@ static void dpaa2_eth_set_rx_taildrop(struct dpaa2_eth_priv *priv, bool enable) td.threshold = DPAA2_ETH_TAILDROP_THRESH; for (i = 0; i < priv->num_fqs; i++) { - if (priv->fq[i].type != DPAA2_RX_FQ) + fq = &priv->fq[i]; + if (fq->type != DPAA2_RX_FQ) continue; err = dpni_set_taildrop(priv->mc_io, 0, priv->mc_token, - DPNI_CP_QUEUE, DPNI_QUEUE_RX, 0, - priv->fq[i].flowid, &td); + DPNI_CP_QUEUE, DPNI_QUEUE_RX, + fq->tc, fq->flowid, &td); if (err) { netdev_err(priv->net_dev, "dpni_set_taildrop() failed\n"); @@ -2407,7 +2409,7 @@ static void set_fq_affinity(struct dpaa2_eth_priv *priv) static void setup_fqs(struct dpaa2_eth_priv *priv) { - int i; + int i, j; /* We have one TxConf FQ per Tx flow. * The number of Tx and Rx queues is the same. @@ -2419,10 +2421,13 @@ static void setup_fqs(struct dpaa2_eth_priv *priv) priv->fq[priv->num_fqs++].flowid = (u16)i; } - for (i = 0; i < dpaa2_eth_queue_count(priv); i++) { - priv->fq[priv->num_fqs].type = DPAA2_RX_FQ; - priv->fq[priv->num_fqs].consume = dpaa2_eth_rx; - priv->fq[priv->num_fqs++].flowid = (u16)i; + for (j = 0; j < dpaa2_eth_tc_count(priv); j++) { + for (i = 0; i < dpaa2_eth_queue_count(priv); i++) { + priv->fq[priv->num_fqs].type = DPAA2_RX_FQ; + priv->fq[priv->num_fqs].consume = dpaa2_eth_rx; + priv->fq[priv->num_fqs].tc = (u8)j; + priv->fq[priv->num_fqs++].flowid = (u16)i; + } } /* For each FQ, decide on which core to process incoming frames */ @@ -2789,7 +2794,7 @@ static int setup_rx_flow(struct dpaa2_eth_priv *priv, int err; err = dpni_get_queue(priv->mc_io, 0, priv->mc_token, - DPNI_QUEUE_RX, 0, fq->flowid, &queue, &qid); + DPNI_QUEUE_RX, fq->tc, fq->flowid, &queue, &qid); if (err) { dev_err(dev, "dpni_get_queue(RX) failed\n"); return err; @@ -2802,7 +2807,7 @@ static int setup_rx_flow(struct dpaa2_eth_priv *priv, queue.destination.priority = 1; queue.user_context = (u64)(uintptr_t)fq; err = dpni_set_queue(priv->mc_io, 0, priv->mc_token, - DPNI_QUEUE_RX, 0, fq->flowid, + DPNI_QUEUE_RX, fq->tc, fq->flowid, DPNI_QUEUE_OPT_USER_CTX | DPNI_QUEUE_OPT_DEST, &queue); if (err) { @@ -2811,6 +2816,10 @@ static int setup_rx_flow(struct dpaa2_eth_priv *priv, } /* xdp_rxq setup */ + /* only once for each channel */ + if (fq->tc > 0) + return 0; + err = xdp_rxq_info_reg(&fq->channel->xdp_rxq, priv->net_dev, fq->flowid); if (err) { @@ -2948,7 +2957,7 @@ static int config_legacy_hash_key(struct dpaa2_eth_priv *priv, dma_addr_t key) { struct device *dev = priv->net_dev->dev.parent; struct dpni_rx_tc_dist_cfg dist_cfg; - int err; + int i, err = 0; memset(&dist_cfg, 0, sizeof(dist_cfg)); @@ -2956,9 +2965,14 @@ static int config_legacy_hash_key(struct dpaa2_eth_priv *priv, dma_addr_t key) dist_cfg.dist_size = dpaa2_eth_queue_count(priv); dist_cfg.dist_mode = DPNI_DIST_MODE_HASH; - err = dpni_set_rx_tc_dist(priv->mc_io, 0, priv->mc_token, 0, &dist_cfg); - if (err) - dev_err(dev, "dpni_set_rx_tc_dist failed\n"); + for (i = 0; i < dpaa2_eth_tc_count(priv); i++) { + err = dpni_set_rx_tc_dist(priv->mc_io, 0, priv->mc_token, + i, &dist_cfg); + if (err) { + dev_err(dev, "dpni_set_rx_tc_dist failed\n"); + break; + } + } return err; } @@ -2968,7 +2982,7 @@ static int config_hash_key(struct dpaa2_eth_priv *priv, dma_addr_t key) { struct device *dev = priv->net_dev->dev.parent; struct dpni_rx_dist_cfg dist_cfg; - int err; + int i, err = 0; memset(&dist_cfg, 0, sizeof(dist_cfg)); @@ -2976,9 +2990,15 @@ static int config_hash_key(struct dpaa2_eth_priv *priv, dma_addr_t key) dist_cfg.dist_size = dpaa2_eth_queue_count(priv); dist_cfg.enable = 1; - err = dpni_set_rx_hash_dist(priv->mc_io, 0, priv->mc_token, &dist_cfg); - if (err) - dev_err(dev, "dpni_set_rx_hash_dist failed\n"); + for (i = 0; i < dpaa2_eth_tc_count(priv); i++) { + dist_cfg.tc = i; + err = dpni_set_rx_hash_dist(priv->mc_io, 0, priv->mc_token, + &dist_cfg); + if (err) { + dev_err(dev, "dpni_set_rx_hash_dist failed\n"); + break; + } + } return err; } @@ -2988,7 +3008,7 @@ static int config_cls_key(struct dpaa2_eth_priv *priv, dma_addr_t key) { struct device *dev = priv->net_dev->dev.parent; struct dpni_rx_dist_cfg dist_cfg; - int err; + int i, err = 0; memset(&dist_cfg, 0, sizeof(dist_cfg)); @@ -2996,9 +3016,15 @@ static int config_cls_key(struct dpaa2_eth_priv *priv, dma_addr_t key) dist_cfg.dist_size = dpaa2_eth_queue_count(priv); dist_cfg.enable = 1; - err = dpni_set_rx_fs_dist(priv->mc_io, 0, priv->mc_token, &dist_cfg); - if (err) - dev_err(dev, "dpni_set_rx_fs_dist failed\n"); + for (i = 0; i < dpaa2_eth_tc_count(priv); i++) { + dist_cfg.tc = i; + err = dpni_set_rx_fs_dist(priv->mc_io, 0, priv->mc_token, + &dist_cfg); + if (err) { + dev_err(dev, "dpni_set_rx_fs_dist failed\n"); + break; + } + } return err; } diff --git a/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.h b/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.h index 0581fbf1f98c..580ad5fd7bd8 100644 --- a/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.h +++ b/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.h @@ -294,7 +294,9 @@ struct dpaa2_eth_ch_stats { /* Maximum number of queues associated with a DPNI */ #define DPAA2_ETH_MAX_TCS 8 -#define DPAA2_ETH_MAX_RX_QUEUES 16 +#define DPAA2_ETH_MAX_RX_QUEUES_PER_TC 16 +#define DPAA2_ETH_MAX_RX_QUEUES \ + (DPAA2_ETH_MAX_RX_QUEUES_PER_TC * DPAA2_ETH_MAX_TCS) #define DPAA2_ETH_MAX_TX_QUEUES 16 #define DPAA2_ETH_MAX_QUEUES (DPAA2_ETH_MAX_RX_QUEUES + \ DPAA2_ETH_MAX_TX_QUEUES) diff --git a/drivers/net/ethernet/freescale/dpaa2/dpaa2-ethtool.c b/drivers/net/ethernet/freescale/dpaa2/dpaa2-ethtool.c index 049afd1d6252..8bf169783bea 100644 --- a/drivers/net/ethernet/freescale/dpaa2/dpaa2-ethtool.c +++ b/drivers/net/ethernet/freescale/dpaa2/dpaa2-ethtool.c @@ -547,7 +547,7 @@ static int do_cls_rule(struct net_device *net_dev, dma_addr_t key_iova; u64 fields = 0; void *key_buf; - int err; + int i, err; if (fs->ring_cookie != RX_CLS_FLOW_DISC && fs->ring_cookie >= dpaa2_eth_queue_count(priv)) @@ -607,11 +607,18 @@ static int do_cls_rule(struct net_device *net_dev, fs_act.options |= DPNI_FS_OPT_DISCARD; else fs_act.flow_id = fs->ring_cookie; - err = dpni_add_fs_entry(priv->mc_io, 0, priv->mc_token, 0, - fs->location, &rule_cfg, &fs_act); - } else { - err = dpni_remove_fs_entry(priv->mc_io, 0, priv->mc_token, 0, - &rule_cfg); + } + for (i = 0; i < dpaa2_eth_tc_count(priv); i++) { + if (add) + err = dpni_add_fs_entry(priv->mc_io, 0, priv->mc_token, + i, fs->location, &rule_cfg, + &fs_act); + else + err = dpni_remove_fs_entry(priv->mc_io, 0, + priv->mc_token, i, + &rule_cfg); + if (err) + break; } dma_unmap_single(dev, key_iova, rule_cfg.key_size * 2, DMA_TO_DEVICE); -- cgit v1.2.3-59-g8ed1b From 6aa90fe2d96745b63d4ccc74c0c37b90d31b699e Mon Sep 17 00:00:00 2001 From: Ioana Radulescu Date: Sun, 31 May 2020 00:08:09 +0300 Subject: dpaa2-eth: Distribute ingress frames based on VLAN prio Configure static ingress classification based on VLAN PCP field. If the DPNI doesn't have enough traffic classes to accommodate all priority levels, the lowest ones end up on TC 0 (default on miss). Signed-off-by: Ioana Radulescu Signed-off-by: Ioana Ciornei Signed-off-by: David S. Miller --- drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c | 116 ++++++++++++++++++++ drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.h | 1 + drivers/net/ethernet/freescale/dpaa2/dpni-cmd.h | 34 ++++++ drivers/net/ethernet/freescale/dpaa2/dpni.c | 131 +++++++++++++++++++++++ drivers/net/ethernet/freescale/dpaa2/dpni.h | 36 +++++++ 5 files changed, 318 insertions(+) diff --git a/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c b/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c index 01263e247d39..3bf5df92ecfa 100644 --- a/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c +++ b/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c @@ -2696,6 +2696,118 @@ out_err: priv->enqueue = dpaa2_eth_enqueue_qd; } +/* Configure ingress classification based on VLAN PCP */ +static int set_vlan_qos(struct dpaa2_eth_priv *priv) +{ + struct device *dev = priv->net_dev->dev.parent; + struct dpkg_profile_cfg kg_cfg = {0}; + struct dpni_qos_tbl_cfg qos_cfg = {0}; + struct dpni_rule_cfg key_params; + void *dma_mem, *key, *mask; + u8 key_size = 2; /* VLAN TCI field */ + int i, pcp, err; + + /* VLAN-based classification only makes sense if we have multiple + * traffic classes. + * Also, we need to extract just the 3-bit PCP field from the VLAN + * header and we can only do that by using a mask + */ + if (dpaa2_eth_tc_count(priv) == 1 || !dpaa2_eth_fs_mask_enabled(priv)) { + dev_dbg(dev, "VLAN-based QoS classification not supported\n"); + return -EOPNOTSUPP; + } + + dma_mem = kzalloc(DPAA2_CLASSIFIER_DMA_SIZE, GFP_KERNEL); + if (!dma_mem) + return -ENOMEM; + + kg_cfg.num_extracts = 1; + kg_cfg.extracts[0].type = DPKG_EXTRACT_FROM_HDR; + kg_cfg.extracts[0].extract.from_hdr.prot = NET_PROT_VLAN; + kg_cfg.extracts[0].extract.from_hdr.type = DPKG_FULL_FIELD; + kg_cfg.extracts[0].extract.from_hdr.field = NH_FLD_VLAN_TCI; + + err = dpni_prepare_key_cfg(&kg_cfg, dma_mem); + if (err) { + dev_err(dev, "dpni_prepare_key_cfg failed\n"); + goto out_free_tbl; + } + + /* set QoS table */ + qos_cfg.default_tc = 0; + qos_cfg.discard_on_miss = 0; + qos_cfg.key_cfg_iova = dma_map_single(dev, dma_mem, + DPAA2_CLASSIFIER_DMA_SIZE, + DMA_TO_DEVICE); + if (dma_mapping_error(dev, qos_cfg.key_cfg_iova)) { + dev_err(dev, "QoS table DMA mapping failed\n"); + err = -ENOMEM; + goto out_free_tbl; + } + + err = dpni_set_qos_table(priv->mc_io, 0, priv->mc_token, &qos_cfg); + if (err) { + dev_err(dev, "dpni_set_qos_table failed\n"); + goto out_unmap_tbl; + } + + /* Add QoS table entries */ + key = kzalloc(key_size * 2, GFP_KERNEL); + if (!key) { + err = -ENOMEM; + goto out_unmap_tbl; + } + mask = key + key_size; + *(__be16 *)mask = cpu_to_be16(VLAN_PRIO_MASK); + + key_params.key_iova = dma_map_single(dev, key, key_size * 2, + DMA_TO_DEVICE); + if (dma_mapping_error(dev, key_params.key_iova)) { + dev_err(dev, "Qos table entry DMA mapping failed\n"); + err = -ENOMEM; + goto out_free_key; + } + + key_params.mask_iova = key_params.key_iova + key_size; + key_params.key_size = key_size; + + /* We add rules for PCP-based distribution starting with highest + * priority (VLAN PCP = 7). If this DPNI doesn't have enough traffic + * classes to accommodate all priority levels, the lowest ones end up + * on TC 0 which was configured as default + */ + for (i = dpaa2_eth_tc_count(priv) - 1, pcp = 7; i >= 0; i--, pcp--) { + *(__be16 *)key = cpu_to_be16(pcp << VLAN_PRIO_SHIFT); + dma_sync_single_for_device(dev, key_params.key_iova, + key_size * 2, DMA_TO_DEVICE); + + err = dpni_add_qos_entry(priv->mc_io, 0, priv->mc_token, + &key_params, i, i); + if (err) { + dev_err(dev, "dpni_add_qos_entry failed\n"); + dpni_clear_qos_table(priv->mc_io, 0, priv->mc_token); + goto out_unmap_key; + } + } + + priv->vlan_cls_enabled = true; + + /* Table and key memory is not persistent, clean everything up after + * configuration is finished + */ +out_unmap_key: + dma_unmap_single(dev, key_params.key_iova, key_size * 2, DMA_TO_DEVICE); +out_free_key: + kfree(key); +out_unmap_tbl: + dma_unmap_single(dev, qos_cfg.key_cfg_iova, DPAA2_CLASSIFIER_DMA_SIZE, + DMA_TO_DEVICE); +out_free_tbl: + kfree(dma_mem); + + return err; +} + /* Configure the DPNI object this interface is associated with */ static int setup_dpni(struct fsl_mc_device *ls_dev) { @@ -2758,6 +2870,10 @@ static int setup_dpni(struct fsl_mc_device *ls_dev) goto close; } + err = set_vlan_qos(priv); + if (err && err != -EOPNOTSUPP) + goto close; + priv->cls_rules = devm_kzalloc(dev, sizeof(struct dpaa2_eth_cls_rule) * dpaa2_eth_fs_count(priv), GFP_KERNEL); if (!priv->cls_rules) { diff --git a/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.h b/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.h index 580ad5fd7bd8..7856f69bcf36 100644 --- a/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.h +++ b/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.h @@ -427,6 +427,7 @@ struct dpaa2_eth_priv { u64 rx_cls_fields; struct dpaa2_eth_cls_rule *cls_rules; u8 rx_cls_enabled; + u8 vlan_cls_enabled; struct bpf_prog *xdp_prog; #ifdef CONFIG_DEBUG_FS struct dpaa2_debugfs dbg; diff --git a/drivers/net/ethernet/freescale/dpaa2/dpni-cmd.h b/drivers/net/ethernet/freescale/dpaa2/dpni-cmd.h index d9b6918807af..0048e856f85e 100644 --- a/drivers/net/ethernet/freescale/dpaa2/dpni-cmd.h +++ b/drivers/net/ethernet/freescale/dpaa2/dpni-cmd.h @@ -59,6 +59,10 @@ #define DPNI_CMDID_SET_RX_TC_DIST DPNI_CMD(0x235) +#define DPNI_CMDID_SET_QOS_TBL DPNI_CMD(0x240) +#define DPNI_CMDID_ADD_QOS_ENT DPNI_CMD(0x241) +#define DPNI_CMDID_REMOVE_QOS_ENT DPNI_CMD(0x242) +#define DPNI_CMDID_CLR_QOS_TBL DPNI_CMD(0x243) #define DPNI_CMDID_ADD_FS_ENT DPNI_CMD(0x244) #define DPNI_CMDID_REMOVE_FS_ENT DPNI_CMD(0x245) #define DPNI_CMDID_CLR_FS_ENT DPNI_CMD(0x246) @@ -567,4 +571,34 @@ struct dpni_cmd_remove_fs_entry { __le64 mask_iova; }; +#define DPNI_DISCARD_ON_MISS_SHIFT 0 +#define DPNI_DISCARD_ON_MISS_SIZE 1 + +struct dpni_cmd_set_qos_table { + __le32 pad; + u8 default_tc; + /* only the LSB */ + u8 discard_on_miss; + __le16 pad1[21]; + __le64 key_cfg_iova; +}; + +struct dpni_cmd_add_qos_entry { + __le16 pad; + u8 tc_id; + u8 key_size; + __le16 index; + __le16 pad1; + __le64 key_iova; + __le64 mask_iova; +}; + +struct dpni_cmd_remove_qos_entry { + u8 pad[3]; + u8 key_size; + __le32 pad1; + __le64 key_iova; + __le64 mask_iova; +}; + #endif /* _FSL_DPNI_CMD_H */ diff --git a/drivers/net/ethernet/freescale/dpaa2/dpni.c b/drivers/net/ethernet/freescale/dpaa2/dpni.c index dd54e6953aeb..78fa325407ca 100644 --- a/drivers/net/ethernet/freescale/dpaa2/dpni.c +++ b/drivers/net/ethernet/freescale/dpaa2/dpni.c @@ -1786,3 +1786,134 @@ int dpni_remove_fs_entry(struct fsl_mc_io *mc_io, /* send command to mc*/ return mc_send_command(mc_io, &cmd); } + +/** + * dpni_set_qos_table() - Set QoS mapping table + * @mc_io: Pointer to MC portal's I/O object + * @cmd_flags: Command flags; one or more of 'MC_CMD_FLAG_' + * @token: Token of DPNI object + * @cfg: QoS table configuration + * + * This function and all QoS-related functions require that + *'max_tcs > 1' was set at DPNI creation. + * + * warning: Before calling this function, call dpkg_prepare_key_cfg() to + * prepare the key_cfg_iova parameter + * + * Return: '0' on Success; Error code otherwise. + */ +int dpni_set_qos_table(struct fsl_mc_io *mc_io, + u32 cmd_flags, + u16 token, + const struct dpni_qos_tbl_cfg *cfg) +{ + struct dpni_cmd_set_qos_table *cmd_params; + struct fsl_mc_command cmd = { 0 }; + + /* prepare command */ + cmd.header = mc_encode_cmd_header(DPNI_CMDID_SET_QOS_TBL, + cmd_flags, + token); + cmd_params = (struct dpni_cmd_set_qos_table *)cmd.params; + cmd_params->default_tc = cfg->default_tc; + cmd_params->key_cfg_iova = cpu_to_le64(cfg->key_cfg_iova); + dpni_set_field(cmd_params->discard_on_miss, DISCARD_ON_MISS, + cfg->discard_on_miss); + + /* send command to mc*/ + return mc_send_command(mc_io, &cmd); +} + +/** + * dpni_add_qos_entry() - Add QoS mapping entry (to select a traffic class) + * @mc_io: Pointer to MC portal's I/O object + * @cmd_flags: Command flags; one or more of 'MC_CMD_FLAG_' + * @token: Token of DPNI object + * @cfg: QoS rule to add + * @tc_id: Traffic class selection (0-7) + * @index: Location in the QoS table where to insert the entry. + * Only relevant if MASKING is enabled for QoS classification on + * this DPNI, it is ignored for exact match. + * + * Return: '0' on Success; Error code otherwise. + */ +int dpni_add_qos_entry(struct fsl_mc_io *mc_io, + u32 cmd_flags, + u16 token, + const struct dpni_rule_cfg *cfg, + u8 tc_id, + u16 index) +{ + struct dpni_cmd_add_qos_entry *cmd_params; + struct fsl_mc_command cmd = { 0 }; + + /* prepare command */ + cmd.header = mc_encode_cmd_header(DPNI_CMDID_ADD_QOS_ENT, + cmd_flags, + token); + cmd_params = (struct dpni_cmd_add_qos_entry *)cmd.params; + cmd_params->tc_id = tc_id; + cmd_params->key_size = cfg->key_size; + cmd_params->index = cpu_to_le16(index); + cmd_params->key_iova = cpu_to_le64(cfg->key_iova); + cmd_params->mask_iova = cpu_to_le64(cfg->mask_iova); + + /* send command to mc*/ + return mc_send_command(mc_io, &cmd); +} + +/** + * dpni_remove_qos_entry() - Remove QoS mapping entry + * @mc_io: Pointer to MC portal's I/O object + * @cmd_flags: Command flags; one or more of 'MC_CMD_FLAG_' + * @token: Token of DPNI object + * @cfg: QoS rule to remove + * + * Return: '0' on Success; Error code otherwise. + */ +int dpni_remove_qos_entry(struct fsl_mc_io *mc_io, + u32 cmd_flags, + u16 token, + const struct dpni_rule_cfg *cfg) +{ + struct dpni_cmd_remove_qos_entry *cmd_params; + struct fsl_mc_command cmd = { 0 }; + + /* prepare command */ + cmd.header = mc_encode_cmd_header(DPNI_CMDID_REMOVE_QOS_ENT, + cmd_flags, + token); + cmd_params = (struct dpni_cmd_remove_qos_entry *)cmd.params; + cmd_params->key_size = cfg->key_size; + cmd_params->key_iova = cpu_to_le64(cfg->key_iova); + cmd_params->mask_iova = cpu_to_le64(cfg->mask_iova); + + /* send command to mc*/ + return mc_send_command(mc_io, &cmd); +} + +/** + * dpni_clear_qos_table() - Clear all QoS mapping entries + * @mc_io: Pointer to MC portal's I/O object + * @cmd_flags: Command flags; one or more of 'MC_CMD_FLAG_' + * @token: Token of DPNI object + * + * Following this function call, all frames are directed to + * the default traffic class (0) + * + * Return: '0' on Success; Error code otherwise. + */ +int dpni_clear_qos_table(struct fsl_mc_io *mc_io, + u32 cmd_flags, + u16 token) +{ + struct fsl_mc_command cmd = { 0 }; + + /* prepare command */ + cmd.header = mc_encode_cmd_header(DPNI_CMDID_CLR_QOS_TBL, + cmd_flags, + token); + + /* send command to mc*/ + return mc_send_command(mc_io, &cmd); +} diff --git a/drivers/net/ethernet/freescale/dpaa2/dpni.h b/drivers/net/ethernet/freescale/dpaa2/dpni.h index ee0711d06b3a..8c7ac20bf1a7 100644 --- a/drivers/net/ethernet/freescale/dpaa2/dpni.h +++ b/drivers/net/ethernet/freescale/dpaa2/dpni.h @@ -715,6 +715,26 @@ int dpni_set_rx_hash_dist(struct fsl_mc_io *mc_io, u16 token, const struct dpni_rx_dist_cfg *cfg); +/** + * struct dpni_qos_tbl_cfg - Structure representing QOS table configuration + * @key_cfg_iova: I/O virtual address of 256 bytes DMA-able memory filled with + * key extractions to be used as the QoS criteria by calling + * dpkg_prepare_key_cfg() + * @discard_on_miss: Set to '1' to discard frames in case of no match (miss); + * '0' to use the 'default_tc' in such cases + * @default_tc: Used in case of no-match and 'discard_on_miss'= 0 + */ +struct dpni_qos_tbl_cfg { + u64 key_cfg_iova; + int discard_on_miss; + u8 default_tc; +}; + +int dpni_set_qos_table(struct fsl_mc_io *mc_io, + u32 cmd_flags, + u16 token, + const struct dpni_qos_tbl_cfg *cfg); + /** * enum dpni_dest - DPNI destination types * @DPNI_DEST_NONE: Unassigned destination; The queue is set in parked mode and @@ -961,6 +981,22 @@ int dpni_remove_fs_entry(struct fsl_mc_io *mc_io, u8 tc_id, const struct dpni_rule_cfg *cfg); +int dpni_add_qos_entry(struct fsl_mc_io *mc_io, + u32 cmd_flags, + u16 token, + const struct dpni_rule_cfg *cfg, + u8 tc_id, + u16 index); + +int dpni_remove_qos_entry(struct fsl_mc_io *mc_io, + u32 cmd_flags, + u16 token, + const struct dpni_rule_cfg *cfg); + +int dpni_clear_qos_table(struct fsl_mc_io *mc_io, + u32 cmd_flags, + u16 token); + int dpni_get_api_version(struct fsl_mc_io *mc_io, u32 cmd_flags, u16 *major_ver, -- cgit v1.2.3-59-g8ed1b From ad054f265401d8279837a916e9b5a5aee2a1749d Mon Sep 17 00:00:00 2001 From: Ioana Radulescu Date: Sun, 31 May 2020 00:08:10 +0300 Subject: dpaa2-eth: Add helper functions Add convenient helper functions that determines whether Rx/Tx pause frames are enabled based on link state flags received from firmware. Signed-off-by: Ioana Radulescu Signed-off-by: Ioana Ciornei Signed-off-by: David S. Miller --- drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c | 3 +-- drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.h | 11 +++++++++++ drivers/net/ethernet/freescale/dpaa2/dpaa2-ethtool.c | 5 ++--- 3 files changed, 14 insertions(+), 5 deletions(-) diff --git a/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c b/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c index 3bf5df92ecfa..c16c8ea3a174 100644 --- a/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c +++ b/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c @@ -1333,8 +1333,7 @@ static int link_state_update(struct dpaa2_eth_priv *priv) * Rx FQ taildrop configuration as well. We configure taildrop * only when pause frame generation is disabled. */ - tx_pause = !!(state.options & DPNI_LINK_OPT_PAUSE) ^ - !!(state.options & DPNI_LINK_OPT_ASYM_PAUSE); + tx_pause = dpaa2_eth_tx_pause_enabled(state.options); dpaa2_eth_set_rx_taildrop(priv, !tx_pause); /* When we manage the MAC/PHY using phylink there is no need diff --git a/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.h b/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.h index 7856f69bcf36..6384f6a23349 100644 --- a/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.h +++ b/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.h @@ -510,6 +510,17 @@ enum dpaa2_eth_rx_dist { (dpaa2_eth_cmp_dpni_ver((priv), DPNI_PAUSE_VER_MAJOR, \ DPNI_PAUSE_VER_MINOR) >= 0) +static inline bool dpaa2_eth_tx_pause_enabled(u64 link_options) +{ + return !!(link_options & DPNI_LINK_OPT_PAUSE) ^ + !!(link_options & DPNI_LINK_OPT_ASYM_PAUSE); +} + +static inline bool dpaa2_eth_rx_pause_enabled(u64 link_options) +{ + return !!(link_options & DPNI_LINK_OPT_PAUSE); +} + static inline unsigned int dpaa2_eth_needed_headroom(struct dpaa2_eth_priv *priv, struct sk_buff *skb) diff --git a/drivers/net/ethernet/freescale/dpaa2/dpaa2-ethtool.c b/drivers/net/ethernet/freescale/dpaa2/dpaa2-ethtool.c index 8bf169783bea..e88269fe3de7 100644 --- a/drivers/net/ethernet/freescale/dpaa2/dpaa2-ethtool.c +++ b/drivers/net/ethernet/freescale/dpaa2/dpaa2-ethtool.c @@ -130,9 +130,8 @@ static void dpaa2_eth_get_pauseparam(struct net_device *net_dev, return; } - pause->rx_pause = !!(link_options & DPNI_LINK_OPT_PAUSE); - pause->tx_pause = pause->rx_pause ^ - !!(link_options & DPNI_LINK_OPT_ASYM_PAUSE); + pause->rx_pause = dpaa2_eth_rx_pause_enabled(link_options); + pause->tx_pause = dpaa2_eth_tx_pause_enabled(link_options); pause->autoneg = AUTONEG_DISABLE; } -- cgit v1.2.3-59-g8ed1b From 2c8d1c8d7d62dfedab97927c22e9421f0d72de8e Mon Sep 17 00:00:00 2001 From: Ioana Radulescu Date: Sun, 31 May 2020 00:08:11 +0300 Subject: dpaa2-eth: Add congestion group taildrop The increase in number of ingress frame queues means we now risk depleting the buffer pool before the FQ taildrop kicks in. Congestion group taildrop allows us to control the number of frames that can accumulate on a group of Rx frame queues belonging to the same traffic class. This setting coexists with the frame queue based taildrop: whichever limit gets hit first triggers the frame drop. Signed-off-by: Ioana Radulescu Signed-off-by: Ioana Ciornei Signed-off-by: David S. Miller --- drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c | 35 ++++++++++++++++++------ drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.h | 13 +++++++-- 2 files changed, 38 insertions(+), 10 deletions(-) diff --git a/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c b/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c index c16c8ea3a174..04eff6308c72 100644 --- a/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c +++ b/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c @@ -1287,17 +1287,20 @@ static void disable_ch_napi(struct dpaa2_eth_priv *priv) } } -static void dpaa2_eth_set_rx_taildrop(struct dpaa2_eth_priv *priv, bool enable) +static void dpaa2_eth_set_rx_taildrop(struct dpaa2_eth_priv *priv, + bool tx_pause) { struct dpni_taildrop td = {0}; struct dpaa2_eth_fq *fq; int i, err; - if (priv->rx_td_enabled == enable) + td.enable = !tx_pause; + if (priv->rx_td_enabled == td.enable) return; - td.enable = enable; - td.threshold = DPAA2_ETH_TAILDROP_THRESH; + /* FQ taildrop: threshold is in bytes, per frame queue */ + td.threshold = DPAA2_ETH_FQ_TAILDROP_THRESH; + td.units = DPNI_CONGESTION_UNIT_BYTES; for (i = 0; i < priv->num_fqs; i++) { fq = &priv->fq[i]; @@ -1308,12 +1311,28 @@ static void dpaa2_eth_set_rx_taildrop(struct dpaa2_eth_priv *priv, bool enable) fq->tc, fq->flowid, &td); if (err) { netdev_err(priv->net_dev, - "dpni_set_taildrop() failed\n"); - break; + "dpni_set_taildrop(FQ) failed\n"); + return; + } + } + + /* Congestion group taildrop: threshold is in frames, per group + * of FQs belonging to the same traffic class + */ + td.threshold = DPAA2_ETH_CG_TAILDROP_THRESH(priv); + td.units = DPNI_CONGESTION_UNIT_FRAMES; + for (i = 0; i < dpaa2_eth_tc_count(priv); i++) { + err = dpni_set_taildrop(priv->mc_io, 0, priv->mc_token, + DPNI_CP_GROUP, DPNI_QUEUE_RX, + i, 0, &td); + if (err) { + netdev_err(priv->net_dev, + "dpni_set_taildrop(CG) failed\n"); + return; } } - priv->rx_td_enabled = enable; + priv->rx_td_enabled = td.enable; } static int link_state_update(struct dpaa2_eth_priv *priv) @@ -1334,7 +1353,7 @@ static int link_state_update(struct dpaa2_eth_priv *priv) * only when pause frame generation is disabled. */ tx_pause = dpaa2_eth_tx_pause_enabled(state.options); - dpaa2_eth_set_rx_taildrop(priv, !tx_pause); + dpaa2_eth_set_rx_taildrop(priv, tx_pause); /* When we manage the MAC/PHY using phylink there is no need * to manually update the netif_carrier. diff --git a/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.h b/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.h index 6384f6a23349..184d5d83e497 100644 --- a/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.h +++ b/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.h @@ -40,7 +40,7 @@ * frames in the Rx queues (length of the current frame is not * taken into account when making the taildrop decision) */ -#define DPAA2_ETH_TAILDROP_THRESH (64 * 1024) +#define DPAA2_ETH_FQ_TAILDROP_THRESH (64 * 1024) /* Maximum number of Tx confirmation frames to be processed * in a single NAPI call @@ -52,11 +52,20 @@ * how many 64B frames fit inside the taildrop threshold and add a margin * to accommodate the buffer refill delay. */ -#define DPAA2_ETH_MAX_FRAMES_PER_QUEUE (DPAA2_ETH_TAILDROP_THRESH / 64) +#define DPAA2_ETH_MAX_FRAMES_PER_QUEUE (DPAA2_ETH_FQ_TAILDROP_THRESH / 64) #define DPAA2_ETH_NUM_BUFS (DPAA2_ETH_MAX_FRAMES_PER_QUEUE + 256) #define DPAA2_ETH_REFILL_THRESH \ (DPAA2_ETH_NUM_BUFS - DPAA2_ETH_BUFS_PER_CMD) +/* Congestion group taildrop threshold: number of frames allowed to accumulate + * at any moment in a group of Rx queues belonging to the same traffic class. + * Choose value such that we don't risk depleting the buffer pool before the + * taildrop kicks in + */ +#define DPAA2_ETH_CG_TAILDROP_THRESH(priv) \ + (DPAA2_ETH_MAX_FRAMES_PER_QUEUE * dpaa2_eth_queue_count(priv) / \ + dpaa2_eth_tc_count(priv)) + /* Maximum number of buffers that can be acquired/released through a single * QBMan command */ -- cgit v1.2.3-59-g8ed1b From 3f8b826d705fc6f0f0602fcbe6ee3b646ed3316e Mon Sep 17 00:00:00 2001 From: Ioana Radulescu Date: Sun, 31 May 2020 00:08:12 +0300 Subject: dpaa2-eth: Update FQ taildrop threshold and buffer pool count Now that we have congestion group taildrop configured at all times, we can afford to increase the frame queue taildrop threshold; this will ensure a better response when receiving bursts of large-sized frames. Also decouple the buffer pool count from the Rx FQ taildrop threshold, as above change would increase it too much. Instead, keep the old count as a hardcoded value. With the new limits, we try to ensure that: * we allow enough leeway for large frame bursts (by buffering enough of them in queues to avoid heavy dropping in case of bursty traffic, but when overall ingress bandwidth is manageable) * allow pending frames to be evenly spread between ingress FQs, regardless of frame size * avoid dropping frames due to the buffer pool being empty; this is not a bad behaviour per se, but system overall response is more linear and predictable when frames are dropped at frame queue/group level. Signed-off-by: Ioana Radulescu Signed-off-by: Ioana Ciornei Signed-off-by: David S. Miller --- drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.h | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.h b/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.h index 184d5d83e497..02c0eea69a23 100644 --- a/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.h +++ b/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.h @@ -36,24 +36,24 @@ /* Convert L3 MTU to L2 MFL */ #define DPAA2_ETH_L2_MAX_FRM(mtu) ((mtu) + VLAN_ETH_HLEN) -/* Set the taildrop threshold (in bytes) to allow the enqueue of several jumbo - * frames in the Rx queues (length of the current frame is not - * taken into account when making the taildrop decision) +/* Set the taildrop threshold (in bytes) to allow the enqueue of a large + * enough number of jumbo frames in the Rx queues (length of the current + * frame is not taken into account when making the taildrop decision) */ -#define DPAA2_ETH_FQ_TAILDROP_THRESH (64 * 1024) +#define DPAA2_ETH_FQ_TAILDROP_THRESH (1024 * 1024) /* Maximum number of Tx confirmation frames to be processed * in a single NAPI call */ #define DPAA2_ETH_TXCONF_PER_NAPI 256 -/* Buffer quota per queue. Must be large enough such that for minimum sized - * frames taildrop kicks in before the bpool gets depleted, so we compute - * how many 64B frames fit inside the taildrop threshold and add a margin - * to accommodate the buffer refill delay. +/* Buffer qouta per channel. We want to keep in check number of ingress frames + * in flight: for small sized frames, congestion group taildrop may kick in + * first; for large sizes, Rx FQ taildrop threshold will ensure only a + * reasonable number of frames will be pending at any given time. + * Ingress frame drop due to buffer pool depletion should be a corner case only */ -#define DPAA2_ETH_MAX_FRAMES_PER_QUEUE (DPAA2_ETH_FQ_TAILDROP_THRESH / 64) -#define DPAA2_ETH_NUM_BUFS (DPAA2_ETH_MAX_FRAMES_PER_QUEUE + 256) +#define DPAA2_ETH_NUM_BUFS 1280 #define DPAA2_ETH_REFILL_THRESH \ (DPAA2_ETH_NUM_BUFS - DPAA2_ETH_BUFS_PER_CMD) @@ -63,8 +63,7 @@ * taildrop kicks in */ #define DPAA2_ETH_CG_TAILDROP_THRESH(priv) \ - (DPAA2_ETH_MAX_FRAMES_PER_QUEUE * dpaa2_eth_queue_count(priv) / \ - dpaa2_eth_tc_count(priv)) + (1024 * dpaa2_eth_queue_count(priv) / dpaa2_eth_tc_count(priv)) /* Maximum number of buffers that can be acquired/released through a single * QBMan command -- cgit v1.2.3-59-g8ed1b From f395b69f40f580491ef56f2395a98e3189baa53c Mon Sep 17 00:00:00 2001 From: Ioana Ciornei Date: Sun, 31 May 2020 00:08:13 +0300 Subject: dpaa2-eth: Add PFC support through DCB ops Add support in dpaa2-eth for PFC (Priority Flow Control) through the DCB ops. Instruct the hardware to respond to received PFC frames. Current firmware doesn't allow us to selectively enable PFC on the Rx side for some priorities only, so we will react to all incoming PFC frames (and stop transmitting on the traffic classes specified in the frame). Also, configure the hardware to generate PFC frames based on Rx congestion notifications. When a certain number of frames accumulate in the ingress queues corresponding to a traffic class, priority flow control frames are generated for that TC. The number of PFC traffic classes available can be queried through lldptool. Also, which of those traffic classes have PFC enabled is also controlled through the same dcbnl_rtnl_ops callbacks. Signed-off-by: Ioana Ciornei Signed-off-by: David S. Miller --- drivers/net/ethernet/freescale/dpaa2/Kconfig | 10 ++ drivers/net/ethernet/freescale/dpaa2/Makefile | 1 + .../net/ethernet/freescale/dpaa2/dpaa2-eth-dcb.c | 146 +++++++++++++++++++++ drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c | 9 ++ drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.h | 18 +++ drivers/net/ethernet/freescale/dpaa2/dpni-cmd.h | 25 ++++ drivers/net/ethernet/freescale/dpaa2/dpni.c | 46 +++++++ drivers/net/ethernet/freescale/dpaa2/dpni.h | 61 +++++++++ 8 files changed, 316 insertions(+) create mode 100644 drivers/net/ethernet/freescale/dpaa2/dpaa2-eth-dcb.c diff --git a/drivers/net/ethernet/freescale/dpaa2/Kconfig b/drivers/net/ethernet/freescale/dpaa2/Kconfig index c6fb8e4021ac..feea797cde02 100644 --- a/drivers/net/ethernet/freescale/dpaa2/Kconfig +++ b/drivers/net/ethernet/freescale/dpaa2/Kconfig @@ -9,6 +9,16 @@ config FSL_DPAA2_ETH The driver manages network objects discovered on the Freescale MC bus. +if FSL_DPAA2_ETH +config FSL_DPAA2_ETH_DCB + bool "Data Center Bridging (DCB) Support" + default n + depends on DCB + help + Enable Priority-Based Flow Control (PFC) support for DPAA2 Ethernet + devices. +endif + config FSL_DPAA2_PTP_CLOCK tristate "Freescale DPAA2 PTP Clock" depends on FSL_DPAA2_ETH && PTP_1588_CLOCK_QORIQ diff --git a/drivers/net/ethernet/freescale/dpaa2/Makefile b/drivers/net/ethernet/freescale/dpaa2/Makefile index 69184ca3b7b9..6e7f33c956bf 100644 --- a/drivers/net/ethernet/freescale/dpaa2/Makefile +++ b/drivers/net/ethernet/freescale/dpaa2/Makefile @@ -7,6 +7,7 @@ obj-$(CONFIG_FSL_DPAA2_ETH) += fsl-dpaa2-eth.o obj-$(CONFIG_FSL_DPAA2_PTP_CLOCK) += fsl-dpaa2-ptp.o fsl-dpaa2-eth-objs := dpaa2-eth.o dpaa2-ethtool.o dpni.o dpaa2-mac.o dpmac.o +fsl-dpaa2-eth-${CONFIG_FSL_DPAA2_ETH_DCB} += dpaa2-eth-dcb.o fsl-dpaa2-eth-${CONFIG_DEBUG_FS} += dpaa2-eth-debugfs.o fsl-dpaa2-ptp-objs := dpaa2-ptp.o dprtc.o diff --git a/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth-dcb.c b/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth-dcb.c new file mode 100644 index 000000000000..7ee07872af4d --- /dev/null +++ b/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth-dcb.c @@ -0,0 +1,146 @@ +// SPDX-License-Identifier: (GPL-2.0+ OR BSD-3-Clause) +/* Copyright 2020 NXP */ + +#include "dpaa2-eth.h" + +static int dpaa2_eth_dcbnl_ieee_getpfc(struct net_device *net_dev, + struct ieee_pfc *pfc) +{ + struct dpaa2_eth_priv *priv = netdev_priv(net_dev); + + if (!(priv->link_state.options & DPNI_LINK_OPT_PFC_PAUSE)) + return 0; + + memcpy(pfc, &priv->pfc, sizeof(priv->pfc)); + pfc->pfc_cap = dpaa2_eth_tc_count(priv); + + return 0; +} + +static inline bool is_prio_enabled(u8 pfc_en, u8 tc) +{ + return !!(pfc_en & (1 << tc)); +} + +static int set_pfc_cn(struct dpaa2_eth_priv *priv, u8 pfc_en) +{ + struct dpni_congestion_notification_cfg cfg = {0}; + int i, err; + + cfg.notification_mode = DPNI_CONG_OPT_FLOW_CONTROL; + cfg.units = DPNI_CONGESTION_UNIT_FRAMES; + cfg.message_iova = 0ULL; + cfg.message_ctx = 0ULL; + + for (i = 0; i < dpaa2_eth_tc_count(priv); i++) { + if (is_prio_enabled(pfc_en, i)) { + cfg.threshold_entry = DPAA2_ETH_CN_THRESH_ENTRY(priv); + cfg.threshold_exit = DPAA2_ETH_CN_THRESH_EXIT(priv); + } else { + /* For priorities not set in the pfc_en mask, we leave + * the congestion thresholds at zero, which effectively + * disables generation of PFC frames for them + */ + cfg.threshold_entry = 0; + cfg.threshold_exit = 0; + } + + err = dpni_set_congestion_notification(priv->mc_io, 0, + priv->mc_token, + DPNI_QUEUE_RX, i, &cfg); + if (err) { + netdev_err(priv->net_dev, + "dpni_set_congestion_notification failed\n"); + return err; + } + } + + return 0; +} + +static int dpaa2_eth_dcbnl_ieee_setpfc(struct net_device *net_dev, + struct ieee_pfc *pfc) +{ + struct dpaa2_eth_priv *priv = netdev_priv(net_dev); + struct dpni_link_cfg link_cfg = {0}; + int err; + + if (pfc->mbc || pfc->delay) + return -EOPNOTSUPP; + + /* If same PFC enabled mask, nothing to do */ + if (priv->pfc.pfc_en == pfc->pfc_en) + return 0; + + /* We allow PFC configuration even if it won't have any effect until + * general pause frames are enabled + */ + if (!dpaa2_eth_rx_pause_enabled(priv->link_state.options) || + !dpaa2_eth_tx_pause_enabled(priv->link_state.options)) + netdev_warn(net_dev, "Pause support must be enabled in order for PFC to work!\n"); + + link_cfg.rate = priv->link_state.rate; + link_cfg.options = priv->link_state.options; + if (pfc->pfc_en) + link_cfg.options |= DPNI_LINK_OPT_PFC_PAUSE; + else + link_cfg.options &= ~DPNI_LINK_OPT_PFC_PAUSE; + err = dpni_set_link_cfg(priv->mc_io, 0, priv->mc_token, &link_cfg); + if (err) { + netdev_err(net_dev, "dpni_set_link_cfg failed\n"); + return err; + } + + /* Configure congestion notifications for the enabled priorities */ + err = set_pfc_cn(priv, pfc->pfc_en); + if (err) + return err; + + memcpy(&priv->pfc, pfc, sizeof(priv->pfc)); + + return 0; +} + +static u8 dpaa2_eth_dcbnl_getdcbx(struct net_device *net_dev) +{ + struct dpaa2_eth_priv *priv = netdev_priv(net_dev); + + return priv->dcbx_mode; +} + +static u8 dpaa2_eth_dcbnl_setdcbx(struct net_device *net_dev, u8 mode) +{ + struct dpaa2_eth_priv *priv = netdev_priv(net_dev); + + return (mode != (priv->dcbx_mode)) ? 1 : 0; +} + +static u8 dpaa2_eth_dcbnl_getcap(struct net_device *net_dev, int capid, u8 *cap) +{ + struct dpaa2_eth_priv *priv = netdev_priv(net_dev); + + switch (capid) { + case DCB_CAP_ATTR_PFC: + *cap = true; + break; + case DCB_CAP_ATTR_PFC_TCS: + *cap = 1 << (dpaa2_eth_tc_count(priv) - 1); + break; + case DCB_CAP_ATTR_DCBX: + *cap = priv->dcbx_mode; + break; + default: + *cap = false; + break; + } + + return 0; +} + +const struct dcbnl_rtnl_ops dpaa2_eth_dcbnl_ops = { + .ieee_getpfc = dpaa2_eth_dcbnl_ieee_getpfc, + .ieee_setpfc = dpaa2_eth_dcbnl_ieee_setpfc, + .getdcbx = dpaa2_eth_dcbnl_getdcbx, + .setdcbx = dpaa2_eth_dcbnl_setdcbx, + .getcap = dpaa2_eth_dcbnl_getcap, +}; diff --git a/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c b/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c index 04eff6308c72..cde9d0e2dd6d 100644 --- a/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c +++ b/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c @@ -3844,6 +3844,15 @@ static int dpaa2_eth_probe(struct fsl_mc_device *dpni_dev) if (err) goto err_alloc_rings; +#ifdef CONFIG_FSL_DPAA2_ETH_DCB + if (dpaa2_eth_has_pause_support(priv) && priv->vlan_cls_enabled) { + priv->dcbx_mode = DCB_CAP_DCBX_HOST | DCB_CAP_DCBX_VER_IEEE; + net_dev->dcbnl_ops = &dpaa2_eth_dcbnl_ops; + } else { + dev_dbg(dev, "PFC not supported\n"); + } +#endif + err = setup_irqs(dpni_dev); if (err) { netdev_warn(net_dev, "Failed to set link interrupt, fall back to polling\n"); diff --git a/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.h b/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.h index 02c0eea69a23..31b7b9b52da0 100644 --- a/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.h +++ b/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.h @@ -6,6 +6,7 @@ #ifndef __DPAA2_ETH_H #define __DPAA2_ETH_H +#include #include #include #include @@ -65,6 +66,17 @@ #define DPAA2_ETH_CG_TAILDROP_THRESH(priv) \ (1024 * dpaa2_eth_queue_count(priv) / dpaa2_eth_tc_count(priv)) +/* Congestion group notification threshold: when this many frames accumulate + * on the Rx queues belonging to the same TC, the MAC is instructed to send + * PFC frames for that TC. + * When number of pending frames drops below exit threshold transmission of + * PFC frames is stopped. + */ +#define DPAA2_ETH_CN_THRESH_ENTRY(priv) \ + (DPAA2_ETH_CG_TAILDROP_THRESH(priv) / 2) +#define DPAA2_ETH_CN_THRESH_EXIT(priv) \ + (DPAA2_ETH_CN_THRESH_ENTRY(priv) * 3 / 4) + /* Maximum number of buffers that can be acquired/released through a single * QBMan command */ @@ -436,6 +448,10 @@ struct dpaa2_eth_priv { struct dpaa2_eth_cls_rule *cls_rules; u8 rx_cls_enabled; u8 vlan_cls_enabled; +#ifdef CONFIG_FSL_DPAA2_ETH_DCB + u8 dcbx_mode; + struct ieee_pfc pfc; +#endif struct bpf_prog *xdp_prog; #ifdef CONFIG_DEBUG_FS struct dpaa2_debugfs dbg; @@ -568,4 +584,6 @@ int dpaa2_eth_cls_key_size(u64 key); int dpaa2_eth_cls_fld_off(int prot, int field); void dpaa2_eth_cls_trim_rule(void *key_mem, u64 fields); +extern const struct dcbnl_rtnl_ops dpaa2_eth_dcbnl_ops; + #endif /* __DPAA2_H */ diff --git a/drivers/net/ethernet/freescale/dpaa2/dpni-cmd.h b/drivers/net/ethernet/freescale/dpaa2/dpni-cmd.h index 0048e856f85e..fd069f67be9b 100644 --- a/drivers/net/ethernet/freescale/dpaa2/dpni-cmd.h +++ b/drivers/net/ethernet/freescale/dpaa2/dpni-cmd.h @@ -601,4 +601,29 @@ struct dpni_cmd_remove_qos_entry { __le64 mask_iova; }; +#define DPNI_DEST_TYPE_SHIFT 0 +#define DPNI_DEST_TYPE_SIZE 4 +#define DPNI_CONG_UNITS_SHIFT 4 +#define DPNI_CONG_UNITS_SIZE 2 + +struct dpni_cmd_set_congestion_notification { + /* cmd word 0 */ + u8 qtype; + u8 tc; + u8 pad[6]; + /* cmd word 1 */ + __le32 dest_id; + __le16 notification_mode; + u8 dest_priority; + /* from LSB: dest_type: 4 units:2 */ + u8 type_units; + /* cmd word 2 */ + __le64 message_iova; + /* cmd word 3 */ + __le64 message_ctx; + /* cmd word 4 */ + __le32 threshold_entry; + __le32 threshold_exit; +}; + #endif /* _FSL_DPNI_CMD_H */ diff --git a/drivers/net/ethernet/freescale/dpaa2/dpni.c b/drivers/net/ethernet/freescale/dpaa2/dpni.c index 78fa325407ca..6b479ba66465 100644 --- a/drivers/net/ethernet/freescale/dpaa2/dpni.c +++ b/drivers/net/ethernet/freescale/dpaa2/dpni.c @@ -1354,6 +1354,52 @@ int dpni_set_rx_tc_dist(struct fsl_mc_io *mc_io, return mc_send_command(mc_io, &cmd); } +/** + * dpni_set_congestion_notification() - Set traffic class congestion + * notification configuration + * @mc_io: Pointer to MC portal's I/O object + * @cmd_flags: Command flags; one or more of 'MC_CMD_FLAG_' + * @token: Token of DPNI object + * @qtype: Type of queue - Rx, Tx and Tx confirm types are supported + * @tc_id: Traffic class selection (0-7) + * @cfg: Congestion notification configuration + * + * Return: '0' on Success; error code otherwise. + */ +int dpni_set_congestion_notification( + struct fsl_mc_io *mc_io, + u32 cmd_flags, + u16 token, + enum dpni_queue_type qtype, + u8 tc_id, + const struct dpni_congestion_notification_cfg *cfg) +{ + struct dpni_cmd_set_congestion_notification *cmd_params; + struct fsl_mc_command cmd = { 0 }; + + /* prepare command */ + cmd.header = + mc_encode_cmd_header(DPNI_CMDID_SET_CONGESTION_NOTIFICATION, + cmd_flags, + token); + cmd_params = (struct dpni_cmd_set_congestion_notification *)cmd.params; + cmd_params->qtype = qtype; + cmd_params->tc = tc_id; + cmd_params->dest_id = cpu_to_le32(cfg->dest_cfg.dest_id); + cmd_params->notification_mode = cpu_to_le16(cfg->notification_mode); + cmd_params->dest_priority = cfg->dest_cfg.priority; + dpni_set_field(cmd_params->type_units, DEST_TYPE, + cfg->dest_cfg.dest_type); + dpni_set_field(cmd_params->type_units, CONG_UNITS, cfg->units); + cmd_params->message_iova = cpu_to_le64(cfg->message_iova); + cmd_params->message_ctx = cpu_to_le64(cfg->message_ctx); + cmd_params->threshold_entry = cpu_to_le32(cfg->threshold_entry); + cmd_params->threshold_exit = cpu_to_le32(cfg->threshold_exit); + + /* send command to mc*/ + return mc_send_command(mc_io, &cmd); +} + /** * dpni_set_queue() - Set queue parameters * @mc_io: Pointer to MC portal's I/O object diff --git a/drivers/net/ethernet/freescale/dpaa2/dpni.h b/drivers/net/ethernet/freescale/dpaa2/dpni.h index 8c7ac20bf1a7..e874d8084142 100644 --- a/drivers/net/ethernet/freescale/dpaa2/dpni.h +++ b/drivers/net/ethernet/freescale/dpaa2/dpni.h @@ -513,6 +513,11 @@ int dpni_get_statistics(struct fsl_mc_io *mc_io, */ #define DPNI_LINK_OPT_ASYM_PAUSE 0x0000000000000008ULL +/** + * Enable priority flow control pause frames + */ +#define DPNI_LINK_OPT_PFC_PAUSE 0x0000000000000010ULL + /** * struct - Structure representing DPNI link configuration * @rate: Rate @@ -877,6 +882,62 @@ enum dpni_congestion_point { DPNI_CP_GROUP, }; +/** + * struct dpni_dest_cfg - Structure representing DPNI destination parameters + * @dest_type: Destination type + * @dest_id: Either DPIO ID or DPCON ID, depending on the destination type + * @priority: Priority selection within the DPIO or DPCON channel; valid + * values are 0-1 or 0-7, depending on the number of priorities + * in that channel; not relevant for 'DPNI_DEST_NONE' option + */ +struct dpni_dest_cfg { + enum dpni_dest dest_type; + int dest_id; + u8 priority; +}; + +/* DPNI congestion options */ + +/** + * This congestion will trigger flow control or priority flow control. + * This will have effect only if flow control is enabled with + * dpni_set_link_cfg(). + */ +#define DPNI_CONG_OPT_FLOW_CONTROL 0x00000040 + +/** + * struct dpni_congestion_notification_cfg - congestion notification + * configuration + * @units: Units type + * @threshold_entry: Above this threshold we enter a congestion state. + * set it to '0' to disable it + * @threshold_exit: Below this threshold we exit the congestion state. + * @message_ctx: The context that will be part of the CSCN message + * @message_iova: I/O virtual address (must be in DMA-able memory), + * must be 16B aligned; valid only if 'DPNI_CONG_OPT_WRITE_MEM_' + * is contained in 'options' + * @dest_cfg: CSCN can be send to either DPIO or DPCON WQ channel + * @notification_mode: Mask of available options; use 'DPNI_CONG_OPT_' values + */ + +struct dpni_congestion_notification_cfg { + enum dpni_congestion_unit units; + u32 threshold_entry; + u32 threshold_exit; + u64 message_ctx; + u64 message_iova; + struct dpni_dest_cfg dest_cfg; + u16 notification_mode; +}; + +int dpni_set_congestion_notification( + struct fsl_mc_io *mc_io, + u32 cmd_flags, + u16 token, + enum dpni_queue_type qtype, + u8 tc_id, + const struct dpni_congestion_notification_cfg *cfg); + /** * struct dpni_taildrop - Structure representing the taildrop * @enable: Indicates whether the taildrop is active or not. -- cgit v1.2.3-59-g8ed1b From 07beb1651adcd324f4d91584d5cab75d5882a9c2 Mon Sep 17 00:00:00 2001 From: Ioana Ciornei Date: Sun, 31 May 2020 00:08:14 +0300 Subject: dpaa2-eth: Keep congestion group taildrop enabled when PFC on Leave congestion group taildrop enabled for all traffic classes when PFC is enabled. Notification threshold is low enough such that it will be hit first and this also ensures that FQs on traffic classes which are not PFC enabled won't drain the buffer pool. FQ taildrop threshold is kept disabled as long as any form of flow control is on. Since FQ taildrop works with bytes, not number of frames, we can't guarantee it will not interfere with the congestion notification mechanism for all frame sizes. Signed-off-by: Ioana Ciornei Signed-off-by: David S. Miller --- .../net/ethernet/freescale/dpaa2/dpaa2-eth-dcb.c | 8 ++++-- drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c | 29 ++++++++++++++++------ drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.h | 7 +++++- 3 files changed, 34 insertions(+), 10 deletions(-) diff --git a/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth-dcb.c b/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth-dcb.c index 7ee07872af4d..83dee575c2fa 100644 --- a/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth-dcb.c +++ b/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth-dcb.c @@ -63,6 +63,7 @@ static int dpaa2_eth_dcbnl_ieee_setpfc(struct net_device *net_dev, { struct dpaa2_eth_priv *priv = netdev_priv(net_dev); struct dpni_link_cfg link_cfg = {0}; + bool tx_pause; int err; if (pfc->mbc || pfc->delay) @@ -75,8 +76,8 @@ static int dpaa2_eth_dcbnl_ieee_setpfc(struct net_device *net_dev, /* We allow PFC configuration even if it won't have any effect until * general pause frames are enabled */ - if (!dpaa2_eth_rx_pause_enabled(priv->link_state.options) || - !dpaa2_eth_tx_pause_enabled(priv->link_state.options)) + tx_pause = dpaa2_eth_tx_pause_enabled(priv->link_state.options); + if (!dpaa2_eth_rx_pause_enabled(priv->link_state.options) || !tx_pause) netdev_warn(net_dev, "Pause support must be enabled in order for PFC to work!\n"); link_cfg.rate = priv->link_state.rate; @@ -97,6 +98,9 @@ static int dpaa2_eth_dcbnl_ieee_setpfc(struct net_device *net_dev, return err; memcpy(&priv->pfc, pfc, sizeof(priv->pfc)); + priv->pfc_enabled = !!pfc->pfc_en; + + dpaa2_eth_set_rx_taildrop(priv, tx_pause, priv->pfc_enabled); return 0; } diff --git a/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c b/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c index cde9d0e2dd6d..8fb48de5d18c 100644 --- a/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c +++ b/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c @@ -1287,18 +1287,22 @@ static void disable_ch_napi(struct dpaa2_eth_priv *priv) } } -static void dpaa2_eth_set_rx_taildrop(struct dpaa2_eth_priv *priv, - bool tx_pause) +void dpaa2_eth_set_rx_taildrop(struct dpaa2_eth_priv *priv, + bool tx_pause, bool pfc) { struct dpni_taildrop td = {0}; struct dpaa2_eth_fq *fq; int i, err; + /* FQ taildrop: threshold is in bytes, per frame queue. Enabled if + * flow control is disabled (as it might interfere with either the + * buffer pool depletion trigger for pause frames or with the group + * congestion trigger for PFC frames) + */ td.enable = !tx_pause; - if (priv->rx_td_enabled == td.enable) - return; + if (priv->rx_fqtd_enabled == td.enable) + goto set_cgtd; - /* FQ taildrop: threshold is in bytes, per frame queue */ td.threshold = DPAA2_ETH_FQ_TAILDROP_THRESH; td.units = DPNI_CONGESTION_UNIT_BYTES; @@ -1316,9 +1320,20 @@ static void dpaa2_eth_set_rx_taildrop(struct dpaa2_eth_priv *priv, } } + priv->rx_fqtd_enabled = td.enable; + +set_cgtd: /* Congestion group taildrop: threshold is in frames, per group * of FQs belonging to the same traffic class + * Enabled if general Tx pause disabled or if PFCs are enabled + * (congestion group threhsold for PFC generation is lower than the + * CG taildrop threshold, so it won't interfere with it; we also + * want frames in non-PFC enabled traffic classes to be kept in check) */ + td.enable = !tx_pause || (tx_pause && pfc); + if (priv->rx_cgtd_enabled == td.enable) + return; + td.threshold = DPAA2_ETH_CG_TAILDROP_THRESH(priv); td.units = DPNI_CONGESTION_UNIT_FRAMES; for (i = 0; i < dpaa2_eth_tc_count(priv); i++) { @@ -1332,7 +1347,7 @@ static void dpaa2_eth_set_rx_taildrop(struct dpaa2_eth_priv *priv, } } - priv->rx_td_enabled = td.enable; + priv->rx_cgtd_enabled = td.enable; } static int link_state_update(struct dpaa2_eth_priv *priv) @@ -1353,7 +1368,7 @@ static int link_state_update(struct dpaa2_eth_priv *priv) * only when pause frame generation is disabled. */ tx_pause = dpaa2_eth_tx_pause_enabled(state.options); - dpaa2_eth_set_rx_taildrop(priv, tx_pause); + dpaa2_eth_set_rx_taildrop(priv, tx_pause, priv->pfc_enabled); /* When we manage the MAC/PHY using phylink there is no need * to manually update the netif_carrier. diff --git a/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.h b/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.h index 31b7b9b52da0..2d7ada0f0dbd 100644 --- a/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.h +++ b/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.h @@ -436,7 +436,8 @@ struct dpaa2_eth_priv { struct dpaa2_eth_drv_stats __percpu *percpu_extras; u16 mc_token; - u8 rx_td_enabled; + u8 rx_fqtd_enabled; + u8 rx_cgtd_enabled; struct dpni_link_state link_state; bool do_link_poll; @@ -448,6 +449,7 @@ struct dpaa2_eth_priv { struct dpaa2_eth_cls_rule *cls_rules; u8 rx_cls_enabled; u8 vlan_cls_enabled; + u8 pfc_enabled; #ifdef CONFIG_FSL_DPAA2_ETH_DCB u8 dcbx_mode; struct ieee_pfc pfc; @@ -584,6 +586,9 @@ int dpaa2_eth_cls_key_size(u64 key); int dpaa2_eth_cls_fld_off(int prot, int field); void dpaa2_eth_cls_trim_rule(void *key_mem, u64 fields); +void dpaa2_eth_set_rx_taildrop(struct dpaa2_eth_priv *priv, + bool tx_pause, bool pfc); + extern const struct dcbnl_rtnl_ops dpaa2_eth_dcbnl_ops; #endif /* __DPAA2_H */ -- cgit v1.2.3-59-g8ed1b From 547ce4cfb34cdecfa0ee19c29a5510329a7ac802 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 31 May 2020 02:06:55 +0100 Subject: switch cmsghdr_from_user_compat_to_kern() to copy_from_user() no point getting compat_cmsghdr field-by-field Signed-off-by: Al Viro Signed-off-by: David S. Miller --- net/compat.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/net/compat.c b/net/compat.c index afd7b444e0bf..5e3041a2c37d 100644 --- a/net/compat.c +++ b/net/compat.c @@ -183,20 +183,21 @@ int cmsghdr_from_user_compat_to_kern(struct msghdr *kmsg, struct sock *sk, memset(kcmsg, 0, kcmlen); ucmsg = CMSG_COMPAT_FIRSTHDR(kmsg); while (ucmsg != NULL) { - if (__get_user(ucmlen, &ucmsg->cmsg_len)) + struct compat_cmsghdr cmsg; + if (copy_from_user(&cmsg, ucmsg, sizeof(cmsg))) goto Efault; - if (!CMSG_COMPAT_OK(ucmlen, ucmsg, kmsg)) + if (!CMSG_COMPAT_OK(cmsg.cmsg_len, ucmsg, kmsg)) goto Einval; - tmp = ((ucmlen - sizeof(*ucmsg)) + sizeof(struct cmsghdr)); + tmp = ((cmsg.cmsg_len - sizeof(*ucmsg)) + sizeof(struct cmsghdr)); if ((char *)kcmsg_base + kcmlen - (char *)kcmsg < CMSG_ALIGN(tmp)) goto Einval; kcmsg->cmsg_len = tmp; + kcmsg->cmsg_level = cmsg.cmsg_level; + kcmsg->cmsg_type = cmsg.cmsg_type; tmp = CMSG_ALIGN(tmp); - if (__get_user(kcmsg->cmsg_level, &ucmsg->cmsg_level) || - __get_user(kcmsg->cmsg_type, &ucmsg->cmsg_type) || - copy_from_user(CMSG_DATA(kcmsg), + if (copy_from_user(CMSG_DATA(kcmsg), CMSG_COMPAT_DATA(ucmsg), - (ucmlen - sizeof(*ucmsg)))) + (cmsg.cmsg_len - sizeof(*ucmsg)))) goto Efault; /* Advance. */ -- cgit v1.2.3-59-g8ed1b From 03eaeda7806dcafb221a66939fcec9748619d16a Mon Sep 17 00:00:00 2001 From: Roopa Prabhu Date: Sat, 30 May 2020 22:17:20 -0700 Subject: vxlan: fix dereference of nexthop group in nexthop update path fix dereference of nexthop group in fdb nexthop group update validation path. Fixes: 1274e1cc4226 ("vxlan: ecmp support for mac fdb entries") Reported-by: Ido Schimmel Suggested-by: Ido Schimmel Signed-off-by: Roopa Prabhu Signed-off-by: David S. Miller --- drivers/net/vxlan.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c index d5906b41cdae..5bb448ae6c9c 100644 --- a/drivers/net/vxlan.c +++ b/drivers/net/vxlan.c @@ -881,13 +881,13 @@ static int vxlan_fdb_nh_update(struct vxlan_dev *vxlan, struct vxlan_fdb *fdb, goto err_inval; } - if (!nh->is_group || !nh->nh_grp->mpath) { + nhg = rtnl_dereference(nh->nh_grp); + if (!nh->is_group || !nhg->mpath) { NL_SET_ERR_MSG(extack, "Nexthop is not a multipath group"); goto err_inval; } /* check nexthop group family */ - nhg = rtnl_dereference(nh->nh_grp); switch (vxlan->default_dst.remote_ip.sa.sa_family) { case AF_INET: if (!nhg->has_v4) { -- cgit v1.2.3-59-g8ed1b From eae9d3c0167df840e821317040efcf0ca6789cb9 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Sun, 31 May 2020 21:25:51 +0300 Subject: net: dsa: sja1105: suppress -Wmissing-prototypes in sja1105_vl.c Newer C compilers are complaining about the fact that there are no function prototypes in sja1105_vl.c for the non-static functions. Give them what they want. Signed-off-by: Vladimir Oltean Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/dsa/sja1105/sja1105_vl.c | 2 +- drivers/net/dsa/sja1105/sja1105_vl.h | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/net/dsa/sja1105/sja1105_vl.c b/drivers/net/dsa/sja1105/sja1105_vl.c index f37611885376..bdfd6c4e190d 100644 --- a/drivers/net/dsa/sja1105/sja1105_vl.c +++ b/drivers/net/dsa/sja1105/sja1105_vl.c @@ -3,7 +3,7 @@ */ #include #include -#include "sja1105.h" +#include "sja1105_vl.h" #define SJA1105_SIZE_VL_STATUS 8 diff --git a/drivers/net/dsa/sja1105/sja1105_vl.h b/drivers/net/dsa/sja1105/sja1105_vl.h index 323fa0535af7..173d78963fed 100644 --- a/drivers/net/dsa/sja1105/sja1105_vl.h +++ b/drivers/net/dsa/sja1105/sja1105_vl.h @@ -4,6 +4,8 @@ #ifndef _SJA1105_VL_H #define _SJA1105_VL_H +#include "sja1105.h" + #if IS_ENABLED(CONFIG_NET_DSA_SJA1105_VL) int sja1105_vl_redirect(struct sja1105_private *priv, int port, -- cgit v1.2.3-59-g8ed1b From 90040351a832acf862c8f1855c29411303d23755 Mon Sep 17 00:00:00 2001 From: Quentin Monnet Date: Sat, 23 May 2020 02:07:51 +0100 Subject: tools, bpftool: Clean subcommand help messages This is a clean-up for the formatting of the do_help functions for bpftool's subcommands. The following fixes are included: - Do not use argv[-2] for "iter" help message, as the help is shown by default if no "iter" action is selected, resulting in messages looking like "./bpftool bpftool pin...". - Do not print unused HELP_SPEC_PROGRAM in help message for "bpftool link". - Andrii used argument indexing to avoid having multiple occurrences of bin_name and argv[-2] in the fprintf() for the help message, for "bpftool gen" and "bpftool link". Let's reuse this for all other help functions. We can remove up to thirty arguments for the "bpftool map" help message. - Harmonise all functions, e.g. use ending quotes-comma on a separate line. Signed-off-by: Quentin Monnet Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20200523010751.23465-1-quentin@isovalent.com Signed-off-by: Alexei Starovoitov --- tools/bpf/bpftool/btf.c | 8 ++++---- tools/bpf/bpftool/cgroup.c | 14 ++++++-------- tools/bpf/bpftool/feature.c | 6 +++--- tools/bpf/bpftool/gen.c | 6 +++--- tools/bpf/bpftool/iter.c | 8 ++++---- tools/bpf/bpftool/link.c | 1 - tools/bpf/bpftool/map.c | 41 ++++++++++++++++++----------------------- tools/bpf/bpftool/net.c | 12 ++++++------ tools/bpf/bpftool/perf.c | 2 +- tools/bpf/bpftool/prog.c | 27 ++++++++++++--------------- tools/bpf/bpftool/struct_ops.c | 15 +++++++-------- 11 files changed, 64 insertions(+), 76 deletions(-) diff --git a/tools/bpf/bpftool/btf.c b/tools/bpf/bpftool/btf.c index 41a1346934a1..c134666591a6 100644 --- a/tools/bpf/bpftool/btf.c +++ b/tools/bpf/bpftool/btf.c @@ -951,9 +951,9 @@ static int do_help(int argc, char **argv) } fprintf(stderr, - "Usage: %s btf { show | list } [id BTF_ID]\n" - " %s btf dump BTF_SRC [format FORMAT]\n" - " %s btf help\n" + "Usage: %1$s %2$s { show | list } [id BTF_ID]\n" + " %1$s %2$s dump BTF_SRC [format FORMAT]\n" + " %1$s %2$s help\n" "\n" " BTF_SRC := { id BTF_ID | prog PROG | map MAP [{key | value | kv | all}] | file FILE }\n" " FORMAT := { raw | c }\n" @@ -961,7 +961,7 @@ static int do_help(int argc, char **argv) " " HELP_SPEC_PROGRAM "\n" " " HELP_SPEC_OPTIONS "\n" "", - bin_name, bin_name, bin_name); + bin_name, "btf"); return 0; } diff --git a/tools/bpf/bpftool/cgroup.c b/tools/bpf/bpftool/cgroup.c index 27931db421d8..d901cc1b904a 100644 --- a/tools/bpf/bpftool/cgroup.c +++ b/tools/bpf/bpftool/cgroup.c @@ -491,20 +491,18 @@ static int do_help(int argc, char **argv) } fprintf(stderr, - "Usage: %s %s { show | list } CGROUP [**effective**]\n" - " %s %s tree [CGROUP_ROOT] [**effective**]\n" - " %s %s attach CGROUP ATTACH_TYPE PROG [ATTACH_FLAGS]\n" - " %s %s detach CGROUP ATTACH_TYPE PROG\n" - " %s %s help\n" + "Usage: %1$s %2$s { show | list } CGROUP [**effective**]\n" + " %1$s %2$s tree [CGROUP_ROOT] [**effective**]\n" + " %1$s %2$s attach CGROUP ATTACH_TYPE PROG [ATTACH_FLAGS]\n" + " %1$s %2$s detach CGROUP ATTACH_TYPE PROG\n" + " %1$s %2$s help\n" "\n" HELP_SPEC_ATTACH_TYPES "\n" " " HELP_SPEC_ATTACH_FLAGS "\n" " " HELP_SPEC_PROGRAM "\n" " " HELP_SPEC_OPTIONS "\n" "", - bin_name, argv[-2], - bin_name, argv[-2], bin_name, argv[-2], - bin_name, argv[-2], bin_name, argv[-2]); + bin_name, argv[-2]); return 0; } diff --git a/tools/bpf/bpftool/feature.c b/tools/bpf/bpftool/feature.c index 1b73e63274b5..f05e9e57b593 100644 --- a/tools/bpf/bpftool/feature.c +++ b/tools/bpf/bpftool/feature.c @@ -937,12 +937,12 @@ static int do_help(int argc, char **argv) } fprintf(stderr, - "Usage: %s %s probe [COMPONENT] [full] [unprivileged] [macros [prefix PREFIX]]\n" - " %s %s help\n" + "Usage: %1$s %2$s probe [COMPONENT] [full] [unprivileged] [macros [prefix PREFIX]]\n" + " %1$s %2$s help\n" "\n" " COMPONENT := { kernel | dev NAME }\n" "", - bin_name, argv[-2], bin_name, argv[-2]); + bin_name, argv[-2]); return 0; } diff --git a/tools/bpf/bpftool/gen.c b/tools/bpf/bpftool/gen.c index 0e5f0236cc76..a3c4bb86c05a 100644 --- a/tools/bpf/bpftool/gen.c +++ b/tools/bpf/bpftool/gen.c @@ -586,12 +586,12 @@ static int do_help(int argc, char **argv) } fprintf(stderr, - "Usage: %1$s gen skeleton FILE\n" - " %1$s gen help\n" + "Usage: %1$s %2$s skeleton FILE\n" + " %1$s %2$s help\n" "\n" " " HELP_SPEC_OPTIONS "\n" "", - bin_name); + bin_name, "gen"); return 0; } diff --git a/tools/bpf/bpftool/iter.c b/tools/bpf/bpftool/iter.c index eb5987a0c3b6..33240fcc6319 100644 --- a/tools/bpf/bpftool/iter.c +++ b/tools/bpf/bpftool/iter.c @@ -68,10 +68,10 @@ close_obj: static int do_help(int argc, char **argv) { fprintf(stderr, - "Usage: %s %s pin OBJ PATH\n" - " %s %s help\n" - "\n", - bin_name, argv[-2], bin_name, argv[-2]); + "Usage: %1$s %2$s pin OBJ PATH\n" + " %1$s %2$s help\n" + "", + bin_name, "iter"); return 0; } diff --git a/tools/bpf/bpftool/link.c b/tools/bpf/bpftool/link.c index b6a0b35c78ae..670a561dc31b 100644 --- a/tools/bpf/bpftool/link.c +++ b/tools/bpf/bpftool/link.c @@ -312,7 +312,6 @@ static int do_help(int argc, char **argv) " %1$s %2$s help\n" "\n" " " HELP_SPEC_LINK "\n" - " " HELP_SPEC_PROGRAM "\n" " " HELP_SPEC_OPTIONS "\n" "", bin_name, argv[-2]); diff --git a/tools/bpf/bpftool/map.c b/tools/bpf/bpftool/map.c index 85cbe9a19170..c5fac8068ba1 100644 --- a/tools/bpf/bpftool/map.c +++ b/tools/bpf/bpftool/map.c @@ -1561,24 +1561,24 @@ static int do_help(int argc, char **argv) } fprintf(stderr, - "Usage: %s %s { show | list } [MAP]\n" - " %s %s create FILE type TYPE key KEY_SIZE value VALUE_SIZE \\\n" - " entries MAX_ENTRIES name NAME [flags FLAGS] \\\n" - " [dev NAME]\n" - " %s %s dump MAP\n" - " %s %s update MAP [key DATA] [value VALUE] [UPDATE_FLAGS]\n" - " %s %s lookup MAP [key DATA]\n" - " %s %s getnext MAP [key DATA]\n" - " %s %s delete MAP key DATA\n" - " %s %s pin MAP FILE\n" - " %s %s event_pipe MAP [cpu N index M]\n" - " %s %s peek MAP\n" - " %s %s push MAP value VALUE\n" - " %s %s pop MAP\n" - " %s %s enqueue MAP value VALUE\n" - " %s %s dequeue MAP\n" - " %s %s freeze MAP\n" - " %s %s help\n" + "Usage: %1$s %2$s { show | list } [MAP]\n" + " %1$s %2$s create FILE type TYPE key KEY_SIZE value VALUE_SIZE \\\n" + " entries MAX_ENTRIES name NAME [flags FLAGS] \\\n" + " [dev NAME]\n" + " %1$s %2$s dump MAP\n" + " %1$s %2$s update MAP [key DATA] [value VALUE] [UPDATE_FLAGS]\n" + " %1$s %2$s lookup MAP [key DATA]\n" + " %1$s %2$s getnext MAP [key DATA]\n" + " %1$s %2$s delete MAP key DATA\n" + " %1$s %2$s pin MAP FILE\n" + " %1$s %2$s event_pipe MAP [cpu N index M]\n" + " %1$s %2$s peek MAP\n" + " %1$s %2$s push MAP value VALUE\n" + " %1$s %2$s pop MAP\n" + " %1$s %2$s enqueue MAP value VALUE\n" + " %1$s %2$s dequeue MAP\n" + " %1$s %2$s freeze MAP\n" + " %1$s %2$s help\n" "\n" " " HELP_SPEC_MAP "\n" " DATA := { [hex] BYTES }\n" @@ -1593,11 +1593,6 @@ static int do_help(int argc, char **argv) " queue | stack | sk_storage | struct_ops }\n" " " HELP_SPEC_OPTIONS "\n" "", - bin_name, argv[-2], bin_name, argv[-2], bin_name, argv[-2], - bin_name, argv[-2], bin_name, argv[-2], bin_name, argv[-2], - bin_name, argv[-2], bin_name, argv[-2], bin_name, argv[-2], - bin_name, argv[-2], bin_name, argv[-2], bin_name, argv[-2], - bin_name, argv[-2], bin_name, argv[-2], bin_name, argv[-2], bin_name, argv[-2]); return 0; diff --git a/tools/bpf/bpftool/net.c b/tools/bpf/bpftool/net.c index c5e3895b7c8b..56c3a2bae3ef 100644 --- a/tools/bpf/bpftool/net.c +++ b/tools/bpf/bpftool/net.c @@ -458,10 +458,10 @@ static int do_help(int argc, char **argv) } fprintf(stderr, - "Usage: %s %s { show | list } [dev ]\n" - " %s %s attach ATTACH_TYPE PROG dev [ overwrite ]\n" - " %s %s detach ATTACH_TYPE dev \n" - " %s %s help\n" + "Usage: %1$s %2$s { show | list } [dev ]\n" + " %1$s %2$s attach ATTACH_TYPE PROG dev [ overwrite ]\n" + " %1$s %2$s detach ATTACH_TYPE dev \n" + " %1$s %2$s help\n" "\n" " " HELP_SPEC_PROGRAM "\n" " ATTACH_TYPE := { xdp | xdpgeneric | xdpdrv | xdpoffload }\n" @@ -470,8 +470,8 @@ static int do_help(int argc, char **argv) " For progs attached to cgroups, use \"bpftool cgroup\"\n" " to dump program attachments. For program types\n" " sk_{filter,skb,msg,reuseport} and lwt/seg6, please\n" - " consult iproute2.\n", - bin_name, argv[-2], bin_name, argv[-2], bin_name, argv[-2], + " consult iproute2.\n" + "", bin_name, argv[-2]); return 0; diff --git a/tools/bpf/bpftool/perf.c b/tools/bpf/bpftool/perf.c index 3341aa14acda..ad23934819c7 100644 --- a/tools/bpf/bpftool/perf.c +++ b/tools/bpf/bpftool/perf.c @@ -231,7 +231,7 @@ static int do_show(int argc, char **argv) static int do_help(int argc, char **argv) { fprintf(stderr, - "Usage: %s %s { show | list | help }\n" + "Usage: %1$s %2$s { show | list | help }\n" "", bin_name, argv[-2]); diff --git a/tools/bpf/bpftool/prog.c b/tools/bpf/bpftool/prog.c index 245f941fdbcf..a5eff83496f2 100644 --- a/tools/bpf/bpftool/prog.c +++ b/tools/bpf/bpftool/prog.c @@ -1984,24 +1984,24 @@ static int do_help(int argc, char **argv) } fprintf(stderr, - "Usage: %s %s { show | list } [PROG]\n" - " %s %s dump xlated PROG [{ file FILE | opcodes | visual | linum }]\n" - " %s %s dump jited PROG [{ file FILE | opcodes | linum }]\n" - " %s %s pin PROG FILE\n" - " %s %s { load | loadall } OBJ PATH \\\n" + "Usage: %1$s %2$s { show | list } [PROG]\n" + " %1$s %2$s dump xlated PROG [{ file FILE | opcodes | visual | linum }]\n" + " %1$s %2$s dump jited PROG [{ file FILE | opcodes | linum }]\n" + " %1$s %2$s pin PROG FILE\n" + " %1$s %2$s { load | loadall } OBJ PATH \\\n" " [type TYPE] [dev NAME] \\\n" " [map { idx IDX | name NAME } MAP]\\\n" " [pinmaps MAP_DIR]\n" - " %s %s attach PROG ATTACH_TYPE [MAP]\n" - " %s %s detach PROG ATTACH_TYPE [MAP]\n" - " %s %s run PROG \\\n" + " %1$s %2$s attach PROG ATTACH_TYPE [MAP]\n" + " %1$s %2$s detach PROG ATTACH_TYPE [MAP]\n" + " %1$s %2$s run PROG \\\n" " data_in FILE \\\n" " [data_out FILE [data_size_out L]] \\\n" " [ctx_in FILE [ctx_out FILE [ctx_size_out M]]] \\\n" " [repeat N]\n" - " %s %s profile PROG [duration DURATION] METRICs\n" - " %s %s tracelog\n" - " %s %s help\n" + " %1$s %2$s profile PROG [duration DURATION] METRICs\n" + " %1$s %2$s tracelog\n" + " %1$s %2$s help\n" "\n" " " HELP_SPEC_MAP "\n" " " HELP_SPEC_PROGRAM "\n" @@ -2022,10 +2022,7 @@ static int do_help(int argc, char **argv) " METRIC := { cycles | instructions | l1d_loads | llc_misses }\n" " " HELP_SPEC_OPTIONS "\n" "", - bin_name, argv[-2], bin_name, argv[-2], bin_name, argv[-2], - bin_name, argv[-2], bin_name, argv[-2], bin_name, argv[-2], - bin_name, argv[-2], bin_name, argv[-2], bin_name, argv[-2], - bin_name, argv[-2], bin_name, argv[-2]); + bin_name, argv[-2]); return 0; } diff --git a/tools/bpf/bpftool/struct_ops.c b/tools/bpf/bpftool/struct_ops.c index e17738479edc..b58b91f62ffb 100644 --- a/tools/bpf/bpftool/struct_ops.c +++ b/tools/bpf/bpftool/struct_ops.c @@ -566,16 +566,15 @@ static int do_help(int argc, char **argv) } fprintf(stderr, - "Usage: %s %s { show | list } [STRUCT_OPS_MAP]\n" - " %s %s dump [STRUCT_OPS_MAP]\n" - " %s %s register OBJ\n" - " %s %s unregister STRUCT_OPS_MAP\n" - " %s %s help\n" + "Usage: %1$s %2$s { show | list } [STRUCT_OPS_MAP]\n" + " %1$s %2$s dump [STRUCT_OPS_MAP]\n" + " %1$s %2$s register OBJ\n" + " %1$s %2$s unregister STRUCT_OPS_MAP\n" + " %1$s %2$s help\n" "\n" " OPTIONS := { {-j|--json} [{-p|--pretty}] }\n" - " STRUCT_OPS_MAP := [ id STRUCT_OPS_MAP_ID | name STRUCT_OPS_MAP_NAME ]\n", - bin_name, argv[-2], bin_name, argv[-2], - bin_name, argv[-2], bin_name, argv[-2], + " STRUCT_OPS_MAP := [ id STRUCT_OPS_MAP_ID | name STRUCT_OPS_MAP_NAME ]\n" + "", bin_name, argv[-2]); return 0; -- cgit v1.2.3-59-g8ed1b From 73a4f0407e67cdfdf55dd94f573ed4ee2d0d62fe Mon Sep 17 00:00:00 2001 From: Quentin Monnet Date: Sat, 23 May 2020 02:02:47 +0100 Subject: tools, bpftool: Make capability check account for new BPF caps Following the introduction of CAP_BPF, and the switch from CAP_SYS_ADMIN to other capabilities for various BPF features, update the capability checks (and potentially, drops) in bpftool for feature probes. Because bpftool and/or the system might not know of CAP_BPF yet, some caution is necessary: - If compiled and run on a system with CAP_BPF, check CAP_BPF, CAP_SYS_ADMIN, CAP_PERFMON, CAP_NET_ADMIN. - Guard against CAP_BPF being undefined, to allow compiling bpftool from latest sources on older systems. If the system where feature probes are run does not know of CAP_BPF, stop checking after CAP_SYS_ADMIN, as this should be the only capability required for all the BPF probing. - If compiled from latest sources on a system without CAP_BPF, but later executed on a newer system with CAP_BPF knowledge, then we only test CAP_SYS_ADMIN. Some probes may fail if the bpftool process has CAP_SYS_ADMIN but misses the other capabilities. The alternative would be to redefine the value for CAP_BPF in bpftool, but this does not look clean, and the case sounds relatively rare anyway. Note that libcap offers a cap_to_name() function to retrieve the name of a given capability (e.g. "cap_sys_admin"). We do not use it because deriving the names from the macros looks simpler than using cap_to_name() (doing a strdup() on the string) + cap_free() + handling the case of failed allocations, when we just want to use the name of the capability in an error message. The checks when compiling without libcap (i.e. root versus non-root) are unchanged. v2: - Do not allocate cap_list dynamically. - Drop BPF-related capabilities when running with "unprivileged", even if we didn't have the full set in the first place (in v1, we would skip dropping them in that case). - Keep track of what capabilities we have, print the names of the missing ones for privileged probing. - Attempt to drop only the capabilities we actually have. - Rename a couple variables. Signed-off-by: Quentin Monnet Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20200523010247.20654-1-quentin@isovalent.com Signed-off-by: Alexei Starovoitov --- tools/bpf/bpftool/feature.c | 85 +++++++++++++++++++++++++++++++++++---------- 1 file changed, 66 insertions(+), 19 deletions(-) diff --git a/tools/bpf/bpftool/feature.c b/tools/bpf/bpftool/feature.c index f05e9e57b593..768bf77df886 100644 --- a/tools/bpf/bpftool/feature.c +++ b/tools/bpf/bpftool/feature.c @@ -758,11 +758,29 @@ static void section_misc(const char *define_prefix, __u32 ifindex) print_end_section(); } +#ifdef USE_LIBCAP +#define capability(c) { c, false, #c } +#define capability_msg(a, i) a[i].set ? "" : a[i].name, a[i].set ? "" : ", " +#endif + static int handle_perms(void) { #ifdef USE_LIBCAP - cap_value_t cap_list[1] = { CAP_SYS_ADMIN }; - bool has_sys_admin_cap = false; + struct { + cap_value_t cap; + bool set; + char name[14]; /* strlen("CAP_SYS_ADMIN") */ + } bpf_caps[] = { + capability(CAP_SYS_ADMIN), +#ifdef CAP_BPF + capability(CAP_BPF), + capability(CAP_NET_ADMIN), + capability(CAP_PERFMON), +#endif + }; + cap_value_t cap_list[ARRAY_SIZE(bpf_caps)]; + unsigned int i, nb_bpf_caps = 0; + bool cap_sys_admin_only = true; cap_flag_value_t val; int res = -1; cap_t caps; @@ -774,35 +792,64 @@ static int handle_perms(void) return -1; } - if (cap_get_flag(caps, CAP_SYS_ADMIN, CAP_EFFECTIVE, &val)) { - p_err("bug: failed to retrieve CAP_SYS_ADMIN status"); - goto exit_free; - } - if (val == CAP_SET) - has_sys_admin_cap = true; +#ifdef CAP_BPF + if (CAP_IS_SUPPORTED(CAP_BPF)) + cap_sys_admin_only = false; +#endif - if (!run_as_unprivileged && !has_sys_admin_cap) { - p_err("full feature probing requires CAP_SYS_ADMIN, run as root or use 'unprivileged'"); - goto exit_free; + for (i = 0; i < ARRAY_SIZE(bpf_caps); i++) { + const char *cap_name = bpf_caps[i].name; + cap_value_t cap = bpf_caps[i].cap; + + if (cap_get_flag(caps, cap, CAP_EFFECTIVE, &val)) { + p_err("bug: failed to retrieve %s status: %s", cap_name, + strerror(errno)); + goto exit_free; + } + + if (val == CAP_SET) { + bpf_caps[i].set = true; + cap_list[nb_bpf_caps++] = cap; + } + + if (cap_sys_admin_only) + /* System does not know about CAP_BPF, meaning that + * CAP_SYS_ADMIN is the only capability required. We + * just checked it, break. + */ + break; } - if ((run_as_unprivileged && !has_sys_admin_cap) || - (!run_as_unprivileged && has_sys_admin_cap)) { + if ((run_as_unprivileged && !nb_bpf_caps) || + (!run_as_unprivileged && nb_bpf_caps == ARRAY_SIZE(bpf_caps)) || + (!run_as_unprivileged && cap_sys_admin_only && nb_bpf_caps)) { /* We are all good, exit now */ res = 0; goto exit_free; } - /* if (run_as_unprivileged && has_sys_admin_cap), drop CAP_SYS_ADMIN */ + if (!run_as_unprivileged) { + if (cap_sys_admin_only) + p_err("missing %s, required for full feature probing; run as root or use 'unprivileged'", + bpf_caps[0].name); + else + p_err("missing %s%s%s%s%s%s%s%srequired for full feature probing; run as root or use 'unprivileged'", + capability_msg(bpf_caps, 0), + capability_msg(bpf_caps, 1), + capability_msg(bpf_caps, 2), + capability_msg(bpf_caps, 3)); + goto exit_free; + } - if (cap_set_flag(caps, CAP_EFFECTIVE, ARRAY_SIZE(cap_list), cap_list, + /* if (run_as_unprivileged && nb_bpf_caps > 0), drop capabilities. */ + if (cap_set_flag(caps, CAP_EFFECTIVE, nb_bpf_caps, cap_list, CAP_CLEAR)) { - p_err("bug: failed to clear CAP_SYS_ADMIN from capabilities"); + p_err("bug: failed to clear capabilities: %s", strerror(errno)); goto exit_free; } if (cap_set_proc(caps)) { - p_err("failed to drop CAP_SYS_ADMIN: %s", strerror(errno)); + p_err("failed to drop capabilities: %s", strerror(errno)); goto exit_free; } @@ -817,7 +864,7 @@ exit_free: return res; #else - /* Detection assumes user has sufficient privileges (CAP_SYS_ADMIN). + /* Detection assumes user has specific privileges. * We do not use libpcap so let's approximate, and restrict usage to * root user only. */ @@ -901,7 +948,7 @@ static int do_probe(int argc, char **argv) } } - /* Full feature detection requires CAP_SYS_ADMIN privilege. + /* Full feature detection requires specific privileges. * Let's approximate, and warn if user is not root. */ if (handle_perms()) -- cgit v1.2.3-59-g8ed1b From dc3ca5cf3e0be9fb73f4691247367d76a22bf30b Mon Sep 17 00:00:00 2001 From: Tobias Klauser Date: Mon, 25 May 2020 15:54:21 +0200 Subject: tools, bpftool: Print correct error message when failing to load BTF btf__parse_raw and btf__parse_elf return negative error numbers wrapped in an ERR_PTR, so the extracted value needs to be negated before passing them to strerror which expects a positive error number. Before: Error: failed to load BTF from .../vmlinux: Unknown error -2 After: Error: failed to load BTF from .../vmlinux: No such file or directory Signed-off-by: Tobias Klauser Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20200525135421.4154-1-tklauser@distanz.ch Signed-off-by: Alexei Starovoitov --- tools/bpf/bpftool/btf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/bpf/bpftool/btf.c b/tools/bpf/bpftool/btf.c index c134666591a6..faac8189b285 100644 --- a/tools/bpf/bpftool/btf.c +++ b/tools/bpf/bpftool/btf.c @@ -553,7 +553,7 @@ static int do_dump(int argc, char **argv) btf = btf__parse_elf(*argv, NULL); if (IS_ERR(btf)) { - err = PTR_ERR(btf); + err = -PTR_ERR(btf); btf = NULL; p_err("failed to load BTF from %s: %s", *argv, strerror(err)); -- cgit v1.2.3-59-g8ed1b From fe537393b5795ecbe5746eec0e16124bc998a594 Mon Sep 17 00:00:00 2001 From: Jakub Sitnicki Date: Mon, 25 May 2020 14:29:28 +0200 Subject: bpf: Fix returned error sign when link doesn't support updates System calls encode returned errors as negative values. Fix a typo that breaks this convention for bpf(LINK_UPDATE) when bpf_link doesn't support update operation. Fixes: f9d041271cf4 ("bpf: Refactor bpf_link update handling") Signed-off-by: Jakub Sitnicki Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20200525122928.1164495-1-jakub@cloudflare.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/syscall.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index aaa29fb6f363..d13b804ff045 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -3924,7 +3924,7 @@ static int link_update(union bpf_attr *attr) if (link->ops->update_prog) ret = link->ops->update_prog(link, new_prog, old_prog); else - ret = EINVAL; + ret = -EINVAL; out_put_progs: if (old_prog) -- cgit v1.2.3-59-g8ed1b From 2b983b407a3a1f47f7d8595245066854ff352c65 Mon Sep 17 00:00:00 2001 From: Lukas Bulwahn Date: Mon, 25 May 2020 16:15:53 +0200 Subject: MAINTAINERS: Adjust entry in XDP SOCKETS to actual file name Commit 2b43470add8c ("xsk: Introduce AF_XDP buffer allocation API") added a new header file include/net/xsk_buff_pool.h, but commit 28bee21dc04b ("MAINTAINERS, xsk: Update AF_XDP section after moves/adds") added a file entry referring to include/net/xsk_buffer_pool.h. Hence, ./scripts/get_maintainer.pl --self-test=patterns complains: warning: no file matches F: include/net/xsk_buffer_pool.h Adjust the entry in XDP SOCKETS to the actual file name. Signed-off-by: Lukas Bulwahn Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20200525141553.7035-1-lukas.bulwahn@gmail.com Signed-off-by: Alexei Starovoitov --- MAINTAINERS | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index 5d81c002232a..66d1a3f10102 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -18456,7 +18456,7 @@ L: netdev@vger.kernel.org L: bpf@vger.kernel.org S: Maintained F: include/net/xdp_sock* -F: include/net/xsk_buffer_pool.h +F: include/net/xsk_buff_pool.h F: include/uapi/linux/if_xdp.h F: net/xdp/ F: samples/bpf/xdpsock* -- cgit v1.2.3-59-g8ed1b From 272d51af32890632134845ddf35318c11da20c7b Mon Sep 17 00:00:00 2001 From: Eelco Chaudron Date: Tue, 26 May 2020 11:21:42 +0200 Subject: libbpf: Add API to consume the perf ring buffer content This new API, perf_buffer__consume, can be used as follows: - When you have a perf ring where wakeup_events is higher than 1, and you have remaining data in the rings you would like to pull out on exit (or maybe based on a timeout). - For low latency cases where you burn a CPU that constantly polls the queues. Signed-off-by: Eelco Chaudron Signed-off-by: Daniel Borkmann Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/159048487929.89441.7465713173442594608.stgit@ebuild Signed-off-by: Alexei Starovoitov --- tools/lib/bpf/libbpf.c | 19 +++++++++++++++++++ tools/lib/bpf/libbpf.h | 1 + tools/lib/bpf/libbpf.map | 1 + 3 files changed, 21 insertions(+) diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index fa04cbe547ed..5d60de6fd818 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -8456,6 +8456,25 @@ int perf_buffer__poll(struct perf_buffer *pb, int timeout_ms) return cnt < 0 ? -errno : cnt; } +int perf_buffer__consume(struct perf_buffer *pb) +{ + int i, err; + + for (i = 0; i < pb->cpu_cnt; i++) { + struct perf_cpu_buf *cpu_buf = pb->cpu_bufs[i]; + + if (!cpu_buf) + continue; + + err = perf_buffer__process_records(pb, cpu_buf); + if (err) { + pr_warn("error while processing records: %d\n", err); + return err; + } + } + return 0; +} + struct bpf_prog_info_array_desc { int array_offset; /* e.g. offset of jited_prog_insns */ int count_offset; /* e.g. offset of jited_prog_len */ diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h index 8ea69558f0a8..1e2e399a5f2c 100644 --- a/tools/lib/bpf/libbpf.h +++ b/tools/lib/bpf/libbpf.h @@ -533,6 +533,7 @@ perf_buffer__new_raw(int map_fd, size_t page_cnt, LIBBPF_API void perf_buffer__free(struct perf_buffer *pb); LIBBPF_API int perf_buffer__poll(struct perf_buffer *pb, int timeout_ms); +LIBBPF_API int perf_buffer__consume(struct perf_buffer *pb); typedef enum bpf_perf_event_ret (*bpf_perf_event_print_t)(struct perf_event_header *hdr, diff --git a/tools/lib/bpf/libbpf.map b/tools/lib/bpf/libbpf.map index 0133d469d30b..381a7342ecfc 100644 --- a/tools/lib/bpf/libbpf.map +++ b/tools/lib/bpf/libbpf.map @@ -262,4 +262,5 @@ LIBBPF_0.0.9 { bpf_link_get_fd_by_id; bpf_link_get_next_id; bpf_program__attach_iter; + perf_buffer__consume; } LIBBPF_0.0.8; -- cgit v1.2.3-59-g8ed1b From 93581359e7aeb11358018f2e3a737776d1e899ae Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Tue, 26 May 2020 20:46:12 +0300 Subject: libbpf: Install headers as part of make install Current 'make install' results in only pkg-config and library binaries being installed. For consistency also install headers as part of "make install" Signed-off-by: Nikolay Borisov Signed-off-by: Daniel Borkmann Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200526174612.5447-1-nborisov@suse.com Signed-off-by: Alexei Starovoitov --- tools/lib/bpf/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/lib/bpf/Makefile b/tools/lib/bpf/Makefile index aee7f1a83c77..d02c4d910aad 100644 --- a/tools/lib/bpf/Makefile +++ b/tools/lib/bpf/Makefile @@ -264,7 +264,7 @@ install_pkgconfig: $(PC_FILE) $(call QUIET_INSTALL, $(PC_FILE)) \ $(call do_install,$(PC_FILE),$(libdir_SQ)/pkgconfig,644) -install: install_lib install_pkgconfig +install: install_lib install_pkgconfig install_headers ### Cleaning rules -- cgit v1.2.3-59-g8ed1b From 0142dddcbe965450338076c486d0d757b3184352 Mon Sep 17 00:00:00 2001 From: Chris Packham Date: Tue, 26 May 2020 11:00:24 +1200 Subject: bpf: Fix spelling in comment explaining ARG1 in ___bpf_prog_run Change 'handeled' to 'handled'. Signed-off-by: Chris Packham Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20200525230025.14470-1-chris.packham@alliedtelesis.co.nz Signed-off-by: Alexei Starovoitov --- kernel/bpf/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index c40ff4cf9880..af52ca658c73 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1543,7 +1543,7 @@ select_insn: /* ARG1 at this point is guaranteed to point to CTX from * the verifier side due to the fact that the tail call is - * handeled like a helper, that is, bpf_tail_call_proto, + * handled like a helper, that is, bpf_tail_call_proto, * where arg1_type is ARG_PTR_TO_CTX. */ insn = prog->insnsi; -- cgit v1.2.3-59-g8ed1b From 55983299b7ea94d714c19cdfd8d969ba86e0d7e9 Mon Sep 17 00:00:00 2001 From: Yauheni Kaliuta Date: Mon, 25 May 2020 09:18:46 +0300 Subject: libbpf: Use .so dynamic symbols for abi check Since dynamic symbols are used for dynamic linking it makes sense to use them (readelf --dyn-syms) for abi check. Found with some configuration on powerpc where linker puts local *.plt_call.* symbols into .so. Signed-off-by: Yauheni Kaliuta Signed-off-by: Daniel Borkmann Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200525061846.16524-1-yauheni.kaliuta@redhat.com Signed-off-by: Alexei Starovoitov --- tools/lib/bpf/Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/lib/bpf/Makefile b/tools/lib/bpf/Makefile index d02c4d910aad..bf8ed134cb8a 100644 --- a/tools/lib/bpf/Makefile +++ b/tools/lib/bpf/Makefile @@ -151,7 +151,7 @@ GLOBAL_SYM_COUNT = $(shell readelf -s --wide $(BPF_IN_SHARED) | \ sed 's/\[.*\]//' | \ awk '/GLOBAL/ && /DEFAULT/ && !/UND/ {print $$NF}' | \ sort -u | wc -l) -VERSIONED_SYM_COUNT = $(shell readelf -s --wide $(OUTPUT)libbpf.so | \ +VERSIONED_SYM_COUNT = $(shell readelf --dyn-syms --wide $(OUTPUT)libbpf.so | \ grep -Eo '[^ ]+@LIBBPF_' | cut -d@ -f1 | sort -u | wc -l) CMD_TARGETS = $(LIB_TARGET) $(PC_FILE) @@ -218,7 +218,7 @@ check_abi: $(OUTPUT)libbpf.so sed 's/\[.*\]//' | \ awk '/GLOBAL/ && /DEFAULT/ && !/UND/ {print $$NF}'| \ sort -u > $(OUTPUT)libbpf_global_syms.tmp; \ - readelf -s --wide $(OUTPUT)libbpf.so | \ + readelf --dyn-syms --wide $(OUTPUT)libbpf.so | \ grep -Eo '[^ ]+@LIBBPF_' | cut -d@ -f1 | \ sort -u > $(OUTPUT)libbpf_versioned_syms.tmp; \ diff -u $(OUTPUT)libbpf_global_syms.tmp \ -- cgit v1.2.3-59-g8ed1b From abe3cac8706bffeda3ebc06e4a9fa6e9cadacf26 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Sun, 24 May 2020 09:50:33 -0700 Subject: bpf, sk_msg: Add some generic helpers that may be useful from sk_msg Add these generic helpers that may be useful to use from sk_msg programs. The helpers do not depend on ctx so we can simply add them here, BPF_FUNC_perf_event_output BPF_FUNC_get_current_uid_gid BPF_FUNC_get_current_pid_tgid BPF_FUNC_get_current_cgroup_id BPF_FUNC_get_current_ancestor_cgroup_id BPF_FUNC_get_cgroup_classid Signed-off-by: John Fastabend Signed-off-by: Daniel Borkmann Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/159033903373.12355.15489763099696629346.stgit@john-Precision-5820-Tower Signed-off-by: Alexei Starovoitov --- net/core/filter.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/net/core/filter.c b/net/core/filter.c index bd2853d23b50..c3b496a19748 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -6443,6 +6443,22 @@ sk_msg_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_msg_push_data_proto; case BPF_FUNC_msg_pop_data: return &bpf_msg_pop_data_proto; + case BPF_FUNC_perf_event_output: + return &bpf_event_output_data_proto; + case BPF_FUNC_get_current_uid_gid: + return &bpf_get_current_uid_gid_proto; + case BPF_FUNC_get_current_pid_tgid: + return &bpf_get_current_pid_tgid_proto; +#ifdef CONFIG_CGROUPS + case BPF_FUNC_get_current_cgroup_id: + return &bpf_get_current_cgroup_id_proto; + case BPF_FUNC_get_current_ancestor_cgroup_id: + return &bpf_get_current_ancestor_cgroup_id_proto; +#endif +#ifdef CONFIG_CGROUP_NET_CLASSID + case BPF_FUNC_get_cgroup_classid: + return &bpf_get_cgroup_classid_curr_proto; +#endif default: return bpf_base_func_proto(func_id); } -- cgit v1.2.3-59-g8ed1b From f470378c7562a2818b45ed11c98973f2b89eedd3 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Sun, 24 May 2020 09:50:55 -0700 Subject: bpf: Extend bpf_base_func_proto helpers with probe_* and *current_task* Often it is useful when applying policy to know something about the task. If the administrator has CAP_SYS_ADMIN rights then they can use kprobe + networking hook and link the two programs together to accomplish this. However, this is a bit clunky and also means we have to call both the network program and kprobe program when we could just use a single program and avoid passing metadata through sk_msg/skb->cb, socket, maps, etc. To accomplish this add probe_* helpers to bpf_base_func_proto programs guarded by a perfmon_capable() check. New supported helpers are the following, BPF_FUNC_get_current_task BPF_FUNC_probe_read_user BPF_FUNC_probe_read_kernel BPF_FUNC_probe_read_user_str BPF_FUNC_probe_read_kernel_str Signed-off-by: John Fastabend Signed-off-by: Daniel Borkmann Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/159033905529.12355.4368381069655254932.stgit@john-Precision-5820-Tower Signed-off-by: Alexei Starovoitov --- kernel/bpf/helpers.c | 24 ++++++++++++++++++++++++ kernel/trace/bpf_trace.c | 10 +++++----- 2 files changed, 29 insertions(+), 5 deletions(-) diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 886949fdcece..bb4fb634275e 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -601,6 +601,12 @@ const struct bpf_func_proto bpf_event_output_data_proto = { .arg5_type = ARG_CONST_SIZE_OR_ZERO, }; +const struct bpf_func_proto bpf_get_current_task_proto __weak; +const struct bpf_func_proto bpf_probe_read_user_proto __weak; +const struct bpf_func_proto bpf_probe_read_user_str_proto __weak; +const struct bpf_func_proto bpf_probe_read_kernel_proto __weak; +const struct bpf_func_proto bpf_probe_read_kernel_str_proto __weak; + const struct bpf_func_proto * bpf_base_func_proto(enum bpf_func_id func_id) { @@ -647,6 +653,24 @@ bpf_base_func_proto(enum bpf_func_id func_id) return bpf_get_trace_printk_proto(); case BPF_FUNC_jiffies64: return &bpf_jiffies64_proto; + default: + break; + } + + if (!perfmon_capable()) + return NULL; + + switch (func_id) { + case BPF_FUNC_get_current_task: + return &bpf_get_current_task_proto; + case BPF_FUNC_probe_read_user: + return &bpf_probe_read_user_proto; + case BPF_FUNC_probe_read_kernel: + return &bpf_probe_read_kernel_proto; + case BPF_FUNC_probe_read_user_str: + return &bpf_probe_read_user_str_proto; + case BPF_FUNC_probe_read_kernel_str: + return &bpf_probe_read_kernel_str_proto; default: return NULL; } diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 9531f54d0a3a..187cd6995bbb 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -147,7 +147,7 @@ BPF_CALL_3(bpf_probe_read_user, void *, dst, u32, size, return ret; } -static const struct bpf_func_proto bpf_probe_read_user_proto = { +const struct bpf_func_proto bpf_probe_read_user_proto = { .func = bpf_probe_read_user, .gpl_only = true, .ret_type = RET_INTEGER, @@ -167,7 +167,7 @@ BPF_CALL_3(bpf_probe_read_user_str, void *, dst, u32, size, return ret; } -static const struct bpf_func_proto bpf_probe_read_user_str_proto = { +const struct bpf_func_proto bpf_probe_read_user_str_proto = { .func = bpf_probe_read_user_str, .gpl_only = true, .ret_type = RET_INTEGER, @@ -198,7 +198,7 @@ BPF_CALL_3(bpf_probe_read_kernel, void *, dst, u32, size, return bpf_probe_read_kernel_common(dst, size, unsafe_ptr, false); } -static const struct bpf_func_proto bpf_probe_read_kernel_proto = { +const struct bpf_func_proto bpf_probe_read_kernel_proto = { .func = bpf_probe_read_kernel, .gpl_only = true, .ret_type = RET_INTEGER, @@ -253,7 +253,7 @@ BPF_CALL_3(bpf_probe_read_kernel_str, void *, dst, u32, size, return bpf_probe_read_kernel_str_common(dst, size, unsafe_ptr, false); } -static const struct bpf_func_proto bpf_probe_read_kernel_str_proto = { +const struct bpf_func_proto bpf_probe_read_kernel_str_proto = { .func = bpf_probe_read_kernel_str, .gpl_only = true, .ret_type = RET_INTEGER, @@ -907,7 +907,7 @@ BPF_CALL_0(bpf_get_current_task) return (long) current; } -static const struct bpf_func_proto bpf_get_current_task_proto = { +const struct bpf_func_proto bpf_get_current_task_proto = { .func = bpf_get_current_task, .gpl_only = true, .ret_type = RET_INTEGER, -- cgit v1.2.3-59-g8ed1b From 13d70f5a5ecff367db2fb18ed4ebe433eab8a74c Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Sun, 24 May 2020 09:51:15 -0700 Subject: bpf, sk_msg: Add get socket storage helpers Add helpers to use local socket storage. Signed-off-by: John Fastabend Signed-off-by: Daniel Borkmann Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/159033907577.12355.14740125020572756560.stgit@john-Precision-5820-Tower Signed-off-by: Alexei Starovoitov --- include/uapi/linux/bpf.h | 2 ++ net/core/filter.c | 15 +++++++++++++++ tools/include/uapi/linux/bpf.h | 2 ++ 3 files changed, 19 insertions(+) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 97e1fd19ff58..54b93f8b49b8 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -3645,6 +3645,8 @@ struct sk_msg_md { __u32 remote_port; /* Stored in network byte order */ __u32 local_port; /* stored in host byte order */ __u32 size; /* Total size of sk_msg */ + + __bpf_md_ptr(struct bpf_sock *, sk); /* current socket */ }; struct sk_reuseport_md { diff --git a/net/core/filter.c b/net/core/filter.c index c3b496a19748..a6fc23447f12 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -6449,6 +6449,10 @@ sk_msg_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_get_current_uid_gid_proto; case BPF_FUNC_get_current_pid_tgid: return &bpf_get_current_pid_tgid_proto; + case BPF_FUNC_sk_storage_get: + return &bpf_sk_storage_get_proto; + case BPF_FUNC_sk_storage_delete: + return &bpf_sk_storage_delete_proto; #ifdef CONFIG_CGROUPS case BPF_FUNC_get_current_cgroup_id: return &bpf_get_current_cgroup_id_proto; @@ -7273,6 +7277,11 @@ static bool sk_msg_is_valid_access(int off, int size, if (size != sizeof(__u64)) return false; break; + case offsetof(struct sk_msg_md, sk): + if (size != sizeof(__u64)) + return false; + info->reg_type = PTR_TO_SOCKET; + break; case bpf_ctx_range(struct sk_msg_md, family): case bpf_ctx_range(struct sk_msg_md, remote_ip4): case bpf_ctx_range(struct sk_msg_md, local_ip4): @@ -8609,6 +8618,12 @@ static u32 sk_msg_convert_ctx_access(enum bpf_access_type type, si->dst_reg, si->src_reg, offsetof(struct sk_msg_sg, size)); break; + + case offsetof(struct sk_msg_md, sk): + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_msg, sk), + si->dst_reg, si->src_reg, + offsetof(struct sk_msg, sk)); + break; } return insn - insn_buf; diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 97e1fd19ff58..54b93f8b49b8 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -3645,6 +3645,8 @@ struct sk_msg_md { __u32 remote_port; /* Stored in network byte order */ __u32 local_port; /* stored in host byte order */ __u32 size; /* Total size of sk_msg */ + + __bpf_md_ptr(struct bpf_sock *, sk); /* current socket */ }; struct sk_reuseport_md { -- cgit v1.2.3-59-g8ed1b From 1d9c037a898b3c0344cfe5064ba6c482bf9b46b0 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Sun, 24 May 2020 09:51:36 -0700 Subject: bpf, selftests: Add sk_msg helpers load and attach test The test itself is not particularly useful but it encodes a common pattern we have. Namely do a sk storage lookup then depending on data here decide if we need to do more work or alternatively allow packet to PASS. Then if we need to do more work consult task_struct for more information about the running task. Finally based on this additional information drop or pass the data. In this case the suspicious check is not so realisitic but it encodes the general pattern and uses the helpers so we test the workflow. This is a load test to ensure verifier correctly handles this case. Signed-off-by: John Fastabend Signed-off-by: Daniel Borkmann Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/159033909665.12355.6166415847337547879.stgit@john-Precision-5820-Tower Signed-off-by: Alexei Starovoitov --- .../selftests/bpf/prog_tests/sockmap_basic.c | 35 ++++++++++++++++ .../selftests/bpf/progs/test_skmsg_load_helpers.c | 47 ++++++++++++++++++++++ 2 files changed, 82 insertions(+) create mode 100644 tools/testing/selftests/bpf/progs/test_skmsg_load_helpers.c diff --git a/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c b/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c index aa43e0bd210c..96e7b7f84c65 100644 --- a/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c +++ b/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c @@ -1,7 +1,9 @@ // SPDX-License-Identifier: GPL-2.0 // Copyright (c) 2020 Cloudflare +#include #include "test_progs.h" +#include "test_skmsg_load_helpers.skel.h" #define TCP_REPAIR 19 /* TCP sock is under repair right now */ @@ -70,10 +72,43 @@ out: close(s); } +static void test_skmsg_helpers(enum bpf_map_type map_type) +{ + struct test_skmsg_load_helpers *skel; + int err, map, verdict; + + skel = test_skmsg_load_helpers__open_and_load(); + if (CHECK_FAIL(!skel)) { + perror("test_skmsg_load_helpers__open_and_load"); + return; + } + + verdict = bpf_program__fd(skel->progs.prog_msg_verdict); + map = bpf_map__fd(skel->maps.sock_map); + + err = bpf_prog_attach(verdict, map, BPF_SK_MSG_VERDICT, 0); + if (CHECK_FAIL(err)) { + perror("bpf_prog_attach"); + goto out; + } + + err = bpf_prog_detach2(verdict, map, BPF_SK_MSG_VERDICT); + if (CHECK_FAIL(err)) { + perror("bpf_prog_detach2"); + goto out; + } +out: + test_skmsg_load_helpers__destroy(skel); +} + void test_sockmap_basic(void) { if (test__start_subtest("sockmap create_update_free")) test_sockmap_create_update_free(BPF_MAP_TYPE_SOCKMAP); if (test__start_subtest("sockhash create_update_free")) test_sockmap_create_update_free(BPF_MAP_TYPE_SOCKHASH); + if (test__start_subtest("sockmap sk_msg load helpers")) + test_skmsg_helpers(BPF_MAP_TYPE_SOCKMAP); + if (test__start_subtest("sockhash sk_msg load helpers")) + test_skmsg_helpers(BPF_MAP_TYPE_SOCKHASH); } diff --git a/tools/testing/selftests/bpf/progs/test_skmsg_load_helpers.c b/tools/testing/selftests/bpf/progs/test_skmsg_load_helpers.c new file mode 100644 index 000000000000..45e8fc75a739 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_skmsg_load_helpers.c @@ -0,0 +1,47 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (c) 2020 Isovalent, Inc. +#include "vmlinux.h" +#include + +struct { + __uint(type, BPF_MAP_TYPE_SOCKMAP); + __uint(max_entries, 2); + __type(key, __u32); + __type(value, __u64); +} sock_map SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_SOCKHASH); + __uint(max_entries, 2); + __type(key, __u32); + __type(value, __u64); +} sock_hash SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_SK_STORAGE); + __uint(map_flags, BPF_F_NO_PREALLOC); + __type(key, __u32); + __type(value, __u64); +} socket_storage SEC(".maps"); + +SEC("sk_msg") +int prog_msg_verdict(struct sk_msg_md *msg) +{ + struct task_struct *task = (struct task_struct *)bpf_get_current_task(); + int verdict = SK_PASS; + __u32 pid, tpid; + __u64 *sk_stg; + + pid = bpf_get_current_pid_tgid() >> 32; + sk_stg = bpf_sk_storage_get(&socket_storage, msg->sk, 0, BPF_SK_STORAGE_GET_F_CREATE); + if (!sk_stg) + return SK_DROP; + *sk_stg = pid; + bpf_probe_read_kernel(&tpid , sizeof(tpid), &task->tgid); + if (pid != tpid) + verdict = SK_DROP; + bpf_sk_storage_delete(&socket_storage, (void *)msg->sk); + return verdict; +} + +char _license[] SEC("license") = "GPL"; -- cgit v1.2.3-59-g8ed1b From ee103e9f1544e04ecd1db5eb5e9eb9a8b8698879 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Sun, 24 May 2020 09:51:57 -0700 Subject: bpf, selftests: Test probe_* helpers from SCHED_CLS Lets test using probe* in SCHED_CLS network programs as well just to be sure these keep working. Its cheap to add the extra test and provides a second context to test outside of sk_msg after we generalized probe* helpers to all networking types. Signed-off-by: John Fastabend Signed-off-by: Daniel Borkmann Acked-by: Yonghong Song Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/159033911685.12355.15951980509828906214.stgit@john-Precision-5820-Tower Signed-off-by: Alexei Starovoitov --- .../testing/selftests/bpf/prog_tests/skb_helpers.c | 30 ++++++++++++++++++++++ .../testing/selftests/bpf/progs/test_skb_helpers.c | 28 ++++++++++++++++++++ 2 files changed, 58 insertions(+) create mode 100644 tools/testing/selftests/bpf/prog_tests/skb_helpers.c create mode 100644 tools/testing/selftests/bpf/progs/test_skb_helpers.c diff --git a/tools/testing/selftests/bpf/prog_tests/skb_helpers.c b/tools/testing/selftests/bpf/prog_tests/skb_helpers.c new file mode 100644 index 000000000000..f302ad84a298 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/skb_helpers.c @@ -0,0 +1,30 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include + +void test_skb_helpers(void) +{ + struct __sk_buff skb = { + .wire_len = 100, + .gso_segs = 8, + .gso_size = 10, + }; + struct bpf_prog_test_run_attr tattr = { + .data_in = &pkt_v4, + .data_size_in = sizeof(pkt_v4), + .ctx_in = &skb, + .ctx_size_in = sizeof(skb), + .ctx_out = &skb, + .ctx_size_out = sizeof(skb), + }; + struct bpf_object *obj; + int err; + + err = bpf_prog_load("./test_skb_helpers.o", BPF_PROG_TYPE_SCHED_CLS, &obj, + &tattr.prog_fd); + if (CHECK_ATTR(err, "load", "err %d errno %d\n", err, errno)) + return; + err = bpf_prog_test_run_xattr(&tattr); + CHECK_ATTR(err, "len", "err %d errno %d\n", err, errno); + bpf_object__close(obj); +} diff --git a/tools/testing/selftests/bpf/progs/test_skb_helpers.c b/tools/testing/selftests/bpf/progs/test_skb_helpers.c new file mode 100644 index 000000000000..bb3fbf1a29e3 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_skb_helpers.c @@ -0,0 +1,28 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include "vmlinux.h" +#include +#include + +#define TEST_COMM_LEN 16 + +struct { + __uint(type, BPF_MAP_TYPE_CGROUP_ARRAY); + __uint(max_entries, 1); + __type(key, u32); + __type(value, u32); +} cgroup_map SEC(".maps"); + +char _license[] SEC("license") = "GPL"; + +SEC("classifier/test_skb_helpers") +int test_skb_helpers(struct __sk_buff *skb) +{ + struct task_struct *task; + char comm[TEST_COMM_LEN]; + __u32 tpid; + + task = (struct task_struct *)bpf_get_current_task(); + bpf_probe_read_kernel(&tpid , sizeof(tpid), &task->tgid); + bpf_probe_read_kernel_str(&comm, sizeof(comm), &task->comm); + return 0; +} -- cgit v1.2.3-59-g8ed1b From 601b05ca6edb0422bf6ce313fbfd55ec7bbbc0fd Mon Sep 17 00:00:00 2001 From: Eelco Chaudron Date: Wed, 27 May 2020 10:42:00 +0200 Subject: libbpf: Fix perf_buffer__free() API for sparse allocs In case the cpu_bufs are sparsely allocated they are not all free'ed. These changes will fix this. Fixes: fb84b8224655 ("libbpf: add perf buffer API") Signed-off-by: Eelco Chaudron Signed-off-by: Daniel Borkmann Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/159056888305.330763.9684536967379110349.stgit@ebuild Signed-off-by: Alexei Starovoitov --- tools/lib/bpf/libbpf.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 5d60de6fd818..74d967619dcf 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -8137,9 +8137,12 @@ void perf_buffer__free(struct perf_buffer *pb) if (!pb) return; if (pb->cpu_bufs) { - for (i = 0; i < pb->cpu_cnt && pb->cpu_bufs[i]; i++) { + for (i = 0; i < pb->cpu_cnt; i++) { struct perf_cpu_buf *cpu_buf = pb->cpu_bufs[i]; + if (!cpu_buf) + continue; + bpf_map_delete_elem(pb->map_fd, &cpu_buf->map_key); perf_buffer__free_cpu_buf(pb, cpu_buf); } -- cgit v1.2.3-59-g8ed1b From 204fb0413a92342d31f3e2557db0bb5babed586c Mon Sep 17 00:00:00 2001 From: Anton Protopopov Date: Wed, 27 May 2020 18:56:56 +0000 Subject: selftests/bpf: Fix a typo in test_maps Trivial fix to a typo in the test_map_wronly test: "read" -> "write" Signed-off-by: Anton Protopopov Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20200527185700.14658-2-a.s.protopopov@gmail.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/test_maps.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/bpf/test_maps.c b/tools/testing/selftests/bpf/test_maps.c index c6766b2cff85..f717acc0c68d 100644 --- a/tools/testing/selftests/bpf/test_maps.c +++ b/tools/testing/selftests/bpf/test_maps.c @@ -1410,7 +1410,7 @@ static void test_map_wronly(void) fd = bpf_create_map(BPF_MAP_TYPE_HASH, sizeof(key), sizeof(value), MAP_SIZE, map_flags | BPF_F_WRONLY); if (fd < 0) { - printf("Failed to create map for read only test '%s'!\n", + printf("Failed to create map for write only test '%s'!\n", strerror(errno)); exit(1); } -- cgit v1.2.3-59-g8ed1b From 36ef9a2d3f764a37cf3d8e619bfebf5c99c070a0 Mon Sep 17 00:00:00 2001 From: Anton Protopopov Date: Wed, 27 May 2020 18:56:57 +0000 Subject: selftests/bpf: Cleanup some file descriptors in test_maps The test_map_rdonly and test_map_wronly tests should close file descriptors which they open. Signed-off-by: Anton Protopopov Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20200527185700.14658-3-a.s.protopopov@gmail.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/test_maps.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tools/testing/selftests/bpf/test_maps.c b/tools/testing/selftests/bpf/test_maps.c index f717acc0c68d..46cf2c232964 100644 --- a/tools/testing/selftests/bpf/test_maps.c +++ b/tools/testing/selftests/bpf/test_maps.c @@ -1401,6 +1401,8 @@ static void test_map_rdonly(void) /* Check that key=2 is not found. */ assert(bpf_map_lookup_elem(fd, &key, &value) == -1 && errno == ENOENT); assert(bpf_map_get_next_key(fd, &key, &value) == -1 && errno == ENOENT); + + close(fd); } static void test_map_wronly(void) @@ -1423,6 +1425,8 @@ static void test_map_wronly(void) /* Check that key=2 is not found. */ assert(bpf_map_lookup_elem(fd, &key, &value) == -1 && errno == EPERM); assert(bpf_map_get_next_key(fd, &key, &value) == -1 && errno == EPERM); + + close(fd); } static void prepare_reuseport_grp(int type, int map_fd, size_t map_elem_size, -- cgit v1.2.3-59-g8ed1b From efbc3b8fe1e6259777670aadf931500545073c6c Mon Sep 17 00:00:00 2001 From: Anton Protopopov Date: Wed, 27 May 2020 18:56:58 +0000 Subject: selftests/bpf: Cleanup comments in test_maps Make comments inside the test_map_rdonly and test_map_wronly tests consistent with logic. Signed-off-by: Anton Protopopov Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20200527185700.14658-4-a.s.protopopov@gmail.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/test_maps.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/bpf/test_maps.c b/tools/testing/selftests/bpf/test_maps.c index 46cf2c232964..08d63948514a 100644 --- a/tools/testing/selftests/bpf/test_maps.c +++ b/tools/testing/selftests/bpf/test_maps.c @@ -1394,11 +1394,11 @@ static void test_map_rdonly(void) key = 1; value = 1234; - /* Insert key=1 element. */ + /* Try to insert key=1 element. */ assert(bpf_map_update_elem(fd, &key, &value, BPF_ANY) == -1 && errno == EPERM); - /* Check that key=2 is not found. */ + /* Check that key=1 is not found. */ assert(bpf_map_lookup_elem(fd, &key, &value) == -1 && errno == ENOENT); assert(bpf_map_get_next_key(fd, &key, &value) == -1 && errno == ENOENT); @@ -1422,7 +1422,7 @@ static void test_map_wronly(void) /* Insert key=1 element. */ assert(bpf_map_update_elem(fd, &key, &value, BPF_ANY) == 0); - /* Check that key=2 is not found. */ + /* Check that reading elements and keys from the map is not allowed. */ assert(bpf_map_lookup_elem(fd, &key, &value) == -1 && errno == EPERM); assert(bpf_map_get_next_key(fd, &key, &value) == -1 && errno == EPERM); -- cgit v1.2.3-59-g8ed1b From 1ea0f9120c8ce105ca181b070561df5cbd6bc049 Mon Sep 17 00:00:00 2001 From: Anton Protopopov Date: Wed, 27 May 2020 18:56:59 +0000 Subject: bpf: Fix map permissions check The map_lookup_and_delete_elem() function should check for both FMODE_CAN_WRITE and FMODE_CAN_READ permissions because it returns a map element to user space. Fixes: bd513cd08f10 ("bpf: add MAP_LOOKUP_AND_DELETE_ELEM syscall") Signed-off-by: Anton Protopopov Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20200527185700.14658-5-a.s.protopopov@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/syscall.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index d13b804ff045..2c969a9b90d3 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1472,7 +1472,8 @@ static int map_lookup_and_delete_elem(union bpf_attr *attr) map = __bpf_map_get(f); if (IS_ERR(map)) return PTR_ERR(map); - if (!(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) { + if (!(map_get_sys_perms(map, f) & FMODE_CAN_READ) || + !(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) { err = -EPERM; goto err_put; } -- cgit v1.2.3-59-g8ed1b From 457f44363a8894135c85b7a9afd2bd8196db24ab Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Fri, 29 May 2020 00:54:20 -0700 Subject: bpf: Implement BPF ring buffer and verifier support for it This commit adds a new MPSC ring buffer implementation into BPF ecosystem, which allows multiple CPUs to submit data to a single shared ring buffer. On the consumption side, only single consumer is assumed. Motivation ---------- There are two distinctive motivators for this work, which are not satisfied by existing perf buffer, which prompted creation of a new ring buffer implementation. - more efficient memory utilization by sharing ring buffer across CPUs; - preserving ordering of events that happen sequentially in time, even across multiple CPUs (e.g., fork/exec/exit events for a task). These two problems are independent, but perf buffer fails to satisfy both. Both are a result of a choice to have per-CPU perf ring buffer. Both can be also solved by having an MPSC implementation of ring buffer. The ordering problem could technically be solved for perf buffer with some in-kernel counting, but given the first one requires an MPSC buffer, the same solution would solve the second problem automatically. Semantics and APIs ------------------ Single ring buffer is presented to BPF programs as an instance of BPF map of type BPF_MAP_TYPE_RINGBUF. Two other alternatives considered, but ultimately rejected. One way would be to, similar to BPF_MAP_TYPE_PERF_EVENT_ARRAY, make BPF_MAP_TYPE_RINGBUF could represent an array of ring buffers, but not enforce "same CPU only" rule. This would be more familiar interface compatible with existing perf buffer use in BPF, but would fail if application needed more advanced logic to lookup ring buffer by arbitrary key. HASH_OF_MAPS addresses this with current approach. Additionally, given the performance of BPF ringbuf, many use cases would just opt into a simple single ring buffer shared among all CPUs, for which current approach would be an overkill. Another approach could introduce a new concept, alongside BPF map, to represent generic "container" object, which doesn't necessarily have key/value interface with lookup/update/delete operations. This approach would add a lot of extra infrastructure that has to be built for observability and verifier support. It would also add another concept that BPF developers would have to familiarize themselves with, new syntax in libbpf, etc. But then would really provide no additional benefits over the approach of using a map. BPF_MAP_TYPE_RINGBUF doesn't support lookup/update/delete operations, but so doesn't few other map types (e.g., queue and stack; array doesn't support delete, etc). The approach chosen has an advantage of re-using existing BPF map infrastructure (introspection APIs in kernel, libbpf support, etc), being familiar concept (no need to teach users a new type of object in BPF program), and utilizing existing tooling (bpftool). For common scenario of using a single ring buffer for all CPUs, it's as simple and straightforward, as would be with a dedicated "container" object. On the other hand, by being a map, it can be combined with ARRAY_OF_MAPS and HASH_OF_MAPS map-in-maps to implement a wide variety of topologies, from one ring buffer for each CPU (e.g., as a replacement for perf buffer use cases), to a complicated application hashing/sharding of ring buffers (e.g., having a small pool of ring buffers with hashed task's tgid being a look up key to preserve order, but reduce contention). Key and value sizes are enforced to be zero. max_entries is used to specify the size of ring buffer and has to be a power of 2 value. There are a bunch of similarities between perf buffer (BPF_MAP_TYPE_PERF_EVENT_ARRAY) and new BPF ring buffer semantics: - variable-length records; - if there is no more space left in ring buffer, reservation fails, no blocking; - memory-mappable data area for user-space applications for ease of consumption and high performance; - epoll notifications for new incoming data; - but still the ability to do busy polling for new data to achieve the lowest latency, if necessary. BPF ringbuf provides two sets of APIs to BPF programs: - bpf_ringbuf_output() allows to *copy* data from one place to a ring buffer, similarly to bpf_perf_event_output(); - bpf_ringbuf_reserve()/bpf_ringbuf_commit()/bpf_ringbuf_discard() APIs split the whole process into two steps. First, a fixed amount of space is reserved. If successful, a pointer to a data inside ring buffer data area is returned, which BPF programs can use similarly to a data inside array/hash maps. Once ready, this piece of memory is either committed or discarded. Discard is similar to commit, but makes consumer ignore the record. bpf_ringbuf_output() has disadvantage of incurring extra memory copy, because record has to be prepared in some other place first. But it allows to submit records of the length that's not known to verifier beforehand. It also closely matches bpf_perf_event_output(), so will simplify migration significantly. bpf_ringbuf_reserve() avoids the extra copy of memory by providing a memory pointer directly to ring buffer memory. In a lot of cases records are larger than BPF stack space allows, so many programs have use extra per-CPU array as a temporary heap for preparing sample. bpf_ringbuf_reserve() avoid this needs completely. But in exchange, it only allows a known constant size of memory to be reserved, such that verifier can verify that BPF program can't access memory outside its reserved record space. bpf_ringbuf_output(), while slightly slower due to extra memory copy, covers some use cases that are not suitable for bpf_ringbuf_reserve(). The difference between commit and discard is very small. Discard just marks a record as discarded, and such records are supposed to be ignored by consumer code. Discard is useful for some advanced use-cases, such as ensuring all-or-nothing multi-record submission, or emulating temporary malloc()/free() within single BPF program invocation. Each reserved record is tracked by verifier through existing reference-tracking logic, similar to socket ref-tracking. It is thus impossible to reserve a record, but forget to submit (or discard) it. bpf_ringbuf_query() helper allows to query various properties of ring buffer. Currently 4 are supported: - BPF_RB_AVAIL_DATA returns amount of unconsumed data in ring buffer; - BPF_RB_RING_SIZE returns the size of ring buffer; - BPF_RB_CONS_POS/BPF_RB_PROD_POS returns current logical possition of consumer/producer, respectively. Returned values are momentarily snapshots of ring buffer state and could be off by the time helper returns, so this should be used only for debugging/reporting reasons or for implementing various heuristics, that take into account highly-changeable nature of some of those characteristics. One such heuristic might involve more fine-grained control over poll/epoll notifications about new data availability in ring buffer. Together with BPF_RB_NO_WAKEUP/BPF_RB_FORCE_WAKEUP flags for output/commit/discard helpers, it allows BPF program a high degree of control and, e.g., more efficient batched notifications. Default self-balancing strategy, though, should be adequate for most applications and will work reliable and efficiently already. Design and implementation ------------------------- This reserve/commit schema allows a natural way for multiple producers, either on different CPUs or even on the same CPU/in the same BPF program, to reserve independent records and work with them without blocking other producers. This means that if BPF program was interruped by another BPF program sharing the same ring buffer, they will both get a record reserved (provided there is enough space left) and can work with it and submit it independently. This applies to NMI context as well, except that due to using a spinlock during reservation, in NMI context, bpf_ringbuf_reserve() might fail to get a lock, in which case reservation will fail even if ring buffer is not full. The ring buffer itself internally is implemented as a power-of-2 sized circular buffer, with two logical and ever-increasing counters (which might wrap around on 32-bit architectures, that's not a problem): - consumer counter shows up to which logical position consumer consumed the data; - producer counter denotes amount of data reserved by all producers. Each time a record is reserved, producer that "owns" the record will successfully advance producer counter. At that point, data is still not yet ready to be consumed, though. Each record has 8 byte header, which contains the length of reserved record, as well as two extra bits: busy bit to denote that record is still being worked on, and discard bit, which might be set at commit time if record is discarded. In the latter case, consumer is supposed to skip the record and move on to the next one. Record header also encodes record's relative offset from the beginning of ring buffer data area (in pages). This allows bpf_ringbuf_commit()/bpf_ringbuf_discard() to accept only the pointer to the record itself, without requiring also the pointer to ring buffer itself. Ring buffer memory location will be restored from record metadata header. This significantly simplifies verifier, as well as improving API usability. Producer counter increments are serialized under spinlock, so there is a strict ordering between reservations. Commits, on the other hand, are completely lockless and independent. All records become available to consumer in the order of reservations, but only after all previous records where already committed. It is thus possible for slow producers to temporarily hold off submitted records, that were reserved later. Reservation/commit/consumer protocol is verified by litmus tests in Documentation/litmus-test/bpf-rb. One interesting implementation bit, that significantly simplifies (and thus speeds up as well) implementation of both producers and consumers is how data area is mapped twice contiguously back-to-back in the virtual memory. This allows to not take any special measures for samples that have to wrap around at the end of the circular buffer data area, because the next page after the last data page would be first data page again, and thus the sample will still appear completely contiguous in virtual memory. See comment and a simple ASCII diagram showing this visually in bpf_ringbuf_area_alloc(). Another feature that distinguishes BPF ringbuf from perf ring buffer is a self-pacing notifications of new data being availability. bpf_ringbuf_commit() implementation will send a notification of new record being available after commit only if consumer has already caught up right up to the record being committed. If not, consumer still has to catch up and thus will see new data anyways without needing an extra poll notification. Benchmarks (see tools/testing/selftests/bpf/benchs/bench_ringbuf.c) show that this allows to achieve a very high throughput without having to resort to tricks like "notify only every Nth sample", which are necessary with perf buffer. For extreme cases, when BPF program wants more manual control of notifications, commit/discard/output helpers accept BPF_RB_NO_WAKEUP and BPF_RB_FORCE_WAKEUP flags, which give full control over notifications of data availability, but require extra caution and diligence in using this API. Comparison to alternatives -------------------------- Before considering implementing BPF ring buffer from scratch existing alternatives in kernel were evaluated, but didn't seem to meet the needs. They largely fell into few categores: - per-CPU buffers (perf, ftrace, etc), which don't satisfy two motivations outlined above (ordering and memory consumption); - linked list-based implementations; while some were multi-producer designs, consuming these from user-space would be very complicated and most probably not performant; memory-mapping contiguous piece of memory is simpler and more performant for user-space consumers; - io_uring is SPSC, but also requires fixed-sized elements. Naively turning SPSC queue into MPSC w/ lock would have subpar performance compared to locked reserve + lockless commit, as with BPF ring buffer. Fixed sized elements would be too limiting for BPF programs, given existing BPF programs heavily rely on variable-sized perf buffer already; - specialized implementations (like a new printk ring buffer, [0]) with lots of printk-specific limitations and implications, that didn't seem to fit well for intended use with BPF programs. [0] https://lwn.net/Articles/779550/ Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20200529075424.3139988-2-andriin@fb.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 13 + include/linux/bpf_types.h | 1 + include/linux/bpf_verifier.h | 4 + include/uapi/linux/bpf.h | 84 +++- kernel/bpf/Makefile | 2 +- kernel/bpf/helpers.c | 10 + kernel/bpf/ringbuf.c | 501 +++++++++++++++++++++ kernel/bpf/syscall.c | 12 + kernel/bpf/verifier.c | 195 ++++++-- kernel/trace/bpf_trace.c | 10 + tools/include/uapi/linux/bpf.h | 84 +++- tools/testing/selftests/bpf/verifier/and.c | 4 +- .../testing/selftests/bpf/verifier/array_access.c | 4 +- tools/testing/selftests/bpf/verifier/bounds.c | 6 +- tools/testing/selftests/bpf/verifier/calls.c | 2 +- .../selftests/bpf/verifier/direct_value_access.c | 4 +- .../selftests/bpf/verifier/helper_access_var_len.c | 2 +- .../selftests/bpf/verifier/helper_value_access.c | 6 +- .../selftests/bpf/verifier/value_ptr_arith.c | 8 +- 19 files changed, 882 insertions(+), 70 deletions(-) create mode 100644 kernel/bpf/ringbuf.c diff --git a/include/linux/bpf.h b/include/linux/bpf.h index efe8836b5c48..e5884f7f801c 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -90,6 +90,8 @@ struct bpf_map_ops { int (*map_direct_value_meta)(const struct bpf_map *map, u64 imm, u32 *off); int (*map_mmap)(struct bpf_map *map, struct vm_area_struct *vma); + __poll_t (*map_poll)(struct bpf_map *map, struct file *filp, + struct poll_table_struct *pts); }; struct bpf_map_memory { @@ -244,6 +246,9 @@ enum bpf_arg_type { ARG_PTR_TO_LONG, /* pointer to long */ ARG_PTR_TO_SOCKET, /* pointer to bpf_sock (fullsock) */ ARG_PTR_TO_BTF_ID, /* pointer to in-kernel struct */ + ARG_PTR_TO_ALLOC_MEM, /* pointer to dynamically allocated memory */ + ARG_PTR_TO_ALLOC_MEM_OR_NULL, /* pointer to dynamically allocated memory or NULL */ + ARG_CONST_ALLOC_SIZE_OR_ZERO, /* number of allocated bytes requested */ }; /* type of values returned from helper functions */ @@ -255,6 +260,7 @@ enum bpf_return_type { RET_PTR_TO_SOCKET_OR_NULL, /* returns a pointer to a socket or NULL */ RET_PTR_TO_TCP_SOCK_OR_NULL, /* returns a pointer to a tcp_sock or NULL */ RET_PTR_TO_SOCK_COMMON_OR_NULL, /* returns a pointer to a sock_common or NULL */ + RET_PTR_TO_ALLOC_MEM_OR_NULL, /* returns a pointer to dynamically allocated memory or NULL */ }; /* eBPF function prototype used by verifier to allow BPF_CALLs from eBPF programs @@ -322,6 +328,8 @@ enum bpf_reg_type { PTR_TO_XDP_SOCK, /* reg points to struct xdp_sock */ PTR_TO_BTF_ID, /* reg points to kernel struct */ PTR_TO_BTF_ID_OR_NULL, /* reg points to kernel struct or NULL */ + PTR_TO_MEM, /* reg points to valid memory region */ + PTR_TO_MEM_OR_NULL, /* reg points to valid memory region or NULL */ }; /* The information passed from prog-specific *_is_valid_access @@ -1611,6 +1619,11 @@ extern const struct bpf_func_proto bpf_tcp_sock_proto; extern const struct bpf_func_proto bpf_jiffies64_proto; extern const struct bpf_func_proto bpf_get_ns_current_pid_tgid_proto; extern const struct bpf_func_proto bpf_event_output_data_proto; +extern const struct bpf_func_proto bpf_ringbuf_output_proto; +extern const struct bpf_func_proto bpf_ringbuf_reserve_proto; +extern const struct bpf_func_proto bpf_ringbuf_submit_proto; +extern const struct bpf_func_proto bpf_ringbuf_discard_proto; +extern const struct bpf_func_proto bpf_ringbuf_query_proto; const struct bpf_func_proto *bpf_tracing_func_proto( enum bpf_func_id func_id, const struct bpf_prog *prog); diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h index 29d22752fc87..fa8e1b552acd 100644 --- a/include/linux/bpf_types.h +++ b/include/linux/bpf_types.h @@ -118,6 +118,7 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_STACK, stack_map_ops) #if defined(CONFIG_BPF_JIT) BPF_MAP_TYPE(BPF_MAP_TYPE_STRUCT_OPS, bpf_struct_ops_map_ops) #endif +BPF_MAP_TYPE(BPF_MAP_TYPE_RINGBUF, ringbuf_map_ops) BPF_LINK_TYPE(BPF_LINK_TYPE_RAW_TRACEPOINT, raw_tracepoint) BPF_LINK_TYPE(BPF_LINK_TYPE_TRACING, tracing) diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index ea833087e853..ca08db4ffb5f 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -54,6 +54,8 @@ struct bpf_reg_state { u32 btf_id; /* for PTR_TO_BTF_ID */ + u32 mem_size; /* for PTR_TO_MEM | PTR_TO_MEM_OR_NULL */ + /* Max size from any of the above. */ unsigned long raw; }; @@ -63,6 +65,8 @@ struct bpf_reg_state { * offset, so they can share range knowledge. * For PTR_TO_MAP_VALUE_OR_NULL this is used to share which map value we * came from, when one is tested for != NULL. + * For PTR_TO_MEM_OR_NULL this is used to identify memory allocation + * for the purpose of tracking that it's freed. * For PTR_TO_SOCKET this is used to share which pointers retain the * same reference to the socket, to determine proper reference freeing. */ diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 54b93f8b49b8..974ca6e948e3 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -147,6 +147,7 @@ enum bpf_map_type { BPF_MAP_TYPE_SK_STORAGE, BPF_MAP_TYPE_DEVMAP_HASH, BPF_MAP_TYPE_STRUCT_OPS, + BPF_MAP_TYPE_RINGBUF, }; /* Note that tracing related programs such as @@ -3157,6 +3158,59 @@ union bpf_attr { * **bpf_sk_cgroup_id**\ (). * Return * The id is returned or 0 in case the id could not be retrieved. + * + * void *bpf_ringbuf_output(void *ringbuf, void *data, u64 size, u64 flags) + * Description + * Copy *size* bytes from *data* into a ring buffer *ringbuf*. + * If BPF_RB_NO_WAKEUP is specified in *flags*, no notification of + * new data availability is sent. + * IF BPF_RB_FORCE_WAKEUP is specified in *flags*, notification of + * new data availability is sent unconditionally. + * Return + * 0, on success; + * < 0, on error. + * + * void *bpf_ringbuf_reserve(void *ringbuf, u64 size, u64 flags) + * Description + * Reserve *size* bytes of payload in a ring buffer *ringbuf*. + * Return + * Valid pointer with *size* bytes of memory available; NULL, + * otherwise. + * + * void bpf_ringbuf_submit(void *data, u64 flags) + * Description + * Submit reserved ring buffer sample, pointed to by *data*. + * If BPF_RB_NO_WAKEUP is specified in *flags*, no notification of + * new data availability is sent. + * IF BPF_RB_FORCE_WAKEUP is specified in *flags*, notification of + * new data availability is sent unconditionally. + * Return + * Nothing. Always succeeds. + * + * void bpf_ringbuf_discard(void *data, u64 flags) + * Description + * Discard reserved ring buffer sample, pointed to by *data*. + * If BPF_RB_NO_WAKEUP is specified in *flags*, no notification of + * new data availability is sent. + * IF BPF_RB_FORCE_WAKEUP is specified in *flags*, notification of + * new data availability is sent unconditionally. + * Return + * Nothing. Always succeeds. + * + * u64 bpf_ringbuf_query(void *ringbuf, u64 flags) + * Description + * Query various characteristics of provided ring buffer. What + * exactly is queries is determined by *flags*: + * - BPF_RB_AVAIL_DATA - amount of data not yet consumed; + * - BPF_RB_RING_SIZE - the size of ring buffer; + * - BPF_RB_CONS_POS - consumer position (can wrap around); + * - BPF_RB_PROD_POS - producer(s) position (can wrap around); + * Data returned is just a momentary snapshots of actual values + * and could be inaccurate, so this facility should be used to + * power heuristics and for reporting, not to make 100% correct + * calculation. + * Return + * Requested value, or 0, if flags are not recognized. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3288,7 +3342,12 @@ union bpf_attr { FN(seq_printf), \ FN(seq_write), \ FN(sk_cgroup_id), \ - FN(sk_ancestor_cgroup_id), + FN(sk_ancestor_cgroup_id), \ + FN(ringbuf_output), \ + FN(ringbuf_reserve), \ + FN(ringbuf_submit), \ + FN(ringbuf_discard), \ + FN(ringbuf_query), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call @@ -3398,6 +3457,29 @@ enum { BPF_F_GET_BRANCH_RECORDS_SIZE = (1ULL << 0), }; +/* BPF_FUNC_bpf_ringbuf_commit, BPF_FUNC_bpf_ringbuf_discard, and + * BPF_FUNC_bpf_ringbuf_output flags. + */ +enum { + BPF_RB_NO_WAKEUP = (1ULL << 0), + BPF_RB_FORCE_WAKEUP = (1ULL << 1), +}; + +/* BPF_FUNC_bpf_ringbuf_query flags */ +enum { + BPF_RB_AVAIL_DATA = 0, + BPF_RB_RING_SIZE = 1, + BPF_RB_CONS_POS = 2, + BPF_RB_PROD_POS = 3, +}; + +/* BPF ring buffer constants */ +enum { + BPF_RINGBUF_BUSY_BIT = (1U << 31), + BPF_RINGBUF_DISCARD_BIT = (1U << 30), + BPF_RINGBUF_HDR_SZ = 8, +}; + /* Mode for BPF_FUNC_skb_adjust_room helper. */ enum bpf_adj_room_mode { BPF_ADJ_ROOM_NET, diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index 375b933010dd..8fca02f64811 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -4,7 +4,7 @@ CFLAGS_core.o += $(call cc-disable-warning, override-init) obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o bpf_iter.o map_iter.o task_iter.o obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o -obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o +obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o ringbuf.o obj-$(CONFIG_BPF_SYSCALL) += disasm.o obj-$(CONFIG_BPF_JIT) += trampoline.o obj-$(CONFIG_BPF_SYSCALL) += btf.o diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index bb4fb634275e..be43ab3e619f 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -635,6 +635,16 @@ bpf_base_func_proto(enum bpf_func_id func_id) return &bpf_ktime_get_ns_proto; case BPF_FUNC_ktime_get_boot_ns: return &bpf_ktime_get_boot_ns_proto; + case BPF_FUNC_ringbuf_output: + return &bpf_ringbuf_output_proto; + case BPF_FUNC_ringbuf_reserve: + return &bpf_ringbuf_reserve_proto; + case BPF_FUNC_ringbuf_submit: + return &bpf_ringbuf_submit_proto; + case BPF_FUNC_ringbuf_discard: + return &bpf_ringbuf_discard_proto; + case BPF_FUNC_ringbuf_query: + return &bpf_ringbuf_query_proto; default: break; } diff --git a/kernel/bpf/ringbuf.c b/kernel/bpf/ringbuf.c new file mode 100644 index 000000000000..180414bb0d3e --- /dev/null +++ b/kernel/bpf/ringbuf.c @@ -0,0 +1,501 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define RINGBUF_CREATE_FLAG_MASK (BPF_F_NUMA_NODE) + +/* non-mmap()'able part of bpf_ringbuf (everything up to consumer page) */ +#define RINGBUF_PGOFF \ + (offsetof(struct bpf_ringbuf, consumer_pos) >> PAGE_SHIFT) +/* consumer page and producer page */ +#define RINGBUF_POS_PAGES 2 + +#define RINGBUF_MAX_RECORD_SZ (UINT_MAX/4) + +/* Maximum size of ring buffer area is limited by 32-bit page offset within + * record header, counted in pages. Reserve 8 bits for extensibility, and take + * into account few extra pages for consumer/producer pages and + * non-mmap()'able parts. This gives 64GB limit, which seems plenty for single + * ring buffer. + */ +#define RINGBUF_MAX_DATA_SZ \ + (((1ULL << 24) - RINGBUF_POS_PAGES - RINGBUF_PGOFF) * PAGE_SIZE) + +struct bpf_ringbuf { + wait_queue_head_t waitq; + struct irq_work work; + u64 mask; + struct page **pages; + int nr_pages; + spinlock_t spinlock ____cacheline_aligned_in_smp; + /* Consumer and producer counters are put into separate pages to allow + * mapping consumer page as r/w, but restrict producer page to r/o. + * This protects producer position from being modified by user-space + * application and ruining in-kernel position tracking. + */ + unsigned long consumer_pos __aligned(PAGE_SIZE); + unsigned long producer_pos __aligned(PAGE_SIZE); + char data[] __aligned(PAGE_SIZE); +}; + +struct bpf_ringbuf_map { + struct bpf_map map; + struct bpf_map_memory memory; + struct bpf_ringbuf *rb; +}; + +/* 8-byte ring buffer record header structure */ +struct bpf_ringbuf_hdr { + u32 len; + u32 pg_off; +}; + +static struct bpf_ringbuf *bpf_ringbuf_area_alloc(size_t data_sz, int numa_node) +{ + const gfp_t flags = GFP_KERNEL | __GFP_RETRY_MAYFAIL | __GFP_NOWARN | + __GFP_ZERO; + int nr_meta_pages = RINGBUF_PGOFF + RINGBUF_POS_PAGES; + int nr_data_pages = data_sz >> PAGE_SHIFT; + int nr_pages = nr_meta_pages + nr_data_pages; + struct page **pages, *page; + struct bpf_ringbuf *rb; + size_t array_size; + int i; + + /* Each data page is mapped twice to allow "virtual" + * continuous read of samples wrapping around the end of ring + * buffer area: + * ------------------------------------------------------ + * | meta pages | real data pages | same data pages | + * ------------------------------------------------------ + * | | 1 2 3 4 5 6 7 8 9 | 1 2 3 4 5 6 7 8 9 | + * ------------------------------------------------------ + * | | TA DA | TA DA | + * ------------------------------------------------------ + * ^^^^^^^ + * | + * Here, no need to worry about special handling of wrapped-around + * data due to double-mapped data pages. This works both in kernel and + * when mmap()'ed in user-space, simplifying both kernel and + * user-space implementations significantly. + */ + array_size = (nr_meta_pages + 2 * nr_data_pages) * sizeof(*pages); + if (array_size > PAGE_SIZE) + pages = vmalloc_node(array_size, numa_node); + else + pages = kmalloc_node(array_size, flags, numa_node); + if (!pages) + return NULL; + + for (i = 0; i < nr_pages; i++) { + page = alloc_pages_node(numa_node, flags, 0); + if (!page) { + nr_pages = i; + goto err_free_pages; + } + pages[i] = page; + if (i >= nr_meta_pages) + pages[nr_data_pages + i] = page; + } + + rb = vmap(pages, nr_meta_pages + 2 * nr_data_pages, + VM_ALLOC | VM_USERMAP, PAGE_KERNEL); + if (rb) { + rb->pages = pages; + rb->nr_pages = nr_pages; + return rb; + } + +err_free_pages: + for (i = 0; i < nr_pages; i++) + __free_page(pages[i]); + kvfree(pages); + return NULL; +} + +static void bpf_ringbuf_notify(struct irq_work *work) +{ + struct bpf_ringbuf *rb = container_of(work, struct bpf_ringbuf, work); + + wake_up_all(&rb->waitq); +} + +static struct bpf_ringbuf *bpf_ringbuf_alloc(size_t data_sz, int numa_node) +{ + struct bpf_ringbuf *rb; + + if (!data_sz || !PAGE_ALIGNED(data_sz)) + return ERR_PTR(-EINVAL); + +#ifdef CONFIG_64BIT + /* on 32-bit arch, it's impossible to overflow record's hdr->pgoff */ + if (data_sz > RINGBUF_MAX_DATA_SZ) + return ERR_PTR(-E2BIG); +#endif + + rb = bpf_ringbuf_area_alloc(data_sz, numa_node); + if (!rb) + return ERR_PTR(-ENOMEM); + + spin_lock_init(&rb->spinlock); + init_waitqueue_head(&rb->waitq); + init_irq_work(&rb->work, bpf_ringbuf_notify); + + rb->mask = data_sz - 1; + rb->consumer_pos = 0; + rb->producer_pos = 0; + + return rb; +} + +static struct bpf_map *ringbuf_map_alloc(union bpf_attr *attr) +{ + struct bpf_ringbuf_map *rb_map; + u64 cost; + int err; + + if (attr->map_flags & ~RINGBUF_CREATE_FLAG_MASK) + return ERR_PTR(-EINVAL); + + if (attr->key_size || attr->value_size || + attr->max_entries == 0 || !PAGE_ALIGNED(attr->max_entries)) + return ERR_PTR(-EINVAL); + + rb_map = kzalloc(sizeof(*rb_map), GFP_USER); + if (!rb_map) + return ERR_PTR(-ENOMEM); + + bpf_map_init_from_attr(&rb_map->map, attr); + + cost = sizeof(struct bpf_ringbuf_map) + + sizeof(struct bpf_ringbuf) + + attr->max_entries; + err = bpf_map_charge_init(&rb_map->map.memory, cost); + if (err) + goto err_free_map; + + rb_map->rb = bpf_ringbuf_alloc(attr->max_entries, rb_map->map.numa_node); + if (IS_ERR(rb_map->rb)) { + err = PTR_ERR(rb_map->rb); + goto err_uncharge; + } + + return &rb_map->map; + +err_uncharge: + bpf_map_charge_finish(&rb_map->map.memory); +err_free_map: + kfree(rb_map); + return ERR_PTR(err); +} + +static void bpf_ringbuf_free(struct bpf_ringbuf *rb) +{ + /* copy pages pointer and nr_pages to local variable, as we are going + * to unmap rb itself with vunmap() below + */ + struct page **pages = rb->pages; + int i, nr_pages = rb->nr_pages; + + vunmap(rb); + for (i = 0; i < nr_pages; i++) + __free_page(pages[i]); + kvfree(pages); +} + +static void ringbuf_map_free(struct bpf_map *map) +{ + struct bpf_ringbuf_map *rb_map; + + /* at this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0, + * so the programs (can be more than one that used this map) were + * disconnected from events. Wait for outstanding critical sections in + * these programs to complete + */ + synchronize_rcu(); + + rb_map = container_of(map, struct bpf_ringbuf_map, map); + bpf_ringbuf_free(rb_map->rb); + kfree(rb_map); +} + +static void *ringbuf_map_lookup_elem(struct bpf_map *map, void *key) +{ + return ERR_PTR(-ENOTSUPP); +} + +static int ringbuf_map_update_elem(struct bpf_map *map, void *key, void *value, + u64 flags) +{ + return -ENOTSUPP; +} + +static int ringbuf_map_delete_elem(struct bpf_map *map, void *key) +{ + return -ENOTSUPP; +} + +static int ringbuf_map_get_next_key(struct bpf_map *map, void *key, + void *next_key) +{ + return -ENOTSUPP; +} + +static size_t bpf_ringbuf_mmap_page_cnt(const struct bpf_ringbuf *rb) +{ + size_t data_pages = (rb->mask + 1) >> PAGE_SHIFT; + + /* consumer page + producer page + 2 x data pages */ + return RINGBUF_POS_PAGES + 2 * data_pages; +} + +static int ringbuf_map_mmap(struct bpf_map *map, struct vm_area_struct *vma) +{ + struct bpf_ringbuf_map *rb_map; + size_t mmap_sz; + + rb_map = container_of(map, struct bpf_ringbuf_map, map); + mmap_sz = bpf_ringbuf_mmap_page_cnt(rb_map->rb) << PAGE_SHIFT; + + if (vma->vm_pgoff * PAGE_SIZE + (vma->vm_end - vma->vm_start) > mmap_sz) + return -EINVAL; + + return remap_vmalloc_range(vma, rb_map->rb, + vma->vm_pgoff + RINGBUF_PGOFF); +} + +static unsigned long ringbuf_avail_data_sz(struct bpf_ringbuf *rb) +{ + unsigned long cons_pos, prod_pos; + + cons_pos = smp_load_acquire(&rb->consumer_pos); + prod_pos = smp_load_acquire(&rb->producer_pos); + return prod_pos - cons_pos; +} + +static __poll_t ringbuf_map_poll(struct bpf_map *map, struct file *filp, + struct poll_table_struct *pts) +{ + struct bpf_ringbuf_map *rb_map; + + rb_map = container_of(map, struct bpf_ringbuf_map, map); + poll_wait(filp, &rb_map->rb->waitq, pts); + + if (ringbuf_avail_data_sz(rb_map->rb)) + return EPOLLIN | EPOLLRDNORM; + return 0; +} + +const struct bpf_map_ops ringbuf_map_ops = { + .map_alloc = ringbuf_map_alloc, + .map_free = ringbuf_map_free, + .map_mmap = ringbuf_map_mmap, + .map_poll = ringbuf_map_poll, + .map_lookup_elem = ringbuf_map_lookup_elem, + .map_update_elem = ringbuf_map_update_elem, + .map_delete_elem = ringbuf_map_delete_elem, + .map_get_next_key = ringbuf_map_get_next_key, +}; + +/* Given pointer to ring buffer record metadata and struct bpf_ringbuf itself, + * calculate offset from record metadata to ring buffer in pages, rounded + * down. This page offset is stored as part of record metadata and allows to + * restore struct bpf_ringbuf * from record pointer. This page offset is + * stored at offset 4 of record metadata header. + */ +static size_t bpf_ringbuf_rec_pg_off(struct bpf_ringbuf *rb, + struct bpf_ringbuf_hdr *hdr) +{ + return ((void *)hdr - (void *)rb) >> PAGE_SHIFT; +} + +/* Given pointer to ring buffer record header, restore pointer to struct + * bpf_ringbuf itself by using page offset stored at offset 4 + */ +static struct bpf_ringbuf * +bpf_ringbuf_restore_from_rec(struct bpf_ringbuf_hdr *hdr) +{ + unsigned long addr = (unsigned long)(void *)hdr; + unsigned long off = (unsigned long)hdr->pg_off << PAGE_SHIFT; + + return (void*)((addr & PAGE_MASK) - off); +} + +static void *__bpf_ringbuf_reserve(struct bpf_ringbuf *rb, u64 size) +{ + unsigned long cons_pos, prod_pos, new_prod_pos, flags; + u32 len, pg_off; + struct bpf_ringbuf_hdr *hdr; + + if (unlikely(size > RINGBUF_MAX_RECORD_SZ)) + return NULL; + + len = round_up(size + BPF_RINGBUF_HDR_SZ, 8); + cons_pos = smp_load_acquire(&rb->consumer_pos); + + if (in_nmi()) { + if (!spin_trylock_irqsave(&rb->spinlock, flags)) + return NULL; + } else { + spin_lock_irqsave(&rb->spinlock, flags); + } + + prod_pos = rb->producer_pos; + new_prod_pos = prod_pos + len; + + /* check for out of ringbuf space by ensuring producer position + * doesn't advance more than (ringbuf_size - 1) ahead + */ + if (new_prod_pos - cons_pos > rb->mask) { + spin_unlock_irqrestore(&rb->spinlock, flags); + return NULL; + } + + hdr = (void *)rb->data + (prod_pos & rb->mask); + pg_off = bpf_ringbuf_rec_pg_off(rb, hdr); + hdr->len = size | BPF_RINGBUF_BUSY_BIT; + hdr->pg_off = pg_off; + + /* pairs with consumer's smp_load_acquire() */ + smp_store_release(&rb->producer_pos, new_prod_pos); + + spin_unlock_irqrestore(&rb->spinlock, flags); + + return (void *)hdr + BPF_RINGBUF_HDR_SZ; +} + +BPF_CALL_3(bpf_ringbuf_reserve, struct bpf_map *, map, u64, size, u64, flags) +{ + struct bpf_ringbuf_map *rb_map; + + if (unlikely(flags)) + return 0; + + rb_map = container_of(map, struct bpf_ringbuf_map, map); + return (unsigned long)__bpf_ringbuf_reserve(rb_map->rb, size); +} + +const struct bpf_func_proto bpf_ringbuf_reserve_proto = { + .func = bpf_ringbuf_reserve, + .ret_type = RET_PTR_TO_ALLOC_MEM_OR_NULL, + .arg1_type = ARG_CONST_MAP_PTR, + .arg2_type = ARG_CONST_ALLOC_SIZE_OR_ZERO, + .arg3_type = ARG_ANYTHING, +}; + +static void bpf_ringbuf_commit(void *sample, u64 flags, bool discard) +{ + unsigned long rec_pos, cons_pos; + struct bpf_ringbuf_hdr *hdr; + struct bpf_ringbuf *rb; + u32 new_len; + + hdr = sample - BPF_RINGBUF_HDR_SZ; + rb = bpf_ringbuf_restore_from_rec(hdr); + new_len = hdr->len ^ BPF_RINGBUF_BUSY_BIT; + if (discard) + new_len |= BPF_RINGBUF_DISCARD_BIT; + + /* update record header with correct final size prefix */ + xchg(&hdr->len, new_len); + + /* if consumer caught up and is waiting for our record, notify about + * new data availability + */ + rec_pos = (void *)hdr - (void *)rb->data; + cons_pos = smp_load_acquire(&rb->consumer_pos) & rb->mask; + + if (flags & BPF_RB_FORCE_WAKEUP) + irq_work_queue(&rb->work); + else if (cons_pos == rec_pos && !(flags & BPF_RB_NO_WAKEUP)) + irq_work_queue(&rb->work); +} + +BPF_CALL_2(bpf_ringbuf_submit, void *, sample, u64, flags) +{ + bpf_ringbuf_commit(sample, flags, false /* discard */); + return 0; +} + +const struct bpf_func_proto bpf_ringbuf_submit_proto = { + .func = bpf_ringbuf_submit, + .ret_type = RET_VOID, + .arg1_type = ARG_PTR_TO_ALLOC_MEM, + .arg2_type = ARG_ANYTHING, +}; + +BPF_CALL_2(bpf_ringbuf_discard, void *, sample, u64, flags) +{ + bpf_ringbuf_commit(sample, flags, true /* discard */); + return 0; +} + +const struct bpf_func_proto bpf_ringbuf_discard_proto = { + .func = bpf_ringbuf_discard, + .ret_type = RET_VOID, + .arg1_type = ARG_PTR_TO_ALLOC_MEM, + .arg2_type = ARG_ANYTHING, +}; + +BPF_CALL_4(bpf_ringbuf_output, struct bpf_map *, map, void *, data, u64, size, + u64, flags) +{ + struct bpf_ringbuf_map *rb_map; + void *rec; + + if (unlikely(flags & ~(BPF_RB_NO_WAKEUP | BPF_RB_FORCE_WAKEUP))) + return -EINVAL; + + rb_map = container_of(map, struct bpf_ringbuf_map, map); + rec = __bpf_ringbuf_reserve(rb_map->rb, size); + if (!rec) + return -EAGAIN; + + memcpy(rec, data, size); + bpf_ringbuf_commit(rec, flags, false /* discard */); + return 0; +} + +const struct bpf_func_proto bpf_ringbuf_output_proto = { + .func = bpf_ringbuf_output, + .ret_type = RET_INTEGER, + .arg1_type = ARG_CONST_MAP_PTR, + .arg2_type = ARG_PTR_TO_MEM, + .arg3_type = ARG_CONST_SIZE_OR_ZERO, + .arg4_type = ARG_ANYTHING, +}; + +BPF_CALL_2(bpf_ringbuf_query, struct bpf_map *, map, u64, flags) +{ + struct bpf_ringbuf *rb; + + rb = container_of(map, struct bpf_ringbuf_map, map)->rb; + + switch (flags) { + case BPF_RB_AVAIL_DATA: + return ringbuf_avail_data_sz(rb); + case BPF_RB_RING_SIZE: + return rb->mask + 1; + case BPF_RB_CONS_POS: + return smp_load_acquire(&rb->consumer_pos); + case BPF_RB_PROD_POS: + return smp_load_acquire(&rb->producer_pos); + default: + return 0; + } +} + +const struct bpf_func_proto bpf_ringbuf_query_proto = { + .func = bpf_ringbuf_query, + .ret_type = RET_INTEGER, + .arg1_type = ARG_CONST_MAP_PTR, + .arg2_type = ARG_ANYTHING, +}; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 2c969a9b90d3..9de3540fa90c 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -26,6 +26,7 @@ #include #include #include +#include #define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || \ (map)->map_type == BPF_MAP_TYPE_CGROUP_ARRAY || \ @@ -662,6 +663,16 @@ out: return err; } +static __poll_t bpf_map_poll(struct file *filp, struct poll_table_struct *pts) +{ + struct bpf_map *map = filp->private_data; + + if (map->ops->map_poll) + return map->ops->map_poll(map, filp, pts); + + return EPOLLERR; +} + const struct file_operations bpf_map_fops = { #ifdef CONFIG_PROC_FS .show_fdinfo = bpf_map_show_fdinfo, @@ -670,6 +681,7 @@ const struct file_operations bpf_map_fops = { .read = bpf_dummy_read, .write = bpf_dummy_write, .mmap = bpf_map_mmap, + .poll = bpf_map_poll, }; int bpf_map_new_fd(struct bpf_map *map, int flags) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 6d725a26f66e..5c7bbaac81ef 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -233,6 +233,7 @@ struct bpf_call_arg_meta { bool pkt_access; int regno; int access_size; + int mem_size; u64 msize_max_value; int ref_obj_id; int func_id; @@ -408,7 +409,8 @@ static bool reg_type_may_be_null(enum bpf_reg_type type) type == PTR_TO_SOCKET_OR_NULL || type == PTR_TO_SOCK_COMMON_OR_NULL || type == PTR_TO_TCP_SOCK_OR_NULL || - type == PTR_TO_BTF_ID_OR_NULL; + type == PTR_TO_BTF_ID_OR_NULL || + type == PTR_TO_MEM_OR_NULL; } static bool reg_may_point_to_spin_lock(const struct bpf_reg_state *reg) @@ -422,7 +424,9 @@ static bool reg_type_may_be_refcounted_or_null(enum bpf_reg_type type) return type == PTR_TO_SOCKET || type == PTR_TO_SOCKET_OR_NULL || type == PTR_TO_TCP_SOCK || - type == PTR_TO_TCP_SOCK_OR_NULL; + type == PTR_TO_TCP_SOCK_OR_NULL || + type == PTR_TO_MEM || + type == PTR_TO_MEM_OR_NULL; } static bool arg_type_may_be_refcounted(enum bpf_arg_type type) @@ -436,7 +440,9 @@ static bool arg_type_may_be_refcounted(enum bpf_arg_type type) */ static bool is_release_function(enum bpf_func_id func_id) { - return func_id == BPF_FUNC_sk_release; + return func_id == BPF_FUNC_sk_release || + func_id == BPF_FUNC_ringbuf_submit || + func_id == BPF_FUNC_ringbuf_discard; } static bool may_be_acquire_function(enum bpf_func_id func_id) @@ -444,7 +450,8 @@ static bool may_be_acquire_function(enum bpf_func_id func_id) return func_id == BPF_FUNC_sk_lookup_tcp || func_id == BPF_FUNC_sk_lookup_udp || func_id == BPF_FUNC_skc_lookup_tcp || - func_id == BPF_FUNC_map_lookup_elem; + func_id == BPF_FUNC_map_lookup_elem || + func_id == BPF_FUNC_ringbuf_reserve; } static bool is_acquire_function(enum bpf_func_id func_id, @@ -454,7 +461,8 @@ static bool is_acquire_function(enum bpf_func_id func_id, if (func_id == BPF_FUNC_sk_lookup_tcp || func_id == BPF_FUNC_sk_lookup_udp || - func_id == BPF_FUNC_skc_lookup_tcp) + func_id == BPF_FUNC_skc_lookup_tcp || + func_id == BPF_FUNC_ringbuf_reserve) return true; if (func_id == BPF_FUNC_map_lookup_elem && @@ -494,6 +502,8 @@ static const char * const reg_type_str[] = { [PTR_TO_XDP_SOCK] = "xdp_sock", [PTR_TO_BTF_ID] = "ptr_", [PTR_TO_BTF_ID_OR_NULL] = "ptr_or_null_", + [PTR_TO_MEM] = "mem", + [PTR_TO_MEM_OR_NULL] = "mem_or_null", }; static char slot_type_char[] = { @@ -2468,32 +2478,49 @@ static int check_map_access_type(struct bpf_verifier_env *env, u32 regno, return 0; } -/* check read/write into map element returned by bpf_map_lookup_elem() */ -static int __check_map_access(struct bpf_verifier_env *env, u32 regno, int off, - int size, bool zero_size_allowed) +/* check read/write into memory region (e.g., map value, ringbuf sample, etc) */ +static int __check_mem_access(struct bpf_verifier_env *env, int regno, + int off, int size, u32 mem_size, + bool zero_size_allowed) { - struct bpf_reg_state *regs = cur_regs(env); - struct bpf_map *map = regs[regno].map_ptr; + bool size_ok = size > 0 || (size == 0 && zero_size_allowed); + struct bpf_reg_state *reg; + + if (off >= 0 && size_ok && (u64)off + size <= mem_size) + return 0; - if (off < 0 || size < 0 || (size == 0 && !zero_size_allowed) || - off + size > map->value_size) { + reg = &cur_regs(env)[regno]; + switch (reg->type) { + case PTR_TO_MAP_VALUE: verbose(env, "invalid access to map value, value_size=%d off=%d size=%d\n", - map->value_size, off, size); - return -EACCES; + mem_size, off, size); + break; + case PTR_TO_PACKET: + case PTR_TO_PACKET_META: + case PTR_TO_PACKET_END: + verbose(env, "invalid access to packet, off=%d size=%d, R%d(id=%d,off=%d,r=%d)\n", + off, size, regno, reg->id, off, mem_size); + break; + case PTR_TO_MEM: + default: + verbose(env, "invalid access to memory, mem_size=%u off=%d size=%d\n", + mem_size, off, size); } - return 0; + + return -EACCES; } -/* check read/write into a map element with possible variable offset */ -static int check_map_access(struct bpf_verifier_env *env, u32 regno, - int off, int size, bool zero_size_allowed) +/* check read/write into a memory region with possible variable offset */ +static int check_mem_region_access(struct bpf_verifier_env *env, u32 regno, + int off, int size, u32 mem_size, + bool zero_size_allowed) { struct bpf_verifier_state *vstate = env->cur_state; struct bpf_func_state *state = vstate->frame[vstate->curframe]; struct bpf_reg_state *reg = &state->regs[regno]; int err; - /* We may have adjusted the register to this map value, so we + /* We may have adjusted the register pointing to memory region, so we * need to try adding each of min_value and max_value to off * to make sure our theoretical access will be safe. */ @@ -2514,10 +2541,10 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno, regno); return -EACCES; } - err = __check_map_access(env, regno, reg->smin_value + off, size, - zero_size_allowed); + err = __check_mem_access(env, regno, reg->smin_value + off, size, + mem_size, zero_size_allowed); if (err) { - verbose(env, "R%d min value is outside of the array range\n", + verbose(env, "R%d min value is outside of the allowed memory range\n", regno); return err; } @@ -2527,18 +2554,38 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno, * If reg->umax_value + off could overflow, treat that as unbounded too. */ if (reg->umax_value >= BPF_MAX_VAR_OFF) { - verbose(env, "R%d unbounded memory access, make sure to bounds check any array access into a map\n", + verbose(env, "R%d unbounded memory access, make sure to bounds check any such access\n", regno); return -EACCES; } - err = __check_map_access(env, regno, reg->umax_value + off, size, - zero_size_allowed); - if (err) - verbose(env, "R%d max value is outside of the array range\n", + err = __check_mem_access(env, regno, reg->umax_value + off, size, + mem_size, zero_size_allowed); + if (err) { + verbose(env, "R%d max value is outside of the allowed memory range\n", regno); + return err; + } + + return 0; +} - if (map_value_has_spin_lock(reg->map_ptr)) { - u32 lock = reg->map_ptr->spin_lock_off; +/* check read/write into a map element with possible variable offset */ +static int check_map_access(struct bpf_verifier_env *env, u32 regno, + int off, int size, bool zero_size_allowed) +{ + struct bpf_verifier_state *vstate = env->cur_state; + struct bpf_func_state *state = vstate->frame[vstate->curframe]; + struct bpf_reg_state *reg = &state->regs[regno]; + struct bpf_map *map = reg->map_ptr; + int err; + + err = check_mem_region_access(env, regno, off, size, map->value_size, + zero_size_allowed); + if (err) + return err; + + if (map_value_has_spin_lock(map)) { + u32 lock = map->spin_lock_off; /* if any part of struct bpf_spin_lock can be touched by * load/store reject this program. @@ -2596,21 +2643,6 @@ static bool may_access_direct_pkt_data(struct bpf_verifier_env *env, } } -static int __check_packet_access(struct bpf_verifier_env *env, u32 regno, - int off, int size, bool zero_size_allowed) -{ - struct bpf_reg_state *regs = cur_regs(env); - struct bpf_reg_state *reg = ®s[regno]; - - if (off < 0 || size < 0 || (size == 0 && !zero_size_allowed) || - (u64)off + size > reg->range) { - verbose(env, "invalid access to packet, off=%d size=%d, R%d(id=%d,off=%d,r=%d)\n", - off, size, regno, reg->id, reg->off, reg->range); - return -EACCES; - } - return 0; -} - static int check_packet_access(struct bpf_verifier_env *env, u32 regno, int off, int size, bool zero_size_allowed) { @@ -2631,16 +2663,17 @@ static int check_packet_access(struct bpf_verifier_env *env, u32 regno, int off, regno); return -EACCES; } - err = __check_packet_access(env, regno, off, size, zero_size_allowed); + err = __check_mem_access(env, regno, off, size, reg->range, + zero_size_allowed); if (err) { verbose(env, "R%d offset is outside of the packet\n", regno); return err; } - /* __check_packet_access has made sure "off + size - 1" is within u16. + /* __check_mem_access has made sure "off + size - 1" is within u16. * reg->umax_value can't be bigger than MAX_PACKET_OFF which is 0xffff, * otherwise find_good_pkt_pointers would have refused to set range info - * that __check_packet_access would have rejected this pkt access. + * that __check_mem_access would have rejected this pkt access. * Therefore, "off + reg->umax_value + size - 1" won't overflow u32. */ env->prog->aux->max_pkt_offset = @@ -3220,6 +3253,16 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn mark_reg_unknown(env, regs, value_regno); } } + } else if (reg->type == PTR_TO_MEM) { + if (t == BPF_WRITE && value_regno >= 0 && + is_pointer_value(env, value_regno)) { + verbose(env, "R%d leaks addr into mem\n", value_regno); + return -EACCES; + } + err = check_mem_region_access(env, regno, off, size, + reg->mem_size, false); + if (!err && t == BPF_READ && value_regno >= 0) + mark_reg_unknown(env, regs, value_regno); } else if (reg->type == PTR_TO_CTX) { enum bpf_reg_type reg_type = SCALAR_VALUE; u32 btf_id = 0; @@ -3557,6 +3600,10 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, return -EACCES; return check_map_access(env, regno, reg->off, access_size, zero_size_allowed); + case PTR_TO_MEM: + return check_mem_region_access(env, regno, reg->off, + access_size, reg->mem_size, + zero_size_allowed); default: /* scalar_value|ptr_to_stack or invalid ptr */ return check_stack_boundary(env, regno, access_size, zero_size_allowed, meta); @@ -3661,6 +3708,17 @@ static bool arg_type_is_mem_size(enum bpf_arg_type type) type == ARG_CONST_SIZE_OR_ZERO; } +static bool arg_type_is_alloc_mem_ptr(enum bpf_arg_type type) +{ + return type == ARG_PTR_TO_ALLOC_MEM || + type == ARG_PTR_TO_ALLOC_MEM_OR_NULL; +} + +static bool arg_type_is_alloc_size(enum bpf_arg_type type) +{ + return type == ARG_CONST_ALLOC_SIZE_OR_ZERO; +} + static bool arg_type_is_int_ptr(enum bpf_arg_type type) { return type == ARG_PTR_TO_INT || @@ -3720,7 +3778,8 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, type != expected_type) goto err_type; } else if (arg_type == ARG_CONST_SIZE || - arg_type == ARG_CONST_SIZE_OR_ZERO) { + arg_type == ARG_CONST_SIZE_OR_ZERO || + arg_type == ARG_CONST_ALLOC_SIZE_OR_ZERO) { expected_type = SCALAR_VALUE; if (type != expected_type) goto err_type; @@ -3791,13 +3850,29 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, * happens during stack boundary checking. */ if (register_is_null(reg) && - arg_type == ARG_PTR_TO_MEM_OR_NULL) + (arg_type == ARG_PTR_TO_MEM_OR_NULL || + arg_type == ARG_PTR_TO_ALLOC_MEM_OR_NULL)) /* final test in check_stack_boundary() */; else if (!type_is_pkt_pointer(type) && type != PTR_TO_MAP_VALUE && + type != PTR_TO_MEM && type != expected_type) goto err_type; meta->raw_mode = arg_type == ARG_PTR_TO_UNINIT_MEM; + } else if (arg_type_is_alloc_mem_ptr(arg_type)) { + expected_type = PTR_TO_MEM; + if (register_is_null(reg) && + arg_type == ARG_PTR_TO_ALLOC_MEM_OR_NULL) + /* final test in check_stack_boundary() */; + else if (type != expected_type) + goto err_type; + if (meta->ref_obj_id) { + verbose(env, "verifier internal error: more than one arg with ref_obj_id R%d %u %u\n", + regno, reg->ref_obj_id, + meta->ref_obj_id); + return -EFAULT; + } + meta->ref_obj_id = reg->ref_obj_id; } else if (arg_type_is_int_ptr(arg_type)) { expected_type = PTR_TO_STACK; if (!type_is_pkt_pointer(type) && @@ -3893,6 +3968,13 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, zero_size_allowed, meta); if (!err) err = mark_chain_precision(env, regno); + } else if (arg_type_is_alloc_size(arg_type)) { + if (!tnum_is_const(reg->var_off)) { + verbose(env, "R%d unbounded size, use 'var &= const' or 'if (var < const)'\n", + regno); + return -EACCES; + } + meta->mem_size = reg->var_off.value; } else if (arg_type_is_int_ptr(arg_type)) { int size = int_ptr_type_to_size(arg_type); @@ -3929,6 +4011,14 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, func_id != BPF_FUNC_xdp_output) goto error; break; + case BPF_MAP_TYPE_RINGBUF: + if (func_id != BPF_FUNC_ringbuf_output && + func_id != BPF_FUNC_ringbuf_reserve && + func_id != BPF_FUNC_ringbuf_submit && + func_id != BPF_FUNC_ringbuf_discard && + func_id != BPF_FUNC_ringbuf_query) + goto error; + break; case BPF_MAP_TYPE_STACK_TRACE: if (func_id != BPF_FUNC_get_stackid) goto error; @@ -4655,6 +4745,11 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn mark_reg_known_zero(env, regs, BPF_REG_0); regs[BPF_REG_0].type = PTR_TO_TCP_SOCK_OR_NULL; regs[BPF_REG_0].id = ++env->id_gen; + } else if (fn->ret_type == RET_PTR_TO_ALLOC_MEM_OR_NULL) { + mark_reg_known_zero(env, regs, BPF_REG_0); + regs[BPF_REG_0].type = PTR_TO_MEM_OR_NULL; + regs[BPF_REG_0].id = ++env->id_gen; + regs[BPF_REG_0].mem_size = meta.mem_size; } else { verbose(env, "unknown return type %d of func %s#%d\n", fn->ret_type, func_id_name(func_id), func_id); @@ -6611,6 +6706,8 @@ static void mark_ptr_or_null_reg(struct bpf_func_state *state, reg->type = PTR_TO_TCP_SOCK; } else if (reg->type == PTR_TO_BTF_ID_OR_NULL) { reg->type = PTR_TO_BTF_ID; + } else if (reg->type == PTR_TO_MEM_OR_NULL) { + reg->type = PTR_TO_MEM; } if (is_null) { /* We don't need id and ref_obj_id from this point diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 187cd6995bbb..3767d34114c0 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -1088,6 +1088,16 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_perf_event_read_value_proto; case BPF_FUNC_get_ns_current_pid_tgid: return &bpf_get_ns_current_pid_tgid_proto; + case BPF_FUNC_ringbuf_output: + return &bpf_ringbuf_output_proto; + case BPF_FUNC_ringbuf_reserve: + return &bpf_ringbuf_reserve_proto; + case BPF_FUNC_ringbuf_submit: + return &bpf_ringbuf_submit_proto; + case BPF_FUNC_ringbuf_discard: + return &bpf_ringbuf_discard_proto; + case BPF_FUNC_ringbuf_query: + return &bpf_ringbuf_query_proto; default: return NULL; } diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 54b93f8b49b8..974ca6e948e3 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -147,6 +147,7 @@ enum bpf_map_type { BPF_MAP_TYPE_SK_STORAGE, BPF_MAP_TYPE_DEVMAP_HASH, BPF_MAP_TYPE_STRUCT_OPS, + BPF_MAP_TYPE_RINGBUF, }; /* Note that tracing related programs such as @@ -3157,6 +3158,59 @@ union bpf_attr { * **bpf_sk_cgroup_id**\ (). * Return * The id is returned or 0 in case the id could not be retrieved. + * + * void *bpf_ringbuf_output(void *ringbuf, void *data, u64 size, u64 flags) + * Description + * Copy *size* bytes from *data* into a ring buffer *ringbuf*. + * If BPF_RB_NO_WAKEUP is specified in *flags*, no notification of + * new data availability is sent. + * IF BPF_RB_FORCE_WAKEUP is specified in *flags*, notification of + * new data availability is sent unconditionally. + * Return + * 0, on success; + * < 0, on error. + * + * void *bpf_ringbuf_reserve(void *ringbuf, u64 size, u64 flags) + * Description + * Reserve *size* bytes of payload in a ring buffer *ringbuf*. + * Return + * Valid pointer with *size* bytes of memory available; NULL, + * otherwise. + * + * void bpf_ringbuf_submit(void *data, u64 flags) + * Description + * Submit reserved ring buffer sample, pointed to by *data*. + * If BPF_RB_NO_WAKEUP is specified in *flags*, no notification of + * new data availability is sent. + * IF BPF_RB_FORCE_WAKEUP is specified in *flags*, notification of + * new data availability is sent unconditionally. + * Return + * Nothing. Always succeeds. + * + * void bpf_ringbuf_discard(void *data, u64 flags) + * Description + * Discard reserved ring buffer sample, pointed to by *data*. + * If BPF_RB_NO_WAKEUP is specified in *flags*, no notification of + * new data availability is sent. + * IF BPF_RB_FORCE_WAKEUP is specified in *flags*, notification of + * new data availability is sent unconditionally. + * Return + * Nothing. Always succeeds. + * + * u64 bpf_ringbuf_query(void *ringbuf, u64 flags) + * Description + * Query various characteristics of provided ring buffer. What + * exactly is queries is determined by *flags*: + * - BPF_RB_AVAIL_DATA - amount of data not yet consumed; + * - BPF_RB_RING_SIZE - the size of ring buffer; + * - BPF_RB_CONS_POS - consumer position (can wrap around); + * - BPF_RB_PROD_POS - producer(s) position (can wrap around); + * Data returned is just a momentary snapshots of actual values + * and could be inaccurate, so this facility should be used to + * power heuristics and for reporting, not to make 100% correct + * calculation. + * Return + * Requested value, or 0, if flags are not recognized. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3288,7 +3342,12 @@ union bpf_attr { FN(seq_printf), \ FN(seq_write), \ FN(sk_cgroup_id), \ - FN(sk_ancestor_cgroup_id), + FN(sk_ancestor_cgroup_id), \ + FN(ringbuf_output), \ + FN(ringbuf_reserve), \ + FN(ringbuf_submit), \ + FN(ringbuf_discard), \ + FN(ringbuf_query), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call @@ -3398,6 +3457,29 @@ enum { BPF_F_GET_BRANCH_RECORDS_SIZE = (1ULL << 0), }; +/* BPF_FUNC_bpf_ringbuf_commit, BPF_FUNC_bpf_ringbuf_discard, and + * BPF_FUNC_bpf_ringbuf_output flags. + */ +enum { + BPF_RB_NO_WAKEUP = (1ULL << 0), + BPF_RB_FORCE_WAKEUP = (1ULL << 1), +}; + +/* BPF_FUNC_bpf_ringbuf_query flags */ +enum { + BPF_RB_AVAIL_DATA = 0, + BPF_RB_RING_SIZE = 1, + BPF_RB_CONS_POS = 2, + BPF_RB_PROD_POS = 3, +}; + +/* BPF ring buffer constants */ +enum { + BPF_RINGBUF_BUSY_BIT = (1U << 31), + BPF_RINGBUF_DISCARD_BIT = (1U << 30), + BPF_RINGBUF_HDR_SZ = 8, +}; + /* Mode for BPF_FUNC_skb_adjust_room helper. */ enum bpf_adj_room_mode { BPF_ADJ_ROOM_NET, diff --git a/tools/testing/selftests/bpf/verifier/and.c b/tools/testing/selftests/bpf/verifier/and.c index e0fad1548737..d781bc86e100 100644 --- a/tools/testing/selftests/bpf/verifier/and.c +++ b/tools/testing/selftests/bpf/verifier/and.c @@ -15,7 +15,7 @@ BPF_EXIT_INSN(), }, .fixup_map_hash_48b = { 3 }, - .errstr = "R0 max value is outside of the array range", + .errstr = "R0 max value is outside of the allowed memory range", .result = REJECT, .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS, }, @@ -44,7 +44,7 @@ BPF_EXIT_INSN(), }, .fixup_map_hash_48b = { 3 }, - .errstr = "R0 max value is outside of the array range", + .errstr = "R0 max value is outside of the allowed memory range", .result = REJECT, .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS, }, diff --git a/tools/testing/selftests/bpf/verifier/array_access.c b/tools/testing/selftests/bpf/verifier/array_access.c index f3c33e128709..1c4b1939f5a8 100644 --- a/tools/testing/selftests/bpf/verifier/array_access.c +++ b/tools/testing/selftests/bpf/verifier/array_access.c @@ -117,7 +117,7 @@ BPF_EXIT_INSN(), }, .fixup_map_hash_48b = { 3 }, - .errstr = "R0 min value is outside of the array range", + .errstr = "R0 min value is outside of the allowed memory range", .result = REJECT, .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS, }, @@ -137,7 +137,7 @@ BPF_EXIT_INSN(), }, .fixup_map_hash_48b = { 3 }, - .errstr = "R0 unbounded memory access, make sure to bounds check any array access into a map", + .errstr = "R0 unbounded memory access, make sure to bounds check any such access", .result = REJECT, .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS, }, diff --git a/tools/testing/selftests/bpf/verifier/bounds.c b/tools/testing/selftests/bpf/verifier/bounds.c index 58f4aa593b1b..4d6645f2874c 100644 --- a/tools/testing/selftests/bpf/verifier/bounds.c +++ b/tools/testing/selftests/bpf/verifier/bounds.c @@ -20,7 +20,7 @@ BPF_EXIT_INSN(), }, .fixup_map_hash_8b = { 3 }, - .errstr = "R0 max value is outside of the array range", + .errstr = "R0 max value is outside of the allowed memory range", .result = REJECT, }, { @@ -146,7 +146,7 @@ BPF_EXIT_INSN(), }, .fixup_map_hash_8b = { 3 }, - .errstr = "R0 min value is outside of the array range", + .errstr = "R0 min value is outside of the allowed memory range", .result = REJECT }, { @@ -354,7 +354,7 @@ BPF_EXIT_INSN(), }, .fixup_map_hash_8b = { 3 }, - .errstr = "R0 max value is outside of the array range", + .errstr = "R0 max value is outside of the allowed memory range", .result = REJECT }, { diff --git a/tools/testing/selftests/bpf/verifier/calls.c b/tools/testing/selftests/bpf/verifier/calls.c index 7629a0cebb9b..94258c6b5235 100644 --- a/tools/testing/selftests/bpf/verifier/calls.c +++ b/tools/testing/selftests/bpf/verifier/calls.c @@ -105,7 +105,7 @@ .prog_type = BPF_PROG_TYPE_SCHED_CLS, .fixup_map_hash_8b = { 16 }, .result = REJECT, - .errstr = "R0 min value is outside of the array range", + .errstr = "R0 min value is outside of the allowed memory range", }, { "calls: overlapping caller/callee", diff --git a/tools/testing/selftests/bpf/verifier/direct_value_access.c b/tools/testing/selftests/bpf/verifier/direct_value_access.c index b9fb28e8e224..988f46a1a4c7 100644 --- a/tools/testing/selftests/bpf/verifier/direct_value_access.c +++ b/tools/testing/selftests/bpf/verifier/direct_value_access.c @@ -68,7 +68,7 @@ }, .fixup_map_array_48b = { 1 }, .result = REJECT, - .errstr = "R1 min value is outside of the array range", + .errstr = "R1 min value is outside of the allowed memory range", }, { "direct map access, write test 7", @@ -220,7 +220,7 @@ }, .fixup_map_array_small = { 1 }, .result = REJECT, - .errstr = "R1 min value is outside of the array range", + .errstr = "R1 min value is outside of the allowed memory range", }, { "direct map access, write test 19", diff --git a/tools/testing/selftests/bpf/verifier/helper_access_var_len.c b/tools/testing/selftests/bpf/verifier/helper_access_var_len.c index 67ab12410050..5a605ae131a9 100644 --- a/tools/testing/selftests/bpf/verifier/helper_access_var_len.c +++ b/tools/testing/selftests/bpf/verifier/helper_access_var_len.c @@ -318,7 +318,7 @@ BPF_EXIT_INSN(), }, .fixup_map_hash_48b = { 4 }, - .errstr = "R1 min value is outside of the array range", + .errstr = "R1 min value is outside of the allowed memory range", .result = REJECT, .prog_type = BPF_PROG_TYPE_TRACEPOINT, }, diff --git a/tools/testing/selftests/bpf/verifier/helper_value_access.c b/tools/testing/selftests/bpf/verifier/helper_value_access.c index 7572e403ddb9..961f28139b96 100644 --- a/tools/testing/selftests/bpf/verifier/helper_value_access.c +++ b/tools/testing/selftests/bpf/verifier/helper_value_access.c @@ -280,7 +280,7 @@ BPF_EXIT_INSN(), }, .fixup_map_hash_48b = { 3 }, - .errstr = "R1 min value is outside of the array range", + .errstr = "R1 min value is outside of the allowed memory range", .result = REJECT, .prog_type = BPF_PROG_TYPE_TRACEPOINT, }, @@ -415,7 +415,7 @@ BPF_EXIT_INSN(), }, .fixup_map_hash_48b = { 3 }, - .errstr = "R1 min value is outside of the array range", + .errstr = "R1 min value is outside of the allowed memory range", .result = REJECT, .prog_type = BPF_PROG_TYPE_TRACEPOINT, }, @@ -926,7 +926,7 @@ }, .fixup_map_hash_16b = { 3, 10 }, .result = REJECT, - .errstr = "R2 unbounded memory access, make sure to bounds check any array access into a map", + .errstr = "R2 unbounded memory access, make sure to bounds check any such access", .prog_type = BPF_PROG_TYPE_TRACEPOINT, }, { diff --git a/tools/testing/selftests/bpf/verifier/value_ptr_arith.c b/tools/testing/selftests/bpf/verifier/value_ptr_arith.c index a53d99cebd9f..97ee658e1242 100644 --- a/tools/testing/selftests/bpf/verifier/value_ptr_arith.c +++ b/tools/testing/selftests/bpf/verifier/value_ptr_arith.c @@ -50,7 +50,7 @@ .fixup_map_array_48b = { 8 }, .result = ACCEPT, .result_unpriv = REJECT, - .errstr_unpriv = "R0 min value is outside of the array range", + .errstr_unpriv = "R0 min value is outside of the allowed memory range", .retval = 1, }, { @@ -325,7 +325,7 @@ }, .fixup_map_array_48b = { 3 }, .result = REJECT, - .errstr = "R0 min value is outside of the array range", + .errstr = "R0 min value is outside of the allowed memory range", .result_unpriv = REJECT, .errstr_unpriv = "R0 pointer arithmetic of map value goes out of range", }, @@ -601,7 +601,7 @@ }, .fixup_map_array_48b = { 3 }, .result = REJECT, - .errstr = "R1 max value is outside of the array range", + .errstr = "R1 max value is outside of the allowed memory range", .errstr_unpriv = "R1 pointer arithmetic of map value goes out of range", .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS, }, @@ -726,7 +726,7 @@ }, .fixup_map_array_48b = { 3 }, .result = REJECT, - .errstr = "R0 min value is outside of the array range", + .errstr = "R0 min value is outside of the allowed memory range", }, { "map access: value_ptr -= known scalar, 2", -- cgit v1.2.3-59-g8ed1b From bf99c936f9478a05d51e9f101f90de70bee9a89c Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Fri, 29 May 2020 00:54:21 -0700 Subject: libbpf: Add BPF ring buffer support Declaring and instantiating BPF ring buffer doesn't require any changes to libbpf, as it's just another type of maps. So using existing BTF-defined maps syntax with __uint(type, BPF_MAP_TYPE_RINGBUF) and __uint(max_elements, ) is all that's necessary to create and use BPF ring buffer. This patch adds BPF ring buffer consumer to libbpf. It is very similar to perf_buffer implementation in terms of API, but also attempts to fix some minor problems and inconveniences with existing perf_buffer API. ring_buffer support both single ring buffer use case (with just using ring_buffer__new()), as well as allows to add more ring buffers, each with its own callback and context. This allows to efficiently poll and consume multiple, potentially completely independent, ring buffers, using single epoll instance. The latter is actually a problem in practice for applications that are using multiple sets of perf buffers. They have to create multiple instances for struct perf_buffer and poll them independently or in a loop, each approach having its own problems (e.g., inability to use a common poll timeout). struct ring_buffer eliminates this problem by aggregating many independent ring buffer instances under the single "ring buffer manager". Second, perf_buffer's callback can't return error, so applications that need to stop polling due to error in data or data signalling the end, have to use extra mechanisms to signal that polling has to stop. ring_buffer's callback can return error, which will be passed through back to user code and can be acted upon appropariately. Two APIs allow to consume ring buffer data: - ring_buffer__poll(), which will wait for data availability notification and will consume data only from reported ring buffer(s); this API allows to efficiently use resources by reading data only when it becomes available; - ring_buffer__consume(), will attempt to read new records regardless of data availablity notification sub-system. This API is useful for cases when lowest latency is required, in expense of burning CPU resources. Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20200529075424.3139988-3-andriin@fb.com Signed-off-by: Alexei Starovoitov --- tools/lib/bpf/Build | 2 +- tools/lib/bpf/libbpf.h | 21 ++++ tools/lib/bpf/libbpf.map | 5 + tools/lib/bpf/libbpf_probes.c | 5 + tools/lib/bpf/ringbuf.c | 285 ++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 317 insertions(+), 1 deletion(-) create mode 100644 tools/lib/bpf/ringbuf.c diff --git a/tools/lib/bpf/Build b/tools/lib/bpf/Build index e3962cfbc9a6..190366d05588 100644 --- a/tools/lib/bpf/Build +++ b/tools/lib/bpf/Build @@ -1,3 +1,3 @@ libbpf-y := libbpf.o bpf.o nlattr.o btf.o libbpf_errno.o str_error.o \ netlink.o bpf_prog_linfo.o libbpf_probes.o xsk.o hashmap.o \ - btf_dump.o + btf_dump.o ringbuf.o diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h index 1e2e399a5f2c..8528a02d5af8 100644 --- a/tools/lib/bpf/libbpf.h +++ b/tools/lib/bpf/libbpf.h @@ -478,6 +478,27 @@ LIBBPF_API int bpf_get_link_xdp_id(int ifindex, __u32 *prog_id, __u32 flags); LIBBPF_API int bpf_get_link_xdp_info(int ifindex, struct xdp_link_info *info, size_t info_size, __u32 flags); +/* Ring buffer APIs */ +struct ring_buffer; + +typedef int (*ring_buffer_sample_fn)(void *ctx, void *data, size_t size); + +struct ring_buffer_opts { + size_t sz; /* size of this struct, for forward/backward compatiblity */ +}; + +#define ring_buffer_opts__last_field sz + +LIBBPF_API struct ring_buffer * +ring_buffer__new(int map_fd, ring_buffer_sample_fn sample_cb, void *ctx, + const struct ring_buffer_opts *opts); +LIBBPF_API void ring_buffer__free(struct ring_buffer *rb); +LIBBPF_API int ring_buffer__add(struct ring_buffer *rb, int map_fd, + ring_buffer_sample_fn sample_cb, void *ctx); +LIBBPF_API int ring_buffer__poll(struct ring_buffer *rb, int timeout_ms); +LIBBPF_API int ring_buffer__consume(struct ring_buffer *rb); + +/* Perf buffer APIs */ struct perf_buffer; typedef void (*perf_buffer_sample_fn)(void *ctx, int cpu, diff --git a/tools/lib/bpf/libbpf.map b/tools/lib/bpf/libbpf.map index 381a7342ecfc..c18860200abb 100644 --- a/tools/lib/bpf/libbpf.map +++ b/tools/lib/bpf/libbpf.map @@ -263,4 +263,9 @@ LIBBPF_0.0.9 { bpf_link_get_next_id; bpf_program__attach_iter; perf_buffer__consume; + ring_buffer__add; + ring_buffer__consume; + ring_buffer__free; + ring_buffer__new; + ring_buffer__poll; } LIBBPF_0.0.8; diff --git a/tools/lib/bpf/libbpf_probes.c b/tools/lib/bpf/libbpf_probes.c index 2c92059c0c90..10cd8d1891f5 100644 --- a/tools/lib/bpf/libbpf_probes.c +++ b/tools/lib/bpf/libbpf_probes.c @@ -238,6 +238,11 @@ bool bpf_probe_map_type(enum bpf_map_type map_type, __u32 ifindex) if (btf_fd < 0) return false; break; + case BPF_MAP_TYPE_RINGBUF: + key_size = 0; + value_size = 0; + max_entries = 4096; + break; case BPF_MAP_TYPE_UNSPEC: case BPF_MAP_TYPE_HASH: case BPF_MAP_TYPE_ARRAY: diff --git a/tools/lib/bpf/ringbuf.c b/tools/lib/bpf/ringbuf.c new file mode 100644 index 000000000000..bc10fa1d43c7 --- /dev/null +++ b/tools/lib/bpf/ringbuf.c @@ -0,0 +1,285 @@ +// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) +/* + * Ring buffer operations. + * + * Copyright (C) 2020 Facebook, Inc. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "libbpf.h" +#include "libbpf_internal.h" +#include "bpf.h" + +/* make sure libbpf doesn't use kernel-only integer typedefs */ +#pragma GCC poison u8 u16 u32 u64 s8 s16 s32 s64 + +struct ring { + ring_buffer_sample_fn sample_cb; + void *ctx; + void *data; + unsigned long *consumer_pos; + unsigned long *producer_pos; + unsigned long mask; + int map_fd; +}; + +struct ring_buffer { + struct epoll_event *events; + struct ring *rings; + size_t page_size; + int epoll_fd; + int ring_cnt; +}; + +static void ringbuf_unmap_ring(struct ring_buffer *rb, struct ring *r) +{ + if (r->consumer_pos) { + munmap(r->consumer_pos, rb->page_size); + r->consumer_pos = NULL; + } + if (r->producer_pos) { + munmap(r->producer_pos, rb->page_size + 2 * (r->mask + 1)); + r->producer_pos = NULL; + } +} + +/* Add extra RINGBUF maps to this ring buffer manager */ +int ring_buffer__add(struct ring_buffer *rb, int map_fd, + ring_buffer_sample_fn sample_cb, void *ctx) +{ + struct bpf_map_info info; + __u32 len = sizeof(info); + struct epoll_event *e; + struct ring *r; + void *tmp; + int err; + + memset(&info, 0, sizeof(info)); + + err = bpf_obj_get_info_by_fd(map_fd, &info, &len); + if (err) { + err = -errno; + pr_warn("ringbuf: failed to get map info for fd=%d: %d\n", + map_fd, err); + return err; + } + + if (info.type != BPF_MAP_TYPE_RINGBUF) { + pr_warn("ringbuf: map fd=%d is not BPF_MAP_TYPE_RINGBUF\n", + map_fd); + return -EINVAL; + } + + tmp = reallocarray(rb->rings, rb->ring_cnt + 1, sizeof(*rb->rings)); + if (!tmp) + return -ENOMEM; + rb->rings = tmp; + + tmp = reallocarray(rb->events, rb->ring_cnt + 1, sizeof(*rb->events)); + if (!tmp) + return -ENOMEM; + rb->events = tmp; + + r = &rb->rings[rb->ring_cnt]; + memset(r, 0, sizeof(*r)); + + r->map_fd = map_fd; + r->sample_cb = sample_cb; + r->ctx = ctx; + r->mask = info.max_entries - 1; + + /* Map writable consumer page */ + tmp = mmap(NULL, rb->page_size, PROT_READ | PROT_WRITE, MAP_SHARED, + map_fd, 0); + if (tmp == MAP_FAILED) { + err = -errno; + pr_warn("ringbuf: failed to mmap consumer page for map fd=%d: %d\n", + map_fd, err); + return err; + } + r->consumer_pos = tmp; + + /* Map read-only producer page and data pages. We map twice as big + * data size to allow simple reading of samples that wrap around the + * end of a ring buffer. See kernel implementation for details. + * */ + tmp = mmap(NULL, rb->page_size + 2 * info.max_entries, PROT_READ, + MAP_SHARED, map_fd, rb->page_size); + if (tmp == MAP_FAILED) { + err = -errno; + ringbuf_unmap_ring(rb, r); + pr_warn("ringbuf: failed to mmap data pages for map fd=%d: %d\n", + map_fd, err); + return err; + } + r->producer_pos = tmp; + r->data = tmp + rb->page_size; + + e = &rb->events[rb->ring_cnt]; + memset(e, 0, sizeof(*e)); + + e->events = EPOLLIN; + e->data.fd = rb->ring_cnt; + if (epoll_ctl(rb->epoll_fd, EPOLL_CTL_ADD, map_fd, e) < 0) { + err = -errno; + ringbuf_unmap_ring(rb, r); + pr_warn("ringbuf: failed to epoll add map fd=%d: %d\n", + map_fd, err); + return err; + } + + rb->ring_cnt++; + return 0; +} + +void ring_buffer__free(struct ring_buffer *rb) +{ + int i; + + if (!rb) + return; + + for (i = 0; i < rb->ring_cnt; ++i) + ringbuf_unmap_ring(rb, &rb->rings[i]); + if (rb->epoll_fd >= 0) + close(rb->epoll_fd); + + free(rb->events); + free(rb->rings); + free(rb); +} + +struct ring_buffer * +ring_buffer__new(int map_fd, ring_buffer_sample_fn sample_cb, void *ctx, + const struct ring_buffer_opts *opts) +{ + struct ring_buffer *rb; + int err; + + if (!OPTS_VALID(opts, ring_buffer_opts)) + return NULL; + + rb = calloc(1, sizeof(*rb)); + if (!rb) + return NULL; + + rb->page_size = getpagesize(); + + rb->epoll_fd = epoll_create1(EPOLL_CLOEXEC); + if (rb->epoll_fd < 0) { + err = -errno; + pr_warn("ringbuf: failed to create epoll instance: %d\n", err); + goto err_out; + } + + err = ring_buffer__add(rb, map_fd, sample_cb, ctx); + if (err) + goto err_out; + + return rb; + +err_out: + ring_buffer__free(rb); + return NULL; +} + +static inline int roundup_len(__u32 len) +{ + /* clear out top 2 bits (discard and busy, if set) */ + len <<= 2; + len >>= 2; + /* add length prefix */ + len += BPF_RINGBUF_HDR_SZ; + /* round up to 8 byte alignment */ + return (len + 7) / 8 * 8; +} + +static int ringbuf_process_ring(struct ring* r) +{ + int *len_ptr, len, err, cnt = 0; + unsigned long cons_pos, prod_pos; + bool got_new_data; + void *sample; + + cons_pos = smp_load_acquire(r->consumer_pos); + do { + got_new_data = false; + prod_pos = smp_load_acquire(r->producer_pos); + while (cons_pos < prod_pos) { + len_ptr = r->data + (cons_pos & r->mask); + len = smp_load_acquire(len_ptr); + + /* sample not committed yet, bail out for now */ + if (len & BPF_RINGBUF_BUSY_BIT) + goto done; + + got_new_data = true; + cons_pos += roundup_len(len); + + if ((len & BPF_RINGBUF_DISCARD_BIT) == 0) { + sample = (void *)len_ptr + BPF_RINGBUF_HDR_SZ; + err = r->sample_cb(r->ctx, sample, len); + if (err) { + /* update consumer pos and bail out */ + smp_store_release(r->consumer_pos, + cons_pos); + return err; + } + cnt++; + } + + smp_store_release(r->consumer_pos, cons_pos); + } + } while (got_new_data); +done: + return cnt; +} + +/* Consume available ring buffer(s) data without event polling. + * Returns number of records consumed across all registered ring buffers, or + * negative number if any of the callbacks return error. + */ +int ring_buffer__consume(struct ring_buffer *rb) +{ + int i, err, res = 0; + + for (i = 0; i < rb->ring_cnt; i++) { + struct ring *ring = &rb->rings[i]; + + err = ringbuf_process_ring(ring); + if (err < 0) + return err; + res += err; + } + return res; +} + +/* Poll for available data and consume records, if any are available. + * Returns number of records consumed, or negative number, if any of the + * registered callbacks returned error. + */ +int ring_buffer__poll(struct ring_buffer *rb, int timeout_ms) +{ + int i, cnt, err, res = 0; + + cnt = epoll_wait(rb->epoll_fd, rb->events, rb->ring_cnt, timeout_ms); + for (i = 0; i < cnt; i++) { + __u32 ring_id = rb->events[i].data.fd; + struct ring *ring = &rb->rings[ring_id]; + + err = ringbuf_process_ring(ring); + if (err < 0) + return err; + res += cnt; + } + return cnt < 0 ? -errno : res; +} -- cgit v1.2.3-59-g8ed1b From cb1c9ddd552520abd49031d47397c6e95bad882e Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Fri, 29 May 2020 00:54:22 -0700 Subject: selftests/bpf: Add BPF ringbuf selftests Both singleton BPF ringbuf and BPF ringbuf with map-in-map use cases are tested. Also reserve+submit/discards and output variants of API are validated. Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20200529075424.3139988-4-andriin@fb.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/prog_tests/ringbuf.c | 211 +++++++++++++++++++++ .../selftests/bpf/prog_tests/ringbuf_multi.c | 102 ++++++++++ tools/testing/selftests/bpf/progs/test_ringbuf.c | 78 ++++++++ .../selftests/bpf/progs/test_ringbuf_multi.c | 77 ++++++++ 4 files changed, 468 insertions(+) create mode 100644 tools/testing/selftests/bpf/prog_tests/ringbuf.c create mode 100644 tools/testing/selftests/bpf/prog_tests/ringbuf_multi.c create mode 100644 tools/testing/selftests/bpf/progs/test_ringbuf.c create mode 100644 tools/testing/selftests/bpf/progs/test_ringbuf_multi.c diff --git a/tools/testing/selftests/bpf/prog_tests/ringbuf.c b/tools/testing/selftests/bpf/prog_tests/ringbuf.c new file mode 100644 index 000000000000..bb8541f240e2 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/ringbuf.c @@ -0,0 +1,211 @@ +// SPDX-License-Identifier: GPL-2.0 +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "test_ringbuf.skel.h" + +#define EDONE 7777 + +static int duration = 0; + +struct sample { + int pid; + int seq; + long value; + char comm[16]; +}; + +static int sample_cnt; + +static int process_sample(void *ctx, void *data, size_t len) +{ + struct sample *s = data; + + sample_cnt++; + + switch (s->seq) { + case 0: + CHECK(s->value != 333, "sample1_value", "exp %ld, got %ld\n", + 333L, s->value); + return 0; + case 1: + CHECK(s->value != 777, "sample2_value", "exp %ld, got %ld\n", + 777L, s->value); + return -EDONE; + default: + /* we don't care about the rest */ + return 0; + } +} + +static struct test_ringbuf *skel; +static struct ring_buffer *ringbuf; + +static void trigger_samples() +{ + skel->bss->dropped = 0; + skel->bss->total = 0; + skel->bss->discarded = 0; + + /* trigger exactly two samples */ + skel->bss->value = 333; + syscall(__NR_getpgid); + skel->bss->value = 777; + syscall(__NR_getpgid); +} + +static void *poll_thread(void *input) +{ + long timeout = (long)input; + + return (void *)(long)ring_buffer__poll(ringbuf, timeout); +} + +void test_ringbuf(void) +{ + const size_t rec_sz = BPF_RINGBUF_HDR_SZ + sizeof(struct sample); + pthread_t thread; + long bg_ret = -1; + int err; + + skel = test_ringbuf__open_and_load(); + if (CHECK(!skel, "skel_open_load", "skeleton open&load failed\n")) + return; + + /* only trigger BPF program for current process */ + skel->bss->pid = getpid(); + + ringbuf = ring_buffer__new(bpf_map__fd(skel->maps.ringbuf), + process_sample, NULL, NULL); + if (CHECK(!ringbuf, "ringbuf_create", "failed to create ringbuf\n")) + goto cleanup; + + err = test_ringbuf__attach(skel); + if (CHECK(err, "skel_attach", "skeleton attachment failed: %d\n", err)) + goto cleanup; + + trigger_samples(); + + /* 2 submitted + 1 discarded records */ + CHECK(skel->bss->avail_data != 3 * rec_sz, + "err_avail_size", "exp %ld, got %ld\n", + 3L * rec_sz, skel->bss->avail_data); + CHECK(skel->bss->ring_size != 4096, + "err_ring_size", "exp %ld, got %ld\n", + 4096L, skel->bss->ring_size); + CHECK(skel->bss->cons_pos != 0, + "err_cons_pos", "exp %ld, got %ld\n", + 0L, skel->bss->cons_pos); + CHECK(skel->bss->prod_pos != 3 * rec_sz, + "err_prod_pos", "exp %ld, got %ld\n", + 3L * rec_sz, skel->bss->prod_pos); + + /* poll for samples */ + err = ring_buffer__poll(ringbuf, -1); + + /* -EDONE is used as an indicator that we are done */ + if (CHECK(err != -EDONE, "err_done", "done err: %d\n", err)) + goto cleanup; + + /* we expect extra polling to return nothing */ + err = ring_buffer__poll(ringbuf, 0); + if (CHECK(err != 0, "extra_samples", "poll result: %d\n", err)) + goto cleanup; + + CHECK(skel->bss->dropped != 0, "err_dropped", "exp %ld, got %ld\n", + 0L, skel->bss->dropped); + CHECK(skel->bss->total != 2, "err_total", "exp %ld, got %ld\n", + 2L, skel->bss->total); + CHECK(skel->bss->discarded != 1, "err_discarded", "exp %ld, got %ld\n", + 1L, skel->bss->discarded); + + /* now validate consumer position is updated and returned */ + trigger_samples(); + CHECK(skel->bss->cons_pos != 3 * rec_sz, + "err_cons_pos", "exp %ld, got %ld\n", + 3L * rec_sz, skel->bss->cons_pos); + err = ring_buffer__poll(ringbuf, -1); + CHECK(err <= 0, "poll_err", "err %d\n", err); + + /* start poll in background w/ long timeout */ + err = pthread_create(&thread, NULL, poll_thread, (void *)(long)10000); + if (CHECK(err, "bg_poll", "pthread_create failed: %d\n", err)) + goto cleanup; + + /* turn off notifications now */ + skel->bss->flags = BPF_RB_NO_WAKEUP; + + /* give background thread a bit of a time */ + usleep(50000); + trigger_samples(); + /* sleeping arbitrarily is bad, but no better way to know that + * epoll_wait() **DID NOT** unblock in background thread + */ + usleep(50000); + /* background poll should still be blocked */ + err = pthread_tryjoin_np(thread, (void **)&bg_ret); + if (CHECK(err != EBUSY, "try_join", "err %d\n", err)) + goto cleanup; + + /* BPF side did everything right */ + CHECK(skel->bss->dropped != 0, "err_dropped", "exp %ld, got %ld\n", + 0L, skel->bss->dropped); + CHECK(skel->bss->total != 2, "err_total", "exp %ld, got %ld\n", + 2L, skel->bss->total); + CHECK(skel->bss->discarded != 1, "err_discarded", "exp %ld, got %ld\n", + 1L, skel->bss->discarded); + + /* clear flags to return to "adaptive" notification mode */ + skel->bss->flags = 0; + + /* produce new samples, no notification should be triggered, because + * consumer is now behind + */ + trigger_samples(); + + /* background poll should still be blocked */ + err = pthread_tryjoin_np(thread, (void **)&bg_ret); + if (CHECK(err != EBUSY, "try_join", "err %d\n", err)) + goto cleanup; + + /* now force notifications */ + skel->bss->flags = BPF_RB_FORCE_WAKEUP; + sample_cnt = 0; + trigger_samples(); + + /* now we should get a pending notification */ + usleep(50000); + err = pthread_tryjoin_np(thread, (void **)&bg_ret); + if (CHECK(err, "join_bg", "err %d\n", err)) + goto cleanup; + + if (CHECK(bg_ret != 1, "bg_ret", "epoll_wait result: %ld", bg_ret)) + goto cleanup; + + /* 3 rounds, 2 samples each */ + CHECK(sample_cnt != 6, "wrong_sample_cnt", + "expected to see %d samples, got %d\n", 6, sample_cnt); + + /* BPF side did everything right */ + CHECK(skel->bss->dropped != 0, "err_dropped", "exp %ld, got %ld\n", + 0L, skel->bss->dropped); + CHECK(skel->bss->total != 2, "err_total", "exp %ld, got %ld\n", + 2L, skel->bss->total); + CHECK(skel->bss->discarded != 1, "err_discarded", "exp %ld, got %ld\n", + 1L, skel->bss->discarded); + + test_ringbuf__detach(skel); +cleanup: + ring_buffer__free(ringbuf); + test_ringbuf__destroy(skel); +} diff --git a/tools/testing/selftests/bpf/prog_tests/ringbuf_multi.c b/tools/testing/selftests/bpf/prog_tests/ringbuf_multi.c new file mode 100644 index 000000000000..78e450609803 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/ringbuf_multi.c @@ -0,0 +1,102 @@ +// SPDX-License-Identifier: GPL-2.0 +#define _GNU_SOURCE +#include +#include +#include "test_ringbuf_multi.skel.h" + +static int duration = 0; + +struct sample { + int pid; + int seq; + long value; + char comm[16]; +}; + +static int process_sample(void *ctx, void *data, size_t len) +{ + int ring = (unsigned long)ctx; + struct sample *s = data; + + switch (s->seq) { + case 0: + CHECK(ring != 1, "sample1_ring", "exp %d, got %d\n", 1, ring); + CHECK(s->value != 333, "sample1_value", "exp %ld, got %ld\n", + 333L, s->value); + break; + case 1: + CHECK(ring != 2, "sample2_ring", "exp %d, got %d\n", 2, ring); + CHECK(s->value != 777, "sample2_value", "exp %ld, got %ld\n", + 777L, s->value); + break; + default: + CHECK(true, "extra_sample", "unexpected sample seq %d, val %ld\n", + s->seq, s->value); + return -1; + } + + return 0; +} + +void test_ringbuf_multi(void) +{ + struct test_ringbuf_multi *skel; + struct ring_buffer *ringbuf; + int err; + + skel = test_ringbuf_multi__open_and_load(); + if (CHECK(!skel, "skel_open_load", "skeleton open&load failed\n")) + return; + + /* only trigger BPF program for current process */ + skel->bss->pid = getpid(); + + ringbuf = ring_buffer__new(bpf_map__fd(skel->maps.ringbuf1), + process_sample, (void *)(long)1, NULL); + if (CHECK(!ringbuf, "ringbuf_create", "failed to create ringbuf\n")) + goto cleanup; + + err = ring_buffer__add(ringbuf, bpf_map__fd(skel->maps.ringbuf2), + process_sample, (void *)(long)2); + if (CHECK(err, "ringbuf_add", "failed to add another ring\n")) + goto cleanup; + + err = test_ringbuf_multi__attach(skel); + if (CHECK(err, "skel_attach", "skeleton attachment failed: %d\n", err)) + goto cleanup; + + /* trigger few samples, some will be skipped */ + skel->bss->target_ring = 0; + skel->bss->value = 333; + syscall(__NR_getpgid); + + /* skipped, no ringbuf in slot 1 */ + skel->bss->target_ring = 1; + skel->bss->value = 555; + syscall(__NR_getpgid); + + skel->bss->target_ring = 2; + skel->bss->value = 777; + syscall(__NR_getpgid); + + /* poll for samples, should get 2 ringbufs back */ + err = ring_buffer__poll(ringbuf, -1); + if (CHECK(err != 4, "poll_res", "expected 4 records, got %d\n", err)) + goto cleanup; + + /* expect extra polling to return nothing */ + err = ring_buffer__poll(ringbuf, 0); + if (CHECK(err < 0, "extra_samples", "poll result: %d\n", err)) + goto cleanup; + + CHECK(skel->bss->dropped != 0, "err_dropped", "exp %ld, got %ld\n", + 0L, skel->bss->dropped); + CHECK(skel->bss->skipped != 1, "err_skipped", "exp %ld, got %ld\n", + 1L, skel->bss->skipped); + CHECK(skel->bss->total != 2, "err_total", "exp %ld, got %ld\n", + 2L, skel->bss->total); + +cleanup: + ring_buffer__free(ringbuf); + test_ringbuf_multi__destroy(skel); +} diff --git a/tools/testing/selftests/bpf/progs/test_ringbuf.c b/tools/testing/selftests/bpf/progs/test_ringbuf.c new file mode 100644 index 000000000000..8ba9959b036b --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_ringbuf.c @@ -0,0 +1,78 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (c) 2020 Facebook + +#include +#include + +char _license[] SEC("license") = "GPL"; + +struct sample { + int pid; + int seq; + long value; + char comm[16]; +}; + +struct { + __uint(type, BPF_MAP_TYPE_RINGBUF); + __uint(max_entries, 1 << 12); +} ringbuf SEC(".maps"); + +/* inputs */ +int pid = 0; +long value = 0; +long flags = 0; + +/* outputs */ +long total = 0; +long discarded = 0; +long dropped = 0; + +long avail_data = 0; +long ring_size = 0; +long cons_pos = 0; +long prod_pos = 0; + +/* inner state */ +long seq = 0; + +SEC("tp/syscalls/sys_enter_getpgid") +int test_ringbuf(void *ctx) +{ + int cur_pid = bpf_get_current_pid_tgid() >> 32; + struct sample *sample; + int zero = 0; + + if (cur_pid != pid) + return 0; + + sample = bpf_ringbuf_reserve(&ringbuf, sizeof(*sample), 0); + if (!sample) { + __sync_fetch_and_add(&dropped, 1); + return 1; + } + + sample->pid = pid; + bpf_get_current_comm(sample->comm, sizeof(sample->comm)); + sample->value = value; + + sample->seq = seq++; + __sync_fetch_and_add(&total, 1); + + if (sample->seq & 1) { + /* copy from reserved sample to a new one... */ + bpf_ringbuf_output(&ringbuf, sample, sizeof(*sample), flags); + /* ...and then discard reserved sample */ + bpf_ringbuf_discard(sample, flags); + __sync_fetch_and_add(&discarded, 1); + } else { + bpf_ringbuf_submit(sample, flags); + } + + avail_data = bpf_ringbuf_query(&ringbuf, BPF_RB_AVAIL_DATA); + ring_size = bpf_ringbuf_query(&ringbuf, BPF_RB_RING_SIZE); + cons_pos = bpf_ringbuf_query(&ringbuf, BPF_RB_CONS_POS); + prod_pos = bpf_ringbuf_query(&ringbuf, BPF_RB_PROD_POS); + + return 0; +} diff --git a/tools/testing/selftests/bpf/progs/test_ringbuf_multi.c b/tools/testing/selftests/bpf/progs/test_ringbuf_multi.c new file mode 100644 index 000000000000..edf3b6953533 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_ringbuf_multi.c @@ -0,0 +1,77 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (c) 2020 Facebook + +#include +#include + +char _license[] SEC("license") = "GPL"; + +struct sample { + int pid; + int seq; + long value; + char comm[16]; +}; + +struct ringbuf_map { + __uint(type, BPF_MAP_TYPE_RINGBUF); + __uint(max_entries, 1 << 12); +} ringbuf1 SEC(".maps"), + ringbuf2 SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY_OF_MAPS); + __uint(max_entries, 4); + __type(key, int); + __array(values, struct ringbuf_map); +} ringbuf_arr SEC(".maps") = { + .values = { + [0] = &ringbuf1, + [2] = &ringbuf2, + }, +}; + +/* inputs */ +int pid = 0; +int target_ring = 0; +long value = 0; + +/* outputs */ +long total = 0; +long dropped = 0; +long skipped = 0; + +SEC("tp/syscalls/sys_enter_getpgid") +int test_ringbuf(void *ctx) +{ + int cur_pid = bpf_get_current_pid_tgid() >> 32; + struct sample *sample; + void *rb; + int zero = 0; + + if (cur_pid != pid) + return 0; + + rb = bpf_map_lookup_elem(&ringbuf_arr, &target_ring); + if (!rb) { + skipped += 1; + return 1; + } + + sample = bpf_ringbuf_reserve(rb, sizeof(*sample), 0); + if (!sample) { + dropped += 1; + return 1; + } + + sample->pid = pid; + bpf_get_current_comm(sample->comm, sizeof(sample->comm)); + sample->value = value; + + sample->seq = total; + total += 1; + + bpf_ringbuf_submit(sample, 0); + + return 0; +} -- cgit v1.2.3-59-g8ed1b From c97099b0f22722be7d0f290278a26d297cc4b7ca Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Fri, 29 May 2020 00:54:23 -0700 Subject: bpf: Add BPF ringbuf and perf buffer benchmarks MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Extend bench framework with ability to have benchmark-provided child argument parser for custom benchmark-specific parameters. This makes bench generic code modular and independent from any specific benchmark. Also implement a set of benchmarks for new BPF ring buffer and existing perf buffer. 4 benchmarks were implemented: 2 variations for each of BPF ringbuf and perfbuf:, - rb-libbpf utilizes stock libbpf ring_buffer manager for reading data; - rb-custom implements custom ring buffer setup and reading code, to eliminate overheads inherent in generic libbpf code due to callback functions and the need to update consumer position after each consumed record, instead of batching updates (due to pessimistic assumption that user callback might take long time and thus could unnecessarily hold ring buffer space for too long); - pb-libbpf uses stock libbpf perf_buffer code with all the default settings, though uses higher-performance raw event callback to minimize unnecessary overhead; - pb-custom implements its own custom consumer code to minimize any possible overhead of generic libbpf implementation and indirect function calls. All of the test support default, no data notification skipped, mode, as well as sampled mode (with --rb-sampled flag), which allows to trigger epoll notification less frequently and reduce overhead. As will be shown, this mode is especially critical for perf buffer, which suffers from high overhead of wakeups in kernel. Otherwise, all benchamrks implement similar way to generate a batch of records by using fentry/sys_getpgid BPF program, which pushes a bunch of records in a tight loop and records number of successful and dropped samples. Each record is a small 8-byte integer, to minimize the effect of memory copying with bpf_perf_event_output() and bpf_ringbuf_output(). Benchmarks that have only one producer implement optional back-to-back mode, in which record production and consumption is alternating on the same CPU. This is the highest-throughput happy case, showing ultimate performance achievable with either BPF ringbuf or perfbuf. All the below scenarios are implemented in a script in benchs/run_bench_ringbufs.sh. Tests were performed on 28-core/56-thread Intel Xeon CPU E5-2680 v4 @ 2.40GHz CPU. Single-producer, parallel producer ================================== rb-libbpf 12.054 ± 0.320M/s (drops 0.000 ± 0.000M/s) rb-custom 8.158 ± 0.118M/s (drops 0.001 ± 0.003M/s) pb-libbpf 0.931 ± 0.007M/s (drops 0.000 ± 0.000M/s) pb-custom 0.965 ± 0.003M/s (drops 0.000 ± 0.000M/s) Single-producer, parallel producer, sampled notification ======================================================== rb-libbpf 11.563 ± 0.067M/s (drops 0.000 ± 0.000M/s) rb-custom 15.895 ± 0.076M/s (drops 0.000 ± 0.000M/s) pb-libbpf 9.889 ± 0.032M/s (drops 0.000 ± 0.000M/s) pb-custom 9.866 ± 0.028M/s (drops 0.000 ± 0.000M/s) Single producer on one CPU, consumer on another one, both running at full speed. Curiously, rb-libbpf has higher throughput than objectively faster (due to more lightweight consumer code path) rb-custom. It appears that faster consumer causes kernel to send notifications more frequently, because consumer appears to be caught up more frequently. Performance of perfbuf suffers from default "no sampling" policy and huge overhead that causes. In sampled mode, rb-custom is winning very significantly eliminating too frequent in-kernel wakeups, the gain appears to be more than 2x. Perf buffer achieves even more impressive wins, compared to stock perfbuf settings, with 10x improvements in throughput with 1:500 sampling rate. The trade-off is that with sampling, application might not get next X events until X+1st arrives, which is not always acceptable. With steady influx of events, though, this shouldn't be a problem. Overall, single-producer performance of ring buffers seems to be better no matter the sampled/non-sampled modes, but it especially beats ring buffer without sampling due to its adaptive notification approach. Single-producer, back-to-back mode ================================== rb-libbpf 15.507 ± 0.247M/s (drops 0.000 ± 0.000M/s) rb-libbpf-sampled 14.692 ± 0.195M/s (drops 0.000 ± 0.000M/s) rb-custom 21.449 ± 0.157M/s (drops 0.000 ± 0.000M/s) rb-custom-sampled 20.024 ± 0.386M/s (drops 0.000 ± 0.000M/s) pb-libbpf 1.601 ± 0.015M/s (drops 0.000 ± 0.000M/s) pb-libbpf-sampled 8.545 ± 0.064M/s (drops 0.000 ± 0.000M/s) pb-custom 1.607 ± 0.022M/s (drops 0.000 ± 0.000M/s) pb-custom-sampled 8.988 ± 0.144M/s (drops 0.000 ± 0.000M/s) Here we test a back-to-back mode, which is arguably best-case scenario both for BPF ringbuf and perfbuf, because there is no contention and for ringbuf also no excessive notification, because consumer appears to be behind after the first record. For ringbuf, custom consumer code clearly wins with 21.5 vs 16 million records per second exchanged between producer and consumer. Sampled mode actually hurts a bit due to slightly slower producer logic (it needs to fetch amount of data available to decide whether to skip or force notification). Perfbuf with wakeup sampling gets 5.5x throughput increase, compared to no-sampling version. There also doesn't seem to be noticeable overhead from generic libbpf handling code. Perfbuf back-to-back, effect of sample rate =========================================== pb-sampled-1 1.035 ± 0.012M/s (drops 0.000 ± 0.000M/s) pb-sampled-5 3.476 ± 0.087M/s (drops 0.000 ± 0.000M/s) pb-sampled-10 5.094 ± 0.136M/s (drops 0.000 ± 0.000M/s) pb-sampled-25 7.118 ± 0.153M/s (drops 0.000 ± 0.000M/s) pb-sampled-50 8.169 ± 0.156M/s (drops 0.000 ± 0.000M/s) pb-sampled-100 8.887 ± 0.136M/s (drops 0.000 ± 0.000M/s) pb-sampled-250 9.180 ± 0.209M/s (drops 0.000 ± 0.000M/s) pb-sampled-500 9.353 ± 0.281M/s (drops 0.000 ± 0.000M/s) pb-sampled-1000 9.411 ± 0.217M/s (drops 0.000 ± 0.000M/s) pb-sampled-2000 9.464 ± 0.167M/s (drops 0.000 ± 0.000M/s) pb-sampled-3000 9.575 ± 0.273M/s (drops 0.000 ± 0.000M/s) This benchmark shows the effect of event sampling for perfbuf. Back-to-back mode for highest throughput. Just doing every 5th record notification gives 3.5x speed up. 250-500 appears to be the point of diminishing return, with almost 9x speed up. Most benchmarks use 500 as the default sampling for pb-raw and pb-custom. Ringbuf back-to-back, effect of sample rate =========================================== rb-sampled-1 1.106 ± 0.010M/s (drops 0.000 ± 0.000M/s) rb-sampled-5 4.746 ± 0.149M/s (drops 0.000 ± 0.000M/s) rb-sampled-10 7.706 ± 0.164M/s (drops 0.000 ± 0.000M/s) rb-sampled-25 12.893 ± 0.273M/s (drops 0.000 ± 0.000M/s) rb-sampled-50 15.961 ± 0.361M/s (drops 0.000 ± 0.000M/s) rb-sampled-100 18.203 ± 0.445M/s (drops 0.000 ± 0.000M/s) rb-sampled-250 19.962 ± 0.786M/s (drops 0.000 ± 0.000M/s) rb-sampled-500 20.881 ± 0.551M/s (drops 0.000 ± 0.000M/s) rb-sampled-1000 21.317 ± 0.532M/s (drops 0.000 ± 0.000M/s) rb-sampled-2000 21.331 ± 0.535M/s (drops 0.000 ± 0.000M/s) rb-sampled-3000 21.688 ± 0.392M/s (drops 0.000 ± 0.000M/s) Similar benchmark for ring buffer also shows a great advantage (in terms of throughput) of skipping notifications. Skipping every 5th one gives 4x boost. Also similar to perfbuf case, 250-500 seems to be the point of diminishing returns, giving roughly 20x better results. Keep in mind, for this test, notifications are controlled manually with BPF_RB_NO_WAKEUP and BPF_RB_FORCE_WAKEUP. As can be seen from previous benchmarks, adaptive notifications based on consumer's positions provides same (or even slightly better due to simpler load generator on BPF side) benefits in favorable back-to-back scenario. Over zealous and fast consumer, which is almost always caught up, will make thoughput numbers smaller. That's the case when manual notification control might prove to be extremely beneficial. Ringbuf back-to-back, reserve+commit vs output ============================================== reserve 22.819 ± 0.503M/s (drops 0.000 ± 0.000M/s) output 18.906 ± 0.433M/s (drops 0.000 ± 0.000M/s) Ringbuf sampled, reserve+commit vs output ========================================= reserve-sampled 15.350 ± 0.132M/s (drops 0.000 ± 0.000M/s) output-sampled 14.195 ± 0.144M/s (drops 0.000 ± 0.000M/s) BPF ringbuf supports two sets of APIs with various usability and performance tradeoffs: bpf_ringbuf_reserve()+bpf_ringbuf_commit() vs bpf_ringbuf_output(). This benchmark clearly shows superiority of reserve+commit approach, despite using a small 8-byte record size. Single-producer, consumer/producer competing on the same CPU, low batch count ============================================================================= rb-libbpf 3.045 ± 0.020M/s (drops 3.536 ± 0.148M/s) rb-custom 3.055 ± 0.022M/s (drops 3.893 ± 0.066M/s) pb-libbpf 1.393 ± 0.024M/s (drops 0.000 ± 0.000M/s) pb-custom 1.407 ± 0.016M/s (drops 0.000 ± 0.000M/s) This benchmark shows one of the worst-case scenarios, in which producer and consumer do not coordinate *and* fight for the same CPU. No batch count and sampling settings were able to eliminate drops for ringbuffer, producer is just too fast for consumer to keep up. But ringbuf and perfbuf still able to pass through quite a lot of messages, which is more than enough for a lot of applications. Ringbuf, multi-producer contention ================================== rb-libbpf nr_prod 1 10.916 ± 0.399M/s (drops 0.000 ± 0.000M/s) rb-libbpf nr_prod 2 4.931 ± 0.030M/s (drops 0.000 ± 0.000M/s) rb-libbpf nr_prod 3 4.880 ± 0.006M/s (drops 0.000 ± 0.000M/s) rb-libbpf nr_prod 4 3.926 ± 0.004M/s (drops 0.000 ± 0.000M/s) rb-libbpf nr_prod 8 4.011 ± 0.004M/s (drops 0.000 ± 0.000M/s) rb-libbpf nr_prod 12 3.967 ± 0.016M/s (drops 0.000 ± 0.000M/s) rb-libbpf nr_prod 16 2.604 ± 0.030M/s (drops 0.001 ± 0.002M/s) rb-libbpf nr_prod 20 2.233 ± 0.003M/s (drops 0.000 ± 0.000M/s) rb-libbpf nr_prod 24 2.085 ± 0.015M/s (drops 0.000 ± 0.000M/s) rb-libbpf nr_prod 28 2.055 ± 0.004M/s (drops 0.000 ± 0.000M/s) rb-libbpf nr_prod 32 1.962 ± 0.004M/s (drops 0.000 ± 0.000M/s) rb-libbpf nr_prod 36 2.089 ± 0.005M/s (drops 0.000 ± 0.000M/s) rb-libbpf nr_prod 40 2.118 ± 0.006M/s (drops 0.000 ± 0.000M/s) rb-libbpf nr_prod 44 2.105 ± 0.004M/s (drops 0.000 ± 0.000M/s) rb-libbpf nr_prod 48 2.120 ± 0.058M/s (drops 0.000 ± 0.001M/s) rb-libbpf nr_prod 52 2.074 ± 0.024M/s (drops 0.007 ± 0.014M/s) Ringbuf uses a very short-duration spinlock during reservation phase, to check few invariants, increment producer count and set record header. This is the biggest point of contention for ringbuf implementation. This benchmark evaluates the effect of multiple competing writers on overall throughput of a single shared ringbuffer. Overall throughput drops almost 2x when going from single to two highly-contended producers, gradually dropping with additional competing producers. Performance drop stabilizes at around 20 producers and hovers around 2mln even with 50+ fighting producers, which is a 5x drop compared to non-contended case. Good kernel implementation in kernel helps maintain decent performance here. Note, that in the intended real-world scenarios, it's not expected to get even close to such a high levels of contention. But if contention will become a problem, there is always an option of sharding few ring buffers across a set of CPUs. Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20200529075424.3139988-5-andriin@fb.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/Makefile | 5 +- tools/testing/selftests/bpf/bench.c | 16 + .../testing/selftests/bpf/benchs/bench_ringbufs.c | 566 +++++++++++++++++++++ .../selftests/bpf/benchs/run_bench_ringbufs.sh | 75 +++ tools/testing/selftests/bpf/progs/perfbuf_bench.c | 33 ++ tools/testing/selftests/bpf/progs/ringbuf_bench.c | 60 +++ 6 files changed, 754 insertions(+), 1 deletion(-) create mode 100644 tools/testing/selftests/bpf/benchs/bench_ringbufs.c create mode 100755 tools/testing/selftests/bpf/benchs/run_bench_ringbufs.sh create mode 100644 tools/testing/selftests/bpf/progs/perfbuf_bench.c create mode 100644 tools/testing/selftests/bpf/progs/ringbuf_bench.c diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index e716e931d0c9..3ce548eff8a8 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -413,12 +413,15 @@ $(OUTPUT)/bench_%.o: benchs/bench_%.c bench.h $(CC) $(CFLAGS) -c $(filter %.c,$^) $(LDLIBS) -o $@ $(OUTPUT)/bench_rename.o: $(OUTPUT)/test_overhead.skel.h $(OUTPUT)/bench_trigger.o: $(OUTPUT)/trigger_bench.skel.h +$(OUTPUT)/bench_ringbufs.o: $(OUTPUT)/ringbuf_bench.skel.h \ + $(OUTPUT)/perfbuf_bench.skel.h $(OUTPUT)/bench.o: bench.h testing_helpers.h $(OUTPUT)/bench: LDLIBS += -lm $(OUTPUT)/bench: $(OUTPUT)/bench.o $(OUTPUT)/testing_helpers.o \ $(OUTPUT)/bench_count.o \ $(OUTPUT)/bench_rename.o \ - $(OUTPUT)/bench_trigger.o + $(OUTPUT)/bench_trigger.o \ + $(OUTPUT)/bench_ringbufs.o $(call msg,BINARY,,$@) $(CC) $(LDFLAGS) -o $@ $(filter %.a %.o,$^) $(LDLIBS) diff --git a/tools/testing/selftests/bpf/bench.c b/tools/testing/selftests/bpf/bench.c index 14390689ef90..944ad4721c83 100644 --- a/tools/testing/selftests/bpf/bench.c +++ b/tools/testing/selftests/bpf/bench.c @@ -130,6 +130,13 @@ static const struct argp_option opts[] = { {}, }; +extern struct argp bench_ringbufs_argp; + +static const struct argp_child bench_parsers[] = { + { &bench_ringbufs_argp, 0, "Ring buffers benchmark", 0 }, + {}, +}; + static error_t parse_arg(int key, char *arg, struct argp_state *state) { static int pos_args; @@ -208,6 +215,7 @@ static void parse_cmdline_args(int argc, char **argv) .options = opts, .parser = parse_arg, .doc = argp_program_doc, + .children = bench_parsers, }; if (argp_parse(&argp, argc, argv, 0, NULL, NULL)) exit(1); @@ -310,6 +318,10 @@ extern const struct bench bench_trig_rawtp; extern const struct bench bench_trig_kprobe; extern const struct bench bench_trig_fentry; extern const struct bench bench_trig_fmodret; +extern const struct bench bench_rb_libbpf; +extern const struct bench bench_rb_custom; +extern const struct bench bench_pb_libbpf; +extern const struct bench bench_pb_custom; static const struct bench *benchs[] = { &bench_count_global, @@ -327,6 +339,10 @@ static const struct bench *benchs[] = { &bench_trig_kprobe, &bench_trig_fentry, &bench_trig_fmodret, + &bench_rb_libbpf, + &bench_rb_custom, + &bench_pb_libbpf, + &bench_pb_custom, }; static void setup_benchmark() diff --git a/tools/testing/selftests/bpf/benchs/bench_ringbufs.c b/tools/testing/selftests/bpf/benchs/bench_ringbufs.c new file mode 100644 index 000000000000..da87c7f31891 --- /dev/null +++ b/tools/testing/selftests/bpf/benchs/bench_ringbufs.c @@ -0,0 +1,566 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2020 Facebook */ +#include +#include +#include +#include +#include +#include +#include +#include "bench.h" +#include "ringbuf_bench.skel.h" +#include "perfbuf_bench.skel.h" + +static struct { + bool back2back; + int batch_cnt; + bool sampled; + int sample_rate; + int ringbuf_sz; /* per-ringbuf, in bytes */ + bool ringbuf_use_output; /* use slower output API */ + int perfbuf_sz; /* per-CPU size, in pages */ +} args = { + .back2back = false, + .batch_cnt = 500, + .sampled = false, + .sample_rate = 500, + .ringbuf_sz = 512 * 1024, + .ringbuf_use_output = false, + .perfbuf_sz = 128, +}; + +enum { + ARG_RB_BACK2BACK = 2000, + ARG_RB_USE_OUTPUT = 2001, + ARG_RB_BATCH_CNT = 2002, + ARG_RB_SAMPLED = 2003, + ARG_RB_SAMPLE_RATE = 2004, +}; + +static const struct argp_option opts[] = { + { "rb-b2b", ARG_RB_BACK2BACK, NULL, 0, "Back-to-back mode"}, + { "rb-use-output", ARG_RB_USE_OUTPUT, NULL, 0, "Use bpf_ringbuf_output() instead of bpf_ringbuf_reserve()"}, + { "rb-batch-cnt", ARG_RB_BATCH_CNT, "CNT", 0, "Set BPF-side record batch count"}, + { "rb-sampled", ARG_RB_SAMPLED, NULL, 0, "Notification sampling"}, + { "rb-sample-rate", ARG_RB_SAMPLE_RATE, "RATE", 0, "Notification sample rate"}, + {}, +}; + +static error_t parse_arg(int key, char *arg, struct argp_state *state) +{ + switch (key) { + case ARG_RB_BACK2BACK: + args.back2back = true; + break; + case ARG_RB_USE_OUTPUT: + args.ringbuf_use_output = true; + break; + case ARG_RB_BATCH_CNT: + args.batch_cnt = strtol(arg, NULL, 10); + if (args.batch_cnt < 0) { + fprintf(stderr, "Invalid batch count."); + argp_usage(state); + } + break; + case ARG_RB_SAMPLED: + args.sampled = true; + break; + case ARG_RB_SAMPLE_RATE: + args.sample_rate = strtol(arg, NULL, 10); + if (args.sample_rate < 0) { + fprintf(stderr, "Invalid perfbuf sample rate."); + argp_usage(state); + } + break; + default: + return ARGP_ERR_UNKNOWN; + } + return 0; +} + +/* exported into benchmark runner */ +const struct argp bench_ringbufs_argp = { + .options = opts, + .parser = parse_arg, +}; + +/* RINGBUF-LIBBPF benchmark */ + +static struct counter buf_hits; + +static inline void bufs_trigger_batch() +{ + (void)syscall(__NR_getpgid); +} + +static void bufs_validate() +{ + if (env.consumer_cnt != 1) { + fprintf(stderr, "rb-libbpf benchmark doesn't support multi-consumer!\n"); + exit(1); + } + + if (args.back2back && env.producer_cnt > 1) { + fprintf(stderr, "back-to-back mode makes sense only for single-producer case!\n"); + exit(1); + } +} + +static void *bufs_sample_producer(void *input) +{ + if (args.back2back) { + /* initial batch to get everything started */ + bufs_trigger_batch(); + return NULL; + } + + while (true) + bufs_trigger_batch(); + return NULL; +} + +static struct ringbuf_libbpf_ctx { + struct ringbuf_bench *skel; + struct ring_buffer *ringbuf; +} ringbuf_libbpf_ctx; + +static void ringbuf_libbpf_measure(struct bench_res *res) +{ + struct ringbuf_libbpf_ctx *ctx = &ringbuf_libbpf_ctx; + + res->hits = atomic_swap(&buf_hits.value, 0); + res->drops = atomic_swap(&ctx->skel->bss->dropped, 0); +} + +static struct ringbuf_bench *ringbuf_setup_skeleton() +{ + struct ringbuf_bench *skel; + + setup_libbpf(); + + skel = ringbuf_bench__open(); + if (!skel) { + fprintf(stderr, "failed to open skeleton\n"); + exit(1); + } + + skel->rodata->batch_cnt = args.batch_cnt; + skel->rodata->use_output = args.ringbuf_use_output ? 1 : 0; + + if (args.sampled) + /* record data + header take 16 bytes */ + skel->rodata->wakeup_data_size = args.sample_rate * 16; + + bpf_map__resize(skel->maps.ringbuf, args.ringbuf_sz); + + if (ringbuf_bench__load(skel)) { + fprintf(stderr, "failed to load skeleton\n"); + exit(1); + } + + return skel; +} + +static int buf_process_sample(void *ctx, void *data, size_t len) +{ + atomic_inc(&buf_hits.value); + return 0; +} + +static void ringbuf_libbpf_setup() +{ + struct ringbuf_libbpf_ctx *ctx = &ringbuf_libbpf_ctx; + struct bpf_link *link; + + ctx->skel = ringbuf_setup_skeleton(); + ctx->ringbuf = ring_buffer__new(bpf_map__fd(ctx->skel->maps.ringbuf), + buf_process_sample, NULL, NULL); + if (!ctx->ringbuf) { + fprintf(stderr, "failed to create ringbuf\n"); + exit(1); + } + + link = bpf_program__attach(ctx->skel->progs.bench_ringbuf); + if (IS_ERR(link)) { + fprintf(stderr, "failed to attach program!\n"); + exit(1); + } +} + +static void *ringbuf_libbpf_consumer(void *input) +{ + struct ringbuf_libbpf_ctx *ctx = &ringbuf_libbpf_ctx; + + while (ring_buffer__poll(ctx->ringbuf, -1) >= 0) { + if (args.back2back) + bufs_trigger_batch(); + } + fprintf(stderr, "ringbuf polling failed!\n"); + return NULL; +} + +/* RINGBUF-CUSTOM benchmark */ +struct ringbuf_custom { + __u64 *consumer_pos; + __u64 *producer_pos; + __u64 mask; + void *data; + int map_fd; +}; + +static struct ringbuf_custom_ctx { + struct ringbuf_bench *skel; + struct ringbuf_custom ringbuf; + int epoll_fd; + struct epoll_event event; +} ringbuf_custom_ctx; + +static void ringbuf_custom_measure(struct bench_res *res) +{ + struct ringbuf_custom_ctx *ctx = &ringbuf_custom_ctx; + + res->hits = atomic_swap(&buf_hits.value, 0); + res->drops = atomic_swap(&ctx->skel->bss->dropped, 0); +} + +static void ringbuf_custom_setup() +{ + struct ringbuf_custom_ctx *ctx = &ringbuf_custom_ctx; + const size_t page_size = getpagesize(); + struct bpf_link *link; + struct ringbuf_custom *r; + void *tmp; + int err; + + ctx->skel = ringbuf_setup_skeleton(); + + ctx->epoll_fd = epoll_create1(EPOLL_CLOEXEC); + if (ctx->epoll_fd < 0) { + fprintf(stderr, "failed to create epoll fd: %d\n", -errno); + exit(1); + } + + r = &ctx->ringbuf; + r->map_fd = bpf_map__fd(ctx->skel->maps.ringbuf); + r->mask = args.ringbuf_sz - 1; + + /* Map writable consumer page */ + tmp = mmap(NULL, page_size, PROT_READ | PROT_WRITE, MAP_SHARED, + r->map_fd, 0); + if (tmp == MAP_FAILED) { + fprintf(stderr, "failed to mmap consumer page: %d\n", -errno); + exit(1); + } + r->consumer_pos = tmp; + + /* Map read-only producer page and data pages. */ + tmp = mmap(NULL, page_size + 2 * args.ringbuf_sz, PROT_READ, MAP_SHARED, + r->map_fd, page_size); + if (tmp == MAP_FAILED) { + fprintf(stderr, "failed to mmap data pages: %d\n", -errno); + exit(1); + } + r->producer_pos = tmp; + r->data = tmp + page_size; + + ctx->event.events = EPOLLIN; + err = epoll_ctl(ctx->epoll_fd, EPOLL_CTL_ADD, r->map_fd, &ctx->event); + if (err < 0) { + fprintf(stderr, "failed to epoll add ringbuf: %d\n", -errno); + exit(1); + } + + link = bpf_program__attach(ctx->skel->progs.bench_ringbuf); + if (IS_ERR(link)) { + fprintf(stderr, "failed to attach program\n"); + exit(1); + } +} + +#define RINGBUF_BUSY_BIT (1 << 31) +#define RINGBUF_DISCARD_BIT (1 << 30) +#define RINGBUF_META_LEN 8 + +static inline int roundup_len(__u32 len) +{ + /* clear out top 2 bits */ + len <<= 2; + len >>= 2; + /* add length prefix */ + len += RINGBUF_META_LEN; + /* round up to 8 byte alignment */ + return (len + 7) / 8 * 8; +} + +static void ringbuf_custom_process_ring(struct ringbuf_custom *r) +{ + unsigned long cons_pos, prod_pos; + int *len_ptr, len; + bool got_new_data; + + cons_pos = smp_load_acquire(r->consumer_pos); + while (true) { + got_new_data = false; + prod_pos = smp_load_acquire(r->producer_pos); + while (cons_pos < prod_pos) { + len_ptr = r->data + (cons_pos & r->mask); + len = smp_load_acquire(len_ptr); + + /* sample not committed yet, bail out for now */ + if (len & RINGBUF_BUSY_BIT) + return; + + got_new_data = true; + cons_pos += roundup_len(len); + + atomic_inc(&buf_hits.value); + } + if (got_new_data) + smp_store_release(r->consumer_pos, cons_pos); + else + break; + }; +} + +static void *ringbuf_custom_consumer(void *input) +{ + struct ringbuf_custom_ctx *ctx = &ringbuf_custom_ctx; + int cnt; + + do { + if (args.back2back) + bufs_trigger_batch(); + cnt = epoll_wait(ctx->epoll_fd, &ctx->event, 1, -1); + if (cnt > 0) + ringbuf_custom_process_ring(&ctx->ringbuf); + } while (cnt >= 0); + fprintf(stderr, "ringbuf polling failed!\n"); + return 0; +} + +/* PERFBUF-LIBBPF benchmark */ +static struct perfbuf_libbpf_ctx { + struct perfbuf_bench *skel; + struct perf_buffer *perfbuf; +} perfbuf_libbpf_ctx; + +static void perfbuf_measure(struct bench_res *res) +{ + struct perfbuf_libbpf_ctx *ctx = &perfbuf_libbpf_ctx; + + res->hits = atomic_swap(&buf_hits.value, 0); + res->drops = atomic_swap(&ctx->skel->bss->dropped, 0); +} + +static struct perfbuf_bench *perfbuf_setup_skeleton() +{ + struct perfbuf_bench *skel; + + setup_libbpf(); + + skel = perfbuf_bench__open(); + if (!skel) { + fprintf(stderr, "failed to open skeleton\n"); + exit(1); + } + + skel->rodata->batch_cnt = args.batch_cnt; + + if (perfbuf_bench__load(skel)) { + fprintf(stderr, "failed to load skeleton\n"); + exit(1); + } + + return skel; +} + +static enum bpf_perf_event_ret +perfbuf_process_sample_raw(void *input_ctx, int cpu, + struct perf_event_header *e) +{ + switch (e->type) { + case PERF_RECORD_SAMPLE: + atomic_inc(&buf_hits.value); + break; + case PERF_RECORD_LOST: + break; + default: + return LIBBPF_PERF_EVENT_ERROR; + } + return LIBBPF_PERF_EVENT_CONT; +} + +static void perfbuf_libbpf_setup() +{ + struct perfbuf_libbpf_ctx *ctx = &perfbuf_libbpf_ctx; + struct perf_event_attr attr; + struct perf_buffer_raw_opts pb_opts = { + .event_cb = perfbuf_process_sample_raw, + .ctx = (void *)(long)0, + .attr = &attr, + }; + struct bpf_link *link; + + ctx->skel = perfbuf_setup_skeleton(); + + memset(&attr, 0, sizeof(attr)); + attr.config = PERF_COUNT_SW_BPF_OUTPUT, + attr.type = PERF_TYPE_SOFTWARE; + attr.sample_type = PERF_SAMPLE_RAW; + /* notify only every Nth sample */ + if (args.sampled) { + attr.sample_period = args.sample_rate; + attr.wakeup_events = args.sample_rate; + } else { + attr.sample_period = 1; + attr.wakeup_events = 1; + } + + if (args.sample_rate > args.batch_cnt) { + fprintf(stderr, "sample rate %d is too high for given batch count %d\n", + args.sample_rate, args.batch_cnt); + exit(1); + } + + ctx->perfbuf = perf_buffer__new_raw(bpf_map__fd(ctx->skel->maps.perfbuf), + args.perfbuf_sz, &pb_opts); + if (!ctx->perfbuf) { + fprintf(stderr, "failed to create perfbuf\n"); + exit(1); + } + + link = bpf_program__attach(ctx->skel->progs.bench_perfbuf); + if (IS_ERR(link)) { + fprintf(stderr, "failed to attach program\n"); + exit(1); + } +} + +static void *perfbuf_libbpf_consumer(void *input) +{ + struct perfbuf_libbpf_ctx *ctx = &perfbuf_libbpf_ctx; + + while (perf_buffer__poll(ctx->perfbuf, -1) >= 0) { + if (args.back2back) + bufs_trigger_batch(); + } + fprintf(stderr, "perfbuf polling failed!\n"); + return NULL; +} + +/* PERFBUF-CUSTOM benchmark */ + +/* copies of internal libbpf definitions */ +struct perf_cpu_buf { + struct perf_buffer *pb; + void *base; /* mmap()'ed memory */ + void *buf; /* for reconstructing segmented data */ + size_t buf_size; + int fd; + int cpu; + int map_key; +}; + +struct perf_buffer { + perf_buffer_event_fn event_cb; + perf_buffer_sample_fn sample_cb; + perf_buffer_lost_fn lost_cb; + void *ctx; /* passed into callbacks */ + + size_t page_size; + size_t mmap_size; + struct perf_cpu_buf **cpu_bufs; + struct epoll_event *events; + int cpu_cnt; /* number of allocated CPU buffers */ + int epoll_fd; /* perf event FD */ + int map_fd; /* BPF_MAP_TYPE_PERF_EVENT_ARRAY BPF map FD */ +}; + +static void *perfbuf_custom_consumer(void *input) +{ + struct perfbuf_libbpf_ctx *ctx = &perfbuf_libbpf_ctx; + struct perf_buffer *pb = ctx->perfbuf; + struct perf_cpu_buf *cpu_buf; + struct perf_event_mmap_page *header; + size_t mmap_mask = pb->mmap_size - 1; + struct perf_event_header *ehdr; + __u64 data_head, data_tail; + size_t ehdr_size; + void *base; + int i, cnt; + + while (true) { + if (args.back2back) + bufs_trigger_batch(); + cnt = epoll_wait(pb->epoll_fd, pb->events, pb->cpu_cnt, -1); + if (cnt <= 0) { + fprintf(stderr, "perf epoll failed: %d\n", -errno); + exit(1); + } + + for (i = 0; i < cnt; ++i) { + cpu_buf = pb->events[i].data.ptr; + header = cpu_buf->base; + base = ((void *)header) + pb->page_size; + + data_head = ring_buffer_read_head(header); + data_tail = header->data_tail; + while (data_head != data_tail) { + ehdr = base + (data_tail & mmap_mask); + ehdr_size = ehdr->size; + + if (ehdr->type == PERF_RECORD_SAMPLE) + atomic_inc(&buf_hits.value); + + data_tail += ehdr_size; + } + ring_buffer_write_tail(header, data_tail); + } + } + return NULL; +} + +const struct bench bench_rb_libbpf = { + .name = "rb-libbpf", + .validate = bufs_validate, + .setup = ringbuf_libbpf_setup, + .producer_thread = bufs_sample_producer, + .consumer_thread = ringbuf_libbpf_consumer, + .measure = ringbuf_libbpf_measure, + .report_progress = hits_drops_report_progress, + .report_final = hits_drops_report_final, +}; + +const struct bench bench_rb_custom = { + .name = "rb-custom", + .validate = bufs_validate, + .setup = ringbuf_custom_setup, + .producer_thread = bufs_sample_producer, + .consumer_thread = ringbuf_custom_consumer, + .measure = ringbuf_custom_measure, + .report_progress = hits_drops_report_progress, + .report_final = hits_drops_report_final, +}; + +const struct bench bench_pb_libbpf = { + .name = "pb-libbpf", + .validate = bufs_validate, + .setup = perfbuf_libbpf_setup, + .producer_thread = bufs_sample_producer, + .consumer_thread = perfbuf_libbpf_consumer, + .measure = perfbuf_measure, + .report_progress = hits_drops_report_progress, + .report_final = hits_drops_report_final, +}; + +const struct bench bench_pb_custom = { + .name = "pb-custom", + .validate = bufs_validate, + .setup = perfbuf_libbpf_setup, + .producer_thread = bufs_sample_producer, + .consumer_thread = perfbuf_custom_consumer, + .measure = perfbuf_measure, + .report_progress = hits_drops_report_progress, + .report_final = hits_drops_report_final, +}; + diff --git a/tools/testing/selftests/bpf/benchs/run_bench_ringbufs.sh b/tools/testing/selftests/bpf/benchs/run_bench_ringbufs.sh new file mode 100755 index 000000000000..af4aa04caba6 --- /dev/null +++ b/tools/testing/selftests/bpf/benchs/run_bench_ringbufs.sh @@ -0,0 +1,75 @@ +#!/bin/bash + +set -eufo pipefail + +RUN_BENCH="sudo ./bench -w3 -d10 -a" + +function hits() +{ + echo "$*" | sed -E "s/.*hits\s+([0-9]+\.[0-9]+ ± [0-9]+\.[0-9]+M\/s).*/\1/" +} + +function drops() +{ + echo "$*" | sed -E "s/.*drops\s+([0-9]+\.[0-9]+ ± [0-9]+\.[0-9]+M\/s).*/\1/" +} + +function header() +{ + local len=${#1} + + printf "\n%s\n" "$1" + for i in $(seq 1 $len); do printf '='; done + printf '\n' +} + +function summarize() +{ + bench="$1" + summary=$(echo $2 | tail -n1) + printf "%-20s %s (drops %s)\n" "$bench" "$(hits $summary)" "$(drops $summary)" +} + +header "Single-producer, parallel producer" +for b in rb-libbpf rb-custom pb-libbpf pb-custom; do + summarize $b "$($RUN_BENCH $b)" +done + +header "Single-producer, parallel producer, sampled notification" +for b in rb-libbpf rb-custom pb-libbpf pb-custom; do + summarize $b "$($RUN_BENCH --rb-sampled $b)" +done + +header "Single-producer, back-to-back mode" +for b in rb-libbpf rb-custom pb-libbpf pb-custom; do + summarize $b "$($RUN_BENCH --rb-b2b $b)" + summarize $b-sampled "$($RUN_BENCH --rb-sampled --rb-b2b $b)" +done + +header "Ringbuf back-to-back, effect of sample rate" +for b in 1 5 10 25 50 100 250 500 1000 2000 3000; do + summarize "rb-sampled-$b" "$($RUN_BENCH --rb-b2b --rb-batch-cnt $b --rb-sampled --rb-sample-rate $b rb-custom)" +done +header "Perfbuf back-to-back, effect of sample rate" +for b in 1 5 10 25 50 100 250 500 1000 2000 3000; do + summarize "pb-sampled-$b" "$($RUN_BENCH --rb-b2b --rb-batch-cnt $b --rb-sampled --rb-sample-rate $b pb-custom)" +done + +header "Ringbuf back-to-back, reserve+commit vs output" +summarize "reserve" "$($RUN_BENCH --rb-b2b rb-custom)" +summarize "output" "$($RUN_BENCH --rb-b2b --rb-use-output rb-custom)" + +header "Ringbuf sampled, reserve+commit vs output" +summarize "reserve-sampled" "$($RUN_BENCH --rb-sampled rb-custom)" +summarize "output-sampled" "$($RUN_BENCH --rb-sampled --rb-use-output rb-custom)" + +header "Single-producer, consumer/producer competing on the same CPU, low batch count" +for b in rb-libbpf rb-custom pb-libbpf pb-custom; do + summarize $b "$($RUN_BENCH --rb-batch-cnt 1 --rb-sample-rate 1 --prod-affinity 0 --cons-affinity 0 $b)" +done + +header "Ringbuf, multi-producer contention" +for b in 1 2 3 4 8 12 16 20 24 28 32 36 40 44 48 52; do + summarize "rb-libbpf nr_prod $b" "$($RUN_BENCH -p$b --rb-batch-cnt 50 rb-libbpf)" +done + diff --git a/tools/testing/selftests/bpf/progs/perfbuf_bench.c b/tools/testing/selftests/bpf/progs/perfbuf_bench.c new file mode 100644 index 000000000000..e5ab4836a641 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/perfbuf_bench.c @@ -0,0 +1,33 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (c) 2020 Facebook + +#include +#include +#include + +char _license[] SEC("license") = "GPL"; + +struct { + __uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY); + __uint(value_size, sizeof(int)); + __uint(key_size, sizeof(int)); +} perfbuf SEC(".maps"); + +const volatile int batch_cnt = 0; + +long sample_val = 42; +long dropped __attribute__((aligned(128))) = 0; + +SEC("fentry/__x64_sys_getpgid") +int bench_perfbuf(void *ctx) +{ + __u64 *sample; + int i; + + for (i = 0; i < batch_cnt; i++) { + if (bpf_perf_event_output(ctx, &perfbuf, BPF_F_CURRENT_CPU, + &sample_val, sizeof(sample_val))) + __sync_add_and_fetch(&dropped, 1); + } + return 0; +} diff --git a/tools/testing/selftests/bpf/progs/ringbuf_bench.c b/tools/testing/selftests/bpf/progs/ringbuf_bench.c new file mode 100644 index 000000000000..123607d314d6 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/ringbuf_bench.c @@ -0,0 +1,60 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (c) 2020 Facebook + +#include +#include +#include + +char _license[] SEC("license") = "GPL"; + +struct { + __uint(type, BPF_MAP_TYPE_RINGBUF); +} ringbuf SEC(".maps"); + +const volatile int batch_cnt = 0; +const volatile long use_output = 0; + +long sample_val = 42; +long dropped __attribute__((aligned(128))) = 0; + +const volatile long wakeup_data_size = 0; + +static __always_inline long get_flags() +{ + long sz; + + if (!wakeup_data_size) + return 0; + + sz = bpf_ringbuf_query(&ringbuf, BPF_RB_AVAIL_DATA); + return sz >= wakeup_data_size ? BPF_RB_FORCE_WAKEUP : BPF_RB_NO_WAKEUP; +} + +SEC("fentry/__x64_sys_getpgid") +int bench_ringbuf(void *ctx) +{ + long *sample, flags; + int i; + + if (!use_output) { + for (i = 0; i < batch_cnt; i++) { + sample = bpf_ringbuf_reserve(&ringbuf, + sizeof(sample_val), 0); + if (!sample) { + __sync_add_and_fetch(&dropped, 1); + } else { + *sample = sample_val; + flags = get_flags(); + bpf_ringbuf_submit(sample, flags); + } + } + } else { + for (i = 0; i < batch_cnt; i++) { + flags = get_flags(); + if (bpf_ringbuf_output(&ringbuf, &sample_val, + sizeof(sample_val), flags)) + __sync_add_and_fetch(&dropped, 1); + } + } + return 0; +} -- cgit v1.2.3-59-g8ed1b From 97abb2b396821f21c21cee2d537bb4e0a0eef31b Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Fri, 29 May 2020 00:54:24 -0700 Subject: docs/bpf: Add BPF ring buffer design notes Add commit description from patch #1 as a stand-alone documentation under Documentation/bpf, as it might be more convenient format, in long term perspective. Suggested-by: Stanislav Fomichev Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20200529075424.3139988-6-andriin@fb.com Signed-off-by: Alexei Starovoitov --- Documentation/bpf/ringbuf.rst | 209 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 209 insertions(+) create mode 100644 Documentation/bpf/ringbuf.rst diff --git a/Documentation/bpf/ringbuf.rst b/Documentation/bpf/ringbuf.rst new file mode 100644 index 000000000000..75f943f0009d --- /dev/null +++ b/Documentation/bpf/ringbuf.rst @@ -0,0 +1,209 @@ +=============== +BPF ring buffer +=============== + +This document describes BPF ring buffer design, API, and implementation details. + +.. contents:: + :local: + :depth: 2 + +Motivation +---------- + +There are two distinctive motivators for this work, which are not satisfied by +existing perf buffer, which prompted creation of a new ring buffer +implementation. + +- more efficient memory utilization by sharing ring buffer across CPUs; +- preserving ordering of events that happen sequentially in time, even across + multiple CPUs (e.g., fork/exec/exit events for a task). + +These two problems are independent, but perf buffer fails to satisfy both. +Both are a result of a choice to have per-CPU perf ring buffer. Both can be +also solved by having an MPSC implementation of ring buffer. The ordering +problem could technically be solved for perf buffer with some in-kernel +counting, but given the first one requires an MPSC buffer, the same solution +would solve the second problem automatically. + +Semantics and APIs +------------------ + +Single ring buffer is presented to BPF programs as an instance of BPF map of +type ``BPF_MAP_TYPE_RINGBUF``. Two other alternatives considered, but +ultimately rejected. + +One way would be to, similar to ``BPF_MAP_TYPE_PERF_EVENT_ARRAY``, make +``BPF_MAP_TYPE_RINGBUF`` could represent an array of ring buffers, but not +enforce "same CPU only" rule. This would be more familiar interface compatible +with existing perf buffer use in BPF, but would fail if application needed more +advanced logic to lookup ring buffer by arbitrary key. +``BPF_MAP_TYPE_HASH_OF_MAPS`` addresses this with current approach. +Additionally, given the performance of BPF ringbuf, many use cases would just +opt into a simple single ring buffer shared among all CPUs, for which current +approach would be an overkill. + +Another approach could introduce a new concept, alongside BPF map, to represent +generic "container" object, which doesn't necessarily have key/value interface +with lookup/update/delete operations. This approach would add a lot of extra +infrastructure that has to be built for observability and verifier support. It +would also add another concept that BPF developers would have to familiarize +themselves with, new syntax in libbpf, etc. But then would really provide no +additional benefits over the approach of using a map. ``BPF_MAP_TYPE_RINGBUF`` +doesn't support lookup/update/delete operations, but so doesn't few other map +types (e.g., queue and stack; array doesn't support delete, etc). + +The approach chosen has an advantage of re-using existing BPF map +infrastructure (introspection APIs in kernel, libbpf support, etc), being +familiar concept (no need to teach users a new type of object in BPF program), +and utilizing existing tooling (bpftool). For common scenario of using a single +ring buffer for all CPUs, it's as simple and straightforward, as would be with +a dedicated "container" object. On the other hand, by being a map, it can be +combined with ``ARRAY_OF_MAPS`` and ``HASH_OF_MAPS`` map-in-maps to implement +a wide variety of topologies, from one ring buffer for each CPU (e.g., as +a replacement for perf buffer use cases), to a complicated application +hashing/sharding of ring buffers (e.g., having a small pool of ring buffers +with hashed task's tgid being a look up key to preserve order, but reduce +contention). + +Key and value sizes are enforced to be zero. ``max_entries`` is used to specify +the size of ring buffer and has to be a power of 2 value. + +There are a bunch of similarities between perf buffer +(``BPF_MAP_TYPE_PERF_EVENT_ARRAY``) and new BPF ring buffer semantics: + +- variable-length records; +- if there is no more space left in ring buffer, reservation fails, no + blocking; +- memory-mappable data area for user-space applications for ease of + consumption and high performance; +- epoll notifications for new incoming data; +- but still the ability to do busy polling for new data to achieve the + lowest latency, if necessary. + +BPF ringbuf provides two sets of APIs to BPF programs: + +- ``bpf_ringbuf_output()`` allows to *copy* data from one place to a ring + buffer, similarly to ``bpf_perf_event_output()``; +- ``bpf_ringbuf_reserve()``/``bpf_ringbuf_commit()``/``bpf_ringbuf_discard()`` + APIs split the whole process into two steps. First, a fixed amount of space + is reserved. If successful, a pointer to a data inside ring buffer data + area is returned, which BPF programs can use similarly to a data inside + array/hash maps. Once ready, this piece of memory is either committed or + discarded. Discard is similar to commit, but makes consumer ignore the + record. + +``bpf_ringbuf_output()`` has disadvantage of incurring extra memory copy, +because record has to be prepared in some other place first. But it allows to +submit records of the length that's not known to verifier beforehand. It also +closely matches ``bpf_perf_event_output()``, so will simplify migration +significantly. + +``bpf_ringbuf_reserve()`` avoids the extra copy of memory by providing a memory +pointer directly to ring buffer memory. In a lot of cases records are larger +than BPF stack space allows, so many programs have use extra per-CPU array as +a temporary heap for preparing sample. bpf_ringbuf_reserve() avoid this needs +completely. But in exchange, it only allows a known constant size of memory to +be reserved, such that verifier can verify that BPF program can't access memory +outside its reserved record space. bpf_ringbuf_output(), while slightly slower +due to extra memory copy, covers some use cases that are not suitable for +``bpf_ringbuf_reserve()``. + +The difference between commit and discard is very small. Discard just marks +a record as discarded, and such records are supposed to be ignored by consumer +code. Discard is useful for some advanced use-cases, such as ensuring +all-or-nothing multi-record submission, or emulating temporary +``malloc()``/``free()`` within single BPF program invocation. + +Each reserved record is tracked by verifier through existing +reference-tracking logic, similar to socket ref-tracking. It is thus +impossible to reserve a record, but forget to submit (or discard) it. + +``bpf_ringbuf_query()`` helper allows to query various properties of ring +buffer. Currently 4 are supported: + +- ``BPF_RB_AVAIL_DATA`` returns amount of unconsumed data in ring buffer; +- ``BPF_RB_RING_SIZE`` returns the size of ring buffer; +- ``BPF_RB_CONS_POS``/``BPF_RB_PROD_POS`` returns current logical possition + of consumer/producer, respectively. + +Returned values are momentarily snapshots of ring buffer state and could be +off by the time helper returns, so this should be used only for +debugging/reporting reasons or for implementing various heuristics, that take +into account highly-changeable nature of some of those characteristics. + +One such heuristic might involve more fine-grained control over poll/epoll +notifications about new data availability in ring buffer. Together with +``BPF_RB_NO_WAKEUP``/``BPF_RB_FORCE_WAKEUP`` flags for output/commit/discard +helpers, it allows BPF program a high degree of control and, e.g., more +efficient batched notifications. Default self-balancing strategy, though, +should be adequate for most applications and will work reliable and efficiently +already. + +Design and Implementation +------------------------- + +This reserve/commit schema allows a natural way for multiple producers, either +on different CPUs or even on the same CPU/in the same BPF program, to reserve +independent records and work with them without blocking other producers. This +means that if BPF program was interruped by another BPF program sharing the +same ring buffer, they will both get a record reserved (provided there is +enough space left) and can work with it and submit it independently. This +applies to NMI context as well, except that due to using a spinlock during +reservation, in NMI context, ``bpf_ringbuf_reserve()`` might fail to get +a lock, in which case reservation will fail even if ring buffer is not full. + +The ring buffer itself internally is implemented as a power-of-2 sized +circular buffer, with two logical and ever-increasing counters (which might +wrap around on 32-bit architectures, that's not a problem): + +- consumer counter shows up to which logical position consumer consumed the + data; +- producer counter denotes amount of data reserved by all producers. + +Each time a record is reserved, producer that "owns" the record will +successfully advance producer counter. At that point, data is still not yet +ready to be consumed, though. Each record has 8 byte header, which contains the +length of reserved record, as well as two extra bits: busy bit to denote that +record is still being worked on, and discard bit, which might be set at commit +time if record is discarded. In the latter case, consumer is supposed to skip +the record and move on to the next one. Record header also encodes record's +relative offset from the beginning of ring buffer data area (in pages). This +allows ``bpf_ringbuf_commit()``/``bpf_ringbuf_discard()`` to accept only the +pointer to the record itself, without requiring also the pointer to ring buffer +itself. Ring buffer memory location will be restored from record metadata +header. This significantly simplifies verifier, as well as improving API +usability. + +Producer counter increments are serialized under spinlock, so there is +a strict ordering between reservations. Commits, on the other hand, are +completely lockless and independent. All records become available to consumer +in the order of reservations, but only after all previous records where +already committed. It is thus possible for slow producers to temporarily hold +off submitted records, that were reserved later. + +Reservation/commit/consumer protocol is verified by litmus tests in +Documentation/litmus_tests/bpf-rb/_. + +One interesting implementation bit, that significantly simplifies (and thus +speeds up as well) implementation of both producers and consumers is how data +area is mapped twice contiguously back-to-back in the virtual memory. This +allows to not take any special measures for samples that have to wrap around +at the end of the circular buffer data area, because the next page after the +last data page would be first data page again, and thus the sample will still +appear completely contiguous in virtual memory. See comment and a simple ASCII +diagram showing this visually in ``bpf_ringbuf_area_alloc()``. + +Another feature that distinguishes BPF ringbuf from perf ring buffer is +a self-pacing notifications of new data being availability. +``bpf_ringbuf_commit()`` implementation will send a notification of new record +being available after commit only if consumer has already caught up right up to +the record being committed. If not, consumer still has to catch up and thus +will see new data anyways without needing an extra poll notification. +Benchmarks (see tools/testing/selftests/bpf/benchs/bench_ringbuf.c_) show that +this allows to achieve a very high throughput without having to resort to +tricks like "notify only every Nth sample", which are necessary with perf +buffer. For extreme cases, when BPF program wants more manual control of +notifications, commit/discard/output helpers accept ``BPF_RB_NO_WAKEUP`` and +``BPF_RB_FORCE_WAKEUP`` flags, which give full control over notifications of +data availability, but require extra caution and diligence in using this API. -- cgit v1.2.3-59-g8ed1b From 43dd115b1fffdd8d2c4cc15659c00b2a1addbc43 Mon Sep 17 00:00:00 2001 From: Anton Protopopov Date: Wed, 27 May 2020 18:57:00 +0000 Subject: selftests/bpf: Add tests for write-only stacks/queues For write-only stacks and queues bpf_map_update_elem should be allowed, but bpf_map_lookup_elem and bpf_map_lookup_and_delete_elem should fail with EPERM. Signed-off-by: Anton Protopopov Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20200527185700.14658-6-a.s.protopopov@gmail.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/test_maps.c | 40 ++++++++++++++++++++++++++++++++- 1 file changed, 39 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/bpf/test_maps.c b/tools/testing/selftests/bpf/test_maps.c index 08d63948514a..6a12a0e01e07 100644 --- a/tools/testing/selftests/bpf/test_maps.c +++ b/tools/testing/selftests/bpf/test_maps.c @@ -1405,7 +1405,7 @@ static void test_map_rdonly(void) close(fd); } -static void test_map_wronly(void) +static void test_map_wronly_hash(void) { int fd, key = 0, value = 0; @@ -1429,6 +1429,44 @@ static void test_map_wronly(void) close(fd); } +static void test_map_wronly_stack_or_queue(enum bpf_map_type map_type) +{ + int fd, value = 0; + + assert(map_type == BPF_MAP_TYPE_QUEUE || + map_type == BPF_MAP_TYPE_STACK); + fd = bpf_create_map(map_type, 0, sizeof(value), MAP_SIZE, + map_flags | BPF_F_WRONLY); + /* Stack/Queue maps do not support BPF_F_NO_PREALLOC */ + if (map_flags & BPF_F_NO_PREALLOC) { + assert(fd < 0 && errno == EINVAL); + return; + } + if (fd < 0) { + printf("Failed to create map '%s'!\n", strerror(errno)); + exit(1); + } + + value = 1234; + assert(bpf_map_update_elem(fd, NULL, &value, BPF_ANY) == 0); + + /* Peek element should fail */ + assert(bpf_map_lookup_elem(fd, NULL, &value) == -1 && errno == EPERM); + + /* Pop element should fail */ + assert(bpf_map_lookup_and_delete_elem(fd, NULL, &value) == -1 && + errno == EPERM); + + close(fd); +} + +static void test_map_wronly(void) +{ + test_map_wronly_hash(); + test_map_wronly_stack_or_queue(BPF_MAP_TYPE_STACK); + test_map_wronly_stack_or_queue(BPF_MAP_TYPE_QUEUE); +} + static void prepare_reuseport_grp(int type, int map_fd, size_t map_elem_size, __s64 *fds64, __u64 *sk_cookies, unsigned int n) -- cgit v1.2.3-59-g8ed1b From c3c16f2ea6d20159903cf93afbb1155f3d8348d5 Mon Sep 17 00:00:00 2001 From: Amritha Nambiar Date: Tue, 26 May 2020 17:34:36 -0700 Subject: bpf: Add rx_queue_mapping to bpf_sock Add "rx_queue_mapping" to bpf_sock. This gives read access for the existing field (sk_rx_queue_mapping) of struct sock from bpf_sock. Semantics for the bpf_sock rx_queue_mapping access are similar to sk_rx_queue_get(), i.e the value NO_QUEUE_MAPPING is not allowed and -1 is returned in that case. This is useful for transmit queue selection based on the received queue index which is cached in the socket in the receive path. v3: Addressed review comments to add usecase in patch description, and fixed default value for rx_queue_mapping. v2: fixed build error for CONFIG_XPS wrapping, reported by kbuild test robot Signed-off-by: Amritha Nambiar Signed-off-by: Alexei Starovoitov --- include/uapi/linux/bpf.h | 1 + net/core/filter.c | 18 ++++++++++++++++++ 2 files changed, 19 insertions(+) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 974ca6e948e3..630432c5c292 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -3612,6 +3612,7 @@ struct bpf_sock { __u32 dst_ip4; __u32 dst_ip6[4]; __u32 state; + __s32 rx_queue_mapping; }; struct bpf_tcp_sock { diff --git a/net/core/filter.c b/net/core/filter.c index a6fc23447f12..0008b029d644 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -6849,6 +6849,7 @@ bool bpf_sock_is_valid_access(int off, int size, enum bpf_access_type type, case offsetof(struct bpf_sock, protocol): case offsetof(struct bpf_sock, dst_port): case offsetof(struct bpf_sock, src_port): + case offsetof(struct bpf_sock, rx_queue_mapping): case bpf_ctx_range(struct bpf_sock, src_ip4): case bpf_ctx_range_till(struct bpf_sock, src_ip6[0], src_ip6[3]): case bpf_ctx_range(struct bpf_sock, dst_ip4): @@ -7897,6 +7898,23 @@ u32 bpf_sock_convert_ctx_access(enum bpf_access_type type, skc_state), target_size)); break; + case offsetof(struct bpf_sock, rx_queue_mapping): +#ifdef CONFIG_XPS + *insn++ = BPF_LDX_MEM( + BPF_FIELD_SIZEOF(struct sock, sk_rx_queue_mapping), + si->dst_reg, si->src_reg, + bpf_target_off(struct sock, sk_rx_queue_mapping, + sizeof_field(struct sock, + sk_rx_queue_mapping), + target_size)); + *insn++ = BPF_JMP_IMM(BPF_JNE, si->dst_reg, NO_QUEUE_MAPPING, + 1); + *insn++ = BPF_MOV64_IMM(si->dst_reg, -1); +#else + *insn++ = BPF_MOV64_IMM(si->dst_reg, -1); + *target_size = 2; +#endif + break; } return insn - insn_buf; -- cgit v1.2.3-59-g8ed1b From 7f1c04269fe7b3293dea38ea65da4fd6614d6f80 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Fri, 29 May 2020 16:07:12 -0600 Subject: devmap: Formalize map value as a named struct MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add 'struct bpf_devmap_val' to formalize the expected values that can be passed in for a DEVMAP. Update devmap code to use the struct. Signed-off-by: David Ahern Signed-off-by: Alexei Starovoitov Acked-by: Toke Høiland-Jørgensen Link: https://lore.kernel.org/bpf/20200529220716.75383-2-dsahern@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/devmap.c | 45 ++++++++++++++++++++++++++++----------------- 1 file changed, 28 insertions(+), 17 deletions(-) diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index a51d9fb7a359..a1459de0914e 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -60,12 +60,18 @@ struct xdp_dev_bulk_queue { unsigned int count; }; +/* DEVMAP values */ +struct bpf_devmap_val { + u32 ifindex; /* device index */ +}; + struct bpf_dtab_netdev { struct net_device *dev; /* must be first member, due to tracepoint */ struct hlist_node index_hlist; struct bpf_dtab *dtab; struct rcu_head rcu; unsigned int idx; + struct bpf_devmap_val val; }; struct bpf_dtab { @@ -472,18 +478,15 @@ int dev_map_generic_redirect(struct bpf_dtab_netdev *dst, struct sk_buff *skb, static void *dev_map_lookup_elem(struct bpf_map *map, void *key) { struct bpf_dtab_netdev *obj = __dev_map_lookup_elem(map, *(u32 *)key); - struct net_device *dev = obj ? obj->dev : NULL; - return dev ? &dev->ifindex : NULL; + return obj ? &obj->val : NULL; } static void *dev_map_hash_lookup_elem(struct bpf_map *map, void *key) { struct bpf_dtab_netdev *obj = __dev_map_hash_lookup_elem(map, *(u32 *)key); - struct net_device *dev = obj ? obj->dev : NULL; - - return dev ? &dev->ifindex : NULL; + return obj ? &obj->val : NULL; } static void __dev_map_entry_free(struct rcu_head *rcu) @@ -541,7 +544,7 @@ static int dev_map_hash_delete_elem(struct bpf_map *map, void *key) static struct bpf_dtab_netdev *__dev_map_alloc_node(struct net *net, struct bpf_dtab *dtab, - u32 ifindex, + struct bpf_devmap_val *val, unsigned int idx) { struct bpf_dtab_netdev *dev; @@ -551,16 +554,18 @@ static struct bpf_dtab_netdev *__dev_map_alloc_node(struct net *net, if (!dev) return ERR_PTR(-ENOMEM); - dev->dev = dev_get_by_index(net, ifindex); - if (!dev->dev) { - kfree(dev); - return ERR_PTR(-EINVAL); - } + dev->dev = dev_get_by_index(net, val->ifindex); + if (!dev->dev) + goto err_out; dev->idx = idx; dev->dtab = dtab; + dev->val.ifindex = val->ifindex; return dev; +err_out: + kfree(dev); + return ERR_PTR(-EINVAL); } static int __dev_map_update_elem(struct net *net, struct bpf_map *map, @@ -568,7 +573,7 @@ static int __dev_map_update_elem(struct net *net, struct bpf_map *map, { struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); struct bpf_dtab_netdev *dev, *old_dev; - u32 ifindex = *(u32 *)value; + struct bpf_devmap_val val = { }; u32 i = *(u32 *)key; if (unlikely(map_flags > BPF_EXIST)) @@ -578,10 +583,13 @@ static int __dev_map_update_elem(struct net *net, struct bpf_map *map, if (unlikely(map_flags == BPF_NOEXIST)) return -EEXIST; - if (!ifindex) { + /* already verified value_size <= sizeof val */ + memcpy(&val, value, map->value_size); + + if (!val.ifindex) { dev = NULL; } else { - dev = __dev_map_alloc_node(net, dtab, ifindex, i); + dev = __dev_map_alloc_node(net, dtab, &val, i); if (IS_ERR(dev)) return PTR_ERR(dev); } @@ -609,12 +617,15 @@ static int __dev_map_hash_update_elem(struct net *net, struct bpf_map *map, { struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); struct bpf_dtab_netdev *dev, *old_dev; - u32 ifindex = *(u32 *)value; + struct bpf_devmap_val val = { }; u32 idx = *(u32 *)key; unsigned long flags; int err = -EEXIST; - if (unlikely(map_flags > BPF_EXIST || !ifindex)) + /* already verified value_size <= sizeof val */ + memcpy(&val, value, map->value_size); + + if (unlikely(map_flags > BPF_EXIST || !val.ifindex)) return -EINVAL; spin_lock_irqsave(&dtab->index_lock, flags); @@ -623,7 +634,7 @@ static int __dev_map_hash_update_elem(struct net *net, struct bpf_map *map, if (old_dev && (map_flags & BPF_NOEXIST)) goto out_err; - dev = __dev_map_alloc_node(net, dtab, ifindex, idx); + dev = __dev_map_alloc_node(net, dtab, &val, idx); if (IS_ERR(dev)) { err = PTR_ERR(dev); goto out_err; -- cgit v1.2.3-59-g8ed1b From b36e62eb85215a60916f910070f6d494b4f3e73a Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Thu, 28 May 2020 17:48:10 -0700 Subject: bpf: Use strncpy_from_unsafe_strict() in bpf_seq_printf() helper In bpf_seq_printf() helper, when user specified a "%s" in the format string, strncpy_from_unsafe() is used to read the actual string to a buffer. The string could be a format string or a string in the kernel data structure. It is really unlikely that the string will reside in the user memory. This is different from Commit b2a5212fb634 ("bpf: Restrict bpf_trace_printk()'s %s usage and add %pks, %pus specifier") which still used strncpy_from_unsafe() for "%s" to preserve the old behavior. If in the future, bpf_seq_printf() indeed needs to read user memory, we can implement "%pus" format string. Based on discussion in [1], if the intent is to read kernel memory, strncpy_from_unsafe_strict() should be used. So this patch changed to use strncpy_from_unsafe_strict(). [1]: https://lore.kernel.org/bpf/20200521152301.2587579-1-hch@lst.de/T/ Fixes: 492e639f0c22 ("bpf: Add bpf_seq_printf and bpf_seq_write helpers") Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Acked-by: Song Liu Cc: Christoph Hellwig Link: https://lore.kernel.org/bpf/20200529004810.3352219-1-yhs@fb.com Signed-off-by: Alexei Starovoitov --- kernel/trace/bpf_trace.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 3767d34114c0..b6c24be5ff53 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -585,9 +585,9 @@ BPF_CALL_5(bpf_seq_printf, struct seq_file *, m, char *, fmt, u32, fmt_size, goto out; } - err = strncpy_from_unsafe(bufs->buf[memcpy_cnt], - (void *) (long) args[fmt_cnt], - MAX_SEQ_PRINTF_STR_LEN); + err = strncpy_from_unsafe_strict(bufs->buf[memcpy_cnt], + (void *) (long) args[fmt_cnt], + MAX_SEQ_PRINTF_STR_LEN); if (err < 0) bufs->buf[memcpy_cnt][0] = '\0'; params[fmt_cnt] = (u64)(long)bufs->buf[memcpy_cnt]; -- cgit v1.2.3-59-g8ed1b From fbee97feed9b3e4acdf9590e1f6b4a2eefecfffe Mon Sep 17 00:00:00 2001 From: David Ahern Date: Fri, 29 May 2020 16:07:13 -0600 Subject: bpf: Add support to attach bpf program to a devmap entry MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add BPF_XDP_DEVMAP attach type for use with programs associated with a DEVMAP entry. Allow DEVMAPs to associate a program with a device entry by adding a bpf_prog.fd to 'struct bpf_devmap_val'. Values read show the program id, so the fd and id are a union. bpf programs can get access to the struct via vmlinux.h. The program associated with the fd must have type XDP with expected attach type BPF_XDP_DEVMAP. When a program is associated with a device index, the program is run on an XDP_REDIRECT and before the buffer is added to the per-cpu queue. At this point rxq data is still valid; the next patch adds tx device information allowing the prorgam to see both ingress and egress device indices. XDP generic is skb based and XDP programs do not work with skb's. Block the use case by walking maps used by a program that is to be attached via xdpgeneric and fail if any of them are DEVMAP / DEVMAP_HASH with Block attach of BPF_XDP_DEVMAP programs to devices. Signed-off-by: David Ahern Signed-off-by: Alexei Starovoitov Acked-by: Toke Høiland-Jørgensen Link: https://lore.kernel.org/bpf/20200529220716.75383-3-dsahern@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 5 +++ include/uapi/linux/bpf.h | 1 + kernel/bpf/devmap.c | 88 ++++++++++++++++++++++++++++++++++++++++-- net/core/dev.c | 18 +++++++++ tools/include/uapi/linux/bpf.h | 1 + 5 files changed, 109 insertions(+), 4 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index e5884f7f801c..e042311f991f 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1250,6 +1250,7 @@ int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp, struct net_device *dev_rx); int dev_map_generic_redirect(struct bpf_dtab_netdev *dst, struct sk_buff *skb, struct bpf_prog *xdp_prog); +bool dev_map_can_have_prog(struct bpf_map *map); struct bpf_cpu_map_entry *__cpu_map_lookup_elem(struct bpf_map *map, u32 key); void __cpu_map_flush(void); @@ -1363,6 +1364,10 @@ static inline struct net_device *__dev_map_hash_lookup_elem(struct bpf_map *map { return NULL; } +static inline bool dev_map_can_have_prog(struct bpf_map *map) +{ + return false; +} static inline void __dev_flush(void) { diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 630432c5c292..f1e364d69007 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -225,6 +225,7 @@ enum bpf_attach_type { BPF_CGROUP_INET6_GETPEERNAME, BPF_CGROUP_INET4_GETSOCKNAME, BPF_CGROUP_INET6_GETSOCKNAME, + BPF_XDP_DEVMAP, __MAX_BPF_ATTACH_TYPE }; diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index a1459de0914e..0089d56617ec 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -63,12 +63,17 @@ struct xdp_dev_bulk_queue { /* DEVMAP values */ struct bpf_devmap_val { u32 ifindex; /* device index */ + union { + int fd; /* prog fd on map write */ + u32 id; /* prog id on map read */ + } bpf_prog; }; struct bpf_dtab_netdev { struct net_device *dev; /* must be first member, due to tracepoint */ struct hlist_node index_hlist; struct bpf_dtab *dtab; + struct bpf_prog *xdp_prog; struct rcu_head rcu; unsigned int idx; struct bpf_devmap_val val; @@ -111,12 +116,18 @@ static inline struct hlist_head *dev_map_index_hash(struct bpf_dtab *dtab, static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr) { + u32 valsize = attr->value_size; u64 cost = 0; int err; - /* check sanity of attributes */ + /* check sanity of attributes. 2 value sizes supported: + * 4 bytes: ifindex + * 8 bytes: ifindex + prog fd + */ if (attr->max_entries == 0 || attr->key_size != 4 || - attr->value_size != 4 || attr->map_flags & ~DEV_CREATE_FLAG_MASK) + (valsize != offsetofend(struct bpf_devmap_val, ifindex) && + valsize != offsetofend(struct bpf_devmap_val, bpf_prog.fd)) || + attr->map_flags & ~DEV_CREATE_FLAG_MASK) return -EINVAL; /* Lookup returns a pointer straight to dev->ifindex, so make sure the @@ -223,6 +234,8 @@ static void dev_map_free(struct bpf_map *map) hlist_for_each_entry_safe(dev, next, head, index_hlist) { hlist_del_rcu(&dev->index_hlist); + if (dev->xdp_prog) + bpf_prog_put(dev->xdp_prog); dev_put(dev->dev); kfree(dev); } @@ -237,6 +250,8 @@ static void dev_map_free(struct bpf_map *map) if (!dev) continue; + if (dev->xdp_prog) + bpf_prog_put(dev->xdp_prog); dev_put(dev->dev); kfree(dev); } @@ -323,6 +338,16 @@ static int dev_map_hash_get_next_key(struct bpf_map *map, void *key, return -ENOENT; } +bool dev_map_can_have_prog(struct bpf_map *map) +{ + if ((map->map_type == BPF_MAP_TYPE_DEVMAP || + map->map_type == BPF_MAP_TYPE_DEVMAP_HASH) && + map->value_size != offsetofend(struct bpf_devmap_val, ifindex)) + return true; + + return false; +} + static int bq_xmit_all(struct xdp_dev_bulk_queue *bq, u32 flags) { struct net_device *dev = bq->dev; @@ -447,6 +472,30 @@ static inline int __xdp_enqueue(struct net_device *dev, struct xdp_buff *xdp, return bq_enqueue(dev, xdpf, dev_rx); } +static struct xdp_buff *dev_map_run_prog(struct net_device *dev, + struct xdp_buff *xdp, + struct bpf_prog *xdp_prog) +{ + u32 act; + + act = bpf_prog_run_xdp(xdp_prog, xdp); + switch (act) { + case XDP_PASS: + return xdp; + case XDP_DROP: + break; + default: + bpf_warn_invalid_xdp_action(act); + fallthrough; + case XDP_ABORTED: + trace_xdp_exception(dev, xdp_prog, act); + break; + } + + xdp_return_buff(xdp); + return NULL; +} + int dev_xdp_enqueue(struct net_device *dev, struct xdp_buff *xdp, struct net_device *dev_rx) { @@ -458,6 +507,11 @@ int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp, { struct net_device *dev = dst->dev; + if (dst->xdp_prog) { + xdp = dev_map_run_prog(dev, xdp, dst->xdp_prog); + if (!xdp) + return 0; + } return __xdp_enqueue(dev, xdp, dev_rx); } @@ -494,6 +548,8 @@ static void __dev_map_entry_free(struct rcu_head *rcu) struct bpf_dtab_netdev *dev; dev = container_of(rcu, struct bpf_dtab_netdev, rcu); + if (dev->xdp_prog) + bpf_prog_put(dev->xdp_prog); dev_put(dev->dev); kfree(dev); } @@ -547,6 +603,7 @@ static struct bpf_dtab_netdev *__dev_map_alloc_node(struct net *net, struct bpf_devmap_val *val, unsigned int idx) { + struct bpf_prog *prog = NULL; struct bpf_dtab_netdev *dev; dev = kmalloc_node(sizeof(*dev), GFP_ATOMIC | __GFP_NOWARN, @@ -558,11 +615,31 @@ static struct bpf_dtab_netdev *__dev_map_alloc_node(struct net *net, if (!dev->dev) goto err_out; + if (val->bpf_prog.fd >= 0) { + prog = bpf_prog_get_type_dev(val->bpf_prog.fd, + BPF_PROG_TYPE_XDP, false); + if (IS_ERR(prog)) + goto err_put_dev; + if (prog->expected_attach_type != BPF_XDP_DEVMAP) + goto err_put_prog; + } + dev->idx = idx; dev->dtab = dtab; + if (prog) { + dev->xdp_prog = prog; + dev->val.bpf_prog.id = prog->aux->id; + } else { + dev->xdp_prog = NULL; + dev->val.bpf_prog.id = 0; + } dev->val.ifindex = val->ifindex; return dev; +err_put_prog: + bpf_prog_put(prog); +err_put_dev: + dev_put(dev->dev); err_out: kfree(dev); return ERR_PTR(-EINVAL); @@ -572,8 +649,8 @@ static int __dev_map_update_elem(struct net *net, struct bpf_map *map, void *key, void *value, u64 map_flags) { struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); + struct bpf_devmap_val val = { .bpf_prog.fd = -1 }; struct bpf_dtab_netdev *dev, *old_dev; - struct bpf_devmap_val val = { }; u32 i = *(u32 *)key; if (unlikely(map_flags > BPF_EXIST)) @@ -588,6 +665,9 @@ static int __dev_map_update_elem(struct net *net, struct bpf_map *map, if (!val.ifindex) { dev = NULL; + /* can not specify fd if ifindex is 0 */ + if (val.bpf_prog.fd != -1) + return -EINVAL; } else { dev = __dev_map_alloc_node(net, dtab, &val, i); if (IS_ERR(dev)) @@ -616,8 +696,8 @@ static int __dev_map_hash_update_elem(struct net *net, struct bpf_map *map, void *key, void *value, u64 map_flags) { struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); + struct bpf_devmap_val val = { .bpf_prog.fd = -1 }; struct bpf_dtab_netdev *dev, *old_dev; - struct bpf_devmap_val val = { }; u32 idx = *(u32 *)key; unsigned long flags; int err = -EEXIST; diff --git a/net/core/dev.c b/net/core/dev.c index ae37586f6ee8..10684833f864 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5420,6 +5420,18 @@ static int generic_xdp_install(struct net_device *dev, struct netdev_bpf *xdp) struct bpf_prog *new = xdp->prog; int ret = 0; + if (new) { + u32 i; + + /* generic XDP does not work with DEVMAPs that can + * have a bpf_prog installed on an entry + */ + for (i = 0; i < new->aux->used_map_cnt; i++) { + if (dev_map_can_have_prog(new->aux->used_maps[i])) + return -EINVAL; + } + } + switch (xdp->command) { case XDP_SETUP_PROG: rcu_assign_pointer(dev->xdp_prog, new); @@ -8835,6 +8847,12 @@ int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack, return -EINVAL; } + if (prog->expected_attach_type == BPF_XDP_DEVMAP) { + NL_SET_ERR_MSG(extack, "BPF_XDP_DEVMAP programs can not be attached to a device"); + bpf_prog_put(prog); + return -EINVAL; + } + /* prog->aux->id may be 0 for orphaned device-bound progs */ if (prog->aux->id && prog->aux->id == prog_id) { bpf_prog_put(prog); diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 974ca6e948e3..65d7717bce2f 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -225,6 +225,7 @@ enum bpf_attach_type { BPF_CGROUP_INET6_GETPEERNAME, BPF_CGROUP_INET4_GETSOCKNAME, BPF_CGROUP_INET6_GETSOCKNAME, + BPF_XDP_DEVMAP, __MAX_BPF_ATTACH_TYPE }; -- cgit v1.2.3-59-g8ed1b From 64b59025c15b244c0954cf52b24fbabfcf5ed8f6 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Fri, 29 May 2020 16:07:14 -0600 Subject: xdp: Add xdp_txq_info to xdp_buff MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add xdp_txq_info as the Tx counterpart to xdp_rxq_info. At the moment only the device is added. Other fields (queue_index) can be added as use cases arise. >From a UAPI perspective, add egress_ifindex to xdp context for bpf programs to see the Tx device. Update the verifier to only allow accesses to egress_ifindex by XDP programs with BPF_XDP_DEVMAP expected attach type. Signed-off-by: David Ahern Signed-off-by: Alexei Starovoitov Acked-by: Toke Høiland-Jørgensen Link: https://lore.kernel.org/bpf/20200529220716.75383-4-dsahern@kernel.org Signed-off-by: Alexei Starovoitov --- include/net/xdp.h | 5 +++++ include/uapi/linux/bpf.h | 2 ++ kernel/bpf/devmap.c | 3 +++ net/core/filter.c | 17 +++++++++++++++++ tools/include/uapi/linux/bpf.h | 2 ++ 5 files changed, 29 insertions(+) diff --git a/include/net/xdp.h b/include/net/xdp.h index 90f11760bd12..d54022959491 100644 --- a/include/net/xdp.h +++ b/include/net/xdp.h @@ -61,12 +61,17 @@ struct xdp_rxq_info { struct xdp_mem_info mem; } ____cacheline_aligned; /* perf critical, avoid false-sharing */ +struct xdp_txq_info { + struct net_device *dev; +}; + struct xdp_buff { void *data; void *data_end; void *data_meta; void *data_hard_start; struct xdp_rxq_info *rxq; + struct xdp_txq_info *txq; u32 frame_sz; /* frame size to deduce data_hard_end/reserved tailroom*/ }; diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index f1e364d69007..f862a58fb567 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -3707,6 +3707,8 @@ struct xdp_md { /* Below access go through struct xdp_rxq_info */ __u32 ingress_ifindex; /* rxq->dev->ifindex */ __u32 rx_queue_index; /* rxq->queue_index */ + + __u32 egress_ifindex; /* txq->dev->ifindex */ }; enum sk_action { diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index 0089d56617ec..c04fb1c72f5e 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -476,8 +476,11 @@ static struct xdp_buff *dev_map_run_prog(struct net_device *dev, struct xdp_buff *xdp, struct bpf_prog *xdp_prog) { + struct xdp_txq_info txq = { .dev = dev }; u32 act; + xdp->txq = &txq; + act = bpf_prog_run_xdp(xdp_prog, xdp); switch (act) { case XDP_PASS: diff --git a/net/core/filter.c b/net/core/filter.c index 0008b029d644..85ff827aab73 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -7015,6 +7015,13 @@ static bool xdp_is_valid_access(int off, int size, const struct bpf_prog *prog, struct bpf_insn_access_aux *info) { + if (prog->expected_attach_type != BPF_XDP_DEVMAP) { + switch (off) { + case offsetof(struct xdp_md, egress_ifindex): + return false; + } + } + if (type == BPF_WRITE) { if (bpf_prog_is_dev_bound(prog->aux)) { switch (off) { @@ -7985,6 +7992,16 @@ static u32 xdp_convert_ctx_access(enum bpf_access_type type, offsetof(struct xdp_rxq_info, queue_index)); break; + case offsetof(struct xdp_md, egress_ifindex): + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, txq), + si->dst_reg, si->src_reg, + offsetof(struct xdp_buff, txq)); + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_txq_info, dev), + si->dst_reg, si->dst_reg, + offsetof(struct xdp_txq_info, dev)); + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, + offsetof(struct net_device, ifindex)); + break; } return insn - insn_buf; diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 65d7717bce2f..f74bc4a2385e 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -3706,6 +3706,8 @@ struct xdp_md { /* Below access go through struct xdp_rxq_info */ __u32 ingress_ifindex; /* rxq->dev->ifindex */ __u32 rx_queue_index; /* rxq->queue_index */ + + __u32 egress_ifindex; /* txq->dev->ifindex */ }; enum sk_action { -- cgit v1.2.3-59-g8ed1b From 2778797037a658be71a6c55b54700bf58ba21eb7 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Fri, 29 May 2020 16:07:15 -0600 Subject: libbpf: Add SEC name for xdp programs attached to device map MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Support SEC("xdp_devmap*") as a short cut for loading the program with type BPF_PROG_TYPE_XDP and expected attach type BPF_XDP_DEVMAP. Signed-off-by: David Ahern Signed-off-by: Alexei Starovoitov Acked-by: Toke Høiland-Jørgensen Link: https://lore.kernel.org/bpf/20200529220716.75383-5-dsahern@kernel.org Signed-off-by: Alexei Starovoitov --- tools/lib/bpf/libbpf.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 74d967619dcf..85d4f1c5fc52 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -6657,6 +6657,8 @@ static const struct bpf_sec_def section_defs[] = { .expected_attach_type = BPF_TRACE_ITER, .is_attach_btf = true, .attach_fn = attach_iter), + BPF_EAPROG_SEC("xdp_devmap", BPF_PROG_TYPE_XDP, + BPF_XDP_DEVMAP), BPF_PROG_SEC("xdp", BPF_PROG_TYPE_XDP), BPF_PROG_SEC("perf_event", BPF_PROG_TYPE_PERF_EVENT), BPF_PROG_SEC("lwt_in", BPF_PROG_TYPE_LWT_IN), -- cgit v1.2.3-59-g8ed1b From ca2f5f21dbbd5e3a00cd3e97f728aa2ca0b2e011 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Fri, 29 May 2020 16:06:41 -0700 Subject: bpf: Refactor sockmap redirect code so its easy to reuse We will need this block of code called from tls context shortly lets refactor the redirect logic so its easy to use. This also cleans up the switch stmt so we have fewer fallthrough cases. No logic changes are intended. Fixes: d829e9c4112b5 ("tls: convert to generic sk_msg interface") Signed-off-by: John Fastabend Signed-off-by: Alexei Starovoitov Reviewed-by: Jakub Sitnicki Acked-by: Song Liu Link: https://lore.kernel.org/bpf/159079360110.5745.7024009076049029819.stgit@john-Precision-5820-Tower Signed-off-by: Alexei Starovoitov --- net/core/skmsg.c | 55 ++++++++++++++++++++++++++++++++++--------------------- 1 file changed, 34 insertions(+), 21 deletions(-) diff --git a/net/core/skmsg.c b/net/core/skmsg.c index c479372f2cd2..9d72f71e9b47 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -682,13 +682,43 @@ static struct sk_psock *sk_psock_from_strp(struct strparser *strp) return container_of(parser, struct sk_psock, parser); } -static void sk_psock_verdict_apply(struct sk_psock *psock, - struct sk_buff *skb, int verdict) +static void sk_psock_skb_redirect(struct sk_psock *psock, struct sk_buff *skb) { struct sk_psock *psock_other; struct sock *sk_other; bool ingress; + sk_other = tcp_skb_bpf_redirect_fetch(skb); + if (unlikely(!sk_other)) { + kfree_skb(skb); + return; + } + psock_other = sk_psock(sk_other); + if (!psock_other || sock_flag(sk_other, SOCK_DEAD) || + !sk_psock_test_state(psock_other, SK_PSOCK_TX_ENABLED)) { + kfree_skb(skb); + return; + } + + ingress = tcp_skb_bpf_ingress(skb); + if ((!ingress && sock_writeable(sk_other)) || + (ingress && + atomic_read(&sk_other->sk_rmem_alloc) <= + sk_other->sk_rcvbuf)) { + if (!ingress) + skb_set_owner_w(skb, sk_other); + skb_queue_tail(&psock_other->ingress_skb, skb); + schedule_work(&psock_other->work); + } else { + kfree_skb(skb); + } +} + +static void sk_psock_verdict_apply(struct sk_psock *psock, + struct sk_buff *skb, int verdict) +{ + struct sock *sk_other; + switch (verdict) { case __SK_PASS: sk_other = psock->sk; @@ -707,25 +737,8 @@ static void sk_psock_verdict_apply(struct sk_psock *psock, } goto out_free; case __SK_REDIRECT: - sk_other = tcp_skb_bpf_redirect_fetch(skb); - if (unlikely(!sk_other)) - goto out_free; - psock_other = sk_psock(sk_other); - if (!psock_other || sock_flag(sk_other, SOCK_DEAD) || - !sk_psock_test_state(psock_other, SK_PSOCK_TX_ENABLED)) - goto out_free; - ingress = tcp_skb_bpf_ingress(skb); - if ((!ingress && sock_writeable(sk_other)) || - (ingress && - atomic_read(&sk_other->sk_rmem_alloc) <= - sk_other->sk_rcvbuf)) { - if (!ingress) - skb_set_owner_w(skb, sk_other); - skb_queue_tail(&psock_other->ingress_skb, skb); - schedule_work(&psock_other->work); - break; - } - /* fall-through */ + sk_psock_skb_redirect(psock, skb); + break; case __SK_DROP: /* fall-through */ default: -- cgit v1.2.3-59-g8ed1b From d39aec79e5923bf984df991ffe51d4a2b7a9e746 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Fri, 29 May 2020 16:07:16 -0600 Subject: selftest: Add tests for XDP programs in devmap entries MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add tests to verify ability to add an XDP program to a entry in a DEVMAP. Add negative tests to show DEVMAP programs can not be attached to devices as a normal XDP program, and accesses to egress_ifindex require BPF_XDP_DEVMAP attach type. Signed-off-by: David Ahern Signed-off-by: Alexei Starovoitov Acked-by: Toke Høiland-Jørgensen Link: https://lore.kernel.org/bpf/20200529220716.75383-6-dsahern@kernel.org Signed-off-by: Alexei Starovoitov --- .../selftests/bpf/prog_tests/xdp_devmap_attach.c | 97 ++++++++++++++++++++++ .../selftests/bpf/progs/test_xdp_devmap_helpers.c | 22 +++++ .../bpf/progs/test_xdp_with_devmap_helpers.c | 44 ++++++++++ 3 files changed, 163 insertions(+) create mode 100644 tools/testing/selftests/bpf/prog_tests/xdp_devmap_attach.c create mode 100644 tools/testing/selftests/bpf/progs/test_xdp_devmap_helpers.c create mode 100644 tools/testing/selftests/bpf/progs/test_xdp_with_devmap_helpers.c diff --git a/tools/testing/selftests/bpf/prog_tests/xdp_devmap_attach.c b/tools/testing/selftests/bpf/prog_tests/xdp_devmap_attach.c new file mode 100644 index 000000000000..d19dbd668f6a --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/xdp_devmap_attach.c @@ -0,0 +1,97 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include + +#include "test_xdp_devmap_helpers.skel.h" +#include "test_xdp_with_devmap_helpers.skel.h" + +#define IFINDEX_LO 1 + +struct bpf_devmap_val { + u32 ifindex; /* device index */ + union { + int fd; /* prog fd on map write */ + u32 id; /* prog id on map read */ + } bpf_prog; +}; + +void test_xdp_with_devmap_helpers(void) +{ + struct test_xdp_with_devmap_helpers *skel; + struct bpf_prog_info info = {}; + struct bpf_devmap_val val = { + .ifindex = IFINDEX_LO, + }; + __u32 len = sizeof(info); + __u32 duration = 0, idx = 0; + int err, dm_fd, map_fd; + + + skel = test_xdp_with_devmap_helpers__open_and_load(); + if (CHECK_FAIL(!skel)) { + perror("test_xdp_with_devmap_helpers__open_and_load"); + return; + } + + /* can not attach program with DEVMAPs that allow programs + * as xdp generic + */ + dm_fd = bpf_program__fd(skel->progs.xdp_redir_prog); + err = bpf_set_link_xdp_fd(IFINDEX_LO, dm_fd, XDP_FLAGS_SKB_MODE); + CHECK(err == 0, "Generic attach of program with 8-byte devmap", + "should have failed\n"); + + dm_fd = bpf_program__fd(skel->progs.xdp_dummy_dm); + map_fd = bpf_map__fd(skel->maps.dm_ports); + err = bpf_obj_get_info_by_fd(dm_fd, &info, &len); + if (CHECK_FAIL(err)) + goto out_close; + + val.bpf_prog.fd = dm_fd; + err = bpf_map_update_elem(map_fd, &idx, &val, 0); + CHECK(err, "Add program to devmap entry", + "err %d errno %d\n", err, errno); + + err = bpf_map_lookup_elem(map_fd, &idx, &val); + CHECK(err, "Read devmap entry", "err %d errno %d\n", err, errno); + CHECK(info.id != val.bpf_prog.id, "Expected program id in devmap entry", + "expected %u read %u\n", info.id, val.bpf_prog.id); + + /* can not attach BPF_XDP_DEVMAP program to a device */ + err = bpf_set_link_xdp_fd(IFINDEX_LO, dm_fd, XDP_FLAGS_SKB_MODE); + CHECK(err == 0, "Attach of BPF_XDP_DEVMAP program", + "should have failed\n"); + + val.ifindex = 1; + val.bpf_prog.fd = bpf_program__fd(skel->progs.xdp_dummy_prog); + err = bpf_map_update_elem(map_fd, &idx, &val, 0); + CHECK(err == 0, "Add non-BPF_XDP_DEVMAP program to devmap entry", + "should have failed\n"); + +out_close: + test_xdp_with_devmap_helpers__destroy(skel); +} + +void test_neg_xdp_devmap_helpers(void) +{ + struct test_xdp_devmap_helpers *skel; + __u32 duration = 0; + + skel = test_xdp_devmap_helpers__open_and_load(); + if (CHECK(skel, + "Load of XDP program accessing egress ifindex without attach type", + "should have failed\n")) { + test_xdp_devmap_helpers__destroy(skel); + } +} + + +void test_xdp_devmap_attach(void) +{ + if (test__start_subtest("DEVMAP with programs in entries")) + test_xdp_with_devmap_helpers(); + + if (test__start_subtest("Verifier check of DEVMAP programs")) + test_neg_xdp_devmap_helpers(); +} diff --git a/tools/testing/selftests/bpf/progs/test_xdp_devmap_helpers.c b/tools/testing/selftests/bpf/progs/test_xdp_devmap_helpers.c new file mode 100644 index 000000000000..e5c0f131c8a7 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_xdp_devmap_helpers.c @@ -0,0 +1,22 @@ +// SPDX-License-Identifier: GPL-2.0 +/* fails to load without expected_attach_type = BPF_XDP_DEVMAP + * because of access to egress_ifindex + */ +#include "vmlinux.h" +#include + +SEC("xdp_dm_log") +int xdpdm_devlog(struct xdp_md *ctx) +{ + char fmt[] = "devmap redirect: dev %u -> dev %u len %u\n"; + void *data_end = (void *)(long)ctx->data_end; + void *data = (void *)(long)ctx->data; + unsigned int len = data_end - data; + + bpf_trace_printk(fmt, sizeof(fmt), + ctx->ingress_ifindex, ctx->egress_ifindex, len); + + return XDP_PASS; +} + +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/test_xdp_with_devmap_helpers.c b/tools/testing/selftests/bpf/progs/test_xdp_with_devmap_helpers.c new file mode 100644 index 000000000000..deef0e050863 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_xdp_with_devmap_helpers.c @@ -0,0 +1,44 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "vmlinux.h" +#include + +struct { + __uint(type, BPF_MAP_TYPE_DEVMAP); + __uint(key_size, sizeof(__u32)); + __uint(value_size, sizeof(struct bpf_devmap_val)); + __uint(max_entries, 4); +} dm_ports SEC(".maps"); + +SEC("xdp_redir") +int xdp_redir_prog(struct xdp_md *ctx) +{ + return bpf_redirect_map(&dm_ports, 1, 0); +} + +/* invalid program on DEVMAP entry; + * SEC name means expected attach type not set + */ +SEC("xdp_dummy") +int xdp_dummy_prog(struct xdp_md *ctx) +{ + return XDP_PASS; +} + +/* valid program on DEVMAP entry via SEC name; + * has access to egress and ingress ifindex + */ +SEC("xdp_devmap") +int xdp_dummy_dm(struct xdp_md *ctx) +{ + char fmt[] = "devmap redirect: dev %u -> dev %u len %u\n"; + void *data_end = (void *)(long)ctx->data_end; + void *data = (void *)(long)ctx->data; + unsigned int len = data_end - data; + + bpf_trace_printk(fmt, sizeof(fmt), + ctx->ingress_ifindex, ctx->egress_ifindex, len); + + return XDP_PASS; +} +char _license[] SEC("license") = "GPL"; -- cgit v1.2.3-59-g8ed1b From e91de6afa81c10e9f855c5695eb9a53168d96b73 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Fri, 29 May 2020 16:06:59 -0700 Subject: bpf: Fix running sk_skb program types with ktls KTLS uses a stream parser to collect TLS messages and send them to the upper layer tls receive handler. This ensures the tls receiver has a full TLS header to parse when it is run. However, when a socket has BPF_SK_SKB_STREAM_VERDICT program attached before KTLS is enabled we end up with two stream parsers running on the same socket. The result is both try to run on the same socket. First the KTLS stream parser runs and calls read_sock() which will tcp_read_sock which in turn calls tcp_rcv_skb(). This dequeues the skb from the sk_receive_queue. When this is done KTLS code then data_ready() callback which because we stacked KTLS on top of the bpf stream verdict program has been replaced with sk_psock_start_strp(). This will in turn kick the stream parser again and eventually do the same thing KTLS did above calling into tcp_rcv_skb() and dequeuing a skb from the sk_receive_queue. At this point the data stream is broke. Part of the stream was handled by the KTLS side some other bytes may have been handled by the BPF side. Generally this results in either missing data or more likely a "Bad Message" complaint from the kTLS receive handler as the BPF program steals some bytes meant to be in a TLS header and/or the TLS header length is no longer correct. We've already broke the idealized model where we can stack ULPs in any order with generic callbacks on the TX side to handle this. So in this patch we do the same thing but for RX side. We add a sk_psock_strp_enabled() helper so TLS can learn a BPF verdict program is running and add a tls_sw_has_ctx_rx() helper so BPF side can learn there is a TLS ULP on the socket. Then on BPF side we omit calling our stream parser to avoid breaking the data stream for the KTLS receiver. Then on the KTLS side we call BPF_SK_SKB_STREAM_VERDICT once the KTLS receiver is done with the packet but before it posts the msg to userspace. This gives us symmetry between the TX and RX halfs and IMO makes it usable again. On the TX side we process packets in this order BPF -> TLS -> TCP and on the receive side in the reverse order TCP -> TLS -> BPF. Discovered while testing OpenSSL 3.0 Alpha2.0 release. Fixes: d829e9c4112b5 ("tls: convert to generic sk_msg interface") Signed-off-by: John Fastabend Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/159079361946.5745.605854335665044485.stgit@john-Precision-5820-Tower Signed-off-by: Alexei Starovoitov --- include/linux/skmsg.h | 8 ++++++++ include/net/tls.h | 9 +++++++++ net/core/skmsg.c | 43 ++++++++++++++++++++++++++++++++++++++++--- net/tls/tls_sw.c | 20 ++++++++++++++++++-- 4 files changed, 75 insertions(+), 5 deletions(-) diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h index ad31c9fb7158..08674cd14d5a 100644 --- a/include/linux/skmsg.h +++ b/include/linux/skmsg.h @@ -437,4 +437,12 @@ static inline void psock_progs_drop(struct sk_psock_progs *progs) psock_set_prog(&progs->skb_verdict, NULL); } +int sk_psock_tls_strp_read(struct sk_psock *psock, struct sk_buff *skb); + +static inline bool sk_psock_strp_enabled(struct sk_psock *psock) +{ + if (!psock) + return false; + return psock->parser.enabled; +} #endif /* _LINUX_SKMSG_H */ diff --git a/include/net/tls.h b/include/net/tls.h index 3e7b44cae0d9..3212d3c214a9 100644 --- a/include/net/tls.h +++ b/include/net/tls.h @@ -571,6 +571,15 @@ static inline bool tls_sw_has_ctx_tx(const struct sock *sk) return !!tls_sw_ctx_tx(ctx); } +static inline bool tls_sw_has_ctx_rx(const struct sock *sk) +{ + struct tls_context *ctx = tls_get_ctx(sk); + + if (!ctx) + return false; + return !!tls_sw_ctx_rx(ctx); +} + void tls_sw_write_space(struct sock *sk, struct tls_context *ctx); void tls_device_write_space(struct sock *sk, struct tls_context *ctx); diff --git a/net/core/skmsg.c b/net/core/skmsg.c index 9d72f71e9b47..351afbf6bfba 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -7,6 +7,7 @@ #include #include +#include static bool sk_msg_try_coalesce_ok(struct sk_msg *msg, int elem_first_coalesce) { @@ -714,6 +715,38 @@ static void sk_psock_skb_redirect(struct sk_psock *psock, struct sk_buff *skb) } } +static void sk_psock_tls_verdict_apply(struct sk_psock *psock, + struct sk_buff *skb, int verdict) +{ + switch (verdict) { + case __SK_REDIRECT: + sk_psock_skb_redirect(psock, skb); + break; + case __SK_PASS: + case __SK_DROP: + default: + break; + } +} + +int sk_psock_tls_strp_read(struct sk_psock *psock, struct sk_buff *skb) +{ + struct bpf_prog *prog; + int ret = __SK_PASS; + + rcu_read_lock(); + prog = READ_ONCE(psock->progs.skb_verdict); + if (likely(prog)) { + tcp_skb_bpf_redirect_clear(skb); + ret = sk_psock_bpf_run(psock, prog, skb); + ret = sk_psock_map_verd(ret, tcp_skb_bpf_redirect_fetch(skb)); + } + rcu_read_unlock(); + sk_psock_tls_verdict_apply(psock, skb, ret); + return ret; +} +EXPORT_SYMBOL_GPL(sk_psock_tls_strp_read); + static void sk_psock_verdict_apply(struct sk_psock *psock, struct sk_buff *skb, int verdict) { @@ -792,9 +825,13 @@ static void sk_psock_strp_data_ready(struct sock *sk) rcu_read_lock(); psock = sk_psock(sk); if (likely(psock)) { - write_lock_bh(&sk->sk_callback_lock); - strp_data_ready(&psock->parser.strp); - write_unlock_bh(&sk->sk_callback_lock); + if (tls_sw_has_ctx_rx(sk)) { + psock->parser.saved_data_ready(sk); + } else { + write_lock_bh(&sk->sk_callback_lock); + strp_data_ready(&psock->parser.strp); + write_unlock_bh(&sk->sk_callback_lock); + } } rcu_read_unlock(); } diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index 8c2763eb6aae..24f64bc0de18 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -1742,6 +1742,7 @@ int tls_sw_recvmsg(struct sock *sk, long timeo; bool is_kvec = iov_iter_is_kvec(&msg->msg_iter); bool is_peek = flags & MSG_PEEK; + bool bpf_strp_enabled; int num_async = 0; int pending; @@ -1752,6 +1753,7 @@ int tls_sw_recvmsg(struct sock *sk, psock = sk_psock_get(sk); lock_sock(sk); + bpf_strp_enabled = sk_psock_strp_enabled(psock); /* Process pending decrypted records. It must be non-zero-copy */ err = process_rx_list(ctx, msg, &control, &cmsg, 0, len, false, @@ -1805,11 +1807,12 @@ int tls_sw_recvmsg(struct sock *sk, if (to_decrypt <= len && !is_kvec && !is_peek && ctx->control == TLS_RECORD_TYPE_DATA && - prot->version != TLS_1_3_VERSION) + prot->version != TLS_1_3_VERSION && + !bpf_strp_enabled) zc = true; /* Do not use async mode if record is non-data */ - if (ctx->control == TLS_RECORD_TYPE_DATA) + if (ctx->control == TLS_RECORD_TYPE_DATA && !bpf_strp_enabled) async_capable = ctx->async_capable; else async_capable = false; @@ -1859,6 +1862,19 @@ int tls_sw_recvmsg(struct sock *sk, goto pick_next_record; if (!zc) { + if (bpf_strp_enabled) { + err = sk_psock_tls_strp_read(psock, skb); + if (err != __SK_PASS) { + rxm->offset = rxm->offset + rxm->full_len; + rxm->full_len = 0; + if (err == __SK_DROP) + consume_skb(skb); + ctx->recv_pkt = NULL; + __strp_unpause(&ctx->strp); + continue; + } + } + if (rxm->full_len > len) { retain_skb = true; chunk = len; -- cgit v1.2.3-59-g8ed1b From 463bac5f1ca79fcd964bf50426eab024fb4dd8a4 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Fri, 29 May 2020 16:07:19 -0700 Subject: bpf, selftests: Add test for ktls with skb bpf ingress policy This adds a test for bpf ingress policy. To ensure data writes happen as expected with extra TLS headers we run these tests with data verification enabled by default. This will test receive packets have "PASS" stamped into the front of the payload. Signed-off-by: John Fastabend Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/159079363965.5745.3390806911628980210.stgit@john-Precision-5820-Tower Signed-off-by: Alexei Starovoitov --- .../selftests/bpf/progs/test_sockmap_kern.h | 46 +++++- tools/testing/selftests/bpf/test_sockmap.c | 163 ++++++++++++++++++--- 2 files changed, 187 insertions(+), 22 deletions(-) diff --git a/tools/testing/selftests/bpf/progs/test_sockmap_kern.h b/tools/testing/selftests/bpf/progs/test_sockmap_kern.h index a443d3637db3..057036ca1111 100644 --- a/tools/testing/selftests/bpf/progs/test_sockmap_kern.h +++ b/tools/testing/selftests/bpf/progs/test_sockmap_kern.h @@ -79,11 +79,18 @@ struct { struct { __uint(type, BPF_MAP_TYPE_ARRAY); - __uint(max_entries, 1); + __uint(max_entries, 2); __type(key, int); __type(value, int); } sock_skb_opts SEC(".maps"); +struct { + __uint(type, TEST_MAP_TYPE); + __uint(max_entries, 20); + __uint(key_size, sizeof(int)); + __uint(value_size, sizeof(int)); +} tls_sock_map SEC(".maps"); + SEC("sk_skb1") int bpf_prog1(struct __sk_buff *skb) { @@ -118,6 +125,43 @@ int bpf_prog2(struct __sk_buff *skb) } +SEC("sk_skb3") +int bpf_prog3(struct __sk_buff *skb) +{ + const int one = 1; + int err, *f, ret = SK_PASS; + void *data_end; + char *c; + + err = bpf_skb_pull_data(skb, 19); + if (err) + goto tls_out; + + c = (char *)(long)skb->data; + data_end = (void *)(long)skb->data_end; + + if (c + 18 < data_end) + memcpy(&c[13], "PASS", 4); + f = bpf_map_lookup_elem(&sock_skb_opts, &one); + if (f && *f) { + __u64 flags = 0; + + ret = 0; + flags = *f; +#ifdef SOCKMAP + return bpf_sk_redirect_map(skb, &tls_sock_map, ret, flags); +#else + return bpf_sk_redirect_hash(skb, &tls_sock_map, &ret, flags); +#endif + } + + f = bpf_map_lookup_elem(&sock_skb_opts, &one); + if (f && *f) + ret = SK_DROP; +tls_out: + return ret; +} + SEC("sockops") int bpf_sockmap(struct bpf_sock_ops *skops) { diff --git a/tools/testing/selftests/bpf/test_sockmap.c b/tools/testing/selftests/bpf/test_sockmap.c index c80643828b82..37695fc8096a 100644 --- a/tools/testing/selftests/bpf/test_sockmap.c +++ b/tools/testing/selftests/bpf/test_sockmap.c @@ -63,8 +63,8 @@ int s1, s2, c1, c2, p1, p2; int test_cnt; int passed; int failed; -int map_fd[8]; -struct bpf_map *maps[8]; +int map_fd[9]; +struct bpf_map *maps[9]; int prog_fd[11]; int txmsg_pass; @@ -79,7 +79,10 @@ int txmsg_end_push; int txmsg_start_pop; int txmsg_pop; int txmsg_ingress; -int txmsg_skb; +int txmsg_redir_skb; +int txmsg_ktls_skb; +int txmsg_ktls_skb_drop; +int txmsg_ktls_skb_redir; int ktls; int peek_flag; @@ -104,7 +107,7 @@ static const struct option long_options[] = { {"txmsg_start_pop", required_argument, NULL, 'w'}, {"txmsg_pop", required_argument, NULL, 'x'}, {"txmsg_ingress", no_argument, &txmsg_ingress, 1 }, - {"txmsg_skb", no_argument, &txmsg_skb, 1 }, + {"txmsg_redir_skb", no_argument, &txmsg_redir_skb, 1 }, {"ktls", no_argument, &ktls, 1 }, {"peek", no_argument, &peek_flag, 1 }, {"whitelist", required_argument, NULL, 'n' }, @@ -169,7 +172,8 @@ static void test_reset(void) txmsg_start_push = txmsg_end_push = 0; txmsg_pass = txmsg_drop = txmsg_redir = 0; txmsg_apply = txmsg_cork = 0; - txmsg_ingress = txmsg_skb = 0; + txmsg_ingress = txmsg_redir_skb = 0; + txmsg_ktls_skb = txmsg_ktls_skb_drop = txmsg_ktls_skb_redir = 0; } static int test_start_subtest(const struct _test *t, struct sockmap_options *o) @@ -502,14 +506,41 @@ unwind_iov: static int msg_verify_data(struct msghdr *msg, int size, int chunk_sz) { - int i, j, bytes_cnt = 0; + int i, j = 0, bytes_cnt = 0; unsigned char k = 0; for (i = 0; i < msg->msg_iovlen; i++) { unsigned char *d = msg->msg_iov[i].iov_base; - for (j = 0; - j < msg->msg_iov[i].iov_len && size; j++) { + /* Special case test for skb ingress + ktls */ + if (i == 0 && txmsg_ktls_skb) { + if (msg->msg_iov[i].iov_len < 4) + return -EIO; + if (txmsg_ktls_skb_redir) { + if (memcmp(&d[13], "PASS", 4) != 0) { + fprintf(stderr, + "detected redirect ktls_skb data error with skb ingress update @iov[%i]:%i \"%02x %02x %02x %02x\" != \"PASS\"\n", i, 0, d[13], d[14], d[15], d[16]); + return -EIO; + } + d[13] = 0; + d[14] = 1; + d[15] = 2; + d[16] = 3; + j = 13; + } else if (txmsg_ktls_skb) { + if (memcmp(d, "PASS", 4) != 0) { + fprintf(stderr, + "detected ktls_skb data error with skb ingress update @iov[%i]:%i \"%02x %02x %02x %02x\" != \"PASS\"\n", i, 0, d[0], d[1], d[2], d[3]); + return -EIO; + } + d[0] = 0; + d[1] = 1; + d[2] = 2; + d[3] = 3; + } + } + + for (; j < msg->msg_iov[i].iov_len && size; j++) { if (d[j] != k++) { fprintf(stderr, "detected data corruption @iov[%i]:%i %02x != %02x, %02x ?= %02x\n", @@ -724,7 +755,7 @@ static int sendmsg_test(struct sockmap_options *opt) rxpid = fork(); if (rxpid == 0) { iov_buf -= (txmsg_pop - txmsg_start_pop + 1); - if (opt->drop_expected) + if (opt->drop_expected || txmsg_ktls_skb_drop) _exit(0); if (!iov_buf) /* zero bytes sent case */ @@ -911,8 +942,28 @@ static int run_options(struct sockmap_options *options, int cg_fd, int test) return err; } + /* Attach programs to TLS sockmap */ + if (txmsg_ktls_skb) { + err = bpf_prog_attach(prog_fd[0], map_fd[8], + BPF_SK_SKB_STREAM_PARSER, 0); + if (err) { + fprintf(stderr, + "ERROR: bpf_prog_attach (TLS sockmap %i->%i): %d (%s)\n", + prog_fd[0], map_fd[8], err, strerror(errno)); + return err; + } + + err = bpf_prog_attach(prog_fd[2], map_fd[8], + BPF_SK_SKB_STREAM_VERDICT, 0); + if (err) { + fprintf(stderr, "ERROR: bpf_prog_attach (TLS sockmap): %d (%s)\n", + err, strerror(errno)); + return err; + } + } + /* Attach to cgroups */ - err = bpf_prog_attach(prog_fd[2], cg_fd, BPF_CGROUP_SOCK_OPS, 0); + err = bpf_prog_attach(prog_fd[3], cg_fd, BPF_CGROUP_SOCK_OPS, 0); if (err) { fprintf(stderr, "ERROR: bpf_prog_attach (groups): %d (%s)\n", err, strerror(errno)); @@ -928,15 +979,15 @@ run: /* Attach txmsg program to sockmap */ if (txmsg_pass) - tx_prog_fd = prog_fd[3]; - else if (txmsg_redir) tx_prog_fd = prog_fd[4]; - else if (txmsg_apply) + else if (txmsg_redir) tx_prog_fd = prog_fd[5]; - else if (txmsg_cork) + else if (txmsg_apply) tx_prog_fd = prog_fd[6]; - else if (txmsg_drop) + else if (txmsg_cork) tx_prog_fd = prog_fd[7]; + else if (txmsg_drop) + tx_prog_fd = prog_fd[8]; else tx_prog_fd = 0; @@ -1108,7 +1159,35 @@ run: } } - if (txmsg_skb) { + if (txmsg_ktls_skb) { + int ingress = BPF_F_INGRESS; + + i = 0; + err = bpf_map_update_elem(map_fd[8], &i, &p2, BPF_ANY); + if (err) { + fprintf(stderr, + "ERROR: bpf_map_update_elem (c1 sockmap): %d (%s)\n", + err, strerror(errno)); + } + + if (txmsg_ktls_skb_redir) { + i = 1; + err = bpf_map_update_elem(map_fd[7], + &i, &ingress, BPF_ANY); + if (err) { + fprintf(stderr, + "ERROR: bpf_map_update_elem (txmsg_ingress): %d (%s)\n", + err, strerror(errno)); + } + } + + if (txmsg_ktls_skb_drop) { + i = 1; + err = bpf_map_update_elem(map_fd[7], &i, &i, BPF_ANY); + } + } + + if (txmsg_redir_skb) { int skb_fd = (test == SENDMSG || test == SENDPAGE) ? p2 : p1; int ingress = BPF_F_INGRESS; @@ -1123,8 +1202,7 @@ run: } i = 3; - err = bpf_map_update_elem(map_fd[0], - &i, &skb_fd, BPF_ANY); + err = bpf_map_update_elem(map_fd[0], &i, &skb_fd, BPF_ANY); if (err) { fprintf(stderr, "ERROR: bpf_map_update_elem (c1 sockmap): %d (%s)\n", @@ -1158,9 +1236,12 @@ run: fprintf(stderr, "unknown test\n"); out: /* Detatch and zero all the maps */ - bpf_prog_detach2(prog_fd[2], cg_fd, BPF_CGROUP_SOCK_OPS); + bpf_prog_detach2(prog_fd[3], cg_fd, BPF_CGROUP_SOCK_OPS); bpf_prog_detach2(prog_fd[0], map_fd[0], BPF_SK_SKB_STREAM_PARSER); bpf_prog_detach2(prog_fd[1], map_fd[0], BPF_SK_SKB_STREAM_VERDICT); + bpf_prog_detach2(prog_fd[0], map_fd[8], BPF_SK_SKB_STREAM_PARSER); + bpf_prog_detach2(prog_fd[2], map_fd[8], BPF_SK_SKB_STREAM_VERDICT); + if (tx_prog_fd >= 0) bpf_prog_detach2(tx_prog_fd, map_fd[1], BPF_SK_MSG_VERDICT); @@ -1229,8 +1310,10 @@ static void test_options(char *options) } if (txmsg_ingress) strncat(options, "ingress,", OPTSTRING); - if (txmsg_skb) - strncat(options, "skb,", OPTSTRING); + if (txmsg_redir_skb) + strncat(options, "redir_skb,", OPTSTRING); + if (txmsg_ktls_skb) + strncat(options, "ktls_skb,", OPTSTRING); if (ktls) strncat(options, "ktls,", OPTSTRING); if (peek_flag) @@ -1362,6 +1445,40 @@ static void test_txmsg_ingress_redir(int cgrp, struct sockmap_options *opt) test_send(opt, cgrp); } +static void test_txmsg_skb(int cgrp, struct sockmap_options *opt) +{ + bool data = opt->data_test; + int k = ktls; + + opt->data_test = true; + ktls = 1; + + txmsg_pass = txmsg_drop = 0; + txmsg_ingress = txmsg_redir = 0; + txmsg_ktls_skb = 1; + txmsg_pass = 1; + + /* Using data verification so ensure iov layout is + * expected from test receiver side. e.g. has enough + * bytes to write test code. + */ + opt->iov_length = 100; + opt->iov_count = 1; + opt->rate = 1; + test_exec(cgrp, opt); + + txmsg_ktls_skb_drop = 1; + test_exec(cgrp, opt); + + txmsg_ktls_skb_drop = 0; + txmsg_ktls_skb_redir = 1; + test_exec(cgrp, opt); + + opt->data_test = data; + ktls = k; +} + + /* Test cork with hung data. This tests poor usage patterns where * cork can leave data on the ring if user program is buggy and * doesn't flush them somehow. They do take some time however @@ -1542,11 +1659,13 @@ char *map_names[] = { "sock_bytes", "sock_redir_flags", "sock_skb_opts", + "tls_sock_map", }; int prog_attach_type[] = { BPF_SK_SKB_STREAM_PARSER, BPF_SK_SKB_STREAM_VERDICT, + BPF_SK_SKB_STREAM_VERDICT, BPF_CGROUP_SOCK_OPS, BPF_SK_MSG_VERDICT, BPF_SK_MSG_VERDICT, @@ -1558,6 +1677,7 @@ int prog_attach_type[] = { }; int prog_type[] = { + BPF_PROG_TYPE_SK_SKB, BPF_PROG_TYPE_SK_SKB, BPF_PROG_TYPE_SK_SKB, BPF_PROG_TYPE_SOCK_OPS, @@ -1620,6 +1740,7 @@ struct _test test[] = { {"txmsg test redirect", test_txmsg_redir}, {"txmsg test drop", test_txmsg_drop}, {"txmsg test ingress redirect", test_txmsg_ingress_redir}, + {"txmsg test skb", test_txmsg_skb}, {"txmsg test apply", test_txmsg_apply}, {"txmsg test cork", test_txmsg_cork}, {"txmsg test hanging corks", test_txmsg_cork_hangs}, -- cgit v1.2.3-59-g8ed1b From df8fe57c071c58f355d0a4985ecd2fcaf99b050f Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Mon, 1 Jun 2020 14:13:52 -0700 Subject: tools/bpf: sync bpf.h Sync bpf.h into tool/include/uapi/ Signed-off-by: Alexei Starovoitov --- tools/include/uapi/linux/bpf.h | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index f74bc4a2385e..f862a58fb567 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -3613,6 +3613,7 @@ struct bpf_sock { __u32 dst_ip4; __u32 dst_ip6[4]; __u32 state; + __s32 rx_queue_mapping; }; struct bpf_tcp_sock { -- cgit v1.2.3-59-g8ed1b From bb2359f4dbe98e8863b4e885fc09269ef4682ec3 Mon Sep 17 00:00:00 2001 From: Denis Efremov Date: Mon, 1 Jun 2020 19:28:14 +0300 Subject: bpf: Change kvfree to kfree in generic_map_lookup_batch() buf_prevkey in generic_map_lookup_batch() is allocated with kmalloc(). It's safe to free it with kfree(). Fixes: cb4d03ab499d ("bpf: Add generic support for lookup batch op") Signed-off-by: Denis Efremov Signed-off-by: Alexei Starovoitov Acked-by: Song Liu Link: https://lore.kernel.org/bpf/20200601162814.17426-1-efremov@linux.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/syscall.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 9de3540fa90c..e83b0818b529 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1399,7 +1399,7 @@ int generic_map_lookup_batch(struct bpf_map *map, buf = kmalloc(map->key_size + value_size, GFP_USER | __GFP_NOWARN); if (!buf) { - kvfree(buf_prevkey); + kfree(buf_prevkey); return -ENOMEM; } -- cgit v1.2.3-59-g8ed1b From 8ea204c2b658eaef55b4716fde469fb66c589a3d Mon Sep 17 00:00:00 2001 From: Ferenc Fejes Date: Sat, 30 May 2020 23:09:00 +0200 Subject: net: Make locking in sock_bindtoindex optional The sock_bindtoindex intended for kernel wide usage however it will lock the socket regardless of the context. This modification relax this behavior optionally: locking the socket will be optional by calling the sock_bindtoindex with lock_sk = true. The modification applied to all users of the sock_bindtoindex. Signed-off-by: Ferenc Fejes Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/bee6355da40d9e991b2f2d12b67d55ebb5f5b207.1590871065.git.fejes@inf.elte.hu --- include/net/sock.h | 2 +- net/core/sock.c | 10 ++++++---- net/ipv4/udp_tunnel.c | 2 +- net/ipv6/ip6_udp_tunnel.c | 2 +- 4 files changed, 9 insertions(+), 7 deletions(-) diff --git a/include/net/sock.h b/include/net/sock.h index 6e9f713a7860..c53cc42b5ab9 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -2690,7 +2690,7 @@ static inline bool sk_dev_equal_l3scope(struct sock *sk, int dif) void sock_def_readable(struct sock *sk); -int sock_bindtoindex(struct sock *sk, int ifindex); +int sock_bindtoindex(struct sock *sk, int ifindex, bool lock_sk); void sock_enable_timestamps(struct sock *sk); void sock_no_linger(struct sock *sk); void sock_set_keepalive(struct sock *sk); diff --git a/net/core/sock.c b/net/core/sock.c index 61ec573221a6..6c4acf1f0220 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -594,13 +594,15 @@ out: return ret; } -int sock_bindtoindex(struct sock *sk, int ifindex) +int sock_bindtoindex(struct sock *sk, int ifindex, bool lock_sk) { int ret; - lock_sock(sk); + if (lock_sk) + lock_sock(sk); ret = sock_bindtoindex_locked(sk, ifindex); - release_sock(sk); + if (lock_sk) + release_sock(sk); return ret; } @@ -646,7 +648,7 @@ static int sock_setbindtodevice(struct sock *sk, char __user *optval, goto out; } - return sock_bindtoindex(sk, index); + return sock_bindtoindex(sk, index, true); out: #endif diff --git a/net/ipv4/udp_tunnel.c b/net/ipv4/udp_tunnel.c index 2158e8bddf41..3eecba0874aa 100644 --- a/net/ipv4/udp_tunnel.c +++ b/net/ipv4/udp_tunnel.c @@ -22,7 +22,7 @@ int udp_sock_create4(struct net *net, struct udp_port_cfg *cfg, goto error; if (cfg->bind_ifindex) { - err = sock_bindtoindex(sock->sk, cfg->bind_ifindex); + err = sock_bindtoindex(sock->sk, cfg->bind_ifindex, true); if (err < 0) goto error; } diff --git a/net/ipv6/ip6_udp_tunnel.c b/net/ipv6/ip6_udp_tunnel.c index 2e0ad1bc84a8..cdc4d4ee2420 100644 --- a/net/ipv6/ip6_udp_tunnel.c +++ b/net/ipv6/ip6_udp_tunnel.c @@ -30,7 +30,7 @@ int udp_sock_create6(struct net *net, struct udp_port_cfg *cfg, goto error; } if (cfg->bind_ifindex) { - err = sock_bindtoindex(sock->sk, cfg->bind_ifindex); + err = sock_bindtoindex(sock->sk, cfg->bind_ifindex, true); if (err < 0) goto error; } -- cgit v1.2.3-59-g8ed1b From 70c58997c1e864c96dfdf072572047303db8f42a Mon Sep 17 00:00:00 2001 From: Ferenc Fejes Date: Sat, 30 May 2020 23:09:01 +0200 Subject: bpf: Allow SO_BINDTODEVICE opt in bpf_setsockopt Extending the supported sockopts in bpf_setsockopt with SO_BINDTODEVICE. We call sock_bindtoindex with parameter lock_sk = false in this context because we already owning the socket. Signed-off-by: Ferenc Fejes Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/4149e304867b8d5a606a305bc59e29b063e51f49.1590871065.git.fejes@inf.elte.hu --- net/core/filter.c | 28 +++++++++++++++++++++++++++- 1 file changed, 27 insertions(+), 1 deletion(-) diff --git a/net/core/filter.c b/net/core/filter.c index 85ff827aab73..ae82bcb03124 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4248,6 +4248,9 @@ static const struct bpf_func_proto bpf_get_socket_uid_proto = { static int _bpf_setsockopt(struct sock *sk, int level, int optname, char *optval, int optlen, u32 flags) { + char devname[IFNAMSIZ]; + struct net *net; + int ifindex; int ret = 0; int val; @@ -4257,7 +4260,7 @@ static int _bpf_setsockopt(struct sock *sk, int level, int optname, sock_owned_by_me(sk); if (level == SOL_SOCKET) { - if (optlen != sizeof(int)) + if (optlen != sizeof(int) && optname != SO_BINDTODEVICE) return -EINVAL; val = *((int *)optval); @@ -4298,6 +4301,29 @@ static int _bpf_setsockopt(struct sock *sk, int level, int optname, sk_dst_reset(sk); } break; + case SO_BINDTODEVICE: + ret = -ENOPROTOOPT; +#ifdef CONFIG_NETDEVICES + optlen = min_t(long, optlen, IFNAMSIZ - 1); + strncpy(devname, optval, optlen); + devname[optlen] = 0; + + ifindex = 0; + if (devname[0] != '\0') { + struct net_device *dev; + + ret = -ENODEV; + + net = sock_net(sk); + dev = dev_get_by_name(net, devname); + if (!dev) + break; + ifindex = dev->ifindex; + dev_put(dev); + } + ret = sock_bindtoindex(sk, ifindex, false); +#endif + break; default: ret = -EINVAL; } -- cgit v1.2.3-59-g8ed1b From 9c441fe4c06a553ad770b6f21616327a3badf793 Mon Sep 17 00:00:00 2001 From: Ferenc Fejes Date: Sat, 30 May 2020 23:09:02 +0200 Subject: selftests/bpf: Add test for SO_BINDTODEVICE opt of bpf_setsockopt This test intended to verify if SO_BINDTODEVICE option works in bpf_setsockopt. Because we already in the SOL_SOCKET level in this connect bpf prog its safe to verify the sanity in the beginning of the connect_v4_prog by calling the bind_to_device test helper. The testing environment already created by the test_sock_addr.sh script so this test assume that two netdevices already existing in the system: veth pair with names test_sock_addr1 and test_sock_addr2. The test will try to bind the socket to those devices first. Then the test assume there are no netdevice with "nonexistent_dev" name so the bpf_setsockopt will give use ENODEV error. At the end the test remove the device binding from the socket by binding it to an empty name. Signed-off-by: Ferenc Fejes Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/3f055b8e45c65639c5c73d0b4b6c589e60b86f15.1590871065.git.fejes@inf.elte.hu --- tools/testing/selftests/bpf/progs/connect4_prog.c | 33 +++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/tools/testing/selftests/bpf/progs/connect4_prog.c b/tools/testing/selftests/bpf/progs/connect4_prog.c index c2c85c31cffd..1ab2c5eba86c 100644 --- a/tools/testing/selftests/bpf/progs/connect4_prog.c +++ b/tools/testing/selftests/bpf/progs/connect4_prog.c @@ -9,6 +9,8 @@ #include #include #include +#include +#include #include #include @@ -21,6 +23,10 @@ #define TCP_CA_NAME_MAX 16 #endif +#ifndef IFNAMSIZ +#define IFNAMSIZ 16 +#endif + int _version SEC("version") = 1; __attribute__ ((noinline)) @@ -75,6 +81,29 @@ static __inline int set_cc(struct bpf_sock_addr *ctx) return 0; } +static __inline int bind_to_device(struct bpf_sock_addr *ctx) +{ + char veth1[IFNAMSIZ] = "test_sock_addr1"; + char veth2[IFNAMSIZ] = "test_sock_addr2"; + char missing[IFNAMSIZ] = "nonexistent_dev"; + char del_bind[IFNAMSIZ] = ""; + + if (bpf_setsockopt(ctx, SOL_SOCKET, SO_BINDTODEVICE, + &veth1, sizeof(veth1))) + return 1; + if (bpf_setsockopt(ctx, SOL_SOCKET, SO_BINDTODEVICE, + &veth2, sizeof(veth2))) + return 1; + if (bpf_setsockopt(ctx, SOL_SOCKET, SO_BINDTODEVICE, + &missing, sizeof(missing)) != -ENODEV) + return 1; + if (bpf_setsockopt(ctx, SOL_SOCKET, SO_BINDTODEVICE, + &del_bind, sizeof(del_bind))) + return 1; + + return 0; +} + SEC("cgroup/connect4") int connect_v4_prog(struct bpf_sock_addr *ctx) { @@ -88,6 +117,10 @@ int connect_v4_prog(struct bpf_sock_addr *ctx) tuple.ipv4.daddr = bpf_htonl(DST_REWRITE_IP4); tuple.ipv4.dport = bpf_htons(DST_REWRITE_PORT4); + /* Bind to device and unbind it. */ + if (bind_to_device(ctx)) + return 0; + if (ctx->type != SOCK_STREAM && ctx->type != SOCK_DGRAM) return 0; else if (ctx->type == SOCK_STREAM) -- cgit v1.2.3-59-g8ed1b From fc37987265b5979129a72c672b20245119768fb8 Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Thu, 28 May 2020 22:47:28 +0200 Subject: xdp: Introduce xdp_convert_frame_to_buff utility routine Introduce xdp_convert_frame_to_buff utility routine to initialize xdp_buff fields from xdp_frames ones. Rely on xdp_convert_frame_to_buff in veth xdp code. Suggested-by: Jesper Dangaard Brouer Signed-off-by: Lorenzo Bianconi Signed-off-by: Alexei Starovoitov Acked-by: Jesper Dangaard Brouer Link: https://lore.kernel.org/bpf/87acf133073c4b2d4cbb8097e8c2480c0a0fac32.1590698295.git.lorenzo@kernel.org --- drivers/net/veth.c | 6 +----- include/net/xdp.h | 10 ++++++++++ 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/drivers/net/veth.c b/drivers/net/veth.c index b586d2fa5551..fb5c17361f64 100644 --- a/drivers/net/veth.c +++ b/drivers/net/veth.c @@ -575,11 +575,7 @@ static struct sk_buff *veth_xdp_rcv_one(struct veth_rq *rq, struct xdp_buff xdp; u32 act; - xdp.data_hard_start = hard_start; - xdp.data = frame->data; - xdp.data_end = frame->data + frame->len; - xdp.data_meta = frame->data - frame->metasize; - xdp.frame_sz = frame->frame_sz; + xdp_convert_frame_to_buff(frame, &xdp); xdp.rxq = &rq->xdp_rxq; act = bpf_prog_run_xdp(xdp_prog, &xdp); diff --git a/include/net/xdp.h b/include/net/xdp.h index d54022959491..db5c2c687f48 100644 --- a/include/net/xdp.h +++ b/include/net/xdp.h @@ -111,6 +111,16 @@ void xdp_warn(const char *msg, const char *func, const int line); struct xdp_frame *xdp_convert_zc_to_xdp_frame(struct xdp_buff *xdp); +static inline +void xdp_convert_frame_to_buff(struct xdp_frame *frame, struct xdp_buff *xdp) +{ + xdp->data_hard_start = frame->data - frame->headroom - sizeof(*frame); + xdp->data = frame->data; + xdp->data_end = frame->data + frame->len; + xdp->data_meta = frame->data - frame->metasize; + xdp->frame_sz = frame->frame_sz; +} + /* Convert xdp_buff to xdp_frame */ static inline struct xdp_frame *convert_to_xdp_frame(struct xdp_buff *xdp) -- cgit v1.2.3-59-g8ed1b From 1b698fa5d8ef958007c455e316aa44c37ab3c5fb Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Thu, 28 May 2020 22:47:29 +0200 Subject: xdp: Rename convert_to_xdp_frame in xdp_convert_buff_to_frame In order to use standard 'xdp' prefix, rename convert_to_xdp_frame utility routine in xdp_convert_buff_to_frame and replace all the occurrences Signed-off-by: Lorenzo Bianconi Signed-off-by: Alexei Starovoitov Acked-by: Jesper Dangaard Brouer Link: https://lore.kernel.org/bpf/6344f739be0d1a08ab2b9607584c4d5478c8c083.1590698295.git.lorenzo@kernel.org --- drivers/net/ethernet/amazon/ena/ena_netdev.c | 2 +- drivers/net/ethernet/intel/i40e/i40e_txrx.c | 2 +- drivers/net/ethernet/intel/ice/ice_txrx_lib.c | 2 +- drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 2 +- drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c | 2 +- drivers/net/ethernet/marvell/mvneta.c | 2 +- drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c | 10 +++++----- drivers/net/ethernet/sfc/rx.c | 2 +- drivers/net/ethernet/socionext/netsec.c | 2 +- drivers/net/ethernet/ti/cpsw_priv.c | 2 +- drivers/net/tun.c | 2 +- drivers/net/veth.c | 2 +- drivers/net/virtio_net.c | 4 ++-- include/net/xdp.h | 2 +- kernel/bpf/cpumap.c | 2 +- kernel/bpf/devmap.c | 2 +- 16 files changed, 21 insertions(+), 21 deletions(-) diff --git a/drivers/net/ethernet/amazon/ena/ena_netdev.c b/drivers/net/ethernet/amazon/ena/ena_netdev.c index 46865d5bd7e7..a0af74c93971 100644 --- a/drivers/net/ethernet/amazon/ena/ena_netdev.c +++ b/drivers/net/ethernet/amazon/ena/ena_netdev.c @@ -263,7 +263,7 @@ static int ena_xdp_tx_map_buff(struct ena_ring *xdp_ring, dma_addr_t dma = 0; u32 size; - tx_info->xdpf = convert_to_xdp_frame(xdp); + tx_info->xdpf = xdp_convert_buff_to_frame(xdp); size = tx_info->xdpf->len; ena_buf = tx_info->bufs; diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c b/drivers/net/ethernet/intel/i40e/i40e_txrx.c index f613782f2f56..f9555c847f73 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c +++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c @@ -2167,7 +2167,7 @@ static int i40e_xmit_xdp_ring(struct xdp_frame *xdpf, int i40e_xmit_xdp_tx_ring(struct xdp_buff *xdp, struct i40e_ring *xdp_ring) { - struct xdp_frame *xdpf = convert_to_xdp_frame(xdp); + struct xdp_frame *xdpf = xdp_convert_buff_to_frame(xdp); if (unlikely(!xdpf)) return I40E_XDP_CONSUMED; diff --git a/drivers/net/ethernet/intel/ice/ice_txrx_lib.c b/drivers/net/ethernet/intel/ice/ice_txrx_lib.c index ab2031b1c635..02b12736ea80 100644 --- a/drivers/net/ethernet/intel/ice/ice_txrx_lib.c +++ b/drivers/net/ethernet/intel/ice/ice_txrx_lib.c @@ -254,7 +254,7 @@ int ice_xmit_xdp_ring(void *data, u16 size, struct ice_ring *xdp_ring) */ int ice_xmit_xdp_buff(struct xdp_buff *xdp, struct ice_ring *xdp_ring) { - struct xdp_frame *xdpf = convert_to_xdp_frame(xdp); + struct xdp_frame *xdpf = xdp_convert_buff_to_frame(xdp); if (unlikely(!xdpf)) return ICE_XDP_CONSUMED; diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c index a59c166f794f..f162b8b8f345 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c @@ -2215,7 +2215,7 @@ static struct sk_buff *ixgbe_run_xdp(struct ixgbe_adapter *adapter, case XDP_PASS: break; case XDP_TX: - xdpf = convert_to_xdp_frame(xdp); + xdpf = xdp_convert_buff_to_frame(xdp); if (unlikely(!xdpf)) { result = IXGBE_XDP_CONSUMED; break; diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c index 86add9fbd36c..be9d2a8da515 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c @@ -107,7 +107,7 @@ static int ixgbe_run_xdp_zc(struct ixgbe_adapter *adapter, case XDP_PASS: break; case XDP_TX: - xdpf = convert_to_xdp_frame(xdp); + xdpf = xdp_convert_buff_to_frame(xdp); if (unlikely(!xdpf)) { result = IXGBE_XDP_CONSUMED; break; diff --git a/drivers/net/ethernet/marvell/mvneta.c b/drivers/net/ethernet/marvell/mvneta.c index 15e42a7f8a86..011cd26953d9 100644 --- a/drivers/net/ethernet/marvell/mvneta.c +++ b/drivers/net/ethernet/marvell/mvneta.c @@ -2073,7 +2073,7 @@ mvneta_xdp_xmit_back(struct mvneta_port *pp, struct xdp_buff *xdp) int cpu; u32 ret; - xdpf = convert_to_xdp_frame(xdp); + xdpf = xdp_convert_buff_to_frame(xdp); if (unlikely(!xdpf)) return MVNETA_XDP_DROPPED; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c b/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c index 3bea1d4be53b..c9d308e91965 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c @@ -64,7 +64,7 @@ mlx5e_xmit_xdp_buff(struct mlx5e_xdpsq *sq, struct mlx5e_rq *rq, struct xdp_frame *xdpf; dma_addr_t dma_addr; - xdpf = convert_to_xdp_frame(xdp); + xdpf = xdp_convert_buff_to_frame(xdp); if (unlikely(!xdpf)) return false; @@ -97,10 +97,10 @@ mlx5e_xmit_xdp_buff(struct mlx5e_xdpsq *sq, struct mlx5e_rq *rq, xdpi.frame.xdpf = xdpf; xdpi.frame.dma_addr = dma_addr; } else { - /* Driver assumes that convert_to_xdp_frame returns an xdp_frame - * that points to the same memory region as the original - * xdp_buff. It allows to map the memory only once and to use - * the DMA_BIDIRECTIONAL mode. + /* Driver assumes that xdp_convert_buff_to_frame returns + * an xdp_frame that points to the same memory region as + * the original xdp_buff. It allows to map the memory only + * once and to use the DMA_BIDIRECTIONAL mode. */ xdpi.mode = MLX5E_XDP_XMIT_MODE_PAGE; diff --git a/drivers/net/ethernet/sfc/rx.c b/drivers/net/ethernet/sfc/rx.c index 68c47a8c71df..c01916cff507 100644 --- a/drivers/net/ethernet/sfc/rx.c +++ b/drivers/net/ethernet/sfc/rx.c @@ -329,7 +329,7 @@ static bool efx_do_xdp(struct efx_nic *efx, struct efx_channel *channel, case XDP_TX: /* Buffer ownership passes to tx on success. */ - xdpf = convert_to_xdp_frame(&xdp); + xdpf = xdp_convert_buff_to_frame(&xdp); err = efx_xdp_tx_buffers(efx, 1, &xdpf, true); if (unlikely(err != 1)) { efx_free_rx_buffers(rx_queue, rx_buf, 1); diff --git a/drivers/net/ethernet/socionext/netsec.c b/drivers/net/ethernet/socionext/netsec.c index e1f4be4b3d69..328bc38848bb 100644 --- a/drivers/net/ethernet/socionext/netsec.c +++ b/drivers/net/ethernet/socionext/netsec.c @@ -867,7 +867,7 @@ static u32 netsec_xdp_queue_one(struct netsec_priv *priv, static u32 netsec_xdp_xmit_back(struct netsec_priv *priv, struct xdp_buff *xdp) { struct netsec_desc_ring *tx_ring = &priv->desc_ring[NETSEC_RING_TX]; - struct xdp_frame *xdpf = convert_to_xdp_frame(xdp); + struct xdp_frame *xdpf = xdp_convert_buff_to_frame(xdp); u32 ret; if (unlikely(!xdpf)) diff --git a/drivers/net/ethernet/ti/cpsw_priv.c b/drivers/net/ethernet/ti/cpsw_priv.c index d940628bff8d..a399f3659346 100644 --- a/drivers/net/ethernet/ti/cpsw_priv.c +++ b/drivers/net/ethernet/ti/cpsw_priv.c @@ -1355,7 +1355,7 @@ int cpsw_run_xdp(struct cpsw_priv *priv, int ch, struct xdp_buff *xdp, ret = CPSW_XDP_PASS; break; case XDP_TX: - xdpf = convert_to_xdp_frame(xdp); + xdpf = xdp_convert_buff_to_frame(xdp); if (unlikely(!xdpf)) goto drop; diff --git a/drivers/net/tun.c b/drivers/net/tun.c index b0ab882c021e..858b012074bd 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -1295,7 +1295,7 @@ resample: static int tun_xdp_tx(struct net_device *dev, struct xdp_buff *xdp) { - struct xdp_frame *frame = convert_to_xdp_frame(xdp); + struct xdp_frame *frame = xdp_convert_buff_to_frame(xdp); if (unlikely(!frame)) return -EOVERFLOW; diff --git a/drivers/net/veth.c b/drivers/net/veth.c index fb5c17361f64..b594f03eeddb 100644 --- a/drivers/net/veth.c +++ b/drivers/net/veth.c @@ -541,7 +541,7 @@ out: static int veth_xdp_tx(struct veth_rq *rq, struct xdp_buff *xdp, struct veth_xdp_tx_bq *bq) { - struct xdp_frame *frame = convert_to_xdp_frame(xdp); + struct xdp_frame *frame = xdp_convert_buff_to_frame(xdp); if (unlikely(!frame)) return -EOVERFLOW; diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index b6951aa76295..ba38765dc490 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -703,7 +703,7 @@ static struct sk_buff *receive_small(struct net_device *dev, break; case XDP_TX: stats->xdp_tx++; - xdpf = convert_to_xdp_frame(&xdp); + xdpf = xdp_convert_buff_to_frame(&xdp); if (unlikely(!xdpf)) goto err_xdp; err = virtnet_xdp_xmit(dev, 1, &xdpf, 0); @@ -892,7 +892,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev, break; case XDP_TX: stats->xdp_tx++; - xdpf = convert_to_xdp_frame(&xdp); + xdpf = xdp_convert_buff_to_frame(&xdp); if (unlikely(!xdpf)) goto err_xdp; err = virtnet_xdp_xmit(dev, 1, &xdpf, 0); diff --git a/include/net/xdp.h b/include/net/xdp.h index db5c2c687f48..609f819ed08b 100644 --- a/include/net/xdp.h +++ b/include/net/xdp.h @@ -123,7 +123,7 @@ void xdp_convert_frame_to_buff(struct xdp_frame *frame, struct xdp_buff *xdp) /* Convert xdp_buff to xdp_frame */ static inline -struct xdp_frame *convert_to_xdp_frame(struct xdp_buff *xdp) +struct xdp_frame *xdp_convert_buff_to_frame(struct xdp_buff *xdp) { struct xdp_frame *xdp_frame; int metasize; diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index 8b85bfddfac7..27595fc6da56 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -621,7 +621,7 @@ int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_buff *xdp, { struct xdp_frame *xdpf; - xdpf = convert_to_xdp_frame(xdp); + xdpf = xdp_convert_buff_to_frame(xdp); if (unlikely(!xdpf)) return -EOVERFLOW; diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index c04fb1c72f5e..854b09beb16b 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -465,7 +465,7 @@ static inline int __xdp_enqueue(struct net_device *dev, struct xdp_buff *xdp, if (unlikely(err)) return err; - xdpf = convert_to_xdp_frame(xdp); + xdpf = xdp_convert_buff_to_frame(xdp); if (unlikely(!xdpf)) return -EOVERFLOW; -- cgit v1.2.3-59-g8ed1b From 958a3f2d2aff896ae2a622878e456114f4a4cd15 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Sun, 31 May 2020 17:42:55 +0200 Subject: bpf: Use tracing helpers for lsm programs Currenty lsm uses bpf_tracing_func_proto helpers which do not include stack trace or perf event output. It's useful to have those for bpftrace lsm support [1]. Using tracing_prog_func_proto helpers for lsm programs. [1] https://github.com/iovisor/bpftrace/pull/1347 Signed-off-by: Jiri Olsa Signed-off-by: Alexei Starovoitov Cc: KP Singh Link: https://lore.kernel.org/bpf/20200531154255.896551-1-jolsa@kernel.org --- include/linux/bpf.h | 3 +++ kernel/bpf/bpf_lsm.c | 2 +- kernel/trace/bpf_trace.c | 2 +- 3 files changed, 5 insertions(+), 2 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index e042311f991f..07052d44bca1 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1633,6 +1633,9 @@ extern const struct bpf_func_proto bpf_ringbuf_query_proto; const struct bpf_func_proto *bpf_tracing_func_proto( enum bpf_func_id func_id, const struct bpf_prog *prog); +const struct bpf_func_proto *tracing_prog_func_proto( + enum bpf_func_id func_id, const struct bpf_prog *prog); + /* Shared helpers among cBPF and eBPF. */ void bpf_user_rnd_init_once(void); u64 bpf_user_rnd_u32(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5); diff --git a/kernel/bpf/bpf_lsm.c b/kernel/bpf/bpf_lsm.c index 19636703b24e..fb278144e9fd 100644 --- a/kernel/bpf/bpf_lsm.c +++ b/kernel/bpf/bpf_lsm.c @@ -49,6 +49,6 @@ const struct bpf_prog_ops lsm_prog_ops = { }; const struct bpf_verifier_ops lsm_verifier_ops = { - .get_func_proto = bpf_tracing_func_proto, + .get_func_proto = tracing_prog_func_proto, .is_valid_access = btf_ctx_access, }; diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index b6c24be5ff53..c41186417d93 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -1467,7 +1467,7 @@ raw_tp_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) } } -static const struct bpf_func_proto * +const struct bpf_func_proto * tracing_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { -- cgit v1.2.3-59-g8ed1b From febeb6dff7beafcaf89521f6c8ff7b0adac08d54 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Mon, 1 Jun 2020 13:26:01 -0700 Subject: libbpf: Add _GNU_SOURCE for reallocarray to ringbuf.c On systems with recent enough glibc, reallocarray compat won't kick in, so reallocarray() itself has to come from stdlib.h include. But _GNU_SOURCE is necessary to enable it. So add it. Fixes: bf99c936f947 ("libbpf: Add BPF ring buffer support") Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Acked-by: Song Liu Link: https://lore.kernel.org/bpf/20200601202601.2139477-1-andriin@fb.com --- tools/lib/bpf/ringbuf.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tools/lib/bpf/ringbuf.c b/tools/lib/bpf/ringbuf.c index bc10fa1d43c7..4fc6c6cbb4eb 100644 --- a/tools/lib/bpf/ringbuf.c +++ b/tools/lib/bpf/ringbuf.c @@ -4,6 +4,9 @@ * * Copyright (C) 2020 Facebook, Inc. */ +#ifndef _GNU_SOURCE +#define _GNU_SOURCE +#endif #include #include #include -- cgit v1.2.3-59-g8ed1b From 171526f6fee84de0c39e2b7aa7e666ba0bbfd173 Mon Sep 17 00:00:00 2001 From: Jakub Sitnicki Date: Sun, 31 May 2020 10:28:35 +0200 Subject: flow_dissector: Pull locking up from prog attach callback Split out the part of attach callback that happens with attach/detach lock acquired. This structures the prog attach callback in a way that opens up doors for moving the locking out of flow_dissector and into generic callbacks for attaching/detaching progs to netns in subsequent patches. Signed-off-by: Jakub Sitnicki Signed-off-by: Alexei Starovoitov Reviewed-by: Stanislav Fomichev Link: https://lore.kernel.org/bpf/20200531082846.2117903-2-jakub@cloudflare.com --- net/core/flow_dissector.c | 40 ++++++++++++++++++++-------------------- 1 file changed, 20 insertions(+), 20 deletions(-) diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 0aeb33572feb..b64a44a083fd 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -109,15 +109,10 @@ int skb_flow_dissector_prog_query(const union bpf_attr *attr, return 0; } -int skb_flow_dissector_bpf_prog_attach(const union bpf_attr *attr, - struct bpf_prog *prog) +static int flow_dissector_bpf_prog_attach(struct net *net, + struct bpf_prog *prog) { struct bpf_prog *attached; - struct net *net; - int ret = 0; - - net = current->nsproxy->net_ns; - mutex_lock(&flow_dissector_mutex); if (net == &init_net) { /* BPF flow dissector in the root namespace overrides @@ -130,33 +125,38 @@ int skb_flow_dissector_bpf_prog_attach(const union bpf_attr *attr, for_each_net(ns) { if (ns == &init_net) continue; - if (rcu_access_pointer(ns->flow_dissector_prog)) { - ret = -EEXIST; - goto out; - } + if (rcu_access_pointer(ns->flow_dissector_prog)) + return -EEXIST; } } else { /* Make sure root flow dissector is not attached * when attaching to the non-root namespace. */ - if (rcu_access_pointer(init_net.flow_dissector_prog)) { - ret = -EEXIST; - goto out; - } + if (rcu_access_pointer(init_net.flow_dissector_prog)) + return -EEXIST; } attached = rcu_dereference_protected(net->flow_dissector_prog, lockdep_is_held(&flow_dissector_mutex)); - if (attached == prog) { + if (attached == prog) /* The same program cannot be attached twice */ - ret = -EINVAL; - goto out; - } + return -EINVAL; + rcu_assign_pointer(net->flow_dissector_prog, prog); if (attached) bpf_prog_put(attached); -out: + return 0; +} + +int skb_flow_dissector_bpf_prog_attach(const union bpf_attr *attr, + struct bpf_prog *prog) +{ + int ret; + + mutex_lock(&flow_dissector_mutex); + ret = flow_dissector_bpf_prog_attach(current->nsproxy->net_ns, prog); mutex_unlock(&flow_dissector_mutex); + return ret; } -- cgit v1.2.3-59-g8ed1b From a3fd7ceee05431d2c51ed86c6cae015d236a51f0 Mon Sep 17 00:00:00 2001 From: Jakub Sitnicki Date: Sun, 31 May 2020 10:28:36 +0200 Subject: net: Introduce netns_bpf for BPF programs attached to netns In order to: (1) attach more than one BPF program type to netns, or (2) support attaching BPF programs to netns with bpf_link, or (3) support multi-prog attach points for netns we will need to keep more state per netns than a single pointer like we have now for BPF flow dissector program. Prepare for the above by extracting netns_bpf that is part of struct net, for storing all state related to BPF programs attached to netns. Turn flow dissector callbacks for querying/attaching/detaching a program into generic ones that operate on netns_bpf. Next patch will move the generic callbacks into their own module. This is similar to how it is organized for cgroup with cgroup_bpf. Signed-off-by: Jakub Sitnicki Signed-off-by: Alexei Starovoitov Cc: Stanislav Fomichev Link: https://lore.kernel.org/bpf/20200531082846.2117903-3-jakub@cloudflare.com --- include/linux/bpf-netns.h | 56 +++++++++++++++++++++++ include/linux/skbuff.h | 26 ----------- include/net/net_namespace.h | 4 +- include/net/netns/bpf.h | 17 +++++++ kernel/bpf/syscall.c | 7 +-- net/core/flow_dissector.c | 105 +++++++++++++++++++++++++++++--------------- 6 files changed, 149 insertions(+), 66 deletions(-) create mode 100644 include/linux/bpf-netns.h create mode 100644 include/net/netns/bpf.h diff --git a/include/linux/bpf-netns.h b/include/linux/bpf-netns.h new file mode 100644 index 000000000000..f3aec3d79824 --- /dev/null +++ b/include/linux/bpf-netns.h @@ -0,0 +1,56 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BPF_NETNS_H +#define _BPF_NETNS_H + +#include +#include + +enum netns_bpf_attach_type { + NETNS_BPF_INVALID = -1, + NETNS_BPF_FLOW_DISSECTOR = 0, + MAX_NETNS_BPF_ATTACH_TYPE +}; + +static inline enum netns_bpf_attach_type +to_netns_bpf_attach_type(enum bpf_attach_type attach_type) +{ + switch (attach_type) { + case BPF_FLOW_DISSECTOR: + return NETNS_BPF_FLOW_DISSECTOR; + default: + return NETNS_BPF_INVALID; + } +} + +/* Protects updates to netns_bpf */ +extern struct mutex netns_bpf_mutex; + +union bpf_attr; +struct bpf_prog; + +#ifdef CONFIG_NET +int netns_bpf_prog_query(const union bpf_attr *attr, + union bpf_attr __user *uattr); +int netns_bpf_prog_attach(const union bpf_attr *attr, + struct bpf_prog *prog); +int netns_bpf_prog_detach(const union bpf_attr *attr); +#else +static inline int netns_bpf_prog_query(const union bpf_attr *attr, + union bpf_attr __user *uattr) +{ + return -EOPNOTSUPP; +} + +static inline int netns_bpf_prog_attach(const union bpf_attr *attr, + struct bpf_prog *prog) +{ + return -EOPNOTSUPP; +} + +static inline int netns_bpf_prog_detach(const union bpf_attr *attr) +{ + return -EOPNOTSUPP; +} +#endif + +#endif /* _BPF_NETNS_H */ diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 531843952809..a0d5c2760103 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -1283,32 +1283,6 @@ void skb_flow_dissector_init(struct flow_dissector *flow_dissector, const struct flow_dissector_key *key, unsigned int key_count); -#ifdef CONFIG_NET -int skb_flow_dissector_prog_query(const union bpf_attr *attr, - union bpf_attr __user *uattr); -int skb_flow_dissector_bpf_prog_attach(const union bpf_attr *attr, - struct bpf_prog *prog); - -int skb_flow_dissector_bpf_prog_detach(const union bpf_attr *attr); -#else -static inline int skb_flow_dissector_prog_query(const union bpf_attr *attr, - union bpf_attr __user *uattr) -{ - return -EOPNOTSUPP; -} - -static inline int skb_flow_dissector_bpf_prog_attach(const union bpf_attr *attr, - struct bpf_prog *prog) -{ - return -EOPNOTSUPP; -} - -static inline int skb_flow_dissector_bpf_prog_detach(const union bpf_attr *attr) -{ - return -EOPNOTSUPP; -} -#endif - struct bpf_flow_dissector; bool bpf_flow_dissect(struct bpf_prog *prog, struct bpf_flow_dissector *ctx, __be16 proto, int nhoff, int hlen, unsigned int flags); diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h index 8e001e049497..2ee5901bec7a 100644 --- a/include/net/net_namespace.h +++ b/include/net/net_namespace.h @@ -33,6 +33,7 @@ #include #include #include +#include #include #include #include @@ -162,7 +163,8 @@ struct net { #endif struct net_generic __rcu *gen; - struct bpf_prog __rcu *flow_dissector_prog; + /* Used to store attached BPF programs */ + struct netns_bpf bpf; /* Note : following structs are cache line aligned */ #ifdef CONFIG_XFRM diff --git a/include/net/netns/bpf.h b/include/net/netns/bpf.h new file mode 100644 index 000000000000..a858d1c5b166 --- /dev/null +++ b/include/net/netns/bpf.h @@ -0,0 +1,17 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * BPF programs attached to network namespace + */ + +#ifndef __NETNS_BPF_H__ +#define __NETNS_BPF_H__ + +#include + +struct bpf_prog; + +struct netns_bpf { + struct bpf_prog __rcu *progs[MAX_NETNS_BPF_ATTACH_TYPE]; +}; + +#endif /* __NETNS_BPF_H__ */ diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index e83b0818b529..c77ab9c76f7b 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -27,6 +27,7 @@ #include #include #include +#include #define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || \ (map)->map_type == BPF_MAP_TYPE_CGROUP_ARRAY || \ @@ -2868,7 +2869,7 @@ static int bpf_prog_attach(const union bpf_attr *attr) ret = lirc_prog_attach(attr, prog); break; case BPF_PROG_TYPE_FLOW_DISSECTOR: - ret = skb_flow_dissector_bpf_prog_attach(attr, prog); + ret = netns_bpf_prog_attach(attr, prog); break; case BPF_PROG_TYPE_CGROUP_DEVICE: case BPF_PROG_TYPE_CGROUP_SKB: @@ -2908,7 +2909,7 @@ static int bpf_prog_detach(const union bpf_attr *attr) case BPF_PROG_TYPE_FLOW_DISSECTOR: if (!capable(CAP_NET_ADMIN)) return -EPERM; - return skb_flow_dissector_bpf_prog_detach(attr); + return netns_bpf_prog_detach(attr); case BPF_PROG_TYPE_CGROUP_DEVICE: case BPF_PROG_TYPE_CGROUP_SKB: case BPF_PROG_TYPE_CGROUP_SOCK: @@ -2961,7 +2962,7 @@ static int bpf_prog_query(const union bpf_attr *attr, case BPF_LIRC_MODE2: return lirc_prog_query(attr, uattr); case BPF_FLOW_DISSECTOR: - return skb_flow_dissector_prog_query(attr, uattr); + return netns_bpf_prog_query(attr, uattr); default: return -EINVAL; } diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index b64a44a083fd..6c1b8e43d611 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -31,8 +31,10 @@ #include #include #endif +#include -static DEFINE_MUTEX(flow_dissector_mutex); +/* Protects updates to netns_bpf */ +DEFINE_MUTEX(netns_bpf_mutex); static void dissector_set_key(struct flow_dissector *flow_dissector, enum flow_dissector_key_id key_id) @@ -70,23 +72,28 @@ void skb_flow_dissector_init(struct flow_dissector *flow_dissector, } EXPORT_SYMBOL(skb_flow_dissector_init); -int skb_flow_dissector_prog_query(const union bpf_attr *attr, - union bpf_attr __user *uattr) +int netns_bpf_prog_query(const union bpf_attr *attr, + union bpf_attr __user *uattr) { __u32 __user *prog_ids = u64_to_user_ptr(attr->query.prog_ids); u32 prog_id, prog_cnt = 0, flags = 0; + enum netns_bpf_attach_type type; struct bpf_prog *attached; struct net *net; if (attr->query.query_flags) return -EINVAL; + type = to_netns_bpf_attach_type(attr->query.attach_type); + if (type < 0) + return -EINVAL; + net = get_net_ns_by_fd(attr->query.target_fd); if (IS_ERR(net)) return PTR_ERR(net); rcu_read_lock(); - attached = rcu_dereference(net->flow_dissector_prog); + attached = rcu_dereference(net->bpf.progs[type]); if (attached) { prog_cnt = 1; prog_id = attached->aux->id; @@ -112,6 +119,7 @@ int skb_flow_dissector_prog_query(const union bpf_attr *attr, static int flow_dissector_bpf_prog_attach(struct net *net, struct bpf_prog *prog) { + enum netns_bpf_attach_type type = NETNS_BPF_FLOW_DISSECTOR; struct bpf_prog *attached; if (net == &init_net) { @@ -125,74 +133,97 @@ static int flow_dissector_bpf_prog_attach(struct net *net, for_each_net(ns) { if (ns == &init_net) continue; - if (rcu_access_pointer(ns->flow_dissector_prog)) + if (rcu_access_pointer(ns->bpf.progs[type])) return -EEXIST; } } else { /* Make sure root flow dissector is not attached * when attaching to the non-root namespace. */ - if (rcu_access_pointer(init_net.flow_dissector_prog)) + if (rcu_access_pointer(init_net.bpf.progs[type])) return -EEXIST; } - attached = rcu_dereference_protected(net->flow_dissector_prog, - lockdep_is_held(&flow_dissector_mutex)); + attached = rcu_dereference_protected(net->bpf.progs[type], + lockdep_is_held(&netns_bpf_mutex)); if (attached == prog) /* The same program cannot be attached twice */ return -EINVAL; - rcu_assign_pointer(net->flow_dissector_prog, prog); + rcu_assign_pointer(net->bpf.progs[type], prog); if (attached) bpf_prog_put(attached); return 0; } -int skb_flow_dissector_bpf_prog_attach(const union bpf_attr *attr, - struct bpf_prog *prog) +int netns_bpf_prog_attach(const union bpf_attr *attr, struct bpf_prog *prog) { + enum netns_bpf_attach_type type; + struct net *net; int ret; - mutex_lock(&flow_dissector_mutex); - ret = flow_dissector_bpf_prog_attach(current->nsproxy->net_ns, prog); - mutex_unlock(&flow_dissector_mutex); + type = to_netns_bpf_attach_type(attr->attach_type); + if (type < 0) + return -EINVAL; + + net = current->nsproxy->net_ns; + mutex_lock(&netns_bpf_mutex); + switch (type) { + case NETNS_BPF_FLOW_DISSECTOR: + ret = flow_dissector_bpf_prog_attach(net, prog); + break; + default: + ret = -EINVAL; + break; + } + mutex_unlock(&netns_bpf_mutex); return ret; } -static int flow_dissector_bpf_prog_detach(struct net *net) +/* Must be called with netns_bpf_mutex held. */ +static int __netns_bpf_prog_detach(struct net *net, + enum netns_bpf_attach_type type) { struct bpf_prog *attached; - mutex_lock(&flow_dissector_mutex); - attached = rcu_dereference_protected(net->flow_dissector_prog, - lockdep_is_held(&flow_dissector_mutex)); - if (!attached) { - mutex_unlock(&flow_dissector_mutex); + attached = rcu_dereference_protected(net->bpf.progs[type], + lockdep_is_held(&netns_bpf_mutex)); + if (!attached) return -ENOENT; - } - RCU_INIT_POINTER(net->flow_dissector_prog, NULL); + RCU_INIT_POINTER(net->bpf.progs[type], NULL); bpf_prog_put(attached); - mutex_unlock(&flow_dissector_mutex); return 0; } -int skb_flow_dissector_bpf_prog_detach(const union bpf_attr *attr) +int netns_bpf_prog_detach(const union bpf_attr *attr) { - return flow_dissector_bpf_prog_detach(current->nsproxy->net_ns); + enum netns_bpf_attach_type type; + int ret; + + type = to_netns_bpf_attach_type(attr->attach_type); + if (type < 0) + return -EINVAL; + + mutex_lock(&netns_bpf_mutex); + ret = __netns_bpf_prog_detach(current->nsproxy->net_ns, type); + mutex_unlock(&netns_bpf_mutex); + + return ret; } -static void __net_exit flow_dissector_pernet_pre_exit(struct net *net) +static void __net_exit netns_bpf_pernet_pre_exit(struct net *net) { - /* We're not racing with attach/detach because there are no - * references to netns left when pre_exit gets called. - */ - if (rcu_access_pointer(net->flow_dissector_prog)) - flow_dissector_bpf_prog_detach(net); + enum netns_bpf_attach_type type; + + mutex_lock(&netns_bpf_mutex); + for (type = 0; type < MAX_NETNS_BPF_ATTACH_TYPE; type++) + __netns_bpf_prog_detach(net, type); + mutex_unlock(&netns_bpf_mutex); } -static struct pernet_operations flow_dissector_pernet_ops __net_initdata = { - .pre_exit = flow_dissector_pernet_pre_exit, +static struct pernet_operations netns_bpf_pernet_ops __net_initdata = { + .pre_exit = netns_bpf_pernet_pre_exit, }; /** @@ -1044,11 +1075,13 @@ bool __skb_flow_dissect(const struct net *net, WARN_ON_ONCE(!net); if (net) { + enum netns_bpf_attach_type type = NETNS_BPF_FLOW_DISSECTOR; + rcu_read_lock(); - attached = rcu_dereference(init_net.flow_dissector_prog); + attached = rcu_dereference(init_net.bpf.progs[type]); if (!attached) - attached = rcu_dereference(net->flow_dissector_prog); + attached = rcu_dereference(net->bpf.progs[type]); if (attached) { struct bpf_flow_keys flow_keys; @@ -1870,6 +1903,6 @@ static int __init init_default_flow_dissectors(void) flow_keys_basic_dissector_keys, ARRAY_SIZE(flow_keys_basic_dissector_keys)); - return register_pernet_subsys(&flow_dissector_pernet_ops); + return register_pernet_subsys(&netns_bpf_pernet_ops); } core_initcall(init_default_flow_dissectors); -- cgit v1.2.3-59-g8ed1b From b27f7bb590ba835b32ef122389db158e44cfda1e Mon Sep 17 00:00:00 2001 From: Jakub Sitnicki Date: Sun, 31 May 2020 10:28:37 +0200 Subject: flow_dissector: Move out netns_bpf prog callbacks Move functions to manage BPF programs attached to netns that are not specific to flow dissector to a dedicated module named bpf/net_namespace.c. The set of functions will grow with the addition of bpf_link support for netns attached programs. This patch prepares ground by creating a place for it. This is a code move with no functional changes intended. Signed-off-by: Jakub Sitnicki Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200531082846.2117903-4-jakub@cloudflare.com --- include/net/flow_dissector.h | 6 ++ kernel/bpf/Makefile | 1 + kernel/bpf/net_namespace.c | 133 +++++++++++++++++++++++++++++++++++++++++++ net/core/flow_dissector.c | 125 ++-------------------------------------- 4 files changed, 144 insertions(+), 121 deletions(-) create mode 100644 kernel/bpf/net_namespace.c diff --git a/include/net/flow_dissector.h b/include/net/flow_dissector.h index 4fb1a69c6ecf..a7eba43fe4e4 100644 --- a/include/net/flow_dissector.h +++ b/include/net/flow_dissector.h @@ -8,6 +8,8 @@ #include #include +struct bpf_prog; +struct net; struct sk_buff; /** @@ -369,4 +371,8 @@ flow_dissector_init_keys(struct flow_dissector_key_control *key_control, memset(key_basic, 0, sizeof(*key_basic)); } +#ifdef CONFIG_BPF_SYSCALL +int flow_dissector_bpf_prog_attach(struct net *net, struct bpf_prog *prog); +#endif /* CONFIG_BPF_SYSCALL */ + #endif diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index 8fca02f64811..1131a921e1a6 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -13,6 +13,7 @@ ifeq ($(CONFIG_NET),y) obj-$(CONFIG_BPF_SYSCALL) += devmap.o obj-$(CONFIG_BPF_SYSCALL) += cpumap.o obj-$(CONFIG_BPF_SYSCALL) += offload.o +obj-$(CONFIG_BPF_SYSCALL) += net_namespace.o endif ifeq ($(CONFIG_PERF_EVENTS),y) obj-$(CONFIG_BPF_SYSCALL) += stackmap.o diff --git a/kernel/bpf/net_namespace.c b/kernel/bpf/net_namespace.c new file mode 100644 index 000000000000..b37d81450c3a --- /dev/null +++ b/kernel/bpf/net_namespace.c @@ -0,0 +1,133 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include +#include + +/* + * Functions to manage BPF programs attached to netns + */ + +/* Protects updates to netns_bpf */ +DEFINE_MUTEX(netns_bpf_mutex); + +int netns_bpf_prog_query(const union bpf_attr *attr, + union bpf_attr __user *uattr) +{ + __u32 __user *prog_ids = u64_to_user_ptr(attr->query.prog_ids); + u32 prog_id, prog_cnt = 0, flags = 0; + enum netns_bpf_attach_type type; + struct bpf_prog *attached; + struct net *net; + + if (attr->query.query_flags) + return -EINVAL; + + type = to_netns_bpf_attach_type(attr->query.attach_type); + if (type < 0) + return -EINVAL; + + net = get_net_ns_by_fd(attr->query.target_fd); + if (IS_ERR(net)) + return PTR_ERR(net); + + rcu_read_lock(); + attached = rcu_dereference(net->bpf.progs[type]); + if (attached) { + prog_cnt = 1; + prog_id = attached->aux->id; + } + rcu_read_unlock(); + + put_net(net); + + if (copy_to_user(&uattr->query.attach_flags, &flags, sizeof(flags))) + return -EFAULT; + if (copy_to_user(&uattr->query.prog_cnt, &prog_cnt, sizeof(prog_cnt))) + return -EFAULT; + + if (!attr->query.prog_cnt || !prog_ids || !prog_cnt) + return 0; + + if (copy_to_user(prog_ids, &prog_id, sizeof(u32))) + return -EFAULT; + + return 0; +} + +int netns_bpf_prog_attach(const union bpf_attr *attr, struct bpf_prog *prog) +{ + enum netns_bpf_attach_type type; + struct net *net; + int ret; + + type = to_netns_bpf_attach_type(attr->attach_type); + if (type < 0) + return -EINVAL; + + net = current->nsproxy->net_ns; + mutex_lock(&netns_bpf_mutex); + switch (type) { + case NETNS_BPF_FLOW_DISSECTOR: + ret = flow_dissector_bpf_prog_attach(net, prog); + break; + default: + ret = -EINVAL; + break; + } + mutex_unlock(&netns_bpf_mutex); + + return ret; +} + +/* Must be called with netns_bpf_mutex held. */ +static int __netns_bpf_prog_detach(struct net *net, + enum netns_bpf_attach_type type) +{ + struct bpf_prog *attached; + + attached = rcu_dereference_protected(net->bpf.progs[type], + lockdep_is_held(&netns_bpf_mutex)); + if (!attached) + return -ENOENT; + RCU_INIT_POINTER(net->bpf.progs[type], NULL); + bpf_prog_put(attached); + return 0; +} + +int netns_bpf_prog_detach(const union bpf_attr *attr) +{ + enum netns_bpf_attach_type type; + int ret; + + type = to_netns_bpf_attach_type(attr->attach_type); + if (type < 0) + return -EINVAL; + + mutex_lock(&netns_bpf_mutex); + ret = __netns_bpf_prog_detach(current->nsproxy->net_ns, type); + mutex_unlock(&netns_bpf_mutex); + + return ret; +} + +static void __net_exit netns_bpf_pernet_pre_exit(struct net *net) +{ + enum netns_bpf_attach_type type; + + mutex_lock(&netns_bpf_mutex); + for (type = 0; type < MAX_NETNS_BPF_ATTACH_TYPE; type++) + __netns_bpf_prog_detach(net, type); + mutex_unlock(&netns_bpf_mutex); +} + +static struct pernet_operations netns_bpf_pernet_ops __net_initdata = { + .pre_exit = netns_bpf_pernet_pre_exit, +}; + +static int __init netns_bpf_init(void) +{ + return register_pernet_subsys(&netns_bpf_pernet_ops); +} + +subsys_initcall(netns_bpf_init); diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 6c1b8e43d611..d02df0b6d0d9 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -33,9 +33,6 @@ #endif #include -/* Protects updates to netns_bpf */ -DEFINE_MUTEX(netns_bpf_mutex); - static void dissector_set_key(struct flow_dissector *flow_dissector, enum flow_dissector_key_id key_id) { @@ -72,52 +69,8 @@ void skb_flow_dissector_init(struct flow_dissector *flow_dissector, } EXPORT_SYMBOL(skb_flow_dissector_init); -int netns_bpf_prog_query(const union bpf_attr *attr, - union bpf_attr __user *uattr) -{ - __u32 __user *prog_ids = u64_to_user_ptr(attr->query.prog_ids); - u32 prog_id, prog_cnt = 0, flags = 0; - enum netns_bpf_attach_type type; - struct bpf_prog *attached; - struct net *net; - - if (attr->query.query_flags) - return -EINVAL; - - type = to_netns_bpf_attach_type(attr->query.attach_type); - if (type < 0) - return -EINVAL; - - net = get_net_ns_by_fd(attr->query.target_fd); - if (IS_ERR(net)) - return PTR_ERR(net); - - rcu_read_lock(); - attached = rcu_dereference(net->bpf.progs[type]); - if (attached) { - prog_cnt = 1; - prog_id = attached->aux->id; - } - rcu_read_unlock(); - - put_net(net); - - if (copy_to_user(&uattr->query.attach_flags, &flags, sizeof(flags))) - return -EFAULT; - if (copy_to_user(&uattr->query.prog_cnt, &prog_cnt, sizeof(prog_cnt))) - return -EFAULT; - - if (!attr->query.prog_cnt || !prog_ids || !prog_cnt) - return 0; - - if (copy_to_user(prog_ids, &prog_id, sizeof(u32))) - return -EFAULT; - - return 0; -} - -static int flow_dissector_bpf_prog_attach(struct net *net, - struct bpf_prog *prog) +#ifdef CONFIG_BPF_SYSCALL +int flow_dissector_bpf_prog_attach(struct net *net, struct bpf_prog *prog) { enum netns_bpf_attach_type type = NETNS_BPF_FLOW_DISSECTOR; struct bpf_prog *attached; @@ -155,76 +108,7 @@ static int flow_dissector_bpf_prog_attach(struct net *net, bpf_prog_put(attached); return 0; } - -int netns_bpf_prog_attach(const union bpf_attr *attr, struct bpf_prog *prog) -{ - enum netns_bpf_attach_type type; - struct net *net; - int ret; - - type = to_netns_bpf_attach_type(attr->attach_type); - if (type < 0) - return -EINVAL; - - net = current->nsproxy->net_ns; - mutex_lock(&netns_bpf_mutex); - switch (type) { - case NETNS_BPF_FLOW_DISSECTOR: - ret = flow_dissector_bpf_prog_attach(net, prog); - break; - default: - ret = -EINVAL; - break; - } - mutex_unlock(&netns_bpf_mutex); - - return ret; -} - -/* Must be called with netns_bpf_mutex held. */ -static int __netns_bpf_prog_detach(struct net *net, - enum netns_bpf_attach_type type) -{ - struct bpf_prog *attached; - - attached = rcu_dereference_protected(net->bpf.progs[type], - lockdep_is_held(&netns_bpf_mutex)); - if (!attached) - return -ENOENT; - RCU_INIT_POINTER(net->bpf.progs[type], NULL); - bpf_prog_put(attached); - return 0; -} - -int netns_bpf_prog_detach(const union bpf_attr *attr) -{ - enum netns_bpf_attach_type type; - int ret; - - type = to_netns_bpf_attach_type(attr->attach_type); - if (type < 0) - return -EINVAL; - - mutex_lock(&netns_bpf_mutex); - ret = __netns_bpf_prog_detach(current->nsproxy->net_ns, type); - mutex_unlock(&netns_bpf_mutex); - - return ret; -} - -static void __net_exit netns_bpf_pernet_pre_exit(struct net *net) -{ - enum netns_bpf_attach_type type; - - mutex_lock(&netns_bpf_mutex); - for (type = 0; type < MAX_NETNS_BPF_ATTACH_TYPE; type++) - __netns_bpf_prog_detach(net, type); - mutex_unlock(&netns_bpf_mutex); -} - -static struct pernet_operations netns_bpf_pernet_ops __net_initdata = { - .pre_exit = netns_bpf_pernet_pre_exit, -}; +#endif /* CONFIG_BPF_SYSCALL */ /** * __skb_flow_get_ports - extract the upper layer ports and return them @@ -1902,7 +1786,6 @@ static int __init init_default_flow_dissectors(void) skb_flow_dissector_init(&flow_keys_basic_dissector, flow_keys_basic_dissector_keys, ARRAY_SIZE(flow_keys_basic_dissector_keys)); - - return register_pernet_subsys(&netns_bpf_pernet_ops); + return 0; } core_initcall(init_default_flow_dissectors); -- cgit v1.2.3-59-g8ed1b From 7f045a49fee04b5662cbdeaf0838f9322ae8c63a Mon Sep 17 00:00:00 2001 From: Jakub Sitnicki Date: Sun, 31 May 2020 10:28:38 +0200 Subject: bpf: Add link-based BPF program attachment to network namespace Extend bpf() syscall subcommands that operate on bpf_link, that is LINK_CREATE, LINK_UPDATE, OBJ_GET_INFO, to accept attach types tied to network namespaces (only flow dissector at the moment). Link-based and prog-based attachment can be used interchangeably, but only one can exist at a time. Attempts to attach a link when a prog is already attached directly, and the other way around, will be met with -EEXIST. Attempts to detach a program when link exists result in -EINVAL. Attachment of multiple links of same attach type to one netns is not supported with the intention to lift the restriction when a use-case presents itself. Because of that link create returns -E2BIG when trying to create another netns link, when one already exists. Link-based attachments to netns don't keep a netns alive by holding a ref to it. Instead links get auto-detached from netns when the latter is being destroyed, using a pernet pre_exit callback. When auto-detached, link lives in defunct state as long there are open FDs for it. -ENOLINK is returned if a user tries to update a defunct link. Because bpf_link to netns doesn't hold a ref to struct net, special care is taken when releasing, updating, or filling link info. The netns might be getting torn down when any of these link operations are in progress. That is why auto-detach and update/release/fill_info are synchronized by the same mutex. Also, link ops have to always check if auto-detach has not happened yet and if netns is still alive (refcnt > 0). Signed-off-by: Jakub Sitnicki Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200531082846.2117903-5-jakub@cloudflare.com --- include/linux/bpf-netns.h | 8 ++ include/linux/bpf_types.h | 3 + include/net/netns/bpf.h | 1 + include/uapi/linux/bpf.h | 5 + kernel/bpf/net_namespace.c | 244 ++++++++++++++++++++++++++++++++++++++++- kernel/bpf/syscall.c | 3 + tools/include/uapi/linux/bpf.h | 5 + 7 files changed, 267 insertions(+), 2 deletions(-) diff --git a/include/linux/bpf-netns.h b/include/linux/bpf-netns.h index f3aec3d79824..4052d649f36d 100644 --- a/include/linux/bpf-netns.h +++ b/include/linux/bpf-netns.h @@ -34,6 +34,8 @@ int netns_bpf_prog_query(const union bpf_attr *attr, int netns_bpf_prog_attach(const union bpf_attr *attr, struct bpf_prog *prog); int netns_bpf_prog_detach(const union bpf_attr *attr); +int netns_bpf_link_create(const union bpf_attr *attr, + struct bpf_prog *prog); #else static inline int netns_bpf_prog_query(const union bpf_attr *attr, union bpf_attr __user *uattr) @@ -51,6 +53,12 @@ static inline int netns_bpf_prog_detach(const union bpf_attr *attr) { return -EOPNOTSUPP; } + +static inline int netns_bpf_link_create(const union bpf_attr *attr, + struct bpf_prog *prog) +{ + return -EOPNOTSUPP; +} #endif #endif /* _BPF_NETNS_H */ diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h index fa8e1b552acd..a18ae82a298a 100644 --- a/include/linux/bpf_types.h +++ b/include/linux/bpf_types.h @@ -126,3 +126,6 @@ BPF_LINK_TYPE(BPF_LINK_TYPE_TRACING, tracing) BPF_LINK_TYPE(BPF_LINK_TYPE_CGROUP, cgroup) #endif BPF_LINK_TYPE(BPF_LINK_TYPE_ITER, iter) +#ifdef CONFIG_NET +BPF_LINK_TYPE(BPF_LINK_TYPE_NETNS, netns) +#endif diff --git a/include/net/netns/bpf.h b/include/net/netns/bpf.h index a858d1c5b166..a8dce2a380c8 100644 --- a/include/net/netns/bpf.h +++ b/include/net/netns/bpf.h @@ -12,6 +12,7 @@ struct bpf_prog; struct netns_bpf { struct bpf_prog __rcu *progs[MAX_NETNS_BPF_ATTACH_TYPE]; + struct bpf_link *links[MAX_NETNS_BPF_ATTACH_TYPE]; }; #endif /* __NETNS_BPF_H__ */ diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index f862a58fb567..b9ed9f14f2a2 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -237,6 +237,7 @@ enum bpf_link_type { BPF_LINK_TYPE_TRACING = 2, BPF_LINK_TYPE_CGROUP = 3, BPF_LINK_TYPE_ITER = 4, + BPF_LINK_TYPE_NETNS = 5, MAX_BPF_LINK_TYPE, }; @@ -3839,6 +3840,10 @@ struct bpf_link_info { __u64 cgroup_id; __u32 attach_type; } cgroup; + struct { + __u32 netns_ino; + __u32 attach_type; + } netns; }; } __attribute__((aligned(8))); diff --git a/kernel/bpf/net_namespace.c b/kernel/bpf/net_namespace.c index b37d81450c3a..78cf061f8179 100644 --- a/kernel/bpf/net_namespace.c +++ b/kernel/bpf/net_namespace.c @@ -8,9 +8,140 @@ * Functions to manage BPF programs attached to netns */ +struct bpf_netns_link { + struct bpf_link link; + enum bpf_attach_type type; + enum netns_bpf_attach_type netns_type; + + /* We don't hold a ref to net in order to auto-detach the link + * when netns is going away. Instead we rely on pernet + * pre_exit callback to clear this pointer. Must be accessed + * with netns_bpf_mutex held. + */ + struct net *net; +}; + /* Protects updates to netns_bpf */ DEFINE_MUTEX(netns_bpf_mutex); +/* Must be called with netns_bpf_mutex held. */ +static void __net_exit bpf_netns_link_auto_detach(struct bpf_link *link) +{ + struct bpf_netns_link *net_link = + container_of(link, struct bpf_netns_link, link); + + net_link->net = NULL; +} + +static void bpf_netns_link_release(struct bpf_link *link) +{ + struct bpf_netns_link *net_link = + container_of(link, struct bpf_netns_link, link); + enum netns_bpf_attach_type type = net_link->netns_type; + struct net *net; + + /* Link auto-detached by dying netns. */ + if (!net_link->net) + return; + + mutex_lock(&netns_bpf_mutex); + + /* Recheck after potential sleep. We can race with cleanup_net + * here, but if we see a non-NULL struct net pointer pre_exit + * has not happened yet and will block on netns_bpf_mutex. + */ + net = net_link->net; + if (!net) + goto out_unlock; + + net->bpf.links[type] = NULL; + RCU_INIT_POINTER(net->bpf.progs[type], NULL); + +out_unlock: + mutex_unlock(&netns_bpf_mutex); +} + +static void bpf_netns_link_dealloc(struct bpf_link *link) +{ + struct bpf_netns_link *net_link = + container_of(link, struct bpf_netns_link, link); + + kfree(net_link); +} + +static int bpf_netns_link_update_prog(struct bpf_link *link, + struct bpf_prog *new_prog, + struct bpf_prog *old_prog) +{ + struct bpf_netns_link *net_link = + container_of(link, struct bpf_netns_link, link); + enum netns_bpf_attach_type type = net_link->netns_type; + struct net *net; + int ret = 0; + + if (old_prog && old_prog != link->prog) + return -EPERM; + if (new_prog->type != link->prog->type) + return -EINVAL; + + mutex_lock(&netns_bpf_mutex); + + net = net_link->net; + if (!net || !check_net(net)) { + /* Link auto-detached or netns dying */ + ret = -ENOLINK; + goto out_unlock; + } + + old_prog = xchg(&link->prog, new_prog); + rcu_assign_pointer(net->bpf.progs[type], new_prog); + bpf_prog_put(old_prog); + +out_unlock: + mutex_unlock(&netns_bpf_mutex); + return ret; +} + +static int bpf_netns_link_fill_info(const struct bpf_link *link, + struct bpf_link_info *info) +{ + const struct bpf_netns_link *net_link = + container_of(link, struct bpf_netns_link, link); + unsigned int inum = 0; + struct net *net; + + mutex_lock(&netns_bpf_mutex); + net = net_link->net; + if (net && check_net(net)) + inum = net->ns.inum; + mutex_unlock(&netns_bpf_mutex); + + info->netns.netns_ino = inum; + info->netns.attach_type = net_link->type; + return 0; +} + +static void bpf_netns_link_show_fdinfo(const struct bpf_link *link, + struct seq_file *seq) +{ + struct bpf_link_info info = {}; + + bpf_netns_link_fill_info(link, &info); + seq_printf(seq, + "netns_ino:\t%u\n" + "attach_type:\t%u\n", + info.netns.netns_ino, + info.netns.attach_type); +} + +static const struct bpf_link_ops bpf_netns_link_ops = { + .release = bpf_netns_link_release, + .dealloc = bpf_netns_link_dealloc, + .update_prog = bpf_netns_link_update_prog, + .fill_link_info = bpf_netns_link_fill_info, + .show_fdinfo = bpf_netns_link_show_fdinfo, +}; + int netns_bpf_prog_query(const union bpf_attr *attr, union bpf_attr __user *uattr) { @@ -67,6 +198,13 @@ int netns_bpf_prog_attach(const union bpf_attr *attr, struct bpf_prog *prog) net = current->nsproxy->net_ns; mutex_lock(&netns_bpf_mutex); + + /* Attaching prog directly is not compatible with links */ + if (net->bpf.links[type]) { + ret = -EEXIST; + goto out_unlock; + } + switch (type) { case NETNS_BPF_FLOW_DISSECTOR: ret = flow_dissector_bpf_prog_attach(net, prog); @@ -75,6 +213,7 @@ int netns_bpf_prog_attach(const union bpf_attr *attr, struct bpf_prog *prog) ret = -EINVAL; break; } +out_unlock: mutex_unlock(&netns_bpf_mutex); return ret; @@ -86,6 +225,10 @@ static int __netns_bpf_prog_detach(struct net *net, { struct bpf_prog *attached; + /* Progs attached via links cannot be detached */ + if (net->bpf.links[type]) + return -EINVAL; + attached = rcu_dereference_protected(net->bpf.progs[type], lockdep_is_held(&netns_bpf_mutex)); if (!attached) @@ -111,13 +254,110 @@ int netns_bpf_prog_detach(const union bpf_attr *attr) return ret; } +static int netns_bpf_link_attach(struct net *net, struct bpf_link *link, + enum netns_bpf_attach_type type) +{ + struct bpf_prog *prog; + int err; + + mutex_lock(&netns_bpf_mutex); + + /* Allow attaching only one prog or link for now */ + if (net->bpf.links[type]) { + err = -E2BIG; + goto out_unlock; + } + /* Links are not compatible with attaching prog directly */ + prog = rcu_dereference_protected(net->bpf.progs[type], + lockdep_is_held(&netns_bpf_mutex)); + if (prog) { + err = -EEXIST; + goto out_unlock; + } + + switch (type) { + case NETNS_BPF_FLOW_DISSECTOR: + err = flow_dissector_bpf_prog_attach(net, link->prog); + break; + default: + err = -EINVAL; + break; + } + if (err) + goto out_unlock; + + net->bpf.links[type] = link; + +out_unlock: + mutex_unlock(&netns_bpf_mutex); + return err; +} + +int netns_bpf_link_create(const union bpf_attr *attr, struct bpf_prog *prog) +{ + enum netns_bpf_attach_type netns_type; + struct bpf_link_primer link_primer; + struct bpf_netns_link *net_link; + enum bpf_attach_type type; + struct net *net; + int err; + + if (attr->link_create.flags) + return -EINVAL; + + type = attr->link_create.attach_type; + netns_type = to_netns_bpf_attach_type(type); + if (netns_type < 0) + return -EINVAL; + + net = get_net_ns_by_fd(attr->link_create.target_fd); + if (IS_ERR(net)) + return PTR_ERR(net); + + net_link = kzalloc(sizeof(*net_link), GFP_USER); + if (!net_link) { + err = -ENOMEM; + goto out_put_net; + } + bpf_link_init(&net_link->link, BPF_LINK_TYPE_NETNS, + &bpf_netns_link_ops, prog); + net_link->net = net; + net_link->type = type; + net_link->netns_type = netns_type; + + err = bpf_link_prime(&net_link->link, &link_primer); + if (err) { + kfree(net_link); + goto out_put_net; + } + + err = netns_bpf_link_attach(net, &net_link->link, netns_type); + if (err) { + bpf_link_cleanup(&link_primer); + goto out_put_net; + } + + put_net(net); + return bpf_link_settle(&link_primer); + +out_put_net: + put_net(net); + return err; +} + static void __net_exit netns_bpf_pernet_pre_exit(struct net *net) { enum netns_bpf_attach_type type; + struct bpf_link *link; mutex_lock(&netns_bpf_mutex); - for (type = 0; type < MAX_NETNS_BPF_ATTACH_TYPE; type++) - __netns_bpf_prog_detach(net, type); + for (type = 0; type < MAX_NETNS_BPF_ATTACH_TYPE; type++) { + link = net->bpf.links[type]; + if (link) + bpf_netns_link_auto_detach(link); + else + __netns_bpf_prog_detach(net, type); + } mutex_unlock(&netns_bpf_mutex); } diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index c77ab9c76f7b..e14a842d7e0d 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -3887,6 +3887,9 @@ static int link_create(union bpf_attr *attr) case BPF_PROG_TYPE_TRACING: ret = tracing_bpf_link_attach(attr, prog); break; + case BPF_PROG_TYPE_FLOW_DISSECTOR: + ret = netns_bpf_link_create(attr, prog); + break; default: ret = -EINVAL; } diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index f862a58fb567..b9ed9f14f2a2 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -237,6 +237,7 @@ enum bpf_link_type { BPF_LINK_TYPE_TRACING = 2, BPF_LINK_TYPE_CGROUP = 3, BPF_LINK_TYPE_ITER = 4, + BPF_LINK_TYPE_NETNS = 5, MAX_BPF_LINK_TYPE, }; @@ -3839,6 +3840,10 @@ struct bpf_link_info { __u64 cgroup_id; __u32 attach_type; } cgroup; + struct { + __u32 netns_ino; + __u32 attach_type; + } netns; }; } __attribute__((aligned(8))); -- cgit v1.2.3-59-g8ed1b From 0c047ecbb7bab4c1d2136f5f04bb47a66a9a12b8 Mon Sep 17 00:00:00 2001 From: Jakub Sitnicki Date: Sun, 31 May 2020 10:28:39 +0200 Subject: bpf, cgroup: Return ENOLINK for auto-detached links on update Failure to update a bpf_link because it has been auto-detached by a dying cgroup currently results in EINVAL error, even though the arguments passed to bpf() syscall are not wrong. bpf_links attaching to netns in this case will return ENOLINK, which carries the message that the link is no longer attached to anything. Change cgroup bpf_links to do the same to keep the uAPI errors consistent. Fixes: 0c991ebc8c69 ("bpf: Implement bpf_prog replacement for an active bpf_cgroup_link") Suggested-by: Lorenz Bauer Signed-off-by: Jakub Sitnicki Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200531082846.2117903-6-jakub@cloudflare.com --- kernel/bpf/cgroup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 5c0e964105ac..fdf7836750a3 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -595,7 +595,7 @@ static int cgroup_bpf_replace(struct bpf_link *link, struct bpf_prog *new_prog, mutex_lock(&cgroup_mutex); /* link might have been auto-released by dying cgroup, so fail */ if (!cg_link->cgroup) { - ret = -EINVAL; + ret = -ENOLINK; goto out_unlock; } if (old_prog && link->prog != old_prog) { -- cgit v1.2.3-59-g8ed1b From d60d81acc2c180e33244857e35ef60072573b000 Mon Sep 17 00:00:00 2001 From: Jakub Sitnicki Date: Sun, 31 May 2020 10:28:40 +0200 Subject: libbpf: Add support for bpf_link-based netns attachment Add bpf_program__attach_nets(), which uses LINK_CREATE subcommand to create an FD-based kernel bpf_link, for attach types tied to network namespace, that is BPF_FLOW_DISSECTOR for the moment. Signed-off-by: Jakub Sitnicki Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200531082846.2117903-7-jakub@cloudflare.com --- tools/lib/bpf/libbpf.c | 23 ++++++++++++++++++----- tools/lib/bpf/libbpf.h | 2 ++ tools/lib/bpf/libbpf.map | 1 + 3 files changed, 21 insertions(+), 5 deletions(-) diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 85d4f1c5fc52..7f01be2b88b8 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -7896,8 +7896,9 @@ static struct bpf_link *attach_iter(const struct bpf_sec_def *sec, return bpf_program__attach_iter(prog, NULL); } -struct bpf_link * -bpf_program__attach_cgroup(struct bpf_program *prog, int cgroup_fd) +static struct bpf_link * +bpf_program__attach_fd(struct bpf_program *prog, int target_fd, + const char *target_name) { enum bpf_attach_type attach_type; char errmsg[STRERR_BUFSIZE]; @@ -7917,12 +7918,12 @@ bpf_program__attach_cgroup(struct bpf_program *prog, int cgroup_fd) link->detach = &bpf_link__detach_fd; attach_type = bpf_program__get_expected_attach_type(prog); - link_fd = bpf_link_create(prog_fd, cgroup_fd, attach_type, NULL); + link_fd = bpf_link_create(prog_fd, target_fd, attach_type, NULL); if (link_fd < 0) { link_fd = -errno; free(link); - pr_warn("program '%s': failed to attach to cgroup: %s\n", - bpf_program__title(prog, false), + pr_warn("program '%s': failed to attach to %s: %s\n", + bpf_program__title(prog, false), target_name, libbpf_strerror_r(link_fd, errmsg, sizeof(errmsg))); return ERR_PTR(link_fd); } @@ -7930,6 +7931,18 @@ bpf_program__attach_cgroup(struct bpf_program *prog, int cgroup_fd) return link; } +struct bpf_link * +bpf_program__attach_cgroup(struct bpf_program *prog, int cgroup_fd) +{ + return bpf_program__attach_fd(prog, cgroup_fd, "cgroup"); +} + +struct bpf_link * +bpf_program__attach_netns(struct bpf_program *prog, int netns_fd) +{ + return bpf_program__attach_fd(prog, netns_fd, "netns"); +} + struct bpf_link * bpf_program__attach_iter(struct bpf_program *prog, const struct bpf_iter_attach_opts *opts) diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h index 8528a02d5af8..334437af3014 100644 --- a/tools/lib/bpf/libbpf.h +++ b/tools/lib/bpf/libbpf.h @@ -253,6 +253,8 @@ LIBBPF_API struct bpf_link * bpf_program__attach_lsm(struct bpf_program *prog); LIBBPF_API struct bpf_link * bpf_program__attach_cgroup(struct bpf_program *prog, int cgroup_fd); +LIBBPF_API struct bpf_link * +bpf_program__attach_netns(struct bpf_program *prog, int netns_fd); struct bpf_map; diff --git a/tools/lib/bpf/libbpf.map b/tools/lib/bpf/libbpf.map index c18860200abb..f732c77b7ed0 100644 --- a/tools/lib/bpf/libbpf.map +++ b/tools/lib/bpf/libbpf.map @@ -262,6 +262,7 @@ LIBBPF_0.0.9 { bpf_link_get_fd_by_id; bpf_link_get_next_id; bpf_program__attach_iter; + bpf_program__attach_netns; perf_buffer__consume; ring_buffer__add; ring_buffer__consume; -- cgit v1.2.3-59-g8ed1b From be6e19818ba626eb1b354490aee40a2cfc1a219f Mon Sep 17 00:00:00 2001 From: Jakub Sitnicki Date: Sun, 31 May 2020 10:28:41 +0200 Subject: bpftool: Extract helpers for showing link attach type Code for printing link attach_type is duplicated in a couple of places, and likely will be duplicated for future link types as well. Create helpers to prevent duplication. Suggested-by: Andrii Nakryiko Signed-off-by: Jakub Sitnicki Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200531082846.2117903-8-jakub@cloudflare.com --- tools/bpf/bpftool/link.c | 44 ++++++++++++++++++++++---------------------- 1 file changed, 22 insertions(+), 22 deletions(-) diff --git a/tools/bpf/bpftool/link.c b/tools/bpf/bpftool/link.c index 670a561dc31b..1ff416eff3d7 100644 --- a/tools/bpf/bpftool/link.c +++ b/tools/bpf/bpftool/link.c @@ -62,6 +62,15 @@ show_link_header_json(struct bpf_link_info *info, json_writer_t *wtr) jsonw_uint_field(json_wtr, "prog_id", info->prog_id); } +static void show_link_attach_type_json(__u32 attach_type, json_writer_t *wtr) +{ + if (attach_type < ARRAY_SIZE(attach_type_name)) + jsonw_string_field(wtr, "attach_type", + attach_type_name[attach_type]); + else + jsonw_uint_field(wtr, "attach_type", attach_type); +} + static int get_prog_info(int prog_id, struct bpf_prog_info *info) { __u32 len = sizeof(*info); @@ -105,22 +114,13 @@ static int show_link_close_json(int fd, struct bpf_link_info *info) jsonw_uint_field(json_wtr, "prog_type", prog_info.type); - if (info->tracing.attach_type < ARRAY_SIZE(attach_type_name)) - jsonw_string_field(json_wtr, "attach_type", - attach_type_name[info->tracing.attach_type]); - else - jsonw_uint_field(json_wtr, "attach_type", - info->tracing.attach_type); + show_link_attach_type_json(info->tracing.attach_type, + json_wtr); break; case BPF_LINK_TYPE_CGROUP: jsonw_lluint_field(json_wtr, "cgroup_id", info->cgroup.cgroup_id); - if (info->cgroup.attach_type < ARRAY_SIZE(attach_type_name)) - jsonw_string_field(json_wtr, "attach_type", - attach_type_name[info->cgroup.attach_type]); - else - jsonw_uint_field(json_wtr, "attach_type", - info->cgroup.attach_type); + show_link_attach_type_json(info->cgroup.attach_type, json_wtr); break; default: break; @@ -153,6 +153,14 @@ static void show_link_header_plain(struct bpf_link_info *info) printf("prog %u ", info->prog_id); } +static void show_link_attach_type_plain(__u32 attach_type) +{ + if (attach_type < ARRAY_SIZE(attach_type_name)) + printf("attach_type %s ", attach_type_name[attach_type]); + else + printf("attach_type %u ", attach_type); +} + static int show_link_close_plain(int fd, struct bpf_link_info *info) { struct bpf_prog_info prog_info; @@ -176,19 +184,11 @@ static int show_link_close_plain(int fd, struct bpf_link_info *info) else printf("\n\tprog_type %u ", prog_info.type); - if (info->tracing.attach_type < ARRAY_SIZE(attach_type_name)) - printf("attach_type %s ", - attach_type_name[info->tracing.attach_type]); - else - printf("attach_type %u ", info->tracing.attach_type); + show_link_attach_type_plain(info->tracing.attach_type); break; case BPF_LINK_TYPE_CGROUP: printf("\n\tcgroup_id %zu ", (size_t)info->cgroup.cgroup_id); - if (info->cgroup.attach_type < ARRAY_SIZE(attach_type_name)) - printf("attach_type %s ", - attach_type_name[info->cgroup.attach_type]); - else - printf("attach_type %u ", info->cgroup.attach_type); + show_link_attach_type_plain(info->cgroup.attach_type); break; default: break; -- cgit v1.2.3-59-g8ed1b From e948947a6e111b3d4bbe538105ee2f3611e032ad Mon Sep 17 00:00:00 2001 From: Jakub Sitnicki Date: Sun, 31 May 2020 10:28:42 +0200 Subject: bpftool: Support link show for netns-attached links Make `bpf link show` aware of new link type, that is links attached to netns. When listing netns-attached links, display netns inode number as its identifier and link attach type. Sample session: # readlink /proc/self/ns/net net:[4026532251] # bpftool prog show 357: flow_dissector tag a04f5eef06a7f555 gpl loaded_at 2020-05-30T16:53:51+0200 uid 0 xlated 16B jited 37B memlock 4096B 358: flow_dissector tag a04f5eef06a7f555 gpl loaded_at 2020-05-30T16:53:51+0200 uid 0 xlated 16B jited 37B memlock 4096B # bpftool link show 108: netns prog 357 netns_ino 4026532251 attach_type flow_dissector # bpftool link -jp show [{ "id": 108, "type": "netns", "prog_id": 357, "netns_ino": 4026532251, "attach_type": "flow_dissector" } ] (... after netns is gone ...) # bpftool link show 108: netns prog 357 netns_ino 0 attach_type flow_dissector # bpftool link -jp show [{ "id": 108, "type": "netns", "prog_id": 357, "netns_ino": 0, "attach_type": "flow_dissector" } ] Signed-off-by: Jakub Sitnicki Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200531082846.2117903-9-jakub@cloudflare.com --- tools/bpf/bpftool/link.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/tools/bpf/bpftool/link.c b/tools/bpf/bpftool/link.c index 1ff416eff3d7..fca57ee8fafe 100644 --- a/tools/bpf/bpftool/link.c +++ b/tools/bpf/bpftool/link.c @@ -17,6 +17,7 @@ static const char * const link_type_name[] = { [BPF_LINK_TYPE_TRACING] = "tracing", [BPF_LINK_TYPE_CGROUP] = "cgroup", [BPF_LINK_TYPE_ITER] = "iter", + [BPF_LINK_TYPE_NETNS] = "netns", }; static int link_parse_fd(int *argc, char ***argv) @@ -122,6 +123,11 @@ static int show_link_close_json(int fd, struct bpf_link_info *info) info->cgroup.cgroup_id); show_link_attach_type_json(info->cgroup.attach_type, json_wtr); break; + case BPF_LINK_TYPE_NETNS: + jsonw_uint_field(json_wtr, "netns_ino", + info->netns.netns_ino); + show_link_attach_type_json(info->netns.attach_type, json_wtr); + break; default: break; } @@ -190,6 +196,10 @@ static int show_link_close_plain(int fd, struct bpf_link_info *info) printf("\n\tcgroup_id %zu ", (size_t)info->cgroup.cgroup_id); show_link_attach_type_plain(info->cgroup.attach_type); break; + case BPF_LINK_TYPE_NETNS: + printf("\n\tnetns_ino %u ", info->netns.netns_ino); + show_link_attach_type_plain(info->netns.attach_type); + break; default: break; } -- cgit v1.2.3-59-g8ed1b From 1f043f87bb595bbe6c7e6b291d115284840a6c33 Mon Sep 17 00:00:00 2001 From: Jakub Sitnicki Date: Sun, 31 May 2020 10:28:43 +0200 Subject: selftests/bpf: Add tests for attaching bpf_link to netns Extend the existing test case for flow dissector attaching to cover: - link creation, - link updates, - link info querying, - mixing links with direct prog attachment. Signed-off-by: Jakub Sitnicki Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200531082846.2117903-10-jakub@cloudflare.com --- .../bpf/prog_tests/flow_dissector_reattach.c | 588 +++++++++++++++++++-- 1 file changed, 551 insertions(+), 37 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/flow_dissector_reattach.c b/tools/testing/selftests/bpf/prog_tests/flow_dissector_reattach.c index 1f51ba66b98b..15cb554a66d8 100644 --- a/tools/testing/selftests/bpf/prog_tests/flow_dissector_reattach.c +++ b/tools/testing/selftests/bpf/prog_tests/flow_dissector_reattach.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include @@ -18,21 +19,30 @@ #include "test_progs.h" -static bool is_attached(int netns) +static int init_net = -1; + +static __u32 query_attached_prog_id(int netns) { - __u32 cnt; + __u32 prog_ids[1] = {}; + __u32 prog_cnt = ARRAY_SIZE(prog_ids); int err; - err = bpf_prog_query(netns, BPF_FLOW_DISSECTOR, 0, NULL, NULL, &cnt); + err = bpf_prog_query(netns, BPF_FLOW_DISSECTOR, 0, NULL, + prog_ids, &prog_cnt); if (CHECK_FAIL(err)) { perror("bpf_prog_query"); - return true; /* fail-safe */ + return 0; } - return cnt > 0; + return prog_cnt == 1 ? prog_ids[0] : 0; +} + +static bool prog_is_attached(int netns) +{ + return query_attached_prog_id(netns) > 0; } -static int load_prog(void) +static int load_prog(enum bpf_prog_type type) { struct bpf_insn prog[] = { BPF_MOV64_IMM(BPF_REG_0, BPF_OK), @@ -40,61 +50,566 @@ static int load_prog(void) }; int fd; - fd = bpf_load_program(BPF_PROG_TYPE_FLOW_DISSECTOR, prog, - ARRAY_SIZE(prog), "GPL", 0, NULL, 0); + fd = bpf_load_program(type, prog, ARRAY_SIZE(prog), "GPL", 0, NULL, 0); if (CHECK_FAIL(fd < 0)) perror("bpf_load_program"); return fd; } -static void do_flow_dissector_reattach(void) +static __u32 query_prog_id(int prog) { - int prog_fd[2] = { -1, -1 }; + struct bpf_prog_info info = {}; + __u32 info_len = sizeof(info); int err; - prog_fd[0] = load_prog(); - if (prog_fd[0] < 0) - return; + err = bpf_obj_get_info_by_fd(prog, &info, &info_len); + if (CHECK_FAIL(err || info_len != sizeof(info))) { + perror("bpf_obj_get_info_by_fd"); + return 0; + } - prog_fd[1] = load_prog(); - if (prog_fd[1] < 0) - goto out_close; + return info.id; +} + +static int unshare_net(int old_net) +{ + int err, new_net; - err = bpf_prog_attach(prog_fd[0], 0, BPF_FLOW_DISSECTOR, 0); + err = unshare(CLONE_NEWNET); if (CHECK_FAIL(err)) { - perror("bpf_prog_attach-0"); - goto out_close; + perror("unshare(CLONE_NEWNET)"); + return -1; + } + new_net = open("/proc/self/ns/net", O_RDONLY); + if (CHECK_FAIL(new_net < 0)) { + perror("open(/proc/self/ns/net)"); + setns(old_net, CLONE_NEWNET); + return -1; } + return new_net; +} + +static void test_prog_attach_prog_attach(int netns, int prog1, int prog2) +{ + int err; + + err = bpf_prog_attach(prog1, 0, BPF_FLOW_DISSECTOR, 0); + if (CHECK_FAIL(err)) { + perror("bpf_prog_attach(prog1)"); + return; + } + CHECK_FAIL(query_attached_prog_id(netns) != query_prog_id(prog1)); /* Expect success when attaching a different program */ - err = bpf_prog_attach(prog_fd[1], 0, BPF_FLOW_DISSECTOR, 0); + err = bpf_prog_attach(prog2, 0, BPF_FLOW_DISSECTOR, 0); if (CHECK_FAIL(err)) { - perror("bpf_prog_attach-1"); + perror("bpf_prog_attach(prog2) #1"); goto out_detach; } + CHECK_FAIL(query_attached_prog_id(netns) != query_prog_id(prog2)); /* Expect failure when attaching the same program twice */ - err = bpf_prog_attach(prog_fd[1], 0, BPF_FLOW_DISSECTOR, 0); + err = bpf_prog_attach(prog2, 0, BPF_FLOW_DISSECTOR, 0); if (CHECK_FAIL(!err || errno != EINVAL)) - perror("bpf_prog_attach-2"); + perror("bpf_prog_attach(prog2) #2"); + CHECK_FAIL(query_attached_prog_id(netns) != query_prog_id(prog2)); out_detach: err = bpf_prog_detach(0, BPF_FLOW_DISSECTOR); if (CHECK_FAIL(err)) perror("bpf_prog_detach"); + CHECK_FAIL(prog_is_attached(netns)); +} + +static void test_link_create_link_create(int netns, int prog1, int prog2) +{ + DECLARE_LIBBPF_OPTS(bpf_link_create_opts, opts); + int link1, link2; + + link1 = bpf_link_create(prog1, netns, BPF_FLOW_DISSECTOR, &opts); + if (CHECK_FAIL(link < 0)) { + perror("bpf_link_create(prog1)"); + return; + } + CHECK_FAIL(query_attached_prog_id(netns) != query_prog_id(prog1)); + + /* Expect failure creating link when another link exists */ + errno = 0; + link2 = bpf_link_create(prog2, netns, BPF_FLOW_DISSECTOR, &opts); + if (CHECK_FAIL(link2 != -1 || errno != E2BIG)) + perror("bpf_prog_attach(prog2) expected E2BIG"); + if (link2 != -1) + close(link2); + CHECK_FAIL(query_attached_prog_id(netns) != query_prog_id(prog1)); + + close(link1); + CHECK_FAIL(prog_is_attached(netns)); +} + +static void test_prog_attach_link_create(int netns, int prog1, int prog2) +{ + DECLARE_LIBBPF_OPTS(bpf_link_create_opts, opts); + int err, link; + + err = bpf_prog_attach(prog1, -1, BPF_FLOW_DISSECTOR, 0); + if (CHECK_FAIL(err)) { + perror("bpf_prog_attach(prog1)"); + return; + } + CHECK_FAIL(query_attached_prog_id(netns) != query_prog_id(prog1)); + + /* Expect failure creating link when prog attached */ + errno = 0; + link = bpf_link_create(prog2, netns, BPF_FLOW_DISSECTOR, &opts); + if (CHECK_FAIL(link != -1 || errno != EEXIST)) + perror("bpf_link_create(prog2) expected EEXIST"); + if (link != -1) + close(link); + CHECK_FAIL(query_attached_prog_id(netns) != query_prog_id(prog1)); + + err = bpf_prog_detach(-1, BPF_FLOW_DISSECTOR); + if (CHECK_FAIL(err)) + perror("bpf_prog_detach"); + CHECK_FAIL(prog_is_attached(netns)); +} + +static void test_link_create_prog_attach(int netns, int prog1, int prog2) +{ + DECLARE_LIBBPF_OPTS(bpf_link_create_opts, opts); + int err, link; + + link = bpf_link_create(prog1, netns, BPF_FLOW_DISSECTOR, &opts); + if (CHECK_FAIL(link < 0)) { + perror("bpf_link_create(prog1)"); + return; + } + CHECK_FAIL(query_attached_prog_id(netns) != query_prog_id(prog1)); + + /* Expect failure attaching prog when link exists */ + errno = 0; + err = bpf_prog_attach(prog2, -1, BPF_FLOW_DISSECTOR, 0); + if (CHECK_FAIL(!err || errno != EEXIST)) + perror("bpf_prog_attach(prog2) expected EEXIST"); + CHECK_FAIL(query_attached_prog_id(netns) != query_prog_id(prog1)); + + close(link); + CHECK_FAIL(prog_is_attached(netns)); +} + +static void test_link_create_prog_detach(int netns, int prog1, int prog2) +{ + DECLARE_LIBBPF_OPTS(bpf_link_create_opts, opts); + int err, link; + + link = bpf_link_create(prog1, netns, BPF_FLOW_DISSECTOR, &opts); + if (CHECK_FAIL(link < 0)) { + perror("bpf_link_create(prog1)"); + return; + } + CHECK_FAIL(query_attached_prog_id(netns) != query_prog_id(prog1)); + + /* Expect failure detaching prog when link exists */ + errno = 0; + err = bpf_prog_detach(-1, BPF_FLOW_DISSECTOR); + if (CHECK_FAIL(!err || errno != EINVAL)) + perror("bpf_prog_detach expected EINVAL"); + CHECK_FAIL(query_attached_prog_id(netns) != query_prog_id(prog1)); + + close(link); + CHECK_FAIL(prog_is_attached(netns)); +} + +static void test_prog_attach_detach_query(int netns, int prog1, int prog2) +{ + int err; + + err = bpf_prog_attach(prog1, 0, BPF_FLOW_DISSECTOR, 0); + if (CHECK_FAIL(err)) { + perror("bpf_prog_attach(prog1)"); + return; + } + CHECK_FAIL(query_attached_prog_id(netns) != query_prog_id(prog1)); + + err = bpf_prog_detach(0, BPF_FLOW_DISSECTOR); + if (CHECK_FAIL(err)) { + perror("bpf_prog_detach"); + return; + } + + /* Expect no prog attached after successful detach */ + CHECK_FAIL(prog_is_attached(netns)); +} + +static void test_link_create_close_query(int netns, int prog1, int prog2) +{ + DECLARE_LIBBPF_OPTS(bpf_link_create_opts, opts); + int link; + + link = bpf_link_create(prog1, netns, BPF_FLOW_DISSECTOR, &opts); + if (CHECK_FAIL(link < 0)) { + perror("bpf_link_create(prog1)"); + return; + } + CHECK_FAIL(query_attached_prog_id(netns) != query_prog_id(prog1)); + + close(link); + /* Expect no prog attached after closing last link FD */ + CHECK_FAIL(prog_is_attached(netns)); +} + +static void test_link_update_no_old_prog(int netns, int prog1, int prog2) +{ + DECLARE_LIBBPF_OPTS(bpf_link_create_opts, create_opts); + DECLARE_LIBBPF_OPTS(bpf_link_update_opts, update_opts); + int err, link; + + link = bpf_link_create(prog1, netns, BPF_FLOW_DISSECTOR, &create_opts); + if (CHECK_FAIL(link < 0)) { + perror("bpf_link_create(prog1)"); + return; + } + CHECK_FAIL(query_attached_prog_id(netns) != query_prog_id(prog1)); + + /* Expect success replacing the prog when old prog not specified */ + update_opts.flags = 0; + update_opts.old_prog_fd = 0; + err = bpf_link_update(link, prog2, &update_opts); + if (CHECK_FAIL(err)) + perror("bpf_link_update"); + CHECK_FAIL(query_attached_prog_id(netns) != query_prog_id(prog2)); + + close(link); + CHECK_FAIL(prog_is_attached(netns)); +} + +static void test_link_update_replace_old_prog(int netns, int prog1, int prog2) +{ + DECLARE_LIBBPF_OPTS(bpf_link_create_opts, create_opts); + DECLARE_LIBBPF_OPTS(bpf_link_update_opts, update_opts); + int err, link; + link = bpf_link_create(prog1, netns, BPF_FLOW_DISSECTOR, &create_opts); + if (CHECK_FAIL(link < 0)) { + perror("bpf_link_create(prog1)"); + return; + } + CHECK_FAIL(query_attached_prog_id(netns) != query_prog_id(prog1)); + + /* Expect success F_REPLACE and old prog specified to succeed */ + update_opts.flags = BPF_F_REPLACE; + update_opts.old_prog_fd = prog1; + err = bpf_link_update(link, prog2, &update_opts); + if (CHECK_FAIL(err)) + perror("bpf_link_update"); + CHECK_FAIL(query_attached_prog_id(netns) != query_prog_id(prog2)); + + close(link); + CHECK_FAIL(prog_is_attached(netns)); +} + +static void test_link_update_invalid_opts(int netns, int prog1, int prog2) +{ + DECLARE_LIBBPF_OPTS(bpf_link_create_opts, create_opts); + DECLARE_LIBBPF_OPTS(bpf_link_update_opts, update_opts); + int err, link; + + link = bpf_link_create(prog1, netns, BPF_FLOW_DISSECTOR, &create_opts); + if (CHECK_FAIL(link < 0)) { + perror("bpf_link_create(prog1)"); + return; + } + CHECK_FAIL(query_attached_prog_id(netns) != query_prog_id(prog1)); + + /* Expect update to fail w/ old prog FD but w/o F_REPLACE*/ + errno = 0; + update_opts.flags = 0; + update_opts.old_prog_fd = prog1; + err = bpf_link_update(link, prog2, &update_opts); + if (CHECK_FAIL(!err || errno != EINVAL)) { + perror("bpf_link_update expected EINVAL"); + goto out_close; + } + CHECK_FAIL(query_attached_prog_id(netns) != query_prog_id(prog1)); + + /* Expect update to fail on old prog FD mismatch */ + errno = 0; + update_opts.flags = BPF_F_REPLACE; + update_opts.old_prog_fd = prog2; + err = bpf_link_update(link, prog2, &update_opts); + if (CHECK_FAIL(!err || errno != EPERM)) { + perror("bpf_link_update expected EPERM"); + goto out_close; + } + CHECK_FAIL(query_attached_prog_id(netns) != query_prog_id(prog1)); + + /* Expect update to fail for invalid old prog FD */ + errno = 0; + update_opts.flags = BPF_F_REPLACE; + update_opts.old_prog_fd = -1; + err = bpf_link_update(link, prog2, &update_opts); + if (CHECK_FAIL(!err || errno != EBADF)) { + perror("bpf_link_update expected EBADF"); + goto out_close; + } + CHECK_FAIL(query_attached_prog_id(netns) != query_prog_id(prog1)); + + /* Expect update to fail with invalid flags */ + errno = 0; + update_opts.flags = BPF_F_ALLOW_MULTI; + update_opts.old_prog_fd = 0; + err = bpf_link_update(link, prog2, &update_opts); + if (CHECK_FAIL(!err || errno != EINVAL)) + perror("bpf_link_update expected EINVAL"); + CHECK_FAIL(query_attached_prog_id(netns) != query_prog_id(prog1)); + +out_close: + close(link); + CHECK_FAIL(prog_is_attached(netns)); +} + +static void test_link_update_invalid_prog(int netns, int prog1, int prog2) +{ + DECLARE_LIBBPF_OPTS(bpf_link_create_opts, create_opts); + DECLARE_LIBBPF_OPTS(bpf_link_update_opts, update_opts); + int err, link, prog3; + + link = bpf_link_create(prog1, netns, BPF_FLOW_DISSECTOR, &create_opts); + if (CHECK_FAIL(link < 0)) { + perror("bpf_link_create(prog1)"); + return; + } + CHECK_FAIL(query_attached_prog_id(netns) != query_prog_id(prog1)); + + /* Expect failure when new prog FD is not valid */ + errno = 0; + update_opts.flags = 0; + update_opts.old_prog_fd = 0; + err = bpf_link_update(link, -1, &update_opts); + if (CHECK_FAIL(!err || errno != EBADF)) { + perror("bpf_link_update expected EINVAL"); + goto out_close_link; + } + CHECK_FAIL(query_attached_prog_id(netns) != query_prog_id(prog1)); + + prog3 = load_prog(BPF_PROG_TYPE_SOCKET_FILTER); + if (prog3 < 0) + goto out_close_link; + + /* Expect failure when new prog FD type doesn't match */ + errno = 0; + update_opts.flags = 0; + update_opts.old_prog_fd = 0; + err = bpf_link_update(link, prog3, &update_opts); + if (CHECK_FAIL(!err || errno != EINVAL)) + perror("bpf_link_update expected EINVAL"); + CHECK_FAIL(query_attached_prog_id(netns) != query_prog_id(prog1)); + + close(prog3); +out_close_link: + close(link); + CHECK_FAIL(prog_is_attached(netns)); +} + +static void test_link_update_netns_gone(int netns, int prog1, int prog2) +{ + DECLARE_LIBBPF_OPTS(bpf_link_create_opts, create_opts); + DECLARE_LIBBPF_OPTS(bpf_link_update_opts, update_opts); + int err, link, old_net; + + old_net = netns; + netns = unshare_net(old_net); + if (netns < 0) + return; + + link = bpf_link_create(prog1, netns, BPF_FLOW_DISSECTOR, &create_opts); + if (CHECK_FAIL(link < 0)) { + perror("bpf_link_create(prog1)"); + return; + } + CHECK_FAIL(query_attached_prog_id(netns) != query_prog_id(prog1)); + + close(netns); + err = setns(old_net, CLONE_NEWNET); + if (CHECK_FAIL(err)) { + perror("setns(CLONE_NEWNET)"); + close(link); + return; + } + + /* Expect failure when netns destroyed */ + errno = 0; + update_opts.flags = 0; + update_opts.old_prog_fd = 0; + err = bpf_link_update(link, prog2, &update_opts); + if (CHECK_FAIL(!err || errno != ENOLINK)) + perror("bpf_link_update"); + + close(link); +} + +static void test_link_get_info(int netns, int prog1, int prog2) +{ + DECLARE_LIBBPF_OPTS(bpf_link_create_opts, create_opts); + DECLARE_LIBBPF_OPTS(bpf_link_update_opts, update_opts); + struct bpf_link_info info = {}; + struct stat netns_stat = {}; + __u32 info_len, link_id; + int err, link, old_net; + + old_net = netns; + netns = unshare_net(old_net); + if (netns < 0) + return; + + err = fstat(netns, &netns_stat); + if (CHECK_FAIL(err)) { + perror("stat(netns)"); + goto out_resetns; + } + + link = bpf_link_create(prog1, netns, BPF_FLOW_DISSECTOR, &create_opts); + if (CHECK_FAIL(link < 0)) { + perror("bpf_link_create(prog1)"); + goto out_resetns; + } + + info_len = sizeof(info); + err = bpf_obj_get_info_by_fd(link, &info, &info_len); + if (CHECK_FAIL(err)) { + perror("bpf_obj_get_info"); + goto out_unlink; + } + CHECK_FAIL(info_len != sizeof(info)); + + /* Expect link info to be sane and match prog and netns details */ + CHECK_FAIL(info.type != BPF_LINK_TYPE_NETNS); + CHECK_FAIL(info.id == 0); + CHECK_FAIL(info.prog_id != query_prog_id(prog1)); + CHECK_FAIL(info.netns.netns_ino != netns_stat.st_ino); + CHECK_FAIL(info.netns.attach_type != BPF_FLOW_DISSECTOR); + + update_opts.flags = 0; + update_opts.old_prog_fd = 0; + err = bpf_link_update(link, prog2, &update_opts); + if (CHECK_FAIL(err)) { + perror("bpf_link_update(prog2)"); + goto out_unlink; + } + + link_id = info.id; + info_len = sizeof(info); + err = bpf_obj_get_info_by_fd(link, &info, &info_len); + if (CHECK_FAIL(err)) { + perror("bpf_obj_get_info"); + goto out_unlink; + } + CHECK_FAIL(info_len != sizeof(info)); + + /* Expect no info change after update except in prog id */ + CHECK_FAIL(info.type != BPF_LINK_TYPE_NETNS); + CHECK_FAIL(info.id != link_id); + CHECK_FAIL(info.prog_id != query_prog_id(prog2)); + CHECK_FAIL(info.netns.netns_ino != netns_stat.st_ino); + CHECK_FAIL(info.netns.attach_type != BPF_FLOW_DISSECTOR); + + /* Leave netns link is attached to and close last FD to it */ + err = setns(old_net, CLONE_NEWNET); + if (CHECK_FAIL(err)) { + perror("setns(NEWNET)"); + goto out_unlink; + } + close(netns); + old_net = -1; + netns = -1; + + info_len = sizeof(info); + err = bpf_obj_get_info_by_fd(link, &info, &info_len); + if (CHECK_FAIL(err)) { + perror("bpf_obj_get_info"); + goto out_unlink; + } + CHECK_FAIL(info_len != sizeof(info)); + + /* Expect netns_ino to change to 0 */ + CHECK_FAIL(info.type != BPF_LINK_TYPE_NETNS); + CHECK_FAIL(info.id != link_id); + CHECK_FAIL(info.prog_id != query_prog_id(prog2)); + CHECK_FAIL(info.netns.netns_ino != 0); + CHECK_FAIL(info.netns.attach_type != BPF_FLOW_DISSECTOR); + +out_unlink: + close(link); +out_resetns: + if (old_net != -1) + setns(old_net, CLONE_NEWNET); + if (netns != -1) + close(netns); +} + +static void run_tests(int netns) +{ + struct test { + const char *test_name; + void (*test_func)(int netns, int prog1, int prog2); + } tests[] = { + { "prog attach, prog attach", + test_prog_attach_prog_attach }, + { "link create, link create", + test_link_create_link_create }, + { "prog attach, link create", + test_prog_attach_link_create }, + { "link create, prog attach", + test_link_create_prog_attach }, + { "link create, prog detach", + test_link_create_prog_detach }, + { "prog attach, detach, query", + test_prog_attach_detach_query }, + { "link create, close, query", + test_link_create_close_query }, + { "link update no old prog", + test_link_update_no_old_prog }, + { "link update with replace old prog", + test_link_update_replace_old_prog }, + { "link update invalid opts", + test_link_update_invalid_opts }, + { "link update invalid prog", + test_link_update_invalid_prog }, + { "link update netns gone", + test_link_update_netns_gone }, + { "link get info", + test_link_get_info }, + }; + int i, progs[2] = { -1, -1 }; + char test_name[80]; + + for (i = 0; i < ARRAY_SIZE(progs); i++) { + progs[i] = load_prog(BPF_PROG_TYPE_FLOW_DISSECTOR); + if (progs[i] < 0) + goto out_close; + } + + for (i = 0; i < ARRAY_SIZE(tests); i++) { + snprintf(test_name, sizeof(test_name), + "flow dissector %s%s", + tests[i].test_name, + netns == init_net ? " (init_net)" : ""); + if (test__start_subtest(test_name)) + tests[i].test_func(netns, progs[0], progs[1]); + } out_close: - close(prog_fd[1]); - close(prog_fd[0]); + for (i = 0; i < ARRAY_SIZE(progs); i++) { + if (progs[i] != -1) + CHECK_FAIL(close(progs[i])); + } } void test_flow_dissector_reattach(void) { - int init_net, self_net, err; + int err, new_net, saved_net; - self_net = open("/proc/self/ns/net", O_RDONLY); - if (CHECK_FAIL(self_net < 0)) { + saved_net = open("/proc/self/ns/net", O_RDONLY); + if (CHECK_FAIL(saved_net < 0)) { perror("open(/proc/self/ns/net"); return; } @@ -111,30 +626,29 @@ void test_flow_dissector_reattach(void) goto out_close; } - if (is_attached(init_net)) { + if (prog_is_attached(init_net)) { test__skip(); printf("Can't test with flow dissector attached to init_net\n"); goto out_setns; } /* First run tests in root network namespace */ - do_flow_dissector_reattach(); + run_tests(init_net); /* Then repeat tests in a non-root namespace */ - err = unshare(CLONE_NEWNET); - if (CHECK_FAIL(err)) { - perror("unshare(CLONE_NEWNET)"); + new_net = unshare_net(init_net); + if (new_net < 0) goto out_setns; - } - do_flow_dissector_reattach(); + run_tests(new_net); + close(new_net); out_setns: /* Move back to netns we started in. */ - err = setns(self_net, CLONE_NEWNET); + err = setns(saved_net, CLONE_NEWNET); if (CHECK_FAIL(err)) perror("setns(/proc/self/ns/net)"); out_close: close(init_net); - close(self_net); + close(saved_net); } -- cgit v1.2.3-59-g8ed1b From b8215dce7dfd817ca38807f55165bf502146cd68 Mon Sep 17 00:00:00 2001 From: Jakub Sitnicki Date: Sun, 31 May 2020 10:28:44 +0200 Subject: selftests/bpf, flow_dissector: Close TAP device FD after the test test_flow_dissector leaves a TAP device after it's finished, potentially interfering with other tests that will run after it. Fix it by closing the TAP descriptor on cleanup. Fixes: 0905beec9f52 ("selftests/bpf: run flow dissector tests in skb-less mode") Signed-off-by: Jakub Sitnicki Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200531082846.2117903-11-jakub@cloudflare.com --- tools/testing/selftests/bpf/prog_tests/flow_dissector.c | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/testing/selftests/bpf/prog_tests/flow_dissector.c b/tools/testing/selftests/bpf/prog_tests/flow_dissector.c index 2301c4d3ecec..ef5aab2f60b5 100644 --- a/tools/testing/selftests/bpf/prog_tests/flow_dissector.c +++ b/tools/testing/selftests/bpf/prog_tests/flow_dissector.c @@ -524,6 +524,7 @@ void test_flow_dissector(void) CHECK_ATTR(err, tests[i].name, "bpf_map_delete_elem %d\n", err); } + close(tap_fd); bpf_prog_detach(prog_fd, BPF_FLOW_DISSECTOR); bpf_object__close(obj); } -- cgit v1.2.3-59-g8ed1b From b4b8a3bf9ef0fbbf343b624d68ea328dd4edd5c4 Mon Sep 17 00:00:00 2001 From: Jakub Sitnicki Date: Sun, 31 May 2020 10:28:45 +0200 Subject: selftests/bpf: Convert test_flow_dissector to use BPF skeleton Switch flow dissector test setup from custom BPF object loader to BPF skeleton to save boilerplate and prepare for testing higher-level API for attaching flow dissector with bpf_link. To avoid depending on program order in the BPF object when populating the flow dissector PROG_ARRAY map, change the program section names to contain the program index into the map. This follows the example set by tailcall tests. Signed-off-by: Jakub Sitnicki Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200531082846.2117903-12-jakub@cloudflare.com --- .../selftests/bpf/prog_tests/flow_dissector.c | 50 +++++++++++++++++++--- tools/testing/selftests/bpf/progs/bpf_flow.c | 20 ++++----- 2 files changed, 55 insertions(+), 15 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/flow_dissector.c b/tools/testing/selftests/bpf/prog_tests/flow_dissector.c index ef5aab2f60b5..b6370c0b3b7a 100644 --- a/tools/testing/selftests/bpf/prog_tests/flow_dissector.c +++ b/tools/testing/selftests/bpf/prog_tests/flow_dissector.c @@ -6,6 +6,8 @@ #include #include +#include "bpf_flow.skel.h" + #ifndef IP_MF #define IP_MF 0x2000 #endif @@ -444,17 +446,54 @@ static int ifup(const char *ifname) return 0; } +static int init_prog_array(struct bpf_object *obj, struct bpf_map *prog_array) +{ + int i, err, map_fd, prog_fd; + struct bpf_program *prog; + char prog_name[32]; + + map_fd = bpf_map__fd(prog_array); + if (map_fd < 0) + return -1; + + for (i = 0; i < bpf_map__def(prog_array)->max_entries; i++) { + snprintf(prog_name, sizeof(prog_name), "flow_dissector/%i", i); + + prog = bpf_object__find_program_by_title(obj, prog_name); + if (!prog) + return -1; + + prog_fd = bpf_program__fd(prog); + if (prog_fd < 0) + return -1; + + err = bpf_map_update_elem(map_fd, &i, &prog_fd, BPF_ANY); + if (err) + return -1; + } + return 0; +} + void test_flow_dissector(void) { int i, err, prog_fd, keys_fd = -1, tap_fd; - struct bpf_object *obj; + struct bpf_flow *skel; __u32 duration = 0; - err = bpf_flow_load(&obj, "./bpf_flow.o", "flow_dissector", - "jmp_table", "last_dissection", &prog_fd, &keys_fd); - if (CHECK_FAIL(err)) + skel = bpf_flow__open_and_load(); + if (CHECK(!skel, "skel", "failed to open/load skeleton\n")) return; + prog_fd = bpf_program__fd(skel->progs._dissect); + if (CHECK(prog_fd < 0, "bpf_program__fd", "err %d\n", prog_fd)) + goto out_destroy_skel; + keys_fd = bpf_map__fd(skel->maps.last_dissection); + if (CHECK(keys_fd < 0, "bpf_map__fd", "err %d\n", keys_fd)) + goto out_destroy_skel; + err = init_prog_array(skel->obj, skel->maps.jmp_table); + if (CHECK(err, "init_prog_array", "err %d\n", err)) + goto out_destroy_skel; + for (i = 0; i < ARRAY_SIZE(tests); i++) { struct bpf_flow_keys flow_keys; struct bpf_prog_test_run_attr tattr = { @@ -526,5 +565,6 @@ void test_flow_dissector(void) close(tap_fd); bpf_prog_detach(prog_fd, BPF_FLOW_DISSECTOR); - bpf_object__close(obj); +out_destroy_skel: + bpf_flow__destroy(skel); } diff --git a/tools/testing/selftests/bpf/progs/bpf_flow.c b/tools/testing/selftests/bpf/progs/bpf_flow.c index 9941f0ba471e..de6de9221518 100644 --- a/tools/testing/selftests/bpf/progs/bpf_flow.c +++ b/tools/testing/selftests/bpf/progs/bpf_flow.c @@ -20,20 +20,20 @@ #include int _version SEC("version") = 1; -#define PROG(F) SEC(#F) int bpf_func_##F +#define PROG(F) PROG_(F, _##F) +#define PROG_(NUM, NAME) SEC("flow_dissector/"#NUM) int bpf_func##NAME /* These are the identifiers of the BPF programs that will be used in tail * calls. Name is limited to 16 characters, with the terminating character and * bpf_func_ above, we have only 6 to work with, anything after will be cropped. */ -enum { - IP, - IPV6, - IPV6OP, /* Destination/Hop-by-Hop Options IPv6 Extension header */ - IPV6FR, /* Fragmentation IPv6 Extension Header */ - MPLS, - VLAN, -}; +#define IP 0 +#define IPV6 1 +#define IPV6OP 2 /* Destination/Hop-by-Hop Options IPv6 Ext. Header */ +#define IPV6FR 3 /* Fragmentation IPv6 Extension Header */ +#define MPLS 4 +#define VLAN 5 +#define MAX_PROG 6 #define IP_MF 0x2000 #define IP_OFFSET 0x1FFF @@ -59,7 +59,7 @@ struct frag_hdr { struct { __uint(type, BPF_MAP_TYPE_PROG_ARRAY); - __uint(max_entries, 8); + __uint(max_entries, MAX_PROG); __uint(key_size, sizeof(__u32)); __uint(value_size, sizeof(__u32)); } jmp_table SEC(".maps"); -- cgit v1.2.3-59-g8ed1b From 06716e04a043aa5e010f952a823ad038054b0e5c Mon Sep 17 00:00:00 2001 From: Jakub Sitnicki Date: Sun, 31 May 2020 10:28:46 +0200 Subject: selftests/bpf: Extend test_flow_dissector to cover link creation Extend the existing flow_dissector test case to run tests once using direct prog attachments, and then for the second time using indirect attachment via link. The intention is to exercises the newly added high-level API for attaching programs to network namespace with links (bpf_program__attach_netns). Signed-off-by: Jakub Sitnicki Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200531082846.2117903-13-jakub@cloudflare.com --- .../selftests/bpf/prog_tests/flow_dissector.c | 115 +++++++++++++++------ 1 file changed, 82 insertions(+), 33 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/flow_dissector.c b/tools/testing/selftests/bpf/prog_tests/flow_dissector.c index b6370c0b3b7a..ea14e3ece812 100644 --- a/tools/testing/selftests/bpf/prog_tests/flow_dissector.c +++ b/tools/testing/selftests/bpf/prog_tests/flow_dissector.c @@ -103,6 +103,7 @@ struct test { #define VLAN_HLEN 4 +static __u32 duration; struct test tests[] = { { .name = "ipv4", @@ -474,11 +475,87 @@ static int init_prog_array(struct bpf_object *obj, struct bpf_map *prog_array) return 0; } +static void run_tests_skb_less(int tap_fd, struct bpf_map *keys) +{ + int i, err, keys_fd; + + keys_fd = bpf_map__fd(keys); + if (CHECK(keys_fd < 0, "bpf_map__fd", "err %d\n", keys_fd)) + return; + + for (i = 0; i < ARRAY_SIZE(tests); i++) { + /* Keep in sync with 'flags' from eth_get_headlen. */ + __u32 eth_get_headlen_flags = + BPF_FLOW_DISSECTOR_F_PARSE_1ST_FRAG; + struct bpf_prog_test_run_attr tattr = {}; + struct bpf_flow_keys flow_keys = {}; + __u32 key = (__u32)(tests[i].keys.sport) << 16 | + tests[i].keys.dport; + + /* For skb-less case we can't pass input flags; run + * only the tests that have a matching set of flags. + */ + + if (tests[i].flags != eth_get_headlen_flags) + continue; + + err = tx_tap(tap_fd, &tests[i].pkt, sizeof(tests[i].pkt)); + CHECK(err < 0, "tx_tap", "err %d errno %d\n", err, errno); + + err = bpf_map_lookup_elem(keys_fd, &key, &flow_keys); + CHECK_ATTR(err, tests[i].name, "bpf_map_lookup_elem %d\n", err); + + CHECK_ATTR(err, tests[i].name, "skb-less err %d\n", err); + CHECK_FLOW_KEYS(tests[i].name, flow_keys, tests[i].keys); + + err = bpf_map_delete_elem(keys_fd, &key); + CHECK_ATTR(err, tests[i].name, "bpf_map_delete_elem %d\n", err); + } +} + +static void test_skb_less_prog_attach(struct bpf_flow *skel, int tap_fd) +{ + int err, prog_fd; + + prog_fd = bpf_program__fd(skel->progs._dissect); + if (CHECK(prog_fd < 0, "bpf_program__fd", "err %d\n", prog_fd)) + return; + + err = bpf_prog_attach(prog_fd, 0, BPF_FLOW_DISSECTOR, 0); + if (CHECK(err, "bpf_prog_attach", "err %d errno %d\n", err, errno)) + return; + + run_tests_skb_less(tap_fd, skel->maps.last_dissection); + + err = bpf_prog_detach(prog_fd, BPF_FLOW_DISSECTOR); + CHECK(err, "bpf_prog_detach", "err %d errno %d\n", err, errno); +} + +static void test_skb_less_link_create(struct bpf_flow *skel, int tap_fd) +{ + struct bpf_link *link; + int err, net_fd; + + net_fd = open("/proc/self/ns/net", O_RDONLY); + if (CHECK(net_fd < 0, "open(/proc/self/ns/net)", "err %d\n", errno)) + return; + + link = bpf_program__attach_netns(skel->progs._dissect, net_fd); + if (CHECK(IS_ERR(link), "attach_netns", "err %ld\n", PTR_ERR(link))) + goto out_close; + + run_tests_skb_less(tap_fd, skel->maps.last_dissection); + + err = bpf_link__destroy(link); + CHECK(err, "bpf_link__destroy", "err %d\n", err); +out_close: + close(net_fd); +} + void test_flow_dissector(void) { int i, err, prog_fd, keys_fd = -1, tap_fd; struct bpf_flow *skel; - __u32 duration = 0; skel = bpf_flow__open_and_load(); if (CHECK(!skel, "skel", "failed to open/load skeleton\n")) @@ -526,45 +603,17 @@ void test_flow_dissector(void) * via BPF map in this case. */ - err = bpf_prog_attach(prog_fd, 0, BPF_FLOW_DISSECTOR, 0); - CHECK(err, "bpf_prog_attach", "err %d errno %d\n", err, errno); - tap_fd = create_tap("tap0"); CHECK(tap_fd < 0, "create_tap", "tap_fd %d errno %d\n", tap_fd, errno); err = ifup("tap0"); CHECK(err, "ifup", "err %d errno %d\n", err, errno); - for (i = 0; i < ARRAY_SIZE(tests); i++) { - /* Keep in sync with 'flags' from eth_get_headlen. */ - __u32 eth_get_headlen_flags = - BPF_FLOW_DISSECTOR_F_PARSE_1ST_FRAG; - struct bpf_prog_test_run_attr tattr = {}; - struct bpf_flow_keys flow_keys = {}; - __u32 key = (__u32)(tests[i].keys.sport) << 16 | - tests[i].keys.dport; - - /* For skb-less case we can't pass input flags; run - * only the tests that have a matching set of flags. - */ - - if (tests[i].flags != eth_get_headlen_flags) - continue; - - err = tx_tap(tap_fd, &tests[i].pkt, sizeof(tests[i].pkt)); - CHECK(err < 0, "tx_tap", "err %d errno %d\n", err, errno); - - err = bpf_map_lookup_elem(keys_fd, &key, &flow_keys); - CHECK_ATTR(err, tests[i].name, "bpf_map_lookup_elem %d\n", err); - - CHECK_ATTR(err, tests[i].name, "skb-less err %d\n", err); - CHECK_FLOW_KEYS(tests[i].name, flow_keys, tests[i].keys); - - err = bpf_map_delete_elem(keys_fd, &key); - CHECK_ATTR(err, tests[i].name, "bpf_map_delete_elem %d\n", err); - } + /* Test direct prog attachment */ + test_skb_less_prog_attach(skel, tap_fd); + /* Test indirect prog attachment via link */ + test_skb_less_link_create(skel, tap_fd); close(tap_fd); - bpf_prog_detach(prog_fd, BPF_FLOW_DISSECTOR); out_destroy_skel: bpf_flow__destroy(skel); } -- cgit v1.2.3-59-g8ed1b From 4c21daae3dbc9f8536cc18e6e53627821fa2c90c Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Thu, 28 May 2020 22:34:07 +0800 Subject: tipc: Fix NULL pointer dereference in __tipc_sendstream() tipc_sendstream() may send zero length packet, then tipc_msg_append() do not alloc skb, skb_peek_tail() will get NULL, msg_set_ack_required will trigger NULL pointer dereference. Reported-by: syzbot+8eac6d030e7807c21d32@syzkaller.appspotmail.com Fixes: 0a3e060f340d ("tipc: add test for Nagle algorithm effectiveness") Signed-off-by: YueHaibing Signed-off-by: David S. Miller --- net/tipc/socket.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/net/tipc/socket.c b/net/tipc/socket.c index 3734cdbedc9c..26123f4177fd 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -1588,8 +1588,12 @@ static int __tipc_sendstream(struct socket *sock, struct msghdr *m, size_t dlen) tsk->pkt_cnt += skb_queue_len(txq); } else { skb = skb_peek_tail(txq); - msg_set_ack_required(buf_msg(skb)); - tsk->expect_ack = true; + if (skb) { + msg_set_ack_required(buf_msg(skb)); + tsk->expect_ack = true; + } else { + tsk->expect_ack = false; + } tsk->msg_acc = 0; tsk->pkt_cnt = 0; } -- cgit v1.2.3-59-g8ed1b From 79a1f0ccdbb4ad700590f61b00525b390cb53905 Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Mon, 1 Jun 2020 11:55:03 +0800 Subject: ipv6: fix IPV6_ADDRFORM operation logic Socket option IPV6_ADDRFORM supports UDP/UDPLITE and TCP at present. Previously the checking logic looks like: if (sk->sk_protocol == IPPROTO_UDP || sk->sk_protocol == IPPROTO_UDPLITE) do_some_check; else if (sk->sk_protocol != IPPROTO_TCP) break; After commit b6f6118901d1 ("ipv6: restrict IPV6_ADDRFORM operation"), TCP was blocked as the logic changed to: if (sk->sk_protocol == IPPROTO_UDP || sk->sk_protocol == IPPROTO_UDPLITE) do_some_check; else if (sk->sk_protocol == IPPROTO_TCP) do_some_check; break; else break; Then after commit 82c9ae440857 ("ipv6: fix restrict IPV6_ADDRFORM operation") UDP/UDPLITE were blocked as the logic changed to: if (sk->sk_protocol == IPPROTO_UDP || sk->sk_protocol == IPPROTO_UDPLITE) do_some_check; if (sk->sk_protocol == IPPROTO_TCP) do_some_check; if (sk->sk_protocol != IPPROTO_TCP) break; Fix it by using Eric's code and simply remove the break in TCP check, which looks like: if (sk->sk_protocol == IPPROTO_UDP || sk->sk_protocol == IPPROTO_UDPLITE) do_some_check; else if (sk->sk_protocol == IPPROTO_TCP) do_some_check; else break; Fixes: 82c9ae440857 ("ipv6: fix restrict IPV6_ADDRFORM operation") Signed-off-by: Hangbin Liu Signed-off-by: David S. Miller --- net/ipv6/ipv6_sockglue.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c index adbfed6adf11..2c843ff5e3a9 100644 --- a/net/ipv6/ipv6_sockglue.c +++ b/net/ipv6/ipv6_sockglue.c @@ -218,14 +218,15 @@ static int do_ipv6_setsockopt(struct sock *sk, int level, int optname, retv = -EBUSY; break; } - } - if (sk->sk_protocol == IPPROTO_TCP && - sk->sk_prot != &tcpv6_prot) { - retv = -EBUSY; + } else if (sk->sk_protocol == IPPROTO_TCP) { + if (sk->sk_prot != &tcpv6_prot) { + retv = -EBUSY; + break; + } + } else { break; } - if (sk->sk_protocol != IPPROTO_TCP) - break; + if (sk->sk_state != TCP_ESTABLISHED) { retv = -ENOTCONN; break; -- cgit v1.2.3-59-g8ed1b From a3ac249a1ab57552cb2a63e70556ee87610a591d Mon Sep 17 00:00:00 2001 From: Rohit Maheshwari Date: Mon, 1 Jun 2020 13:08:29 +0530 Subject: cxgb4/chcr: Enable ktls settings at run time Current design enables ktls setting from start, which is not efficient. Now the feature will be enabled when user demands TLS offload on any interface. v1->v2: - taking ULD module refcount till any single connection exists. - taking rtnl_lock() before clearing tls_devops. v2->v3: - cxgb4 is now registering to tlsdev_ops. - module refcount inc/dec in chcr. - refcount is only for connections. - removed new code from cxgb_set_feature(). v3->v4: - fixed warning message. Signed-off-by: Rohit Maheshwari Signed-off-by: David S. Miller --- drivers/crypto/chelsio/chcr_core.c | 23 +++--- drivers/crypto/chelsio/chcr_core.h | 10 ++- drivers/crypto/chelsio/chcr_ktls.c | 59 ++++----------- drivers/crypto/chelsio/chcr_ktls.h | 9 ++- drivers/net/ethernet/chelsio/cxgb4/cxgb4.h | 4 + drivers/net/ethernet/chelsio/cxgb4/cxgb4_debugfs.c | 2 + drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c | 85 +++++++++++++++++++++- drivers/net/ethernet/chelsio/cxgb4/cxgb4_uld.c | 71 ++++++++++++------ drivers/net/ethernet/chelsio/cxgb4/cxgb4_uld.h | 7 ++ drivers/net/ethernet/chelsio/cxgb4/t4fw_api.h | 10 ++- 10 files changed, 195 insertions(+), 85 deletions(-) diff --git a/drivers/crypto/chelsio/chcr_core.c b/drivers/crypto/chelsio/chcr_core.c index ffd4ec0c7374..bd8dac806e7a 100644 --- a/drivers/crypto/chelsio/chcr_core.c +++ b/drivers/crypto/chelsio/chcr_core.c @@ -33,6 +33,13 @@ static int cpl_fw6_pld_handler(struct adapter *adap, unsigned char *input); static void *chcr_uld_add(const struct cxgb4_lld_info *lld); static int chcr_uld_state_change(void *handle, enum cxgb4_state state); +#if defined(CONFIG_CHELSIO_TLS_DEVICE) +static const struct tlsdev_ops chcr_ktls_ops = { + .tls_dev_add = chcr_ktls_dev_add, + .tls_dev_del = chcr_ktls_dev_del, +}; +#endif + #ifdef CONFIG_CHELSIO_IPSEC_INLINE static void update_netdev_features(void); #endif /* CONFIG_CHELSIO_IPSEC_INLINE */ @@ -56,6 +63,9 @@ static struct cxgb4_uld_info chcr_uld_info = { #if defined(CONFIG_CHELSIO_IPSEC_INLINE) || defined(CONFIG_CHELSIO_TLS_DEVICE) .tx_handler = chcr_uld_tx_handler, #endif /* CONFIG_CHELSIO_IPSEC_INLINE || CONFIG_CHELSIO_TLS_DEVICE */ +#if defined(CONFIG_CHELSIO_TLS_DEVICE) + .tlsdev_ops = &chcr_ktls_ops, +#endif }; static void detach_work_fn(struct work_struct *work) @@ -207,11 +217,6 @@ static void *chcr_uld_add(const struct cxgb4_lld_info *lld) } u_ctx->lldi = *lld; chcr_dev_init(u_ctx); - -#ifdef CONFIG_CHELSIO_TLS_DEVICE - if (lld->ulp_crypto & ULP_CRYPTO_KTLS_INLINE) - chcr_enable_ktls(padap(&u_ctx->dev)); -#endif out: return u_ctx; } @@ -348,20 +353,12 @@ static void __exit chcr_crypto_exit(void) list_for_each_entry_safe(u_ctx, tmp, &drv_data.act_dev, entry) { adap = padap(&u_ctx->dev); memset(&adap->chcr_stats, 0, sizeof(adap->chcr_stats)); -#ifdef CONFIG_CHELSIO_TLS_DEVICE - if (u_ctx->lldi.ulp_crypto & ULP_CRYPTO_KTLS_INLINE) - chcr_disable_ktls(adap); -#endif list_del(&u_ctx->entry); kfree(u_ctx); } list_for_each_entry_safe(u_ctx, tmp, &drv_data.inact_dev, entry) { adap = padap(&u_ctx->dev); memset(&adap->chcr_stats, 0, sizeof(adap->chcr_stats)); -#ifdef CONFIG_CHELSIO_TLS_DEVICE - if (u_ctx->lldi.ulp_crypto & ULP_CRYPTO_KTLS_INLINE) - chcr_disable_ktls(adap); -#endif list_del(&u_ctx->entry); kfree(u_ctx); } diff --git a/drivers/crypto/chelsio/chcr_core.h b/drivers/crypto/chelsio/chcr_core.h index 2c09672e00a4..67d77abd6775 100644 --- a/drivers/crypto/chelsio/chcr_core.h +++ b/drivers/crypto/chelsio/chcr_core.h @@ -37,6 +37,7 @@ #define __CHCR_CORE_H__ #include +#include #include "t4_hw.h" #include "cxgb4.h" #include "t4_msg.h" @@ -223,10 +224,15 @@ int chcr_handle_resp(struct crypto_async_request *req, unsigned char *input, int chcr_ipsec_xmit(struct sk_buff *skb, struct net_device *dev); void chcr_add_xfrmops(const struct cxgb4_lld_info *lld); #ifdef CONFIG_CHELSIO_TLS_DEVICE -void chcr_enable_ktls(struct adapter *adap); -void chcr_disable_ktls(struct adapter *adap); int chcr_ktls_cpl_act_open_rpl(struct adapter *adap, unsigned char *input); int chcr_ktls_cpl_set_tcb_rpl(struct adapter *adap, unsigned char *input); int chcr_ktls_xmit(struct sk_buff *skb, struct net_device *dev); +extern int chcr_ktls_dev_add(struct net_device *netdev, struct sock *sk, + enum tls_offload_ctx_dir direction, + struct tls_crypto_info *crypto_info, + u32 start_offload_tcp_sn); +extern void chcr_ktls_dev_del(struct net_device *netdev, + struct tls_context *tls_ctx, + enum tls_offload_ctx_dir direction); #endif #endif /* __CHCR_CORE_H__ */ diff --git a/drivers/crypto/chelsio/chcr_ktls.c b/drivers/crypto/chelsio/chcr_ktls.c index 43d9e2420110..f55b87152166 100644 --- a/drivers/crypto/chelsio/chcr_ktls.c +++ b/drivers/crypto/chelsio/chcr_ktls.c @@ -373,9 +373,9 @@ static int chcr_ktls_mark_tcb_close(struct chcr_ktls_info *tx_info) * @tls_cts - tls context. * @direction - TX/RX crypto direction */ -static void chcr_ktls_dev_del(struct net_device *netdev, - struct tls_context *tls_ctx, - enum tls_offload_ctx_dir direction) +void chcr_ktls_dev_del(struct net_device *netdev, + struct tls_context *tls_ctx, + enum tls_offload_ctx_dir direction) { struct chcr_ktls_ofld_ctx_tx *tx_ctx = chcr_get_ktls_tx_context(tls_ctx); @@ -411,6 +411,8 @@ static void chcr_ktls_dev_del(struct net_device *netdev, atomic64_inc(&tx_info->adap->chcr_stats.ktls_tx_connection_close); kvfree(tx_info); tx_ctx->chcr_info = NULL; + /* release module refcount */ + module_put(THIS_MODULE); } /* @@ -422,10 +424,10 @@ static void chcr_ktls_dev_del(struct net_device *netdev, * @direction - TX/RX crypto direction * return: SUCCESS/FAILURE. */ -static int chcr_ktls_dev_add(struct net_device *netdev, struct sock *sk, - enum tls_offload_ctx_dir direction, - struct tls_crypto_info *crypto_info, - u32 start_offload_tcp_sn) +int chcr_ktls_dev_add(struct net_device *netdev, struct sock *sk, + enum tls_offload_ctx_dir direction, + struct tls_crypto_info *crypto_info, + u32 start_offload_tcp_sn) { struct tls_context *tls_ctx = tls_get_ctx(sk); struct chcr_ktls_ofld_ctx_tx *tx_ctx; @@ -528,6 +530,12 @@ static int chcr_ktls_dev_add(struct net_device *netdev, struct sock *sk, if (ret) goto out2; + /* Driver shouldn't be removed until any single connection exists */ + if (!try_module_get(THIS_MODULE)) { + ret = -EINVAL; + goto out2; + } + atomic64_inc(&adap->chcr_stats.ktls_tx_connection_open); return 0; out2: @@ -537,43 +545,6 @@ out: return ret; } -static const struct tlsdev_ops chcr_ktls_ops = { - .tls_dev_add = chcr_ktls_dev_add, - .tls_dev_del = chcr_ktls_dev_del, -}; - -/* - * chcr_enable_ktls: add NETIF_F_HW_TLS_TX flag in all the ports. - */ -void chcr_enable_ktls(struct adapter *adap) -{ - struct net_device *netdev; - int i; - - for_each_port(adap, i) { - netdev = adap->port[i]; - netdev->features |= NETIF_F_HW_TLS_TX; - netdev->hw_features |= NETIF_F_HW_TLS_TX; - netdev->tlsdev_ops = &chcr_ktls_ops; - } -} - -/* - * chcr_disable_ktls: remove NETIF_F_HW_TLS_TX flag from all the ports. - */ -void chcr_disable_ktls(struct adapter *adap) -{ - struct net_device *netdev; - int i; - - for_each_port(adap, i) { - netdev = adap->port[i]; - netdev->features &= ~NETIF_F_HW_TLS_TX; - netdev->hw_features &= ~NETIF_F_HW_TLS_TX; - netdev->tlsdev_ops = NULL; - } -} - /* * chcr_init_tcb_fields: Initialize tcb fields to handle TCP seq number * handling. diff --git a/drivers/crypto/chelsio/chcr_ktls.h b/drivers/crypto/chelsio/chcr_ktls.h index 5a7ae2ca446e..5cbd84b1da05 100644 --- a/drivers/crypto/chelsio/chcr_ktls.h +++ b/drivers/crypto/chelsio/chcr_ktls.h @@ -89,10 +89,15 @@ static inline int chcr_get_first_rx_qid(struct adapter *adap) return u_ctx->lldi.rxq_ids[0]; } -void chcr_enable_ktls(struct adapter *adap); -void chcr_disable_ktls(struct adapter *adap); int chcr_ktls_cpl_act_open_rpl(struct adapter *adap, unsigned char *input); int chcr_ktls_cpl_set_tcb_rpl(struct adapter *adap, unsigned char *input); int chcr_ktls_xmit(struct sk_buff *skb, struct net_device *dev); +int chcr_ktls_dev_add(struct net_device *netdev, struct sock *sk, + enum tls_offload_ctx_dir direction, + struct tls_crypto_info *crypto_info, + u32 start_offload_tcp_sn); +void chcr_ktls_dev_del(struct net_device *netdev, + struct tls_context *tls_ctx, + enum tls_offload_ctx_dir direction); #endif /* CONFIG_CHELSIO_TLS_DEVICE */ #endif /* __CHCR_KTLS_H__ */ diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h b/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h index 5a41801acb6a..cf69c6edcfec 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h +++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h @@ -1099,6 +1099,7 @@ struct adapter { /* TC u32 offload */ struct cxgb4_tc_u32_table *tc_u32; + struct chcr_ktls chcr_ktls; struct chcr_stats_debug chcr_stats; /* TC flower offload */ @@ -2060,4 +2061,7 @@ int cxgb_open(struct net_device *dev); int cxgb_close(struct net_device *dev); void cxgb4_enable_rx(struct adapter *adap, struct sge_rspq *q); void cxgb4_quiesce_rx(struct sge_rspq *q); +#ifdef CONFIG_CHELSIO_TLS_DEVICE +int cxgb4_set_ktls_feature(struct adapter *adap, bool enable); +#endif #endif /* __CXGB4_H__ */ diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_debugfs.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_debugfs.c index c3dd50b45c48..41315712deb8 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_debugfs.c +++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_debugfs.c @@ -3491,6 +3491,8 @@ static int chcr_stats_show(struct seq_file *seq, void *v) atomic_read(&adap->chcr_stats.tls_key)); #ifdef CONFIG_CHELSIO_TLS_DEVICE seq_puts(seq, "\nChelsio KTLS Crypto Accelerator Stats\n"); + seq_printf(seq, "Tx TLS offload refcount: %20u\n", + refcount_read(&adap->chcr_ktls.ktls_refcount)); seq_printf(seq, "Tx HW offload contexts added: %20llu\n", atomic64_read(&adap->chcr_stats.ktls_tx_ctx)); seq_printf(seq, "Tx connection created: %20llu\n", diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c index 7a0414f379be..854b1717a70d 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c +++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c @@ -66,6 +66,9 @@ #include #include #include +#if defined(CONFIG_CHELSIO_TLS_DEVICE) +#include +#endif #include "cxgb4.h" #include "cxgb4_filter.h" @@ -6064,6 +6067,79 @@ static int cxgb4_iov_configure(struct pci_dev *pdev, int num_vfs) } #endif /* CONFIG_PCI_IOV */ +#if defined(CONFIG_CHELSIO_TLS_DEVICE) + +static int cxgb4_ktls_dev_add(struct net_device *netdev, struct sock *sk, + enum tls_offload_ctx_dir direction, + struct tls_crypto_info *crypto_info, + u32 tcp_sn) +{ + struct adapter *adap = netdev2adap(netdev); + int ret = 0; + + mutex_lock(&uld_mutex); + if (!adap->uld[CXGB4_ULD_CRYPTO].handle) { + dev_err(adap->pdev_dev, "chcr driver is not loaded\n"); + ret = -EOPNOTSUPP; + goto out_unlock; + } + + if (!adap->uld[CXGB4_ULD_CRYPTO].tlsdev_ops) { + dev_err(adap->pdev_dev, + "chcr driver has no registered tlsdev_ops()\n"); + ret = -EOPNOTSUPP; + goto out_unlock; + } + + ret = cxgb4_set_ktls_feature(adap, FW_PARAMS_PARAM_DEV_KTLS_HW_ENABLE); + if (ret) + goto out_unlock; + + ret = adap->uld[CXGB4_ULD_CRYPTO].tlsdev_ops->tls_dev_add(netdev, sk, + direction, + crypto_info, + tcp_sn); + /* if there is a failure, clear the refcount */ + if (ret) + cxgb4_set_ktls_feature(adap, + FW_PARAMS_PARAM_DEV_KTLS_HW_DISABLE); +out_unlock: + mutex_unlock(&uld_mutex); + return ret; +} + +static void cxgb4_ktls_dev_del(struct net_device *netdev, + struct tls_context *tls_ctx, + enum tls_offload_ctx_dir direction) +{ + struct adapter *adap = netdev2adap(netdev); + + mutex_lock(&uld_mutex); + if (!adap->uld[CXGB4_ULD_CRYPTO].handle) { + dev_err(adap->pdev_dev, "chcr driver is not loaded\n"); + goto out_unlock; + } + + if (!adap->uld[CXGB4_ULD_CRYPTO].tlsdev_ops) { + dev_err(adap->pdev_dev, + "chcr driver has no registered tlsdev_ops\n"); + goto out_unlock; + } + + adap->uld[CXGB4_ULD_CRYPTO].tlsdev_ops->tls_dev_del(netdev, tls_ctx, + direction); + cxgb4_set_ktls_feature(adap, FW_PARAMS_PARAM_DEV_KTLS_HW_DISABLE); + +out_unlock: + mutex_unlock(&uld_mutex); +} + +static const struct tlsdev_ops cxgb4_ktls_ops = { + .tls_dev_add = cxgb4_ktls_dev_add, + .tls_dev_del = cxgb4_ktls_dev_del, +}; +#endif /* CONFIG_CHELSIO_TLS_DEVICE */ + static int init_one(struct pci_dev *pdev, const struct pci_device_id *ent) { struct net_device *netdev; @@ -6313,7 +6389,14 @@ static int init_one(struct pci_dev *pdev, const struct pci_device_id *ent) netdev->hw_features |= NETIF_F_HIGHDMA; netdev->features |= netdev->hw_features; netdev->vlan_features = netdev->features & VLAN_FEAT; - +#if defined(CONFIG_CHELSIO_TLS_DEVICE) + if (pi->adapter->params.crypto & FW_CAPS_CONFIG_TLS_HW) { + netdev->hw_features |= NETIF_F_HW_TLS_TX; + netdev->tlsdev_ops = &cxgb4_ktls_ops; + /* initialize the refcount */ + refcount_set(&pi->adapter->chcr_ktls.ktls_refcount, 0); + } +#endif netdev->priv_flags |= IFF_UNICAST_FLT; /* MTU range: 81 - 9600 */ diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_uld.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_uld.c index 9e3c6b36cde8..0307e9c69a47 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_uld.c +++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_uld.c @@ -663,22 +663,64 @@ static int uld_attach(struct adapter *adap, unsigned int uld) return 0; } +static bool cxgb4_uld_in_use(struct adapter *adap) +{ + const struct tid_info *t = &adap->tids; + + return (atomic_read(&t->conns_in_use) || t->stids_in_use); +} + #ifdef CONFIG_CHELSIO_TLS_DEVICE /* cxgb4_set_ktls_feature: request FW to enable/disable ktls settings. * @adap: adapter info * @enable: 1 to enable / 0 to disable ktls settings. */ -static void cxgb4_set_ktls_feature(struct adapter *adap, bool enable) +int cxgb4_set_ktls_feature(struct adapter *adap, bool enable) { - u32 params = (FW_PARAMS_MNEM_V(FW_PARAMS_MNEM_DEV) | - FW_PARAMS_PARAM_X_V(FW_PARAMS_PARAM_DEV_KTLS_TX_HW) | - FW_PARAMS_PARAM_Y_V(enable)); int ret = 0; + u32 params = + FW_PARAMS_MNEM_V(FW_PARAMS_MNEM_DEV) | + FW_PARAMS_PARAM_X_V(FW_PARAMS_PARAM_DEV_KTLS_HW) | + FW_PARAMS_PARAM_Y_V(enable) | + FW_PARAMS_PARAM_Z_V(FW_PARAMS_PARAM_DEV_KTLS_HW_USER_ENABLE); + + if (enable) { + if (!refcount_read(&adap->chcr_ktls.ktls_refcount)) { + /* At this moment if ULD connection are up means, other + * ULD is/are already active, return failure. + */ + if (cxgb4_uld_in_use(adap)) { + dev_warn(adap->pdev_dev, + "ULD connections (tid/stid) active. Can't enable kTLS\n"); + return -EINVAL; + } + ret = t4_set_params(adap, adap->mbox, adap->pf, + 0, 1, ¶ms, ¶ms); + if (ret) + return ret; + refcount_set(&adap->chcr_ktls.ktls_refcount, 1); + pr_info("kTLS has been enabled. Restrictions placed on ULD support\n"); + } else { + /* ktls settings already up, just increment refcount. */ + refcount_inc(&adap->chcr_ktls.ktls_refcount); + } + } else { + /* return failure if refcount is already 0. */ + if (!refcount_read(&adap->chcr_ktls.ktls_refcount)) + return -EINVAL; + /* decrement refcount and test, if 0, disable ktls feature, + * else return command success. + */ + if (refcount_dec_and_test(&adap->chcr_ktls.ktls_refcount)) { + ret = t4_set_params(adap, adap->mbox, adap->pf, + 0, 1, ¶ms, ¶ms); + if (ret) + return ret; + pr_info("kTLS is disabled. Restrictions on ULD support removed\n"); + } + } - ret = t4_set_params(adap, adap->mbox, adap->pf, 0, 1, ¶ms, ¶ms); - /* if fw returns failure, clear the ktls flag */ - if (ret) - adap->params.crypto &= ~ULP_CRYPTO_KTLS_INLINE; + return ret; } #endif @@ -706,12 +748,6 @@ static void cxgb4_uld_alloc_resources(struct adapter *adap, } if (adap->flags & CXGB4_FULL_INIT_DONE) enable_rx_uld(adap, type); -#ifdef CONFIG_CHELSIO_TLS_DEVICE - /* send mbox to enable ktls related settings. */ - if (type == CXGB4_ULD_CRYPTO && - (adap->params.crypto & FW_CAPS_CONFIG_TX_TLS_HW)) - cxgb4_set_ktls_feature(adap, 1); -#endif if (adap->uld[type].add) goto free_irq; ret = setup_sge_txq_uld(adap, type, p); @@ -805,13 +841,6 @@ int cxgb4_unregister_uld(enum cxgb4_uld type) continue; cxgb4_shutdown_uld_adapter(adap, type); - -#ifdef CONFIG_CHELSIO_TLS_DEVICE - /* send mbox to disable ktls related settings. */ - if (type == CXGB4_ULD_CRYPTO && - (adap->params.crypto & FW_CAPS_CONFIG_TX_TLS_HW)) - cxgb4_set_ktls_feature(adap, 0); -#endif } list_for_each_entry_safe(uld_entry, tmp, &uld_list, list_node) { diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_uld.h b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_uld.h index 085fa1424f9a..dbce99b209d6 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_uld.h +++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_uld.h @@ -268,6 +268,10 @@ struct filter_ctx { u32 tid; /* to store tid */ }; +struct chcr_ktls { + refcount_t ktls_refcount; +}; + struct ch_filter_specification; int cxgb4_get_free_ftid(struct net_device *dev, u8 family, bool hash_en, @@ -464,6 +468,9 @@ struct cxgb4_uld_info { struct napi_struct *napi); void (*lro_flush)(struct t4_lro_mgr *); int (*tx_handler)(struct sk_buff *skb, struct net_device *dev); +#if IS_ENABLED(CONFIG_TLS_DEVICE) + const struct tlsdev_ops *tlsdev_ops; +#endif }; void cxgb4_uld_enable(struct adapter *adap); diff --git a/drivers/net/ethernet/chelsio/cxgb4/t4fw_api.h b/drivers/net/ethernet/chelsio/cxgb4/t4fw_api.h index 68fe734b9b37..0a326c054707 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/t4fw_api.h +++ b/drivers/net/ethernet/chelsio/cxgb4/t4fw_api.h @@ -1205,7 +1205,7 @@ enum fw_caps_config_crypto { FW_CAPS_CONFIG_CRYPTO_LOOKASIDE = 0x00000001, FW_CAPS_CONFIG_TLS_INLINE = 0x00000002, FW_CAPS_CONFIG_IPSEC_INLINE = 0x00000004, - FW_CAPS_CONFIG_TX_TLS_HW = 0x00000008, + FW_CAPS_CONFIG_TLS_HW = 0x00000008, }; enum fw_caps_config_fcoe { @@ -1329,7 +1329,7 @@ enum fw_params_param_dev { FW_PARAMS_PARAM_DEV_DBQ_TIMERTICK = 0x2A, FW_PARAMS_PARAM_DEV_NUM_TM_CLASS = 0x2B, FW_PARAMS_PARAM_DEV_FILTER = 0x2E, - FW_PARAMS_PARAM_DEV_KTLS_TX_HW = 0x31, + FW_PARAMS_PARAM_DEV_KTLS_HW = 0x31, }; /* @@ -1412,6 +1412,12 @@ enum fw_params_param_dmaq { FW_PARAMS_PARAM_DMAQ_CONM_CTXT = 0x20, }; +enum fw_params_param_dev_ktls_hw { + FW_PARAMS_PARAM_DEV_KTLS_HW_DISABLE = 0x00, + FW_PARAMS_PARAM_DEV_KTLS_HW_ENABLE = 0x01, + FW_PARAMS_PARAM_DEV_KTLS_HW_USER_ENABLE = 0x01, +}; + enum fw_params_param_dev_phyfw { FW_PARAMS_PARAM_DEV_PHYFW_DOWNLOAD = 0x00, FW_PARAMS_PARAM_DEV_PHYFW_VERSION = 0x01, -- cgit v1.2.3-59-g8ed1b From 76d7728db724466490c2c3dd4f84c3357f550615 Mon Sep 17 00:00:00 2001 From: Rohit Maheshwari Date: Mon, 1 Jun 2020 19:33:32 +0530 Subject: crypto/chcr: IPV6 code needs to be in CONFIG_IPV6 Error messages seen while building kernel with CONFIG_IPV6 disabled. Signed-off-by: Rohit Maheshwari Signed-off-by: David S. Miller --- drivers/crypto/chelsio/chcr_ktls.c | 48 ++++++++++++++++++++++++++------------ 1 file changed, 33 insertions(+), 15 deletions(-) diff --git a/drivers/crypto/chelsio/chcr_ktls.c b/drivers/crypto/chelsio/chcr_ktls.c index f55b87152166..91dee616d15e 100644 --- a/drivers/crypto/chelsio/chcr_ktls.c +++ b/drivers/crypto/chelsio/chcr_ktls.c @@ -221,6 +221,7 @@ static int chcr_ktls_act_open_req(struct sock *sk, return cxgb4_l2t_send(tx_info->netdev, skb, tx_info->l2te); } +#if IS_ENABLED(CONFIG_IPV6) /* * chcr_ktls_act_open_req6: creates TCB entry for ipv6 connection. * @sk - tcp socket. @@ -270,6 +271,7 @@ static int chcr_ktls_act_open_req6(struct sock *sk, return cxgb4_l2t_send(tx_info->netdev, skb, tx_info->l2te); } +#endif /* #if IS_ENABLED(CONFIG_IPV6) */ /* * chcr_setup_connection: create a TCB entry so that TP will form tcp packets. @@ -290,20 +292,26 @@ static int chcr_setup_connection(struct sock *sk, tx_info->atid = atid; tx_info->ip_family = sk->sk_family; - if (sk->sk_family == AF_INET || - (sk->sk_family == AF_INET6 && !sk->sk_ipv6only && - ipv6_addr_type(&sk->sk_v6_daddr) == IPV6_ADDR_MAPPED)) { + if (sk->sk_family == AF_INET) { tx_info->ip_family = AF_INET; ret = chcr_ktls_act_open_req(sk, tx_info, atid); +#if IS_ENABLED(CONFIG_IPV6) } else { - tx_info->ip_family = AF_INET6; - ret = - cxgb4_clip_get(tx_info->netdev, - (const u32 *)&sk->sk_v6_rcv_saddr.in6_u.u6_addr8, - 1); - if (ret) - goto out; - ret = chcr_ktls_act_open_req6(sk, tx_info, atid); + if (!sk->sk_ipv6only && + ipv6_addr_type(&sk->sk_v6_daddr) == IPV6_ADDR_MAPPED) { + tx_info->ip_family = AF_INET; + ret = chcr_ktls_act_open_req(sk, tx_info, atid); + } else { + tx_info->ip_family = AF_INET6; + ret = cxgb4_clip_get(tx_info->netdev, + (const u32 *) + &sk->sk_v6_rcv_saddr.s6_addr, + 1); + if (ret) + goto out; + ret = chcr_ktls_act_open_req6(sk, tx_info, atid); + } +#endif } /* if return type is NET_XMIT_CN, msg will be sent but delayed, mark ret @@ -394,11 +402,13 @@ void chcr_ktls_dev_del(struct net_device *netdev, if (tx_info->l2te) cxgb4_l2t_release(tx_info->l2te); +#if IS_ENABLED(CONFIG_IPV6) /* clear clip entry */ if (tx_info->ip_family == AF_INET6) cxgb4_clip_release(netdev, (const u32 *)&sk->sk_v6_daddr.in6_u.u6_addr8, 1); +#endif /* clear tid */ if (tx_info->tid != -1) { @@ -491,12 +501,16 @@ int chcr_ktls_dev_add(struct net_device *netdev, struct sock *sk, goto out2; /* get peer ip */ - if (sk->sk_family == AF_INET || - (sk->sk_family == AF_INET6 && !sk->sk_ipv6only && - ipv6_addr_type(&sk->sk_v6_daddr) == IPV6_ADDR_MAPPED)) { + if (sk->sk_family == AF_INET) { memcpy(daaddr, &sk->sk_daddr, 4); +#if IS_ENABLED(CONFIG_IPV6) } else { - memcpy(daaddr, sk->sk_v6_daddr.in6_u.u6_addr8, 16); + if (!sk->sk_ipv6only && + ipv6_addr_type(&sk->sk_v6_daddr) == IPV6_ADDR_MAPPED) + memcpy(daaddr, &sk->sk_daddr, 4); + else + memcpy(daaddr, sk->sk_v6_daddr.in6_u.u6_addr8, 16); +#endif } /* get the l2t index */ @@ -903,7 +917,9 @@ chcr_ktls_write_tcp_options(struct chcr_ktls_info *tx_info, struct sk_buff *skb, struct fw_eth_tx_pkt_wr *wr; struct cpl_tx_pkt_core *cpl; u32 ctrl, iplen, maclen; +#if IS_ENABLED(CONFIG_IPV6) struct ipv6hdr *ip6; +#endif unsigned int ndesc; struct tcphdr *tcp; int len16, pktlen; @@ -958,9 +974,11 @@ chcr_ktls_write_tcp_options(struct chcr_ktls_info *tx_info, struct sk_buff *skb, /* we need to correct ip header len */ ip = (struct iphdr *)(buf + maclen); ip->tot_len = htons(pktlen - maclen); +#if IS_ENABLED(CONFIG_IPV6) } else { ip6 = (struct ipv6hdr *)(buf + maclen); ip6->payload_len = htons(pktlen - maclen - iplen); +#endif } /* now take care of the tcp header, if fin is not set then clear push * bit as well, and if fin is set, it will be sent at the last so we -- cgit v1.2.3-59-g8ed1b From f3b140ad8575b74c78be50f34079e1cf43b0302d Mon Sep 17 00:00:00 2001 From: Ayush Sawal Date: Mon, 1 Jun 2020 23:11:58 +0530 Subject: Crypto/chcr: Fixes compilations warnings This patch fixes the compilation warnings displayed by sparse tool for chcr driver. V1->V2 Avoid type casting by using get_unaligned_be32() and put_unaligned_be16/32() functions. The key which comes from stack is an u8 byte stream so we store it in an unsigned char array(ablkctx->key). The function get_aes_decrypt_key() is a used to calculate the reverse round key for decryption, for this operation the key has to be divided into 4 bytes, so to extract 4 bytes from an u8 byte stream and store it in an u32 variable, get_aligned_be32() is used. Similarly for copying back the key from u32 variable to the original u8 key stream, put_aligned_be32() is used. Signed-off-by: Ayush Sawal Signed-off-by: David S. Miller --- drivers/crypto/chelsio/chcr_algo.c | 10 ++++------ drivers/crypto/chelsio/chcr_ipsec.c | 2 +- 2 files changed, 5 insertions(+), 7 deletions(-) diff --git a/drivers/crypto/chelsio/chcr_algo.c b/drivers/crypto/chelsio/chcr_algo.c index b8c1c4dd3ef0..94cf04e5aacf 100644 --- a/drivers/crypto/chelsio/chcr_algo.c +++ b/drivers/crypto/chelsio/chcr_algo.c @@ -256,7 +256,7 @@ static void get_aes_decrypt_key(unsigned char *dec_key, return; } for (i = 0; i < nk; i++) - w_ring[i] = be32_to_cpu(*(u32 *)&key[4 * i]); + w_ring[i] = get_unaligned_be32(&key[i * 4]); i = 0; temp = w_ring[nk - 1]; @@ -275,7 +275,7 @@ static void get_aes_decrypt_key(unsigned char *dec_key, } i--; for (k = 0, j = i % nk; k < nk; k++) { - *((u32 *)dec_key + k) = htonl(w_ring[j]); + put_unaligned_be32(w_ring[j], &dec_key[k * 4]); j--; if (j < 0) j += nk; @@ -2926,8 +2926,7 @@ static int ccm_format_packet(struct aead_request *req, memcpy(ivptr, req->iv, 16); } if (assoclen) - *((unsigned short *)(reqctx->scratch_pad + 16)) = - htons(assoclen); + put_unaligned_be16(assoclen, &reqctx->scratch_pad[16]); rc = generate_b0(req, ivptr, op_type); /* zero the ctr value */ @@ -3201,8 +3200,7 @@ static struct sk_buff *create_gcm_wr(struct aead_request *req, } else { memcpy(ivptr, req->iv, GCM_AES_IV_SIZE); } - *((unsigned int *)(ivptr + 12)) = htonl(0x01); - + put_unaligned_be32(0x01, &ivptr[12]); ulptx = (struct ulptx_sgl *)(ivptr + 16); chcr_add_aead_dst_ent(req, phys_cpl, qid); diff --git a/drivers/crypto/chelsio/chcr_ipsec.c b/drivers/crypto/chelsio/chcr_ipsec.c index d25689837b26..3a10f51ad6fd 100644 --- a/drivers/crypto/chelsio/chcr_ipsec.c +++ b/drivers/crypto/chelsio/chcr_ipsec.c @@ -403,7 +403,7 @@ inline void *copy_esn_pktxt(struct sk_buff *skb, xo = xfrm_offload(skb); aadiv->spi = (esphdr->spi); - seqlo = htonl(esphdr->seq_no); + seqlo = ntohl(esphdr->seq_no); seqno = cpu_to_be64(seqlo + ((u64)xo->seq.hi << 32)); memcpy(aadiv->seq_no, &seqno, 8); iv = skb_transport_header(skb) + sizeof(struct ip_esp_hdr); -- cgit v1.2.3-59-g8ed1b From 055be6865dea6743b090d1c55c8d21a5e01df201 Mon Sep 17 00:00:00 2001 From: Ayush Sawal Date: Mon, 1 Jun 2020 23:11:59 +0530 Subject: Crypto/chcr: Fixes a coccinile check error This fixes an error observed after running coccinile check. drivers/crypto/chelsio/chcr_algo.c:1462:5-8: Unneeded variable: "err". Return "0" on line 1480 This line is missed in the commit 567be3a5d227 ("crypto: chelsio - Use multiple txq/rxq per tfm to process the requests"). Fixes: 567be3a5d227 ("crypto: chelsio - Use multiple txq/rxq per tfm to process the requests"). V1->V2 -Modified subject. Signed-off-by: Ayush Sawal Signed-off-by: David S. Miller --- drivers/crypto/chelsio/chcr_algo.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/crypto/chelsio/chcr_algo.c b/drivers/crypto/chelsio/chcr_algo.c index 94cf04e5aacf..2080b2ec6639 100644 --- a/drivers/crypto/chelsio/chcr_algo.c +++ b/drivers/crypto/chelsio/chcr_algo.c @@ -1464,6 +1464,7 @@ static int chcr_device_init(struct chcr_context *ctx) if (!ctx->dev) { u_ctx = assign_chcr_device(); if (!u_ctx) { + err = -ENXIO; pr_err("chcr device assignment fails\n"); goto out; } -- cgit v1.2.3-59-g8ed1b From 6abde0b241224347cd88e2ae75902e07f55c42cb Mon Sep 17 00:00:00 2001 From: Vinay Kumar Yadav Date: Tue, 2 Jun 2020 00:07:05 +0530 Subject: crypto/chtls: IPv6 support for inline TLS Extends support to IPv6 for Inline TLS server. Signed-off-by: Vinay Kumar Yadav v1->v2: - cc'd tcp folks. v2->v3: - changed EXPORT_SYMBOL() to EXPORT_SYMBOL_GPL() Signed-off-by: David S. Miller --- drivers/crypto/chelsio/chtls/chtls_cm.c | 195 ++++++++++++++++++++++++------ drivers/crypto/chelsio/chtls/chtls_cm.h | 1 + drivers/crypto/chelsio/chtls/chtls_main.c | 14 ++- net/ipv6/tcp_ipv6.c | 1 + 4 files changed, 168 insertions(+), 43 deletions(-) diff --git a/drivers/crypto/chelsio/chtls/chtls_cm.c b/drivers/crypto/chelsio/chtls/chtls_cm.c index d5720a859443..9a642c79a657 100644 --- a/drivers/crypto/chelsio/chtls/chtls_cm.c +++ b/drivers/crypto/chelsio/chtls/chtls_cm.c @@ -18,13 +18,20 @@ #include #include #include +#include +#include +#include +#include #include #include #include #include +#include +#include #include "chtls.h" #include "chtls_cm.h" +#include "clip_tbl.h" /* * State transitions and actions for close. Note that if we are in SYN_SENT @@ -82,15 +89,36 @@ static void chtls_sock_release(struct kref *ref) kfree(csk); } -static struct net_device *chtls_ipv4_netdev(struct chtls_dev *cdev, +static struct net_device *chtls_find_netdev(struct chtls_dev *cdev, struct sock *sk) { struct net_device *ndev = cdev->ports[0]; + struct net_device *temp; + int addr_type; + + switch (sk->sk_family) { + case PF_INET: + if (likely(!inet_sk(sk)->inet_rcv_saddr)) + return ndev; + ndev = ip_dev_find(&init_net, inet_sk(sk)->inet_rcv_saddr); + break; + case PF_INET6: + addr_type = ipv6_addr_type(&sk->sk_v6_rcv_saddr); + if (likely(addr_type == IPV6_ADDR_ANY)) + return ndev; + + for_each_netdev_rcu(&init_net, temp) { + if (ipv6_chk_addr(&init_net, (struct in6_addr *) + &sk->sk_v6_rcv_saddr, temp, 1)) { + ndev = temp; + break; + } + } + break; + default: + return NULL; + } - if (likely(!inet_sk(sk)->inet_rcv_saddr)) - return ndev; - - ndev = ip_dev_find(&init_net, inet_sk(sk)->inet_rcv_saddr); if (!ndev) return NULL; @@ -446,7 +474,10 @@ void chtls_destroy_sock(struct sock *sk) free_tls_keyid(sk); kref_put(&csk->kref, chtls_sock_release); csk->cdev = NULL; - sk->sk_prot = &tcp_prot; + if (sk->sk_family == AF_INET) + sk->sk_prot = &tcp_prot; + else + sk->sk_prot = &tcpv6_prot; sk->sk_prot->destroy(sk); } @@ -473,7 +504,8 @@ static void chtls_disconnect_acceptq(struct sock *listen_sk) while (*pprev) { struct request_sock *req = *pprev; - if (req->rsk_ops == &chtls_rsk_ops) { + if (req->rsk_ops == &chtls_rsk_ops || + req->rsk_ops == &chtls_rsk_opsv6) { struct sock *child = req->sk; *pprev = req->dl_next; @@ -600,14 +632,13 @@ int chtls_listen_start(struct chtls_dev *cdev, struct sock *sk) struct listen_ctx *ctx; struct adapter *adap; struct port_info *pi; + bool clip_valid; int stid; int ret; - if (sk->sk_family != PF_INET) - return -EAGAIN; - + clip_valid = false; rcu_read_lock(); - ndev = chtls_ipv4_netdev(cdev, sk); + ndev = chtls_find_netdev(cdev, sk); rcu_read_unlock(); if (!ndev) return -EBADF; @@ -638,16 +669,35 @@ int chtls_listen_start(struct chtls_dev *cdev, struct sock *sk) if (!listen_hash_add(cdev, sk, stid)) goto free_stid; - ret = cxgb4_create_server(ndev, stid, - inet_sk(sk)->inet_rcv_saddr, - inet_sk(sk)->inet_sport, 0, - cdev->lldi->rxq_ids[0]); + if (sk->sk_family == PF_INET) { + ret = cxgb4_create_server(ndev, stid, + inet_sk(sk)->inet_rcv_saddr, + inet_sk(sk)->inet_sport, 0, + cdev->lldi->rxq_ids[0]); + } else { + int addr_type; + + addr_type = ipv6_addr_type(&sk->sk_v6_rcv_saddr); + if (addr_type != IPV6_ADDR_ANY) { + ret = cxgb4_clip_get(ndev, (const u32 *) + &sk->sk_v6_rcv_saddr, 1); + if (ret) + goto del_hash; + clip_valid = true; + } + ret = cxgb4_create_server6(ndev, stid, + &sk->sk_v6_rcv_saddr, + inet_sk(sk)->inet_sport, + cdev->lldi->rxq_ids[0]); + } if (ret > 0) ret = net_xmit_errno(ret); if (ret) goto del_hash; return 0; del_hash: + if (clip_valid) + cxgb4_clip_release(ndev, (const u32 *)&sk->sk_v6_rcv_saddr, 1); listen_hash_del(cdev, sk); free_stid: cxgb4_free_stid(cdev->tids, stid, sk->sk_family); @@ -661,6 +711,8 @@ free_ctx: void chtls_listen_stop(struct chtls_dev *cdev, struct sock *sk) { struct listen_ctx *listen_ctx; + struct chtls_sock *csk; + int addr_type = 0; int stid; stid = listen_hash_del(cdev, sk); @@ -671,7 +723,16 @@ void chtls_listen_stop(struct chtls_dev *cdev, struct sock *sk) chtls_reset_synq(listen_ctx); cxgb4_remove_server(cdev->lldi->ports[0], stid, - cdev->lldi->rxq_ids[0], 0); + cdev->lldi->rxq_ids[0], sk->sk_family == PF_INET6); + + if (sk->sk_family == PF_INET6) { + csk = rcu_dereference_sk_user_data(sk); + addr_type = ipv6_addr_type((const struct in6_addr *) + &sk->sk_v6_rcv_saddr); + if (addr_type != IPV6_ADDR_ANY) + cxgb4_clip_release(csk->egress_dev, (const u32 *) + &sk->sk_v6_rcv_saddr, 1); + } chtls_disconnect_acceptq(sk); } @@ -880,7 +941,10 @@ static unsigned int chtls_select_mss(const struct chtls_sock *csk, tp = tcp_sk(sk); tcpoptsz = 0; - iphdrsz = sizeof(struct iphdr) + sizeof(struct tcphdr); + if (sk->sk_family == AF_INET6) + iphdrsz = sizeof(struct ipv6hdr) + sizeof(struct tcphdr); + else + iphdrsz = sizeof(struct iphdr) + sizeof(struct tcphdr); if (req->tcpopt.tstamp) tcpoptsz += round_up(TCPOLEN_TIMESTAMP, 4); @@ -1045,11 +1109,29 @@ static struct sock *chtls_recv_sock(struct sock *lsk, if (!newsk) goto free_oreq; - dst = inet_csk_route_child_sock(lsk, newsk, oreq); - if (!dst) - goto free_sk; + if (lsk->sk_family == AF_INET) { + dst = inet_csk_route_child_sock(lsk, newsk, oreq); + if (!dst) + goto free_sk; - n = dst_neigh_lookup(dst, &iph->saddr); + n = dst_neigh_lookup(dst, &iph->saddr); + } else { + const struct ipv6hdr *ip6h; + struct flowi6 fl6; + + ip6h = (const struct ipv6hdr *)network_hdr; + memset(&fl6, 0, sizeof(fl6)); + fl6.flowi6_proto = IPPROTO_TCP; + fl6.saddr = ip6h->daddr; + fl6.daddr = ip6h->saddr; + fl6.fl6_dport = inet_rsk(oreq)->ir_rmt_port; + fl6.fl6_sport = htons(inet_rsk(oreq)->ir_num); + security_req_classify_flow(oreq, flowi6_to_flowi(&fl6)); + dst = ip6_dst_lookup_flow(sock_net(lsk), lsk, &fl6, NULL); + if (IS_ERR(dst)) + goto free_sk; + n = dst_neigh_lookup(dst, &ip6h->saddr); + } if (!n) goto free_sk; @@ -1072,9 +1154,28 @@ static struct sock *chtls_recv_sock(struct sock *lsk, tp = tcp_sk(newsk); newinet = inet_sk(newsk); - newinet->inet_daddr = iph->saddr; - newinet->inet_rcv_saddr = iph->daddr; - newinet->inet_saddr = iph->daddr; + if (iph->version == 0x4) { + newinet->inet_daddr = iph->saddr; + newinet->inet_rcv_saddr = iph->daddr; + newinet->inet_saddr = iph->daddr; + } else { + struct tcp6_sock *newtcp6sk = (struct tcp6_sock *)newsk; + struct inet_request_sock *treq = inet_rsk(oreq); + struct ipv6_pinfo *newnp = inet6_sk(newsk); + struct ipv6_pinfo *np = inet6_sk(lsk); + + inet_sk(newsk)->pinet6 = &newtcp6sk->inet6; + memcpy(newnp, np, sizeof(struct ipv6_pinfo)); + newsk->sk_v6_daddr = treq->ir_v6_rmt_addr; + newsk->sk_v6_rcv_saddr = treq->ir_v6_loc_addr; + inet6_sk(newsk)->saddr = treq->ir_v6_loc_addr; + newnp->ipv6_fl_list = NULL; + newnp->pktoptions = NULL; + newsk->sk_bound_dev_if = treq->ir_iif; + newinet->inet_opt = NULL; + newinet->inet_daddr = LOOPBACK4_IPV6; + newinet->inet_saddr = LOOPBACK4_IPV6; + } oreq->ts_recent = PASS_OPEN_TID_G(ntohl(req->tos_stid)); sk_setup_caps(newsk, dst); @@ -1156,6 +1257,7 @@ static void chtls_pass_accept_request(struct sock *sk, struct sk_buff *reply_skb; struct chtls_sock *csk; struct chtls_dev *cdev; + struct ipv6hdr *ip6h; struct tcphdr *tcph; struct sock *newsk; struct ethhdr *eh; @@ -1196,37 +1298,50 @@ static void chtls_pass_accept_request(struct sock *sk, if (sk_acceptq_is_full(sk)) goto reject; - oreq = inet_reqsk_alloc(&chtls_rsk_ops, sk, true); - if (!oreq) - goto reject; - - oreq->rsk_rcv_wnd = 0; - oreq->rsk_window_clamp = 0; - oreq->cookie_ts = 0; - oreq->mss = 0; - oreq->ts_recent = 0; eth_hdr_len = T6_ETH_HDR_LEN_G(ntohl(req->hdr_len)); if (eth_hdr_len == ETH_HLEN) { eh = (struct ethhdr *)(req + 1); iph = (struct iphdr *)(eh + 1); + ip6h = (struct ipv6hdr *)(eh + 1); network_hdr = (void *)(eh + 1); } else { vlan_eh = (struct vlan_ethhdr *)(req + 1); iph = (struct iphdr *)(vlan_eh + 1); + ip6h = (struct ipv6hdr *)(vlan_eh + 1); network_hdr = (void *)(vlan_eh + 1); } - if (iph->version != 0x4) - goto free_oreq; - tcph = (struct tcphdr *)(iph + 1); - skb_set_network_header(skb, (void *)iph - (void *)req); + if (iph->version == 0x4) { + tcph = (struct tcphdr *)(iph + 1); + skb_set_network_header(skb, (void *)iph - (void *)req); + oreq = inet_reqsk_alloc(&chtls_rsk_ops, sk, true); + } else { + tcph = (struct tcphdr *)(ip6h + 1); + skb_set_network_header(skb, (void *)ip6h - (void *)req); + oreq = inet_reqsk_alloc(&chtls_rsk_opsv6, sk, false); + } + + if (!oreq) + goto reject; + + oreq->rsk_rcv_wnd = 0; + oreq->rsk_window_clamp = 0; + oreq->cookie_ts = 0; + oreq->mss = 0; + oreq->ts_recent = 0; tcp_rsk(oreq)->tfo_listener = false; tcp_rsk(oreq)->rcv_isn = ntohl(tcph->seq); chtls_set_req_port(oreq, tcph->source, tcph->dest); - chtls_set_req_addr(oreq, iph->daddr, iph->saddr); - ip_dsfield = ipv4_get_dsfield(iph); + if (iph->version == 0x4) { + chtls_set_req_addr(oreq, iph->daddr, iph->saddr); + ip_dsfield = ipv4_get_dsfield(iph); + } else { + inet_rsk(oreq)->ir_v6_rmt_addr = ipv6_hdr(skb)->saddr; + inet_rsk(oreq)->ir_v6_loc_addr = ipv6_hdr(skb)->daddr; + ip_dsfield = ipv6_get_dsfield(ipv6_hdr(skb)); + } if (req->tcpopt.wsf <= 14 && sock_net(sk)->ipv4.sysctl_tcp_window_scaling) { inet_rsk(oreq)->wscale_ok = 1; @@ -1243,7 +1358,7 @@ static void chtls_pass_accept_request(struct sock *sk, newsk = chtls_recv_sock(sk, oreq, network_hdr, req, cdev); if (!newsk) - goto reject; + goto free_oreq; if (chtls_get_module(newsk)) goto reject; diff --git a/drivers/crypto/chelsio/chtls/chtls_cm.h b/drivers/crypto/chelsio/chtls/chtls_cm.h index 3fac0c74a41f..47ba81e42f5d 100644 --- a/drivers/crypto/chelsio/chtls/chtls_cm.h +++ b/drivers/crypto/chelsio/chtls/chtls_cm.h @@ -79,6 +79,7 @@ enum { typedef void (*defer_handler_t)(struct chtls_dev *dev, struct sk_buff *skb); extern struct request_sock_ops chtls_rsk_ops; +extern struct request_sock_ops chtls_rsk_opsv6; struct deferred_skb_cb { defer_handler_t handler; diff --git a/drivers/crypto/chelsio/chtls/chtls_main.c b/drivers/crypto/chelsio/chtls/chtls_main.c index 2110d0893bc7..7dfffdde9593 100644 --- a/drivers/crypto/chelsio/chtls/chtls_main.c +++ b/drivers/crypto/chelsio/chtls/chtls_main.c @@ -13,6 +13,8 @@ #include #include #include +#include +#include #include #include @@ -30,8 +32,8 @@ static DEFINE_MUTEX(cdev_mutex); static DEFINE_MUTEX(notify_mutex); static RAW_NOTIFIER_HEAD(listen_notify_list); -static struct proto chtls_cpl_prot; -struct request_sock_ops chtls_rsk_ops; +static struct proto chtls_cpl_prot, chtls_cpl_protv6; +struct request_sock_ops chtls_rsk_ops, chtls_rsk_opsv6; static uint send_page_order = (14 - PAGE_SHIFT < 0) ? 0 : 14 - PAGE_SHIFT; static void register_listen_notifier(struct notifier_block *nb) @@ -586,7 +588,10 @@ static struct cxgb4_uld_info chtls_uld_info = { void chtls_install_cpl_ops(struct sock *sk) { - sk->sk_prot = &chtls_cpl_prot; + if (sk->sk_family == AF_INET) + sk->sk_prot = &chtls_cpl_prot; + else + sk->sk_prot = &chtls_cpl_protv6; } static void __init chtls_init_ulp_ops(void) @@ -603,6 +608,9 @@ static void __init chtls_init_ulp_ops(void) chtls_cpl_prot.recvmsg = chtls_recvmsg; chtls_cpl_prot.setsockopt = chtls_setsockopt; chtls_cpl_prot.getsockopt = chtls_getsockopt; + chtls_cpl_protv6 = chtls_cpl_prot; + chtls_init_rsk_ops(&chtls_cpl_protv6, &chtls_rsk_opsv6, + &tcpv6_prot, PF_INET6); } static int __init chtls_register(void) diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index b7415ca75c2d..f67d45ff00b4 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -2121,6 +2121,7 @@ struct proto tcpv6_prot = { #endif .diag_destroy = tcp_abort, }; +EXPORT_SYMBOL_GPL(tcpv6_prot); /* thinking of making this const? Don't. * early_demux can change based on sysctl. -- cgit v1.2.3-59-g8ed1b From efd7ed0f5f2d07ccbb1853c5d46656cdfa1371fb Mon Sep 17 00:00:00 2001 From: Jules Irenge Date: Mon, 1 Jun 2020 19:45:52 +0100 Subject: sfc: add missing annotation for efx_ef10_try_update_nic_stats_vf() Sparse reports a warning at efx_ef10_try_update_nic_stats_vf() warning: context imbalance in efx_ef10_try_update_nic_stats_vf() - unexpected unlock The root cause is the missing annotation at efx_ef10_try_update_nic_stats_vf() Add the missing _must_hold(&efx->stats_lock) annotation Signed-off-by: Jules Irenge Signed-off-by: David S. Miller --- drivers/net/ethernet/sfc/ef10.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/sfc/ef10.c b/drivers/net/ethernet/sfc/ef10.c index 964c5e842cec..4b0e3695a71a 100644 --- a/drivers/net/ethernet/sfc/ef10.c +++ b/drivers/net/ethernet/sfc/ef10.c @@ -1819,6 +1819,7 @@ static size_t efx_ef10_update_stats_pf(struct efx_nic *efx, u64 *full_stats, } static int efx_ef10_try_update_nic_stats_vf(struct efx_nic *efx) + __must_hold(&efx->stats_lock) { MCDI_DECLARE_BUF(inbuf, MC_CMD_MAC_STATS_IN_LEN); struct efx_ef10_nic_data *nic_data = efx->nic_data; -- cgit v1.2.3-59-g8ed1b From 836e66c218f355ec01ba57671c85abf32961dcea Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 2 Jun 2020 16:58:32 +0200 Subject: bpf: Fix up bpf_skb_adjust_room helper's skb csum setting Lorenz recently reported: In our TC classifier cls_redirect [0], we use the following sequence of helper calls to decapsulate a GUE (basically IP + UDP + custom header) encapsulated packet: bpf_skb_adjust_room(skb, -encap_len, BPF_ADJ_ROOM_MAC, BPF_F_ADJ_ROOM_FIXED_GSO) bpf_redirect(skb->ifindex, BPF_F_INGRESS) It seems like some checksums of the inner headers are not validated in this case. For example, a TCP SYN packet with invalid TCP checksum is still accepted by the network stack and elicits a SYN ACK. [...] That is, we receive the following packet from the driver: | ETH | IP | UDP | GUE | IP | TCP | skb->ip_summed == CHECKSUM_UNNECESSARY ip_summed is CHECKSUM_UNNECESSARY because our NICs do rx checksum offloading. On this packet we run skb_adjust_room_mac(-encap_len), and get the following: | ETH | IP | TCP | skb->ip_summed == CHECKSUM_UNNECESSARY Note that ip_summed is still CHECKSUM_UNNECESSARY. After bpf_redirect()'ing into the ingress, we end up in tcp_v4_rcv(). There, skb_checksum_init() is turned into a no-op due to CHECKSUM_UNNECESSARY. The bpf_skb_adjust_room() helper is not aware of protocol specifics. Internally, it handles the CHECKSUM_COMPLETE case via skb_postpull_rcsum(), but that does not cover CHECKSUM_UNNECESSARY. In this case skb->csum_level of the original skb prior to bpf_skb_adjust_room() call was 0, that is, covering UDP. Right now there is no way to adjust the skb->csum_level. NICs that have checksum offload disabled (CHECKSUM_NONE) or that support CHECKSUM_COMPLETE are not affected. Use a safe default for CHECKSUM_UNNECESSARY by resetting to CHECKSUM_NONE and add a flag to the helper called BPF_F_ADJ_ROOM_NO_CSUM_RESET that allows users from opting out. Opting out is useful for the case where we don't remove/add full protocol headers, or for the case where a user wants to adjust the csum level manually e.g. through bpf_csum_level() helper that is added in subsequent patch. The bpf_skb_proto_{4_to_6,6_to_4}() for NAT64/46 translation from the BPF bpf_skb_change_proto() helper uses bpf_skb_net_hdr_{push,pop}() pair internally as well but doesn't change layers, only transitions between v4 to v6 and vice versa, therefore no adoption is required there. [0] https://lore.kernel.org/bpf/20200424185556.7358-1-lmb@cloudflare.com/ Fixes: 2be7e212d541 ("bpf: add bpf_skb_adjust_room helper") Reported-by: Lorenz Bauer Reported-by: Alan Maguire Signed-off-by: Daniel Borkmann Signed-off-by: Lorenz Bauer Signed-off-by: Alexei Starovoitov Reviewed-by: Alan Maguire Link: https://lore.kernel.org/bpf/CACAyw9-uU_52esMd1JjuA80fRPHJv5vsSg8GnfW3t_qDU4aVKQ@mail.gmail.com/ Link: https://lore.kernel.org/bpf/11a90472e7cce83e76ddbfce81fdfce7bfc68808.1591108731.git.daniel@iogearbox.net --- include/linux/skbuff.h | 8 ++++++++ include/uapi/linux/bpf.h | 8 ++++++++ net/core/filter.c | 8 ++++++-- tools/include/uapi/linux/bpf.h | 8 ++++++++ 4 files changed, 30 insertions(+), 2 deletions(-) diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index a0d5c2760103..0c0377fc00c2 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -3919,6 +3919,14 @@ static inline void __skb_incr_checksum_unnecessary(struct sk_buff *skb) } } +static inline void __skb_reset_checksum_unnecessary(struct sk_buff *skb) +{ + if (skb->ip_summed == CHECKSUM_UNNECESSARY) { + skb->ip_summed = CHECKSUM_NONE; + skb->csum_level = 0; + } +} + /* Check if we need to perform checksum complete validation. * * Returns true if checksum complete is needed, false otherwise diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index b9ed9f14f2a2..3ba2bbbed80c 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1635,6 +1635,13 @@ union bpf_attr { * Grow or shrink the room for data in the packet associated to * *skb* by *len_diff*, and according to the selected *mode*. * + * By default, the helper will reset any offloaded checksum + * indicator of the skb to CHECKSUM_NONE. This can be avoided + * by the following flag: + * + * * **BPF_F_ADJ_ROOM_NO_CSUM_RESET**: Do not reset offloaded + * checksum data of the skb to CHECKSUM_NONE. + * * There are two supported modes at this time: * * * **BPF_ADJ_ROOM_MAC**: Adjust room at the mac layer @@ -3433,6 +3440,7 @@ enum { BPF_F_ADJ_ROOM_ENCAP_L3_IPV6 = (1ULL << 2), BPF_F_ADJ_ROOM_ENCAP_L4_GRE = (1ULL << 3), BPF_F_ADJ_ROOM_ENCAP_L4_UDP = (1ULL << 4), + BPF_F_ADJ_ROOM_NO_CSUM_RESET = (1ULL << 5), }; enum { diff --git a/net/core/filter.c b/net/core/filter.c index ae82bcb03124..278dcc0af961 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -3113,7 +3113,8 @@ static int bpf_skb_net_shrink(struct sk_buff *skb, u32 off, u32 len_diff, { int ret; - if (flags & ~BPF_F_ADJ_ROOM_FIXED_GSO) + if (unlikely(flags & ~(BPF_F_ADJ_ROOM_FIXED_GSO | + BPF_F_ADJ_ROOM_NO_CSUM_RESET))) return -EINVAL; if (skb_is_gso(skb) && !skb_is_gso_tcp(skb)) { @@ -3163,7 +3164,8 @@ BPF_CALL_4(bpf_skb_adjust_room, struct sk_buff *, skb, s32, len_diff, u32 off; int ret; - if (unlikely(flags & ~BPF_F_ADJ_ROOM_MASK)) + if (unlikely(flags & ~(BPF_F_ADJ_ROOM_MASK | + BPF_F_ADJ_ROOM_NO_CSUM_RESET))) return -EINVAL; if (unlikely(len_diff_abs > 0xfffU)) return -EFAULT; @@ -3191,6 +3193,8 @@ BPF_CALL_4(bpf_skb_adjust_room, struct sk_buff *, skb, s32, len_diff, ret = shrink ? bpf_skb_net_shrink(skb, off, len_diff_abs, flags) : bpf_skb_net_grow(skb, off, len_diff_abs, flags); + if (!ret && !(flags & BPF_F_ADJ_ROOM_NO_CSUM_RESET)) + __skb_reset_checksum_unnecessary(skb); bpf_compute_data_pointers(skb); return ret; diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index b9ed9f14f2a2..3ba2bbbed80c 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -1635,6 +1635,13 @@ union bpf_attr { * Grow or shrink the room for data in the packet associated to * *skb* by *len_diff*, and according to the selected *mode*. * + * By default, the helper will reset any offloaded checksum + * indicator of the skb to CHECKSUM_NONE. This can be avoided + * by the following flag: + * + * * **BPF_F_ADJ_ROOM_NO_CSUM_RESET**: Do not reset offloaded + * checksum data of the skb to CHECKSUM_NONE. + * * There are two supported modes at this time: * * * **BPF_ADJ_ROOM_MAC**: Adjust room at the mac layer @@ -3433,6 +3440,7 @@ enum { BPF_F_ADJ_ROOM_ENCAP_L3_IPV6 = (1ULL << 2), BPF_F_ADJ_ROOM_ENCAP_L4_GRE = (1ULL << 3), BPF_F_ADJ_ROOM_ENCAP_L4_UDP = (1ULL << 4), + BPF_F_ADJ_ROOM_NO_CSUM_RESET = (1ULL << 5), }; enum { -- cgit v1.2.3-59-g8ed1b From 7cdec54f9713256bb170873a1fc5c75c9127c9d2 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 2 Jun 2020 16:58:33 +0200 Subject: bpf: Add csum_level helper for fixing up csum levels Add a bpf_csum_level() helper which BPF programs can use in combination with bpf_skb_adjust_room() when they pass in BPF_F_ADJ_ROOM_NO_CSUM_RESET flag to the latter to avoid falling back to CHECKSUM_NONE. The bpf_csum_level() allows to adjust CHECKSUM_UNNECESSARY skb->csum_levels via BPF_CSUM_LEVEL_{INC,DEC} which calls __skb_{incr,decr}_checksum_unnecessary() on the skb. The helper also allows a BPF_CSUM_LEVEL_RESET which sets the skb's csum to CHECKSUM_NONE as well as a BPF_CSUM_LEVEL_QUERY to just return the current level. Without this helper, there is no way to otherwise adjust the skb->csum_level. I did not add an extra dummy flags as there is plenty of free bitspace in level argument itself iff ever needed in future. Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov Reviewed-by: Alan Maguire Acked-by: Lorenz Bauer Link: https://lore.kernel.org/bpf/279ae3717cb3d03c0ffeb511493c93c450a01e1a.1591108731.git.daniel@iogearbox.net --- include/uapi/linux/bpf.h | 43 +++++++++++++++++++++++++++++++++++++++++- net/core/filter.c | 38 +++++++++++++++++++++++++++++++++++++ tools/include/uapi/linux/bpf.h | 43 +++++++++++++++++++++++++++++++++++++++++- 3 files changed, 122 insertions(+), 2 deletions(-) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 3ba2bbbed80c..c65b374a5090 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -3220,6 +3220,38 @@ union bpf_attr { * calculation. * Return * Requested value, or 0, if flags are not recognized. + * + * int bpf_csum_level(struct sk_buff *skb, u64 level) + * Description + * Change the skbs checksum level by one layer up or down, or + * reset it entirely to none in order to have the stack perform + * checksum validation. The level is applicable to the following + * protocols: TCP, UDP, GRE, SCTP, FCOE. For example, a decap of + * | ETH | IP | UDP | GUE | IP | TCP | into | ETH | IP | TCP | + * through **bpf_skb_adjust_room**\ () helper with passing in + * **BPF_F_ADJ_ROOM_NO_CSUM_RESET** flag would require one call + * to **bpf_csum_level**\ () with **BPF_CSUM_LEVEL_DEC** since + * the UDP header is removed. Similarly, an encap of the latter + * into the former could be accompanied by a helper call to + * **bpf_csum_level**\ () with **BPF_CSUM_LEVEL_INC** if the + * skb is still intended to be processed in higher layers of the + * stack instead of just egressing at tc. + * + * There are three supported level settings at this time: + * + * * **BPF_CSUM_LEVEL_INC**: Increases skb->csum_level for skbs + * with CHECKSUM_UNNECESSARY. + * * **BPF_CSUM_LEVEL_DEC**: Decreases skb->csum_level for skbs + * with CHECKSUM_UNNECESSARY. + * * **BPF_CSUM_LEVEL_RESET**: Resets skb->csum_level to 0 and + * sets CHECKSUM_NONE to force checksum validation by the stack. + * * **BPF_CSUM_LEVEL_QUERY**: No-op, returns the current + * skb->csum_level. + * Return + * 0 on success, or a negative error in case of failure. In the + * case of **BPF_CSUM_LEVEL_QUERY**, the current skb->csum_level + * is returned or the error code -EACCES in case the skb is not + * subject to CHECKSUM_UNNECESSARY. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3356,7 +3388,8 @@ union bpf_attr { FN(ringbuf_reserve), \ FN(ringbuf_submit), \ FN(ringbuf_discard), \ - FN(ringbuf_query), + FN(ringbuf_query), \ + FN(csum_level), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call @@ -3433,6 +3466,14 @@ enum { BPF_F_CURRENT_NETNS = (-1L), }; +/* BPF_FUNC_csum_level level values. */ +enum { + BPF_CSUM_LEVEL_QUERY, + BPF_CSUM_LEVEL_INC, + BPF_CSUM_LEVEL_DEC, + BPF_CSUM_LEVEL_RESET, +}; + /* BPF_FUNC_skb_adjust_room flags. */ enum { BPF_F_ADJ_ROOM_FIXED_GSO = (1ULL << 0), diff --git a/net/core/filter.c b/net/core/filter.c index 278dcc0af961..d01a244b5087 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2015,6 +2015,40 @@ static const struct bpf_func_proto bpf_csum_update_proto = { .arg2_type = ARG_ANYTHING, }; +BPF_CALL_2(bpf_csum_level, struct sk_buff *, skb, u64, level) +{ + /* The interface is to be used in combination with bpf_skb_adjust_room() + * for encap/decap of packet headers when BPF_F_ADJ_ROOM_NO_CSUM_RESET + * is passed as flags, for example. + */ + switch (level) { + case BPF_CSUM_LEVEL_INC: + __skb_incr_checksum_unnecessary(skb); + break; + case BPF_CSUM_LEVEL_DEC: + __skb_decr_checksum_unnecessary(skb); + break; + case BPF_CSUM_LEVEL_RESET: + __skb_reset_checksum_unnecessary(skb); + break; + case BPF_CSUM_LEVEL_QUERY: + return skb->ip_summed == CHECKSUM_UNNECESSARY ? + skb->csum_level : -EACCES; + default: + return -EINVAL; + } + + return 0; +} + +static const struct bpf_func_proto bpf_csum_level_proto = { + .func = bpf_csum_level, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, +}; + static inline int __bpf_rx_skb(struct net_device *dev, struct sk_buff *skb) { return dev_forward_skb(dev, skb); @@ -6280,6 +6314,8 @@ tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_csum_diff_proto; case BPF_FUNC_csum_update: return &bpf_csum_update_proto; + case BPF_FUNC_csum_level: + return &bpf_csum_level_proto; case BPF_FUNC_l3_csum_replace: return &bpf_l3_csum_replace_proto; case BPF_FUNC_l4_csum_replace: @@ -6613,6 +6649,8 @@ lwt_xmit_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_skb_store_bytes_proto; case BPF_FUNC_csum_update: return &bpf_csum_update_proto; + case BPF_FUNC_csum_level: + return &bpf_csum_level_proto; case BPF_FUNC_l3_csum_replace: return &bpf_l3_csum_replace_proto; case BPF_FUNC_l4_csum_replace: diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 3ba2bbbed80c..c65b374a5090 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -3220,6 +3220,38 @@ union bpf_attr { * calculation. * Return * Requested value, or 0, if flags are not recognized. + * + * int bpf_csum_level(struct sk_buff *skb, u64 level) + * Description + * Change the skbs checksum level by one layer up or down, or + * reset it entirely to none in order to have the stack perform + * checksum validation. The level is applicable to the following + * protocols: TCP, UDP, GRE, SCTP, FCOE. For example, a decap of + * | ETH | IP | UDP | GUE | IP | TCP | into | ETH | IP | TCP | + * through **bpf_skb_adjust_room**\ () helper with passing in + * **BPF_F_ADJ_ROOM_NO_CSUM_RESET** flag would require one call + * to **bpf_csum_level**\ () with **BPF_CSUM_LEVEL_DEC** since + * the UDP header is removed. Similarly, an encap of the latter + * into the former could be accompanied by a helper call to + * **bpf_csum_level**\ () with **BPF_CSUM_LEVEL_INC** if the + * skb is still intended to be processed in higher layers of the + * stack instead of just egressing at tc. + * + * There are three supported level settings at this time: + * + * * **BPF_CSUM_LEVEL_INC**: Increases skb->csum_level for skbs + * with CHECKSUM_UNNECESSARY. + * * **BPF_CSUM_LEVEL_DEC**: Decreases skb->csum_level for skbs + * with CHECKSUM_UNNECESSARY. + * * **BPF_CSUM_LEVEL_RESET**: Resets skb->csum_level to 0 and + * sets CHECKSUM_NONE to force checksum validation by the stack. + * * **BPF_CSUM_LEVEL_QUERY**: No-op, returns the current + * skb->csum_level. + * Return + * 0 on success, or a negative error in case of failure. In the + * case of **BPF_CSUM_LEVEL_QUERY**, the current skb->csum_level + * is returned or the error code -EACCES in case the skb is not + * subject to CHECKSUM_UNNECESSARY. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3356,7 +3388,8 @@ union bpf_attr { FN(ringbuf_reserve), \ FN(ringbuf_submit), \ FN(ringbuf_discard), \ - FN(ringbuf_query), + FN(ringbuf_query), \ + FN(csum_level), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call @@ -3433,6 +3466,14 @@ enum { BPF_F_CURRENT_NETNS = (-1L), }; +/* BPF_FUNC_csum_level level values. */ +enum { + BPF_CSUM_LEVEL_QUERY, + BPF_CSUM_LEVEL_INC, + BPF_CSUM_LEVEL_DEC, + BPF_CSUM_LEVEL_RESET, +}; + /* BPF_FUNC_skb_adjust_room flags. */ enum { BPF_F_ADJ_ROOM_FIXED_GSO = (1ULL << 0), -- cgit v1.2.3-59-g8ed1b From c4ba153b6501fa7ccfdc7e57946fb1d6011e36e8 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 2 Jun 2020 16:58:34 +0200 Subject: bpf, selftests: Adapt cls_redirect to call csum_level helper Adapt bpf_skb_adjust_room() to pass in BPF_F_ADJ_ROOM_NO_CSUM_RESET flag and use the new bpf_csum_level() helper to inc/dec the checksum level by one after the encap/decap. Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov Reviewed-by: Lorenz Bauer Link: https://lore.kernel.org/bpf/e7458f10e3f3d795307cbc5ad870112671d9c6f7.1591108731.git.daniel@iogearbox.net --- tools/testing/selftests/bpf/progs/test_cls_redirect.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/bpf/progs/test_cls_redirect.c b/tools/testing/selftests/bpf/progs/test_cls_redirect.c index 1668b993eb86..f0b72e86bee5 100644 --- a/tools/testing/selftests/bpf/progs/test_cls_redirect.c +++ b/tools/testing/selftests/bpf/progs/test_cls_redirect.c @@ -380,9 +380,10 @@ static ret_t accept_locally(struct __sk_buff *skb, encap_headers_t *encap) } if (bpf_skb_adjust_room(skb, -encap_overhead, BPF_ADJ_ROOM_MAC, - BPF_F_ADJ_ROOM_FIXED_GSO)) { + BPF_F_ADJ_ROOM_FIXED_GSO | + BPF_F_ADJ_ROOM_NO_CSUM_RESET) || + bpf_csum_level(skb, BPF_CSUM_LEVEL_DEC)) return TC_ACT_SHOT; - } return bpf_redirect(skb->ifindex, BPF_F_INGRESS); } @@ -472,7 +473,9 @@ static ret_t forward_with_gre(struct __sk_buff *skb, encap_headers_t *encap, } if (bpf_skb_adjust_room(skb, delta, BPF_ADJ_ROOM_NET, - BPF_F_ADJ_ROOM_FIXED_GSO)) { + BPF_F_ADJ_ROOM_FIXED_GSO | + BPF_F_ADJ_ROOM_NO_CSUM_RESET) || + bpf_csum_level(skb, BPF_CSUM_LEVEL_INC)) { metrics->errors_total_encap_adjust_failed++; return TC_ACT_SHOT; } -- cgit v1.2.3-59-g8ed1b From 9a5f25ad30e5bb40a2e0c61c991594d3e6529c0a Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Mon, 1 Jun 2020 22:03:49 -0700 Subject: selftests/bpf: Fix sample_cnt shared between two threads Make sample_cnt volatile to fix possible selftests failure due to compiler optimization preventing latest sample_cnt value to be visible to main thread. sample_cnt is incremented in background thread, which is then joined into main thread. So in terms of visibility sample_cnt update is ok. But because it's not volatile, compiler might make optimizations that would prevent main thread to see latest updated value. Fix this by marking global variable volatile. Fixes: cb1c9ddd5525 ("selftests/bpf: Add BPF ringbuf selftests") Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Acked-by: Song Liu Link: https://lore.kernel.org/bpf/20200602050349.215037-1-andriin@fb.com --- tools/testing/selftests/bpf/prog_tests/ringbuf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/bpf/prog_tests/ringbuf.c b/tools/testing/selftests/bpf/prog_tests/ringbuf.c index bb8541f240e2..2bba908dfa63 100644 --- a/tools/testing/selftests/bpf/prog_tests/ringbuf.c +++ b/tools/testing/selftests/bpf/prog_tests/ringbuf.c @@ -25,7 +25,7 @@ struct sample { char comm[16]; }; -static int sample_cnt; +static volatile int sample_cnt; static int process_sample(void *ctx, void *data, size_t len) { -- cgit v1.2.3-59-g8ed1b From 7cec0b927142f510a1fac88033017616cce44c26 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Tue, 2 Jun 2020 11:57:43 -0700 Subject: selftests/bpf: Fix verifier test Adjust verifier test due to addition of new field. Fixes: c3c16f2ea6d2 ("bpf: Add rx_queue_mapping to bpf_sock") Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/verifier/sock.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/bpf/verifier/sock.c b/tools/testing/selftests/bpf/verifier/sock.c index 0bc51ad9e0fb..b1aac2641498 100644 --- a/tools/testing/selftests/bpf/verifier/sock.c +++ b/tools/testing/selftests/bpf/verifier/sock.c @@ -222,7 +222,7 @@ BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 2), BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), - BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_0, offsetofend(struct bpf_sock, state)), + BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_0, offsetofend(struct bpf_sock, rx_queue_mapping)), BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), }, -- cgit v1.2.3-59-g8ed1b From effe5be17706167ee968fa28afe40dec9c6f71db Mon Sep 17 00:00:00 2001 From: Ilya Leoshkevich Date: Tue, 2 Jun 2020 19:43:39 +0200 Subject: s390/bpf: Maintain 8-byte stack alignment Certain kernel functions (e.g. get_vtimer/set_vtimer) cause kernel panic when the stack is not 8-byte aligned. Currently JITed BPF programs may trigger this by allocating stack frames with non-rounded sizes and then being interrupted. Fix by using rounded fp->aux->stack_depth. Signed-off-by: Ilya Leoshkevich Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20200602174339.2501066-1-iii@linux.ibm.com --- arch/s390/net/bpf_jit_comp.c | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/arch/s390/net/bpf_jit_comp.c b/arch/s390/net/bpf_jit_comp.c index 8d2134136290..0f37a1b635f8 100644 --- a/arch/s390/net/bpf_jit_comp.c +++ b/arch/s390/net/bpf_jit_comp.c @@ -594,7 +594,7 @@ static void bpf_jit_epilogue(struct bpf_jit *jit, u32 stack_depth) * stack space for the large switch statement. */ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp, - int i, bool extra_pass) + int i, bool extra_pass, u32 stack_depth) { struct bpf_insn *insn = &fp->insnsi[i]; u32 dst_reg = insn->dst_reg; @@ -1207,7 +1207,7 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp, */ if (jit->seen & SEEN_STACK) - off = STK_OFF_TCCNT + STK_OFF + fp->aux->stack_depth; + off = STK_OFF_TCCNT + STK_OFF + stack_depth; else off = STK_OFF_TCCNT; /* lhi %w0,1 */ @@ -1249,7 +1249,7 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp, /* * Restore registers before calling function */ - save_restore_regs(jit, REGS_RESTORE, fp->aux->stack_depth); + save_restore_regs(jit, REGS_RESTORE, stack_depth); /* * goto *(prog->bpf_func + tail_call_start); @@ -1519,7 +1519,7 @@ static int bpf_set_addr(struct bpf_jit *jit, int i) * Compile eBPF program into s390x code */ static int bpf_jit_prog(struct bpf_jit *jit, struct bpf_prog *fp, - bool extra_pass) + bool extra_pass, u32 stack_depth) { int i, insn_count, lit32_size, lit64_size; @@ -1527,18 +1527,18 @@ static int bpf_jit_prog(struct bpf_jit *jit, struct bpf_prog *fp, jit->lit64 = jit->lit64_start; jit->prg = 0; - bpf_jit_prologue(jit, fp->aux->stack_depth); + bpf_jit_prologue(jit, stack_depth); if (bpf_set_addr(jit, 0) < 0) return -1; for (i = 0; i < fp->len; i += insn_count) { - insn_count = bpf_jit_insn(jit, fp, i, extra_pass); + insn_count = bpf_jit_insn(jit, fp, i, extra_pass, stack_depth); if (insn_count < 0) return -1; /* Next instruction address */ if (bpf_set_addr(jit, i + insn_count) < 0) return -1; } - bpf_jit_epilogue(jit, fp->aux->stack_depth); + bpf_jit_epilogue(jit, stack_depth); lit32_size = jit->lit32 - jit->lit32_start; lit64_size = jit->lit64 - jit->lit64_start; @@ -1569,6 +1569,7 @@ struct s390_jit_data { */ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *fp) { + u32 stack_depth = round_up(fp->aux->stack_depth, 8); struct bpf_prog *tmp, *orig_fp = fp; struct bpf_binary_header *header; struct s390_jit_data *jit_data; @@ -1621,7 +1622,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *fp) * - 3: Calculate program size and addrs arrray */ for (pass = 1; pass <= 3; pass++) { - if (bpf_jit_prog(&jit, fp, extra_pass)) { + if (bpf_jit_prog(&jit, fp, extra_pass, stack_depth)) { fp = orig_fp; goto free_addrs; } @@ -1635,7 +1636,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *fp) goto free_addrs; } skip_init_ctx: - if (bpf_jit_prog(&jit, fp, extra_pass)) { + if (bpf_jit_prog(&jit, fp, extra_pass, stack_depth)) { bpf_jit_binary_free(header); fp = orig_fp; goto free_addrs; -- cgit v1.2.3-59-g8ed1b From 33d21f18204cb33b43ca6c78c8180949f6dc7227 Mon Sep 17 00:00:00 2001 From: Ilya Leoshkevich Date: Tue, 2 Jun 2020 19:45:55 +0200 Subject: s390/bpf: Use bcr 0,%0 as tail call nop filler Currently used 0x0000 filler confuses bfd disassembler, making bpftool prog dump xlated output nearly useless. Fix by using a real instruction. Signed-off-by: Ilya Leoshkevich Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20200602174555.2501389-1-iii@linux.ibm.com --- arch/s390/net/bpf_jit_comp.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/s390/net/bpf_jit_comp.c b/arch/s390/net/bpf_jit_comp.c index 0f37a1b635f8..f4242b894cf2 100644 --- a/arch/s390/net/bpf_jit_comp.c +++ b/arch/s390/net/bpf_jit_comp.c @@ -503,7 +503,8 @@ static void bpf_jit_prologue(struct bpf_jit *jit, u32 stack_depth) } else { /* j tail_call_start: NOP if no tail calls are used */ EMIT4_PCREL(0xa7f40000, 6); - _EMIT2(0); + /* bcr 0,%0 */ + EMIT2(0x0700, 0, REG_0); } /* Tail calls have to skip above initialization */ jit->tail_call_start = jit->prg; -- cgit v1.2.3-59-g8ed1b From 9bc499befeef07a4d79f4924bfca05634ad8fc97 Mon Sep 17 00:00:00 2001 From: Ilya Leoshkevich Date: Tue, 2 Jun 2020 19:44:48 +0200 Subject: bpf, selftests: Use bpf_probe_read_kernel Since commit 0ebeea8ca8a4 ("bpf: Restrict bpf_probe_read{, str}() only to archs where they work") 44 verifier tests fail on s390 due to not having bpf_probe_read anymore. Fix by using bpf_probe_read_kernel. Signed-off-by: Ilya Leoshkevich Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20200602174448.2501214-1-iii@linux.ibm.com --- tools/testing/selftests/bpf/verifier/const_or.c | 8 ++-- .../selftests/bpf/verifier/helper_access_var_len.c | 44 +++++++++++----------- .../selftests/bpf/verifier/helper_value_access.c | 36 +++++++++--------- tools/testing/selftests/bpf/verifier/precise.c | 8 ++-- 4 files changed, 48 insertions(+), 48 deletions(-) diff --git a/tools/testing/selftests/bpf/verifier/const_or.c b/tools/testing/selftests/bpf/verifier/const_or.c index 84446dfc7c1d..6c214c58e8d4 100644 --- a/tools/testing/selftests/bpf/verifier/const_or.c +++ b/tools/testing/selftests/bpf/verifier/const_or.c @@ -6,7 +6,7 @@ BPF_MOV64_IMM(BPF_REG_2, 34), BPF_ALU64_IMM(BPF_OR, BPF_REG_2, 13), BPF_MOV64_IMM(BPF_REG_3, 0), - BPF_EMIT_CALL(BPF_FUNC_probe_read), + BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel), BPF_EXIT_INSN(), }, .result = ACCEPT, @@ -20,7 +20,7 @@ BPF_MOV64_IMM(BPF_REG_2, 34), BPF_ALU64_IMM(BPF_OR, BPF_REG_2, 24), BPF_MOV64_IMM(BPF_REG_3, 0), - BPF_EMIT_CALL(BPF_FUNC_probe_read), + BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel), BPF_EXIT_INSN(), }, .errstr = "invalid stack type R1 off=-48 access_size=58", @@ -36,7 +36,7 @@ BPF_MOV64_IMM(BPF_REG_4, 13), BPF_ALU64_REG(BPF_OR, BPF_REG_2, BPF_REG_4), BPF_MOV64_IMM(BPF_REG_3, 0), - BPF_EMIT_CALL(BPF_FUNC_probe_read), + BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel), BPF_EXIT_INSN(), }, .result = ACCEPT, @@ -51,7 +51,7 @@ BPF_MOV64_IMM(BPF_REG_4, 24), BPF_ALU64_REG(BPF_OR, BPF_REG_2, BPF_REG_4), BPF_MOV64_IMM(BPF_REG_3, 0), - BPF_EMIT_CALL(BPF_FUNC_probe_read), + BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel), BPF_EXIT_INSN(), }, .errstr = "invalid stack type R1 off=-48 access_size=58", diff --git a/tools/testing/selftests/bpf/verifier/helper_access_var_len.c b/tools/testing/selftests/bpf/verifier/helper_access_var_len.c index 5a605ae131a9..87c4e7900083 100644 --- a/tools/testing/selftests/bpf/verifier/helper_access_var_len.c +++ b/tools/testing/selftests/bpf/verifier/helper_access_var_len.c @@ -19,7 +19,7 @@ BPF_MOV64_IMM(BPF_REG_4, 0), BPF_JMP_REG(BPF_JGE, BPF_REG_4, BPF_REG_2, 2), BPF_MOV64_IMM(BPF_REG_3, 0), - BPF_EMIT_CALL(BPF_FUNC_probe_read), + BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel), BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), }, @@ -36,7 +36,7 @@ BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_1, -128), BPF_ALU64_IMM(BPF_AND, BPF_REG_2, 64), BPF_MOV64_IMM(BPF_REG_3, 0), - BPF_EMIT_CALL(BPF_FUNC_probe_read), + BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel), BPF_EXIT_INSN(), }, .errstr = "invalid indirect read from stack off -64+0 size 64", @@ -55,7 +55,7 @@ BPF_MOV64_IMM(BPF_REG_4, 0), BPF_JMP_REG(BPF_JGE, BPF_REG_4, BPF_REG_2, 2), BPF_MOV64_IMM(BPF_REG_3, 0), - BPF_EMIT_CALL(BPF_FUNC_probe_read), + BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel), BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), }, @@ -84,7 +84,7 @@ BPF_MOV64_IMM(BPF_REG_4, 0), BPF_JMP_REG(BPF_JGE, BPF_REG_4, BPF_REG_2, 2), BPF_MOV64_IMM(BPF_REG_3, 0), - BPF_EMIT_CALL(BPF_FUNC_probe_read), + BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel), BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), }, @@ -112,7 +112,7 @@ BPF_MOV64_IMM(BPF_REG_4, 0), BPF_JMP_REG(BPF_JSGE, BPF_REG_4, BPF_REG_2, 2), BPF_MOV64_IMM(BPF_REG_3, 0), - BPF_EMIT_CALL(BPF_FUNC_probe_read), + BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel), BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), }, @@ -132,7 +132,7 @@ BPF_JMP_REG(BPF_JGE, BPF_REG_4, BPF_REG_2, 3), BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, 1), BPF_MOV64_IMM(BPF_REG_3, 0), - BPF_EMIT_CALL(BPF_FUNC_probe_read), + BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel), BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), }, @@ -152,7 +152,7 @@ BPF_MOV64_IMM(BPF_REG_4, 0), BPF_JMP_REG(BPF_JGE, BPF_REG_4, BPF_REG_2, 2), BPF_MOV64_IMM(BPF_REG_3, 0), - BPF_EMIT_CALL(BPF_FUNC_probe_read), + BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel), BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), }, @@ -171,7 +171,7 @@ BPF_MOV64_IMM(BPF_REG_4, 0), BPF_JMP_REG(BPF_JGE, BPF_REG_4, BPF_REG_2, 2), BPF_MOV64_IMM(BPF_REG_3, 0), - BPF_EMIT_CALL(BPF_FUNC_probe_read), + BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel), BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), }, @@ -190,7 +190,7 @@ BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_1, -128), BPF_JMP_IMM(BPF_JGT, BPF_REG_2, 64, 3), BPF_MOV64_IMM(BPF_REG_3, 0), - BPF_EMIT_CALL(BPF_FUNC_probe_read), + BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel), BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), }, @@ -208,7 +208,7 @@ BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_1, -128), BPF_JMP_IMM(BPF_JSGT, BPF_REG_2, 64, 3), BPF_MOV64_IMM(BPF_REG_3, 0), - BPF_EMIT_CALL(BPF_FUNC_probe_read), + BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel), BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), }, @@ -233,7 +233,7 @@ BPF_MOV64_IMM(BPF_REG_4, 0), BPF_JMP_REG(BPF_JSGE, BPF_REG_4, BPF_REG_2, 2), BPF_MOV64_IMM(BPF_REG_3, 0), - BPF_EMIT_CALL(BPF_FUNC_probe_read), + BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel), BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), }, @@ -259,7 +259,7 @@ BPF_MOV64_IMM(BPF_REG_4, 0), BPF_JMP_REG(BPF_JSGE, BPF_REG_4, BPF_REG_2, 2), BPF_MOV64_IMM(BPF_REG_3, 0), - BPF_EMIT_CALL(BPF_FUNC_probe_read), + BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel), BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), }, @@ -286,7 +286,7 @@ BPF_MOV64_IMM(BPF_REG_4, 0), BPF_JMP_REG(BPF_JSGE, BPF_REG_4, BPF_REG_2, 2), BPF_MOV64_IMM(BPF_REG_3, 0), - BPF_EMIT_CALL(BPF_FUNC_probe_read), + BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel), BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), }, @@ -313,7 +313,7 @@ BPF_MOV64_IMM(BPF_REG_4, 0), BPF_JMP_REG(BPF_JSGE, BPF_REG_4, BPF_REG_2, 2), BPF_MOV64_IMM(BPF_REG_3, 0), - BPF_EMIT_CALL(BPF_FUNC_probe_read), + BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel), BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), }, @@ -468,7 +468,7 @@ BPF_MOV64_IMM(BPF_REG_1, 0), BPF_MOV64_IMM(BPF_REG_2, 0), BPF_MOV64_IMM(BPF_REG_3, 0), - BPF_EMIT_CALL(BPF_FUNC_probe_read), + BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel), BPF_EXIT_INSN(), }, .errstr = "R1 type=inv expected=fp", @@ -481,7 +481,7 @@ BPF_MOV64_IMM(BPF_REG_1, 0), BPF_MOV64_IMM(BPF_REG_2, 1), BPF_MOV64_IMM(BPF_REG_3, 0), - BPF_EMIT_CALL(BPF_FUNC_probe_read), + BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel), BPF_EXIT_INSN(), }, .errstr = "R1 type=inv expected=fp", @@ -495,7 +495,7 @@ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -8), BPF_MOV64_IMM(BPF_REG_2, 0), BPF_MOV64_IMM(BPF_REG_3, 0), - BPF_EMIT_CALL(BPF_FUNC_probe_read), + BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel), BPF_EXIT_INSN(), }, .result = ACCEPT, @@ -513,7 +513,7 @@ BPF_MOV64_REG(BPF_REG_1, BPF_REG_0), BPF_MOV64_IMM(BPF_REG_2, 0), BPF_MOV64_IMM(BPF_REG_3, 0), - BPF_EMIT_CALL(BPF_FUNC_probe_read), + BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel), BPF_EXIT_INSN(), }, .fixup_map_hash_8b = { 3 }, @@ -534,7 +534,7 @@ BPF_MOV64_REG(BPF_REG_1, BPF_REG_10), BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -8), BPF_MOV64_IMM(BPF_REG_3, 0), - BPF_EMIT_CALL(BPF_FUNC_probe_read), + BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel), BPF_EXIT_INSN(), }, .fixup_map_hash_8b = { 3 }, @@ -554,7 +554,7 @@ BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_0, 0), BPF_JMP_IMM(BPF_JGT, BPF_REG_2, 8, 2), BPF_MOV64_IMM(BPF_REG_3, 0), - BPF_EMIT_CALL(BPF_FUNC_probe_read), + BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel), BPF_EXIT_INSN(), }, .fixup_map_hash_8b = { 3 }, @@ -580,7 +580,7 @@ BPF_ALU64_IMM(BPF_AND, BPF_REG_2, 63), BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, 1), BPF_MOV64_IMM(BPF_REG_3, 0), - BPF_EMIT_CALL(BPF_FUNC_probe_read), + BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel), BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_10, -16), BPF_EXIT_INSN(), }, @@ -607,7 +607,7 @@ BPF_ALU64_IMM(BPF_AND, BPF_REG_2, 32), BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, 32), BPF_MOV64_IMM(BPF_REG_3, 0), - BPF_EMIT_CALL(BPF_FUNC_probe_read), + BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel), BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_10, -16), BPF_EXIT_INSN(), }, diff --git a/tools/testing/selftests/bpf/verifier/helper_value_access.c b/tools/testing/selftests/bpf/verifier/helper_value_access.c index 961f28139b96..1c7882ddfa63 100644 --- a/tools/testing/selftests/bpf/verifier/helper_value_access.c +++ b/tools/testing/selftests/bpf/verifier/helper_value_access.c @@ -10,7 +10,7 @@ BPF_MOV64_REG(BPF_REG_1, BPF_REG_0), BPF_MOV64_IMM(BPF_REG_2, sizeof(struct test_val)), BPF_MOV64_IMM(BPF_REG_3, 0), - BPF_EMIT_CALL(BPF_FUNC_probe_read), + BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel), BPF_EXIT_INSN(), }, .fixup_map_hash_48b = { 3 }, @@ -29,7 +29,7 @@ BPF_MOV64_REG(BPF_REG_1, BPF_REG_0), BPF_MOV64_IMM(BPF_REG_2, 8), BPF_MOV64_IMM(BPF_REG_3, 0), - BPF_EMIT_CALL(BPF_FUNC_probe_read), + BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel), BPF_EXIT_INSN(), }, .fixup_map_hash_48b = { 3 }, @@ -67,7 +67,7 @@ BPF_MOV64_REG(BPF_REG_1, BPF_REG_0), BPF_MOV64_IMM(BPF_REG_2, sizeof(struct test_val) + 8), BPF_MOV64_IMM(BPF_REG_3, 0), - BPF_EMIT_CALL(BPF_FUNC_probe_read), + BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel), BPF_EXIT_INSN(), }, .fixup_map_hash_48b = { 3 }, @@ -87,7 +87,7 @@ BPF_MOV64_REG(BPF_REG_1, BPF_REG_0), BPF_MOV64_IMM(BPF_REG_2, -8), BPF_MOV64_IMM(BPF_REG_3, 0), - BPF_EMIT_CALL(BPF_FUNC_probe_read), + BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel), BPF_EXIT_INSN(), }, .fixup_map_hash_48b = { 3 }, @@ -109,7 +109,7 @@ BPF_MOV64_IMM(BPF_REG_2, sizeof(struct test_val) - offsetof(struct test_val, foo)), BPF_MOV64_IMM(BPF_REG_3, 0), - BPF_EMIT_CALL(BPF_FUNC_probe_read), + BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel), BPF_EXIT_INSN(), }, .fixup_map_hash_48b = { 3 }, @@ -129,7 +129,7 @@ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, offsetof(struct test_val, foo)), BPF_MOV64_IMM(BPF_REG_2, 8), BPF_MOV64_IMM(BPF_REG_3, 0), - BPF_EMIT_CALL(BPF_FUNC_probe_read), + BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel), BPF_EXIT_INSN(), }, .fixup_map_hash_48b = { 3 }, @@ -170,7 +170,7 @@ BPF_MOV64_IMM(BPF_REG_2, sizeof(struct test_val) - offsetof(struct test_val, foo) + 8), BPF_MOV64_IMM(BPF_REG_3, 0), - BPF_EMIT_CALL(BPF_FUNC_probe_read), + BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel), BPF_EXIT_INSN(), }, .fixup_map_hash_48b = { 3 }, @@ -191,7 +191,7 @@ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, offsetof(struct test_val, foo)), BPF_MOV64_IMM(BPF_REG_2, -8), BPF_MOV64_IMM(BPF_REG_3, 0), - BPF_EMIT_CALL(BPF_FUNC_probe_read), + BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel), BPF_EXIT_INSN(), }, .fixup_map_hash_48b = { 3 }, @@ -212,7 +212,7 @@ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, offsetof(struct test_val, foo)), BPF_MOV64_IMM(BPF_REG_2, -1), BPF_MOV64_IMM(BPF_REG_3, 0), - BPF_EMIT_CALL(BPF_FUNC_probe_read), + BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel), BPF_EXIT_INSN(), }, .fixup_map_hash_48b = { 3 }, @@ -235,7 +235,7 @@ BPF_MOV64_IMM(BPF_REG_2, sizeof(struct test_val) - offsetof(struct test_val, foo)), BPF_MOV64_IMM(BPF_REG_3, 0), - BPF_EMIT_CALL(BPF_FUNC_probe_read), + BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel), BPF_EXIT_INSN(), }, .fixup_map_hash_48b = { 3 }, @@ -256,7 +256,7 @@ BPF_ALU64_REG(BPF_ADD, BPF_REG_1, BPF_REG_3), BPF_MOV64_IMM(BPF_REG_2, 8), BPF_MOV64_IMM(BPF_REG_3, 0), - BPF_EMIT_CALL(BPF_FUNC_probe_read), + BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel), BPF_EXIT_INSN(), }, .fixup_map_hash_48b = { 3 }, @@ -300,7 +300,7 @@ sizeof(struct test_val) - offsetof(struct test_val, foo) + 8), BPF_MOV64_IMM(BPF_REG_3, 0), - BPF_EMIT_CALL(BPF_FUNC_probe_read), + BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel), BPF_EXIT_INSN(), }, .fixup_map_hash_48b = { 3 }, @@ -322,7 +322,7 @@ BPF_ALU64_REG(BPF_ADD, BPF_REG_1, BPF_REG_3), BPF_MOV64_IMM(BPF_REG_2, -8), BPF_MOV64_IMM(BPF_REG_3, 0), - BPF_EMIT_CALL(BPF_FUNC_probe_read), + BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel), BPF_EXIT_INSN(), }, .fixup_map_hash_48b = { 3 }, @@ -344,7 +344,7 @@ BPF_ALU64_REG(BPF_ADD, BPF_REG_1, BPF_REG_3), BPF_MOV64_IMM(BPF_REG_2, -1), BPF_MOV64_IMM(BPF_REG_3, 0), - BPF_EMIT_CALL(BPF_FUNC_probe_read), + BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel), BPF_EXIT_INSN(), }, .fixup_map_hash_48b = { 3 }, @@ -368,7 +368,7 @@ BPF_MOV64_IMM(BPF_REG_2, sizeof(struct test_val) - offsetof(struct test_val, foo)), BPF_MOV64_IMM(BPF_REG_3, 0), - BPF_EMIT_CALL(BPF_FUNC_probe_read), + BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel), BPF_EXIT_INSN(), }, .fixup_map_hash_48b = { 3 }, @@ -390,7 +390,7 @@ BPF_ALU64_REG(BPF_ADD, BPF_REG_1, BPF_REG_3), BPF_MOV64_IMM(BPF_REG_2, 8), BPF_MOV64_IMM(BPF_REG_3, 0), - BPF_EMIT_CALL(BPF_FUNC_probe_read), + BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel), BPF_EXIT_INSN(), }, .fixup_map_hash_48b = { 3 }, @@ -433,7 +433,7 @@ BPF_ALU64_REG(BPF_ADD, BPF_REG_1, BPF_REG_3), BPF_MOV64_IMM(BPF_REG_2, 1), BPF_MOV64_IMM(BPF_REG_3, 0), - BPF_EMIT_CALL(BPF_FUNC_probe_read), + BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel), BPF_EXIT_INSN(), }, .fixup_map_hash_48b = { 3 }, @@ -458,7 +458,7 @@ sizeof(struct test_val) - offsetof(struct test_val, foo) + 1), BPF_MOV64_IMM(BPF_REG_3, 0), - BPF_EMIT_CALL(BPF_FUNC_probe_read), + BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel), BPF_EXIT_INSN(), }, .fixup_map_hash_48b = { 3 }, diff --git a/tools/testing/selftests/bpf/verifier/precise.c b/tools/testing/selftests/bpf/verifier/precise.c index 02151f8c940f..6dc8003ffc70 100644 --- a/tools/testing/selftests/bpf/verifier/precise.c +++ b/tools/testing/selftests/bpf/verifier/precise.c @@ -31,14 +31,14 @@ BPF_MOV64_REG(BPF_REG_1, BPF_REG_FP), BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -8), BPF_MOV64_IMM(BPF_REG_3, 0), - BPF_EMIT_CALL(BPF_FUNC_probe_read), + BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel), BPF_EXIT_INSN(), }, .prog_type = BPF_PROG_TYPE_TRACEPOINT, .fixup_map_array_48b = { 1 }, .result = VERBOSE_ACCEPT, .errstr = - "26: (85) call bpf_probe_read#4\ + "26: (85) call bpf_probe_read_kernel#113\ last_idx 26 first_idx 20\ regs=4 stack=0 before 25\ regs=4 stack=0 before 24\ @@ -91,7 +91,7 @@ BPF_MOV64_REG(BPF_REG_1, BPF_REG_FP), BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -8), BPF_MOV64_IMM(BPF_REG_3, 0), - BPF_EMIT_CALL(BPF_FUNC_probe_read), + BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel), BPF_EXIT_INSN(), }, .prog_type = BPF_PROG_TYPE_TRACEPOINT, @@ -99,7 +99,7 @@ .result = VERBOSE_ACCEPT, .flags = BPF_F_TEST_STATE_FREQ, .errstr = - "26: (85) call bpf_probe_read#4\ + "26: (85) call bpf_probe_read_kernel#113\ last_idx 26 first_idx 22\ regs=4 stack=0 before 25\ regs=4 stack=0 before 24\ -- cgit v1.2.3-59-g8ed1b From d70a6be1e2ab98f13688e4a529b326e8e11230d0 Mon Sep 17 00:00:00 2001 From: Ilya Leoshkevich Date: Tue, 2 Jun 2020 19:56:48 +0200 Subject: tools/bpf: Don't use $(COMPILE.c) When using make kselftest TARGETS=bpf, tools/bpf is built with MAKEFLAGS=rR, which causes $(COMPILE.c) to be undefined, which in turn causes the build to fail with CC kselftest/bpf/tools/build/bpftool/map_perf_ring.o /bin/sh: 1: -MMD: not found Fix by using $(CC) $(CFLAGS) -c instead of $(COMPILE.c). Signed-off-by: Ilya Leoshkevich Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20200602175649.2501580-2-iii@linux.ibm.com --- tools/bpf/Makefile | 6 +++--- tools/bpf/bpftool/Makefile | 8 ++++---- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/tools/bpf/Makefile b/tools/bpf/Makefile index f897eeeb0b4f..77472e28c8fd 100644 --- a/tools/bpf/Makefile +++ b/tools/bpf/Makefile @@ -64,12 +64,12 @@ $(OUTPUT)%.lex.c: $(srctree)/tools/bpf/%.l $(QUIET_FLEX)$(LEX) -o $@ $< $(OUTPUT)%.o: $(srctree)/tools/bpf/%.c - $(QUIET_CC)$(COMPILE.c) -o $@ $< + $(QUIET_CC)$(CC) $(CFLAGS) -c -o $@ $< $(OUTPUT)%.yacc.o: $(OUTPUT)%.yacc.c - $(QUIET_CC)$(COMPILE.c) -o $@ $< + $(QUIET_CC)$(CC) $(CFLAGS) -c -o $@ $< $(OUTPUT)%.lex.o: $(OUTPUT)%.lex.c - $(QUIET_CC)$(COMPILE.c) -o $@ $< + $(QUIET_CC)$(CC) $(CFLAGS) -c -o $@ $< PROGS = $(OUTPUT)bpf_jit_disasm $(OUTPUT)bpf_dbg $(OUTPUT)bpf_asm diff --git a/tools/bpf/bpftool/Makefile b/tools/bpf/bpftool/Makefile index 2759f9cc3289..9e85f101be85 100644 --- a/tools/bpf/bpftool/Makefile +++ b/tools/bpf/bpftool/Makefile @@ -126,7 +126,7 @@ else endif $(OUTPUT)_prog.o: prog.c - $(QUIET_CC)$(COMPILE.c) -MMD -DBPFTOOL_WITHOUT_SKELETONS -o $@ $< + $(QUIET_CC)$(CC) $(CFLAGS) -c -MMD -DBPFTOOL_WITHOUT_SKELETONS -o $@ $< $(OUTPUT)_bpftool: $(_OBJS) $(LIBBPF) $(QUIET_LINK)$(CC) $(CFLAGS) $(LDFLAGS) -o $@ $(_OBJS) $(LIBS) @@ -141,10 +141,10 @@ profiler.skel.h: $(OUTPUT)_bpftool skeleton/profiler.bpf.o $(QUIET_GEN)$(OUTPUT)./_bpftool gen skeleton skeleton/profiler.bpf.o > $@ $(OUTPUT)prog.o: prog.c profiler.skel.h - $(QUIET_CC)$(COMPILE.c) -MMD -o $@ $< + $(QUIET_CC)$(CC) $(CFLAGS) -c -MMD -o $@ $< $(OUTPUT)disasm.o: $(srctree)/kernel/bpf/disasm.c - $(QUIET_CC)$(COMPILE.c) -MMD -o $@ $< + $(QUIET_CC)$(CC) $(CFLAGS) -c -MMD -o $@ $< $(OUTPUT)feature.o: | zdep @@ -152,7 +152,7 @@ $(OUTPUT)bpftool: $(__OBJS) $(LIBBPF) $(QUIET_LINK)$(CC) $(CFLAGS) $(LDFLAGS) -o $@ $(__OBJS) $(LIBS) $(OUTPUT)%.o: %.c - $(QUIET_CC)$(COMPILE.c) -MMD -o $@ $< + $(QUIET_CC)$(CC) $(CFLAGS) -c -MMD -o $@ $< clean: $(LIBBPF)-clean $(call QUIET_CLEAN, bpftool) -- cgit v1.2.3-59-g8ed1b From e7ad28e6fdbffa2b9b1bd376431fb81a5403bcfd Mon Sep 17 00:00:00 2001 From: Ilya Leoshkevich Date: Tue, 2 Jun 2020 19:56:49 +0200 Subject: selftests/bpf: Add a default $(CXX) value When using make kselftest TARGETS=bpf, tools/bpf is built with MAKEFLAGS=rR, which causes $(CXX) to be undefined, which in turn causes the build to fail with CXX test_cpp /bin/sh: 2: g: not found Fix by adding a default $(CXX) value, like tools/build/feature/Makefile already does. Signed-off-by: Ilya Leoshkevich Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20200602175649.2501580-3-iii@linux.ibm.com --- tools/testing/selftests/bpf/Makefile | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index 3ce548eff8a8..22aaec74ea0a 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -2,6 +2,8 @@ include ../../../../scripts/Kbuild.include include ../../../scripts/Makefile.arch +CXX ?= $(CROSS_COMPILE)g++ + CURDIR := $(abspath .) TOOLSDIR := $(abspath ../../..) LIBDIR := $(TOOLSDIR)/lib -- cgit v1.2.3-59-g8ed1b From 2eed5a8b614bc0197b29da7b21a78d2c564a7098 Mon Sep 17 00:00:00 2001 From: Luo bin Date: Tue, 2 Jun 2020 08:40:32 +0800 Subject: hinic: add set_channels ethtool_ops support add support to change TX/RX queue number with "ethtool -L combined". V5 -> V6: remove check for carrier in hinic_xmit_frame V4 -> V5: change time zone in patch header V3 -> V4: update date in patch header V2 -> V3: remove check for zero channels->combined_count V1 -> V2: update commit message("ethtool -L" to "ethtool -L combined") V0 -> V1: remove check for channels->tx_count/rx_count/other_count Signed-off-by: Luo bin Signed-off-by: David S. Miller --- drivers/net/ethernet/huawei/hinic/hinic_ethtool.c | 40 ++++++++++++++++++----- drivers/net/ethernet/huawei/hinic/hinic_main.c | 2 +- 2 files changed, 33 insertions(+), 9 deletions(-) diff --git a/drivers/net/ethernet/huawei/hinic/hinic_ethtool.c b/drivers/net/ethernet/huawei/hinic/hinic_ethtool.c index ace18d258049..efb02e03e7da 100644 --- a/drivers/net/ethernet/huawei/hinic/hinic_ethtool.c +++ b/drivers/net/ethernet/huawei/hinic/hinic_ethtool.c @@ -619,14 +619,37 @@ static void hinic_get_channels(struct net_device *netdev, struct hinic_dev *nic_dev = netdev_priv(netdev); struct hinic_hwdev *hwdev = nic_dev->hwdev; - channels->max_rx = hwdev->nic_cap.max_qps; - channels->max_tx = hwdev->nic_cap.max_qps; - channels->max_other = 0; - channels->max_combined = 0; - channels->rx_count = hinic_hwdev_num_qps(hwdev); - channels->tx_count = hinic_hwdev_num_qps(hwdev); - channels->other_count = 0; - channels->combined_count = 0; + channels->max_combined = nic_dev->max_qps; + channels->combined_count = hinic_hwdev_num_qps(hwdev); +} + +static int hinic_set_channels(struct net_device *netdev, + struct ethtool_channels *channels) +{ + struct hinic_dev *nic_dev = netdev_priv(netdev); + unsigned int count = channels->combined_count; + int err; + + netif_info(nic_dev, drv, netdev, "Set max combined queue number from %d to %d\n", + hinic_hwdev_num_qps(nic_dev->hwdev), count); + + if (netif_running(netdev)) { + netif_info(nic_dev, drv, netdev, "Restarting netdev\n"); + hinic_close(netdev); + + nic_dev->hwdev->nic_cap.num_qps = count; + + err = hinic_open(netdev); + if (err) { + netif_err(nic_dev, drv, netdev, + "Failed to open netdev\n"); + return -EFAULT; + } + } else { + nic_dev->hwdev->nic_cap.num_qps = count; + } + + return 0; } static int hinic_get_rss_hash_opts(struct hinic_dev *nic_dev, @@ -1219,6 +1242,7 @@ static const struct ethtool_ops hinic_ethtool_ops = { .get_ringparam = hinic_get_ringparam, .set_ringparam = hinic_set_ringparam, .get_channels = hinic_get_channels, + .set_channels = hinic_set_channels, .get_rxnfc = hinic_get_rxnfc, .set_rxnfc = hinic_set_rxnfc, .get_rxfh_key_size = hinic_get_rxfh_key_size, diff --git a/drivers/net/ethernet/huawei/hinic/hinic_main.c b/drivers/net/ethernet/huawei/hinic/hinic_main.c index c8ab129a7ae8..e9e6f4c9309a 100644 --- a/drivers/net/ethernet/huawei/hinic/hinic_main.c +++ b/drivers/net/ethernet/huawei/hinic/hinic_main.c @@ -326,7 +326,6 @@ static void hinic_enable_rss(struct hinic_dev *nic_dev) int i, node, err = 0; u16 num_cpus = 0; - nic_dev->max_qps = hinic_hwdev_max_num_qps(hwdev); if (nic_dev->max_qps <= 1) { nic_dev->flags &= ~HINIC_RSS_ENABLE; nic_dev->rss_limit = nic_dev->max_qps; @@ -1031,6 +1030,7 @@ static int nic_dev_init(struct pci_dev *pdev) nic_dev->rq_depth = HINIC_RQ_DEPTH; nic_dev->sriov_info.hwdev = hwdev; nic_dev->sriov_info.pdev = pdev; + nic_dev->max_qps = num_qps; sema_init(&nic_dev->mgmt_lock, 1); -- cgit v1.2.3-59-g8ed1b From 11e877b2a8cfd282a1b81f9d4c594b900889a5d8 Mon Sep 17 00:00:00 2001 From: Ronak Doshi Date: Mon, 1 Jun 2020 20:02:39 -0700 Subject: vmxnet3: allow rx flow hash ops only when rss is enabled It makes sense to allow changes to get/set rx flow hash callback only when rss is enabled. This patch restricts get_rss_hash_opts and set_rss_hash_opts methods to allow querying and configuring different Rx flow hash configurations only when rss is enabled Signed-off-by: Ronak Doshi Signed-off-by: David S. Miller --- drivers/net/vmxnet3/vmxnet3_ethtool.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/drivers/net/vmxnet3/vmxnet3_ethtool.c b/drivers/net/vmxnet3/vmxnet3_ethtool.c index 6acaafe169de..def27afa1c69 100644 --- a/drivers/net/vmxnet3/vmxnet3_ethtool.c +++ b/drivers/net/vmxnet3/vmxnet3_ethtool.c @@ -899,6 +899,12 @@ vmxnet3_get_rxnfc(struct net_device *netdev, struct ethtool_rxnfc *info, err = -EOPNOTSUPP; break; } +#ifdef VMXNET3_RSS + if (!adapter->rss) { + err = -EOPNOTSUPP; + break; + } +#endif err = vmxnet3_get_rss_hash_opts(adapter, info); break; default: @@ -919,6 +925,12 @@ vmxnet3_set_rxnfc(struct net_device *netdev, struct ethtool_rxnfc *info) err = -EOPNOTSUPP; goto done; } +#ifdef VMXNET3_RSS + if (!adapter->rss) { + err = -EOPNOTSUPP; + goto done; + } +#endif switch (info->cmd) { case ETHTOOL_SRXFH: -- cgit v1.2.3-59-g8ed1b From 049fa17f7ae6b0971ad41b761479962facafea4b Mon Sep 17 00:00:00 2001 From: Tuong Lien Date: Tue, 2 Jun 2020 11:46:40 +0700 Subject: Revert "tipc: Fix potential tipc_node refcnt leak in tipc_rcv" This reverts commit de058420767df21e2b6b0f3bb36d1616fb962032. There is no actual tipc_node refcnt leak as stated in the above commit. The refcnt is hold carefully for the case of an asynchronous decryption (i.e. -EINPROGRESS/-EBUSY and skb = NULL is returned), so that the node object cannot be freed in the meantime. The counter will be re-balanced when the operation's callback arrives with the decrypted buffer if any. In other cases, e.g. a synchronous crypto the counter will be decreased immediately when it is done. Now with that commit, a kernel panic will occur when there is no node found (i.e. n = NULL) in the 'tipc_rcv()' or a premature release of the node object. This commit solves the issues by reverting the said commit, but keeping one valid case that the 'skb_linearize()' is failed. Acked-by: Jon Maloy Signed-off-by: Tuong Lien Tested-by: Hoang Le Signed-off-by: David S. Miller --- net/tipc/node.c | 1 - 1 file changed, 1 deletion(-) diff --git a/net/tipc/node.c b/net/tipc/node.c index 0312fb181d94..a4c2816c3746 100644 --- a/net/tipc/node.c +++ b/net/tipc/node.c @@ -2038,7 +2038,6 @@ void tipc_rcv(struct net *net, struct sk_buff *skb, struct tipc_bearer *b) n = tipc_node_find_by_id(net, ehdr->id); } tipc_crypto_rcv(net, (n) ? n->crypto_rx : NULL, &skb, b); - tipc_node_put(n); if (!skb) return; -- cgit v1.2.3-59-g8ed1b From a275727b1899d14d33d6c8c70f303b73f01cdbc6 Mon Sep 17 00:00:00 2001 From: Tuong Lien Date: Tue, 2 Jun 2020 11:46:41 +0700 Subject: Revert "tipc: Fix potential tipc_aead refcnt leak in tipc_crypto_rcv" This reverts commit 441870ee4240cf67b5d3ab8e16216a9ff42eb5d6. Like the previous patch in this series, we revert the above commit that causes similar issues with the 'aead' object. Acked-by: Jon Maloy Signed-off-by: Tuong Lien Signed-off-by: David S. Miller --- net/tipc/crypto.c | 1 - 1 file changed, 1 deletion(-) diff --git a/net/tipc/crypto.c b/net/tipc/crypto.c index 8c47ded2edb6..c8c47fc72653 100644 --- a/net/tipc/crypto.c +++ b/net/tipc/crypto.c @@ -1712,7 +1712,6 @@ exit: case -EBUSY: this_cpu_inc(stats->stat[STAT_ASYNC]); *skb = NULL; - tipc_aead_put(aead); return rc; default: this_cpu_inc(stats->stat[STAT_NOK]); -- cgit v1.2.3-59-g8ed1b From e8224bfe77293494626f6eec1884fee7b87d0ced Mon Sep 17 00:00:00 2001 From: Vasily Averin Date: Tue, 2 Jun 2020 15:55:26 +0300 Subject: net_failover: fixed rollback in net_failover_open() found by smatch: drivers/net/net_failover.c:65 net_failover_open() error: we previously assumed 'primary_dev' could be null (see line 43) Fixes: cfc80d9a1163 ("net: Introduce net_failover driver") Signed-off-by: Vasily Averin Signed-off-by: David S. Miller --- drivers/net/net_failover.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/net_failover.c b/drivers/net/net_failover.c index b16a1221d19b..fb182bec8f06 100644 --- a/drivers/net/net_failover.c +++ b/drivers/net/net_failover.c @@ -61,7 +61,8 @@ static int net_failover_open(struct net_device *dev) return 0; err_standby_open: - dev_close(primary_dev); + if (primary_dev) + dev_close(primary_dev); err_primary_open: netif_tx_disable(dev); return err; -- cgit v1.2.3-59-g8ed1b From 065fcfd49763ec71ae345bb5c5a74f961031e70e Mon Sep 17 00:00:00 2001 From: Thadeu Lima de Souza Cascardo Date: Tue, 2 Jun 2020 15:38:37 -0300 Subject: selftests: net: ip_defrag: ignore EPERM When running with conntrack rules, the dropped overlap fragments may cause EPERM to be returned to sendto. Instead of completely failing, just ignore those errors and continue. If this causes packets with overlap fragments to be dropped as expected, that is okay. And if it causes packets that are expected to be received to be dropped, which should not happen, it will be detected as failure. Signed-off-by: Thadeu Lima de Souza Cascardo Signed-off-by: David S. Miller --- tools/testing/selftests/net/ip_defrag.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tools/testing/selftests/net/ip_defrag.c b/tools/testing/selftests/net/ip_defrag.c index c0c9ecb891e1..f9ed749fd8c7 100644 --- a/tools/testing/selftests/net/ip_defrag.c +++ b/tools/testing/selftests/net/ip_defrag.c @@ -192,9 +192,9 @@ static void send_fragment(int fd_raw, struct sockaddr *addr, socklen_t alen, } res = sendto(fd_raw, ip_frame, frag_len, 0, addr, alen); - if (res < 0) + if (res < 0 && errno != EPERM) error(1, errno, "send_fragment"); - if (res != frag_len) + if (res >= 0 && res != frag_len) error(1, 0, "send_fragment: %d vs %d", res, frag_len); frag_counter++; @@ -313,9 +313,9 @@ static void send_udp_frags(int fd_raw, struct sockaddr *addr, iphdr->ip_len = htons(frag_len); } res = sendto(fd_raw, ip_frame, frag_len, 0, addr, alen); - if (res < 0) + if (res < 0 && errno != EPERM) error(1, errno, "sendto overlap: %d", frag_len); - if (res != frag_len) + if (res >= 0 && res != frag_len) error(1, 0, "sendto overlap: %d vs %d", (int)res, frag_len); frag_counter++; } -- cgit v1.2.3-59-g8ed1b